Include custom id (CID) to distinguish CodeQL databases
The current api (<2024-07-26 Fri>) is set up only for (owner,name). This is
insufficient for distinguishing CodeQL databases.
Other differences must be considered; this patch combines the fields
| cliVersion |
| creationTime |
| language |
| sha |
into one called CID. The CID field is a hash of these others and therefore can be
changed in the future without affecting workflows or the server.
The cid is combined with the owner/name to form one
identifier. This requires no changes to server or client -- the db
selection's interface is separate from VS Code and gh-mrva in any case.
To test this, this version imports multiple versions of the same owner/repo pairs from multiple directories. In this case, from
~/work-gh/mrva/mrva-open-source-download/repos
and
~/work-gh/mrva/mrva-open-source-download/repos-2024-04-29/
The unique database count increases from 3000 to 5360 -- see README.md,
./bin/mc-db-view-info < db-info-3.csv &
Other code modifications:
- Push (owner,repo,cid) names to minio
- Generate databases.json for use in vs code extension
- Generate list-databases.json for use by gh-mrva client
This commit is contained in:
committed by
=Michael Hohn
parent
b4f1a2b8a6
commit
1e1daf9330
103
client/qldbtools/bin/mc-db-generate-selection
Executable file
103
client/qldbtools/bin/mc-db-generate-selection
Executable file
@@ -0,0 +1,103 @@
|
||||
#!/usr/bin/env python
|
||||
""" Read a table of CodeQL DB information
|
||||
and generate the selection files for
|
||||
1. the VS Code CodeQL plugin
|
||||
2. the gh-mrva command-line client
|
||||
"""
|
||||
import argparse
|
||||
import logging
|
||||
import qldbtools.utils as utils
|
||||
import numpy as np
|
||||
|
||||
#
|
||||
#* Configure logger
|
||||
#
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
|
||||
# Overwrite log level set by minio
|
||||
root_logger = logging.getLogger()
|
||||
root_logger.setLevel(logging.INFO)
|
||||
|
||||
#
|
||||
#* Process command line
|
||||
#
|
||||
parser = argparse.ArgumentParser(
|
||||
description=""" Read a table of CodeQL DB information
|
||||
and generate the selection files for
|
||||
1. the VS Code CodeQL plugin
|
||||
2. the gh-mrva command-line client
|
||||
""",
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument('vscode_selection', type=str,
|
||||
help='VS Code selection file to generate')
|
||||
parser.add_argument('gh_mrva_selection', type=str,
|
||||
help='gh-mrva cli selection file to generate')
|
||||
parser.add_argument('-n', '--num-entries', type=int,
|
||||
help='Only use N entries',
|
||||
default=None)
|
||||
parser.add_argument('-s', '--seed', type=int,
|
||||
help='Random number seed',
|
||||
default=4242)
|
||||
parser.add_argument('-l', '--list-name', type=str,
|
||||
help='Name of the repository list',
|
||||
default='mirva-list')
|
||||
|
||||
args = parser.parse_args()
|
||||
#
|
||||
#* Load the information
|
||||
#
|
||||
import pandas as pd
|
||||
import sys
|
||||
|
||||
df0 = pd.read_csv(sys.stdin)
|
||||
|
||||
if args.num_entries == None:
|
||||
# Use all entries
|
||||
df1 = df0
|
||||
else:
|
||||
# Use num_entries, chosen via pseudo-random numbers
|
||||
df1 = df0.sample(n=args.num_entries,
|
||||
random_state=np.random.RandomState(args.seed))
|
||||
|
||||
#
|
||||
#* Form and save structures
|
||||
#
|
||||
repos = []
|
||||
for index, row in df1[['owner', 'name', 'CID', 'path']].iterrows():
|
||||
owner, name, CID, path = row
|
||||
repos.append(utils.form_db_req_name(owner, name, CID))
|
||||
|
||||
repo_list_name = args.list_name
|
||||
vsc = {
|
||||
"version": 1,
|
||||
"databases": {
|
||||
"variantAnalysis": {
|
||||
"repositoryLists": [
|
||||
{
|
||||
"name": repo_list_name,
|
||||
"repositories": repos,
|
||||
}
|
||||
],
|
||||
"owners": [],
|
||||
"repositories": []
|
||||
}
|
||||
},
|
||||
"selected": {
|
||||
"kind": "variantAnalysisUserDefinedList",
|
||||
"listName": repo_list_name
|
||||
}
|
||||
}
|
||||
|
||||
gh = {
|
||||
repo_list_name: repos
|
||||
}
|
||||
|
||||
import json
|
||||
with open(args.vscode_selection, "w") as fc:
|
||||
json.dump(vsc, fc, indent=4)
|
||||
|
||||
with open(args.gh_mrva_selection, "w") as fc:
|
||||
json.dump(gh, fc, indent=4)
|
||||
|
||||
# Local Variables:
|
||||
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
|
||||
# End:
|
||||
@@ -72,9 +72,10 @@ except S3Error as err:
|
||||
logging.error(f"Error creating bucket: {err}")
|
||||
|
||||
# Get info from dataframe and push the files
|
||||
for index, row in entries[['owner', 'name', 'path']].iterrows():
|
||||
owner, name, path = row
|
||||
new_name = f'{owner}${name}.zip'
|
||||
# XX: include CID.
|
||||
for index, row in entries[['owner', 'name', 'CID', 'path']].iterrows():
|
||||
owner, name, CID, path = row
|
||||
new_name = utils.form_db_bucket_name(owner, name, CID)
|
||||
try:
|
||||
client.fput_object(QL_DB_BUCKET_NAME, new_name, path)
|
||||
logging.info(f"Uploaded {path} as {new_name} to bucket {QL_DB_BUCKET_NAME}")
|
||||
|
||||
@@ -43,6 +43,14 @@ for left_index in range(0, len(d)-1):
|
||||
joiners_df = pd.concat(joiners, axis=0)
|
||||
full_df = pd.merge(d, joiners_df, left_index=True, right_on='left_index', how='outer')
|
||||
|
||||
#** Add single uniqueness field -- CID (Cumulative ID)
|
||||
full_df['CID'] = full_df.apply(lambda row:
|
||||
utils.cid_hash( (row['creationTime'],
|
||||
row['sha'],
|
||||
row['cliVersion'],
|
||||
row['language'])
|
||||
), axis=1)
|
||||
|
||||
#** Re-order the dataframe columns by importance
|
||||
# - Much of the data
|
||||
# 1. Is only conditionally present
|
||||
@@ -70,11 +78,13 @@ full_df = pd.merge(d, joiners_df, left_index=True, right_on='left_index', how='o
|
||||
# | primaryLanguage |
|
||||
# | finalised |
|
||||
|
||||
final_df = full_df.reindex(columns=['owner', 'name', 'language', 'size', 'cliVersion',
|
||||
'creationTime', 'sha', 'baselineLinesOfCode', 'path',
|
||||
'db_lang', 'db_lang_displayName', 'db_lang_file_count',
|
||||
'db_lang_linesOfCode', 'ctime', 'primaryLanguage',
|
||||
'finalised', 'left_index'])
|
||||
final_df = full_df.reindex( columns=['owner', 'name', 'cliVersion',
|
||||
'creationTime', 'language', 'sha','CID',
|
||||
'baselineLinesOfCode', 'path', 'db_lang',
|
||||
'db_lang_displayName', 'db_lang_file_count',
|
||||
'db_lang_linesOfCode', 'ctime',
|
||||
'primaryLanguage', 'finalised', 'left_index',
|
||||
'size'])
|
||||
|
||||
final_df.to_csv(sys.stdout, index=False)
|
||||
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
#!/usr/bin/env python
|
||||
""" Read a table of CodeQL DB information,
|
||||
group entries by (owner,name), sort each group by
|
||||
creationTime and keep only the top (newest) element.
|
||||
group entries by (owner,name,CID),
|
||||
sort each group by creationTime,
|
||||
and keep only the top (newest) element.
|
||||
"""
|
||||
import argparse
|
||||
import logging
|
||||
@@ -32,8 +33,8 @@ import sys
|
||||
|
||||
df0 = pd.read_csv(sys.stdin)
|
||||
|
||||
df_sorted = df0.sort_values(by=['owner', 'name', 'creationTime'])
|
||||
df_unique = df_sorted.groupby(['owner', 'name']).first().reset_index()
|
||||
df_sorted = df0.sort_values(by=['owner', 'name', 'CID', 'creationTime'])
|
||||
df_unique = df_sorted.groupby(['owner', 'name', 'CID']).first().reset_index()
|
||||
|
||||
df_unique.to_csv(sys.stdout, index=False)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user