Include custom id (CID) to distinguish CodeQL databases
The current api (<2024-07-26 Fri>) is set up only for (owner,name). This is
insufficient for distinguishing CodeQL databases.
Other differences must be considered; this patch combines the fields
| cliVersion |
| creationTime |
| language |
| sha |
into one called CID. The CID field is a hash of these others and therefore can be
changed in the future without affecting workflows or the server.
The cid is combined with the owner/name to form one
identifier. This requires no changes to server or client -- the db
selection's interface is separate from VS Code and gh-mrva in any case.
To test this, this version imports multiple versions of the same owner/repo pairs from multiple directories. In this case, from
~/work-gh/mrva/mrva-open-source-download/repos
and
~/work-gh/mrva/mrva-open-source-download/repos-2024-04-29/
The unique database count increases from 3000 to 5360 -- see README.md,
./bin/mc-db-view-info < db-info-3.csv &
Other code modifications:
- Push (owner,repo,cid) names to minio
- Generate databases.json for use in vs code extension
- Generate list-databases.json for use by gh-mrva client
This commit is contained in:
committed by
=Michael Hohn
parent
b4f1a2b8a6
commit
1e1daf9330
61
client/qldbtools/qldbtools/session-generate-selection.py
Normal file
61
client/qldbtools/qldbtools/session-generate-selection.py
Normal file
@@ -0,0 +1,61 @@
|
||||
""" Read a table of CodeQL DB information
|
||||
and generate the selection files for
|
||||
1. the VS Code CodeQL plugin
|
||||
2. the gh-mrva command-line client
|
||||
"""
|
||||
#
|
||||
#* Collect the information and write files
|
||||
#
|
||||
import pandas as pd
|
||||
import sys
|
||||
import qldbtools.utils as utils
|
||||
import numpy as np
|
||||
import importlib
|
||||
importlib.reload(utils)
|
||||
|
||||
df0 = pd.read_csv('db-info-3.csv')
|
||||
|
||||
# Use num_entries, chosen via pseudo-random numbers
|
||||
df1 = df0.sample(n=3, random_state=np.random.RandomState(4242))
|
||||
|
||||
repos = []
|
||||
for index, row in df1[['owner', 'name', 'CID', 'path']].iterrows():
|
||||
owner, name, CID, path = row
|
||||
repos.append(utils.form_db_req_name(owner, name, CID))
|
||||
|
||||
repo_list_name = "mirva-list"
|
||||
vsc = {
|
||||
"version": 1,
|
||||
"databases": {
|
||||
"variantAnalysis": {
|
||||
"repositoryLists": [
|
||||
{
|
||||
"name": repo_list_name,
|
||||
"repositories": repos,
|
||||
}
|
||||
],
|
||||
"owners": [],
|
||||
"repositories": []
|
||||
}
|
||||
},
|
||||
"selected": {
|
||||
"kind": "variantAnalysisUserDefinedList",
|
||||
"listName": repo_list_name
|
||||
}
|
||||
}
|
||||
|
||||
gh = {
|
||||
repo_list_name: repos
|
||||
}
|
||||
|
||||
|
||||
# write the files
|
||||
import json
|
||||
with open("tmp-selection-vsc.json", "w") as fc:
|
||||
json.dump(vsc, fc, indent=4)
|
||||
with open("tmp-selection-gh.json", "w") as fc:
|
||||
json.dump(gh, fc, indent=4)
|
||||
|
||||
# Local Variables:
|
||||
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
|
||||
# End:
|
||||
45
client/qldbtools/qldbtools/session-post-refine-info.py
Normal file
45
client/qldbtools/qldbtools/session-post-refine-info.py
Normal file
@@ -0,0 +1,45 @@
|
||||
import qldbtools.utils as utils
|
||||
import pandas as pd
|
||||
|
||||
#
|
||||
#* Collect the information
|
||||
#
|
||||
df1 = pd.read_csv("db-info-2.csv")
|
||||
|
||||
# Add single uniqueness field -- CID (Cumulative ID) -- using
|
||||
# - creationTime
|
||||
# - sha
|
||||
# - cliVersion
|
||||
# - language
|
||||
|
||||
from hashlib import blake2b
|
||||
|
||||
def cid_hash(row_tuple: tuple):
|
||||
"""
|
||||
cid_hash(row_tuple)
|
||||
Take a bytes object and return hash as hex string
|
||||
"""
|
||||
h = blake2b(digest_size = 3)
|
||||
h.update(str(row_tuple).encode())
|
||||
# return int.from_bytes(h.digest(), byteorder='big')
|
||||
return h.hexdigest()
|
||||
|
||||
# Apply the cid_hash function to the specified columns and create the 'CID' column
|
||||
df1['CID'] = df1.apply(lambda row: cid_hash( (row['creationTime'],
|
||||
row['sha'],
|
||||
row['cliVersion'],
|
||||
row['language'])
|
||||
), axis=1)
|
||||
|
||||
df2 = df1.reindex(columns=['owner', 'name', 'cliVersion', 'creationTime',
|
||||
'language', 'sha','CID', 'baselineLinesOfCode', 'path',
|
||||
'db_lang', 'db_lang_displayName', 'db_lang_file_count',
|
||||
'db_lang_linesOfCode', 'ctime', 'primaryLanguage',
|
||||
'finalised', 'left_index', 'size'])
|
||||
|
||||
df1['cid']
|
||||
|
||||
|
||||
# Local Variables:
|
||||
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
|
||||
# End:
|
||||
@@ -175,6 +175,38 @@ def metadata_details(left_index, codeql_content, meta_content):
|
||||
|
||||
class DetailsMissing(Exception): pass
|
||||
|
||||
from hashlib import blake2b
|
||||
|
||||
def cid_hash(row_tuple: tuple):
|
||||
"""
|
||||
cid_hash(row_tuple)
|
||||
Take a bytes object and return hash as hex string
|
||||
"""
|
||||
h = blake2b(digest_size = 3)
|
||||
h.update(str(row_tuple).encode())
|
||||
# return int.from_bytes(h.digest(), byteorder='big')
|
||||
return h.hexdigest()
|
||||
|
||||
def form_db_bucket_name(owner, name, CID):
|
||||
"""
|
||||
form_db_bucket_name(owner, name, CID)
|
||||
Return the name to use in minio storage; this function is trivial and used to
|
||||
enforce consistent naming.
|
||||
|
||||
The 'ctsj' prefix is a random, unique key to identify the information.
|
||||
"""
|
||||
return f'{owner}${name}ctsj{CID}.zip'
|
||||
|
||||
def form_db_req_name(owner, name, CID):
|
||||
"""
|
||||
form_db_req_name(owner, name, CID)
|
||||
Return the name to use in mrva requests; this function is trivial and used to
|
||||
enforce consistent naming.
|
||||
|
||||
The 'ctsj' prefix is a random, unique key to identify the information.
|
||||
"""
|
||||
return f'{owner}/{name}ctsj{CID}'
|
||||
|
||||
|
||||
# Local Variables:
|
||||
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
|
||||
|
||||
Reference in New Issue
Block a user