Include custom id (CID) to distinguish CodeQL databases

The current api (<2024-07-26 Fri>) is set up only for (owner,name). This is insufficient for distinguishing CodeQL databases. Other differences must be considered; this patch combines the fields | cliVersion | | creationTime | | language | | sha | into one called CID. The CID field is a hash of these others and therefore can be changed in the future without affecting workflows or the server. The cid is combined with the owner/name to form one identifier. This requires no changes to server or client -- the db selection's interface is separate from VS Code and gh-mrva in any case. To test this, this version imports multiple versions of the same owner/repo pairs from multiple directories. In this case, from ~/work-gh/mrva/mrva-open-source-download/repos and ~/work-gh/mrva/mrva-open-source-download/repos-2024-04-29/ The unique database count increases from 3000 to 5360 -- see README.md, ./bin/mc-db-view-info < db-info-3.csv & Other code modifications: - Push (owner,repo,cid) names to minio - Generate databases.json for use in vs code extension - Generate list-databases.json for use by gh-mrva client
2024-07-30 10:47:29 -07:00
parent b4f1a2b8a6
commit 1e1daf9330
8 changed files with 322 additions and 52 deletions
--- a/client/qldbtools/qldbtools/session-generate-selection.py
+++ b/client/qldbtools/qldbtools/session-generate-selection.py
@@ -0,0 +1,61 @@
+""" Read a table of CodeQL DB information
+    and generate the selection files for
+    1. the VS Code CodeQL plugin
+    2. the gh-mrva command-line client
+"""
+#
+#* Collect the information and write files
+#
+import pandas as pd
+import sys
+import qldbtools.utils as utils
+import numpy as np
+import importlib
+importlib.reload(utils)
+
+df0 = pd.read_csv('db-info-3.csv')
+
+# Use num_entries, chosen via pseudo-random numbers
+df1 = df0.sample(n=3, random_state=np.random.RandomState(4242))
+
+repos = []
+for index, row in df1[['owner', 'name', 'CID', 'path']].iterrows():
+    owner, name, CID, path = row
+    repos.append(utils.form_db_req_name(owner, name, CID))
+
+repo_list_name = "mirva-list"
+vsc = {
+    "version": 1,
+    "databases": {
+        "variantAnalysis": {
+            "repositoryLists": [
+                {
+                    "name": repo_list_name,
+                    "repositories": repos,
+                }
+            ],
+            "owners": [],
+            "repositories": []
+        }
+    },
+    "selected": {
+        "kind": "variantAnalysisUserDefinedList",
+        "listName": repo_list_name
+    }
+}
+
+gh = {
+    repo_list_name:  repos
+}
+
+
+# write the files
+import json
+with open("tmp-selection-vsc.json", "w") as fc:
+    json.dump(vsc, fc, indent=4)
+with open("tmp-selection-gh.json", "w") as fc:
+    json.dump(gh, fc, indent=4)
+    
+# Local Variables:
+# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
+# End:
--- a/client/qldbtools/qldbtools/session-post-refine-info.py
+++ b/client/qldbtools/qldbtools/session-post-refine-info.py
@@ -0,0 +1,45 @@
+import qldbtools.utils as utils
+import pandas as pd
+
+#
+#* Collect the information
+#
+df1 = pd.read_csv("db-info-2.csv")
+
+# Add single uniqueness field -- CID (Cumulative ID) -- using
+# - creationTime
+# - sha
+# - cliVersion
+# - language
+
+from hashlib import blake2b
+
+def cid_hash(row_tuple: tuple):
+    """
+        cid_hash(row_tuple)
+    Take a bytes object and return hash as hex string
+    """
+    h = blake2b(digest_size = 3)
+    h.update(str(row_tuple).encode())
+    # return int.from_bytes(h.digest(), byteorder='big')
+    return h.hexdigest()
+
+# Apply the cid_hash function to the specified columns and create the 'CID' column
+df1['CID'] = df1.apply(lambda row: cid_hash( (row['creationTime'],
+                                              row['sha'], 
+                                              row['cliVersion'], 
+                                              row['language'])
+                                            ), axis=1)
+
+df2 = df1.reindex(columns=['owner', 'name', 'cliVersion', 'creationTime',
+	                       'language', 'sha','CID', 'baselineLinesOfCode', 'path',
+	                       'db_lang', 'db_lang_displayName', 'db_lang_file_count',
+	                       'db_lang_linesOfCode', 'ctime', 'primaryLanguage',
+	                       'finalised', 'left_index', 'size'])
+
+df1['cid']
+
+
+# Local Variables:
+# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
+# End:
--- a/client/qldbtools/qldbtools/utils.py
+++ b/client/qldbtools/qldbtools/utils.py
@@ -175,6 +175,38 @@ def metadata_details(left_index, codeql_content, meta_content):

 class DetailsMissing(Exception): pass                        

+from hashlib import blake2b
+
+def cid_hash(row_tuple: tuple):
+    """
+        cid_hash(row_tuple)
+    Take a bytes object and return hash as hex string
+    """
+    h = blake2b(digest_size = 3)
+    h.update(str(row_tuple).encode())
+    # return int.from_bytes(h.digest(), byteorder='big')
+    return h.hexdigest()
+
+def form_db_bucket_name(owner, name, CID):
+    """
+        form_db_bucket_name(owner, name, CID)
+    Return the name to use in minio storage; this function is trivial and used to
+    enforce consistent naming.
+
+    The 'ctsj' prefix is a random, unique key to identify the information.
+    """
+    return f'{owner}${name}ctsj{CID}.zip'
+
+def form_db_req_name(owner, name, CID):
+    """
+        form_db_req_name(owner, name, CID)
+    Return the name to use in mrva requests; this function is trivial and used to
+    enforce consistent naming.
+
+    The 'ctsj' prefix is a random, unique key to identify the information.
+    """
+    return f'{owner}/{name}ctsj{CID}'
+

 # Local Variables:
 # python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"