Include custom id (CID) to distinguish CodeQL databases

The current api (<2024-07-26 Fri>) is set up only for (owner,name). This is insufficient for distinguishing CodeQL databases. Other differences must be considered; this patch combines the fields | cliVersion | | creationTime | | language | | sha | into one called CID. The CID field is a hash of these others and therefore can be changed in the future without affecting workflows or the server. The cid is combined with the owner/name to form one identifier. This requires no changes to server or client -- the db selection's interface is separate from VS Code and gh-mrva in any case. To test this, this version imports multiple versions of the same owner/repo pairs from multiple directories. In this case, from ~/work-gh/mrva/mrva-open-source-download/repos and ~/work-gh/mrva/mrva-open-source-download/repos-2024-04-29/ The unique database count increases from 3000 to 5360 -- see README.md, ./bin/mc-db-view-info < db-info-3.csv & Other code modifications: - Push (owner,repo,cid) names to minio - Generate databases.json for use in vs code extension - Generate list-databases.json for use by gh-mrva client
2024-07-30 10:47:29 -07:00
parent b4f1a2b8a6
commit 1e1daf9330
8 changed files with 322 additions and 52 deletions
--- a/client/qldbtools/bin/mc-db-generate-selection
+++ b/client/qldbtools/bin/mc-db-generate-selection
@@ -0,0 +1,103 @@
+#!/usr/bin/env python
+""" Read a table of CodeQL DB information
+    and generate the selection files for
+    1. the VS Code CodeQL plugin
+    2. the gh-mrva command-line client
+"""
+import argparse
+import logging
+import qldbtools.utils as utils
+import numpy as np
+
+#
+#* Configure logger
+# 
+logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
+# Overwrite log level set by minio
+root_logger = logging.getLogger()
+root_logger.setLevel(logging.INFO)
+
+#
+#* Process command line
+#
+parser = argparse.ArgumentParser(
+    description=""" Read a table of CodeQL DB information
+    and generate the selection files for
+    1. the VS Code CodeQL plugin
+    2. the gh-mrva command-line client
+    """,
+    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('vscode_selection', type=str,
+                    help='VS Code selection file to generate')
+parser.add_argument('gh_mrva_selection', type=str,
+                    help='gh-mrva cli selection file to generate')
+parser.add_argument('-n', '--num-entries', type=int, 
+                    help='Only use N entries', 
+                    default=None)
+parser.add_argument('-s', '--seed', type=int, 
+                    help='Random number seed', 
+                    default=4242)
+parser.add_argument('-l', '--list-name', type=str, 
+                    help='Name of the repository list',
+                    default='mirva-list')
+
+args = parser.parse_args()
+#
+#* Load the information
+#
+import pandas as pd
+import sys
+
+df0 = pd.read_csv(sys.stdin)
+
+if args.num_entries == None:
+    # Use all entries
+    df1 = df0
+else:
+    # Use num_entries, chosen via pseudo-random numbers
+    df1 = df0.sample(n=args.num_entries,
+                    random_state=np.random.RandomState(args.seed))
+
+#
+#* Form and save structures
+#
+repos = []
+for index, row in df1[['owner', 'name', 'CID', 'path']].iterrows():
+    owner, name, CID, path = row
+    repos.append(utils.form_db_req_name(owner, name, CID))
+
+repo_list_name = args.list_name
+vsc = {
+    "version": 1,
+    "databases": {
+        "variantAnalysis": {
+            "repositoryLists": [
+                {
+                    "name": repo_list_name,
+                    "repositories": repos,
+                }
+            ],
+            "owners": [],
+            "repositories": []
+        }
+    },
+    "selected": {
+        "kind": "variantAnalysisUserDefinedList",
+        "listName": repo_list_name
+    }
+}
+
+gh = {
+    repo_list_name:  repos
+}
+
+import json
+with open(args.vscode_selection, "w") as fc:
+    json.dump(vsc, fc, indent=4)
+
+with open(args.gh_mrva_selection, "w") as fc:
+    json.dump(gh, fc, indent=4)
+ 
+# Local Variables:
+# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
+# End:
--- a/client/qldbtools/bin/mc-db-populate-minio
+++ b/client/qldbtools/bin/mc-db-populate-minio
@@ -72,9 +72,10 @@ except S3Error as err:
    logging.error(f"Error creating bucket: {err}")

 # Get info from dataframe and push the files
-for index, row in entries[['owner', 'name', 'path']].iterrows():
-    owner, name, path = row
-    new_name = f'{owner}${name}.zip'
+# XX: include CID.  
+for index, row in entries[['owner', 'name', 'CID', 'path']].iterrows():
+    owner, name, CID, path = row
+    new_name = utils.form_db_bucket_name(owner, name, CID)
    try:
        client.fput_object(QL_DB_BUCKET_NAME, new_name, path)
        logging.info(f"Uploaded {path} as {new_name} to bucket {QL_DB_BUCKET_NAME}")
--- a/client/qldbtools/bin/mc-db-refine-info
+++ b/client/qldbtools/bin/mc-db-refine-info
@@ -43,6 +43,14 @@ for left_index in range(0, len(d)-1):
 joiners_df = pd.concat(joiners, axis=0)
 full_df = pd.merge(d, joiners_df, left_index=True, right_on='left_index', how='outer')    

+#** Add single uniqueness field -- CID (Cumulative ID)
+full_df['CID'] = full_df.apply(lambda row: 
+                               utils.cid_hash( (row['creationTime'],
+                                                row['sha'], 
+                                                row['cliVersion'], 
+                                                row['language'])
+                                              ), axis=1)
+
 #** Re-order the dataframe columns by importance
 # - Much of the data
 #   1. Is only conditionally present
@@ -70,11 +78,13 @@ full_df = pd.merge(d, joiners_df, left_index=True, right_on='left_index', how='o
 #     | primaryLanguage     |
 #     | finalised           |

-final_df = full_df.reindex(columns=['owner', 'name', 'language', 'size', 'cliVersion',
-	                                'creationTime', 'sha', 'baselineLinesOfCode', 'path',
-	                                'db_lang', 'db_lang_displayName', 'db_lang_file_count',
-	                                'db_lang_linesOfCode', 'ctime', 'primaryLanguage',
-	                                'finalised', 'left_index'])
+final_df = full_df.reindex( columns=['owner', 'name', 'cliVersion',
+                                     'creationTime', 'language', 'sha','CID',
+                                     'baselineLinesOfCode', 'path', 'db_lang',
+                                     'db_lang_displayName', 'db_lang_file_count',
+                                     'db_lang_linesOfCode', 'ctime',
+                                     'primaryLanguage', 'finalised', 'left_index',
+                                     'size'])

 final_df.to_csv(sys.stdout, index=False)

--- a/client/qldbtools/bin/mc-db-unique
+++ b/client/qldbtools/bin/mc-db-unique
@@ -1,7 +1,8 @@
 #!/usr/bin/env python
 """ Read a table of CodeQL DB information, 
-    group entries by (owner,name),  sort each group by
-    creationTime and keep only the top (newest) element.
+    group entries by (owner,name,CID),  
+    sort each group by creationTime,
+    and keep only the top (newest) element.
 """
 import argparse
 import logging
@@ -32,8 +33,8 @@ import sys

 df0 = pd.read_csv(sys.stdin)

-df_sorted = df0.sort_values(by=['owner', 'name', 'creationTime'])
-df_unique = df_sorted.groupby(['owner', 'name']).first().reset_index()
+df_sorted = df0.sort_values(by=['owner', 'name', 'CID', 'creationTime'])
+df_unique = df_sorted.groupby(['owner', 'name', 'CID']).first().reset_index()

 df_unique.to_csv(sys.stdout, index=False)