From 1e1daf9330082bb09a6a823460ce367a22a7e396 Mon Sep 17 00:00:00 2001
From: Michael Hohn <hohn@github.com>
Date: Tue, 30 Jul 2024 10:47:29 -0700
Subject: [PATCH] Include custom id (CID) to distinguish CodeQL databases

The current api (<2024-07-26 Fri>) is set up only for (owner,name).  This is
insufficient for distinguishing CodeQL databases.

Other differences must be considered;  this patch combines the fields
    | cliVersion   |
    | creationTime |
    | language     |
    | sha          |
into one called CID.  The CID field is a hash of these others and therefore can be
changed in the future without affecting workflows or the server.

The cid is combined with the owner/name to form one
identifier.  This requires no changes to server or client -- the db
selection's interface is separate from VS Code and gh-mrva in any case.

To test this, this version imports multiple versions of the same owner/repo pairs from multiple directories.  In this case, from
    ~/work-gh/mrva/mrva-open-source-download/repos
and
    ~/work-gh/mrva/mrva-open-source-download/repos-2024-04-29/
The unique database count increases from 3000 to 5360 -- see README.md,
    ./bin/mc-db-view-info < db-info-3.csv &

Other code modifications:
    - Push (owner,repo,cid) names to minio
    - Generate databases.json for use in vs code extension
    -  Generate list-databases.json for use by gh-mrva client
---
 client/qldbtools/README.md                    |  97 ++++++++++-------
 client/qldbtools/bin/mc-db-generate-selection | 103 ++++++++++++++++++
 client/qldbtools/bin/mc-db-populate-minio     |   7 +-
 client/qldbtools/bin/mc-db-refine-info        |  20 +++-
 client/qldbtools/bin/mc-db-unique             |   9 +-
 .../qldbtools/session-generate-selection.py   |  61 +++++++++++
 .../qldbtools/session-post-refine-info.py     |  45 ++++++++
 client/qldbtools/qldbtools/utils.py           |  32 ++++++
 8 files changed, 322 insertions(+), 52 deletions(-)
 create mode 100755 client/qldbtools/bin/mc-db-generate-selection
 create mode 100644 client/qldbtools/qldbtools/session-generate-selection.py
 create mode 100644 client/qldbtools/qldbtools/session-post-refine-info.py

diff --git a/client/qldbtools/README.md b/client/qldbtools/README.md
index 4400754..05279d4 100644
--- a/client/qldbtools/README.md
+++ b/client/qldbtools/README.md
@@ -6,48 +6,48 @@ qldbtools is a Python package for working with CodeQL databases
 
 -   Set up the virtual environment and install tools
 
-        cd ~/work-gh/mrva/mrvacommander/client/qldbtools/
-        python3.11 -m venv venv
-        source venv/bin/activate
-        pip install --upgrade pip
+                cd ~/work-gh/mrva/mrvacommander/client/qldbtools/
+                python3.11 -m venv venv
+                source venv/bin/activate
+                pip install --upgrade pip
 
-        # From requirements.txt
-        pip install -r requirements.txt
-        # Or explicitly
-        pip install jupyterlab pandas ipython
-        pip install lckr-jupyterlab-variableinspector
+                # From requirements.txt
+                pip install -r requirements.txt
+                # Or explicitly
+                pip install jupyterlab pandas ipython
+                pip install lckr-jupyterlab-variableinspector
 
 -   Run jupyterlab
 
-        cd ~/work-gh/mrva/mrvacommander/client
-        source venv/bin/activate
-        jupyter lab &
-        
-    The variable inspector is a right-click on an open console or notebook.
-    
-    The `jupyter` command produces output including
-    
-        Jupyter Server 2.14.1 is running at:
-        http://127.0.0.1:8888/lab?token=4c91308819786fe00a33b76e60f3321840283486457516a1
+                cd ~/work-gh/mrva/mrvacommander/client
+                source venv/bin/activate
+                jupyter lab &
+               
+        The variable inspector is a right-click on an open console or notebook.
+       
+        The `jupyter` command produces output including
+       
+                Jupyter Server 2.14.1 is running at:
+                http://127.0.0.1:8888/lab?token=4c91308819786fe00a33b76e60f3321840283486457516a1
 
-    Use this to connect multiple front ends
+        Use this to connect multiple front ends
 
 -   Local development
 
-    ```bash
-    cd ~/work-gh/mrva/mrvacommander/client/qldbtools
-    source venv/bin/activate
-    pip install --editable .
-    ```
+        ```bash
+        cd ~/work-gh/mrva/mrvacommander/client/qldbtools
+        source venv/bin/activate
+        pip install --editable .
+        ```
 
-    The `--editable` *should* use symlinks for all scripts; use `./bin/*` to be sure.
+        The `--editable` *should* use symlinks for all scripts; use `./bin/*` to be sure.
 
 
 -   Full installation
 
-    ```bash
-    pip install qldbtools
-    ```
+        ```bash
+        pip install qldbtools
+        ```
 
 
 ## Use as library
@@ -58,15 +58,32 @@ import qldbtools as ql
 
 ## Command-line use
 
-    cd ~/work-gh/mrva/mrvacommander/client/qldbtools
-    ./bin/mc-db-initial-info ~/work-gh/mrva/mrva-open-source-download > db-info-1.csv
-    
-    ./bin/mc-db-refine-info < db-info-1.csv > db-info-2.csv
-    
-    ./bin/mc-db-populate-minio < db-info-2.csv -n 3
+   Initial information collection requires a unique file path so it can be run
+   repeatedly over DB collections with the same (owner,name) but other differences
+   -- namely, in one or more of
 
-    ./bin/mc-db-view-info < db-info-2.csv 
-    
-    ./bin/mc-db-unique < db-info-2.csv > db-info-3.csv
-    
-        
+   - creationTime
+   - sha
+   - cliVersion
+   - language
+
+   Those fields are collected and a single name addenum formed in
+   `bin/mc-db-refine-info`. 
+
+   XX: Add `mc-db-generate-selection`
+
+   The command sequence, grouped by data files, is
+
+        cd ~/work-gh/mrva/mrvacommander/client/qldbtools
+        ./bin/mc-db-initial-info ~/work-gh/mrva/mrva-open-source-download > db-info-1.csv
+        ./bin/mc-db-refine-info < db-info-1.csv > db-info-2.csv
+       
+        ./bin/mc-db-view-info < db-info-2.csv &
+        ./bin/mc-db-unique < db-info-2.csv > db-info-3.csv
+        ./bin/mc-db-view-info < db-info-3.csv &
+
+        ./bin/mc-db-populate-minio -n 23 < db-info-3.csv
+        ./bin/mc-db-generate-selection -n 23 vscode-selection.json gh-mrva-selection.json < db-info-3.csv 
+       
+       
+               
diff --git a/client/qldbtools/bin/mc-db-generate-selection b/client/qldbtools/bin/mc-db-generate-selection
new file mode 100755
index 0000000..009f7f2
--- /dev/null
+++ b/client/qldbtools/bin/mc-db-generate-selection
@@ -0,0 +1,103 @@
+#!/usr/bin/env python
+""" Read a table of CodeQL DB information
+    and generate the selection files for
+    1. the VS Code CodeQL plugin
+    2. the gh-mrva command-line client
+"""
+import argparse
+import logging
+import qldbtools.utils as utils
+import numpy as np
+
+#
+#* Configure logger
+# 
+logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
+# Overwrite log level set by minio
+root_logger = logging.getLogger()
+root_logger.setLevel(logging.INFO)
+
+#
+#* Process command line
+#
+parser = argparse.ArgumentParser(
+    description=""" Read a table of CodeQL DB information
+    and generate the selection files for
+    1. the VS Code CodeQL plugin
+    2. the gh-mrva command-line client
+    """,
+    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('vscode_selection', type=str,
+                    help='VS Code selection file to generate')
+parser.add_argument('gh_mrva_selection', type=str,
+                    help='gh-mrva cli selection file to generate')
+parser.add_argument('-n', '--num-entries', type=int, 
+                    help='Only use N entries', 
+                    default=None)
+parser.add_argument('-s', '--seed', type=int, 
+                    help='Random number seed', 
+                    default=4242)
+parser.add_argument('-l', '--list-name', type=str, 
+                    help='Name of the repository list',
+                    default='mirva-list')
+
+args = parser.parse_args()
+#
+#* Load the information
+#
+import pandas as pd
+import sys
+
+df0 = pd.read_csv(sys.stdin)
+
+if args.num_entries == None:
+    # Use all entries
+    df1 = df0
+else:
+    # Use num_entries, chosen via pseudo-random numbers
+    df1 = df0.sample(n=args.num_entries,
+                    random_state=np.random.RandomState(args.seed))
+
+#
+#* Form and save structures
+#
+repos = []
+for index, row in df1[['owner', 'name', 'CID', 'path']].iterrows():
+    owner, name, CID, path = row
+    repos.append(utils.form_db_req_name(owner, name, CID))
+
+repo_list_name = args.list_name
+vsc = {
+    "version": 1,
+    "databases": {
+        "variantAnalysis": {
+            "repositoryLists": [
+                {
+                    "name": repo_list_name,
+                    "repositories": repos,
+                }
+            ],
+            "owners": [],
+            "repositories": []
+        }
+    },
+    "selected": {
+        "kind": "variantAnalysisUserDefinedList",
+        "listName": repo_list_name
+    }
+}
+
+gh = {
+    repo_list_name:  repos
+}
+
+import json
+with open(args.vscode_selection, "w") as fc:
+    json.dump(vsc, fc, indent=4)
+
+with open(args.gh_mrva_selection, "w") as fc:
+    json.dump(gh, fc, indent=4)
+ 
+# Local Variables:
+# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
+# End:
diff --git a/client/qldbtools/bin/mc-db-populate-minio b/client/qldbtools/bin/mc-db-populate-minio
index 175803a..3a8652c 100755
--- a/client/qldbtools/bin/mc-db-populate-minio
+++ b/client/qldbtools/bin/mc-db-populate-minio
@@ -72,9 +72,10 @@ except S3Error as err:
     logging.error(f"Error creating bucket: {err}")
 
 # Get info from dataframe and push the files
-for index, row in entries[['owner', 'name', 'path']].iterrows():
-    owner, name, path = row
-    new_name = f'{owner}${name}.zip'
+# XX: include CID.  
+for index, row in entries[['owner', 'name', 'CID', 'path']].iterrows():
+    owner, name, CID, path = row
+    new_name = utils.form_db_bucket_name(owner, name, CID)
     try:
         client.fput_object(QL_DB_BUCKET_NAME, new_name, path)
         logging.info(f"Uploaded {path} as {new_name} to bucket {QL_DB_BUCKET_NAME}")
diff --git a/client/qldbtools/bin/mc-db-refine-info b/client/qldbtools/bin/mc-db-refine-info
index 6f71027..39b000e 100755
--- a/client/qldbtools/bin/mc-db-refine-info
+++ b/client/qldbtools/bin/mc-db-refine-info
@@ -43,6 +43,14 @@ for left_index in range(0, len(d)-1):
 joiners_df = pd.concat(joiners, axis=0)
 full_df = pd.merge(d, joiners_df, left_index=True, right_on='left_index', how='outer')    
 
+#** Add single uniqueness field -- CID (Cumulative ID)
+full_df['CID'] = full_df.apply(lambda row: 
+                               utils.cid_hash( (row['creationTime'],
+                                                row['sha'], 
+                                                row['cliVersion'], 
+                                                row['language'])
+                                              ), axis=1)
+
 #** Re-order the dataframe columns by importance
 # - Much of the data
 #   1. Is only conditionally present
@@ -70,11 +78,13 @@ full_df = pd.merge(d, joiners_df, left_index=True, right_on='left_index', how='o
 #     | primaryLanguage     |
 #     | finalised           |
 
-final_df = full_df.reindex(columns=['owner', 'name', 'language', 'size', 'cliVersion',
-	                                'creationTime', 'sha', 'baselineLinesOfCode', 'path',
-	                                'db_lang', 'db_lang_displayName', 'db_lang_file_count',
-	                                'db_lang_linesOfCode', 'ctime', 'primaryLanguage',
-	                                'finalised', 'left_index'])
+final_df = full_df.reindex( columns=['owner', 'name', 'cliVersion',
+                                     'creationTime', 'language', 'sha','CID',
+                                     'baselineLinesOfCode', 'path', 'db_lang',
+                                     'db_lang_displayName', 'db_lang_file_count',
+                                     'db_lang_linesOfCode', 'ctime',
+                                     'primaryLanguage', 'finalised', 'left_index',
+                                     'size'])
 
 final_df.to_csv(sys.stdout, index=False)
 
diff --git a/client/qldbtools/bin/mc-db-unique b/client/qldbtools/bin/mc-db-unique
index 4c44f6e..fd6fb91 100755
--- a/client/qldbtools/bin/mc-db-unique
+++ b/client/qldbtools/bin/mc-db-unique
@@ -1,7 +1,8 @@
 #!/usr/bin/env python
 """ Read a table of CodeQL DB information, 
-    group entries by (owner,name),  sort each group by
-    creationTime and keep only the top (newest) element.
+    group entries by (owner,name,CID),  
+    sort each group by creationTime,
+    and keep only the top (newest) element.
 """
 import argparse
 import logging
@@ -32,8 +33,8 @@ import sys
 
 df0 = pd.read_csv(sys.stdin)
 
-df_sorted = df0.sort_values(by=['owner', 'name', 'creationTime'])
-df_unique = df_sorted.groupby(['owner', 'name']).first().reset_index()
+df_sorted = df0.sort_values(by=['owner', 'name', 'CID', 'creationTime'])
+df_unique = df_sorted.groupby(['owner', 'name', 'CID']).first().reset_index()
 
 df_unique.to_csv(sys.stdout, index=False)
 
diff --git a/client/qldbtools/qldbtools/session-generate-selection.py b/client/qldbtools/qldbtools/session-generate-selection.py
new file mode 100644
index 0000000..9f1200e
--- /dev/null
+++ b/client/qldbtools/qldbtools/session-generate-selection.py
@@ -0,0 +1,61 @@
+""" Read a table of CodeQL DB information
+    and generate the selection files for
+    1. the VS Code CodeQL plugin
+    2. the gh-mrva command-line client
+"""
+#
+#* Collect the information and write files
+#
+import pandas as pd
+import sys
+import qldbtools.utils as utils
+import numpy as np
+import importlib
+importlib.reload(utils)
+
+df0 = pd.read_csv('db-info-3.csv')
+
+# Use num_entries, chosen via pseudo-random numbers
+df1 = df0.sample(n=3, random_state=np.random.RandomState(4242))
+
+repos = []
+for index, row in df1[['owner', 'name', 'CID', 'path']].iterrows():
+    owner, name, CID, path = row
+    repos.append(utils.form_db_req_name(owner, name, CID))
+
+repo_list_name = "mirva-list"
+vsc = {
+    "version": 1,
+    "databases": {
+        "variantAnalysis": {
+            "repositoryLists": [
+                {
+                    "name": repo_list_name,
+                    "repositories": repos,
+                }
+            ],
+            "owners": [],
+            "repositories": []
+        }
+    },
+    "selected": {
+        "kind": "variantAnalysisUserDefinedList",
+        "listName": repo_list_name
+    }
+}
+
+gh = {
+    repo_list_name:  repos
+}
+
+
+# write the files
+import json
+with open("tmp-selection-vsc.json", "w") as fc:
+    json.dump(vsc, fc, indent=4)
+with open("tmp-selection-gh.json", "w") as fc:
+    json.dump(gh, fc, indent=4)
+    
+# Local Variables:
+# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
+# End:
diff --git a/client/qldbtools/qldbtools/session-post-refine-info.py b/client/qldbtools/qldbtools/session-post-refine-info.py
new file mode 100644
index 0000000..18f01df
--- /dev/null
+++ b/client/qldbtools/qldbtools/session-post-refine-info.py
@@ -0,0 +1,45 @@
+import qldbtools.utils as utils
+import pandas as pd
+
+#
+#* Collect the information
+#
+df1 = pd.read_csv("db-info-2.csv")
+
+# Add single uniqueness field -- CID (Cumulative ID) -- using
+# - creationTime
+# - sha
+# - cliVersion
+# - language
+
+from hashlib import blake2b
+
+def cid_hash(row_tuple: tuple):
+    """
+        cid_hash(row_tuple)
+    Take a bytes object and return hash as hex string
+    """
+    h = blake2b(digest_size = 3)
+    h.update(str(row_tuple).encode())
+    # return int.from_bytes(h.digest(), byteorder='big')
+    return h.hexdigest()
+
+# Apply the cid_hash function to the specified columns and create the 'CID' column
+df1['CID'] = df1.apply(lambda row: cid_hash( (row['creationTime'],
+                                              row['sha'], 
+                                              row['cliVersion'], 
+                                              row['language'])
+                                            ), axis=1)
+
+df2 = df1.reindex(columns=['owner', 'name', 'cliVersion', 'creationTime',
+	                       'language', 'sha','CID', 'baselineLinesOfCode', 'path',
+	                       'db_lang', 'db_lang_displayName', 'db_lang_file_count',
+	                       'db_lang_linesOfCode', 'ctime', 'primaryLanguage',
+	                       'finalised', 'left_index', 'size'])
+
+df1['cid']
+
+
+# Local Variables:
+# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
+# End:
diff --git a/client/qldbtools/qldbtools/utils.py b/client/qldbtools/qldbtools/utils.py
index 203e923..f034d04 100644
--- a/client/qldbtools/qldbtools/utils.py
+++ b/client/qldbtools/qldbtools/utils.py
@@ -175,6 +175,38 @@ def metadata_details(left_index, codeql_content, meta_content):
 
 class DetailsMissing(Exception): pass                        
 
+from hashlib import blake2b
+
+def cid_hash(row_tuple: tuple):
+    """
+        cid_hash(row_tuple)
+    Take a bytes object and return hash as hex string
+    """
+    h = blake2b(digest_size = 3)
+    h.update(str(row_tuple).encode())
+    # return int.from_bytes(h.digest(), byteorder='big')
+    return h.hexdigest()
+
+def form_db_bucket_name(owner, name, CID):
+    """
+        form_db_bucket_name(owner, name, CID)
+    Return the name to use in minio storage; this function is trivial and used to
+    enforce consistent naming.
+
+    The 'ctsj' prefix is a random, unique key to identify the information.
+    """
+    return f'{owner}${name}ctsj{CID}.zip'
+
+def form_db_req_name(owner, name, CID):
+    """
+        form_db_req_name(owner, name, CID)
+    Return the name to use in mrva requests; this function is trivial and used to
+    enforce consistent naming.
+
+    The 'ctsj' prefix is a random, unique key to identify the information.
+    """
+    return f'{owner}/{name}ctsj{CID}'
+
 
 # Local Variables:
 # python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"