From 1e1daf9330082bb09a6a823460ce367a22a7e396 Mon Sep 17 00:00:00 2001 From: Michael Hohn Date: Tue, 30 Jul 2024 10:47:29 -0700 Subject: [PATCH] Include custom id (CID) to distinguish CodeQL databases The current api (<2024-07-26 Fri>) is set up only for (owner,name). This is insufficient for distinguishing CodeQL databases. Other differences must be considered; this patch combines the fields | cliVersion | | creationTime | | language | | sha | into one called CID. The CID field is a hash of these others and therefore can be changed in the future without affecting workflows or the server. The cid is combined with the owner/name to form one identifier. This requires no changes to server or client -- the db selection's interface is separate from VS Code and gh-mrva in any case. To test this, this version imports multiple versions of the same owner/repo pairs from multiple directories. In this case, from ~/work-gh/mrva/mrva-open-source-download/repos and ~/work-gh/mrva/mrva-open-source-download/repos-2024-04-29/ The unique database count increases from 3000 to 5360 -- see README.md, ./bin/mc-db-view-info < db-info-3.csv & Other code modifications: - Push (owner,repo,cid) names to minio - Generate databases.json for use in vs code extension - Generate list-databases.json for use by gh-mrva client --- client/qldbtools/README.md | 97 ++++++++++------- client/qldbtools/bin/mc-db-generate-selection | 103 ++++++++++++++++++ client/qldbtools/bin/mc-db-populate-minio | 7 +- client/qldbtools/bin/mc-db-refine-info | 20 +++- client/qldbtools/bin/mc-db-unique | 9 +- .../qldbtools/session-generate-selection.py | 61 +++++++++++ .../qldbtools/session-post-refine-info.py | 45 ++++++++ client/qldbtools/qldbtools/utils.py | 32 ++++++ 8 files changed, 322 insertions(+), 52 deletions(-) create mode 100755 client/qldbtools/bin/mc-db-generate-selection create mode 100644 client/qldbtools/qldbtools/session-generate-selection.py create mode 100644 client/qldbtools/qldbtools/session-post-refine-info.py diff --git a/client/qldbtools/README.md b/client/qldbtools/README.md index 4400754..05279d4 100644 --- a/client/qldbtools/README.md +++ b/client/qldbtools/README.md @@ -6,48 +6,48 @@ qldbtools is a Python package for working with CodeQL databases - Set up the virtual environment and install tools - cd ~/work-gh/mrva/mrvacommander/client/qldbtools/ - python3.11 -m venv venv - source venv/bin/activate - pip install --upgrade pip + cd ~/work-gh/mrva/mrvacommander/client/qldbtools/ + python3.11 -m venv venv + source venv/bin/activate + pip install --upgrade pip - # From requirements.txt - pip install -r requirements.txt - # Or explicitly - pip install jupyterlab pandas ipython - pip install lckr-jupyterlab-variableinspector + # From requirements.txt + pip install -r requirements.txt + # Or explicitly + pip install jupyterlab pandas ipython + pip install lckr-jupyterlab-variableinspector - Run jupyterlab - cd ~/work-gh/mrva/mrvacommander/client - source venv/bin/activate - jupyter lab & - - The variable inspector is a right-click on an open console or notebook. - - The `jupyter` command produces output including - - Jupyter Server 2.14.1 is running at: - http://127.0.0.1:8888/lab?token=4c91308819786fe00a33b76e60f3321840283486457516a1 + cd ~/work-gh/mrva/mrvacommander/client + source venv/bin/activate + jupyter lab & + + The variable inspector is a right-click on an open console or notebook. + + The `jupyter` command produces output including + + Jupyter Server 2.14.1 is running at: + http://127.0.0.1:8888/lab?token=4c91308819786fe00a33b76e60f3321840283486457516a1 - Use this to connect multiple front ends + Use this to connect multiple front ends - Local development - ```bash - cd ~/work-gh/mrva/mrvacommander/client/qldbtools - source venv/bin/activate - pip install --editable . - ``` + ```bash + cd ~/work-gh/mrva/mrvacommander/client/qldbtools + source venv/bin/activate + pip install --editable . + ``` - The `--editable` *should* use symlinks for all scripts; use `./bin/*` to be sure. + The `--editable` *should* use symlinks for all scripts; use `./bin/*` to be sure. - Full installation - ```bash - pip install qldbtools - ``` + ```bash + pip install qldbtools + ``` ## Use as library @@ -58,15 +58,32 @@ import qldbtools as ql ## Command-line use - cd ~/work-gh/mrva/mrvacommander/client/qldbtools - ./bin/mc-db-initial-info ~/work-gh/mrva/mrva-open-source-download > db-info-1.csv - - ./bin/mc-db-refine-info < db-info-1.csv > db-info-2.csv - - ./bin/mc-db-populate-minio < db-info-2.csv -n 3 + Initial information collection requires a unique file path so it can be run + repeatedly over DB collections with the same (owner,name) but other differences + -- namely, in one or more of - ./bin/mc-db-view-info < db-info-2.csv - - ./bin/mc-db-unique < db-info-2.csv > db-info-3.csv - - + - creationTime + - sha + - cliVersion + - language + + Those fields are collected and a single name addenum formed in + `bin/mc-db-refine-info`. + + XX: Add `mc-db-generate-selection` + + The command sequence, grouped by data files, is + + cd ~/work-gh/mrva/mrvacommander/client/qldbtools + ./bin/mc-db-initial-info ~/work-gh/mrva/mrva-open-source-download > db-info-1.csv + ./bin/mc-db-refine-info < db-info-1.csv > db-info-2.csv + + ./bin/mc-db-view-info < db-info-2.csv & + ./bin/mc-db-unique < db-info-2.csv > db-info-3.csv + ./bin/mc-db-view-info < db-info-3.csv & + + ./bin/mc-db-populate-minio -n 23 < db-info-3.csv + ./bin/mc-db-generate-selection -n 23 vscode-selection.json gh-mrva-selection.json < db-info-3.csv + + + diff --git a/client/qldbtools/bin/mc-db-generate-selection b/client/qldbtools/bin/mc-db-generate-selection new file mode 100755 index 0000000..009f7f2 --- /dev/null +++ b/client/qldbtools/bin/mc-db-generate-selection @@ -0,0 +1,103 @@ +#!/usr/bin/env python +""" Read a table of CodeQL DB information + and generate the selection files for + 1. the VS Code CodeQL plugin + 2. the gh-mrva command-line client +""" +import argparse +import logging +import qldbtools.utils as utils +import numpy as np + +# +#* Configure logger +# +logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') +# Overwrite log level set by minio +root_logger = logging.getLogger() +root_logger.setLevel(logging.INFO) + +# +#* Process command line +# +parser = argparse.ArgumentParser( + description=""" Read a table of CodeQL DB information + and generate the selection files for + 1. the VS Code CodeQL plugin + 2. the gh-mrva command-line client + """, + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument('vscode_selection', type=str, + help='VS Code selection file to generate') +parser.add_argument('gh_mrva_selection', type=str, + help='gh-mrva cli selection file to generate') +parser.add_argument('-n', '--num-entries', type=int, + help='Only use N entries', + default=None) +parser.add_argument('-s', '--seed', type=int, + help='Random number seed', + default=4242) +parser.add_argument('-l', '--list-name', type=str, + help='Name of the repository list', + default='mirva-list') + +args = parser.parse_args() +# +#* Load the information +# +import pandas as pd +import sys + +df0 = pd.read_csv(sys.stdin) + +if args.num_entries == None: + # Use all entries + df1 = df0 +else: + # Use num_entries, chosen via pseudo-random numbers + df1 = df0.sample(n=args.num_entries, + random_state=np.random.RandomState(args.seed)) + +# +#* Form and save structures +# +repos = [] +for index, row in df1[['owner', 'name', 'CID', 'path']].iterrows(): + owner, name, CID, path = row + repos.append(utils.form_db_req_name(owner, name, CID)) + +repo_list_name = args.list_name +vsc = { + "version": 1, + "databases": { + "variantAnalysis": { + "repositoryLists": [ + { + "name": repo_list_name, + "repositories": repos, + } + ], + "owners": [], + "repositories": [] + } + }, + "selected": { + "kind": "variantAnalysisUserDefinedList", + "listName": repo_list_name + } +} + +gh = { + repo_list_name: repos +} + +import json +with open(args.vscode_selection, "w") as fc: + json.dump(vsc, fc, indent=4) + +with open(args.gh_mrva_selection, "w") as fc: + json.dump(gh, fc, indent=4) + +# Local Variables: +# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/" +# End: diff --git a/client/qldbtools/bin/mc-db-populate-minio b/client/qldbtools/bin/mc-db-populate-minio index 175803a..3a8652c 100755 --- a/client/qldbtools/bin/mc-db-populate-minio +++ b/client/qldbtools/bin/mc-db-populate-minio @@ -72,9 +72,10 @@ except S3Error as err: logging.error(f"Error creating bucket: {err}") # Get info from dataframe and push the files -for index, row in entries[['owner', 'name', 'path']].iterrows(): - owner, name, path = row - new_name = f'{owner}${name}.zip' +# XX: include CID. +for index, row in entries[['owner', 'name', 'CID', 'path']].iterrows(): + owner, name, CID, path = row + new_name = utils.form_db_bucket_name(owner, name, CID) try: client.fput_object(QL_DB_BUCKET_NAME, new_name, path) logging.info(f"Uploaded {path} as {new_name} to bucket {QL_DB_BUCKET_NAME}") diff --git a/client/qldbtools/bin/mc-db-refine-info b/client/qldbtools/bin/mc-db-refine-info index 6f71027..39b000e 100755 --- a/client/qldbtools/bin/mc-db-refine-info +++ b/client/qldbtools/bin/mc-db-refine-info @@ -43,6 +43,14 @@ for left_index in range(0, len(d)-1): joiners_df = pd.concat(joiners, axis=0) full_df = pd.merge(d, joiners_df, left_index=True, right_on='left_index', how='outer') +#** Add single uniqueness field -- CID (Cumulative ID) +full_df['CID'] = full_df.apply(lambda row: + utils.cid_hash( (row['creationTime'], + row['sha'], + row['cliVersion'], + row['language']) + ), axis=1) + #** Re-order the dataframe columns by importance # - Much of the data # 1. Is only conditionally present @@ -70,11 +78,13 @@ full_df = pd.merge(d, joiners_df, left_index=True, right_on='left_index', how='o # | primaryLanguage | # | finalised | -final_df = full_df.reindex(columns=['owner', 'name', 'language', 'size', 'cliVersion', - 'creationTime', 'sha', 'baselineLinesOfCode', 'path', - 'db_lang', 'db_lang_displayName', 'db_lang_file_count', - 'db_lang_linesOfCode', 'ctime', 'primaryLanguage', - 'finalised', 'left_index']) +final_df = full_df.reindex( columns=['owner', 'name', 'cliVersion', + 'creationTime', 'language', 'sha','CID', + 'baselineLinesOfCode', 'path', 'db_lang', + 'db_lang_displayName', 'db_lang_file_count', + 'db_lang_linesOfCode', 'ctime', + 'primaryLanguage', 'finalised', 'left_index', + 'size']) final_df.to_csv(sys.stdout, index=False) diff --git a/client/qldbtools/bin/mc-db-unique b/client/qldbtools/bin/mc-db-unique index 4c44f6e..fd6fb91 100755 --- a/client/qldbtools/bin/mc-db-unique +++ b/client/qldbtools/bin/mc-db-unique @@ -1,7 +1,8 @@ #!/usr/bin/env python """ Read a table of CodeQL DB information, - group entries by (owner,name), sort each group by - creationTime and keep only the top (newest) element. + group entries by (owner,name,CID), + sort each group by creationTime, + and keep only the top (newest) element. """ import argparse import logging @@ -32,8 +33,8 @@ import sys df0 = pd.read_csv(sys.stdin) -df_sorted = df0.sort_values(by=['owner', 'name', 'creationTime']) -df_unique = df_sorted.groupby(['owner', 'name']).first().reset_index() +df_sorted = df0.sort_values(by=['owner', 'name', 'CID', 'creationTime']) +df_unique = df_sorted.groupby(['owner', 'name', 'CID']).first().reset_index() df_unique.to_csv(sys.stdout, index=False) diff --git a/client/qldbtools/qldbtools/session-generate-selection.py b/client/qldbtools/qldbtools/session-generate-selection.py new file mode 100644 index 0000000..9f1200e --- /dev/null +++ b/client/qldbtools/qldbtools/session-generate-selection.py @@ -0,0 +1,61 @@ +""" Read a table of CodeQL DB information + and generate the selection files for + 1. the VS Code CodeQL plugin + 2. the gh-mrva command-line client +""" +# +#* Collect the information and write files +# +import pandas as pd +import sys +import qldbtools.utils as utils +import numpy as np +import importlib +importlib.reload(utils) + +df0 = pd.read_csv('db-info-3.csv') + +# Use num_entries, chosen via pseudo-random numbers +df1 = df0.sample(n=3, random_state=np.random.RandomState(4242)) + +repos = [] +for index, row in df1[['owner', 'name', 'CID', 'path']].iterrows(): + owner, name, CID, path = row + repos.append(utils.form_db_req_name(owner, name, CID)) + +repo_list_name = "mirva-list" +vsc = { + "version": 1, + "databases": { + "variantAnalysis": { + "repositoryLists": [ + { + "name": repo_list_name, + "repositories": repos, + } + ], + "owners": [], + "repositories": [] + } + }, + "selected": { + "kind": "variantAnalysisUserDefinedList", + "listName": repo_list_name + } +} + +gh = { + repo_list_name: repos +} + + +# write the files +import json +with open("tmp-selection-vsc.json", "w") as fc: + json.dump(vsc, fc, indent=4) +with open("tmp-selection-gh.json", "w") as fc: + json.dump(gh, fc, indent=4) + +# Local Variables: +# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/" +# End: diff --git a/client/qldbtools/qldbtools/session-post-refine-info.py b/client/qldbtools/qldbtools/session-post-refine-info.py new file mode 100644 index 0000000..18f01df --- /dev/null +++ b/client/qldbtools/qldbtools/session-post-refine-info.py @@ -0,0 +1,45 @@ +import qldbtools.utils as utils +import pandas as pd + +# +#* Collect the information +# +df1 = pd.read_csv("db-info-2.csv") + +# Add single uniqueness field -- CID (Cumulative ID) -- using +# - creationTime +# - sha +# - cliVersion +# - language + +from hashlib import blake2b + +def cid_hash(row_tuple: tuple): + """ + cid_hash(row_tuple) + Take a bytes object and return hash as hex string + """ + h = blake2b(digest_size = 3) + h.update(str(row_tuple).encode()) + # return int.from_bytes(h.digest(), byteorder='big') + return h.hexdigest() + +# Apply the cid_hash function to the specified columns and create the 'CID' column +df1['CID'] = df1.apply(lambda row: cid_hash( (row['creationTime'], + row['sha'], + row['cliVersion'], + row['language']) + ), axis=1) + +df2 = df1.reindex(columns=['owner', 'name', 'cliVersion', 'creationTime', + 'language', 'sha','CID', 'baselineLinesOfCode', 'path', + 'db_lang', 'db_lang_displayName', 'db_lang_file_count', + 'db_lang_linesOfCode', 'ctime', 'primaryLanguage', + 'finalised', 'left_index', 'size']) + +df1['cid'] + + +# Local Variables: +# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/" +# End: diff --git a/client/qldbtools/qldbtools/utils.py b/client/qldbtools/qldbtools/utils.py index 203e923..f034d04 100644 --- a/client/qldbtools/qldbtools/utils.py +++ b/client/qldbtools/qldbtools/utils.py @@ -175,6 +175,38 @@ def metadata_details(left_index, codeql_content, meta_content): class DetailsMissing(Exception): pass +from hashlib import blake2b + +def cid_hash(row_tuple: tuple): + """ + cid_hash(row_tuple) + Take a bytes object and return hash as hex string + """ + h = blake2b(digest_size = 3) + h.update(str(row_tuple).encode()) + # return int.from_bytes(h.digest(), byteorder='big') + return h.hexdigest() + +def form_db_bucket_name(owner, name, CID): + """ + form_db_bucket_name(owner, name, CID) + Return the name to use in minio storage; this function is trivial and used to + enforce consistent naming. + + The 'ctsj' prefix is a random, unique key to identify the information. + """ + return f'{owner}${name}ctsj{CID}.zip' + +def form_db_req_name(owner, name, CID): + """ + form_db_req_name(owner, name, CID) + Return the name to use in mrva requests; this function is trivial and used to + enforce consistent naming. + + The 'ctsj' prefix is a random, unique key to identify the information. + """ + return f'{owner}/{name}ctsj{CID}' + # Local Variables: # python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"