Move session scripts to separate directory

2024-08-02 13:56:47 -07:00
parent 582d933130
commit 349d758c14
7 changed files with 48 additions and 1 deletions
--- a/client/qldbtools/session/db-generate-selection.py
+++ b/client/qldbtools/session/db-generate-selection.py
@@ -0,0 +1,61 @@
+""" Read a table of CodeQL DB information
+    and generate the selection files for
+    1. the VS Code CodeQL plugin
+    2. the gh-mrva command-line client
+"""
+#
+#* Collect the information and write files
+#
+import pandas as pd
+import sys
+import qldbtools.utils as utils
+import numpy as np
+import importlib
+importlib.reload(utils)
+
+df0 = pd.read_csv('scratch/db-info-3.csv')
+
+# Use num_entries, chosen via pseudo-random numbers
+df1 = df0.sample(n=3, random_state=np.random.RandomState(4242))
+
+repos = []
+for index, row in df1[['owner', 'name', 'CID', 'path']].iterrows():
+    owner, name, CID, path = row
+    repos.append(utils.form_db_req_name(owner, name, CID))
+
+repo_list_name = "mirva-list"
+vsc = {
+    "version": 1,
+    "databases": {
+        "variantAnalysis": {
+            "repositoryLists": [
+                {
+                    "name": repo_list_name,
+                    "repositories": repos,
+                }
+            ],
+            "owners": [],
+            "repositories": []
+        }
+    },
+    "selected": {
+        "kind": "variantAnalysisUserDefinedList",
+        "listName": repo_list_name
+    }
+}
+
+gh = {
+    repo_list_name:  repos
+}
+
+
+# write the files
+import json
+with open("tmp-selection-vsc.json", "w") as fc:
+    json.dump(vsc, fc, indent=4)
+with open("tmp-selection-gh.json", "w") as fc:
+    json.dump(gh, fc, indent=4)
+    
+# Local Variables:
+# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
+# End:
--- a/client/qldbtools/session/db-initial-info.py
+++ b/client/qldbtools/session/db-initial-info.py
@@ -0,0 +1,59 @@
+#* Experimental work with utils.py, to be merged into it.
+# The rest of this interactive script is available as cli script in
+# mc-db-initial-info
+from utils import *
+
+#* Data collection
+# Get the db information in list of DBInfo form
+db_base = "~/work-gh/mrva/mrva-open-source-download/"
+dbs = list(collect_dbs(db_base))
+
+# Inspect:
+from pprint import pprint
+pprint(["len", len(dbs)])
+pprint(["dbs[0]", dbs[0].__dict__])
+pprint(["dbs[-1]", dbs[-1].__dict__])
+# 
+# Get a dataframe
+dbdf = pd.DataFrame([d.__dict__ for d in dbs])
+# 
+#* Experiments with on-disk format
+# Continue use of raw information in separate session.
+# 
+# PosixPath is a problem for json and parquet
+# 
+dbdf['path'] = dbdf['path'].astype(str)
+#
+dbdf.to_csv('dbdf.csv')
+#
+dbdf.to_csv('dbdf.csv.gz', compression='gzip', index=False)
+# 
+dbdf.to_json('dbdf.json')
+#
+# dbdf.to_hdf('dbdf.h5', key='dbdf', mode='w')
+# 
+# fast, binary
+dbdf.to_parquet('dbdf.parquet')
+# 
+# fast
+import sqlite3
+conn = sqlite3.connect('dbdf.db')
+dbdf.to_sql('qldbs', conn, if_exists='replace', index=False)
+conn.close()
+# 
+# Sizes:
+# ls -laSr dbdf.*
+# -rw-r--r--@ 1 hohn  staff  101390 Jul 12 14:17 dbdf.csv.gz
+# -rw-r--r--@ 1 hohn  staff  202712 Jul 12 14:17 dbdf.parquet
+# -rw-r--r--@ 1 hohn  staff  560623 Jul 12 14:17 dbdf.csv
+# -rw-r--r--@ 1 hohn  staff  610304 Jul 12 14:17 dbdf.db
+# -rw-r--r--@ 1 hohn  staff  735097 Jul 12 14:17 dbdf.json
+#
+# parquet has many libraries, including go: xitongsys/parquet-go
+# https://parquet.apache.org/
+# 
+
+
+# Local Variables:
+# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
+# End:
--- a/client/qldbtools/session/db-populate-minio.py
+++ b/client/qldbtools/session/db-populate-minio.py
@@ -0,0 +1,65 @@
+import qldbtools.utils as utils
+import pandas as pd
+import numpy as np
+import sys
+from minio import Minio
+from minio.error import S3Error
+from pathlib import Path
+
+#
+#* Collect the information and select subset
+#
+df = pd.read_csv('scratch/db-info-2.csv')
+seed = 4242
+if 0:
+    # Use all entries
+    entries = df
+else:
+    # Use num_entries, chosen via pseudo-random numbers
+    entries = df.sample(n=3,
+                        random_state=np.random.RandomState(seed))
+#
+#* Push the DBs
+#
+# Configuration
+MINIO_URL = "http://localhost:9000"
+MINIO_ROOT_USER = "user"
+MINIO_ROOT_PASSWORD = "mmusty8432"
+QL_DB_BUCKET_NAME = "qldb"
+
+# Initialize MinIO client
+client = Minio(
+    MINIO_URL.replace("http://", "").replace("https://", ""),
+    access_key=MINIO_ROOT_USER,
+    secret_key=MINIO_ROOT_PASSWORD,
+    secure=False
+)
+
+# Create the bucket if it doesn't exist
+try:
+    if not client.bucket_exists(QL_DB_BUCKET_NAME):
+        client.make_bucket(QL_DB_BUCKET_NAME)
+    else:
+        print(f"Bucket '{QL_DB_BUCKET_NAME}' already exists.")
+except S3Error as err:
+    print(f"Error creating bucket: {err}")
+
+# (test) File paths and new names
+files_to_upload = {
+    "cmd/server/codeql/dbs/google/flatbuffers/google_flatbuffers_db.zip": "google$flatbuffers.zip",
+    "cmd/server/codeql/dbs/psycopg/psycopg2/psycopg_psycopg2_db.zip": "psycopg$psycopg2.zip"
+}
+
+# (test) Push the files
+prefix = Path('/Users/hohn/work-gh/mrva/mrvacommander')
+for local_path, new_name in files_to_upload.items():
+    try:
+        client.fput_object(QL_DB_BUCKET_NAME, new_name, prefix / Path(local_path))
+        print(f"Uploaded {local_path} as {new_name} to bucket {QL_DB_BUCKET_NAME}")
+    except S3Error as err:
+        print(f"Error uploading file {local_path}: {err}")
+
+
+# Local Variables:
+# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
+# End:
--- a/client/qldbtools/session/db-post-refine-info.py
+++ b/client/qldbtools/session/db-post-refine-info.py
@@ -0,0 +1,46 @@
+# Session around bin/mc-db-unique
+import qldbtools.utils as utils
+import pandas as pd
+
+#
+#* Collect the information
+#
+df1 = pd.read_csv("scratch/db-info-2.csv")
+
+# Add single uniqueness field -- CID (Cumulative ID) -- using
+# - creationTime
+# - sha
+# - cliVersion
+# - language
+
+from hashlib import blake2b
+
+def cid_hash(row_tuple: tuple):
+    """
+        cid_hash(row_tuple)
+    Take a bytes object and return hash as hex string
+    """
+    h = blake2b(digest_size = 3)
+    h.update(str(row_tuple).encode())
+    # return int.from_bytes(h.digest(), byteorder='big')
+    return h.hexdigest()
+
+# Apply the cid_hash function to the specified columns and create the 'CID' column
+df1['CID'] = df1.apply(lambda row: cid_hash( (row['creationTime'],
+                                              row['sha'], 
+                                              row['cliVersion'], 
+                                              row['language'])
+                                            ), axis=1)
+
+df2 = df1.reindex(columns=['owner', 'name', 'cliVersion', 'creationTime',
+	                       'language', 'sha','CID', 'baselineLinesOfCode', 'path',
+	                       'db_lang', 'db_lang_displayName', 'db_lang_file_count',
+	                       'db_lang_linesOfCode', 'ctime', 'primaryLanguage',
+	                       'finalised', 'left_index', 'size'])
+
+df1['cid']
+
+
+# Local Variables:
+# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
+# End:
--- a/client/qldbtools/session/db-refine-info.py
+++ b/client/qldbtools/session/db-refine-info.py
@@ -0,0 +1,118 @@
+# Experimental work be merged with bin/mc-db-refine-info
+from utils import *
+from pprint import pprint
+
+#* Reload gzipped CSV file to continue work
+dbdf_1 = pd.read_csv('dbdf.csv.gz', compression='gzip')
+#
+# (old) Consistency check:
+# dbdf_1.columns == dbdf.columns
+# dbmask = (dbdf_1 != dbdf)
+# dbdf_1[dbmask]
+# dbdf_1[dbmask].dropna(how='all')
+# ctime_raw is different in places, so don't use it.
+
+# 
+#* Interact with/visualize the dataframe
+# Using pandasgui -- qt
+from pandasgui import show
+os.environ['APPDATA'] = "needed-for-pandasgui"
+show(dbdf_1)
+# Using dtale -- web
+import dtale
+dtale.show(dbdf_1)
+# 
+
+#
+#* Collect metadata from DB zip files
+#
+#** A manual sample
+#
+d = dbdf_1
+left_index = 0
+d.path[0]
+cqlc, metac = extract_metadata(d.path[0])
+
+cqlc['baselineLinesOfCode']
+cqlc['primaryLanguage']
+cqlc['creationMetadata']['sha']
+cqlc['creationMetadata']['cliVersion']
+cqlc['creationMetadata']['creationTime'].isoformat()
+cqlc['finalised']
+
+for lang, lang_cont in metac['languages'].items():
+    print(lang)
+    indent = "    "
+    for prop, val in lang_cont.items():
+        if prop == 'files':
+            print("%sfiles count %d" % (indent, len(val)))
+        elif prop == 'linesOfCode':
+            print("%slinesOfCode %d" % (indent, val))
+        elif prop == 'displayName':
+            print("%sdisplayName %s" % (indent, val))
+
+#** Automated for all entries
+# The rest of this interactive script is available as cli script in
+# mc-db-refine-info
+d = dbdf_1
+joiners = []
+for left_index in range(0, len(d)-1):
+    try:
+        cqlc, metac = extract_metadata(d.path[left_index])
+    except ExtractNotZipfile:
+        continue
+    except ExtractNoCQLDB:
+        continue
+    try:
+        detail_df = metadata_details(left_index, cqlc, metac)
+    except DetailsMissing:
+        continue
+    joiners.append(detail_df)
+joiners_df = pd.concat(joiners, axis=0)
+full_df = pd.merge(d, joiners_df, left_index=True, right_on='left_index', how='outer')    
+
+#** View the full dataframe with metadata
+from pandasgui import show
+os.environ['APPDATA'] = "needed-for-pandasgui"
+show(full_df)
+
+#** Re-order the dataframe columns by importance
+# - Much of the data
+#   1. Is only conditionally present
+#   2. Is extra info, not for the DB proper
+#   3. May have various names
+
+# - The essential columns are
+#     | owner               |
+#     | name                |
+#     | language            |
+#     | size                |
+#     | cliVersion          |
+#     | creationTime        |
+#     | sha                 |
+#     | baselineLinesOfCode |
+#     | path                |
+
+# - The rest are useful; put them last
+#     | db_lang             |
+#     | db_lang_displayName |
+#     | db_lang_file_count  |
+#     | db_lang_linesOfCode |
+#     | left_index          |
+#     | ctime               |
+#     | primaryLanguage     |
+#     | finalised           |
+
+final_df = full_df.reindex(columns=['owner', 'name', 'language', 'size', 'cliVersion',
+	                                'creationTime', 'sha', 'baselineLinesOfCode', 'path',
+	                                'db_lang', 'db_lang_displayName', 'db_lang_file_count',
+	                                'db_lang_linesOfCode', 'ctime', 'primaryLanguage',
+	                                'finalised', 'left_index'])
+
+final_df.to_csv('all-info-table.csv.gz', compression='gzip', index=False)
+
+# 
+# Local Variables:
+# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
+# End:
+# 
--- a/client/qldbtools/session/db-unique-1.py
+++ b/client/qldbtools/session/db-unique-1.py
@@ -0,0 +1,41 @@
+# Experimental work for ../bin/mc-db-unique, to be merged into it.
+import qldbtools.utils as utils
+from pprint import pprint
+import pandas as pd
+# cd ../
+
+#* Reload CSV file to continue work
+df2 = df_refined = pd.read_csv('scratch/db-info-2.csv')
+
+# Identify rows missing specific entries
+rows = ( df2['cliVersion'].isna() | 
+         df2['creationTime'].isna() |
+         df2['language'].isna() |
+         df2['sha'].isna() )
+df2[rows]
+df3 = df2[~rows]
+df3
+
+#* post-save work
+df4 = pd.read_csv('scratch/db-info-3.csv')
+
+# Sort and group
+df_sorted = df4.sort_values(by=['owner', 'name', 'CID', 'creationTime'])
+df_unique = df_sorted.groupby(['owner', 'name', 'CID']).first().reset_index()
+
+# Find duplicates
+df_dups = df_unique[df_unique['CID'].duplicated(keep=False)]
+len(df_dups)
+df_dups['CID']
+
+# Set display options
+pd.set_option('display.max_colwidth', None)
+pd.set_option('display.max_columns', None)
+pd.set_option('display.width', 140)
+
+
+# 
+# Local Variables:
+# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
+# End:
+# 
--- a/client/qldbtools/session/db-unique.py
+++ b/client/qldbtools/session/db-unique.py
@@ -0,0 +1,46 @@
+# Session around bin/mc-db-unique
+import qldbtools.utils as utils
+import pandas as pd
+
+#
+#* Collect the information
+#
+df1 = pd.read_csv("scratch/db-info-2.csv")
+
+# Add single uniqueness field -- CID (Cumulative ID) -- using
+# - creationTime
+# - sha
+# - cliVersion
+# - language
+
+from hashlib import blake2b
+
+def cid_hash(row_tuple: tuple):
+    """
+        cid_hash(row_tuple)
+    Take a bytes object and return hash as hex string
+    """
+    h = blake2b(digest_size = 3)
+    h.update(str(row_tuple).encode())
+    # return int.from_bytes(h.digest(), byteorder='big')
+    return h.hexdigest()
+
+# Apply the cid_hash function to the specified columns and create the 'CID' column
+df1['CID'] = df1.apply(lambda row: cid_hash( (row['creationTime'],
+                                              row['sha'], 
+                                              row['cliVersion'], 
+                                              row['language'])
+                                            ), axis=1)
+
+df2 = df1.reindex(columns=['owner', 'name', 'cliVersion', 'creationTime',
+	                       'language', 'sha','CID', 'baselineLinesOfCode', 'path',
+	                       'db_lang', 'db_lang_displayName', 'db_lang_file_count',
+	                       'db_lang_linesOfCode', 'ctime', 'primaryLanguage',
+	                       'finalised', 'left_index', 'size'])
+
+df1['cid']
+
+
+# Local Variables:
+# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
+# End: