Move session scripts to separate directory

2024-08-02 13:56:47 -07:00
parent 582d933130
commit 349d758c14
7 changed files with 48 additions and 1 deletions
--- a/client/qldbtools/qldbtools/session-4-unique.py
+++ b/client/qldbtools/qldbtools/session-4-unique.py
@@ -1,41 +0,0 @@
-# Experimental work for ../bin/mc-db-unique, to be merged into it.
-import qldbtools.utils as utils
-from pprint import pprint
-import pandas as pd
-# cd ../
-
-#* Reload CSV file to continue work
-df2 = df_refined = pd.read_csv('scratch/db-info-2.csv')
-
-# Identify rows missing specific entries
-rows = ( df2['cliVersion'].isna() | 
-         df2['creationTime'].isna() |
-         df2['language'].isna() |
-         df2['sha'].isna() )
-df2[rows]
-df3 = df2[~rows]
-df3
-
-#* post-save work
-df4 = pd.read_csv('scratch/db-info-3.csv')
-
-# Sort and group
-df_sorted = df4.sort_values(by=['owner', 'name', 'CID', 'creationTime'])
-df_unique = df_sorted.groupby(['owner', 'name', 'CID']).first().reset_index()
-
-# Find duplicates
-df_dups = df_unique[df_unique['CID'].duplicated(keep=False)]
-len(df_dups)
-df_dups['CID']
-
-# Set display options
-pd.set_option('display.max_colwidth', None)
-pd.set_option('display.max_columns', None)
-pd.set_option('display.width', 140)
-
-
-# 
-# Local Variables:
-# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
-# End:
-# 
--- a/client/qldbtools/qldbtools/session-generate-selection.py
+++ b/client/qldbtools/qldbtools/session-generate-selection.py
@@ -1,61 +0,0 @@
-""" Read a table of CodeQL DB information
-    and generate the selection files for
-    1. the VS Code CodeQL plugin
-    2. the gh-mrva command-line client
-"""
-#
-#* Collect the information and write files
-#
-import pandas as pd
-import sys
-import qldbtools.utils as utils
-import numpy as np
-import importlib
-importlib.reload(utils)
-
-df0 = pd.read_csv('scratch/db-info-3.csv')
-
-# Use num_entries, chosen via pseudo-random numbers
-df1 = df0.sample(n=3, random_state=np.random.RandomState(4242))
-
-repos = []
-for index, row in df1[['owner', 'name', 'CID', 'path']].iterrows():
-    owner, name, CID, path = row
-    repos.append(utils.form_db_req_name(owner, name, CID))
-
-repo_list_name = "mirva-list"
-vsc = {
-    "version": 1,
-    "databases": {
-        "variantAnalysis": {
-            "repositoryLists": [
-                {
-                    "name": repo_list_name,
-                    "repositories": repos,
-                }
-            ],
-            "owners": [],
-            "repositories": []
-        }
-    },
-    "selected": {
-        "kind": "variantAnalysisUserDefinedList",
-        "listName": repo_list_name
-    }
-}
-
-gh = {
-    repo_list_name:  repos
-}
-
-
-# write the files
-import json
-with open("tmp-selection-vsc.json", "w") as fc:
-    json.dump(vsc, fc, indent=4)
-with open("tmp-selection-gh.json", "w") as fc:
-    json.dump(gh, fc, indent=4)
-    
-# Local Variables:
-# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
-# End:
--- a/client/qldbtools/qldbtools/session-populate-minio.py
+++ b/client/qldbtools/qldbtools/session-populate-minio.py
@@ -1,65 +0,0 @@
-import qldbtools.utils as utils
-import pandas as pd
-import numpy as np
-import sys
-from minio import Minio
-from minio.error import S3Error
-from pathlib import Path
-
-#
-#* Collect the information and select subset
-#
-df = pd.read_csv('scratch/db-info-2.csv')
-seed = 4242
-if 0:
-    # Use all entries
-    entries = df
-else:
-    # Use num_entries, chosen via pseudo-random numbers
-    entries = df.sample(n=3,
-                        random_state=np.random.RandomState(seed))
-#
-#* Push the DBs
-#
-# Configuration
-MINIO_URL = "http://localhost:9000"
-MINIO_ROOT_USER = "user"
-MINIO_ROOT_PASSWORD = "mmusty8432"
-QL_DB_BUCKET_NAME = "qldb"
-
-# Initialize MinIO client
-client = Minio(
-    MINIO_URL.replace("http://", "").replace("https://", ""),
-    access_key=MINIO_ROOT_USER,
-    secret_key=MINIO_ROOT_PASSWORD,
-    secure=False
-)
-
-# Create the bucket if it doesn't exist
-try:
-    if not client.bucket_exists(QL_DB_BUCKET_NAME):
-        client.make_bucket(QL_DB_BUCKET_NAME)
-    else:
-        print(f"Bucket '{QL_DB_BUCKET_NAME}' already exists.")
-except S3Error as err:
-    print(f"Error creating bucket: {err}")
-
-# (test) File paths and new names
-files_to_upload = {
-    "cmd/server/codeql/dbs/google/flatbuffers/google_flatbuffers_db.zip": "google$flatbuffers.zip",
-    "cmd/server/codeql/dbs/psycopg/psycopg2/psycopg_psycopg2_db.zip": "psycopg$psycopg2.zip"
-}
-
-# (test) Push the files
-prefix = Path('/Users/hohn/work-gh/mrva/mrvacommander')
-for local_path, new_name in files_to_upload.items():
-    try:
-        client.fput_object(QL_DB_BUCKET_NAME, new_name, prefix / Path(local_path))
-        print(f"Uploaded {local_path} as {new_name} to bucket {QL_DB_BUCKET_NAME}")
-    except S3Error as err:
-        print(f"Error uploading file {local_path}: {err}")
-
-
-# Local Variables:
-# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
-# End:
--- a/client/qldbtools/qldbtools/session-post-refine-info.py
+++ b/client/qldbtools/qldbtools/session-post-refine-info.py
@@ -1,45 +0,0 @@
-import qldbtools.utils as utils
-import pandas as pd
-
-#
-#* Collect the information
-#
-df1 = pd.read_csv("scratch/db-info-2.csv")
-
-# Add single uniqueness field -- CID (Cumulative ID) -- using
-# - creationTime
-# - sha
-# - cliVersion
-# - language
-
-from hashlib import blake2b
-
-def cid_hash(row_tuple: tuple):
-    """
-        cid_hash(row_tuple)
-    Take a bytes object and return hash as hex string
-    """
-    h = blake2b(digest_size = 3)
-    h.update(str(row_tuple).encode())
-    # return int.from_bytes(h.digest(), byteorder='big')
-    return h.hexdigest()
-
-# Apply the cid_hash function to the specified columns and create the 'CID' column
-df1['CID'] = df1.apply(lambda row: cid_hash( (row['creationTime'],
-                                              row['sha'], 
-                                              row['cliVersion'], 
-                                              row['language'])
-                                            ), axis=1)
-
-df2 = df1.reindex(columns=['owner', 'name', 'cliVersion', 'creationTime',
-	                       'language', 'sha','CID', 'baselineLinesOfCode', 'path',
-	                       'db_lang', 'db_lang_displayName', 'db_lang_file_count',
-	                       'db_lang_linesOfCode', 'ctime', 'primaryLanguage',
-	                       'finalised', 'left_index', 'size'])
-
-df1['cid']
-
-
-# Local Variables:
-# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
-# End:
--- a/client/qldbtools/qldbtools/session1.py
+++ b/client/qldbtools/qldbtools/session1.py
@@ -1,59 +0,0 @@
-#* Experimental work with utils.py, to be merged into it.
-# The rest of this interactive script is available as cli script in
-# mc-db-initial-info
-from utils import *
-
-#* Data collection
-# Get the db information in list of DBInfo form
-db_base = "~/work-gh/mrva/mrva-open-source-download/"
-dbs = list(collect_dbs(db_base))
-
-# Inspect:
-from pprint import pprint
-pprint(["len", len(dbs)])
-pprint(["dbs[0]", dbs[0].__dict__])
-pprint(["dbs[-1]", dbs[-1].__dict__])
-# 
-# Get a dataframe
-dbdf = pd.DataFrame([d.__dict__ for d in dbs])
-# 
-#* Experiments with on-disk format
-# Continue use of raw information in separate session.
-# 
-# PosixPath is a problem for json and parquet
-# 
-dbdf['path'] = dbdf['path'].astype(str)
-#
-dbdf.to_csv('dbdf.csv')
-#
-dbdf.to_csv('dbdf.csv.gz', compression='gzip', index=False)
-# 
-dbdf.to_json('dbdf.json')
-#
-# dbdf.to_hdf('dbdf.h5', key='dbdf', mode='w')
-# 
-# fast, binary
-dbdf.to_parquet('dbdf.parquet')
-# 
-# fast
-import sqlite3
-conn = sqlite3.connect('dbdf.db')
-dbdf.to_sql('qldbs', conn, if_exists='replace', index=False)
-conn.close()
-# 
-# Sizes:
-# ls -laSr dbdf.*
-# -rw-r--r--@ 1 hohn  staff  101390 Jul 12 14:17 dbdf.csv.gz
-# -rw-r--r--@ 1 hohn  staff  202712 Jul 12 14:17 dbdf.parquet
-# -rw-r--r--@ 1 hohn  staff  560623 Jul 12 14:17 dbdf.csv
-# -rw-r--r--@ 1 hohn  staff  610304 Jul 12 14:17 dbdf.db
-# -rw-r--r--@ 1 hohn  staff  735097 Jul 12 14:17 dbdf.json
-#
-# parquet has many libraries, including go: xitongsys/parquet-go
-# https://parquet.apache.org/
-# 
-
-
-# Local Variables:
-# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
-# End:
--- a/client/qldbtools/qldbtools/session2.py
+++ b/client/qldbtools/qldbtools/session2.py
@@ -1,118 +0,0 @@
-# Experimental work with utils.py, to be merged into it.
-from utils import *
-from pprint import pprint
-
-#* Reload gzipped CSV file to continue work
-dbdf_1 = pd.read_csv('dbdf.csv.gz', compression='gzip')
-#
-# (old) Consistency check:
-# dbdf_1.columns == dbdf.columns
-# dbmask = (dbdf_1 != dbdf)
-# dbdf_1[dbmask]
-# dbdf_1[dbmask].dropna(how='all')
-# ctime_raw is different in places, so don't use it.
-
-# 
-#* Interact with/visualize the dataframe
-# Using pandasgui -- qt
-from pandasgui import show
-os.environ['APPDATA'] = "needed-for-pandasgui"
-show(dbdf_1)
-# Using dtale -- web
-import dtale
-dtale.show(dbdf_1)
-# 
-
-#
-#* Collect metadata from DB zip files
-#
-#** A manual sample
-#
-d = dbdf_1
-left_index = 0
-d.path[0]
-cqlc, metac = extract_metadata(d.path[0])
-
-cqlc['baselineLinesOfCode']
-cqlc['primaryLanguage']
-cqlc['creationMetadata']['sha']
-cqlc['creationMetadata']['cliVersion']
-cqlc['creationMetadata']['creationTime'].isoformat()
-cqlc['finalised']
-
-for lang, lang_cont in metac['languages'].items():
-    print(lang)
-    indent = "    "
-    for prop, val in lang_cont.items():
-        if prop == 'files':
-            print("%sfiles count %d" % (indent, len(val)))
-        elif prop == 'linesOfCode':
-            print("%slinesOfCode %d" % (indent, val))
-        elif prop == 'displayName':
-            print("%sdisplayName %s" % (indent, val))
-
-#** Automated for all entries
-# The rest of this interactive script is available as cli script in
-# mc-db-refine-info
-d = dbdf_1
-joiners = []
-for left_index in range(0, len(d)-1):
-    try:
-        cqlc, metac = extract_metadata(d.path[left_index])
-    except ExtractNotZipfile:
-        continue
-    except ExtractNoCQLDB:
-        continue
-    try:
-        detail_df = metadata_details(left_index, cqlc, metac)
-    except DetailsMissing:
-        continue
-    joiners.append(detail_df)
-joiners_df = pd.concat(joiners, axis=0)
-full_df = pd.merge(d, joiners_df, left_index=True, right_on='left_index', how='outer')    
-
-#** View the full dataframe with metadata
-from pandasgui import show
-os.environ['APPDATA'] = "needed-for-pandasgui"
-show(full_df)
-
-#** Re-order the dataframe columns by importance
-# - Much of the data
-#   1. Is only conditionally present
-#   2. Is extra info, not for the DB proper
-#   3. May have various names
-
-# - The essential columns are
-#     | owner               |
-#     | name                |
-#     | language            |
-#     | size                |
-#     | cliVersion          |
-#     | creationTime        |
-#     | sha                 |
-#     | baselineLinesOfCode |
-#     | path                |
-
-# - The rest are useful; put them last
-#     | db_lang             |
-#     | db_lang_displayName |
-#     | db_lang_file_count  |
-#     | db_lang_linesOfCode |
-#     | left_index          |
-#     | ctime               |
-#     | primaryLanguage     |
-#     | finalised           |
-
-final_df = full_df.reindex(columns=['owner', 'name', 'language', 'size', 'cliVersion',
-	                                'creationTime', 'sha', 'baselineLinesOfCode', 'path',
-	                                'db_lang', 'db_lang_displayName', 'db_lang_file_count',
-	                                'db_lang_linesOfCode', 'ctime', 'primaryLanguage',
-	                                'finalised', 'left_index'])
-
-final_df.to_csv('all-info-table.csv.gz', compression='gzip', index=False)
-
-# 
-# Local Variables:
-# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
-# End:
-#