Move session scripts to separate directory

This commit is contained in:
Michael Hohn
2024-08-02 13:56:47 -07:00
committed by =Michael Hohn
parent 582d933130
commit 349d758c14
7 changed files with 48 additions and 1 deletions

View File

@@ -0,0 +1,61 @@
""" Read a table of CodeQL DB information
and generate the selection files for
1. the VS Code CodeQL plugin
2. the gh-mrva command-line client
"""
#
#* Collect the information and write files
#
import pandas as pd
import sys
import qldbtools.utils as utils
import numpy as np
import importlib
importlib.reload(utils)
df0 = pd.read_csv('scratch/db-info-3.csv')
# Use num_entries, chosen via pseudo-random numbers
df1 = df0.sample(n=3, random_state=np.random.RandomState(4242))
repos = []
for index, row in df1[['owner', 'name', 'CID', 'path']].iterrows():
owner, name, CID, path = row
repos.append(utils.form_db_req_name(owner, name, CID))
repo_list_name = "mirva-list"
vsc = {
"version": 1,
"databases": {
"variantAnalysis": {
"repositoryLists": [
{
"name": repo_list_name,
"repositories": repos,
}
],
"owners": [],
"repositories": []
}
},
"selected": {
"kind": "variantAnalysisUserDefinedList",
"listName": repo_list_name
}
}
gh = {
repo_list_name: repos
}
# write the files
import json
with open("tmp-selection-vsc.json", "w") as fc:
json.dump(vsc, fc, indent=4)
with open("tmp-selection-gh.json", "w") as fc:
json.dump(gh, fc, indent=4)
# Local Variables:
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
# End:

View File

@@ -0,0 +1,59 @@
#* Experimental work with utils.py, to be merged into it.
# The rest of this interactive script is available as cli script in
# mc-db-initial-info
from utils import *
#* Data collection
# Get the db information in list of DBInfo form
db_base = "~/work-gh/mrva/mrva-open-source-download/"
dbs = list(collect_dbs(db_base))
# Inspect:
from pprint import pprint
pprint(["len", len(dbs)])
pprint(["dbs[0]", dbs[0].__dict__])
pprint(["dbs[-1]", dbs[-1].__dict__])
#
# Get a dataframe
dbdf = pd.DataFrame([d.__dict__ for d in dbs])
#
#* Experiments with on-disk format
# Continue use of raw information in separate session.
#
# PosixPath is a problem for json and parquet
#
dbdf['path'] = dbdf['path'].astype(str)
#
dbdf.to_csv('dbdf.csv')
#
dbdf.to_csv('dbdf.csv.gz', compression='gzip', index=False)
#
dbdf.to_json('dbdf.json')
#
# dbdf.to_hdf('dbdf.h5', key='dbdf', mode='w')
#
# fast, binary
dbdf.to_parquet('dbdf.parquet')
#
# fast
import sqlite3
conn = sqlite3.connect('dbdf.db')
dbdf.to_sql('qldbs', conn, if_exists='replace', index=False)
conn.close()
#
# Sizes:
# ls -laSr dbdf.*
# -rw-r--r--@ 1 hohn staff 101390 Jul 12 14:17 dbdf.csv.gz
# -rw-r--r--@ 1 hohn staff 202712 Jul 12 14:17 dbdf.parquet
# -rw-r--r--@ 1 hohn staff 560623 Jul 12 14:17 dbdf.csv
# -rw-r--r--@ 1 hohn staff 610304 Jul 12 14:17 dbdf.db
# -rw-r--r--@ 1 hohn staff 735097 Jul 12 14:17 dbdf.json
#
# parquet has many libraries, including go: xitongsys/parquet-go
# https://parquet.apache.org/
#
# Local Variables:
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
# End:

View File

@@ -0,0 +1,65 @@
import qldbtools.utils as utils
import pandas as pd
import numpy as np
import sys
from minio import Minio
from minio.error import S3Error
from pathlib import Path
#
#* Collect the information and select subset
#
df = pd.read_csv('scratch/db-info-2.csv')
seed = 4242
if 0:
# Use all entries
entries = df
else:
# Use num_entries, chosen via pseudo-random numbers
entries = df.sample(n=3,
random_state=np.random.RandomState(seed))
#
#* Push the DBs
#
# Configuration
MINIO_URL = "http://localhost:9000"
MINIO_ROOT_USER = "user"
MINIO_ROOT_PASSWORD = "mmusty8432"
QL_DB_BUCKET_NAME = "qldb"
# Initialize MinIO client
client = Minio(
MINIO_URL.replace("http://", "").replace("https://", ""),
access_key=MINIO_ROOT_USER,
secret_key=MINIO_ROOT_PASSWORD,
secure=False
)
# Create the bucket if it doesn't exist
try:
if not client.bucket_exists(QL_DB_BUCKET_NAME):
client.make_bucket(QL_DB_BUCKET_NAME)
else:
print(f"Bucket '{QL_DB_BUCKET_NAME}' already exists.")
except S3Error as err:
print(f"Error creating bucket: {err}")
# (test) File paths and new names
files_to_upload = {
"cmd/server/codeql/dbs/google/flatbuffers/google_flatbuffers_db.zip": "google$flatbuffers.zip",
"cmd/server/codeql/dbs/psycopg/psycopg2/psycopg_psycopg2_db.zip": "psycopg$psycopg2.zip"
}
# (test) Push the files
prefix = Path('/Users/hohn/work-gh/mrva/mrvacommander')
for local_path, new_name in files_to_upload.items():
try:
client.fput_object(QL_DB_BUCKET_NAME, new_name, prefix / Path(local_path))
print(f"Uploaded {local_path} as {new_name} to bucket {QL_DB_BUCKET_NAME}")
except S3Error as err:
print(f"Error uploading file {local_path}: {err}")
# Local Variables:
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
# End:

View File

@@ -0,0 +1,46 @@
# Session around bin/mc-db-unique
import qldbtools.utils as utils
import pandas as pd
#
#* Collect the information
#
df1 = pd.read_csv("scratch/db-info-2.csv")
# Add single uniqueness field -- CID (Cumulative ID) -- using
# - creationTime
# - sha
# - cliVersion
# - language
from hashlib import blake2b
def cid_hash(row_tuple: tuple):
"""
cid_hash(row_tuple)
Take a bytes object and return hash as hex string
"""
h = blake2b(digest_size = 3)
h.update(str(row_tuple).encode())
# return int.from_bytes(h.digest(), byteorder='big')
return h.hexdigest()
# Apply the cid_hash function to the specified columns and create the 'CID' column
df1['CID'] = df1.apply(lambda row: cid_hash( (row['creationTime'],
row['sha'],
row['cliVersion'],
row['language'])
), axis=1)
df2 = df1.reindex(columns=['owner', 'name', 'cliVersion', 'creationTime',
'language', 'sha','CID', 'baselineLinesOfCode', 'path',
'db_lang', 'db_lang_displayName', 'db_lang_file_count',
'db_lang_linesOfCode', 'ctime', 'primaryLanguage',
'finalised', 'left_index', 'size'])
df1['cid']
# Local Variables:
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
# End:

View File

@@ -0,0 +1,118 @@
# Experimental work be merged with bin/mc-db-refine-info
from utils import *
from pprint import pprint
#* Reload gzipped CSV file to continue work
dbdf_1 = pd.read_csv('dbdf.csv.gz', compression='gzip')
#
# (old) Consistency check:
# dbdf_1.columns == dbdf.columns
# dbmask = (dbdf_1 != dbdf)
# dbdf_1[dbmask]
# dbdf_1[dbmask].dropna(how='all')
# ctime_raw is different in places, so don't use it.
#
#* Interact with/visualize the dataframe
# Using pandasgui -- qt
from pandasgui import show
os.environ['APPDATA'] = "needed-for-pandasgui"
show(dbdf_1)
# Using dtale -- web
import dtale
dtale.show(dbdf_1)
#
#
#* Collect metadata from DB zip files
#
#** A manual sample
#
d = dbdf_1
left_index = 0
d.path[0]
cqlc, metac = extract_metadata(d.path[0])
cqlc['baselineLinesOfCode']
cqlc['primaryLanguage']
cqlc['creationMetadata']['sha']
cqlc['creationMetadata']['cliVersion']
cqlc['creationMetadata']['creationTime'].isoformat()
cqlc['finalised']
for lang, lang_cont in metac['languages'].items():
print(lang)
indent = " "
for prop, val in lang_cont.items():
if prop == 'files':
print("%sfiles count %d" % (indent, len(val)))
elif prop == 'linesOfCode':
print("%slinesOfCode %d" % (indent, val))
elif prop == 'displayName':
print("%sdisplayName %s" % (indent, val))
#** Automated for all entries
# The rest of this interactive script is available as cli script in
# mc-db-refine-info
d = dbdf_1
joiners = []
for left_index in range(0, len(d)-1):
try:
cqlc, metac = extract_metadata(d.path[left_index])
except ExtractNotZipfile:
continue
except ExtractNoCQLDB:
continue
try:
detail_df = metadata_details(left_index, cqlc, metac)
except DetailsMissing:
continue
joiners.append(detail_df)
joiners_df = pd.concat(joiners, axis=0)
full_df = pd.merge(d, joiners_df, left_index=True, right_on='left_index', how='outer')
#** View the full dataframe with metadata
from pandasgui import show
os.environ['APPDATA'] = "needed-for-pandasgui"
show(full_df)
#** Re-order the dataframe columns by importance
# - Much of the data
# 1. Is only conditionally present
# 2. Is extra info, not for the DB proper
# 3. May have various names
# - The essential columns are
# | owner |
# | name |
# | language |
# | size |
# | cliVersion |
# | creationTime |
# | sha |
# | baselineLinesOfCode |
# | path |
# - The rest are useful; put them last
# | db_lang |
# | db_lang_displayName |
# | db_lang_file_count |
# | db_lang_linesOfCode |
# | left_index |
# | ctime |
# | primaryLanguage |
# | finalised |
final_df = full_df.reindex(columns=['owner', 'name', 'language', 'size', 'cliVersion',
'creationTime', 'sha', 'baselineLinesOfCode', 'path',
'db_lang', 'db_lang_displayName', 'db_lang_file_count',
'db_lang_linesOfCode', 'ctime', 'primaryLanguage',
'finalised', 'left_index'])
final_df.to_csv('all-info-table.csv.gz', compression='gzip', index=False)
#
# Local Variables:
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
# End:
#

View File

@@ -0,0 +1,41 @@
# Experimental work for ../bin/mc-db-unique, to be merged into it.
import qldbtools.utils as utils
from pprint import pprint
import pandas as pd
# cd ../
#* Reload CSV file to continue work
df2 = df_refined = pd.read_csv('scratch/db-info-2.csv')
# Identify rows missing specific entries
rows = ( df2['cliVersion'].isna() |
df2['creationTime'].isna() |
df2['language'].isna() |
df2['sha'].isna() )
df2[rows]
df3 = df2[~rows]
df3
#* post-save work
df4 = pd.read_csv('scratch/db-info-3.csv')
# Sort and group
df_sorted = df4.sort_values(by=['owner', 'name', 'CID', 'creationTime'])
df_unique = df_sorted.groupby(['owner', 'name', 'CID']).first().reset_index()
# Find duplicates
df_dups = df_unique[df_unique['CID'].duplicated(keep=False)]
len(df_dups)
df_dups['CID']
# Set display options
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 140)
#
# Local Variables:
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
# End:
#

View File

@@ -0,0 +1,46 @@
# Session around bin/mc-db-unique
import qldbtools.utils as utils
import pandas as pd
#
#* Collect the information
#
df1 = pd.read_csv("scratch/db-info-2.csv")
# Add single uniqueness field -- CID (Cumulative ID) -- using
# - creationTime
# - sha
# - cliVersion
# - language
from hashlib import blake2b
def cid_hash(row_tuple: tuple):
"""
cid_hash(row_tuple)
Take a bytes object and return hash as hex string
"""
h = blake2b(digest_size = 3)
h.update(str(row_tuple).encode())
# return int.from_bytes(h.digest(), byteorder='big')
return h.hexdigest()
# Apply the cid_hash function to the specified columns and create the 'CID' column
df1['CID'] = df1.apply(lambda row: cid_hash( (row['creationTime'],
row['sha'],
row['cliVersion'],
row['language'])
), axis=1)
df2 = df1.reindex(columns=['owner', 'name', 'cliVersion', 'creationTime',
'language', 'sha','CID', 'baselineLinesOfCode', 'path',
'db_lang', 'db_lang_displayName', 'db_lang_file_count',
'db_lang_linesOfCode', 'ctime', 'primaryLanguage',
'finalised', 'left_index', 'size'])
df1['cid']
# Local Variables:
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
# End: