Move session scripts to separate directory
This commit is contained in:
committed by
=Michael Hohn
parent
582d933130
commit
349d758c14
@@ -1,41 +0,0 @@
|
||||
# Experimental work for ../bin/mc-db-unique, to be merged into it.
|
||||
import qldbtools.utils as utils
|
||||
from pprint import pprint
|
||||
import pandas as pd
|
||||
# cd ../
|
||||
|
||||
#* Reload CSV file to continue work
|
||||
df2 = df_refined = pd.read_csv('scratch/db-info-2.csv')
|
||||
|
||||
# Identify rows missing specific entries
|
||||
rows = ( df2['cliVersion'].isna() |
|
||||
df2['creationTime'].isna() |
|
||||
df2['language'].isna() |
|
||||
df2['sha'].isna() )
|
||||
df2[rows]
|
||||
df3 = df2[~rows]
|
||||
df3
|
||||
|
||||
#* post-save work
|
||||
df4 = pd.read_csv('scratch/db-info-3.csv')
|
||||
|
||||
# Sort and group
|
||||
df_sorted = df4.sort_values(by=['owner', 'name', 'CID', 'creationTime'])
|
||||
df_unique = df_sorted.groupby(['owner', 'name', 'CID']).first().reset_index()
|
||||
|
||||
# Find duplicates
|
||||
df_dups = df_unique[df_unique['CID'].duplicated(keep=False)]
|
||||
len(df_dups)
|
||||
df_dups['CID']
|
||||
|
||||
# Set display options
|
||||
pd.set_option('display.max_colwidth', None)
|
||||
pd.set_option('display.max_columns', None)
|
||||
pd.set_option('display.width', 140)
|
||||
|
||||
|
||||
#
|
||||
# Local Variables:
|
||||
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
|
||||
# End:
|
||||
#
|
||||
@@ -1,61 +0,0 @@
|
||||
""" Read a table of CodeQL DB information
|
||||
and generate the selection files for
|
||||
1. the VS Code CodeQL plugin
|
||||
2. the gh-mrva command-line client
|
||||
"""
|
||||
#
|
||||
#* Collect the information and write files
|
||||
#
|
||||
import pandas as pd
|
||||
import sys
|
||||
import qldbtools.utils as utils
|
||||
import numpy as np
|
||||
import importlib
|
||||
importlib.reload(utils)
|
||||
|
||||
df0 = pd.read_csv('scratch/db-info-3.csv')
|
||||
|
||||
# Use num_entries, chosen via pseudo-random numbers
|
||||
df1 = df0.sample(n=3, random_state=np.random.RandomState(4242))
|
||||
|
||||
repos = []
|
||||
for index, row in df1[['owner', 'name', 'CID', 'path']].iterrows():
|
||||
owner, name, CID, path = row
|
||||
repos.append(utils.form_db_req_name(owner, name, CID))
|
||||
|
||||
repo_list_name = "mirva-list"
|
||||
vsc = {
|
||||
"version": 1,
|
||||
"databases": {
|
||||
"variantAnalysis": {
|
||||
"repositoryLists": [
|
||||
{
|
||||
"name": repo_list_name,
|
||||
"repositories": repos,
|
||||
}
|
||||
],
|
||||
"owners": [],
|
||||
"repositories": []
|
||||
}
|
||||
},
|
||||
"selected": {
|
||||
"kind": "variantAnalysisUserDefinedList",
|
||||
"listName": repo_list_name
|
||||
}
|
||||
}
|
||||
|
||||
gh = {
|
||||
repo_list_name: repos
|
||||
}
|
||||
|
||||
|
||||
# write the files
|
||||
import json
|
||||
with open("tmp-selection-vsc.json", "w") as fc:
|
||||
json.dump(vsc, fc, indent=4)
|
||||
with open("tmp-selection-gh.json", "w") as fc:
|
||||
json.dump(gh, fc, indent=4)
|
||||
|
||||
# Local Variables:
|
||||
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
|
||||
# End:
|
||||
@@ -1,65 +0,0 @@
|
||||
import qldbtools.utils as utils
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import sys
|
||||
from minio import Minio
|
||||
from minio.error import S3Error
|
||||
from pathlib import Path
|
||||
|
||||
#
|
||||
#* Collect the information and select subset
|
||||
#
|
||||
df = pd.read_csv('scratch/db-info-2.csv')
|
||||
seed = 4242
|
||||
if 0:
|
||||
# Use all entries
|
||||
entries = df
|
||||
else:
|
||||
# Use num_entries, chosen via pseudo-random numbers
|
||||
entries = df.sample(n=3,
|
||||
random_state=np.random.RandomState(seed))
|
||||
#
|
||||
#* Push the DBs
|
||||
#
|
||||
# Configuration
|
||||
MINIO_URL = "http://localhost:9000"
|
||||
MINIO_ROOT_USER = "user"
|
||||
MINIO_ROOT_PASSWORD = "mmusty8432"
|
||||
QL_DB_BUCKET_NAME = "qldb"
|
||||
|
||||
# Initialize MinIO client
|
||||
client = Minio(
|
||||
MINIO_URL.replace("http://", "").replace("https://", ""),
|
||||
access_key=MINIO_ROOT_USER,
|
||||
secret_key=MINIO_ROOT_PASSWORD,
|
||||
secure=False
|
||||
)
|
||||
|
||||
# Create the bucket if it doesn't exist
|
||||
try:
|
||||
if not client.bucket_exists(QL_DB_BUCKET_NAME):
|
||||
client.make_bucket(QL_DB_BUCKET_NAME)
|
||||
else:
|
||||
print(f"Bucket '{QL_DB_BUCKET_NAME}' already exists.")
|
||||
except S3Error as err:
|
||||
print(f"Error creating bucket: {err}")
|
||||
|
||||
# (test) File paths and new names
|
||||
files_to_upload = {
|
||||
"cmd/server/codeql/dbs/google/flatbuffers/google_flatbuffers_db.zip": "google$flatbuffers.zip",
|
||||
"cmd/server/codeql/dbs/psycopg/psycopg2/psycopg_psycopg2_db.zip": "psycopg$psycopg2.zip"
|
||||
}
|
||||
|
||||
# (test) Push the files
|
||||
prefix = Path('/Users/hohn/work-gh/mrva/mrvacommander')
|
||||
for local_path, new_name in files_to_upload.items():
|
||||
try:
|
||||
client.fput_object(QL_DB_BUCKET_NAME, new_name, prefix / Path(local_path))
|
||||
print(f"Uploaded {local_path} as {new_name} to bucket {QL_DB_BUCKET_NAME}")
|
||||
except S3Error as err:
|
||||
print(f"Error uploading file {local_path}: {err}")
|
||||
|
||||
|
||||
# Local Variables:
|
||||
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
|
||||
# End:
|
||||
@@ -1,45 +0,0 @@
|
||||
import qldbtools.utils as utils
|
||||
import pandas as pd
|
||||
|
||||
#
|
||||
#* Collect the information
|
||||
#
|
||||
df1 = pd.read_csv("scratch/db-info-2.csv")
|
||||
|
||||
# Add single uniqueness field -- CID (Cumulative ID) -- using
|
||||
# - creationTime
|
||||
# - sha
|
||||
# - cliVersion
|
||||
# - language
|
||||
|
||||
from hashlib import blake2b
|
||||
|
||||
def cid_hash(row_tuple: tuple):
|
||||
"""
|
||||
cid_hash(row_tuple)
|
||||
Take a bytes object and return hash as hex string
|
||||
"""
|
||||
h = blake2b(digest_size = 3)
|
||||
h.update(str(row_tuple).encode())
|
||||
# return int.from_bytes(h.digest(), byteorder='big')
|
||||
return h.hexdigest()
|
||||
|
||||
# Apply the cid_hash function to the specified columns and create the 'CID' column
|
||||
df1['CID'] = df1.apply(lambda row: cid_hash( (row['creationTime'],
|
||||
row['sha'],
|
||||
row['cliVersion'],
|
||||
row['language'])
|
||||
), axis=1)
|
||||
|
||||
df2 = df1.reindex(columns=['owner', 'name', 'cliVersion', 'creationTime',
|
||||
'language', 'sha','CID', 'baselineLinesOfCode', 'path',
|
||||
'db_lang', 'db_lang_displayName', 'db_lang_file_count',
|
||||
'db_lang_linesOfCode', 'ctime', 'primaryLanguage',
|
||||
'finalised', 'left_index', 'size'])
|
||||
|
||||
df1['cid']
|
||||
|
||||
|
||||
# Local Variables:
|
||||
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
|
||||
# End:
|
||||
@@ -1,59 +0,0 @@
|
||||
#* Experimental work with utils.py, to be merged into it.
|
||||
# The rest of this interactive script is available as cli script in
|
||||
# mc-db-initial-info
|
||||
from utils import *
|
||||
|
||||
#* Data collection
|
||||
# Get the db information in list of DBInfo form
|
||||
db_base = "~/work-gh/mrva/mrva-open-source-download/"
|
||||
dbs = list(collect_dbs(db_base))
|
||||
|
||||
# Inspect:
|
||||
from pprint import pprint
|
||||
pprint(["len", len(dbs)])
|
||||
pprint(["dbs[0]", dbs[0].__dict__])
|
||||
pprint(["dbs[-1]", dbs[-1].__dict__])
|
||||
#
|
||||
# Get a dataframe
|
||||
dbdf = pd.DataFrame([d.__dict__ for d in dbs])
|
||||
#
|
||||
#* Experiments with on-disk format
|
||||
# Continue use of raw information in separate session.
|
||||
#
|
||||
# PosixPath is a problem for json and parquet
|
||||
#
|
||||
dbdf['path'] = dbdf['path'].astype(str)
|
||||
#
|
||||
dbdf.to_csv('dbdf.csv')
|
||||
#
|
||||
dbdf.to_csv('dbdf.csv.gz', compression='gzip', index=False)
|
||||
#
|
||||
dbdf.to_json('dbdf.json')
|
||||
#
|
||||
# dbdf.to_hdf('dbdf.h5', key='dbdf', mode='w')
|
||||
#
|
||||
# fast, binary
|
||||
dbdf.to_parquet('dbdf.parquet')
|
||||
#
|
||||
# fast
|
||||
import sqlite3
|
||||
conn = sqlite3.connect('dbdf.db')
|
||||
dbdf.to_sql('qldbs', conn, if_exists='replace', index=False)
|
||||
conn.close()
|
||||
#
|
||||
# Sizes:
|
||||
# ls -laSr dbdf.*
|
||||
# -rw-r--r--@ 1 hohn staff 101390 Jul 12 14:17 dbdf.csv.gz
|
||||
# -rw-r--r--@ 1 hohn staff 202712 Jul 12 14:17 dbdf.parquet
|
||||
# -rw-r--r--@ 1 hohn staff 560623 Jul 12 14:17 dbdf.csv
|
||||
# -rw-r--r--@ 1 hohn staff 610304 Jul 12 14:17 dbdf.db
|
||||
# -rw-r--r--@ 1 hohn staff 735097 Jul 12 14:17 dbdf.json
|
||||
#
|
||||
# parquet has many libraries, including go: xitongsys/parquet-go
|
||||
# https://parquet.apache.org/
|
||||
#
|
||||
|
||||
|
||||
# Local Variables:
|
||||
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
|
||||
# End:
|
||||
@@ -1,118 +0,0 @@
|
||||
# Experimental work with utils.py, to be merged into it.
|
||||
from utils import *
|
||||
from pprint import pprint
|
||||
|
||||
#* Reload gzipped CSV file to continue work
|
||||
dbdf_1 = pd.read_csv('dbdf.csv.gz', compression='gzip')
|
||||
#
|
||||
# (old) Consistency check:
|
||||
# dbdf_1.columns == dbdf.columns
|
||||
# dbmask = (dbdf_1 != dbdf)
|
||||
# dbdf_1[dbmask]
|
||||
# dbdf_1[dbmask].dropna(how='all')
|
||||
# ctime_raw is different in places, so don't use it.
|
||||
|
||||
#
|
||||
#* Interact with/visualize the dataframe
|
||||
# Using pandasgui -- qt
|
||||
from pandasgui import show
|
||||
os.environ['APPDATA'] = "needed-for-pandasgui"
|
||||
show(dbdf_1)
|
||||
# Using dtale -- web
|
||||
import dtale
|
||||
dtale.show(dbdf_1)
|
||||
#
|
||||
|
||||
#
|
||||
#* Collect metadata from DB zip files
|
||||
#
|
||||
#** A manual sample
|
||||
#
|
||||
d = dbdf_1
|
||||
left_index = 0
|
||||
d.path[0]
|
||||
cqlc, metac = extract_metadata(d.path[0])
|
||||
|
||||
cqlc['baselineLinesOfCode']
|
||||
cqlc['primaryLanguage']
|
||||
cqlc['creationMetadata']['sha']
|
||||
cqlc['creationMetadata']['cliVersion']
|
||||
cqlc['creationMetadata']['creationTime'].isoformat()
|
||||
cqlc['finalised']
|
||||
|
||||
for lang, lang_cont in metac['languages'].items():
|
||||
print(lang)
|
||||
indent = " "
|
||||
for prop, val in lang_cont.items():
|
||||
if prop == 'files':
|
||||
print("%sfiles count %d" % (indent, len(val)))
|
||||
elif prop == 'linesOfCode':
|
||||
print("%slinesOfCode %d" % (indent, val))
|
||||
elif prop == 'displayName':
|
||||
print("%sdisplayName %s" % (indent, val))
|
||||
|
||||
#** Automated for all entries
|
||||
# The rest of this interactive script is available as cli script in
|
||||
# mc-db-refine-info
|
||||
d = dbdf_1
|
||||
joiners = []
|
||||
for left_index in range(0, len(d)-1):
|
||||
try:
|
||||
cqlc, metac = extract_metadata(d.path[left_index])
|
||||
except ExtractNotZipfile:
|
||||
continue
|
||||
except ExtractNoCQLDB:
|
||||
continue
|
||||
try:
|
||||
detail_df = metadata_details(left_index, cqlc, metac)
|
||||
except DetailsMissing:
|
||||
continue
|
||||
joiners.append(detail_df)
|
||||
joiners_df = pd.concat(joiners, axis=0)
|
||||
full_df = pd.merge(d, joiners_df, left_index=True, right_on='left_index', how='outer')
|
||||
|
||||
#** View the full dataframe with metadata
|
||||
from pandasgui import show
|
||||
os.environ['APPDATA'] = "needed-for-pandasgui"
|
||||
show(full_df)
|
||||
|
||||
#** Re-order the dataframe columns by importance
|
||||
# - Much of the data
|
||||
# 1. Is only conditionally present
|
||||
# 2. Is extra info, not for the DB proper
|
||||
# 3. May have various names
|
||||
|
||||
# - The essential columns are
|
||||
# | owner |
|
||||
# | name |
|
||||
# | language |
|
||||
# | size |
|
||||
# | cliVersion |
|
||||
# | creationTime |
|
||||
# | sha |
|
||||
# | baselineLinesOfCode |
|
||||
# | path |
|
||||
|
||||
# - The rest are useful; put them last
|
||||
# | db_lang |
|
||||
# | db_lang_displayName |
|
||||
# | db_lang_file_count |
|
||||
# | db_lang_linesOfCode |
|
||||
# | left_index |
|
||||
# | ctime |
|
||||
# | primaryLanguage |
|
||||
# | finalised |
|
||||
|
||||
final_df = full_df.reindex(columns=['owner', 'name', 'language', 'size', 'cliVersion',
|
||||
'creationTime', 'sha', 'baselineLinesOfCode', 'path',
|
||||
'db_lang', 'db_lang_displayName', 'db_lang_file_count',
|
||||
'db_lang_linesOfCode', 'ctime', 'primaryLanguage',
|
||||
'finalised', 'left_index'])
|
||||
|
||||
final_df.to_csv('all-info-table.csv.gz', compression='gzip', index=False)
|
||||
|
||||
#
|
||||
# Local Variables:
|
||||
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
|
||||
# End:
|
||||
#
|
||||
Reference in New Issue
Block a user