Experiment with formats for saving/loading the database index

The .csv.gz format is the simplest and most universal.  It's also the smallest
on disk.
The comparison of saved/reloaded dataframe shows no difference.
The ctime_raw column caused serialization problems, so only ctime (in
iso-8601 format) is used.
This commit is contained in:
Michael Hohn
2024-07-12 14:41:05 -07:00
committed by =Michael Hohn
parent 3df1cac5ae
commit 6b4e753e69
2 changed files with 127 additions and 28 deletions

View File

@@ -0,0 +1,100 @@
#* Interactive use only
# Experimental work with utils.py, to be merged into it.
if 0:
from utils import *
#* Data collection
# Get the db information in list of DBInfo form
db_base = "~/work-gh/mrva/mrva-open-source-download/"
dbs = list(collect_dbs(db_base))
# XX: add metadata
# codeql, meta = extract_metadata('path_to_your_zipfile.zip')
# print(codeql)
# print(meta)
# Inspect:
from pprint import pprint
pprint(["len", len(dbs)])
pprint(["dbs[0]", dbs[0].__dict__])
#
# Get a dataframe
dbdf = pd.DataFrame([d.__dict__ for d in dbs])
#
# XX: save to disk, continue use in separate session
#
# PosixPath is a problem for json and parquet:
#
dbdf['path'] = dbdf['path'].astype(str)
#
dbdf.to_csv('dbdf.csv')
#
dbdf.to_csv('dbdf.csv.gz', compression='gzip', index=False)
#
dbdf.to_json('dbdf.json')
#
# dbdf.to_hdf('dbdf.h5', key='dbdf', mode='w')
#
# fast, binary
dbdf.to_parquet('dbdf.parquet')
#
# fast
import sqlite3
conn = sqlite3.connect('dbdf.db')
dbdf.to_sql('qldbs', conn, if_exists='replace', index=False)
conn.close()
#
# Sizes:
# ls -laSr dbdf.*
# -rw-r--r--@ 1 hohn staff 101390 Jul 12 14:17 dbdf.csv.gz
# -rw-r--r--@ 1 hohn staff 202712 Jul 12 14:17 dbdf.parquet
# -rw-r--r--@ 1 hohn staff 560623 Jul 12 14:17 dbdf.csv
# -rw-r--r--@ 1 hohn staff 610304 Jul 12 14:17 dbdf.db
# -rw-r--r--@ 1 hohn staff 735097 Jul 12 14:17 dbdf.json
#
# parquet has many libraries, including go: xitongsys/parquet-go
# https://parquet.apache.org/
#
# Reload to continue work
dbdf_1 = pd.read_csv('dbdf.csv.gz', compression='gzip')
#
# Consistency check:
dbdf_1.columns == dbdf.columns
dbmask = (dbdf_1 != dbdf)
dbdf_1[dbmask]
dbdf_1[dbmask].dropna(how='all')
# ctime_raw is different in places, so don't use it.
#
# Interact with/visualize the dataframe
os.environ['APPDATA'] = "needed-for-pandasgui"
from pandasgui import show
show(dbdf)
show(cmp)
#
import dtale
dtale.show(dbdf)
#
# Local Variables:
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
# End:
import pandas as pd
# Example large DataFrame
data = {
'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
'age': [25, 30, 35, 40, 22],
'city': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']
}
large_df = pd.DataFrame(data)
# Create a boolean mask: select rows where age is greater than 30
mask = large_df['age'] > 30
# Apply the boolean mask to get the smaller DataFrame
small_df = large_df[mask]
print(small_df)

View File

@@ -8,9 +8,13 @@
#* Imports #* Imports
import pandas as pd import pandas as pd
from pathlib import Path from pathlib import Path
import os import datetime
import json
import logging import logging
import os
import time import time
import yaml
import zipfile
#* Setup #* Setup
logging.basicConfig( logging.basicConfig(
@@ -51,8 +55,9 @@ def collect_dbs(db_base):
db.path = path db.path = path
s = path.stat() s = path.stat()
db.size = s.st_size db.size = s.st_size
db.ctime_raw = s.st_ctime # db.ctime_raw = s.st_ctime
db.ctime = time.ctime(s.st_ctime) # db.ctime = time.ctime(s.st_ctime)
db.ctime = datetime.datetime.fromtimestamp(s.st_ctime).isoformat()
yield db yield db
def dbdf_from_tree(): def dbdf_from_tree():
@@ -61,30 +66,24 @@ def dbdf_from_tree():
dbdf = pd.DataFrame([d.__dict__ for d in dbs]) dbdf = pd.DataFrame([d.__dict__ for d in dbs])
return dbdf return dbdf
#* Interactive use only # extract_metadata(zipfile)
if 0:
#* Data collection
# Get the db information in list of DBInfo form
db_base = "~/work-gh/mrva/mrva-open-source-download/"
dbs = list(collect_dbs(db_base))
# #
# Inspect: # Unzip zipfile into memory and return the contents of the files
from pprint import pprint # codeql-database.yml and baseline-info.json that it contains in a tuple
pprint(["len", len(dbs)])
pprint(["dbs[0]", dbs[0].__dict__])
#
# Get a dataframe
dbdf = pd.DataFrame([d.__dict__ for d in dbs])
#
# Interact with/visualize it
os.environ['APPDATA'] = "needed-for-pandasgui"
from pandasgui import show
show(dbdf)
#
import dtale
dtale.show(dbdf)
# #
def extract_metadata(zipfile_path):
codeql_content = None
meta_content = None
with zipfile.ZipFile(zipfile_path, 'r') as z:
for file_info in z.infolist():
if file_info.filename == 'codeql_db/codeql-database.yml':
with z.open(file_info) as f:
codeql_content = yaml.safe_load(f)
elif file_info.filename == 'codeql_db/baseline-info.json':
with z.open(file_info) as f:
meta_content = json.load(f)
return codeql_content, meta_content
# Local Variables: # Local Variables:
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/venv/" # python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
# End: # End: