Experiment with formats for saving/loading the database index
The .csv.gz format is the simplest and most universal. It's also the smallest on disk. The comparison of saved/reloaded dataframe shows no difference. The ctime_raw column caused serialization problems, so only ctime (in iso-8601 format) is used.
This commit is contained in:
committed by
=Michael Hohn
parent
3df1cac5ae
commit
6b4e753e69
100
client/qldbtools/qldbtools/utils-dev.py
Normal file
100
client/qldbtools/qldbtools/utils-dev.py
Normal file
@@ -0,0 +1,100 @@
|
|||||||
|
#* Interactive use only
|
||||||
|
# Experimental work with utils.py, to be merged into it.
|
||||||
|
if 0:
|
||||||
|
from utils import *
|
||||||
|
|
||||||
|
#* Data collection
|
||||||
|
# Get the db information in list of DBInfo form
|
||||||
|
db_base = "~/work-gh/mrva/mrva-open-source-download/"
|
||||||
|
dbs = list(collect_dbs(db_base))
|
||||||
|
|
||||||
|
# XX: add metadata
|
||||||
|
# codeql, meta = extract_metadata('path_to_your_zipfile.zip')
|
||||||
|
# print(codeql)
|
||||||
|
# print(meta)
|
||||||
|
|
||||||
|
# Inspect:
|
||||||
|
from pprint import pprint
|
||||||
|
pprint(["len", len(dbs)])
|
||||||
|
pprint(["dbs[0]", dbs[0].__dict__])
|
||||||
|
#
|
||||||
|
# Get a dataframe
|
||||||
|
dbdf = pd.DataFrame([d.__dict__ for d in dbs])
|
||||||
|
#
|
||||||
|
# XX: save to disk, continue use in separate session
|
||||||
|
#
|
||||||
|
# PosixPath is a problem for json and parquet:
|
||||||
|
#
|
||||||
|
dbdf['path'] = dbdf['path'].astype(str)
|
||||||
|
#
|
||||||
|
dbdf.to_csv('dbdf.csv')
|
||||||
|
#
|
||||||
|
dbdf.to_csv('dbdf.csv.gz', compression='gzip', index=False)
|
||||||
|
#
|
||||||
|
dbdf.to_json('dbdf.json')
|
||||||
|
#
|
||||||
|
# dbdf.to_hdf('dbdf.h5', key='dbdf', mode='w')
|
||||||
|
#
|
||||||
|
# fast, binary
|
||||||
|
dbdf.to_parquet('dbdf.parquet')
|
||||||
|
#
|
||||||
|
# fast
|
||||||
|
import sqlite3
|
||||||
|
conn = sqlite3.connect('dbdf.db')
|
||||||
|
dbdf.to_sql('qldbs', conn, if_exists='replace', index=False)
|
||||||
|
conn.close()
|
||||||
|
#
|
||||||
|
# Sizes:
|
||||||
|
# ls -laSr dbdf.*
|
||||||
|
# -rw-r--r--@ 1 hohn staff 101390 Jul 12 14:17 dbdf.csv.gz
|
||||||
|
# -rw-r--r--@ 1 hohn staff 202712 Jul 12 14:17 dbdf.parquet
|
||||||
|
# -rw-r--r--@ 1 hohn staff 560623 Jul 12 14:17 dbdf.csv
|
||||||
|
# -rw-r--r--@ 1 hohn staff 610304 Jul 12 14:17 dbdf.db
|
||||||
|
# -rw-r--r--@ 1 hohn staff 735097 Jul 12 14:17 dbdf.json
|
||||||
|
#
|
||||||
|
# parquet has many libraries, including go: xitongsys/parquet-go
|
||||||
|
# https://parquet.apache.org/
|
||||||
|
#
|
||||||
|
# Reload to continue work
|
||||||
|
dbdf_1 = pd.read_csv('dbdf.csv.gz', compression='gzip')
|
||||||
|
#
|
||||||
|
# Consistency check:
|
||||||
|
dbdf_1.columns == dbdf.columns
|
||||||
|
dbmask = (dbdf_1 != dbdf)
|
||||||
|
dbdf_1[dbmask]
|
||||||
|
dbdf_1[dbmask].dropna(how='all')
|
||||||
|
# ctime_raw is different in places, so don't use it.
|
||||||
|
|
||||||
|
#
|
||||||
|
# Interact with/visualize the dataframe
|
||||||
|
os.environ['APPDATA'] = "needed-for-pandasgui"
|
||||||
|
from pandasgui import show
|
||||||
|
show(dbdf)
|
||||||
|
show(cmp)
|
||||||
|
#
|
||||||
|
import dtale
|
||||||
|
dtale.show(dbdf)
|
||||||
|
#
|
||||||
|
|
||||||
|
# Local Variables:
|
||||||
|
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
|
||||||
|
# End:
|
||||||
|
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
# Example large DataFrame
|
||||||
|
data = {
|
||||||
|
'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
|
||||||
|
'age': [25, 30, 35, 40, 22],
|
||||||
|
'city': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']
|
||||||
|
}
|
||||||
|
large_df = pd.DataFrame(data)
|
||||||
|
|
||||||
|
# Create a boolean mask: select rows where age is greater than 30
|
||||||
|
mask = large_df['age'] > 30
|
||||||
|
|
||||||
|
# Apply the boolean mask to get the smaller DataFrame
|
||||||
|
small_df = large_df[mask]
|
||||||
|
|
||||||
|
print(small_df)
|
||||||
@@ -8,9 +8,13 @@
|
|||||||
#* Imports
|
#* Imports
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import os
|
import datetime
|
||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
import time
|
import time
|
||||||
|
import yaml
|
||||||
|
import zipfile
|
||||||
|
|
||||||
#* Setup
|
#* Setup
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
@@ -51,8 +55,9 @@ def collect_dbs(db_base):
|
|||||||
db.path = path
|
db.path = path
|
||||||
s = path.stat()
|
s = path.stat()
|
||||||
db.size = s.st_size
|
db.size = s.st_size
|
||||||
db.ctime_raw = s.st_ctime
|
# db.ctime_raw = s.st_ctime
|
||||||
db.ctime = time.ctime(s.st_ctime)
|
# db.ctime = time.ctime(s.st_ctime)
|
||||||
|
db.ctime = datetime.datetime.fromtimestamp(s.st_ctime).isoformat()
|
||||||
yield db
|
yield db
|
||||||
|
|
||||||
def dbdf_from_tree():
|
def dbdf_from_tree():
|
||||||
@@ -61,30 +66,24 @@ def dbdf_from_tree():
|
|||||||
dbdf = pd.DataFrame([d.__dict__ for d in dbs])
|
dbdf = pd.DataFrame([d.__dict__ for d in dbs])
|
||||||
return dbdf
|
return dbdf
|
||||||
|
|
||||||
#* Interactive use only
|
# extract_metadata(zipfile)
|
||||||
if 0:
|
|
||||||
#* Data collection
|
|
||||||
# Get the db information in list of DBInfo form
|
|
||||||
db_base = "~/work-gh/mrva/mrva-open-source-download/"
|
|
||||||
dbs = list(collect_dbs(db_base))
|
|
||||||
#
|
#
|
||||||
# Inspect:
|
# Unzip zipfile into memory and return the contents of the files
|
||||||
from pprint import pprint
|
# codeql-database.yml and baseline-info.json that it contains in a tuple
|
||||||
pprint(["len", len(dbs)])
|
|
||||||
pprint(["dbs[0]", dbs[0].__dict__])
|
|
||||||
#
|
|
||||||
# Get a dataframe
|
|
||||||
dbdf = pd.DataFrame([d.__dict__ for d in dbs])
|
|
||||||
#
|
|
||||||
# Interact with/visualize it
|
|
||||||
os.environ['APPDATA'] = "needed-for-pandasgui"
|
|
||||||
from pandasgui import show
|
|
||||||
show(dbdf)
|
|
||||||
#
|
|
||||||
import dtale
|
|
||||||
dtale.show(dbdf)
|
|
||||||
#
|
#
|
||||||
|
def extract_metadata(zipfile_path):
|
||||||
|
codeql_content = None
|
||||||
|
meta_content = None
|
||||||
|
with zipfile.ZipFile(zipfile_path, 'r') as z:
|
||||||
|
for file_info in z.infolist():
|
||||||
|
if file_info.filename == 'codeql_db/codeql-database.yml':
|
||||||
|
with z.open(file_info) as f:
|
||||||
|
codeql_content = yaml.safe_load(f)
|
||||||
|
elif file_info.filename == 'codeql_db/baseline-info.json':
|
||||||
|
with z.open(file_info) as f:
|
||||||
|
meta_content = json.load(f)
|
||||||
|
return codeql_content, meta_content
|
||||||
|
|
||||||
# Local Variables:
|
# Local Variables:
|
||||||
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/venv/"
|
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
|
||||||
# End:
|
# End:
|
||||||
|
|||||||
Reference in New Issue
Block a user