Experiment with formats for saving/loading the database index

The .csv.gz format is the simplest and most universal. It's also the smallest on disk. The comparison of saved/reloaded dataframe shows no difference. The ctime_raw column caused serialization problems, so only ctime (in iso-8601 format) is used.
2024-07-12 14:41:05 -07:00
parent 3df1cac5ae
commit 6b4e753e69
2 changed files with 127 additions and 28 deletions
--- a/client/qldbtools/qldbtools/utils-dev.py
+++ b/client/qldbtools/qldbtools/utils-dev.py
@@ -0,0 +1,100 @@
 #* Interactive use only
 # Experimental work with utils.py, to be merged into it.
 if 0:
    from utils import *
    #* Data collection
    # Get the db information in list of DBInfo form
    db_base = "~/work-gh/mrva/mrva-open-source-download/"
    dbs = list(collect_dbs(db_base))
    # XX: add metadata
    # codeql, meta = extract_metadata('path_to_your_zipfile.zip')
    # print(codeql)
    # print(meta)
    # Inspect:
    from pprint import pprint
    pprint(["len", len(dbs)])
    pprint(["dbs[0]", dbs[0].__dict__])
    # 
    # Get a dataframe
    dbdf = pd.DataFrame([d.__dict__ for d in dbs])
    # 
    # XX: save to disk, continue use in separate session
    #
    #     PosixPath is a problem for json and parquet:
    # 
    dbdf['path'] = dbdf['path'].astype(str)
    # 
    dbdf.to_csv('dbdf.csv')
    #
    dbdf.to_csv('dbdf.csv.gz', compression='gzip', index=False)
    # 
    dbdf.to_json('dbdf.json')
    #
    # dbdf.to_hdf('dbdf.h5', key='dbdf', mode='w')
    # 
    # fast, binary
    dbdf.to_parquet('dbdf.parquet')
    # 
    # fast
    import sqlite3
    conn = sqlite3.connect('dbdf.db')
    dbdf.to_sql('qldbs', conn, if_exists='replace', index=False)
    conn.close()
    # 
    # Sizes:
    # ls -laSr dbdf.*
    # -rw-r--r--@ 1 hohn  staff  101390 Jul 12 14:17 dbdf.csv.gz
    # -rw-r--r--@ 1 hohn  staff  202712 Jul 12 14:17 dbdf.parquet
    # -rw-r--r--@ 1 hohn  staff  560623 Jul 12 14:17 dbdf.csv
    # -rw-r--r--@ 1 hohn  staff  610304 Jul 12 14:17 dbdf.db
    # -rw-r--r--@ 1 hohn  staff  735097 Jul 12 14:17 dbdf.json
    #
    # parquet has many libraries, including go: xitongsys/parquet-go
    # https://parquet.apache.org/
    # 
    # Reload to continue work
    dbdf_1 = pd.read_csv('dbdf.csv.gz', compression='gzip')
    #
    # Consistency check:
    dbdf_1.columns == dbdf.columns
    dbmask = (dbdf_1 != dbdf)
    dbdf_1[dbmask]
    dbdf_1[dbmask].dropna(how='all')
    # ctime_raw is different in places, so don't use it.
    # 
    # Interact with/visualize the dataframe
    os.environ['APPDATA'] = "needed-for-pandasgui"
    from pandasgui import show
    show(dbdf)
    show(cmp)
    # 
    import dtale
    dtale.show(dbdf)
    # 
 # Local Variables:
 # python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
 # End:
 import pandas as pd
 # Example large DataFrame
 data = {
    'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'age': [25, 30, 35, 40, 22],
    'city': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']
 }
 large_df = pd.DataFrame(data)
 # Create a boolean mask: select rows where age is greater than 30
 mask = large_df['age'] > 30
 # Apply the boolean mask to get the smaller DataFrame
 small_df = large_df[mask]
 print(small_df)
--- a/client/qldbtools/qldbtools/utils.py
+++ b/client/qldbtools/qldbtools/utils.py
@@ -8,9 +8,13 @@
 #* Imports 
 import pandas as pd
 from pathlib import Path
-import os
+import datetime
 import json
 import logging
 import os
 import time
 import yaml
 import zipfile
 #* Setup
 logging.basicConfig(
@@ -51,8 +55,9 @@ def collect_dbs(db_base):
            db.path = path
            s = path.stat()
            db.size = s.st_size
-            db.ctime_raw = s.st_ctime
+            # db.ctime_raw = s.st_ctime
-            db.ctime = time.ctime(s.st_ctime)
+            # db.ctime = time.ctime(s.st_ctime)
            db.ctime = datetime.datetime.fromtimestamp(s.st_ctime).isoformat()
            yield db 
 def dbdf_from_tree():
@@ -61,30 +66,24 @@ def dbdf_from_tree():
    dbdf = pd.DataFrame([d.__dict__ for d in dbs])
    return dbdf
-#* Interactive use only
+#    extract_metadata(zipfile)
 if 0:
    #* Data collection
    # Get the db information in list of DBInfo form
    db_base = "~/work-gh/mrva/mrva-open-source-download/"
    dbs = list(collect_dbs(db_base))
 # 
-    # Inspect:
+# Unzip zipfile into memory and return the contents of the files
-    from pprint import pprint
+# codeql-database.yml and baseline-info.json that it contains in a tuple
    pprint(["len", len(dbs)])
    pprint(["dbs[0]", dbs[0].__dict__])
    # 
    # Get a dataframe
    dbdf = pd.DataFrame([d.__dict__ for d in dbs])
    # 
    # Interact with/visualize it
    os.environ['APPDATA'] = "needed-for-pandasgui"
    from pandasgui import show
    show(dbdf)
    # 
    import dtale
    dtale.show(dbdf)
 #
 def extract_metadata(zipfile_path):
    codeql_content = None
    meta_content = None
    with zipfile.ZipFile(zipfile_path, 'r') as z:
        for file_info in z.infolist():
            if file_info.filename == 'codeql_db/codeql-database.yml':
                with z.open(file_info) as f:
                    codeql_content = yaml.safe_load(f)
            elif file_info.filename == 'codeql_db/baseline-info.json':
                with z.open(file_info) as f:
                    meta_content = json.load(f)
    return codeql_content, meta_content
 # Local Variables:
-# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/venv/"
+# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
 # End: