Experiment with formats for saving/loading the database index

The .csv.gz format is the simplest and most universal. It's also the smallest on disk. The comparison of saved/reloaded dataframe shows no difference. The ctime_raw column caused serialization problems, so only ctime (in iso-8601 format) is used.
2024-07-12 14:41:05 -07:00
parent 3df1cac5ae
commit 6b4e753e69
2 changed files with 127 additions and 28 deletions
--- a/client/qldbtools/qldbtools/utils-dev.py
+++ b/client/qldbtools/qldbtools/utils-dev.py
@@ -0,0 +1,100 @@
+#* Interactive use only
+# Experimental work with utils.py, to be merged into it.
+if 0:
+    from utils import *
+
+    #* Data collection
+    # Get the db information in list of DBInfo form
+    db_base = "~/work-gh/mrva/mrva-open-source-download/"
+    dbs = list(collect_dbs(db_base))
+
+    # XX: add metadata
+    # codeql, meta = extract_metadata('path_to_your_zipfile.zip')
+    # print(codeql)
+    # print(meta)
+
+    # Inspect:
+    from pprint import pprint
+    pprint(["len", len(dbs)])
+    pprint(["dbs[0]", dbs[0].__dict__])
+    # 
+    # Get a dataframe
+    dbdf = pd.DataFrame([d.__dict__ for d in dbs])
+    # 
+    # XX: save to disk, continue use in separate session
+    #
+    #     PosixPath is a problem for json and parquet:
+    # 
+    dbdf['path'] = dbdf['path'].astype(str)
+    # 
+    dbdf.to_csv('dbdf.csv')
+    #
+    dbdf.to_csv('dbdf.csv.gz', compression='gzip', index=False)
+    # 
+    dbdf.to_json('dbdf.json')
+    #
+    # dbdf.to_hdf('dbdf.h5', key='dbdf', mode='w')
+    # 
+    # fast, binary
+    dbdf.to_parquet('dbdf.parquet')
+    # 
+    # fast
+    import sqlite3
+    conn = sqlite3.connect('dbdf.db')
+    dbdf.to_sql('qldbs', conn, if_exists='replace', index=False)
+    conn.close()
+    # 
+    # Sizes:
+    # ls -laSr dbdf.*
+    # -rw-r--r--@ 1 hohn  staff  101390 Jul 12 14:17 dbdf.csv.gz
+    # -rw-r--r--@ 1 hohn  staff  202712 Jul 12 14:17 dbdf.parquet
+    # -rw-r--r--@ 1 hohn  staff  560623 Jul 12 14:17 dbdf.csv
+    # -rw-r--r--@ 1 hohn  staff  610304 Jul 12 14:17 dbdf.db
+    # -rw-r--r--@ 1 hohn  staff  735097 Jul 12 14:17 dbdf.json
+    #
+    # parquet has many libraries, including go: xitongsys/parquet-go
+    # https://parquet.apache.org/
+    # 
+    # Reload to continue work
+    dbdf_1 = pd.read_csv('dbdf.csv.gz', compression='gzip')
+    #
+    # Consistency check:
+    dbdf_1.columns == dbdf.columns
+    dbmask = (dbdf_1 != dbdf)
+    dbdf_1[dbmask]
+    dbdf_1[dbmask].dropna(how='all')
+    # ctime_raw is different in places, so don't use it.
+    
+    # 
+    # Interact with/visualize the dataframe
+    os.environ['APPDATA'] = "needed-for-pandasgui"
+    from pandasgui import show
+    show(dbdf)
+    show(cmp)
+    # 
+    import dtale
+    dtale.show(dbdf)
+    # 
+
+# Local Variables:
+# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
+# End:
+
+
+import pandas as pd
+
+# Example large DataFrame
+data = {
+    'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
+    'age': [25, 30, 35, 40, 22],
+    'city': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']
+}
+large_df = pd.DataFrame(data)
+
+# Create a boolean mask: select rows where age is greater than 30
+mask = large_df['age'] > 30
+
+# Apply the boolean mask to get the smaller DataFrame
+small_df = large_df[mask]
+
+print(small_df)
--- a/client/qldbtools/qldbtools/utils.py
+++ b/client/qldbtools/qldbtools/utils.py
@@ -8,9 +8,13 @@
 #* Imports 
 import pandas as pd
 from pathlib import Path
-import os
+import datetime
+import json
 import logging
+import os
 import time
+import yaml
+import zipfile

 #* Setup
 logging.basicConfig(
@@ -51,8 +55,9 @@ def collect_dbs(db_base):
            db.path = path
            s = path.stat()
            db.size = s.st_size
-            db.ctime_raw = s.st_ctime
-            db.ctime = time.ctime(s.st_ctime)
+            # db.ctime_raw = s.st_ctime
+            # db.ctime = time.ctime(s.st_ctime)
+            db.ctime = datetime.datetime.fromtimestamp(s.st_ctime).isoformat()
            yield db 

 def dbdf_from_tree():
@@ -61,30 +66,24 @@ def dbdf_from_tree():
    dbdf = pd.DataFrame([d.__dict__ for d in dbs])
    return dbdf
    
-#* Interactive use only
-if 0:
-    #* Data collection
-    # Get the db information in list of DBInfo form
-    db_base = "~/work-gh/mrva/mrva-open-source-download/"
-    dbs = list(collect_dbs(db_base))
-    # 
-    # Inspect:
-    from pprint import pprint
-    pprint(["len", len(dbs)])
-    pprint(["dbs[0]", dbs[0].__dict__])
-    # 
-    # Get a dataframe
-    dbdf = pd.DataFrame([d.__dict__ for d in dbs])
-    # 
-    # Interact with/visualize it
-    os.environ['APPDATA'] = "needed-for-pandasgui"
-    from pandasgui import show
-    show(dbdf)
-    # 
-    import dtale
-    dtale.show(dbdf)
-    # 
-
+#    extract_metadata(zipfile)
+# 
+# Unzip zipfile into memory and return the contents of the files
+# codeql-database.yml and baseline-info.json that it contains in a tuple
+#
+def extract_metadata(zipfile_path):
+    codeql_content = None
+    meta_content = None
+    with zipfile.ZipFile(zipfile_path, 'r') as z:
+        for file_info in z.infolist():
+            if file_info.filename == 'codeql_db/codeql-database.yml':
+                with z.open(file_info) as f:
+                    codeql_content = yaml.safe_load(f)
+            elif file_info.filename == 'codeql_db/baseline-info.json':
+                with z.open(file_info) as f:
+                    meta_content = json.load(f)
+    return codeql_content, meta_content
+               
 # Local Variables:
-# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/venv/"
+# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
 # End: