diff --git a/client/qldbtools/qldbtools/utils-dev.py b/client/qldbtools/qldbtools/utils-dev.py new file mode 100644 index 0000000..77829e3 --- /dev/null +++ b/client/qldbtools/qldbtools/utils-dev.py @@ -0,0 +1,100 @@ +#* Interactive use only +# Experimental work with utils.py, to be merged into it. +if 0: + from utils import * + + #* Data collection + # Get the db information in list of DBInfo form + db_base = "~/work-gh/mrva/mrva-open-source-download/" + dbs = list(collect_dbs(db_base)) + + # XX: add metadata + # codeql, meta = extract_metadata('path_to_your_zipfile.zip') + # print(codeql) + # print(meta) + + # Inspect: + from pprint import pprint + pprint(["len", len(dbs)]) + pprint(["dbs[0]", dbs[0].__dict__]) + # + # Get a dataframe + dbdf = pd.DataFrame([d.__dict__ for d in dbs]) + # + # XX: save to disk, continue use in separate session + # + # PosixPath is a problem for json and parquet: + # + dbdf['path'] = dbdf['path'].astype(str) + # + dbdf.to_csv('dbdf.csv') + # + dbdf.to_csv('dbdf.csv.gz', compression='gzip', index=False) + # + dbdf.to_json('dbdf.json') + # + # dbdf.to_hdf('dbdf.h5', key='dbdf', mode='w') + # + # fast, binary + dbdf.to_parquet('dbdf.parquet') + # + # fast + import sqlite3 + conn = sqlite3.connect('dbdf.db') + dbdf.to_sql('qldbs', conn, if_exists='replace', index=False) + conn.close() + # + # Sizes: + # ls -laSr dbdf.* + # -rw-r--r--@ 1 hohn staff 101390 Jul 12 14:17 dbdf.csv.gz + # -rw-r--r--@ 1 hohn staff 202712 Jul 12 14:17 dbdf.parquet + # -rw-r--r--@ 1 hohn staff 560623 Jul 12 14:17 dbdf.csv + # -rw-r--r--@ 1 hohn staff 610304 Jul 12 14:17 dbdf.db + # -rw-r--r--@ 1 hohn staff 735097 Jul 12 14:17 dbdf.json + # + # parquet has many libraries, including go: xitongsys/parquet-go + # https://parquet.apache.org/ + # + # Reload to continue work + dbdf_1 = pd.read_csv('dbdf.csv.gz', compression='gzip') + # + # Consistency check: + dbdf_1.columns == dbdf.columns + dbmask = (dbdf_1 != dbdf) + dbdf_1[dbmask] + dbdf_1[dbmask].dropna(how='all') + # ctime_raw is different in places, so don't use it. + + # + # Interact with/visualize the dataframe + os.environ['APPDATA'] = "needed-for-pandasgui" + from pandasgui import show + show(dbdf) + show(cmp) + # + import dtale + dtale.show(dbdf) + # + +# Local Variables: +# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/" +# End: + + +import pandas as pd + +# Example large DataFrame +data = { + 'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'], + 'age': [25, 30, 35, 40, 22], + 'city': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'] +} +large_df = pd.DataFrame(data) + +# Create a boolean mask: select rows where age is greater than 30 +mask = large_df['age'] > 30 + +# Apply the boolean mask to get the smaller DataFrame +small_df = large_df[mask] + +print(small_df) diff --git a/client/qldbtools/qldbtools/utils.py b/client/qldbtools/qldbtools/utils.py index 74ea70d..cdf0478 100644 --- a/client/qldbtools/qldbtools/utils.py +++ b/client/qldbtools/qldbtools/utils.py @@ -8,9 +8,13 @@ #* Imports import pandas as pd from pathlib import Path -import os +import datetime +import json import logging +import os import time +import yaml +import zipfile #* Setup logging.basicConfig( @@ -51,8 +55,9 @@ def collect_dbs(db_base): db.path = path s = path.stat() db.size = s.st_size - db.ctime_raw = s.st_ctime - db.ctime = time.ctime(s.st_ctime) + # db.ctime_raw = s.st_ctime + # db.ctime = time.ctime(s.st_ctime) + db.ctime = datetime.datetime.fromtimestamp(s.st_ctime).isoformat() yield db def dbdf_from_tree(): @@ -61,30 +66,24 @@ def dbdf_from_tree(): dbdf = pd.DataFrame([d.__dict__ for d in dbs]) return dbdf -#* Interactive use only -if 0: - #* Data collection - # Get the db information in list of DBInfo form - db_base = "~/work-gh/mrva/mrva-open-source-download/" - dbs = list(collect_dbs(db_base)) - # - # Inspect: - from pprint import pprint - pprint(["len", len(dbs)]) - pprint(["dbs[0]", dbs[0].__dict__]) - # - # Get a dataframe - dbdf = pd.DataFrame([d.__dict__ for d in dbs]) - # - # Interact with/visualize it - os.environ['APPDATA'] = "needed-for-pandasgui" - from pandasgui import show - show(dbdf) - # - import dtale - dtale.show(dbdf) - # - +# extract_metadata(zipfile) +# +# Unzip zipfile into memory and return the contents of the files +# codeql-database.yml and baseline-info.json that it contains in a tuple +# +def extract_metadata(zipfile_path): + codeql_content = None + meta_content = None + with zipfile.ZipFile(zipfile_path, 'r') as z: + for file_info in z.infolist(): + if file_info.filename == 'codeql_db/codeql-database.yml': + with z.open(file_info) as f: + codeql_content = yaml.safe_load(f) + elif file_info.filename == 'codeql_db/baseline-info.json': + with z.open(file_info) as f: + meta_content = json.load(f) + return codeql_content, meta_content + # Local Variables: -# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/venv/" +# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/" # End: