From d64522d16867a23088bf726641c8235506de2ce3 Mon Sep 17 00:00:00 2001 From: Michael Hohn Date: Mon, 22 Jul 2024 11:07:00 -0700 Subject: [PATCH] Collect CodeQL database information from the file system and save as CSV This collection already provides significant meta-information ctime : str = '2024-05-13T12:04:01.593586' language : str = 'cpp' name : str = 'nanobind' owner : str = 'wjakob' path : Path = Path('/Users/hohn/work-gh/mrva/mrva-open-source-download/repos/wjakob/nanobind/code-scanning/codeql/databases/cpp/db.zip') size : int = 63083064 There is some more in the db.zip files, to be added --- client/qldbtools/qldbtools/session1.py | 62 +++++++++++++++ client/qldbtools/qldbtools/session2.py | 28 +++++++ client/qldbtools/qldbtools/utils-dev.py | 100 ------------------------ client/qldbtools/qldbtools/utils.py | 13 ++- client/qldbtools/requirements.txt | 14 ++++ 5 files changed, 115 insertions(+), 102 deletions(-) create mode 100644 client/qldbtools/qldbtools/session1.py create mode 100644 client/qldbtools/qldbtools/session2.py delete mode 100644 client/qldbtools/qldbtools/utils-dev.py diff --git a/client/qldbtools/qldbtools/session1.py b/client/qldbtools/qldbtools/session1.py new file mode 100644 index 0000000..3a893fe --- /dev/null +++ b/client/qldbtools/qldbtools/session1.py @@ -0,0 +1,62 @@ +#* Experimental work with utils.py, to be merged into it. +from utils import * + +#* Data collection +# Get the db information in list of DBInfo form +db_base = "~/work-gh/mrva/mrva-open-source-download/" +dbs = list(collect_dbs(db_base)) + +# XX: add metadata +# codeql, meta = extract_metadata('path_to_your_zipfile.zip') +# print(codeql) +# print(meta) + +# Inspect: +from pprint import pprint +pprint(["len", len(dbs)]) +pprint(["dbs[0]", dbs[0].__dict__]) +pprint(["dbs[-1]", dbs[-1].__dict__]) +# +# Get a dataframe +dbdf = pd.DataFrame([d.__dict__ for d in dbs]) +# +#* Experiments with on-disk format +# Continue use of raw information in separate session. +# +# PosixPath is a problem for json and parquet +# +dbdf['path'] = dbdf['path'].astype(str) +# +dbdf.to_csv('dbdf.csv') +# +dbdf.to_csv('dbdf.csv.gz', compression='gzip', index=False) +# +dbdf.to_json('dbdf.json') +# +# dbdf.to_hdf('dbdf.h5', key='dbdf', mode='w') +# +# fast, binary +dbdf.to_parquet('dbdf.parquet') +# +# fast +import sqlite3 +conn = sqlite3.connect('dbdf.db') +dbdf.to_sql('qldbs', conn, if_exists='replace', index=False) +conn.close() +# +# Sizes: +# ls -laSr dbdf.* +# -rw-r--r--@ 1 hohn staff 101390 Jul 12 14:17 dbdf.csv.gz +# -rw-r--r--@ 1 hohn staff 202712 Jul 12 14:17 dbdf.parquet +# -rw-r--r--@ 1 hohn staff 560623 Jul 12 14:17 dbdf.csv +# -rw-r--r--@ 1 hohn staff 610304 Jul 12 14:17 dbdf.db +# -rw-r--r--@ 1 hohn staff 735097 Jul 12 14:17 dbdf.json +# +# parquet has many libraries, including go: xitongsys/parquet-go +# https://parquet.apache.org/ +# + + +# Local Variables: +# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/" +# End: diff --git a/client/qldbtools/qldbtools/session2.py b/client/qldbtools/qldbtools/session2.py new file mode 100644 index 0000000..68408fa --- /dev/null +++ b/client/qldbtools/qldbtools/session2.py @@ -0,0 +1,28 @@ +# Experimental work with utils.py, to be merged into it. +from utils import * +from pprint import pprint + +#* Reload gzipped CSV file to continue work +dbdf_1 = pd.read_csv('dbdf.csv.gz', compression='gzip') +# +# (old) Consistency check: +# dbdf_1.columns == dbdf.columns +# dbmask = (dbdf_1 != dbdf) +# dbdf_1[dbmask] +# dbdf_1[dbmask].dropna(how='all') +# ctime_raw is different in places, so don't use it. + +# +#* Interact with/visualize the dataframe +# Using pandasgui -- qt +from pandasgui import show +os.environ['APPDATA'] = "needed-for-pandasgui" +show(dbdf_1) +# Using dtale -- web +import dtale +dtale.show(dbdf_1) +# + +# Local Variables: +# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/" +# End: diff --git a/client/qldbtools/qldbtools/utils-dev.py b/client/qldbtools/qldbtools/utils-dev.py deleted file mode 100644 index 77829e3..0000000 --- a/client/qldbtools/qldbtools/utils-dev.py +++ /dev/null @@ -1,100 +0,0 @@ -#* Interactive use only -# Experimental work with utils.py, to be merged into it. -if 0: - from utils import * - - #* Data collection - # Get the db information in list of DBInfo form - db_base = "~/work-gh/mrva/mrva-open-source-download/" - dbs = list(collect_dbs(db_base)) - - # XX: add metadata - # codeql, meta = extract_metadata('path_to_your_zipfile.zip') - # print(codeql) - # print(meta) - - # Inspect: - from pprint import pprint - pprint(["len", len(dbs)]) - pprint(["dbs[0]", dbs[0].__dict__]) - # - # Get a dataframe - dbdf = pd.DataFrame([d.__dict__ for d in dbs]) - # - # XX: save to disk, continue use in separate session - # - # PosixPath is a problem for json and parquet: - # - dbdf['path'] = dbdf['path'].astype(str) - # - dbdf.to_csv('dbdf.csv') - # - dbdf.to_csv('dbdf.csv.gz', compression='gzip', index=False) - # - dbdf.to_json('dbdf.json') - # - # dbdf.to_hdf('dbdf.h5', key='dbdf', mode='w') - # - # fast, binary - dbdf.to_parquet('dbdf.parquet') - # - # fast - import sqlite3 - conn = sqlite3.connect('dbdf.db') - dbdf.to_sql('qldbs', conn, if_exists='replace', index=False) - conn.close() - # - # Sizes: - # ls -laSr dbdf.* - # -rw-r--r--@ 1 hohn staff 101390 Jul 12 14:17 dbdf.csv.gz - # -rw-r--r--@ 1 hohn staff 202712 Jul 12 14:17 dbdf.parquet - # -rw-r--r--@ 1 hohn staff 560623 Jul 12 14:17 dbdf.csv - # -rw-r--r--@ 1 hohn staff 610304 Jul 12 14:17 dbdf.db - # -rw-r--r--@ 1 hohn staff 735097 Jul 12 14:17 dbdf.json - # - # parquet has many libraries, including go: xitongsys/parquet-go - # https://parquet.apache.org/ - # - # Reload to continue work - dbdf_1 = pd.read_csv('dbdf.csv.gz', compression='gzip') - # - # Consistency check: - dbdf_1.columns == dbdf.columns - dbmask = (dbdf_1 != dbdf) - dbdf_1[dbmask] - dbdf_1[dbmask].dropna(how='all') - # ctime_raw is different in places, so don't use it. - - # - # Interact with/visualize the dataframe - os.environ['APPDATA'] = "needed-for-pandasgui" - from pandasgui import show - show(dbdf) - show(cmp) - # - import dtale - dtale.show(dbdf) - # - -# Local Variables: -# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/" -# End: - - -import pandas as pd - -# Example large DataFrame -data = { - 'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'], - 'age': [25, 30, 35, 40, 22], - 'city': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'] -} -large_df = pd.DataFrame(data) - -# Create a boolean mask: select rows where age is greater than 30 -mask = large_df['age'] > 30 - -# Apply the boolean mask to get the smaller DataFrame -small_df = large_df[mask] - -print(small_df) diff --git a/client/qldbtools/qldbtools/utils.py b/client/qldbtools/qldbtools/utils.py index cdf0478..4ffb55d 100644 --- a/client/qldbtools/qldbtools/utils.py +++ b/client/qldbtools/qldbtools/utils.py @@ -6,12 +6,14 @@ # 4. creation date # 5. db size #* Imports -import pandas as pd +from dataclasses import dataclass from pathlib import Path + import datetime import json import logging import os +import pandas as pd import time import yaml import zipfile @@ -40,8 +42,15 @@ def traverse_tree(root): pass # Collect information in one 'struct' +@dataclass class DBInfo: - pass + ctime : str = '2024-05-13T12:04:01.593586' + language : str = 'cpp' + name : str = 'nanobind' + owner : str = 'wjakob' + path : Path = Path('/Users/hohn/work-gh/mrva/mrva-open-source-download/repos/wjakob/nanobind/code-scanning/codeql/databases/cpp/db.zip') + size : int = 63083064 + def collect_dbs(db_base): for path in traverse_tree(db_base): diff --git a/client/qldbtools/requirements.txt b/client/qldbtools/requirements.txt index b118cc3..30a0e60 100644 --- a/client/qldbtools/requirements.txt +++ b/client/qldbtools/requirements.txt @@ -10,7 +10,9 @@ async-lru==2.0.4 attrs==23.2.0 Babel==2.15.0 beautifulsoup4==4.12.3 +black==24.4.2 bleach==6.1.0 +blosc2==2.7.0 Brotli==1.1.0 certifi==2024.7.4 cffi==1.16.0 @@ -29,10 +31,12 @@ dash_daq==0.5.0 debugpy==1.8.2 decorator==5.1.1 defusedxml==0.7.1 +distlib==0.3.8 dtale==3.13.1 et-xmlfile==1.1.0 executing==2.0.1 fastjsonschema==2.20.0 +filelock==3.15.4 Flask==2.2.5 Flask-Compress==1.15 flask-ngrok==0.0.25 @@ -50,6 +54,7 @@ ipython==8.26.0 ipython-genutils==0.2.0 ipywidgets==8.1.3 isoduration==20.11.0 +isort==5.13.2 itsdangerous==2.2.0 jedi==0.19.1 Jinja2==3.1.4 @@ -77,12 +82,17 @@ matplotlib==3.9.1 matplotlib-inline==0.1.7 missingno==0.5.2 mistune==3.0.2 +msgpack==1.0.8 +mypy==1.11.0 +mypy-extensions==1.0.0 nbclient==0.10.0 nbconvert==7.16.4 nbformat==5.10.4 +ndindex==1.8 nest-asyncio==1.6.0 networkx==3.3 notebook_shim==0.2.4 +numexpr==2.10.1 numpy==2.0.0 openpyxl==3.1.5 overrides==7.7.0 @@ -91,6 +101,7 @@ pandas==2.2.2 pandasgui==0.2.14 pandocfilters==1.5.1 parso==0.8.4 +pathspec==0.12.1 patsy==0.5.6 pexpect==4.9.0 pillow==10.4.0 @@ -101,6 +112,7 @@ prompt_toolkit==3.0.47 psutil==6.0.0 ptyprocess==0.7.0 pure-eval==0.2.2 +py-cpuinfo==9.0.0 pyarrow==16.1.0 pycparser==2.22 Pygments==2.18.0 @@ -139,6 +151,7 @@ squarify==0.4.3 stack-data==0.6.3 statsmodels==0.14.2 strsimpy==0.2.1 +tables==3.9.2 tenacity==8.5.0 terminado==0.18.1 threadpoolctl==3.5.0 @@ -150,6 +163,7 @@ typing_extensions==4.12.2 tzdata==2024.1 uri-template==1.3.0 urllib3==2.2.2 +virtualenv==20.26.3 wcwidth==0.2.13 webcolors==24.6.0 webencodings==0.5.1