Collect CodeQL database information from the file system and save as CSV

This collection already provides significant meta-information

    ctime : str = '2024-05-13T12:04:01.593586'
    language : str = 'cpp'
    name : str = 'nanobind'
    owner : str = 'wjakob'
    path : Path = Path('/Users/hohn/work-gh/mrva/mrva-open-source-download/repos/wjakob/nanobind/code-scanning/codeql/databases/cpp/db.zip')
    size : int = 63083064

There is some more in the db.zip files, to be added
This commit is contained in:
Michael Hohn
2024-07-22 11:07:00 -07:00
committed by =Michael Hohn
parent 6b4e753e69
commit d64522d168
5 changed files with 115 additions and 102 deletions

View File

@@ -0,0 +1,62 @@
#* Experimental work with utils.py, to be merged into it.
from utils import *
#* Data collection
# Get the db information in list of DBInfo form
db_base = "~/work-gh/mrva/mrva-open-source-download/"
dbs = list(collect_dbs(db_base))
# XX: add metadata
# codeql, meta = extract_metadata('path_to_your_zipfile.zip')
# print(codeql)
# print(meta)
# Inspect:
from pprint import pprint
pprint(["len", len(dbs)])
pprint(["dbs[0]", dbs[0].__dict__])
pprint(["dbs[-1]", dbs[-1].__dict__])
#
# Get a dataframe
dbdf = pd.DataFrame([d.__dict__ for d in dbs])
#
#* Experiments with on-disk format
# Continue use of raw information in separate session.
#
# PosixPath is a problem for json and parquet
#
dbdf['path'] = dbdf['path'].astype(str)
#
dbdf.to_csv('dbdf.csv')
#
dbdf.to_csv('dbdf.csv.gz', compression='gzip', index=False)
#
dbdf.to_json('dbdf.json')
#
# dbdf.to_hdf('dbdf.h5', key='dbdf', mode='w')
#
# fast, binary
dbdf.to_parquet('dbdf.parquet')
#
# fast
import sqlite3
conn = sqlite3.connect('dbdf.db')
dbdf.to_sql('qldbs', conn, if_exists='replace', index=False)
conn.close()
#
# Sizes:
# ls -laSr dbdf.*
# -rw-r--r--@ 1 hohn staff 101390 Jul 12 14:17 dbdf.csv.gz
# -rw-r--r--@ 1 hohn staff 202712 Jul 12 14:17 dbdf.parquet
# -rw-r--r--@ 1 hohn staff 560623 Jul 12 14:17 dbdf.csv
# -rw-r--r--@ 1 hohn staff 610304 Jul 12 14:17 dbdf.db
# -rw-r--r--@ 1 hohn staff 735097 Jul 12 14:17 dbdf.json
#
# parquet has many libraries, including go: xitongsys/parquet-go
# https://parquet.apache.org/
#
# Local Variables:
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
# End:

View File

@@ -0,0 +1,28 @@
# Experimental work with utils.py, to be merged into it.
from utils import *
from pprint import pprint
#* Reload gzipped CSV file to continue work
dbdf_1 = pd.read_csv('dbdf.csv.gz', compression='gzip')
#
# (old) Consistency check:
# dbdf_1.columns == dbdf.columns
# dbmask = (dbdf_1 != dbdf)
# dbdf_1[dbmask]
# dbdf_1[dbmask].dropna(how='all')
# ctime_raw is different in places, so don't use it.
#
#* Interact with/visualize the dataframe
# Using pandasgui -- qt
from pandasgui import show
os.environ['APPDATA'] = "needed-for-pandasgui"
show(dbdf_1)
# Using dtale -- web
import dtale
dtale.show(dbdf_1)
#
# Local Variables:
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
# End:

View File

@@ -1,100 +0,0 @@
#* Interactive use only
# Experimental work with utils.py, to be merged into it.
if 0:
from utils import *
#* Data collection
# Get the db information in list of DBInfo form
db_base = "~/work-gh/mrva/mrva-open-source-download/"
dbs = list(collect_dbs(db_base))
# XX: add metadata
# codeql, meta = extract_metadata('path_to_your_zipfile.zip')
# print(codeql)
# print(meta)
# Inspect:
from pprint import pprint
pprint(["len", len(dbs)])
pprint(["dbs[0]", dbs[0].__dict__])
#
# Get a dataframe
dbdf = pd.DataFrame([d.__dict__ for d in dbs])
#
# XX: save to disk, continue use in separate session
#
# PosixPath is a problem for json and parquet:
#
dbdf['path'] = dbdf['path'].astype(str)
#
dbdf.to_csv('dbdf.csv')
#
dbdf.to_csv('dbdf.csv.gz', compression='gzip', index=False)
#
dbdf.to_json('dbdf.json')
#
# dbdf.to_hdf('dbdf.h5', key='dbdf', mode='w')
#
# fast, binary
dbdf.to_parquet('dbdf.parquet')
#
# fast
import sqlite3
conn = sqlite3.connect('dbdf.db')
dbdf.to_sql('qldbs', conn, if_exists='replace', index=False)
conn.close()
#
# Sizes:
# ls -laSr dbdf.*
# -rw-r--r--@ 1 hohn staff 101390 Jul 12 14:17 dbdf.csv.gz
# -rw-r--r--@ 1 hohn staff 202712 Jul 12 14:17 dbdf.parquet
# -rw-r--r--@ 1 hohn staff 560623 Jul 12 14:17 dbdf.csv
# -rw-r--r--@ 1 hohn staff 610304 Jul 12 14:17 dbdf.db
# -rw-r--r--@ 1 hohn staff 735097 Jul 12 14:17 dbdf.json
#
# parquet has many libraries, including go: xitongsys/parquet-go
# https://parquet.apache.org/
#
# Reload to continue work
dbdf_1 = pd.read_csv('dbdf.csv.gz', compression='gzip')
#
# Consistency check:
dbdf_1.columns == dbdf.columns
dbmask = (dbdf_1 != dbdf)
dbdf_1[dbmask]
dbdf_1[dbmask].dropna(how='all')
# ctime_raw is different in places, so don't use it.
#
# Interact with/visualize the dataframe
os.environ['APPDATA'] = "needed-for-pandasgui"
from pandasgui import show
show(dbdf)
show(cmp)
#
import dtale
dtale.show(dbdf)
#
# Local Variables:
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
# End:
import pandas as pd
# Example large DataFrame
data = {
'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
'age': [25, 30, 35, 40, 22],
'city': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']
}
large_df = pd.DataFrame(data)
# Create a boolean mask: select rows where age is greater than 30
mask = large_df['age'] > 30
# Apply the boolean mask to get the smaller DataFrame
small_df = large_df[mask]
print(small_df)

View File

@@ -6,12 +6,14 @@
# 4. creation date
# 5. db size
#* Imports
import pandas as pd
from dataclasses import dataclass
from pathlib import Path
import datetime
import json
import logging
import os
import pandas as pd
import time
import yaml
import zipfile
@@ -40,8 +42,15 @@ def traverse_tree(root):
pass
# Collect information in one 'struct'
@dataclass
class DBInfo:
pass
ctime : str = '2024-05-13T12:04:01.593586'
language : str = 'cpp'
name : str = 'nanobind'
owner : str = 'wjakob'
path : Path = Path('/Users/hohn/work-gh/mrva/mrva-open-source-download/repos/wjakob/nanobind/code-scanning/codeql/databases/cpp/db.zip')
size : int = 63083064
def collect_dbs(db_base):
for path in traverse_tree(db_base):

View File

@@ -10,7 +10,9 @@ async-lru==2.0.4
attrs==23.2.0
Babel==2.15.0
beautifulsoup4==4.12.3
black==24.4.2
bleach==6.1.0
blosc2==2.7.0
Brotli==1.1.0
certifi==2024.7.4
cffi==1.16.0
@@ -29,10 +31,12 @@ dash_daq==0.5.0
debugpy==1.8.2
decorator==5.1.1
defusedxml==0.7.1
distlib==0.3.8
dtale==3.13.1
et-xmlfile==1.1.0
executing==2.0.1
fastjsonschema==2.20.0
filelock==3.15.4
Flask==2.2.5
Flask-Compress==1.15
flask-ngrok==0.0.25
@@ -50,6 +54,7 @@ ipython==8.26.0
ipython-genutils==0.2.0
ipywidgets==8.1.3
isoduration==20.11.0
isort==5.13.2
itsdangerous==2.2.0
jedi==0.19.1
Jinja2==3.1.4
@@ -77,12 +82,17 @@ matplotlib==3.9.1
matplotlib-inline==0.1.7
missingno==0.5.2
mistune==3.0.2
msgpack==1.0.8
mypy==1.11.0
mypy-extensions==1.0.0
nbclient==0.10.0
nbconvert==7.16.4
nbformat==5.10.4
ndindex==1.8
nest-asyncio==1.6.0
networkx==3.3
notebook_shim==0.2.4
numexpr==2.10.1
numpy==2.0.0
openpyxl==3.1.5
overrides==7.7.0
@@ -91,6 +101,7 @@ pandas==2.2.2
pandasgui==0.2.14
pandocfilters==1.5.1
parso==0.8.4
pathspec==0.12.1
patsy==0.5.6
pexpect==4.9.0
pillow==10.4.0
@@ -101,6 +112,7 @@ prompt_toolkit==3.0.47
psutil==6.0.0
ptyprocess==0.7.0
pure-eval==0.2.2
py-cpuinfo==9.0.0
pyarrow==16.1.0
pycparser==2.22
Pygments==2.18.0
@@ -139,6 +151,7 @@ squarify==0.4.3
stack-data==0.6.3
statsmodels==0.14.2
strsimpy==0.2.1
tables==3.9.2
tenacity==8.5.0
terminado==0.18.1
threadpoolctl==3.5.0
@@ -150,6 +163,7 @@ typing_extensions==4.12.2
tzdata==2024.1
uri-template==1.3.0
urllib3==2.2.2
virtualenv==20.26.3
wcwidth==0.2.13
webcolors==24.6.0
webencodings==0.5.1