Collect CodeQL database information from the file system and save as CSV
This collection already provides significant meta-information
ctime : str = '2024-05-13T12:04:01.593586'
language : str = 'cpp'
name : str = 'nanobind'
owner : str = 'wjakob'
path : Path = Path('/Users/hohn/work-gh/mrva/mrva-open-source-download/repos/wjakob/nanobind/code-scanning/codeql/databases/cpp/db.zip')
size : int = 63083064
There is some more in the db.zip files, to be added
This commit is contained in:
committed by
=Michael Hohn
parent
6b4e753e69
commit
d64522d168
62
client/qldbtools/qldbtools/session1.py
Normal file
62
client/qldbtools/qldbtools/session1.py
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
#* Experimental work with utils.py, to be merged into it.
|
||||||
|
from utils import *
|
||||||
|
|
||||||
|
#* Data collection
|
||||||
|
# Get the db information in list of DBInfo form
|
||||||
|
db_base = "~/work-gh/mrva/mrva-open-source-download/"
|
||||||
|
dbs = list(collect_dbs(db_base))
|
||||||
|
|
||||||
|
# XX: add metadata
|
||||||
|
# codeql, meta = extract_metadata('path_to_your_zipfile.zip')
|
||||||
|
# print(codeql)
|
||||||
|
# print(meta)
|
||||||
|
|
||||||
|
# Inspect:
|
||||||
|
from pprint import pprint
|
||||||
|
pprint(["len", len(dbs)])
|
||||||
|
pprint(["dbs[0]", dbs[0].__dict__])
|
||||||
|
pprint(["dbs[-1]", dbs[-1].__dict__])
|
||||||
|
#
|
||||||
|
# Get a dataframe
|
||||||
|
dbdf = pd.DataFrame([d.__dict__ for d in dbs])
|
||||||
|
#
|
||||||
|
#* Experiments with on-disk format
|
||||||
|
# Continue use of raw information in separate session.
|
||||||
|
#
|
||||||
|
# PosixPath is a problem for json and parquet
|
||||||
|
#
|
||||||
|
dbdf['path'] = dbdf['path'].astype(str)
|
||||||
|
#
|
||||||
|
dbdf.to_csv('dbdf.csv')
|
||||||
|
#
|
||||||
|
dbdf.to_csv('dbdf.csv.gz', compression='gzip', index=False)
|
||||||
|
#
|
||||||
|
dbdf.to_json('dbdf.json')
|
||||||
|
#
|
||||||
|
# dbdf.to_hdf('dbdf.h5', key='dbdf', mode='w')
|
||||||
|
#
|
||||||
|
# fast, binary
|
||||||
|
dbdf.to_parquet('dbdf.parquet')
|
||||||
|
#
|
||||||
|
# fast
|
||||||
|
import sqlite3
|
||||||
|
conn = sqlite3.connect('dbdf.db')
|
||||||
|
dbdf.to_sql('qldbs', conn, if_exists='replace', index=False)
|
||||||
|
conn.close()
|
||||||
|
#
|
||||||
|
# Sizes:
|
||||||
|
# ls -laSr dbdf.*
|
||||||
|
# -rw-r--r--@ 1 hohn staff 101390 Jul 12 14:17 dbdf.csv.gz
|
||||||
|
# -rw-r--r--@ 1 hohn staff 202712 Jul 12 14:17 dbdf.parquet
|
||||||
|
# -rw-r--r--@ 1 hohn staff 560623 Jul 12 14:17 dbdf.csv
|
||||||
|
# -rw-r--r--@ 1 hohn staff 610304 Jul 12 14:17 dbdf.db
|
||||||
|
# -rw-r--r--@ 1 hohn staff 735097 Jul 12 14:17 dbdf.json
|
||||||
|
#
|
||||||
|
# parquet has many libraries, including go: xitongsys/parquet-go
|
||||||
|
# https://parquet.apache.org/
|
||||||
|
#
|
||||||
|
|
||||||
|
|
||||||
|
# Local Variables:
|
||||||
|
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
|
||||||
|
# End:
|
||||||
28
client/qldbtools/qldbtools/session2.py
Normal file
28
client/qldbtools/qldbtools/session2.py
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
# Experimental work with utils.py, to be merged into it.
|
||||||
|
from utils import *
|
||||||
|
from pprint import pprint
|
||||||
|
|
||||||
|
#* Reload gzipped CSV file to continue work
|
||||||
|
dbdf_1 = pd.read_csv('dbdf.csv.gz', compression='gzip')
|
||||||
|
#
|
||||||
|
# (old) Consistency check:
|
||||||
|
# dbdf_1.columns == dbdf.columns
|
||||||
|
# dbmask = (dbdf_1 != dbdf)
|
||||||
|
# dbdf_1[dbmask]
|
||||||
|
# dbdf_1[dbmask].dropna(how='all')
|
||||||
|
# ctime_raw is different in places, so don't use it.
|
||||||
|
|
||||||
|
#
|
||||||
|
#* Interact with/visualize the dataframe
|
||||||
|
# Using pandasgui -- qt
|
||||||
|
from pandasgui import show
|
||||||
|
os.environ['APPDATA'] = "needed-for-pandasgui"
|
||||||
|
show(dbdf_1)
|
||||||
|
# Using dtale -- web
|
||||||
|
import dtale
|
||||||
|
dtale.show(dbdf_1)
|
||||||
|
#
|
||||||
|
|
||||||
|
# Local Variables:
|
||||||
|
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
|
||||||
|
# End:
|
||||||
@@ -1,100 +0,0 @@
|
|||||||
#* Interactive use only
|
|
||||||
# Experimental work with utils.py, to be merged into it.
|
|
||||||
if 0:
|
|
||||||
from utils import *
|
|
||||||
|
|
||||||
#* Data collection
|
|
||||||
# Get the db information in list of DBInfo form
|
|
||||||
db_base = "~/work-gh/mrva/mrva-open-source-download/"
|
|
||||||
dbs = list(collect_dbs(db_base))
|
|
||||||
|
|
||||||
# XX: add metadata
|
|
||||||
# codeql, meta = extract_metadata('path_to_your_zipfile.zip')
|
|
||||||
# print(codeql)
|
|
||||||
# print(meta)
|
|
||||||
|
|
||||||
# Inspect:
|
|
||||||
from pprint import pprint
|
|
||||||
pprint(["len", len(dbs)])
|
|
||||||
pprint(["dbs[0]", dbs[0].__dict__])
|
|
||||||
#
|
|
||||||
# Get a dataframe
|
|
||||||
dbdf = pd.DataFrame([d.__dict__ for d in dbs])
|
|
||||||
#
|
|
||||||
# XX: save to disk, continue use in separate session
|
|
||||||
#
|
|
||||||
# PosixPath is a problem for json and parquet:
|
|
||||||
#
|
|
||||||
dbdf['path'] = dbdf['path'].astype(str)
|
|
||||||
#
|
|
||||||
dbdf.to_csv('dbdf.csv')
|
|
||||||
#
|
|
||||||
dbdf.to_csv('dbdf.csv.gz', compression='gzip', index=False)
|
|
||||||
#
|
|
||||||
dbdf.to_json('dbdf.json')
|
|
||||||
#
|
|
||||||
# dbdf.to_hdf('dbdf.h5', key='dbdf', mode='w')
|
|
||||||
#
|
|
||||||
# fast, binary
|
|
||||||
dbdf.to_parquet('dbdf.parquet')
|
|
||||||
#
|
|
||||||
# fast
|
|
||||||
import sqlite3
|
|
||||||
conn = sqlite3.connect('dbdf.db')
|
|
||||||
dbdf.to_sql('qldbs', conn, if_exists='replace', index=False)
|
|
||||||
conn.close()
|
|
||||||
#
|
|
||||||
# Sizes:
|
|
||||||
# ls -laSr dbdf.*
|
|
||||||
# -rw-r--r--@ 1 hohn staff 101390 Jul 12 14:17 dbdf.csv.gz
|
|
||||||
# -rw-r--r--@ 1 hohn staff 202712 Jul 12 14:17 dbdf.parquet
|
|
||||||
# -rw-r--r--@ 1 hohn staff 560623 Jul 12 14:17 dbdf.csv
|
|
||||||
# -rw-r--r--@ 1 hohn staff 610304 Jul 12 14:17 dbdf.db
|
|
||||||
# -rw-r--r--@ 1 hohn staff 735097 Jul 12 14:17 dbdf.json
|
|
||||||
#
|
|
||||||
# parquet has many libraries, including go: xitongsys/parquet-go
|
|
||||||
# https://parquet.apache.org/
|
|
||||||
#
|
|
||||||
# Reload to continue work
|
|
||||||
dbdf_1 = pd.read_csv('dbdf.csv.gz', compression='gzip')
|
|
||||||
#
|
|
||||||
# Consistency check:
|
|
||||||
dbdf_1.columns == dbdf.columns
|
|
||||||
dbmask = (dbdf_1 != dbdf)
|
|
||||||
dbdf_1[dbmask]
|
|
||||||
dbdf_1[dbmask].dropna(how='all')
|
|
||||||
# ctime_raw is different in places, so don't use it.
|
|
||||||
|
|
||||||
#
|
|
||||||
# Interact with/visualize the dataframe
|
|
||||||
os.environ['APPDATA'] = "needed-for-pandasgui"
|
|
||||||
from pandasgui import show
|
|
||||||
show(dbdf)
|
|
||||||
show(cmp)
|
|
||||||
#
|
|
||||||
import dtale
|
|
||||||
dtale.show(dbdf)
|
|
||||||
#
|
|
||||||
|
|
||||||
# Local Variables:
|
|
||||||
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
|
|
||||||
# End:
|
|
||||||
|
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
# Example large DataFrame
|
|
||||||
data = {
|
|
||||||
'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
|
|
||||||
'age': [25, 30, 35, 40, 22],
|
|
||||||
'city': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']
|
|
||||||
}
|
|
||||||
large_df = pd.DataFrame(data)
|
|
||||||
|
|
||||||
# Create a boolean mask: select rows where age is greater than 30
|
|
||||||
mask = large_df['age'] > 30
|
|
||||||
|
|
||||||
# Apply the boolean mask to get the smaller DataFrame
|
|
||||||
small_df = large_df[mask]
|
|
||||||
|
|
||||||
print(small_df)
|
|
||||||
@@ -6,12 +6,14 @@
|
|||||||
# 4. creation date
|
# 4. creation date
|
||||||
# 5. db size
|
# 5. db size
|
||||||
#* Imports
|
#* Imports
|
||||||
import pandas as pd
|
from dataclasses import dataclass
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import datetime
|
import datetime
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import pandas as pd
|
||||||
import time
|
import time
|
||||||
import yaml
|
import yaml
|
||||||
import zipfile
|
import zipfile
|
||||||
@@ -40,8 +42,15 @@ def traverse_tree(root):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
# Collect information in one 'struct'
|
# Collect information in one 'struct'
|
||||||
|
@dataclass
|
||||||
class DBInfo:
|
class DBInfo:
|
||||||
pass
|
ctime : str = '2024-05-13T12:04:01.593586'
|
||||||
|
language : str = 'cpp'
|
||||||
|
name : str = 'nanobind'
|
||||||
|
owner : str = 'wjakob'
|
||||||
|
path : Path = Path('/Users/hohn/work-gh/mrva/mrva-open-source-download/repos/wjakob/nanobind/code-scanning/codeql/databases/cpp/db.zip')
|
||||||
|
size : int = 63083064
|
||||||
|
|
||||||
|
|
||||||
def collect_dbs(db_base):
|
def collect_dbs(db_base):
|
||||||
for path in traverse_tree(db_base):
|
for path in traverse_tree(db_base):
|
||||||
|
|||||||
@@ -10,7 +10,9 @@ async-lru==2.0.4
|
|||||||
attrs==23.2.0
|
attrs==23.2.0
|
||||||
Babel==2.15.0
|
Babel==2.15.0
|
||||||
beautifulsoup4==4.12.3
|
beautifulsoup4==4.12.3
|
||||||
|
black==24.4.2
|
||||||
bleach==6.1.0
|
bleach==6.1.0
|
||||||
|
blosc2==2.7.0
|
||||||
Brotli==1.1.0
|
Brotli==1.1.0
|
||||||
certifi==2024.7.4
|
certifi==2024.7.4
|
||||||
cffi==1.16.0
|
cffi==1.16.0
|
||||||
@@ -29,10 +31,12 @@ dash_daq==0.5.0
|
|||||||
debugpy==1.8.2
|
debugpy==1.8.2
|
||||||
decorator==5.1.1
|
decorator==5.1.1
|
||||||
defusedxml==0.7.1
|
defusedxml==0.7.1
|
||||||
|
distlib==0.3.8
|
||||||
dtale==3.13.1
|
dtale==3.13.1
|
||||||
et-xmlfile==1.1.0
|
et-xmlfile==1.1.0
|
||||||
executing==2.0.1
|
executing==2.0.1
|
||||||
fastjsonschema==2.20.0
|
fastjsonschema==2.20.0
|
||||||
|
filelock==3.15.4
|
||||||
Flask==2.2.5
|
Flask==2.2.5
|
||||||
Flask-Compress==1.15
|
Flask-Compress==1.15
|
||||||
flask-ngrok==0.0.25
|
flask-ngrok==0.0.25
|
||||||
@@ -50,6 +54,7 @@ ipython==8.26.0
|
|||||||
ipython-genutils==0.2.0
|
ipython-genutils==0.2.0
|
||||||
ipywidgets==8.1.3
|
ipywidgets==8.1.3
|
||||||
isoduration==20.11.0
|
isoduration==20.11.0
|
||||||
|
isort==5.13.2
|
||||||
itsdangerous==2.2.0
|
itsdangerous==2.2.0
|
||||||
jedi==0.19.1
|
jedi==0.19.1
|
||||||
Jinja2==3.1.4
|
Jinja2==3.1.4
|
||||||
@@ -77,12 +82,17 @@ matplotlib==3.9.1
|
|||||||
matplotlib-inline==0.1.7
|
matplotlib-inline==0.1.7
|
||||||
missingno==0.5.2
|
missingno==0.5.2
|
||||||
mistune==3.0.2
|
mistune==3.0.2
|
||||||
|
msgpack==1.0.8
|
||||||
|
mypy==1.11.0
|
||||||
|
mypy-extensions==1.0.0
|
||||||
nbclient==0.10.0
|
nbclient==0.10.0
|
||||||
nbconvert==7.16.4
|
nbconvert==7.16.4
|
||||||
nbformat==5.10.4
|
nbformat==5.10.4
|
||||||
|
ndindex==1.8
|
||||||
nest-asyncio==1.6.0
|
nest-asyncio==1.6.0
|
||||||
networkx==3.3
|
networkx==3.3
|
||||||
notebook_shim==0.2.4
|
notebook_shim==0.2.4
|
||||||
|
numexpr==2.10.1
|
||||||
numpy==2.0.0
|
numpy==2.0.0
|
||||||
openpyxl==3.1.5
|
openpyxl==3.1.5
|
||||||
overrides==7.7.0
|
overrides==7.7.0
|
||||||
@@ -91,6 +101,7 @@ pandas==2.2.2
|
|||||||
pandasgui==0.2.14
|
pandasgui==0.2.14
|
||||||
pandocfilters==1.5.1
|
pandocfilters==1.5.1
|
||||||
parso==0.8.4
|
parso==0.8.4
|
||||||
|
pathspec==0.12.1
|
||||||
patsy==0.5.6
|
patsy==0.5.6
|
||||||
pexpect==4.9.0
|
pexpect==4.9.0
|
||||||
pillow==10.4.0
|
pillow==10.4.0
|
||||||
@@ -101,6 +112,7 @@ prompt_toolkit==3.0.47
|
|||||||
psutil==6.0.0
|
psutil==6.0.0
|
||||||
ptyprocess==0.7.0
|
ptyprocess==0.7.0
|
||||||
pure-eval==0.2.2
|
pure-eval==0.2.2
|
||||||
|
py-cpuinfo==9.0.0
|
||||||
pyarrow==16.1.0
|
pyarrow==16.1.0
|
||||||
pycparser==2.22
|
pycparser==2.22
|
||||||
Pygments==2.18.0
|
Pygments==2.18.0
|
||||||
@@ -139,6 +151,7 @@ squarify==0.4.3
|
|||||||
stack-data==0.6.3
|
stack-data==0.6.3
|
||||||
statsmodels==0.14.2
|
statsmodels==0.14.2
|
||||||
strsimpy==0.2.1
|
strsimpy==0.2.1
|
||||||
|
tables==3.9.2
|
||||||
tenacity==8.5.0
|
tenacity==8.5.0
|
||||||
terminado==0.18.1
|
terminado==0.18.1
|
||||||
threadpoolctl==3.5.0
|
threadpoolctl==3.5.0
|
||||||
@@ -150,6 +163,7 @@ typing_extensions==4.12.2
|
|||||||
tzdata==2024.1
|
tzdata==2024.1
|
||||||
uri-template==1.3.0
|
uri-template==1.3.0
|
||||||
urllib3==2.2.2
|
urllib3==2.2.2
|
||||||
|
virtualenv==20.26.3
|
||||||
wcwidth==0.2.13
|
wcwidth==0.2.13
|
||||||
webcolors==24.6.0
|
webcolors==24.6.0
|
||||||
webencodings==0.5.1
|
webencodings==0.5.1
|
||||||
|
|||||||
Reference in New Issue
Block a user