Add scripts for automatic codeql db data and metadata collection

- updated instructions
- cli scripts mirror the interactive session*.py files
This commit is contained in:
Michael Hohn
2024-07-23 15:05:03 -07:00
committed by =Michael Hohn
parent aaeafa9e88
commit 731b44b187
6 changed files with 174 additions and 4 deletions

View File

@@ -46,10 +46,17 @@ qldbtools is a Python package for working with CodeQL databases
```
## Usage
## Use as library
```python
import qldbtools as ql
```
## Command-line use
cd ~/work-gh/mrva/mrvacommander/client/qldbtools
./bin/mc-db-initial-info ~/work-gh/mrva/mrva-open-source-download | gzip > db-info-1.csv.gz
gunzip < db-info-1.csv.gz | ./bin/mc-db-refine-info | gzip > db-info-2.csv.gz

View File

@@ -0,0 +1,40 @@
#!/usr/bin/env python
""" Collect information about CodeQL databases from the file system and write out
a table in CSV format.
"""
import qldbtools.utils as utils
import argparse
import logging
import sys
import pandas as pd
#
#* Configure logger
#
logging.basicConfig(format='%(asctime)s %(message)s')
#
#* Process command line
#
parser = argparse.ArgumentParser(
description="""Find all CodeQL DBs in and below starting_dir and export a CSV
file with relevant data.""")
parser.add_argument('starting_dir', type=str,
help='The starting directory to search for codeql.')
args = parser.parse_args()
#
#* Collect info
#
# Get the db information in list of DBInfo form
db_base = args.starting_dir
dbs = list(utils.collect_dbs(db_base))
dbdf = pd.DataFrame([d.__dict__ for d in dbs])
#
#
#* Write info out
#
dbdf.to_csv(sys.stdout, index=False)
# Local Variables:
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
# End:

View File

@@ -0,0 +1,83 @@
#!/usr/bin/env python
""" Read an initial table of CodeQL DB information, produced by
mc-db-initial-info, and collect more detailed information from the database
files. Write out an extended table in CSV format.
"""
import qldbtools.utils as utils
import argparse
import logging
import pandas as pd
import sys
#
#* Configure logger
#
logging.basicConfig(format='%(asctime)s %(message)s')
#
#* Process command line
#
parser = argparse.ArgumentParser(
description="""Read an initial table of CodeQL DB information, produced by
mc-db-initial-info, and collect more detailed information from the database
files. Write out an extended table in CSV format. """)
args = parser.parse_args()
#
#* Collect the information
#
d = pd.read_csv(sys.stdin)
joiners = []
for left_index in range(0, len(d)-1):
try:
cqlc, metac = utils.extract_metadata(d.path[left_index])
except utils.ExtractNotZipfile:
continue
except utils.ExtractNoCQLDB:
continue
try:
detail_df = utils.metadata_details(left_index, cqlc, metac)
except utils.DetailsMissing:
continue
joiners.append(detail_df)
joiners_df = pd.concat(joiners, axis=0)
full_df = pd.merge(d, joiners_df, left_index=True, right_on='left_index', how='outer')
#** Re-order the dataframe columns by importance
# - Much of the data
# 1. Is only conditionally present
# 2. Is extra info, not for the DB proper
# 3. May have various names
#
# - The essential columns are
# | owner |
# | name |
# | language |
# | size |
# | cliVersion |
# | creationTime |
# | sha |
# | baselineLinesOfCode |
# | path |
#
# - The rest are useful; put them last
# | db_lang |
# | db_lang_displayName |
# | db_lang_file_count |
# | db_lang_linesOfCode |
# | left_index |
# | ctime |
# | primaryLanguage |
# | finalised |
final_df = full_df.reindex(columns=['owner', 'name', 'language', 'size', 'cliVersion',
'creationTime', 'sha', 'baselineLinesOfCode', 'path',
'db_lang', 'db_lang_displayName', 'db_lang_file_count',
'db_lang_linesOfCode', 'ctime', 'primaryLanguage',
'finalised', 'left_index'])
final_df.to_csv(sys.stdout, index=False)
# Local Variables:
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
# End:

View File

@@ -1,4 +1,6 @@
#* Experimental work with utils.py, to be merged into it.
# The rest of this interactive script is available as cli script in
# mc-db-initial-info
from utils import *
#* Data collection

View File

@@ -52,6 +52,8 @@ for lang, lang_cont in metac['languages'].items():
print("%sdisplayName %s" % (indent, val))
#** Automated for all entries
# The rest of this interactive script is available as cli script in
# mc-db-refine-info
d = dbdf_1
joiners = []
for left_index in range(0, len(d)-1):
@@ -74,6 +76,41 @@ from pandasgui import show
os.environ['APPDATA'] = "needed-for-pandasgui"
show(full_df)
#** Re-order the dataframe columns by importance
# - Much of the data
# 1. Is only conditionally present
# 2. Is extra info, not for the DB proper
# 3. May have various names
# - The essential columns are
# | owner |
# | name |
# | language |
# | size |
# | cliVersion |
# | creationTime |
# | sha |
# | baselineLinesOfCode |
# | path |
# - The rest are useful; put them last
# | db_lang |
# | db_lang_displayName |
# | db_lang_file_count |
# | db_lang_linesOfCode |
# | left_index |
# | ctime |
# | primaryLanguage |
# | finalised |
final_df = full_df.reindex(columns=['owner', 'name', 'language', 'size', 'cliVersion',
'creationTime', 'sha', 'baselineLinesOfCode', 'path',
'db_lang', 'db_lang_displayName', 'db_lang_file_count',
'db_lang_linesOfCode', 'ctime', 'primaryLanguage',
'finalised', 'left_index'])
final_df.to_csv('all-info-table.csv.gz', compression='gzip', index=False)
#
# Local Variables:
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"

View File

@@ -1,4 +1,5 @@
from setuptools import setup, find_packages
import glob
setup(
name='qldbtools',
@@ -6,7 +7,7 @@ setup(
description='A Python package for working with CodeQL databases',
author='Michael Hohn',
author_email='hohn@github.com',
packages=find_packages(),
install_requires=[
],
packages=['qldbtools'],
install_requires=[],
scripts=glob.glob("bin/mc-*"),
)