Add scripts for automatic codeql db data and metadata collection

- updated instructions
- cli scripts mirror the interactive session*.py files
This commit is contained in:
Michael Hohn
2024-07-23 15:05:03 -07:00
committed by =Michael Hohn
parent aaeafa9e88
commit 731b44b187
6 changed files with 174 additions and 4 deletions

View File

@@ -0,0 +1,40 @@
#!/usr/bin/env python
""" Collect information about CodeQL databases from the file system and write out
a table in CSV format.
"""
import qldbtools.utils as utils
import argparse
import logging
import sys
import pandas as pd
#
#* Configure logger
#
logging.basicConfig(format='%(asctime)s %(message)s')
#
#* Process command line
#
parser = argparse.ArgumentParser(
description="""Find all CodeQL DBs in and below starting_dir and export a CSV
file with relevant data.""")
parser.add_argument('starting_dir', type=str,
help='The starting directory to search for codeql.')
args = parser.parse_args()
#
#* Collect info
#
# Get the db information in list of DBInfo form
db_base = args.starting_dir
dbs = list(utils.collect_dbs(db_base))
dbdf = pd.DataFrame([d.__dict__ for d in dbs])
#
#
#* Write info out
#
dbdf.to_csv(sys.stdout, index=False)
# Local Variables:
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
# End:

View File

@@ -0,0 +1,83 @@
#!/usr/bin/env python
""" Read an initial table of CodeQL DB information, produced by
mc-db-initial-info, and collect more detailed information from the database
files. Write out an extended table in CSV format.
"""
import qldbtools.utils as utils
import argparse
import logging
import pandas as pd
import sys
#
#* Configure logger
#
logging.basicConfig(format='%(asctime)s %(message)s')
#
#* Process command line
#
parser = argparse.ArgumentParser(
description="""Read an initial table of CodeQL DB information, produced by
mc-db-initial-info, and collect more detailed information from the database
files. Write out an extended table in CSV format. """)
args = parser.parse_args()
#
#* Collect the information
#
d = pd.read_csv(sys.stdin)
joiners = []
for left_index in range(0, len(d)-1):
try:
cqlc, metac = utils.extract_metadata(d.path[left_index])
except utils.ExtractNotZipfile:
continue
except utils.ExtractNoCQLDB:
continue
try:
detail_df = utils.metadata_details(left_index, cqlc, metac)
except utils.DetailsMissing:
continue
joiners.append(detail_df)
joiners_df = pd.concat(joiners, axis=0)
full_df = pd.merge(d, joiners_df, left_index=True, right_on='left_index', how='outer')
#** Re-order the dataframe columns by importance
# - Much of the data
# 1. Is only conditionally present
# 2. Is extra info, not for the DB proper
# 3. May have various names
#
# - The essential columns are
# | owner |
# | name |
# | language |
# | size |
# | cliVersion |
# | creationTime |
# | sha |
# | baselineLinesOfCode |
# | path |
#
# - The rest are useful; put them last
# | db_lang |
# | db_lang_displayName |
# | db_lang_file_count |
# | db_lang_linesOfCode |
# | left_index |
# | ctime |
# | primaryLanguage |
# | finalised |
final_df = full_df.reindex(columns=['owner', 'name', 'language', 'size', 'cliVersion',
'creationTime', 'sha', 'baselineLinesOfCode', 'path',
'db_lang', 'db_lang_displayName', 'db_lang_file_count',
'db_lang_linesOfCode', 'ctime', 'primaryLanguage',
'finalised', 'left_index'])
final_df.to_csv(sys.stdout, index=False)
# Local Variables:
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
# End: