Add scripts for automatic codeql db data and metadata collection
- updated instructions - cli scripts mirror the interactive session*.py files
This commit is contained in:
committed by
=Michael Hohn
parent
aaeafa9e88
commit
731b44b187
@@ -46,10 +46,17 @@ qldbtools is a Python package for working with CodeQL databases
|
||||
```
|
||||
|
||||
|
||||
## Usage
|
||||
## Use as library
|
||||
|
||||
```python
|
||||
import qldbtools as ql
|
||||
```
|
||||
|
||||
## Command-line use
|
||||
|
||||
cd ~/work-gh/mrva/mrvacommander/client/qldbtools
|
||||
./bin/mc-db-initial-info ~/work-gh/mrva/mrva-open-source-download | gzip > db-info-1.csv.gz
|
||||
|
||||
gunzip < db-info-1.csv.gz | ./bin/mc-db-refine-info | gzip > db-info-2.csv.gz
|
||||
|
||||
|
||||
|
||||
40
client/qldbtools/bin/mc-db-initial-info
Executable file
40
client/qldbtools/bin/mc-db-initial-info
Executable file
@@ -0,0 +1,40 @@
|
||||
#!/usr/bin/env python
|
||||
""" Collect information about CodeQL databases from the file system and write out
|
||||
a table in CSV format.
|
||||
"""
|
||||
import qldbtools.utils as utils
|
||||
import argparse
|
||||
import logging
|
||||
import sys
|
||||
import pandas as pd
|
||||
#
|
||||
#* Configure logger
|
||||
#
|
||||
logging.basicConfig(format='%(asctime)s %(message)s')
|
||||
|
||||
#
|
||||
#* Process command line
|
||||
#
|
||||
parser = argparse.ArgumentParser(
|
||||
description="""Find all CodeQL DBs in and below starting_dir and export a CSV
|
||||
file with relevant data.""")
|
||||
parser.add_argument('starting_dir', type=str,
|
||||
help='The starting directory to search for codeql.')
|
||||
args = parser.parse_args()
|
||||
|
||||
#
|
||||
#* Collect info
|
||||
#
|
||||
# Get the db information in list of DBInfo form
|
||||
db_base = args.starting_dir
|
||||
dbs = list(utils.collect_dbs(db_base))
|
||||
dbdf = pd.DataFrame([d.__dict__ for d in dbs])
|
||||
#
|
||||
#
|
||||
#* Write info out
|
||||
#
|
||||
dbdf.to_csv(sys.stdout, index=False)
|
||||
|
||||
# Local Variables:
|
||||
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
|
||||
# End:
|
||||
83
client/qldbtools/bin/mc-db-refine-info
Executable file
83
client/qldbtools/bin/mc-db-refine-info
Executable file
@@ -0,0 +1,83 @@
|
||||
#!/usr/bin/env python
|
||||
""" Read an initial table of CodeQL DB information, produced by
|
||||
mc-db-initial-info, and collect more detailed information from the database
|
||||
files. Write out an extended table in CSV format.
|
||||
"""
|
||||
import qldbtools.utils as utils
|
||||
import argparse
|
||||
import logging
|
||||
import pandas as pd
|
||||
import sys
|
||||
|
||||
#
|
||||
#* Configure logger
|
||||
#
|
||||
logging.basicConfig(format='%(asctime)s %(message)s')
|
||||
|
||||
#
|
||||
#* Process command line
|
||||
#
|
||||
parser = argparse.ArgumentParser(
|
||||
description="""Read an initial table of CodeQL DB information, produced by
|
||||
mc-db-initial-info, and collect more detailed information from the database
|
||||
files. Write out an extended table in CSV format. """)
|
||||
args = parser.parse_args()
|
||||
|
||||
#
|
||||
#* Collect the information
|
||||
#
|
||||
d = pd.read_csv(sys.stdin)
|
||||
joiners = []
|
||||
for left_index in range(0, len(d)-1):
|
||||
try:
|
||||
cqlc, metac = utils.extract_metadata(d.path[left_index])
|
||||
except utils.ExtractNotZipfile:
|
||||
continue
|
||||
except utils.ExtractNoCQLDB:
|
||||
continue
|
||||
try:
|
||||
detail_df = utils.metadata_details(left_index, cqlc, metac)
|
||||
except utils.DetailsMissing:
|
||||
continue
|
||||
joiners.append(detail_df)
|
||||
joiners_df = pd.concat(joiners, axis=0)
|
||||
full_df = pd.merge(d, joiners_df, left_index=True, right_on='left_index', how='outer')
|
||||
|
||||
#** Re-order the dataframe columns by importance
|
||||
# - Much of the data
|
||||
# 1. Is only conditionally present
|
||||
# 2. Is extra info, not for the DB proper
|
||||
# 3. May have various names
|
||||
#
|
||||
# - The essential columns are
|
||||
# | owner |
|
||||
# | name |
|
||||
# | language |
|
||||
# | size |
|
||||
# | cliVersion |
|
||||
# | creationTime |
|
||||
# | sha |
|
||||
# | baselineLinesOfCode |
|
||||
# | path |
|
||||
#
|
||||
# - The rest are useful; put them last
|
||||
# | db_lang |
|
||||
# | db_lang_displayName |
|
||||
# | db_lang_file_count |
|
||||
# | db_lang_linesOfCode |
|
||||
# | left_index |
|
||||
# | ctime |
|
||||
# | primaryLanguage |
|
||||
# | finalised |
|
||||
|
||||
final_df = full_df.reindex(columns=['owner', 'name', 'language', 'size', 'cliVersion',
|
||||
'creationTime', 'sha', 'baselineLinesOfCode', 'path',
|
||||
'db_lang', 'db_lang_displayName', 'db_lang_file_count',
|
||||
'db_lang_linesOfCode', 'ctime', 'primaryLanguage',
|
||||
'finalised', 'left_index'])
|
||||
|
||||
final_df.to_csv(sys.stdout, index=False)
|
||||
|
||||
# Local Variables:
|
||||
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
|
||||
# End:
|
||||
@@ -1,4 +1,6 @@
|
||||
#* Experimental work with utils.py, to be merged into it.
|
||||
# The rest of this interactive script is available as cli script in
|
||||
# mc-db-initial-info
|
||||
from utils import *
|
||||
|
||||
#* Data collection
|
||||
|
||||
@@ -52,6 +52,8 @@ for lang, lang_cont in metac['languages'].items():
|
||||
print("%sdisplayName %s" % (indent, val))
|
||||
|
||||
#** Automated for all entries
|
||||
# The rest of this interactive script is available as cli script in
|
||||
# mc-db-refine-info
|
||||
d = dbdf_1
|
||||
joiners = []
|
||||
for left_index in range(0, len(d)-1):
|
||||
@@ -74,6 +76,41 @@ from pandasgui import show
|
||||
os.environ['APPDATA'] = "needed-for-pandasgui"
|
||||
show(full_df)
|
||||
|
||||
#** Re-order the dataframe columns by importance
|
||||
# - Much of the data
|
||||
# 1. Is only conditionally present
|
||||
# 2. Is extra info, not for the DB proper
|
||||
# 3. May have various names
|
||||
|
||||
# - The essential columns are
|
||||
# | owner |
|
||||
# | name |
|
||||
# | language |
|
||||
# | size |
|
||||
# | cliVersion |
|
||||
# | creationTime |
|
||||
# | sha |
|
||||
# | baselineLinesOfCode |
|
||||
# | path |
|
||||
|
||||
# - The rest are useful; put them last
|
||||
# | db_lang |
|
||||
# | db_lang_displayName |
|
||||
# | db_lang_file_count |
|
||||
# | db_lang_linesOfCode |
|
||||
# | left_index |
|
||||
# | ctime |
|
||||
# | primaryLanguage |
|
||||
# | finalised |
|
||||
|
||||
final_df = full_df.reindex(columns=['owner', 'name', 'language', 'size', 'cliVersion',
|
||||
'creationTime', 'sha', 'baselineLinesOfCode', 'path',
|
||||
'db_lang', 'db_lang_displayName', 'db_lang_file_count',
|
||||
'db_lang_linesOfCode', 'ctime', 'primaryLanguage',
|
||||
'finalised', 'left_index'])
|
||||
|
||||
final_df.to_csv('all-info-table.csv.gz', compression='gzip', index=False)
|
||||
|
||||
#
|
||||
# Local Variables:
|
||||
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
from setuptools import setup, find_packages
|
||||
import glob
|
||||
|
||||
setup(
|
||||
name='qldbtools',
|
||||
@@ -6,7 +7,7 @@ setup(
|
||||
description='A Python package for working with CodeQL databases',
|
||||
author='Michael Hohn',
|
||||
author_email='hohn@github.com',
|
||||
packages=find_packages(),
|
||||
install_requires=[
|
||||
],
|
||||
packages=['qldbtools'],
|
||||
install_requires=[],
|
||||
scripts=glob.glob("bin/mc-*"),
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user