Add scripts for automatic codeql db data and metadata collection
- updated instructions - cli scripts mirror the interactive session*.py files
This commit is contained in:
committed by
=Michael Hohn
parent
aaeafa9e88
commit
731b44b187
@@ -46,10 +46,17 @@ qldbtools is a Python package for working with CodeQL databases
|
|||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
## Usage
|
## Use as library
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import qldbtools as ql
|
import qldbtools as ql
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Command-line use
|
||||||
|
|
||||||
|
cd ~/work-gh/mrva/mrvacommander/client/qldbtools
|
||||||
|
./bin/mc-db-initial-info ~/work-gh/mrva/mrva-open-source-download | gzip > db-info-1.csv.gz
|
||||||
|
|
||||||
|
gunzip < db-info-1.csv.gz | ./bin/mc-db-refine-info | gzip > db-info-2.csv.gz
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
40
client/qldbtools/bin/mc-db-initial-info
Executable file
40
client/qldbtools/bin/mc-db-initial-info
Executable file
@@ -0,0 +1,40 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
""" Collect information about CodeQL databases from the file system and write out
|
||||||
|
a table in CSV format.
|
||||||
|
"""
|
||||||
|
import qldbtools.utils as utils
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
import pandas as pd
|
||||||
|
#
|
||||||
|
#* Configure logger
|
||||||
|
#
|
||||||
|
logging.basicConfig(format='%(asctime)s %(message)s')
|
||||||
|
|
||||||
|
#
|
||||||
|
#* Process command line
|
||||||
|
#
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="""Find all CodeQL DBs in and below starting_dir and export a CSV
|
||||||
|
file with relevant data.""")
|
||||||
|
parser.add_argument('starting_dir', type=str,
|
||||||
|
help='The starting directory to search for codeql.')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
#
|
||||||
|
#* Collect info
|
||||||
|
#
|
||||||
|
# Get the db information in list of DBInfo form
|
||||||
|
db_base = args.starting_dir
|
||||||
|
dbs = list(utils.collect_dbs(db_base))
|
||||||
|
dbdf = pd.DataFrame([d.__dict__ for d in dbs])
|
||||||
|
#
|
||||||
|
#
|
||||||
|
#* Write info out
|
||||||
|
#
|
||||||
|
dbdf.to_csv(sys.stdout, index=False)
|
||||||
|
|
||||||
|
# Local Variables:
|
||||||
|
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
|
||||||
|
# End:
|
||||||
83
client/qldbtools/bin/mc-db-refine-info
Executable file
83
client/qldbtools/bin/mc-db-refine-info
Executable file
@@ -0,0 +1,83 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
""" Read an initial table of CodeQL DB information, produced by
|
||||||
|
mc-db-initial-info, and collect more detailed information from the database
|
||||||
|
files. Write out an extended table in CSV format.
|
||||||
|
"""
|
||||||
|
import qldbtools.utils as utils
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
import pandas as pd
|
||||||
|
import sys
|
||||||
|
|
||||||
|
#
|
||||||
|
#* Configure logger
|
||||||
|
#
|
||||||
|
logging.basicConfig(format='%(asctime)s %(message)s')
|
||||||
|
|
||||||
|
#
|
||||||
|
#* Process command line
|
||||||
|
#
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="""Read an initial table of CodeQL DB information, produced by
|
||||||
|
mc-db-initial-info, and collect more detailed information from the database
|
||||||
|
files. Write out an extended table in CSV format. """)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
#
|
||||||
|
#* Collect the information
|
||||||
|
#
|
||||||
|
d = pd.read_csv(sys.stdin)
|
||||||
|
joiners = []
|
||||||
|
for left_index in range(0, len(d)-1):
|
||||||
|
try:
|
||||||
|
cqlc, metac = utils.extract_metadata(d.path[left_index])
|
||||||
|
except utils.ExtractNotZipfile:
|
||||||
|
continue
|
||||||
|
except utils.ExtractNoCQLDB:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
detail_df = utils.metadata_details(left_index, cqlc, metac)
|
||||||
|
except utils.DetailsMissing:
|
||||||
|
continue
|
||||||
|
joiners.append(detail_df)
|
||||||
|
joiners_df = pd.concat(joiners, axis=0)
|
||||||
|
full_df = pd.merge(d, joiners_df, left_index=True, right_on='left_index', how='outer')
|
||||||
|
|
||||||
|
#** Re-order the dataframe columns by importance
|
||||||
|
# - Much of the data
|
||||||
|
# 1. Is only conditionally present
|
||||||
|
# 2. Is extra info, not for the DB proper
|
||||||
|
# 3. May have various names
|
||||||
|
#
|
||||||
|
# - The essential columns are
|
||||||
|
# | owner |
|
||||||
|
# | name |
|
||||||
|
# | language |
|
||||||
|
# | size |
|
||||||
|
# | cliVersion |
|
||||||
|
# | creationTime |
|
||||||
|
# | sha |
|
||||||
|
# | baselineLinesOfCode |
|
||||||
|
# | path |
|
||||||
|
#
|
||||||
|
# - The rest are useful; put them last
|
||||||
|
# | db_lang |
|
||||||
|
# | db_lang_displayName |
|
||||||
|
# | db_lang_file_count |
|
||||||
|
# | db_lang_linesOfCode |
|
||||||
|
# | left_index |
|
||||||
|
# | ctime |
|
||||||
|
# | primaryLanguage |
|
||||||
|
# | finalised |
|
||||||
|
|
||||||
|
final_df = full_df.reindex(columns=['owner', 'name', 'language', 'size', 'cliVersion',
|
||||||
|
'creationTime', 'sha', 'baselineLinesOfCode', 'path',
|
||||||
|
'db_lang', 'db_lang_displayName', 'db_lang_file_count',
|
||||||
|
'db_lang_linesOfCode', 'ctime', 'primaryLanguage',
|
||||||
|
'finalised', 'left_index'])
|
||||||
|
|
||||||
|
final_df.to_csv(sys.stdout, index=False)
|
||||||
|
|
||||||
|
# Local Variables:
|
||||||
|
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
|
||||||
|
# End:
|
||||||
@@ -1,4 +1,6 @@
|
|||||||
#* Experimental work with utils.py, to be merged into it.
|
#* Experimental work with utils.py, to be merged into it.
|
||||||
|
# The rest of this interactive script is available as cli script in
|
||||||
|
# mc-db-initial-info
|
||||||
from utils import *
|
from utils import *
|
||||||
|
|
||||||
#* Data collection
|
#* Data collection
|
||||||
|
|||||||
@@ -52,6 +52,8 @@ for lang, lang_cont in metac['languages'].items():
|
|||||||
print("%sdisplayName %s" % (indent, val))
|
print("%sdisplayName %s" % (indent, val))
|
||||||
|
|
||||||
#** Automated for all entries
|
#** Automated for all entries
|
||||||
|
# The rest of this interactive script is available as cli script in
|
||||||
|
# mc-db-refine-info
|
||||||
d = dbdf_1
|
d = dbdf_1
|
||||||
joiners = []
|
joiners = []
|
||||||
for left_index in range(0, len(d)-1):
|
for left_index in range(0, len(d)-1):
|
||||||
@@ -74,6 +76,41 @@ from pandasgui import show
|
|||||||
os.environ['APPDATA'] = "needed-for-pandasgui"
|
os.environ['APPDATA'] = "needed-for-pandasgui"
|
||||||
show(full_df)
|
show(full_df)
|
||||||
|
|
||||||
|
#** Re-order the dataframe columns by importance
|
||||||
|
# - Much of the data
|
||||||
|
# 1. Is only conditionally present
|
||||||
|
# 2. Is extra info, not for the DB proper
|
||||||
|
# 3. May have various names
|
||||||
|
|
||||||
|
# - The essential columns are
|
||||||
|
# | owner |
|
||||||
|
# | name |
|
||||||
|
# | language |
|
||||||
|
# | size |
|
||||||
|
# | cliVersion |
|
||||||
|
# | creationTime |
|
||||||
|
# | sha |
|
||||||
|
# | baselineLinesOfCode |
|
||||||
|
# | path |
|
||||||
|
|
||||||
|
# - The rest are useful; put them last
|
||||||
|
# | db_lang |
|
||||||
|
# | db_lang_displayName |
|
||||||
|
# | db_lang_file_count |
|
||||||
|
# | db_lang_linesOfCode |
|
||||||
|
# | left_index |
|
||||||
|
# | ctime |
|
||||||
|
# | primaryLanguage |
|
||||||
|
# | finalised |
|
||||||
|
|
||||||
|
final_df = full_df.reindex(columns=['owner', 'name', 'language', 'size', 'cliVersion',
|
||||||
|
'creationTime', 'sha', 'baselineLinesOfCode', 'path',
|
||||||
|
'db_lang', 'db_lang_displayName', 'db_lang_file_count',
|
||||||
|
'db_lang_linesOfCode', 'ctime', 'primaryLanguage',
|
||||||
|
'finalised', 'left_index'])
|
||||||
|
|
||||||
|
final_df.to_csv('all-info-table.csv.gz', compression='gzip', index=False)
|
||||||
|
|
||||||
#
|
#
|
||||||
# Local Variables:
|
# Local Variables:
|
||||||
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
|
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
from setuptools import setup, find_packages
|
from setuptools import setup, find_packages
|
||||||
|
import glob
|
||||||
|
|
||||||
setup(
|
setup(
|
||||||
name='qldbtools',
|
name='qldbtools',
|
||||||
@@ -6,7 +7,7 @@ setup(
|
|||||||
description='A Python package for working with CodeQL databases',
|
description='A Python package for working with CodeQL databases',
|
||||||
author='Michael Hohn',
|
author='Michael Hohn',
|
||||||
author_email='hohn@github.com',
|
author_email='hohn@github.com',
|
||||||
packages=find_packages(),
|
packages=['qldbtools'],
|
||||||
install_requires=[
|
install_requires=[],
|
||||||
],
|
scripts=glob.glob("bin/mc-*"),
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user