diff --git a/client/qldbtools/README.md b/client/qldbtools/README.md index 6db732a..4bc1e9a 100644 --- a/client/qldbtools/README.md +++ b/client/qldbtools/README.md @@ -46,10 +46,17 @@ qldbtools is a Python package for working with CodeQL databases ``` -## Usage +## Use as library ```python import qldbtools as ql ``` +## Command-line use + cd ~/work-gh/mrva/mrvacommander/client/qldbtools + ./bin/mc-db-initial-info ~/work-gh/mrva/mrva-open-source-download | gzip > db-info-1.csv.gz + + gunzip < db-info-1.csv.gz | ./bin/mc-db-refine-info | gzip > db-info-2.csv.gz + + diff --git a/client/qldbtools/bin/mc-db-initial-info b/client/qldbtools/bin/mc-db-initial-info new file mode 100755 index 0000000..793d233 --- /dev/null +++ b/client/qldbtools/bin/mc-db-initial-info @@ -0,0 +1,40 @@ +#!/usr/bin/env python +""" Collect information about CodeQL databases from the file system and write out + a table in CSV format. +""" +import qldbtools.utils as utils +import argparse +import logging +import sys +import pandas as pd +# +#* Configure logger +# +logging.basicConfig(format='%(asctime)s %(message)s') + +# +#* Process command line +# +parser = argparse.ArgumentParser( + description="""Find all CodeQL DBs in and below starting_dir and export a CSV +file with relevant data.""") +parser.add_argument('starting_dir', type=str, + help='The starting directory to search for codeql.') +args = parser.parse_args() + +# +#* Collect info +# +# Get the db information in list of DBInfo form +db_base = args.starting_dir +dbs = list(utils.collect_dbs(db_base)) +dbdf = pd.DataFrame([d.__dict__ for d in dbs]) +# +# +#* Write info out +# +dbdf.to_csv(sys.stdout, index=False) + +# Local Variables: +# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/" +# End: diff --git a/client/qldbtools/bin/mc-db-refine-info b/client/qldbtools/bin/mc-db-refine-info new file mode 100755 index 0000000..6f71027 --- /dev/null +++ b/client/qldbtools/bin/mc-db-refine-info @@ -0,0 +1,83 @@ +#!/usr/bin/env python +""" Read an initial table of CodeQL DB information, produced by + mc-db-initial-info, and collect more detailed information from the database + files. Write out an extended table in CSV format. +""" +import qldbtools.utils as utils +import argparse +import logging +import pandas as pd +import sys + +# +#* Configure logger +# +logging.basicConfig(format='%(asctime)s %(message)s') + +# +#* Process command line +# +parser = argparse.ArgumentParser( + description="""Read an initial table of CodeQL DB information, produced by + mc-db-initial-info, and collect more detailed information from the database + files. Write out an extended table in CSV format. """) +args = parser.parse_args() + +# +#* Collect the information +# +d = pd.read_csv(sys.stdin) +joiners = [] +for left_index in range(0, len(d)-1): + try: + cqlc, metac = utils.extract_metadata(d.path[left_index]) + except utils.ExtractNotZipfile: + continue + except utils.ExtractNoCQLDB: + continue + try: + detail_df = utils.metadata_details(left_index, cqlc, metac) + except utils.DetailsMissing: + continue + joiners.append(detail_df) +joiners_df = pd.concat(joiners, axis=0) +full_df = pd.merge(d, joiners_df, left_index=True, right_on='left_index', how='outer') + +#** Re-order the dataframe columns by importance +# - Much of the data +# 1. Is only conditionally present +# 2. Is extra info, not for the DB proper +# 3. May have various names +# +# - The essential columns are +# | owner | +# | name | +# | language | +# | size | +# | cliVersion | +# | creationTime | +# | sha | +# | baselineLinesOfCode | +# | path | +# +# - The rest are useful; put them last +# | db_lang | +# | db_lang_displayName | +# | db_lang_file_count | +# | db_lang_linesOfCode | +# | left_index | +# | ctime | +# | primaryLanguage | +# | finalised | + +final_df = full_df.reindex(columns=['owner', 'name', 'language', 'size', 'cliVersion', + 'creationTime', 'sha', 'baselineLinesOfCode', 'path', + 'db_lang', 'db_lang_displayName', 'db_lang_file_count', + 'db_lang_linesOfCode', 'ctime', 'primaryLanguage', + 'finalised', 'left_index']) + +final_df.to_csv(sys.stdout, index=False) + +# Local Variables: +# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/" +# End: diff --git a/client/qldbtools/qldbtools/session1.py b/client/qldbtools/qldbtools/session1.py index 3a893fe..2803f36 100644 --- a/client/qldbtools/qldbtools/session1.py +++ b/client/qldbtools/qldbtools/session1.py @@ -1,4 +1,6 @@ #* Experimental work with utils.py, to be merged into it. +# The rest of this interactive script is available as cli script in +# mc-db-initial-info from utils import * #* Data collection diff --git a/client/qldbtools/qldbtools/session2.py b/client/qldbtools/qldbtools/session2.py index 6318ef7..978da84 100644 --- a/client/qldbtools/qldbtools/session2.py +++ b/client/qldbtools/qldbtools/session2.py @@ -52,6 +52,8 @@ for lang, lang_cont in metac['languages'].items(): print("%sdisplayName %s" % (indent, val)) #** Automated for all entries +# The rest of this interactive script is available as cli script in +# mc-db-refine-info d = dbdf_1 joiners = [] for left_index in range(0, len(d)-1): @@ -74,6 +76,41 @@ from pandasgui import show os.environ['APPDATA'] = "needed-for-pandasgui" show(full_df) +#** Re-order the dataframe columns by importance +# - Much of the data +# 1. Is only conditionally present +# 2. Is extra info, not for the DB proper +# 3. May have various names + +# - The essential columns are +# | owner | +# | name | +# | language | +# | size | +# | cliVersion | +# | creationTime | +# | sha | +# | baselineLinesOfCode | +# | path | + +# - The rest are useful; put them last +# | db_lang | +# | db_lang_displayName | +# | db_lang_file_count | +# | db_lang_linesOfCode | +# | left_index | +# | ctime | +# | primaryLanguage | +# | finalised | + +final_df = full_df.reindex(columns=['owner', 'name', 'language', 'size', 'cliVersion', + 'creationTime', 'sha', 'baselineLinesOfCode', 'path', + 'db_lang', 'db_lang_displayName', 'db_lang_file_count', + 'db_lang_linesOfCode', 'ctime', 'primaryLanguage', + 'finalised', 'left_index']) + +final_df.to_csv('all-info-table.csv.gz', compression='gzip', index=False) + # # Local Variables: # python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/" diff --git a/client/qldbtools/setup.py b/client/qldbtools/setup.py index bfbdb05..1608a05 100644 --- a/client/qldbtools/setup.py +++ b/client/qldbtools/setup.py @@ -1,4 +1,5 @@ from setuptools import setup, find_packages +import glob setup( name='qldbtools', @@ -6,7 +7,7 @@ setup( description='A Python package for working with CodeQL databases', author='Michael Hohn', author_email='hohn@github.com', - packages=find_packages(), - install_requires=[ - ], + packages=['qldbtools'], + install_requires=[], + scripts=glob.glob("bin/mc-*"), )