Add scripts for automatic codeql db data and metadata collection

- updated instructions - cli scripts mirror the interactive session*.py files
2024-07-23 15:05:03 -07:00
parent aaeafa9e88
commit 731b44b187
6 changed files with 174 additions and 4 deletions
--- a/client/qldbtools/bin/mc-db-initial-info
+++ b/client/qldbtools/bin/mc-db-initial-info
@@ -0,0 +1,40 @@
+#!/usr/bin/env python
+""" Collect information about CodeQL databases from the file system and write out
+    a table in CSV format.
+"""
+import qldbtools.utils as utils
+import argparse
+import logging
+import sys
+import pandas as pd
+#
+#* Configure logger
+# 
+logging.basicConfig(format='%(asctime)s %(message)s')
+
+#
+#* Process command line
+#
+parser = argparse.ArgumentParser(
+    description="""Find all CodeQL DBs in and below starting_dir and export a CSV 
+file with relevant data.""")
+parser.add_argument('starting_dir', type=str, 
+                    help='The starting directory to search for codeql.')
+args = parser.parse_args()
+
+# 
+#* Collect info
+# 
+# Get the db information in list of DBInfo form
+db_base = args.starting_dir
+dbs = list(utils.collect_dbs(db_base))
+dbdf = pd.DataFrame([d.__dict__ for d in dbs])
+#
+#
+#* Write info out
+#
+dbdf.to_csv(sys.stdout, index=False)
+
+# Local Variables:
+# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
+# End:
--- a/client/qldbtools/bin/mc-db-refine-info
+++ b/client/qldbtools/bin/mc-db-refine-info
@@ -0,0 +1,83 @@
+#!/usr/bin/env python
+""" Read an initial table of CodeQL DB information, produced by
+    mc-db-initial-info, and collect more detailed information from the database
+    files.  Write out an extended table in CSV format.
+"""
+import qldbtools.utils as utils
+import argparse
+import logging
+import pandas as pd
+import sys
+
+#
+#* Configure logger
+# 
+logging.basicConfig(format='%(asctime)s %(message)s')
+
+#
+#* Process command line
+#
+parser = argparse.ArgumentParser(
+    description="""Read an initial table of CodeQL DB information, produced by
+    mc-db-initial-info, and collect more detailed information from the database
+    files.  Write out an extended table in CSV format. """)
+args = parser.parse_args()
+
+#
+#* Collect the information
+#
+d = pd.read_csv(sys.stdin)
+joiners = []
+for left_index in range(0, len(d)-1):
+    try:
+        cqlc, metac = utils.extract_metadata(d.path[left_index])
+    except utils.ExtractNotZipfile:
+        continue
+    except utils.ExtractNoCQLDB:
+        continue
+    try:
+        detail_df = utils.metadata_details(left_index, cqlc, metac)
+    except utils.DetailsMissing:
+        continue
+    joiners.append(detail_df)
+joiners_df = pd.concat(joiners, axis=0)
+full_df = pd.merge(d, joiners_df, left_index=True, right_on='left_index', how='outer')    
+
+#** Re-order the dataframe columns by importance
+# - Much of the data
+#   1. Is only conditionally present
+#   2. Is extra info, not for the DB proper
+#   3. May have various names
+# 
+# - The essential columns are
+#     | owner               |
+#     | name                |
+#     | language            |
+#     | size                |
+#     | cliVersion          |
+#     | creationTime        |
+#     | sha                 |
+#     | baselineLinesOfCode |
+#     | path                |
+# 
+# - The rest are useful; put them last
+#     | db_lang             |
+#     | db_lang_displayName |
+#     | db_lang_file_count  |
+#     | db_lang_linesOfCode |
+#     | left_index          |
+#     | ctime               |
+#     | primaryLanguage     |
+#     | finalised           |
+
+final_df = full_df.reindex(columns=['owner', 'name', 'language', 'size', 'cliVersion',
+	                                'creationTime', 'sha', 'baselineLinesOfCode', 'path',
+	                                'db_lang', 'db_lang_displayName', 'db_lang_file_count',
+	                                'db_lang_linesOfCode', 'ctime', 'primaryLanguage',
+	                                'finalised', 'left_index'])
+
+final_df.to_csv(sys.stdout, index=False)
+
+# Local Variables:
+# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
+# End: