Rewrite sarif-combine-tables.py as full tool, bin/sarif-aggregate-scans

2025-12-16 17:23:03 +01:00 · 2022-08-10 17:34:35 -07:00
parent 38af30ead9
commit 03a9ef0477
3 changed files with 167 additions and 94 deletions
--- a/bin/sarif-aggregate-scans
+++ b/bin/sarif-aggregate-scans
@@ -0,0 +1,156 @@
+#!/usr/bin/env python3
+
+"""Traverse the org/project.scantables/ directories produced by
+./sarif-extract-scans-runner and concatenate the collection of individual tables
+(codeflows.csv results.csv scans.csv projects.csv) into 4 large tables.
+"""
+
+from copy import deepcopy
+from datetime import datetime
+import argparse
+import csv
+import numpy
+import os
+import pandas as pd
+import sys
+
+from sarif_cli import scan_tables
+from sarif_cli import table_joins
+
+#
+# TODO: Factor out functionality / structures in common with
+# ./sarif-extract-scans-runner
+# 
+
+#
+# Handle arguments
+#
+parser = argparse.ArgumentParser(description='Run sarif-extract-scans over a directory hierarchy')
+
+parser.add_argument('sarif_files', metavar='sarif-files', type=str,
+                    help='File containing list of sarif files, use - for stdin') 
+
+parser.add_argument('aggregate_dir', metavar='aggregate-dir', type=str,
+                    help='Directory for writing the combined scan tables') 
+
+parser.add_argument('-m', '--max-files', metavar='M', type=int, default=100,
+                    help='Maximum number of files to process.'
+                    '  Default: %(default)d')
+
+parser.add_argument('-i', '--update-interval', metavar='N', type=int, default=100,
+                    help='Update status after processing N files.'
+                    '  Default: %(default)d')
+
+parser.add_argument('--doc', dest='fulldoc', default=False,
+                    action='store_true', 
+                    help='Print full documentation for this script')
+
+# Avoid argparse error when only --doc is given
+if len(sys.argv) == 2 and sys.argv[1] == '--doc':
+    print(__doc__)
+    sys.exit(0)
+
+args = parser.parse_args()
+
+#
+# Utilities
+# 
+_extract_scans_tables = { 
+    "scans" : [],
+    "results" : [],
+    "projects" : [], 
+    "codeflows" : [],
+}
+_table_output_dtypes = {
+    "scans" : scan_tables.ScanTablesTypes.scans,
+    "results" : scan_tables.ScanTablesTypes.results,
+    "projects" : scan_tables.ScanTablesTypes.projects, 
+    "codeflows" : table_joins.BaseTablesTypes.codeflows,
+}    
+
+# Accomodate special dtype cases for parsing to avoid
+# 
+#       TypeError: the dtype datetime64 is not supported for parsing, pass this
+#       column using parse_dates instead
+# 
+_parse_dates = {
+    "scans" : [],
+    "results" : [],
+    "projects" : [],
+    "codeflows" : [],
+}
+
+# Prep for in-place modification, use copies of original module values
+_table_input_dtypes = { key: deepcopy(val) for key, val in _table_output_dtypes.items()}
+
+# Replace datetime64 with str and track the affected columns
+for tab_name, tab_dtypes in _table_input_dtypes.items():
+    for col_key, col_dtype in tab_dtypes.items():
+        # Let pandas parse datetime64 as str, then convert to date
+        if col_dtype == numpy.dtype('M'):
+            # Note: pd.StringDtype() here will cause parsing failure later
+            tab_dtypes[col_key] = str 
+            _parse_dates[tab_name].append(col_key)
+
+def _all_csv_files_exist(output_dir):
+    for file_prefix in _extract_scans_tables.keys():
+        csv_fname = os.path.join(output_dir, file_prefix + ".csv")
+        if not os.path.exists(csv_fname):
+            return False
+    return True
+
+#
+# Prepare output directory first; can't really run without it
+# 
+try: os.mkdir(args.aggregate_dir, mode=0o755)
+except FileExistsError: pass
+
+#
+# Collect sarif file information
+# 
+with open(args.sarif_files, 'r') if args.sarif_files != '-' else sys.stdin as fp: 
+    paths = fp.readlines()
+
+#
+# Traverse all possible scantable-containing directories
+# 
+count = -1
+for path in paths:
+    count += 1
+    if count > args.max_files: break
+    # 
+    # Paths and components
+    # 
+    path = path.rstrip()
+    project, component = path.split('/')
+    #
+    # Validate input data directory and content
+    #
+    output_dir = os.path.join(project, component + ".scantables")
+    if not os.path.exists(output_dir):
+        continue
+    if not _all_csv_files_exist(output_dir):
+        continue
+    #
+    # Append data for every table
+    #
+    for file_prefix in _extract_scans_tables.keys():
+        csv_fname = os.path.join(output_dir, file_prefix + ".csv")
+        data = pd.read_csv(csv_fname, dtype = _table_input_dtypes[file_prefix],
+                           parse_dates = _parse_dates[file_prefix])
+        _extract_scans_tables[file_prefix].append(data)
+
+    # Some timing information
+    if count % args.update_interval == 0:
+        print("{:6} {:6}/{:6}".format("COUNT", count, len(paths)))
+        print("{:6} {}".format("DATE", datetime.now().isoformat()))
+        sys.stdout.flush()
+              
+# 
+# Create and write the combined dataframes
+# 
+for file_prefix in _extract_scans_tables.keys():
+    all = (pd.concat(_extract_scans_tables[file_prefix], ignore_index=True, axis='index')
+           .astype(_table_output_dtypes[file_prefix]).reset_index(drop=True))
+    with open(os.path.join(args.aggregate_dir, file_prefix + ".csv"), 'w') as fh:
+        all.to_csv(fh, index=False, quoting=csv.QUOTE_NONNUMERIC)
--- a/scripts/sarif-combine-tables.py
+++ b/scripts/sarif-combine-tables.py
@@ -1,94 +0,0 @@
-#!/usr/bin/env python3
-#
-# Traverse the org/project.scantables/ directories produced by ./sarif-runner.py
-# and concatenate the collection of (codeflows.csv results.csv scans.csv
-# projects.csv) tables into 4 tables.
-#
-import os
-import sys
-from datetime import datetime
-import pandas as pd
-
-#
-# TODO: Factor out functionality / structures in common with ./sarif-runner.py
-# 
-
-# 
-# Parameters
-# 
-max_files = 80000
-sarif_file_list = 'sarif-files.txt'
-combined_tables_dir = 'sarif-scantables-combined'
-
-#
-# Utilities
-# 
-_extract_scans_tables = { 
-    "scans" : [],
-    "results" : [],
-    "projects" : [], 
-    "codeflows" : [],
-}
-
-def _all_csv_files_exist(output_dir):
-    for file_prefix in _extract_scans_tables.keys():
-        csv_fname = os.path.join(output_dir, file_prefix + ".csv")
-        if not os.path.exists(csv_fname):
-            return False
-    return True
-
-#
-# Prepare output directory first; can't really run without it
-# 
-try: os.mkdir(combined_tables_dir, mode=0o755)
-except FileExistsError: pass
-
-#
-# Collect sarif file information
-# 
-paths = open(sarif_file_list, 'r').readlines()
-
-#
-# Traverse all possible output directories
-# 
-count = 0
-for path in paths:
-    count += 1
-    if count > max_files: break
-    # 
-    # Paths and components
-    # 
-    path = path.rstrip()
-    project, sarif_file = path.split('/')
-    component = sarif_file.removesuffix('.json')
-    #
-    # Validate data directory
-    #
-    output_dir = os.path.join(project, component + ".scantables")
-    if not os.path.exists(output_dir):
-        continue
-    if not _all_csv_files_exist(output_dir):
-        continue
-    #
-    # Append data for every table
-    #
-    for file_prefix in _extract_scans_tables.keys():
-        csv_fname = os.path.join(output_dir, file_prefix + ".csv")
-        data = pd.read_csv(csv_fname)
-        _extract_scans_tables[file_prefix].append(data)
-
-    # Some timing information
-    if count % 100 == 0:
-        print("{:6} {:6}/{:6}".format("COUNT", count, len(paths)))
-        print("{:6} {}".format("DATE", datetime.now().isoformat()))
-        sys.stdout.flush()
-              
-# 
-# Create and write the combined dataframes
-# 
-for file_prefix in _extract_scans_tables.keys():
-    all = pd.concat(_extract_scans_tables[file_prefix], ignore_index=True, axis='index')
-
-    with open(os.path.join(combined_tables_dir, file_prefix + ".csv"), 'w') as fh:
-        all.to_csv(fh, index=False)
-
--- a/scripts/table-tests.sh
+++ b/scripts/table-tests.sh
@@ -28,3 +28,14 @@ EOF
 2022-02-25/results.sarif
 EOF
 )
+
+# Aggregate multiple results
+( cd ../data/treeio/
+  cat > test-sas-files <<EOF
+2021-12-09/results.sarif
+2022-02-25/results.sarif
+EOF
+      
+  sarif-extract-scans-runner test-sas-files
+  sarif-aggregate-scans -i1 test-sas-files aggregated.scantables 
+)