Add sarif-pad-aggregate to fill scan values

Fills the scans table's db_create_start/stop and scan_start/stop_date columns with realistic random values.
2025-12-16 17:23:03 +01:00 · 2022-08-31 21:19:02 -07:00
parent 2b42a7d306
commit 203343df07
2 changed files with 137 additions and 0 deletions
--- a/bin/sarif-pad-aggregate
+++ b/bin/sarif-pad-aggregate
@@ -0,0 +1,136 @@
+#!/usr/bin/env python3
+
+from copy import deepcopy
+from datetime import datetime
+import argparse
+import csv
+import numpy
+import os
+import pandas as pd
+import random
+
+from sarif_cli import scan_tables
+from sarif_cli import table_joins
+
+#
+# Handle arguments
+#
+parser = argparse.ArgumentParser(
+    description="Fill the scans table's db_create_start/stop and "
+    "scan_start/stop_date columns with realistic random values" )
+
+parser.add_argument('aggregate_dir', metavar='aggregate-dir', type=str,
+                    help='Directory containing combined scan tables') 
+
+parser.add_argument('output_dir', metavar='output-dir', type=str,
+                    help='Directory for writing the combined and padded scan tables') 
+
+args = parser.parse_args()
+
+#
+# Prepare output directory
+# 
+try: os.mkdir(args.output_dir, mode=0o755)
+except FileExistsError: pass
+
+#
+# TODO: factor out code in common with ./sarif-aggregate-scans
+#
+
+#
+# Utilities
+# 
+_extract_scans_tables = { 
+    "scans" : [],
+    "results" : [],
+    "projects" : [], 
+    "codeflows" : [],
+}
+_table_output_dtypes = {
+    "scans" : scan_tables.ScanTablesTypes.scans,
+    "results" : scan_tables.ScanTablesTypes.results,
+    "projects" : scan_tables.ScanTablesTypes.projects, 
+    "codeflows" : table_joins.BaseTablesTypes.codeflows,
+}    
+
+# Accomodate special dtype cases for parsing to avoid
+# 
+#       TypeError: the dtype datetime64 is not supported for parsing, pass this
+#       column using parse_dates instead
+# 
+_parse_dates = {
+    "scans" : [],
+    "results" : [],
+    "projects" : [],
+    "codeflows" : [],
+}
+
+# Prep for in-place modification, use copies of original module values
+_table_input_dtypes = { key: deepcopy(val) for key, val in _table_output_dtypes.items()}
+
+# Replace datetime64 with str and track the affected columns
+for tab_name, tab_dtypes in _table_input_dtypes.items():
+    for col_key, col_dtype in tab_dtypes.items():
+        # Let pandas parse datetime64 as str, then convert to date
+        if col_dtype == numpy.dtype('M'):
+            # Note: pd.StringDtype() here will cause parsing failure later
+            tab_dtypes[col_key] = str 
+            _parse_dates[tab_name].append(col_key)
+
+def _all_csv_files_exist(output_dir):
+    for file_prefix in _extract_scans_tables.keys():
+        csv_fname = os.path.join(output_dir, file_prefix + ".csv")
+        if not os.path.exists(csv_fname):
+            return False
+    return True
+
+# 
+# Read the combined dataframes
+# 
+for file_prefix in _extract_scans_tables.keys():
+    csv_fname = os.path.join(args.aggregate_dir, file_prefix + ".csv")
+    data = pd.read_csv(csv_fname, dtype = _table_input_dtypes[file_prefix],
+                       parse_dates = _parse_dates[file_prefix])
+    _extract_scans_tables[file_prefix].append(data)
+
+#
+# Pad the dataframes
+# 
+# ---- placeholder dates ----
+# - Across scans, these should spread over one year to avoid massive jumps
+#   in display.
+# - For indivdual scans, the scan duration should be between a few minutes and
+#   several hours 
+# - db creation times can be between a few minutes and several hours
+# - scans follow db creation
+# 
+scans = _extract_scans_tables["scans"][0]
+rows = len(scans)
+rng = numpy.random.default_rng(seed=7)
+
+def rcol():
+    return rng.uniform(0, 1, rows)
+
+def day():
+    return numpy.timedelta64(1, 'D')
+
+def minute():
+    return numpy.timedelta64(1, 'm')
+
+scans.db_create_start = (numpy.datetime64('today', 's') + 23*59*rcol()*minute() -
+                         365 * rcol() * day())
+scans.db_create_stop = scans.db_create_start + (5 + 3 * 60 * rcol()) * minute()
+scans.scan_start_date = scans.db_create_stop + (1 + 13 * rcol()) * minute()
+scans.scan_stop_date = scans.scan_start_date + (5 + 3*60 * rcol()) * minute()
+
+_extract_scans_tables["scans"][0] = scans
+
+#
+# Write all dataframes
+# 
+for file_prefix in _extract_scans_tables.keys():
+    csv_fname = os.path.join(args.output_dir, file_prefix + ".csv")
+    frame = (_extract_scans_tables[file_prefix][0]
+             .astype(_table_output_dtypes[file_prefix]))
+    with open(csv_fname, 'w') as fh:
+        frame.to_csv(fh, index=False, quoting=csv.QUOTE_NONNUMERIC)
--- a/scripts/table-tests.sh
+++ b/scripts/table-tests.sh
@@ -38,4 +38,5 @@ EOF
      
  sarif-extract-scans-runner test-sas-files
  sarif-aggregate-scans -i1 test-sas-files aggregated.scantables 
+  sarif-pad-aggregate aggregated.scantables aggregated.scantables.padded
 )