diff --git a/bin/sarif-pad-aggregate b/bin/sarif-pad-aggregate new file mode 100755 index 0000000..b9c9796 --- /dev/null +++ b/bin/sarif-pad-aggregate @@ -0,0 +1,136 @@ +#!/usr/bin/env python3 + +from copy import deepcopy +from datetime import datetime +import argparse +import csv +import numpy +import os +import pandas as pd +import random + +from sarif_cli import scan_tables +from sarif_cli import table_joins + +# +# Handle arguments +# +parser = argparse.ArgumentParser( + description="Fill the scans table's db_create_start/stop and " + "scan_start/stop_date columns with realistic random values" ) + +parser.add_argument('aggregate_dir', metavar='aggregate-dir', type=str, + help='Directory containing combined scan tables') + +parser.add_argument('output_dir', metavar='output-dir', type=str, + help='Directory for writing the combined and padded scan tables') + +args = parser.parse_args() + +# +# Prepare output directory +# +try: os.mkdir(args.output_dir, mode=0o755) +except FileExistsError: pass + +# +# TODO: factor out code in common with ./sarif-aggregate-scans +# + +# +# Utilities +# +_extract_scans_tables = { + "scans" : [], + "results" : [], + "projects" : [], + "codeflows" : [], +} +_table_output_dtypes = { + "scans" : scan_tables.ScanTablesTypes.scans, + "results" : scan_tables.ScanTablesTypes.results, + "projects" : scan_tables.ScanTablesTypes.projects, + "codeflows" : table_joins.BaseTablesTypes.codeflows, +} + +# Accomodate special dtype cases for parsing to avoid +# +# TypeError: the dtype datetime64 is not supported for parsing, pass this +# column using parse_dates instead +# +_parse_dates = { + "scans" : [], + "results" : [], + "projects" : [], + "codeflows" : [], +} + +# Prep for in-place modification, use copies of original module values +_table_input_dtypes = { key: deepcopy(val) for key, val in _table_output_dtypes.items()} + +# Replace datetime64 with str and track the affected columns +for tab_name, tab_dtypes in _table_input_dtypes.items(): + for col_key, col_dtype in tab_dtypes.items(): + # Let pandas parse datetime64 as str, then convert to date + if col_dtype == numpy.dtype('M'): + # Note: pd.StringDtype() here will cause parsing failure later + tab_dtypes[col_key] = str + _parse_dates[tab_name].append(col_key) + +def _all_csv_files_exist(output_dir): + for file_prefix in _extract_scans_tables.keys(): + csv_fname = os.path.join(output_dir, file_prefix + ".csv") + if not os.path.exists(csv_fname): + return False + return True + +# +# Read the combined dataframes +# +for file_prefix in _extract_scans_tables.keys(): + csv_fname = os.path.join(args.aggregate_dir, file_prefix + ".csv") + data = pd.read_csv(csv_fname, dtype = _table_input_dtypes[file_prefix], + parse_dates = _parse_dates[file_prefix]) + _extract_scans_tables[file_prefix].append(data) + +# +# Pad the dataframes +# +# ---- placeholder dates ---- +# - Across scans, these should spread over one year to avoid massive jumps +# in display. +# - For indivdual scans, the scan duration should be between a few minutes and +# several hours +# - db creation times can be between a few minutes and several hours +# - scans follow db creation +# +scans = _extract_scans_tables["scans"][0] +rows = len(scans) +rng = numpy.random.default_rng(seed=7) + +def rcol(): + return rng.uniform(0, 1, rows) + +def day(): + return numpy.timedelta64(1, 'D') + +def minute(): + return numpy.timedelta64(1, 'm') + +scans.db_create_start = (numpy.datetime64('today', 's') + 23*59*rcol()*minute() - + 365 * rcol() * day()) +scans.db_create_stop = scans.db_create_start + (5 + 3 * 60 * rcol()) * minute() +scans.scan_start_date = scans.db_create_stop + (1 + 13 * rcol()) * minute() +scans.scan_stop_date = scans.scan_start_date + (5 + 3*60 * rcol()) * minute() + +_extract_scans_tables["scans"][0] = scans + +# +# Write all dataframes +# +for file_prefix in _extract_scans_tables.keys(): + csv_fname = os.path.join(args.output_dir, file_prefix + ".csv") + frame = (_extract_scans_tables[file_prefix][0] + .astype(_table_output_dtypes[file_prefix])) + with open(csv_fname, 'w') as fh: + frame.to_csv(fh, index=False, quoting=csv.QUOTE_NONNUMERIC) diff --git a/scripts/table-tests.sh b/scripts/table-tests.sh index b842b53..2d64d31 100644 --- a/scripts/table-tests.sh +++ b/scripts/table-tests.sh @@ -38,4 +38,5 @@ EOF sarif-extract-scans-runner test-sas-files sarif-aggregate-scans -i1 test-sas-files aggregated.scantables + sarif-pad-aggregate aggregated.scantables aggregated.scantables.padded )