diff --git a/bin/sarif-extract-scans b/bin/sarif-extract-scans index 2b7e371..adf58b9 100755 --- a/bin/sarif-extract-scans +++ b/bin/sarif-extract-scans @@ -34,6 +34,17 @@ parser.add_argument('outdir', metavar='output-dir', type=str, help='output direc parser.add_argument('csvout', metavar='csv-outfile', type=str, help='processing status csv output file name to use') parser.add_argument('-r', '--write-raw-tables', action="store_true", help='Write the raw sarif tables to the output directory') + +parser.add_argument('-t', '--with-timestamps', action='store_true', + help='Read name of files containing timestamp information ' + 'from the scan-spec.json file. ' + 'The file format changes from ' + 'e.g., ' + '{"scan_id": 15092319597255524458, "sarif_file_name": "sqlidb-0.1.sarif"} ' + 'to ' + '{"scan_id": 15092319597255524458, "sarif_file_name": "sqlidb-0.1.sarif", timestamp_file_name: "sqlidb-0.1.timestamps"}' + ) + parser.add_argument('-f','--input-signature', metavar='input-signature', type=str, default="CLI", help='Signature of the sarif, as in, where it was generated it may affect the signature.\n' 'Options: LGTM, CLI\n' @@ -64,6 +75,22 @@ def load(fname): scan_spec = load(args.file) sarif_struct = load(scan_spec['sarif_file_name']) +if args.with_timestamps: + t1 = load(scan_spec['timestamp_file_name']) + # TODO Remove this kludge for wrong keywords. + timestamps = { + **t1, + "scan_start_date" : t1["scan_start"], + "scan_stop_date" : t1["scan_stop"], + } +else: + timestamps = { + "db_create_start" : pd.Timestamp(0.0, unit='s'), + "db_create_stop" : pd.Timestamp(0.0, unit='s'), + "scan_start_date" : pd.Timestamp(0.0, unit='s'), + "scan_stop_date" : pd.Timestamp(0.0, unit='s'), + } + status_writer.setup_status_filenames(scan_spec['sarif_file_name']) # @@ -189,7 +216,9 @@ scantabs.columns_to_reindex = { # joins for projects has to happen first as it backfills the guess about the project_id scantabs.projects = st.joins_for_projects(bt, external_info) scantabs.results = st.joins_for_results(bt, external_info) -scantabs.scans = st.joins_for_scans(bt, external_info, scantabs, args.input_signature) +scantabs.scans = \ + st.joins_for_scans(bt, external_info, scantabs, + args.input_signature, timestamps) # # Replace the remaining internal ids with snowflake ids diff --git a/bin/sarif-extract-scans-runner b/bin/sarif-extract-scans-runner index b323bea..04e3965 100755 --- a/bin/sarif-extract-scans-runner +++ b/bin/sarif-extract-scans-runner @@ -110,6 +110,16 @@ parser.add_argument('-s', '--successful-runs', metavar='filename', type=str, 'new/failed entries from sarif-files.' ' Default: "%(default)s"') +parser.add_argument('-t', '--with-timestamps', action='store_true', + help='Read names of files containing timestamp information ' + 'following the name of the sarif source file.' + 'E.g., ' + 'sarif-extract-scans-runner --with-timestamps - << EOF ' + 'foo.sarif,timestamps.json ' + 'EOF ' + 'Note: spaces are NOT stripped, so foo.sarif,timestamps.json ' + 'and foo.sarif, timestamps.json are different.' + ) parser.add_argument('--doc', dest='fulldoc', default=False, action='store_true', @@ -154,7 +164,13 @@ if use_successful_runs: successful_runs = set() count = -1 -for path in paths: +for path_timestamp in paths: + if args.with_timestamps: + path, t1 = path_timestamp.split(',') + timestamp_fname = t1.strip() + else: + path = path_timestamp + count += 1 if count > args.max_files: break # @@ -169,11 +185,17 @@ for path in paths: data = f.read() scan_id = hash.hash_unique(data) - scan_spec = { - "scan_id": scan_id, # pd.Int64Dtype() - "sarif_file_name": path, # pd.StringDtype() - } - + if args.with_timestamps: + scan_spec = { + "scan_id": scan_id, # pd.Int64Dtype() + "sarif_file_name": path, # pd.StringDtype() + "timestamp_file_name": timestamp_fname + } + else: + scan_spec = { + "scan_id": scan_id, # pd.Int64Dtype() + "sarif_file_name": path, # pd.StringDtype() + } # # If using outermost output directory, create project directory: # (like //*.scantables) @@ -211,7 +233,13 @@ for path in paths: scan_log_file = os.path.join(outer_dir+ path + ".scanlog") csv_outfile = os.path.join(outer_dir+ path) - runstats = subprocess.run(['sarif-extract-scans', scan_spec_file, output_dir, csv_outfile, "-f", args.input_signature], + if args.with_timestamps: + timestamp_options = ['--with-timestamps'] + else: + timestamp_options = [] + runstats = subprocess.run(['sarif-extract-scans', scan_spec_file, output_dir, + csv_outfile, "-f", args.input_signature, + *timestamp_options], capture_output=True, text=True) if runstats.returncode == 0: print("{:6} {}".format("OK", path)) diff --git a/sarif_cli/scan_tables.py b/sarif_cli/scan_tables.py index 852101b..4566424 100644 --- a/sarif_cli/scan_tables.py +++ b/sarif_cli/scan_tables.py @@ -116,7 +116,7 @@ def joins_for_projects(basetables, external_info): # # Scans table # -def joins_for_scans(basetables, external_info, scantables, sarif_type): +def joins_for_scans(basetables, external_info, scantables, sarif_type, timestamps : dict): """ Form the `scans` table for the ScanTables dataclass """ @@ -135,12 +135,7 @@ def joins_for_scans(basetables, external_info, scantables, sarif_type): "id" : e.scan_id, "commit_id" : commit_id, "project_id" : e.project_id, - # TODO extract real date information from somewhere external - "db_create_start" : pd.Timestamp(0.0, unit='s'), - "db_create_stop" : pd.Timestamp(0.0, unit='s'), - "scan_start_date" : pd.Timestamp(0.0, unit='s'), - "scan_stop_date" : pd.Timestamp(0.0, unit='s'), - # + **timestamps, "tool_name" : driver_name[0], "tool_version" : driver_version[0], "tool_query_commit_id" : pd.NA, diff --git a/scripts/test-timestamps.sh b/scripts/test-timestamps.sh new file mode 100644 index 0000000..19a4fba --- /dev/null +++ b/scripts/test-timestamps.sh @@ -0,0 +1,86 @@ +#!/bin/bash +#* Setup +cd ~/local/sarif-cli/data/codeql-dataflow-sql-injection +ls -la sqlidb-0.sarif sqlidb-1.sarif + +# +source ~/local/sarif-cli/.venv/bin/activate + +#* Utility functions +function get-csv() { + #* Insert versionControlProvenance + sarif-insert-vcp $1.sarif > $1.1.sarif + + #* Populate CSV with provided timestamps + cat > $1.timestamp << EOF +{ + "db_create_start": "2023-07-03T00:56:15.576222", + "db_create_stop": "2023-07-03T00:56:42.781839", + "scan_start": "2023-07-03T00:56:47.546696", + "scan_stop": "2023-07-03T00:57:55.988059" +} +EOF + + sarif-extract-scans-runner --input-signature CLI --with-timestamps - < $1.1.sarif + + #* Get CSV with dummy timestamps + sarif-extract-scans-runner --input-signature CLI - <