Add support for external timestamps

This allows external files containing timestamps = { "db_create_start" : pd.Timestamp(0.0, unit='s'), "db_create_stop" : pd.Timestamp(0.0, unit='s'), "scan_start_date" : pd.Timestamp(0.0, unit='s'), "scan_stop_date" : pd.Timestamp(0.0, unit='s'), } to be used to provide those values, instead of the above defaults. This patch changes the top-level scripts bin/sarif-extract-scans bin/sarif-extract-scans-runner and provides scripts/test-timestamps.sh for verification. The following keys are also accepted: { "db_create_start": ..., "db_create_stop": ..., "scan_start": ... "scan_stop": ... }
2025-12-15 17:03:04 +01:00 · 2023-08-18 17:00:11 -07:00
parent 57710bdd14
commit ee11214aee
4 changed files with 153 additions and 15 deletions
--- a/bin/sarif-extract-scans
+++ b/bin/sarif-extract-scans
@@ -34,6 +34,17 @@ parser.add_argument('outdir', metavar='output-dir', type=str, help='output direc
 parser.add_argument('csvout', metavar='csv-outfile', type=str, help='processing status csv output file name to use')
 parser.add_argument('-r', '--write-raw-tables', action="store_true",
                    help='Write the raw sarif tables to the output directory')
+
+parser.add_argument('-t', '--with-timestamps', action='store_true',
+                    help='Read name of files containing timestamp information '
+                    'from the scan-spec.json file.  '
+                    'The file format changes from '
+                    'e.g., '
+                    '{"scan_id": 15092319597255524458, "sarif_file_name": "sqlidb-0.1.sarif"} '
+                    'to '
+                    '{"scan_id": 15092319597255524458, "sarif_file_name": "sqlidb-0.1.sarif", timestamp_file_name: "sqlidb-0.1.timestamps"}'                    
+                    )
+
 parser.add_argument('-f','--input-signature', metavar='input-signature', type=str, default="CLI", 
                    help='Signature of the sarif, as in, where it was generated it may affect the signature.\n'
                    'Options: LGTM, CLI\n'
@@ -64,6 +75,22 @@ def load(fname):

 scan_spec = load(args.file)
 sarif_struct = load(scan_spec['sarif_file_name'])
+if args.with_timestamps:
+    t1 = load(scan_spec['timestamp_file_name'])
+    # TODO Remove this kludge for wrong keywords.
+    timestamps = {
+        **t1,
+        "scan_start_date" : t1["scan_start"],
+        "scan_stop_date"  : t1["scan_stop"],
+    }
+else:
+    timestamps = {
+        "db_create_start"      : pd.Timestamp(0.0, unit='s'),
+        "db_create_stop"       : pd.Timestamp(0.0, unit='s'),
+        "scan_start_date"      : pd.Timestamp(0.0, unit='s'),
+        "scan_stop_date"       : pd.Timestamp(0.0, unit='s'),
+    }
+    
 status_writer.setup_status_filenames(scan_spec['sarif_file_name'])

 #
@@ -189,7 +216,9 @@ scantabs.columns_to_reindex = {
 # joins for projects has to happen first as it backfills the guess about the project_id
 scantabs.projects = st.joins_for_projects(bt, external_info)
 scantabs.results = st.joins_for_results(bt, external_info)
-scantabs.scans = st.joins_for_scans(bt, external_info, scantabs, args.input_signature)
+scantabs.scans = \
+    st.joins_for_scans(bt, external_info, scantabs,
+                       args.input_signature, timestamps)

 #
 # Replace the remaining internal ids with snowflake ids
--- a/bin/sarif-extract-scans-runner
+++ b/bin/sarif-extract-scans-runner
@@ -110,6 +110,16 @@ parser.add_argument('-s', '--successful-runs', metavar='filename', type=str,
                    'new/failed entries from sarif-files.'
                    '  Default: "%(default)s"')

+parser.add_argument('-t', '--with-timestamps', action='store_true',
+                    help='Read names of files containing timestamp information '
+                    'following the name of the sarif source file.'
+                    'E.g., '
+                    'sarif-extract-scans-runner --with-timestamps - << EOF '
+                    'foo.sarif,timestamps.json '
+                    'EOF '
+                    'Note: spaces are NOT stripped, so foo.sarif,timestamps.json '
+                    'and foo.sarif, timestamps.json are different.'
+                    )

 parser.add_argument('--doc', dest='fulldoc', default=False,
                    action='store_true', 
@@ -154,7 +164,13 @@ if use_successful_runs:
        successful_runs = set()

 count = -1
-for path in paths:
+for path_timestamp in paths:
+    if args.with_timestamps:
+        path, t1 = path_timestamp.split(',')
+        timestamp_fname = t1.strip()
+    else:
+        path = path_timestamp
+
    count += 1
    if count > args.max_files: break
    # 
@@ -169,11 +185,17 @@ for path in paths:
        data = f.read()
        scan_id = hash.hash_unique(data)

-    scan_spec = {
-        "scan_id": scan_id,                        # pd.Int64Dtype()
-        "sarif_file_name": path,                   # pd.StringDtype()
-    }
-    
+    if args.with_timestamps:
+        scan_spec = {
+            "scan_id": scan_id,                        # pd.Int64Dtype()
+            "sarif_file_name": path,                   # pd.StringDtype()
+            "timestamp_file_name": timestamp_fname
+        }
+    else:
+        scan_spec = {
+            "scan_id": scan_id,                        # pd.Int64Dtype()
+            "sarif_file_name": path,                   # pd.StringDtype()
+        }
    # 
    # If using outermost output directory, create project directory:
    # (like <outer_dir>/<repositoryUri>/*.scantables)
@@ -211,7 +233,13 @@ for path in paths:

    scan_log_file = os.path.join(outer_dir+ path + ".scanlog")
    csv_outfile = os.path.join(outer_dir+ path)
-    runstats = subprocess.run(['sarif-extract-scans', scan_spec_file, output_dir, csv_outfile, "-f", args.input_signature],
+    if args.with_timestamps:
+        timestamp_options = ['--with-timestamps']
+    else:
+        timestamp_options = []
+    runstats = subprocess.run(['sarif-extract-scans', scan_spec_file, output_dir,
+                               csv_outfile, "-f", args.input_signature,
+                               *timestamp_options],
                              capture_output=True, text=True)
    if runstats.returncode == 0:
        print("{:6} {}".format("OK", path))
--- a/sarif_cli/scan_tables.py
+++ b/sarif_cli/scan_tables.py
@@ -116,7 +116,7 @@ def joins_for_projects(basetables, external_info):
 #
 # Scans table
 # 
-def joins_for_scans(basetables, external_info, scantables, sarif_type):
+def joins_for_scans(basetables, external_info, scantables, sarif_type, timestamps : dict):
    """ 
    Form the `scans` table for the ScanTables dataclass
    """
@@ -135,12 +135,7 @@ def joins_for_scans(basetables, external_info, scantables, sarif_type):
        "id"                   : e.scan_id,
        "commit_id"            : commit_id,
        "project_id"           : e.project_id,
-        # TODO extract real date information from somewhere external
-        "db_create_start"      : pd.Timestamp(0.0, unit='s'),
-        "db_create_stop"       : pd.Timestamp(0.0, unit='s'),
-        "scan_start_date"      : pd.Timestamp(0.0, unit='s'),
-        "scan_stop_date"       : pd.Timestamp(0.0, unit='s'),
-        # 
+        **timestamps, 
        "tool_name"            : driver_name[0],
        "tool_version"         : driver_version[0],
        "tool_query_commit_id" : pd.NA,
--- a/scripts/test-timestamps.sh
+++ b/scripts/test-timestamps.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+#* Setup 
+cd ~/local/sarif-cli/data/codeql-dataflow-sql-injection
+ls -la sqlidb-0.sarif sqlidb-1.sarif
+
+#
+source ~/local/sarif-cli/.venv/bin/activate
+
+#* Utility functions
+function get-csv() {
+    #* Insert versionControlProvenance
+    sarif-insert-vcp $1.sarif > $1.1.sarif
+
+    #* Populate CSV with provided timestamps
+    cat > $1.timestamp << EOF
+{
+    "db_create_start": "2023-07-03T00:56:15.576222",
+    "db_create_stop": "2023-07-03T00:56:42.781839",
+    "scan_start": "2023-07-03T00:56:47.546696",
+    "scan_stop": "2023-07-03T00:57:55.988059"
+}
+EOF
+
+    sarif-extract-scans-runner --input-signature CLI --with-timestamps - <<EOF
+$1.1.sarif,$1.timestamp
+EOF
+
+    #* List CSV messages
+    cd ~/local/sarif-cli/data/codeql-dataflow-sql-injection 
+    head -4 $1.1.sarif.csv 
+
+    #* List CSV output
+    ls -la $1.1*
+    find $1.1.sarif.scantables -print
+    csvcut -c "db_create_start,db_create_stop,scan_start_date,scan_stop_date" \
+           $1.1.sarif.scantables/scans.csv
+
+    # #* show log
+    # echo "run log:"
+    # cat $1.1.sarif.scanlog
+}
+
+function get-csv-no-ts() {
+    #* Insert versionControlProvenance
+    sarif-insert-vcp $1.sarif > $1.1.sarif
+
+    #* Get CSV with dummy timestamps
+    sarif-extract-scans-runner --input-signature CLI - <<EOF
+$1.1.sarif
+EOF
+
+    #* List CSV messages
+    cd ~/local/sarif-cli/data/codeql-dataflow-sql-injection 
+    head -4 $1.1.sarif.csv 
+
+    #* List CSV output
+    ls -la $1.1*
+    find $1.1.sarif.scantables -print
+    csvcut -c "db_create_start,db_create_stop,scan_start_date,scan_stop_date" \
+           $1.1.sarif.scantables/scans.csv
+}
+
+clean-csv () {
+    cd ~/local/sarif-cli/data/codeql-dataflow-sql-injection 
+    rm -f $1.1.sarif.csv 
+    rm -f $1.1*scan{log,spec}
+    rm -fR $1.1.sarif.scantables 
+}    
+
+#* Clean up and run tool
+cd ~/local/sarif-cli/data/codeql-dataflow-sql-injection 
+clean-csv sqlidb-0
+get-csv sqlidb-0
+
+clean-csv sqlidb-1
+get-csv-no-ts sqlidb-1
+
+#* Look for the timestamp value
+function check-timestamp() {
+    ag -C1 "00:56:15.57622|1970-01-01" ${1}
+}
+# With custom stamp:
+check-timestamp 'sqlidb-0.1*/scans.csv'
+# With default stamp:
+check-timestamp 'sqlidb-1.1*/scans.csv'
+#