Fix subtle type problem: M8 is required for early steps, datetime64[ns] later

2025-12-16 01:13:03 +01:00 · 2025-10-19 13:35:02 -07:00
parent bed9d3e659
commit c15dc6d4bc
7 changed files with 271 additions and 29 deletions
--- a/bin/sarif-extract-scans
+++ b/bin/sarif-extract-scans
@@ -50,8 +50,25 @@ parser.add_argument('-f','--input-signature', metavar='input-signature', type=st
                    'Options: LGTM, CLI\n'
                    'If current represented signatures are not sufficient, view signature_single.py for how to support further signatures.'
                    '  Default: "%(default)s"')
+
+parser.add_argument("-d", "--debug", action="store_true",
+                    help="Run inside IPython with --pdb for post-mortem debugging")
+
 args = parser.parse_args()

+
+import sys, pdb, traceback
+
+def debug_excepthook(type, value, tb):
+    traceback.print_exception(type, value, tb)
+    print("\nEntering post-mortem debugger...\n")
+    pdb.post_mortem(tb)
+
+# XX:
+if args.debug:
+    sys.excepthook = debug_excepthook
+
+
 if args.input_signature not in ["LGTM","CLI"]:
    print("Unsupported sarif signature requested.")
    print("Use one of [LGTM, CLI].")
--- a/bin/sarif-extract-scans-runner
+++ b/bin/sarif-extract-scans-runner
@@ -237,10 +237,12 @@ for path_timestamp in paths:
        timestamp_options = ['--with-timestamps']
    else:
        timestamp_options = []
-    runstats = subprocess.run(['sarif-extract-scans', scan_spec_file, output_dir,
-                               csv_outfile, "-f", args.input_signature,
-                               *timestamp_options],
+    # XX:
+    runstats = subprocess.run(['sarif-extract-scans', scan_spec_file,
+                               output_dir, csv_outfile, "-f",
+                               args.input_signature],
                              capture_output=True, text=True)
+
    if runstats.returncode == 0:
        print("{:6} {}".format("OK", path))
        if use_successful_runs:
@@ -250,10 +252,25 @@ for path_timestamp in paths:
        # log error
        with open(scan_log_file, 'w') as fp:
            fp.write(runstats.stderr)
-        # report only tail
-        print("{:6} {}".format("", "Error tail: "))
-        for t1 in runstats.stderr.split('\n')[-6:-1]:
-            print("{:6} {}".format("", t1))    
+
+        # show command for manual re-run
+        cmd = [
+            "sarif-extract-scans",
+            scan_spec_file,
+            output_dir,
+            csv_outfile,
+            "-f", args.input_signature,
+        ]
+        print("{:6} {}".format("", "Command was:"))
+        print("{:6} {}".format("", " ".join(cmd)))
+
+
+        # report only tail of stderr
+        print("{:6} {}".format("", "Error tail:"))
+        if runstats.stderr:
+            for line in runstats.stderr.splitlines()[-6:]:
+                print("{:6} {}".format("", line))
+

 if use_successful_runs:
    with open(args.successful_runs, 'wb') as outfile:
--- a/notes/README.org
+++ b/notes/README.org
@@ -205,9 +205,9 @@
          #+BEGIN_SRC sh :session shared :results output :eval never-export
            cd ~/local/sarif-cli/data/codeql-dataflow-sql-injection 
            sarif-extract-scans \
-                sqlidb-1.1.sarif.scanspec \
-                sqlidb-1.1.sarif.scantables \
-                sqlidb-1.1.sarif.csv \
+                sqlidb-1.sarif.scanspec \
+                sqlidb-1.sarif.scantables \
+                sqlidb-1.sarif.csv \
                -f CLI
          #+END_SRC

--- a/notes/quickstart.org
+++ b/notes/quickstart.org
@@ -0,0 +1,67 @@
+* sarif-cli quickstart
+  Set up the virtual environment and install the packages:
+  #+BEGIN_SRC sh 
+    cd ~/work-gh/sarif-cli/
+
+    # set up virtual environment
+    python3 -m venv .venv
+    . .venv/bin/activate
+
+    # Use requirementsDEV.txt 
+    python -m pip install -r requirementsDEV.txt
+
+    # install scripts
+    pip install -e .
+
+    # force symlinks for development
+    rm -f "$VIRTUAL_ENV/bin/sarif-"*
+    ln -sf "$PWD/bin/sarif-"* "$VIRTUAL_ENV/bin/"
+
+  #+END_SRC
+
+  Run SARIF extraction for one test file and inspect results.
+  This assumes you are in the above virtual environment where all =sarif-*= tools
+  are on =$PATH=.
+
+  #+BEGIN_SRC sh 
+    cd ~/work-gh/sarif-cli/data/codeql-dataflow-sql-injection
+
+    # ---------------------------------------------------------------------
+    # 1. Set base name of the original SARIF file (without extension)
+    # ---------------------------------------------------------------------
+    orig="sqlidb-1"
+
+    # ---------------------------------------------------------------------
+    # 2. Remove any stale output from previous runs
+    # ---------------------------------------------------------------------
+    rm -fR -- "${orig}.1.sarif."*
+
+    # ---------------------------------------------------------------------
+    # 3. Ensure versionControlProvenance field is present
+    # ---------------------------------------------------------------------
+    sarif-insert-vcp "${orig}.sarif" > "${orig}.1.sarif"
+
+    # ---------------------------------------------------------------------
+    # 4. Run the converter (CLI input signature)
+    #     - Logs are written only if errors occur.
+    # ---------------------------------------------------------------------
+    sarif-extract-scans-runner --input-signature CLI - > /dev/null <<EOF
+    ${orig}.1.sarif
+    EOF
+
+    # ---------------------------------------------------------------------
+    # 5. If errors occurred, show the scan log.
+    #    The log lists the exact commands that can be re-run manually under pdb.
+    # ---------------------------------------------------------------------
+    if [[ -f "${orig}.1.sarif.scanlog" ]]; then
+        echo "Conversion errors logged in ${orig}.1.sarif.scanlog"
+        cat "${orig}.1.sarif.scanlog"
+    fi
+
+    # ---------------------------------------------------------------------
+    # 6. Examine results (converted SARIF, logs, etc.)
+    # ---------------------------------------------------------------------
+    ls -l "${orig}.1.sarif"*
+  #+END_SRC
+  For interactive examination / debugging, see [[file:README.org::*Run using embedded repls][Run using embedded repls]]
+  
--- a/notes/update.org
+++ b/notes/update.org
@@ -0,0 +1,76 @@
+
+* issues <2025-10-18 Sat>
+** DONE 
+   CLOSED: [2025-10-18 Sat 22:34]
+
+   - State "DONE"       from "NEXT"       [2025-10-18 Sat 22:34]
+  #+BEGIN_SRC text
+    ~/work-gh/sarif-cli/data/codeql-dataflow-sql-injection]$
+    1:$ bat                 sqlidb-1.sarif.scanspec                 sqlidb-1.sarif.scantables                 sqlidb-1.sarif.csv
+    ───────┬──────────────────────────────────────────────────────────────────────────────────────────────────
+           │ File: sqlidb-1.sarif.scanspec
+    ───────┼──────────────────────────────────────────────────────────────────────────────────────────────────
+       1   │ {"scan_id": 12314655876769447717, "sarif_file_name": "sqlidb-1.sarif"}
+    ───────┴──────────────────────────────────────────────────────────────────────────────────────────────────
+    [bat error]: 'sqlidb-1.sarif.scantables' is a directory.
+    ───────┬──────────────────────────────────────────────────────────────────────────────────────────────────
+           │ File: sqlidb-1.sarif.csv
+    ───────┼──────────────────────────────────────────────────────────────────────────────────────────────────
+       1   │ sarif_file,level,levelcode,message,extra_info
+       2   │ sqlidb-1.sarif,WARNING,2,Input sarif is missing neccesary properties.,"Missing: {'newlineSequence
+           │ s', 'versionControlProvenance'}, "
+    ───────┴──────────────────────────────────────────────────────────────────────────────────────────────────
+    (.venv-m325) (base) [hohn@m325 ~/work-gh/sarif-cli/data/codeql-dataflow-sql-injection]$
+  #+END_SRC
+
+  sarif_file,level,levelcode,message,extra_info
+  sqlidb-1.sarif,WARNING,2,Input sarif is missing neccesary properties.,"Missing:
+  {'newlineSequences', 'versionControlProvenance'}
+
+  see
+
+  File: ./bin/sarif-insert-vcp
+   2  11 # Add the versionControlProvenance key to a SARIF file
+   9   6 | ( .versionControlProvenance |=
+
+  File: ./scripts/test-vcp.sh
+  21  15     #* Insert versionControlProvenance
+
+
+  o  The CLI sarif **MUST** contain one additional property `versionControlProvenance` - which needs to look like:
+  ```
+  "versionControlProvenance": [
+        {
+          "repositoryUri": "https://github.com/testorg/testrepo.git",
+          "revisionId": "testsha"
+        }
+      ]
+  ```
+
+  The script 
+
+      bin/sarif-insert-vcp
+      [[file:~/work-gh/sarif-cli/bin/sarif-insert-vcp::uri=vcp-no-uri]]
+
+  will add that entry to a SARIF file.
+
+
+  Also,
+    ./sarif_cli/signature.py:308:        # Ensure newlineSequences is present when versionControlProvenance is
+    ./sarif_cli/signature.py:309:        full_elem['newlineSequences'] = elem.get('newlineSequences', dummy_newlineSequences)
+  
+  So:
+  - adding versionControlProvenance first will add newlineSequences later also
+
+  
+** TODO sarif-cli type error
+   #+BEGIN_SRC text
+     ~/work-gh/sarif-cli/data/codeql-dataflow-sql-injection]$
+     0:$ less sqlidb-1.1.sarif.scanlog
+
+       ...
+       File "/Users/hohn/work-gh/sarif-cli/.venv-m325/lib/python3.11/site-packages/pandas/core/arrays/datetimes.py", line 734, in astype
+         raise TypeError(
+     TypeError: Casting to unit-less dtype 'datetime64' is not supported. Pass e.g. 'datetime64[ns]' instead.
+
+   #+END_SRC
--- a/requirementsDEV.txt
+++ b/requirementsDEV.txt
@@ -33,9 +33,9 @@ nbconvert==6.4.4
 nbformat==5.2.0
 nest-asyncio==1.5.4
 notebook==6.4.10
-numpy==1.22.3
+numpy
 packaging==21.3
-pandas==1.4.1
+pandas
 pandocfilters==1.5.0
 parso==0.8.3
 pexpect==4.8.0
@@ -52,7 +52,7 @@ pyrsistent==0.18.1
 python-dateutil==2.8.2
 pytz==2021.3
 PyYAML==6.0
-pyzmq==22.3.0
+pyzmq
 qtconsole==5.2.2
 QtPy==2.0.1
 Send2Trash==1.8.0
--- a/sarif_cli/scan_tables.py
+++ b/sarif_cli/scan_tables.py
@@ -5,6 +5,7 @@ from . import snowflake_id

 import logging
 import numpy
+import numpy as np
 import pandas as pd
 import re
 from sarif_cli import hash
@@ -108,8 +109,10 @@ def joins_for_projects(basetables, external_info):
        "automationDetails"  : automationDetails,
    }, index=[0])

-    # Force all column types to ensure appropriate formatting
-    res1 = res.astype(ScanTablesTypes.projects).reset_index(drop=True)
+    # 
+    # - Now (not before), "creation_date" needs type numpy.dtype('datetime64[ns]')
+    # - Force all column types to ensure appropriate formatting
+    res1 = normalize_dataframe_types(res, ScanTablesTypes.projects)
    # 
    return res1

@@ -144,9 +147,33 @@ def joins_for_scans(basetables, external_info, scantables, sarif_type, timestamp
        "rules_count"          : len(b.rules['id'].unique()),
    },index=[0])
    # Force all column types to ensure correct writing and type checks on reading.
-    res1 = res.astype(ScanTablesTypes.scans).reset_index(drop=True)
+    res1 = normalize_dataframe_types(res, ScanTablesTypes.scans)
    return res1

+
+def normalize_dataframe_types(df: pd.DataFrame, type_map: dict) -> pd.DataFrame:
+    """
+    Normalize dtypes in a DataFrame according to a given type map.
+
+    - Converts any ambiguous datetime64 types (e.g. 'M', '<M8') to 'datetime64[ns]'
+    - Coerces corresponding columns with pd.to_datetime()
+    - Returns a new DataFrame with types enforced and index reset
+    """
+    fixed_types = dict(type_map)  # shallow copy to avoid mutating globals
+
+    for col, dtype in fixed_types.items():
+        dtype_str = str(dtype)
+
+        # Normalize datetime-like dtypes
+        if dtype_str.startswith("datetime64") or dtype_str.startswith("<M8") or dtype_str == "M8":
+            fixed_types[col] = np.dtype("datetime64[ns]")
+            if col in df.columns:
+                df[col] = pd.to_datetime(df[col], errors="coerce", utc=False)
+
+    # Enforce all column types consistently
+    df1 = df.astype(fixed_types).reset_index(drop=True)
+    return df1
+
 # 
 # Results table
 # 
@@ -175,28 +202,66 @@ def joins_for_results(basetables, external_info):
        res = tables[0]
        
    # Force all column types to ensure appropriate formatting
-    res1 = res.astype(ScanTablesTypes.results).reset_index(drop=True)
+    res1 = normalize_dataframe_types(res, ScanTablesTypes.results)    
    return res1

-#id as primary key
+def _lookup_rule_value(basetable, rule_id, column_name, join_tags=False):
+    """
+    Look up a value in basetable.rules by id == rule_id.
+    If join_tags=True, concatenate all tag_text values with '_'.
+    """
+    df = basetable.rules
+
+    # Defensive check: avoid duplicate 'id' rows or missing entries
+    match = df.loc[df["id"] == rule_id, column_name]
+
+    if match.empty:
+        return None
+
+    if join_tags:
+        return match.str.cat(sep="_")
+
+    # For scalar columns, pick first entry safely
+    return match.head(1).item()
+
+# id as primary key
 def _populate_from_rule_table_code_flow_tag_text(basetable, flowtable):
-    val = flowtable.rule_id.values[0]
-    return basetable.rules.query("id == @val")["tag_text"].str.cat(sep='_')
+    return _lookup_rule_value(basetable, flowtable.rule_id.values[0], "tag_text", join_tags=True)

-#id as primary key
+# id as primary key
 def _populate_from_rule_table_tag_text(basetable, i):
-    val = basetable.kind_problem.rule_id[i]
-    return basetable.rules.query("id == @val")["tag_text"].str.cat(sep='_')
+    return _lookup_rule_value(basetable, basetable.kind_problem.rule_id[i], "tag_text", join_tags=True)

-#id as primary key
+# id as primary key
 def _populate_from_rule_table(column_name, basetable, i):
-    val = basetable.kind_problem.rule_id[i]
-    return basetable.rules.query("id == @val")[column_name].head(1).item()
+    return _lookup_rule_value(basetable, basetable.kind_problem.rule_id[i], column_name)

-#id as primary key
+# id as primary key
 def _populate_from_rule_table_code_flow(column_name, basetable, flowtable):
-    val = flowtable.rule_id.values[0]
-    return basetable.rules.query("id == @val")[column_name].head(1).item()
+    return _lookup_rule_value(basetable, flowtable.rule_id.values[0], column_name)
+
+
+# #id as primary key
+# def _populate_from_rule_table_code_flow_tag_text(basetable, flowtable):
+#     val = flowtable.rule_id.values[0]
+#     return basetable.rules.query("id == @val")["tag_text"].str.cat(sep='_')
+
+# #id as primary key
+# def _populate_from_rule_table_tag_text(basetable, i):
+#     val = basetable.kind_problem.rule_id[i]
+#     return basetable.rules.query("id == @val")["tag_text"].str.cat(sep='_')
+
+# #id as primary key
+# def _populate_from_rule_table(column_name, basetable, i):
+#     val = basetable.kind_problem.rule_id[i]
+#     # return basetable.rules.query("id == @val")[column_name].head(1).item()
+#     return basetable.rules.loc[basetable.rules["id"] == val, column_name].head(1).item()
+
+# #id as primary key
+# def _populate_from_rule_table_code_flow(column_name, basetable, flowtable):
+#     val = flowtable.rule_id.values[0]
+#     # return basetable.rules.query("id == @val")[column_name].head(1).item()
+#     return basetable.rules.loc[basetable.rules["id"] == val, column_name].head(1).item()

 def _results_from_kind_problem(basetables, external_info):
    b = basetables; e = external_info