Enforce types when forming the scan tables (internal and output formatting)

Force all column types to ensure appropriate formatting for writing. In particular, no character data in place of integers, no floats, no objects in place of strings. Table formation for the functions - st.joins_for_results - st.joins_for_scans - st.joins_for_projects enforces types.
2025-12-16 09:13:04 +01:00 · 2022-08-07 18:57:14 -07:00
parent 581419afde
commit 560b9ecf35
2 changed files with 71 additions and 6 deletions
--- a/bin/sarif-extract-scans
+++ b/bin/sarif-extract-scans
@@ -104,7 +104,8 @@ external_info = ExternalInfo(
    scan_spec["project_id"],
    scan_spec["scan_id"],
    scan_spec["sarif_file_name"],
-    'deadbeef00',               # TODO: Take ql_query_id from where?
+    # TODO: Take ql_query_id from where? (git commit id of the ql query set)
+    'deadbeef00',               
 )

 # 
--- a/sarif_cli/scan_tables.py
+++ b/sarif_cli/scan_tables.py
@@ -2,9 +2,13 @@

 """
 import pandas as pd
+import numpy
 import re
 from . import snowflake_id

+class ZeroResults(Exception):
+    pass
+
 #
 # Projects table
 # 
@@ -28,13 +32,23 @@ def joins_for_projects(basetables, external_info, scantables):
    res = pd.DataFrame(data={
        "id"                 : e.project_id,
        "project_name"       : project_name,
-        "creation_date"      : pd.NA,    # TODO: external info 
+        "creation_date"      : pd.Timestamp(0.0, unit='s'), # TODO: external info 
        "repo_url"           : repo_url, 
        "primary_language"   : b.project['semmle.sourceLanguage'][0], # TODO: external info
        "languages_analyzed" : ",".join(list(b.project['semmle.sourceLanguage']))
    },index=[0])

-    return res
+    # Force all column types to ensure appropriate formatting
+    res1 = res.astype({
+        "id"                 : pd.UInt64Dtype(),
+        "project_name"       : pd.StringDtype(),
+        "creation_date"      : numpy.datetime64(),
+        "repo_url"           : pd.StringDtype(),
+        "primary_language"   : pd.StringDtype(),
+        "languages_analyzed" : pd.StringDtype(),
+    }).reset_index(drop=True)
+
+    return res1

 #
 # Scans table
@@ -66,7 +80,25 @@ def joins_for_scans(basetables, external_info, scantables):
        "results_count"        : scantables.results.shape[0],
        "rules_count"          : len(b.rules['id'].unique()),
    },index=[0])
-    return res
+
+    # Force all column types to ensure correct writing and type checks on reading.
+    res1 = res.astype({
+        "id"                   : pd.UInt64Dtype(),
+        "commit_id"            : pd.StringDtype(),
+        "project_id"           : pd.UInt64Dtype(),
+        "db_create_start"      : numpy.datetime64(),
+        "db_create_stop"       : numpy.datetime64(),
+        "scan_start_date"      : numpy.datetime64(),
+        "scan_stop_date"       : numpy.datetime64(),
+        "tool_name"            : pd.StringDtype(),
+        "tool_version"         : pd.StringDtype(),
+        "tool_query_commit_id" : pd.StringDtype(),
+        "sarif_file_name"      : pd.StringDtype(),
+        "results_count"        : pd.Int64Dtype(),
+        "rules_count"          : pd.Int64Dtype(),
+    }).reset_index(drop=True)
+
+    return res1

 # 
 # Results table
@@ -89,9 +121,42 @@ def joins_for_results(basetables, external_info):
    if len(stack) > 0:
        res = pd.concat(stack)
    else:
+        if stack == []:
+            # Sanity check: The case of zero results must be handled at
+            # sarif read time and should never reach here.
+            raise ZeroResults("Zero problem/path_problem results found in sarif "
+                              "file but processing anyway.  Internal error.")
        res = tables[0]
        
-    return res
+    # Force all column types to ensure appropriate formatting
+    res1 = res.astype({
+        'id'               : pd.UInt64Dtype(),
+        'scan_id'          : pd.UInt64Dtype(),
+        'query_id'         : pd.StringDtype(),
+        
+        'result_type'      : pd.StringDtype(),
+        'codeFlow_id'      : pd.UInt64Dtype(),
+        
+        'message'          : pd.StringDtype(),
+        'message_object'   : numpy.dtype('O'),
+        'location'         : pd.StringDtype(),
+        
+        'source_startLine' : pd.Int64Dtype(),
+        'source_startCol'  : pd.Int64Dtype(),
+        'source_endLine'   : pd.Int64Dtype(),
+        'source_endCol'    : pd.Int64Dtype(),
+        
+        'sink_startLine'   : pd.Int64Dtype(),
+        'sink_startCol'    : pd.Int64Dtype(),
+        'sink_endLine'     : pd.Int64Dtype(),
+        'sink_endCol'      : pd.Int64Dtype(),
+        
+        # TODO Find high-level info from query name or tags?
+        'source_object'    : numpy.dtype('O'),
+        'sink_object'      : numpy.dtype('O'),
+    }).reset_index(drop=True)
+
+    return res1

 def _results_from_kind_problem(basetables, external_info):
    b = basetables; e = external_info
@@ -126,7 +191,6 @@ def _results_from_kind_problem(basetables, external_info):
        })
    # Force column type(s) to avoid floats in output.
    res1 = res.astype({ 'id' : 'uint64', 'scan_id': 'uint64'}).reset_index(drop=True)
-
    return res1