Rework project name format and project id format

2025-12-16 17:23:03 +01:00 · 2022-11-07 13:56:50 -05:00
parent 4121072088
commit 1caf03f5f0
4 changed files with 42 additions and 14 deletions
--- a/bin/sarif-extract-scans
+++ b/bin/sarif-extract-scans
@@ -38,6 +38,7 @@ def load(fname):
        try:
            content = json.load(fp)
        except json.decoder.JSONDecodeError as err:
+            # TODO knewbury error handling
            logging.error('Error reading from {}: {}: line {}, column {}'
                          .format(args.file, err.msg, err.lineno, err.colno))
            sys.exit(1)
@@ -61,8 +62,16 @@ sarif_struct = signature.fillsig(args, sarif_struct, context)
 #
 # Use reference type graph (signature) to traverse sarif and attach values to tables
 #
+# try:
+#     tgraph = typegraph.Typegraph(signature_single.struct_graph_2022_02_01)
+#     typegraph.destructure(tgraph, signature_single.start_node_2022_02_01, sarif_struct)
+# except json.decoder.JSONDecodeError as err:
+#     logging.error('Error reading from {}: {}: line {}, column {}'
+#                           .format(args.file, err.msg, err.lineno, err.colno))
+#     sys.exit(1)
 tgraph = typegraph.Typegraph(signature_single.struct_graph_2022_02_01)
 typegraph.destructure(tgraph, signature_single.start_node_2022_02_01, sarif_struct)
+
 #
 # Form output tables
 # 
@@ -125,9 +134,11 @@ bt.rules = tj.joins_for_rules(tgraph)
 #
 # Form scan tables
 #
+# joins for projects has to happen first as it backfills the guess about the project_id
+scantabs.projects = st.joins_for_projects(bt, external_info, scantabs)
 scantabs.results = st.joins_for_results(bt, external_info)
 scantabs.scans = st.joins_for_scans(bt, external_info, scantabs)
-scantabs.projects = st.joins_for_projects(bt, external_info, scantabs)
+


 #
--- a/bin/sarif-extract-scans-runner
+++ b/bin/sarif-extract-scans-runner
@@ -80,13 +80,7 @@ import os
 import sys
 import pickle
 from datetime import datetime
-from hashlib import blake2b
-
-def hash_unique(item_to_hash, size):
-    h = blake2b(digest_size = size)
-    h.update(item_to_hash.encode())
-    return abs(int.from_bytes(h.digest(), byteorder='big'))
-
+from sarif_cli import hash
 #
 # Handle arguments
 #
@@ -147,12 +141,21 @@ for path in paths:
    # 
    # Scan specification
    # 
+    # scan id as hash of sarif file contents
+    with open(path, 'rb') as f:
+        data = f.read()
+        scan_id = hash.hash_unique(data)
+
    scan_spec = {
-        "project_id": hash_unique(project, 8),     # pd.UInt64Dtype()
-        "scan_id": hash_unique(path, 8),           # pd.Int64Dtype()
+        # assuming sarif file names are like <org>/<repo>
+        # however this will be replaced down the line with the repoURI if possible
+        # still, leaving here in case later versions of this tool do not rely on that property being there
+        # in that case this will be the best guess
+        "project_id": hash.hash_unique((project+"-"+component).encode()),  # pd.UInt64Dtype()
+        "scan_id": scan_id,                        # pd.Int64Dtype()
        "sarif_file_name": path,                   # pd.StringDtype()
    }
-
+    
    scan_spec_file = os.path.join(project, component + ".scanspec")
    with open(scan_spec_file, 'w') as fp:
        json.dump(scan_spec, fp)
--- a/sarif_cli/hash.py
+++ b/sarif_cli/hash.py
@@ -0,0 +1,7 @@
+from hashlib import blake2b
+
+# takes a bytes object and outputs an 8 byte hash
+def hash_unique(item_to_hash):
+    h = blake2b(digest_size = 8)
+    h.update(item_to_hash)
+    return abs(int.from_bytes(h.digest(), byteorder='big'))
--- a/sarif_cli/scan_tables.py
+++ b/sarif_cli/scan_tables.py
@@ -7,6 +7,8 @@ import logging
 import numpy
 import pandas as pd
 import re
+import sys
+from sarif_cli import hash

 class ZeroResults(Exception):
    pass
@@ -81,10 +83,16 @@ def joins_for_projects(basetables, external_info, scantables):
    #   (git|https)://*/org/project.*
    # use the org/project part as the project_name.
    # 
+    # TODO knewbury error handling for if the signature is slotted out?
    repo_url = b.project.repositoryUri[0]
    url_parts = re.match(r'(git|https)://[^/]+/([^/]+)/([^/.]+).*', repo_url)
    if url_parts:
-        project_name = f"{url_parts.group(2)}/{url_parts.group(3)}"
+        project_name = f"{url_parts.group(2)}-{url_parts.group(3)}"
+        project, component = e.sarif_file_name.rstrip().split('/')
+        # if the runners guess from the filename was bad, replace with real info
+        # and continue to use that scanspec to pass that around
+        if project_name != project+"-"+component:
+            e.project_id = hash.hash_unique(project_name.encode())
    else:
        project_name = pd.NA
    
@@ -131,7 +139,6 @@ def joins_for_scans(basetables, external_info, scantables):
        "results_count"        : scantables.results.shape[0],
        "rules_count"          : len(b.rules['id'].unique()),
    },index=[0])
-
    # Force all column types to ensure correct writing and type checks on reading.
    res1 = res.astype(ScanTablesTypes.scans).reset_index(drop=True)
    return res1
@@ -158,7 +165,7 @@ def joins_for_results(basetables, external_info):
        res = pd.concat(stack)
    else:
        if stack == []:
-            # TODO: The case of zero results should be handled at sarif read time
+            # TODO knewbury to error handling
            logging.warning("Zero problem/path_problem results found in sarif "
                            "file but processing anyway.")
        res = tables[0]