Rework project and scan id generation

goal: deterministic across multiple instances of scan on same sarif file no collisions between sarif files from different scan instances (regardless of for same project or not) assumption sarif file naming will follow: <project>/<unique_filename_per_analysis> format
2025-12-16 09:13:04 +01:00 · 2022-10-26 12:00:38 -04:00
parent c51dbba577
commit 4121072088
1 changed files with 11 additions and 7 deletions
--- a/bin/sarif-extract-scans-runner
+++ b/bin/sarif-extract-scans-runner
@@ -80,7 +80,12 @@ import os
 import sys
 import pickle
 from datetime import datetime
-from sarif_cli import snowflake_id
+from hashlib import blake2b
+
+def hash_unique(item_to_hash, size):
+    h = blake2b(digest_size = size)
+    h.update(item_to_hash.encode())
+    return abs(int.from_bytes(h.digest(), byteorder='big'))

 #
 # Handle arguments
@@ -130,9 +135,6 @@ if use_successful_runs:
    else:
        successful_runs = set()

-# Scan id guaranteed unique - do not rely on external info
-flakegen1 = snowflake_id.Snowflake(0)
-
 count = -1
 for path in paths:
    count += 1
@@ -146,13 +148,15 @@ for path in paths:
    # Scan specification
    # 
    scan_spec = {
-        "project_id": abs(hash(project + component)), # pd.UInt64Dtype()
-        "scan_id": flakegen1.next(),                  # pd.Int64Dtype()
-        "sarif_file_name": path,                      # pd.StringDtype()
+        "project_id": hash_unique(project, 8),     # pd.UInt64Dtype()
+        "scan_id": hash_unique(path, 8),           # pd.Int64Dtype()
+        "sarif_file_name": path,                   # pd.StringDtype()
    }
+
    scan_spec_file = os.path.join(project, component + ".scanspec")
    with open(scan_spec_file, 'w') as fp:
        json.dump(scan_spec, fp)
+
    # 
    # Table output directory
    #