Update how project_id is generated

previously relied on assumption: naming like: <org>/<project> in repositoryUri now just uses full repositoryUri
2025-12-16 09:13:04 +01:00 · 2023-01-05 16:37:55 -05:00
parent fc2c6bac99
commit 1a915e4de8
4 changed files with 16 additions and 41 deletions
--- a/bin/sarif-extract-scans
+++ b/bin/sarif-extract-scans
@@ -130,17 +130,14 @@ scantabs = ScanTables()
@dataclass
 class ExternalInfo:
-    project_id : int
+    project_id: pd.UInt64Dtype()
    scan_id : pd.UInt64Dtype()
    sarif_file_name : str
    ql_query_id : str
 external_info = ExternalInfo(
-    scan_spec["project_id"],
+    pd.NA,
    scan_spec["scan_id"],
-    scan_spec["sarif_file_name"],
+    scan_spec["sarif_file_name"]           
    # TODO: Take ql_query_id from where? (git commit id of the ql query set)
    'deadbeef00',               
 )
 # 
--- a/bin/sarif-extract-scans-runner
+++ b/bin/sarif-extract-scans-runner
@@ -161,7 +161,6 @@ for path in paths:
    # Paths and components
    # 
    path = path.rstrip()
    project, component = path.split('/')
    # 
    # Scan specification
    # 
@@ -171,30 +170,25 @@ for path in paths:
        scan_id = hash.hash_unique(data)
    scan_spec = {
        # assuming sarif file names are like <org>/<repo>
        # however this will be replaced down the line with the repoURI if possible
        # still, leaving here in case later versions of this tool do not rely on that property being there
        # in that case this will be the best guess
        "project_id": hash.hash_unique((project+"-"+component).encode()),  # pd.UInt64Dtype()
        "scan_id": scan_id,                        # pd.Int64Dtype()
        "sarif_file_name": path,                   # pd.StringDtype()
    }
    # 
    # If using outermost output directory, create project directory:
-    # (like <outer_dir>/<project>/*.scantables)
+    # (like <outer_dir>/<repositoryUri>/*.scantables)
    # 
-    try: os.mkdir(outer_dir+ project, mode=0o755)
+    try: os.mkdir(outer_dir+ path, mode=0o755)
    except FileExistsError: pass
-    scan_spec_file = os.path.join(outer_dir+ project, component + ".scanspec")
+    scan_spec_file = os.path.join(outer_dir+ path + ".scanspec")
    with open(scan_spec_file, 'w') as fp:
        json.dump(scan_spec, fp)
    # 
    # Table output directory
    # 
-    output_dir = os.path.join(outer_dir+ project, component + ".scantables")
+    output_dir = os.path.join(outer_dir+ path + ".scantables")
    try: os.mkdir(output_dir, mode=0o755)
    except FileExistsError: pass
    #
@@ -215,8 +209,8 @@ for path in paths:
            with open(args.successful_runs, 'wb') as outfile:
                pickle.dump(successful_runs, outfile)
-    scan_log_file = os.path.join(outer_dir+ project, component + ".scanlog")
+    scan_log_file = os.path.join(outer_dir+ path + ".scanlog")
-    csv_outfile = os.path.join(outer_dir+ project, component)
+    csv_outfile = os.path.join(outer_dir+ path)
    runstats = subprocess.run(['sarif-extract-scans', scan_spec_file, output_dir, csv_outfile, "-f", args.input_signature],
                              capture_output=True, text=True)
    if runstats.returncode == 0:
--- a/sarif_cli/hash.py
+++ b/sarif_cli/hash.py
@@ -4,4 +4,4 @@ from hashlib import blake2b
 def hash_unique(item_to_hash):
    h = blake2b(digest_size = 8)
    h.update(item_to_hash)
-    return abs(int.from_bytes(h.digest(), byteorder='big'))
+    return int.from_bytes(h.digest(), byteorder='big')
--- a/sarif_cli/scan_tables.py
+++ b/sarif_cli/scan_tables.py
@@ -79,34 +79,18 @@ def joins_for_projects(basetables, external_info):
    """
    b = basetables; e = external_info
-    # if the sarif does not have versionControlProvenance, semmle.sourceLanguage ect
+    # if the sarif does have versionControlProvenance
    # there is no reliable way to know the project name 
    # and will still need to use a guess about the project id
    if "repositoryUri" in b.project:
-        repo_url = b.project.repositoryUri[0]
+        repoUri = b.project.repositoryUri[0]
-         # For a repository url of the form
+        e.project_id = hash.hash_unique(repoUri.encode())
        #   (git|https)://*/org/project.*
        # use the org/project part as the project_name.
        # 
        url_parts = re.match(r'(git|https)://[^/]+/([^/]+)/(.*).git', repo_url)
        if url_parts:
            project_name = f"{url_parts.group(2)}-{url_parts.group(3)}"
            project, component = e.sarif_file_name.rstrip().split('/')
            # if the runners guess from the filename was bad, replace with real info
            # and continue to use that scanspec to pass that around
            if project_name != project+"-"+component:
                e.project_id = hash.hash_unique(project_name.encode())
        else:
            project_name = pd.NA
    else:
-        repo_url = "unknown"
+        repoUri = "unknown"
        project_name = pd.NA
    res = pd.DataFrame(data={
        "id"                 : e.project_id,
-        "project_name"       : project_name,
+        "project_name"       : repoUri,
        "creation_date"      : pd.Timestamp(0.0, unit='s'), # TODO: external info 
-        "repo_url"           : repo_url, 
+        "repo_url"           : repoUri, 
        "primary_language"   : b.project['semmle.sourceLanguage'][0],
        "languages_analyzed" : ",".join(list(b.project['semmle.sourceLanguage']))
    }, index=[0])