Update how project_id is generated

previously relied on assumption: naming like: <org>/<project> in repositoryUri now just uses full repositoryUri
2025-12-16 09:13:04 +01:00 · 2023-01-05 16:37:55 -05:00
parent fc2c6bac99
commit 1a915e4de8
4 changed files with 16 additions and 41 deletions
--- a/sarif_cli/scan_tables.py
+++ b/sarif_cli/scan_tables.py
@@ -79,34 +79,18 @@ def joins_for_projects(basetables, external_info):
    """
    b = basetables; e = external_info
   
-    # if the sarif does not have versionControlProvenance, semmle.sourceLanguage ect
-    # there is no reliable way to know the project name 
-    # and will still need to use a guess about the project id
+    # if the sarif does have versionControlProvenance
    if "repositoryUri" in b.project:
-        repo_url = b.project.repositoryUri[0]
-         # For a repository url of the form
-        #   (git|https)://*/org/project.*
-        # use the org/project part as the project_name.
-        # 
-        url_parts = re.match(r'(git|https)://[^/]+/([^/]+)/(.*).git', repo_url)
-        if url_parts:
-            project_name = f"{url_parts.group(2)}-{url_parts.group(3)}"
-            project, component = e.sarif_file_name.rstrip().split('/')
-            # if the runners guess from the filename was bad, replace with real info
-            # and continue to use that scanspec to pass that around
-            if project_name != project+"-"+component:
-                e.project_id = hash.hash_unique(project_name.encode())
-        else:
-            project_name = pd.NA
+        repoUri = b.project.repositoryUri[0]
+        e.project_id = hash.hash_unique(repoUri.encode())
    else:
-        repo_url = "unknown"
-        project_name = pd.NA
+        repoUri = "unknown"
    
    res = pd.DataFrame(data={
        "id"                 : e.project_id,
-        "project_name"       : project_name,
+        "project_name"       : repoUri,
        "creation_date"      : pd.Timestamp(0.0, unit='s'), # TODO: external info 
-        "repo_url"           : repo_url, 
+        "repo_url"           : repoUri, 
        "primary_language"   : b.project['semmle.sourceLanguage'][0],
        "languages_analyzed" : ",".join(list(b.project['semmle.sourceLanguage']))
    }, index=[0])