Add CLI support

enabled by -f flag with CLI value tested on sarif from CodeQL CLIs: 2.6.3, 2.9.4, 2.11.4 MUST contain versionControlProvenance property however
2025-12-16 09:13:04 +01:00 · 2022-12-01 11:37:56 -05:00
parent 009cf12d2c
commit 04a5aae14d
11 changed files with 757 additions and 68 deletions
--- a/sarif_cli/scan_tables.py
+++ b/sarif_cli/scan_tables.py
@@ -73,36 +73,49 @@ class ScanTablesTypes:
 #
 # Projects table
 # 
-def joins_for_projects(basetables, external_info, scantables):
+def joins_for_projects(basetables, external_info):
    """ 
    Form the 'projects' table for the ScanTables dataclass
    """
    b = basetables; e = external_info
-
-    # For a repository url of the form
-    #   (git|https)://*/org/project.*
-    # use the org/project part as the project_name.
-    # 
-    # TODO knewbury error handling for if the signature is slotted out?
-    repo_url = b.project.repositoryUri[0]
-    url_parts = re.match(r'(git|https)://[^/]+/([^/]+)/(.*).git', repo_url)
-    if url_parts:
-        project_name = f"{url_parts.group(2)}-{url_parts.group(3)}"
-        project, component = e.sarif_file_name.rstrip().split('/')
-        # if the runners guess from the filename was bad, replace with real info
-        # and continue to use that scanspec to pass that around
-        if project_name != project+"-"+component:
-            e.project_id = hash.hash_unique(project_name.encode())
+   
+    # if the sarif does not have versionControlProvenance, semmle.sourceLanguage ect
+    # there is no reliable way to know the project name 
+    # and will still need to use a guess about the project id
+    if "repositoryUri" in b.project:
+        repo_url = b.project.repositoryUri[0]
+         # For a repository url of the form
+        #   (git|https)://*/org/project.*
+        # use the org/project part as the project_name.
+        # 
+        url_parts = re.match(r'(git|https)://[^/]+/([^/]+)/(.*).git', repo_url)
+        if url_parts:
+            project_name = f"{url_parts.group(2)}-{url_parts.group(3)}"
+            project, component = e.sarif_file_name.rstrip().split('/')
+            # if the runners guess from the filename was bad, replace with real info
+            # and continue to use that scanspec to pass that around
+            if project_name != project+"-"+component:
+                e.project_id = hash.hash_unique(project_name.encode())
+        else:
+            project_name = pd.NA
    else:
+        repo_url = "unknown"
        project_name = pd.NA
+
+    if 'semmle.sourceLanguage' in b.project:
+        srcLang = b.project['semmle.sourceLanguage'][0]
+        allLang = ",".join(list(b.project['semmle.sourceLanguage']))
+    else: 
+        srcLang = "unknown"
+        allLang = "unknown"
    
    res = pd.DataFrame(data={
        "id"                 : e.project_id,
        "project_name"       : project_name,
        "creation_date"      : pd.Timestamp(0.0, unit='s'), # TODO: external info 
        "repo_url"           : repo_url, 
-        "primary_language"   : b.project['semmle.sourceLanguage'][0], # TODO: external info
-        "languages_analyzed" : ",".join(list(b.project['semmle.sourceLanguage']))
+        "primary_language"   : srcLang, # TODO: external info if CLI sarif
+        "languages_analyzed" : allLang  # TODO: external info if CLI sarif
    }, index=[0])

    # Force all column types to ensure appropriate formatting
@@ -112,7 +125,7 @@ def joins_for_projects(basetables, external_info, scantables):
 #
 # Scans table
 # 
-def joins_for_scans(basetables, external_info, scantables):
+def joins_for_scans(basetables, external_info, scantables, sarif_type):
    """ 
    Form the `scans` table for the ScanTables dataclass
    """
@@ -122,9 +135,14 @@ def joins_for_scans(basetables, external_info, scantables):
    driver_version = b.project.driver_version.unique()
    assert len(driver_version) == 1, \
        "More than one driver version found for single sarif file."
+    # TODO if commit id exists in external info for CLI gen'd sarif, add?
+    if sarif_type == "LGTM":
+        commit_id = b.project.revisionId[0]
+    else:
+        commit_id = "unknown"
    res = pd.DataFrame(data={
        "id"                   : e.scan_id,
-        "commit_id"            : b.project.revisionId[0],
+        "commit_id"            : commit_id,
        "project_id"           : e.project_id,
        # TODO extract real date information from somewhere external
        "db_create_start"      : pd.Timestamp(0.0, unit='s'),
@@ -159,7 +177,7 @@ def joins_for_results(basetables, external_info):
    tables = [_results_from_kind_problem(basetables, external_info),
              _results_from_kind_pathproblem(basetables, external_info)]
    stack = [table for table in tables if len(table) > 0]
-
+    
    # Concatenation fails without at least one table, so avoid that.
    if len(stack) > 0:
        res = pd.concat(stack)
@@ -207,7 +225,6 @@ def _results_from_kind_problem(basetables, external_info):
            'query_precision'  :  [_populate_from_rule_table("precision", b, i) for i in range(len(b.kind_problem))],
            'query_severity'   :  [_populate_from_rule_table("problem.severity", b, i) for i in range(len(b.kind_problem))],
            'query_tags'   : [_populate_from_rule_table_tag_text(b, i) for i in range(len(b.kind_problem))],
-
            'codeFlow_id' : 0,      # link to codeflows (kind_pathproblem only, NULL here)
            
            'message': b.kind_problem.message_text,
@@ -249,6 +266,7 @@ def _results_from_kind_pathproblem(basetables, external_info):
    # The `result` table has no entry to distinguish these, so we use a simplified
    # version of `kind_pathproblem`.

+
    reduced_kind_pathp = b.kind_pathproblem.drop(
        columns=[
            'relatedLocation_array_index',
@@ -295,7 +313,6 @@ def _results_from_kind_pathproblem(basetables, external_info):
                    'query_precision'  : _populate_from_rule_table_code_flow("precision", b, cfid0ppt0),
                    'query_severity'   : _populate_from_rule_table_code_flow("problem.severity", b, cfid0ppt0),
                    'query_tags'   : _populate_from_rule_table_code_flow_tag_text(b, cfid0ppt0),
-
                    'codeFlow_id' : cfid0,
                    # 
                    'message': cfid0ppt0.message_text.values[0],