From 04a5aae14de19f21ea0f097356f9cbfa2b4ef651 Mon Sep 17 00:00:00 2001
From: Kristen Newbury <knewbury01@github.com>
Date: Thu, 1 Dec 2022 11:37:56 -0500
Subject: [PATCH 1/8] Add CLI support

enabled by -f flag with CLI value
tested on sarif from CodeQL CLIs:
2.6.3, 2.9.4, 2.11.4
MUST contain versionControlProvenance property however
---
 bin/sarif-extract-multi           |   2 +-
 bin/sarif-extract-scans           |  81 ++++--
 bin/sarif-extract-scans-runner    |  16 +-
 bin/sarif-extract-tables          |   6 +-
 sarif_cli/scan_tables.py          |  63 ++--
 sarif_cli/signature.py            |   2 +-
 sarif_cli/signature_single.py     |   7 +-
 sarif_cli/signature_single_CLI.py | 161 +++++++++++
 sarif_cli/table_joins.py          |   3 +-
 sarif_cli/table_joins_CLI.py      | 462 ++++++++++++++++++++++++++++++
 sarif_cli/typegraph.py            |  22 +-
 11 files changed, 757 insertions(+), 68 deletions(-)
 create mode 100644 sarif_cli/signature_single_CLI.py
 create mode 100644 sarif_cli/table_joins_CLI.py

diff --git a/bin/sarif-extract-multi b/bin/sarif-extract-multi
index 66f40ac..c5f5655 100755
--- a/bin/sarif-extract-multi
+++ b/bin/sarif-extract-multi
@@ -81,7 +81,7 @@ bt = BaseTables()
 # 
 # Add dataframes
 # 
-sf_2683 = tj.joins_for_sf_2683(tgraph)
+sf_2683 = tj.joins_for_location_info(tgraph)
 af_0350_location = tj.joins_for_af_0350_location(tgraph)
 bt.artifacts = tj.joins_for_artifacts(tgraph)
 bt.codeflows = tj.joins_for_codeflows(tgraph, sf_2683)
diff --git a/bin/sarif-extract-scans b/bin/sarif-extract-scans
index d676378..d891f71 100755
--- a/bin/sarif-extract-scans
+++ b/bin/sarif-extract-scans
@@ -2,7 +2,7 @@
 """ Extract scan data from multiple sarif files in table form.
 """
 from dataclasses import dataclass
-from sarif_cli import signature, signature_single
+from sarif_cli import signature, signature_single, signature_single_CLI
 from sarif_cli import typegraph
 from sarif_cli import snowflake_id
 from sarif_cli import status_writer
@@ -14,6 +14,7 @@ import logging
 import pandas as pd
 import pathlib
 import sarif_cli.table_joins as tj
+import sarif_cli.table_joins_CLI as tj_CLI
 import sarif_cli.scan_tables as st
 import sys
 
@@ -32,8 +33,18 @@ parser.add_argument('outdir', metavar='output-dir', type=str, help='output direc
 parser.add_argument('csvout', metavar='csv-outfile', type=str, help='processing status csv output file name to use')
 parser.add_argument('-r', '--write-raw-tables', action="store_true",
                     help='Write the raw sarif tables to the output directory')
+parser.add_argument('-f','--input-signature', metavar='input-signature', type=str, default="LGTM", 
+                    help='Signature of the sarif, as in, where it was generated it may affect the signature.'
+                    'Options: LGTM, CLI'
+                    'If current represented signatures are not sufficient, view signature_single.py for how to support further signatures.'
+                    '  Default: "%(default)s"')
 args = parser.parse_args()
 
+if args.input_signature not in ["LGTM","CLI"]:
+    print("Unsupported sarif signature requested.")
+    print("Use one of [LGTM, CLI].")
+    sys.exit(0)
+
 # Setup csv error writer
 status_writer.setup_csv_writer(args.csvout)
 
@@ -66,11 +77,20 @@ context = signature.Context(
 ) 
 sarif_struct = signature.fillsig(args, sarif_struct, context)
 
+#
+# Setup which signature to use
+if args.input_signature == "LGTM":
+    signature_to_use = signature_single.struct_graph_LGTM
+    start_node = signature_single.start_node_LGTM
+else:
+    #signature_to_use = signature_single.struct_graph_CLI
+    signature_to_use = signature_single_CLI.struct_graph_CLI
+    start_node = signature_single_CLI.start_node_CLI
 #
 # Use reference type graph (signature) to traverse sarif and attach values to tables
 try:
-    tgraph = typegraph.Typegraph(signature_single.struct_graph_2022_02_01)
-    typegraph.destructure(tgraph, signature_single.start_node_2022_02_01, sarif_struct)
+    tgraph = typegraph.Typegraph(signature_to_use)
+    typegraph.destructure(tgraph, start_node, sarif_struct)
 except Exception:
     # will have gathered errors/warnings
     status_writer.csv_write_warnings()
@@ -126,31 +146,29 @@ external_info = ExternalInfo(
 # 
 # Add dataframes for base tables 
 # 
-sf_2683 = tj.joins_for_sf_2683(tgraph)
-af_0350_location = tj.joins_for_af_0350_location(tgraph)
-bt.artifacts = tj.joins_for_artifacts(tgraph)
-bt.codeflows = tj.joins_for_codeflows(tgraph, sf_2683)
-bt.kind_pathproblem = tj.joins_for_path_problem(tgraph, af_0350_location)
-bt.kind_problem = tj.joins_for_problem(tgraph, af_0350_location)
-bt.project = tj.joins_for_project_single(tgraph)
-bt.relatedLocations = tj.joins_for_relatedLocations(tgraph, sf_2683)
-bt.rules = tj.joins_for_rules(tgraph)
+# (relies on some specifics of the sigature type)
+if args.input_signature == "LGTM":
+    tj = tj
+else:
+    tj = tj_CLI
+try:
+    location_info = tj.joins_for_location_info(tgraph)
+    af_0350_location = tj.joins_for_af_0350_location(tgraph)
+    bt.artifacts = tj.joins_for_artifacts(tgraph)
+    bt.codeflows = tj.joins_for_codeflows(tgraph, location_info)
+    bt.kind_pathproblem = tj.joins_for_path_problem(tgraph, af_0350_location)
+    bt.kind_problem = tj.joins_for_problem(tgraph, af_0350_location)
+    bt.project = tj.joins_for_project_single(tgraph)
+    bt.relatedLocations = tj.joins_for_relatedLocations(tgraph, location_info)
+    bt.rules = tj.joins_for_rules(tgraph)
+except Exception:
+    #possible warnings accumulated
+    status_writer.csv_write_warnings()
+    raise Exception
 
 #
-# Form scan tables
+# Setup rest of basetables
 #
-# joins for projects has to happen first as it backfills the guess about the project_id
-scantabs.projects = st.joins_for_projects(bt, external_info, scantabs)
-scantabs.results = st.joins_for_results(bt, external_info)
-scantabs.scans = st.joins_for_scans(bt, external_info, scantabs)
-
-
-
-#
-# Replace the remaining internal ids with snowflake ids
-# 
-flakegen = snowflake_id.Snowflake(0)
-
 bt.columns_to_reindex = {
     # template from {field.name : [''] for field in dc.fields(bt)}
     'artifacts': ['artifacts_id'],
@@ -167,6 +185,19 @@ scantabs.columns_to_reindex = {
     'results': ['codeFlow_id'],
     }
 
+#
+# Form scan tables
+#
+# joins for projects has to happen first as it backfills the guess about the project_id
+scantabs.projects = st.joins_for_projects(bt, external_info)
+scantabs.results = st.joins_for_results(bt, external_info)
+scantabs.scans = st.joins_for_scans(bt, external_info, scantabs, args.input_signature)
+
+#
+# Replace the remaining internal ids with snowflake ids
+# 
+flakegen = snowflake_id.Snowflake(0)
+
 _id_to_flake = {}
 def _get_flake(id):
     flake = _id_to_flake.get(id, -1)
diff --git a/bin/sarif-extract-scans-runner b/bin/sarif-extract-scans-runner
index 5c5a983..a069493 100755
--- a/bin/sarif-extract-scans-runner
+++ b/bin/sarif-extract-scans-runner
@@ -87,7 +87,14 @@ from sarif_cli import hash
 parser = argparse.ArgumentParser(description='Run sarif-extract-scans over a directory hierarchy')
 
 parser.add_argument('sarif_files', metavar='sarif-files', type=str, help='File containing list of sarif files, use - for stdin')
-parser.add_argument('-o','--outdir', metavar='output-dir', type=str, default="", help='output directory')
+
+parser.add_argument('-f','--input-signature', metavar='input-signature', type=str, default="LGTM", 
+                    help='Signature of the sarif, as in, where it was generated it may affect the signature.'
+                    'Options: LGTM, CLI'
+                    'If current represented signatures are not sufficient, view signature_single.py for how to support further signatures.'
+                    '  Default: "%(default)s"')
+
+parser.add_argument('-o','--outdir', metavar='output-dir', type=str, default="", help='Output directory')
 
 parser.add_argument('-m', '--max-files', metavar='number', type=int, default=100000,
                     help='Maximum number of files to process.'
@@ -126,6 +133,11 @@ if outer_dir != "":
     except FileExistsError: 
         pass
 
+if args.input_signature not in ["LGTM","CLI"]:
+    print("Unsupported sarif signature requested.")
+    print("Use one of [LGTM, CLI].")
+    sys.exit(0)
+
 #
 # Collect sarif file information
 # 
@@ -205,7 +217,7 @@ for path in paths:
 
     scan_log_file = os.path.join(outer_dir+ project, component + ".scanlog")
     csv_outfile = os.path.join(outer_dir+ project, component)
-    runstats = subprocess.run(['sarif-extract-scans', scan_spec_file, output_dir, csv_outfile],
+    runstats = subprocess.run(['sarif-extract-scans', scan_spec_file, output_dir, csv_outfile, "-f", args.input_signature],
                               capture_output=True, text=True)
     if runstats.returncode == 0:
         print("{:6} {}".format("OK", path))
diff --git a/bin/sarif-extract-tables b/bin/sarif-extract-tables
index 439b335..97820b3 100755
--- a/bin/sarif-extract-tables
+++ b/bin/sarif-extract-tables
@@ -59,8 +59,8 @@ sarif_struct = signature.fillsig(args, sarif_struct, context)
 #
 # Use reference type graph (signature) to traverse sarif and attach values to tables
 #
-tgraph = typegraph.Typegraph(signature_single.struct_graph_2022_02_01)
-typegraph.destructure(tgraph, signature_single.start_node_2022_02_01, sarif_struct)
+tgraph = typegraph.Typegraph(signature_single.struct_graph_LGTM)
+typegraph.destructure(tgraph, signature_single.start_node_LGTM, sarif_struct)
 
 #
 # Form output tables
@@ -84,7 +84,7 @@ bt = BaseTables()
 # 
 # Add dataframes
 # 
-sf_2683 = tj.joins_for_sf_2683(tgraph)
+sf_2683 = tj.joins_for_location_info(tgraph)
 af_0350_location = tj.joins_for_af_0350_location(tgraph)
 bt.artifacts = tj.joins_for_artifacts(tgraph)
 bt.codeflows = tj.joins_for_codeflows(tgraph, sf_2683)
diff --git a/sarif_cli/scan_tables.py b/sarif_cli/scan_tables.py
index 12702e3..716d940 100644
--- a/sarif_cli/scan_tables.py
+++ b/sarif_cli/scan_tables.py
@@ -73,36 +73,49 @@ class ScanTablesTypes:
 #
 # Projects table
 # 
-def joins_for_projects(basetables, external_info, scantables):
+def joins_for_projects(basetables, external_info):
     """ 
     Form the 'projects' table for the ScanTables dataclass
     """
     b = basetables; e = external_info
-
-    # For a repository url of the form
-    #   (git|https)://*/org/project.*
-    # use the org/project part as the project_name.
-    # 
-    # TODO knewbury error handling for if the signature is slotted out?
-    repo_url = b.project.repositoryUri[0]
-    url_parts = re.match(r'(git|https)://[^/]+/([^/]+)/(.*).git', repo_url)
-    if url_parts:
-        project_name = f"{url_parts.group(2)}-{url_parts.group(3)}"
-        project, component = e.sarif_file_name.rstrip().split('/')
-        # if the runners guess from the filename was bad, replace with real info
-        # and continue to use that scanspec to pass that around
-        if project_name != project+"-"+component:
-            e.project_id = hash.hash_unique(project_name.encode())
+   
+    # if the sarif does not have versionControlProvenance, semmle.sourceLanguage ect
+    # there is no reliable way to know the project name 
+    # and will still need to use a guess about the project id
+    if "repositoryUri" in b.project:
+        repo_url = b.project.repositoryUri[0]
+         # For a repository url of the form
+        #   (git|https)://*/org/project.*
+        # use the org/project part as the project_name.
+        # 
+        url_parts = re.match(r'(git|https)://[^/]+/([^/]+)/(.*).git', repo_url)
+        if url_parts:
+            project_name = f"{url_parts.group(2)}-{url_parts.group(3)}"
+            project, component = e.sarif_file_name.rstrip().split('/')
+            # if the runners guess from the filename was bad, replace with real info
+            # and continue to use that scanspec to pass that around
+            if project_name != project+"-"+component:
+                e.project_id = hash.hash_unique(project_name.encode())
+        else:
+            project_name = pd.NA
     else:
+        repo_url = "unknown"
         project_name = pd.NA
+
+    if 'semmle.sourceLanguage' in b.project:
+        srcLang = b.project['semmle.sourceLanguage'][0]
+        allLang = ",".join(list(b.project['semmle.sourceLanguage']))
+    else: 
+        srcLang = "unknown"
+        allLang = "unknown"
     
     res = pd.DataFrame(data={
         "id"                 : e.project_id,
         "project_name"       : project_name,
         "creation_date"      : pd.Timestamp(0.0, unit='s'), # TODO: external info 
         "repo_url"           : repo_url, 
-        "primary_language"   : b.project['semmle.sourceLanguage'][0], # TODO: external info
-        "languages_analyzed" : ",".join(list(b.project['semmle.sourceLanguage']))
+        "primary_language"   : srcLang, # TODO: external info if CLI sarif
+        "languages_analyzed" : allLang  # TODO: external info if CLI sarif
     }, index=[0])
 
     # Force all column types to ensure appropriate formatting
@@ -112,7 +125,7 @@ def joins_for_projects(basetables, external_info, scantables):
 #
 # Scans table
 # 
-def joins_for_scans(basetables, external_info, scantables):
+def joins_for_scans(basetables, external_info, scantables, sarif_type):
     """ 
     Form the `scans` table for the ScanTables dataclass
     """
@@ -122,9 +135,14 @@ def joins_for_scans(basetables, external_info, scantables):
     driver_version = b.project.driver_version.unique()
     assert len(driver_version) == 1, \
         "More than one driver version found for single sarif file."
+    # TODO if commit id exists in external info for CLI gen'd sarif, add?
+    if sarif_type == "LGTM":
+        commit_id = b.project.revisionId[0]
+    else:
+        commit_id = "unknown"
     res = pd.DataFrame(data={
         "id"                   : e.scan_id,
-        "commit_id"            : b.project.revisionId[0],
+        "commit_id"            : commit_id,
         "project_id"           : e.project_id,
         # TODO extract real date information from somewhere external
         "db_create_start"      : pd.Timestamp(0.0, unit='s'),
@@ -159,7 +177,7 @@ def joins_for_results(basetables, external_info):
     tables = [_results_from_kind_problem(basetables, external_info),
               _results_from_kind_pathproblem(basetables, external_info)]
     stack = [table for table in tables if len(table) > 0]
-
+    
     # Concatenation fails without at least one table, so avoid that.
     if len(stack) > 0:
         res = pd.concat(stack)
@@ -207,7 +225,6 @@ def _results_from_kind_problem(basetables, external_info):
             'query_precision'  :  [_populate_from_rule_table("precision", b, i) for i in range(len(b.kind_problem))],
             'query_severity'   :  [_populate_from_rule_table("problem.severity", b, i) for i in range(len(b.kind_problem))],
             'query_tags'   : [_populate_from_rule_table_tag_text(b, i) for i in range(len(b.kind_problem))],
-
             'codeFlow_id' : 0,      # link to codeflows (kind_pathproblem only, NULL here)
             
             'message': b.kind_problem.message_text,
@@ -249,6 +266,7 @@ def _results_from_kind_pathproblem(basetables, external_info):
     # The `result` table has no entry to distinguish these, so we use a simplified
     # version of `kind_pathproblem`.
 
+
     reduced_kind_pathp = b.kind_pathproblem.drop(
         columns=[
             'relatedLocation_array_index',
@@ -295,7 +313,6 @@ def _results_from_kind_pathproblem(basetables, external_info):
                     'query_precision'  : _populate_from_rule_table_code_flow("precision", b, cfid0ppt0),
                     'query_severity'   : _populate_from_rule_table_code_flow("problem.severity", b, cfid0ppt0),
                     'query_tags'   : _populate_from_rule_table_code_flow_tag_text(b, cfid0ppt0),
-
                     'codeFlow_id' : cfid0,
                     # 
                     'message': cfid0ppt0.message_text.values[0],
diff --git a/sarif_cli/signature.py b/sarif_cli/signature.py
index ea99552..42957fb 100644
--- a/sarif_cli/signature.py
+++ b/sarif_cli/signature.py
@@ -225,7 +225,7 @@ dummy_newlineSequences = ['\r\n', '\n', '\u2028', '\u2029']
 dummy_relatedLocations_entry = [
     {'id': -1,
      'physicalLocation': {'artifactLocation': {'uri': 'scli-dyys dummy value',
-                                               'uriBaseId': 'scli-dyys dummy value',
+                                               'uriBaseId': 'scli-dyys uriBaseId',
                                                'index': -1},
                           'region': {'startLine': -1, 
                                      'startColumn': -1,
diff --git a/sarif_cli/signature_single.py b/sarif_cli/signature_single.py
index 72b05e0..c810cb5 100644
--- a/sarif_cli/signature_single.py
+++ b/sarif_cli/signature_single.py
@@ -12,9 +12,9 @@ is marked below
 # 
 # The starting node the leftmost node in ../notes/typegraph.pdf
 # 
-start_node_2022_02_01 = 'Struct6787' 
+start_node_LGTM = 'Struct6787' 
 
-struct_graph_2022_02_01 = (
+struct_graph_LGTM = (
 [   ('String', 'string'),
     ('Int', 'int'),
     ('Bool', 'bool'),
@@ -121,5 +121,4 @@ struct_graph_2022_02_01 = (
             ('$schema', 'String'),
             ('runs', 'Array0177'),
             ('version', 'String')))]
-)
-
+)
\ No newline at end of file
diff --git a/sarif_cli/signature_single_CLI.py b/sarif_cli/signature_single_CLI.py
new file mode 100644
index 0000000..fd8dfa5
--- /dev/null
+++ b/sarif_cli/signature_single_CLI.py
@@ -0,0 +1,161 @@
+""" The signature for a single sarif file
+
+Produced by 
+
+    sarif-to-dot -u -t -f 2021-12-09/results.sarif
+
+with some arrays manually sorted so the the signature with more fields comes first.  The case 
+    ('Array6343', ('array', (1, 'Struct9699'), (0, 'Struct4055'))), # MANUALLY SORTED
+is marked below
+"""
+
+# 
+# The starting node the leftmost node in ../notes/typegraph.pdf
+# 
+start_node_CLI = 'Struct5521'
+
+# generated with CLI 2.9.4
+struct_graph_CLI = (
+    [   ('String', 'string'),
+    ('Int', 'int'),
+    ('Bool', 'bool'),
+    (   'Struct2685',
+        (   'struct',
+            ('index', 'Int'),
+            ('uri', 'String'),
+            ('uriBaseId', 'String'))),
+    ('Struct5277', ('struct', ('location', 'Struct2685'))),
+    ('Struct3497', ('struct', ('index', 'Int'), ('uri', 'String'))),
+    ('Struct9567', ('struct', ('location', 'Struct3497'))),
+    ('Array6920', ('array', (0, 'Struct5277'), (1, 'Struct9567'))),
+    ('Struct1509', ('struct', ('semmle.formatSpecifier', 'String'))),
+    ('Struct2774', ('struct', ('text', 'String'))),
+    (   'Struct6299',
+        (   'struct',
+            ('endColumn', 'Int'),
+            ('endLine', 'Int'),
+            ('startColumn', 'Int'),
+            ('startLine', 'Int'))),
+    (   'Struct4963',
+        (   'struct',
+            ('artifactLocation', 'Struct2685'),
+            ('region', 'Struct6299'))),
+    (   'Struct2683',
+        (   'struct',
+            ('id', 'Int'),
+            ('message', 'Struct2774'),
+            ('physicalLocation', 'Struct4963'))),
+    ('Array0350', ('array', (0, 'Struct2683'))),
+    (   'Struct4199',
+        (   'struct',
+            ('primaryLocationLineHash', 'String'),
+            ('primaryLocationStartColumnFingerprint', 'String'))),
+    ('Struct3942', ('struct', ('id', 'String'), ('index', 'Int'))),
+    (   'Struct4055',
+        (   'struct',
+            ('locations', 'Array0350'),
+            ('message', 'Struct2774'),
+            ('partialFingerprints', 'Struct4199'),
+            ('relatedLocations', 'Array0350'),
+            ('rule', 'Struct3942'),
+            ('ruleId', 'String'),
+            ('ruleIndex', 'Int'))),
+    (   'Struct7125',
+        (   'struct',
+            ('artifactLocation', 'Struct3497'),
+            ('region', 'Struct6299'))),
+    (   'Struct6772',
+        (   'struct',
+            ('id', 'Int'),
+            ('message', 'Struct2774'),
+            ('physicalLocation', 'Struct7125'))),
+    ('Array8753', ('array', (0, 'Struct6772'))),
+    (   'Struct0102',
+        (   'struct',
+            ('locations', 'Array0350'),
+            ('message', 'Struct2774'),
+            ('partialFingerprints', 'Struct4199'),
+            ('relatedLocations', 'Array8753'),
+            ('rule', 'Struct3942'),
+            ('ruleId', 'String'),
+            ('ruleIndex', 'Int'))),
+    ('Struct0987', ('struct', ('location', 'Struct2683'))),
+    ('Array1075', ('array', (0, 'Struct0987'))),
+    ('Struct4194', ('struct', ('locations', 'Array1075'))),
+    ('Array1597', ('array', (0, 'Struct4194'))),
+    ('Struct7122', ('struct', ('threadFlows', 'Array1597'))),
+    ('Array9799', ('array', (0, 'Struct7122'))),
+    (   'Struct9699',
+        (   'struct',
+            ('codeFlows', 'Array9799'),
+            ('locations', 'Array0350'),
+            ('message', 'Struct2774'),
+            ('partialFingerprints', 'Struct4199'),
+            ('relatedLocations', 'Array0350'),
+            ('rule', 'Struct3942'),
+            ('ruleId', 'String'),
+            ('ruleIndex', 'Int'))),
+    (   'Array1768',
+        #('array', (2, 'Struct9699'), (1, 'Struct4055'),(0, 'Struct0102'))),
+        #('array',(0, 'Struct0102'), (1, 'Struct4055'), (2, 'Struct9699'))),
+        #omitting (0, 'Struct0102') means we will never find column info
+        ('array', (2, 'Struct9699'), (1, 'Struct4055'))),
+    ('Struct8581', ('struct', ('enabled', 'Bool'), ('level', 'String'))),
+    ('Array7069', ('array', (0, 'String'))),
+    (   'Struct6853',
+        (   'struct',
+            ('description', 'String'),
+            ('id', 'String'),
+            ('kind', 'String'),
+            ('name', 'String'),
+            ('precision', 'String'),
+            ('problem.severity', 'String'),
+            ('security-severity', 'String'),
+            ('severity', 'String'),
+            ('sub-severity', 'String'),
+            ('tags', 'Array7069'))),
+    (   'Struct7100',
+        (   'struct',
+            ('defaultConfiguration', 'Struct8581'),
+            ('fullDescription', 'Struct2774'),
+            ('id', 'String'),
+            ('name', 'String'),
+            ('properties', 'Struct6853'),
+            ('shortDescription', 'Struct2774'))),
+    ('Array0147', ('array', (0, 'Struct7100'))),
+    (   'Struct7828',
+        (   'struct',
+            ('name', 'String'),
+            ('organization', 'String'),
+            ('rules', 'Array0147'),
+            ('semanticVersion', 'String'))),
+    (   'Struct9027',
+        ('struct', ('description', 'Struct2774'), ('uri', 'String'))),
+    ('Array4813', ('array', (0, 'Struct9027'))),
+    (   'Struct6152',
+        (   'struct',
+            ('locations', 'Array4813'),
+            ('name', 'String'),
+            ('semanticVersion', 'String'))),
+    ('Struct7826', ('struct', ('locations', 'Array4813'), ('name', 'String'))),
+    ('Array9357', ('array', (0, 'Struct6152'), (1, 'Struct7826'))),
+    (   'Struct0032',
+        ('struct', ('driver', 'Struct7828'), ('extensions', 'Array9357'))),
+    (   'Struct3081',
+        ('struct', ('repositoryUri', 'String'), ('revisionId', 'String'))),
+    ('Array5511', ('array', (0, 'Struct3081'))),
+    (   'Struct9786',
+        (   'struct',
+            ('artifacts', 'Array6920'),
+            ('columnKind', 'String'),
+            ('newlineSequences', 'Array7069'),
+            ('properties', 'Struct1509'),
+            ('results', 'Array1768'),
+            ('tool', 'Struct0032'),
+            ('versionControlProvenance', 'Array5511'))),
+    ('Array1273', ('array', (0, 'Struct9786'))),
+    (   'Struct5521',
+        (   'struct',
+            ('$schema', 'String'),
+            ('runs', 'Array1273'),
+            ('version', 'String')))] )
diff --git a/sarif_cli/table_joins.py b/sarif_cli/table_joins.py
index 5209f84..41c5faa 100644
--- a/sarif_cli/table_joins.py
+++ b/sarif_cli/table_joins.py
@@ -73,13 +73,12 @@ def joins_for_af_0350_location(tgraph):
     )
     return af_0350_location
 
-def joins_for_sf_2683(tgraph):
+def joins_for_location_info(tgraph):
     """ 
     Join all the tables used by 2683's right side into one.
     """
     # Access convenience functions
     sf = lambda num: tgraph.dataframes['Struct' + str(num)]
-    af = lambda num: tgraph.dataframes['Array' + str(num)]
     # 
     sf_2683 = ( 
         # 
diff --git a/sarif_cli/table_joins_CLI.py b/sarif_cli/table_joins_CLI.py
new file mode 100644
index 0000000..71b8c42
--- /dev/null
+++ b/sarif_cli/table_joins_CLI.py
@@ -0,0 +1,462 @@
+""" Collection of joins for the base tables provided by typegraph.attach_tables()
+
+    The `problem` and `path-problem` entries provide that information; the
+    `relatedLocations` table provides the details when multiple results are
+    present for either.  `project` is the high-level overview; `artifacts` 
+    provides those for the other tables.
+"""
+import pandas as pd
+import re
+from .typegraph import tagged_array_columns, tagged_struct_columns
+
+class BaseTablesTypes:
+    codeflows = {
+        "codeflow_id" : pd.UInt64Dtype(),
+        "codeflow_index" : pd.Int64Dtype(),
+        "threadflow_index" : pd.Int64Dtype(),
+        "location_index" : pd.Int64Dtype(),
+        "endColumn" : pd.Int64Dtype(),
+        "endLine" : pd.Int64Dtype(),
+        "startColumn" : pd.Int64Dtype(),
+        "startLine" : pd.Int64Dtype(),
+        "artifact_index" : pd.Int64Dtype(),
+        "uri" : pd.StringDtype(),
+        "uriBaseId" : pd.StringDtype(),
+        "message" : pd.StringDtype(),
+    }
+
+def joins_for_af_0350_location(tgraph):
+    """ 
+    Join all the tables used by 0350's right side into one.
+    """
+    # Access convenience functions
+    sf = lambda num: tgraph.dataframes['Struct' + str(num)]
+    af = lambda num: tgraph.dataframes['Array' + str(num)]
+    sft = lambda id: sf(id).rename(columns = tagged_struct_columns(tgraph, id))
+    aft = lambda id: af(id).rename(columns = tagged_array_columns(tgraph, id))
+    
+    af_0350_location =  (
+        aft('0350')
+        # 
+        .merge(sft(2683), how="left", left_on='t0350_id_or_value_at_index', right_on='t2683_struct_id',
+               validate="1:m")
+        .drop(columns=['t0350_id_or_value_at_index', 't2683_struct_id', 't0350_type_at_index'])
+        # 
+        .merge(sft(4963), how="left", left_on='t2683_physicalLocation', right_on='t4963_struct_id',
+               validate="1:m")
+        .drop(columns=['t2683_physicalLocation', 't4963_struct_id'])
+        # 
+        .merge(sft(6299), how="left", left_on='t4963_region', right_on='t6299_struct_id', 
+               validate="1:m")
+        .drop(columns=['t4963_region', 't6299_struct_id'])
+        # 
+        .merge(sft(2685), how="left", left_on='t4963_artifactLocation', right_on='t2685_struct_id', 
+               validate="1:m")
+        .drop(columns=['t4963_artifactLocation', 't2685_struct_id'])
+        # 
+        .merge(sft(2774), how="left", left_on='t2683_message', right_on='t2774_struct_id', 
+               validate="1:m")
+        .drop(columns=['t2683_message', 't2774_struct_id'])
+        #
+        .rename(columns={'t0350_array_id'    : 'm0350_location_array_id',
+                         't0350_value_index' : 'm0350_location_array_index',
+                         't2683_id'          : 'm0350_location_id',
+                         't6299_endColumn'   : 'm0350_location_endColumn', 
+                         't6299_endLine'     : 'm0350_location_endLine', 
+                         't6299_startColumn' : 'm0350_location_startColumn', 
+                         't6299_startLine'   : 'm0350_location_startLine', 
+                         't2685_index'       : 'm0350_location_index',
+                         't2685_uri'         : 'm0350_location_uri',
+                         't2685_uriBaseId'   : 'm0350_location_uriBaseId',
+                         't2774_text'        : 'm0350_location_message',
+                         })
+    )
+    return af_0350_location
+
+def joins_for_location_info(tgraph):
+    """ 
+    Join all the tables used by 2683's right side into one.
+    """
+    # Access convenience functions
+    sf = lambda num: tgraph.dataframes['Struct' + str(num)]
+    # 
+    sf_2683 = ( 
+        # 
+        sf(2683)
+        .rename(columns={"struct_id": "struct_id_2683", "id": "id_2683"})
+        # 
+        .merge(sf(4963), how="left", left_on='physicalLocation', right_on='struct_id', validate="1:m")
+        .drop(columns=['struct_id', 'physicalLocation'])
+        # 
+        .merge(sf(6299), how="left", left_on='region', right_on='struct_id', validate="1:m")
+        .drop(columns=['struct_id', 'region'])
+        # 
+        .merge(sf(2685), how="left", left_on='artifactLocation', right_on='struct_id', validate="1:m")
+        .drop(columns=['struct_id', 'artifactLocation'])
+        .rename(columns={"index": "location_index_2685"})
+        # 
+        .merge(sf(2774), how="left", left_on='message', right_on='struct_id', validate="1:m")
+        .drop(columns=['struct_id', 'message'])
+        .rename(columns={"text": "message_text_2683"})
+        # 
+    )
+
+    return sf_2683
+
+def joins_for_problem(tgraph, af_0350_location):
+    """ 
+    Return table providing the `problem` information.
+    """
+    # Access convenience functions
+    sf = lambda num: tgraph.dataframes['Struct' + str(num)]
+    af = lambda num: tgraph.dataframes['Array' + str(num)]
+    sft = lambda id: sf(id).rename(columns = tagged_struct_columns(tgraph, id))
+    aft = lambda id: af(id).rename(columns = tagged_array_columns(tgraph, id))
+    # 
+    # Form the message dataframe (@kind problem) via joins
+    # 
+
+    kind_problem_1 = (
+        aft(1768)
+        .merge(sft(4055), how="inner",
+               left_on='t1768_id_or_value_at_index', right_on='t4055_struct_id', 
+               validate="1:m")
+        .drop(columns=['t1768_type_at_index', 't1768_id_or_value_at_index',
+                       't4055_struct_id']) 
+        #
+        .merge(af_0350_location, how="left", left_on='t4055_locations',
+               right_on='m0350_location_array_id', validate="1:m")
+        .drop(columns=['t4055_locations', 'm0350_location_array_id'])
+        #
+        .merge(af_0350_location.rename(columns=lambda x: re.sub('m0350_location',
+                                                                'm0350_relatedLocation',
+                                                                x)),
+               how="left", left_on='t4055_relatedLocations',
+               right_on='m0350_relatedLocation_array_id', validate="1:m")
+        .drop(columns=['t4055_relatedLocations', 'm0350_relatedLocation_array_id'])
+        #
+        .merge(sft(2774), how="left", left_on='t4055_message', right_on='t2774_struct_id')
+        .drop(columns=['t4055_message', 't2774_struct_id'])
+        .rename(columns={"t2774_text": "t4055_message_text"})
+        # 
+        .merge(sft(4199), how="left", left_on='t4055_partialFingerprints',
+               right_on='t4199_struct_id')
+        .drop(columns=['t4055_partialFingerprints', 't4199_struct_id'])
+        #
+        .merge(sft(3942), how="left", left_on='t4055_rule',
+               right_on='t3942_struct_id')
+        .drop(columns=['t4055_rule', 't3942_struct_id'])
+    )
+
+    kind_problem_2 = (
+        kind_problem_1
+        .rename({
+            't1768_array_id'     : 'results_array_id',
+            't1768_value_index'  : 'results_array_index',
+            't4055_ruleId'       : 'ruleId',
+            't4055_ruleIndex'    : 'ruleIndex',
+            't4055_message_text' : 'message_text',
+            't3942_id'           : 'rule_id',
+            't3942_index'        : 'rule_index',
+        }, axis='columns')
+        # Strip type prefix for the rest
+        .rename(columns = lambda x: re.sub('m0350_|t4199_', '', x))
+    )
+
+    return kind_problem_2
+
+
+def joins_for_codeflows(tgraph, sf_2683):
+    """ 
+    Return the table providing the `codeFlows` for a `path-problem table.
+    """
+    # Access convenience functions
+    sf = lambda num: tgraph.dataframes['Struct' + str(num)]
+    af = lambda num: tgraph.dataframes['Array' + str(num)]
+    #
+    codeflows = (
+        af(9799).rename(columns={"array_id": "t9799_array_id", "value_index": "t9799_idx"})
+        # 
+        .merge(sf(7122), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m")
+        .drop(columns=['id_or_value_at_index', 'struct_id', 'type_at_index'])
+        # 
+        .merge(af(1597).rename(columns={"array_id": "t1597_array_id", "value_index": "t1597_idx"}),
+               how="left", left_on='threadFlows', right_on='t1597_array_id', validate="1:m")
+        .drop(columns=['threadFlows', 't1597_array_id', 'type_at_index'])
+        #
+        .merge(sf(4194), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m")
+        .drop(columns=['id_or_value_at_index', 'struct_id'])
+        #
+        .merge(af(1075).rename(columns={"array_id": "t1075_array_id", "value_index": "t1075_idx"}),
+               how="left", left_on='locations', right_on='t1075_array_id', validate="1:m")
+        .drop(columns=['locations', 't1075_array_id', 'type_at_index'])
+        .rename(columns={"t1075_idx": "t1075_locations_idx"})
+        #
+        .merge(sf('0987'), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m")
+        .drop(columns=['id_or_value_at_index', 'struct_id'])
+        #
+        .merge(sf_2683, how="left", left_on='location', right_on='struct_id_2683', validate="1:m")
+        .drop(columns=['location', 'struct_id_2683'])
+    )
+    codeflows_1 = (
+        codeflows
+        .drop(columns=['id_2683'])
+        .rename({
+            't9799_array_id': 'codeflow_id',
+            't9799_idx': 'codeflow_index',
+            't1597_idx': 'threadflow_index',
+            't1075_locations_idx': 'location_index',
+            'location_index_2685': 'artifact_index',
+            'message_text_2683': 'message',
+        }, axis='columns')
+    )
+    codeflows_2 = codeflows_1.astype(BaseTablesTypes.codeflows).reset_index(drop=True)
+    return codeflows_2
+
+def joins_for_path_problem(tgraph, af_0350_location):
+    """ 
+    Return table providing the `path-problem` information.
+    """
+    # Access convenience functions
+    sf = lambda num: tgraph.dataframes['Struct' + str(num)]
+    af = lambda num: tgraph.dataframes['Array' + str(num)]
+    sft = lambda id: sf(id).rename(columns = tagged_struct_columns(tgraph, id))
+    aft = lambda id: af(id).rename(columns = tagged_array_columns(tgraph, id))
+
+    kind_pathproblem_1 = (
+        aft(1768)
+        .merge(sft(9699), how="inner", left_on='t1768_id_or_value_at_index', right_on='t9699_struct_id',
+               validate="1:m")
+        .drop(columns=['t1768_id_or_value_at_index', 't9699_struct_id', 't1768_type_at_index'])
+        #
+        .merge(af_0350_location, how="left", left_on='t9699_locations',
+               right_on='m0350_location_array_id', validate="1:m")
+        .drop(columns=['t9699_locations', 'm0350_location_array_id'])
+        #
+        .merge(af_0350_location.rename(columns=lambda x: re.sub('m0350_location',
+                                                                'm0350_relatedLocation',
+                                                                x)),
+               how="left", left_on='t9699_relatedLocations',
+               right_on='m0350_relatedLocation_array_id', validate="1:m")
+        .drop(columns=['t9699_relatedLocations', 'm0350_relatedLocation_array_id'])
+        #
+        .merge(sft(2774), how="left", left_on='t9699_message', right_on='t2774_struct_id')
+        .drop(columns=['t9699_message', 't2774_struct_id'])
+        .rename(columns={"t2774_text": "t9699_message_text"})
+        # 
+        .merge(sft(4199), how="left", left_on='t9699_partialFingerprints',
+               right_on='t4199_struct_id')
+        .drop(columns=['t9699_partialFingerprints', 't4199_struct_id'])
+        #
+        .merge(sft(3942), how="left", left_on='t9699_rule',
+               right_on='t3942_struct_id')
+        .drop(columns=['t9699_rule', 't3942_struct_id'])
+    )
+    strip_colums = lambda x: re.sub('t9699_|m0350_|t4199_', '', x)
+    kind_pathproblem_2 = (kind_pathproblem_1
+                          .rename({
+                              't1768_array_id'     : 'results_array_id',
+                              't1768_value_index'  : 'results_array_index',
+                              't9699_codeFlows'    : 'codeFlows_id',
+                              't9699_ruleId'       : 'ruleId',
+                              't9699_ruleIndex'    : 'ruleIndex',
+                              't9699_message_text' : 'message_text',
+                              't3942_id'           : 'rule_id',
+                              't3942_index'        : 'rule_index',
+                          }, axis='columns')
+                          # Strip type prefix for the rest
+                          .rename(columns = strip_colums))
+
+    return kind_pathproblem_2
+
+def joins_for_relatedLocations(tgraph, sf_2683):
+    """ 
+    Return table providing the  `relatedLocations` and `locations` information.
+    """
+    # Access convenience functions
+    sf = lambda num: tgraph.dataframes['Struct' + str(num)]
+    af = lambda num: tgraph.dataframes['Array' + str(num)]
+    # 
+    # Form the relatedLocation dataframe via joins, starting from the union of
+    # relatedLocations from `kind problem` (sf(4055)) and `kind path-problem`
+    # (sf(9699)).
+    # 
+    related_locations_1 = (
+        pd.concat([sf(4055)[['relatedLocations', 'struct_id']], sf(9699)[['relatedLocations', 'struct_id']]])
+        .merge(af('0350'), how="left", left_on='relatedLocations', right_on='array_id', validate="1:m")
+        .drop(columns=['relatedLocations', 'array_id', 'value_index', 'type_at_index'])
+        # 
+        .merge(sf(2683), how="left", left_on='id_or_value_at_index', right_on='struct_id',
+               suffixes=("_4055_9699", "_2683"), validate="1:m")
+        .drop(columns=['struct_id_2683', 'id_or_value_at_index'])
+        # 
+        .merge(sf(4963), how="left", left_on='physicalLocation', right_on='struct_id', validate="1:m")
+        .drop(columns=['struct_id', 'physicalLocation'])
+        # 
+        .merge(sf(6299), how="left", left_on='region', right_on='struct_id', validate="1:m")
+        .drop(columns=['struct_id', 'region'])
+        # 
+        .merge(sf(2685), how="left", left_on='artifactLocation', right_on='struct_id', validate="1:m")
+        .drop(columns=['struct_id', 'artifactLocation'])
+        # 
+        .merge(sf(2774), how="left", left_on='message', right_on='struct_id', validate="1:m")
+        .drop(columns=['struct_id', 'message'])
+    )
+
+    # Keep columns of interest
+    related_locations_2 = (related_locations_1[['struct_id_4055_9699', 'uri', 'startLine', 'startColumn', 'endLine', 'endColumn', 'text']]
+          .rename({'text': 'message', 'struct_id_4055_9699': 'struct_id'}, axis='columns'))
+
+    # Remove dummy locations previously injected by signature.fillsig
+    related_locations_3 = related_locations_2[related_locations_2.uri != 'scli-dyys dummy value']
+
+    return related_locations_3
+
+def joins_for_project_single(tgraph):
+    """ 
+    Return table providing the `project` information for sarif-extract-scans
+    """
+    # Access convenience functions
+    sf = lambda num: tgraph.dataframes['Struct' + str(num)]
+    af = lambda num: tgraph.dataframes['Array' + str(num)]
+    # 
+    project_df_temp1 = (
+        sf(5521)
+        .rename(columns={"version": "version_5521", "struct_id": "struct_id_5521"})
+        #
+        .merge(af('1273'), how="left", left_on='runs', right_on='array_id',
+               validate="1:m")
+        .drop(columns=['runs', 'array_id', 'type_at_index'])
+        .rename(columns={"value_index": "value_index_1273"})
+        #
+        .merge(sf(9786), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m")
+        .drop(columns=['id_or_value_at_index', 'struct_id']))
+        # 
+    #newlines there or not - handle
+    if 'newlineSequences' in project_df_temp1:
+        project_df_temp2 = project_df_temp1.drop(columns=['newlineSequences'])
+
+    project_df_temp2 = (
+        project_df_temp1
+        #
+        .merge(sf(1509), how="left", left_on='properties', right_on='struct_id', validate="1:m")
+        .drop(columns=['properties', 'struct_id'])
+        #
+        # tool - driver - rules - defaultConfiguration - ( properties - tags )
+        # 
+        .merge(sf('0032'), how="left", left_on='tool', right_on='struct_id', validate="1:m")
+        .drop(columns=['tool', 'struct_id'])
+        # 
+        .merge(sf(7828), how="left", left_on='driver', right_on='struct_id', validate="1:m")
+        .drop(columns=['driver', 'struct_id'])
+        .rename(columns={"semanticVersion": "driver_version_7828", "name": "driver_name_7828"})
+        # 
+        #assumet to be there
+        .merge(af(5511), how="left", left_on='versionControlProvenance', right_on='array_id')
+        .drop(columns=['versionControlProvenance', 'array_id', 'type_at_index'])
+        .rename(columns={"value_index": "versionControl_value_index_5511"})
+        # 
+        .merge(sf(3081), how="left", left_on='id_or_value_at_index', right_on='struct_id')
+        .drop(columns=['id_or_value_at_index', 'struct_id'])
+        )
+        #
+    
+    # Keep columns of interest
+    project_df_1 = (
+        project_df_temp2
+        .drop(columns=['struct_id_5521', 'versionControl_value_index_5511'])
+        .rename({
+            'version_5521': 'sarif_version',
+            'value_index_1273': 'run_index',
+            'driver_name_7828': 'driver_name',
+            'driver_version_7828': 'driver_version',
+        }, axis='columns')
+    )
+    return project_df_1
+
+def joins_for_rules(tgraph):
+    """ 
+    Return table providing the `rules` information.
+    """
+    # Access convenience functions
+    sf = lambda num: tgraph.dataframes['Struct' + str(num)]
+    sft = lambda id: sf(id).rename(columns = tagged_struct_columns(tgraph, id))
+    af = lambda num: tgraph.dataframes['Array' + str(num)]
+    aft = lambda id: af(id).rename(columns = tagged_array_columns(tgraph, id))
+    # 
+    rules_df = (
+        aft('0147')
+        # 
+        .drop(columns=['t0147_type_at_index'])
+        # 
+        .merge(sft(7100), how="left", left_on='t0147_id_or_value_at_index',
+               right_on='t7100_struct_id',
+               validate="1:m")
+        .drop(columns=['t0147_id_or_value_at_index', 't7100_struct_id'])
+        # 
+        .merge(sft(8581), how="left", left_on='t7100_defaultConfiguration',
+               right_on='t8581_struct_id', validate="1:m") 
+        .drop(columns=['t7100_defaultConfiguration', 't8581_struct_id'])
+        # 
+        .merge(sft(2774), how="left", left_on='t7100_fullDescription',
+               right_on='t2774_struct_id', validate="1:m") 
+        .drop(columns=['t7100_fullDescription', 't2774_struct_id'])
+        .rename(columns={'t2774_text': "t7100_t2774_fullDescription"})
+        # 
+        .merge(sft(2774), how="left", left_on='t7100_shortDescription',
+               right_on='t2774_struct_id', validate="1:m") 
+        .drop(columns=['t7100_shortDescription', 't2774_struct_id'])
+        .rename(columns={"t2774_text": 't7100_t2774_shortDescription'})
+        # 
+        .merge(sft(6853), how="left", left_on='t7100_properties',
+               right_on='t6853_struct_id', validate="1:m") 
+        .drop(columns=['t7100_properties', 't6853_struct_id', 't6853_id'])
+        # 
+        .merge(aft(7069), how="left", left_on='t6853_tags',
+               right_on='t7069_array_id', validate="1:m")  
+        .drop(columns=['t6853_tags', 't7069_array_id', 't7069_type_at_index'])
+    )
+    rules_2 = (
+        rules_df
+        .rename({
+            't0147_array_id'             : 'rules_array_id',
+            't0147_value_index'          : 'rules_array_index',
+            't7069_value_index'          : 'tag_index',
+            't7069_id_or_value_at_index' : 'tag_text',
+        }, axis='columns')
+        # Strip type prefix for the rest
+        .rename(columns = lambda x: re.sub('t7100_t2774_|t7100_|t8581_|t6853_', '', x))
+    )
+    return rules_2
+
+def joins_for_artifacts(tgraph):
+    """ 
+    Return table providing the `artifacts` information.
+    """
+    # Access convenience functions
+    sf = lambda num: tgraph.dataframes['Struct' + str(num)]
+    af = lambda num: tgraph.dataframes['Array' + str(num)]
+    # 
+    artifacts_df = (
+        af(6920)
+        #
+        .merge(sf(5277), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m")
+        .drop(columns=['id_or_value_at_index', 'struct_id', 'type_at_index'])
+        .rename(columns={"value_index": "artifact_index_6920"})
+        #
+        .merge(sf(2685), how="left", left_on='location', right_on='struct_id', validate="1:m")
+        .drop(columns=['location', 'struct_id'])
+    )
+    # Keep columns of interest and rename
+    df_1 = (
+        artifacts_df
+        .rename({
+            'array_id': 'artifacts_id',
+            'artifact_index_6920': 'artifacts_array_index',
+        }, axis='columns')
+    )
+
+    if (df_1['artifacts_array_index'] == df_1['index']).all():
+        df_1 = df_1.drop(columns=['artifacts_array_index'])
+
+    return df_1
diff --git a/sarif_cli/typegraph.py b/sarif_cli/typegraph.py
index 5761943..3769fc6 100644
--- a/sarif_cli/typegraph.py
+++ b/sarif_cli/typegraph.py
@@ -179,13 +179,21 @@ def _destructure_dict(typegraph: Typegraph, node, tree):
         if specific_missing not in status_writer.input_sarif_missing["extra_info"]:
             status_writer.input_sarif_missing["extra_info"] += specific_missing
         status_writer.warning_set["input_sarif_missing"]+=1
-        raise MissingFieldException(
-            f"(Sub)tree is missing fields required by typedef.\n"
-            f"Expected {type_fields}, found {tree_fields}.\n"
-            f"Missing {set(type_fields) - set(tree_fields)}\n"
-            f"Note: these fields are post-signature fill and may be more extensive than the orginal. \n"
-            f"Check input file for the original signature."
-        )
+        
+        #special case of no longer trying other signatures
+        #else exception here triggers a retry - mainly needed for Struct9699 or Struct4055
+        difference = set(type_fields) - set(tree_fields)
+        if "uriBaseId" in difference:
+                tree["uriBaseId"] = "default"
+                _destructure_dict_1(typegraph, node, tree)
+        else:
+            raise MissingFieldException(
+                f"(Sub)tree is missing fields required by typedef.\n"
+                f"Expected {type_fields}, found {tree_fields}.\n"
+                f"Missing {set(type_fields) - set(tree_fields)}\n"
+                f"Note: these fields are post-signature fill and may be more extensive than the orginal. \n"
+                f"Check input file for the original signature."
+            )
 
     else:
         status_writer.unknown_sarif_parsing_shape["extra_info"] = "type fields {} do not match tree fields {}.".format(type_fields, tree_fields)

From 202f7f53a5c574a9e34f123e539ce43560273d88 Mon Sep 17 00:00:00 2001
From: Kristen Newbury <knewbury01@github.com>
Date: Tue, 13 Dec 2022 18:32:34 -0500
Subject: [PATCH 2/8] Update README for CLI usage instructions

---
 README.md | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/README.md b/README.md
index 49d9706..db6f7f4 100644
--- a/README.md
+++ b/README.md
@@ -5,6 +5,23 @@
 
   The tool was implemented using Python 3.9.
 
+# Sarif format information
+
+  The tool operates on sarif generated by LGTM 1.27.0 (by default) or by the CodeQL CLI (enabled with the -f flag given a value of `CLI`).
+
+  The values that the -f flag accepts are: `LGTM` and `CLI`.
+
+  The CLI versions used against development of the CLI support were: 2.6.3, 2.9.4, and 2.11.4.
+
+  The CLI sarif **MUST** contain one additional property `versionControlProvenance` - which needs to look like:
+  ```
+  "versionControlProvenance": [
+        {
+          "repositoryUri": "https://github.com/testorg/testrepo.git",
+          "revisionId": "testsha"
+        }
+  ```
+
 # Test Setup
   This repository includes some test data (in `data`) and uses =git lfs= for storing those test files; installation steps are at
   [[https://git-lfs.github.com][git-lfs]]; on a mac with homebrew, install it via

From dc4fd09e63d74ad24cc2f372d269f6851681f4f6 Mon Sep 17 00:00:00 2001
From: Kristen Newbury <knewbury01@github.com>
Date: Tue, 13 Dec 2022 18:42:45 -0500
Subject: [PATCH 3/8] Update README missing minor syntax

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index db6f7f4..6c7850e 100644
--- a/README.md
+++ b/README.md
@@ -20,6 +20,7 @@
           "repositoryUri": "https://github.com/testorg/testrepo.git",
           "revisionId": "testsha"
         }
+      ]
   ```
 
 # Test Setup

From dae6c50d5b3e1f6cc765a9bd05600e8f17481f6a Mon Sep 17 00:00:00 2001
From: Kristen Newbury <knewbury01@github.com>
Date: Tue, 13 Dec 2022 20:13:13 -0500
Subject: [PATCH 4/8] Bugfix CLI signature merge mistake

---
 sarif_cli/signature_single_CLI.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sarif_cli/signature_single_CLI.py b/sarif_cli/signature_single_CLI.py
index fd8dfa5..d773cf2 100644
--- a/sarif_cli/signature_single_CLI.py
+++ b/sarif_cli/signature_single_CLI.py
@@ -111,7 +111,6 @@ struct_graph_CLI = (
             ('precision', 'String'),
             ('problem.severity', 'String'),
             ('security-severity', 'String'),
-            ('severity', 'String'),
             ('sub-severity', 'String'),
             ('tags', 'Array7069'))),
     (   'Struct7100',

From d602efd3f07038cb5509ef3a6b434735ba893d00 Mon Sep 17 00:00:00 2001
From: Kristen Newbury <knewbury01@github.com>
Date: Thu, 15 Dec 2022 18:46:32 -0500
Subject: [PATCH 5/8] Bugfix signature subset superset mismatch

when the template signature portion contains
codeflows it was previously possible that a valid sarif
problem portion that contains extra fields
would be misdiagnosed as not parsable
---
 sarif_cli/typegraph.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/sarif_cli/typegraph.py b/sarif_cli/typegraph.py
index 3769fc6..4dce356 100644
--- a/sarif_cli/typegraph.py
+++ b/sarif_cli/typegraph.py
@@ -196,9 +196,14 @@ def _destructure_dict(typegraph: Typegraph, node, tree):
             )
 
     else:
-        status_writer.unknown_sarif_parsing_shape["extra_info"] = "type fields {} do not match tree fields {}.".format(type_fields, tree_fields)
-        status_writer.csv_write(status_writer.unknown_sarif_parsing_shape)
-        raise Exception("typegraph: unhandled case reached: cannot match type "
+        # possibly looks like: (Struct9699)type_fields: [codeflows...] vs tree_fields: [...extra_properties]
+        # in that case we need to also try the Struct4055 signature here
+        if "codeFlows" in type_fields:
+            _destructure_dict(typegraph, "Struct4055", tree)
+        else:
+            status_writer.unknown_sarif_parsing_shape["extra_info"] = "type fields {} do not match tree fields {}.".format(type_fields, tree_fields)
+            status_writer.csv_write(status_writer.unknown_sarif_parsing_shape)
+            raise Exception("typegraph: unhandled case reached: cannot match type "
                         "fields {} to tree fields {}.  Data is invalid."
                         .format(type_fields, tree_fields))
         

From fc2c6bac9993db0f6a0d8ffbcceb8714d33d7bf6 Mon Sep 17 00:00:00 2001
From: Kristen Newbury <knewbury01@github.com>
Date: Thu, 5 Jan 2023 12:50:54 -0500
Subject: [PATCH 6/8] Add capability to read sourceLanguage if exists in CLI
 sarif

otherwise dummy val
previously assumed never present in CLI sarif
---
 sarif_cli/scan_tables.py          | 11 ++---------
 sarif_cli/signature.py            |  6 ++++++
 sarif_cli/signature_single_CLI.py |  2 +-
 3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/sarif_cli/scan_tables.py b/sarif_cli/scan_tables.py
index 716d940..4218478 100644
--- a/sarif_cli/scan_tables.py
+++ b/sarif_cli/scan_tables.py
@@ -101,21 +101,14 @@ def joins_for_projects(basetables, external_info):
     else:
         repo_url = "unknown"
         project_name = pd.NA
-
-    if 'semmle.sourceLanguage' in b.project:
-        srcLang = b.project['semmle.sourceLanguage'][0]
-        allLang = ",".join(list(b.project['semmle.sourceLanguage']))
-    else: 
-        srcLang = "unknown"
-        allLang = "unknown"
     
     res = pd.DataFrame(data={
         "id"                 : e.project_id,
         "project_name"       : project_name,
         "creation_date"      : pd.Timestamp(0.0, unit='s'), # TODO: external info 
         "repo_url"           : repo_url, 
-        "primary_language"   : srcLang, # TODO: external info if CLI sarif
-        "languages_analyzed" : allLang  # TODO: external info if CLI sarif
+        "primary_language"   : b.project['semmle.sourceLanguage'][0],
+        "languages_analyzed" : ",".join(list(b.project['semmle.sourceLanguage']))
     }, index=[0])
 
     # Force all column types to ensure appropriate formatting
diff --git a/sarif_cli/signature.py b/sarif_cli/signature.py
index 42957fb..f81a111 100644
--- a/sarif_cli/signature.py
+++ b/sarif_cli/signature.py
@@ -235,6 +235,8 @@ dummy_relatedLocations_entry = [
 
 dummy_message_entry = {'text': 'scli-dyys dummy value'}
 
+dummy_sourceLanguage = 'unknown'
+
 def fillsig_dict(args, elem, context):
     """ Fill in the missing fields in dictionary signatures.
     """
@@ -286,6 +288,10 @@ def fillsig_dict(args, elem, context):
     if 'level' in elem.keys():
         full_elem['enabled'] = elem.get('enabled', True)
 
+    if 'semmle.formatSpecifier' in elem.keys():
+        # Ensure semmle.sourceLanguage is present at least in dummy form
+        full_elem['semmle.sourceLanguage'] = elem.get('semmle.sourceLanguage', dummy_sourceLanguage)
+
     if 'versionControlProvenance' in elem.keys():
         # Ensure newlineSequences is present when versionControlProvenance is
         full_elem['newlineSequences'] = elem.get('newlineSequences', dummy_newlineSequences)
diff --git a/sarif_cli/signature_single_CLI.py b/sarif_cli/signature_single_CLI.py
index d773cf2..1b6b747 100644
--- a/sarif_cli/signature_single_CLI.py
+++ b/sarif_cli/signature_single_CLI.py
@@ -28,7 +28,7 @@ struct_graph_CLI = (
     ('Struct3497', ('struct', ('index', 'Int'), ('uri', 'String'))),
     ('Struct9567', ('struct', ('location', 'Struct3497'))),
     ('Array6920', ('array', (0, 'Struct5277'), (1, 'Struct9567'))),
-    ('Struct1509', ('struct', ('semmle.formatSpecifier', 'String'))),
+    ('Struct1509', ('struct', ('semmle.formatSpecifier', 'String'), ('semmle.sourceLanguage', 'String'))),
     ('Struct2774', ('struct', ('text', 'String'))),
     (   'Struct6299',
         (   'struct',

From 1a915e4de8046ff3f276de63d6b31cb5a20f336f Mon Sep 17 00:00:00 2001
From: Kristen Newbury <knewbury01@github.com>
Date: Thu, 5 Jan 2023 16:37:55 -0500
Subject: [PATCH 7/8] Update how project_id is generated

previously relied on assumption:
naming like: <org>/<project> in
repositoryUri
now just uses full repositoryUri
---
 bin/sarif-extract-scans        |  9 +++------
 bin/sarif-extract-scans-runner | 18 ++++++------------
 sarif_cli/hash.py              |  2 +-
 sarif_cli/scan_tables.py       | 28 ++++++----------------------
 4 files changed, 16 insertions(+), 41 deletions(-)

diff --git a/bin/sarif-extract-scans b/bin/sarif-extract-scans
index d891f71..a171e8b 100755
--- a/bin/sarif-extract-scans
+++ b/bin/sarif-extract-scans
@@ -130,17 +130,14 @@ scantabs = ScanTables()
 
 @dataclass
 class ExternalInfo:
-    project_id : int
+    project_id: pd.UInt64Dtype()
     scan_id : pd.UInt64Dtype()
     sarif_file_name : str
-    ql_query_id : str
 
 external_info = ExternalInfo(
-    scan_spec["project_id"],
+    pd.NA,
     scan_spec["scan_id"],
-    scan_spec["sarif_file_name"],
-    # TODO: Take ql_query_id from where? (git commit id of the ql query set)
-    'deadbeef00',               
+    scan_spec["sarif_file_name"]           
 )
 
 # 
diff --git a/bin/sarif-extract-scans-runner b/bin/sarif-extract-scans-runner
index a069493..df9c266 100755
--- a/bin/sarif-extract-scans-runner
+++ b/bin/sarif-extract-scans-runner
@@ -161,7 +161,6 @@ for path in paths:
     # Paths and components
     # 
     path = path.rstrip()
-    project, component = path.split('/')
     # 
     # Scan specification
     # 
@@ -171,30 +170,25 @@ for path in paths:
         scan_id = hash.hash_unique(data)
 
     scan_spec = {
-        # assuming sarif file names are like <org>/<repo>
-        # however this will be replaced down the line with the repoURI if possible
-        # still, leaving here in case later versions of this tool do not rely on that property being there
-        # in that case this will be the best guess
-        "project_id": hash.hash_unique((project+"-"+component).encode()),  # pd.UInt64Dtype()
         "scan_id": scan_id,                        # pd.Int64Dtype()
         "sarif_file_name": path,                   # pd.StringDtype()
     }
     
     # 
     # If using outermost output directory, create project directory:
-    # (like <outer_dir>/<project>/*.scantables)
+    # (like <outer_dir>/<repositoryUri>/*.scantables)
     # 
-    try: os.mkdir(outer_dir+ project, mode=0o755)
+    try: os.mkdir(outer_dir+ path, mode=0o755)
     except FileExistsError: pass
 
-    scan_spec_file = os.path.join(outer_dir+ project, component + ".scanspec")
+    scan_spec_file = os.path.join(outer_dir+ path + ".scanspec")
     with open(scan_spec_file, 'w') as fp:
         json.dump(scan_spec, fp)
 
     # 
     # Table output directory
     # 
-    output_dir = os.path.join(outer_dir+ project, component + ".scantables")
+    output_dir = os.path.join(outer_dir+ path + ".scantables")
     try: os.mkdir(output_dir, mode=0o755)
     except FileExistsError: pass
     #
@@ -215,8 +209,8 @@ for path in paths:
             with open(args.successful_runs, 'wb') as outfile:
                 pickle.dump(successful_runs, outfile)
 
-    scan_log_file = os.path.join(outer_dir+ project, component + ".scanlog")
-    csv_outfile = os.path.join(outer_dir+ project, component)
+    scan_log_file = os.path.join(outer_dir+ path + ".scanlog")
+    csv_outfile = os.path.join(outer_dir+ path)
     runstats = subprocess.run(['sarif-extract-scans', scan_spec_file, output_dir, csv_outfile, "-f", args.input_signature],
                               capture_output=True, text=True)
     if runstats.returncode == 0:
diff --git a/sarif_cli/hash.py b/sarif_cli/hash.py
index 9c107ba..f900897 100644
--- a/sarif_cli/hash.py
+++ b/sarif_cli/hash.py
@@ -4,4 +4,4 @@ from hashlib import blake2b
 def hash_unique(item_to_hash):
     h = blake2b(digest_size = 8)
     h.update(item_to_hash)
-    return abs(int.from_bytes(h.digest(), byteorder='big'))
+    return int.from_bytes(h.digest(), byteorder='big')
diff --git a/sarif_cli/scan_tables.py b/sarif_cli/scan_tables.py
index 4218478..b2cd8f0 100644
--- a/sarif_cli/scan_tables.py
+++ b/sarif_cli/scan_tables.py
@@ -79,34 +79,18 @@ def joins_for_projects(basetables, external_info):
     """
     b = basetables; e = external_info
    
-    # if the sarif does not have versionControlProvenance, semmle.sourceLanguage ect
-    # there is no reliable way to know the project name 
-    # and will still need to use a guess about the project id
+    # if the sarif does have versionControlProvenance
     if "repositoryUri" in b.project:
-        repo_url = b.project.repositoryUri[0]
-         # For a repository url of the form
-        #   (git|https)://*/org/project.*
-        # use the org/project part as the project_name.
-        # 
-        url_parts = re.match(r'(git|https)://[^/]+/([^/]+)/(.*).git', repo_url)
-        if url_parts:
-            project_name = f"{url_parts.group(2)}-{url_parts.group(3)}"
-            project, component = e.sarif_file_name.rstrip().split('/')
-            # if the runners guess from the filename was bad, replace with real info
-            # and continue to use that scanspec to pass that around
-            if project_name != project+"-"+component:
-                e.project_id = hash.hash_unique(project_name.encode())
-        else:
-            project_name = pd.NA
+        repoUri = b.project.repositoryUri[0]
+        e.project_id = hash.hash_unique(repoUri.encode())
     else:
-        repo_url = "unknown"
-        project_name = pd.NA
+        repoUri = "unknown"
     
     res = pd.DataFrame(data={
         "id"                 : e.project_id,
-        "project_name"       : project_name,
+        "project_name"       : repoUri,
         "creation_date"      : pd.Timestamp(0.0, unit='s'), # TODO: external info 
-        "repo_url"           : repo_url, 
+        "repo_url"           : repoUri, 
         "primary_language"   : b.project['semmle.sourceLanguage'][0],
         "languages_analyzed" : ",".join(list(b.project['semmle.sourceLanguage']))
     }, index=[0])

From 7dad175d4df5d1ca2cb8ae1875edcabd0f7939d7 Mon Sep 17 00:00:00 2001
From: Kristen Newbury <knewbury01@github.com>
Date: Thu, 12 Jan 2023 12:03:51 -0500
Subject: [PATCH 8/8] Fix tool to default CLI not LGTM sarif input

update readme minor improvement
---
 README.md                      | 4 +++-
 bin/sarif-extract-scans-runner | 6 +++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 6c7850e..3916397 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,9 @@
 # CLI tools for SARIF processing 
 
   Each of these tools present a high-level command-line interface to extract a
-  specific subset of information from a SARIF file. The main tools are: `sarif-extract-scans-runner`,`sarif-aggregate-scans`,`sarif-create-aggregate-report`
+  specific subset of information from a SARIF file. The main tools are: `sarif-extract-scans-runner`,`sarif-aggregate-scans`,`sarif-create-aggregate-report`. 
+  
+  Each tool can print its options and description like: `sarif-extract-scans-runner --help`.
 
   The tool was implemented using Python 3.9.
 
diff --git a/bin/sarif-extract-scans-runner b/bin/sarif-extract-scans-runner
index df9c266..b323bea 100755
--- a/bin/sarif-extract-scans-runner
+++ b/bin/sarif-extract-scans-runner
@@ -88,9 +88,9 @@ parser = argparse.ArgumentParser(description='Run sarif-extract-scans over a dir
 
 parser.add_argument('sarif_files', metavar='sarif-files', type=str, help='File containing list of sarif files, use - for stdin')
 
-parser.add_argument('-f','--input-signature', metavar='input-signature', type=str, default="LGTM", 
-                    help='Signature of the sarif, as in, where it was generated it may affect the signature.'
-                    'Options: LGTM, CLI'
+parser.add_argument('-f','--input-signature', metavar='input-signature', type=str, default="CLI", 
+                    help='Signature of the sarif, as in, where it was generated it may affect the signature.\n'
+                    'Options: LGTM, CLI.\n'
                     'If current represented signatures are not sufficient, view signature_single.py for how to support further signatures.'
                     '  Default: "%(default)s"')