sarif-extract-multi: extract combined tables from multiple sarif files

This command introduces a new tree structure that pulls in a collection of sarif files. In yaml format, an example is - creation_date: '2021-12-09' # Repository creation date primary_language: javascript # By lines of code project_name: treeio/treeio # Repo name-short name query_commit_id: fa9571646c # Commit id for custom (non-library) queries sarif_content: {} # The sarif content will be attached here sarif_file_name: 2021-12-09/results.sarif # Path to sarif file scan_start_date: '2021-12-09' # Beginning date/time of scan scan_stop_date: '2021-12-10' # End date/time of scan tool_name: codeql tool_version: v1.27 - creation_date: '2022-02-25' primary_language: javascript ... At run time, cd ~/local/sarif-cli/data/treeio sarif-extract-multi multi-sarif-01.json test-multi-table will load the specified sarif files and put them in place of `sarif_content`, then build tables against the new signature found in sarif_cli/signature_multi.py, and merge those into 6 larger tables. The exported tables are artifacts.csv path-problem.csv project.csv codeflows.csv problem.csv related-locations.csv and they have join keys for further operations. The new typegraph is rendered in notes/typegraph-multi.pdf using the instructions in sarif_cli/signature_multi.py
2025-12-16 17:23:03 +01:00 · 2022-03-11 23:00:53 -08:00
parent 9c151e295b
commit 0f070a6ae4
19 changed files with 78848 additions and 25044 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -1 +1,2 @@
 *.sarif filter=lfs diff=lfs merge=lfs -text
+*.pdf filter=lfs diff=lfs merge=lfs -text
--- a/bin/sarif-extract-multi
+++ b/bin/sarif-extract-multi
@@ -0,0 +1,88 @@
+#!/usr/bin/env python
+""" Extract data from multiple sarif files in table form.
+"""
+import argparse
+import json
+import pathlib
+from sarif_cli import signature, signature_multi
+from sarif_cli import typegraph
+import sarif_cli.table_joins as tj
+import sys
+from collections import defaultdict
+import pandas as pd
+
+#
+# Start processing 
+#
+parser = argparse.ArgumentParser(description='Read a collection of sarif files and produce tabular output.')
+parser.add_argument('file', metavar='sarif-files.json', type=str,
+                    help="json file containing the metadata array.  Use - for stdin. ")
+parser.add_argument('outdir', metavar='output-dir', type=str, help='output directory')
+parser.add_argument('-c', '--combine-only', action="store_true",
+                    help='Read the referenced input file(s) and write the combined structure to stdout')
+args = parser.parse_args()
+
+# Load meta info
+with open(args.file, 'r') if args.file != '-' else sys.stdin as fp:
+    meta_struct = json.load(fp)
+
+# Attach referenced files
+def load(fname):
+    with open(fname, 'rb') as fp: 
+        content = json.load(fp)
+    return content
+
+for sarif_meta in meta_struct:
+    sarif_meta['sarif_content'] = load(sarif_meta['sarif_file_name'])
+
+# Only output composite?
+if args.combine_only:
+    json.dump(meta_struct, sys.stdout, indent=4)
+    sys.exit(0)
+#
+# Preprocess raw SARIF to get smaller signature
+#
+context = signature.Context(
+    {
+        "string" : "String",
+        "int" : "Int",
+        "bool" : "Bool"
+    }
+) 
+meta_struct = signature.fillsig(args, meta_struct, context)
+#
+# Use reference type graph (signature) to traverse sarif and attach values to tables
+#
+tgraph = typegraph.Typegraph(signature_multi.struct_graph_2022_03_08)
+typegraph.destructure(tgraph, signature_multi.start_node_2022_03_08, meta_struct)
+#
+# Form output tables
+# 
+typegraph.attach_tables(tgraph)
+# 
+# Form dataframes originally introduced by sarif-extract-tables
+# 
+sf_2683 = tj.joins_for_sf_2683(tgraph)
+kind_problem = tj.joins_for_problem(tgraph, sf_2683)
+kind_pathproblem = tj.joins_for_path_problem(tgraph, sf_2683)
+codeflows_9799 = tj.joins_for_codeflows(tgraph, sf_2683)
+related_locations = tj.joins_for_relatedLocations(tgraph, sf_2683)
+# 
+# Form the new dataframes
+# 
+project_df = tj.joins_for_project(tgraph)
+artifacts_df = tj.joins_for_artifacts(tgraph)
+#
+# Write output
+#
+p = pathlib.Path(args.outdir)
+p.mkdir(exist_ok=True)
+def write(path, frame):
+    with p.joinpath(path).open(mode='wb') as fh:
+        frame.to_csv(fh, index_label='index')
+write('problem.csv', kind_problem)
+write('path-problem.csv', kind_pathproblem)
+write('codeflows.csv', codeflows_9799)
+write('related-locations.csv', related_locations)
+write('project.csv', project_df)
+write('artifacts.csv', artifacts_df)
--- a/bin/sarif-extract-tables
+++ b/bin/sarif-extract-tables
@@ -1,10 +1,21 @@
 #!/usr/bin/env python
 """ Extract data from sarif files in table form.
+
+These particular table joins create tables matching the content of
+./sarif-results-summary
+
+Return tables providing the `problem`, `path-problem` and `relatedLocations`
+information.
+
+The `problem` and `path-problem` entries provide that information; the
+`relatedLocations` table provides the details when multiple results are present
+for either.
+
 """
 import argparse
 import json
 import pathlib
-from sarif_cli import signature
+from sarif_cli import signature, signature_single
 from sarif_cli import typegraph
 import sys
 from collections import defaultdict
@@ -43,8 +54,8 @@ sarif_struct = signature.fillsig(args, sarif_struct, context)
 #
 # Use reference type graph (signature) to traverse sarif and attach values to tables
 #
-tgraph = typegraph.Typegraph(typegraph.struct_graph_2022_02_01)
-typegraph.destructure(tgraph, typegraph.start_node_2022_02_01, sarif_struct)
+tgraph = typegraph.Typegraph(signature_single.struct_graph_2022_02_01)
+typegraph.destructure(tgraph, signature_single.start_node_2022_02_01, sarif_struct)

 #
 # Form output tables
--- a/data/treeio/2021-12-09/results.sarif
+++ b/data/treeio/2021-12-09/results.sarif
--- a/data/treeio/2021-12-09/results.yaml
+++ b/data/treeio/2021-12-09/results.yaml
--- a/data/treeio/2022-02-25/results.sarif
+++ b/data/treeio/2022-02-25/results.sarif
--- a/data/treeio/2022-02-25/results.yaml
+++ b/data/treeio/2022-02-25/results.yaml
--- a/data/treeio/multi-sarif-01.json
+++ b/data/treeio/multi-sarif-01.json
@@ -0,0 +1,26 @@
+[
+  {
+    "creation_date": "2021-12-09",
+    "primary_language": "javascript",
+    "project_name": "treeio/treeio",
+    "query_commit_id": "fa9571646c",
+    "sarif_content": {},
+    "sarif_file_name": "2021-12-09/results.sarif",
+    "scan_start_date": "2021-12-09",
+    "scan_stop_date": "2021-12-10",
+    "tool_name": "codeql",
+    "tool_version": "v1.27"
+  },
+  {
+    "creation_date": "2022-02-25",
+    "primary_language": "javascript",
+    "project_name": "treeio/treeio",
+    "query_commit_id": "fa9571646c",
+    "sarif_content": {},
+    "sarif_file_name": "2022-02-25/results.sarif",
+    "scan_start_date": "2022-02-25",
+    "scan_stop_date": "2022-02-26",
+    "tool_name": "codeql",
+    "tool_version": "v1.29"
+  }
+]
--- a/data/treeio/multi-sarif-01.yaml
+++ b/data/treeio/multi-sarif-01.yaml
@@ -0,0 +1,21 @@
+- creation_date: '2021-12-09'   # Repository creation date
+  primary_language: javascript  # By lines of code
+  project_name: treeio/treeio   # Repo name-short name
+  query_commit_id: fa9571646c   # Commit id for custom (non-library) queries
+  sarif_content: {}             # The sarif content will be attached here
+  sarif_file_name: 2021-12-09/results.sarif # Path to sarif file
+  scan_start_date: '2021-12-09'             # Beginning date/time of scan
+  scan_stop_date:  '2021-12-10'             # End date/time of scan
+  tool_name: codeql
+  tool_version: v1.27
+
+- creation_date: '2022-02-25'  
+  primary_language: javascript 
+  project_name: treeio/treeio  
+  query_commit_id: fa9571646c  
+  sarif_content: {}            
+  sarif_file_name: 2022-02-25/results.sarif 
+  scan_start_date: '2022-02-25'  
+  scan_stop_date:  '2022-02-26'  
+  tool_name: codeql
+  tool_version: v1.29
--- a/data/treeio/results.yaml
+++ b/data/treeio/results.yaml
--- a/data/treeio/raw-nested-types.pdf
+++ b/data/treeio/raw-nested-types.pdf
--- a/notes/typegraph-multi.pdf
+++ b/notes/typegraph-multi.pdf
--- a/sarif_cli/signature.py
+++ b/sarif_cli/signature.py
@@ -4,7 +4,7 @@ These functions convert a SARIF (or any json structure) to its signature, with v
 See sarif-to-dot for options and examples.
 """
 from dataclasses import dataclass
-import sarif_cli.traverse as traverse
+from . import traverse
 import zlib

 # 
--- a/sarif_cli/signature_multi.py
+++ b/sarif_cli/signature_multi.py
@@ -0,0 +1,148 @@
+""" The signature for a multi-sarif result file
+
+Produced by 
+
+    cd sarif-cli/data/treeio
+    sarif-extract-multi -c multi-sarif-01.json none | sarif-to-dot -utf - 
+
+with some arrays manually sorted so the the signature with more fields comes first.  The case 
+    ('Array6343', ('array', (1, 'Struct9699'), (0, 'Struct4055'))), # MANUALLY SORTED
+is marked below.
+
+Also, this struct should be (and is) identical to struct_graph_2022_02_01 in the
+leading entries, but there are two extras.
+
+To get a map of this type graph, use
+
+    cd sarif-cli/data/treeio
+    sarif-extract-multi -c multi-sarif-01.json none | \
+        sarif-to-dot -u -t -f -n -d  - | dot -Tpdf > typegraph-multi.pdf
+
+"""
+
+# 
+# The starting node is the leftmost node in ../notes/typegraph-multi.pdf
+# 
+start_node_2022_03_08 = 'Array6785'
+
+struct_graph_2022_03_08 = (
+[   ('String', 'string'),
+    ('Int', 'int'),
+    ('Bool', 'bool'),
+    (   'Struct2685',
+        (   'struct',
+            ('index', 'Int'),
+            ('uri', 'String'),
+            ('uriBaseId', 'String'))),
+    ('Struct5277', ('struct', ('location', 'Struct2685'))),
+    ('Array4640', ('array', (0, 'Struct5277'))),
+    ('Array7069', ('array', (0, 'String'))),
+    (   'Struct9543',
+        (   'struct',
+            ('semmle.formatSpecifier', 'String'),
+            ('semmle.sourceLanguage', 'String'))),
+    ('Struct2774', ('struct', ('text', 'String'))),
+    (   'Struct6299',
+        (   'struct',
+            ('endColumn', 'Int'),
+            ('endLine', 'Int'),
+            ('startColumn', 'Int'),
+            ('startLine', 'Int'))),
+    (   'Struct4963',
+        (   'struct',
+            ('artifactLocation', 'Struct2685'),
+            ('region', 'Struct6299'))),
+    (   'Struct2683',
+        (   'struct',
+            ('id', 'Int'),
+            ('message', 'Struct2774'),
+            ('physicalLocation', 'Struct4963'))),
+    ('Array0350', ('array', (0, 'Struct2683'))),
+    (   'Struct4199',
+        (   'struct',
+            ('primaryLocationLineHash', 'String'),
+            ('primaryLocationStartColumnFingerprint', 'String'))),
+    ('Struct3942', ('struct', ('id', 'String'), ('index', 'Int'))),
+    (   'Struct4055',
+        (   'struct',
+            ('locations', 'Array0350'),
+            ('message', 'Struct2774'),
+            ('partialFingerprints', 'Struct4199'),
+            ('relatedLocations', 'Array0350'),
+            ('rule', 'Struct3942'),
+            ('ruleId', 'String'),
+            ('ruleIndex', 'Int'))),
+    ('Struct0987', ('struct', ('location', 'Struct2683'))),
+    ('Array1075', ('array', (0, 'Struct0987'))),
+    ('Struct4194', ('struct', ('locations', 'Array1075'))),
+    ('Array1597', ('array', (0, 'Struct4194'))),
+    ('Struct7122', ('struct', ('threadFlows', 'Array1597'))),
+    ('Array9799', ('array', (0, 'Struct7122'))),
+    (   'Struct9699',
+        (   'struct',
+            ('codeFlows', 'Array9799'),
+            ('locations', 'Array0350'),
+            ('message', 'Struct2774'),
+            ('partialFingerprints', 'Struct4199'),
+            ('relatedLocations', 'Array0350'),
+            ('rule', 'Struct3942'),
+            ('ruleId', 'String'),
+            ('ruleIndex', 'Int'))),
+    ('Array6343', ('array', (1, 'Struct9699'), (0, 'Struct4055'))), # MANUALLY SORTED
+    ('Struct8581', ('struct', ('enabled', 'Bool'), ('level', 'String'))),
+    (   'Struct7849',
+        (   'struct',
+            ('kind', 'String'),
+            ('precision', 'String'),
+            ('security-severity', 'String'),
+            ('severity', 'String'),
+            ('sub-severity', 'String'),
+            ('tags', 'Array7069'))),
+    (   'Struct6818',
+        (   'struct',
+            ('defaultConfiguration', 'Struct8581'),
+            ('fullDescription', 'Struct2774'),
+            ('id', 'String'),
+            ('name', 'String'),
+            ('properties', 'Struct7849'),
+            ('shortDescription', 'Struct2774'))),
+    ('Array8754', ('array', (0, 'Struct6818'))),
+    (   'Struct7820',
+        (   'struct',
+            ('name', 'String'),
+            ('organization', 'String'),
+            ('rules', 'Array8754'),
+            ('version', 'String'))),
+    ('Struct8972', ('struct', ('driver', 'Struct7820'))),
+    (   'Struct3081',
+        ('struct', ('repositoryUri', 'String'), ('revisionId', 'String'))),
+    ('Array5511', ('array', (0, 'Struct3081'))),
+    (   'Struct3388',
+        (   'struct',
+            ('artifacts', 'Array4640'),
+            ('columnKind', 'String'),
+            ('newlineSequences', 'Array7069'),
+            ('properties', 'Struct9543'),
+            ('results', 'Array6343'),
+            ('tool', 'Struct8972'),
+            ('versionControlProvenance', 'Array5511'))),
+    ('Array0177', ('array', (0, 'Struct3388'))),
+    (   'Struct6787',
+        (   'struct',
+            ('$schema', 'String'),
+            ('runs', 'Array0177'),
+            ('version', 'String'))), # Up to here identical to struct_graph_2022_02_01
+    (   'Struct3739',
+        (   'struct',
+            ('creation_date', 'String'),
+            ('primary_language', 'String'),
+            ('project_name', 'String'),
+            ('query_commit_id', 'String'),
+            ('sarif_content', 'Struct6787'),
+            ('sarif_file_name', 'String'),
+            ('scan_start_date', 'String'),
+            ('scan_stop_date', 'String'),
+            ('tool_name', 'String'),
+            ('tool_version', 'String'))),
+    ('Array6785', ('array', (0, 'Struct3739')))]
+)
--- a/sarif_cli/signature_single.py
+++ b/sarif_cli/signature_single.py
@@ -0,0 +1,125 @@
+""" The signature for a single sarif file
+
+Produced by 
+
+    sarif-to-dot -u -t -f 2021-12-09/results.sarif
+
+with some arrays manually sorted so the the signature with more fields comes first.  The case 
+    ('Array6343', ('array', (1, 'Struct9699'), (0, 'Struct4055'))), # MANUALLY SORTED
+is marked below
+"""
+
+# 
+# The starting node the leftmost node in ../notes/typegraph.pdf
+# 
+start_node_2022_02_01 = 'Struct6787' 
+
+struct_graph_2022_02_01 = (
+[   ('String', 'string'),
+    ('Int', 'int'),
+    ('Bool', 'bool'),
+    (   'Struct2685',
+        (   'struct',
+            ('index', 'Int'),
+            ('uri', 'String'),
+            ('uriBaseId', 'String'))),
+    ('Struct5277', ('struct', ('location', 'Struct2685'))),
+    ('Array4640', ('array', (0, 'Struct5277'))),
+    ('Array7069', ('array', (0, 'String'))),
+    (   'Struct9543',
+        (   'struct',
+            ('semmle.formatSpecifier', 'String'),
+            ('semmle.sourceLanguage', 'String'))),
+    ('Struct2774', ('struct', ('text', 'String'))),
+    (   'Struct6299',
+        (   'struct',
+            ('endColumn', 'Int'),
+            ('endLine', 'Int'),
+            ('startColumn', 'Int'),
+            ('startLine', 'Int'))),
+    (   'Struct4963',
+        (   'struct',
+            ('artifactLocation', 'Struct2685'),
+            ('region', 'Struct6299'))),
+    (   'Struct2683',
+        (   'struct',
+            ('id', 'Int'),
+            ('message', 'Struct2774'),
+            ('physicalLocation', 'Struct4963'))),
+    ('Array0350', ('array', (0, 'Struct2683'))),
+    (   'Struct4199',
+        (   'struct',
+            ('primaryLocationLineHash', 'String'),
+            ('primaryLocationStartColumnFingerprint', 'String'))),
+    ('Struct3942', ('struct', ('id', 'String'), ('index', 'Int'))),
+    (   'Struct4055',
+        (   'struct',
+            ('locations', 'Array0350'),
+            ('message', 'Struct2774'),
+            ('partialFingerprints', 'Struct4199'),
+            ('relatedLocations', 'Array0350'),
+            ('rule', 'Struct3942'),
+            ('ruleId', 'String'),
+            ('ruleIndex', 'Int'))),
+    ('Struct0987', ('struct', ('location', 'Struct2683'))),
+    ('Array1075', ('array', (0, 'Struct0987'))),
+    ('Struct4194', ('struct', ('locations', 'Array1075'))),
+    ('Array1597', ('array', (0, 'Struct4194'))),
+    ('Struct7122', ('struct', ('threadFlows', 'Array1597'))),
+    ('Array9799', ('array', (0, 'Struct7122'))),
+    (   'Struct9699',
+        (   'struct',
+            ('codeFlows', 'Array9799'),
+            ('locations', 'Array0350'),
+            ('message', 'Struct2774'),
+            ('partialFingerprints', 'Struct4199'),
+            ('relatedLocations', 'Array0350'),
+            ('rule', 'Struct3942'),
+            ('ruleId', 'String'),
+            ('ruleIndex', 'Int'))),
+    ('Array6343', ('array', (1, 'Struct9699'), (0, 'Struct4055'))), # MANUALLY SORTED
+    ('Struct8581', ('struct', ('enabled', 'Bool'), ('level', 'String'))),
+    (   'Struct7849',
+        (   'struct',
+            ('kind', 'String'),
+            ('precision', 'String'),
+            ('security-severity', 'String'),
+            ('severity', 'String'),
+            ('sub-severity', 'String'),
+            ('tags', 'Array7069'))),
+    (   'Struct6818',
+        (   'struct',
+            ('defaultConfiguration', 'Struct8581'),
+            ('fullDescription', 'Struct2774'),
+            ('id', 'String'),
+            ('name', 'String'),
+            ('properties', 'Struct7849'),
+            ('shortDescription', 'Struct2774'))),
+    ('Array8754', ('array', (0, 'Struct6818'))),
+    (   'Struct7820',
+        (   'struct',
+            ('name', 'String'),
+            ('organization', 'String'),
+            ('rules', 'Array8754'),
+            ('version', 'String'))),
+    ('Struct8972', ('struct', ('driver', 'Struct7820'))),
+    (   'Struct3081',
+        ('struct', ('repositoryUri', 'String'), ('revisionId', 'String'))),
+    ('Array5511', ('array', (0, 'Struct3081'))),
+    (   'Struct3388',
+        (   'struct',
+            ('artifacts', 'Array4640'),
+            ('columnKind', 'String'),
+            ('newlineSequences', 'Array7069'),
+            ('properties', 'Struct9543'),
+            ('results', 'Array6343'),
+            ('tool', 'Struct8972'),
+            ('versionControlProvenance', 'Array5511'))),
+    ('Array0177', ('array', (0, 'Struct3388'))),
+    (   'Struct6787',
+        (   'struct',
+            ('$schema', 'String'),
+            ('runs', 'Array0177'),
+            ('version', 'String')))]
+)
+
--- a/sarif_cli/table_joins.py
+++ b/sarif_cli/table_joins.py
@@ -0,0 +1,316 @@
+""" Collection of joins for the base tables provided by typegraph.attach_tables()
+
+    The `problem` and `path-problem` entries provide that information; the
+    `relatedLocations` table provides the details when multiple results are
+    present for either.  `project` is the high-level overview; `artifacts` 
+    provides those for the other tables.
+"""
+import pandas as pd
+
+def joins_for_sf_2683(tgraph):
+    """ 
+    Join all the tables used by 2683's right side into one.
+    """
+    # Access convenience functions
+    sf = lambda num: tgraph.dataframes['Struct' + str(num)]
+    af = lambda num: tgraph.dataframes['Array' + str(num)]
+    # 
+    sf_2683 = ( 
+        # 
+        sf(2683)
+        .rename(columns={"struct_id": "struct_id_2683", "id": "id_2683"})
+        # 
+        .merge(sf(4963), how="left", left_on='physicalLocation', right_on='struct_id', validate="1:m")
+        .drop(columns=['struct_id', 'physicalLocation'])
+        # 
+        .merge(sf(6299), how="left", left_on='region', right_on='struct_id', validate="1:m")
+        .drop(columns=['struct_id', 'region'])
+        # 
+        .merge(sf(2685), how="left", left_on='artifactLocation', right_on='struct_id', validate="1:m")
+        .drop(columns=['struct_id', 'artifactLocation'])
+        .rename(columns={"index": "location_index_2685"})
+        # 
+        .merge(sf(2774), how="left", left_on='message', right_on='struct_id', validate="1:m")
+        .drop(columns=['struct_id', 'message'])
+        .rename(columns={"text": "message_text_2683"})
+        # 
+    )
+
+    return sf_2683
+
+def joins_for_problem(tgraph, sf_2683):
+    """ 
+    Return table providing the `problem` information.
+    """
+    # Access convenience functions
+    sf = lambda num: tgraph.dataframes['Struct' + str(num)]
+    af = lambda num: tgraph.dataframes['Array' + str(num)]
+    # 
+    # Form the message dataframe (@kind problem) via joins
+    # 
+    kind_problem_1 = (
+        af(6343)
+        .rename(columns={"value_index": "results_idx_6343", "array_id": "result_id_6343"})
+        .merge(sf(4055), how="inner", left_on='id_or_value_at_index', right_on='struct_id',
+               validate="1:m")
+        .drop(columns=['type_at_index', 'id_or_value_at_index', 'struct_id'])
+        .rename(columns={"message": "result_message_4055",
+                         "relatedLocations": "relatedLocations_id"})
+        # locations
+        .merge(af('0350'), how="left", left_on='locations', right_on='array_id', validate="1:m")
+        .drop(columns=['locations', 'array_id', 'type_at_index'])
+        # 
+        .merge(sf_2683, how="left", left_on='id_or_value_at_index', right_on='struct_id_2683', validate="1:m")
+        .drop(columns=['id_or_value_at_index', 'struct_id_2683'])
+        # 
+        .merge(sf(2774), how="left", left_on='result_message_4055', right_on='struct_id', validate="1:m")
+        .drop(columns=['struct_id', 'result_message_4055'])
+        .rename(columns={"text": "message_text_4055"})
+        # 
+        .merge(sf(4199), how="left", left_on='partialFingerprints', right_on='struct_id', validate="1:m")
+        .drop(columns=['struct_id', 'partialFingerprints'])
+        #
+        .merge(
+            sf(3942).rename(columns={"id": "rule_id", "index": "rule_index"}), 
+            how="left", left_on='rule', right_on='struct_id', validate="1:m")
+        .drop(columns=['struct_id', 'rule'])
+        #
+    )
+    return kind_problem_1
+
+def joins_for_codeflows(tgraph, sf_2683):
+    """ 
+    Return the table providing the `codeFlows` for a `path-problem table.
+    """
+    # Access convenience functions
+    sf = lambda num: tgraph.dataframes['Struct' + str(num)]
+    af = lambda num: tgraph.dataframes['Array' + str(num)]
+    #
+    af_9799 = (
+        af(9799).rename(columns={"array_id": "t9799_array_id", "value_index": "t9799_idx"})
+        # 
+        .merge(sf(7122), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m")
+        .drop(columns=['id_or_value_at_index', 'struct_id', 'type_at_index'])
+        # 
+        .merge(af(1597).rename(columns={"array_id": "t1597_array_id", "value_index": "t1597_idx"}),
+               how="left", left_on='threadFlows', right_on='t1597_array_id', validate="1:m")
+        .drop(columns=['threadFlows', 't1597_array_id', 'type_at_index'])
+        #
+        .merge(sf(4194), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m")
+        .drop(columns=['id_or_value_at_index', 'struct_id'])
+        #
+        .merge(af(1075).rename(columns={"array_id": "t1075_array_id", "value_index": "t1075_idx"}),
+               how="left", left_on='locations', right_on='t1075_array_id', validate="1:m")
+        .drop(columns=['locations', 't1075_array_id', 'type_at_index'])
+        .rename(columns={"t1075_idx": "t1075_locations_idx"})
+        #
+        .merge(sf('0987'), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m")
+        .drop(columns=['id_or_value_at_index', 'struct_id'])
+        #
+        .merge(sf_2683, how="left", left_on='location', right_on='struct_id_2683', validate="1:m")
+        .drop(columns=['location', 'struct_id_2683'])
+    )
+    return af_9799
+
+def joins_for_path_problem(tgraph, sf_2683):
+    """ 
+    Return table providing the `path-problem` information.
+    """
+    # Access convenience functions
+    sf = lambda num: tgraph.dataframes['Struct' + str(num)]
+    af = lambda num: tgraph.dataframes['Array' + str(num)]
+    #
+    kind_pathproblem_1 = (
+        af(6343)
+        .rename(columns={"value_index": "t6343_result_idx", "array_id": "t6343_result_id"})
+        .merge(sf(9699), how="inner", left_on='id_or_value_at_index', right_on='struct_id',
+               validate="1:m")
+        .rename(columns={"codeFlows"           : "t9699_codeFlows",
+                         "locations"           : "t9699_locations",
+                         "message"             : "t9699_message",
+                         "partialFingerprints" : "t9699_partialFingerprints",
+                         "relatedLocations"    : "t9699_relatedLocations",
+                         "rule"                : "t9699_rule",
+                         "ruleId"              : "t9699_ruleId",
+                         "ruleIndex"           : "t9699_ruleIndex",
+                         })
+        .drop(columns=['id_or_value_at_index', 'struct_id', 'type_at_index'])
+        # 9699.locations
+        .merge(af('0350').rename(columns={"value_index": "t0350_location_idx"}),
+               how="left", left_on='t9699_locations', right_on='array_id', validate="1:m")
+        .drop(columns=['t9699_locations', 'array_id', 'type_at_index'])
+        # 
+        .merge(sf_2683, how="left", left_on='id_or_value_at_index', right_on='struct_id_2683', validate="1:m")
+        .drop(columns=['id_or_value_at_index', 'struct_id_2683'])
+        #
+        # # TODO: merge or keep separate?
+        # # 9699.codeFlows
+        # .merge(af_9799, how="left", left_on='t9699_codeFlows', right_on='t9799_array_id', validate="1:m")
+        #
+        # 9699.message
+        .merge(sf(2774), how="left", left_on='t9699_message', right_on='struct_id', validate="1:m")
+        .drop(columns=['struct_id', 't9699_message'])
+        .rename(columns={"text": "t9699_message_text"})
+        # 
+        # 9699.partialFingerprints
+        .merge(sf(4199), how="left", left_on='t9699_partialFingerprints', right_on='struct_id', validate="1:m")
+        .drop(columns=['struct_id', 't9699_partialFingerprints'])
+        #
+        # 9699.relatedLocations -- keep ids
+        # 
+        # 9699.rule
+        .merge(
+            sf(3942).rename(columns={"id": "t3942_rule_id", "index": "t3942_rule_idx"}), 
+            how="left", left_on='t9699_rule', right_on='struct_id', validate="1:m")
+        .drop(columns=['struct_id', 't9699_rule'])
+    )
+
+    # # TODO potential cleanup
+    # # Remove dummy locations previously injected by signature.fillsig
+    # kind_pathproblem_2 = kind_pathproblem_1[kind_pathproblem_1.uri != 'scli-dyys dummy value']
+    # #
+    return kind_pathproblem_1
+
+def joins_for_relatedLocations(tgraph, sf_2683):
+    """ 
+    Return table providing the  `relatedLocations` and `locations` information.
+    """
+    # Access convenience functions
+    sf = lambda num: tgraph.dataframes['Struct' + str(num)]
+    af = lambda num: tgraph.dataframes['Array' + str(num)]
+    # 
+    # Form the relatedLocation dataframe via joins, starting from the union of
+    # relatedLocations from `kind problem` (sf(4055)) and `kind path-problem`
+    # (sf(9699)).
+    # 
+    related_locations_1 = (
+        pd.concat([sf(4055)[['relatedLocations', 'struct_id']], sf(9699)[['relatedLocations', 'struct_id']]])
+        .merge(af('0350'), how="left", left_on='relatedLocations', right_on='array_id', validate="1:m")
+        .drop(columns=['relatedLocations', 'array_id', 'value_index', 'type_at_index'])
+        # 
+        .merge(sf(2683), how="left", left_on='id_or_value_at_index', right_on='struct_id',
+               suffixes=("_4055_9699", "_2683"), validate="1:m")
+        .drop(columns=['struct_id_2683', 'id_or_value_at_index'])
+        # 
+        .merge(sf(4963), how="left", left_on='physicalLocation', right_on='struct_id', validate="1:m")
+        .drop(columns=['struct_id', 'physicalLocation'])
+        # 
+        .merge(sf(6299), how="left", left_on='region', right_on='struct_id', validate="1:m")
+        .drop(columns=['struct_id', 'region'])
+        # 
+        .merge(sf(2685), how="left", left_on='artifactLocation', right_on='struct_id', validate="1:m")
+        .drop(columns=['struct_id', 'artifactLocation'])
+        # 
+        .merge(sf(2774), how="left", left_on='message', right_on='struct_id', validate="1:m")
+        .drop(columns=['struct_id', 'message'])
+    )
+
+    # Keep columns of interest
+    related_locations_2 = (related_locations_1[['struct_id_4055_9699', 'uri', 'startLine', 'startColumn', 'endLine', 'endColumn', 'text']]
+          .rename({'text': 'message', 'struct_id_4055_9699': 'struct_id'}, axis='columns'))
+
+    # Remove dummy locations previously injected by signature.fillsig
+    related_locations_3 = related_locations_2[related_locations_2.uri != 'scli-dyys dummy value']
+
+    return related_locations_3
+
+def joins_for_project(tgraph):
+    """ 
+    Return table providing the `project` information.
+    """
+    # Access convenience functions
+    sf = lambda num: tgraph.dataframes['Struct' + str(num)]
+    af = lambda num: tgraph.dataframes['Array' + str(num)]
+    # 
+    project_df = (
+        af(6785)
+        #
+        .merge(sf(3739), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m")
+        .drop(columns=['id_or_value_at_index', 'struct_id', 'array_id', 'type_at_index'])
+        #
+        .merge(sf(6787), how="left", left_on='sarif_content', right_on='struct_id', validate="1:m")
+        .drop(columns=['sarif_content', 'struct_id'])
+        .rename(columns={"version": "version_6787"})
+        #
+        .merge(af('0177'), how="left", left_on='runs', right_on='array_id',
+               suffixes=("_6785", "_0177"), validate="1:m")
+        .drop(columns=['runs', 'array_id', 'type_at_index'])
+        #
+        .merge(sf(3388), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m")
+        .drop(columns=['id_or_value_at_index', 'struct_id'])
+        # 
+        # .merge(af(7069), how="left", left_on='newlineSequences', right_on='array_id',
+        #        validate="1:m")
+        # .drop(columns=['newlineSequences', 'array_id', 'type_at_index'])
+        .drop(columns=['newlineSequences'])
+        #
+        .merge(sf(9543), how="left", left_on='properties', right_on='struct_id', validate="1:m")
+        .drop(columns=['properties', 'struct_id'])
+        #
+        # tool - driver - rules - defaultConfiguration - ( properties - tags )
+        # 
+        .merge(sf(8972), how="left", left_on='tool', right_on='struct_id', validate="1:m")
+        .drop(columns=['tool', 'struct_id'])
+        # 
+        .merge(sf(7820), how="left", left_on='driver', right_on='struct_id', validate="1:m")
+        .drop(columns=['driver', 'struct_id'])
+        .rename(columns={"version": "driver_version_7820", "name": "driver_name_7820"})
+        # 
+        .merge(af(8754), how="left", left_on='rules', right_on='array_id', validate="1:m")
+        .drop(columns=['rules', 'array_id', 'type_at_index'])
+        .rename(columns={"value_index": "rule_value_index_8754"}) # rule index
+        #
+        .merge(sf(6818), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m")
+        .drop(columns=['id_or_value_at_index', 'struct_id'])
+        .rename(columns={"id": "rule_id_6818", "name": "rule_name_6818"})
+        # 
+        .merge(sf(8581), how="left", left_on='defaultConfiguration', right_on='struct_id', validate="1:m")
+        .drop(columns=['defaultConfiguration', 'struct_id'])
+        # 
+        .merge(sf(2774), how="left", left_on='fullDescription', right_on='struct_id', validate="1:m")
+        .drop(columns=['fullDescription', 'struct_id'])
+        .rename(columns={"text": "rule_fullDescription_6818"})
+        # 
+        .merge(sf(2774), how="left", left_on='shortDescription', right_on='struct_id', validate="1:m")
+        .drop(columns=['shortDescription', 'struct_id'])
+        .rename(columns={"text": "rule_shortDescription_6818"})
+        # 
+        .merge(sf(7849), how="left", left_on='properties', right_on='struct_id', validate="1:m")
+        .drop(columns=['properties', 'struct_id'])
+        # 
+        .merge(af(7069), how="left", left_on='tags', right_on='array_id', validate="1:m")
+        .drop(columns=['tags', 'array_id', 'type_at_index'])
+        .rename(columns={"value_index": "tag_index_7069", "id_or_value_at_index": "tag_text_7069"})
+        # versionControlProvenance - repositoryUri
+        # The merge with af(8754) replicates versionControlProvenance, no 1:m validation
+        .merge(af(5511), how="left", left_on='versionControlProvenance', right_on='array_id')
+        .drop(columns=['versionControlProvenance', 'array_id', 'type_at_index'])
+        .rename(columns={"value_index": "versionControl_value_index_5511"})
+        # 
+        .merge(sf(3081), how="left", left_on='id_or_value_at_index', right_on='struct_id')
+        .drop(columns=['id_or_value_at_index', 'struct_id'])
+        #
+    )
+    return project_df
+
+def joins_for_artifacts(tgraph):
+    """ 
+    Return table providing the `artifacts` information.
+    """
+    # Access convenience functions
+    sf = lambda num: tgraph.dataframes['Struct' + str(num)]
+    af = lambda num: tgraph.dataframes['Array' + str(num)]
+    # 
+    artifacts_df = (
+        af(4640)
+        #
+        .merge(sf(5277), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m")
+        .drop(columns=['id_or_value_at_index', 'struct_id', 'type_at_index'])
+        .rename(columns={"value_index": "artifact_index_4640"})
+        #
+        .merge(sf(2685), how="left", left_on='location', right_on='struct_id', validate="1:m")
+        .drop(columns=['location', 'struct_id'])
+        .rename(columns={"index": "location_index_2685", "uri": "location_uri_2685",
+                         "uriBaseId": "location_uriBaseId_2685"})
+    )
+    return artifacts_df
--- a/sarif_cli/typegraph.py
+++ b/sarif_cli/typegraph.py
@@ -11,124 +11,6 @@ from dataclasses import dataclass
 from typing import *
 import pandas as pd

-# 
-# Structure graph from ../../bin/sarif-to-dot -u -t -f results.sarif
-# 
-struct_graph_2022_02_01 = (
-[   ('String', 'string'),
-    ('Int', 'int'),
-    ('Bool', 'bool'),
-    (   'Struct2685',
-        (   'struct',
-            ('index', 'Int'),
-            ('uri', 'String'),
-            ('uriBaseId', 'String'))),
-    ('Struct5277', ('struct', ('location', 'Struct2685'))),
-    ('Array4640', ('array', (0, 'Struct5277'))),
-    ('Array7069', ('array', (0, 'String'))),
-    (   'Struct9543',
-        (   'struct',
-            ('semmle.formatSpecifier', 'String'),
-            ('semmle.sourceLanguage', 'String'))),
-    ('Struct2774', ('struct', ('text', 'String'))),
-    (   'Struct6299',
-        (   'struct',
-            ('endColumn', 'Int'),
-            ('endLine', 'Int'),
-            ('startColumn', 'Int'),
-            ('startLine', 'Int'))),
-    (   'Struct4963',
-        (   'struct',
-            ('artifactLocation', 'Struct2685'),
-            ('region', 'Struct6299'))),
-    (   'Struct2683',
-        (   'struct',
-            ('id', 'Int'),
-            ('message', 'Struct2774'),
-            ('physicalLocation', 'Struct4963'))),
-    ('Array0350', ('array', (0, 'Struct2683'))),
-    (   'Struct4199',
-        (   'struct',
-            ('primaryLocationLineHash', 'String'),
-            ('primaryLocationStartColumnFingerprint', 'String'))),
-    ('Struct3942', ('struct', ('id', 'String'), ('index', 'Int'))),
-    (   'Struct4055',
-        (   'struct',
-            ('locations', 'Array0350'),
-            ('message', 'Struct2774'),
-            ('partialFingerprints', 'Struct4199'),
-            ('relatedLocations', 'Array0350'),
-            ('rule', 'Struct3942'),
-            ('ruleId', 'String'),
-            ('ruleIndex', 'Int'))),
-    ('Struct0987', ('struct', ('location', 'Struct2683'))),
-    ('Array1075', ('array', (0, 'Struct0987'))),
-    ('Struct4194', ('struct', ('locations', 'Array1075'))),
-    ('Array1597', ('array', (0, 'Struct4194'))),
-    ('Struct7122', ('struct', ('threadFlows', 'Array1597'))),
-    ('Array9799', ('array', (0, 'Struct7122'))),
-    (   'Struct9699',
-        (   'struct',
-            ('codeFlows', 'Array9799'),
-            ('locations', 'Array0350'),
-            ('message', 'Struct2774'),
-            ('partialFingerprints', 'Struct4199'),
-            ('relatedLocations', 'Array0350'),
-            ('rule', 'Struct3942'),
-            ('ruleId', 'String'),
-            ('ruleIndex', 'Int'))),
-    ('Array6343', ('array', (1, 'Struct9699'), (0, 'Struct4055'))), # MANUALLY SORTED
-    ('Struct8581', ('struct', ('enabled', 'Bool'), ('level', 'String'))),
-    (   'Struct7849',
-        (   'struct',
-            ('kind', 'String'),
-            ('precision', 'String'),
-            ('security-severity', 'String'),
-            ('severity', 'String'),
-            ('sub-severity', 'String'),
-            ('tags', 'Array7069'))),
-    (   'Struct6818',
-        (   'struct',
-            ('defaultConfiguration', 'Struct8581'),
-            ('fullDescription', 'Struct2774'),
-            ('id', 'String'),
-            ('name', 'String'),
-            ('properties', 'Struct7849'),
-            ('shortDescription', 'Struct2774'))),
-    ('Array8754', ('array', (0, 'Struct6818'))),
-    (   'Struct7820',
-        (   'struct',
-            ('name', 'String'),
-            ('organization', 'String'),
-            ('rules', 'Array8754'),
-            ('version', 'String'))),
-    ('Struct8972', ('struct', ('driver', 'Struct7820'))),
-    (   'Struct3081',
-        ('struct', ('repositoryUri', 'String'), ('revisionId', 'String'))),
-    ('Array5511', ('array', (0, 'Struct3081'))),
-    (   'Struct3388',
-        (   'struct',
-            ('artifacts', 'Array4640'),
-            ('columnKind', 'String'),
-            ('newlineSequences', 'Array7069'),
-            ('properties', 'Struct9543'),
-            ('results', 'Array6343'),
-            ('tool', 'Struct8972'),
-            ('versionControlProvenance', 'Array5511'))),
-    ('Array0177', ('array', (0, 'Struct3388'))),
-    (   'Struct6787',
-        (   'struct',
-            ('$schema', 'String'),
-            ('runs', 'Array0177'),
-            ('version', 'String')))]
-)
-
-# 
-# The starting node is the typedef with '$schema' in the struct, also the leftmost
-# node in ../notes/sarif-structure-from-sarif-to-dot.pdf
-# 
-start_node_2022_02_01 = 'Struct6787' 
-
 #
 # Utility classes
 # 
--- a/scripts/file-level-tests.sh
+++ b/scripts/file-level-tests.sh
@@ -29,7 +29,7 @@ done
 # cases covering the different output options.  They are intended for manual use
 # and review.
 #
-read -r file srcroot <<< "../data/treeio/results.sarif ../data/treeio/treeio"
+read -r file srcroot <<< "../data/treeio/2021-12-09/results.sarif ../data/treeio/treeio"

 # All results, minimal output
 sarif-results-summary             $file | less
--- a/scripts/table-tests.sh
+++ b/scripts/table-tests.sh
@@ -0,0 +1,11 @@
+# -*- sh -*-
+#
+# Sanity tests for the table-producing scripts.  Should succeed and produce
+# nothing on stdout/stderr
+# 
+
+cd ~/local/sarif-cli/data/treeio/2021-12-09
+sarif-extract-tables results.sarif test-tables
+
+cd ~/local/sarif-cli/data/treeio
+sarif-extract-multi multi-sarif-01.json test-multi-table