diff --git a/bin/sarif-extract-scans b/bin/sarif-extract-scans
index 6da1f44..c2d6e4c 100755
--- a/bin/sarif-extract-scans
+++ b/bin/sarif-extract-scans
@@ -12,7 +12,7 @@ import logging
import pandas as pd
import pathlib
import sarif_cli.table_joins as tj
-import sarif_cli.derived_joins as derived
+import sarif_cli.scan_tables as st
import sys
#
@@ -88,6 +88,16 @@ class ScanTables:
def __init__(self): pass
scantabs = ScanTables()
+@dataclass
+class ExternalInfo:
+ scan_id : int
+ ql_query_id : str
+
+external_info = ExternalInfo(
+ scan_spec['scan_id'],
+ 'deadbeef00', # TODO: Take ql_query_id from where?
+)
+
#
# Add dataframes for base tables
#
@@ -102,13 +112,10 @@ bt.relatedLocations = tj.joins_for_relatedLocations(tgraph, sf_2683)
bt.rules = tj.joins_for_rules(tgraph)
#
-# Form derived query tables
+# Form scan tables
#
-# XX
-# scantabs.project = derived.joins_for_project(bt)
-# scantabs.scans = derived.joins_for_scans(bt)
-# scantabs.results = derived.joins_for_results(bt)
-
+scantabs.results = st.joins_for_results(bt, external_info)
+scantabs.scans = st.joins_for_scans(bt, external_info)
#
# Replace the remaining internal ids with snowflake ids
diff --git a/notes/state-and-tables.drawio b/notes/state-and-tables.drawio
index 452c396..e048873 100644
--- a/notes/state-and-tables.drawio
+++ b/notes/state-and-tables.drawio
@@ -1 +1 @@

\ No newline at end of file
+7Z1Zb9u6Esc/TR4TiPKmPGbpdpEcFKf3nLZPASMztlpZdCkqifvpLymL8kLGcbYZFXeAArUoKbLIH6nRfxYf9M5m9x8Un08v5VjkB3E0vj/onR/EMWNDZv6zLYtlSzLqLRsmKhsvm6JVw5fst2jOdK1VNhZl07Zs0lLmOptvNqayKESqN9q4UvJu87AbmY83GuZ8IryGLynP/dav2VhPm7sYRKv2jyKbTN2VWdTsuebpz4mSVdFcr5CFWO6ZcfdnmkPLKR/Lu7Wm3ruD3pmSUi8/ze7PRG67dbPH3j+wt/3KShR6nxNOi+z2Ov4kv5bV5ck/avrX5w8fD5u/UuqF6woxNj3TbEqlp3IiC56/W7We1rcr7F9lZmt1zIWU86bxh9B60Qwzr7Q0TVM9y5u94j7T38zn6GjQbH1f23NuqYrcxsJtFFotlifFA7f9fX3n6rx6y53od5O7Z1mptLnpX98uTiT7fcyuvnJ9ov57efXp8DBuGNZcTYTedeBweaDturVLNMPwQciZMN/IHKBEznV2uwkdb9idtMetBtF8aMYxPKY7v/ctz6vmUgfxMDd3cFrOeWE+T+rPXGU3h+JeK57qw1llZps56WBwejg9GJw3H9PVx9J2o8qkOixTXpSrHfN6h/xhJmbTWh9w9KOU5mKRrPS80ofjTLlvYe5q/Yt4CG4CdjfNtPgy5/VY3Zn1ZxMmfl3KvNLiRKUNcHXraqtv71Ur+bOd2mZoTm9kod/zWZZbTmYG31TuguVWKC3udw5us7fv1rRmKRwkzfbdamGJHTHT9UXFrSmvD8TQA2I5KHbF5prXK6aya5ZQk6yYeANS3mWznBfiFfrNruF5zuelGD+nN5NAb7Jh5PemmwShzmyu9rcBlhcTc5NPutzxIHC5ttFdj+daqIJrcWphLr0xbG/1+cM6YN6w2hlpn4g6uzGTujxKy9tl02ND6lZ8O3fSaZaPL/jCMGJaSm0ecW7rdCpV9tscz930M7uVbqaa4Wz9iC/2zGYVVqI0x3x2gLCtpkt+v3HgBS9109Dgkl2332/GLaWnUms5aw4KU3mTZ/N/H3wM7J4qe8/3eJOY/p58st6O2f4woHtcLQnNhiE8nbFHZ8vllTlzG0jT3bpdqc9kbhak3nltTpmBzPJ8q4nn2aQwm7m4safZ8cqMPXfSNGtritjHXWrWs4v6mPP+quXvpmtskzTn3uS1ZTbNxmNR1CaONuviEjlLz1xmha77bnBq/pkePrPGi33YnZlttto2/+zhSp/JwtwLz2rchOH5Tlim915Ad0/5x/FcbI78o+vl8GEcX/TwGfjWSGae7vcEAAwAbh7iAdD3AKhURsMPM/yjBHv4B6HhP+Wl+EQPASAIWNTHpsB/BVkapqkcC9vxZKtC2Kq95FHzcRTAoufOe6KxusflumKtjjw+HZlkrD53ndqfzx3GaojHt1umkh0YkNUKR0LIaoUl4dgjQU+V4GNiAZqFkAkLyoKzlNZYyGXKdSYLIgGShKAdC4uCr7mKYmyGs5oVRAEQBb09zda3o8DXNg0FF5nz/RIDb87AANtUHPq6Zv1+SWsBKAcJtqE49OXNmgNaDQDfGyN0E9FXOZ23i0xEUBR66CaiL3WSvwNQQkI3Dn0pkRwe0BQk6OahryTORFnaOFRiAISBHkM3DX0Ncenh+mlMgqs519O5kqarZ+T7envfV7/X4PCwMyruBehIoh147IgkfOblEHxfI1/eNKNb5bq84krxBTnAnrlmtfP/JQ6wvSh5tSVr5OubWyzQywwcDgEvGDAOvtBp/aHvbcAGLQtgHAQ8YMAc+GKnqnJ6oQEjIOT5AkbA1zlrBOh5AEhBwPMFTIGvc7aucDIQ4IEIuMGAgfDVzlVsBD0eoDgIuMGAOfBVz5YDipCAfo0M+MOAcfDlz3UcyDkKCEPAIwYMg6+DtjBQzASCyIRtRSa+4rgJBK0PgDgEfGXAOPiiI8XXYqAQcpkBo+ALji0KFEUBB0IP24BMfMVxHQQKpwDFYYhtQia++tjiQHEVsDAco5uPvghZV9YS4wvSItG46DN0O9LXIre5IEkSDIc+ui3pS5LbOJAyCU3FEN2w9JXJABUkQAAycYxuXfoC5TYTpFMiRD9hG5rHvk4Z5IJWC0Aq+thm5nEoRnLLzKQXD0AiXJQiHhG+arlNBImXcDwcY9uYx4FwSZ8H0jAhqRjG2FbmcSCCcosKkjJhmRigW5i+lNkwcFWPOYEAA8II3aj0tcu5ymZcLdziYF8yPvJySkwAJV1E6GalL2BuMfFlpUq8N4MjlNlf0KoBRUiMbmgGxMwqF+TmgGNggG5WBsTLmgHSICAz9LAtyfbngx6oNkCVBg6AKg0MnTvhian/LUNPqzSwz+XYHiQCVBpg0aPp5fTgetaKtZr9f06tAeYMbCo20Akg0KsNsIjSzJEZQK800Ca6U545Igb45QZYRJnm3UICvfYAi3yNlHLNEUhALzrAIso27xIQ6NUHWET55p3BAb/6AIso47xbSKDXIGBOAqOc804AgV6DgDHKOu8KDOgVCJhLfKe8c1wU8CsQMEaZ510CAr0SAWOUe94dHNArETBG2eedJAO9LAFjvj5J+edoQODXI2CMMtA7yAV6YQLGKAe9c1SgFyZgjLLQO0kGenkC5qI1KQ+9Q1zglydgMWWid4wJ9OIELPaVTMpFRyQCvTgBiykbvYNcoBcpYDHlo3eNCvwiBSymjPRuoIBem4DFvp5JOenIVKAXKmAxZaV3mxH8wgUsprx0dArQixOwmDLTuwACeoUCNvBtyoM6CX3rfaP8P09P3xOCp6eku/5sI7T3zRF3odxPS0nvbV6u91ylFCIlfeAbuWbaVqmmh9Vz16h2ur8oF32fmgWvuET5Vi3JpHAAhHLPgQHwTVbyqUFjEEo/B8bAt1nJ6Q4PQjABHZYEl6GyRgIF5QBTEMo5B6bAd7FTyB44B6GMc2AOYo8Dco4BUxBKMwemwHelN0JGlQtSL2DUi36gwl37tF7noI0Af5p6EbsfDtshX7BBR+SLYbhcEtXTe+Hy1M70l2gYQSrfcHUKZCats0CyOyAOAUUDGgdf2aTlAGz8A1IG9Pj7kmbBZ2SvQhEQ0jCgEfBFTVHY3qV1AIyCgIYBTUGgGpK4FTkxAMVAQL8AZmDkq5k3VZ6fizJV2dw634kGKBoCOgY0Db6qWU5NFxEOCO+NgXJ50Dj44qb90QhCAAqBQHk8aAR8ZXOuRJqVtBRASkjotuLI1xNLkVYq04vD0hiN9gPxAMVDoDAeNA++pkgYAGMQKooHjUEgSrK6phUBHIVAOTxoFHxZUfMJeRdgOQjUwYPmwNcWLQeUJAqJQaDoHTQGvrh4UMdAlCkvzNVOCnHn8UDhEC8Jh3DPgDZpfE8IWjviafEQo634i07HQyS+zpnK2SyjdI5nL0vtDP+DQiESX+CcK/lDUFoPJAf4MRBOal/jYHx9lSphVqFlzTSCAQgG/ICIxNc412EwI0UswLDQgdCIJCB0Got1uShcjQ0SRAMUDfghEklA5lzSIOcEAygM+LESiS92ainzK4qeg+QAP0oiCSidlgNzE+QWhXydxI+QSAJip0XhVyXU4ooUBnAk8CMmkkCmOFfZjaGh0ILqnQGqTejm47GvOi5ZMOMqyG4ApQE9ZqKtruS5RBoJkhJFtwef7T3KzV6XAtr+1s+eS0BrVT4xJXTreiy05CR7QAbgAokjX/smy+SZzrnn/LIt+vLja97O90FPIkAS0L0fceQL3rXabUu7k6gFiAK67yOOQsX+57LMtDQvsJWiZDAoFvB9H3Hkq91NCe8rY3tOKqpqA4gDuvMjjny922FQXvGC54vflDAMBwSkA+S0yG6v40/ya1ldnvyjpn99/vDxcOiNtRhPhHsVNV01lRPTD/m7VasZAPM+I+xfta+Cq2MupB3YuvGH0HrRvJLySttenOqZe2EV95n+ZkfvKB40m9/Xdp27F9F6Y+E2CnO/y7MGbvP7+r7VafWWO88fveadqZSVSsXOAW2cAub9eiJ2Dn1jfNm+2znudRHs7FZsfJE3mOW+N8OFaCrxqzKwByY5yRH7yRFsD33gFUMy97peR0IyW+8NhWS+3lNltDeh3ZElfH8JhWSCc4AvSrili0Iy0WHAlyWYr1VTSCYKCx2QJZwtRCGZ+DTgqxLMF68pJBMHBvSQzJj58jWFZIJzgB6SaVAMc0AhmcCvk+ghmTF7IEqbQjKRkEAPyYxZQOKkkEwUtQnffAz8nheFZOLQgB+SyQLB2o3zQ9xnpTYD4uFAvo/nhGIOk7d1fQz+IM9H7EudZJe8bPVpJ/If5PmIfZGTPB/gHOB7PmJf3yTPBw4M+J6PmIpRdISFDng+YipG0Rka8D0fMRWj6AoM+J6P+AGZkxQMSA7wPR8xFaPoBAod8HzEVIyiW0jgez5iKkbRERbwPR89X3UkzwcSDZCej2AuT88ba8BcHve5Tsk5cmk9j6XyHKwn8hyN2sSe18/lcfp8h3N5goPa/9MHNX7DQXXZjH/aoPr2vVmUrYvKnmne+rld7uyX4eNDWeT+b66s+Ss3XI47+/MJZee3qsAHHnRxaGlrHYIv6bH4svhPv5Rx70dvcfytd5H0//qMtLY5pNkm0o8R/UT/6zrnwZtnnad819feKI00zN2z2ey54c1tD39V0rY3fdVum0+T2lhJeXFUGzbuL5gvtPwjywP8CVKbN18bXuPNcS2nfG6Pm91PFJ9Pj6xVkk7NNDoay7SamfGxJ9xNMy2+zJff8c4caM9cotKsaq8w00bR1lQbBWzKKDTVnj7TzKaStl/bfR/s7V/KsbBH/A8=
\ No newline at end of file
diff --git a/sarif_cli/scan_tables.py b/sarif_cli/scan_tables.py
new file mode 100644
index 0000000..9c4292c
--- /dev/null
+++ b/sarif_cli/scan_tables.py
@@ -0,0 +1,165 @@
+""" Collection of joins for the derived tables
+
+"""
+import pandas as pd
+from . import snowflake_id
+
+# id --
+# commit_id -- pathval(r02s01, 'commit_sha')
+# project_id -- project.id
+# db_create_start -- pathval(r02s01, 'created_at')
+# db_create_stop
+# scan_start_date
+# scan_stop_date
+# tool_name -- pathval(r02s01, 'tool', 'name')
+# tool_version -- pathval(r02s01, 'tool', 'version')
+# tool_query_commit_id -- pathval(r02, 0, 'tool', 'version') is sufficient
+# sarif_content -- r02s02
+# sarif_file_name -- used on upload
+# sarif_id -- pathval(r02s01, 'sarif_id')
+# results_count -- pathval(r02s01, 'results_count')
+# rules_count -- pathval(r02s01, 'rules_count')
+#
+def joins_for_scans(basetables, external_info):
+ """
+ Return the `scans` table
+ """
+ # XX
+ pass
+
+#
+# Results table
+#
+def joins_for_results(basetables, external_info):
+ """
+ Form and return the `results` table
+ """
+ # Get one table per result_type, then stack them,
+ # (kind_problem,
+ # kind_pathproblem,
+ # )
+ return pd.concat([_results_from_kind_problem(basetables, external_info),
+ _results_from_kind_pathproblem(basetables, external_info)])
+
+def _results_from_kind_problem(basetables, external_info):
+ b = basetables; e = external_info
+ flakegen = snowflake_id.Snowflake(2)
+ res = pd.DataFrame(data={
+ 'id': [flakegen.next() for _ in range(len(b.kind_problem))],
+
+ 'scan_id' : e.scan_id,
+ 'query_id' : e.ql_query_id,
+
+ 'result_type' : "kind_problem",
+ 'codeFlow_id' : 0, # link to codeflows (kind_pathproblem only, NULL here)
+
+ 'message': b.kind_problem.message_text,
+ 'message_object' : pd.NA,
+ 'location': b.kind_problem.location_uri,
+
+ # for kind_problem, use the same location for source and sink
+ 'source_startLine' : b.kind_problem.location_startLine,
+ 'source_startCol' : b.kind_problem.location_startColumn,
+ 'source_endLine' : b.kind_problem.location_endLine,
+ 'source_endCol' : b.kind_problem.location_endColumn,
+
+ 'sink_startLine' : b.kind_problem.location_startLine,
+ 'sink_startCol' : b.kind_problem.location_startColumn,
+ 'sink_endLine' : b.kind_problem.location_endLine,
+ 'sink_endCol' : b.kind_problem.location_endColumn,
+
+ 'source_object' : pd.NA, # TODO: find high-level info from query name or tags?
+ 'sink_object' : pd.NA,
+ })
+ return res
+
+
+def _results_from_kind_pathproblem(basetables, external_info):
+ #
+ # Only get source and sink, no paths. This implies one codeflow_index and one
+ # threadflow_index, no repetitions.
+ #
+ b = basetables; e = external_info
+ flakegen = snowflake_id.Snowflake(3)
+
+ # The sarif tables have relatedLocation information, which result in multiple
+ # results for a single codeFlows_id -- the expression
+ # b.kind_pathproblem[b.kind_pathproblem['codeFlows_id'] == cfid0]
+ # produces multiple rows.
+ #
+ # The `result` table has no entry to distinguish these, so we use a simplified
+ # version of `kind_pathproblem`.
+
+ reduced_kind_pathp = b.kind_pathproblem.drop(
+ columns=[
+ 'relatedLocation_array_index',
+ 'relatedLocation_endColumn',
+ 'relatedLocation_endLine',
+ 'relatedLocation_id',
+ 'relatedLocation_index',
+ 'relatedLocation_message',
+ 'relatedLocation_startColumn',
+ 'relatedLocation_startLine',
+ 'relatedLocation_uri',
+ 'relatedLocation_uriBaseId',
+ ])
+
+ # Per codeflow_id taken from b.kind_pathproblem table, it should suffice to
+ # take one codeflow_index, one threadflow_index, first and last location_index
+ # from the b.codeflows table.
+ #
+ # To ensure nothing is missed, collect all the entries and then check for
+ # unique rows.
+ cfids = reduced_kind_pathp['codeFlows_id'].unique()
+
+ source_sink_coll = []
+ for cfid0 in cfids:
+ cfid0t0 = b.codeflows[b.codeflows['codeflow_id'] == cfid0]
+ cfid0ppt0 = reduced_kind_pathp[reduced_kind_pathp['codeFlows_id'] ==
+ cfid0].drop_duplicates()
+ assert cfid0ppt0.shape[0] == 1, \
+ "Reduced kind_pathproblem table still has multiple entries"
+ for cfi0 in range(0, cfid0t0['codeflow_index'].max()+1):
+ cf0 = cfid0t0[cfid0t0['codeflow_index'] == cfi0]
+ for tfi0 in range(0, cf0['threadflow_index'].max()+1):
+ tf0 = cf0[ cf0['threadflow_index'] == tfi0 ]
+ loc_first = tf0['location_index'].min()
+ loc_last = tf0['location_index'].max()
+ source = tf0[tf0['location_index'] == loc_first]
+ sink = tf0[tf0['location_index'] == loc_last]
+ # Note that we're adding the unique row ids after the full table
+ # is done, below.
+ res = {
+ 'scan_id' : e.scan_id,
+ 'query_id' : e.ql_query_id,
+ #
+ 'result_type' : "kind_pathproblem",
+ 'codeFlow_id' : cfid0,
+ #
+ 'message': cfid0ppt0.message_text.values[0],
+ 'message_object' : pd.NA,
+ 'location': cfid0ppt0.location_uri.values[0],
+ #
+ 'source_location' : source.uri.values[0],
+ 'source_startLine' : source.startLine.values[0],
+ 'source_startCol' : source.startColumn.values[0],
+ 'source_endLine' : source.endLine.values[0],
+ 'source_endCol' : source.endColumn.values[0],
+ #
+ 'sink_location' : sink.uri.values[0],
+ 'sink_startLine' : sink.startLine.values[0],
+ 'sink_startCol' : sink.startColumn.values[0],
+ 'sink_endLine' : sink.endLine.values[0],
+ 'sink_endCol' : sink.endColumn.values[0],
+ #
+ 'source_object' : pd.NA, # TODO: find high-level info from
+ # query name or tags?
+ 'sink_object' : pd.NA,
+ }
+ source_sink_coll.append(res)
+ results0 = pd.DataFrame(data=source_sink_coll).drop_duplicates().reset_index(drop=True)
+
+ # Now add the snowflake ids
+ results0['id'] = [flakegen.next() for _ in range(len(results0))]
+
+ return results0
diff --git a/scripts/table-tests.sh b/scripts/table-tests.sh
index cabb6aa..e759647 100644
--- a/scripts/table-tests.sh
+++ b/scripts/table-tests.sh
@@ -5,3 +5,4 @@
#
( cd ../data/treeio/2021-12-09 && sarif-extract-tables results.sarif test-tables )
( cd ../data/treeio && sarif-extract-multi multi-sarif-01.json test-multi-table )
+( cd ../data/treeio && sarif-extract-scans scan-spec-0.json test-scan )