sarif-extract-multi: extract combined tables from multiple sarif files

This command introduces a new tree structure that pulls in a collection
of sarif files.  In yaml format, an example is

    - creation_date: '2021-12-09'   # Repository creation date
      primary_language: javascript  # By lines of code
      project_name: treeio/treeio   # Repo name-short name
      query_commit_id: fa9571646c   # Commit id for custom (non-library) queries
      sarif_content: {}             # The sarif content will be attached here
      sarif_file_name: 2021-12-09/results.sarif # Path to sarif file
      scan_start_date: '2021-12-09'             # Beginning date/time of scan
      scan_stop_date:  '2021-12-10'             # End date/time of scan
      tool_name: codeql
      tool_version: v1.27

    - creation_date: '2022-02-25'
      primary_language: javascript
      ...

At run time,

    cd ~/local/sarif-cli/data/treeio
    sarif-extract-multi multi-sarif-01.json test-multi-table

will load the specified sarif files and put them in place of
`sarif_content`, then build tables against the new signature found in
sarif_cli/signature_multi.py, and merge those into 6 larger tables.  The
exported tables are

    artifacts.csv  path-problem.csv  project.csv
    codeflows.csv  problem.csv       related-locations.csv

and they have join keys for further operations.

The new typegraph is rendered in

    notes/typegraph-multi.pdf

using the instructions in

    sarif_cli/signature_multi.py
This commit is contained in:
Michael Hohn
2022-03-11 23:00:53 -08:00
committed by =Michael Hohn
parent 9c151e295b
commit 0f070a6ae4
19 changed files with 78848 additions and 25044 deletions

1
.gitattributes vendored
View File

@@ -1 +1,2 @@
*.sarif filter=lfs diff=lfs merge=lfs -text
*.pdf filter=lfs diff=lfs merge=lfs -text

88
bin/sarif-extract-multi Executable file
View File

@@ -0,0 +1,88 @@
#!/usr/bin/env python
""" Extract data from multiple sarif files in table form.
"""
import argparse
import json
import pathlib
from sarif_cli import signature, signature_multi
from sarif_cli import typegraph
import sarif_cli.table_joins as tj
import sys
from collections import defaultdict
import pandas as pd
#
# Start processing
#
parser = argparse.ArgumentParser(description='Read a collection of sarif files and produce tabular output.')
parser.add_argument('file', metavar='sarif-files.json', type=str,
help="json file containing the metadata array. Use - for stdin. ")
parser.add_argument('outdir', metavar='output-dir', type=str, help='output directory')
parser.add_argument('-c', '--combine-only', action="store_true",
help='Read the referenced input file(s) and write the combined structure to stdout')
args = parser.parse_args()
# Load meta info
with open(args.file, 'r') if args.file != '-' else sys.stdin as fp:
meta_struct = json.load(fp)
# Attach referenced files
def load(fname):
with open(fname, 'rb') as fp:
content = json.load(fp)
return content
for sarif_meta in meta_struct:
sarif_meta['sarif_content'] = load(sarif_meta['sarif_file_name'])
# Only output composite?
if args.combine_only:
json.dump(meta_struct, sys.stdout, indent=4)
sys.exit(0)
#
# Preprocess raw SARIF to get smaller signature
#
context = signature.Context(
{
"string" : "String",
"int" : "Int",
"bool" : "Bool"
}
)
meta_struct = signature.fillsig(args, meta_struct, context)
#
# Use reference type graph (signature) to traverse sarif and attach values to tables
#
tgraph = typegraph.Typegraph(signature_multi.struct_graph_2022_03_08)
typegraph.destructure(tgraph, signature_multi.start_node_2022_03_08, meta_struct)
#
# Form output tables
#
typegraph.attach_tables(tgraph)
#
# Form dataframes originally introduced by sarif-extract-tables
#
sf_2683 = tj.joins_for_sf_2683(tgraph)
kind_problem = tj.joins_for_problem(tgraph, sf_2683)
kind_pathproblem = tj.joins_for_path_problem(tgraph, sf_2683)
codeflows_9799 = tj.joins_for_codeflows(tgraph, sf_2683)
related_locations = tj.joins_for_relatedLocations(tgraph, sf_2683)
#
# Form the new dataframes
#
project_df = tj.joins_for_project(tgraph)
artifacts_df = tj.joins_for_artifacts(tgraph)
#
# Write output
#
p = pathlib.Path(args.outdir)
p.mkdir(exist_ok=True)
def write(path, frame):
with p.joinpath(path).open(mode='wb') as fh:
frame.to_csv(fh, index_label='index')
write('problem.csv', kind_problem)
write('path-problem.csv', kind_pathproblem)
write('codeflows.csv', codeflows_9799)
write('related-locations.csv', related_locations)
write('project.csv', project_df)
write('artifacts.csv', artifacts_df)

View File

@@ -1,10 +1,21 @@
#!/usr/bin/env python
""" Extract data from sarif files in table form.
These particular table joins create tables matching the content of
./sarif-results-summary
Return tables providing the `problem`, `path-problem` and `relatedLocations`
information.
The `problem` and `path-problem` entries provide that information; the
`relatedLocations` table provides the details when multiple results are present
for either.
"""
import argparse
import json
import pathlib
from sarif_cli import signature
from sarif_cli import signature, signature_single
from sarif_cli import typegraph
import sys
from collections import defaultdict
@@ -43,8 +54,8 @@ sarif_struct = signature.fillsig(args, sarif_struct, context)
#
# Use reference type graph (signature) to traverse sarif and attach values to tables
#
tgraph = typegraph.Typegraph(typegraph.struct_graph_2022_02_01)
typegraph.destructure(tgraph, typegraph.start_node_2022_02_01, sarif_struct)
tgraph = typegraph.Typegraph(signature_single.struct_graph_2022_02_01)
typegraph.destructure(tgraph, signature_single.start_node_2022_02_01, sarif_struct)
#
# Form output tables

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,26 @@
[
{
"creation_date": "2021-12-09",
"primary_language": "javascript",
"project_name": "treeio/treeio",
"query_commit_id": "fa9571646c",
"sarif_content": {},
"sarif_file_name": "2021-12-09/results.sarif",
"scan_start_date": "2021-12-09",
"scan_stop_date": "2021-12-10",
"tool_name": "codeql",
"tool_version": "v1.27"
},
{
"creation_date": "2022-02-25",
"primary_language": "javascript",
"project_name": "treeio/treeio",
"query_commit_id": "fa9571646c",
"sarif_content": {},
"sarif_file_name": "2022-02-25/results.sarif",
"scan_start_date": "2022-02-25",
"scan_stop_date": "2022-02-26",
"tool_name": "codeql",
"tool_version": "v1.29"
}
]

View File

@@ -0,0 +1,21 @@
- creation_date: '2021-12-09' # Repository creation date
primary_language: javascript # By lines of code
project_name: treeio/treeio # Repo name-short name
query_commit_id: fa9571646c # Commit id for custom (non-library) queries
sarif_content: {} # The sarif content will be attached here
sarif_file_name: 2021-12-09/results.sarif # Path to sarif file
scan_start_date: '2021-12-09' # Beginning date/time of scan
scan_stop_date: '2021-12-10' # End date/time of scan
tool_name: codeql
tool_version: v1.27
- creation_date: '2022-02-25'
primary_language: javascript
project_name: treeio/treeio
query_commit_id: fa9571646c
sarif_content: {}
sarif_file_name: 2022-02-25/results.sarif
scan_start_date: '2022-02-25'
scan_stop_date: '2022-02-26'
tool_name: codeql
tool_version: v1.29

File diff suppressed because it is too large Load Diff

BIN
notes/typegraph-multi.pdf Normal file

Binary file not shown.

View File

@@ -4,7 +4,7 @@ These functions convert a SARIF (or any json structure) to its signature, with v
See sarif-to-dot for options and examples.
"""
from dataclasses import dataclass
import sarif_cli.traverse as traverse
from . import traverse
import zlib
#

View File

@@ -0,0 +1,148 @@
""" The signature for a multi-sarif result file
Produced by
cd sarif-cli/data/treeio
sarif-extract-multi -c multi-sarif-01.json none | sarif-to-dot -utf -
with some arrays manually sorted so the the signature with more fields comes first. The case
('Array6343', ('array', (1, 'Struct9699'), (0, 'Struct4055'))), # MANUALLY SORTED
is marked below.
Also, this struct should be (and is) identical to struct_graph_2022_02_01 in the
leading entries, but there are two extras.
To get a map of this type graph, use
cd sarif-cli/data/treeio
sarif-extract-multi -c multi-sarif-01.json none | \
sarif-to-dot -u -t -f -n -d - | dot -Tpdf > typegraph-multi.pdf
"""
#
# The starting node is the leftmost node in ../notes/typegraph-multi.pdf
#
start_node_2022_03_08 = 'Array6785'
struct_graph_2022_03_08 = (
[ ('String', 'string'),
('Int', 'int'),
('Bool', 'bool'),
( 'Struct2685',
( 'struct',
('index', 'Int'),
('uri', 'String'),
('uriBaseId', 'String'))),
('Struct5277', ('struct', ('location', 'Struct2685'))),
('Array4640', ('array', (0, 'Struct5277'))),
('Array7069', ('array', (0, 'String'))),
( 'Struct9543',
( 'struct',
('semmle.formatSpecifier', 'String'),
('semmle.sourceLanguage', 'String'))),
('Struct2774', ('struct', ('text', 'String'))),
( 'Struct6299',
( 'struct',
('endColumn', 'Int'),
('endLine', 'Int'),
('startColumn', 'Int'),
('startLine', 'Int'))),
( 'Struct4963',
( 'struct',
('artifactLocation', 'Struct2685'),
('region', 'Struct6299'))),
( 'Struct2683',
( 'struct',
('id', 'Int'),
('message', 'Struct2774'),
('physicalLocation', 'Struct4963'))),
('Array0350', ('array', (0, 'Struct2683'))),
( 'Struct4199',
( 'struct',
('primaryLocationLineHash', 'String'),
('primaryLocationStartColumnFingerprint', 'String'))),
('Struct3942', ('struct', ('id', 'String'), ('index', 'Int'))),
( 'Struct4055',
( 'struct',
('locations', 'Array0350'),
('message', 'Struct2774'),
('partialFingerprints', 'Struct4199'),
('relatedLocations', 'Array0350'),
('rule', 'Struct3942'),
('ruleId', 'String'),
('ruleIndex', 'Int'))),
('Struct0987', ('struct', ('location', 'Struct2683'))),
('Array1075', ('array', (0, 'Struct0987'))),
('Struct4194', ('struct', ('locations', 'Array1075'))),
('Array1597', ('array', (0, 'Struct4194'))),
('Struct7122', ('struct', ('threadFlows', 'Array1597'))),
('Array9799', ('array', (0, 'Struct7122'))),
( 'Struct9699',
( 'struct',
('codeFlows', 'Array9799'),
('locations', 'Array0350'),
('message', 'Struct2774'),
('partialFingerprints', 'Struct4199'),
('relatedLocations', 'Array0350'),
('rule', 'Struct3942'),
('ruleId', 'String'),
('ruleIndex', 'Int'))),
('Array6343', ('array', (1, 'Struct9699'), (0, 'Struct4055'))), # MANUALLY SORTED
('Struct8581', ('struct', ('enabled', 'Bool'), ('level', 'String'))),
( 'Struct7849',
( 'struct',
('kind', 'String'),
('precision', 'String'),
('security-severity', 'String'),
('severity', 'String'),
('sub-severity', 'String'),
('tags', 'Array7069'))),
( 'Struct6818',
( 'struct',
('defaultConfiguration', 'Struct8581'),
('fullDescription', 'Struct2774'),
('id', 'String'),
('name', 'String'),
('properties', 'Struct7849'),
('shortDescription', 'Struct2774'))),
('Array8754', ('array', (0, 'Struct6818'))),
( 'Struct7820',
( 'struct',
('name', 'String'),
('organization', 'String'),
('rules', 'Array8754'),
('version', 'String'))),
('Struct8972', ('struct', ('driver', 'Struct7820'))),
( 'Struct3081',
('struct', ('repositoryUri', 'String'), ('revisionId', 'String'))),
('Array5511', ('array', (0, 'Struct3081'))),
( 'Struct3388',
( 'struct',
('artifacts', 'Array4640'),
('columnKind', 'String'),
('newlineSequences', 'Array7069'),
('properties', 'Struct9543'),
('results', 'Array6343'),
('tool', 'Struct8972'),
('versionControlProvenance', 'Array5511'))),
('Array0177', ('array', (0, 'Struct3388'))),
( 'Struct6787',
( 'struct',
('$schema', 'String'),
('runs', 'Array0177'),
('version', 'String'))), # Up to here identical to struct_graph_2022_02_01
( 'Struct3739',
( 'struct',
('creation_date', 'String'),
('primary_language', 'String'),
('project_name', 'String'),
('query_commit_id', 'String'),
('sarif_content', 'Struct6787'),
('sarif_file_name', 'String'),
('scan_start_date', 'String'),
('scan_stop_date', 'String'),
('tool_name', 'String'),
('tool_version', 'String'))),
('Array6785', ('array', (0, 'Struct3739')))]
)

View File

@@ -0,0 +1,125 @@
""" The signature for a single sarif file
Produced by
sarif-to-dot -u -t -f 2021-12-09/results.sarif
with some arrays manually sorted so the the signature with more fields comes first. The case
('Array6343', ('array', (1, 'Struct9699'), (0, 'Struct4055'))), # MANUALLY SORTED
is marked below
"""
#
# The starting node the leftmost node in ../notes/typegraph.pdf
#
start_node_2022_02_01 = 'Struct6787'
struct_graph_2022_02_01 = (
[ ('String', 'string'),
('Int', 'int'),
('Bool', 'bool'),
( 'Struct2685',
( 'struct',
('index', 'Int'),
('uri', 'String'),
('uriBaseId', 'String'))),
('Struct5277', ('struct', ('location', 'Struct2685'))),
('Array4640', ('array', (0, 'Struct5277'))),
('Array7069', ('array', (0, 'String'))),
( 'Struct9543',
( 'struct',
('semmle.formatSpecifier', 'String'),
('semmle.sourceLanguage', 'String'))),
('Struct2774', ('struct', ('text', 'String'))),
( 'Struct6299',
( 'struct',
('endColumn', 'Int'),
('endLine', 'Int'),
('startColumn', 'Int'),
('startLine', 'Int'))),
( 'Struct4963',
( 'struct',
('artifactLocation', 'Struct2685'),
('region', 'Struct6299'))),
( 'Struct2683',
( 'struct',
('id', 'Int'),
('message', 'Struct2774'),
('physicalLocation', 'Struct4963'))),
('Array0350', ('array', (0, 'Struct2683'))),
( 'Struct4199',
( 'struct',
('primaryLocationLineHash', 'String'),
('primaryLocationStartColumnFingerprint', 'String'))),
('Struct3942', ('struct', ('id', 'String'), ('index', 'Int'))),
( 'Struct4055',
( 'struct',
('locations', 'Array0350'),
('message', 'Struct2774'),
('partialFingerprints', 'Struct4199'),
('relatedLocations', 'Array0350'),
('rule', 'Struct3942'),
('ruleId', 'String'),
('ruleIndex', 'Int'))),
('Struct0987', ('struct', ('location', 'Struct2683'))),
('Array1075', ('array', (0, 'Struct0987'))),
('Struct4194', ('struct', ('locations', 'Array1075'))),
('Array1597', ('array', (0, 'Struct4194'))),
('Struct7122', ('struct', ('threadFlows', 'Array1597'))),
('Array9799', ('array', (0, 'Struct7122'))),
( 'Struct9699',
( 'struct',
('codeFlows', 'Array9799'),
('locations', 'Array0350'),
('message', 'Struct2774'),
('partialFingerprints', 'Struct4199'),
('relatedLocations', 'Array0350'),
('rule', 'Struct3942'),
('ruleId', 'String'),
('ruleIndex', 'Int'))),
('Array6343', ('array', (1, 'Struct9699'), (0, 'Struct4055'))), # MANUALLY SORTED
('Struct8581', ('struct', ('enabled', 'Bool'), ('level', 'String'))),
( 'Struct7849',
( 'struct',
('kind', 'String'),
('precision', 'String'),
('security-severity', 'String'),
('severity', 'String'),
('sub-severity', 'String'),
('tags', 'Array7069'))),
( 'Struct6818',
( 'struct',
('defaultConfiguration', 'Struct8581'),
('fullDescription', 'Struct2774'),
('id', 'String'),
('name', 'String'),
('properties', 'Struct7849'),
('shortDescription', 'Struct2774'))),
('Array8754', ('array', (0, 'Struct6818'))),
( 'Struct7820',
( 'struct',
('name', 'String'),
('organization', 'String'),
('rules', 'Array8754'),
('version', 'String'))),
('Struct8972', ('struct', ('driver', 'Struct7820'))),
( 'Struct3081',
('struct', ('repositoryUri', 'String'), ('revisionId', 'String'))),
('Array5511', ('array', (0, 'Struct3081'))),
( 'Struct3388',
( 'struct',
('artifacts', 'Array4640'),
('columnKind', 'String'),
('newlineSequences', 'Array7069'),
('properties', 'Struct9543'),
('results', 'Array6343'),
('tool', 'Struct8972'),
('versionControlProvenance', 'Array5511'))),
('Array0177', ('array', (0, 'Struct3388'))),
( 'Struct6787',
( 'struct',
('$schema', 'String'),
('runs', 'Array0177'),
('version', 'String')))]
)

316
sarif_cli/table_joins.py Normal file
View File

@@ -0,0 +1,316 @@
""" Collection of joins for the base tables provided by typegraph.attach_tables()
The `problem` and `path-problem` entries provide that information; the
`relatedLocations` table provides the details when multiple results are
present for either. `project` is the high-level overview; `artifacts`
provides those for the other tables.
"""
import pandas as pd
def joins_for_sf_2683(tgraph):
"""
Join all the tables used by 2683's right side into one.
"""
# Access convenience functions
sf = lambda num: tgraph.dataframes['Struct' + str(num)]
af = lambda num: tgraph.dataframes['Array' + str(num)]
#
sf_2683 = (
#
sf(2683)
.rename(columns={"struct_id": "struct_id_2683", "id": "id_2683"})
#
.merge(sf(4963), how="left", left_on='physicalLocation', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 'physicalLocation'])
#
.merge(sf(6299), how="left", left_on='region', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 'region'])
#
.merge(sf(2685), how="left", left_on='artifactLocation', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 'artifactLocation'])
.rename(columns={"index": "location_index_2685"})
#
.merge(sf(2774), how="left", left_on='message', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 'message'])
.rename(columns={"text": "message_text_2683"})
#
)
return sf_2683
def joins_for_problem(tgraph, sf_2683):
"""
Return table providing the `problem` information.
"""
# Access convenience functions
sf = lambda num: tgraph.dataframes['Struct' + str(num)]
af = lambda num: tgraph.dataframes['Array' + str(num)]
#
# Form the message dataframe (@kind problem) via joins
#
kind_problem_1 = (
af(6343)
.rename(columns={"value_index": "results_idx_6343", "array_id": "result_id_6343"})
.merge(sf(4055), how="inner", left_on='id_or_value_at_index', right_on='struct_id',
validate="1:m")
.drop(columns=['type_at_index', 'id_or_value_at_index', 'struct_id'])
.rename(columns={"message": "result_message_4055",
"relatedLocations": "relatedLocations_id"})
# locations
.merge(af('0350'), how="left", left_on='locations', right_on='array_id', validate="1:m")
.drop(columns=['locations', 'array_id', 'type_at_index'])
#
.merge(sf_2683, how="left", left_on='id_or_value_at_index', right_on='struct_id_2683', validate="1:m")
.drop(columns=['id_or_value_at_index', 'struct_id_2683'])
#
.merge(sf(2774), how="left", left_on='result_message_4055', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 'result_message_4055'])
.rename(columns={"text": "message_text_4055"})
#
.merge(sf(4199), how="left", left_on='partialFingerprints', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 'partialFingerprints'])
#
.merge(
sf(3942).rename(columns={"id": "rule_id", "index": "rule_index"}),
how="left", left_on='rule', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 'rule'])
#
)
return kind_problem_1
def joins_for_codeflows(tgraph, sf_2683):
"""
Return the table providing the `codeFlows` for a `path-problem table.
"""
# Access convenience functions
sf = lambda num: tgraph.dataframes['Struct' + str(num)]
af = lambda num: tgraph.dataframes['Array' + str(num)]
#
af_9799 = (
af(9799).rename(columns={"array_id": "t9799_array_id", "value_index": "t9799_idx"})
#
.merge(sf(7122), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m")
.drop(columns=['id_or_value_at_index', 'struct_id', 'type_at_index'])
#
.merge(af(1597).rename(columns={"array_id": "t1597_array_id", "value_index": "t1597_idx"}),
how="left", left_on='threadFlows', right_on='t1597_array_id', validate="1:m")
.drop(columns=['threadFlows', 't1597_array_id', 'type_at_index'])
#
.merge(sf(4194), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m")
.drop(columns=['id_or_value_at_index', 'struct_id'])
#
.merge(af(1075).rename(columns={"array_id": "t1075_array_id", "value_index": "t1075_idx"}),
how="left", left_on='locations', right_on='t1075_array_id', validate="1:m")
.drop(columns=['locations', 't1075_array_id', 'type_at_index'])
.rename(columns={"t1075_idx": "t1075_locations_idx"})
#
.merge(sf('0987'), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m")
.drop(columns=['id_or_value_at_index', 'struct_id'])
#
.merge(sf_2683, how="left", left_on='location', right_on='struct_id_2683', validate="1:m")
.drop(columns=['location', 'struct_id_2683'])
)
return af_9799
def joins_for_path_problem(tgraph, sf_2683):
"""
Return table providing the `path-problem` information.
"""
# Access convenience functions
sf = lambda num: tgraph.dataframes['Struct' + str(num)]
af = lambda num: tgraph.dataframes['Array' + str(num)]
#
kind_pathproblem_1 = (
af(6343)
.rename(columns={"value_index": "t6343_result_idx", "array_id": "t6343_result_id"})
.merge(sf(9699), how="inner", left_on='id_or_value_at_index', right_on='struct_id',
validate="1:m")
.rename(columns={"codeFlows" : "t9699_codeFlows",
"locations" : "t9699_locations",
"message" : "t9699_message",
"partialFingerprints" : "t9699_partialFingerprints",
"relatedLocations" : "t9699_relatedLocations",
"rule" : "t9699_rule",
"ruleId" : "t9699_ruleId",
"ruleIndex" : "t9699_ruleIndex",
})
.drop(columns=['id_or_value_at_index', 'struct_id', 'type_at_index'])
# 9699.locations
.merge(af('0350').rename(columns={"value_index": "t0350_location_idx"}),
how="left", left_on='t9699_locations', right_on='array_id', validate="1:m")
.drop(columns=['t9699_locations', 'array_id', 'type_at_index'])
#
.merge(sf_2683, how="left", left_on='id_or_value_at_index', right_on='struct_id_2683', validate="1:m")
.drop(columns=['id_or_value_at_index', 'struct_id_2683'])
#
# # TODO: merge or keep separate?
# # 9699.codeFlows
# .merge(af_9799, how="left", left_on='t9699_codeFlows', right_on='t9799_array_id', validate="1:m")
#
# 9699.message
.merge(sf(2774), how="left", left_on='t9699_message', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 't9699_message'])
.rename(columns={"text": "t9699_message_text"})
#
# 9699.partialFingerprints
.merge(sf(4199), how="left", left_on='t9699_partialFingerprints', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 't9699_partialFingerprints'])
#
# 9699.relatedLocations -- keep ids
#
# 9699.rule
.merge(
sf(3942).rename(columns={"id": "t3942_rule_id", "index": "t3942_rule_idx"}),
how="left", left_on='t9699_rule', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 't9699_rule'])
)
# # TODO potential cleanup
# # Remove dummy locations previously injected by signature.fillsig
# kind_pathproblem_2 = kind_pathproblem_1[kind_pathproblem_1.uri != 'scli-dyys dummy value']
# #
return kind_pathproblem_1
def joins_for_relatedLocations(tgraph, sf_2683):
"""
Return table providing the `relatedLocations` and `locations` information.
"""
# Access convenience functions
sf = lambda num: tgraph.dataframes['Struct' + str(num)]
af = lambda num: tgraph.dataframes['Array' + str(num)]
#
# Form the relatedLocation dataframe via joins, starting from the union of
# relatedLocations from `kind problem` (sf(4055)) and `kind path-problem`
# (sf(9699)).
#
related_locations_1 = (
pd.concat([sf(4055)[['relatedLocations', 'struct_id']], sf(9699)[['relatedLocations', 'struct_id']]])
.merge(af('0350'), how="left", left_on='relatedLocations', right_on='array_id', validate="1:m")
.drop(columns=['relatedLocations', 'array_id', 'value_index', 'type_at_index'])
#
.merge(sf(2683), how="left", left_on='id_or_value_at_index', right_on='struct_id',
suffixes=("_4055_9699", "_2683"), validate="1:m")
.drop(columns=['struct_id_2683', 'id_or_value_at_index'])
#
.merge(sf(4963), how="left", left_on='physicalLocation', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 'physicalLocation'])
#
.merge(sf(6299), how="left", left_on='region', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 'region'])
#
.merge(sf(2685), how="left", left_on='artifactLocation', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 'artifactLocation'])
#
.merge(sf(2774), how="left", left_on='message', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 'message'])
)
# Keep columns of interest
related_locations_2 = (related_locations_1[['struct_id_4055_9699', 'uri', 'startLine', 'startColumn', 'endLine', 'endColumn', 'text']]
.rename({'text': 'message', 'struct_id_4055_9699': 'struct_id'}, axis='columns'))
# Remove dummy locations previously injected by signature.fillsig
related_locations_3 = related_locations_2[related_locations_2.uri != 'scli-dyys dummy value']
return related_locations_3
def joins_for_project(tgraph):
"""
Return table providing the `project` information.
"""
# Access convenience functions
sf = lambda num: tgraph.dataframes['Struct' + str(num)]
af = lambda num: tgraph.dataframes['Array' + str(num)]
#
project_df = (
af(6785)
#
.merge(sf(3739), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m")
.drop(columns=['id_or_value_at_index', 'struct_id', 'array_id', 'type_at_index'])
#
.merge(sf(6787), how="left", left_on='sarif_content', right_on='struct_id', validate="1:m")
.drop(columns=['sarif_content', 'struct_id'])
.rename(columns={"version": "version_6787"})
#
.merge(af('0177'), how="left", left_on='runs', right_on='array_id',
suffixes=("_6785", "_0177"), validate="1:m")
.drop(columns=['runs', 'array_id', 'type_at_index'])
#
.merge(sf(3388), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m")
.drop(columns=['id_or_value_at_index', 'struct_id'])
#
# .merge(af(7069), how="left", left_on='newlineSequences', right_on='array_id',
# validate="1:m")
# .drop(columns=['newlineSequences', 'array_id', 'type_at_index'])
.drop(columns=['newlineSequences'])
#
.merge(sf(9543), how="left", left_on='properties', right_on='struct_id', validate="1:m")
.drop(columns=['properties', 'struct_id'])
#
# tool - driver - rules - defaultConfiguration - ( properties - tags )
#
.merge(sf(8972), how="left", left_on='tool', right_on='struct_id', validate="1:m")
.drop(columns=['tool', 'struct_id'])
#
.merge(sf(7820), how="left", left_on='driver', right_on='struct_id', validate="1:m")
.drop(columns=['driver', 'struct_id'])
.rename(columns={"version": "driver_version_7820", "name": "driver_name_7820"})
#
.merge(af(8754), how="left", left_on='rules', right_on='array_id', validate="1:m")
.drop(columns=['rules', 'array_id', 'type_at_index'])
.rename(columns={"value_index": "rule_value_index_8754"}) # rule index
#
.merge(sf(6818), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m")
.drop(columns=['id_or_value_at_index', 'struct_id'])
.rename(columns={"id": "rule_id_6818", "name": "rule_name_6818"})
#
.merge(sf(8581), how="left", left_on='defaultConfiguration', right_on='struct_id', validate="1:m")
.drop(columns=['defaultConfiguration', 'struct_id'])
#
.merge(sf(2774), how="left", left_on='fullDescription', right_on='struct_id', validate="1:m")
.drop(columns=['fullDescription', 'struct_id'])
.rename(columns={"text": "rule_fullDescription_6818"})
#
.merge(sf(2774), how="left", left_on='shortDescription', right_on='struct_id', validate="1:m")
.drop(columns=['shortDescription', 'struct_id'])
.rename(columns={"text": "rule_shortDescription_6818"})
#
.merge(sf(7849), how="left", left_on='properties', right_on='struct_id', validate="1:m")
.drop(columns=['properties', 'struct_id'])
#
.merge(af(7069), how="left", left_on='tags', right_on='array_id', validate="1:m")
.drop(columns=['tags', 'array_id', 'type_at_index'])
.rename(columns={"value_index": "tag_index_7069", "id_or_value_at_index": "tag_text_7069"})
# versionControlProvenance - repositoryUri
# The merge with af(8754) replicates versionControlProvenance, no 1:m validation
.merge(af(5511), how="left", left_on='versionControlProvenance', right_on='array_id')
.drop(columns=['versionControlProvenance', 'array_id', 'type_at_index'])
.rename(columns={"value_index": "versionControl_value_index_5511"})
#
.merge(sf(3081), how="left", left_on='id_or_value_at_index', right_on='struct_id')
.drop(columns=['id_or_value_at_index', 'struct_id'])
#
)
return project_df
def joins_for_artifacts(tgraph):
"""
Return table providing the `artifacts` information.
"""
# Access convenience functions
sf = lambda num: tgraph.dataframes['Struct' + str(num)]
af = lambda num: tgraph.dataframes['Array' + str(num)]
#
artifacts_df = (
af(4640)
#
.merge(sf(5277), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m")
.drop(columns=['id_or_value_at_index', 'struct_id', 'type_at_index'])
.rename(columns={"value_index": "artifact_index_4640"})
#
.merge(sf(2685), how="left", left_on='location', right_on='struct_id', validate="1:m")
.drop(columns=['location', 'struct_id'])
.rename(columns={"index": "location_index_2685", "uri": "location_uri_2685",
"uriBaseId": "location_uriBaseId_2685"})
)
return artifacts_df

View File

@@ -11,124 +11,6 @@ from dataclasses import dataclass
from typing import *
import pandas as pd
#
# Structure graph from ../../bin/sarif-to-dot -u -t -f results.sarif
#
struct_graph_2022_02_01 = (
[ ('String', 'string'),
('Int', 'int'),
('Bool', 'bool'),
( 'Struct2685',
( 'struct',
('index', 'Int'),
('uri', 'String'),
('uriBaseId', 'String'))),
('Struct5277', ('struct', ('location', 'Struct2685'))),
('Array4640', ('array', (0, 'Struct5277'))),
('Array7069', ('array', (0, 'String'))),
( 'Struct9543',
( 'struct',
('semmle.formatSpecifier', 'String'),
('semmle.sourceLanguage', 'String'))),
('Struct2774', ('struct', ('text', 'String'))),
( 'Struct6299',
( 'struct',
('endColumn', 'Int'),
('endLine', 'Int'),
('startColumn', 'Int'),
('startLine', 'Int'))),
( 'Struct4963',
( 'struct',
('artifactLocation', 'Struct2685'),
('region', 'Struct6299'))),
( 'Struct2683',
( 'struct',
('id', 'Int'),
('message', 'Struct2774'),
('physicalLocation', 'Struct4963'))),
('Array0350', ('array', (0, 'Struct2683'))),
( 'Struct4199',
( 'struct',
('primaryLocationLineHash', 'String'),
('primaryLocationStartColumnFingerprint', 'String'))),
('Struct3942', ('struct', ('id', 'String'), ('index', 'Int'))),
( 'Struct4055',
( 'struct',
('locations', 'Array0350'),
('message', 'Struct2774'),
('partialFingerprints', 'Struct4199'),
('relatedLocations', 'Array0350'),
('rule', 'Struct3942'),
('ruleId', 'String'),
('ruleIndex', 'Int'))),
('Struct0987', ('struct', ('location', 'Struct2683'))),
('Array1075', ('array', (0, 'Struct0987'))),
('Struct4194', ('struct', ('locations', 'Array1075'))),
('Array1597', ('array', (0, 'Struct4194'))),
('Struct7122', ('struct', ('threadFlows', 'Array1597'))),
('Array9799', ('array', (0, 'Struct7122'))),
( 'Struct9699',
( 'struct',
('codeFlows', 'Array9799'),
('locations', 'Array0350'),
('message', 'Struct2774'),
('partialFingerprints', 'Struct4199'),
('relatedLocations', 'Array0350'),
('rule', 'Struct3942'),
('ruleId', 'String'),
('ruleIndex', 'Int'))),
('Array6343', ('array', (1, 'Struct9699'), (0, 'Struct4055'))), # MANUALLY SORTED
('Struct8581', ('struct', ('enabled', 'Bool'), ('level', 'String'))),
( 'Struct7849',
( 'struct',
('kind', 'String'),
('precision', 'String'),
('security-severity', 'String'),
('severity', 'String'),
('sub-severity', 'String'),
('tags', 'Array7069'))),
( 'Struct6818',
( 'struct',
('defaultConfiguration', 'Struct8581'),
('fullDescription', 'Struct2774'),
('id', 'String'),
('name', 'String'),
('properties', 'Struct7849'),
('shortDescription', 'Struct2774'))),
('Array8754', ('array', (0, 'Struct6818'))),
( 'Struct7820',
( 'struct',
('name', 'String'),
('organization', 'String'),
('rules', 'Array8754'),
('version', 'String'))),
('Struct8972', ('struct', ('driver', 'Struct7820'))),
( 'Struct3081',
('struct', ('repositoryUri', 'String'), ('revisionId', 'String'))),
('Array5511', ('array', (0, 'Struct3081'))),
( 'Struct3388',
( 'struct',
('artifacts', 'Array4640'),
('columnKind', 'String'),
('newlineSequences', 'Array7069'),
('properties', 'Struct9543'),
('results', 'Array6343'),
('tool', 'Struct8972'),
('versionControlProvenance', 'Array5511'))),
('Array0177', ('array', (0, 'Struct3388'))),
( 'Struct6787',
( 'struct',
('$schema', 'String'),
('runs', 'Array0177'),
('version', 'String')))]
)
#
# The starting node is the typedef with '$schema' in the struct, also the leftmost
# node in ../notes/sarif-structure-from-sarif-to-dot.pdf
#
start_node_2022_02_01 = 'Struct6787'
#
# Utility classes
#

View File

@@ -29,7 +29,7 @@ done
# cases covering the different output options. They are intended for manual use
# and review.
#
read -r file srcroot <<< "../data/treeio/results.sarif ../data/treeio/treeio"
read -r file srcroot <<< "../data/treeio/2021-12-09/results.sarif ../data/treeio/treeio"
# All results, minimal output
sarif-results-summary $file | less

11
scripts/table-tests.sh Normal file
View File

@@ -0,0 +1,11 @@
# -*- sh -*-
#
# Sanity tests for the table-producing scripts. Should succeed and produce
# nothing on stdout/stderr
#
cd ~/local/sarif-cli/data/treeio/2021-12-09
sarif-extract-tables results.sarif test-tables
cd ~/local/sarif-cli/data/treeio
sarif-extract-multi multi-sarif-01.json test-multi-table