sarif-extract-tables: interim commit

Internal destructuring and array aggregration run, but need to be tested.
Tables need to be formed, and pandas selections/joins/etc. used for custom table output.
This commit is contained in:
Michael Hohn
2022-02-04 14:44:55 -08:00
committed by =Michael Hohn
parent cf8096446b
commit 7a517fa06c
2 changed files with 479 additions and 0 deletions

131
bin/sarif-extract-tables Executable file
View File

@@ -0,0 +1,131 @@
#!/usr/bin/env python
""" Extract data from sarif files in table form.
"""
import argparse
import json
import sarif_cli.signature as S
import sarif_cli.typegraph as T
import sys
from pprint import pprint
from collections import defaultdict
#
# Start processing
#
parser = argparse.ArgumentParser(description='Read a sarif file and produce tabular output.')
parser.add_argument('file', metavar='sarif-file', type=str, help='input file, - for stdin')
# XX
# parser.add_argument('-t', '--typedef-signatures', action="store_true",
# help='Give every object signature a type and report by types')
args = parser.parse_args()
#
# Load data
#
with open(args.file, 'r') if args.file != '-' else sys.stdin as fp:
sarif_struct = json.load(fp)
#
# Preprocess raw SARIF to get smaller signature
#
context = S.Context(
{
"string" : "String",
"int" : "Int",
"bool" : "Bool"
}
)
sarif_struct = S.fillsig(args, sarif_struct, context)
#
# Use reference type graph (signature) to traverse sarif and attach values to tables
#
tgraph = T.Typegraph(T.struct_graph_2022_02_01)
T.destructure(tgraph, T.start_node_2022_02_01, sarif_struct)
import IPython
IPython.embed(header="""
---------------------------------
ipython repl for
tgraph = T.Typegraph(T.struct_graph_2022_02_01)
---------------------------------
Sanity checks:
In [4]: tgraph.fields
Out[4]:
{'String': None,
'Int': None,
'Bool': None,
...
}
In [6]: tgraph.instances['String']
Out[6]: []
In [7]: tgraph.instances['Int']
Out[7]: []
In [8]: tgraph.instances['Bool']
Out[8]: []
Select value checks:
In [9]: tgraph.instances['Struct6787']
Out[9]:
[(4358601472,
'https://raw.githubusercontent.com/oasis-tcs/sarif-spec/master/Schemata/sarif-schema-2.1.0.json',
4362190016,
'2.1.0')]
In [10]: tgraph.fields['Struct6787']
Out[10]: ['$schema', 'runs', 'version']
In [5]: tgraph.instances['Array0177']
Out[5]:
[(4337396800, 0, 'Struct3388', 4337396928),
(4337396800, 1, 'Struct3388', 4337397056)]
In [12]: tgraph.fields['Array0177']
Out[12]: [0]
In [9]: tgraph.instances['Array7069'][0:5]
Out[9]:
[(4337397248, 0, 'String', '\r\n'),
(4337397248, 1, 'String', '\n'),
(4337397248, 2, 'String', '\u2028'),
(4337397248, 3, 'String', '\u2029'),
(4339863424, 0, 'String', 'maintainability')]
In [10]: tgraph.instances['Struct6299'][:3]
Out[10]:
[(4315110720, 17, 1214, 13, 1214),
(4315111232, -1, -1, 1, -1),
(4315124096, 30, 847, 17, 847)]
In [11]: tgraph.fields['Struct6299']
Out[11]: ['endColumn', 'endLine', 'startColumn', 'startLine']
""")
#
# Form output tables
#
if args.dot_output:
S._signature(args, sarif_struct, context)
struct_graph = [(typedef, sig) for sig, typedef in context.sig_to_typedef.items()]
S.write_header(sys.stdout)
for typedef, sig in struct_graph:
S.write_node(sys.stdout, typedef, sig)
for typedef, sig in struct_graph:
S.write_edges(args, sys.stdout, typedef, sig)
S.write_footer(sys.stdout)
elif args.typedef_signatures:
S._signature(args, sarif_struct, context)
struct_graph = dict((typedef, sig) for sig,typedef in context.sig_to_typedef.items())
pprint(struct_graph, sys.stdout, indent=4)
else:
pprint(S._signature(args, sarif_struct, context), sys.stdout, indent=2)

348
sarif_cli/typegraph.py Normal file
View File

@@ -0,0 +1,348 @@
"""Operations on the type graph produced by sarif-to-dot -u -t -f
Also contains some type graph reference values; these may be moved out into
separate files at some point.
"""
from dataclasses import dataclass
from typing import *
#
# Structure graph from ../../bin/sarif-to-dot -u -t -f results.sarif
#
struct_graph_2022_02_01 = (
[ ('String', 'string'),
('Int', 'int'),
('Bool', 'bool'),
( 'Struct2685',
( 'struct',
('index', 'Int'),
('uri', 'String'),
('uriBaseId', 'String'))),
('Struct5277', ('struct', ('location', 'Struct2685'))),
('Array4640', ('array', (0, 'Struct5277'))),
('Array7069', ('array', (0, 'String'))),
( 'Struct9543',
( 'struct',
('semmle.formatSpecifier', 'String'),
('semmle.sourceLanguage', 'String'))),
('Struct2774', ('struct', ('text', 'String'))),
( 'Struct6299',
( 'struct',
('endColumn', 'Int'),
('endLine', 'Int'),
('startColumn', 'Int'),
('startLine', 'Int'))),
( 'Struct4963',
( 'struct',
('artifactLocation', 'Struct2685'),
('region', 'Struct6299'))),
( 'Struct2683',
( 'struct',
('id', 'Int'),
('message', 'Struct2774'),
('physicalLocation', 'Struct4963'))),
('Array0350', ('array', (0, 'Struct2683'))),
( 'Struct4199',
( 'struct',
('primaryLocationLineHash', 'String'),
('primaryLocationStartColumnFingerprint', 'String'))),
('Struct3942', ('struct', ('id', 'String'), ('index', 'Int'))),
( 'Struct4055',
( 'struct',
('locations', 'Array0350'),
('message', 'Struct2774'),
('partialFingerprints', 'Struct4199'),
('relatedLocations', 'Array0350'),
('rule', 'Struct3942'),
('ruleId', 'String'),
('ruleIndex', 'Int'))),
('Struct0987', ('struct', ('location', 'Struct2683'))),
('Array1075', ('array', (0, 'Struct0987'))),
('Struct4194', ('struct', ('locations', 'Array1075'))),
('Array1597', ('array', (0, 'Struct4194'))),
('Struct7122', ('struct', ('threadFlows', 'Array1597'))),
('Array9799', ('array', (0, 'Struct7122'))),
( 'Struct9699',
( 'struct',
('codeFlows', 'Array9799'),
('locations', 'Array0350'),
('message', 'Struct2774'),
('partialFingerprints', 'Struct4199'),
('relatedLocations', 'Array0350'),
('rule', 'Struct3942'),
('ruleId', 'String'),
('ruleIndex', 'Int'))),
('Array6343', ('array', (1, 'Struct9699'), (0, 'Struct4055'))), # MANUALLY SORTED
('Struct8581', ('struct', ('enabled', 'Bool'), ('level', 'String'))),
( 'Struct7849',
( 'struct',
('kind', 'String'),
('precision', 'String'),
('security-severity', 'String'),
('severity', 'String'),
('sub-severity', 'String'),
('tags', 'Array7069'))),
( 'Struct6818',
( 'struct',
('defaultConfiguration', 'Struct8581'),
('fullDescription', 'Struct2774'),
('id', 'String'),
('name', 'String'),
('properties', 'Struct7849'),
('shortDescription', 'Struct2774'))),
('Array8754', ('array', (0, 'Struct6818'))),
( 'Struct7820',
( 'struct',
('name', 'String'),
('organization', 'String'),
('rules', 'Array8754'),
('version', 'String'))),
('Struct8972', ('struct', ('driver', 'Struct7820'))),
( 'Struct3081',
('struct', ('repositoryUri', 'String'), ('revisionId', 'String'))),
('Array5511', ('array', (0, 'Struct3081'))),
( 'Struct3388',
( 'struct',
('artifacts', 'Array4640'),
('columnKind', 'String'),
('newlineSequences', 'Array7069'),
('properties', 'Struct9543'),
('results', 'Array6343'),
('tool', 'Struct8972'),
('versionControlProvenance', 'Array5511'))),
('Array0177', ('array', (0, 'Struct3388'))),
( 'Struct6787',
( 'struct',
('$schema', 'String'),
('runs', 'Array0177'),
('version', 'String')))]
)
#
# The starting node is the typedef with '$schema' in the struct, also the leftmost
# node in ../notes/sarif-structure-from-sarif-to-dot.pdf
#
start_node_2022_02_01 = 'Struct6787'
#
# Utility classes
#
class MissingFieldException(Exception):
pass
class SignatureMismatch(Exception):
pass
Tree = Union[Dict, List, int, str, bool]
NodeId = str
#
# Data aggregate
#
@dataclass
class Typegraph:
signature_graph : Dict[NodeId, Any] # (typedef -> signature) dict
instances : Dict[NodeId, List[Tuple]] # (node -> (row list)) dict
fields: Dict[NodeId, List] # (node -> (field list)) dict
"""
# Given this typedef
( 'Struct6787',
( 'struct',
('$schema', 'String'),
('runs', 'Array0177'),
('version', 'String')))
# and an instance SI of Struct6787, we have the following fields:
instances['Struct6787'] = []
fields['Struct6787'] = ('$schema', # Sorted from here
'runs',
'version')
table_header['Struct6787'] = ('id',
'$schema', # Sorted from here
'runs',
'version')
# The values are filled via
instances['Struct6787'].append( (id(SI), # "uplink" id
SI['$schema'], # value for int|string|bool
id(SI['runs']), # "downlink" id
SI['version']) )
# which may evaluate to, e.g.,
instances['Struct6787'].append( (4543584064,
'schema-sarif...',
4543582656,
'2.1') )
# Array entries use a fixed header with labeled entries:
# (array_id, value_index, value_type, id_or_value_at_index)
array_header['Array7069'] = ('id',
'value_index',
'value_type',
'value_or_id')
"""
def __init__(self, signature_graph : List):
"""
Arguments:
signature_graph -- The graph of typedefs (signatures), see
struct_graph_2022_02_01 as example
"""
self.signature_graph = dict(signature_graph)
self.instances = {}
self.fields = {}
for typedef, signature in signature_graph:
self.instances[typedef] = []
self.fields[typedef] = fields(signature)
def fields(signature):
if type(signature) != tuple:
# 'bool', 'int', 'string'
return None
else:
typ, *fields = signature
return sorted([fname for fname, ftype in fields])
def dict_fields(tree: Dict):
return sorted(tree.keys())
#
# Destructuring functions use the typegraph to destructure all subtrees into tables
#
def destructure(typegraph: Typegraph, node: NodeId, tree: Tree):
t = type(tree)
if t == dict:
_destructure_dict(typegraph, node, tree)
elif t == list:
_destructure_list(typegraph, node, tree)
elif t in [str, int, bool]:
pass
else:
raise Exception("Unhandled type: %s" % t)
def _destructure_dict_1(typegraph, node, tree):
"""
# typegraph.signature_graph destructuring
d1 = dict(struct_graph_2022_02_01)
In [765]: typ, *sig = d1['Struct6787']
In [766]: sig
Out[766]: [('$schema', 'String'), ('runs', 'Array0177'), ('version', 'String')]
In [767]: typ
Out[774]: 'struct'
"""
def id_or_value(tree, fieldname, fieldtype):
""" Id for recursive types, value for leaves
"""
if fieldtype in ['Bool', 'Int', 'String']:
return tree[fieldname]
else:
return id(tree[fieldname])
# Sanity check
sig = typegraph.signature_graph[node]
if type(sig) != tuple:
raise SignatureMismatch()
# Destructure this dictionary
subtype, *signature = sig
typegraph.instances[node].append(
(id(tree),
*[id_or_value(tree, fieldname, fieldtype)
for fieldname, fieldtype in signature]))
# Destructure recursive entries
for fieldname, fieldtype in signature:
if fieldtype not in ['Bool', 'Int', 'String']:
destructure(typegraph, fieldtype, tree[fieldname])
def _destructure_dict(typegraph: Typegraph, node, tree):
tree_fields = dict_fields(tree)
type_fields = typegraph.fields[node]
if tree_fields == type_fields:
_destructure_dict_1(typegraph, node, tree)
elif set(tree_fields).issuperset(set(type_fields)):
# Log a warning
# log.warning("XX: Tree has unrecognized fields")
_destructure_dict_1(typegraph, node, tree)
elif set(tree_fields).issubset(set(type_fields)):
raise MissingFieldException("XX: (Sub)tree is missing fields required by typedef")
else:
raise Exception("typegraph: unhandled case reached. Internal error")
def _destructure_list(typegraph, node: str, tree: List):
"""
"""
# List entries with multiple distinct signatures must be in order from most specific
# to least specific.
#
# HERE, WE ASSUME THAT THE `signature` list (see below) IS SORTED IN THE CORRECT ORDER
#
# For the cases in struct_graph_2022_02_01, Struct4055 and
# Struct9699, the signature with more fields takes precedence -- that is,
# ('Array6343', ('array', (1, 'Struct9699'), (0, 'Struct4055'))), # MANUALLY SORTED
#
"""
The three considered array signatures:
Multiple signatures (this is minimized by signature.fillsig()):
In [753]: d1 = typegraph.signature_graph
In [949]: subtype, *signature = d1['Array6343']
In [950]: subtype, signature
Out[952]: ('array', [(0, 'Struct4055'), (1, 'Struct9699')])
In [953]: subtype
Out[953]: 'array'
Single signature, with recursive subtype:
In [954]: subtype, *signature = d1['Array1597']
In [955]: signature
Out[955]: [(0, 'Struct4194')]
Single signature, leaf value:
In [956]: subtype, *signature = d1['Array7069']
In [957]: signature
Out[957]: [(0, 'String')]
"""
# Array entries use a fixed header with labeled entries:
# (array_id, value_index, type_at_index, id_or_value_at_index)
subtype, *signature = typegraph.signature_graph[node]
for value, valueindex in zip(tree, range(0,len(tree))):
for sigindex, sigtype in signature:
if sigtype in ['Bool', 'Int', 'String']:
# Destructure array leaf entries
typegraph.instances[node].append(
(id(tree),
valueindex,
sigtype,
value))
else:
# Destructure recursive entries
try:
destructure(typegraph, sigtype, value)
typegraph.instances[node].append(
(id(tree),
valueindex,
sigtype,
id(value)))
# Next `value` on success
break
except MissingFieldException as e:
# Re-raise if last available signature failed, otherwise try
# next `signature`
if (sigindex, sigtype) == signature[-1]:
raise