mirror of
https://github.com/hohn/sarif-cli.git
synced 2025-12-16 17:23:03 +01:00
sarif-extract-tables: interim commit
Internal destructuring and array aggregration run, but need to be tested. Tables need to be formed, and pandas selections/joins/etc. used for custom table output.
This commit is contained in:
committed by
=Michael Hohn
parent
cf8096446b
commit
7a517fa06c
131
bin/sarif-extract-tables
Executable file
131
bin/sarif-extract-tables
Executable file
@@ -0,0 +1,131 @@
|
||||
#!/usr/bin/env python
|
||||
""" Extract data from sarif files in table form.
|
||||
"""
|
||||
import argparse
|
||||
import json
|
||||
import sarif_cli.signature as S
|
||||
import sarif_cli.typegraph as T
|
||||
import sys
|
||||
from pprint import pprint
|
||||
from collections import defaultdict
|
||||
|
||||
#
|
||||
# Start processing
|
||||
#
|
||||
parser = argparse.ArgumentParser(description='Read a sarif file and produce tabular output.')
|
||||
parser.add_argument('file', metavar='sarif-file', type=str, help='input file, - for stdin')
|
||||
# XX
|
||||
# parser.add_argument('-t', '--typedef-signatures', action="store_true",
|
||||
# help='Give every object signature a type and report by types')
|
||||
args = parser.parse_args()
|
||||
|
||||
#
|
||||
# Load data
|
||||
#
|
||||
with open(args.file, 'r') if args.file != '-' else sys.stdin as fp:
|
||||
sarif_struct = json.load(fp)
|
||||
|
||||
#
|
||||
# Preprocess raw SARIF to get smaller signature
|
||||
#
|
||||
context = S.Context(
|
||||
{
|
||||
"string" : "String",
|
||||
"int" : "Int",
|
||||
"bool" : "Bool"
|
||||
}
|
||||
)
|
||||
sarif_struct = S.fillsig(args, sarif_struct, context)
|
||||
|
||||
#
|
||||
# Use reference type graph (signature) to traverse sarif and attach values to tables
|
||||
#
|
||||
tgraph = T.Typegraph(T.struct_graph_2022_02_01)
|
||||
T.destructure(tgraph, T.start_node_2022_02_01, sarif_struct)
|
||||
|
||||
import IPython
|
||||
IPython.embed(header="""
|
||||
---------------------------------
|
||||
ipython repl for
|
||||
|
||||
tgraph = T.Typegraph(T.struct_graph_2022_02_01)
|
||||
|
||||
---------------------------------
|
||||
Sanity checks:
|
||||
In [4]: tgraph.fields
|
||||
Out[4]:
|
||||
{'String': None,
|
||||
'Int': None,
|
||||
'Bool': None,
|
||||
...
|
||||
}
|
||||
In [6]: tgraph.instances['String']
|
||||
Out[6]: []
|
||||
|
||||
In [7]: tgraph.instances['Int']
|
||||
Out[7]: []
|
||||
|
||||
In [8]: tgraph.instances['Bool']
|
||||
Out[8]: []
|
||||
|
||||
Select value checks:
|
||||
In [9]: tgraph.instances['Struct6787']
|
||||
Out[9]:
|
||||
[(4358601472,
|
||||
'https://raw.githubusercontent.com/oasis-tcs/sarif-spec/master/Schemata/sarif-schema-2.1.0.json',
|
||||
4362190016,
|
||||
'2.1.0')]
|
||||
|
||||
In [10]: tgraph.fields['Struct6787']
|
||||
Out[10]: ['$schema', 'runs', 'version']
|
||||
|
||||
In [5]: tgraph.instances['Array0177']
|
||||
Out[5]:
|
||||
[(4337396800, 0, 'Struct3388', 4337396928),
|
||||
(4337396800, 1, 'Struct3388', 4337397056)]
|
||||
|
||||
In [12]: tgraph.fields['Array0177']
|
||||
Out[12]: [0]
|
||||
|
||||
In [9]: tgraph.instances['Array7069'][0:5]
|
||||
Out[9]:
|
||||
[(4337397248, 0, 'String', '\r\n'),
|
||||
(4337397248, 1, 'String', '\n'),
|
||||
(4337397248, 2, 'String', '\u2028'),
|
||||
(4337397248, 3, 'String', '\u2029'),
|
||||
(4339863424, 0, 'String', 'maintainability')]
|
||||
|
||||
|
||||
In [10]: tgraph.instances['Struct6299'][:3]
|
||||
Out[10]:
|
||||
[(4315110720, 17, 1214, 13, 1214),
|
||||
(4315111232, -1, -1, 1, -1),
|
||||
(4315124096, 30, 847, 17, 847)]
|
||||
|
||||
In [11]: tgraph.fields['Struct6299']
|
||||
Out[11]: ['endColumn', 'endLine', 'startColumn', 'startLine']
|
||||
|
||||
|
||||
""")
|
||||
|
||||
#
|
||||
# Form output tables
|
||||
#
|
||||
|
||||
if args.dot_output:
|
||||
S._signature(args, sarif_struct, context)
|
||||
struct_graph = [(typedef, sig) for sig, typedef in context.sig_to_typedef.items()]
|
||||
S.write_header(sys.stdout)
|
||||
for typedef, sig in struct_graph:
|
||||
S.write_node(sys.stdout, typedef, sig)
|
||||
for typedef, sig in struct_graph:
|
||||
S.write_edges(args, sys.stdout, typedef, sig)
|
||||
S.write_footer(sys.stdout)
|
||||
|
||||
elif args.typedef_signatures:
|
||||
S._signature(args, sarif_struct, context)
|
||||
struct_graph = dict((typedef, sig) for sig,typedef in context.sig_to_typedef.items())
|
||||
pprint(struct_graph, sys.stdout, indent=4)
|
||||
|
||||
else:
|
||||
pprint(S._signature(args, sarif_struct, context), sys.stdout, indent=2)
|
||||
348
sarif_cli/typegraph.py
Normal file
348
sarif_cli/typegraph.py
Normal file
@@ -0,0 +1,348 @@
|
||||
"""Operations on the type graph produced by sarif-to-dot -u -t -f
|
||||
|
||||
Also contains some type graph reference values; these may be moved out into
|
||||
separate files at some point.
|
||||
"""
|
||||
from dataclasses import dataclass
|
||||
from typing import *
|
||||
|
||||
#
|
||||
# Structure graph from ../../bin/sarif-to-dot -u -t -f results.sarif
|
||||
#
|
||||
struct_graph_2022_02_01 = (
|
||||
[ ('String', 'string'),
|
||||
('Int', 'int'),
|
||||
('Bool', 'bool'),
|
||||
( 'Struct2685',
|
||||
( 'struct',
|
||||
('index', 'Int'),
|
||||
('uri', 'String'),
|
||||
('uriBaseId', 'String'))),
|
||||
('Struct5277', ('struct', ('location', 'Struct2685'))),
|
||||
('Array4640', ('array', (0, 'Struct5277'))),
|
||||
('Array7069', ('array', (0, 'String'))),
|
||||
( 'Struct9543',
|
||||
( 'struct',
|
||||
('semmle.formatSpecifier', 'String'),
|
||||
('semmle.sourceLanguage', 'String'))),
|
||||
('Struct2774', ('struct', ('text', 'String'))),
|
||||
( 'Struct6299',
|
||||
( 'struct',
|
||||
('endColumn', 'Int'),
|
||||
('endLine', 'Int'),
|
||||
('startColumn', 'Int'),
|
||||
('startLine', 'Int'))),
|
||||
( 'Struct4963',
|
||||
( 'struct',
|
||||
('artifactLocation', 'Struct2685'),
|
||||
('region', 'Struct6299'))),
|
||||
( 'Struct2683',
|
||||
( 'struct',
|
||||
('id', 'Int'),
|
||||
('message', 'Struct2774'),
|
||||
('physicalLocation', 'Struct4963'))),
|
||||
('Array0350', ('array', (0, 'Struct2683'))),
|
||||
( 'Struct4199',
|
||||
( 'struct',
|
||||
('primaryLocationLineHash', 'String'),
|
||||
('primaryLocationStartColumnFingerprint', 'String'))),
|
||||
('Struct3942', ('struct', ('id', 'String'), ('index', 'Int'))),
|
||||
( 'Struct4055',
|
||||
( 'struct',
|
||||
('locations', 'Array0350'),
|
||||
('message', 'Struct2774'),
|
||||
('partialFingerprints', 'Struct4199'),
|
||||
('relatedLocations', 'Array0350'),
|
||||
('rule', 'Struct3942'),
|
||||
('ruleId', 'String'),
|
||||
('ruleIndex', 'Int'))),
|
||||
('Struct0987', ('struct', ('location', 'Struct2683'))),
|
||||
('Array1075', ('array', (0, 'Struct0987'))),
|
||||
('Struct4194', ('struct', ('locations', 'Array1075'))),
|
||||
('Array1597', ('array', (0, 'Struct4194'))),
|
||||
('Struct7122', ('struct', ('threadFlows', 'Array1597'))),
|
||||
('Array9799', ('array', (0, 'Struct7122'))),
|
||||
( 'Struct9699',
|
||||
( 'struct',
|
||||
('codeFlows', 'Array9799'),
|
||||
('locations', 'Array0350'),
|
||||
('message', 'Struct2774'),
|
||||
('partialFingerprints', 'Struct4199'),
|
||||
('relatedLocations', 'Array0350'),
|
||||
('rule', 'Struct3942'),
|
||||
('ruleId', 'String'),
|
||||
('ruleIndex', 'Int'))),
|
||||
('Array6343', ('array', (1, 'Struct9699'), (0, 'Struct4055'))), # MANUALLY SORTED
|
||||
('Struct8581', ('struct', ('enabled', 'Bool'), ('level', 'String'))),
|
||||
( 'Struct7849',
|
||||
( 'struct',
|
||||
('kind', 'String'),
|
||||
('precision', 'String'),
|
||||
('security-severity', 'String'),
|
||||
('severity', 'String'),
|
||||
('sub-severity', 'String'),
|
||||
('tags', 'Array7069'))),
|
||||
( 'Struct6818',
|
||||
( 'struct',
|
||||
('defaultConfiguration', 'Struct8581'),
|
||||
('fullDescription', 'Struct2774'),
|
||||
('id', 'String'),
|
||||
('name', 'String'),
|
||||
('properties', 'Struct7849'),
|
||||
('shortDescription', 'Struct2774'))),
|
||||
('Array8754', ('array', (0, 'Struct6818'))),
|
||||
( 'Struct7820',
|
||||
( 'struct',
|
||||
('name', 'String'),
|
||||
('organization', 'String'),
|
||||
('rules', 'Array8754'),
|
||||
('version', 'String'))),
|
||||
('Struct8972', ('struct', ('driver', 'Struct7820'))),
|
||||
( 'Struct3081',
|
||||
('struct', ('repositoryUri', 'String'), ('revisionId', 'String'))),
|
||||
('Array5511', ('array', (0, 'Struct3081'))),
|
||||
( 'Struct3388',
|
||||
( 'struct',
|
||||
('artifacts', 'Array4640'),
|
||||
('columnKind', 'String'),
|
||||
('newlineSequences', 'Array7069'),
|
||||
('properties', 'Struct9543'),
|
||||
('results', 'Array6343'),
|
||||
('tool', 'Struct8972'),
|
||||
('versionControlProvenance', 'Array5511'))),
|
||||
('Array0177', ('array', (0, 'Struct3388'))),
|
||||
( 'Struct6787',
|
||||
( 'struct',
|
||||
('$schema', 'String'),
|
||||
('runs', 'Array0177'),
|
||||
('version', 'String')))]
|
||||
)
|
||||
|
||||
#
|
||||
# The starting node is the typedef with '$schema' in the struct, also the leftmost
|
||||
# node in ../notes/sarif-structure-from-sarif-to-dot.pdf
|
||||
#
|
||||
start_node_2022_02_01 = 'Struct6787'
|
||||
|
||||
#
|
||||
# Utility classes
|
||||
#
|
||||
class MissingFieldException(Exception):
|
||||
pass
|
||||
|
||||
class SignatureMismatch(Exception):
|
||||
pass
|
||||
|
||||
|
||||
Tree = Union[Dict, List, int, str, bool]
|
||||
NodeId = str
|
||||
|
||||
#
|
||||
# Data aggregate
|
||||
#
|
||||
@dataclass
|
||||
class Typegraph:
|
||||
signature_graph : Dict[NodeId, Any] # (typedef -> signature) dict
|
||||
instances : Dict[NodeId, List[Tuple]] # (node -> (row list)) dict
|
||||
fields: Dict[NodeId, List] # (node -> (field list)) dict
|
||||
|
||||
"""
|
||||
# Given this typedef
|
||||
( 'Struct6787',
|
||||
( 'struct',
|
||||
('$schema', 'String'),
|
||||
('runs', 'Array0177'),
|
||||
('version', 'String')))
|
||||
# and an instance SI of Struct6787, we have the following fields:
|
||||
instances['Struct6787'] = []
|
||||
|
||||
fields['Struct6787'] = ('$schema', # Sorted from here
|
||||
'runs',
|
||||
'version')
|
||||
|
||||
table_header['Struct6787'] = ('id',
|
||||
'$schema', # Sorted from here
|
||||
'runs',
|
||||
'version')
|
||||
|
||||
# The values are filled via
|
||||
instances['Struct6787'].append( (id(SI), # "uplink" id
|
||||
SI['$schema'], # value for int|string|bool
|
||||
id(SI['runs']), # "downlink" id
|
||||
SI['version']) )
|
||||
# which may evaluate to, e.g.,
|
||||
instances['Struct6787'].append( (4543584064,
|
||||
'schema-sarif...',
|
||||
4543582656,
|
||||
'2.1') )
|
||||
|
||||
# Array entries use a fixed header with labeled entries:
|
||||
# (array_id, value_index, value_type, id_or_value_at_index)
|
||||
|
||||
array_header['Array7069'] = ('id',
|
||||
'value_index',
|
||||
'value_type',
|
||||
'value_or_id')
|
||||
"""
|
||||
|
||||
|
||||
def __init__(self, signature_graph : List):
|
||||
"""
|
||||
Arguments:
|
||||
signature_graph -- The graph of typedefs (signatures), see
|
||||
struct_graph_2022_02_01 as example
|
||||
"""
|
||||
self.signature_graph = dict(signature_graph)
|
||||
self.instances = {}
|
||||
self.fields = {}
|
||||
for typedef, signature in signature_graph:
|
||||
self.instances[typedef] = []
|
||||
self.fields[typedef] = fields(signature)
|
||||
|
||||
def fields(signature):
|
||||
if type(signature) != tuple:
|
||||
# 'bool', 'int', 'string'
|
||||
return None
|
||||
else:
|
||||
typ, *fields = signature
|
||||
return sorted([fname for fname, ftype in fields])
|
||||
|
||||
def dict_fields(tree: Dict):
|
||||
return sorted(tree.keys())
|
||||
|
||||
#
|
||||
# Destructuring functions use the typegraph to destructure all subtrees into tables
|
||||
#
|
||||
def destructure(typegraph: Typegraph, node: NodeId, tree: Tree):
|
||||
t = type(tree)
|
||||
if t == dict:
|
||||
_destructure_dict(typegraph, node, tree)
|
||||
elif t == list:
|
||||
_destructure_list(typegraph, node, tree)
|
||||
elif t in [str, int, bool]:
|
||||
pass
|
||||
else:
|
||||
raise Exception("Unhandled type: %s" % t)
|
||||
|
||||
def _destructure_dict_1(typegraph, node, tree):
|
||||
"""
|
||||
# typegraph.signature_graph destructuring
|
||||
d1 = dict(struct_graph_2022_02_01)
|
||||
In [765]: typ, *sig = d1['Struct6787']
|
||||
|
||||
In [766]: sig
|
||||
Out[766]: [('$schema', 'String'), ('runs', 'Array0177'), ('version', 'String')]
|
||||
|
||||
In [767]: typ
|
||||
Out[774]: 'struct'
|
||||
"""
|
||||
def id_or_value(tree, fieldname, fieldtype):
|
||||
""" Id for recursive types, value for leaves
|
||||
"""
|
||||
if fieldtype in ['Bool', 'Int', 'String']:
|
||||
return tree[fieldname]
|
||||
else:
|
||||
return id(tree[fieldname])
|
||||
|
||||
# Sanity check
|
||||
sig = typegraph.signature_graph[node]
|
||||
if type(sig) != tuple:
|
||||
raise SignatureMismatch()
|
||||
|
||||
# Destructure this dictionary
|
||||
subtype, *signature = sig
|
||||
typegraph.instances[node].append(
|
||||
(id(tree),
|
||||
*[id_or_value(tree, fieldname, fieldtype)
|
||||
for fieldname, fieldtype in signature]))
|
||||
|
||||
# Destructure recursive entries
|
||||
for fieldname, fieldtype in signature:
|
||||
if fieldtype not in ['Bool', 'Int', 'String']:
|
||||
destructure(typegraph, fieldtype, tree[fieldname])
|
||||
|
||||
|
||||
def _destructure_dict(typegraph: Typegraph, node, tree):
|
||||
tree_fields = dict_fields(tree)
|
||||
type_fields = typegraph.fields[node]
|
||||
if tree_fields == type_fields:
|
||||
_destructure_dict_1(typegraph, node, tree)
|
||||
|
||||
elif set(tree_fields).issuperset(set(type_fields)):
|
||||
# Log a warning
|
||||
# log.warning("XX: Tree has unrecognized fields")
|
||||
_destructure_dict_1(typegraph, node, tree)
|
||||
|
||||
elif set(tree_fields).issubset(set(type_fields)):
|
||||
raise MissingFieldException("XX: (Sub)tree is missing fields required by typedef")
|
||||
|
||||
else:
|
||||
raise Exception("typegraph: unhandled case reached. Internal error")
|
||||
|
||||
|
||||
def _destructure_list(typegraph, node: str, tree: List):
|
||||
"""
|
||||
"""
|
||||
# List entries with multiple distinct signatures must be in order from most specific
|
||||
# to least specific.
|
||||
#
|
||||
# HERE, WE ASSUME THAT THE `signature` list (see below) IS SORTED IN THE CORRECT ORDER
|
||||
#
|
||||
# For the cases in struct_graph_2022_02_01, Struct4055 and
|
||||
# Struct9699, the signature with more fields takes precedence -- that is,
|
||||
# ('Array6343', ('array', (1, 'Struct9699'), (0, 'Struct4055'))), # MANUALLY SORTED
|
||||
#
|
||||
"""
|
||||
The three considered array signatures:
|
||||
|
||||
Multiple signatures (this is minimized by signature.fillsig()):
|
||||
In [753]: d1 = typegraph.signature_graph
|
||||
|
||||
In [949]: subtype, *signature = d1['Array6343']
|
||||
In [950]: subtype, signature
|
||||
Out[952]: ('array', [(0, 'Struct4055'), (1, 'Struct9699')])
|
||||
|
||||
In [953]: subtype
|
||||
Out[953]: 'array'
|
||||
|
||||
Single signature, with recursive subtype:
|
||||
|
||||
In [954]: subtype, *signature = d1['Array1597']
|
||||
In [955]: signature
|
||||
Out[955]: [(0, 'Struct4194')]
|
||||
|
||||
Single signature, leaf value:
|
||||
|
||||
In [956]: subtype, *signature = d1['Array7069']
|
||||
In [957]: signature
|
||||
Out[957]: [(0, 'String')]
|
||||
"""
|
||||
# Array entries use a fixed header with labeled entries:
|
||||
# (array_id, value_index, type_at_index, id_or_value_at_index)
|
||||
|
||||
subtype, *signature = typegraph.signature_graph[node]
|
||||
for value, valueindex in zip(tree, range(0,len(tree))):
|
||||
for sigindex, sigtype in signature:
|
||||
if sigtype in ['Bool', 'Int', 'String']:
|
||||
# Destructure array leaf entries
|
||||
typegraph.instances[node].append(
|
||||
(id(tree),
|
||||
valueindex,
|
||||
sigtype,
|
||||
value))
|
||||
else:
|
||||
# Destructure recursive entries
|
||||
try:
|
||||
destructure(typegraph, sigtype, value)
|
||||
typegraph.instances[node].append(
|
||||
(id(tree),
|
||||
valueindex,
|
||||
sigtype,
|
||||
id(value)))
|
||||
# Next `value` on success
|
||||
break
|
||||
except MissingFieldException as e:
|
||||
# Re-raise if last available signature failed, otherwise try
|
||||
# next `signature`
|
||||
if (sigindex, sigtype) == signature[-1]:
|
||||
raise
|
||||
Reference in New Issue
Block a user