mirror of
https://github.com/hohn/sarif-cli.git
synced 2025-12-16 09:13:04 +01:00
279 lines
9.2 KiB
Python
279 lines
9.2 KiB
Python
"""Operations on the type graph produced by sarif-to-dot -u -t -f
|
|
|
|
To get a map of this type graph, use
|
|
cd sarif-cli/data/treeio
|
|
../../bin/sarif-to-dot -u -t -f -n -d results.sarif | dot -Tpdf > typegraph.pdf
|
|
|
|
This file also contains some type graph reference values; these may be moved out into
|
|
separate files at some point.
|
|
"""
|
|
from dataclasses import dataclass
|
|
from typing import Any, Dict, List, Tuple, Union
|
|
import pandas as pd
|
|
|
|
#
|
|
# Utility classes
|
|
#
|
|
class MissingFieldException(Exception):
|
|
pass
|
|
|
|
class SignatureMismatch(Exception):
|
|
pass
|
|
|
|
|
|
Tree = Union[Dict, List, int, str, bool]
|
|
NodeId = str
|
|
|
|
#
|
|
# Data aggregate
|
|
#
|
|
@dataclass
|
|
class Typegraph:
|
|
signature_graph : Dict[NodeId, Any] # (typedef -> signature) dict
|
|
instances : Dict[NodeId, List[Tuple]] # (node -> (row list)) dict
|
|
fields: Dict[NodeId, List] # (node -> (field list)) dict
|
|
dataframes: Dict[NodeId, Any] # (node -> dataframe) dict
|
|
|
|
"""
|
|
# Given this typedef
|
|
( 'Struct6787',
|
|
( 'struct',
|
|
('$schema', 'String'),
|
|
('runs', 'Array0177'),
|
|
('version', 'String')))
|
|
# and an instance SI of Struct6787, we have the following fields:
|
|
instances['Struct6787'] = []
|
|
|
|
fields['Struct6787'] = ('$schema', # Sorted from here
|
|
'runs',
|
|
'version')
|
|
|
|
table_header['Struct6787'] = ('id',
|
|
'$schema', # Sorted from here
|
|
'runs',
|
|
'version')
|
|
|
|
# The values are filled via
|
|
instances['Struct6787'].append( (id(SI), # "uplink" id
|
|
SI['$schema'], # value for int|string|bool
|
|
id(SI['runs']), # "downlink" id
|
|
SI['version']) )
|
|
# which may evaluate to, e.g.,
|
|
instances['Struct6787'].append( (4543584064,
|
|
'schema-sarif...',
|
|
4543582656,
|
|
'2.1') )
|
|
|
|
# Array entries use a fixed header with labeled entries:
|
|
# (array_id, value_index, value_type, id_or_value_at_index)
|
|
|
|
array_header['Array7069'] = ('id',
|
|
'value_index',
|
|
'value_type',
|
|
'value_or_id')
|
|
"""
|
|
|
|
|
|
def __init__(self, signature_graph : List):
|
|
"""
|
|
Arguments:
|
|
signature_graph -- The graph of typedefs (signatures), see
|
|
struct_graph_2022_02_01 as example
|
|
"""
|
|
self.signature_graph = dict(signature_graph)
|
|
self.instances = {}
|
|
self.fields = {}
|
|
self.dataframes = {}
|
|
for typedef, signature in signature_graph:
|
|
self.instances[typedef] = []
|
|
self.fields[typedef] = fields(signature)
|
|
|
|
def fields(signature):
|
|
if type(signature) != tuple:
|
|
# 'bool', 'int', 'string'
|
|
return None
|
|
else:
|
|
typ, *fields = signature
|
|
return sorted([fname for fname, ftype in fields])
|
|
|
|
def dict_fields(tree: Dict):
|
|
return sorted(tree.keys())
|
|
|
|
#
|
|
# Destructuring functions use the typegraph to destructure all subtrees into tables
|
|
#
|
|
def destructure(typegraph: Typegraph, node: NodeId, tree: Tree):
|
|
t = type(tree)
|
|
if t == dict:
|
|
_destructure_dict(typegraph, node, tree)
|
|
elif t == list:
|
|
_destructure_list(typegraph, node, tree)
|
|
elif t in [str, int, bool]:
|
|
pass
|
|
else:
|
|
raise Exception("Unhandled type: %s" % t)
|
|
|
|
def _destructure_dict_1(typegraph, node, tree):
|
|
"""
|
|
# typegraph.signature_graph destructuring
|
|
d1 = dict(struct_graph_2022_02_01)
|
|
In [765]: typ, *sig = d1['Struct6787']
|
|
|
|
In [766]: sig
|
|
Out[766]: [('$schema', 'String'), ('runs', 'Array0177'), ('version', 'String')]
|
|
|
|
In [767]: typ
|
|
Out[774]: 'struct'
|
|
"""
|
|
def id_or_value(tree, fieldname, fieldtype):
|
|
""" Id for recursive types, value for leaves
|
|
"""
|
|
if fieldtype in ['Bool', 'Int', 'String']:
|
|
return tree[fieldname]
|
|
else:
|
|
return id(tree[fieldname])
|
|
|
|
# Sanity check
|
|
sig = typegraph.signature_graph[node]
|
|
if type(sig) != tuple:
|
|
raise SignatureMismatch()
|
|
|
|
# Destructure this dictionary
|
|
subtype, *signature = sig
|
|
typegraph.instances[node].append(
|
|
(id(tree),
|
|
*[id_or_value(tree, fieldname, fieldtype)
|
|
for fieldname, fieldtype in signature]))
|
|
|
|
# Destructure recursive entries
|
|
for fieldname, fieldtype in signature:
|
|
if fieldtype not in ['Bool', 'Int', 'String']:
|
|
destructure(typegraph, fieldtype, tree[fieldname])
|
|
|
|
|
|
def _destructure_dict(typegraph: Typegraph, node, tree):
|
|
tree_fields = dict_fields(tree)
|
|
type_fields = typegraph.fields[node]
|
|
if tree_fields == type_fields:
|
|
_destructure_dict_1(typegraph, node, tree)
|
|
|
|
elif set(tree_fields).issuperset(set(type_fields)):
|
|
# Log a warning
|
|
# log.warning("XX: Tree has unrecognized fields")
|
|
_destructure_dict_1(typegraph, node, tree)
|
|
|
|
elif set(tree_fields).issubset(set(type_fields)):
|
|
raise MissingFieldException("XX: (Sub)tree is missing fields required by typedef")
|
|
|
|
else:
|
|
raise Exception("typegraph: unhandled case reached. Internal error")
|
|
|
|
|
|
def _destructure_list(typegraph, node: str, tree: List):
|
|
"""
|
|
"""
|
|
# List entries with multiple distinct signatures must be in order from most specific
|
|
# to least specific.
|
|
#
|
|
# HERE, WE ASSUME THAT THE `signature` list (see below) IS SORTED IN THE CORRECT ORDER
|
|
#
|
|
# For the cases in struct_graph_2022_02_01, Struct4055 and
|
|
# Struct9699, the signature with more fields takes precedence -- that is,
|
|
# ('Array6343', ('array', (1, 'Struct9699'), (0, 'Struct4055'))), # MANUALLY SORTED
|
|
#
|
|
"""
|
|
The three considered array signatures:
|
|
|
|
Multiple signatures (this is minimized by signature.fillsig()):
|
|
In [753]: d1 = typegraph.signature_graph
|
|
|
|
In [949]: subtype, *signature = d1['Array6343']
|
|
In [950]: subtype, signature
|
|
Out[952]: ('array', [(0, 'Struct4055'), (1, 'Struct9699')])
|
|
|
|
In [953]: subtype
|
|
Out[953]: 'array'
|
|
|
|
Single signature, with recursive subtype:
|
|
|
|
In [954]: subtype, *signature = d1['Array1597']
|
|
In [955]: signature
|
|
Out[955]: [(0, 'Struct4194')]
|
|
|
|
Single signature, leaf value:
|
|
|
|
In [956]: subtype, *signature = d1['Array7069']
|
|
In [957]: signature
|
|
Out[957]: [(0, 'String')]
|
|
"""
|
|
# Array entries use a fixed header with labeled entries:
|
|
# (array_id, value_index, type_at_index, id_or_value_at_index)
|
|
|
|
subtype, *signature = typegraph.signature_graph[node]
|
|
for value, valueindex in zip(tree, range(0,len(tree))):
|
|
for sigindex, sigtype in signature:
|
|
if sigtype in ['Bool', 'Int', 'String']:
|
|
# Destructure array leaf entries
|
|
typegraph.instances[node].append(
|
|
(id(tree),
|
|
valueindex,
|
|
sigtype,
|
|
value))
|
|
else:
|
|
# Destructure recursive entries
|
|
try:
|
|
destructure(typegraph, sigtype, value)
|
|
typegraph.instances[node].append(
|
|
(id(tree),
|
|
valueindex,
|
|
sigtype,
|
|
id(value)))
|
|
# Next `value` on success
|
|
break
|
|
except MissingFieldException:
|
|
# Re-raise if last available signature failed, otherwise try
|
|
# next `signature`
|
|
if (sigindex, sigtype) == signature[-1]:
|
|
raise
|
|
|
|
#
|
|
# Form tables from destructured json/sarif
|
|
#
|
|
def attach_tables(typegraph):
|
|
for typedef, valarray in typegraph.instances.items():
|
|
if typedef.startswith('Array'):
|
|
# Arrays
|
|
colheader = ('array_id', 'value_index', 'type_at_index', 'id_or_value_at_index')
|
|
elif typedef.startswith('Struct'):
|
|
# Structs
|
|
colheader = ('struct_id', *typegraph.fields[typedef])
|
|
else:
|
|
continue # skip String etc.
|
|
typegraph.dataframes[typedef] = pd.DataFrame(valarray, columns = colheader)
|
|
|
|
|
|
def tagged_array_columns(typegraph, array_id):
|
|
""" Return a dict mapping the array column names to versions tagged with the id.
|
|
|
|
Example:
|
|
The original table headers are
|
|
|
|
array_id value_index type_at_index id_or_value_at_index
|
|
|
|
the tagged versions become
|
|
|
|
t8754_array_id t8754_value_index t8754_type_at_index t8754_id_or_value_at_index
|
|
"""
|
|
array_id = str(array_id)
|
|
colheader = ('array_id', 'value_index', 'type_at_index', 'id_or_value_at_index')
|
|
return { header:"t{:s}_{:s}".format(array_id, header) for header in colheader}
|
|
|
|
|
|
def tagged_struct_columns(typegraph, struct_id):
|
|
""" Return a dict mapping the struct column names to versions tagged with the id.
|
|
"""
|
|
struct_id = str(struct_id)
|
|
typedef = 'Struct' + struct_id
|
|
colheader = ('struct_id', *typegraph.fields[typedef])
|
|
return { header:"t{:s}_{:s}".format(struct_id, header) for header in colheader}
|