sarif-extract-tables: interim commit: form tables

Tables are now formed and kept in the Typegraph instance.
These will be tested using pandas operations to form one of the previous outputs.
This commit is contained in:
Michael Hohn
2022-02-04 23:56:01 -08:00
committed by =Michael Hohn
parent 7a517fa06c
commit f246f06d4e
3 changed files with 162 additions and 68 deletions

View File

@@ -32,12 +32,13 @@
Set up the virtual environment and install the packages:
# pip freeze > requirements.txt
#+BEGIN_SRC sh
# Using requirements.txt
python3 -m venv .venv
. .venv/bin/activate
python3 -m pip install -r requirements.txt
# Or separately:
pip install --upgrade pip
pip install ipython pyyaml
pip install ipython pyyaml pandas
#+END_SRC
"Install" for local development:

View File

@@ -3,11 +3,12 @@
"""
import argparse
import json
import sarif_cli.signature as S
import sarif_cli.typegraph as T
from sarif_cli import signature
from sarif_cli import typegraph
import sys
from pprint import pprint
from collections import defaultdict
import pandas as pd
#
# Start processing
@@ -28,104 +29,173 @@ with open(args.file, 'r') if args.file != '-' else sys.stdin as fp:
#
# Preprocess raw SARIF to get smaller signature
#
context = S.Context(
context = signature.Context(
{
"string" : "String",
"int" : "Int",
"bool" : "Bool"
}
)
sarif_struct = S.fillsig(args, sarif_struct, context)
sarif_struct = signature.fillsig(args, sarif_struct, context)
#
# Use reference type graph (signature) to traverse sarif and attach values to tables
#
tgraph = T.Typegraph(T.struct_graph_2022_02_01)
T.destructure(tgraph, T.start_node_2022_02_01, sarif_struct)
tgraph = typegraph.Typegraph(typegraph.struct_graph_2022_02_01)
typegraph.destructure(tgraph, typegraph.start_node_2022_02_01, sarif_struct)
import IPython
IPython.embed(header="""
---------------------------------
ipython repl for
if 0:
import IPython
IPython.embed(header="""
---------------------------------
ipython repl for
tgraph = T.Typegraph(T.struct_graph_2022_02_01)
tgraph = typegraph.Typegraph(typegraph.struct_graph_2022_02_01)
---------------------------------
Sanity checks:
In [4]: tgraph.fields
Out[4]:
{'String': None,
'Int': None,
'Bool': None,
...
}
In [6]: tgraph.instances['String']
Out[6]: []
---------------------------------
Sanity checks:
In [4]: tgraph.fields
Out[4]:
{'String': None,
'Int': None,
'Bool': None,
...
}
In [6]: tgraph.instances['String']
Out[6]: []
In [7]: tgraph.instances['Int']
Out[7]: []
In [7]: tgraph.instances['Int']
Out[7]: []
In [8]: tgraph.instances['Bool']
Out[8]: []
In [8]: tgraph.instances['Bool']
Out[8]: []
Select value checks:
In [9]: tgraph.instances['Struct6787']
Out[9]:
[(4358601472,
'https://raw.githubusercontent.com/oasis-tcs/sarif-spec/master/Schemata/sarif-schema-2.1.0.json',
4362190016,
'2.1.0')]
Select value checks:
In [9]: tgraph.instances['Struct6787']
Out[9]:
[(4358601472,
'https://raw.githubusercontent.com/oasis-tcs/sarif-spec/master/Schemata/sarif-schema-2.1.0.json',
4362190016,
'2.1.0')]
In [10]: tgraph.fields['Struct6787']
Out[10]: ['$schema', 'runs', 'version']
In [10]: tgraph.fields['Struct6787']
Out[10]: ['$schema', 'runs', 'version']
In [5]: tgraph.instances['Array0177']
Out[5]:
[(4337396800, 0, 'Struct3388', 4337396928),
(4337396800, 1, 'Struct3388', 4337397056)]
In [5]: tgraph.instances['Array0177']
Out[5]:
[(4337396800, 0, 'Struct3388', 4337396928),
(4337396800, 1, 'Struct3388', 4337397056)]
In [12]: tgraph.fields['Array0177']
Out[12]: [0]
In [12]: tgraph.fields['Array0177']
Out[12]: [0]
In [9]: tgraph.instances['Array7069'][0:5]
Out[9]:
[(4337397248, 0, 'String', '\r\n'),
(4337397248, 1, 'String', '\n'),
(4337397248, 2, 'String', '\u2028'),
(4337397248, 3, 'String', '\u2029'),
(4339863424, 0, 'String', 'maintainability')]
In [9]: tgraph.instances['Array7069'][0:5]
Out[9]:
[(4337397248, 0, 'String', '\r\n'),
(4337397248, 1, 'String', '\n'),
(4337397248, 2, 'String', '\u2028'),
(4337397248, 3, 'String', '\u2029'),
(4339863424, 0, 'String', 'maintainability')]
In [10]: tgraph.instances['Struct6299'][:3]
Out[10]:
[(4315110720, 17, 1214, 13, 1214),
(4315111232, -1, -1, 1, -1),
(4315124096, 30, 847, 17, 847)]
In [10]: tgraph.instances['Struct6299'][:3]
Out[10]:
[(4315110720, 17, 1214, 13, 1214),
(4315111232, -1, -1, 1, -1),
(4315124096, 30, 847, 17, 847)]
In [11]: tgraph.fields['Struct6299']
Out[11]: ['endColumn', 'endLine', 'startColumn', 'startLine']
""")
In [11]: tgraph.fields['Struct6299']
Out[11]: ['endColumn', 'endLine', 'startColumn', 'startLine']
""")
#
# Form output tables
#
typegraph.attach_tables(tgraph)
import IPython
IPython.embed(header="""
---------------------------------
ipython repl for tables
---------------------------------
tgraph.dataframes
In [7]: sorted(tgraph.dataframes.keys())
Out[7]:
['Array0177',
'Array0350',
'Array1075',...]
sorted(tgraph.dataframes.keys())
tgraph.dataframes['Array0177']
tgraph.dataframes['Struct3388']
tgraph.signature_graph['Struct3388']
XX: reproduce the
file:line:col:line:col: message
output from
../../bin/sarif-results-summary results.sarif | less
as test. Sample:
RESULT: static/js/fileuploader.js:1214:13:1214:17: Unused variable size.
Collect typedef/fields via typegraph.pdf:
static/js/fileuploader.js
Struct2685/uri
In [22]: d1 = tgraph.dataframes['Struct2685']
In [24]: d1[d1.uri == "static/js/fileuploader.js"]
Out[24]:
struct_id index uri uriBaseId
0 4856718656 0 static/js/fileuploader.js %SRCROOT%
77 4856758336 0 static/js/fileuploader.js %SRCROOT%
...
:1214:13:1214:17:
Struct6299/startLine/startColumn/endLine/endColumn
Unused variable size.
Struct2774/message
d1 = tgraph.dataframes['Struct2774']
In [31]: d1[d1.text.str.contains("Unused variable size")]
Out[31]:
struct_id text
1 4856749504 Unused variable size.
103 4856879296 Unused variable size.
Follow the edges in typegraph.pdf to find joining typedefs and paths.
Struct4963
Struct2683
""")
if args.dot_output:
S._signature(args, sarif_struct, context)
signature._signature(args, sarif_struct, context)
struct_graph = [(typedef, sig) for sig, typedef in context.sig_to_typedef.items()]
S.write_header(sys.stdout)
signature.write_header(sys.stdout)
for typedef, sig in struct_graph:
S.write_node(sys.stdout, typedef, sig)
signature.write_node(sys.stdout, typedef, sig)
for typedef, sig in struct_graph:
S.write_edges(args, sys.stdout, typedef, sig)
S.write_footer(sys.stdout)
signature.write_edges(args, sys.stdout, typedef, sig)
signature.write_footer(sys.stdout)
elif args.typedef_signatures:
S._signature(args, sarif_struct, context)
signature._signature(args, sarif_struct, context)
struct_graph = dict((typedef, sig) for sig,typedef in context.sig_to_typedef.items())
pprint(struct_graph, sys.stdout, indent=4)
else:
pprint(S._signature(args, sarif_struct, context), sys.stdout, indent=2)
pprint(signature._signature(args, sarif_struct, context), sys.stdout, indent=2)

View File

@@ -1,10 +1,15 @@
"""Operations on the type graph produced by sarif-to-dot -u -t -f
"""Operations on the type graph produced by sarif-to-dot -u -t -f
Also contains some type graph reference values; these may be moved out into
To get a map of this type graph, use
cd sarif-cli/data/treeio
../../bin/sarif-to-dot -u -t -f -n -d results.sarif | dot -Tpdf > typegraph.pdf
This file also contains some type graph reference values; these may be moved out into
separate files at some point.
"""
from dataclasses import dataclass
from typing import *
import pandas as pd
#
# Structure graph from ../../bin/sarif-to-dot -u -t -f results.sarif
@@ -145,6 +150,7 @@ class Typegraph:
signature_graph : Dict[NodeId, Any] # (typedef -> signature) dict
instances : Dict[NodeId, List[Tuple]] # (node -> (row list)) dict
fields: Dict[NodeId, List] # (node -> (field list)) dict
dataframes: Dict[NodeId, Any] # (node -> dataframe) dict
"""
# Given this typedef
@@ -195,6 +201,7 @@ class Typegraph:
self.signature_graph = dict(signature_graph)
self.instances = {}
self.fields = {}
self.dataframes = {}
for typedef, signature in signature_graph:
self.instances[typedef] = []
self.fields[typedef] = fields(signature)
@@ -346,3 +353,19 @@ def _destructure_list(typegraph, node: str, tree: List):
# next `signature`
if (sigindex, sigtype) == signature[-1]:
raise
#
# Form tables from destructured json/sarif
#
def attach_tables(typegraph):
for typedef, valarray in typegraph.instances.items():
if typedef.startswith('Array'):
# Arrays
colheader = ('array_id', 'value_index', 'type_at_index', 'id_or_value_at_index')
elif typedef.startswith('Struct'):
# Structs
colheader = ('struct_id', *typegraph.fields[typedef])
else:
continue # skip String etc.
typegraph.dataframes[typedef] = pd.DataFrame(valarray, columns = colheader)