sarif-extract-tables: initial version, reproduces known output as table

Reproduce the

    file:line:col:line:col: message

output from

    ../../bin/sarif-results-summary results.sarif | grep size

as test/example.

Original sample output is

    RESULT: static/js/fileuploader.js:1214:13:1214:17: Unused variable size.
    RESULT: static/js/tinymce/jscripts/tiny_mce/plugins/media/js/media.js:438:30:438:34: Unused variable size.

The table result here is

    0:$ ../../bin/sarif-extract-tables results.sarif | grep size
    0,static/js/fileuploader.js,1214,13,1214,17,Unused variable size.
    34,static/js/tinymce/jscripts/tiny_mce/plugins/media/js/media.js,438,30,438,34,Unused variable size.
This commit is contained in:
Michael Hohn
2022-02-08 20:04:28 -08:00
committed by =Michael Hohn
parent f5e73e90ba
commit ec9a0b5590
2 changed files with 78 additions and 236 deletions

View File

@@ -6,7 +6,6 @@ import json
from sarif_cli import signature
from sarif_cli import typegraph
import sys
from pprint import pprint
from collections import defaultdict
import pandas as pd
@@ -15,9 +14,10 @@ import pandas as pd
#
parser = argparse.ArgumentParser(description='Read a sarif file and produce tabular output.')
parser.add_argument('file', metavar='sarif-file', type=str, help='input file, - for stdin')
# XX
# parser.add_argument('-t', '--typedef-signatures', action="store_true",
# help='Give every object signature a type and report by types')
parser.add_argument('-f', '--output-format', metavar='format', type=str, default="csv",
help='Output format for table. Currently just csv; '
' other formats supported by pandas can be added.')
args = parser.parse_args()
#
@@ -44,265 +44,107 @@ sarif_struct = signature.fillsig(args, sarif_struct, context)
tgraph = typegraph.Typegraph(typegraph.struct_graph_2022_02_01)
typegraph.destructure(tgraph, typegraph.start_node_2022_02_01, sarif_struct)
if 0:
import IPython
IPython.embed(header="""
---------------------------------
ipython repl for
tgraph = typegraph.Typegraph(typegraph.struct_graph_2022_02_01)
---------------------------------
Sanity checks:
In [4]: tgraph.fields
Out[4]:
{'String': None,
'Int': None,
'Bool': None,
...
}
In [6]: tgraph.instances['String']
Out[6]: []
In [7]: tgraph.instances['Int']
Out[7]: []
In [8]: tgraph.instances['Bool']
Out[8]: []
Select value checks:
In [9]: tgraph.instances['Struct6787']
Out[9]:
[(4358601472,
'https://raw.githubusercontent.com/oasis-tcs/sarif-spec/master/Schemata/sarif-schema-2.1.0.json',
4362190016,
'2.1.0')]
In [10]: tgraph.fields['Struct6787']
Out[10]: ['$schema', 'runs', 'version']
In [5]: tgraph.instances['Array0177']
Out[5]:
[(4337396800, 0, 'Struct3388', 4337396928),
(4337396800, 1, 'Struct3388', 4337397056)]
In [12]: tgraph.fields['Array0177']
Out[12]: [0]
In [9]: tgraph.instances['Array7069'][0:5]
Out[9]:
[(4337397248, 0, 'String', '\r\n'),
(4337397248, 1, 'String', '\n'),
(4337397248, 2, 'String', '\u2028'),
(4337397248, 3, 'String', '\u2029'),
(4339863424, 0, 'String', 'maintainability')]
In [10]: tgraph.instances['Struct6299'][:3]
Out[10]:
[(4315110720, 17, 1214, 13, 1214),
(4315111232, -1, -1, 1, -1),
(4315124096, 30, 847, 17, 847)]
In [11]: tgraph.fields['Struct6299']
Out[11]: ['endColumn', 'endLine', 'startColumn', 'startLine']
""")
#
# Form output tables
#
typegraph.attach_tables(tgraph)
import IPython
IPython.embed(header="""
---------------------------------
ipython repl for tables
---------------------------------
tgraph.dataframes
In [7]: sorted(tgraph.dataframes.keys())
Out[7]:
['Array0177',
'Array0350',
'Array1075',...]
sorted(tgraph.dataframes.keys())
tgraph.dataframes['Array0177']
tgraph.dataframes['Struct3388']
tgraph.signature_graph['Struct3388']
XX: reproduce the
"""
Reproduce the
file:line:col:line:col: message
output from
../../bin/sarif-results-summary results.sarif | less
../../bin/sarif-results-summary results.sarif | grep size
as test. Sample:
as test/example. Sample output is
RESULT: static/js/fileuploader.js:1214:13:1214:17: Unused variable size.
RESULT: static/js/tinymce/jscripts/tiny_mce/plugins/media/js/media.js:438:30:438:34: Unused variable size.
Collect typedef/fields via typegraph.pdf:
The tree paths that match up .startLine with .text and .uri are
- .results > .[] > .message > .text
- .results > .[] > .locations > .[] > .physicalLocation > .region > .startLine
- .results > .[] > .locations > .[] > .physicalLocation > .artifactLocation > .uri
static/js/fileuploader.js
Struct2685/uri
Note that this IGNORES the path
- .results > .[] > .relatedLocations > .[] > .physicalLocation > .text
In [22]: d1 = tgraph.dataframes['Struct2685']
In [24]: d1[d1.uri == "static/js/fileuploader.js"]
Out[24]:
struct_id index uri uriBaseId
0 4856718656 0 static/js/fileuploader.js %SRCROOT%
77 4856758336 0 static/js/fileuploader.js %SRCROOT%
...
We need appropriate table joins to replicate those tree paths; following the edges
in typegraph.pdf is the most direct way to find relevant tables and keys.
:1214:13:1214:17:
Struct6299/startLine/startColumn/endLine/endColumn
We only care about .message with matching .startLine, so left joins should
work without losing any data. Here are the tree paths and their corresponding
tables; the tree paths are from left to right and the joins can be done in the
same order.
Unused variable size.
Struct2774/message
d1 = tgraph.dataframes['Struct2774']
In [31]: d1[d1.text.str.contains("Unused variable size")]
Out[31]:
struct_id text
1 4856749504 Unused variable size.
103 4856879296 Unused variable size.
Using ../notes/typegraph.pdf, we find these:
Follow the edges in typegraph.pdf to find joining typedefs and paths.
|------------+----------+---------+-------------------+-------------------+------------|
| .locations | | .[] | .physicalLocation | .artifactLocation | .uri |
| sf(4055) | | af(350) | sf(2683) | sf(4963) | sf(2685) |
|------------+----------+---------+-------------------+-------------------+------------|
| .locations | | .[] | .physicalLocation | .region | .startLine |
| sf(4055) | | af(350) | sf(2683) | sf(4963) | sf(6299) |
|------------+----------+---------+-------------------+-------------------+------------|
| .message | .text | | | | |
| sf(4055) | sf(2774) | | | | |
|------------+----------+---------+-------------------+-------------------+------------|
Struct4963
Struct2683
""")
"""
#
# Access convenience functions
#
sf = lambda num: tgraph.dataframes['Struct' + str(num)]
af = lambda num: tgraph.dataframes['Array' + str(num)]
#
# These merges are for reconstructing ../../bin/sarif-results-summary output, but
# they also form the "bottom right" dataframe on the type graph (see the .pdf) and
# can be used for other result-oriented output.
# Form the dataframe via joins
#
# original dataframes
#
# Struct2685/uri
f2685 = odf_location = tgraph.dataframes['Struct2685']
# Struct6299/startLine/startColumn/endLine/endColumn
f6299 = odf_region = tgraph.dataframes['Struct6299']
# Struct2774/message
f2774 = odf_message = tgraph.dataframes['Struct2774']
#
# Linking dataframes
#
f4963 = ldf_physicalLocation = tgraph.dataframes['Struct4963']
f2683 = tgraph.dataframes['Struct2683']
# f4963 -> f6299
m_f4963_f6299 = pd.merge(
f4963,
f6299,
how="inner",
on=None,
left_on='region',
right_on='struct_id',
left_index=False,
right_index=False,
sort=True,
suffixes=("_f4963", "_f6299"),
copy=True,
indicator=False,
validate="1:m",
)
# m_f4963_f6299 -> f2685
m_f4963_f6299_f2685 = pd.merge(
m_f4963_f6299,
f2685,
how="inner",
on=None,
left_on='artifactLocation',
right_on='struct_id',
left_index=False,
right_index=False,
sort=True,
suffixes=("_m_f4963_f6299", "_f2685"),
copy=True,
indicator=False,
validate="1:m",
)
# f2683 -> m_f4963_f6299_f2685
m_f2683_f4963_f6299_f2685 = pd.merge(
f2683,
m_f4963_f6299_f2685,
how="inner",
on=None,
left_on='physicalLocation',
right_on='struct_id_f4963',
left_index=False,
right_index=False,
sort=True,
suffixes=("_f2683", "_m_f4963_f6299_f2685"),
copy=True,
indicator=False,
validate="1:m",
)
# m_f2683_f4963_f6299_f2685 -> f2774
m_f2683_f4963_f6299_f2685_f2774 = pd.merge(
m_f2683_f4963_f6299_f2685,
f2774,
how="inner",
on=None,
left_on='message',
right_on='struct_id',
left_index=False,
right_index=False,
sort=True,
suffixes=("_m_f2683_f4963_f6299_f2685", "_f2774"),
copy=True,
indicator=False,
validate="1:m",
d1 = (
sf(4055)
.merge(af('0350'), how="left", left_on='locations', right_on='array_id', validate="1:m")
.drop(columns=['struct_id', 'locations', 'array_id', 'value_index', 'type_at_index'])
.merge(sf(2683), how="left", left_on='id_or_value_at_index', right_on='struct_id',
suffixes=("_4055", "_2683"), validate="1:m")
.drop(columns=['struct_id', 'id_or_value_at_index'])
.merge(sf(4963), how="left", left_on='physicalLocation', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 'physicalLocation'])
.merge(sf(6299), how="left", left_on='region', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 'region'])
.merge(sf(2685), how="left", left_on='artifactLocation', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 'artifactLocation'])
.merge(sf(2774), how="left", left_on='message_4055', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 'message_4055'])
.merge(sf(2774), how="left", left_on='message_2683', right_on='struct_id',
suffixes=("_4055", "_2683"), validate="1:m")
)
#
# As expected from the above note
#
# Note that this IGNORES the path
# - .results > .[] > .relatedLocations > .[] > .physicalLocation > .text
#
# we have no text entries that table:
#
# In [88]: d1[d1.text_2683 != '']
# Out[88]:
# Empty DataFrame
#
# Remove indexing columns. Note: each row corresponds to the fields of an
# original table.
# Reproduce ALL `file:line:col:line:col: message` entries as a table
#
qdf = m_f2683_f4963_f6299_f2685_f2774[
['id', 'message', 'physicalLocation',
'artifactLocation', 'region',
'endColumn', 'endLine', 'startColumn', 'startLine',
'index', 'uri', 'uriBaseId',
'text']]
qdf[qdf.uri == "static/js/fileuploader.js"]
qdf[qdf.text.str.contains("Unused variable size")]
d2 = (d1[['uri', 'startLine', 'startColumn', 'endLine', 'endColumn', 'text_4055']]
.rename({'text_4055': 'message'}, axis='columns'))
#
# Write output
#
#
if args.dot_output:
signature._signature(args, sarif_struct, context)
struct_graph = [(typedef, sig) for sig, typedef in context.sig_to_typedef.items()]
signature.write_header(sys.stdout)
for typedef, sig in struct_graph:
signature.write_node(sys.stdout, typedef, sig)
for typedef, sig in struct_graph:
signature.write_edges(args, sys.stdout, typedef, sig)
signature.write_footer(sys.stdout)
elif args.typedef_signatures:
signature._signature(args, sarif_struct, context)
struct_graph = dict((typedef, sig) for sig,typedef in context.sig_to_typedef.items())
pprint(struct_graph, sys.stdout, indent=4)
if args.output_format == 'csv':
d2.to_csv(sys.stdout, index_label='index')
else:
pprint(signature._signature(args, sarif_struct, context), sys.stdout, indent=2)
sys.stderr.write("unknown output format")
sys.exit(1)

BIN
notes/typegraph.pdf Normal file

Binary file not shown.