mirror of
https://github.com/hohn/sarif-cli.git
synced 2025-12-16 09:13:04 +01:00
Reproduce the
file:line:col:line:col: message
output from
../../bin/sarif-results-summary results.sarif | grep size
as test/example.
Original sample output is
RESULT: static/js/fileuploader.js:1214:13:1214:17: Unused variable size.
RESULT: static/js/tinymce/jscripts/tiny_mce/plugins/media/js/media.js:438:30:438:34: Unused variable size.
The table result here is
0:$ ../../bin/sarif-extract-tables results.sarif | grep size
0,static/js/fileuploader.js,1214,13,1214,17,Unused variable size.
34,static/js/tinymce/jscripts/tiny_mce/plugins/media/js/media.js,438,30,438,34,Unused variable size.
151 lines
5.3 KiB
Python
Executable File
151 lines
5.3 KiB
Python
Executable File
#!/usr/bin/env python
|
|
""" Extract data from sarif files in table form.
|
|
"""
|
|
import argparse
|
|
import json
|
|
from sarif_cli import signature
|
|
from sarif_cli import typegraph
|
|
import sys
|
|
from collections import defaultdict
|
|
import pandas as pd
|
|
|
|
#
|
|
# Start processing
|
|
#
|
|
parser = argparse.ArgumentParser(description='Read a sarif file and produce tabular output.')
|
|
parser.add_argument('file', metavar='sarif-file', type=str, help='input file, - for stdin')
|
|
parser.add_argument('-f', '--output-format', metavar='format', type=str, default="csv",
|
|
help='Output format for table. Currently just csv; '
|
|
' other formats supported by pandas can be added.')
|
|
|
|
args = parser.parse_args()
|
|
|
|
#
|
|
# Load data
|
|
#
|
|
with open(args.file, 'r') if args.file != '-' else sys.stdin as fp:
|
|
sarif_struct = json.load(fp)
|
|
|
|
#
|
|
# Preprocess raw SARIF to get smaller signature
|
|
#
|
|
context = signature.Context(
|
|
{
|
|
"string" : "String",
|
|
"int" : "Int",
|
|
"bool" : "Bool"
|
|
}
|
|
)
|
|
sarif_struct = signature.fillsig(args, sarif_struct, context)
|
|
|
|
#
|
|
# Use reference type graph (signature) to traverse sarif and attach values to tables
|
|
#
|
|
tgraph = typegraph.Typegraph(typegraph.struct_graph_2022_02_01)
|
|
typegraph.destructure(tgraph, typegraph.start_node_2022_02_01, sarif_struct)
|
|
|
|
#
|
|
# Form output tables
|
|
#
|
|
typegraph.attach_tables(tgraph)
|
|
|
|
"""
|
|
Reproduce the
|
|
|
|
file:line:col:line:col: message
|
|
|
|
output from
|
|
|
|
../../bin/sarif-results-summary results.sarif | grep size
|
|
|
|
as test/example. Sample output is
|
|
|
|
RESULT: static/js/fileuploader.js:1214:13:1214:17: Unused variable size.
|
|
RESULT: static/js/tinymce/jscripts/tiny_mce/plugins/media/js/media.js:438:30:438:34: Unused variable size.
|
|
|
|
The tree paths that match up .startLine with .text and .uri are
|
|
- .results > .[] > .message > .text
|
|
- .results > .[] > .locations > .[] > .physicalLocation > .region > .startLine
|
|
- .results > .[] > .locations > .[] > .physicalLocation > .artifactLocation > .uri
|
|
|
|
Note that this IGNORES the path
|
|
- .results > .[] > .relatedLocations > .[] > .physicalLocation > .text
|
|
|
|
We need appropriate table joins to replicate those tree paths; following the edges
|
|
in typegraph.pdf is the most direct way to find relevant tables and keys.
|
|
|
|
We only care about .message with matching .startLine, so left joins should
|
|
work without losing any data. Here are the tree paths and their corresponding
|
|
tables; the tree paths are from left to right and the joins can be done in the
|
|
same order.
|
|
|
|
Using ../notes/typegraph.pdf, we find these:
|
|
|
|
|------------+----------+---------+-------------------+-------------------+------------|
|
|
| .locations | | .[] | .physicalLocation | .artifactLocation | .uri |
|
|
| sf(4055) | | af(350) | sf(2683) | sf(4963) | sf(2685) |
|
|
|------------+----------+---------+-------------------+-------------------+------------|
|
|
| .locations | | .[] | .physicalLocation | .region | .startLine |
|
|
| sf(4055) | | af(350) | sf(2683) | sf(4963) | sf(6299) |
|
|
|------------+----------+---------+-------------------+-------------------+------------|
|
|
| .message | .text | | | | |
|
|
| sf(4055) | sf(2774) | | | | |
|
|
|------------+----------+---------+-------------------+-------------------+------------|
|
|
|
|
"""
|
|
#
|
|
# Access convenience functions
|
|
#
|
|
sf = lambda num: tgraph.dataframes['Struct' + str(num)]
|
|
af = lambda num: tgraph.dataframes['Array' + str(num)]
|
|
|
|
#
|
|
# Form the dataframe via joins
|
|
#
|
|
d1 = (
|
|
sf(4055)
|
|
.merge(af('0350'), how="left", left_on='locations', right_on='array_id', validate="1:m")
|
|
.drop(columns=['struct_id', 'locations', 'array_id', 'value_index', 'type_at_index'])
|
|
.merge(sf(2683), how="left", left_on='id_or_value_at_index', right_on='struct_id',
|
|
suffixes=("_4055", "_2683"), validate="1:m")
|
|
.drop(columns=['struct_id', 'id_or_value_at_index'])
|
|
.merge(sf(4963), how="left", left_on='physicalLocation', right_on='struct_id', validate="1:m")
|
|
.drop(columns=['struct_id', 'physicalLocation'])
|
|
.merge(sf(6299), how="left", left_on='region', right_on='struct_id', validate="1:m")
|
|
.drop(columns=['struct_id', 'region'])
|
|
.merge(sf(2685), how="left", left_on='artifactLocation', right_on='struct_id', validate="1:m")
|
|
.drop(columns=['struct_id', 'artifactLocation'])
|
|
.merge(sf(2774), how="left", left_on='message_4055', right_on='struct_id', validate="1:m")
|
|
.drop(columns=['struct_id', 'message_4055'])
|
|
.merge(sf(2774), how="left", left_on='message_2683', right_on='struct_id',
|
|
suffixes=("_4055", "_2683"), validate="1:m")
|
|
)
|
|
#
|
|
# As expected from the above note
|
|
#
|
|
# Note that this IGNORES the path
|
|
# - .results > .[] > .relatedLocations > .[] > .physicalLocation > .text
|
|
#
|
|
# we have no text entries that table:
|
|
#
|
|
# In [88]: d1[d1.text_2683 != '']
|
|
# Out[88]:
|
|
# Empty DataFrame
|
|
|
|
#
|
|
# Reproduce ALL `file:line:col:line:col: message` entries as a table
|
|
#
|
|
d2 = (d1[['uri', 'startLine', 'startColumn', 'endLine', 'endColumn', 'text_4055']]
|
|
.rename({'text_4055': 'message'}, axis='columns'))
|
|
|
|
#
|
|
# Write output
|
|
#
|
|
if args.output_format == 'csv':
|
|
d2.to_csv(sys.stdout, index_label='index')
|
|
|
|
else:
|
|
sys.stderr.write("unknown output format")
|
|
sys.exit(1)
|
|
|