mirror of
https://github.com/hohn/sarif-cli.git
synced 2025-12-16 09:13:04 +01:00
266 lines
10 KiB
Python
Executable File
266 lines
10 KiB
Python
Executable File
#!/usr/bin/env python
|
|
""" Extract data from sarif files in table form.
|
|
|
|
These particular table joins create tables matching the content of
|
|
./sarif-results-summary
|
|
|
|
Return tables providing the `problem`, `path-problem` and `relatedLocations`
|
|
information.
|
|
|
|
The `problem` and `path-problem` entries provide that information; the
|
|
`relatedLocations` table provides the details when multiple results are present
|
|
for either.
|
|
|
|
"""
|
|
import argparse
|
|
import json
|
|
import pathlib
|
|
from sarif_cli import signature, signature_single
|
|
from sarif_cli import typegraph
|
|
import sys
|
|
import pandas as pd
|
|
|
|
#
|
|
# Start processing
|
|
#
|
|
parser = argparse.ArgumentParser(description='Read a sarif file and produce tabular output.')
|
|
parser.add_argument('file', metavar='sarif-file', type=str, help='input file, - for stdin')
|
|
parser.add_argument('outdir', metavar='output-dir', type=str, help='output directory')
|
|
parser.add_argument('-f', '--output-format', metavar='format', type=str, default="csv",
|
|
help='Output format for table. Currently just csv; '
|
|
' other formats supported by pandas can be added.')
|
|
|
|
args = parser.parse_args()
|
|
|
|
#
|
|
# Load data
|
|
#
|
|
with open(args.file, 'r') if args.file != '-' else sys.stdin as fp:
|
|
sarif_struct = json.load(fp)
|
|
|
|
#
|
|
# Preprocess raw SARIF to get smaller signature
|
|
#
|
|
context = signature.Context(
|
|
{
|
|
"string" : "String",
|
|
"int" : "Int",
|
|
"bool" : "Bool"
|
|
}
|
|
)
|
|
sarif_struct = signature.fillsig(args, sarif_struct, context)
|
|
|
|
#
|
|
# Use reference type graph (signature) to traverse sarif and attach values to tables
|
|
#
|
|
tgraph = typegraph.Typegraph(signature_single.struct_graph_2022_02_01)
|
|
typegraph.destructure(tgraph, signature_single.start_node_2022_02_01, sarif_struct)
|
|
|
|
#
|
|
# Form output tables
|
|
#
|
|
typegraph.attach_tables(tgraph)
|
|
|
|
"""
|
|
Reproduce the
|
|
|
|
file:line:col:line:col: message
|
|
|
|
output from
|
|
|
|
../../bin/sarif-results-summary results.sarif | grep size
|
|
|
|
as test/example. Sample output is
|
|
|
|
RESULT: static/js/fileuploader.js:1214:13:1214:17: Unused variable size.
|
|
RESULT: static/js/tinymce/jscripts/tiny_mce/plugins/media/js/media.js:438:30:438:34: Unused variable size.
|
|
|
|
The tree paths that match up .startLine with .text and .uri are
|
|
- .results > .[] > .message > .text
|
|
- .results > .[] > .locations > .[] > .physicalLocation > .region > .startLine
|
|
- .results > .[] > .locations > .[] > .physicalLocation > .artifactLocation > .uri
|
|
|
|
Note that this IGNORES the path
|
|
- .results > .[] > .relatedLocations > .[] > .physicalLocation > .text
|
|
|
|
We need appropriate table joins to replicate those tree paths; following the edges
|
|
in typegraph.pdf is the most direct way to find relevant tables and keys.
|
|
|
|
We only care about .message with matching .startLine, so left joins should
|
|
work without losing any data. Here are the tree paths and their corresponding
|
|
tables; the tree paths are from left to right and the joins can be done in the
|
|
same order.
|
|
|
|
Using ../notes/typegraph.pdf, we find these:
|
|
|
|
|------------+----------+---------+-------------------+-------------------+------------|
|
|
| .locations | | .[] | .physicalLocation | .artifactLocation | .uri |
|
|
| sf(4055) | | af(350) | sf(2683) | sf(4963) | sf(2685) |
|
|
|------------+----------+---------+-------------------+-------------------+------------|
|
|
| .locations | | .[] | .physicalLocation | .region | .startLine |
|
|
| sf(4055) | | af(350) | sf(2683) | sf(4963) | sf(6299) |
|
|
|------------+----------+---------+-------------------+-------------------+------------|
|
|
| .message | .text | | | | |
|
|
| sf(4055) | sf(2774) | | | | |
|
|
|------------+----------+---------+-------------------+-------------------+------------|
|
|
|
|
"""
|
|
#
|
|
# Access convenience functions
|
|
#
|
|
sf = lambda num: tgraph.dataframes['Struct' + str(num)]
|
|
af = lambda num: tgraph.dataframes['Array' + str(num)]
|
|
|
|
#
|
|
# Form the message dataframe via joins
|
|
#
|
|
d1 = (
|
|
sf(4055)
|
|
.merge(af('0350'), how="left", left_on='locations', right_on='array_id', validate="1:m")
|
|
.drop(columns=['locations', 'array_id', 'value_index', 'type_at_index'])
|
|
.merge(sf(2683), how="left", left_on='id_or_value_at_index', right_on='struct_id',
|
|
suffixes=("_4055", "_2683"), validate="1:m")
|
|
.drop(columns=['struct_id_2683', 'id_or_value_at_index'])
|
|
.merge(sf(4963), how="left", left_on='physicalLocation', right_on='struct_id', validate="1:m")
|
|
.drop(columns=['struct_id', 'physicalLocation'])
|
|
.merge(sf(6299), how="left", left_on='region', right_on='struct_id', validate="1:m")
|
|
.drop(columns=['struct_id', 'region'])
|
|
.merge(sf(2685), how="left", left_on='artifactLocation', right_on='struct_id', validate="1:m")
|
|
.drop(columns=['struct_id', 'artifactLocation'])
|
|
.merge(sf(2774), how="left", left_on='message_4055', right_on='struct_id', validate="1:m")
|
|
.drop(columns=['struct_id', 'message_4055'])
|
|
.merge(sf(2774), how="left", left_on='message_2683', right_on='struct_id',
|
|
suffixes=("_4055", "_2683"), validate="1:m")
|
|
)
|
|
#
|
|
# As expected from the above note
|
|
#
|
|
# Note that this IGNORES the path
|
|
# - .results > .[] > .relatedLocations > .[] > .physicalLocation > .text
|
|
#
|
|
# we have no text entries that table:
|
|
#
|
|
# In [88]: d1[d1.text_2683 != '']
|
|
# Out[88]:
|
|
# Empty DataFrame
|
|
|
|
#
|
|
# Reproduce ALL `file:line:col:line:col: message` entries as a table
|
|
#
|
|
d2 = (d1[['struct_id_4055', 'uri', 'startLine', 'startColumn', 'endLine', 'endColumn', 'text_4055']]
|
|
.rename({'text_4055': 'message'}, axis='columns'))
|
|
|
|
#
|
|
# Form the codeFlows dataframe
|
|
#
|
|
dco1 = (
|
|
sf(9699)
|
|
.merge(af(9799), how="left", left_on='codeFlows', right_on='array_id', validate="1:m")
|
|
.drop(columns=['struct_id', 'codeFlows', 'array_id', 'type_at_index'])
|
|
#
|
|
.merge(sf(7122), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m")
|
|
.drop(columns=['id_or_value_at_index', 'struct_id'])
|
|
#
|
|
.merge(af(1597), how="left", left_on='threadFlows', right_on='array_id',
|
|
suffixes=("_codeFlow_9799", "_threadFlows_1597"), validate="1:m")
|
|
.drop(columns=['threadFlows', 'array_id', 'type_at_index'])
|
|
#
|
|
.merge(sf(4194), how="left", left_on='id_or_value_at_index', right_on='struct_id',
|
|
suffixes=("_9699", "_4194"), validate="1:m")
|
|
.drop(columns=['id_or_value_at_index', 'struct_id'])
|
|
#
|
|
.merge(af(1075), how="left", left_on='locations_4194', right_on='array_id', validate="1:m")
|
|
.drop(columns=['locations_4194', 'array_id', 'type_at_index'])
|
|
.rename(columns={"value_index": "value_index_locations_1075"})
|
|
#
|
|
.merge(sf('0987'), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m")
|
|
.drop(columns=['id_or_value_at_index', 'struct_id'])
|
|
#
|
|
.merge(sf(2683), how="left", left_on='location', right_on='struct_id',
|
|
suffixes=("_9699", "_2683"), validate="1:m")
|
|
.drop(columns=['location', 'struct_id'])
|
|
#
|
|
# The below is similar to dr1
|
|
#
|
|
.merge(sf(4963), how="left", left_on='physicalLocation', right_on='struct_id', validate="1:m")
|
|
.drop(columns=['struct_id', 'physicalLocation'])
|
|
#
|
|
.merge(sf(6299), how="left", left_on='region', right_on='struct_id', validate="1:m")
|
|
.drop(columns=['struct_id', 'region'])
|
|
#
|
|
.merge(sf(2685), how="left", left_on='artifactLocation', right_on='struct_id', validate="1:m")
|
|
.drop(columns=['struct_id', 'artifactLocation'])
|
|
#
|
|
.merge(sf(2774), how="left", left_on='message_2683', right_on='struct_id', validate="1:m")
|
|
.drop(columns=['struct_id', 'message_2683'])
|
|
)
|
|
|
|
# Keep columns of interest
|
|
dco2 = (dco1[['uri',
|
|
'startLine', 'startColumn', 'endLine', 'endColumn',
|
|
'text',
|
|
'ruleIndex', 'value_index_codeFlow_9799',
|
|
'value_index_threadFlows_1597', 'value_index_locations_1075',
|
|
]]
|
|
.rename({'text': 'message',
|
|
'value_index_codeFlow_9799': 'idx_codeFlow',
|
|
'value_index_threadFlows_1597': 'idx_threadFlows',
|
|
'value_index_locations_1075': 'idx_locations'}, axis='columns'))
|
|
|
|
# Remove dummy locations previously injected by signature.fillsig
|
|
dco3 = dco2[dco2.uri != 'scli-dyys dummy value']
|
|
|
|
#
|
|
# Form the relatedLocation dataframe via joins, starting from the union of
|
|
# relatedLocations from `kind problem` (sf(4055)) and `kind path-problem`
|
|
# (sf(9699)). This is only sligthly different from d1: left_on=relatedLocations,
|
|
# and no left_on='message_4055'
|
|
#
|
|
dr1 = (
|
|
pd.concat([sf(4055)[['relatedLocations', 'struct_id']], sf(9699)[['relatedLocations', 'struct_id']]])
|
|
.merge(af('0350'), how="left", left_on='relatedLocations', right_on='array_id', validate="1:m")
|
|
.drop(columns=['relatedLocations', 'array_id', 'value_index', 'type_at_index'])
|
|
#
|
|
.merge(sf(2683), how="left", left_on='id_or_value_at_index', right_on='struct_id',
|
|
suffixes=("_4055_9699", "_2683"), validate="1:m")
|
|
.drop(columns=['struct_id_2683', 'id_or_value_at_index'])
|
|
#
|
|
.merge(sf(4963), how="left", left_on='physicalLocation', right_on='struct_id', validate="1:m")
|
|
.drop(columns=['struct_id', 'physicalLocation'])
|
|
#
|
|
.merge(sf(6299), how="left", left_on='region', right_on='struct_id', validate="1:m")
|
|
.drop(columns=['struct_id', 'region'])
|
|
#
|
|
.merge(sf(2685), how="left", left_on='artifactLocation', right_on='struct_id', validate="1:m")
|
|
.drop(columns=['struct_id', 'artifactLocation'])
|
|
#
|
|
.merge(sf(2774), how="left", left_on='message', right_on='struct_id', validate="1:m")
|
|
.drop(columns=['struct_id', 'message'])
|
|
)
|
|
|
|
# Keep columns of interest
|
|
dr2 = (dr1[['struct_id_4055_9699', 'uri', 'startLine', 'startColumn', 'endLine', 'endColumn', 'text']]
|
|
.rename({'text': 'message', 'struct_id_4055_9699': 'struct_id'}, axis='columns'))
|
|
|
|
# Remove dummy locations previously injected by signature.fillsig
|
|
dr3 = dr2[dr2.uri != 'scli-dyys dummy value']
|
|
|
|
|
|
#
|
|
# Write output
|
|
#
|
|
if args.output_format == 'csv':
|
|
p = pathlib.Path(args.outdir)
|
|
p.mkdir(exist_ok=True)
|
|
with p.joinpath('problem.csv').open(mode='wb') as problem:
|
|
d2.to_csv(problem, index_label='index')
|
|
with p.joinpath('path-problem.csv').open(mode='wb') as path_problem:
|
|
dco3.to_csv(path_problem, index_label='index')
|
|
with p.joinpath('relatedLocations.csv').open(mode='wb') as relo:
|
|
dr3.to_csv(relo, index_label='index')
|
|
|
|
else:
|
|
sys.stderr.write("unknown output format")
|
|
sys.exit(1)
|
|
|