sarif-cli/bin/sarif-extract-tables

#!/usr/bin/env python
""" Extract data from sarif files in table form.
"""
import argparse
import json
from sarif_cli import signature
from sarif_cli import typegraph
import sys
from collections import defaultdict
import pandas as pd

#
# Start processing
#
parser = argparse.ArgumentParser(description='Read a sarif file and produce tabular output.')
parser.add_argument('file', metavar='sarif-file', type=str, help='input file, - for stdin')
parser.add_argument('-f', '--output-format', metavar='format', type=str, default="csv",
                    help='Output format for table.  Currently just csv; '
                    '  other formats supported by pandas can be added.')

args = parser.parse_args()

#
# Load data
#
with open(args.file, 'r') if args.file != '-' else sys.stdin as fp:
    sarif_struct = json.load(fp)

#
# Preprocess raw SARIF to get smaller signature
#
context = signature.Context(
    {
        "string" : "String",
        "int" : "Int",
        "bool" : "Bool"
    }
)
sarif_struct = signature.fillsig(args, sarif_struct, context)

#
# Use reference type graph (signature) to traverse sarif and attach values to tables
#
tgraph = typegraph.Typegraph(typegraph.struct_graph_2022_02_01)
typegraph.destructure(tgraph, typegraph.start_node_2022_02_01, sarif_struct)

#
# Form output tables
#
typegraph.attach_tables(tgraph)

"""
Reproduce the

    file:line:col:line:col: message

output from

    ../../bin/sarif-results-summary results.sarif | grep size

as test/example.  Sample output is

    RESULT: static/js/fileuploader.js:1214:13:1214:17: Unused variable size.
    RESULT: static/js/tinymce/jscripts/tiny_mce/plugins/media/js/media.js:438:30:438:34: Unused variable size.

The tree paths that match up .startLine with .text and .uri are
- .results > .[] > .message > .text
- .results > .[] > .locations > .[] > .physicalLocation > .region > .startLine
- .results > .[] > .locations > .[] > .physicalLocation > .artifactLocation > .uri

Note that this IGNORES the path
- .results > .[] > .relatedLocations > .[] > .physicalLocation > .text

We need appropriate table joins to replicate those tree paths; following the edges
in typegraph.pdf is the most direct way to find relevant tables and keys.

We only care about .message with matching .startLine, so left joins should
work without losing any data.  Here are the tree paths and their corresponding
tables; the tree paths are from left to right and the joins can be done in the
same order.

Using ../notes/typegraph.pdf, we find these:

    |------------+----------+---------+-------------------+-------------------+------------|
    | .locations |          | .[]     | .physicalLocation | .artifactLocation | .uri       |
    | sf(4055)   |          | af(350) | sf(2683)          | sf(4963)          | sf(2685)   |
    |------------+----------+---------+-------------------+-------------------+------------|
    | .locations |          | .[]     | .physicalLocation | .region           | .startLine |
    | sf(4055)   |          | af(350) | sf(2683)          | sf(4963)          | sf(6299)   |
    |------------+----------+---------+-------------------+-------------------+------------|
    | .message   | .text    |         |                   |                   |            |
    | sf(4055)   | sf(2774) |         |                   |                   |            |
    |------------+----------+---------+-------------------+-------------------+------------|

"""
#
# Access convenience functions
#
sf = lambda num: tgraph.dataframes['Struct' + str(num)]
af = lambda num: tgraph.dataframes['Array' + str(num)]

#
# Form the dataframe via joins
#
d1 = (
    sf(4055)
    .merge(af('0350'), how="left", left_on='locations', right_on='array_id', validate="1:m")
    .drop(columns=['struct_id', 'locations', 'array_id', 'value_index', 'type_at_index'])
    .merge(sf(2683), how="left", left_on='id_or_value_at_index', right_on='struct_id',
           suffixes=("_4055", "_2683"), validate="1:m")
    .drop(columns=['struct_id', 'id_or_value_at_index'])
    .merge(sf(4963), how="left", left_on='physicalLocation', right_on='struct_id', validate="1:m")
    .drop(columns=['struct_id', 'physicalLocation'])
    .merge(sf(6299), how="left", left_on='region', right_on='struct_id', validate="1:m")
    .drop(columns=['struct_id', 'region'])
    .merge(sf(2685), how="left", left_on='artifactLocation', right_on='struct_id', validate="1:m")
    .drop(columns=['struct_id', 'artifactLocation'])
    .merge(sf(2774), how="left", left_on='message_4055', right_on='struct_id', validate="1:m")
    .drop(columns=['struct_id', 'message_4055'])
    .merge(sf(2774), how="left", left_on='message_2683', right_on='struct_id',
           suffixes=("_4055", "_2683"), validate="1:m")
)
#
# As expected from the above note
#
#     Note that this IGNORES the path
#     - .results > .[] > .relatedLocations > .[] > .physicalLocation > .text
#
# we have no text entries that table:
#
#     In [88]: d1[d1.text_2683 != '']
#     Out[88]:
#     Empty DataFrame

#
# Reproduce ALL `file:line:col:line:col: message` entries as a table
#
d2 = (d1[['uri', 'startLine', 'startColumn', 'endLine', 'endColumn', 'text_4055']]
      .rename({'text_4055': 'message'}, axis='columns'))

#
# Write output
#
if args.output_format == 'csv':
    d2.to_csv(sys.stdout, index_label='index')

else:
    sys.stderr.write("unknown output format")
    sys.exit(1)