sarif-cli/bin/sarif-extract-tables

#!/usr/bin/env python
""" Extract data from sarif files in table form.

These particular table joins create tables matching the content of
./sarif-results-summary

Return tables providing the `problem`, `path-problem` and `relatedLocations`
information.

The `problem` and `path-problem` entries provide that information; the
`relatedLocations` table provides the details when multiple results are present
for either.

"""
import argparse
import json
import pathlib
from sarif_cli import signature, signature_single
from sarif_cli import typegraph
import sys
from collections import defaultdict
import pandas as pd

#
# Start processing
#
parser = argparse.ArgumentParser(description='Read a sarif file and produce tabular output.')
parser.add_argument('file', metavar='sarif-file', type=str, help='input file, - for stdin')
parser.add_argument('outdir', metavar='output-dir', type=str, help='output directory')
parser.add_argument('-f', '--output-format', metavar='format', type=str, default="csv",
                    help='Output format for table.  Currently just csv; '
                    '  other formats supported by pandas can be added.')

args = parser.parse_args()

#
# Load data
#
with open(args.file, 'r') if args.file != '-' else sys.stdin as fp:
    sarif_struct = json.load(fp)

#
# Preprocess raw SARIF to get smaller signature
#
context = signature.Context(
    {
        "string" : "String",
        "int" : "Int",
        "bool" : "Bool"
    }
)
sarif_struct = signature.fillsig(args, sarif_struct, context)

#
# Use reference type graph (signature) to traverse sarif and attach values to tables
#
tgraph = typegraph.Typegraph(signature_single.struct_graph_2022_02_01)
typegraph.destructure(tgraph, signature_single.start_node_2022_02_01, sarif_struct)

#
# Form output tables
#
typegraph.attach_tables(tgraph)

"""
Reproduce the

    file:line:col:line:col: message

output from

    ../../bin/sarif-results-summary results.sarif | grep size

as test/example.  Sample output is

    RESULT: static/js/fileuploader.js:1214:13:1214:17: Unused variable size.
    RESULT: static/js/tinymce/jscripts/tiny_mce/plugins/media/js/media.js:438:30:438:34: Unused variable size.

The tree paths that match up .startLine with .text and .uri are
- .results > .[] > .message > .text
- .results > .[] > .locations > .[] > .physicalLocation > .region > .startLine
- .results > .[] > .locations > .[] > .physicalLocation > .artifactLocation > .uri

Note that this IGNORES the path
- .results > .[] > .relatedLocations > .[] > .physicalLocation > .text

We need appropriate table joins to replicate those tree paths; following the edges
in typegraph.pdf is the most direct way to find relevant tables and keys.

We only care about .message with matching .startLine, so left joins should
work without losing any data.  Here are the tree paths and their corresponding
tables; the tree paths are from left to right and the joins can be done in the
same order.

Using ../notes/typegraph.pdf, we find these:

    |------------+----------+---------+-------------------+-------------------+------------|
    | .locations |          | .[]     | .physicalLocation | .artifactLocation | .uri       |
    | sf(4055)   |          | af(350) | sf(2683)          | sf(4963)          | sf(2685)   |
    |------------+----------+---------+-------------------+-------------------+------------|
    | .locations |          | .[]     | .physicalLocation | .region           | .startLine |
    | sf(4055)   |          | af(350) | sf(2683)          | sf(4963)          | sf(6299)   |
    |------------+----------+---------+-------------------+-------------------+------------|
    | .message   | .text    |         |                   |                   |            |
    | sf(4055)   | sf(2774) |         |                   |                   |            |
    |------------+----------+---------+-------------------+-------------------+------------|

"""
#
# Access convenience functions
#
sf = lambda num: tgraph.dataframes['Struct' + str(num)]
af = lambda num: tgraph.dataframes['Array' + str(num)]

#
# Form the message dataframe via joins
#
d1 = (
    sf(4055)
    .merge(af('0350'), how="left", left_on='locations', right_on='array_id', validate="1:m")
    .drop(columns=['locations', 'array_id', 'value_index', 'type_at_index'])
    .merge(sf(2683), how="left", left_on='id_or_value_at_index', right_on='struct_id',
           suffixes=("_4055", "_2683"), validate="1:m")
    .drop(columns=['struct_id_2683', 'id_or_value_at_index'])
    .merge(sf(4963), how="left", left_on='physicalLocation', right_on='struct_id', validate="1:m")
    .drop(columns=['struct_id', 'physicalLocation'])
    .merge(sf(6299), how="left", left_on='region', right_on='struct_id', validate="1:m")
    .drop(columns=['struct_id', 'region'])
    .merge(sf(2685), how="left", left_on='artifactLocation', right_on='struct_id', validate="1:m")
    .drop(columns=['struct_id', 'artifactLocation'])
    .merge(sf(2774), how="left", left_on='message_4055', right_on='struct_id', validate="1:m")
    .drop(columns=['struct_id', 'message_4055'])
    .merge(sf(2774), how="left", left_on='message_2683', right_on='struct_id',
           suffixes=("_4055", "_2683"), validate="1:m")
)
#
# As expected from the above note
#
#     Note that this IGNORES the path
#     - .results > .[] > .relatedLocations > .[] > .physicalLocation > .text
#
# we have no text entries that table:
#
#     In [88]: d1[d1.text_2683 != '']
#     Out[88]:
#     Empty DataFrame

#
# Reproduce ALL `file:line:col:line:col: message` entries as a table
#
d2 = (d1[['struct_id_4055', 'uri', 'startLine', 'startColumn', 'endLine', 'endColumn', 'text_4055']]
      .rename({'text_4055': 'message'}, axis='columns'))

#
# Form the codeFlows dataframe
#
dco1 = (
    sf(9699)
    .merge(af(9799), how="left", left_on='codeFlows', right_on='array_id', validate="1:m")
    .drop(columns=['struct_id', 'codeFlows', 'array_id', 'type_at_index'])
    #
    .merge(sf(7122), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m")
    .drop(columns=['id_or_value_at_index', 'struct_id'])
    #
    .merge(af(1597), how="left", left_on='threadFlows', right_on='array_id',
           suffixes=("_codeFlow_9799", "_threadFlows_1597"), validate="1:m")
    .drop(columns=['threadFlows', 'array_id', 'type_at_index'])
    #
    .merge(sf(4194), how="left", left_on='id_or_value_at_index', right_on='struct_id',
           suffixes=("_9699", "_4194"), validate="1:m")
    .drop(columns=['id_or_value_at_index', 'struct_id'])
    #
    .merge(af(1075), how="left", left_on='locations_4194', right_on='array_id', validate="1:m")
    .drop(columns=['locations_4194', 'array_id', 'type_at_index'])
    .rename(columns={"value_index": "value_index_locations_1075"})
    #
    .merge(sf('0987'), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m")
    .drop(columns=['id_or_value_at_index', 'struct_id'])
    #
    .merge(sf(2683), how="left", left_on='location', right_on='struct_id',
           suffixes=("_9699", "_2683"), validate="1:m")
    .drop(columns=['location', 'struct_id'])
    #
    # The below is similar to dr1
    #
    .merge(sf(4963), how="left", left_on='physicalLocation', right_on='struct_id', validate="1:m")
    .drop(columns=['struct_id', 'physicalLocation'])
    #
    .merge(sf(6299), how="left", left_on='region', right_on='struct_id', validate="1:m")
    .drop(columns=['struct_id', 'region'])
    #
    .merge(sf(2685), how="left", left_on='artifactLocation', right_on='struct_id', validate="1:m")
    .drop(columns=['struct_id', 'artifactLocation'])
    #
    .merge(sf(2774), how="left", left_on='message_2683', right_on='struct_id', validate="1:m")
    .drop(columns=['struct_id', 'message_2683'])
)

# Keep columns of interest
dco2 = (dco1[['uri',
              'startLine', 'startColumn', 'endLine', 'endColumn',
              'text',
              'ruleIndex', 'value_index_codeFlow_9799',
              'value_index_threadFlows_1597', 'value_index_locations_1075',
              ]]
        .rename({'text': 'message',
                 'value_index_codeFlow_9799': 'idx_codeFlow',
                 'value_index_threadFlows_1597': 'idx_threadFlows',
                 'value_index_locations_1075': 'idx_locations'}, axis='columns'))

# Remove dummy locations previously injected by signature.fillsig
dco3 = dco2[dco2.uri != 'scli-dyys dummy value']

#
# Form the relatedLocation dataframe via joins, starting from the union of
# relatedLocations from `kind problem` (sf(4055)) and `kind path-problem`
# (sf(9699)).  This is only sligthly different from d1: left_on=relatedLocations,
# and no left_on='message_4055'
#
dr1 = (
    pd.concat([sf(4055)[['relatedLocations', 'struct_id']], sf(9699)[['relatedLocations', 'struct_id']]])
    .merge(af('0350'), how="left", left_on='relatedLocations', right_on='array_id', validate="1:m")
    .drop(columns=['relatedLocations', 'array_id', 'value_index', 'type_at_index'])
    #
    .merge(sf(2683), how="left", left_on='id_or_value_at_index', right_on='struct_id',
           suffixes=("_4055_9699", "_2683"), validate="1:m")
    .drop(columns=['struct_id_2683', 'id_or_value_at_index'])
    #
    .merge(sf(4963), how="left", left_on='physicalLocation', right_on='struct_id', validate="1:m")
    .drop(columns=['struct_id', 'physicalLocation'])
    #
    .merge(sf(6299), how="left", left_on='region', right_on='struct_id', validate="1:m")
    .drop(columns=['struct_id', 'region'])
    #
    .merge(sf(2685), how="left", left_on='artifactLocation', right_on='struct_id', validate="1:m")
    .drop(columns=['struct_id', 'artifactLocation'])
    #
    .merge(sf(2774), how="left", left_on='message', right_on='struct_id', validate="1:m")
    .drop(columns=['struct_id', 'message'])
)

# Keep columns of interest
dr2 = (dr1[['struct_id_4055_9699', 'uri', 'startLine', 'startColumn', 'endLine', 'endColumn', 'text']]
      .rename({'text': 'message', 'struct_id_4055_9699': 'struct_id'}, axis='columns'))

# Remove dummy locations previously injected by signature.fillsig
dr3 = dr2[dr2.uri != 'scli-dyys dummy value']


#
# Write output
#
if args.output_format == 'csv':
    p = pathlib.Path(args.outdir)
    p.mkdir(exist_ok=True)
    with p.joinpath('problem.csv').open(mode='wb') as problem:
        d2.to_csv(problem, index_label='index')
    with p.joinpath('path-problem.csv').open(mode='wb') as path_problem:
        dco3.to_csv(path_problem, index_label='index')
    with p.joinpath('relatedLocations.csv').open(mode='wb') as relo:
        dr3.to_csv(relo, index_label='index')

else:
    sys.stderr.write("unknown output format")
    sys.exit(1)