sarif-cli/bin/sarif-extract-tables

#!/usr/bin/env python
"""Extract data from sarif files in table form.

The table joins for `problem`, `path-problem` and `relatedLocations` create tables
matching the content of ./sarif-results-summary.

The `artifacts`, `codeflows`, `relatedLocations` and `rules` tables provide the
remaining information from the sarif file; see
../notes/typegraph-multi-with-tables.pdf for details.

The `problem` and `path-problem` entries provide that information; the
`relatedLocations` table provides the details when multiple results are present
for either.

"""
from dataclasses import dataclass
from sarif_cli import signature, signature_single
from sarif_cli import typegraph
from sarif_cli import snowflake_id
import argparse
import dataclasses as dc
import json
import pandas as pd
import pathlib
import sarif_cli.table_joins as tj
import sys

#
# Start processing
#
parser = argparse.ArgumentParser(description='Read a sarif file and produce tabular output.')
parser.add_argument('file', metavar='sarif-file', type=str, help='input file, - for stdin')
parser.add_argument('outdir', metavar='output-dir', type=str, help='output directory')
parser.add_argument('-f', '--output-format', metavar='format', type=str, default="csv",
                    help='Output format for table.  Currently just csv; '
                    '  other formats supported by pandas can be added.')

args = parser.parse_args()

#
# Load data
#
with open(args.file, 'r') if args.file != '-' else sys.stdin as fp:
    sarif_struct = json.load(fp)

#
# Preprocess raw SARIF to get smaller signature
#
context = signature.Context(
    {
        "string" : "String",
        "int" : "Int",
        "bool" : "Bool"
    }
)
sarif_struct = signature.fillsig(args, sarif_struct, context)

#
# Use reference type graph (signature) to traverse sarif and attach values to tables
#
tgraph = typegraph.Typegraph(signature_single.struct_graph_2022_02_01)
typegraph.destructure(tgraph, signature_single.start_node_2022_02_01, sarif_struct)

#
# Form output tables
#
typegraph.attach_tables(tgraph)

#
# Dataframe / table collection
#
@dataclass
class BaseTables:
    artifacts : pd.DataFrame
    codeflows : pd.DataFrame
    kind_pathproblem : pd.DataFrame
    kind_problem : pd.DataFrame
    relatedLocations : pd.DataFrame
    rules : pd.DataFrame
    def __init__(self): pass

bt = BaseTables()
#
# Add dataframes
#
sf_2683 = tj.joins_for_sf_2683(tgraph)
af_0350_location = tj.joins_for_af_0350_location(tgraph)
bt.artifacts = tj.joins_for_artifacts(tgraph)
bt.codeflows = tj.joins_for_codeflows(tgraph, sf_2683)
bt.kind_pathproblem = tj.joins_for_path_problem(tgraph, af_0350_location)
bt.kind_problem = tj.joins_for_problem(tgraph, af_0350_location)
bt.relatedLocations = tj.joins_for_relatedLocations(tgraph, sf_2683)
bt.rules = tj.joins_for_rules(tgraph)

#
# Replace the remaining internal ids with snowflake ids
#
flakegen = snowflake_id.Snowflake(0)

columns_to_reindex = {
    # template from {field.name : [''] for field in dc.fields(bt)}
    'artifacts': ['artifacts_id'],
    'codeflows': ['codeflow_id'],
    'kind_pathproblem': ['results_array_id', 'codeFlows_id'],
    'kind_problem': ['results_array_id'],
    'relatedLocations': ['struct_id'],
    'rules': ['rules_array_id']}

_id_to_flake = {}
def _get_flake(id):
    flake = _id_to_flake.get(id, -1)
    if flake == -1:
        flake = flakegen.next()
        _id_to_flake[id] = flake
    return flake


for field in dc.fields(bt):
    table_name = field.name
    table = getattr(bt, field.name)
    # Turn all snowflake columns into uint64 and reset indexing to 0..len(table)
    newtable = table.astype(
        { colname : 'uint64'
          for colname in columns_to_reindex[table_name]}
    ).reset_index(drop=True)
    # Swap ids for flakes
    for colname in columns_to_reindex[table_name]:
        for i in range(0, len(newtable)):
            newtable.loc[i, colname] = _get_flake(newtable.loc[i, colname])
    # Replace the table
    setattr(bt, field.name, newtable)
#
# Write output
#
p = pathlib.Path(args.outdir)
p.mkdir(exist_ok=True)
def write(path, frame):
    with p.joinpath(path + ".csv").open(mode='wb') as fh:
        frame.to_csv(fh, index=False)
for field in dc.fields(bt):
    table = getattr(bt, field.name)
    write(field.name, table)


# TODO:
"""
Reproduce the

    file:line:col:line:col: message

output from

    ../../bin/sarif-results-summary results.sarif | grep size

as test/example.  Sample output is

    RESULT: static/js/fileuploader.js:1214:13:1214:17: Unused variable size.
    RESULT: static/js/tinymce/jscripts/tiny_mce/plugins/media/js/media.js:438:30:438:34: Unused variable size.

The tree paths that match up .startLine with .text and .uri are
- .results > .[] > .message > .text
- .results > .[] > .locations > .[] > .physicalLocation > .region > .startLine
- .results > .[] > .locations > .[] > .physicalLocation > .artifactLocation > .uri

Note that this IGNORES the path
- .results > .[] > .relatedLocations > .[] > .physicalLocation > .text

We need appropriate table joins to replicate those tree paths; following the edges
in typegraph.pdf is the most direct way to find relevant tables and keys.

We only care about .message with matching .startLine, so left joins should
work without losing any data.  Here are the tree paths and their corresponding
tables; the tree paths are from left to right and the joins can be done in the
same order.

Using ../notes/typegraph.pdf, we find these:

    |------------+----------+---------+-------------------+-------------------+------------|
    | .locations |          | .[]     | .physicalLocation | .artifactLocation | .uri       |
    | sf(4055)   |          | af(350) | sf(2683)          | sf(4963)          | sf(2685)   |
    |------------+----------+---------+-------------------+-------------------+------------|
    | .locations |          | .[]     | .physicalLocation | .region           | .startLine |
    | sf(4055)   |          | af(350) | sf(2683)          | sf(4963)          | sf(6299)   |
    |------------+----------+---------+-------------------+-------------------+------------|
    | .message   | .text    |         |                   |                   |            |
    | sf(4055)   | sf(2774) |         |                   |                   |            |
    |------------+----------+---------+-------------------+-------------------+------------|

"""