Files
sarif-cli/bin/sarif-extract-tables

190 lines
6.3 KiB
Python
Executable File

#!/usr/bin/env python
"""Extract data from sarif files in table form.
The table joins for `problem`, `path-problem` and `relatedLocations` create tables
matching the content of ./sarif-results-summary.
The `artifacts`, `codeflows`, `relatedLocations` and `rules` tables provide the
remaining information from the sarif file; see
../notes/typegraph-multi-with-tables.pdf for details.
The `problem` and `path-problem` entries provide that information; the
`relatedLocations` table provides the details when multiple results are present
for either.
"""
from dataclasses import dataclass
from sarif_cli import signature, signature_single
from sarif_cli import typegraph
from sarif_cli import snowflake_id
import argparse
import dataclasses as dc
import json
import pandas as pd
import pathlib
import sarif_cli.table_joins as tj
import sys
#
# Start processing
#
parser = argparse.ArgumentParser(description='Read a sarif file and produce tabular output.')
parser.add_argument('file', metavar='sarif-file', type=str, help='input file, - for stdin')
parser.add_argument('outdir', metavar='output-dir', type=str, help='output directory')
parser.add_argument('-f', '--output-format', metavar='format', type=str, default="csv",
help='Output format for table. Currently just csv; '
' other formats supported by pandas can be added.')
args = parser.parse_args()
#
# Load data
#
with open(args.file, 'r') if args.file != '-' else sys.stdin as fp:
sarif_struct = json.load(fp)
#
# Preprocess raw SARIF to get smaller signature
#
context = signature.Context(
{
"string" : "String",
"int" : "Int",
"bool" : "Bool"
}
)
sarif_struct = signature.fillsig(args, sarif_struct, context)
#
# Use reference type graph (signature) to traverse sarif and attach values to tables
#
tgraph = typegraph.Typegraph(signature_single.struct_graph_2022_02_01)
typegraph.destructure(tgraph, signature_single.start_node_2022_02_01, sarif_struct)
#
# Form output tables
#
typegraph.attach_tables(tgraph)
#
# Dataframe / table collection
#
@dataclass
class BaseTables:
artifacts : pd.DataFrame
codeflows : pd.DataFrame
kind_pathproblem : pd.DataFrame
kind_problem : pd.DataFrame
relatedLocations : pd.DataFrame
rules : pd.DataFrame
def __init__(self): pass
bt = BaseTables()
#
# Add dataframes
#
sf_2683 = tj.joins_for_sf_2683(tgraph)
af_0350_location = tj.joins_for_af_0350_location(tgraph)
bt.artifacts = tj.joins_for_artifacts(tgraph)
bt.codeflows = tj.joins_for_codeflows(tgraph, sf_2683)
bt.kind_pathproblem = tj.joins_for_path_problem(tgraph, af_0350_location)
bt.kind_problem = tj.joins_for_problem(tgraph, af_0350_location)
bt.relatedLocations = tj.joins_for_relatedLocations(tgraph, sf_2683)
bt.rules = tj.joins_for_rules(tgraph)
#
# Replace the remaining internal ids with snowflake ids
#
flakegen = snowflake_id.Snowflake(0)
columns_to_reindex = {
# template from {field.name : [''] for field in dc.fields(bt)}
'artifacts': ['artifacts_id'],
'codeflows': ['codeflow_id'],
'kind_pathproblem': ['results_array_id', 'codeFlows_id'],
'kind_problem': ['results_array_id'],
'relatedLocations': ['struct_id'],
'rules': ['rules_array_id']}
_id_to_flake = {}
def _get_flake(id):
flake = _id_to_flake.get(id, -1)
if flake == -1:
flake = flakegen.next()
_id_to_flake[id] = flake
return flake
for field in dc.fields(bt):
table_name = field.name
table = getattr(bt, field.name)
# Turn all snowflake columns into uint64 and reset indexing to 0..len(table)
newtable = table.astype(
{ colname : 'uint64'
for colname in columns_to_reindex[table_name]}
).reset_index(drop=True)
# Swap ids for flakes
for colname in columns_to_reindex[table_name]:
for i in range(0, len(newtable)):
newtable.loc[i, colname] = _get_flake(newtable.loc[i, colname])
# Replace the table
setattr(bt, field.name, newtable)
#
# Write output
#
p = pathlib.Path(args.outdir)
p.mkdir(exist_ok=True)
def write(path, frame):
with p.joinpath(path + ".csv").open(mode='wb') as fh:
frame.to_csv(fh, index=False)
for field in dc.fields(bt):
table = getattr(bt, field.name)
write(field.name, table)
# TODO:
"""
Reproduce the
file:line:col:line:col: message
output from
../../bin/sarif-results-summary results.sarif | grep size
as test/example. Sample output is
RESULT: static/js/fileuploader.js:1214:13:1214:17: Unused variable size.
RESULT: static/js/tinymce/jscripts/tiny_mce/plugins/media/js/media.js:438:30:438:34: Unused variable size.
The tree paths that match up .startLine with .text and .uri are
- .results > .[] > .message > .text
- .results > .[] > .locations > .[] > .physicalLocation > .region > .startLine
- .results > .[] > .locations > .[] > .physicalLocation > .artifactLocation > .uri
Note that this IGNORES the path
- .results > .[] > .relatedLocations > .[] > .physicalLocation > .text
We need appropriate table joins to replicate those tree paths; following the edges
in typegraph.pdf is the most direct way to find relevant tables and keys.
We only care about .message with matching .startLine, so left joins should
work without losing any data. Here are the tree paths and their corresponding
tables; the tree paths are from left to right and the joins can be done in the
same order.
Using ../notes/typegraph.pdf, we find these:
|------------+----------+---------+-------------------+-------------------+------------|
| .locations | | .[] | .physicalLocation | .artifactLocation | .uri |
| sf(4055) | | af(350) | sf(2683) | sf(4963) | sf(2685) |
|------------+----------+---------+-------------------+-------------------+------------|
| .locations | | .[] | .physicalLocation | .region | .startLine |
| sf(4055) | | af(350) | sf(2683) | sf(4963) | sf(6299) |
|------------+----------+---------+-------------------+-------------------+------------|
| .message | .text | | | | |
| sf(4055) | sf(2774) | | | | |
|------------+----------+---------+-------------------+-------------------+------------|
"""