mirror of
https://github.com/hohn/sarif-cli.git
synced 2025-12-16 17:23:03 +01:00
enabled by -f flag with CLI value tested on sarif from CodeQL CLIs: 2.6.3, 2.9.4, 2.11.4 MUST contain versionControlProvenance property however
191 lines
6.3 KiB
Python
Executable File
191 lines
6.3 KiB
Python
Executable File
#!/usr/bin/env python
|
|
"""Extract data from sarif files in table form.
|
|
|
|
The table joins for `problem`, `path-problem` and `relatedLocations` create tables
|
|
matching the content of ./sarif-results-summary.
|
|
|
|
The `artifacts`, `codeflows`, `relatedLocations` and `rules` tables provide the
|
|
remaining information from the sarif file; see
|
|
../notes/typegraph-multi-with-tables.pdf for details.
|
|
|
|
The `problem` and `path-problem` entries provide that information; the
|
|
`relatedLocations` table provides the details when multiple results are present
|
|
for either.
|
|
|
|
"""
|
|
from dataclasses import dataclass
|
|
from sarif_cli import signature, signature_single
|
|
from sarif_cli import typegraph
|
|
from sarif_cli import snowflake_id
|
|
import argparse
|
|
import csv
|
|
import dataclasses as dc
|
|
import json
|
|
import pandas as pd
|
|
import pathlib
|
|
import sarif_cli.table_joins as tj
|
|
import sys
|
|
|
|
#
|
|
# Start processing
|
|
#
|
|
parser = argparse.ArgumentParser(description='Read a sarif file and produce tabular output.')
|
|
parser.add_argument('file', metavar='sarif-file', type=str, help='input file, - for stdin')
|
|
parser.add_argument('outdir', metavar='output-dir', type=str, help='output directory')
|
|
parser.add_argument('-f', '--output-format', metavar='format', type=str, default="csv",
|
|
help='Output format for table. Currently just csv; '
|
|
' other formats supported by pandas can be added.')
|
|
|
|
args = parser.parse_args()
|
|
|
|
#
|
|
# Load data
|
|
#
|
|
with open(args.file, 'r') if args.file != '-' else sys.stdin as fp:
|
|
sarif_struct = json.load(fp)
|
|
|
|
#
|
|
# Preprocess raw SARIF to get smaller signature
|
|
#
|
|
context = signature.Context(
|
|
{
|
|
"string" : "String",
|
|
"int" : "Int",
|
|
"bool" : "Bool"
|
|
}
|
|
)
|
|
sarif_struct = signature.fillsig(args, sarif_struct, context)
|
|
|
|
#
|
|
# Use reference type graph (signature) to traverse sarif and attach values to tables
|
|
#
|
|
tgraph = typegraph.Typegraph(signature_single.struct_graph_LGTM)
|
|
typegraph.destructure(tgraph, signature_single.start_node_LGTM, sarif_struct)
|
|
|
|
#
|
|
# Form output tables
|
|
#
|
|
typegraph.attach_tables(tgraph)
|
|
|
|
#
|
|
# Dataframe / table collection
|
|
#
|
|
@dataclass
|
|
class BaseTables:
|
|
artifacts : pd.DataFrame
|
|
codeflows : pd.DataFrame
|
|
kind_pathproblem : pd.DataFrame
|
|
kind_problem : pd.DataFrame
|
|
relatedLocations : pd.DataFrame
|
|
rules : pd.DataFrame
|
|
def __init__(self): pass
|
|
|
|
bt = BaseTables()
|
|
#
|
|
# Add dataframes
|
|
#
|
|
sf_2683 = tj.joins_for_location_info(tgraph)
|
|
af_0350_location = tj.joins_for_af_0350_location(tgraph)
|
|
bt.artifacts = tj.joins_for_artifacts(tgraph)
|
|
bt.codeflows = tj.joins_for_codeflows(tgraph, sf_2683)
|
|
bt.kind_pathproblem = tj.joins_for_path_problem(tgraph, af_0350_location)
|
|
bt.kind_problem = tj.joins_for_problem(tgraph, af_0350_location)
|
|
bt.relatedLocations = tj.joins_for_relatedLocations(tgraph, sf_2683)
|
|
bt.rules = tj.joins_for_rules(tgraph)
|
|
|
|
#
|
|
# Replace the remaining internal ids with snowflake ids
|
|
#
|
|
flakegen = snowflake_id.Snowflake(0)
|
|
|
|
columns_to_reindex = {
|
|
# template from {field.name : [''] for field in dc.fields(bt)}
|
|
'artifacts': ['artifacts_id'],
|
|
'codeflows': ['codeflow_id'],
|
|
'kind_pathproblem': ['results_array_id', 'codeFlows_id'],
|
|
'kind_problem': ['results_array_id'],
|
|
'relatedLocations': ['struct_id'],
|
|
'rules': ['rules_array_id']}
|
|
|
|
_id_to_flake = {}
|
|
def _get_flake(id):
|
|
flake = _id_to_flake.get(id, -1)
|
|
if flake == -1:
|
|
flake = flakegen.next()
|
|
_id_to_flake[id] = flake
|
|
return flake
|
|
|
|
|
|
for field in dc.fields(bt):
|
|
table_name = field.name
|
|
table = getattr(bt, field.name)
|
|
# Turn all snowflake columns into uint64 and reset indexing to 0..len(table)
|
|
newtable = table.astype(
|
|
{ colname : 'uint64'
|
|
for colname in columns_to_reindex[table_name]}
|
|
).reset_index(drop=True)
|
|
# Swap ids for flakes
|
|
for colname in columns_to_reindex[table_name]:
|
|
for i in range(0, len(newtable)):
|
|
newtable.loc[i, colname] = _get_flake(newtable.loc[i, colname])
|
|
# Replace the table
|
|
setattr(bt, field.name, newtable)
|
|
#
|
|
# Write output
|
|
#
|
|
p = pathlib.Path(args.outdir)
|
|
p.mkdir(exist_ok=True)
|
|
def write(path, frame):
|
|
with p.joinpath(path + ".csv").open(mode='wb') as fh:
|
|
frame.to_csv(fh, index=False, quoting=csv.QUOTE_NONNUMERIC)
|
|
for field in dc.fields(bt):
|
|
table = getattr(bt, field.name)
|
|
write(field.name, table)
|
|
|
|
|
|
# TODO:
|
|
"""
|
|
Reproduce the
|
|
|
|
file:line:col:line:col: message
|
|
|
|
output from
|
|
|
|
../../bin/sarif-results-summary results.sarif | grep size
|
|
|
|
as test/example. Sample output is
|
|
|
|
RESULT: static/js/fileuploader.js:1214:13:1214:17: Unused variable size.
|
|
RESULT: static/js/tinymce/jscripts/tiny_mce/plugins/media/js/media.js:438:30:438:34: Unused variable size.
|
|
|
|
The tree paths that match up .startLine with .text and .uri are
|
|
- .results > .[] > .message > .text
|
|
- .results > .[] > .locations > .[] > .physicalLocation > .region > .startLine
|
|
- .results > .[] > .locations > .[] > .physicalLocation > .artifactLocation > .uri
|
|
|
|
Note that this IGNORES the path
|
|
- .results > .[] > .relatedLocations > .[] > .physicalLocation > .text
|
|
|
|
We need appropriate table joins to replicate those tree paths; following the edges
|
|
in typegraph.pdf is the most direct way to find relevant tables and keys.
|
|
|
|
We only care about .message with matching .startLine, so left joins should
|
|
work without losing any data. Here are the tree paths and their corresponding
|
|
tables; the tree paths are from left to right and the joins can be done in the
|
|
same order.
|
|
|
|
Using ../notes/typegraph.pdf, we find these:
|
|
|
|
|------------+----------+---------+-------------------+-------------------+------------|
|
|
| .locations | | .[] | .physicalLocation | .artifactLocation | .uri |
|
|
| sf(4055) | | af(350) | sf(2683) | sf(4963) | sf(2685) |
|
|
|------------+----------+---------+-------------------+-------------------+------------|
|
|
| .locations | | .[] | .physicalLocation | .region | .startLine |
|
|
| sf(4055) | | af(350) | sf(2683) | sf(4963) | sf(6299) |
|
|
|------------+----------+---------+-------------------+-------------------+------------|
|
|
| .message | .text | | | | |
|
|
| sf(4055) | sf(2774) | | | | |
|
|
|------------+----------+---------+-------------------+-------------------+------------|
|
|
|
|
"""
|