#!/usr/bin/env python """Extract data from sarif files in table form. The table joins for `problem`, `path-problem` and `relatedLocations` create tables matching the content of ./sarif-results-summary. The `artifacts`, `codeflows`, `relatedLocations` and `rules` tables provide the remaining information from the sarif file; see ../notes/typegraph-multi-with-tables.pdf for details. The `problem` and `path-problem` entries provide that information; the `relatedLocations` table provides the details when multiple results are present for either. """ from dataclasses import dataclass from sarif_cli import signature, signature_single from sarif_cli import typegraph from sarif_cli import snowflake_id import argparse import csv import dataclasses as dc import json import pandas as pd import pathlib import sarif_cli.table_joins as tj import sys # # Start processing # parser = argparse.ArgumentParser(description='Read a sarif file and produce tabular output.') parser.add_argument('file', metavar='sarif-file', type=str, help='input file, - for stdin') parser.add_argument('outdir', metavar='output-dir', type=str, help='output directory') parser.add_argument('-f', '--output-format', metavar='format', type=str, default="csv", help='Output format for table. Currently just csv; ' ' other formats supported by pandas can be added.') args = parser.parse_args() # # Load data # with open(args.file, 'r') if args.file != '-' else sys.stdin as fp: sarif_struct = json.load(fp) # # Preprocess raw SARIF to get smaller signature # context = signature.Context( { "string" : "String", "int" : "Int", "bool" : "Bool" } ) sarif_struct = signature.fillsig(args, sarif_struct, context) # # Use reference type graph (signature) to traverse sarif and attach values to tables # tgraph = typegraph.Typegraph(signature_single.struct_graph_LGTM) typegraph.destructure(tgraph, signature_single.start_node_LGTM, sarif_struct) # # Form output tables # typegraph.attach_tables(tgraph) # # Dataframe / table collection # @dataclass class BaseTables: artifacts : pd.DataFrame codeflows : pd.DataFrame kind_pathproblem : pd.DataFrame kind_problem : pd.DataFrame relatedLocations : pd.DataFrame rules : pd.DataFrame def __init__(self): pass bt = BaseTables() # # Add dataframes # sf_2683 = tj.joins_for_location_info(tgraph) af_0350_location = tj.joins_for_af_0350_location(tgraph) bt.artifacts = tj.joins_for_artifacts(tgraph) bt.codeflows = tj.joins_for_codeflows(tgraph, sf_2683) bt.kind_pathproblem = tj.joins_for_path_problem(tgraph, af_0350_location) bt.kind_problem = tj.joins_for_problem(tgraph, af_0350_location) bt.relatedLocations = tj.joins_for_relatedLocations(tgraph, sf_2683) bt.rules = tj.joins_for_rules(tgraph) # # Replace the remaining internal ids with snowflake ids # flakegen = snowflake_id.Snowflake(0) columns_to_reindex = { # template from {field.name : [''] for field in dc.fields(bt)} 'artifacts': ['artifacts_id'], 'codeflows': ['codeflow_id'], 'kind_pathproblem': ['results_array_id', 'codeFlows_id'], 'kind_problem': ['results_array_id'], 'relatedLocations': ['struct_id'], 'rules': ['rules_array_id']} _id_to_flake = {} def _get_flake(id): flake = _id_to_flake.get(id, -1) if flake == -1: flake = flakegen.next() _id_to_flake[id] = flake return flake for field in dc.fields(bt): table_name = field.name table = getattr(bt, field.name) # Turn all snowflake columns into uint64 and reset indexing to 0..len(table) newtable = table.astype( { colname : 'uint64' for colname in columns_to_reindex[table_name]} ).reset_index(drop=True) # Swap ids for flakes for colname in columns_to_reindex[table_name]: for i in range(0, len(newtable)): newtable.loc[i, colname] = _get_flake(newtable.loc[i, colname]) # Replace the table setattr(bt, field.name, newtable) # # Write output # p = pathlib.Path(args.outdir) p.mkdir(exist_ok=True) def write(path, frame): with p.joinpath(path + ".csv").open(mode='wb') as fh: frame.to_csv(fh, index=False, quoting=csv.QUOTE_NONNUMERIC) for field in dc.fields(bt): table = getattr(bt, field.name) write(field.name, table) # TODO: """ Reproduce the file:line:col:line:col: message output from ../../bin/sarif-results-summary results.sarif | grep size as test/example. Sample output is RESULT: static/js/fileuploader.js:1214:13:1214:17: Unused variable size. RESULT: static/js/tinymce/jscripts/tiny_mce/plugins/media/js/media.js:438:30:438:34: Unused variable size. The tree paths that match up .startLine with .text and .uri are - .results > .[] > .message > .text - .results > .[] > .locations > .[] > .physicalLocation > .region > .startLine - .results > .[] > .locations > .[] > .physicalLocation > .artifactLocation > .uri Note that this IGNORES the path - .results > .[] > .relatedLocations > .[] > .physicalLocation > .text We need appropriate table joins to replicate those tree paths; following the edges in typegraph.pdf is the most direct way to find relevant tables and keys. We only care about .message with matching .startLine, so left joins should work without losing any data. Here are the tree paths and their corresponding tables; the tree paths are from left to right and the joins can be done in the same order. Using ../notes/typegraph.pdf, we find these: |------------+----------+---------+-------------------+-------------------+------------| | .locations | | .[] | .physicalLocation | .artifactLocation | .uri | | sf(4055) | | af(350) | sf(2683) | sf(4963) | sf(2685) | |------------+----------+---------+-------------------+-------------------+------------| | .locations | | .[] | .physicalLocation | .region | .startLine | | sf(4055) | | af(350) | sf(2683) | sf(4963) | sf(6299) | |------------+----------+---------+-------------------+-------------------+------------| | .message | .text | | | | | | sf(4055) | sf(2774) | | | | | |------------+----------+---------+-------------------+-------------------+------------| """