diff --git a/bin/sarif-extract-scans b/bin/sarif-extract-scans new file mode 100755 index 0000000..6da1f44 --- /dev/null +++ b/bin/sarif-extract-scans @@ -0,0 +1,174 @@ +#!/usr/bin/env python +""" Extract scan data from multiple sarif files in table form. +""" +from dataclasses import dataclass +from sarif_cli import signature, signature_single +from sarif_cli import typegraph +from sarif_cli import snowflake_id +import argparse +import dataclasses as dc +import json +import logging +import pandas as pd +import pathlib +import sarif_cli.table_joins as tj +import sarif_cli.derived_joins as derived +import sys + +# +# Configure logger +# +logging.basicConfig(format='%(asctime)s %(message)s') + +# +# Start processing +# +parser = argparse.ArgumentParser(description='Read a collection of sarif files and produce tabular output.') +parser.add_argument('file', metavar='scan-spec.json', type=str, + help="json file containing required external scan information.") +parser.add_argument('outdir', metavar='output-dir', type=str, help='output directory') +args = parser.parse_args() + +# Load meta info +def load(fname): + with open(fname, 'rb') if fname != '-' else sys.stdin as fp: + try: + content = json.load(fp) + except json.decoder.JSONDecodeError as err: + logging.error('Error reading from {}: {}: line {}, column {}' + .format(args.file, err.msg, err.lineno, err.colno)) + sys.exit(1) + return content + +scan_spec = load(args.file) +sarif_struct = load(scan_spec['sarif_file_name']) + +# +# Preprocess raw SARIF to get smaller signature +# +context = signature.Context( + { + "string" : "String", + "int" : "Int", + "bool" : "Bool" + } +) +sarif_struct = signature.fillsig(args, sarif_struct, context) + +# +# Use reference type graph (signature) to traverse sarif and attach values to tables +# +tgraph = typegraph.Typegraph(signature_single.struct_graph_2022_02_01) +typegraph.destructure(tgraph, signature_single.start_node_2022_02_01, sarif_struct) +# +# Form output tables +# +typegraph.attach_tables(tgraph) + +# +# Dataframe / table collection +# +@dataclass +class BaseTables: + artifacts : pd.DataFrame + codeflows : pd.DataFrame + kind_pathproblem : pd.DataFrame + kind_problem : pd.DataFrame + project : pd.DataFrame + relatedLocations : pd.DataFrame + rules : pd.DataFrame + def __init__(self): pass +bt = BaseTables() + +@dataclass +class ScanTables: + # project: External table with project information + scans : pd.DataFrame + results : pd.DataFrame + def __init__(self): pass +scantabs = ScanTables() + +# +# Add dataframes for base tables +# +sf_2683 = tj.joins_for_sf_2683(tgraph) +af_0350_location = tj.joins_for_af_0350_location(tgraph) +bt.artifacts = tj.joins_for_artifacts(tgraph) +bt.codeflows = tj.joins_for_codeflows(tgraph, sf_2683) +bt.kind_pathproblem = tj.joins_for_path_problem(tgraph, af_0350_location) +bt.kind_problem = tj.joins_for_problem(tgraph, af_0350_location) +bt.project = tj.joins_for_project_single(tgraph) +bt.relatedLocations = tj.joins_for_relatedLocations(tgraph, sf_2683) +bt.rules = tj.joins_for_rules(tgraph) + +# +# Form derived query tables +# +# XX +# scantabs.project = derived.joins_for_project(bt) +# scantabs.scans = derived.joins_for_scans(bt) +# scantabs.results = derived.joins_for_results(bt) + + +# +# Replace the remaining internal ids with snowflake ids +# +flakegen = snowflake_id.Snowflake(0) + +columns_to_reindex = { + # template from {field.name : [''] for field in dc.fields(bt)} + 'artifacts': ['artifacts_id'], + 'codeflows': ['codeflow_id'], + 'kind_pathproblem': ['results_array_id', 'codeFlows_id'], + 'kind_problem': ['results_array_id'], + 'project': ['artifacts', 'results', 'rules'], + 'relatedLocations': ['struct_id'], + 'rules': ['rules_array_id']} + +_id_to_flake = {} +def _get_flake(id): + flake = _id_to_flake.get(id, -1) + if flake == -1: + flake = flakegen.next() + _id_to_flake[id] = flake + return flake + +# +# Cleaner, but makes far too many copies; keep the loop below +# +# def _reindex(table, colname): +# newtable = table.astype({ colname : 'uint64'}).reset_index(drop=True) +# for i in range(0, len(newtable)): +# newtable.loc[i, colname] = _get_flake(newtable.loc[i, colname]) +# return newtable +# +# for field in dc.fields(bt): +# table_name = field.name +# for colname in columns_to_reindex[table_name]: +# setattr(bt, field.name, _reindex(getattr(bt, field.name), colname)) +# +for field in dc.fields(bt): + table_name = field.name + table = getattr(bt, field.name) + # Turn all snowflake columns into uint64 and reset indexing to 0..len(table) + newtable = table.astype( + { colname : 'uint64' + for colname in columns_to_reindex[table_name]} + ).reset_index(drop=True) + # Swap ids for flakes + for colname in columns_to_reindex[table_name]: + for i in range(0, len(newtable)): + newtable.loc[i, colname] = _get_flake(newtable.loc[i, colname]) + # Replace the table + setattr(bt, field.name, newtable) +# +# Write output +# +p = pathlib.Path(args.outdir) +p.mkdir(exist_ok=True) +def write(path, frame): + with p.joinpath(path + ".csv").open(mode='wb') as fh: + frame.to_csv(fh, index=False) +for field in dc.fields(bt): + table = getattr(bt, field.name) + write(field.name, table) diff --git a/data/treeio/scan-spec-0.json b/data/treeio/scan-spec-0.json new file mode 100644 index 0000000..5672109 --- /dev/null +++ b/data/treeio/scan-spec-0.json @@ -0,0 +1,5 @@ +{ + "project_id": 13243, + "scan_id": 123457, + "sarif_file_name": "2022-02-25/results.sarif" +} diff --git a/data/treeio/scan-spec-1.json b/data/treeio/scan-spec-1.json new file mode 100644 index 0000000..8818266 --- /dev/null +++ b/data/treeio/scan-spec-1.json @@ -0,0 +1,5 @@ +{ + "project_id": 13243, + "scan_id": 123456, + "sarif_file_name": "2021-12-09/results.sarif" +} diff --git a/notes/tables.org b/notes/tables.org index 534976a..9cdff3c 100644 --- a/notes/tables.org +++ b/notes/tables.org @@ -181,7 +181,8 @@ * Tables or entries to be removed The top of the [Mar-23-2022] =projects.csv= table, enumerated below, is ad-hoc and included in the other tables below; the information for its fields is not - yet collected to it can be discarded. + yet collected so it can be discarded. + #+BEGIN_SRC text ==> project-meta.csv <== creation_date @@ -196,6 +197,17 @@ tool_version #+END_SRC + This information was used to expand the sarif tree (see Struct3452 and Array7481 + in typegraph-multi-with-tables.pdf and the code). In retrospect, that was a + poor choice. All additional information needed can be represented by one or + more tables, so sarif-extract* post commit 30e3dd3a3 do so. + + The minimal information required to drive the sarif-to-table conversion is + | project_id | 13243 | | + | scan_id | 123456 | | + | sarif_file_name | "2021-12-09/results.sarif" | | + + * New tables to be exported This section enumerates new tables intended for reporting infrastructure. diff --git a/sarif_cli/table_joins.py b/sarif_cli/table_joins.py index a133c60..d15c4c2 100644 --- a/sarif_cli/table_joins.py +++ b/sarif_cli/table_joins.py @@ -305,7 +305,7 @@ def joins_for_relatedLocations(tgraph, sf_2683): def joins_for_project(tgraph): """ - Return table providing the `project` information. + Return table providing the `project` information for sarif-extract-multi. """ # Access convenience functions sf = lambda num: tgraph.dataframes['Struct' + str(num)] @@ -368,6 +368,64 @@ def joins_for_project(tgraph): ) return project_df_1 +def joins_for_project_single(tgraph): + """ + Return table providing the `project` information for sarif-extract-scans + """ + # Access convenience functions + sf = lambda num: tgraph.dataframes['Struct' + str(num)] + af = lambda num: tgraph.dataframes['Array' + str(num)] + # + project_df = ( + sf(6787) + .rename(columns={"version": "version_6787", "struct_id": "struct_id_6787"}) + # + .merge(af('0177'), how="left", left_on='runs', right_on='array_id', + validate="1:m") + .drop(columns=['runs', 'array_id', 'type_at_index']) + .rename(columns={"value_index": "value_index_0177"}) + # + .merge(sf(3388), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m") + .drop(columns=['id_or_value_at_index', 'struct_id']) + # + # .merge(af(7069), how="left", left_on='newlineSequences', right_on='array_id', + # validate="1:m") + # .drop(columns=['newlineSequences', 'array_id', 'type_at_index']) + .drop(columns=['newlineSequences']) + # + .merge(sf(9543), how="left", left_on='properties', right_on='struct_id', validate="1:m") + .drop(columns=['properties', 'struct_id']) + # + # tool - driver - rules - defaultConfiguration - ( properties - tags ) + # + .merge(sf(8972), how="left", left_on='tool', right_on='struct_id', validate="1:m") + .drop(columns=['tool', 'struct_id']) + # + .merge(sf(7820), how="left", left_on='driver', right_on='struct_id', validate="1:m") + .drop(columns=['driver', 'struct_id']) + .rename(columns={"version": "driver_version_7820", "name": "driver_name_7820"}) + # + .merge(af(5511), how="left", left_on='versionControlProvenance', right_on='array_id') + .drop(columns=['versionControlProvenance', 'array_id', 'type_at_index']) + .rename(columns={"value_index": "versionControl_value_index_5511"}) + # + .merge(sf(3081), how="left", left_on='id_or_value_at_index', right_on='struct_id') + .drop(columns=['id_or_value_at_index', 'struct_id']) + # + ) + # Keep columns of interest + project_df_1 = ( + project_df + .drop(columns=['struct_id_6787', 'versionControl_value_index_5511']) + .rename({ + 'version_6787': 'sarif_version', + 'value_index_0177': 'run_index', + 'driver_name_7820': 'driver_name', + 'driver_version_7820': 'driver_version', + }, axis='columns') + ) + return project_df_1 + def joins_for_rules(tgraph): """ Return table providing the `rules` information. diff --git a/sarif_cli/typegraph.py b/sarif_cli/typegraph.py index 06bf62d..348013b 100644 --- a/sarif_cli/typegraph.py +++ b/sarif_cli/typegraph.py @@ -8,6 +8,7 @@ This file also contains some type graph reference values; these may be moved out separate files at some point. """ from dataclasses import dataclass +import logging from typing import Any, Dict, List, Tuple, Union import pandas as pd @@ -160,13 +161,19 @@ def _destructure_dict(typegraph: Typegraph, node, tree): elif set(tree_fields).issuperset(set(type_fields)): # Log a warning # log.warning("XX: Tree has unrecognized fields") + logging.warning('Input tree has unrecognized fields, collecting only ' + 'known entries: {}'.format(tree)) + logging.warning('tree fields: {} type fields: {}' + .format(tree_fields, type_fields)) _destructure_dict_1(typegraph, node, tree) elif set(tree_fields).issubset(set(type_fields)): raise MissingFieldException("XX: (Sub)tree is missing fields required by typedef") else: - raise Exception("typegraph: unhandled case reached. Internal error") + raise Exception("typegraph: unhandled case reached: cannot match type " + "fields {} to tree fields {}. Data is invalid." + .format(type_fields, tree_fields)) def _destructure_list(typegraph, node: str, tree: List):