WIP: sarif-extract-scans: back to single sarif file handling, incorporate multi-file libraries

This commit is contained in:
Michael Hohn
2022-05-10 19:01:38 -07:00
committed by =Michael Hohn
parent 675a5a4008
commit b212423907
6 changed files with 264 additions and 3 deletions

174
bin/sarif-extract-scans Executable file
View File

@@ -0,0 +1,174 @@
#!/usr/bin/env python
""" Extract scan data from multiple sarif files in table form.
"""
from dataclasses import dataclass
from sarif_cli import signature, signature_single
from sarif_cli import typegraph
from sarif_cli import snowflake_id
import argparse
import dataclasses as dc
import json
import logging
import pandas as pd
import pathlib
import sarif_cli.table_joins as tj
import sarif_cli.derived_joins as derived
import sys
#
# Configure logger
#
logging.basicConfig(format='%(asctime)s %(message)s')
#
# Start processing
#
parser = argparse.ArgumentParser(description='Read a collection of sarif files and produce tabular output.')
parser.add_argument('file', metavar='scan-spec.json', type=str,
help="json file containing required external scan information.")
parser.add_argument('outdir', metavar='output-dir', type=str, help='output directory')
args = parser.parse_args()
# Load meta info
def load(fname):
with open(fname, 'rb') if fname != '-' else sys.stdin as fp:
try:
content = json.load(fp)
except json.decoder.JSONDecodeError as err:
logging.error('Error reading from {}: {}: line {}, column {}'
.format(args.file, err.msg, err.lineno, err.colno))
sys.exit(1)
return content
scan_spec = load(args.file)
sarif_struct = load(scan_spec['sarif_file_name'])
#
# Preprocess raw SARIF to get smaller signature
#
context = signature.Context(
{
"string" : "String",
"int" : "Int",
"bool" : "Bool"
}
)
sarif_struct = signature.fillsig(args, sarif_struct, context)
#
# Use reference type graph (signature) to traverse sarif and attach values to tables
#
tgraph = typegraph.Typegraph(signature_single.struct_graph_2022_02_01)
typegraph.destructure(tgraph, signature_single.start_node_2022_02_01, sarif_struct)
#
# Form output tables
#
typegraph.attach_tables(tgraph)
#
# Dataframe / table collection
#
@dataclass
class BaseTables:
artifacts : pd.DataFrame
codeflows : pd.DataFrame
kind_pathproblem : pd.DataFrame
kind_problem : pd.DataFrame
project : pd.DataFrame
relatedLocations : pd.DataFrame
rules : pd.DataFrame
def __init__(self): pass
bt = BaseTables()
@dataclass
class ScanTables:
# project: External table with project information
scans : pd.DataFrame
results : pd.DataFrame
def __init__(self): pass
scantabs = ScanTables()
#
# Add dataframes for base tables
#
sf_2683 = tj.joins_for_sf_2683(tgraph)
af_0350_location = tj.joins_for_af_0350_location(tgraph)
bt.artifacts = tj.joins_for_artifacts(tgraph)
bt.codeflows = tj.joins_for_codeflows(tgraph, sf_2683)
bt.kind_pathproblem = tj.joins_for_path_problem(tgraph, af_0350_location)
bt.kind_problem = tj.joins_for_problem(tgraph, af_0350_location)
bt.project = tj.joins_for_project_single(tgraph)
bt.relatedLocations = tj.joins_for_relatedLocations(tgraph, sf_2683)
bt.rules = tj.joins_for_rules(tgraph)
#
# Form derived query tables
#
# XX
# scantabs.project = derived.joins_for_project(bt)
# scantabs.scans = derived.joins_for_scans(bt)
# scantabs.results = derived.joins_for_results(bt)
#
# Replace the remaining internal ids with snowflake ids
#
flakegen = snowflake_id.Snowflake(0)
columns_to_reindex = {
# template from {field.name : [''] for field in dc.fields(bt)}
'artifacts': ['artifacts_id'],
'codeflows': ['codeflow_id'],
'kind_pathproblem': ['results_array_id', 'codeFlows_id'],
'kind_problem': ['results_array_id'],
'project': ['artifacts', 'results', 'rules'],
'relatedLocations': ['struct_id'],
'rules': ['rules_array_id']}
_id_to_flake = {}
def _get_flake(id):
flake = _id_to_flake.get(id, -1)
if flake == -1:
flake = flakegen.next()
_id_to_flake[id] = flake
return flake
#
# Cleaner, but makes far too many copies; keep the loop below
#
# def _reindex(table, colname):
# newtable = table.astype({ colname : 'uint64'}).reset_index(drop=True)
# for i in range(0, len(newtable)):
# newtable.loc[i, colname] = _get_flake(newtable.loc[i, colname])
# return newtable
#
# for field in dc.fields(bt):
# table_name = field.name
# for colname in columns_to_reindex[table_name]:
# setattr(bt, field.name, _reindex(getattr(bt, field.name), colname))
#
for field in dc.fields(bt):
table_name = field.name
table = getattr(bt, field.name)
# Turn all snowflake columns into uint64 and reset indexing to 0..len(table)
newtable = table.astype(
{ colname : 'uint64'
for colname in columns_to_reindex[table_name]}
).reset_index(drop=True)
# Swap ids for flakes
for colname in columns_to_reindex[table_name]:
for i in range(0, len(newtable)):
newtable.loc[i, colname] = _get_flake(newtable.loc[i, colname])
# Replace the table
setattr(bt, field.name, newtable)
#
# Write output
#
p = pathlib.Path(args.outdir)
p.mkdir(exist_ok=True)
def write(path, frame):
with p.joinpath(path + ".csv").open(mode='wb') as fh:
frame.to_csv(fh, index=False)
for field in dc.fields(bt):
table = getattr(bt, field.name)
write(field.name, table)

View File

@@ -0,0 +1,5 @@
{
"project_id": 13243,
"scan_id": 123457,
"sarif_file_name": "2022-02-25/results.sarif"
}

View File

@@ -0,0 +1,5 @@
{
"project_id": 13243,
"scan_id": 123456,
"sarif_file_name": "2021-12-09/results.sarif"
}

View File

@@ -181,7 +181,8 @@
* Tables or entries to be removed * Tables or entries to be removed
The top of the [Mar-23-2022] =projects.csv= table, enumerated below, is ad-hoc The top of the [Mar-23-2022] =projects.csv= table, enumerated below, is ad-hoc
and included in the other tables below; the information for its fields is not and included in the other tables below; the information for its fields is not
yet collected to it can be discarded. yet collected so it can be discarded.
#+BEGIN_SRC text #+BEGIN_SRC text
==> project-meta.csv <== ==> project-meta.csv <==
creation_date creation_date
@@ -196,6 +197,17 @@
tool_version tool_version
#+END_SRC #+END_SRC
This information was used to expand the sarif tree (see Struct3452 and Array7481
in typegraph-multi-with-tables.pdf and the code). In retrospect, that was a
poor choice. All additional information needed can be represented by one or
more tables, so sarif-extract* post commit 30e3dd3a3 do so.
The minimal information required to drive the sarif-to-table conversion is
| project_id | 13243 | |
| scan_id | 123456 | |
| sarif_file_name | "2021-12-09/results.sarif" | |
* New tables to be exported * New tables to be exported
This section enumerates new tables intended for reporting infrastructure. This section enumerates new tables intended for reporting infrastructure.

View File

@@ -305,7 +305,7 @@ def joins_for_relatedLocations(tgraph, sf_2683):
def joins_for_project(tgraph): def joins_for_project(tgraph):
""" """
Return table providing the `project` information. Return table providing the `project` information for sarif-extract-multi.
""" """
# Access convenience functions # Access convenience functions
sf = lambda num: tgraph.dataframes['Struct' + str(num)] sf = lambda num: tgraph.dataframes['Struct' + str(num)]
@@ -368,6 +368,64 @@ def joins_for_project(tgraph):
) )
return project_df_1 return project_df_1
def joins_for_project_single(tgraph):
"""
Return table providing the `project` information for sarif-extract-scans
"""
# Access convenience functions
sf = lambda num: tgraph.dataframes['Struct' + str(num)]
af = lambda num: tgraph.dataframes['Array' + str(num)]
#
project_df = (
sf(6787)
.rename(columns={"version": "version_6787", "struct_id": "struct_id_6787"})
#
.merge(af('0177'), how="left", left_on='runs', right_on='array_id',
validate="1:m")
.drop(columns=['runs', 'array_id', 'type_at_index'])
.rename(columns={"value_index": "value_index_0177"})
#
.merge(sf(3388), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m")
.drop(columns=['id_or_value_at_index', 'struct_id'])
#
# .merge(af(7069), how="left", left_on='newlineSequences', right_on='array_id',
# validate="1:m")
# .drop(columns=['newlineSequences', 'array_id', 'type_at_index'])
.drop(columns=['newlineSequences'])
#
.merge(sf(9543), how="left", left_on='properties', right_on='struct_id', validate="1:m")
.drop(columns=['properties', 'struct_id'])
#
# tool - driver - rules - defaultConfiguration - ( properties - tags )
#
.merge(sf(8972), how="left", left_on='tool', right_on='struct_id', validate="1:m")
.drop(columns=['tool', 'struct_id'])
#
.merge(sf(7820), how="left", left_on='driver', right_on='struct_id', validate="1:m")
.drop(columns=['driver', 'struct_id'])
.rename(columns={"version": "driver_version_7820", "name": "driver_name_7820"})
#
.merge(af(5511), how="left", left_on='versionControlProvenance', right_on='array_id')
.drop(columns=['versionControlProvenance', 'array_id', 'type_at_index'])
.rename(columns={"value_index": "versionControl_value_index_5511"})
#
.merge(sf(3081), how="left", left_on='id_or_value_at_index', right_on='struct_id')
.drop(columns=['id_or_value_at_index', 'struct_id'])
#
)
# Keep columns of interest
project_df_1 = (
project_df
.drop(columns=['struct_id_6787', 'versionControl_value_index_5511'])
.rename({
'version_6787': 'sarif_version',
'value_index_0177': 'run_index',
'driver_name_7820': 'driver_name',
'driver_version_7820': 'driver_version',
}, axis='columns')
)
return project_df_1
def joins_for_rules(tgraph): def joins_for_rules(tgraph):
""" """
Return table providing the `rules` information. Return table providing the `rules` information.

View File

@@ -8,6 +8,7 @@ This file also contains some type graph reference values; these may be moved out
separate files at some point. separate files at some point.
""" """
from dataclasses import dataclass from dataclasses import dataclass
import logging
from typing import Any, Dict, List, Tuple, Union from typing import Any, Dict, List, Tuple, Union
import pandas as pd import pandas as pd
@@ -160,13 +161,19 @@ def _destructure_dict(typegraph: Typegraph, node, tree):
elif set(tree_fields).issuperset(set(type_fields)): elif set(tree_fields).issuperset(set(type_fields)):
# Log a warning # Log a warning
# log.warning("XX: Tree has unrecognized fields") # log.warning("XX: Tree has unrecognized fields")
logging.warning('Input tree has unrecognized fields, collecting only '
'known entries: {}'.format(tree))
logging.warning('tree fields: {} type fields: {}'
.format(tree_fields, type_fields))
_destructure_dict_1(typegraph, node, tree) _destructure_dict_1(typegraph, node, tree)
elif set(tree_fields).issubset(set(type_fields)): elif set(tree_fields).issubset(set(type_fields)):
raise MissingFieldException("XX: (Sub)tree is missing fields required by typedef") raise MissingFieldException("XX: (Sub)tree is missing fields required by typedef")
else: else:
raise Exception("typegraph: unhandled case reached. Internal error") raise Exception("typegraph: unhandled case reached: cannot match type "
"fields {} to tree fields {}. Data is invalid."
.format(type_fields, tree_fields))
def _destructure_list(typegraph, node: str, tree: List): def _destructure_list(typegraph, node: str, tree: List):