mirror of
https://github.com/hohn/sarif-cli.git
synced 2025-12-16 17:23:03 +01:00
WIP: sarif-extract-scans: back to single sarif file handling, incorporate multi-file libraries
This commit is contained in:
committed by
=Michael Hohn
parent
675a5a4008
commit
b212423907
174
bin/sarif-extract-scans
Executable file
174
bin/sarif-extract-scans
Executable file
@@ -0,0 +1,174 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
""" Extract scan data from multiple sarif files in table form.
|
||||||
|
"""
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from sarif_cli import signature, signature_single
|
||||||
|
from sarif_cli import typegraph
|
||||||
|
from sarif_cli import snowflake_id
|
||||||
|
import argparse
|
||||||
|
import dataclasses as dc
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import pandas as pd
|
||||||
|
import pathlib
|
||||||
|
import sarif_cli.table_joins as tj
|
||||||
|
import sarif_cli.derived_joins as derived
|
||||||
|
import sys
|
||||||
|
|
||||||
|
#
|
||||||
|
# Configure logger
|
||||||
|
#
|
||||||
|
logging.basicConfig(format='%(asctime)s %(message)s')
|
||||||
|
|
||||||
|
#
|
||||||
|
# Start processing
|
||||||
|
#
|
||||||
|
parser = argparse.ArgumentParser(description='Read a collection of sarif files and produce tabular output.')
|
||||||
|
parser.add_argument('file', metavar='scan-spec.json', type=str,
|
||||||
|
help="json file containing required external scan information.")
|
||||||
|
parser.add_argument('outdir', metavar='output-dir', type=str, help='output directory')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Load meta info
|
||||||
|
def load(fname):
|
||||||
|
with open(fname, 'rb') if fname != '-' else sys.stdin as fp:
|
||||||
|
try:
|
||||||
|
content = json.load(fp)
|
||||||
|
except json.decoder.JSONDecodeError as err:
|
||||||
|
logging.error('Error reading from {}: {}: line {}, column {}'
|
||||||
|
.format(args.file, err.msg, err.lineno, err.colno))
|
||||||
|
sys.exit(1)
|
||||||
|
return content
|
||||||
|
|
||||||
|
scan_spec = load(args.file)
|
||||||
|
sarif_struct = load(scan_spec['sarif_file_name'])
|
||||||
|
|
||||||
|
#
|
||||||
|
# Preprocess raw SARIF to get smaller signature
|
||||||
|
#
|
||||||
|
context = signature.Context(
|
||||||
|
{
|
||||||
|
"string" : "String",
|
||||||
|
"int" : "Int",
|
||||||
|
"bool" : "Bool"
|
||||||
|
}
|
||||||
|
)
|
||||||
|
sarif_struct = signature.fillsig(args, sarif_struct, context)
|
||||||
|
|
||||||
|
#
|
||||||
|
# Use reference type graph (signature) to traverse sarif and attach values to tables
|
||||||
|
#
|
||||||
|
tgraph = typegraph.Typegraph(signature_single.struct_graph_2022_02_01)
|
||||||
|
typegraph.destructure(tgraph, signature_single.start_node_2022_02_01, sarif_struct)
|
||||||
|
#
|
||||||
|
# Form output tables
|
||||||
|
#
|
||||||
|
typegraph.attach_tables(tgraph)
|
||||||
|
|
||||||
|
#
|
||||||
|
# Dataframe / table collection
|
||||||
|
#
|
||||||
|
@dataclass
|
||||||
|
class BaseTables:
|
||||||
|
artifacts : pd.DataFrame
|
||||||
|
codeflows : pd.DataFrame
|
||||||
|
kind_pathproblem : pd.DataFrame
|
||||||
|
kind_problem : pd.DataFrame
|
||||||
|
project : pd.DataFrame
|
||||||
|
relatedLocations : pd.DataFrame
|
||||||
|
rules : pd.DataFrame
|
||||||
|
def __init__(self): pass
|
||||||
|
bt = BaseTables()
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ScanTables:
|
||||||
|
# project: External table with project information
|
||||||
|
scans : pd.DataFrame
|
||||||
|
results : pd.DataFrame
|
||||||
|
def __init__(self): pass
|
||||||
|
scantabs = ScanTables()
|
||||||
|
|
||||||
|
#
|
||||||
|
# Add dataframes for base tables
|
||||||
|
#
|
||||||
|
sf_2683 = tj.joins_for_sf_2683(tgraph)
|
||||||
|
af_0350_location = tj.joins_for_af_0350_location(tgraph)
|
||||||
|
bt.artifacts = tj.joins_for_artifacts(tgraph)
|
||||||
|
bt.codeflows = tj.joins_for_codeflows(tgraph, sf_2683)
|
||||||
|
bt.kind_pathproblem = tj.joins_for_path_problem(tgraph, af_0350_location)
|
||||||
|
bt.kind_problem = tj.joins_for_problem(tgraph, af_0350_location)
|
||||||
|
bt.project = tj.joins_for_project_single(tgraph)
|
||||||
|
bt.relatedLocations = tj.joins_for_relatedLocations(tgraph, sf_2683)
|
||||||
|
bt.rules = tj.joins_for_rules(tgraph)
|
||||||
|
|
||||||
|
#
|
||||||
|
# Form derived query tables
|
||||||
|
#
|
||||||
|
# XX
|
||||||
|
# scantabs.project = derived.joins_for_project(bt)
|
||||||
|
# scantabs.scans = derived.joins_for_scans(bt)
|
||||||
|
# scantabs.results = derived.joins_for_results(bt)
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
|
# Replace the remaining internal ids with snowflake ids
|
||||||
|
#
|
||||||
|
flakegen = snowflake_id.Snowflake(0)
|
||||||
|
|
||||||
|
columns_to_reindex = {
|
||||||
|
# template from {field.name : [''] for field in dc.fields(bt)}
|
||||||
|
'artifacts': ['artifacts_id'],
|
||||||
|
'codeflows': ['codeflow_id'],
|
||||||
|
'kind_pathproblem': ['results_array_id', 'codeFlows_id'],
|
||||||
|
'kind_problem': ['results_array_id'],
|
||||||
|
'project': ['artifacts', 'results', 'rules'],
|
||||||
|
'relatedLocations': ['struct_id'],
|
||||||
|
'rules': ['rules_array_id']}
|
||||||
|
|
||||||
|
_id_to_flake = {}
|
||||||
|
def _get_flake(id):
|
||||||
|
flake = _id_to_flake.get(id, -1)
|
||||||
|
if flake == -1:
|
||||||
|
flake = flakegen.next()
|
||||||
|
_id_to_flake[id] = flake
|
||||||
|
return flake
|
||||||
|
|
||||||
|
#
|
||||||
|
# Cleaner, but makes far too many copies; keep the loop below
|
||||||
|
#
|
||||||
|
# def _reindex(table, colname):
|
||||||
|
# newtable = table.astype({ colname : 'uint64'}).reset_index(drop=True)
|
||||||
|
# for i in range(0, len(newtable)):
|
||||||
|
# newtable.loc[i, colname] = _get_flake(newtable.loc[i, colname])
|
||||||
|
# return newtable
|
||||||
|
#
|
||||||
|
# for field in dc.fields(bt):
|
||||||
|
# table_name = field.name
|
||||||
|
# for colname in columns_to_reindex[table_name]:
|
||||||
|
# setattr(bt, field.name, _reindex(getattr(bt, field.name), colname))
|
||||||
|
#
|
||||||
|
for field in dc.fields(bt):
|
||||||
|
table_name = field.name
|
||||||
|
table = getattr(bt, field.name)
|
||||||
|
# Turn all snowflake columns into uint64 and reset indexing to 0..len(table)
|
||||||
|
newtable = table.astype(
|
||||||
|
{ colname : 'uint64'
|
||||||
|
for colname in columns_to_reindex[table_name]}
|
||||||
|
).reset_index(drop=True)
|
||||||
|
# Swap ids for flakes
|
||||||
|
for colname in columns_to_reindex[table_name]:
|
||||||
|
for i in range(0, len(newtable)):
|
||||||
|
newtable.loc[i, colname] = _get_flake(newtable.loc[i, colname])
|
||||||
|
# Replace the table
|
||||||
|
setattr(bt, field.name, newtable)
|
||||||
|
#
|
||||||
|
# Write output
|
||||||
|
#
|
||||||
|
p = pathlib.Path(args.outdir)
|
||||||
|
p.mkdir(exist_ok=True)
|
||||||
|
def write(path, frame):
|
||||||
|
with p.joinpath(path + ".csv").open(mode='wb') as fh:
|
||||||
|
frame.to_csv(fh, index=False)
|
||||||
|
for field in dc.fields(bt):
|
||||||
|
table = getattr(bt, field.name)
|
||||||
|
write(field.name, table)
|
||||||
5
data/treeio/scan-spec-0.json
Normal file
5
data/treeio/scan-spec-0.json
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
{
|
||||||
|
"project_id": 13243,
|
||||||
|
"scan_id": 123457,
|
||||||
|
"sarif_file_name": "2022-02-25/results.sarif"
|
||||||
|
}
|
||||||
5
data/treeio/scan-spec-1.json
Normal file
5
data/treeio/scan-spec-1.json
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
{
|
||||||
|
"project_id": 13243,
|
||||||
|
"scan_id": 123456,
|
||||||
|
"sarif_file_name": "2021-12-09/results.sarif"
|
||||||
|
}
|
||||||
@@ -181,7 +181,8 @@
|
|||||||
* Tables or entries to be removed
|
* Tables or entries to be removed
|
||||||
The top of the [Mar-23-2022] =projects.csv= table, enumerated below, is ad-hoc
|
The top of the [Mar-23-2022] =projects.csv= table, enumerated below, is ad-hoc
|
||||||
and included in the other tables below; the information for its fields is not
|
and included in the other tables below; the information for its fields is not
|
||||||
yet collected to it can be discarded.
|
yet collected so it can be discarded.
|
||||||
|
|
||||||
#+BEGIN_SRC text
|
#+BEGIN_SRC text
|
||||||
==> project-meta.csv <==
|
==> project-meta.csv <==
|
||||||
creation_date
|
creation_date
|
||||||
@@ -196,6 +197,17 @@
|
|||||||
tool_version
|
tool_version
|
||||||
#+END_SRC
|
#+END_SRC
|
||||||
|
|
||||||
|
This information was used to expand the sarif tree (see Struct3452 and Array7481
|
||||||
|
in typegraph-multi-with-tables.pdf and the code). In retrospect, that was a
|
||||||
|
poor choice. All additional information needed can be represented by one or
|
||||||
|
more tables, so sarif-extract* post commit 30e3dd3a3 do so.
|
||||||
|
|
||||||
|
The minimal information required to drive the sarif-to-table conversion is
|
||||||
|
| project_id | 13243 | |
|
||||||
|
| scan_id | 123456 | |
|
||||||
|
| sarif_file_name | "2021-12-09/results.sarif" | |
|
||||||
|
|
||||||
|
|
||||||
* New tables to be exported
|
* New tables to be exported
|
||||||
This section enumerates new tables intended for reporting infrastructure.
|
This section enumerates new tables intended for reporting infrastructure.
|
||||||
|
|
||||||
|
|||||||
@@ -305,7 +305,7 @@ def joins_for_relatedLocations(tgraph, sf_2683):
|
|||||||
|
|
||||||
def joins_for_project(tgraph):
|
def joins_for_project(tgraph):
|
||||||
"""
|
"""
|
||||||
Return table providing the `project` information.
|
Return table providing the `project` information for sarif-extract-multi.
|
||||||
"""
|
"""
|
||||||
# Access convenience functions
|
# Access convenience functions
|
||||||
sf = lambda num: tgraph.dataframes['Struct' + str(num)]
|
sf = lambda num: tgraph.dataframes['Struct' + str(num)]
|
||||||
@@ -368,6 +368,64 @@ def joins_for_project(tgraph):
|
|||||||
)
|
)
|
||||||
return project_df_1
|
return project_df_1
|
||||||
|
|
||||||
|
def joins_for_project_single(tgraph):
|
||||||
|
"""
|
||||||
|
Return table providing the `project` information for sarif-extract-scans
|
||||||
|
"""
|
||||||
|
# Access convenience functions
|
||||||
|
sf = lambda num: tgraph.dataframes['Struct' + str(num)]
|
||||||
|
af = lambda num: tgraph.dataframes['Array' + str(num)]
|
||||||
|
#
|
||||||
|
project_df = (
|
||||||
|
sf(6787)
|
||||||
|
.rename(columns={"version": "version_6787", "struct_id": "struct_id_6787"})
|
||||||
|
#
|
||||||
|
.merge(af('0177'), how="left", left_on='runs', right_on='array_id',
|
||||||
|
validate="1:m")
|
||||||
|
.drop(columns=['runs', 'array_id', 'type_at_index'])
|
||||||
|
.rename(columns={"value_index": "value_index_0177"})
|
||||||
|
#
|
||||||
|
.merge(sf(3388), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m")
|
||||||
|
.drop(columns=['id_or_value_at_index', 'struct_id'])
|
||||||
|
#
|
||||||
|
# .merge(af(7069), how="left", left_on='newlineSequences', right_on='array_id',
|
||||||
|
# validate="1:m")
|
||||||
|
# .drop(columns=['newlineSequences', 'array_id', 'type_at_index'])
|
||||||
|
.drop(columns=['newlineSequences'])
|
||||||
|
#
|
||||||
|
.merge(sf(9543), how="left", left_on='properties', right_on='struct_id', validate="1:m")
|
||||||
|
.drop(columns=['properties', 'struct_id'])
|
||||||
|
#
|
||||||
|
# tool - driver - rules - defaultConfiguration - ( properties - tags )
|
||||||
|
#
|
||||||
|
.merge(sf(8972), how="left", left_on='tool', right_on='struct_id', validate="1:m")
|
||||||
|
.drop(columns=['tool', 'struct_id'])
|
||||||
|
#
|
||||||
|
.merge(sf(7820), how="left", left_on='driver', right_on='struct_id', validate="1:m")
|
||||||
|
.drop(columns=['driver', 'struct_id'])
|
||||||
|
.rename(columns={"version": "driver_version_7820", "name": "driver_name_7820"})
|
||||||
|
#
|
||||||
|
.merge(af(5511), how="left", left_on='versionControlProvenance', right_on='array_id')
|
||||||
|
.drop(columns=['versionControlProvenance', 'array_id', 'type_at_index'])
|
||||||
|
.rename(columns={"value_index": "versionControl_value_index_5511"})
|
||||||
|
#
|
||||||
|
.merge(sf(3081), how="left", left_on='id_or_value_at_index', right_on='struct_id')
|
||||||
|
.drop(columns=['id_or_value_at_index', 'struct_id'])
|
||||||
|
#
|
||||||
|
)
|
||||||
|
# Keep columns of interest
|
||||||
|
project_df_1 = (
|
||||||
|
project_df
|
||||||
|
.drop(columns=['struct_id_6787', 'versionControl_value_index_5511'])
|
||||||
|
.rename({
|
||||||
|
'version_6787': 'sarif_version',
|
||||||
|
'value_index_0177': 'run_index',
|
||||||
|
'driver_name_7820': 'driver_name',
|
||||||
|
'driver_version_7820': 'driver_version',
|
||||||
|
}, axis='columns')
|
||||||
|
)
|
||||||
|
return project_df_1
|
||||||
|
|
||||||
def joins_for_rules(tgraph):
|
def joins_for_rules(tgraph):
|
||||||
"""
|
"""
|
||||||
Return table providing the `rules` information.
|
Return table providing the `rules` information.
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ This file also contains some type graph reference values; these may be moved out
|
|||||||
separate files at some point.
|
separate files at some point.
|
||||||
"""
|
"""
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
import logging
|
||||||
from typing import Any, Dict, List, Tuple, Union
|
from typing import Any, Dict, List, Tuple, Union
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
@@ -160,13 +161,19 @@ def _destructure_dict(typegraph: Typegraph, node, tree):
|
|||||||
elif set(tree_fields).issuperset(set(type_fields)):
|
elif set(tree_fields).issuperset(set(type_fields)):
|
||||||
# Log a warning
|
# Log a warning
|
||||||
# log.warning("XX: Tree has unrecognized fields")
|
# log.warning("XX: Tree has unrecognized fields")
|
||||||
|
logging.warning('Input tree has unrecognized fields, collecting only '
|
||||||
|
'known entries: {}'.format(tree))
|
||||||
|
logging.warning('tree fields: {} type fields: {}'
|
||||||
|
.format(tree_fields, type_fields))
|
||||||
_destructure_dict_1(typegraph, node, tree)
|
_destructure_dict_1(typegraph, node, tree)
|
||||||
|
|
||||||
elif set(tree_fields).issubset(set(type_fields)):
|
elif set(tree_fields).issubset(set(type_fields)):
|
||||||
raise MissingFieldException("XX: (Sub)tree is missing fields required by typedef")
|
raise MissingFieldException("XX: (Sub)tree is missing fields required by typedef")
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise Exception("typegraph: unhandled case reached. Internal error")
|
raise Exception("typegraph: unhandled case reached: cannot match type "
|
||||||
|
"fields {} to tree fields {}. Data is invalid."
|
||||||
|
.format(type_fields, tree_fields))
|
||||||
|
|
||||||
|
|
||||||
def _destructure_list(typegraph, node: str, tree: List):
|
def _destructure_list(typegraph, node: str, tree: List):
|
||||||
|
|||||||
Reference in New Issue
Block a user