Initial version of sarif-extract-scans, to be tested

Running

    cd ~/local/sarif-cli/data/treeio
    sarif-extract-scans scan-spec-0.json test-scan

produces the 2 derived and one sarif-based table (codeflows.csv):

    ls test-scan/
    codeflows.csv  results.csv  scans.csv

Adding -r via

    sarif-extract-scans -r scan-spec-0.json test-scan

writes all tables:

    ls test-scan/
    artifacts.csv  kind_pathproblem.csv  project.csv           results.csv  scans.csv
    codeflows.csv  kind_problem.csv      relatedLocations.csv  rules.csv
This commit is contained in:
Michael Hohn
2022-05-16 18:58:53 -07:00
committed by =Michael Hohn
parent 3dd8522b7f
commit eb8e2f18e9
2 changed files with 89 additions and 39 deletions

View File

@@ -27,6 +27,8 @@ parser = argparse.ArgumentParser(description='Read a collection of sarif files a
parser.add_argument('file', metavar='scan-spec.json', type=str, parser.add_argument('file', metavar='scan-spec.json', type=str,
help="json file containing required external scan information.") help="json file containing required external scan information.")
parser.add_argument('outdir', metavar='output-dir', type=str, help='output directory') parser.add_argument('outdir', metavar='output-dir', type=str, help='output directory')
parser.add_argument('-r', '--write-raw-tables', action="store_true",
help='Write the raw sarif tables to the output directory')
args = parser.parse_args() args = parser.parse_args()
# Load meta info # Load meta info
@@ -77,6 +79,7 @@ class BaseTables:
project : pd.DataFrame project : pd.DataFrame
relatedLocations : pd.DataFrame relatedLocations : pd.DataFrame
rules : pd.DataFrame rules : pd.DataFrame
columns_to_reindex : dict # (name -> name list) dict
def __init__(self): pass def __init__(self): pass
bt = BaseTables() bt = BaseTables()
@@ -85,16 +88,21 @@ class ScanTables:
# project: External table with project information # project: External table with project information
scans : pd.DataFrame scans : pd.DataFrame
results : pd.DataFrame results : pd.DataFrame
columns_to_reindex : dict # (name -> name list) dict
def __init__(self): pass def __init__(self): pass
scantabs = ScanTables() scantabs = ScanTables()
@dataclass @dataclass
class ExternalInfo: class ExternalInfo:
project_id : int
scan_id : int scan_id : int
sarif_file_name : str
ql_query_id : str ql_query_id : str
external_info = ExternalInfo( external_info = ExternalInfo(
scan_spec['scan_id'], scan_spec["project_id"],
scan_spec["scan_id"],
scan_spec["sarif_file_name"],
'deadbeef00', # TODO: Take ql_query_id from where? 'deadbeef00', # TODO: Take ql_query_id from where?
) )
@@ -115,14 +123,14 @@ bt.rules = tj.joins_for_rules(tgraph)
# Form scan tables # Form scan tables
# #
scantabs.results = st.joins_for_results(bt, external_info) scantabs.results = st.joins_for_results(bt, external_info)
scantabs.scans = st.joins_for_scans(bt, external_info) scantabs.scans = st.joins_for_scans(bt, external_info, scantabs)
# #
# Replace the remaining internal ids with snowflake ids # Replace the remaining internal ids with snowflake ids
# #
flakegen = snowflake_id.Snowflake(0) flakegen = snowflake_id.Snowflake(0)
columns_to_reindex = { bt.columns_to_reindex = {
# template from {field.name : [''] for field in dc.fields(bt)} # template from {field.name : [''] for field in dc.fields(bt)}
'artifacts': ['artifacts_id'], 'artifacts': ['artifacts_id'],
'codeflows': ['codeflow_id'], 'codeflows': ['codeflow_id'],
@@ -132,6 +140,11 @@ columns_to_reindex = {
'relatedLocations': ['struct_id'], 'relatedLocations': ['struct_id'],
'rules': ['rules_array_id']} 'rules': ['rules_array_id']}
scantabs.columns_to_reindex = {
'scans': [],
'results': ['codeFlow_id'],
}
_id_to_flake = {} _id_to_flake = {}
def _get_flake(id): def _get_flake(id):
flake = _id_to_flake.get(id, -1) flake = _id_to_flake.get(id, -1)
@@ -154,28 +167,56 @@ def _get_flake(id):
# for colname in columns_to_reindex[table_name]: # for colname in columns_to_reindex[table_name]:
# setattr(bt, field.name, _reindex(getattr(bt, field.name), colname)) # setattr(bt, field.name, _reindex(getattr(bt, field.name), colname))
# #
for field in dc.fields(bt):
table_name = field.name def _replace_ids(tables_dataclass):
table = getattr(bt, field.name) tdc = tables_dataclass
# Turn all snowflake columns into uint64 and reset indexing to 0..len(table) for field in dc.fields(tdc):
newtable = table.astype( if field.type != pd.DataFrame:
{ colname : 'uint64' continue
for colname in columns_to_reindex[table_name]} table_name = field.name
).reset_index(drop=True) table = getattr(tdc, field.name)
# Swap ids for flakes # Turn all snowflake columns into uint64 and reset indexing to 0..len(table)
for colname in columns_to_reindex[table_name]: newtable = table.astype(
for i in range(0, len(newtable)): { colname : 'uint64'
newtable.loc[i, colname] = _get_flake(newtable.loc[i, colname]) for colname in tdc.columns_to_reindex[table_name]}
# Replace the table ).reset_index(drop=True)
setattr(bt, field.name, newtable) # Swap ids for flakes
for colname in tdc.columns_to_reindex[table_name]:
for i in range(0, len(newtable)):
oid = newtable.loc[i, colname]
if oid in [0,-1]:
# Ignore special values
continue
newtable.loc[i, colname] = _get_flake(oid)
# Replace the table
setattr(tdc, field.name, newtable)
# Replace id()s of the base and derived tables
_replace_ids(bt)
_replace_ids(scantabs)
# #
# Write output # Write output
# #
p = pathlib.Path(args.outdir) p = pathlib.Path(args.outdir)
p.mkdir(exist_ok=True) p.mkdir(exist_ok=True)
def write(path, frame): def write(path, frame):
with p.joinpath(path + ".csv").open(mode='wb') as fh: with p.joinpath(path + ".csv").open(mode='wb') as fh:
frame.to_csv(fh, index=False) frame.to_csv(fh, index=False)
for field in dc.fields(bt):
table = getattr(bt, field.name) def _write_dataframes_of(tables_dataclass):
write(field.name, table) for field in dc.fields(tables_dataclass):
if field.type != pd.DataFrame:
continue
table = getattr(tables_dataclass, field.name)
write(field.name, table)
# Write sarif-based tables
if args.write_raw_tables:
_write_dataframes_of(bt)
# Write derived tables and codeflows
_write_dataframes_of(scantabs)
write('codeflows', bt.codeflows)

View File

@@ -4,28 +4,37 @@
import pandas as pd import pandas as pd
from . import snowflake_id from . import snowflake_id
# id -- #
# commit_id -- pathval(r02s01, 'commit_sha') # Scans table
# project_id -- project.id
# db_create_start -- pathval(r02s01, 'created_at')
# db_create_stop
# scan_start_date
# scan_stop_date
# tool_name -- pathval(r02s01, 'tool', 'name')
# tool_version -- pathval(r02s01, 'tool', 'version')
# tool_query_commit_id -- pathval(r02, 0, 'tool', 'version') is sufficient
# sarif_content -- r02s02
# sarif_file_name -- used on upload
# sarif_id -- pathval(r02s01, 'sarif_id')
# results_count -- pathval(r02s01, 'results_count')
# rules_count -- pathval(r02s01, 'rules_count')
# #
def joins_for_scans(basetables, external_info): def joins_for_scans(basetables, external_info, scantables):
""" """
Return the `scans` table Form the `scans` table for the ScanTables dataclass
""" """
# XX b = basetables; e = external_info
pass driver_name = b.project.driver_name.unique()
assert len(driver_name) == 1, "More than one driver name found for single sarif file."
driver_version = b.project.driver_version.unique()
assert len(driver_version) == 1, \
"More than one driver version found for single sarif file."
res = pd.DataFrame(data={
"id" : e.scan_id,
"commit_id" : pd.NA,
"project_id" : e.project_id,
#
"db_create_start" : pd.NA,
"db_create_stop" : pd.NA,
"scan_start_date" : pd.NA,
"scan_stop_date" : pd.NA,
#
"tool_name" : driver_name[0],
"tool_version" : driver_version[0],
"tool_query_commit_id" : pd.NA,
"sarif_file_name" : e.sarif_file_name,
"results_count" : scantables.results.shape[0],
"rules_count" : len(b.rules['id'].unique()),
},index=[0])
return res
# #
# Results table # Results table