mirror of
https://github.com/hohn/sarif-cli.git
synced 2025-12-16 17:23:03 +01:00
Initial version of sarif-extract-scans, to be tested
Running
cd ~/local/sarif-cli/data/treeio
sarif-extract-scans scan-spec-0.json test-scan
produces the 2 derived and one sarif-based table (codeflows.csv):
ls test-scan/
codeflows.csv results.csv scans.csv
Adding -r via
sarif-extract-scans -r scan-spec-0.json test-scan
writes all tables:
ls test-scan/
artifacts.csv kind_pathproblem.csv project.csv results.csv scans.csv
codeflows.csv kind_problem.csv relatedLocations.csv rules.csv
This commit is contained in:
committed by
=Michael Hohn
parent
3dd8522b7f
commit
eb8e2f18e9
@@ -27,6 +27,8 @@ parser = argparse.ArgumentParser(description='Read a collection of sarif files a
|
|||||||
parser.add_argument('file', metavar='scan-spec.json', type=str,
|
parser.add_argument('file', metavar='scan-spec.json', type=str,
|
||||||
help="json file containing required external scan information.")
|
help="json file containing required external scan information.")
|
||||||
parser.add_argument('outdir', metavar='output-dir', type=str, help='output directory')
|
parser.add_argument('outdir', metavar='output-dir', type=str, help='output directory')
|
||||||
|
parser.add_argument('-r', '--write-raw-tables', action="store_true",
|
||||||
|
help='Write the raw sarif tables to the output directory')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# Load meta info
|
# Load meta info
|
||||||
@@ -77,6 +79,7 @@ class BaseTables:
|
|||||||
project : pd.DataFrame
|
project : pd.DataFrame
|
||||||
relatedLocations : pd.DataFrame
|
relatedLocations : pd.DataFrame
|
||||||
rules : pd.DataFrame
|
rules : pd.DataFrame
|
||||||
|
columns_to_reindex : dict # (name -> name list) dict
|
||||||
def __init__(self): pass
|
def __init__(self): pass
|
||||||
bt = BaseTables()
|
bt = BaseTables()
|
||||||
|
|
||||||
@@ -85,16 +88,21 @@ class ScanTables:
|
|||||||
# project: External table with project information
|
# project: External table with project information
|
||||||
scans : pd.DataFrame
|
scans : pd.DataFrame
|
||||||
results : pd.DataFrame
|
results : pd.DataFrame
|
||||||
|
columns_to_reindex : dict # (name -> name list) dict
|
||||||
def __init__(self): pass
|
def __init__(self): pass
|
||||||
scantabs = ScanTables()
|
scantabs = ScanTables()
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class ExternalInfo:
|
class ExternalInfo:
|
||||||
|
project_id : int
|
||||||
scan_id : int
|
scan_id : int
|
||||||
|
sarif_file_name : str
|
||||||
ql_query_id : str
|
ql_query_id : str
|
||||||
|
|
||||||
external_info = ExternalInfo(
|
external_info = ExternalInfo(
|
||||||
scan_spec['scan_id'],
|
scan_spec["project_id"],
|
||||||
|
scan_spec["scan_id"],
|
||||||
|
scan_spec["sarif_file_name"],
|
||||||
'deadbeef00', # TODO: Take ql_query_id from where?
|
'deadbeef00', # TODO: Take ql_query_id from where?
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -115,14 +123,14 @@ bt.rules = tj.joins_for_rules(tgraph)
|
|||||||
# Form scan tables
|
# Form scan tables
|
||||||
#
|
#
|
||||||
scantabs.results = st.joins_for_results(bt, external_info)
|
scantabs.results = st.joins_for_results(bt, external_info)
|
||||||
scantabs.scans = st.joins_for_scans(bt, external_info)
|
scantabs.scans = st.joins_for_scans(bt, external_info, scantabs)
|
||||||
|
|
||||||
#
|
#
|
||||||
# Replace the remaining internal ids with snowflake ids
|
# Replace the remaining internal ids with snowflake ids
|
||||||
#
|
#
|
||||||
flakegen = snowflake_id.Snowflake(0)
|
flakegen = snowflake_id.Snowflake(0)
|
||||||
|
|
||||||
columns_to_reindex = {
|
bt.columns_to_reindex = {
|
||||||
# template from {field.name : [''] for field in dc.fields(bt)}
|
# template from {field.name : [''] for field in dc.fields(bt)}
|
||||||
'artifacts': ['artifacts_id'],
|
'artifacts': ['artifacts_id'],
|
||||||
'codeflows': ['codeflow_id'],
|
'codeflows': ['codeflow_id'],
|
||||||
@@ -132,6 +140,11 @@ columns_to_reindex = {
|
|||||||
'relatedLocations': ['struct_id'],
|
'relatedLocations': ['struct_id'],
|
||||||
'rules': ['rules_array_id']}
|
'rules': ['rules_array_id']}
|
||||||
|
|
||||||
|
scantabs.columns_to_reindex = {
|
||||||
|
'scans': [],
|
||||||
|
'results': ['codeFlow_id'],
|
||||||
|
}
|
||||||
|
|
||||||
_id_to_flake = {}
|
_id_to_flake = {}
|
||||||
def _get_flake(id):
|
def _get_flake(id):
|
||||||
flake = _id_to_flake.get(id, -1)
|
flake = _id_to_flake.get(id, -1)
|
||||||
@@ -154,28 +167,56 @@ def _get_flake(id):
|
|||||||
# for colname in columns_to_reindex[table_name]:
|
# for colname in columns_to_reindex[table_name]:
|
||||||
# setattr(bt, field.name, _reindex(getattr(bt, field.name), colname))
|
# setattr(bt, field.name, _reindex(getattr(bt, field.name), colname))
|
||||||
#
|
#
|
||||||
for field in dc.fields(bt):
|
|
||||||
table_name = field.name
|
def _replace_ids(tables_dataclass):
|
||||||
table = getattr(bt, field.name)
|
tdc = tables_dataclass
|
||||||
# Turn all snowflake columns into uint64 and reset indexing to 0..len(table)
|
for field in dc.fields(tdc):
|
||||||
newtable = table.astype(
|
if field.type != pd.DataFrame:
|
||||||
{ colname : 'uint64'
|
continue
|
||||||
for colname in columns_to_reindex[table_name]}
|
table_name = field.name
|
||||||
).reset_index(drop=True)
|
table = getattr(tdc, field.name)
|
||||||
# Swap ids for flakes
|
# Turn all snowflake columns into uint64 and reset indexing to 0..len(table)
|
||||||
for colname in columns_to_reindex[table_name]:
|
newtable = table.astype(
|
||||||
for i in range(0, len(newtable)):
|
{ colname : 'uint64'
|
||||||
newtable.loc[i, colname] = _get_flake(newtable.loc[i, colname])
|
for colname in tdc.columns_to_reindex[table_name]}
|
||||||
# Replace the table
|
).reset_index(drop=True)
|
||||||
setattr(bt, field.name, newtable)
|
# Swap ids for flakes
|
||||||
|
for colname in tdc.columns_to_reindex[table_name]:
|
||||||
|
for i in range(0, len(newtable)):
|
||||||
|
oid = newtable.loc[i, colname]
|
||||||
|
if oid in [0,-1]:
|
||||||
|
# Ignore special values
|
||||||
|
continue
|
||||||
|
newtable.loc[i, colname] = _get_flake(oid)
|
||||||
|
# Replace the table
|
||||||
|
setattr(tdc, field.name, newtable)
|
||||||
|
|
||||||
|
# Replace id()s of the base and derived tables
|
||||||
|
_replace_ids(bt)
|
||||||
|
_replace_ids(scantabs)
|
||||||
|
|
||||||
#
|
#
|
||||||
# Write output
|
# Write output
|
||||||
#
|
#
|
||||||
p = pathlib.Path(args.outdir)
|
p = pathlib.Path(args.outdir)
|
||||||
p.mkdir(exist_ok=True)
|
p.mkdir(exist_ok=True)
|
||||||
|
|
||||||
def write(path, frame):
|
def write(path, frame):
|
||||||
with p.joinpath(path + ".csv").open(mode='wb') as fh:
|
with p.joinpath(path + ".csv").open(mode='wb') as fh:
|
||||||
frame.to_csv(fh, index=False)
|
frame.to_csv(fh, index=False)
|
||||||
for field in dc.fields(bt):
|
|
||||||
table = getattr(bt, field.name)
|
def _write_dataframes_of(tables_dataclass):
|
||||||
write(field.name, table)
|
for field in dc.fields(tables_dataclass):
|
||||||
|
if field.type != pd.DataFrame:
|
||||||
|
continue
|
||||||
|
table = getattr(tables_dataclass, field.name)
|
||||||
|
write(field.name, table)
|
||||||
|
|
||||||
|
# Write sarif-based tables
|
||||||
|
if args.write_raw_tables:
|
||||||
|
_write_dataframes_of(bt)
|
||||||
|
|
||||||
|
# Write derived tables and codeflows
|
||||||
|
_write_dataframes_of(scantabs)
|
||||||
|
|
||||||
|
write('codeflows', bt.codeflows)
|
||||||
|
|||||||
@@ -4,28 +4,37 @@
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
from . import snowflake_id
|
from . import snowflake_id
|
||||||
|
|
||||||
# id --
|
#
|
||||||
# commit_id -- pathval(r02s01, 'commit_sha')
|
# Scans table
|
||||||
# project_id -- project.id
|
|
||||||
# db_create_start -- pathval(r02s01, 'created_at')
|
|
||||||
# db_create_stop
|
|
||||||
# scan_start_date
|
|
||||||
# scan_stop_date
|
|
||||||
# tool_name -- pathval(r02s01, 'tool', 'name')
|
|
||||||
# tool_version -- pathval(r02s01, 'tool', 'version')
|
|
||||||
# tool_query_commit_id -- pathval(r02, 0, 'tool', 'version') is sufficient
|
|
||||||
# sarif_content -- r02s02
|
|
||||||
# sarif_file_name -- used on upload
|
|
||||||
# sarif_id -- pathval(r02s01, 'sarif_id')
|
|
||||||
# results_count -- pathval(r02s01, 'results_count')
|
|
||||||
# rules_count -- pathval(r02s01, 'rules_count')
|
|
||||||
#
|
#
|
||||||
def joins_for_scans(basetables, external_info):
|
def joins_for_scans(basetables, external_info, scantables):
|
||||||
"""
|
"""
|
||||||
Return the `scans` table
|
Form the `scans` table for the ScanTables dataclass
|
||||||
"""
|
"""
|
||||||
# XX
|
b = basetables; e = external_info
|
||||||
pass
|
driver_name = b.project.driver_name.unique()
|
||||||
|
assert len(driver_name) == 1, "More than one driver name found for single sarif file."
|
||||||
|
driver_version = b.project.driver_version.unique()
|
||||||
|
assert len(driver_version) == 1, \
|
||||||
|
"More than one driver version found for single sarif file."
|
||||||
|
res = pd.DataFrame(data={
|
||||||
|
"id" : e.scan_id,
|
||||||
|
"commit_id" : pd.NA,
|
||||||
|
"project_id" : e.project_id,
|
||||||
|
#
|
||||||
|
"db_create_start" : pd.NA,
|
||||||
|
"db_create_stop" : pd.NA,
|
||||||
|
"scan_start_date" : pd.NA,
|
||||||
|
"scan_stop_date" : pd.NA,
|
||||||
|
#
|
||||||
|
"tool_name" : driver_name[0],
|
||||||
|
"tool_version" : driver_version[0],
|
||||||
|
"tool_query_commit_id" : pd.NA,
|
||||||
|
"sarif_file_name" : e.sarif_file_name,
|
||||||
|
"results_count" : scantables.results.shape[0],
|
||||||
|
"rules_count" : len(b.rules['id'].unique()),
|
||||||
|
},index=[0])
|
||||||
|
return res
|
||||||
|
|
||||||
#
|
#
|
||||||
# Results table
|
# Results table
|
||||||
|
|||||||
Reference in New Issue
Block a user