Bring sarif-extract-tables up to date with sarif-extract-scans

This commit is contained in:
Michael Hohn
2022-07-19 15:42:26 -07:00
committed by =Michael Hohn
parent da7d669eb9
commit ef00559408
2 changed files with 93 additions and 168 deletions

View File

@@ -1,24 +1,29 @@
#!/usr/bin/env python #!/usr/bin/env python
""" Extract data from sarif files in table form. """Extract data from sarif files in table form.
These particular table joins create tables matching the content of The table joins for `problem`, `path-problem` and `relatedLocations` create tables
./sarif-results-summary matching the content of ./sarif-results-summary.
Return tables providing the `problem`, `path-problem` and `relatedLocations`
information.
The `artifacts`, `codeflows`, `relatedLocations` and `rules` tables provide the
remaining information from the sarif file; see
../notes/typegraph-multi-with-tables.pdf for details.
The `problem` and `path-problem` entries provide that information; the The `problem` and `path-problem` entries provide that information; the
`relatedLocations` table provides the details when multiple results are present `relatedLocations` table provides the details when multiple results are present
for either. for either.
""" """
import argparse from dataclasses import dataclass
import json
import pathlib
from sarif_cli import signature, signature_single from sarif_cli import signature, signature_single
from sarif_cli import typegraph from sarif_cli import typegraph
import sys from sarif_cli import snowflake_id
import argparse
import dataclasses as dc
import json
import pandas as pd import pandas as pd
import pathlib
import sarif_cli.table_joins as tj
import sys
# #
# Start processing # Start processing
@@ -61,6 +66,83 @@ typegraph.destructure(tgraph, signature_single.start_node_2022_02_01, sarif_stru
# #
typegraph.attach_tables(tgraph) typegraph.attach_tables(tgraph)
#
# Dataframe / table collection
#
@dataclass
class BaseTables:
artifacts : pd.DataFrame
codeflows : pd.DataFrame
kind_pathproblem : pd.DataFrame
kind_problem : pd.DataFrame
relatedLocations : pd.DataFrame
rules : pd.DataFrame
def __init__(self): pass
bt = BaseTables()
#
# Add dataframes
#
sf_2683 = tj.joins_for_sf_2683(tgraph)
af_0350_location = tj.joins_for_af_0350_location(tgraph)
bt.artifacts = tj.joins_for_artifacts(tgraph)
bt.codeflows = tj.joins_for_codeflows(tgraph, sf_2683)
bt.kind_pathproblem = tj.joins_for_path_problem(tgraph, af_0350_location)
bt.kind_problem = tj.joins_for_problem(tgraph, af_0350_location)
bt.relatedLocations = tj.joins_for_relatedLocations(tgraph, sf_2683)
bt.rules = tj.joins_for_rules(tgraph)
#
# Replace the remaining internal ids with snowflake ids
#
flakegen = snowflake_id.Snowflake(0)
columns_to_reindex = {
# template from {field.name : [''] for field in dc.fields(bt)}
'artifacts': ['artifacts_id'],
'codeflows': ['codeflow_id'],
'kind_pathproblem': ['results_array_id', 'codeFlows_id'],
'kind_problem': ['results_array_id'],
'relatedLocations': ['struct_id'],
'rules': ['rules_array_id']}
_id_to_flake = {}
def _get_flake(id):
flake = _id_to_flake.get(id, -1)
if flake == -1:
flake = flakegen.next()
_id_to_flake[id] = flake
return flake
for field in dc.fields(bt):
table_name = field.name
table = getattr(bt, field.name)
# Turn all snowflake columns into uint64 and reset indexing to 0..len(table)
newtable = table.astype(
{ colname : 'uint64'
for colname in columns_to_reindex[table_name]}
).reset_index(drop=True)
# Swap ids for flakes
for colname in columns_to_reindex[table_name]:
for i in range(0, len(newtable)):
newtable.loc[i, colname] = _get_flake(newtable.loc[i, colname])
# Replace the table
setattr(bt, field.name, newtable)
#
# Write output
#
p = pathlib.Path(args.outdir)
p.mkdir(exist_ok=True)
def write(path, frame):
with p.joinpath(path + ".csv").open(mode='wb') as fh:
frame.to_csv(fh, index=False)
for field in dc.fields(bt):
table = getattr(bt, field.name)
write(field.name, table)
# TODO:
""" """
Reproduce the Reproduce the
@@ -105,161 +187,3 @@ Using ../notes/typegraph.pdf, we find these:
|------------+----------+---------+-------------------+-------------------+------------| |------------+----------+---------+-------------------+-------------------+------------|
""" """
#
# Access convenience functions
#
sf = lambda num: tgraph.dataframes['Struct' + str(num)]
af = lambda num: tgraph.dataframes['Array' + str(num)]
#
# Form the message dataframe via joins
#
d1 = (
sf(4055)
.merge(af('0350'), how="left", left_on='locations', right_on='array_id', validate="1:m")
.drop(columns=['locations', 'array_id', 'value_index', 'type_at_index'])
.merge(sf(2683), how="left", left_on='id_or_value_at_index', right_on='struct_id',
suffixes=("_4055", "_2683"), validate="1:m")
.drop(columns=['struct_id_2683', 'id_or_value_at_index'])
.merge(sf(4963), how="left", left_on='physicalLocation', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 'physicalLocation'])
.merge(sf(6299), how="left", left_on='region', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 'region'])
.merge(sf(2685), how="left", left_on='artifactLocation', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 'artifactLocation'])
.merge(sf(2774), how="left", left_on='message_4055', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 'message_4055'])
.merge(sf(2774), how="left", left_on='message_2683', right_on='struct_id',
suffixes=("_4055", "_2683"), validate="1:m")
)
#
# As expected from the above note
#
# Note that this IGNORES the path
# - .results > .[] > .relatedLocations > .[] > .physicalLocation > .text
#
# we have no text entries that table:
#
# In [88]: d1[d1.text_2683 != '']
# Out[88]:
# Empty DataFrame
#
# Reproduce ALL `file:line:col:line:col: message` entries as a table
#
d2 = (d1[['struct_id_4055', 'uri', 'startLine', 'startColumn', 'endLine', 'endColumn', 'text_4055']]
.rename({'text_4055': 'message'}, axis='columns'))
#
# Form the codeFlows dataframe
#
dco1 = (
sf(9699)
.merge(af(9799), how="left", left_on='codeFlows', right_on='array_id', validate="1:m")
.drop(columns=['struct_id', 'codeFlows', 'array_id', 'type_at_index'])
#
.merge(sf(7122), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m")
.drop(columns=['id_or_value_at_index', 'struct_id'])
#
.merge(af(1597), how="left", left_on='threadFlows', right_on='array_id',
suffixes=("_codeFlow_9799", "_threadFlows_1597"), validate="1:m")
.drop(columns=['threadFlows', 'array_id', 'type_at_index'])
#
.merge(sf(4194), how="left", left_on='id_or_value_at_index', right_on='struct_id',
suffixes=("_9699", "_4194"), validate="1:m")
.drop(columns=['id_or_value_at_index', 'struct_id'])
#
.merge(af(1075), how="left", left_on='locations_4194', right_on='array_id', validate="1:m")
.drop(columns=['locations_4194', 'array_id', 'type_at_index'])
.rename(columns={"value_index": "value_index_locations_1075"})
#
.merge(sf('0987'), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m")
.drop(columns=['id_or_value_at_index', 'struct_id'])
#
.merge(sf(2683), how="left", left_on='location', right_on='struct_id',
suffixes=("_9699", "_2683"), validate="1:m")
.drop(columns=['location', 'struct_id'])
#
# The below is similar to dr1
#
.merge(sf(4963), how="left", left_on='physicalLocation', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 'physicalLocation'])
#
.merge(sf(6299), how="left", left_on='region', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 'region'])
#
.merge(sf(2685), how="left", left_on='artifactLocation', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 'artifactLocation'])
#
.merge(sf(2774), how="left", left_on='message_2683', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 'message_2683'])
)
# Keep columns of interest
dco2 = (dco1[['uri',
'startLine', 'startColumn', 'endLine', 'endColumn',
'text',
'ruleIndex', 'value_index_codeFlow_9799',
'value_index_threadFlows_1597', 'value_index_locations_1075',
]]
.rename({'text': 'message',
'value_index_codeFlow_9799': 'idx_codeFlow',
'value_index_threadFlows_1597': 'idx_threadFlows',
'value_index_locations_1075': 'idx_locations'}, axis='columns'))
# Remove dummy locations previously injected by signature.fillsig
dco3 = dco2[dco2.uri != 'scli-dyys dummy value']
#
# Form the relatedLocation dataframe via joins, starting from the union of
# relatedLocations from `kind problem` (sf(4055)) and `kind path-problem`
# (sf(9699)). This is only sligthly different from d1: left_on=relatedLocations,
# and no left_on='message_4055'
#
dr1 = (
pd.concat([sf(4055)[['relatedLocations', 'struct_id']], sf(9699)[['relatedLocations', 'struct_id']]])
.merge(af('0350'), how="left", left_on='relatedLocations', right_on='array_id', validate="1:m")
.drop(columns=['relatedLocations', 'array_id', 'value_index', 'type_at_index'])
#
.merge(sf(2683), how="left", left_on='id_or_value_at_index', right_on='struct_id',
suffixes=("_4055_9699", "_2683"), validate="1:m")
.drop(columns=['struct_id_2683', 'id_or_value_at_index'])
#
.merge(sf(4963), how="left", left_on='physicalLocation', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 'physicalLocation'])
#
.merge(sf(6299), how="left", left_on='region', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 'region'])
#
.merge(sf(2685), how="left", left_on='artifactLocation', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 'artifactLocation'])
#
.merge(sf(2774), how="left", left_on='message', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 'message'])
)
# Keep columns of interest
dr2 = (dr1[['struct_id_4055_9699', 'uri', 'startLine', 'startColumn', 'endLine', 'endColumn', 'text']]
.rename({'text': 'message', 'struct_id_4055_9699': 'struct_id'}, axis='columns'))
# Remove dummy locations previously injected by signature.fillsig
dr3 = dr2[dr2.uri != 'scli-dyys dummy value']
#
# Write output
#
if args.output_format == 'csv':
p = pathlib.Path(args.outdir)
p.mkdir(exist_ok=True)
with p.joinpath('problem.csv').open(mode='wb') as problem:
d2.to_csv(problem, index_label='index')
with p.joinpath('path-problem.csv').open(mode='wb') as path_problem:
dco3.to_csv(path_problem, index_label='index')
with p.joinpath('relatedLocations.csv').open(mode='wb') as relo:
dr3.to_csv(relo, index_label='index')
else:
sys.stderr.write("unknown output format")
sys.exit(1)

View File

@@ -4,5 +4,6 @@
# nothing on stdout/stderr # nothing on stdout/stderr
# #
( cd ../data/treeio/2021-12-09 && sarif-extract-tables results.sarif test-tables ) ( cd ../data/treeio/2021-12-09 && sarif-extract-tables results.sarif test-tables )
( cd ../data/treeio/2022-02-25 && sarif-extract-tables results.sarif test-tables )
( cd ../data/treeio && sarif-extract-multi multi-sarif-01.json test-multi-table ) ( cd ../data/treeio && sarif-extract-multi multi-sarif-01.json test-multi-table )
( cd ../data/treeio && sarif-extract-scans scan-spec-0.json test-scan ) ( cd ../data/treeio && sarif-extract-scans scan-spec-0.json test-scan )