sarif-extract-tables: also output relatedLocations table

With --related-locations,

    ../../bin/sarif-results-summary -r results.sarif

produces the details

    RESULT: static/js/tinymce/jscripts/tiny_mce/plugins/paste/editor_plugin_src.js:722:
    72:722:73: Character ''' is repeated [here](1) in the same character class.
    Character ''' is repeated [here](2) in the same character class.
    Character ''' is repeated [here](3) in the same character class.
    REFERENCE: static/js/tinymce/jscripts/tiny_mce/plugins/paste/editor_plugin_src.js:722:74:722:75: here
    REFERENCE: static/js/tinymce/jscripts/tiny_mce/plugins/paste/editor_plugin_src.js:722:76:722:77: here
    REFERENCE: static/js/tinymce/jscripts/tiny_mce/plugins/paste/editor_plugin_src.js:722:78:722:79: here

Via
    ../../bin/sarif-extract-tables results.sarif tables

sarif-extract-tables now produces two output tables,

    tables/
    ├── messages.csv
    └── relatedLocations.csv

that contain the relevant information and can be joined or otherwise combined on
the struct_id_4055 key.

For example, adding to the end of sarif-extract-tables:
    import IPython
    IPython.embed()

    msg = d2[d2.message.str.startswith("Character ''' is repeated [here]")]
    dr3[dr3.struct_id_4055 == msg.struct_id_4055.values[0]]

    In [24]: msg
    Out[24]:
         struct_id_4055  ...                                            message
    180      4796917312  ...  Character ''' is repeated [here](1) in the sam...

    [1 rows x 7 columns]

    In [25]: dr3[dr3.struct_id_4055 == msg.struct_id_4055.values[0]]
    Out[25]:
         struct_id_4055                                                uri  startLine  startColumn  endLine  endColumn message
    180      4796917312  static/js/tinymce/jscripts/tiny_mce/plugins/pa...        722           74      722         75    here
    181      4796917312  static/js/tinymce/jscripts/tiny_mce/plugins/pa...        722           76      722         77    here
    182      4796917312  static/js/tinymce/jscripts/tiny_mce/plugins/pa...        722           78      722         79    here

or manually from the shell:

    # pick up the struct_id_4055:
    0:$ grep "static.*Character ''' is repeated \[here\]" tables/messages.csv
    180,4927448704,static/js/tinymce/jscripts/tiny_mce/plugins/paste/editor_plugin_src.js,722,72,722,73,"Character ''' is repeated [here](1) in the same character class.

    # and find relatedLocations:
    0:$ grep 4927448704 tables/relatedLocations.csv
    180,4927448704,static/js/tinymce/jscripts/tiny_mce/plugins/paste/editor_plugin_src.js,722,74,722,75,here
    181,4927448704,static/js/tinymce/jscripts/tiny_mce/plugins/paste/editor_plugin_src.js,722,76,722,77,here
    182,4927448704,static/js/tinymce/jscripts/tiny_mce/plugins/paste/editor_plugin_src.js,722,78,722,79,here

Changes:
- Introduce scli-dyys, a random id string for later identification and removal of
  dummy table rows.

- Keep the struct_id_4055 column to join tables as needed.

- Output is now written to a directory as there are always multiple files.
This commit is contained in:
Michael Hohn
2022-02-16 17:03:58 -08:00
committed by =Michael Hohn
parent ec9a0b5590
commit ad738abed3
2 changed files with 54 additions and 15 deletions

View File

@@ -3,6 +3,7 @@
""" """
import argparse import argparse
import json import json
import pathlib
from sarif_cli import signature from sarif_cli import signature
from sarif_cli import typegraph from sarif_cli import typegraph
import sys import sys
@@ -14,6 +15,7 @@ import pandas as pd
# #
parser = argparse.ArgumentParser(description='Read a sarif file and produce tabular output.') parser = argparse.ArgumentParser(description='Read a sarif file and produce tabular output.')
parser.add_argument('file', metavar='sarif-file', type=str, help='input file, - for stdin') parser.add_argument('file', metavar='sarif-file', type=str, help='input file, - for stdin')
parser.add_argument('outdir', metavar='output-dir', type=str, help='output directory')
parser.add_argument('-f', '--output-format', metavar='format', type=str, default="csv", parser.add_argument('-f', '--output-format', metavar='format', type=str, default="csv",
help='Output format for table. Currently just csv; ' help='Output format for table. Currently just csv; '
' other formats supported by pandas can be added.') ' other formats supported by pandas can be added.')
@@ -100,15 +102,15 @@ sf = lambda num: tgraph.dataframes['Struct' + str(num)]
af = lambda num: tgraph.dataframes['Array' + str(num)] af = lambda num: tgraph.dataframes['Array' + str(num)]
# #
# Form the dataframe via joins # Form the message dataframe via joins
# #
d1 = ( d1 = (
sf(4055) sf(4055)
.merge(af('0350'), how="left", left_on='locations', right_on='array_id', validate="1:m") .merge(af('0350'), how="left", left_on='locations', right_on='array_id', validate="1:m")
.drop(columns=['struct_id', 'locations', 'array_id', 'value_index', 'type_at_index']) .drop(columns=['locations', 'array_id', 'value_index', 'type_at_index'])
.merge(sf(2683), how="left", left_on='id_or_value_at_index', right_on='struct_id', .merge(sf(2683), how="left", left_on='id_or_value_at_index', right_on='struct_id',
suffixes=("_4055", "_2683"), validate="1:m") suffixes=("_4055", "_2683"), validate="1:m")
.drop(columns=['struct_id', 'id_or_value_at_index']) .drop(columns=['struct_id_2683', 'id_or_value_at_index'])
.merge(sf(4963), how="left", left_on='physicalLocation', right_on='struct_id', validate="1:m") .merge(sf(4963), how="left", left_on='physicalLocation', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 'physicalLocation']) .drop(columns=['struct_id', 'physicalLocation'])
.merge(sf(6299), how="left", left_on='region', right_on='struct_id', validate="1:m") .merge(sf(6299), how="left", left_on='region', right_on='struct_id', validate="1:m")
@@ -135,14 +137,48 @@ d1 = (
# #
# Reproduce ALL `file:line:col:line:col: message` entries as a table # Reproduce ALL `file:line:col:line:col: message` entries as a table
# #
d2 = (d1[['uri', 'startLine', 'startColumn', 'endLine', 'endColumn', 'text_4055']] d2 = (d1[['struct_id_4055', 'uri', 'startLine', 'startColumn', 'endLine', 'endColumn', 'text_4055']]
.rename({'text_4055': 'message'}, axis='columns')) .rename({'text_4055': 'message'}, axis='columns'))
#
# Form the relatedLocation dataframe via joins. This is subtly different from d1:
# left_on=relatedLocations, and no left_on='message_4055'
dr1 = (
sf(4055)
.merge(af('0350'), how="left", left_on='relatedLocations', right_on='array_id', validate="1:m")
.drop(columns=['relatedLocations', 'array_id', 'value_index', 'type_at_index'])
#
.merge(sf(2683), how="left", left_on='id_or_value_at_index', right_on='struct_id',
suffixes=("_4055", "_2683"), validate="1:m")
.drop(columns=['struct_id_2683', 'id_or_value_at_index'])
.merge(sf(4963), how="left", left_on='physicalLocation', right_on='struct_id', validate="1:m")
#
.drop(columns=['struct_id', 'physicalLocation'])
.merge(sf(6299), how="left", left_on='region', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 'region'])
.merge(sf(2685), how="left", left_on='artifactLocation', right_on='struct_id', validate="1:m")
#
.drop(columns=['struct_id', 'artifactLocation'])
.merge(sf(2774), how="left", left_on='message_2683', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 'message_2683'])
)
dr2 = (dr1[['struct_id_4055', 'uri', 'startLine', 'startColumn', 'endLine', 'endColumn', 'text']]
.rename({'text': 'message'}, axis='columns'))
# Remove dummy locations previously injected by signature.fillsig
dr3 = dr2[dr2.uri != 'scli-dyys dummy value']
# #
# Write output # Write output
# #
if args.output_format == 'csv': if args.output_format == 'csv':
d2.to_csv(sys.stdout, index_label='index') p = pathlib.Path(args.outdir)
p.mkdir(exist_ok=True)
with p.joinpath('messages.csv').open(mode='wb') as messages:
d2.to_csv(messages, index_label='index')
with p.joinpath('relatedLocations.csv').open(mode='wb') as relo:
dr3.to_csv(relo, index_label='index')
else: else:
sys.stderr.write("unknown output format") sys.stderr.write("unknown output format")

View File

@@ -203,12 +203,15 @@ properties_keys = set([first for first, _ in
('sub-severity', 'String'), ('sub-severity', 'String'),
('tags', 'Array003'), ('tags', 'Array003'),
]]) ]])
dummy_properties = { 'kind' : 'unspecified', #
'precision' : 'unspecified', # scli-dyys is a random id string for later identification of dummy values
'security-severity' : 'unspecified', #
'severity' : 'unspecified', dummy_properties = { 'kind' : 'scli-dyys dummy value',
'sub-severity' : 'unspecified', 'precision' : 'scli-dyys dummy value',
'tags' : ['unspecified'], 'security-severity' : 'scli-dyys dummy value',
'severity' : 'scli-dyys dummy value',
'sub-severity' : 'scli-dyys dummy value',
'tags' : ['scli-dyys dummy value'],
} }
relatedLocations_keys = set([first for first, _ in relatedLocations_keys = set([first for first, _ in
@@ -221,16 +224,16 @@ dummy_newlineSequences = ['\r\n', '\n', '\u2028', '\u2029']
dummy_relatedLocations_entry = [ dummy_relatedLocations_entry = [
{'id': -1, {'id': -1,
'physicalLocation': {'artifactLocation': {'uri': '', 'physicalLocation': {'artifactLocation': {'uri': 'scli-dyys dummy value',
'uriBaseId': '%SRCROOT%', 'uriBaseId': 'scli-dyys dummy value',
'index': -1}, 'index': -1},
'region': {'startLine': -1, 'region': {'startLine': -1,
'startColumn': -1, 'startColumn': -1,
'endLine': -1, 'endLine': -1,
'endColumn': -1}}, 'endColumn': -1}},
'message': {'text': ''}}] 'message': {'text': 'scli-dyys dummy value'}}]
dummy_message_entry = {'text': ''} dummy_message_entry = {'text': 'scli-dyys dummy value'}
def fillsig_dict(args, elem, context): def fillsig_dict(args, elem, context):
""" Fill in the missing fields in dictionary signatures. """ Fill in the missing fields in dictionary signatures.