From ad738abed3b10e85d894b533fe0586d4018e3ee6 Mon Sep 17 00:00:00 2001 From: Michael Hohn Date: Wed, 16 Feb 2022 17:03:58 -0800 Subject: [PATCH] sarif-extract-tables: also output relatedLocations table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With --related-locations, ../../bin/sarif-results-summary -r results.sarif produces the details RESULT: static/js/tinymce/jscripts/tiny_mce/plugins/paste/editor_plugin_src.js:722: 72:722:73: Character ''' is repeated [here](1) in the same character class. Character ''' is repeated [here](2) in the same character class. Character ''' is repeated [here](3) in the same character class. REFERENCE: static/js/tinymce/jscripts/tiny_mce/plugins/paste/editor_plugin_src.js:722:74:722:75: here REFERENCE: static/js/tinymce/jscripts/tiny_mce/plugins/paste/editor_plugin_src.js:722:76:722:77: here REFERENCE: static/js/tinymce/jscripts/tiny_mce/plugins/paste/editor_plugin_src.js:722:78:722:79: here Via ../../bin/sarif-extract-tables results.sarif tables sarif-extract-tables now produces two output tables, tables/ ├── messages.csv └── relatedLocations.csv that contain the relevant information and can be joined or otherwise combined on the struct_id_4055 key. For example, adding to the end of sarif-extract-tables: import IPython IPython.embed() msg = d2[d2.message.str.startswith("Character ''' is repeated [here]")] dr3[dr3.struct_id_4055 == msg.struct_id_4055.values[0]] In [24]: msg Out[24]: struct_id_4055 ... message 180 4796917312 ... Character ''' is repeated [here](1) in the sam... [1 rows x 7 columns] In [25]: dr3[dr3.struct_id_4055 == msg.struct_id_4055.values[0]] Out[25]: struct_id_4055 uri startLine startColumn endLine endColumn message 180 4796917312 static/js/tinymce/jscripts/tiny_mce/plugins/pa... 722 74 722 75 here 181 4796917312 static/js/tinymce/jscripts/tiny_mce/plugins/pa... 722 76 722 77 here 182 4796917312 static/js/tinymce/jscripts/tiny_mce/plugins/pa... 722 78 722 79 here or manually from the shell: # pick up the struct_id_4055: 0:$ grep "static.*Character ''' is repeated \[here\]" tables/messages.csv 180,4927448704,static/js/tinymce/jscripts/tiny_mce/plugins/paste/editor_plugin_src.js,722,72,722,73,"Character ''' is repeated [here](1) in the same character class. # and find relatedLocations: 0:$ grep 4927448704 tables/relatedLocations.csv 180,4927448704,static/js/tinymce/jscripts/tiny_mce/plugins/paste/editor_plugin_src.js,722,74,722,75,here 181,4927448704,static/js/tinymce/jscripts/tiny_mce/plugins/paste/editor_plugin_src.js,722,76,722,77,here 182,4927448704,static/js/tinymce/jscripts/tiny_mce/plugins/paste/editor_plugin_src.js,722,78,722,79,here Changes: - Introduce scli-dyys, a random id string for later identification and removal of dummy table rows. - Keep the struct_id_4055 column to join tables as needed. - Output is now written to a directory as there are always multiple files. --- bin/sarif-extract-tables | 46 +++++++++++++++++++++++++++++++++++----- sarif_cli/signature.py | 23 +++++++++++--------- 2 files changed, 54 insertions(+), 15 deletions(-) diff --git a/bin/sarif-extract-tables b/bin/sarif-extract-tables index 0159ea4..6e5a24d 100755 --- a/bin/sarif-extract-tables +++ b/bin/sarif-extract-tables @@ -3,6 +3,7 @@ """ import argparse import json +import pathlib from sarif_cli import signature from sarif_cli import typegraph import sys @@ -14,6 +15,7 @@ import pandas as pd # parser = argparse.ArgumentParser(description='Read a sarif file and produce tabular output.') parser.add_argument('file', metavar='sarif-file', type=str, help='input file, - for stdin') +parser.add_argument('outdir', metavar='output-dir', type=str, help='output directory') parser.add_argument('-f', '--output-format', metavar='format', type=str, default="csv", help='Output format for table. Currently just csv; ' ' other formats supported by pandas can be added.') @@ -100,15 +102,15 @@ sf = lambda num: tgraph.dataframes['Struct' + str(num)] af = lambda num: tgraph.dataframes['Array' + str(num)] # -# Form the dataframe via joins +# Form the message dataframe via joins # d1 = ( sf(4055) .merge(af('0350'), how="left", left_on='locations', right_on='array_id', validate="1:m") - .drop(columns=['struct_id', 'locations', 'array_id', 'value_index', 'type_at_index']) + .drop(columns=['locations', 'array_id', 'value_index', 'type_at_index']) .merge(sf(2683), how="left", left_on='id_or_value_at_index', right_on='struct_id', suffixes=("_4055", "_2683"), validate="1:m") - .drop(columns=['struct_id', 'id_or_value_at_index']) + .drop(columns=['struct_id_2683', 'id_or_value_at_index']) .merge(sf(4963), how="left", left_on='physicalLocation', right_on='struct_id', validate="1:m") .drop(columns=['struct_id', 'physicalLocation']) .merge(sf(6299), how="left", left_on='region', right_on='struct_id', validate="1:m") @@ -135,14 +137,48 @@ d1 = ( # # Reproduce ALL `file:line:col:line:col: message` entries as a table # -d2 = (d1[['uri', 'startLine', 'startColumn', 'endLine', 'endColumn', 'text_4055']] +d2 = (d1[['struct_id_4055', 'uri', 'startLine', 'startColumn', 'endLine', 'endColumn', 'text_4055']] .rename({'text_4055': 'message'}, axis='columns')) +# +# Form the relatedLocation dataframe via joins. This is subtly different from d1: +# left_on=relatedLocations, and no left_on='message_4055' +dr1 = ( + sf(4055) + .merge(af('0350'), how="left", left_on='relatedLocations', right_on='array_id', validate="1:m") + .drop(columns=['relatedLocations', 'array_id', 'value_index', 'type_at_index']) + # + .merge(sf(2683), how="left", left_on='id_or_value_at_index', right_on='struct_id', + suffixes=("_4055", "_2683"), validate="1:m") + .drop(columns=['struct_id_2683', 'id_or_value_at_index']) + .merge(sf(4963), how="left", left_on='physicalLocation', right_on='struct_id', validate="1:m") + # + .drop(columns=['struct_id', 'physicalLocation']) + .merge(sf(6299), how="left", left_on='region', right_on='struct_id', validate="1:m") + .drop(columns=['struct_id', 'region']) + .merge(sf(2685), how="left", left_on='artifactLocation', right_on='struct_id', validate="1:m") + # + .drop(columns=['struct_id', 'artifactLocation']) + .merge(sf(2774), how="left", left_on='message_2683', right_on='struct_id', validate="1:m") + .drop(columns=['struct_id', 'message_2683']) +) + +dr2 = (dr1[['struct_id_4055', 'uri', 'startLine', 'startColumn', 'endLine', 'endColumn', 'text']] + .rename({'text': 'message'}, axis='columns')) + +# Remove dummy locations previously injected by signature.fillsig +dr3 = dr2[dr2.uri != 'scli-dyys dummy value'] + # # Write output # if args.output_format == 'csv': - d2.to_csv(sys.stdout, index_label='index') + p = pathlib.Path(args.outdir) + p.mkdir(exist_ok=True) + with p.joinpath('messages.csv').open(mode='wb') as messages: + d2.to_csv(messages, index_label='index') + with p.joinpath('relatedLocations.csv').open(mode='wb') as relo: + dr3.to_csv(relo, index_label='index') else: sys.stderr.write("unknown output format") diff --git a/sarif_cli/signature.py b/sarif_cli/signature.py index 59f9a27..418df09 100644 --- a/sarif_cli/signature.py +++ b/sarif_cli/signature.py @@ -203,12 +203,15 @@ properties_keys = set([first for first, _ in ('sub-severity', 'String'), ('tags', 'Array003'), ]]) -dummy_properties = { 'kind' : 'unspecified', - 'precision' : 'unspecified', - 'security-severity' : 'unspecified', - 'severity' : 'unspecified', - 'sub-severity' : 'unspecified', - 'tags' : ['unspecified'], +# +# scli-dyys is a random id string for later identification of dummy values +# +dummy_properties = { 'kind' : 'scli-dyys dummy value', + 'precision' : 'scli-dyys dummy value', + 'security-severity' : 'scli-dyys dummy value', + 'severity' : 'scli-dyys dummy value', + 'sub-severity' : 'scli-dyys dummy value', + 'tags' : ['scli-dyys dummy value'], } relatedLocations_keys = set([first for first, _ in @@ -221,16 +224,16 @@ dummy_newlineSequences = ['\r\n', '\n', '\u2028', '\u2029'] dummy_relatedLocations_entry = [ {'id': -1, - 'physicalLocation': {'artifactLocation': {'uri': '', - 'uriBaseId': '%SRCROOT%', + 'physicalLocation': {'artifactLocation': {'uri': 'scli-dyys dummy value', + 'uriBaseId': 'scli-dyys dummy value', 'index': -1}, 'region': {'startLine': -1, 'startColumn': -1, 'endLine': -1, 'endColumn': -1}}, - 'message': {'text': ''}}] + 'message': {'text': 'scli-dyys dummy value'}}] -dummy_message_entry = {'text': ''} +dummy_message_entry = {'text': 'scli-dyys dummy value'} def fillsig_dict(args, elem, context): """ Fill in the missing fields in dictionary signatures.