sarif-extract-tables: also output relatedLocations table

With --related-locations, ../../bin/sarif-results-summary -r results.sarif produces the details RESULT: static/js/tinymce/jscripts/tiny_mce/plugins/paste/editor_plugin_src.js:722: 72:722:73: Character ''' is repeated [here](1) in the same character class. Character ''' is repeated [here](2) in the same character class. Character ''' is repeated [here](3) in the same character class. REFERENCE: static/js/tinymce/jscripts/tiny_mce/plugins/paste/editor_plugin_src.js:722:74:722:75: here REFERENCE: static/js/tinymce/jscripts/tiny_mce/plugins/paste/editor_plugin_src.js:722:76:722:77: here REFERENCE: static/js/tinymce/jscripts/tiny_mce/plugins/paste/editor_plugin_src.js:722:78:722:79: here Via ../../bin/sarif-extract-tables results.sarif tables sarif-extract-tables now produces two output tables, tables/ ├── messages.csv └── relatedLocations.csv that contain the relevant information and can be joined or otherwise combined on the struct_id_4055 key. For example, adding to the end of sarif-extract-tables: import IPython IPython.embed() msg = d2[d2.message.str.startswith("Character ''' is repeated [here]")] dr3[dr3.struct_id_4055 == msg.struct_id_4055.values[0]] In [24]: msg Out[24]: struct_id_4055 ... message 180 4796917312 ... Character ''' is repeated [here](1) in the sam... [1 rows x 7 columns] In [25]: dr3[dr3.struct_id_4055 == msg.struct_id_4055.values[0]] Out[25]: struct_id_4055 uri startLine startColumn endLine endColumn message 180 4796917312 static/js/tinymce/jscripts/tiny_mce/plugins/pa... 722 74 722 75 here 181 4796917312 static/js/tinymce/jscripts/tiny_mce/plugins/pa... 722 76 722 77 here 182 4796917312 static/js/tinymce/jscripts/tiny_mce/plugins/pa... 722 78 722 79 here or manually from the shell: # pick up the struct_id_4055: 0:$ grep "static.*Character ''' is repeated \[here\]" tables/messages.csv 180,4927448704,static/js/tinymce/jscripts/tiny_mce/plugins/paste/editor_plugin_src.js,722,72,722,73,"Character ''' is repeated [here](1) in the same character class. # and find relatedLocations: 0:$ grep 4927448704 tables/relatedLocations.csv 180,4927448704,static/js/tinymce/jscripts/tiny_mce/plugins/paste/editor_plugin_src.js,722,74,722,75,here 181,4927448704,static/js/tinymce/jscripts/tiny_mce/plugins/paste/editor_plugin_src.js,722,76,722,77,here 182,4927448704,static/js/tinymce/jscripts/tiny_mce/plugins/paste/editor_plugin_src.js,722,78,722,79,here Changes: - Introduce scli-dyys, a random id string for later identification and removal of dummy table rows. - Keep the struct_id_4055 column to join tables as needed. - Output is now written to a directory as there are always multiple files.
2025-12-16 17:23:03 +01:00 · 2022-02-16 17:03:58 -08:00
parent ec9a0b5590
commit ad738abed3
2 changed files with 54 additions and 15 deletions
--- a/bin/sarif-extract-tables
+++ b/bin/sarif-extract-tables
@@ -3,6 +3,7 @@
 """
 import argparse
 import json
 import pathlib
 from sarif_cli import signature
 from sarif_cli import typegraph
 import sys
@@ -14,6 +15,7 @@ import pandas as pd
 #
 parser = argparse.ArgumentParser(description='Read a sarif file and produce tabular output.')
 parser.add_argument('file', metavar='sarif-file', type=str, help='input file, - for stdin')
 parser.add_argument('outdir', metavar='output-dir', type=str, help='output directory')
 parser.add_argument('-f', '--output-format', metavar='format', type=str, default="csv",
                    help='Output format for table.  Currently just csv; '
                    '  other formats supported by pandas can be added.')
@@ -100,15 +102,15 @@ sf = lambda num: tgraph.dataframes['Struct' + str(num)]
 af = lambda num: tgraph.dataframes['Array' + str(num)]
 # 
-# Form the dataframe via joins
+# Form the message dataframe via joins
 # 
 d1 = (
    sf(4055)
    .merge(af('0350'), how="left", left_on='locations', right_on='array_id', validate="1:m")
-    .drop(columns=['struct_id', 'locations', 'array_id', 'value_index', 'type_at_index'])
+    .drop(columns=['locations', 'array_id', 'value_index', 'type_at_index'])
    .merge(sf(2683), how="left", left_on='id_or_value_at_index', right_on='struct_id',
           suffixes=("_4055", "_2683"), validate="1:m")
-    .drop(columns=['struct_id', 'id_or_value_at_index'])
+    .drop(columns=['struct_id_2683', 'id_or_value_at_index'])
    .merge(sf(4963), how="left", left_on='physicalLocation', right_on='struct_id', validate="1:m")
    .drop(columns=['struct_id', 'physicalLocation'])
    .merge(sf(6299), how="left", left_on='region', right_on='struct_id', validate="1:m")
@@ -135,14 +137,48 @@ d1 = (
 # 
 # Reproduce ALL `file:line:col:line:col: message` entries as a table
 # 
-d2 = (d1[['uri', 'startLine', 'startColumn', 'endLine', 'endColumn', 'text_4055']]
+d2 = (d1[['struct_id_4055', 'uri', 'startLine', 'startColumn', 'endLine', 'endColumn', 'text_4055']]
      .rename({'text_4055': 'message'}, axis='columns'))
 # 
 # Form the relatedLocation dataframe via joins.  This is subtly different from d1:
 # left_on=relatedLocations, and no left_on='message_4055'
 dr1 = (
    sf(4055)
    .merge(af('0350'), how="left", left_on='relatedLocations', right_on='array_id', validate="1:m")
    .drop(columns=['relatedLocations', 'array_id', 'value_index', 'type_at_index'])
    # 
    .merge(sf(2683), how="left", left_on='id_or_value_at_index', right_on='struct_id',
           suffixes=("_4055", "_2683"), validate="1:m")
    .drop(columns=['struct_id_2683', 'id_or_value_at_index'])
    .merge(sf(4963), how="left", left_on='physicalLocation', right_on='struct_id', validate="1:m")
    # 
    .drop(columns=['struct_id', 'physicalLocation'])
    .merge(sf(6299), how="left", left_on='region', right_on='struct_id', validate="1:m")
    .drop(columns=['struct_id', 'region'])
    .merge(sf(2685), how="left", left_on='artifactLocation', right_on='struct_id', validate="1:m")
    # 
    .drop(columns=['struct_id', 'artifactLocation'])
    .merge(sf(2774), how="left", left_on='message_2683', right_on='struct_id', validate="1:m")
    .drop(columns=['struct_id', 'message_2683'])
 )
 dr2 = (dr1[['struct_id_4055', 'uri', 'startLine', 'startColumn', 'endLine', 'endColumn', 'text']]
      .rename({'text': 'message'}, axis='columns'))
 # Remove dummy locations previously injected by signature.fillsig
 dr3 = dr2[dr2.uri != 'scli-dyys dummy value']
 #
 # Write output
 #
 if args.output_format == 'csv':
-    d2.to_csv(sys.stdout, index_label='index')
+    p = pathlib.Path(args.outdir)
    p.mkdir(exist_ok=True)
    with p.joinpath('messages.csv').open(mode='wb') as messages:
        d2.to_csv(messages, index_label='index')
    with p.joinpath('relatedLocations.csv').open(mode='wb') as relo:
        dr3.to_csv(relo, index_label='index')
 else:
    sys.stderr.write("unknown output format")
--- a/sarif_cli/signature.py
+++ b/sarif_cli/signature.py
@@ -203,12 +203,15 @@ properties_keys = set([first for first, _ in
                         ('sub-severity', 'String'),
                         ('tags', 'Array003'),
                        ]])
-dummy_properties = { 'kind' : 'unspecified',
+# 
-                     'precision' : 'unspecified',
+# scli-dyys is a random id string for later identification of dummy values
-                     'security-severity' : 'unspecified',
+# 
-                     'severity' : 'unspecified',
+dummy_properties = { 'kind' : 'scli-dyys dummy value',
-                     'sub-severity' : 'unspecified',
+                     'precision' : 'scli-dyys dummy value',
-                     'tags' : ['unspecified'],
+                     'security-severity' : 'scli-dyys dummy value',
                     'severity' : 'scli-dyys dummy value',
                     'sub-severity' : 'scli-dyys dummy value',
                     'tags' : ['scli-dyys dummy value'],
                    }
 relatedLocations_keys = set([first for first, _ in
@@ -221,16 +224,16 @@ dummy_newlineSequences = ['\r\n', '\n', '\u2028', '\u2029']
 dummy_relatedLocations_entry = [
    {'id': -1,
-     'physicalLocation': {'artifactLocation': {'uri': '',
+     'physicalLocation': {'artifactLocation': {'uri': 'scli-dyys dummy value',
-                                               'uriBaseId': '%SRCROOT%',
+                                               'uriBaseId': 'scli-dyys dummy value',
                                               'index': -1},
                          'region': {'startLine': -1, 
                                     'startColumn': -1,
                                     'endLine': -1, 
                                     'endColumn': -1}},
-     'message': {'text': ''}}]
+     'message': {'text': 'scli-dyys dummy value'}}]
-dummy_message_entry = {'text': ''}
+dummy_message_entry = {'text': 'scli-dyys dummy value'}
 def fillsig_dict(args, elem, context):
    """ Fill in the missing fields in dictionary signatures.