From ad738abed3b10e85d894b533fe0586d4018e3ee6 Mon Sep 17 00:00:00 2001
From: Michael Hohn <hohn@github.com>
Date: Wed, 16 Feb 2022 17:03:58 -0800
Subject: [PATCH] sarif-extract-tables: also output relatedLocations table
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With --related-locations,

    ../../bin/sarif-results-summary -r results.sarif

produces the details

    RESULT: static/js/tinymce/jscripts/tiny_mce/plugins/paste/editor_plugin_src.js:722:
    72:722:73: Character ''' is repeated [here](1) in the same character class.
    Character ''' is repeated [here](2) in the same character class.
    Character ''' is repeated [here](3) in the same character class.
    REFERENCE: static/js/tinymce/jscripts/tiny_mce/plugins/paste/editor_plugin_src.js:722:74:722:75: here
    REFERENCE: static/js/tinymce/jscripts/tiny_mce/plugins/paste/editor_plugin_src.js:722:76:722:77: here
    REFERENCE: static/js/tinymce/jscripts/tiny_mce/plugins/paste/editor_plugin_src.js:722:78:722:79: here

Via
    ../../bin/sarif-extract-tables results.sarif tables

sarif-extract-tables now produces two output tables,

    tables/
    ├── messages.csv
    └── relatedLocations.csv

that contain the relevant information and can be joined or otherwise combined on
the struct_id_4055 key.

For example, adding to the end of sarif-extract-tables:
    import IPython
    IPython.embed()

    msg = d2[d2.message.str.startswith("Character ''' is repeated [here]")]
    dr3[dr3.struct_id_4055 == msg.struct_id_4055.values[0]]

    In [24]: msg
    Out[24]:
         struct_id_4055  ...                                            message
    180      4796917312  ...  Character ''' is repeated [here](1) in the sam...

    [1 rows x 7 columns]

    In [25]: dr3[dr3.struct_id_4055 == msg.struct_id_4055.values[0]]
    Out[25]:
         struct_id_4055                                                uri  startLine  startColumn  endLine  endColumn message
    180      4796917312  static/js/tinymce/jscripts/tiny_mce/plugins/pa...        722           74      722         75    here
    181      4796917312  static/js/tinymce/jscripts/tiny_mce/plugins/pa...        722           76      722         77    here
    182      4796917312  static/js/tinymce/jscripts/tiny_mce/plugins/pa...        722           78      722         79    here

or manually from the shell:

    # pick up the struct_id_4055:
    0:$ grep "static.*Character ''' is repeated \[here\]" tables/messages.csv
    180,4927448704,static/js/tinymce/jscripts/tiny_mce/plugins/paste/editor_plugin_src.js,722,72,722,73,"Character ''' is repeated [here](1) in the same character class.

    # and find relatedLocations:
    0:$ grep 4927448704 tables/relatedLocations.csv
    180,4927448704,static/js/tinymce/jscripts/tiny_mce/plugins/paste/editor_plugin_src.js,722,74,722,75,here
    181,4927448704,static/js/tinymce/jscripts/tiny_mce/plugins/paste/editor_plugin_src.js,722,76,722,77,here
    182,4927448704,static/js/tinymce/jscripts/tiny_mce/plugins/paste/editor_plugin_src.js,722,78,722,79,here

Changes:
- Introduce scli-dyys, a random id string for later identification and removal of
  dummy table rows.

- Keep the struct_id_4055 column to join tables as needed.

- Output is now written to a directory as there are always multiple files.
---
 bin/sarif-extract-tables | 46 +++++++++++++++++++++++++++++++++++-----
 sarif_cli/signature.py   | 23 +++++++++++---------
 2 files changed, 54 insertions(+), 15 deletions(-)

diff --git a/bin/sarif-extract-tables b/bin/sarif-extract-tables
index 0159ea4..6e5a24d 100755
--- a/bin/sarif-extract-tables
+++ b/bin/sarif-extract-tables
@@ -3,6 +3,7 @@
 """
 import argparse
 import json
+import pathlib
 from sarif_cli import signature
 from sarif_cli import typegraph
 import sys
@@ -14,6 +15,7 @@ import pandas as pd
 #
 parser = argparse.ArgumentParser(description='Read a sarif file and produce tabular output.')
 parser.add_argument('file', metavar='sarif-file', type=str, help='input file, - for stdin')
+parser.add_argument('outdir', metavar='output-dir', type=str, help='output directory')
 parser.add_argument('-f', '--output-format', metavar='format', type=str, default="csv",
                     help='Output format for table.  Currently just csv; '
                     '  other formats supported by pandas can be added.')
@@ -100,15 +102,15 @@ sf = lambda num: tgraph.dataframes['Struct' + str(num)]
 af = lambda num: tgraph.dataframes['Array' + str(num)]
 
 # 
-# Form the dataframe via joins
+# Form the message dataframe via joins
 # 
 d1 = (
     sf(4055)
     .merge(af('0350'), how="left", left_on='locations', right_on='array_id', validate="1:m")
-    .drop(columns=['struct_id', 'locations', 'array_id', 'value_index', 'type_at_index'])
+    .drop(columns=['locations', 'array_id', 'value_index', 'type_at_index'])
     .merge(sf(2683), how="left", left_on='id_or_value_at_index', right_on='struct_id',
            suffixes=("_4055", "_2683"), validate="1:m")
-    .drop(columns=['struct_id', 'id_or_value_at_index'])
+    .drop(columns=['struct_id_2683', 'id_or_value_at_index'])
     .merge(sf(4963), how="left", left_on='physicalLocation', right_on='struct_id', validate="1:m")
     .drop(columns=['struct_id', 'physicalLocation'])
     .merge(sf(6299), how="left", left_on='region', right_on='struct_id', validate="1:m")
@@ -135,14 +137,48 @@ d1 = (
 # 
 # Reproduce ALL `file:line:col:line:col: message` entries as a table
 # 
-d2 = (d1[['uri', 'startLine', 'startColumn', 'endLine', 'endColumn', 'text_4055']]
+d2 = (d1[['struct_id_4055', 'uri', 'startLine', 'startColumn', 'endLine', 'endColumn', 'text_4055']]
       .rename({'text_4055': 'message'}, axis='columns'))
 
+# 
+# Form the relatedLocation dataframe via joins.  This is subtly different from d1:
+# left_on=relatedLocations, and no left_on='message_4055'
+dr1 = (
+    sf(4055)
+    .merge(af('0350'), how="left", left_on='relatedLocations', right_on='array_id', validate="1:m")
+    .drop(columns=['relatedLocations', 'array_id', 'value_index', 'type_at_index'])
+    # 
+    .merge(sf(2683), how="left", left_on='id_or_value_at_index', right_on='struct_id',
+           suffixes=("_4055", "_2683"), validate="1:m")
+    .drop(columns=['struct_id_2683', 'id_or_value_at_index'])
+    .merge(sf(4963), how="left", left_on='physicalLocation', right_on='struct_id', validate="1:m")
+    # 
+    .drop(columns=['struct_id', 'physicalLocation'])
+    .merge(sf(6299), how="left", left_on='region', right_on='struct_id', validate="1:m")
+    .drop(columns=['struct_id', 'region'])
+    .merge(sf(2685), how="left", left_on='artifactLocation', right_on='struct_id', validate="1:m")
+    # 
+    .drop(columns=['struct_id', 'artifactLocation'])
+    .merge(sf(2774), how="left", left_on='message_2683', right_on='struct_id', validate="1:m")
+    .drop(columns=['struct_id', 'message_2683'])
+)
+
+dr2 = (dr1[['struct_id_4055', 'uri', 'startLine', 'startColumn', 'endLine', 'endColumn', 'text']]
+      .rename({'text': 'message'}, axis='columns'))
+
+# Remove dummy locations previously injected by signature.fillsig
+dr3 = dr2[dr2.uri != 'scli-dyys dummy value']
+
 #
 # Write output
 #
 if args.output_format == 'csv':
-    d2.to_csv(sys.stdout, index_label='index')
+    p = pathlib.Path(args.outdir)
+    p.mkdir(exist_ok=True)
+    with p.joinpath('messages.csv').open(mode='wb') as messages:
+        d2.to_csv(messages, index_label='index')
+    with p.joinpath('relatedLocations.csv').open(mode='wb') as relo:
+        dr3.to_csv(relo, index_label='index')
 
 else:
     sys.stderr.write("unknown output format")
diff --git a/sarif_cli/signature.py b/sarif_cli/signature.py
index 59f9a27..418df09 100644
--- a/sarif_cli/signature.py
+++ b/sarif_cli/signature.py
@@ -203,12 +203,15 @@ properties_keys = set([first for first, _ in
                          ('sub-severity', 'String'),
                          ('tags', 'Array003'),
                         ]])
-dummy_properties = { 'kind' : 'unspecified',
-                     'precision' : 'unspecified',
-                     'security-severity' : 'unspecified',
-                     'severity' : 'unspecified',
-                     'sub-severity' : 'unspecified',
-                     'tags' : ['unspecified'],
+# 
+# scli-dyys is a random id string for later identification of dummy values
+# 
+dummy_properties = { 'kind' : 'scli-dyys dummy value',
+                     'precision' : 'scli-dyys dummy value',
+                     'security-severity' : 'scli-dyys dummy value',
+                     'severity' : 'scli-dyys dummy value',
+                     'sub-severity' : 'scli-dyys dummy value',
+                     'tags' : ['scli-dyys dummy value'],
                     }
 
 relatedLocations_keys = set([first for first, _ in
@@ -221,16 +224,16 @@ dummy_newlineSequences = ['\r\n', '\n', '\u2028', '\u2029']
 
 dummy_relatedLocations_entry = [
     {'id': -1,
-     'physicalLocation': {'artifactLocation': {'uri': '',
-                                               'uriBaseId': '%SRCROOT%',
+     'physicalLocation': {'artifactLocation': {'uri': 'scli-dyys dummy value',
+                                               'uriBaseId': 'scli-dyys dummy value',
                                                'index': -1},
                           'region': {'startLine': -1, 
                                      'startColumn': -1,
                                      'endLine': -1, 
                                      'endColumn': -1}},
-     'message': {'text': ''}}]
+     'message': {'text': 'scli-dyys dummy value'}}]
 
-dummy_message_entry = {'text': ''}
+dummy_message_entry = {'text': 'scli-dyys dummy value'}
 
 def fillsig_dict(args, elem, context):
     """ Fill in the missing fields in dictionary signatures.