Add endpoints-only option for path output and a collection of usage samples

2025-12-15 17:03:04 +01:00 · 2021-12-21 14:04:46 -08:00
parent 79649a6226
commit 558e218d3b
5 changed files with 183 additions and 7832 deletions
--- a/README.org
+++ b/README.org
@@ -3,12 +3,23 @@

  Each of these tools present a high-level command-line interface to extract a
  specific subset of information from a SARIF file.  The format of each tool's
-  /output/ is versioned and, as much as possible, independent of the input.
+  /output/ will be versioned and, as much as possible, independent of the input.

-  It is the intent of these tools to
-  - hide the internals of sarif when /used/
-  - provide examples of extracting information from SARIF files /while writing
-    your own/ or extending the tools
+  For human use and to fit with existing tools, the default output format is
+  line-oriented and resembles compiler error formatting.
+
+  The goal of this tool set is to support working with sarif files 
+  - at the shell / file level, 
+  - across multiple versions of the same sarif result set, 
+  - and across many repositories.
+
+  The implementation language is Python, but that is a detail.  The scripts should
+  work well when used with other shell tools, especially =diff= and =git=.
+
+  # It is the intent of these tools to
+  # - hide the internals of sarif when /used/,
+  # - provide examples of extracting information from SARIF files /while writing
+  #   your own/ or extending the tools.

 * Setup for development
  This repository uses =git lfs= for some larger files; installation steps are at
--- a/bin/sarif-results-summary
+++ b/bin/sarif-results-summary
@@ -12,6 +12,8 @@ parser.add_argument('-s', '--list-source', metavar='srcroot', type=str,
                    help='list source snippets using srcroot as sarif SRCROOT')
 parser.add_argument('-r', '--related-locations', action="store_true",
                    help='list related locations like "hides [parameter](1)"')
+parser.add_argument('-e', '--endpoints-only', action="store_true",
+                    help='only list source and sink, dropping the path.  Identical, successive source/sink pairs are combined')
 # TODO mutually exclusive options
 parser.add_argument('-c', '--csv', action="store_true",
                    help='output csv instead of human-readable summary')
@@ -59,7 +61,6 @@ for runi in S.indices(sarif_struct, 'runs'):
                    for line, line_num in zip(lines, range(l1, l2+1)):
                        S.display_underlined(l1, c1, l2, c2, line, line_num)
            if args.related_locations:
-                # Full path: S.get(sarif_struct, 'runs', runi, 'results', resi, 'relatedLocations')
                relatedLocations = result.get('relatedLocations', None)
                if type(relatedLocations) == list:
                    # Linking is explicit in output, so no need to get id(s) from message string.
@@ -89,15 +90,37 @@ for runi in S.indices(sarif_struct, 'runs'):
                                        S.display_underlined(l1, c1, l2, c2, line, line_num)
        if 'codeFlows' in result:
            # Path problems
+            last_codeFlow = None
            for codefi in S.indices(result, 'codeFlows'):
                codeFlow = S.get(result, 'codeFlows', codefi)
                if args.csv:
                    S.write_csv(cw, "path", codefi)
                else:
                    S.msg("PATH %d\n" % codefi)
+                    
                for threadi in S.indices(codeFlow, 'threadFlows'):
                    threadFlow = S.get(codeFlow, 'threadFlows', threadi)
-                    for loci in S.indices(threadFlow, 'locations'):
+
+                    if args.endpoints_only:
+                        # 
+                        # Pick the range to list only the endpoints (source/sink) of a threadFlow.
+                        # 
+                        t1 = S.indices(threadFlow, 'locations')
+                        location_range = [t1[0], t1[-1]]
+                        # 
+                        # If the previous path had the same (source,sink) pair,
+                        # we don't need to repeat it.
+                        # 
+                        if (last_codeFlow and
+                            ( S.get(last_codeFlow, 'threadFlows', threadi, 'locations', 0) ==
+                              S.get(codeFlow, 'threadFlows', threadi, 'locations', 0)) and
+                            ( S.get(last_codeFlow, 'threadFlows', threadi, 'locations', -1) ==
+                              S.get(codeFlow, 'threadFlows', threadi, 'locations', -1))):
+                            continue
+                    else:
+                        location_range = S.indices(threadFlow, 'locations')
+
+                    for loci in location_range:
                        location = S.get(threadFlow, 'locations', loci, 'location')
                        message, artifact, region = S.get_relatedlocation_message_info(location)
                        if artifact == S.NoFile:
@@ -122,6 +145,7 @@ for runi in S.indices(sarif_struct, 'runs'):
                                else:
                                    for line, line_num in zip(lines, range(l1, l2+1)):
                                        S.display_underlined(l1, c1, l2, c2, line, line_num)
+                last_codeFlow = codeFlow
        if args.csv:
            pass
        else:
--- a/data/treeio/test_set_1.yaml
+++ b/data/treeio/test_set_1.yaml
--- a/docs/sarif-handling.org
+++ b/docs/sarif-handling.org
@@ -61,7 +61,8 @@
       "'$.fn." + plugin.getPluginName() + "' plugin"
   #+end_src

-   Results are
+   The full results are found in [[file:../data/treeio/results.yaml::Potential XSS vulnerability in the \['$.fn.datepicker' plugin\](1).][results.yaml]], with a testing subset in [[file:../data/treeio/test_set_1.yaml::Potential XSS vulnerability in the \['$.fn.datepicker'
+ plugin\](1).][test_set_1.yaml]]; the results for this query are 
   #+BEGIN_SRC text
     message:
       text: |-
@@ -71,7 +72,7 @@
   #+END_SRC
   with 3 =relatedLocations= and 6 =threadFlows=.

-   The the original query's first column is a sink (=sink.getNode()=), so the
+   The original query's first column is a sink (=sink.getNode()=), so the
   =threadFlows= should terminate there -- and they do.
   #+BEGIN_SRC text
     locations:
@@ -152,6 +153,78 @@
   obvious connections between them.  More importantly, the ordering is
   consistent. 

+** Multiple message values and source/sink pairs
+   As a special case of [[*Multiple message values and flow paths][Multiple message values and flow paths]], we can report only
+   the (source, sink) pairs and drop the flow paths.  This is useful in result
+   reports spanning many repositories and multiple tools.
+
+   Considering
+   #+BEGIN_SRC text
+     Potential XSS vulnerability in the ['$.fn.datepicker' plugin](1).
+   #+END_SRC
+   found in [[file:../data/treeio/test_set_1.yaml::Potential XSS vulnerability in the \['$.fn.datepicker'    plugin\](1).][test_set_1.yaml]], stripping the =threadFlows= paths, and looking at the
+   first two =threadFlows= gives the following simplified structure.
+   Note that without the flow paths, the first two results are now identical
+   =(source, sink)= pairs; the same holds for 2,3 and 4,5.
+
+   #+BEGIN_SRC yaml
+     - ruleId: com.lgtm/javascript-queries:js/unsafe-jquery-plugin
+       codeFlows:
+         - threadFlows:
+             - locations:
+                 - location:
+                     physicalLocation:
+                       artifactLocation:
+                         uri: static/js/jquery-ui-1.10.3/ui/jquery-ui.js
+                         uriBaseId: '%SRCROOT%'
+                         index: 72
+                       region:
+                         startLine: 9598
+                         startColumn: 28
+                         endColumn: 35
+                     message:
+                       text: options
+                 - location:
+                     physicalLocation:
+                       artifactLocation:
+                         uri: static/js/jquery-ui-1.10.3/ui/jquery.ui.datepicker.js
+                         uriBaseId: '%SRCROOT%'
+                         index: 61
+                       region:
+                         startLine: 1027
+                         startColumn: 6
+                         endColumn: 14
+                     message:
+                       text: altField
+         - threadFlows:
+             - locations:
+                 - location:
+                     physicalLocation:
+                       artifactLocation:
+                         uri: static/js/jquery-ui-1.10.3/ui/jquery-ui.js
+                         uriBaseId: '%SRCROOT%'
+                         index: 72
+                       region:
+                         startLine: 9598
+                         startColumn: 28
+                         endColumn: 35
+                     message:
+                       text: options
+                 - location:
+                     physicalLocation:
+                       artifactLocation:
+                         uri: static/js/jquery-ui-1.10.3/ui/jquery.ui.datepicker.js
+                         uriBaseId: '%SRCROOT%'
+                         index: 61
+                       region:
+                         startLine: 1027
+                         startColumn: 6
+                         endColumn: 14
+                     message:
+                       text: altField
+
+   #+END_SRC
+
 # 
 #+OPTIONS: ^:{}

--- a/scripts/file-level-tests.sh
+++ b/scripts/file-level-tests.sh
@@ -0,0 +1,66 @@
+# -*- sh -*-
+# The purpose of this tool set is working with sarif at the shell / file level,
+# across multiple versions of the same sarif result set, and across many
+# repositories.
+#
+# These tests mirror that goal: they work on files using the tools and use
+# standard unix utilities to verify contents.
+# 
+
+sarif-results-summary  -h
+
+#
+# Simple failure checks.  These should produce no output.
+# 
+test_files="
+../data/wxWidgets_wxWidgets__2021-11-21_16_06_30__export.sarif
+../data/torvalds_linux__2021-10-21_10_07_00__export.sarif
+../data/treeio/results.sarif
+"
+for file in $test_files ; do
+    sarif-results-summary $file > /dev/null 
+done
+for file in $test_files ; do
+    sarif-results-summary -r $file > /dev/null 
+done
+          
+#
+# The following are for iterating and evolving result inspection to find test
+# cases covering the different output options.  They are intended for manual use
+# and review.
+#
+read -r file srcroot <<< "../data/treeio/results.sarif ../data/treeio/treeio"
+
+# All results, minimal output
+sarif-results-summary             $file | less
+
+# All results, related locations output
+sarif-results-summary -r           $file | less
+
+# All results, related locations and source output
+sarif-results-summary -r -s $srcroot $file | less
+
+# single-line result, no flow steps
+start="sanitizer.py:8:1:8:16"
+sarif-results-summary             $file | sed -n "/$start/,/RESULT/p" | sed '$d' | less
+
+# single-line result, with flow steps
+start="treeio.core.middleware.chat.py:395:29:395:33"
+sarif-results-summary             $file | sed -n "/$start/,/RESULT/p" | sed '$d' | less
+
+# single-line result, with flow steps, with relatedLocations
+start="treeio.core.middleware.chat.py:395:29:395:33"
+sarif-results-summary -r           $file | sed -n "/$start/,/RESULT/p" | sed '$d' | less
+
+# single-line result, with flow steps compacted
+start="treeio.core.middleware.chat.py:395:29:395:33"
+sarif-results-summary -e          $file | sed -n "/$start/,/RESULT/p" | sed '$d' | less
+
+# multi-line result, no flow steps, with relatedLocations and source
+start=editor_plugin_src.js:722:72:722:73
+sarif-results-summary -r -s $srcroot $file | sed -n "/$start/,/RESULT/p" | sed '$d' | less
+
+# multi-line result, with flow steps, with relatedLocations and source
+start=modal-form.html:89:35:93:14
+sarif-results-summary -r -s $srcroot $file | sed -n "/$start/,/RESULT/p" | sed '$d' | less
+