Add endpoints-only option for path output and a collection of usage samples

2025-12-16 01:13:03 +01:00 · 2021-12-21 14:04:46 -08:00
parent 79649a6226
commit 558e218d3b
5 changed files with 183 additions and 7832 deletions
--- a/README.org
+++ b/README.org
@@ -3,12 +3,23 @@
  Each of these tools present a high-level command-line interface to extract a
  specific subset of information from a SARIF file.  The format of each tool's
-  /output/ is versioned and, as much as possible, independent of the input.
+  /output/ will be versioned and, as much as possible, independent of the input.
-  It is the intent of these tools to
+  For human use and to fit with existing tools, the default output format is
-  - hide the internals of sarif when /used/
+  line-oriented and resembles compiler error formatting.
-  - provide examples of extracting information from SARIF files /while writing
+
-    your own/ or extending the tools
+  The goal of this tool set is to support working with sarif files 
  - at the shell / file level, 
  - across multiple versions of the same sarif result set, 
  - and across many repositories.
  The implementation language is Python, but that is a detail.  The scripts should
  work well when used with other shell tools, especially =diff= and =git=.
  # It is the intent of these tools to
  # - hide the internals of sarif when /used/,
  # - provide examples of extracting information from SARIF files /while writing
  #   your own/ or extending the tools.
 * Setup for development
  This repository uses =git lfs= for some larger files; installation steps are at
--- a/bin/sarif-results-summary
+++ b/bin/sarif-results-summary
@@ -12,6 +12,8 @@ parser.add_argument('-s', '--list-source', metavar='srcroot', type=str,
                    help='list source snippets using srcroot as sarif SRCROOT')
 parser.add_argument('-r', '--related-locations', action="store_true",
                    help='list related locations like "hides [parameter](1)"')
 parser.add_argument('-e', '--endpoints-only', action="store_true",
                    help='only list source and sink, dropping the path.  Identical, successive source/sink pairs are combined')
 # TODO mutually exclusive options
 parser.add_argument('-c', '--csv', action="store_true",
                    help='output csv instead of human-readable summary')
@@ -59,7 +61,6 @@ for runi in S.indices(sarif_struct, 'runs'):
                    for line, line_num in zip(lines, range(l1, l2+1)):
                        S.display_underlined(l1, c1, l2, c2, line, line_num)
            if args.related_locations:
                # Full path: S.get(sarif_struct, 'runs', runi, 'results', resi, 'relatedLocations')
                relatedLocations = result.get('relatedLocations', None)
                if type(relatedLocations) == list:
                    # Linking is explicit in output, so no need to get id(s) from message string.
@@ -89,15 +90,37 @@ for runi in S.indices(sarif_struct, 'runs'):
                                        S.display_underlined(l1, c1, l2, c2, line, line_num)
        if 'codeFlows' in result:
            # Path problems
            last_codeFlow = None
            for codefi in S.indices(result, 'codeFlows'):
                codeFlow = S.get(result, 'codeFlows', codefi)
                if args.csv:
                    S.write_csv(cw, "path", codefi)
                else:
                    S.msg("PATH %d\n" % codefi)
                for threadi in S.indices(codeFlow, 'threadFlows'):
                    threadFlow = S.get(codeFlow, 'threadFlows', threadi)
-                    for loci in S.indices(threadFlow, 'locations'):
+
                    if args.endpoints_only:
                        # 
                        # Pick the range to list only the endpoints (source/sink) of a threadFlow.
                        # 
                        t1 = S.indices(threadFlow, 'locations')
                        location_range = [t1[0], t1[-1]]
                        # 
                        # If the previous path had the same (source,sink) pair,
                        # we don't need to repeat it.
                        # 
                        if (last_codeFlow and
                            ( S.get(last_codeFlow, 'threadFlows', threadi, 'locations', 0) ==
                              S.get(codeFlow, 'threadFlows', threadi, 'locations', 0)) and
                            ( S.get(last_codeFlow, 'threadFlows', threadi, 'locations', -1) ==
                              S.get(codeFlow, 'threadFlows', threadi, 'locations', -1))):
                            continue
                    else:
                        location_range = S.indices(threadFlow, 'locations')
                    for loci in location_range:
                        location = S.get(threadFlow, 'locations', loci, 'location')
                        message, artifact, region = S.get_relatedlocation_message_info(location)
                        if artifact == S.NoFile:
@@ -122,6 +145,7 @@ for runi in S.indices(sarif_struct, 'runs'):
                                else:
                                    for line, line_num in zip(lines, range(l1, l2+1)):
                                        S.display_underlined(l1, c1, l2, c2, line, line_num)
                last_codeFlow = codeFlow
        if args.csv:
            pass
        else:
--- a/data/treeio/test_set_1.yaml
+++ b/data/treeio/test_set_1.yaml
--- a/docs/sarif-handling.org
+++ b/docs/sarif-handling.org
@@ -61,7 +61,8 @@
       "'$.fn." + plugin.getPluginName() + "' plugin"
   #+end_src
-   Results are
+   The full results are found in [[file:../data/treeio/results.yaml::Potential XSS vulnerability in the \['$.fn.datepicker' plugin\](1).][results.yaml]], with a testing subset in [[file:../data/treeio/test_set_1.yaml::Potential XSS vulnerability in the \['$.fn.datepicker'
 plugin\](1).][test_set_1.yaml]]; the results for this query are 
   #+BEGIN_SRC text
     message:
       text: |-
@@ -71,7 +72,7 @@
   #+END_SRC
   with 3 =relatedLocations= and 6 =threadFlows=.
-   The the original query's first column is a sink (=sink.getNode()=), so the
+   The original query's first column is a sink (=sink.getNode()=), so the
   =threadFlows= should terminate there -- and they do.
   #+BEGIN_SRC text
     locations:
@@ -152,6 +153,78 @@
   obvious connections between them.  More importantly, the ordering is
   consistent. 
 ** Multiple message values and source/sink pairs
   As a special case of [[*Multiple message values and flow paths][Multiple message values and flow paths]], we can report only
   the (source, sink) pairs and drop the flow paths.  This is useful in result
   reports spanning many repositories and multiple tools.
   Considering
   #+BEGIN_SRC text
     Potential XSS vulnerability in the ['$.fn.datepicker' plugin](1).
   #+END_SRC
   found in [[file:../data/treeio/test_set_1.yaml::Potential XSS vulnerability in the \['$.fn.datepicker'    plugin\](1).][test_set_1.yaml]], stripping the =threadFlows= paths, and looking at the
   first two =threadFlows= gives the following simplified structure.
   Note that without the flow paths, the first two results are now identical
   =(source, sink)= pairs; the same holds for 2,3 and 4,5.
   #+BEGIN_SRC yaml
     - ruleId: com.lgtm/javascript-queries:js/unsafe-jquery-plugin
       codeFlows:
         - threadFlows:
             - locations:
                 - location:
                     physicalLocation:
                       artifactLocation:
                         uri: static/js/jquery-ui-1.10.3/ui/jquery-ui.js
                         uriBaseId: '%SRCROOT%'
                         index: 72
                       region:
                         startLine: 9598
                         startColumn: 28
                         endColumn: 35
                     message:
                       text: options
                 - location:
                     physicalLocation:
                       artifactLocation:
                         uri: static/js/jquery-ui-1.10.3/ui/jquery.ui.datepicker.js
                         uriBaseId: '%SRCROOT%'
                         index: 61
                       region:
                         startLine: 1027
                         startColumn: 6
                         endColumn: 14
                     message:
                       text: altField
         - threadFlows:
             - locations:
                 - location:
                     physicalLocation:
                       artifactLocation:
                         uri: static/js/jquery-ui-1.10.3/ui/jquery-ui.js
                         uriBaseId: '%SRCROOT%'
                         index: 72
                       region:
                         startLine: 9598
                         startColumn: 28
                         endColumn: 35
                     message:
                       text: options
                 - location:
                     physicalLocation:
                       artifactLocation:
                         uri: static/js/jquery-ui-1.10.3/ui/jquery.ui.datepicker.js
                         uriBaseId: '%SRCROOT%'
                         index: 61
                       region:
                         startLine: 1027
                         startColumn: 6
                         endColumn: 14
                     message:
                       text: altField
   #+END_SRC
 # 
 #+OPTIONS: ^:{}
--- a/scripts/file-level-tests.sh
+++ b/scripts/file-level-tests.sh
@@ -0,0 +1,66 @@
 # -*- sh -*-
 # The purpose of this tool set is working with sarif at the shell / file level,
 # across multiple versions of the same sarif result set, and across many
 # repositories.
 #
 # These tests mirror that goal: they work on files using the tools and use
 # standard unix utilities to verify contents.
 # 
 sarif-results-summary  -h
 #
 # Simple failure checks.  These should produce no output.
 # 
 test_files="
 ../data/wxWidgets_wxWidgets__2021-11-21_16_06_30__export.sarif
 ../data/torvalds_linux__2021-10-21_10_07_00__export.sarif
 ../data/treeio/results.sarif
 "
 for file in $test_files ; do
    sarif-results-summary $file > /dev/null 
 done
 for file in $test_files ; do
    sarif-results-summary -r $file > /dev/null 
 done
 #
 # The following are for iterating and evolving result inspection to find test
 # cases covering the different output options.  They are intended for manual use
 # and review.
 #
 read -r file srcroot <<< "../data/treeio/results.sarif ../data/treeio/treeio"
 # All results, minimal output
 sarif-results-summary             $file | less
 # All results, related locations output
 sarif-results-summary -r           $file | less
 # All results, related locations and source output
 sarif-results-summary -r -s $srcroot $file | less
 # single-line result, no flow steps
 start="sanitizer.py:8:1:8:16"
 sarif-results-summary             $file | sed -n "/$start/,/RESULT/p" | sed '$d' | less
 # single-line result, with flow steps
 start="treeio.core.middleware.chat.py:395:29:395:33"
 sarif-results-summary             $file | sed -n "/$start/,/RESULT/p" | sed '$d' | less
 # single-line result, with flow steps, with relatedLocations
 start="treeio.core.middleware.chat.py:395:29:395:33"
 sarif-results-summary -r           $file | sed -n "/$start/,/RESULT/p" | sed '$d' | less
 # single-line result, with flow steps compacted
 start="treeio.core.middleware.chat.py:395:29:395:33"
 sarif-results-summary -e          $file | sed -n "/$start/,/RESULT/p" | sed '$d' | less
 # multi-line result, no flow steps, with relatedLocations and source
 start=editor_plugin_src.js:722:72:722:73
 sarif-results-summary -r -s $srcroot $file | sed -n "/$start/,/RESULT/p" | sed '$d' | less
 # multi-line result, with flow steps, with relatedLocations and source
 start=modal-form.html:89:35:93:14
 sarif-results-summary -r -s $srcroot $file | sed -n "/$start/,/RESULT/p" | sed '$d' | less