Add endpoints-only option for path output and a collection of usage samples

This commit is contained in:
Michael Hohn
2021-12-21 14:04:46 -08:00
committed by =Michael Hohn
parent 79649a6226
commit 558e218d3b
5 changed files with 183 additions and 7832 deletions

View File

@@ -3,12 +3,23 @@
Each of these tools present a high-level command-line interface to extract a Each of these tools present a high-level command-line interface to extract a
specific subset of information from a SARIF file. The format of each tool's specific subset of information from a SARIF file. The format of each tool's
/output/ is versioned and, as much as possible, independent of the input. /output/ will be versioned and, as much as possible, independent of the input.
It is the intent of these tools to For human use and to fit with existing tools, the default output format is
- hide the internals of sarif when /used/ line-oriented and resembles compiler error formatting.
- provide examples of extracting information from SARIF files /while writing
your own/ or extending the tools The goal of this tool set is to support working with sarif files
- at the shell / file level,
- across multiple versions of the same sarif result set,
- and across many repositories.
The implementation language is Python, but that is a detail. The scripts should
work well when used with other shell tools, especially =diff= and =git=.
# It is the intent of these tools to
# - hide the internals of sarif when /used/,
# - provide examples of extracting information from SARIF files /while writing
# your own/ or extending the tools.
* Setup for development * Setup for development
This repository uses =git lfs= for some larger files; installation steps are at This repository uses =git lfs= for some larger files; installation steps are at

View File

@@ -12,6 +12,8 @@ parser.add_argument('-s', '--list-source', metavar='srcroot', type=str,
help='list source snippets using srcroot as sarif SRCROOT') help='list source snippets using srcroot as sarif SRCROOT')
parser.add_argument('-r', '--related-locations', action="store_true", parser.add_argument('-r', '--related-locations', action="store_true",
help='list related locations like "hides [parameter](1)"') help='list related locations like "hides [parameter](1)"')
parser.add_argument('-e', '--endpoints-only', action="store_true",
help='only list source and sink, dropping the path. Identical, successive source/sink pairs are combined')
# TODO mutually exclusive options # TODO mutually exclusive options
parser.add_argument('-c', '--csv', action="store_true", parser.add_argument('-c', '--csv', action="store_true",
help='output csv instead of human-readable summary') help='output csv instead of human-readable summary')
@@ -59,7 +61,6 @@ for runi in S.indices(sarif_struct, 'runs'):
for line, line_num in zip(lines, range(l1, l2+1)): for line, line_num in zip(lines, range(l1, l2+1)):
S.display_underlined(l1, c1, l2, c2, line, line_num) S.display_underlined(l1, c1, l2, c2, line, line_num)
if args.related_locations: if args.related_locations:
# Full path: S.get(sarif_struct, 'runs', runi, 'results', resi, 'relatedLocations')
relatedLocations = result.get('relatedLocations', None) relatedLocations = result.get('relatedLocations', None)
if type(relatedLocations) == list: if type(relatedLocations) == list:
# Linking is explicit in output, so no need to get id(s) from message string. # Linking is explicit in output, so no need to get id(s) from message string.
@@ -89,15 +90,37 @@ for runi in S.indices(sarif_struct, 'runs'):
S.display_underlined(l1, c1, l2, c2, line, line_num) S.display_underlined(l1, c1, l2, c2, line, line_num)
if 'codeFlows' in result: if 'codeFlows' in result:
# Path problems # Path problems
last_codeFlow = None
for codefi in S.indices(result, 'codeFlows'): for codefi in S.indices(result, 'codeFlows'):
codeFlow = S.get(result, 'codeFlows', codefi) codeFlow = S.get(result, 'codeFlows', codefi)
if args.csv: if args.csv:
S.write_csv(cw, "path", codefi) S.write_csv(cw, "path", codefi)
else: else:
S.msg("PATH %d\n" % codefi) S.msg("PATH %d\n" % codefi)
for threadi in S.indices(codeFlow, 'threadFlows'): for threadi in S.indices(codeFlow, 'threadFlows'):
threadFlow = S.get(codeFlow, 'threadFlows', threadi) threadFlow = S.get(codeFlow, 'threadFlows', threadi)
for loci in S.indices(threadFlow, 'locations'):
if args.endpoints_only:
#
# Pick the range to list only the endpoints (source/sink) of a threadFlow.
#
t1 = S.indices(threadFlow, 'locations')
location_range = [t1[0], t1[-1]]
#
# If the previous path had the same (source,sink) pair,
# we don't need to repeat it.
#
if (last_codeFlow and
( S.get(last_codeFlow, 'threadFlows', threadi, 'locations', 0) ==
S.get(codeFlow, 'threadFlows', threadi, 'locations', 0)) and
( S.get(last_codeFlow, 'threadFlows', threadi, 'locations', -1) ==
S.get(codeFlow, 'threadFlows', threadi, 'locations', -1))):
continue
else:
location_range = S.indices(threadFlow, 'locations')
for loci in location_range:
location = S.get(threadFlow, 'locations', loci, 'location') location = S.get(threadFlow, 'locations', loci, 'location')
message, artifact, region = S.get_relatedlocation_message_info(location) message, artifact, region = S.get_relatedlocation_message_info(location)
if artifact == S.NoFile: if artifact == S.NoFile:
@@ -122,6 +145,7 @@ for runi in S.indices(sarif_struct, 'runs'):
else: else:
for line, line_num in zip(lines, range(l1, l2+1)): for line, line_num in zip(lines, range(l1, l2+1)):
S.display_underlined(l1, c1, l2, c2, line, line_num) S.display_underlined(l1, c1, l2, c2, line, line_num)
last_codeFlow = codeFlow
if args.csv: if args.csv:
pass pass
else: else:

File diff suppressed because it is too large Load Diff

View File

@@ -61,7 +61,8 @@
"'$.fn." + plugin.getPluginName() + "' plugin" "'$.fn." + plugin.getPluginName() + "' plugin"
#+end_src #+end_src
Results are The full results are found in [[file:../data/treeio/results.yaml::Potential XSS vulnerability in the \['$.fn.datepicker' plugin\](1).][results.yaml]], with a testing subset in [[file:../data/treeio/test_set_1.yaml::Potential XSS vulnerability in the \['$.fn.datepicker'
plugin\](1).][test_set_1.yaml]]; the results for this query are
#+BEGIN_SRC text #+BEGIN_SRC text
message: message:
text: |- text: |-
@@ -71,7 +72,7 @@
#+END_SRC #+END_SRC
with 3 =relatedLocations= and 6 =threadFlows=. with 3 =relatedLocations= and 6 =threadFlows=.
The the original query's first column is a sink (=sink.getNode()=), so the The original query's first column is a sink (=sink.getNode()=), so the
=threadFlows= should terminate there -- and they do. =threadFlows= should terminate there -- and they do.
#+BEGIN_SRC text #+BEGIN_SRC text
locations: locations:
@@ -152,6 +153,78 @@
obvious connections between them. More importantly, the ordering is obvious connections between them. More importantly, the ordering is
consistent. consistent.
** Multiple message values and source/sink pairs
As a special case of [[*Multiple message values and flow paths][Multiple message values and flow paths]], we can report only
the (source, sink) pairs and drop the flow paths. This is useful in result
reports spanning many repositories and multiple tools.
Considering
#+BEGIN_SRC text
Potential XSS vulnerability in the ['$.fn.datepicker' plugin](1).
#+END_SRC
found in [[file:../data/treeio/test_set_1.yaml::Potential XSS vulnerability in the \['$.fn.datepicker' plugin\](1).][test_set_1.yaml]], stripping the =threadFlows= paths, and looking at the
first two =threadFlows= gives the following simplified structure.
Note that without the flow paths, the first two results are now identical
=(source, sink)= pairs; the same holds for 2,3 and 4,5.
#+BEGIN_SRC yaml
- ruleId: com.lgtm/javascript-queries:js/unsafe-jquery-plugin
codeFlows:
- threadFlows:
- locations:
- location:
physicalLocation:
artifactLocation:
uri: static/js/jquery-ui-1.10.3/ui/jquery-ui.js
uriBaseId: '%SRCROOT%'
index: 72
region:
startLine: 9598
startColumn: 28
endColumn: 35
message:
text: options
- location:
physicalLocation:
artifactLocation:
uri: static/js/jquery-ui-1.10.3/ui/jquery.ui.datepicker.js
uriBaseId: '%SRCROOT%'
index: 61
region:
startLine: 1027
startColumn: 6
endColumn: 14
message:
text: altField
- threadFlows:
- locations:
- location:
physicalLocation:
artifactLocation:
uri: static/js/jquery-ui-1.10.3/ui/jquery-ui.js
uriBaseId: '%SRCROOT%'
index: 72
region:
startLine: 9598
startColumn: 28
endColumn: 35
message:
text: options
- location:
physicalLocation:
artifactLocation:
uri: static/js/jquery-ui-1.10.3/ui/jquery.ui.datepicker.js
uriBaseId: '%SRCROOT%'
index: 61
region:
startLine: 1027
startColumn: 6
endColumn: 14
message:
text: altField
#+END_SRC
# #
#+OPTIONS: ^:{} #+OPTIONS: ^:{}

View File

@@ -0,0 +1,66 @@
# -*- sh -*-
# The purpose of this tool set is working with sarif at the shell / file level,
# across multiple versions of the same sarif result set, and across many
# repositories.
#
# These tests mirror that goal: they work on files using the tools and use
# standard unix utilities to verify contents.
#
sarif-results-summary -h
#
# Simple failure checks. These should produce no output.
#
test_files="
../data/wxWidgets_wxWidgets__2021-11-21_16_06_30__export.sarif
../data/torvalds_linux__2021-10-21_10_07_00__export.sarif
../data/treeio/results.sarif
"
for file in $test_files ; do
sarif-results-summary $file > /dev/null
done
for file in $test_files ; do
sarif-results-summary -r $file > /dev/null
done
#
# The following are for iterating and evolving result inspection to find test
# cases covering the different output options. They are intended for manual use
# and review.
#
read -r file srcroot <<< "../data/treeio/results.sarif ../data/treeio/treeio"
# All results, minimal output
sarif-results-summary $file | less
# All results, related locations output
sarif-results-summary -r $file | less
# All results, related locations and source output
sarif-results-summary -r -s $srcroot $file | less
# single-line result, no flow steps
start="sanitizer.py:8:1:8:16"
sarif-results-summary $file | sed -n "/$start/,/RESULT/p" | sed '$d' | less
# single-line result, with flow steps
start="treeio.core.middleware.chat.py:395:29:395:33"
sarif-results-summary $file | sed -n "/$start/,/RESULT/p" | sed '$d' | less
# single-line result, with flow steps, with relatedLocations
start="treeio.core.middleware.chat.py:395:29:395:33"
sarif-results-summary -r $file | sed -n "/$start/,/RESULT/p" | sed '$d' | less
# single-line result, with flow steps compacted
start="treeio.core.middleware.chat.py:395:29:395:33"
sarif-results-summary -e $file | sed -n "/$start/,/RESULT/p" | sed '$d' | less
# multi-line result, no flow steps, with relatedLocations and source
start=editor_plugin_src.js:722:72:722:73
sarif-results-summary -r -s $srcroot $file | sed -n "/$start/,/RESULT/p" | sed '$d' | less
# multi-line result, with flow steps, with relatedLocations and source
start=modal-form.html:89:35:93:14
sarif-results-summary -r -s $srcroot $file | sed -n "/$start/,/RESULT/p" | sed '$d' | less