diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..48213d9 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +*.sarif filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c0ca852 --- /dev/null +++ b/.gitignore @@ -0,0 +1,15 @@ +# Compiled python modules. +*.pyc + +# Setuptools distribution folder. +/dist/ + +# Python egg metadata, regenerated from source files by setuptools. +/*.egg-info + +# virtual environment +/.venv/ + +# Backup files +*~ + diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..1fcc1e5 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +include README.org diff --git a/README.org b/README.org new file mode 100644 index 0000000..3f391d9 --- /dev/null +++ b/README.org @@ -0,0 +1,55 @@ +* Collection of cli tools for SARIF processing + This is a work in progress; the plan is as follows: + + Each of these tools present a high-level command-line interface to extract a + specific subset of information from a SARIF file. The format of each tool's + /output/ is versioned and, as much as possible, independent of the input. + + It is the intent of these tools to + - hide the internals of sarif when /used/ + - provide examples of extracting information from sarif files while writing your + own or extending the tools + +* Setup for development + Set up the virtual environment and install the packages: + # pip freeze > requirements.txt + #+BEGIN_SRC sh + python3 -m venv .venv + . .venv/bin/activate + python3 -m pip install -r requirements.txt + # Or separately: + pip install --upgrade pip + pip install ipython pyyaml + #+END_SRC + + "Install" for local development: + #+BEGIN_SRC sh + pip install -e . + #+END_SRC + +* Sample Data + The query results in =data/= are taken from lgtm.com, which ran the + : ql/$LANG/ql/src/codeql-suites/$LANG-lgtm.qls + queries. + + The linux kernel has both single-location results (="kind": "problem"=) and path + results (="kind": "path-problem"=). It also has results for multiple source + languages. + + The subset of files referenced by the sarif results is in =data/linux-small/= + and is taken from + #+begin_src javascript + "versionControlProvenance": [ + { + "repositoryUri": "https://github.com/torvalds/linux.git", + "revisionId": "d9abdee5fd5abffd0e763e52fbfa3116de167822" + } + ] + #+end_src + +* Commands + + + +#+OPTIONS: ^:{} + diff --git a/bin/json-to-yaml b/bin/json-to-yaml new file mode 100755 index 0000000..ee42064 --- /dev/null +++ b/bin/json-to-yaml @@ -0,0 +1,6 @@ +#!/usr/bin/env python +import json +import yaml +import sys + +yaml.dump(json.load(sys.stdin), stream=sys.stdout) diff --git a/bin/sarif-digest b/bin/sarif-digest new file mode 100755 index 0000000..75470ab --- /dev/null +++ b/bin/sarif-digest @@ -0,0 +1,38 @@ +#!/usr/bin/env python +import json +import sarif_cli as S +import sys + +# TODO command-line: sarif-digest [] +# +# reduce size by listing only first/last elements +fpath = sys.argv[1] +with open(fpath, 'r') as fp: + sarif_struct = json.load(fp) + +def _show_dict(elem, context): + return {key : _compact(val, key) for key, val in elem.items()} + +def _show_list(elem, context): + if len(elem) > 2: + # first and last + return ["------------%d items, showing first and last ----------" % len(elem), + _compact(elem[0], 0), + _compact(elem[-1], -1)] + if len(elem) > 0: + return [_compact(elem[i], i) for i in range(0, len(elem))] + else: + return elem + +def _compact(elem, context): + t = type(elem) + if t == dict: + return _show_dict(elem, context) + elif t == list: + return _show_list(elem, context) + else: + return elem + +json.dump(_compact(sarif_struct, "starting"), sys.stdout, indent=2) + + diff --git a/bin/sarif-labeled b/bin/sarif-labeled new file mode 100644 index 0000000..4e06158 --- /dev/null +++ b/bin/sarif-labeled @@ -0,0 +1,50 @@ +#!/usr/bin/env python +import argparse +import json +import sarif_cli as S +import sys +import collections + +# TODO +# require python 3.7+ for ordered dictionaries? + +parser = argparse.ArgumentParser(description='Output a sarif file with labeled paths preceeding arrays and objects') +parser.add_argument('file', metavar='file', type=str, help='input file, - for stdin') + +args = parser.parse_args() +with open(args.file, 'r') if args.file != '-' else sys.stdin as fp: + sarif_struct = json.load(fp) + +def _label_dict(elem, path): + d = collections.OrderedDict() + for key, val in elem.items(): + subpath = path + "['%s']" % key + if type(val) in [dict, list]: + d[subpath] = "----path----" + d[key] = _label(val, subpath) + return d + +def _label_list(elem, path): + if len(elem) > 0: + l = [] + for i in range(0, len(elem)): + subpath = path + "[%d]" % i + if i % 4 == 0: + l.append("---- %s ----" % subpath) + l.append(_label(elem[i], subpath)) + return l + else: + return elem + +def _label(elem, path): + t = type(elem) + if t == dict: + return _label_dict(elem, path) + elif t == list: + return _label_list(elem, path) + else: + return elem + +json.dump(_label(sarif_struct, "sarif_struct"), sys.stdout, indent=2) + + diff --git a/bin/sarif-list-files b/bin/sarif-list-files new file mode 100755 index 0000000..d3c39e6 --- /dev/null +++ b/bin/sarif-list-files @@ -0,0 +1,46 @@ +#!/usr/bin/env python +import argparse +import json +import sarif_cli as S +import sys +import collections + +parser = argparse.ArgumentParser(description='list source files referenced by sarif file') +parser.add_argument('file', metavar='sarif-file', type=str, + help='input file, - for stdin') +args = parser.parse_args() + +# Grab the file +with open(args.file, 'r') if args.file != '-' else sys.stdin as fp: + sarif_struct = json.load(fp) + +# Make sure there are some results +num_results = len(S.get(sarif_struct, 'runs', 0, 'results')) +if num_results == 0: + S.exit(0) + +# Collect the file names +uris = set() + +# Locations for @kind problem +# e.g., +# sarif_struct['runs'][0]['results'][5]['locations'][0]['physicalLocation']['artifactLocation'] +for resi in range(0, len(S.get(sarif_struct, 'runs', 0, 'results'))): + uri = S.get(sarif_struct, 'runs', 0, 'results', resi, 'locations', 0, + 'physicalLocation', 'artifactLocation', 'uri') + uris.add(uri) + +# Locations for @kind path-problem +# e.g. sarif_struct['runs'][0]['results'][22]['codeFlows'][0]['threadFlows'][0]['locations'][1]['location'] +for resi in range(0, len(S.get(sarif_struct, 'runs', 0, 'results'))): + if 'codeFlows' in S.get(sarif_struct, 'runs', 0, 'results', resi).keys(): + locations = S.get(sarif_struct, 'runs', 0, 'results', resi, 'codeFlows', 0, + 'threadFlows', 0, 'locations') + for loci in range(0, len(locations)): + uri = S.get(locations, loci, 'location', 'physicalLocation', + 'artifactLocation', 'uri') + uris.add(uri) +uris = list(uris) +uris.sort() +for u in uris: + print(u) diff --git a/bin/sarif-results-summary b/bin/sarif-results-summary new file mode 100644 index 0000000..4099c54 --- /dev/null +++ b/bin/sarif-results-summary @@ -0,0 +1,32 @@ +#!/usr/bin/env python +import argparse +import json +import sarif_cli as S +import sys +import collections + +parser = argparse.ArgumentParser(description='summary of results') +parser.add_argument('file', metavar='sarif-file', type=str, help='input file, - for stdin') + +args = parser.parse_args() +with open(args.file, 'r') if args.file != '-' else sys.stdin as fp: + sarif_struct = json.load(fp) + +num_results = len(S.get(sarif_struct, 'runs', 0, 'results')) +S.msg("Found %d results\n\n" % num_results) +if num_results == 0: + S.exit(0) + +for resi in range(0, len(S.get(sarif_struct, 'runs', 0, 'results'))): + message = S.get(sarif_struct, 'runs', 0, 'results', resi, 'message', 'text') + artifact = S.get(sarif_struct, 'runs', 0, 'results', resi, 'locations', 0, + 'physicalLocation', 'artifactLocation') + region = S.get(sarif_struct, 'runs', 0, 'results', resi, 'locations', 0, + 'physicalLocation', 'region') + filepath = "%s:%d:%d" % (artifact['uri'], region['startLine'], + region.get('startColumn', -1)) + S.msg("%s: %s\n" % (filepath, message)) + + + + diff --git a/data/torvalds_linux__2021-10-21_10_07_00__export.sarif b/data/torvalds_linux__2021-10-21_10_07_00__export.sarif new file mode 100644 index 0000000..3106831 --- /dev/null +++ b/data/torvalds_linux__2021-10-21_10_07_00__export.sarif @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6336c9f7d5c6f6c21ccfd5a5a63adcad710c4ddbe326ec37b5964ac172f8e817 +size 1196634 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..807c031 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,18 @@ +appnope==0.1.2 +attrs==21.2.0 +backcall==0.2.0 +decorator==5.1.0 +ipython==7.28.0 +jedi==0.18.0 +matplotlib-inline==0.1.3 +parso==0.8.2 +pbr==5.6.0 +pexpect==4.8.0 +pickleshare==0.7.5 +prompt-toolkit==3.0.20 +ptyprocess==0.7.0 +Pygments==2.10.0 +PyYAML==6.0 +sarif-om==1.0.4 +traitlets==5.1.0 +wcwidth==0.2.5 diff --git a/sarif_cli/__init__.py b/sarif_cli/__init__.py new file mode 100644 index 0000000..67709c7 --- /dev/null +++ b/sarif_cli/__init__.py @@ -0,0 +1,18 @@ +import sys + +MIN_PYTHON = (3, 7) +if sys.version_info < MIN_PYTHON: + sys.exit("Python %s.%s or later is required.\n" % MIN_PYTHON) + +def get(sarif_struct, *path): + """ Get the sarif entry at PATH """ + res = sarif_struct + for p in path: + res = res[p] + return res + +def msg(message): + """ Print message to stdout """ + sys.stdout.write(message) + sys.stdout.write('\n') + diff --git a/sarif_cli/tests/__init__.py b/sarif_cli/tests/__init__.py new file mode 100644 index 0000000..792d600 --- /dev/null +++ b/sarif_cli/tests/__init__.py @@ -0,0 +1 @@ +# diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..f95913a --- /dev/null +++ b/setup.py @@ -0,0 +1,17 @@ +from setuptools import setup +import glob + +setup(name='sarif_cli', + version='0.1', + description='Collection of command line tools for sarif files', + url='https://github.com/hohn/sarif-cli', + author='Michael Hohn', + author_email='hohn@github.com', + license='MIT', + packages=['sarif_cli'], + install_requires=[], + include_package_data=True, + scripts=glob.glob("bin/sarif-*"), + zip_safe=False, + python_requires='>=3.7' + )