Files
sarif-cli/bin/sarif-extract-scans-runner
Kristen Newbury 4121072088 Rework project and scan id generation
goal:
deterministic across multiple instances of scan on same sarif file
no collisions between sarif files from different scan instances (regardless of for same project or not)

assumption sarif file naming will follow: <project>/<unique_filename_per_analysis> format
2022-10-26 12:00:38 -04:00

205 lines
6.2 KiB
Python
Executable File

#!/usr/bin/env python3
"""Run `sarif-extract-scans` over a directory hierarchy of the form
organization/project-sarif.json
org2/proj2.sarif
...
mirroring the github org/project structure. The list of sarif files to ingest is
in the file given by the sarif-files argument, a list of paths of the form
<organization>/<project-sarif>
sarif-extract-scans-runner creates these files:
- successful_runs -- optional, one file. Track saved file status and only re-run
failed attempts
- org*/project*.scanspec -- one scanspec file per org/project listed in
sarif-files. Required by `sarif-extract-scans`
- org*/project*.scanlog -- failures from 'sarif-extract-scans' are logged here
It also creates the directories
- org*/project*.scantables -- one directory per org/project, each holding the
tables produced by `sarif-extract-scans`:
├── codeflows.csv
├── projects.csv
├── results.csv
└── scans.csv
As example:
cd ../data/treeio
sarif-extract-scans-runner -i1 -s successful-runs - <<EOF
2021-12-09/results.sarif
2022-02-25/results.sarif
EOF
writes out
DATE 2022-08-08T14:32:41.962219
OK 2021-12-09/results.sarif
DATE 2022-08-08T14:32:42.970712
OK 2022-02-25/results.sarif
and produces the files
2021-12-09/
├── results.sarif.scanspec
├── results.sarif.scantables
├── codeflows.csv
├── projects.csv
├── results.csv
└── scans.csv
2022-02-25/
├── results.sarif.scanspec
├── results.sarif.scantables
├── codeflows.csv
├── projects.csv
├── results.csv
└── scans.csv
on the first run; repeating
sarif-extract-scans-runner -i1 -s successful-runs - <<EOF
2021-12-09/results.sarif
2022-02-25/results.sarif
EOF
will produce no output but run much faster.
Typical use for larger sarif file collections:
cd /path/to/scans/root
create sarif-files.txt
nohup sarif-extract-scans-runner -s ses-successful-runs sarif-files.txt &
"""
import argparse
import subprocess
import json
import os
import sys
import pickle
from datetime import datetime
from hashlib import blake2b
def hash_unique(item_to_hash, size):
h = blake2b(digest_size = size)
h.update(item_to_hash.encode())
return abs(int.from_bytes(h.digest(), byteorder='big'))
#
# Handle arguments
#
parser = argparse.ArgumentParser(description='Run sarif-extract-scans over a directory hierarchy')
parser.add_argument('sarif_files', metavar='sarif-files', type=str, help='File containing list of sarif files, use - for stdin')
parser.add_argument('-m', '--max-files', metavar='number', type=int, default=100,
help='Maximum number of files to process.'
' Default: %(default)d')
parser.add_argument('-i', '--update-interval', metavar='N', type=int, default=10,
help='Update status and save state after processing N files.'
' Default: %(default)d')
parser.add_argument('-s', '--successful-runs', metavar='filename', type=str,
default="",
help='Incremental running support: Track successful runs and only (re)run '
'new/failed entries from sarif-files.'
' Default: "%(default)s"')
parser.add_argument('--doc', dest='fulldoc', default=False,
action='store_true',
help='Print full documentation for this script')
# Avoid argparse error when only --doc is given
if len(sys.argv) == 2 and sys.argv[1] == '--doc':
print(__doc__)
sys.exit(0)
args = parser.parse_args()
#
# Collect sarif file information
#
with open(args.sarif_files, 'r') if args.sarif_files != '-' else sys.stdin as fp:
paths = fp.readlines()
# Use saved status, only re-run failed attempts
use_successful_runs = args.successful_runs != ""
if use_successful_runs:
if os.path.exists(args.successful_runs):
with open(args.successful_runs, "rb") as infile:
successful_runs = pickle.load(infile)
else:
successful_runs = set()
count = -1
for path in paths:
count += 1
if count > args.max_files: break
#
# Paths and components
#
path = path.rstrip()
project, component = path.split('/')
#
# Scan specification
#
scan_spec = {
"project_id": hash_unique(project, 8), # pd.UInt64Dtype()
"scan_id": hash_unique(path, 8), # pd.Int64Dtype()
"sarif_file_name": path, # pd.StringDtype()
}
scan_spec_file = os.path.join(project, component + ".scanspec")
with open(scan_spec_file, 'w') as fp:
json.dump(scan_spec, fp)
#
# Table output directory
#
output_dir = os.path.join(project, component + ".scantables")
try: os.mkdir(output_dir, mode=0o755)
except FileExistsError: pass
#
# Run sarif-extract-scans
#
if use_successful_runs:
if path in successful_runs:
# Don't rerun
continue
# Some timing information
if count % args.update_interval == 0:
print("{:6} {}".format("DATE", datetime.now().isoformat()))
# Save occasionally
if count % args.update_interval == 0:
if use_successful_runs:
with open(args.successful_runs, 'wb') as outfile:
pickle.dump(successful_runs, outfile)
scan_log_file = os.path.join(project, component + ".scanlog")
runstats = subprocess.run(['sarif-extract-scans', scan_spec_file, output_dir],
capture_output=True, text=True)
if runstats.returncode == 0:
print("{:6} {}".format("OK", path))
if use_successful_runs:
successful_runs.add(path)
else:
print("{:6} {} {}".format("FAIL", path, scan_log_file))
# log error
with open(scan_log_file, 'w') as fp:
fp.write(runstats.stderr)
# report only tail
print("{:6} {}".format("", "Error tail: "))
for t1 in runstats.stderr.split('\n')[-6:-1]:
print("{:6} {}".format("", t1))
if use_successful_runs:
with open(args.successful_runs, 'wb') as outfile:
pickle.dump(successful_runs, outfile)