5 Commits
master ... v0.1

Author SHA1 Message Date
edce50fb79 Add missing result[] handling
fixes missing result errors
       #+BEGIN_SRC text
         Traceback (most recent call last):
           File "/mnt/common/home/hohn/work-gh/sarif-cli/.venv-ubuserv/bin/sarif-to-table", line 125, in <module>
             num_results = len(S.get(sarif_struct, 'runs', runi, 'results'))
           File "/mnt/common/home/hohn/work-gh/sarif-cli/sarif_cli/traverse.py", line 169, in get
             res = res[p]
         KeyError: 'results'
         76% 3204:1006=7s ./repos/RasaHQ/rasa/code-scanning/analyses/132221999.sarif                                Traceback (most recent call last):
           File "/mnt/common/home/hohn/work-gh/sarif-cli/.venv-ubuserv/bin/sarif-to-table", line 125, in <module>
             num_results = len(S.get(sarif_struct, 'runs', runi, 'results'))
           File "/mnt/common/home/hohn/work-gh/sarif-cli/sarif_cli/traverse.py", line 169, in get
             res = res[p]
         KeyError: 'results'
       #+END_SRC
2025-10-20 21:33:02 -07:00
1909517804 added rule_id to sarif-to-table 2025-10-20 21:20:02 -07:00
8741e12860 wip: sarif-to-table: full table output in parallel to text 2025-10-20 18:57:34 -07:00
8b3181fbf7 wip: sarif-to-table: no csv option 2025-10-20 18:20:17 -07:00
f98af0295e tested simple pull extractor. fail. 2025-10-20 13:30:34 -07:00
14 changed files with 407 additions and 82817 deletions

View File

@@ -1,18 +1,15 @@
#!/usr/bin/env python3
"""
Create SQLite schema for SARIF importer.
"""
"""Create SQLite schema for SARIF importer."""
import sqlite3, sys
schemas = {
"runs": """CREATE TABLE IF NOT EXISTS runs (
"runs": """CREATE TABLE IF NOT EXISTS runs(
run_id TEXT PRIMARY KEY,
timestamp TIMESTAMP,
tool TEXT,
version TEXT,
exit_code INTEGER
);""",
"results": """CREATE TABLE IF NOT EXISTS results (
exit_code INTEGER);""",
"results": """CREATE TABLE IF NOT EXISTS results(
run_id TEXT,
rule_id TEXT,
severity TEXT,
@@ -22,18 +19,16 @@ schemas = {
line_end INTEGER,
column_start INTEGER,
column_end INTEGER,
PRIMARY KEY (run_id, rule_id, file_path, line_start)
);""",
"alerts": """CREATE TABLE IF NOT EXISTS alerts (
PRIMARY KEY(run_id,rule_id,file_path,line_start));""",
"alerts": """CREATE TABLE IF NOT EXISTS alerts(
alert_id TEXT PRIMARY KEY,
run_id TEXT,
rule_id TEXT,
kind TEXT,
file_path TEXT,
message TEXT,
severity TEXT
);""",
"referenced_source_regions": """CREATE TABLE IF NOT EXISTS referenced_source_regions (
severity TEXT);""",
"referenced_source_regions": """CREATE TABLE IF NOT EXISTS referenced_source_regions(
region_id TEXT PRIMARY KEY,
result_id TEXT,
file_path TEXT,
@@ -42,22 +37,18 @@ schemas = {
start_column INTEGER,
end_column INTEGER,
snippet TEXT,
source_hash TEXT
);"""
source_hash TEXT);"""
}
def main():
if len(sys.argv) < 2:
if len(sys.argv)<2:
print("Usage: sarif-make-schema dbfile")
sys.exit(1)
db = sys.argv[1]
con = sqlite3.connect(db)
cur = con.cursor()
for name, sql in schemas.items():
cur.executescript(sql)
con.commit()
con.close()
print(f"Created/verified schema in {db}")
db=sys.argv[1]
con=sqlite3.connect(db)
cur=con.cursor()
for sql in schemas.values(): cur.executescript(sql)
con.commit(); con.close()
print(f"Schema ready in {db}")
if __name__ == "__main__":
main()
if __name__=="__main__": main()

View File

@@ -1,84 +1,69 @@
#!/usr/bin/env python3
"""
Pull-style SARIF to SQLite converter.
Example: sarif-pull foo.sarif foo.db
Pull-style SARIF SQLite importer.
Populates runs, results, alerts, referenced_source_regions.
"""
import sqlite3, sys, os, uuid
import json, fnmatch, hashlib, datetime
def load_json(path):
with open(path, 'r', encoding='utf-8') as f:
return json.load(f)
def flatten_json(obj, prefix="", sep="/"):
"""Yield (path, value) pairs from nested dicts/lists."""
if isinstance(obj, dict):
for k, v in obj.items():
yield from flatten_json(v, f"{prefix}{sep}{k}" if prefix else k, sep)
elif isinstance(obj, list):
for i, v in enumerate(obj):
yield from flatten_json(v, f"{prefix}{sep}{i}" if prefix else str(i), sep)
else:
yield prefix, obj
def hash_snippet(text):
return hashlib.sha1(text.encode('utf-8', 'ignore')).hexdigest()
def now_timestamp():
return datetime.datetime.utcnow().isoformat(sep=' ', timespec='seconds')
import sqlite3, sys, os
from sarif_util import load_json, hash_snippet, now_timestamp
import subprocess
def ensure_schema(db):
import subprocess
subprocess.run(["sarif-make-schema", db], check=True)
def extract_results(run_id, run):
results = []
tool = run.get("tool", {}).get("driver", {}).get("name", "")
version = run.get("tool", {}).get("driver", {}).get("semanticVersion", "")
for res in run.get("results", []) or []:
msg = (res.get("message") or {}).get("text", "")
rule_id = res.get("ruleId", "")
sev = (res.get("properties") or {}).get("problem.severity", "")
locs = res.get("locations") or []
def extract_all(run_id, run):
results, alerts, regions = [], [], []
tool = run.get("tool",{}).get("driver",{}).get("name","")
version = run.get("tool",{}).get("driver",{}).get("semanticVersion","")
for res in run.get("results",[]) or []:
msg=(res.get("message") or {}).get("text","")
rule_id=res.get("ruleId","")
sev=(res.get("properties") or {}).get("problem.severity","")
locs=res.get("locations") or []
for loc in locs:
ploc = loc.get("physicalLocation", {}) if loc else {}
file_path = (ploc.get("artifactLocation") or {}).get("uri", "")
region = ploc.get("region") or {}
results.append({
"run_id": run_id,
"rule_id": rule_id,
"severity": sev,
"message": msg,
"file_path": file_path,
"line_start": region.get("startLine"),
"line_end": region.get("endLine"),
"column_start": region.get("startColumn"),
"column_end": region.get("endColumn"),
})
return results, tool, version
ploc=loc.get("physicalLocation",{}) if loc else {}
file_path=(ploc.get("artifactLocation") or {}).get("uri","")
region=ploc.get("region") or {}
ls,le,cs,ce=(region.get("startLine"),region.get("endLine"),
region.get("startColumn"),region.get("endColumn"))
rid=hash_snippet(f"{run_id}|{rule_id}|{file_path}|{ls}|{le}|{cs}|{ce}")
results.append(dict(run_id=run_id,rule_id=rule_id,severity=sev,
message=msg,file_path=file_path,
line_start=ls,line_end=le,
column_start=cs,column_end=ce))
alerts.append(dict(alert_id=rid,run_id=run_id,rule_id=rule_id,
kind="result",file_path=file_path,
message=msg,severity=sev))
regions.append(dict(region_id=hash_snippet(f"{file_path}|{ls}|{le}|{cs}|{ce}"),
result_id=rid,file_path=file_path,
start_line=ls,end_line=le,
start_column=cs,end_column=ce,
snippet=None,source_hash=None))
return results, alerts, regions, tool, version
def main():
if len(sys.argv) < 3:
if len(sys.argv)<3:
print("Usage: sarif-pull input.sarif output.db")
sys.exit(1)
sarif_file, dbfile = sys.argv[1], sys.argv[2]
sarif_file,dbfile=sys.argv[1:3]
ensure_schema(dbfile)
sarif = load_json(sarif_file)
con = sqlite3.connect(dbfile)
cur = con.cursor()
for i, run in enumerate(sarif.get("runs", [])):
run_id = f"{os.path.basename(sarif_file)}#{i}"
results, tool, version = extract_results(run_id, run)
cur.execute("INSERT OR REPLACE INTO runs VALUES (?, ?, ?, ?, ?)",
(run_id, now_timestamp(), tool, version, 0))
sarif=load_json(sarif_file)
con=sqlite3.connect(dbfile)
cur=con.cursor()
for i,run in enumerate(sarif.get("runs",[])):
run_id=f"{os.path.basename(sarif_file)}#{i}"
results,alerts,regions,tool,version=extract_all(run_id,run)
cur.execute("INSERT OR REPLACE INTO runs VALUES (?,?,?,?,?)",
(run_id,now_timestamp(),tool,version,0))
cur.executemany("""INSERT OR REPLACE INTO results VALUES
(:run_id, :rule_id, :severity, :message, :file_path,
:line_start, :line_end, :column_start, :column_end)""", results)
con.commit()
con.close()
print(f"Inserted {len(results)} results into {dbfile}")
(:run_id,:rule_id,:severity,:message,:file_path,
:line_start,:line_end,:column_start,:column_end)""",results)
cur.executemany("""INSERT OR REPLACE INTO alerts VALUES
(:alert_id,:run_id,:rule_id,:kind,:file_path,:message,:severity)""",alerts)
cur.executemany("""INSERT OR REPLACE INTO referenced_source_regions VALUES
(:region_id,:result_id,:file_path,:start_line,:end_line,
:start_column,:end_column,:snippet,:source_hash)""",regions)
con.commit(); con.close()
print(f"Inserted {len(results)} results, {len(alerts)} alerts, "
f"{len(regions)} regions into {dbfile}")
if __name__ == "__main__":
main()
if __name__=="__main__": main()

305
bin/sarif-to-table Executable file
View File

@@ -0,0 +1,305 @@
#!/usr/bin/env python
import argparse
import json
import sarif_cli.traverse as S
import sys
import sqlite3
import hashlib
# --------------------------------------------------------------------
# Argument parsing
# --------------------------------------------------------------------
parser = argparse.ArgumentParser(description='summary of results, stored in sqlite')
parser.add_argument('file', metavar='sarif-file', type=str,
help='input file, - for stdin')
parser.add_argument('dbfile', metavar='db-file', type=str,
help='sqlite database file to append results to')
parser.add_argument('-s', '--list-source', metavar='srcroot', type=str,
help='list source snippets using srcroot as sarif SRCROOT')
parser.add_argument('-r', '--related-locations', action="store_true",
help='list related locations like "hides "')
parser.add_argument('-e', '--endpoints-only', action="store_true",
help='only list source and sink, dropping the path. Identical, successive source/sink pairs are combined')
args = parser.parse_args()
# --------------------------------------------------------------------
# Read SARIF
# --------------------------------------------------------------------
with open(args.file, 'r') if args.file != '-' else sys.stdin as fp:
sarif_struct = json.load(fp)
if not S.is_sarif_struct(sarif_struct):
S.msg("ERROR: invalid json contents in %s\n" % (args.file))
S.dbg("invalid json contents in %s\n" % (args.file))
sys.exit(0)
# --------------------------------------------------------------------
# Compute unique id (tool version, git commit, date)
# --------------------------------------------------------------------
def compute_unique_id(sarif_struct, runi, sarif_file):
def _safeget(*path):
try:
return S.get(*path)
except Exception:
return None
tool_version = _safeget(sarif_struct, 'runs', runi, 'tool', 'driver', 'version')
revision_id = _safeget(sarif_struct, 'runs', runi, 'versionControlProvenance', 0, 'revisionId')
start_time = _safeget(sarif_struct, 'runs', runi, 'invocations', 0, 'startTimeUtc')
seed = f"{tool_version or ''}|{revision_id or ''}|{start_time or ''}|{sarif_file}"
return hashlib.sha1(seed.encode('utf-8')).hexdigest()
# --------------------------------------------------------------------
# Define keep_with_context inside S
# --------------------------------------------------------------------
def _init_db(dbfile):
conn = sqlite3.connect(dbfile)
cur = conn.cursor()
cur.execute("""
CREATE TABLE IF NOT EXISTS sarif_results (
sarif_file TEXT,
unique_id TEXT,
runi INTEGER,
resi INTEGER,
codefi INTEGER,
threadi INTEGER,
loci INTEGER,
related_index INTEGER,
artifact_uri TEXT,
l1 INTEGER,
c1 INTEGER,
l2 INTEGER,
c2 INTEGER,
line_num INTEGER,
msg_type TEXT,
message TEXT,
source_line TEXT,
rule_id TEXT
);
""")
cur.execute("CREATE INDEX IF NOT EXISTS idx_artifact_uri ON sarif_results(artifact_uri);")
cur.execute("CREATE INDEX IF NOT EXISTS idx_runi_resi ON sarif_results(runi, resi);")
cur.execute("CREATE INDEX IF NOT EXISTS idx_msg_type ON sarif_results(msg_type);")
cur.execute("CREATE INDEX IF NOT EXISTS idx_unique_id ON sarif_results(unique_id);")
cur.execute("CREATE INDEX IF NOT EXISTS idx_rule_id ON sarif_results(rule_id);")
conn.commit()
return conn
_conn = _init_db(args.dbfile)
_buffer = []
_COMMIT_INTERVAL = 1000
def _flush_buffer():
global _buffer
if not _buffer:
return
cur = _conn.cursor()
cur.executemany("""
INSERT INTO sarif_results (
sarif_file, unique_id, runi, resi, codefi, threadi, loci, related_index,
artifact_uri, l1, c1, l2, c2, line_num, msg_type, message, source_line, rule_id
) VALUES (
:sarif_file, :unique_id, :runi, :resi, :codefi, :threadi, :loci, :related_index,
:artifact_uri, :l1, :c1, :l2, :c2, :line_num, :msg_type, :message, :source_line, :rule_id
)
""", _buffer)
_conn.commit()
_buffer = []
def keep_with_context(ctx):
global _buffer
_buffer.append(ctx)
if len(_buffer) >= _COMMIT_INTERVAL:
_flush_buffer()
S.keep_with_context = keep_with_context
import atexit
atexit.register(_flush_buffer)
# --------------------------------------------------------------------
# Traverse SARIF
# --------------------------------------------------------------------
for runi in S.indices(sarif_struct, 'runs'):
unique_id = compute_unique_id(sarif_struct, runi, args.file)
run_obj = S.get(sarif_struct, 'runs', runi)
results = run_obj.get('results', [])
if not results:
S.dbg(f"Skipping {args.file} run {runi}: no results key\n")
continue
num_results = len(results)
for resi in S.indices(sarif_struct, 'runs', runi, 'results'):
result = S.get(sarif_struct, 'runs', runi, 'results', resi)
rule_id = result.get("ruleId")
if not rule_id:
try:
rule_id = S.get(result, "rule", "id")
except Exception:
rule_id = None
# ---------------- Locations (non-path problems)
if 'locations' in result:
message, artifact, region = S.get_location_message_info(result)
if region == S.WholeFile:
l1, c1, l2, c2 = -1, -1, -1, -1
else:
l1, c1, l2, c2 = S.lineinfo(region)
filepath = "%s:%d:%d:%d:%d" % (artifact['uri'], l1, c1, l2, c2)
S.msg("RESULT: %s: %s\n" % (filepath, message))
S.keep_with_context({
"sarif_file": args.file, "unique_id": unique_id,
"runi": runi, "resi": resi,
"codefi": None, "threadi": None, "loci": None, "related_index": None,
"artifact_uri": artifact.get('uri', ''),
"l1": l1, "c1": c1, "l2": l2, "c2": c2,
"line_num": None, "msg_type": "RESULT",
"message": message, "source_line": "", "rule_id": rule_id
})
if region != S.WholeFile and args.list_source:
lines = S.load_lines(args.list_source, artifact['uri'], l1, l2)
for line, line_num in zip(lines, range(l1, l2 + 1)):
S.display_underlined(l1, c1, l2, c2, line, line_num)
S.keep_with_context({
"sarif_file": args.file, "unique_id": unique_id,
"runi": runi, "resi": resi,
"codefi": None, "threadi": None, "loci": None, "related_index": None,
"artifact_uri": artifact.get('uri', ''),
"l1": l1, "c1": c1, "l2": l2, "c2": c2,
"line_num": line_num, "msg_type": "SOURCE",
"message": message, "source_line": line, "rule_id": rule_id
})
if args.related_locations:
relatedLocations = result.get('relatedLocations', None)
if isinstance(relatedLocations, list):
for relo_index, relo in enumerate(relatedLocations):
message, artifact, region = S.get_relatedlocation_message_info(relo)
if artifact == S.NoFile:
S.msg("REFERENCE: %s: %s\n" % ("<NoFile>", message))
S.keep_with_context({
"sarif_file": args.file, "unique_id": unique_id,
"runi": runi, "resi": resi,
"codefi": None, "threadi": None,
"loci": None, "related_index": relo_index,
"artifact_uri": "", "l1": -1, "c1": -1, "l2": -1, "c2": -1,
"line_num": None, "msg_type": "REFERENCE",
"message": message, "source_line": "", "rule_id": rule_id
})
else:
if region == S.WholeFile:
l1, c1, l2, c2 = -1, -1, -1, -1
else:
l1, c1, l2, c2 = S.lineinfo(region)
filepath = "%s:%d:%d:%d:%d" % (artifact['uri'], l1, c1, l2, c2)
S.msg("REFERENCE: %s: %s\n" % (filepath, message))
S.keep_with_context({
"sarif_file": args.file, "unique_id": unique_id,
"runi": runi, "resi": resi,
"codefi": None, "threadi": None,
"loci": None, "related_index": relo_index,
"artifact_uri": artifact.get('uri', ''),
"l1": l1, "c1": c1, "l2": l2, "c2": c2,
"line_num": None, "msg_type": "REFERENCE",
"message": message, "source_line": "", "rule_id": rule_id
})
if args.list_source:
lines = S.load_lines(args.list_source, artifact['uri'], l1, l2)
for line, line_num in zip(lines, range(l1, l2 + 1)):
S.display_underlined(l1, c1, l2, c2, line, line_num)
S.keep_with_context({
"sarif_file": args.file, "unique_id": unique_id,
"runi": runi, "resi": resi,
"codefi": None, "threadi": None,
"loci": None, "related_index": relo_index,
"artifact_uri": artifact.get('uri', ''),
"l1": l1, "c1": c1, "l2": l2, "c2": c2,
"line_num": line_num, "msg_type": "SOURCE",
"message": message, "source_line": line, "rule_id": rule_id
})
# ---------------- CodeFlows (path problems)
if 'codeFlows' in result:
last_codeFlow = None
for codefi in S.indices(result, 'codeFlows'):
codeFlow = S.get(result, 'codeFlows', codefi)
S.msg("PATH %d\n" % codefi)
S.keep_with_context({
"sarif_file": args.file, "unique_id": unique_id,
"runi": runi, "resi": resi, "codefi": codefi,
"threadi": None, "loci": None, "related_index": None,
"artifact_uri": "", "l1": -1, "c1": -1, "l2": -1, "c2": -1,
"line_num": None, "msg_type": "PATH",
"message": "", "source_line": "", "rule_id": rule_id
})
for threadi in S.indices(codeFlow, 'threadFlows'):
threadFlow = S.get(codeFlow, 'threadFlows', threadi)
if args.endpoints_only:
t1 = S.indices(threadFlow, 'locations')
location_range = [t1[0], t1[-1]]
if (last_codeFlow and
(S.get(last_codeFlow, 'threadFlows', threadi, 'locations', 0) ==
S.get(codeFlow, 'threadFlows', threadi, 'locations', 0)) and
(S.get(last_codeFlow, 'threadFlows', threadi, 'locations', -1) ==
S.get(codeFlow, 'threadFlows', threadi, 'locations', -1))):
continue
else:
location_range = S.indices(threadFlow, 'locations')
for loci in location_range:
location = S.get(threadFlow, 'locations', loci, 'location')
message, artifact, region = S.get_relatedlocation_message_info(location)
if artifact == S.NoFile:
S.msg("FLOW STEP %d: %s: %s\n" % (loci, "<NoFile>", message))
S.keep_with_context({
"sarif_file": args.file, "unique_id": unique_id,
"runi": runi, "resi": resi,
"codefi": codefi, "threadi": threadi,
"loci": loci, "related_index": None,
"artifact_uri": "", "l1": -1, "c1": -1, "l2": -1, "c2": -1,
"line_num": None, "msg_type": "FLOW_STEP",
"message": message, "source_line": "", "rule_id": rule_id
})
else:
if region == S.WholeFile:
l1, c1, l2, c2 = -1, -1, -1, -1
else:
l1, c1, l2, c2 = S.lineinfo(region)
filepath = "%s:%d:%d:%d:%d" % (artifact['uri'], l1, c1, l2, c2)
S.msg("FLOW STEP %d: %s: %s\n" % (loci, filepath, message))
S.keep_with_context({
"sarif_file": args.file, "unique_id": unique_id,
"runi": runi, "resi": resi,
"codefi": codefi, "threadi": threadi,
"loci": loci, "related_index": None,
"artifact_uri": artifact.get('uri', ''),
"l1": l1, "c1": c1, "l2": l2, "c2": c2,
"line_num": None, "msg_type": "FLOW_STEP",
"message": message, "source_line": "", "rule_id": rule_id
})
if args.list_source:
lines = S.load_lines(args.list_source, artifact['uri'], l1, l2)
for line, line_num in zip(lines, range(l1, l2 + 1)):
S.display_underlined(l1, c1, l2, c2, line, line_num)
S.keep_with_context({
"sarif_file": args.file, "unique_id": unique_id,
"runi": runi, "resi": resi,
"codefi": codefi, "threadi": threadi,
"loci": loci, "related_index": None,
"artifact_uri": artifact.get('uri', ''),
"l1": l1, "c1": c1, "l2": l2, "c2": c2,
"line_num": line_num, "msg_type": "SOURCE",
"message": message, "source_line": line, "rule_id": rule_id
})
last_codeFlow = codeFlow
S.msg("\n")
S.keep_with_context({
"sarif_file": args.file, "unique_id": unique_id,
"runi": runi, "resi": resi,
"codefi": None, "threadi": None, "loci": None, "related_index": None,
"artifact_uri": "", "l1": -1, "c1": -1, "l2": -1, "c2": -1,
"line_num": None, "msg_type": "NEWLINE",
"message": "", "source_line": "", "rule_id": rule_id
})

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -16,7 +16,7 @@
135: ( 'Struct3739',
147: ('Array6785', ('array', (0, 'Struct3739')))]
#+END_SRC
1. First update the signature. The file [[./sarif_cli/signature_multi.py]] has
1. First update the signature. The file [[../sarif_cli/signature_multi.py]] has
instructions for updating (or creating) a typegraph.
The update from commit 0f070a6ae to 0f070a6ae+1 introduces the changes

View File

@@ -14,8 +14,9 @@
pip install -e .
# force symlinks for development
rm -f "$VIRTUAL_ENV/bin/sarif-"*
ln -sf "$PWD/bin/sarif-"* "$VIRTUAL_ENV/bin/"
[ x"$VIRTUAL_ENV" != x ] &&\
rm -f "$VIRTUAL_ENV/bin/sarif-"* && \
( cd ~/work-gh/sarif-cli/ && ln -sf "$PWD/bin/sarif-"* "$VIRTUAL_ENV/bin/")
#+END_SRC