tested simple pull extractor. fail.

This commit is contained in:
2025-10-20 13:28:53 -07:00
committed by =michael hohn
parent 6c9e992b0e
commit f98af0295e
11 changed files with 98 additions and 82814 deletions

View File

@@ -1,18 +1,15 @@
#!/usr/bin/env python3
"""
Create SQLite schema for SARIF importer.
"""
"""Create SQLite schema for SARIF importer."""
import sqlite3, sys
schemas = {
"runs": """CREATE TABLE IF NOT EXISTS runs (
"runs": """CREATE TABLE IF NOT EXISTS runs(
run_id TEXT PRIMARY KEY,
timestamp TIMESTAMP,
tool TEXT,
version TEXT,
exit_code INTEGER
);""",
"results": """CREATE TABLE IF NOT EXISTS results (
exit_code INTEGER);""",
"results": """CREATE TABLE IF NOT EXISTS results(
run_id TEXT,
rule_id TEXT,
severity TEXT,
@@ -22,18 +19,16 @@ schemas = {
line_end INTEGER,
column_start INTEGER,
column_end INTEGER,
PRIMARY KEY (run_id, rule_id, file_path, line_start)
);""",
"alerts": """CREATE TABLE IF NOT EXISTS alerts (
PRIMARY KEY(run_id,rule_id,file_path,line_start));""",
"alerts": """CREATE TABLE IF NOT EXISTS alerts(
alert_id TEXT PRIMARY KEY,
run_id TEXT,
rule_id TEXT,
kind TEXT,
file_path TEXT,
message TEXT,
severity TEXT
);""",
"referenced_source_regions": """CREATE TABLE IF NOT EXISTS referenced_source_regions (
severity TEXT);""",
"referenced_source_regions": """CREATE TABLE IF NOT EXISTS referenced_source_regions(
region_id TEXT PRIMARY KEY,
result_id TEXT,
file_path TEXT,
@@ -42,22 +37,18 @@ schemas = {
start_column INTEGER,
end_column INTEGER,
snippet TEXT,
source_hash TEXT
);"""
source_hash TEXT);"""
}
def main():
if len(sys.argv) < 2:
if len(sys.argv)<2:
print("Usage: sarif-make-schema dbfile")
sys.exit(1)
db = sys.argv[1]
con = sqlite3.connect(db)
cur = con.cursor()
for name, sql in schemas.items():
cur.executescript(sql)
con.commit()
con.close()
print(f"Created/verified schema in {db}")
db=sys.argv[1]
con=sqlite3.connect(db)
cur=con.cursor()
for sql in schemas.values(): cur.executescript(sql)
con.commit(); con.close()
print(f"Schema ready in {db}")
if __name__ == "__main__":
main()
if __name__=="__main__": main()

View File

@@ -1,84 +1,69 @@
#!/usr/bin/env python3
"""
Pull-style SARIF to SQLite converter.
Example: sarif-pull foo.sarif foo.db
Pull-style SARIF SQLite importer.
Populates runs, results, alerts, referenced_source_regions.
"""
import sqlite3, sys, os, uuid
import json, fnmatch, hashlib, datetime
def load_json(path):
with open(path, 'r', encoding='utf-8') as f:
return json.load(f)
def flatten_json(obj, prefix="", sep="/"):
"""Yield (path, value) pairs from nested dicts/lists."""
if isinstance(obj, dict):
for k, v in obj.items():
yield from flatten_json(v, f"{prefix}{sep}{k}" if prefix else k, sep)
elif isinstance(obj, list):
for i, v in enumerate(obj):
yield from flatten_json(v, f"{prefix}{sep}{i}" if prefix else str(i), sep)
else:
yield prefix, obj
def hash_snippet(text):
return hashlib.sha1(text.encode('utf-8', 'ignore')).hexdigest()
def now_timestamp():
return datetime.datetime.utcnow().isoformat(sep=' ', timespec='seconds')
import sqlite3, sys, os
from sarif_util import load_json, hash_snippet, now_timestamp
import subprocess
def ensure_schema(db):
import subprocess
subprocess.run(["sarif-make-schema", db], check=True)
def extract_results(run_id, run):
results = []
tool = run.get("tool", {}).get("driver", {}).get("name", "")
version = run.get("tool", {}).get("driver", {}).get("semanticVersion", "")
for res in run.get("results", []) or []:
msg = (res.get("message") or {}).get("text", "")
rule_id = res.get("ruleId", "")
sev = (res.get("properties") or {}).get("problem.severity", "")
locs = res.get("locations") or []
def extract_all(run_id, run):
results, alerts, regions = [], [], []
tool = run.get("tool",{}).get("driver",{}).get("name","")
version = run.get("tool",{}).get("driver",{}).get("semanticVersion","")
for res in run.get("results",[]) or []:
msg=(res.get("message") or {}).get("text","")
rule_id=res.get("ruleId","")
sev=(res.get("properties") or {}).get("problem.severity","")
locs=res.get("locations") or []
for loc in locs:
ploc = loc.get("physicalLocation", {}) if loc else {}
file_path = (ploc.get("artifactLocation") or {}).get("uri", "")
region = ploc.get("region") or {}
results.append({
"run_id": run_id,
"rule_id": rule_id,
"severity": sev,
"message": msg,
"file_path": file_path,
"line_start": region.get("startLine"),
"line_end": region.get("endLine"),
"column_start": region.get("startColumn"),
"column_end": region.get("endColumn"),
})
return results, tool, version
ploc=loc.get("physicalLocation",{}) if loc else {}
file_path=(ploc.get("artifactLocation") or {}).get("uri","")
region=ploc.get("region") or {}
ls,le,cs,ce=(region.get("startLine"),region.get("endLine"),
region.get("startColumn"),region.get("endColumn"))
rid=hash_snippet(f"{run_id}|{rule_id}|{file_path}|{ls}|{le}|{cs}|{ce}")
results.append(dict(run_id=run_id,rule_id=rule_id,severity=sev,
message=msg,file_path=file_path,
line_start=ls,line_end=le,
column_start=cs,column_end=ce))
alerts.append(dict(alert_id=rid,run_id=run_id,rule_id=rule_id,
kind="result",file_path=file_path,
message=msg,severity=sev))
regions.append(dict(region_id=hash_snippet(f"{file_path}|{ls}|{le}|{cs}|{ce}"),
result_id=rid,file_path=file_path,
start_line=ls,end_line=le,
start_column=cs,end_column=ce,
snippet=None,source_hash=None))
return results, alerts, regions, tool, version
def main():
if len(sys.argv) < 3:
if len(sys.argv)<3:
print("Usage: sarif-pull input.sarif output.db")
sys.exit(1)
sarif_file, dbfile = sys.argv[1], sys.argv[2]
sarif_file,dbfile=sys.argv[1:3]
ensure_schema(dbfile)
sarif = load_json(sarif_file)
con = sqlite3.connect(dbfile)
cur = con.cursor()
for i, run in enumerate(sarif.get("runs", [])):
run_id = f"{os.path.basename(sarif_file)}#{i}"
results, tool, version = extract_results(run_id, run)
cur.execute("INSERT OR REPLACE INTO runs VALUES (?, ?, ?, ?, ?)",
(run_id, now_timestamp(), tool, version, 0))
sarif=load_json(sarif_file)
con=sqlite3.connect(dbfile)
cur=con.cursor()
for i,run in enumerate(sarif.get("runs",[])):
run_id=f"{os.path.basename(sarif_file)}#{i}"
results,alerts,regions,tool,version=extract_all(run_id,run)
cur.execute("INSERT OR REPLACE INTO runs VALUES (?,?,?,?,?)",
(run_id,now_timestamp(),tool,version,0))
cur.executemany("""INSERT OR REPLACE INTO results VALUES
(:run_id, :rule_id, :severity, :message, :file_path,
:line_start, :line_end, :column_start, :column_end)""", results)
con.commit()
con.close()
print(f"Inserted {len(results)} results into {dbfile}")
(:run_id,:rule_id,:severity,:message,:file_path,
:line_start,:line_end,:column_start,:column_end)""",results)
cur.executemany("""INSERT OR REPLACE INTO alerts VALUES
(:alert_id,:run_id,:rule_id,:kind,:file_path,:message,:severity)""",alerts)
cur.executemany("""INSERT OR REPLACE INTO referenced_source_regions VALUES
(:region_id,:result_id,:file_path,:start_line,:end_line,
:start_column,:end_column,:snippet,:source_hash)""",regions)
con.commit(); con.close()
print(f"Inserted {len(results)} results, {len(alerts)} alerts, "
f"{len(regions)} regions into {dbfile}")
if __name__ == "__main__":
main()
if __name__=="__main__": main()

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff