mirror of
https://github.com/hohn/sarif-cli.git
synced 2025-12-16 17:23:03 +01:00
Fix subtle type problem: M8 is required for early steps, datetime64[ns] later
This commit is contained in:
@@ -50,8 +50,25 @@ parser.add_argument('-f','--input-signature', metavar='input-signature', type=st
|
|||||||
'Options: LGTM, CLI\n'
|
'Options: LGTM, CLI\n'
|
||||||
'If current represented signatures are not sufficient, view signature_single.py for how to support further signatures.'
|
'If current represented signatures are not sufficient, view signature_single.py for how to support further signatures.'
|
||||||
' Default: "%(default)s"')
|
' Default: "%(default)s"')
|
||||||
|
|
||||||
|
parser.add_argument("-d", "--debug", action="store_true",
|
||||||
|
help="Run inside IPython with --pdb for post-mortem debugging")
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
import sys, pdb, traceback
|
||||||
|
|
||||||
|
def debug_excepthook(type, value, tb):
|
||||||
|
traceback.print_exception(type, value, tb)
|
||||||
|
print("\nEntering post-mortem debugger...\n")
|
||||||
|
pdb.post_mortem(tb)
|
||||||
|
|
||||||
|
# XX:
|
||||||
|
if args.debug:
|
||||||
|
sys.excepthook = debug_excepthook
|
||||||
|
|
||||||
|
|
||||||
if args.input_signature not in ["LGTM","CLI"]:
|
if args.input_signature not in ["LGTM","CLI"]:
|
||||||
print("Unsupported sarif signature requested.")
|
print("Unsupported sarif signature requested.")
|
||||||
print("Use one of [LGTM, CLI].")
|
print("Use one of [LGTM, CLI].")
|
||||||
|
|||||||
@@ -237,10 +237,12 @@ for path_timestamp in paths:
|
|||||||
timestamp_options = ['--with-timestamps']
|
timestamp_options = ['--with-timestamps']
|
||||||
else:
|
else:
|
||||||
timestamp_options = []
|
timestamp_options = []
|
||||||
runstats = subprocess.run(['sarif-extract-scans', scan_spec_file, output_dir,
|
# XX:
|
||||||
csv_outfile, "-f", args.input_signature,
|
runstats = subprocess.run(['sarif-extract-scans', scan_spec_file,
|
||||||
*timestamp_options],
|
output_dir, csv_outfile, "-f",
|
||||||
|
args.input_signature],
|
||||||
capture_output=True, text=True)
|
capture_output=True, text=True)
|
||||||
|
|
||||||
if runstats.returncode == 0:
|
if runstats.returncode == 0:
|
||||||
print("{:6} {}".format("OK", path))
|
print("{:6} {}".format("OK", path))
|
||||||
if use_successful_runs:
|
if use_successful_runs:
|
||||||
@@ -250,10 +252,25 @@ for path_timestamp in paths:
|
|||||||
# log error
|
# log error
|
||||||
with open(scan_log_file, 'w') as fp:
|
with open(scan_log_file, 'w') as fp:
|
||||||
fp.write(runstats.stderr)
|
fp.write(runstats.stderr)
|
||||||
# report only tail
|
|
||||||
print("{:6} {}".format("", "Error tail: "))
|
# show command for manual re-run
|
||||||
for t1 in runstats.stderr.split('\n')[-6:-1]:
|
cmd = [
|
||||||
print("{:6} {}".format("", t1))
|
"sarif-extract-scans",
|
||||||
|
scan_spec_file,
|
||||||
|
output_dir,
|
||||||
|
csv_outfile,
|
||||||
|
"-f", args.input_signature,
|
||||||
|
]
|
||||||
|
print("{:6} {}".format("", "Command was:"))
|
||||||
|
print("{:6} {}".format("", " ".join(cmd)))
|
||||||
|
|
||||||
|
|
||||||
|
# report only tail of stderr
|
||||||
|
print("{:6} {}".format("", "Error tail:"))
|
||||||
|
if runstats.stderr:
|
||||||
|
for line in runstats.stderr.splitlines()[-6:]:
|
||||||
|
print("{:6} {}".format("", line))
|
||||||
|
|
||||||
|
|
||||||
if use_successful_runs:
|
if use_successful_runs:
|
||||||
with open(args.successful_runs, 'wb') as outfile:
|
with open(args.successful_runs, 'wb') as outfile:
|
||||||
|
|||||||
@@ -205,9 +205,9 @@
|
|||||||
#+BEGIN_SRC sh :session shared :results output :eval never-export
|
#+BEGIN_SRC sh :session shared :results output :eval never-export
|
||||||
cd ~/local/sarif-cli/data/codeql-dataflow-sql-injection
|
cd ~/local/sarif-cli/data/codeql-dataflow-sql-injection
|
||||||
sarif-extract-scans \
|
sarif-extract-scans \
|
||||||
sqlidb-1.1.sarif.scanspec \
|
sqlidb-1.sarif.scanspec \
|
||||||
sqlidb-1.1.sarif.scantables \
|
sqlidb-1.sarif.scantables \
|
||||||
sqlidb-1.1.sarif.csv \
|
sqlidb-1.sarif.csv \
|
||||||
-f CLI
|
-f CLI
|
||||||
#+END_SRC
|
#+END_SRC
|
||||||
|
|
||||||
|
|||||||
67
notes/quickstart.org
Normal file
67
notes/quickstart.org
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
* sarif-cli quickstart
|
||||||
|
Set up the virtual environment and install the packages:
|
||||||
|
#+BEGIN_SRC sh
|
||||||
|
cd ~/work-gh/sarif-cli/
|
||||||
|
|
||||||
|
# set up virtual environment
|
||||||
|
python3 -m venv .venv
|
||||||
|
. .venv/bin/activate
|
||||||
|
|
||||||
|
# Use requirementsDEV.txt
|
||||||
|
python -m pip install -r requirementsDEV.txt
|
||||||
|
|
||||||
|
# install scripts
|
||||||
|
pip install -e .
|
||||||
|
|
||||||
|
# force symlinks for development
|
||||||
|
rm -f "$VIRTUAL_ENV/bin/sarif-"*
|
||||||
|
ln -sf "$PWD/bin/sarif-"* "$VIRTUAL_ENV/bin/"
|
||||||
|
|
||||||
|
#+END_SRC
|
||||||
|
|
||||||
|
Run SARIF extraction for one test file and inspect results.
|
||||||
|
This assumes you are in the above virtual environment where all =sarif-*= tools
|
||||||
|
are on =$PATH=.
|
||||||
|
|
||||||
|
#+BEGIN_SRC sh
|
||||||
|
cd ~/work-gh/sarif-cli/data/codeql-dataflow-sql-injection
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------
|
||||||
|
# 1. Set base name of the original SARIF file (without extension)
|
||||||
|
# ---------------------------------------------------------------------
|
||||||
|
orig="sqlidb-1"
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------
|
||||||
|
# 2. Remove any stale output from previous runs
|
||||||
|
# ---------------------------------------------------------------------
|
||||||
|
rm -fR -- "${orig}.1.sarif."*
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------
|
||||||
|
# 3. Ensure versionControlProvenance field is present
|
||||||
|
# ---------------------------------------------------------------------
|
||||||
|
sarif-insert-vcp "${orig}.sarif" > "${orig}.1.sarif"
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------
|
||||||
|
# 4. Run the converter (CLI input signature)
|
||||||
|
# - Logs are written only if errors occur.
|
||||||
|
# ---------------------------------------------------------------------
|
||||||
|
sarif-extract-scans-runner --input-signature CLI - > /dev/null <<EOF
|
||||||
|
${orig}.1.sarif
|
||||||
|
EOF
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------
|
||||||
|
# 5. If errors occurred, show the scan log.
|
||||||
|
# The log lists the exact commands that can be re-run manually under pdb.
|
||||||
|
# ---------------------------------------------------------------------
|
||||||
|
if [[ -f "${orig}.1.sarif.scanlog" ]]; then
|
||||||
|
echo "Conversion errors logged in ${orig}.1.sarif.scanlog"
|
||||||
|
cat "${orig}.1.sarif.scanlog"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------
|
||||||
|
# 6. Examine results (converted SARIF, logs, etc.)
|
||||||
|
# ---------------------------------------------------------------------
|
||||||
|
ls -l "${orig}.1.sarif"*
|
||||||
|
#+END_SRC
|
||||||
|
For interactive examination / debugging, see [[file:README.org::*Run using embedded repls][Run using embedded repls]]
|
||||||
|
|
||||||
76
notes/update.org
Normal file
76
notes/update.org
Normal file
@@ -0,0 +1,76 @@
|
|||||||
|
|
||||||
|
* issues <2025-10-18 Sat>
|
||||||
|
** DONE
|
||||||
|
CLOSED: [2025-10-18 Sat 22:34]
|
||||||
|
|
||||||
|
- State "DONE" from "NEXT" [2025-10-18 Sat 22:34]
|
||||||
|
#+BEGIN_SRC text
|
||||||
|
~/work-gh/sarif-cli/data/codeql-dataflow-sql-injection]$
|
||||||
|
1:$ bat sqlidb-1.sarif.scanspec sqlidb-1.sarif.scantables sqlidb-1.sarif.csv
|
||||||
|
───────┬──────────────────────────────────────────────────────────────────────────────────────────────────
|
||||||
|
│ File: sqlidb-1.sarif.scanspec
|
||||||
|
───────┼──────────────────────────────────────────────────────────────────────────────────────────────────
|
||||||
|
1 │ {"scan_id": 12314655876769447717, "sarif_file_name": "sqlidb-1.sarif"}
|
||||||
|
───────┴──────────────────────────────────────────────────────────────────────────────────────────────────
|
||||||
|
[bat error]: 'sqlidb-1.sarif.scantables' is a directory.
|
||||||
|
───────┬──────────────────────────────────────────────────────────────────────────────────────────────────
|
||||||
|
│ File: sqlidb-1.sarif.csv
|
||||||
|
───────┼──────────────────────────────────────────────────────────────────────────────────────────────────
|
||||||
|
1 │ sarif_file,level,levelcode,message,extra_info
|
||||||
|
2 │ sqlidb-1.sarif,WARNING,2,Input sarif is missing neccesary properties.,"Missing: {'newlineSequence
|
||||||
|
│ s', 'versionControlProvenance'}, "
|
||||||
|
───────┴──────────────────────────────────────────────────────────────────────────────────────────────────
|
||||||
|
(.venv-m325) (base) [hohn@m325 ~/work-gh/sarif-cli/data/codeql-dataflow-sql-injection]$
|
||||||
|
#+END_SRC
|
||||||
|
|
||||||
|
sarif_file,level,levelcode,message,extra_info
|
||||||
|
sqlidb-1.sarif,WARNING,2,Input sarif is missing neccesary properties.,"Missing:
|
||||||
|
{'newlineSequences', 'versionControlProvenance'}
|
||||||
|
|
||||||
|
see
|
||||||
|
|
||||||
|
File: ./bin/sarif-insert-vcp
|
||||||
|
2 11 # Add the versionControlProvenance key to a SARIF file
|
||||||
|
9 6 | ( .versionControlProvenance |=
|
||||||
|
|
||||||
|
File: ./scripts/test-vcp.sh
|
||||||
|
21 15 #* Insert versionControlProvenance
|
||||||
|
|
||||||
|
|
||||||
|
o The CLI sarif **MUST** contain one additional property `versionControlProvenance` - which needs to look like:
|
||||||
|
```
|
||||||
|
"versionControlProvenance": [
|
||||||
|
{
|
||||||
|
"repositoryUri": "https://github.com/testorg/testrepo.git",
|
||||||
|
"revisionId": "testsha"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
The script
|
||||||
|
|
||||||
|
bin/sarif-insert-vcp
|
||||||
|
[[file:~/work-gh/sarif-cli/bin/sarif-insert-vcp::uri=vcp-no-uri]]
|
||||||
|
|
||||||
|
will add that entry to a SARIF file.
|
||||||
|
|
||||||
|
|
||||||
|
Also,
|
||||||
|
./sarif_cli/signature.py:308: # Ensure newlineSequences is present when versionControlProvenance is
|
||||||
|
./sarif_cli/signature.py:309: full_elem['newlineSequences'] = elem.get('newlineSequences', dummy_newlineSequences)
|
||||||
|
|
||||||
|
So:
|
||||||
|
- adding versionControlProvenance first will add newlineSequences later also
|
||||||
|
|
||||||
|
|
||||||
|
** TODO sarif-cli type error
|
||||||
|
#+BEGIN_SRC text
|
||||||
|
~/work-gh/sarif-cli/data/codeql-dataflow-sql-injection]$
|
||||||
|
0:$ less sqlidb-1.1.sarif.scanlog
|
||||||
|
|
||||||
|
...
|
||||||
|
File "/Users/hohn/work-gh/sarif-cli/.venv-m325/lib/python3.11/site-packages/pandas/core/arrays/datetimes.py", line 734, in astype
|
||||||
|
raise TypeError(
|
||||||
|
TypeError: Casting to unit-less dtype 'datetime64' is not supported. Pass e.g. 'datetime64[ns]' instead.
|
||||||
|
|
||||||
|
#+END_SRC
|
||||||
@@ -33,9 +33,9 @@ nbconvert==6.4.4
|
|||||||
nbformat==5.2.0
|
nbformat==5.2.0
|
||||||
nest-asyncio==1.5.4
|
nest-asyncio==1.5.4
|
||||||
notebook==6.4.10
|
notebook==6.4.10
|
||||||
numpy==1.22.3
|
numpy
|
||||||
packaging==21.3
|
packaging==21.3
|
||||||
pandas==1.4.1
|
pandas
|
||||||
pandocfilters==1.5.0
|
pandocfilters==1.5.0
|
||||||
parso==0.8.3
|
parso==0.8.3
|
||||||
pexpect==4.8.0
|
pexpect==4.8.0
|
||||||
@@ -52,7 +52,7 @@ pyrsistent==0.18.1
|
|||||||
python-dateutil==2.8.2
|
python-dateutil==2.8.2
|
||||||
pytz==2021.3
|
pytz==2021.3
|
||||||
PyYAML==6.0
|
PyYAML==6.0
|
||||||
pyzmq==22.3.0
|
pyzmq
|
||||||
qtconsole==5.2.2
|
qtconsole==5.2.2
|
||||||
QtPy==2.0.1
|
QtPy==2.0.1
|
||||||
Send2Trash==1.8.0
|
Send2Trash==1.8.0
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ from . import snowflake_id
|
|||||||
|
|
||||||
import logging
|
import logging
|
||||||
import numpy
|
import numpy
|
||||||
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import re
|
import re
|
||||||
from sarif_cli import hash
|
from sarif_cli import hash
|
||||||
@@ -108,8 +109,10 @@ def joins_for_projects(basetables, external_info):
|
|||||||
"automationDetails" : automationDetails,
|
"automationDetails" : automationDetails,
|
||||||
}, index=[0])
|
}, index=[0])
|
||||||
|
|
||||||
# Force all column types to ensure appropriate formatting
|
#
|
||||||
res1 = res.astype(ScanTablesTypes.projects).reset_index(drop=True)
|
# - Now (not before), "creation_date" needs type numpy.dtype('datetime64[ns]')
|
||||||
|
# - Force all column types to ensure appropriate formatting
|
||||||
|
res1 = normalize_dataframe_types(res, ScanTablesTypes.projects)
|
||||||
#
|
#
|
||||||
return res1
|
return res1
|
||||||
|
|
||||||
@@ -144,9 +147,33 @@ def joins_for_scans(basetables, external_info, scantables, sarif_type, timestamp
|
|||||||
"rules_count" : len(b.rules['id'].unique()),
|
"rules_count" : len(b.rules['id'].unique()),
|
||||||
},index=[0])
|
},index=[0])
|
||||||
# Force all column types to ensure correct writing and type checks on reading.
|
# Force all column types to ensure correct writing and type checks on reading.
|
||||||
res1 = res.astype(ScanTablesTypes.scans).reset_index(drop=True)
|
res1 = normalize_dataframe_types(res, ScanTablesTypes.scans)
|
||||||
return res1
|
return res1
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_dataframe_types(df: pd.DataFrame, type_map: dict) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
Normalize dtypes in a DataFrame according to a given type map.
|
||||||
|
|
||||||
|
- Converts any ambiguous datetime64 types (e.g. 'M', '<M8') to 'datetime64[ns]'
|
||||||
|
- Coerces corresponding columns with pd.to_datetime()
|
||||||
|
- Returns a new DataFrame with types enforced and index reset
|
||||||
|
"""
|
||||||
|
fixed_types = dict(type_map) # shallow copy to avoid mutating globals
|
||||||
|
|
||||||
|
for col, dtype in fixed_types.items():
|
||||||
|
dtype_str = str(dtype)
|
||||||
|
|
||||||
|
# Normalize datetime-like dtypes
|
||||||
|
if dtype_str.startswith("datetime64") or dtype_str.startswith("<M8") or dtype_str == "M8":
|
||||||
|
fixed_types[col] = np.dtype("datetime64[ns]")
|
||||||
|
if col in df.columns:
|
||||||
|
df[col] = pd.to_datetime(df[col], errors="coerce", utc=False)
|
||||||
|
|
||||||
|
# Enforce all column types consistently
|
||||||
|
df1 = df.astype(fixed_types).reset_index(drop=True)
|
||||||
|
return df1
|
||||||
|
|
||||||
#
|
#
|
||||||
# Results table
|
# Results table
|
||||||
#
|
#
|
||||||
@@ -175,28 +202,66 @@ def joins_for_results(basetables, external_info):
|
|||||||
res = tables[0]
|
res = tables[0]
|
||||||
|
|
||||||
# Force all column types to ensure appropriate formatting
|
# Force all column types to ensure appropriate formatting
|
||||||
res1 = res.astype(ScanTablesTypes.results).reset_index(drop=True)
|
res1 = normalize_dataframe_types(res, ScanTablesTypes.results)
|
||||||
return res1
|
return res1
|
||||||
|
|
||||||
#id as primary key
|
def _lookup_rule_value(basetable, rule_id, column_name, join_tags=False):
|
||||||
|
"""
|
||||||
|
Look up a value in basetable.rules by id == rule_id.
|
||||||
|
If join_tags=True, concatenate all tag_text values with '_'.
|
||||||
|
"""
|
||||||
|
df = basetable.rules
|
||||||
|
|
||||||
|
# Defensive check: avoid duplicate 'id' rows or missing entries
|
||||||
|
match = df.loc[df["id"] == rule_id, column_name]
|
||||||
|
|
||||||
|
if match.empty:
|
||||||
|
return None
|
||||||
|
|
||||||
|
if join_tags:
|
||||||
|
return match.str.cat(sep="_")
|
||||||
|
|
||||||
|
# For scalar columns, pick first entry safely
|
||||||
|
return match.head(1).item()
|
||||||
|
|
||||||
|
# id as primary key
|
||||||
def _populate_from_rule_table_code_flow_tag_text(basetable, flowtable):
|
def _populate_from_rule_table_code_flow_tag_text(basetable, flowtable):
|
||||||
val = flowtable.rule_id.values[0]
|
return _lookup_rule_value(basetable, flowtable.rule_id.values[0], "tag_text", join_tags=True)
|
||||||
return basetable.rules.query("id == @val")["tag_text"].str.cat(sep='_')
|
|
||||||
|
|
||||||
#id as primary key
|
# id as primary key
|
||||||
def _populate_from_rule_table_tag_text(basetable, i):
|
def _populate_from_rule_table_tag_text(basetable, i):
|
||||||
val = basetable.kind_problem.rule_id[i]
|
return _lookup_rule_value(basetable, basetable.kind_problem.rule_id[i], "tag_text", join_tags=True)
|
||||||
return basetable.rules.query("id == @val")["tag_text"].str.cat(sep='_')
|
|
||||||
|
|
||||||
#id as primary key
|
# id as primary key
|
||||||
def _populate_from_rule_table(column_name, basetable, i):
|
def _populate_from_rule_table(column_name, basetable, i):
|
||||||
val = basetable.kind_problem.rule_id[i]
|
return _lookup_rule_value(basetable, basetable.kind_problem.rule_id[i], column_name)
|
||||||
return basetable.rules.query("id == @val")[column_name].head(1).item()
|
|
||||||
|
|
||||||
#id as primary key
|
# id as primary key
|
||||||
def _populate_from_rule_table_code_flow(column_name, basetable, flowtable):
|
def _populate_from_rule_table_code_flow(column_name, basetable, flowtable):
|
||||||
val = flowtable.rule_id.values[0]
|
return _lookup_rule_value(basetable, flowtable.rule_id.values[0], column_name)
|
||||||
return basetable.rules.query("id == @val")[column_name].head(1).item()
|
|
||||||
|
|
||||||
|
# #id as primary key
|
||||||
|
# def _populate_from_rule_table_code_flow_tag_text(basetable, flowtable):
|
||||||
|
# val = flowtable.rule_id.values[0]
|
||||||
|
# return basetable.rules.query("id == @val")["tag_text"].str.cat(sep='_')
|
||||||
|
|
||||||
|
# #id as primary key
|
||||||
|
# def _populate_from_rule_table_tag_text(basetable, i):
|
||||||
|
# val = basetable.kind_problem.rule_id[i]
|
||||||
|
# return basetable.rules.query("id == @val")["tag_text"].str.cat(sep='_')
|
||||||
|
|
||||||
|
# #id as primary key
|
||||||
|
# def _populate_from_rule_table(column_name, basetable, i):
|
||||||
|
# val = basetable.kind_problem.rule_id[i]
|
||||||
|
# # return basetable.rules.query("id == @val")[column_name].head(1).item()
|
||||||
|
# return basetable.rules.loc[basetable.rules["id"] == val, column_name].head(1).item()
|
||||||
|
|
||||||
|
# #id as primary key
|
||||||
|
# def _populate_from_rule_table_code_flow(column_name, basetable, flowtable):
|
||||||
|
# val = flowtable.rule_id.values[0]
|
||||||
|
# # return basetable.rules.query("id == @val")[column_name].head(1).item()
|
||||||
|
# return basetable.rules.loc[basetable.rules["id"] == val, column_name].head(1).item()
|
||||||
|
|
||||||
def _results_from_kind_problem(basetables, external_info):
|
def _results_from_kind_problem(basetables, external_info):
|
||||||
b = basetables; e = external_info
|
b = basetables; e = external_info
|
||||||
|
|||||||
Reference in New Issue
Block a user