diff --git a/bin/sarif-extract-scans b/bin/sarif-extract-scans index adf58b9..882faa0 100755 --- a/bin/sarif-extract-scans +++ b/bin/sarif-extract-scans @@ -50,8 +50,25 @@ parser.add_argument('-f','--input-signature', metavar='input-signature', type=st 'Options: LGTM, CLI\n' 'If current represented signatures are not sufficient, view signature_single.py for how to support further signatures.' ' Default: "%(default)s"') + +parser.add_argument("-d", "--debug", action="store_true", + help="Run inside IPython with --pdb for post-mortem debugging") + args = parser.parse_args() + +import sys, pdb, traceback + +def debug_excepthook(type, value, tb): + traceback.print_exception(type, value, tb) + print("\nEntering post-mortem debugger...\n") + pdb.post_mortem(tb) + +# XX: +if args.debug: + sys.excepthook = debug_excepthook + + if args.input_signature not in ["LGTM","CLI"]: print("Unsupported sarif signature requested.") print("Use one of [LGTM, CLI].") diff --git a/bin/sarif-extract-scans-runner b/bin/sarif-extract-scans-runner index 04e3965..bd9fc03 100755 --- a/bin/sarif-extract-scans-runner +++ b/bin/sarif-extract-scans-runner @@ -237,10 +237,12 @@ for path_timestamp in paths: timestamp_options = ['--with-timestamps'] else: timestamp_options = [] - runstats = subprocess.run(['sarif-extract-scans', scan_spec_file, output_dir, - csv_outfile, "-f", args.input_signature, - *timestamp_options], + # XX: + runstats = subprocess.run(['sarif-extract-scans', scan_spec_file, + output_dir, csv_outfile, "-f", + args.input_signature], capture_output=True, text=True) + if runstats.returncode == 0: print("{:6} {}".format("OK", path)) if use_successful_runs: @@ -250,10 +252,25 @@ for path_timestamp in paths: # log error with open(scan_log_file, 'w') as fp: fp.write(runstats.stderr) - # report only tail - print("{:6} {}".format("", "Error tail: ")) - for t1 in runstats.stderr.split('\n')[-6:-1]: - print("{:6} {}".format("", t1)) + + # show command for manual re-run + cmd = [ + "sarif-extract-scans", + scan_spec_file, + output_dir, + csv_outfile, + "-f", args.input_signature, + ] + print("{:6} {}".format("", "Command was:")) + print("{:6} {}".format("", " ".join(cmd))) + + + # report only tail of stderr + print("{:6} {}".format("", "Error tail:")) + if runstats.stderr: + for line in runstats.stderr.splitlines()[-6:]: + print("{:6} {}".format("", line)) + if use_successful_runs: with open(args.successful_runs, 'wb') as outfile: diff --git a/notes/README.org b/notes/README.org index 09b581a..09a6018 100644 --- a/notes/README.org +++ b/notes/README.org @@ -205,9 +205,9 @@ #+BEGIN_SRC sh :session shared :results output :eval never-export cd ~/local/sarif-cli/data/codeql-dataflow-sql-injection sarif-extract-scans \ - sqlidb-1.1.sarif.scanspec \ - sqlidb-1.1.sarif.scantables \ - sqlidb-1.1.sarif.csv \ + sqlidb-1.sarif.scanspec \ + sqlidb-1.sarif.scantables \ + sqlidb-1.sarif.csv \ -f CLI #+END_SRC diff --git a/notes/quickstart.org b/notes/quickstart.org new file mode 100644 index 0000000..58eeba0 --- /dev/null +++ b/notes/quickstart.org @@ -0,0 +1,67 @@ +* sarif-cli quickstart + Set up the virtual environment and install the packages: + #+BEGIN_SRC sh + cd ~/work-gh/sarif-cli/ + + # set up virtual environment + python3 -m venv .venv + . .venv/bin/activate + + # Use requirementsDEV.txt + python -m pip install -r requirementsDEV.txt + + # install scripts + pip install -e . + + # force symlinks for development + rm -f "$VIRTUAL_ENV/bin/sarif-"* + ln -sf "$PWD/bin/sarif-"* "$VIRTUAL_ENV/bin/" + + #+END_SRC + + Run SARIF extraction for one test file and inspect results. + This assumes you are in the above virtual environment where all =sarif-*= tools + are on =$PATH=. + + #+BEGIN_SRC sh + cd ~/work-gh/sarif-cli/data/codeql-dataflow-sql-injection + + # --------------------------------------------------------------------- + # 1. Set base name of the original SARIF file (without extension) + # --------------------------------------------------------------------- + orig="sqlidb-1" + + # --------------------------------------------------------------------- + # 2. Remove any stale output from previous runs + # --------------------------------------------------------------------- + rm -fR -- "${orig}.1.sarif."* + + # --------------------------------------------------------------------- + # 3. Ensure versionControlProvenance field is present + # --------------------------------------------------------------------- + sarif-insert-vcp "${orig}.sarif" > "${orig}.1.sarif" + + # --------------------------------------------------------------------- + # 4. Run the converter (CLI input signature) + # - Logs are written only if errors occur. + # --------------------------------------------------------------------- + sarif-extract-scans-runner --input-signature CLI - > /dev/null < +** DONE + CLOSED: [2025-10-18 Sat 22:34] + + - State "DONE" from "NEXT" [2025-10-18 Sat 22:34] + #+BEGIN_SRC text + ~/work-gh/sarif-cli/data/codeql-dataflow-sql-injection]$ + 1:$ bat sqlidb-1.sarif.scanspec sqlidb-1.sarif.scantables sqlidb-1.sarif.csv + ───────┬────────────────────────────────────────────────────────────────────────────────────────────────── + │ File: sqlidb-1.sarif.scanspec + ───────┼────────────────────────────────────────────────────────────────────────────────────────────────── + 1 │ {"scan_id": 12314655876769447717, "sarif_file_name": "sqlidb-1.sarif"} + ───────┴────────────────────────────────────────────────────────────────────────────────────────────────── + [bat error]: 'sqlidb-1.sarif.scantables' is a directory. + ───────┬────────────────────────────────────────────────────────────────────────────────────────────────── + │ File: sqlidb-1.sarif.csv + ───────┼────────────────────────────────────────────────────────────────────────────────────────────────── + 1 │ sarif_file,level,levelcode,message,extra_info + 2 │ sqlidb-1.sarif,WARNING,2,Input sarif is missing neccesary properties.,"Missing: {'newlineSequence + │ s', 'versionControlProvenance'}, " + ───────┴────────────────────────────────────────────────────────────────────────────────────────────────── + (.venv-m325) (base) [hohn@m325 ~/work-gh/sarif-cli/data/codeql-dataflow-sql-injection]$ + #+END_SRC + + sarif_file,level,levelcode,message,extra_info + sqlidb-1.sarif,WARNING,2,Input sarif is missing neccesary properties.,"Missing: + {'newlineSequences', 'versionControlProvenance'} + + see + + File: ./bin/sarif-insert-vcp + 2 11 # Add the versionControlProvenance key to a SARIF file + 9 6 | ( .versionControlProvenance |= + + File: ./scripts/test-vcp.sh + 21 15 #* Insert versionControlProvenance + + + o The CLI sarif **MUST** contain one additional property `versionControlProvenance` - which needs to look like: + ``` + "versionControlProvenance": [ + { + "repositoryUri": "https://github.com/testorg/testrepo.git", + "revisionId": "testsha" + } + ] + ``` + + The script + + bin/sarif-insert-vcp + [[file:~/work-gh/sarif-cli/bin/sarif-insert-vcp::uri=vcp-no-uri]] + + will add that entry to a SARIF file. + + + Also, + ./sarif_cli/signature.py:308: # Ensure newlineSequences is present when versionControlProvenance is + ./sarif_cli/signature.py:309: full_elem['newlineSequences'] = elem.get('newlineSequences', dummy_newlineSequences) + + So: + - adding versionControlProvenance first will add newlineSequences later also + + +** TODO sarif-cli type error + #+BEGIN_SRC text + ~/work-gh/sarif-cli/data/codeql-dataflow-sql-injection]$ + 0:$ less sqlidb-1.1.sarif.scanlog + + ... + File "/Users/hohn/work-gh/sarif-cli/.venv-m325/lib/python3.11/site-packages/pandas/core/arrays/datetimes.py", line 734, in astype + raise TypeError( + TypeError: Casting to unit-less dtype 'datetime64' is not supported. Pass e.g. 'datetime64[ns]' instead. + + #+END_SRC diff --git a/requirementsDEV.txt b/requirementsDEV.txt index a442d69..2521640 100644 --- a/requirementsDEV.txt +++ b/requirementsDEV.txt @@ -33,9 +33,9 @@ nbconvert==6.4.4 nbformat==5.2.0 nest-asyncio==1.5.4 notebook==6.4.10 -numpy==1.22.3 +numpy packaging==21.3 -pandas==1.4.1 +pandas pandocfilters==1.5.0 parso==0.8.3 pexpect==4.8.0 @@ -52,7 +52,7 @@ pyrsistent==0.18.1 python-dateutil==2.8.2 pytz==2021.3 PyYAML==6.0 -pyzmq==22.3.0 +pyzmq qtconsole==5.2.2 QtPy==2.0.1 Send2Trash==1.8.0 diff --git a/sarif_cli/scan_tables.py b/sarif_cli/scan_tables.py index 4566424..4038606 100644 --- a/sarif_cli/scan_tables.py +++ b/sarif_cli/scan_tables.py @@ -5,6 +5,7 @@ from . import snowflake_id import logging import numpy +import numpy as np import pandas as pd import re from sarif_cli import hash @@ -108,8 +109,10 @@ def joins_for_projects(basetables, external_info): "automationDetails" : automationDetails, }, index=[0]) - # Force all column types to ensure appropriate formatting - res1 = res.astype(ScanTablesTypes.projects).reset_index(drop=True) + # + # - Now (not before), "creation_date" needs type numpy.dtype('datetime64[ns]') + # - Force all column types to ensure appropriate formatting + res1 = normalize_dataframe_types(res, ScanTablesTypes.projects) # return res1 @@ -144,9 +147,33 @@ def joins_for_scans(basetables, external_info, scantables, sarif_type, timestamp "rules_count" : len(b.rules['id'].unique()), },index=[0]) # Force all column types to ensure correct writing and type checks on reading. - res1 = res.astype(ScanTablesTypes.scans).reset_index(drop=True) + res1 = normalize_dataframe_types(res, ScanTablesTypes.scans) return res1 + +def normalize_dataframe_types(df: pd.DataFrame, type_map: dict) -> pd.DataFrame: + """ + Normalize dtypes in a DataFrame according to a given type map. + + - Converts any ambiguous datetime64 types (e.g. 'M', '