diff --git a/README.md b/README.md index 4928713..8a0a188 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,10 @@ The CLI versions used against development of the CLI support were: 2.6.3, 2.9.4, and 2.11.4. + Minimal tests are also run against the versions in + [this build script](./build-multiple-codeql-versions.sh). Currently, those are + 2.9.4, 2.12.7, 2.13.5, 2.14.0. + The CLI sarif **MUST** contain one additional property `versionControlProvenance` - which needs to look like: ``` "versionControlProvenance": [ @@ -25,6 +29,12 @@ ] ``` + The script + + bin/sarif-insert-vcp + + will add that entry to a SARIF file. + # Test Setup This repository includes some test data (in `data`) and uses =git lfs= for storing those test files; installation steps are at [[https://git-lfs.github.com][git-lfs]]; on a mac with homebrew, install it via diff --git a/bin/sarif-insert-vcp b/bin/sarif-insert-vcp new file mode 100755 index 0000000..52fc772 --- /dev/null +++ b/bin/sarif-insert-vcp @@ -0,0 +1,19 @@ +#!/bin/sh +# Add the versionControlProvenance key to a SARIF file +# usage: $0 file +uri=vcp-no-uri +revid=vcp-no-revid +jq ' {"$schema" : ."$schema", + "version" : .version, + "runs" : [ .runs | .[] +| ( .versionControlProvenance |= +[ + { + "repositoryUri": "'$uri'", + "revisionId": "'$revid'" + } +] +) ] +} +' $1 + diff --git a/build-multiple-codeql-versions.sh b/build-multiple-codeql-versions.sh new file mode 100644 index 0000000..8f0872e --- /dev/null +++ b/build-multiple-codeql-versions.sh @@ -0,0 +1,136 @@ +#!/bin/bash -e +#* Following are the steps needed to build a codeql db using different versions of +# the codeql cli. +# +# Some files from prior runs are found in ./data/codeql-dataflow-sql-injection/ +# +usage=" +This script's purpose is to run the sarif-cli against SARIF files +produced by different versions of the codeql cli. + +This script is intended for interactive use only. Take one block at a time, +run it, and check results as you go. + +A (subset) of this script may be automated in the future. +" + +echo "$0: Interactive use only" +echo "$usage" +exit 1 + +#* Use virtual environment. See README for setup. +source ~/local/sarif-cli/.venv/bin/activate + +#* What can we use? +gh codeql list-versions + +#* History +open https://github.com/github/codeql-cli-binaries/blob/HEAD/CHANGELOG.md + +#* Get repo +cd ~/local/sarif-cli +git clone git@github.com:hohn/codeql-dataflow-sql-injection.git +cd codeql-dataflow-sql-injection/ + +#* Choose +v2.14.0 +v2.13.5 +v2.13.4 +v2.13.3 +v2.13.1 +v2.13.0 +v2.12.7 +v2.12.6 +v2.11.6 +v2.10.5 +v2.9.4 + +CLI_VERSION=v2.9.4 +CLI_VERSION=v2.12.7 +CLI_VERSION=v2.13.5 +CLI_VERSION=v2.14.0 +gh codeql set-version $CLI_VERSION + +#* Build vanilla DB +cd ~/local/sarif-cli/codeql-dataflow-sql-injection +rm -fR sqlidb +codeql database create --language=cpp -s . -j 8 -v sqlidb --command='./build.sh' +cp -r sqlidb sqlidb-$CLI_VERSION + +#* Pack compatibility with CLI +function codeql-complib() { + if [ -z "$1" ]; then + echo "Usage: codeql-complib " + return 1 + fi + curl --silent https://raw.githubusercontent.com/github/codeql/codeql-cli/v$(codeql version --format=json | jq -r .version)/$1/ql/lib/qlpack.yml | grep version | cut -d':' -f2 | sed 's/^[ ]*//' +} + +# Create the qlpack file using commands: +cd ~/local/sarif-cli +# Bug: drops the codeql- prefix +rm -fR dataflow-sql-injection +codeql pack init codeql-dataflow-sql-injection +cp -f dataflow-sql-injection/qlpack.yml codeql-dataflow-sql-injection/ +# Add correct library dependency +codeql pack add --dir=codeql-dataflow-sql-injection codeql/cpp-all@"$(codeql-complib cpp)" +cat codeql-dataflow-sql-injection/qlpack.yml + +#* Install packs +cd ~/local/sarif-cli/codeql-dataflow-sql-injection +rm -f *lock* +codeql pack install + +#* Run the analyze command with options +# +cd ~/local/sarif-cli/codeql-dataflow-sql-injection +codeql database analyze \ + -v \ + --sarif-category santa-chap \ + --ram=16000 \ + -j8 \ + --format=sarif-latest \ + --output sqlidb-$CLI_VERSION.sarif \ + -- \ + sqlidb-$CLI_VERSION \ + SqlInjection.ql + +# Verify cli version in SARIF output +SAVER=`jq -r '.runs |.[] |.tool.driver.semanticVersion ' sqlidb-$CLI_VERSION.sarif` +printf "db %s\ncli %s\n" $SAVER $CLI_VERSION +if [ v$SAVER != $CLI_VERSION ] ; +then + echo "---: codeql version inconsistency" +fi + +# Check sarif-category flag +grep -A2 automationDetails sqlidb-$CLI_VERSION.sarif + +#* Insert versionControlProvenance +cd ~/local/sarif-cli/codeql-dataflow-sql-injection +sarif-insert-vcp sqlidb-$CLI_VERSION.sarif > sqlidb-$CLI_VERSION-1.sarif + +#* Get CSV. +cd ~/local/sarif-cli/codeql-dataflow-sql-injection +sarif-extract-scans-runner --input-signature CLI - > /dev/null <" + return 1 + fi + curl --silent https://raw.githubusercontent.com/github/codeql/codeql-cli/v$(codeql version --format=json | jq -r .version)/$1/ql/lib/qlpack.yml | grep version | cut -d':' -f2 | sed 's/^[ ]*//' +} + +: ' +0:$ codeql-complib cpp +0.2.3 + +Put the version into the qlpack: +... +dependencies: + codeql/cpp-all: ^0.2.3 +... + +Then follow the rest; that is + codeql pack install +followed by + codeql database analyze +without + --additional-packs $HOME/local/codeql-v2.11.6/ \ + + +Or create the qlpack file using commands: + codeql pack init foo + codeql pack add --dir=foo codeql/cpp-all@"$(codeql-complib cpp)" + +' + +#* Install packs +cd ~/local/sarif-cli/data/codeql-dataflow-sql-injection +rm -f *lock* +codeql pack install + +#* Run the analyze command's plain version +cd ~/local/sarif-cli/data/codeql-dataflow-sql-injection + +# Note workaround for using --additional-packs +if false +then + source ../scripts/grab.sh + grab v2.11.6 osx64 $HOME/local + + codeql database analyze \ + -v \ + --ram=14000 \ + -j12 \ + --rerun \ + --format=sarif-latest \ + --additional-packs $HOME/local/codeql-v2.11.6/ \ + --output sqlidb-0.sarif \ + -- \ + sqlidb \ + SqlInjection.ql +fi + +codeql database analyze \ + -v \ + --ram=14000 \ + -j12 \ + --rerun \ + --format=sarif-latest \ + --output sqlidb-0.sarif \ + -- \ + sqlidb \ + SqlInjection.ql + +# This field should not be there: +grep automationDetails sqlidb-0.sarif + +#* Run the analyze command with options +# but don't rerun the analysis. We just want another SARIF file. +# +cd ~/local/sarif-cli/data/codeql-dataflow-sql-injection + +codeql database analyze \ + -v \ + --sarif-category mast-issue \ + --ram=14000 \ + -j12 \ + --format=sarif-latest \ + --output sqlidb-1.sarif \ + -- \ + sqlidb \ + SqlInjection.ql + +# Now it's present: +grep -A2 automationDetails sqlidb-1.sarif +: ' + "automationDetails" : { + "id" : "mast-issue/" + }, +' + +# Follow the installation in sarif-cli/README.md. + +#* Verify versionControlProvenance location +jq '.runs | .[] | .versionControlProvenance' \ + ~/local/sarif-cli/data/treeio/test_set_1.sarif + +#* Insert versionControlProvenance +cd ~/local/sarif-cli/data/codeql-dataflow-sql-injection +sarif-insert-vcp sqlidb-0.sarif > sqlidb-0.1.sarif + +#* Get CSV. +cd ~/local/sarif-cli/data/codeql-dataflow-sql-injection +sarif-extract-scans-runner --input-signature CLI - > /dev/null < sqlidb-1.1.sarif + +#* Get CSV. +cd ~/local/sarif-cli/data/codeql-dataflow-sql-injection +sarif-extract-scans-runner --input-signature CLI - > /dev/null < +#+HTML:
+#+TOC: headlines 3 insert TOC here, with two headline levels +#+HTML:
+# +#+HTML:
+ * The notes directory This directory is for notes that may be useful, but aren't complete enough to serve as documentation in their current state. Think of it as staging for [[../docs]]. + Short notes start as sections in this README. They will be moved if separate + file make more sense. + ** The typegraphs The type graph files are derived from a sarif input file, with various options controlling output. @@ -27,3 +43,259 @@ ../../../bin/sarif-to-dot -td -nuf results.sarif | dot -Tpdf > typegraph-tdnuf.pdf #+END_SRC + +** Debugging the absence of automationDetails.id + The =automationDetails.id= entry is produced by CodeQL when using the + =--sarif-category= flag. + + The prerequisites for tracing its flow through the tools is started in + [[../data/build-multiple-sarifs.sh]] + + For testing the following is injected into =sqlidb-1.1.sarif=. + #+BEGIN_SRC text + : ' + "automationDetails" : { + "id" : "mast-issue/" + }, + ' + + #+END_SRC + +*** Add repl as appropriate, then examine. + Make sure the input is correct + #+BEGIN_SRC sh :session shared :results output :eval never-export + cd ~/local/sarif-cli/data/codeql-dataflow-sql-injection + grep -A2 automationDetails sqlidb-1.1.sarif + #+END_SRC + + #+RESULTS: + : hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection + : "automationDetails" : { + : "id" : "mast-issue/" + : }, + : + : hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection + +*** Create the CSV + #+BEGIN_SRC sh :session shared :results output :eval never-export + source ~/local/sarif-cli/.venv/bin/activate + cd ~/local/sarif-cli/data/codeql-dataflow-sql-injection + sarif-extract-scans-runner --input-signature CLI - > /dev/null < > (.venv) + hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection + #+end_example + + #+BEGIN_SRC sh :session shared :results output :eval never-export + cd ~/local/sarif-cli/data/codeql-dataflow-sql-injection + ls -la sqlidb-1.1* + find sqlidb-1.1.sarif.scantables -print + #+END_SRC + + #+RESULTS: + #+begin_example + hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection + -rw-r--r-- 1 hohn staff 8.2K Jul 11 19:25 sqlidb-1.1.sarif + -rw-r--r-- 1 hohn staff 326 Jul 12 16:39 sqlidb-1.1.sarif.csv + -rw-r--r-- 1 hohn staff 72 Jul 12 16:39 sqlidb-1.1.sarif.scanspec + + sqlidb-1.1.sarif.scantables: + total 16K + drwxr-xr-x 6 hohn staff 192 Jul 12 16:39 ./ + drwxr-xr-x 43 hohn staff 1.4K Jul 12 16:39 ../ + -rw-r--r-- 1 hohn staff 622 Jul 12 16:39 codeflows.csv + -rw-r--r-- 1 hohn staff 165 Jul 12 16:39 projects.csv + -rw-r--r-- 1 hohn staff 589 Jul 12 16:39 results.csv + -rw-r--r-- 1 hohn staff 343 Jul 12 16:39 scans.csv + (.venv) + hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection + sqlidb-1.1.sarif.scantables + sqlidb-1.1.sarif.scantables/codeflows.csv + sqlidb-1.1.sarif.scantables/scans.csv + sqlidb-1.1.sarif.scantables/results.csv + sqlidb-1.1.sarif.scantables/projects.csv + (.venv) + hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection + #+end_example + +*** Check if =automationDetails= or its value is in output + #+BEGIN_SRC sh :session shared :results output :eval never-export + cd ~/local/sarif-cli/data/codeql-dataflow-sql-injection/sqlidb-1.1.sarif.scantables + ag automationDetails | cat + #+END_SRC + + #+RESULTS: + : (.venv) + : hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection/sqlidb-1.1.sarif.scantables + : projects.csv:1:"id","project_name","creation_date","repo_url","primary_language","languages_analyzed","automationDetails" + : (.venv) + : hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection/sqlidb-1.1.sarif.scantables + + #+RESULTS: + : (.venv) + : hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection/sqlidb-1.1.sarif.scantables + : (.venv) + : hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection/sqlidb-1.1.sarif.scantables + + #+RESULTS: + : (.venv) + : hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection/sqlidb-1.1.sarif.scantables + : (.venv) + : hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection/sqlidb-1.1.sarif.scantables + + See if the magic value is present + #+BEGIN_SRC sh :session shared :results output :eval never-export + cd ~/local/sarif-cli/data/codeql-dataflow-sql-injection/sqlidb-1.1.sarif.scantables + ag mast-issue |cat + #+END_SRC + + #+RESULTS: + : (.venv) + : hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection/sqlidb-1.1.sarif.scantables + : projects.csv:2:490227419655596076,"vcp-no-uri","1970-01-01","vcp-no-uri","unknown","unknown","mast-issue/" + : (.venv) + : hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection/sqlidb-1.1.sarif.scantables + + #+RESULTS: + : (.venv) + : hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection/sqlidb-1.1.sarif.scantables + : (.venv) + : hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection/sqlidb-1.1.sarif.scantables + +*** Nothing is in the output, so trace execution to see where it's dropped + #+BEGIN_SRC sh :session shared :results output :eval never-export + cd ~/local/sarif-cli/notes && ag -l automationDetails ../sarif_cli |cat + #+END_SRC + + #+RESULTS: + : ../sarif_cli/scan_tables.py + : ../sarif_cli/signature_single_CLI.py + : ../sarif_cli/table_joins_CLI.py + : ../sarif_cli/signature.py + : (.venv) + : hohn@gh-hohn ~/local/sarif-cli/notes + +*** Trace the call chain + Trace the call chain to one of + : ../sarif_cli/scan_tables.py + : ../sarif_cli/table_joins_CLI.py + : ../sarif_cli/signature.py + + Entry is + #+BEGIN_SRC sh :session shared :results output :eval never-export + sarif-extract-scans-runner --input-signature CLI - > /dev/null < + diff --git a/notes/l3style.css b/notes/l3style.css index 9b71bbd..b508a59 100644 --- a/notes/l3style.css +++ b/notes/l3style.css @@ -1,3 +1,7 @@ +:root { + --margin-left: 40%; + --body-width: 60%; +} /* The sum of width and margin percentages must not exceed 100.*/ div#toc { @@ -8,30 +12,33 @@ div#toc { /* OR */ /* use a fixed-position toc */ position: fixed; - top: 80px; + top: 8px; left: 0px; /* match toc, org-content, postamble */ - width: 26%; + width: var(--margin-left); margin-right: 1%; margin-left: 1%; + + overflow-y: scroll; + height: calc(100% - 10px); + } div#org-content { float: right; - width: 70%; + width: var(--body-width); /* match toc, org-content, postamble */ - margin-left: 28%; + margin-left: var(--margin-left); } div#postamble { float: right; - width: 70%; + width: var(--body-width); /* match toc, org-content, postamble */ - margin-left: 28%; + margin-left: var(--margin-left); } - p.author { clear: both; font-size: 1em; @@ -107,9 +114,9 @@ h1 { color: #cc8c00; /* padding-top: 5px; */ border-bottom: 2px solid #aaa; - width: 70%; - /* match toc, org-content, postamble */ - margin-left: 28%; /* Align with div#content */ + width: var(--body-width); + /* match toc, org-content, postamble */ + width: var(--margin-left); /* Align with div#content */ } h2 { @@ -167,4 +174,3 @@ td, th { vertical-align: top; border: 1pt solid #ADB9CC; } - diff --git a/sarif_cli/columns.py b/sarif_cli/columns.py index 71d8dda..3e37266 100644 --- a/sarif_cli/columns.py +++ b/sarif_cli/columns.py @@ -46,7 +46,8 @@ columns = { "creation_date", "repo_url" , "primary_language" , - "languages_analyzed" + "languages_analyzed", + # "automationDetails", ], "codeflows" : [ "codeflow_id", @@ -62,4 +63,4 @@ columns = { "uriBaseId", "message" ] -} \ No newline at end of file +} diff --git a/sarif_cli/scan_tables.py b/sarif_cli/scan_tables.py index 0613d82..95cea09 100644 --- a/sarif_cli/scan_tables.py +++ b/sarif_cli/scan_tables.py @@ -70,6 +70,7 @@ class ScanTablesTypes: "repo_url" : pd.StringDtype(), "primary_language" : pd.StringDtype(), "languages_analyzed" : pd.StringDtype(), + # "automationDetails" : pd.StringDtype(), } # @@ -88,21 +89,24 @@ def joins_for_projects(basetables, external_info): # if the sarif does have versionControlProvenance if "repositoryUri" in b.project: repoUri = b.project.repositoryUri[0] + project_name = b.project.repositoryUri[0] + "-" + extra e.project_id = hash.hash_unique((repoUri+extra).encode()) else: repoUri = "unknown" - + res = pd.DataFrame(data={ "id" : e.project_id, - "project_name" : repoUri, + "project_name" : project_name, "creation_date" : pd.Timestamp(0.0, unit='s'), # TODO: external info "repo_url" : repoUri, "primary_language" : b.project['semmle.sourceLanguage'][0], - "languages_analyzed" : ",".join(list(b.project['semmle.sourceLanguage'])) + "languages_analyzed" : ",".join(list(b.project['semmle.sourceLanguage'])), + "automationDetails" : extra, }, index=[0]) # Force all column types to ensure appropriate formatting res1 = res.astype(ScanTablesTypes.projects).reset_index(drop=True) + # return res1 # diff --git a/sarif_cli/signature.py b/sarif_cli/signature.py index c668a1d..82771a5 100644 --- a/sarif_cli/signature.py +++ b/sarif_cli/signature.py @@ -82,6 +82,8 @@ def _signature_list(args, elem, context): if args.typedef_signatures: # Give every unique array a name and use a reference to it as value. if signature not in context.sig_to_typedef: + #cannot have leading 0 hashes later in table joins so replace now + #context.sig_to_typedef[signature] = str("Array%04d" % shorthash(signature)).replace("0", "1") context.sig_to_typedef[signature] = "Array%04d" % shorthash(signature) typedef = context.sig_to_typedef[signature] return typedef @@ -252,7 +254,7 @@ def fillsig_dict(args, elem, context): if 'results' in elem.keys() and not 'automationDetails' in elem.keys(): #want this to be blank if not present- ie no submodule info added/no sarif-category used - full_elem['automationDetails'] = {'id' : ""} + full_elem['automationDetails'] = {'id' : "no-value-for-ad"} if {'locations', 'message', 'partialFingerprints', 'ruleId', 'ruleIndex'}.issubset(elem.keys()): diff --git a/sarif_cli/table_joins.py b/sarif_cli/table_joins.py index 41c5faa..7133626 100644 --- a/sarif_cli/table_joins.py +++ b/sarif_cli/table_joins.py @@ -115,7 +115,6 @@ def joins_for_problem(tgraph, af_0350_location): # # Form the message dataframe (@kind problem) via joins # - kind_problem_1 = ( aft(6343) .merge(sft(4055), how="inner", diff --git a/sarif_cli/table_joins_CLI.py b/sarif_cli/table_joins_CLI.py index 94f9af9..3859b3e 100644 --- a/sarif_cli/table_joins_CLI.py +++ b/sarif_cli/table_joins_CLI.py @@ -335,7 +335,7 @@ def joins_for_project_single(tgraph): .merge(sf(1111), how="left", left_on='automationDetails', right_on='struct_id', validate="1:m") .drop(columns=['automationDetails', 'struct_id']) .rename(columns={"id": "automationDetails"})) - # + # #newlines there or not - handle if 'newlineSequences' in project_df_temp1: project_df_temp2 = project_df_temp1.drop(columns=['newlineSequences']) diff --git a/scripts/grab.sh b/scripts/grab.sh new file mode 100644 index 0000000..ad8d0b8 --- /dev/null +++ b/scripts/grab.sh @@ -0,0 +1,40 @@ +# Reference urls: +# https://github.com/github/codeql-cli-binaries/releases/download/v2.8.0/codeql-linux64.zip +# https://github.com/github/codeql/archive/refs/tags/codeql-cli/v2.8.0.zip +# +# grab -- retrieve and extract codeql cli and library +# Usage: grab version url prefix +grab() { + version=$1; shift + platform=$1; shift + prefix=$1; shift + mkdir -p $prefix/codeql-$version && + cd $prefix/codeql-$version || return + + # Get cli + wget "https://github.com/github/codeql-cli-binaries/releases/download/$version/codeql-$platform.zip" + # Get lib + wget "https://github.com/github/codeql/archive/refs/tags/codeql-cli/$version.zip" + # Fix attributes + if [ `uname` = Darwin ] ; then + xattr -c *.zip + fi + # Extract + unzip -q codeql-$platform.zip + unzip -q $version.zip + # Rename library directory for VS Code + mv codeql-codeql-cli-$version/ ql + # remove archives? + # rm codeql-$platform.zip + # rm $version.zip +} + +# grab v2.7.6 osx64 $HOME/local +# grab v2.8.3 osx64 $HOME/local +# grab v2.8.4 osx64 $HOME/local + +# grab v2.6.3 linux64 /opt + +# grab v2.6.3 osx64 $HOME/local +# grab v2.4.6 osx64 $HOME/local + diff --git a/scripts/table-tests.sh b/scripts/table-tests.sh index 2d64d31..195686a 100644 --- a/scripts/table-tests.sh +++ b/scripts/table-tests.sh @@ -40,3 +40,12 @@ EOF sarif-aggregate-scans -i1 test-sas-files aggregated.scantables sarif-pad-aggregate aggregated.scantables aggregated.scantables.padded ) + +#* Tests for the automationDetails flag +#** Simple run +# This requires the tool setup, [[file:~/local/sarif-cli/README.md::Tool Setup]] +( cd ../data/codeql-dataflow-sql-injection/ && + sarif-extract-scans-runner - > /dev/null < test-vcp.out 2>&1 +# +# An output sample -- not suitable for automatic testing yet -- is in test-vcp.sample + +#* Two databases, one with and one without +# --sarif-category mast-issue +cd ~/local/sarif-cli/data/codeql-dataflow-sql-injection +ls -la sqlidb-0.sarif sqlidb-1.sarif +grep -A2 automationDetails sqlidb-0.sarif sqlidb-1.sarif + +source ~/local/sarif-cli/.venv/bin/activate + +function get-csv() { + #* Insert versionControlProvenance + sarif-insert-vcp $1.sarif > $1.1.sarif + + #* Get CSV. + cd ~/local/sarif-cli/data/codeql-dataflow-sql-injection + sarif-extract-scans-runner --input-signature CLI - > /dev/null <