wip: debug and get automationDetails into CSV output

This commit is contained in:
Michael Hohn
2023-07-12 17:04:23 -07:00
committed by =Michael Hohn
parent 742392338e
commit 68b43e0514
6 changed files with 267 additions and 24 deletions

View File

@@ -167,6 +167,3 @@ head -4 sqlidb-1.1.sarif.csv
#* Check CSV output
ls -la sqlidb-1.1*
find sqlidb-1.1.sarif.scantables -print

View File

@@ -2,11 +2,11 @@
#+OPTIONS: org-confirm-babel-evaluate:nil
#+LANGUAGE: en
#+TEXT:
#+OPTIONS: ^:{} H:2 num:t \n:nil @:t ::t |:t ^:nil f:t *:t TeX:t LaTeX:t skip:nil p:nil
#+OPTIONS: ^:{} H:3 num:t \n:nil @:t ::t |:t ^:nil f:t *:t TeX:t LaTeX:t skip:nil p:nil
#+OPTIONS: toc:nil
#+HTML_HEAD: <link rel="stylesheet" type="text/css" href="./l3style.css"/>
#+HTML: <div id="toc">
#+TOC: headlines 2 insert TOC here, with two headline levels
#+TOC: headlines 3 insert TOC here, with two headline levels
#+HTML: </div>
#
#+HTML: <div id="org-content">
@@ -44,27 +44,258 @@
#+END_SRC
** The automationDetails.id
** Debugging the absence of automationDetails.id
The =automationDetails.id= entry is produced by CodeQL when using the
=--sarif-category= flag.
The prerequisites for tracing its flow through the tools is started in
[[../data/build-multiple-sarifs.sh]]
#+BEGIN_SRC sh :session shared :results output
cd ~/local/sarif-cli/ && ag -l automationDetails |cat
For testing the following is injected into =sqlidb-1.1.sarif=.
#+BEGIN_SRC text
: '
"automationDetails" : {
"id" : "mast-issue/"
},
'
#+END_SRC
*** Add repl as appropriate, then examine.
Make sure the input is correct
#+BEGIN_SRC sh :session shared :results output :eval never-export
cd ~/local/sarif-cli/data/codeql-dataflow-sql-injection
grep -A2 automationDetails sqlidb-1.1.sarif
#+END_SRC
#+RESULTS:
: notes/README.org
: notes/README.html
: scripts/table-tests.sh
: sarif_cli/signature_single_CLI.py
: sarif_cli/table_joins_CLI.py
: sarif_cli/scan_tables.py
: sarif_cli/signature.py
:
: hohn@gh-hohn ~/local/sarif-cli
#+HTML: </div>
#+RESULTS:
: hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection
: "automationDetails" : {
: "id" : "mast-issue/"
: },
:
: hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection
*** Create the CSV
#+BEGIN_SRC sh :session shared :results output :eval never-export
source ~/local/sarif-cli/.venv/bin/activate
cd ~/local/sarif-cli/data/codeql-dataflow-sql-injection
sarif-extract-scans-runner --input-signature CLI - > /dev/null <<EOF
sqlidb-1.1.sarif
EOF
#+END_SRC
#+RESULTS:
#+begin_example
hohn@gh-hohn ~/local/sarif-cli/notes
(.venv)
hohn@gh-hohn ~/local/sarif-cli/notes
(.venv)
hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection
> > (.venv)
hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection
#+end_example
#+BEGIN_SRC sh :session shared :results output :eval never-export
cd ~/local/sarif-cli/data/codeql-dataflow-sql-injection
ls -la sqlidb-1.1*
find sqlidb-1.1.sarif.scantables -print
#+END_SRC
#+RESULTS:
#+begin_example
hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection
-rw-r--r-- 1 hohn staff 8.2K Jul 11 19:25 sqlidb-1.1.sarif
-rw-r--r-- 1 hohn staff 326 Jul 12 16:39 sqlidb-1.1.sarif.csv
-rw-r--r-- 1 hohn staff 72 Jul 12 16:39 sqlidb-1.1.sarif.scanspec
sqlidb-1.1.sarif.scantables:
total 16K
drwxr-xr-x 6 hohn staff 192 Jul 12 16:39 ./
drwxr-xr-x 43 hohn staff 1.4K Jul 12 16:39 ../
-rw-r--r-- 1 hohn staff 622 Jul 12 16:39 codeflows.csv
-rw-r--r-- 1 hohn staff 165 Jul 12 16:39 projects.csv
-rw-r--r-- 1 hohn staff 589 Jul 12 16:39 results.csv
-rw-r--r-- 1 hohn staff 343 Jul 12 16:39 scans.csv
(.venv)
hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection
sqlidb-1.1.sarif.scantables
sqlidb-1.1.sarif.scantables/codeflows.csv
sqlidb-1.1.sarif.scantables/scans.csv
sqlidb-1.1.sarif.scantables/results.csv
sqlidb-1.1.sarif.scantables/projects.csv
(.venv)
hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection
#+end_example
*** Check if =automationDetails= or its value is in output
#+BEGIN_SRC sh :session shared :results output :eval never-export
cd ~/local/sarif-cli/data/codeql-dataflow-sql-injection/sqlidb-1.1.sarif.scantables
ag automationDetails | cat
#+END_SRC
#+RESULTS:
: (.venv)
: hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection/sqlidb-1.1.sarif.scantables
: projects.csv:1:"id","project_name","creation_date","repo_url","primary_language","languages_analyzed","automationDetails"
: (.venv)
: hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection/sqlidb-1.1.sarif.scantables
#+RESULTS:
: (.venv)
: hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection/sqlidb-1.1.sarif.scantables
: (.venv)
: hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection/sqlidb-1.1.sarif.scantables
#+RESULTS:
: (.venv)
: hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection/sqlidb-1.1.sarif.scantables
: (.venv)
: hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection/sqlidb-1.1.sarif.scantables
See if the magic value is present
#+BEGIN_SRC sh :session shared :results output :eval never-export
cd ~/local/sarif-cli/data/codeql-dataflow-sql-injection/sqlidb-1.1.sarif.scantables
ag mast-issue |cat
#+END_SRC
#+RESULTS:
: (.venv)
: hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection/sqlidb-1.1.sarif.scantables
: projects.csv:2:490227419655596076,"vcp-no-uri","1970-01-01","vcp-no-uri","unknown","unknown","mast-issue/"
: (.venv)
: hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection/sqlidb-1.1.sarif.scantables
#+RESULTS:
: (.venv)
: hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection/sqlidb-1.1.sarif.scantables
: (.venv)
: hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection/sqlidb-1.1.sarif.scantables
*** Nothing is in the output, so trace execution to see where it's dropped
#+BEGIN_SRC sh :session shared :results output :eval never-export
cd ~/local/sarif-cli/notes && ag -l automationDetails ../sarif_cli |cat
#+END_SRC
#+RESULTS:
: ../sarif_cli/scan_tables.py
: ../sarif_cli/signature_single_CLI.py
: ../sarif_cli/table_joins_CLI.py
: ../sarif_cli/signature.py
: (.venv)
: hohn@gh-hohn ~/local/sarif-cli/notes
*** Trace the call chain
Trace the call chain to one of
: ../sarif_cli/scan_tables.py
: ../sarif_cli/table_joins_CLI.py
: ../sarif_cli/signature.py
Entry is
#+BEGIN_SRC sh :session shared :results output :eval never-export
sarif-extract-scans-runner --input-signature CLI - > /dev/null <<EOF
sqlidb-1.1.sarif
EOF
#+END_SRC
1. sarif-extract-scans-runner
1. calls [[file:~/local/sarif-cli/bin/sarif-extract-scans-runner::runstats = subprocess.run(\['sarif-extract-scans', scan_spec_file, output_dir, csv_outfile, "-f", args.input_signature\],]]
The following will drop into the inserted repls:
#+BEGIN_SRC sh :session shared :results output :eval never-export
cd ~/local/sarif-cli/data/codeql-dataflow-sql-injection
sarif-extract-scans \
sqlidb-1.1.sarif.scanspec \
sqlidb-1.1.sarif.scantables \
sqlidb-1.1.sarif.csv \
-f CLI
#+END_SRC
1. calls [[file:~/local/sarif-cli/bin/sarif-extract-scans::sarif_struct = load(scan_spec\['sarif_file_name'\])]]
2. uses [[file:~/local/sarif-cli/bin/sarif-extract-scans::location_info = tj.joins_for_location_info(tgraph)]]
*** Run using embedded repls
The following will drop into the inserted repls:
#+BEGIN_SRC sh :session shared :results output :eval never-export
cd ~/local/sarif-cli/data/codeql-dataflow-sql-injection
sarif-extract-scans \
sqlidb-1.1.sarif.scanspec \
sqlidb-1.1.sarif.scantables \
sqlidb-1.1.sarif.csv \
-f CLI
#+END_SRC
The line
: .rename(columns={"id": "automationDetails"})
has the right effect:
#+BEGIN_SRC text
In [3]: project_df_temp1.T
Out[3]:
0
struct_id_5521 4796854592
$schema https://json.schemastore.org/sarif-2.1.0.json
version_5521 2.1.0
value_index_1273 0
artifacts 4797197888
columnKind utf16CodeUnits
newlineSequences 4797197568
properties 4797244480
results 4797198208
tool 4797244672
versionControlProvenance 4797218944
automationDetails mast-issue/
#+END_SRC
The line
: extra = b.project.automationDetails[0]
also works:
#+BEGIN_SRC text
In [1]: extra
Out[1]: 'mast-issue/'
#+END_SRC
but
: extra
is only used in
: e.project_id = hash.hash_unique((repoUri+extra).encode())
when
#+BEGIN_SRC text
In [5]: "repositoryUri" in b.project
Out[5]: True
#+END_SRC
For reference:
#+BEGIN_SRC text
In [8]: b.project.automationDetails
Out[8]:
0 mast-issue/
Name: automationDetails, dtype: object
#+END_SRC
This is in joins_for_projects, called from
: scantabs.projects = st.joins_for_projects(bt, external_info)
Add
: "automationDetails" : extra,
to the
: # Projects table
And repeat the [[*Check if =automationDetails= or its value is in output][Check if =automationDetails= or its value is in output]]
Still missing. Must be dropped between dataframe creation and output.
Use project_name to search.
: class ScanTablesTypes:
has no entry for
: automationDetails
Add
: "automationDetails" : pd.StringDtype(),
Similar for
: File: sarif_cli/columns.py
And repeat [[*Run using embedded repls][Run using embedded repls]], then
[[*Check if =automationDetails= or its value is in output][Check if =automationDetails= or its value is in output]]
* Footnotes
#+HTML: </div>

View File

@@ -46,7 +46,8 @@ columns = {
"creation_date",
"repo_url" ,
"primary_language" ,
"languages_analyzed"
"languages_analyzed",
"automationDetails",
],
"codeflows" : [
"codeflow_id",
@@ -62,4 +63,4 @@ columns = {
"uriBaseId",
"message"
]
}
}

View File

@@ -70,6 +70,7 @@ class ScanTablesTypes:
"repo_url" : pd.StringDtype(),
"primary_language" : pd.StringDtype(),
"languages_analyzed" : pd.StringDtype(),
"automationDetails" : pd.StringDtype(),
}
#
@@ -98,11 +99,16 @@ def joins_for_projects(basetables, external_info):
"creation_date" : pd.Timestamp(0.0, unit='s'), # TODO: external info
"repo_url" : repoUri,
"primary_language" : b.project['semmle.sourceLanguage'][0],
"languages_analyzed" : ",".join(list(b.project['semmle.sourceLanguage']))
"languages_analyzed" : ",".join(list(b.project['semmle.sourceLanguage'])),
"automationDetails" : extra,
}, index=[0])
# Force all column types to ensure appropriate formatting
res1 = res.astype(ScanTablesTypes.projects).reset_index(drop=True)
# XX: automationDetails?
import IPython
IPython.embed(header="spot 11")
#
return res1
#

View File

@@ -256,7 +256,11 @@ def fillsig_dict(args, elem, context):
if 'results' in elem.keys() and not 'automationDetails' in elem.keys():
#want this to be blank if not present- ie no submodule info added/no sarif-category used
full_elem['automationDetails'] = {'id' : ""}
full_elem['automationDetails'] = {'id' : "no-value-for-ad"}
# XX: automationDetails?
import IPython
IPython.embed(header="spot 2")
#
if {'locations', 'message', 'partialFingerprints', 'ruleId',
'ruleIndex'}.issubset(elem.keys()):

View File

@@ -336,6 +336,10 @@ def joins_for_project_single(tgraph):
.drop(columns=['automationDetails', 'struct_id'])
.rename(columns={"id": "automationDetails"}))
#
# XX: automationDetails?
import IPython
IPython.embed(header="spot 3")
#
#newlines there or not - handle
if 'newlineSequences' in project_df_temp1:
project_df_temp2 = project_df_temp1.drop(columns=['newlineSequences'])