From 44f1d2f179d5d11406c80a3f896550e5bf764d1a Mon Sep 17 00:00:00 2001 From: Michael Hohn Date: Wed, 20 Apr 2022 15:22:20 -0700 Subject: [PATCH] Description of current and upcoming tables and their information sources --- notes/tables-info.py | 89 ++++++++++++++++++++++++++ notes/tables.org | 145 +++++++++++++++++++++++++++---------------- 2 files changed, 179 insertions(+), 55 deletions(-) create mode 100644 notes/tables-info.py diff --git a/notes/tables-info.py b/notes/tables-info.py new file mode 100644 index 0000000..e3838f7 --- /dev/null +++ b/notes/tables-info.py @@ -0,0 +1,89 @@ +# +# Simple utilities to retrieve and view Github API information +# +import urllib3 +import os +import sys +import json +from pprint import pprint +from contextlib import redirect_stdout + +#* Init +header_auth = {'Authorization': 'token %s' % os.environ['GITHUB_TOKEN']} + +http = urllib3.PoolManager() + +owner = 'hohn' +repo = 'tabu-soda' +header_accept = {'Accept' : 'application/vnd.github.v3+json'} +GET = 'GET' + +#* Local utility functions using lexical variables +def gith(verb, path, headers={}): + res = http.request( + verb, + 'https://api.github.com' + path, + headers={**header_auth, **header_accept, **headers} + ) + return res + +def topy(result): + return json.loads(result.data.decode('utf-8')) + +def pathval(result, *path): + v = topy(result) + for p in path: + v = v[p] + print(f'path: {path} value: {v}') + return (path, v) + + +#* GET /repos/{owner}/{repo}/events +r01 = gith(GET, f'/repos/{owner}/{repo}/events') +pathval(r01, 0, 'repo', 'name') +pathval(r01, 0, 'repo', 'url') + +#* GET /repos/{owner}/{repo}/code-scanning/analyses +r02 = gith(GET, f'/repos/{owner}/{repo}/code-scanning/analyses') +topy(r02) +# ? 'sarif_id': '9df9fbb4-bf4b-11ec-9ca6-b32c61360f89', + +#** GET /repos/{owner}/{repo}/code-scanning/analyses/{analysis_id}, overview only: +_, analysis_id = pathval(r02, 0, 'id') +r02s01 = gith(GET, f'/repos/{owner}/{repo}/code-scanning/analyses/{analysis_id}') +topy(r02s01) +pathval(r02s01, 'commit_sha') +pathval(r02s01, 'created_at') +pathval(r02s01, 'results_count') +pathval(r02s01, 'rules_count') +pathval(r02s01, 'sarif_id') +pathval(r02s01, 'tool', 'name') +pathval(r02s01, 'tool', 'version') + +#** GET /repos/{owner}/{repo}/code-scanning/analyses/{analysis_id}, full sarif: +r02s02 = gith(GET, f'/repos/{owner}/{repo}/code-scanning/analyses/{analysis_id}', + headers = {'Accept': 'application/sarif+json'}) + +pprint(topy(r02s02), open("r02s02", "w", encoding='utf-8')) +json.dump(topy(r02s02), open("r02s02.json", "w", encoding='utf-8'), indent=4) + +#* GET /repos/{owner}/{repo} +r03 = gith(GET, f'/repos/{owner}/{repo}') +topy(r03) +pathval(r03, 'created_at') +pathval(r03, 'full_name') +pathval(r03, 'git_url') +pathval(r03, 'clone_url') +pathval(r03, 'language') + +#* POST /repos/{owner}/{repo}/code-scanning/sarifs +# TODO: to be tested... +r04 = gith(POST, f'/repos/{owner}/{repo}/code-scanning/sarifs', + fields={'commit_sha': 'aa22233', + 'ref': 'refs/heads/', + 'sarif': 'gzip < sarif | base64 -w0', + 'tool_name' : 'codeql', + 'started_at': 'when the analysis started', + }, + headers = {'Accept': 'application/sarif+json'}) + diff --git a/notes/tables.org b/notes/tables.org index da0bedd..534976a 100644 --- a/notes/tables.org +++ b/notes/tables.org @@ -80,7 +80,6 @@ #+END_SRC - #+BEGIN_SRC text ==> kind_problem.csv <== results_array_id @@ -115,7 +114,8 @@ #+END_SRC - + The parts above =$schema= in the =projects.csv= table is ad-hoc and the + information for those fields is not yet collected. They can be discarded. #+BEGIN_SRC text ==> project.csv <== creation_date @@ -178,11 +178,92 @@ tag_text #+END_SRC -* New tables to be exported - Possible splits from =project.csv= - - The scan results are the root of the sarif tree, so this is a required base table. +* Tables or entries to be removed + The top of the [Mar-23-2022] =projects.csv= table, enumerated below, is ad-hoc + and included in the other tables below; the information for its fields is not + yet collected to it can be discarded. #+BEGIN_SRC text + ==> project-meta.csv <== + creation_date + primary_language + project_name + query_commit_id + sarif_file_name + scan_id + scan_start_date + scan_stop_date + tool_name + tool_version + #+END_SRC + +* New tables to be exported + This section enumerates new tables intended for reporting infrastructure. + + Using the github API starting points + #+BEGIN_SRC python + # Code scanning information + # Get the full list: + r02 = gith(GET, f'/repos/{owner}/{repo}/code-scanning/analyses') + + # Work with one entry + _, analysis_id = pathval(r02, 0, 'id') + r02s01 = gith(GET, f'/repos/{owner}/{repo}/code-scanning/analyses/{analysis_id}') + + r02s02 = gith(GET, f'/repos/{owner}/{repo}/code-scanning/analyses/{analysis_id}', + headers = {'Accept': 'application/sarif+json'}) + + # Repository information via GET /repos/{owner}/{repo} + r03 = gith(GET, f'/repos/{owner}/{repo}') + #+END_SRC + we can populate the =project.csv= and =scans.csv= tables: + #+BEGIN_SRC sql + ==> project.csv <== + id + project_name -- pathval(r03, 'full_name') + creation_date -- pathval(r03, 'created_at') + owner -- r03 + repo -- r03 = gith(GET, f'/repos/{owner}/{repo}') + repository_url -- pathval(r03, 'clone_url') + primary_language -- pathval(r03, 'language') + languages_analyzed -- + #+END_SRC + #+BEGIN_SRC sql + ==> scans.csv <== + id -- + commit_id -- pathval(r02s01, 'commit_sha') + project_id -- project.id + db_create_start -- pathval(r02s01, 'created_at') + db_create_stop + scan_start_date + scan_stop_date + tool_name -- pathval(r02s01, 'tool', 'name') + tool_version -- pathval(r02s01, 'tool', 'version') + tool_query_commit_id -- pathval(r02, 0, 'tool', 'version') is sufficient + sarif_content -- r02s02 + sarif_file_name -- used on upload + sarif_id -- pathval(r02s01, 'sarif_id') + results_count -- pathval(r02s01, 'results_count') + rules_count -- pathval(r02s01, 'rules_count') + #+END_SRC + + The sarif upload from codeql analysis to github uses the following API and + parameters which naturally are the minimal parameters needed to run the + analysis. + #+BEGIN_SRC python + # untested + r04 = gith(POST, f'/repos/{owner}/{repo}/code-scanning/sarifs', + fields={'commit_sha': 'aa22233', + 'ref': 'refs/heads/', + 'sarif': 'gzip < sarif | base64 -w0', + 'tool_name' : 'codeql', + 'started_at': 'when the analysis started', + }, + headers = {'Accept': 'application/sarif+json'}) + #+END_SRC + + The scan results from =project.csv= are the root of the sarif tree, so this is a + required base table. + #+BEGIN_SRC sql ==> project-scan-result.csv <== $schema sarif_version @@ -200,52 +281,9 @@ revisionId #+END_SRC - The rest of the [Mar-23-2022] =projects.csv= table is ad-hoc and included in the - other tables below; it can be discarded. - #+BEGIN_SRC text - ==> project-meta.csv <== - creation_date - primary_language - project_name - query_commit_id - sarif_file_name - scan_id - scan_start_date - scan_stop_date - tool_name - tool_version - - #+END_SRC - - New tables intended for reporting infrastructure: - #+BEGIN_SRC text - ==> project.csv <== - id - project_name - creation_date - repository_url -- new - primary_language -- from github api - languages_analyzed - #+END_SRC - - #+BEGIN_SRC text - ==> scans.csv <== - id - commit_id - project_id - db_create_start - db_create_stop - scan_start_date - scan_stop_date - tool_name - tool_version - tool_query_commit_id - sarif_content - sarif_file_name - - #+END_SRC - - #+BEGIN_SRC text + Using joins of the =project-scan-result.csv= table and the + other [[*Currently Exported Tables][Currently Exported Tables]], the =results.csv= table can be formed: + #+BEGIN_SRC sql ==> results.csv <== id INT, -- primary key scan_id INT, -- scans.id @@ -271,9 +309,6 @@ -- source_object STRING, -- higher-level info: 'args', 'request', etc. sink_object string, -- higher level: 'execute', 'sql statement', etc. - #+END_SRC - - #+HTML: