From 505ee8ea66e0e47e9ab2fcc914fe7242ac476848 Mon Sep 17 00:00:00 2001 From: Michael Hohn Date: Mon, 8 Aug 2022 16:48:17 -0700 Subject: [PATCH] Export column types for scan-related pandas tables --- sarif_cli/scan_tables.py | 111 ++++++++++++++++++++------------------- 1 file changed, 58 insertions(+), 53 deletions(-) diff --git a/sarif_cli/scan_tables.py b/sarif_cli/scan_tables.py index 34ca882..20f8869 100644 --- a/sarif_cli/scan_tables.py +++ b/sarif_cli/scan_tables.py @@ -9,6 +9,60 @@ from . import snowflake_id class ZeroResults(Exception): pass +# +# Column types for scan-related pandas tables +# +class ScanTablesTypes: + scans = { + "id" : pd.UInt64Dtype(), + "commit_id" : pd.StringDtype(), + "project_id" : pd.UInt64Dtype(), + "db_create_start" : numpy.datetime64(), + "db_create_stop" : numpy.datetime64(), + "scan_start_date" : numpy.datetime64(), + "scan_stop_date" : numpy.datetime64(), + "tool_name" : pd.StringDtype(), + "tool_version" : pd.StringDtype(), + "tool_query_commit_id" : pd.StringDtype(), + "sarif_file_name" : pd.StringDtype(), + "results_count" : pd.Int64Dtype(), + "rules_count" : pd.Int64Dtype(), + } + results = { + 'id' : pd.UInt64Dtype(), + 'scan_id' : pd.UInt64Dtype(), + 'query_id' : pd.StringDtype(), + + 'result_type' : pd.StringDtype(), + 'codeFlow_id' : pd.UInt64Dtype(), + + 'message' : pd.StringDtype(), + 'message_object' : numpy.dtype('O'), + 'location' : pd.StringDtype(), + + 'source_startLine' : pd.Int64Dtype(), + 'source_startCol' : pd.Int64Dtype(), + 'source_endLine' : pd.Int64Dtype(), + 'source_endCol' : pd.Int64Dtype(), + + 'sink_startLine' : pd.Int64Dtype(), + 'sink_startCol' : pd.Int64Dtype(), + 'sink_endLine' : pd.Int64Dtype(), + 'sink_endCol' : pd.Int64Dtype(), + + # TODO Find high-level info from query name or tags? + 'source_object' : numpy.dtype('O'), + 'sink_object' : numpy.dtype('O'), + } + projects = { + "id" : pd.UInt64Dtype(), + "project_name" : pd.StringDtype(), + "creation_date" : numpy.datetime64(), + "repo_url" : pd.StringDtype(), + "primary_language" : pd.StringDtype(), + "languages_analyzed" : pd.StringDtype(), + } + # # Projects table # @@ -36,18 +90,10 @@ def joins_for_projects(basetables, external_info, scantables): "repo_url" : repo_url, "primary_language" : b.project['semmle.sourceLanguage'][0], # TODO: external info "languages_analyzed" : ",".join(list(b.project['semmle.sourceLanguage'])) - },index=[0]) + }, index=[0]) # Force all column types to ensure appropriate formatting - res1 = res.astype({ - "id" : pd.UInt64Dtype(), - "project_name" : pd.StringDtype(), - "creation_date" : numpy.datetime64(), - "repo_url" : pd.StringDtype(), - "primary_language" : pd.StringDtype(), - "languages_analyzed" : pd.StringDtype(), - }).reset_index(drop=True) - + res1 = res.astype(ScanTablesTypes.projects).reset_index(drop=True) return res1 # @@ -82,22 +128,7 @@ def joins_for_scans(basetables, external_info, scantables): },index=[0]) # Force all column types to ensure correct writing and type checks on reading. - res1 = res.astype({ - "id" : pd.UInt64Dtype(), - "commit_id" : pd.StringDtype(), - "project_id" : pd.UInt64Dtype(), - "db_create_start" : numpy.datetime64(), - "db_create_stop" : numpy.datetime64(), - "scan_start_date" : numpy.datetime64(), - "scan_stop_date" : numpy.datetime64(), - "tool_name" : pd.StringDtype(), - "tool_version" : pd.StringDtype(), - "tool_query_commit_id" : pd.StringDtype(), - "sarif_file_name" : pd.StringDtype(), - "results_count" : pd.Int64Dtype(), - "rules_count" : pd.Int64Dtype(), - }).reset_index(drop=True) - + res1 = res.astype(ScanTablesTypes.scans).reset_index(drop=True) return res1 # @@ -129,33 +160,7 @@ def joins_for_results(basetables, external_info): res = tables[0] # Force all column types to ensure appropriate formatting - res1 = res.astype({ - 'id' : pd.UInt64Dtype(), - 'scan_id' : pd.UInt64Dtype(), - 'query_id' : pd.StringDtype(), - - 'result_type' : pd.StringDtype(), - 'codeFlow_id' : pd.UInt64Dtype(), - - 'message' : pd.StringDtype(), - 'message_object' : numpy.dtype('O'), - 'location' : pd.StringDtype(), - - 'source_startLine' : pd.Int64Dtype(), - 'source_startCol' : pd.Int64Dtype(), - 'source_endLine' : pd.Int64Dtype(), - 'source_endCol' : pd.Int64Dtype(), - - 'sink_startLine' : pd.Int64Dtype(), - 'sink_startCol' : pd.Int64Dtype(), - 'sink_endLine' : pd.Int64Dtype(), - 'sink_endCol' : pd.Int64Dtype(), - - # TODO Find high-level info from query name or tags? - 'source_object' : numpy.dtype('O'), - 'sink_object' : numpy.dtype('O'), - }).reset_index(drop=True) - + res1 = res.astype(ScanTablesTypes.results).reset_index(drop=True) return res1 def _results_from_kind_problem(basetables, external_info):