Add CLI support

enabled by -f flag with CLI value
tested on sarif from CodeQL CLIs:
2.6.3, 2.9.4, 2.11.4
MUST contain versionControlProvenance property however
This commit is contained in:
Kristen Newbury
2022-12-01 11:37:56 -05:00
parent 009cf12d2c
commit 04a5aae14d
11 changed files with 757 additions and 68 deletions

View File

@@ -81,7 +81,7 @@ bt = BaseTables()
# #
# Add dataframes # Add dataframes
# #
sf_2683 = tj.joins_for_sf_2683(tgraph) sf_2683 = tj.joins_for_location_info(tgraph)
af_0350_location = tj.joins_for_af_0350_location(tgraph) af_0350_location = tj.joins_for_af_0350_location(tgraph)
bt.artifacts = tj.joins_for_artifacts(tgraph) bt.artifacts = tj.joins_for_artifacts(tgraph)
bt.codeflows = tj.joins_for_codeflows(tgraph, sf_2683) bt.codeflows = tj.joins_for_codeflows(tgraph, sf_2683)

View File

@@ -2,7 +2,7 @@
""" Extract scan data from multiple sarif files in table form. """ Extract scan data from multiple sarif files in table form.
""" """
from dataclasses import dataclass from dataclasses import dataclass
from sarif_cli import signature, signature_single from sarif_cli import signature, signature_single, signature_single_CLI
from sarif_cli import typegraph from sarif_cli import typegraph
from sarif_cli import snowflake_id from sarif_cli import snowflake_id
from sarif_cli import status_writer from sarif_cli import status_writer
@@ -14,6 +14,7 @@ import logging
import pandas as pd import pandas as pd
import pathlib import pathlib
import sarif_cli.table_joins as tj import sarif_cli.table_joins as tj
import sarif_cli.table_joins_CLI as tj_CLI
import sarif_cli.scan_tables as st import sarif_cli.scan_tables as st
import sys import sys
@@ -32,8 +33,18 @@ parser.add_argument('outdir', metavar='output-dir', type=str, help='output direc
parser.add_argument('csvout', metavar='csv-outfile', type=str, help='processing status csv output file name to use') parser.add_argument('csvout', metavar='csv-outfile', type=str, help='processing status csv output file name to use')
parser.add_argument('-r', '--write-raw-tables', action="store_true", parser.add_argument('-r', '--write-raw-tables', action="store_true",
help='Write the raw sarif tables to the output directory') help='Write the raw sarif tables to the output directory')
parser.add_argument('-f','--input-signature', metavar='input-signature', type=str, default="LGTM",
help='Signature of the sarif, as in, where it was generated it may affect the signature.'
'Options: LGTM, CLI'
'If current represented signatures are not sufficient, view signature_single.py for how to support further signatures.'
' Default: "%(default)s"')
args = parser.parse_args() args = parser.parse_args()
if args.input_signature not in ["LGTM","CLI"]:
print("Unsupported sarif signature requested.")
print("Use one of [LGTM, CLI].")
sys.exit(0)
# Setup csv error writer # Setup csv error writer
status_writer.setup_csv_writer(args.csvout) status_writer.setup_csv_writer(args.csvout)
@@ -66,11 +77,20 @@ context = signature.Context(
) )
sarif_struct = signature.fillsig(args, sarif_struct, context) sarif_struct = signature.fillsig(args, sarif_struct, context)
#
# Setup which signature to use
if args.input_signature == "LGTM":
signature_to_use = signature_single.struct_graph_LGTM
start_node = signature_single.start_node_LGTM
else:
#signature_to_use = signature_single.struct_graph_CLI
signature_to_use = signature_single_CLI.struct_graph_CLI
start_node = signature_single_CLI.start_node_CLI
# #
# Use reference type graph (signature) to traverse sarif and attach values to tables # Use reference type graph (signature) to traverse sarif and attach values to tables
try: try:
tgraph = typegraph.Typegraph(signature_single.struct_graph_2022_02_01) tgraph = typegraph.Typegraph(signature_to_use)
typegraph.destructure(tgraph, signature_single.start_node_2022_02_01, sarif_struct) typegraph.destructure(tgraph, start_node, sarif_struct)
except Exception: except Exception:
# will have gathered errors/warnings # will have gathered errors/warnings
status_writer.csv_write_warnings() status_writer.csv_write_warnings()
@@ -126,31 +146,29 @@ external_info = ExternalInfo(
# #
# Add dataframes for base tables # Add dataframes for base tables
# #
sf_2683 = tj.joins_for_sf_2683(tgraph) # (relies on some specifics of the sigature type)
af_0350_location = tj.joins_for_af_0350_location(tgraph) if args.input_signature == "LGTM":
bt.artifacts = tj.joins_for_artifacts(tgraph) tj = tj
bt.codeflows = tj.joins_for_codeflows(tgraph, sf_2683) else:
bt.kind_pathproblem = tj.joins_for_path_problem(tgraph, af_0350_location) tj = tj_CLI
bt.kind_problem = tj.joins_for_problem(tgraph, af_0350_location) try:
bt.project = tj.joins_for_project_single(tgraph) location_info = tj.joins_for_location_info(tgraph)
bt.relatedLocations = tj.joins_for_relatedLocations(tgraph, sf_2683) af_0350_location = tj.joins_for_af_0350_location(tgraph)
bt.rules = tj.joins_for_rules(tgraph) bt.artifacts = tj.joins_for_artifacts(tgraph)
bt.codeflows = tj.joins_for_codeflows(tgraph, location_info)
bt.kind_pathproblem = tj.joins_for_path_problem(tgraph, af_0350_location)
bt.kind_problem = tj.joins_for_problem(tgraph, af_0350_location)
bt.project = tj.joins_for_project_single(tgraph)
bt.relatedLocations = tj.joins_for_relatedLocations(tgraph, location_info)
bt.rules = tj.joins_for_rules(tgraph)
except Exception:
#possible warnings accumulated
status_writer.csv_write_warnings()
raise Exception
# #
# Form scan tables # Setup rest of basetables
# #
# joins for projects has to happen first as it backfills the guess about the project_id
scantabs.projects = st.joins_for_projects(bt, external_info, scantabs)
scantabs.results = st.joins_for_results(bt, external_info)
scantabs.scans = st.joins_for_scans(bt, external_info, scantabs)
#
# Replace the remaining internal ids with snowflake ids
#
flakegen = snowflake_id.Snowflake(0)
bt.columns_to_reindex = { bt.columns_to_reindex = {
# template from {field.name : [''] for field in dc.fields(bt)} # template from {field.name : [''] for field in dc.fields(bt)}
'artifacts': ['artifacts_id'], 'artifacts': ['artifacts_id'],
@@ -167,6 +185,19 @@ scantabs.columns_to_reindex = {
'results': ['codeFlow_id'], 'results': ['codeFlow_id'],
} }
#
# Form scan tables
#
# joins for projects has to happen first as it backfills the guess about the project_id
scantabs.projects = st.joins_for_projects(bt, external_info)
scantabs.results = st.joins_for_results(bt, external_info)
scantabs.scans = st.joins_for_scans(bt, external_info, scantabs, args.input_signature)
#
# Replace the remaining internal ids with snowflake ids
#
flakegen = snowflake_id.Snowflake(0)
_id_to_flake = {} _id_to_flake = {}
def _get_flake(id): def _get_flake(id):
flake = _id_to_flake.get(id, -1) flake = _id_to_flake.get(id, -1)

View File

@@ -87,7 +87,14 @@ from sarif_cli import hash
parser = argparse.ArgumentParser(description='Run sarif-extract-scans over a directory hierarchy') parser = argparse.ArgumentParser(description='Run sarif-extract-scans over a directory hierarchy')
parser.add_argument('sarif_files', metavar='sarif-files', type=str, help='File containing list of sarif files, use - for stdin') parser.add_argument('sarif_files', metavar='sarif-files', type=str, help='File containing list of sarif files, use - for stdin')
parser.add_argument('-o','--outdir', metavar='output-dir', type=str, default="", help='output directory')
parser.add_argument('-f','--input-signature', metavar='input-signature', type=str, default="LGTM",
help='Signature of the sarif, as in, where it was generated it may affect the signature.'
'Options: LGTM, CLI'
'If current represented signatures are not sufficient, view signature_single.py for how to support further signatures.'
' Default: "%(default)s"')
parser.add_argument('-o','--outdir', metavar='output-dir', type=str, default="", help='Output directory')
parser.add_argument('-m', '--max-files', metavar='number', type=int, default=100000, parser.add_argument('-m', '--max-files', metavar='number', type=int, default=100000,
help='Maximum number of files to process.' help='Maximum number of files to process.'
@@ -126,6 +133,11 @@ if outer_dir != "":
except FileExistsError: except FileExistsError:
pass pass
if args.input_signature not in ["LGTM","CLI"]:
print("Unsupported sarif signature requested.")
print("Use one of [LGTM, CLI].")
sys.exit(0)
# #
# Collect sarif file information # Collect sarif file information
# #
@@ -205,7 +217,7 @@ for path in paths:
scan_log_file = os.path.join(outer_dir+ project, component + ".scanlog") scan_log_file = os.path.join(outer_dir+ project, component + ".scanlog")
csv_outfile = os.path.join(outer_dir+ project, component) csv_outfile = os.path.join(outer_dir+ project, component)
runstats = subprocess.run(['sarif-extract-scans', scan_spec_file, output_dir, csv_outfile], runstats = subprocess.run(['sarif-extract-scans', scan_spec_file, output_dir, csv_outfile, "-f", args.input_signature],
capture_output=True, text=True) capture_output=True, text=True)
if runstats.returncode == 0: if runstats.returncode == 0:
print("{:6} {}".format("OK", path)) print("{:6} {}".format("OK", path))

View File

@@ -59,8 +59,8 @@ sarif_struct = signature.fillsig(args, sarif_struct, context)
# #
# Use reference type graph (signature) to traverse sarif and attach values to tables # Use reference type graph (signature) to traverse sarif and attach values to tables
# #
tgraph = typegraph.Typegraph(signature_single.struct_graph_2022_02_01) tgraph = typegraph.Typegraph(signature_single.struct_graph_LGTM)
typegraph.destructure(tgraph, signature_single.start_node_2022_02_01, sarif_struct) typegraph.destructure(tgraph, signature_single.start_node_LGTM, sarif_struct)
# #
# Form output tables # Form output tables
@@ -84,7 +84,7 @@ bt = BaseTables()
# #
# Add dataframes # Add dataframes
# #
sf_2683 = tj.joins_for_sf_2683(tgraph) sf_2683 = tj.joins_for_location_info(tgraph)
af_0350_location = tj.joins_for_af_0350_location(tgraph) af_0350_location = tj.joins_for_af_0350_location(tgraph)
bt.artifacts = tj.joins_for_artifacts(tgraph) bt.artifacts = tj.joins_for_artifacts(tgraph)
bt.codeflows = tj.joins_for_codeflows(tgraph, sf_2683) bt.codeflows = tj.joins_for_codeflows(tgraph, sf_2683)

View File

@@ -73,18 +73,21 @@ class ScanTablesTypes:
# #
# Projects table # Projects table
# #
def joins_for_projects(basetables, external_info, scantables): def joins_for_projects(basetables, external_info):
""" """
Form the 'projects' table for the ScanTables dataclass Form the 'projects' table for the ScanTables dataclass
""" """
b = basetables; e = external_info b = basetables; e = external_info
# if the sarif does not have versionControlProvenance, semmle.sourceLanguage ect
# there is no reliable way to know the project name
# and will still need to use a guess about the project id
if "repositoryUri" in b.project:
repo_url = b.project.repositoryUri[0]
# For a repository url of the form # For a repository url of the form
# (git|https)://*/org/project.* # (git|https)://*/org/project.*
# use the org/project part as the project_name. # use the org/project part as the project_name.
# #
# TODO knewbury error handling for if the signature is slotted out?
repo_url = b.project.repositoryUri[0]
url_parts = re.match(r'(git|https)://[^/]+/([^/]+)/(.*).git', repo_url) url_parts = re.match(r'(git|https)://[^/]+/([^/]+)/(.*).git', repo_url)
if url_parts: if url_parts:
project_name = f"{url_parts.group(2)}-{url_parts.group(3)}" project_name = f"{url_parts.group(2)}-{url_parts.group(3)}"
@@ -95,14 +98,24 @@ def joins_for_projects(basetables, external_info, scantables):
e.project_id = hash.hash_unique(project_name.encode()) e.project_id = hash.hash_unique(project_name.encode())
else: else:
project_name = pd.NA project_name = pd.NA
else:
repo_url = "unknown"
project_name = pd.NA
if 'semmle.sourceLanguage' in b.project:
srcLang = b.project['semmle.sourceLanguage'][0]
allLang = ",".join(list(b.project['semmle.sourceLanguage']))
else:
srcLang = "unknown"
allLang = "unknown"
res = pd.DataFrame(data={ res = pd.DataFrame(data={
"id" : e.project_id, "id" : e.project_id,
"project_name" : project_name, "project_name" : project_name,
"creation_date" : pd.Timestamp(0.0, unit='s'), # TODO: external info "creation_date" : pd.Timestamp(0.0, unit='s'), # TODO: external info
"repo_url" : repo_url, "repo_url" : repo_url,
"primary_language" : b.project['semmle.sourceLanguage'][0], # TODO: external info "primary_language" : srcLang, # TODO: external info if CLI sarif
"languages_analyzed" : ",".join(list(b.project['semmle.sourceLanguage'])) "languages_analyzed" : allLang # TODO: external info if CLI sarif
}, index=[0]) }, index=[0])
# Force all column types to ensure appropriate formatting # Force all column types to ensure appropriate formatting
@@ -112,7 +125,7 @@ def joins_for_projects(basetables, external_info, scantables):
# #
# Scans table # Scans table
# #
def joins_for_scans(basetables, external_info, scantables): def joins_for_scans(basetables, external_info, scantables, sarif_type):
""" """
Form the `scans` table for the ScanTables dataclass Form the `scans` table for the ScanTables dataclass
""" """
@@ -122,9 +135,14 @@ def joins_for_scans(basetables, external_info, scantables):
driver_version = b.project.driver_version.unique() driver_version = b.project.driver_version.unique()
assert len(driver_version) == 1, \ assert len(driver_version) == 1, \
"More than one driver version found for single sarif file." "More than one driver version found for single sarif file."
# TODO if commit id exists in external info for CLI gen'd sarif, add?
if sarif_type == "LGTM":
commit_id = b.project.revisionId[0]
else:
commit_id = "unknown"
res = pd.DataFrame(data={ res = pd.DataFrame(data={
"id" : e.scan_id, "id" : e.scan_id,
"commit_id" : b.project.revisionId[0], "commit_id" : commit_id,
"project_id" : e.project_id, "project_id" : e.project_id,
# TODO extract real date information from somewhere external # TODO extract real date information from somewhere external
"db_create_start" : pd.Timestamp(0.0, unit='s'), "db_create_start" : pd.Timestamp(0.0, unit='s'),
@@ -207,7 +225,6 @@ def _results_from_kind_problem(basetables, external_info):
'query_precision' : [_populate_from_rule_table("precision", b, i) for i in range(len(b.kind_problem))], 'query_precision' : [_populate_from_rule_table("precision", b, i) for i in range(len(b.kind_problem))],
'query_severity' : [_populate_from_rule_table("problem.severity", b, i) for i in range(len(b.kind_problem))], 'query_severity' : [_populate_from_rule_table("problem.severity", b, i) for i in range(len(b.kind_problem))],
'query_tags' : [_populate_from_rule_table_tag_text(b, i) for i in range(len(b.kind_problem))], 'query_tags' : [_populate_from_rule_table_tag_text(b, i) for i in range(len(b.kind_problem))],
'codeFlow_id' : 0, # link to codeflows (kind_pathproblem only, NULL here) 'codeFlow_id' : 0, # link to codeflows (kind_pathproblem only, NULL here)
'message': b.kind_problem.message_text, 'message': b.kind_problem.message_text,
@@ -249,6 +266,7 @@ def _results_from_kind_pathproblem(basetables, external_info):
# The `result` table has no entry to distinguish these, so we use a simplified # The `result` table has no entry to distinguish these, so we use a simplified
# version of `kind_pathproblem`. # version of `kind_pathproblem`.
reduced_kind_pathp = b.kind_pathproblem.drop( reduced_kind_pathp = b.kind_pathproblem.drop(
columns=[ columns=[
'relatedLocation_array_index', 'relatedLocation_array_index',
@@ -295,7 +313,6 @@ def _results_from_kind_pathproblem(basetables, external_info):
'query_precision' : _populate_from_rule_table_code_flow("precision", b, cfid0ppt0), 'query_precision' : _populate_from_rule_table_code_flow("precision", b, cfid0ppt0),
'query_severity' : _populate_from_rule_table_code_flow("problem.severity", b, cfid0ppt0), 'query_severity' : _populate_from_rule_table_code_flow("problem.severity", b, cfid0ppt0),
'query_tags' : _populate_from_rule_table_code_flow_tag_text(b, cfid0ppt0), 'query_tags' : _populate_from_rule_table_code_flow_tag_text(b, cfid0ppt0),
'codeFlow_id' : cfid0, 'codeFlow_id' : cfid0,
# #
'message': cfid0ppt0.message_text.values[0], 'message': cfid0ppt0.message_text.values[0],

View File

@@ -225,7 +225,7 @@ dummy_newlineSequences = ['\r\n', '\n', '\u2028', '\u2029']
dummy_relatedLocations_entry = [ dummy_relatedLocations_entry = [
{'id': -1, {'id': -1,
'physicalLocation': {'artifactLocation': {'uri': 'scli-dyys dummy value', 'physicalLocation': {'artifactLocation': {'uri': 'scli-dyys dummy value',
'uriBaseId': 'scli-dyys dummy value', 'uriBaseId': 'scli-dyys uriBaseId',
'index': -1}, 'index': -1},
'region': {'startLine': -1, 'region': {'startLine': -1,
'startColumn': -1, 'startColumn': -1,

View File

@@ -12,9 +12,9 @@ is marked below
# #
# The starting node the leftmost node in ../notes/typegraph.pdf # The starting node the leftmost node in ../notes/typegraph.pdf
# #
start_node_2022_02_01 = 'Struct6787' start_node_LGTM = 'Struct6787'
struct_graph_2022_02_01 = ( struct_graph_LGTM = (
[ ('String', 'string'), [ ('String', 'string'),
('Int', 'int'), ('Int', 'int'),
('Bool', 'bool'), ('Bool', 'bool'),
@@ -122,4 +122,3 @@ struct_graph_2022_02_01 = (
('runs', 'Array0177'), ('runs', 'Array0177'),
('version', 'String')))] ('version', 'String')))]
) )

View File

@@ -0,0 +1,161 @@
""" The signature for a single sarif file
Produced by
sarif-to-dot -u -t -f 2021-12-09/results.sarif
with some arrays manually sorted so the the signature with more fields comes first. The case
('Array6343', ('array', (1, 'Struct9699'), (0, 'Struct4055'))), # MANUALLY SORTED
is marked below
"""
#
# The starting node the leftmost node in ../notes/typegraph.pdf
#
start_node_CLI = 'Struct5521'
# generated with CLI 2.9.4
struct_graph_CLI = (
[ ('String', 'string'),
('Int', 'int'),
('Bool', 'bool'),
( 'Struct2685',
( 'struct',
('index', 'Int'),
('uri', 'String'),
('uriBaseId', 'String'))),
('Struct5277', ('struct', ('location', 'Struct2685'))),
('Struct3497', ('struct', ('index', 'Int'), ('uri', 'String'))),
('Struct9567', ('struct', ('location', 'Struct3497'))),
('Array6920', ('array', (0, 'Struct5277'), (1, 'Struct9567'))),
('Struct1509', ('struct', ('semmle.formatSpecifier', 'String'))),
('Struct2774', ('struct', ('text', 'String'))),
( 'Struct6299',
( 'struct',
('endColumn', 'Int'),
('endLine', 'Int'),
('startColumn', 'Int'),
('startLine', 'Int'))),
( 'Struct4963',
( 'struct',
('artifactLocation', 'Struct2685'),
('region', 'Struct6299'))),
( 'Struct2683',
( 'struct',
('id', 'Int'),
('message', 'Struct2774'),
('physicalLocation', 'Struct4963'))),
('Array0350', ('array', (0, 'Struct2683'))),
( 'Struct4199',
( 'struct',
('primaryLocationLineHash', 'String'),
('primaryLocationStartColumnFingerprint', 'String'))),
('Struct3942', ('struct', ('id', 'String'), ('index', 'Int'))),
( 'Struct4055',
( 'struct',
('locations', 'Array0350'),
('message', 'Struct2774'),
('partialFingerprints', 'Struct4199'),
('relatedLocations', 'Array0350'),
('rule', 'Struct3942'),
('ruleId', 'String'),
('ruleIndex', 'Int'))),
( 'Struct7125',
( 'struct',
('artifactLocation', 'Struct3497'),
('region', 'Struct6299'))),
( 'Struct6772',
( 'struct',
('id', 'Int'),
('message', 'Struct2774'),
('physicalLocation', 'Struct7125'))),
('Array8753', ('array', (0, 'Struct6772'))),
( 'Struct0102',
( 'struct',
('locations', 'Array0350'),
('message', 'Struct2774'),
('partialFingerprints', 'Struct4199'),
('relatedLocations', 'Array8753'),
('rule', 'Struct3942'),
('ruleId', 'String'),
('ruleIndex', 'Int'))),
('Struct0987', ('struct', ('location', 'Struct2683'))),
('Array1075', ('array', (0, 'Struct0987'))),
('Struct4194', ('struct', ('locations', 'Array1075'))),
('Array1597', ('array', (0, 'Struct4194'))),
('Struct7122', ('struct', ('threadFlows', 'Array1597'))),
('Array9799', ('array', (0, 'Struct7122'))),
( 'Struct9699',
( 'struct',
('codeFlows', 'Array9799'),
('locations', 'Array0350'),
('message', 'Struct2774'),
('partialFingerprints', 'Struct4199'),
('relatedLocations', 'Array0350'),
('rule', 'Struct3942'),
('ruleId', 'String'),
('ruleIndex', 'Int'))),
( 'Array1768',
#('array', (2, 'Struct9699'), (1, 'Struct4055'),(0, 'Struct0102'))),
#('array',(0, 'Struct0102'), (1, 'Struct4055'), (2, 'Struct9699'))),
#omitting (0, 'Struct0102') means we will never find column info
('array', (2, 'Struct9699'), (1, 'Struct4055'))),
('Struct8581', ('struct', ('enabled', 'Bool'), ('level', 'String'))),
('Array7069', ('array', (0, 'String'))),
( 'Struct6853',
( 'struct',
('description', 'String'),
('id', 'String'),
('kind', 'String'),
('name', 'String'),
('precision', 'String'),
('problem.severity', 'String'),
('security-severity', 'String'),
('severity', 'String'),
('sub-severity', 'String'),
('tags', 'Array7069'))),
( 'Struct7100',
( 'struct',
('defaultConfiguration', 'Struct8581'),
('fullDescription', 'Struct2774'),
('id', 'String'),
('name', 'String'),
('properties', 'Struct6853'),
('shortDescription', 'Struct2774'))),
('Array0147', ('array', (0, 'Struct7100'))),
( 'Struct7828',
( 'struct',
('name', 'String'),
('organization', 'String'),
('rules', 'Array0147'),
('semanticVersion', 'String'))),
( 'Struct9027',
('struct', ('description', 'Struct2774'), ('uri', 'String'))),
('Array4813', ('array', (0, 'Struct9027'))),
( 'Struct6152',
( 'struct',
('locations', 'Array4813'),
('name', 'String'),
('semanticVersion', 'String'))),
('Struct7826', ('struct', ('locations', 'Array4813'), ('name', 'String'))),
('Array9357', ('array', (0, 'Struct6152'), (1, 'Struct7826'))),
( 'Struct0032',
('struct', ('driver', 'Struct7828'), ('extensions', 'Array9357'))),
( 'Struct3081',
('struct', ('repositoryUri', 'String'), ('revisionId', 'String'))),
('Array5511', ('array', (0, 'Struct3081'))),
( 'Struct9786',
( 'struct',
('artifacts', 'Array6920'),
('columnKind', 'String'),
('newlineSequences', 'Array7069'),
('properties', 'Struct1509'),
('results', 'Array1768'),
('tool', 'Struct0032'),
('versionControlProvenance', 'Array5511'))),
('Array1273', ('array', (0, 'Struct9786'))),
( 'Struct5521',
( 'struct',
('$schema', 'String'),
('runs', 'Array1273'),
('version', 'String')))] )

View File

@@ -73,13 +73,12 @@ def joins_for_af_0350_location(tgraph):
) )
return af_0350_location return af_0350_location
def joins_for_sf_2683(tgraph): def joins_for_location_info(tgraph):
""" """
Join all the tables used by 2683's right side into one. Join all the tables used by 2683's right side into one.
""" """
# Access convenience functions # Access convenience functions
sf = lambda num: tgraph.dataframes['Struct' + str(num)] sf = lambda num: tgraph.dataframes['Struct' + str(num)]
af = lambda num: tgraph.dataframes['Array' + str(num)]
# #
sf_2683 = ( sf_2683 = (
# #

View File

@@ -0,0 +1,462 @@
""" Collection of joins for the base tables provided by typegraph.attach_tables()
The `problem` and `path-problem` entries provide that information; the
`relatedLocations` table provides the details when multiple results are
present for either. `project` is the high-level overview; `artifacts`
provides those for the other tables.
"""
import pandas as pd
import re
from .typegraph import tagged_array_columns, tagged_struct_columns
class BaseTablesTypes:
codeflows = {
"codeflow_id" : pd.UInt64Dtype(),
"codeflow_index" : pd.Int64Dtype(),
"threadflow_index" : pd.Int64Dtype(),
"location_index" : pd.Int64Dtype(),
"endColumn" : pd.Int64Dtype(),
"endLine" : pd.Int64Dtype(),
"startColumn" : pd.Int64Dtype(),
"startLine" : pd.Int64Dtype(),
"artifact_index" : pd.Int64Dtype(),
"uri" : pd.StringDtype(),
"uriBaseId" : pd.StringDtype(),
"message" : pd.StringDtype(),
}
def joins_for_af_0350_location(tgraph):
"""
Join all the tables used by 0350's right side into one.
"""
# Access convenience functions
sf = lambda num: tgraph.dataframes['Struct' + str(num)]
af = lambda num: tgraph.dataframes['Array' + str(num)]
sft = lambda id: sf(id).rename(columns = tagged_struct_columns(tgraph, id))
aft = lambda id: af(id).rename(columns = tagged_array_columns(tgraph, id))
af_0350_location = (
aft('0350')
#
.merge(sft(2683), how="left", left_on='t0350_id_or_value_at_index', right_on='t2683_struct_id',
validate="1:m")
.drop(columns=['t0350_id_or_value_at_index', 't2683_struct_id', 't0350_type_at_index'])
#
.merge(sft(4963), how="left", left_on='t2683_physicalLocation', right_on='t4963_struct_id',
validate="1:m")
.drop(columns=['t2683_physicalLocation', 't4963_struct_id'])
#
.merge(sft(6299), how="left", left_on='t4963_region', right_on='t6299_struct_id',
validate="1:m")
.drop(columns=['t4963_region', 't6299_struct_id'])
#
.merge(sft(2685), how="left", left_on='t4963_artifactLocation', right_on='t2685_struct_id',
validate="1:m")
.drop(columns=['t4963_artifactLocation', 't2685_struct_id'])
#
.merge(sft(2774), how="left", left_on='t2683_message', right_on='t2774_struct_id',
validate="1:m")
.drop(columns=['t2683_message', 't2774_struct_id'])
#
.rename(columns={'t0350_array_id' : 'm0350_location_array_id',
't0350_value_index' : 'm0350_location_array_index',
't2683_id' : 'm0350_location_id',
't6299_endColumn' : 'm0350_location_endColumn',
't6299_endLine' : 'm0350_location_endLine',
't6299_startColumn' : 'm0350_location_startColumn',
't6299_startLine' : 'm0350_location_startLine',
't2685_index' : 'm0350_location_index',
't2685_uri' : 'm0350_location_uri',
't2685_uriBaseId' : 'm0350_location_uriBaseId',
't2774_text' : 'm0350_location_message',
})
)
return af_0350_location
def joins_for_location_info(tgraph):
"""
Join all the tables used by 2683's right side into one.
"""
# Access convenience functions
sf = lambda num: tgraph.dataframes['Struct' + str(num)]
#
sf_2683 = (
#
sf(2683)
.rename(columns={"struct_id": "struct_id_2683", "id": "id_2683"})
#
.merge(sf(4963), how="left", left_on='physicalLocation', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 'physicalLocation'])
#
.merge(sf(6299), how="left", left_on='region', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 'region'])
#
.merge(sf(2685), how="left", left_on='artifactLocation', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 'artifactLocation'])
.rename(columns={"index": "location_index_2685"})
#
.merge(sf(2774), how="left", left_on='message', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 'message'])
.rename(columns={"text": "message_text_2683"})
#
)
return sf_2683
def joins_for_problem(tgraph, af_0350_location):
"""
Return table providing the `problem` information.
"""
# Access convenience functions
sf = lambda num: tgraph.dataframes['Struct' + str(num)]
af = lambda num: tgraph.dataframes['Array' + str(num)]
sft = lambda id: sf(id).rename(columns = tagged_struct_columns(tgraph, id))
aft = lambda id: af(id).rename(columns = tagged_array_columns(tgraph, id))
#
# Form the message dataframe (@kind problem) via joins
#
kind_problem_1 = (
aft(1768)
.merge(sft(4055), how="inner",
left_on='t1768_id_or_value_at_index', right_on='t4055_struct_id',
validate="1:m")
.drop(columns=['t1768_type_at_index', 't1768_id_or_value_at_index',
't4055_struct_id'])
#
.merge(af_0350_location, how="left", left_on='t4055_locations',
right_on='m0350_location_array_id', validate="1:m")
.drop(columns=['t4055_locations', 'm0350_location_array_id'])
#
.merge(af_0350_location.rename(columns=lambda x: re.sub('m0350_location',
'm0350_relatedLocation',
x)),
how="left", left_on='t4055_relatedLocations',
right_on='m0350_relatedLocation_array_id', validate="1:m")
.drop(columns=['t4055_relatedLocations', 'm0350_relatedLocation_array_id'])
#
.merge(sft(2774), how="left", left_on='t4055_message', right_on='t2774_struct_id')
.drop(columns=['t4055_message', 't2774_struct_id'])
.rename(columns={"t2774_text": "t4055_message_text"})
#
.merge(sft(4199), how="left", left_on='t4055_partialFingerprints',
right_on='t4199_struct_id')
.drop(columns=['t4055_partialFingerprints', 't4199_struct_id'])
#
.merge(sft(3942), how="left", left_on='t4055_rule',
right_on='t3942_struct_id')
.drop(columns=['t4055_rule', 't3942_struct_id'])
)
kind_problem_2 = (
kind_problem_1
.rename({
't1768_array_id' : 'results_array_id',
't1768_value_index' : 'results_array_index',
't4055_ruleId' : 'ruleId',
't4055_ruleIndex' : 'ruleIndex',
't4055_message_text' : 'message_text',
't3942_id' : 'rule_id',
't3942_index' : 'rule_index',
}, axis='columns')
# Strip type prefix for the rest
.rename(columns = lambda x: re.sub('m0350_|t4199_', '', x))
)
return kind_problem_2
def joins_for_codeflows(tgraph, sf_2683):
"""
Return the table providing the `codeFlows` for a `path-problem table.
"""
# Access convenience functions
sf = lambda num: tgraph.dataframes['Struct' + str(num)]
af = lambda num: tgraph.dataframes['Array' + str(num)]
#
codeflows = (
af(9799).rename(columns={"array_id": "t9799_array_id", "value_index": "t9799_idx"})
#
.merge(sf(7122), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m")
.drop(columns=['id_or_value_at_index', 'struct_id', 'type_at_index'])
#
.merge(af(1597).rename(columns={"array_id": "t1597_array_id", "value_index": "t1597_idx"}),
how="left", left_on='threadFlows', right_on='t1597_array_id', validate="1:m")
.drop(columns=['threadFlows', 't1597_array_id', 'type_at_index'])
#
.merge(sf(4194), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m")
.drop(columns=['id_or_value_at_index', 'struct_id'])
#
.merge(af(1075).rename(columns={"array_id": "t1075_array_id", "value_index": "t1075_idx"}),
how="left", left_on='locations', right_on='t1075_array_id', validate="1:m")
.drop(columns=['locations', 't1075_array_id', 'type_at_index'])
.rename(columns={"t1075_idx": "t1075_locations_idx"})
#
.merge(sf('0987'), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m")
.drop(columns=['id_or_value_at_index', 'struct_id'])
#
.merge(sf_2683, how="left", left_on='location', right_on='struct_id_2683', validate="1:m")
.drop(columns=['location', 'struct_id_2683'])
)
codeflows_1 = (
codeflows
.drop(columns=['id_2683'])
.rename({
't9799_array_id': 'codeflow_id',
't9799_idx': 'codeflow_index',
't1597_idx': 'threadflow_index',
't1075_locations_idx': 'location_index',
'location_index_2685': 'artifact_index',
'message_text_2683': 'message',
}, axis='columns')
)
codeflows_2 = codeflows_1.astype(BaseTablesTypes.codeflows).reset_index(drop=True)
return codeflows_2
def joins_for_path_problem(tgraph, af_0350_location):
"""
Return table providing the `path-problem` information.
"""
# Access convenience functions
sf = lambda num: tgraph.dataframes['Struct' + str(num)]
af = lambda num: tgraph.dataframes['Array' + str(num)]
sft = lambda id: sf(id).rename(columns = tagged_struct_columns(tgraph, id))
aft = lambda id: af(id).rename(columns = tagged_array_columns(tgraph, id))
kind_pathproblem_1 = (
aft(1768)
.merge(sft(9699), how="inner", left_on='t1768_id_or_value_at_index', right_on='t9699_struct_id',
validate="1:m")
.drop(columns=['t1768_id_or_value_at_index', 't9699_struct_id', 't1768_type_at_index'])
#
.merge(af_0350_location, how="left", left_on='t9699_locations',
right_on='m0350_location_array_id', validate="1:m")
.drop(columns=['t9699_locations', 'm0350_location_array_id'])
#
.merge(af_0350_location.rename(columns=lambda x: re.sub('m0350_location',
'm0350_relatedLocation',
x)),
how="left", left_on='t9699_relatedLocations',
right_on='m0350_relatedLocation_array_id', validate="1:m")
.drop(columns=['t9699_relatedLocations', 'm0350_relatedLocation_array_id'])
#
.merge(sft(2774), how="left", left_on='t9699_message', right_on='t2774_struct_id')
.drop(columns=['t9699_message', 't2774_struct_id'])
.rename(columns={"t2774_text": "t9699_message_text"})
#
.merge(sft(4199), how="left", left_on='t9699_partialFingerprints',
right_on='t4199_struct_id')
.drop(columns=['t9699_partialFingerprints', 't4199_struct_id'])
#
.merge(sft(3942), how="left", left_on='t9699_rule',
right_on='t3942_struct_id')
.drop(columns=['t9699_rule', 't3942_struct_id'])
)
strip_colums = lambda x: re.sub('t9699_|m0350_|t4199_', '', x)
kind_pathproblem_2 = (kind_pathproblem_1
.rename({
't1768_array_id' : 'results_array_id',
't1768_value_index' : 'results_array_index',
't9699_codeFlows' : 'codeFlows_id',
't9699_ruleId' : 'ruleId',
't9699_ruleIndex' : 'ruleIndex',
't9699_message_text' : 'message_text',
't3942_id' : 'rule_id',
't3942_index' : 'rule_index',
}, axis='columns')
# Strip type prefix for the rest
.rename(columns = strip_colums))
return kind_pathproblem_2
def joins_for_relatedLocations(tgraph, sf_2683):
"""
Return table providing the `relatedLocations` and `locations` information.
"""
# Access convenience functions
sf = lambda num: tgraph.dataframes['Struct' + str(num)]
af = lambda num: tgraph.dataframes['Array' + str(num)]
#
# Form the relatedLocation dataframe via joins, starting from the union of
# relatedLocations from `kind problem` (sf(4055)) and `kind path-problem`
# (sf(9699)).
#
related_locations_1 = (
pd.concat([sf(4055)[['relatedLocations', 'struct_id']], sf(9699)[['relatedLocations', 'struct_id']]])
.merge(af('0350'), how="left", left_on='relatedLocations', right_on='array_id', validate="1:m")
.drop(columns=['relatedLocations', 'array_id', 'value_index', 'type_at_index'])
#
.merge(sf(2683), how="left", left_on='id_or_value_at_index', right_on='struct_id',
suffixes=("_4055_9699", "_2683"), validate="1:m")
.drop(columns=['struct_id_2683', 'id_or_value_at_index'])
#
.merge(sf(4963), how="left", left_on='physicalLocation', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 'physicalLocation'])
#
.merge(sf(6299), how="left", left_on='region', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 'region'])
#
.merge(sf(2685), how="left", left_on='artifactLocation', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 'artifactLocation'])
#
.merge(sf(2774), how="left", left_on='message', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 'message'])
)
# Keep columns of interest
related_locations_2 = (related_locations_1[['struct_id_4055_9699', 'uri', 'startLine', 'startColumn', 'endLine', 'endColumn', 'text']]
.rename({'text': 'message', 'struct_id_4055_9699': 'struct_id'}, axis='columns'))
# Remove dummy locations previously injected by signature.fillsig
related_locations_3 = related_locations_2[related_locations_2.uri != 'scli-dyys dummy value']
return related_locations_3
def joins_for_project_single(tgraph):
"""
Return table providing the `project` information for sarif-extract-scans
"""
# Access convenience functions
sf = lambda num: tgraph.dataframes['Struct' + str(num)]
af = lambda num: tgraph.dataframes['Array' + str(num)]
#
project_df_temp1 = (
sf(5521)
.rename(columns={"version": "version_5521", "struct_id": "struct_id_5521"})
#
.merge(af('1273'), how="left", left_on='runs', right_on='array_id',
validate="1:m")
.drop(columns=['runs', 'array_id', 'type_at_index'])
.rename(columns={"value_index": "value_index_1273"})
#
.merge(sf(9786), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m")
.drop(columns=['id_or_value_at_index', 'struct_id']))
#
#newlines there or not - handle
if 'newlineSequences' in project_df_temp1:
project_df_temp2 = project_df_temp1.drop(columns=['newlineSequences'])
project_df_temp2 = (
project_df_temp1
#
.merge(sf(1509), how="left", left_on='properties', right_on='struct_id', validate="1:m")
.drop(columns=['properties', 'struct_id'])
#
# tool - driver - rules - defaultConfiguration - ( properties - tags )
#
.merge(sf('0032'), how="left", left_on='tool', right_on='struct_id', validate="1:m")
.drop(columns=['tool', 'struct_id'])
#
.merge(sf(7828), how="left", left_on='driver', right_on='struct_id', validate="1:m")
.drop(columns=['driver', 'struct_id'])
.rename(columns={"semanticVersion": "driver_version_7828", "name": "driver_name_7828"})
#
#assumet to be there
.merge(af(5511), how="left", left_on='versionControlProvenance', right_on='array_id')
.drop(columns=['versionControlProvenance', 'array_id', 'type_at_index'])
.rename(columns={"value_index": "versionControl_value_index_5511"})
#
.merge(sf(3081), how="left", left_on='id_or_value_at_index', right_on='struct_id')
.drop(columns=['id_or_value_at_index', 'struct_id'])
)
#
# Keep columns of interest
project_df_1 = (
project_df_temp2
.drop(columns=['struct_id_5521', 'versionControl_value_index_5511'])
.rename({
'version_5521': 'sarif_version',
'value_index_1273': 'run_index',
'driver_name_7828': 'driver_name',
'driver_version_7828': 'driver_version',
}, axis='columns')
)
return project_df_1
def joins_for_rules(tgraph):
"""
Return table providing the `rules` information.
"""
# Access convenience functions
sf = lambda num: tgraph.dataframes['Struct' + str(num)]
sft = lambda id: sf(id).rename(columns = tagged_struct_columns(tgraph, id))
af = lambda num: tgraph.dataframes['Array' + str(num)]
aft = lambda id: af(id).rename(columns = tagged_array_columns(tgraph, id))
#
rules_df = (
aft('0147')
#
.drop(columns=['t0147_type_at_index'])
#
.merge(sft(7100), how="left", left_on='t0147_id_or_value_at_index',
right_on='t7100_struct_id',
validate="1:m")
.drop(columns=['t0147_id_or_value_at_index', 't7100_struct_id'])
#
.merge(sft(8581), how="left", left_on='t7100_defaultConfiguration',
right_on='t8581_struct_id', validate="1:m")
.drop(columns=['t7100_defaultConfiguration', 't8581_struct_id'])
#
.merge(sft(2774), how="left", left_on='t7100_fullDescription',
right_on='t2774_struct_id', validate="1:m")
.drop(columns=['t7100_fullDescription', 't2774_struct_id'])
.rename(columns={'t2774_text': "t7100_t2774_fullDescription"})
#
.merge(sft(2774), how="left", left_on='t7100_shortDescription',
right_on='t2774_struct_id', validate="1:m")
.drop(columns=['t7100_shortDescription', 't2774_struct_id'])
.rename(columns={"t2774_text": 't7100_t2774_shortDescription'})
#
.merge(sft(6853), how="left", left_on='t7100_properties',
right_on='t6853_struct_id', validate="1:m")
.drop(columns=['t7100_properties', 't6853_struct_id', 't6853_id'])
#
.merge(aft(7069), how="left", left_on='t6853_tags',
right_on='t7069_array_id', validate="1:m")
.drop(columns=['t6853_tags', 't7069_array_id', 't7069_type_at_index'])
)
rules_2 = (
rules_df
.rename({
't0147_array_id' : 'rules_array_id',
't0147_value_index' : 'rules_array_index',
't7069_value_index' : 'tag_index',
't7069_id_or_value_at_index' : 'tag_text',
}, axis='columns')
# Strip type prefix for the rest
.rename(columns = lambda x: re.sub('t7100_t2774_|t7100_|t8581_|t6853_', '', x))
)
return rules_2
def joins_for_artifacts(tgraph):
"""
Return table providing the `artifacts` information.
"""
# Access convenience functions
sf = lambda num: tgraph.dataframes['Struct' + str(num)]
af = lambda num: tgraph.dataframes['Array' + str(num)]
#
artifacts_df = (
af(6920)
#
.merge(sf(5277), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m")
.drop(columns=['id_or_value_at_index', 'struct_id', 'type_at_index'])
.rename(columns={"value_index": "artifact_index_6920"})
#
.merge(sf(2685), how="left", left_on='location', right_on='struct_id', validate="1:m")
.drop(columns=['location', 'struct_id'])
)
# Keep columns of interest and rename
df_1 = (
artifacts_df
.rename({
'array_id': 'artifacts_id',
'artifact_index_6920': 'artifacts_array_index',
}, axis='columns')
)
if (df_1['artifacts_array_index'] == df_1['index']).all():
df_1 = df_1.drop(columns=['artifacts_array_index'])
return df_1

View File

@@ -179,6 +179,14 @@ def _destructure_dict(typegraph: Typegraph, node, tree):
if specific_missing not in status_writer.input_sarif_missing["extra_info"]: if specific_missing not in status_writer.input_sarif_missing["extra_info"]:
status_writer.input_sarif_missing["extra_info"] += specific_missing status_writer.input_sarif_missing["extra_info"] += specific_missing
status_writer.warning_set["input_sarif_missing"]+=1 status_writer.warning_set["input_sarif_missing"]+=1
#special case of no longer trying other signatures
#else exception here triggers a retry - mainly needed for Struct9699 or Struct4055
difference = set(type_fields) - set(tree_fields)
if "uriBaseId" in difference:
tree["uriBaseId"] = "default"
_destructure_dict_1(typegraph, node, tree)
else:
raise MissingFieldException( raise MissingFieldException(
f"(Sub)tree is missing fields required by typedef.\n" f"(Sub)tree is missing fields required by typedef.\n"
f"Expected {type_fields}, found {tree_fields}.\n" f"Expected {type_fields}, found {tree_fields}.\n"