Add CLI support

enabled by -f flag with CLI value
tested on sarif from CodeQL CLIs:
2.6.3, 2.9.4, 2.11.4
MUST contain versionControlProvenance property however
This commit is contained in:
Kristen Newbury
2022-12-01 11:37:56 -05:00
parent 2bda917a4e
commit 2ba9593d70
11 changed files with 765 additions and 68 deletions

View File

@@ -73,36 +73,49 @@ class ScanTablesTypes:
#
# Projects table
#
def joins_for_projects(basetables, external_info, scantables):
def joins_for_projects(basetables, external_info):
"""
Form the 'projects' table for the ScanTables dataclass
"""
b = basetables; e = external_info
# For a repository url of the form
# (git|https)://*/org/project.*
# use the org/project part as the project_name.
#
# TODO knewbury error handling for if the signature is slotted out?
repo_url = b.project.repositoryUri[0]
url_parts = re.match(r'(git|https)://[^/]+/([^/]+)/(.*).git', repo_url)
if url_parts:
project_name = f"{url_parts.group(2)}-{url_parts.group(3)}"
project, component = e.sarif_file_name.rstrip().split('/')
# if the runners guess from the filename was bad, replace with real info
# and continue to use that scanspec to pass that around
if project_name != project+"-"+component:
e.project_id = hash.hash_unique(project_name.encode())
# if the sarif does not have versionControlProvenance, semmle.sourceLanguage ect
# there is no reliable way to know the project name
# and will still need to use a guess about the project id
if "repositoryUri" in b.project:
repo_url = b.project.repositoryUri[0]
# For a repository url of the form
# (git|https)://*/org/project.*
# use the org/project part as the project_name.
#
url_parts = re.match(r'(git|https)://[^/]+/([^/]+)/(.*).git', repo_url)
if url_parts:
project_name = f"{url_parts.group(2)}-{url_parts.group(3)}"
project, component = e.sarif_file_name.rstrip().split('/')
# if the runners guess from the filename was bad, replace with real info
# and continue to use that scanspec to pass that around
if project_name != project+"-"+component:
e.project_id = hash.hash_unique(project_name.encode())
else:
project_name = pd.NA
else:
repo_url = "unknown"
project_name = pd.NA
if 'semmle.sourceLanguage' in b.project:
srcLang = b.project['semmle.sourceLanguage'][0]
allLang = ",".join(list(b.project['semmle.sourceLanguage']))
else:
srcLang = "unknown"
allLang = "unknown"
res = pd.DataFrame(data={
"id" : e.project_id,
"project_name" : project_name,
"creation_date" : pd.Timestamp(0.0, unit='s'), # TODO: external info
"repo_url" : repo_url,
"primary_language" : b.project['semmle.sourceLanguage'][0], # TODO: external info
"languages_analyzed" : ",".join(list(b.project['semmle.sourceLanguage']))
"primary_language" : srcLang, # TODO: external info if CLI sarif
"languages_analyzed" : allLang # TODO: external info if CLI sarif
}, index=[0])
# Force all column types to ensure appropriate formatting
@@ -112,7 +125,7 @@ def joins_for_projects(basetables, external_info, scantables):
#
# Scans table
#
def joins_for_scans(basetables, external_info, scantables):
def joins_for_scans(basetables, external_info, scantables, sarif_type):
"""
Form the `scans` table for the ScanTables dataclass
"""
@@ -122,9 +135,14 @@ def joins_for_scans(basetables, external_info, scantables):
driver_version = b.project.driver_version.unique()
assert len(driver_version) == 1, \
"More than one driver version found for single sarif file."
# TODO if commit id exists in external info for CLI gen'd sarif, add?
if sarif_type == "LGTM":
commit_id = b.project.revisionId[0]
else:
commit_id = "unknown"
res = pd.DataFrame(data={
"id" : e.scan_id,
"commit_id" : b.project.revisionId[0],
"commit_id" : commit_id,
"project_id" : e.project_id,
# TODO extract real date information from somewhere external
"db_create_start" : pd.Timestamp(0.0, unit='s'),
@@ -159,7 +177,7 @@ def joins_for_results(basetables, external_info):
tables = [_results_from_kind_problem(basetables, external_info),
_results_from_kind_pathproblem(basetables, external_info)]
stack = [table for table in tables if len(table) > 0]
# Concatenation fails without at least one table, so avoid that.
if len(stack) > 0:
res = pd.concat(stack)
@@ -195,7 +213,7 @@ def _results_from_kind_problem(basetables, external_info):
'query_id' : b.kind_problem.rule_id,
'query_kind' : "problem",
'query_precision' : [_populate_from_rule_table("precision", b, i) for i in range(len(b.kind_problem))],
'query_severity' : [_populate_from_rule_table("severity", b, i) for i in range(len(b.kind_problem))],
'query_severity' : [_populate_from_rule_table("problem.severity", b, i) for i in range(len(b.kind_problem))],
'result_type' : "kind_problem",
'codeFlow_id' : 0, # link to codeflows (kind_pathproblem only, NULL here)
@@ -240,6 +258,7 @@ def _results_from_kind_pathproblem(basetables, external_info):
# The `result` table has no entry to distinguish these, so we use a simplified
# version of `kind_pathproblem`.
reduced_kind_pathp = b.kind_pathproblem.drop(
columns=[
'relatedLocation_array_index',
@@ -284,7 +303,7 @@ def _results_from_kind_pathproblem(basetables, external_info):
'query_id' : cfid0ppt0.rule_id.values[0],
'query_kind' : "path-problem",
'query_precision' : _populate_from_rule_table_code_flow("precision", b, cfid0ppt0),
'query_severity' : _populate_from_rule_table_code_flow("severity", b, cfid0ppt0),
'query_severity' : _populate_from_rule_table_code_flow("problem.severity", b, cfid0ppt0),
#
'result_type' : "kind_pathproblem",
'codeFlow_id' : cfid0,

View File

@@ -53,6 +53,8 @@ def _signature_dict(args, elem, context: Context):
if args.typedef_signatures:
# Give every unique struct a name and use a reference to it as value.
if signature not in context.sig_to_typedef:
#cannot have leading 0 hashes later in table joins so replace now
#context.sig_to_typedef[signature] = str("Struct%04d" % shorthash(signature)).replace("0", "1")
context.sig_to_typedef[signature] = "Struct%04d" % shorthash(signature)
typedef = context.sig_to_typedef[signature]
return typedef
@@ -79,6 +81,8 @@ def _signature_list(args, elem, context):
if args.typedef_signatures:
# Give every unique array a name and use a reference to it as value.
if signature not in context.sig_to_typedef:
#cannot have leading 0 hashes later in table joins so replace now
#context.sig_to_typedef[signature] = str("Array%04d" % shorthash(signature)).replace("0", "1")
context.sig_to_typedef[signature] = "Array%04d" % shorthash(signature)
typedef = context.sig_to_typedef[signature]
return typedef
@@ -225,7 +229,7 @@ dummy_newlineSequences = ['\r\n', '\n', '\u2028', '\u2029']
dummy_relatedLocations_entry = [
{'id': -1,
'physicalLocation': {'artifactLocation': {'uri': 'scli-dyys dummy value',
'uriBaseId': 'scli-dyys dummy value',
'uriBaseId': 'scli-dyys uriBaseId',
'index': -1},
'region': {'startLine': -1,
'startColumn': -1,

View File

@@ -12,9 +12,9 @@ is marked below
#
# The starting node the leftmost node in ../notes/typegraph.pdf
#
start_node_2022_02_01 = 'Struct6787'
start_node_LGTM = 'Struct6787'
struct_graph_2022_02_01 = (
struct_graph_LGTM = (
[ ('String', 'string'),
('Int', 'int'),
('Bool', 'bool'),
@@ -121,5 +121,4 @@ struct_graph_2022_02_01 = (
('$schema', 'String'),
('runs', 'Array0177'),
('version', 'String')))]
)
)

View File

@@ -0,0 +1,161 @@
""" The signature for a single sarif file
Produced by
sarif-to-dot -u -t -f 2021-12-09/results.sarif
with some arrays manually sorted so the the signature with more fields comes first. The case
('Array6343', ('array', (1, 'Struct9699'), (0, 'Struct4055'))), # MANUALLY SORTED
is marked below
"""
#
# The starting node the leftmost node in ../notes/typegraph.pdf
#
start_node_CLI = 'Struct5521'
# generated with CLI 2.9.4
struct_graph_CLI = (
[ ('String', 'string'),
('Int', 'int'),
('Bool', 'bool'),
( 'Struct2685',
( 'struct',
('index', 'Int'),
('uri', 'String'),
('uriBaseId', 'String'))),
('Struct5277', ('struct', ('location', 'Struct2685'))),
('Struct3497', ('struct', ('index', 'Int'), ('uri', 'String'))),
('Struct9567', ('struct', ('location', 'Struct3497'))),
('Array6920', ('array', (0, 'Struct5277'), (1, 'Struct9567'))),
('Struct1509', ('struct', ('semmle.formatSpecifier', 'String'))),
('Struct2774', ('struct', ('text', 'String'))),
( 'Struct6299',
( 'struct',
('endColumn', 'Int'),
('endLine', 'Int'),
('startColumn', 'Int'),
('startLine', 'Int'))),
( 'Struct4963',
( 'struct',
('artifactLocation', 'Struct2685'),
('region', 'Struct6299'))),
( 'Struct2683',
( 'struct',
('id', 'Int'),
('message', 'Struct2774'),
('physicalLocation', 'Struct4963'))),
('Array0350', ('array', (0, 'Struct2683'))),
( 'Struct4199',
( 'struct',
('primaryLocationLineHash', 'String'),
('primaryLocationStartColumnFingerprint', 'String'))),
('Struct3942', ('struct', ('id', 'String'), ('index', 'Int'))),
( 'Struct4055',
( 'struct',
('locations', 'Array0350'),
('message', 'Struct2774'),
('partialFingerprints', 'Struct4199'),
('relatedLocations', 'Array0350'),
('rule', 'Struct3942'),
('ruleId', 'String'),
('ruleIndex', 'Int'))),
( 'Struct7125',
( 'struct',
('artifactLocation', 'Struct3497'),
('region', 'Struct6299'))),
( 'Struct6772',
( 'struct',
('id', 'Int'),
('message', 'Struct2774'),
('physicalLocation', 'Struct7125'))),
('Array8753', ('array', (0, 'Struct6772'))),
( 'Struct0102',
( 'struct',
('locations', 'Array0350'),
('message', 'Struct2774'),
('partialFingerprints', 'Struct4199'),
('relatedLocations', 'Array8753'),
('rule', 'Struct3942'),
('ruleId', 'String'),
('ruleIndex', 'Int'))),
('Struct0987', ('struct', ('location', 'Struct2683'))),
('Array1075', ('array', (0, 'Struct0987'))),
('Struct4194', ('struct', ('locations', 'Array1075'))),
('Array1597', ('array', (0, 'Struct4194'))),
('Struct7122', ('struct', ('threadFlows', 'Array1597'))),
('Array9799', ('array', (0, 'Struct7122'))),
( 'Struct9699',
( 'struct',
('codeFlows', 'Array9799'),
('locations', 'Array0350'),
('message', 'Struct2774'),
('partialFingerprints', 'Struct4199'),
('relatedLocations', 'Array0350'),
('rule', 'Struct3942'),
('ruleId', 'String'),
('ruleIndex', 'Int'))),
( 'Array1768',
#('array', (2, 'Struct9699'), (1, 'Struct4055'),(0, 'Struct0102'))),
#('array',(0, 'Struct0102'), (1, 'Struct4055'), (2, 'Struct9699'))),
#omitting (0, 'Struct0102') means we will never find column info
('array', (2, 'Struct9699'), (1, 'Struct4055'))),
('Struct8581', ('struct', ('enabled', 'Bool'), ('level', 'String'))),
('Array7069', ('array', (0, 'String'))),
( 'Struct6853',
( 'struct',
('description', 'String'),
('id', 'String'),
('kind', 'String'),
('name', 'String'),
('precision', 'String'),
('problem.severity', 'String'),
('security-severity', 'String'),
('severity', 'String'),
('sub-severity', 'String'),
('tags', 'Array7069'))),
( 'Struct7100',
( 'struct',
('defaultConfiguration', 'Struct8581'),
('fullDescription', 'Struct2774'),
('id', 'String'),
('name', 'String'),
('properties', 'Struct6853'),
('shortDescription', 'Struct2774'))),
('Array0147', ('array', (0, 'Struct7100'))),
( 'Struct7828',
( 'struct',
('name', 'String'),
('organization', 'String'),
('rules', 'Array0147'),
('semanticVersion', 'String'))),
( 'Struct9027',
('struct', ('description', 'Struct2774'), ('uri', 'String'))),
('Array4813', ('array', (0, 'Struct9027'))),
( 'Struct6152',
( 'struct',
('locations', 'Array4813'),
('name', 'String'),
('semanticVersion', 'String'))),
('Struct7826', ('struct', ('locations', 'Array4813'), ('name', 'String'))),
('Array9357', ('array', (0, 'Struct6152'), (1, 'Struct7826'))),
( 'Struct0032',
('struct', ('driver', 'Struct7828'), ('extensions', 'Array9357'))),
( 'Struct3081',
('struct', ('repositoryUri', 'String'), ('revisionId', 'String'))),
('Array5511', ('array', (0, 'Struct3081'))),
( 'Struct9786',
( 'struct',
('artifacts', 'Array6920'),
('columnKind', 'String'),
('newlineSequences', 'Array7069'),
('properties', 'Struct1509'),
('results', 'Array1768'),
('tool', 'Struct0032'),
('versionControlProvenance', 'Array5511'))),
('Array1273', ('array', (0, 'Struct9786'))),
( 'Struct5521',
( 'struct',
('$schema', 'String'),
('runs', 'Array1273'),
('version', 'String')))] )

View File

@@ -73,13 +73,12 @@ def joins_for_af_0350_location(tgraph):
)
return af_0350_location
def joins_for_sf_2683(tgraph):
def joins_for_location_info(tgraph):
"""
Join all the tables used by 2683's right side into one.
"""
# Access convenience functions
sf = lambda num: tgraph.dataframes['Struct' + str(num)]
af = lambda num: tgraph.dataframes['Array' + str(num)]
#
sf_2683 = (
#
@@ -116,6 +115,8 @@ def joins_for_problem(tgraph, af_0350_location):
#
# Form the message dataframe (@kind problem) via joins
#
import IPython
IPython.embed(header="spot 1")
kind_problem_1 = (
aft(6343)

View File

@@ -0,0 +1,462 @@
""" Collection of joins for the base tables provided by typegraph.attach_tables()
The `problem` and `path-problem` entries provide that information; the
`relatedLocations` table provides the details when multiple results are
present for either. `project` is the high-level overview; `artifacts`
provides those for the other tables.
"""
import pandas as pd
import re
from .typegraph import tagged_array_columns, tagged_struct_columns
class BaseTablesTypes:
codeflows = {
"codeflow_id" : pd.UInt64Dtype(),
"codeflow_index" : pd.Int64Dtype(),
"threadflow_index" : pd.Int64Dtype(),
"location_index" : pd.Int64Dtype(),
"endColumn" : pd.Int64Dtype(),
"endLine" : pd.Int64Dtype(),
"startColumn" : pd.Int64Dtype(),
"startLine" : pd.Int64Dtype(),
"artifact_index" : pd.Int64Dtype(),
"uri" : pd.StringDtype(),
"uriBaseId" : pd.StringDtype(),
"message" : pd.StringDtype(),
}
def joins_for_af_0350_location(tgraph):
"""
Join all the tables used by 0350's right side into one.
"""
# Access convenience functions
sf = lambda num: tgraph.dataframes['Struct' + str(num)]
af = lambda num: tgraph.dataframes['Array' + str(num)]
sft = lambda id: sf(id).rename(columns = tagged_struct_columns(tgraph, id))
aft = lambda id: af(id).rename(columns = tagged_array_columns(tgraph, id))
af_0350_location = (
aft('0350')
#
.merge(sft(2683), how="left", left_on='t0350_id_or_value_at_index', right_on='t2683_struct_id',
validate="1:m")
.drop(columns=['t0350_id_or_value_at_index', 't2683_struct_id', 't0350_type_at_index'])
#
.merge(sft(4963), how="left", left_on='t2683_physicalLocation', right_on='t4963_struct_id',
validate="1:m")
.drop(columns=['t2683_physicalLocation', 't4963_struct_id'])
#
.merge(sft(6299), how="left", left_on='t4963_region', right_on='t6299_struct_id',
validate="1:m")
.drop(columns=['t4963_region', 't6299_struct_id'])
#
.merge(sft(2685), how="left", left_on='t4963_artifactLocation', right_on='t2685_struct_id',
validate="1:m")
.drop(columns=['t4963_artifactLocation', 't2685_struct_id'])
#
.merge(sft(2774), how="left", left_on='t2683_message', right_on='t2774_struct_id',
validate="1:m")
.drop(columns=['t2683_message', 't2774_struct_id'])
#
.rename(columns={'t0350_array_id' : 'm0350_location_array_id',
't0350_value_index' : 'm0350_location_array_index',
't2683_id' : 'm0350_location_id',
't6299_endColumn' : 'm0350_location_endColumn',
't6299_endLine' : 'm0350_location_endLine',
't6299_startColumn' : 'm0350_location_startColumn',
't6299_startLine' : 'm0350_location_startLine',
't2685_index' : 'm0350_location_index',
't2685_uri' : 'm0350_location_uri',
't2685_uriBaseId' : 'm0350_location_uriBaseId',
't2774_text' : 'm0350_location_message',
})
)
return af_0350_location
def joins_for_location_info(tgraph):
"""
Join all the tables used by 2683's right side into one.
"""
# Access convenience functions
sf = lambda num: tgraph.dataframes['Struct' + str(num)]
#
sf_2683 = (
#
sf(2683)
.rename(columns={"struct_id": "struct_id_2683", "id": "id_2683"})
#
.merge(sf(4963), how="left", left_on='physicalLocation', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 'physicalLocation'])
#
.merge(sf(6299), how="left", left_on='region', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 'region'])
#
.merge(sf(2685), how="left", left_on='artifactLocation', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 'artifactLocation'])
.rename(columns={"index": "location_index_2685"})
#
.merge(sf(2774), how="left", left_on='message', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 'message'])
.rename(columns={"text": "message_text_2683"})
#
)
return sf_2683
def joins_for_problem(tgraph, af_0350_location):
"""
Return table providing the `problem` information.
"""
# Access convenience functions
sf = lambda num: tgraph.dataframes['Struct' + str(num)]
af = lambda num: tgraph.dataframes['Array' + str(num)]
sft = lambda id: sf(id).rename(columns = tagged_struct_columns(tgraph, id))
aft = lambda id: af(id).rename(columns = tagged_array_columns(tgraph, id))
#
# Form the message dataframe (@kind problem) via joins
#
kind_problem_1 = (
aft(1768)
.merge(sft(4055), how="inner",
left_on='t1768_id_or_value_at_index', right_on='t4055_struct_id',
validate="1:m")
.drop(columns=['t1768_type_at_index', 't1768_id_or_value_at_index',
't4055_struct_id'])
#
.merge(af_0350_location, how="left", left_on='t4055_locations',
right_on='m0350_location_array_id', validate="1:m")
.drop(columns=['t4055_locations', 'm0350_location_array_id'])
#
.merge(af_0350_location.rename(columns=lambda x: re.sub('m0350_location',
'm0350_relatedLocation',
x)),
how="left", left_on='t4055_relatedLocations',
right_on='m0350_relatedLocation_array_id', validate="1:m")
.drop(columns=['t4055_relatedLocations', 'm0350_relatedLocation_array_id'])
#
.merge(sft(2774), how="left", left_on='t4055_message', right_on='t2774_struct_id')
.drop(columns=['t4055_message', 't2774_struct_id'])
.rename(columns={"t2774_text": "t4055_message_text"})
#
.merge(sft(4199), how="left", left_on='t4055_partialFingerprints',
right_on='t4199_struct_id')
.drop(columns=['t4055_partialFingerprints', 't4199_struct_id'])
#
.merge(sft(3942), how="left", left_on='t4055_rule',
right_on='t3942_struct_id')
.drop(columns=['t4055_rule', 't3942_struct_id'])
)
kind_problem_2 = (
kind_problem_1
.rename({
't1768_array_id' : 'results_array_id',
't1768_value_index' : 'results_array_index',
't4055_ruleId' : 'ruleId',
't4055_ruleIndex' : 'ruleIndex',
't4055_message_text' : 'message_text',
't3942_id' : 'rule_id',
't3942_index' : 'rule_index',
}, axis='columns')
# Strip type prefix for the rest
.rename(columns = lambda x: re.sub('m0350_|t4199_', '', x))
)
return kind_problem_2
def joins_for_codeflows(tgraph, sf_2683):
"""
Return the table providing the `codeFlows` for a `path-problem table.
"""
# Access convenience functions
sf = lambda num: tgraph.dataframes['Struct' + str(num)]
af = lambda num: tgraph.dataframes['Array' + str(num)]
#
codeflows = (
af(9799).rename(columns={"array_id": "t9799_array_id", "value_index": "t9799_idx"})
#
.merge(sf(7122), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m")
.drop(columns=['id_or_value_at_index', 'struct_id', 'type_at_index'])
#
.merge(af(1597).rename(columns={"array_id": "t1597_array_id", "value_index": "t1597_idx"}),
how="left", left_on='threadFlows', right_on='t1597_array_id', validate="1:m")
.drop(columns=['threadFlows', 't1597_array_id', 'type_at_index'])
#
.merge(sf(4194), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m")
.drop(columns=['id_or_value_at_index', 'struct_id'])
#
.merge(af(1075).rename(columns={"array_id": "t1075_array_id", "value_index": "t1075_idx"}),
how="left", left_on='locations', right_on='t1075_array_id', validate="1:m")
.drop(columns=['locations', 't1075_array_id', 'type_at_index'])
.rename(columns={"t1075_idx": "t1075_locations_idx"})
#
.merge(sf('0987'), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m")
.drop(columns=['id_or_value_at_index', 'struct_id'])
#
.merge(sf_2683, how="left", left_on='location', right_on='struct_id_2683', validate="1:m")
.drop(columns=['location', 'struct_id_2683'])
)
codeflows_1 = (
codeflows
.drop(columns=['id_2683'])
.rename({
't9799_array_id': 'codeflow_id',
't9799_idx': 'codeflow_index',
't1597_idx': 'threadflow_index',
't1075_locations_idx': 'location_index',
'location_index_2685': 'artifact_index',
'message_text_2683': 'message',
}, axis='columns')
)
codeflows_2 = codeflows_1.astype(BaseTablesTypes.codeflows).reset_index(drop=True)
return codeflows_2
def joins_for_path_problem(tgraph, af_0350_location):
"""
Return table providing the `path-problem` information.
"""
# Access convenience functions
sf = lambda num: tgraph.dataframes['Struct' + str(num)]
af = lambda num: tgraph.dataframes['Array' + str(num)]
sft = lambda id: sf(id).rename(columns = tagged_struct_columns(tgraph, id))
aft = lambda id: af(id).rename(columns = tagged_array_columns(tgraph, id))
kind_pathproblem_1 = (
aft(1768)
.merge(sft(9699), how="inner", left_on='t1768_id_or_value_at_index', right_on='t9699_struct_id',
validate="1:m")
.drop(columns=['t1768_id_or_value_at_index', 't9699_struct_id', 't1768_type_at_index'])
#
.merge(af_0350_location, how="left", left_on='t9699_locations',
right_on='m0350_location_array_id', validate="1:m")
.drop(columns=['t9699_locations', 'm0350_location_array_id'])
#
.merge(af_0350_location.rename(columns=lambda x: re.sub('m0350_location',
'm0350_relatedLocation',
x)),
how="left", left_on='t9699_relatedLocations',
right_on='m0350_relatedLocation_array_id', validate="1:m")
.drop(columns=['t9699_relatedLocations', 'm0350_relatedLocation_array_id'])
#
.merge(sft(2774), how="left", left_on='t9699_message', right_on='t2774_struct_id')
.drop(columns=['t9699_message', 't2774_struct_id'])
.rename(columns={"t2774_text": "t9699_message_text"})
#
.merge(sft(4199), how="left", left_on='t9699_partialFingerprints',
right_on='t4199_struct_id')
.drop(columns=['t9699_partialFingerprints', 't4199_struct_id'])
#
.merge(sft(3942), how="left", left_on='t9699_rule',
right_on='t3942_struct_id')
.drop(columns=['t9699_rule', 't3942_struct_id'])
)
strip_colums = lambda x: re.sub('t9699_|m0350_|t4199_', '', x)
kind_pathproblem_2 = (kind_pathproblem_1
.rename({
't1768_array_id' : 'results_array_id',
't1768_value_index' : 'results_array_index',
't9699_codeFlows' : 'codeFlows_id',
't9699_ruleId' : 'ruleId',
't9699_ruleIndex' : 'ruleIndex',
't9699_message_text' : 'message_text',
't3942_id' : 'rule_id',
't3942_index' : 'rule_index',
}, axis='columns')
# Strip type prefix for the rest
.rename(columns = strip_colums))
return kind_pathproblem_2
def joins_for_relatedLocations(tgraph, sf_2683):
"""
Return table providing the `relatedLocations` and `locations` information.
"""
# Access convenience functions
sf = lambda num: tgraph.dataframes['Struct' + str(num)]
af = lambda num: tgraph.dataframes['Array' + str(num)]
#
# Form the relatedLocation dataframe via joins, starting from the union of
# relatedLocations from `kind problem` (sf(4055)) and `kind path-problem`
# (sf(9699)).
#
related_locations_1 = (
pd.concat([sf(4055)[['relatedLocations', 'struct_id']], sf(9699)[['relatedLocations', 'struct_id']]])
.merge(af('0350'), how="left", left_on='relatedLocations', right_on='array_id', validate="1:m")
.drop(columns=['relatedLocations', 'array_id', 'value_index', 'type_at_index'])
#
.merge(sf(2683), how="left", left_on='id_or_value_at_index', right_on='struct_id',
suffixes=("_4055_9699", "_2683"), validate="1:m")
.drop(columns=['struct_id_2683', 'id_or_value_at_index'])
#
.merge(sf(4963), how="left", left_on='physicalLocation', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 'physicalLocation'])
#
.merge(sf(6299), how="left", left_on='region', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 'region'])
#
.merge(sf(2685), how="left", left_on='artifactLocation', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 'artifactLocation'])
#
.merge(sf(2774), how="left", left_on='message', right_on='struct_id', validate="1:m")
.drop(columns=['struct_id', 'message'])
)
# Keep columns of interest
related_locations_2 = (related_locations_1[['struct_id_4055_9699', 'uri', 'startLine', 'startColumn', 'endLine', 'endColumn', 'text']]
.rename({'text': 'message', 'struct_id_4055_9699': 'struct_id'}, axis='columns'))
# Remove dummy locations previously injected by signature.fillsig
related_locations_3 = related_locations_2[related_locations_2.uri != 'scli-dyys dummy value']
return related_locations_3
def joins_for_project_single(tgraph):
"""
Return table providing the `project` information for sarif-extract-scans
"""
# Access convenience functions
sf = lambda num: tgraph.dataframes['Struct' + str(num)]
af = lambda num: tgraph.dataframes['Array' + str(num)]
#
project_df_temp1 = (
sf(5521)
.rename(columns={"version": "version_5521", "struct_id": "struct_id_5521"})
#
.merge(af('1273'), how="left", left_on='runs', right_on='array_id',
validate="1:m")
.drop(columns=['runs', 'array_id', 'type_at_index'])
.rename(columns={"value_index": "value_index_1273"})
#
.merge(sf(9786), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m")
.drop(columns=['id_or_value_at_index', 'struct_id']))
#
#newlines there or not - handle
if 'newlineSequences' in project_df_temp1:
project_df_temp2 = project_df_temp1.drop(columns=['newlineSequences'])
project_df_temp2 = (
project_df_temp1
#
.merge(sf(1509), how="left", left_on='properties', right_on='struct_id', validate="1:m")
.drop(columns=['properties', 'struct_id'])
#
# tool - driver - rules - defaultConfiguration - ( properties - tags )
#
.merge(sf('0032'), how="left", left_on='tool', right_on='struct_id', validate="1:m")
.drop(columns=['tool', 'struct_id'])
#
.merge(sf(7828), how="left", left_on='driver', right_on='struct_id', validate="1:m")
.drop(columns=['driver', 'struct_id'])
.rename(columns={"semanticVersion": "driver_version_7828", "name": "driver_name_7828"})
#
#assumet to be there
.merge(af(5511), how="left", left_on='versionControlProvenance', right_on='array_id')
.drop(columns=['versionControlProvenance', 'array_id', 'type_at_index'])
.rename(columns={"value_index": "versionControl_value_index_5511"})
#
.merge(sf(3081), how="left", left_on='id_or_value_at_index', right_on='struct_id')
.drop(columns=['id_or_value_at_index', 'struct_id'])
)
#
# Keep columns of interest
project_df_1 = (
project_df_temp2
.drop(columns=['struct_id_5521', 'versionControl_value_index_5511'])
.rename({
'version_5521': 'sarif_version',
'value_index_1273': 'run_index',
'driver_name_7828': 'driver_name',
'driver_version_7828': 'driver_version',
}, axis='columns')
)
return project_df_1
def joins_for_rules(tgraph):
"""
Return table providing the `rules` information.
"""
# Access convenience functions
sf = lambda num: tgraph.dataframes['Struct' + str(num)]
sft = lambda id: sf(id).rename(columns = tagged_struct_columns(tgraph, id))
af = lambda num: tgraph.dataframes['Array' + str(num)]
aft = lambda id: af(id).rename(columns = tagged_array_columns(tgraph, id))
#
rules_df = (
aft('0147')
#
.drop(columns=['t0147_type_at_index'])
#
.merge(sft(7100), how="left", left_on='t0147_id_or_value_at_index',
right_on='t7100_struct_id',
validate="1:m")
.drop(columns=['t0147_id_or_value_at_index', 't7100_struct_id'])
#
.merge(sft(8581), how="left", left_on='t7100_defaultConfiguration',
right_on='t8581_struct_id', validate="1:m")
.drop(columns=['t7100_defaultConfiguration', 't8581_struct_id'])
#
.merge(sft(2774), how="left", left_on='t7100_fullDescription',
right_on='t2774_struct_id', validate="1:m")
.drop(columns=['t7100_fullDescription', 't2774_struct_id'])
.rename(columns={'t2774_text': "t7100_t2774_fullDescription"})
#
.merge(sft(2774), how="left", left_on='t7100_shortDescription',
right_on='t2774_struct_id', validate="1:m")
.drop(columns=['t7100_shortDescription', 't2774_struct_id'])
.rename(columns={"t2774_text": 't7100_t2774_shortDescription'})
#
.merge(sft(6853), how="left", left_on='t7100_properties',
right_on='t6853_struct_id', validate="1:m")
.drop(columns=['t7100_properties', 't6853_struct_id', 't6853_id'])
#
.merge(aft(7069), how="left", left_on='t6853_tags',
right_on='t7069_array_id', validate="1:m")
.drop(columns=['t6853_tags', 't7069_array_id', 't7069_type_at_index'])
)
rules_2 = (
rules_df
.rename({
't0147_array_id' : 'rules_array_id',
't0147_value_index' : 'rules_array_index',
't7069_value_index' : 'tag_index',
't7069_id_or_value_at_index' : 'tag_text',
}, axis='columns')
# Strip type prefix for the rest
.rename(columns = lambda x: re.sub('t7100_t2774_|t7100_|t8581_|t6853_', '', x))
)
return rules_2
def joins_for_artifacts(tgraph):
"""
Return table providing the `artifacts` information.
"""
# Access convenience functions
sf = lambda num: tgraph.dataframes['Struct' + str(num)]
af = lambda num: tgraph.dataframes['Array' + str(num)]
#
artifacts_df = (
af(6920)
#
.merge(sf(5277), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m")
.drop(columns=['id_or_value_at_index', 'struct_id', 'type_at_index'])
.rename(columns={"value_index": "artifact_index_6920"})
#
.merge(sf(2685), how="left", left_on='location', right_on='struct_id', validate="1:m")
.drop(columns=['location', 'struct_id'])
)
# Keep columns of interest and rename
df_1 = (
artifacts_df
.rename({
'array_id': 'artifacts_id',
'artifact_index_6920': 'artifacts_array_index',
}, axis='columns')
)
if (df_1['artifacts_array_index'] == df_1['index']).all():
df_1 = df_1.drop(columns=['artifacts_array_index'])
return df_1

View File

@@ -179,13 +179,21 @@ def _destructure_dict(typegraph: Typegraph, node, tree):
if specific_missing not in status_writer.input_sarif_missing["extra_info"]:
status_writer.input_sarif_missing["extra_info"] += specific_missing
status_writer.warning_set["input_sarif_missing"]+=1
raise MissingFieldException(
f"(Sub)tree is missing fields required by typedef.\n"
f"Expected {type_fields}, found {tree_fields}.\n"
f"Missing {set(type_fields) - set(tree_fields)}\n"
f"Note: these fields are post-signature fill and may be more extensive than the orginal. \n"
f"Check input file for the original signature."
)
#special case of no longer trying other signatures
#else exception here triggers a retry - mainly needed for Struct9699 or Struct4055
difference = set(type_fields) - set(tree_fields)
if "uriBaseId" in difference:
tree["uriBaseId"] = "default"
_destructure_dict_1(typegraph, node, tree)
else:
raise MissingFieldException(
f"(Sub)tree is missing fields required by typedef.\n"
f"Expected {type_fields}, found {tree_fields}.\n"
f"Missing {set(type_fields) - set(tree_fields)}\n"
f"Note: these fields are post-signature fill and may be more extensive than the orginal. \n"
f"Check input file for the original signature."
)
else:
status_writer.unknown_sarif_parsing_shape["extra_info"] = "type fields {} do not match tree fields {}.".format(type_fields, tree_fields)