From 2ba9593d7087226bb25354c32614021996e2d369 Mon Sep 17 00:00:00 2001 From: Kristen Newbury Date: Thu, 1 Dec 2022 11:37:56 -0500 Subject: [PATCH 1/4] Add CLI support enabled by -f flag with CLI value tested on sarif from CodeQL CLIs: 2.6.3, 2.9.4, 2.11.4 MUST contain versionControlProvenance property however --- bin/sarif-extract-multi | 2 +- bin/sarif-extract-scans | 81 ++++-- bin/sarif-extract-scans-runner | 16 +- bin/sarif-extract-tables | 6 +- sarif_cli/scan_tables.py | 65 +++-- sarif_cli/signature.py | 6 +- sarif_cli/signature_single.py | 7 +- sarif_cli/signature_single_CLI.py | 161 +++++++++++ sarif_cli/table_joins.py | 5 +- sarif_cli/table_joins_CLI.py | 462 ++++++++++++++++++++++++++++++ sarif_cli/typegraph.py | 22 +- 11 files changed, 765 insertions(+), 68 deletions(-) create mode 100644 sarif_cli/signature_single_CLI.py create mode 100644 sarif_cli/table_joins_CLI.py diff --git a/bin/sarif-extract-multi b/bin/sarif-extract-multi index 66f40ac..c5f5655 100755 --- a/bin/sarif-extract-multi +++ b/bin/sarif-extract-multi @@ -81,7 +81,7 @@ bt = BaseTables() # # Add dataframes # -sf_2683 = tj.joins_for_sf_2683(tgraph) +sf_2683 = tj.joins_for_location_info(tgraph) af_0350_location = tj.joins_for_af_0350_location(tgraph) bt.artifacts = tj.joins_for_artifacts(tgraph) bt.codeflows = tj.joins_for_codeflows(tgraph, sf_2683) diff --git a/bin/sarif-extract-scans b/bin/sarif-extract-scans index 94052e9..c5b5cb9 100755 --- a/bin/sarif-extract-scans +++ b/bin/sarif-extract-scans @@ -2,7 +2,7 @@ """ Extract scan data from multiple sarif files in table form. """ from dataclasses import dataclass -from sarif_cli import signature, signature_single +from sarif_cli import signature, signature_single, signature_single_CLI from sarif_cli import typegraph from sarif_cli import snowflake_id from sarif_cli import status_writer @@ -14,6 +14,7 @@ import logging import pandas as pd import pathlib import sarif_cli.table_joins as tj +import sarif_cli.table_joins_CLI as tj_CLI import sarif_cli.scan_tables as st import sys @@ -32,8 +33,18 @@ parser.add_argument('outdir', metavar='output-dir', type=str, help='output direc parser.add_argument('csvout', metavar='csv-outfile', type=str, help='processing status csv output file name to use') parser.add_argument('-r', '--write-raw-tables', action="store_true", help='Write the raw sarif tables to the output directory') +parser.add_argument('-f','--input-signature', metavar='input-signature', type=str, default="LGTM", + help='Signature of the sarif, as in, where it was generated it may affect the signature.' + 'Options: LGTM, CLI' + 'If current represented signatures are not sufficient, view signature_single.py for how to support further signatures.' + ' Default: "%(default)s"') args = parser.parse_args() +if args.input_signature not in ["LGTM","CLI"]: + print("Unsupported sarif signature requested.") + print("Use one of [LGTM, CLI].") + sys.exit(0) + # Setup csv error writer status_writer.setup_csv_writer(args.csvout) @@ -66,11 +77,20 @@ context = signature.Context( ) sarif_struct = signature.fillsig(args, sarif_struct, context) +# +# Setup which signature to use +if args.input_signature == "LGTM": + signature_to_use = signature_single.struct_graph_LGTM + start_node = signature_single.start_node_LGTM +else: + #signature_to_use = signature_single.struct_graph_CLI + signature_to_use = signature_single_CLI.struct_graph_CLI + start_node = signature_single_CLI.start_node_CLI # # Use reference type graph (signature) to traverse sarif and attach values to tables try: - tgraph = typegraph.Typegraph(signature_single.struct_graph_2022_02_01) - typegraph.destructure(tgraph, signature_single.start_node_2022_02_01, sarif_struct) + tgraph = typegraph.Typegraph(signature_to_use) + typegraph.destructure(tgraph, start_node, sarif_struct) except Exception: # will have gathered errors/warnings status_writer.csv_write_warnings() @@ -126,31 +146,29 @@ external_info = ExternalInfo( # # Add dataframes for base tables # -sf_2683 = tj.joins_for_sf_2683(tgraph) -af_0350_location = tj.joins_for_af_0350_location(tgraph) -bt.artifacts = tj.joins_for_artifacts(tgraph) -bt.codeflows = tj.joins_for_codeflows(tgraph, sf_2683) -bt.kind_pathproblem = tj.joins_for_path_problem(tgraph, af_0350_location) -bt.kind_problem = tj.joins_for_problem(tgraph, af_0350_location) -bt.project = tj.joins_for_project_single(tgraph) -bt.relatedLocations = tj.joins_for_relatedLocations(tgraph, sf_2683) -bt.rules = tj.joins_for_rules(tgraph) +# (relies on some specifics of the sigature type) +if args.input_signature == "LGTM": + tj = tj +else: + tj = tj_CLI +try: + location_info = tj.joins_for_location_info(tgraph) + af_0350_location = tj.joins_for_af_0350_location(tgraph) + bt.artifacts = tj.joins_for_artifacts(tgraph) + bt.codeflows = tj.joins_for_codeflows(tgraph, location_info) + bt.kind_pathproblem = tj.joins_for_path_problem(tgraph, af_0350_location) + bt.kind_problem = tj.joins_for_problem(tgraph, af_0350_location) + bt.project = tj.joins_for_project_single(tgraph) + bt.relatedLocations = tj.joins_for_relatedLocations(tgraph, location_info) + bt.rules = tj.joins_for_rules(tgraph) +except Exception: + #possible warnings accumulated + status_writer.csv_write_warnings() + raise Exception # -# Form scan tables +# Setup rest of basetables # -# joins for projects has to happen first as it backfills the guess about the project_id -scantabs.projects = st.joins_for_projects(bt, external_info, scantabs) -scantabs.results = st.joins_for_results(bt, external_info) -scantabs.scans = st.joins_for_scans(bt, external_info, scantabs) - - - -# -# Replace the remaining internal ids with snowflake ids -# -flakegen = snowflake_id.Snowflake(0) - bt.columns_to_reindex = { # template from {field.name : [''] for field in dc.fields(bt)} 'artifacts': ['artifacts_id'], @@ -167,6 +185,19 @@ scantabs.columns_to_reindex = { 'results': ['codeFlow_id'], } +# +# Form scan tables +# +# joins for projects has to happen first as it backfills the guess about the project_id +scantabs.projects = st.joins_for_projects(bt, external_info) +scantabs.results = st.joins_for_results(bt, external_info) +scantabs.scans = st.joins_for_scans(bt, external_info, scantabs, args.input_signature) + +# +# Replace the remaining internal ids with snowflake ids +# +flakegen = snowflake_id.Snowflake(0) + _id_to_flake = {} def _get_flake(id): flake = _id_to_flake.get(id, -1) diff --git a/bin/sarif-extract-scans-runner b/bin/sarif-extract-scans-runner index 5c5a983..a069493 100755 --- a/bin/sarif-extract-scans-runner +++ b/bin/sarif-extract-scans-runner @@ -87,7 +87,14 @@ from sarif_cli import hash parser = argparse.ArgumentParser(description='Run sarif-extract-scans over a directory hierarchy') parser.add_argument('sarif_files', metavar='sarif-files', type=str, help='File containing list of sarif files, use - for stdin') -parser.add_argument('-o','--outdir', metavar='output-dir', type=str, default="", help='output directory') + +parser.add_argument('-f','--input-signature', metavar='input-signature', type=str, default="LGTM", + help='Signature of the sarif, as in, where it was generated it may affect the signature.' + 'Options: LGTM, CLI' + 'If current represented signatures are not sufficient, view signature_single.py for how to support further signatures.' + ' Default: "%(default)s"') + +parser.add_argument('-o','--outdir', metavar='output-dir', type=str, default="", help='Output directory') parser.add_argument('-m', '--max-files', metavar='number', type=int, default=100000, help='Maximum number of files to process.' @@ -126,6 +133,11 @@ if outer_dir != "": except FileExistsError: pass +if args.input_signature not in ["LGTM","CLI"]: + print("Unsupported sarif signature requested.") + print("Use one of [LGTM, CLI].") + sys.exit(0) + # # Collect sarif file information # @@ -205,7 +217,7 @@ for path in paths: scan_log_file = os.path.join(outer_dir+ project, component + ".scanlog") csv_outfile = os.path.join(outer_dir+ project, component) - runstats = subprocess.run(['sarif-extract-scans', scan_spec_file, output_dir, csv_outfile], + runstats = subprocess.run(['sarif-extract-scans', scan_spec_file, output_dir, csv_outfile, "-f", args.input_signature], capture_output=True, text=True) if runstats.returncode == 0: print("{:6} {}".format("OK", path)) diff --git a/bin/sarif-extract-tables b/bin/sarif-extract-tables index 439b335..97820b3 100755 --- a/bin/sarif-extract-tables +++ b/bin/sarif-extract-tables @@ -59,8 +59,8 @@ sarif_struct = signature.fillsig(args, sarif_struct, context) # # Use reference type graph (signature) to traverse sarif and attach values to tables # -tgraph = typegraph.Typegraph(signature_single.struct_graph_2022_02_01) -typegraph.destructure(tgraph, signature_single.start_node_2022_02_01, sarif_struct) +tgraph = typegraph.Typegraph(signature_single.struct_graph_LGTM) +typegraph.destructure(tgraph, signature_single.start_node_LGTM, sarif_struct) # # Form output tables @@ -84,7 +84,7 @@ bt = BaseTables() # # Add dataframes # -sf_2683 = tj.joins_for_sf_2683(tgraph) +sf_2683 = tj.joins_for_location_info(tgraph) af_0350_location = tj.joins_for_af_0350_location(tgraph) bt.artifacts = tj.joins_for_artifacts(tgraph) bt.codeflows = tj.joins_for_codeflows(tgraph, sf_2683) diff --git a/sarif_cli/scan_tables.py b/sarif_cli/scan_tables.py index d6b79a7..ce709d1 100644 --- a/sarif_cli/scan_tables.py +++ b/sarif_cli/scan_tables.py @@ -73,36 +73,49 @@ class ScanTablesTypes: # # Projects table # -def joins_for_projects(basetables, external_info, scantables): +def joins_for_projects(basetables, external_info): """ Form the 'projects' table for the ScanTables dataclass """ b = basetables; e = external_info - - # For a repository url of the form - # (git|https)://*/org/project.* - # use the org/project part as the project_name. - # - # TODO knewbury error handling for if the signature is slotted out? - repo_url = b.project.repositoryUri[0] - url_parts = re.match(r'(git|https)://[^/]+/([^/]+)/(.*).git', repo_url) - if url_parts: - project_name = f"{url_parts.group(2)}-{url_parts.group(3)}" - project, component = e.sarif_file_name.rstrip().split('/') - # if the runners guess from the filename was bad, replace with real info - # and continue to use that scanspec to pass that around - if project_name != project+"-"+component: - e.project_id = hash.hash_unique(project_name.encode()) + + # if the sarif does not have versionControlProvenance, semmle.sourceLanguage ect + # there is no reliable way to know the project name + # and will still need to use a guess about the project id + if "repositoryUri" in b.project: + repo_url = b.project.repositoryUri[0] + # For a repository url of the form + # (git|https)://*/org/project.* + # use the org/project part as the project_name. + # + url_parts = re.match(r'(git|https)://[^/]+/([^/]+)/(.*).git', repo_url) + if url_parts: + project_name = f"{url_parts.group(2)}-{url_parts.group(3)}" + project, component = e.sarif_file_name.rstrip().split('/') + # if the runners guess from the filename was bad, replace with real info + # and continue to use that scanspec to pass that around + if project_name != project+"-"+component: + e.project_id = hash.hash_unique(project_name.encode()) + else: + project_name = pd.NA else: + repo_url = "unknown" project_name = pd.NA + + if 'semmle.sourceLanguage' in b.project: + srcLang = b.project['semmle.sourceLanguage'][0] + allLang = ",".join(list(b.project['semmle.sourceLanguage'])) + else: + srcLang = "unknown" + allLang = "unknown" res = pd.DataFrame(data={ "id" : e.project_id, "project_name" : project_name, "creation_date" : pd.Timestamp(0.0, unit='s'), # TODO: external info "repo_url" : repo_url, - "primary_language" : b.project['semmle.sourceLanguage'][0], # TODO: external info - "languages_analyzed" : ",".join(list(b.project['semmle.sourceLanguage'])) + "primary_language" : srcLang, # TODO: external info if CLI sarif + "languages_analyzed" : allLang # TODO: external info if CLI sarif }, index=[0]) # Force all column types to ensure appropriate formatting @@ -112,7 +125,7 @@ def joins_for_projects(basetables, external_info, scantables): # # Scans table # -def joins_for_scans(basetables, external_info, scantables): +def joins_for_scans(basetables, external_info, scantables, sarif_type): """ Form the `scans` table for the ScanTables dataclass """ @@ -122,9 +135,14 @@ def joins_for_scans(basetables, external_info, scantables): driver_version = b.project.driver_version.unique() assert len(driver_version) == 1, \ "More than one driver version found for single sarif file." + # TODO if commit id exists in external info for CLI gen'd sarif, add? + if sarif_type == "LGTM": + commit_id = b.project.revisionId[0] + else: + commit_id = "unknown" res = pd.DataFrame(data={ "id" : e.scan_id, - "commit_id" : b.project.revisionId[0], + "commit_id" : commit_id, "project_id" : e.project_id, # TODO extract real date information from somewhere external "db_create_start" : pd.Timestamp(0.0, unit='s'), @@ -159,7 +177,7 @@ def joins_for_results(basetables, external_info): tables = [_results_from_kind_problem(basetables, external_info), _results_from_kind_pathproblem(basetables, external_info)] stack = [table for table in tables if len(table) > 0] - + # Concatenation fails without at least one table, so avoid that. if len(stack) > 0: res = pd.concat(stack) @@ -195,7 +213,7 @@ def _results_from_kind_problem(basetables, external_info): 'query_id' : b.kind_problem.rule_id, 'query_kind' : "problem", 'query_precision' : [_populate_from_rule_table("precision", b, i) for i in range(len(b.kind_problem))], - 'query_severity' : [_populate_from_rule_table("severity", b, i) for i in range(len(b.kind_problem))], + 'query_severity' : [_populate_from_rule_table("problem.severity", b, i) for i in range(len(b.kind_problem))], 'result_type' : "kind_problem", 'codeFlow_id' : 0, # link to codeflows (kind_pathproblem only, NULL here) @@ -240,6 +258,7 @@ def _results_from_kind_pathproblem(basetables, external_info): # The `result` table has no entry to distinguish these, so we use a simplified # version of `kind_pathproblem`. + reduced_kind_pathp = b.kind_pathproblem.drop( columns=[ 'relatedLocation_array_index', @@ -284,7 +303,7 @@ def _results_from_kind_pathproblem(basetables, external_info): 'query_id' : cfid0ppt0.rule_id.values[0], 'query_kind' : "path-problem", 'query_precision' : _populate_from_rule_table_code_flow("precision", b, cfid0ppt0), - 'query_severity' : _populate_from_rule_table_code_flow("severity", b, cfid0ppt0), + 'query_severity' : _populate_from_rule_table_code_flow("problem.severity", b, cfid0ppt0), # 'result_type' : "kind_pathproblem", 'codeFlow_id' : cfid0, diff --git a/sarif_cli/signature.py b/sarif_cli/signature.py index 582dbee..d3e76e4 100644 --- a/sarif_cli/signature.py +++ b/sarif_cli/signature.py @@ -53,6 +53,8 @@ def _signature_dict(args, elem, context: Context): if args.typedef_signatures: # Give every unique struct a name and use a reference to it as value. if signature not in context.sig_to_typedef: + #cannot have leading 0 hashes later in table joins so replace now + #context.sig_to_typedef[signature] = str("Struct%04d" % shorthash(signature)).replace("0", "1") context.sig_to_typedef[signature] = "Struct%04d" % shorthash(signature) typedef = context.sig_to_typedef[signature] return typedef @@ -79,6 +81,8 @@ def _signature_list(args, elem, context): if args.typedef_signatures: # Give every unique array a name and use a reference to it as value. if signature not in context.sig_to_typedef: + #cannot have leading 0 hashes later in table joins so replace now + #context.sig_to_typedef[signature] = str("Array%04d" % shorthash(signature)).replace("0", "1") context.sig_to_typedef[signature] = "Array%04d" % shorthash(signature) typedef = context.sig_to_typedef[signature] return typedef @@ -225,7 +229,7 @@ dummy_newlineSequences = ['\r\n', '\n', '\u2028', '\u2029'] dummy_relatedLocations_entry = [ {'id': -1, 'physicalLocation': {'artifactLocation': {'uri': 'scli-dyys dummy value', - 'uriBaseId': 'scli-dyys dummy value', + 'uriBaseId': 'scli-dyys uriBaseId', 'index': -1}, 'region': {'startLine': -1, 'startColumn': -1, diff --git a/sarif_cli/signature_single.py b/sarif_cli/signature_single.py index 050cab7..4cc20a5 100644 --- a/sarif_cli/signature_single.py +++ b/sarif_cli/signature_single.py @@ -12,9 +12,9 @@ is marked below # # The starting node the leftmost node in ../notes/typegraph.pdf # -start_node_2022_02_01 = 'Struct6787' +start_node_LGTM = 'Struct6787' -struct_graph_2022_02_01 = ( +struct_graph_LGTM = ( [ ('String', 'string'), ('Int', 'int'), ('Bool', 'bool'), @@ -121,5 +121,4 @@ struct_graph_2022_02_01 = ( ('$schema', 'String'), ('runs', 'Array0177'), ('version', 'String')))] -) - +) \ No newline at end of file diff --git a/sarif_cli/signature_single_CLI.py b/sarif_cli/signature_single_CLI.py new file mode 100644 index 0000000..fd8dfa5 --- /dev/null +++ b/sarif_cli/signature_single_CLI.py @@ -0,0 +1,161 @@ +""" The signature for a single sarif file + +Produced by + + sarif-to-dot -u -t -f 2021-12-09/results.sarif + +with some arrays manually sorted so the the signature with more fields comes first. The case + ('Array6343', ('array', (1, 'Struct9699'), (0, 'Struct4055'))), # MANUALLY SORTED +is marked below +""" + +# +# The starting node the leftmost node in ../notes/typegraph.pdf +# +start_node_CLI = 'Struct5521' + +# generated with CLI 2.9.4 +struct_graph_CLI = ( + [ ('String', 'string'), + ('Int', 'int'), + ('Bool', 'bool'), + ( 'Struct2685', + ( 'struct', + ('index', 'Int'), + ('uri', 'String'), + ('uriBaseId', 'String'))), + ('Struct5277', ('struct', ('location', 'Struct2685'))), + ('Struct3497', ('struct', ('index', 'Int'), ('uri', 'String'))), + ('Struct9567', ('struct', ('location', 'Struct3497'))), + ('Array6920', ('array', (0, 'Struct5277'), (1, 'Struct9567'))), + ('Struct1509', ('struct', ('semmle.formatSpecifier', 'String'))), + ('Struct2774', ('struct', ('text', 'String'))), + ( 'Struct6299', + ( 'struct', + ('endColumn', 'Int'), + ('endLine', 'Int'), + ('startColumn', 'Int'), + ('startLine', 'Int'))), + ( 'Struct4963', + ( 'struct', + ('artifactLocation', 'Struct2685'), + ('region', 'Struct6299'))), + ( 'Struct2683', + ( 'struct', + ('id', 'Int'), + ('message', 'Struct2774'), + ('physicalLocation', 'Struct4963'))), + ('Array0350', ('array', (0, 'Struct2683'))), + ( 'Struct4199', + ( 'struct', + ('primaryLocationLineHash', 'String'), + ('primaryLocationStartColumnFingerprint', 'String'))), + ('Struct3942', ('struct', ('id', 'String'), ('index', 'Int'))), + ( 'Struct4055', + ( 'struct', + ('locations', 'Array0350'), + ('message', 'Struct2774'), + ('partialFingerprints', 'Struct4199'), + ('relatedLocations', 'Array0350'), + ('rule', 'Struct3942'), + ('ruleId', 'String'), + ('ruleIndex', 'Int'))), + ( 'Struct7125', + ( 'struct', + ('artifactLocation', 'Struct3497'), + ('region', 'Struct6299'))), + ( 'Struct6772', + ( 'struct', + ('id', 'Int'), + ('message', 'Struct2774'), + ('physicalLocation', 'Struct7125'))), + ('Array8753', ('array', (0, 'Struct6772'))), + ( 'Struct0102', + ( 'struct', + ('locations', 'Array0350'), + ('message', 'Struct2774'), + ('partialFingerprints', 'Struct4199'), + ('relatedLocations', 'Array8753'), + ('rule', 'Struct3942'), + ('ruleId', 'String'), + ('ruleIndex', 'Int'))), + ('Struct0987', ('struct', ('location', 'Struct2683'))), + ('Array1075', ('array', (0, 'Struct0987'))), + ('Struct4194', ('struct', ('locations', 'Array1075'))), + ('Array1597', ('array', (0, 'Struct4194'))), + ('Struct7122', ('struct', ('threadFlows', 'Array1597'))), + ('Array9799', ('array', (0, 'Struct7122'))), + ( 'Struct9699', + ( 'struct', + ('codeFlows', 'Array9799'), + ('locations', 'Array0350'), + ('message', 'Struct2774'), + ('partialFingerprints', 'Struct4199'), + ('relatedLocations', 'Array0350'), + ('rule', 'Struct3942'), + ('ruleId', 'String'), + ('ruleIndex', 'Int'))), + ( 'Array1768', + #('array', (2, 'Struct9699'), (1, 'Struct4055'),(0, 'Struct0102'))), + #('array',(0, 'Struct0102'), (1, 'Struct4055'), (2, 'Struct9699'))), + #omitting (0, 'Struct0102') means we will never find column info + ('array', (2, 'Struct9699'), (1, 'Struct4055'))), + ('Struct8581', ('struct', ('enabled', 'Bool'), ('level', 'String'))), + ('Array7069', ('array', (0, 'String'))), + ( 'Struct6853', + ( 'struct', + ('description', 'String'), + ('id', 'String'), + ('kind', 'String'), + ('name', 'String'), + ('precision', 'String'), + ('problem.severity', 'String'), + ('security-severity', 'String'), + ('severity', 'String'), + ('sub-severity', 'String'), + ('tags', 'Array7069'))), + ( 'Struct7100', + ( 'struct', + ('defaultConfiguration', 'Struct8581'), + ('fullDescription', 'Struct2774'), + ('id', 'String'), + ('name', 'String'), + ('properties', 'Struct6853'), + ('shortDescription', 'Struct2774'))), + ('Array0147', ('array', (0, 'Struct7100'))), + ( 'Struct7828', + ( 'struct', + ('name', 'String'), + ('organization', 'String'), + ('rules', 'Array0147'), + ('semanticVersion', 'String'))), + ( 'Struct9027', + ('struct', ('description', 'Struct2774'), ('uri', 'String'))), + ('Array4813', ('array', (0, 'Struct9027'))), + ( 'Struct6152', + ( 'struct', + ('locations', 'Array4813'), + ('name', 'String'), + ('semanticVersion', 'String'))), + ('Struct7826', ('struct', ('locations', 'Array4813'), ('name', 'String'))), + ('Array9357', ('array', (0, 'Struct6152'), (1, 'Struct7826'))), + ( 'Struct0032', + ('struct', ('driver', 'Struct7828'), ('extensions', 'Array9357'))), + ( 'Struct3081', + ('struct', ('repositoryUri', 'String'), ('revisionId', 'String'))), + ('Array5511', ('array', (0, 'Struct3081'))), + ( 'Struct9786', + ( 'struct', + ('artifacts', 'Array6920'), + ('columnKind', 'String'), + ('newlineSequences', 'Array7069'), + ('properties', 'Struct1509'), + ('results', 'Array1768'), + ('tool', 'Struct0032'), + ('versionControlProvenance', 'Array5511'))), + ('Array1273', ('array', (0, 'Struct9786'))), + ( 'Struct5521', + ( 'struct', + ('$schema', 'String'), + ('runs', 'Array1273'), + ('version', 'String')))] ) diff --git a/sarif_cli/table_joins.py b/sarif_cli/table_joins.py index 5209f84..520f0c6 100644 --- a/sarif_cli/table_joins.py +++ b/sarif_cli/table_joins.py @@ -73,13 +73,12 @@ def joins_for_af_0350_location(tgraph): ) return af_0350_location -def joins_for_sf_2683(tgraph): +def joins_for_location_info(tgraph): """ Join all the tables used by 2683's right side into one. """ # Access convenience functions sf = lambda num: tgraph.dataframes['Struct' + str(num)] - af = lambda num: tgraph.dataframes['Array' + str(num)] # sf_2683 = ( # @@ -116,6 +115,8 @@ def joins_for_problem(tgraph, af_0350_location): # # Form the message dataframe (@kind problem) via joins # + import IPython + IPython.embed(header="spot 1") kind_problem_1 = ( aft(6343) diff --git a/sarif_cli/table_joins_CLI.py b/sarif_cli/table_joins_CLI.py new file mode 100644 index 0000000..71b8c42 --- /dev/null +++ b/sarif_cli/table_joins_CLI.py @@ -0,0 +1,462 @@ +""" Collection of joins for the base tables provided by typegraph.attach_tables() + + The `problem` and `path-problem` entries provide that information; the + `relatedLocations` table provides the details when multiple results are + present for either. `project` is the high-level overview; `artifacts` + provides those for the other tables. +""" +import pandas as pd +import re +from .typegraph import tagged_array_columns, tagged_struct_columns + +class BaseTablesTypes: + codeflows = { + "codeflow_id" : pd.UInt64Dtype(), + "codeflow_index" : pd.Int64Dtype(), + "threadflow_index" : pd.Int64Dtype(), + "location_index" : pd.Int64Dtype(), + "endColumn" : pd.Int64Dtype(), + "endLine" : pd.Int64Dtype(), + "startColumn" : pd.Int64Dtype(), + "startLine" : pd.Int64Dtype(), + "artifact_index" : pd.Int64Dtype(), + "uri" : pd.StringDtype(), + "uriBaseId" : pd.StringDtype(), + "message" : pd.StringDtype(), + } + +def joins_for_af_0350_location(tgraph): + """ + Join all the tables used by 0350's right side into one. + """ + # Access convenience functions + sf = lambda num: tgraph.dataframes['Struct' + str(num)] + af = lambda num: tgraph.dataframes['Array' + str(num)] + sft = lambda id: sf(id).rename(columns = tagged_struct_columns(tgraph, id)) + aft = lambda id: af(id).rename(columns = tagged_array_columns(tgraph, id)) + + af_0350_location = ( + aft('0350') + # + .merge(sft(2683), how="left", left_on='t0350_id_or_value_at_index', right_on='t2683_struct_id', + validate="1:m") + .drop(columns=['t0350_id_or_value_at_index', 't2683_struct_id', 't0350_type_at_index']) + # + .merge(sft(4963), how="left", left_on='t2683_physicalLocation', right_on='t4963_struct_id', + validate="1:m") + .drop(columns=['t2683_physicalLocation', 't4963_struct_id']) + # + .merge(sft(6299), how="left", left_on='t4963_region', right_on='t6299_struct_id', + validate="1:m") + .drop(columns=['t4963_region', 't6299_struct_id']) + # + .merge(sft(2685), how="left", left_on='t4963_artifactLocation', right_on='t2685_struct_id', + validate="1:m") + .drop(columns=['t4963_artifactLocation', 't2685_struct_id']) + # + .merge(sft(2774), how="left", left_on='t2683_message', right_on='t2774_struct_id', + validate="1:m") + .drop(columns=['t2683_message', 't2774_struct_id']) + # + .rename(columns={'t0350_array_id' : 'm0350_location_array_id', + 't0350_value_index' : 'm0350_location_array_index', + 't2683_id' : 'm0350_location_id', + 't6299_endColumn' : 'm0350_location_endColumn', + 't6299_endLine' : 'm0350_location_endLine', + 't6299_startColumn' : 'm0350_location_startColumn', + 't6299_startLine' : 'm0350_location_startLine', + 't2685_index' : 'm0350_location_index', + 't2685_uri' : 'm0350_location_uri', + 't2685_uriBaseId' : 'm0350_location_uriBaseId', + 't2774_text' : 'm0350_location_message', + }) + ) + return af_0350_location + +def joins_for_location_info(tgraph): + """ + Join all the tables used by 2683's right side into one. + """ + # Access convenience functions + sf = lambda num: tgraph.dataframes['Struct' + str(num)] + # + sf_2683 = ( + # + sf(2683) + .rename(columns={"struct_id": "struct_id_2683", "id": "id_2683"}) + # + .merge(sf(4963), how="left", left_on='physicalLocation', right_on='struct_id', validate="1:m") + .drop(columns=['struct_id', 'physicalLocation']) + # + .merge(sf(6299), how="left", left_on='region', right_on='struct_id', validate="1:m") + .drop(columns=['struct_id', 'region']) + # + .merge(sf(2685), how="left", left_on='artifactLocation', right_on='struct_id', validate="1:m") + .drop(columns=['struct_id', 'artifactLocation']) + .rename(columns={"index": "location_index_2685"}) + # + .merge(sf(2774), how="left", left_on='message', right_on='struct_id', validate="1:m") + .drop(columns=['struct_id', 'message']) + .rename(columns={"text": "message_text_2683"}) + # + ) + + return sf_2683 + +def joins_for_problem(tgraph, af_0350_location): + """ + Return table providing the `problem` information. + """ + # Access convenience functions + sf = lambda num: tgraph.dataframes['Struct' + str(num)] + af = lambda num: tgraph.dataframes['Array' + str(num)] + sft = lambda id: sf(id).rename(columns = tagged_struct_columns(tgraph, id)) + aft = lambda id: af(id).rename(columns = tagged_array_columns(tgraph, id)) + # + # Form the message dataframe (@kind problem) via joins + # + + kind_problem_1 = ( + aft(1768) + .merge(sft(4055), how="inner", + left_on='t1768_id_or_value_at_index', right_on='t4055_struct_id', + validate="1:m") + .drop(columns=['t1768_type_at_index', 't1768_id_or_value_at_index', + 't4055_struct_id']) + # + .merge(af_0350_location, how="left", left_on='t4055_locations', + right_on='m0350_location_array_id', validate="1:m") + .drop(columns=['t4055_locations', 'm0350_location_array_id']) + # + .merge(af_0350_location.rename(columns=lambda x: re.sub('m0350_location', + 'm0350_relatedLocation', + x)), + how="left", left_on='t4055_relatedLocations', + right_on='m0350_relatedLocation_array_id', validate="1:m") + .drop(columns=['t4055_relatedLocations', 'm0350_relatedLocation_array_id']) + # + .merge(sft(2774), how="left", left_on='t4055_message', right_on='t2774_struct_id') + .drop(columns=['t4055_message', 't2774_struct_id']) + .rename(columns={"t2774_text": "t4055_message_text"}) + # + .merge(sft(4199), how="left", left_on='t4055_partialFingerprints', + right_on='t4199_struct_id') + .drop(columns=['t4055_partialFingerprints', 't4199_struct_id']) + # + .merge(sft(3942), how="left", left_on='t4055_rule', + right_on='t3942_struct_id') + .drop(columns=['t4055_rule', 't3942_struct_id']) + ) + + kind_problem_2 = ( + kind_problem_1 + .rename({ + 't1768_array_id' : 'results_array_id', + 't1768_value_index' : 'results_array_index', + 't4055_ruleId' : 'ruleId', + 't4055_ruleIndex' : 'ruleIndex', + 't4055_message_text' : 'message_text', + 't3942_id' : 'rule_id', + 't3942_index' : 'rule_index', + }, axis='columns') + # Strip type prefix for the rest + .rename(columns = lambda x: re.sub('m0350_|t4199_', '', x)) + ) + + return kind_problem_2 + + +def joins_for_codeflows(tgraph, sf_2683): + """ + Return the table providing the `codeFlows` for a `path-problem table. + """ + # Access convenience functions + sf = lambda num: tgraph.dataframes['Struct' + str(num)] + af = lambda num: tgraph.dataframes['Array' + str(num)] + # + codeflows = ( + af(9799).rename(columns={"array_id": "t9799_array_id", "value_index": "t9799_idx"}) + # + .merge(sf(7122), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m") + .drop(columns=['id_or_value_at_index', 'struct_id', 'type_at_index']) + # + .merge(af(1597).rename(columns={"array_id": "t1597_array_id", "value_index": "t1597_idx"}), + how="left", left_on='threadFlows', right_on='t1597_array_id', validate="1:m") + .drop(columns=['threadFlows', 't1597_array_id', 'type_at_index']) + # + .merge(sf(4194), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m") + .drop(columns=['id_or_value_at_index', 'struct_id']) + # + .merge(af(1075).rename(columns={"array_id": "t1075_array_id", "value_index": "t1075_idx"}), + how="left", left_on='locations', right_on='t1075_array_id', validate="1:m") + .drop(columns=['locations', 't1075_array_id', 'type_at_index']) + .rename(columns={"t1075_idx": "t1075_locations_idx"}) + # + .merge(sf('0987'), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m") + .drop(columns=['id_or_value_at_index', 'struct_id']) + # + .merge(sf_2683, how="left", left_on='location', right_on='struct_id_2683', validate="1:m") + .drop(columns=['location', 'struct_id_2683']) + ) + codeflows_1 = ( + codeflows + .drop(columns=['id_2683']) + .rename({ + 't9799_array_id': 'codeflow_id', + 't9799_idx': 'codeflow_index', + 't1597_idx': 'threadflow_index', + 't1075_locations_idx': 'location_index', + 'location_index_2685': 'artifact_index', + 'message_text_2683': 'message', + }, axis='columns') + ) + codeflows_2 = codeflows_1.astype(BaseTablesTypes.codeflows).reset_index(drop=True) + return codeflows_2 + +def joins_for_path_problem(tgraph, af_0350_location): + """ + Return table providing the `path-problem` information. + """ + # Access convenience functions + sf = lambda num: tgraph.dataframes['Struct' + str(num)] + af = lambda num: tgraph.dataframes['Array' + str(num)] + sft = lambda id: sf(id).rename(columns = tagged_struct_columns(tgraph, id)) + aft = lambda id: af(id).rename(columns = tagged_array_columns(tgraph, id)) + + kind_pathproblem_1 = ( + aft(1768) + .merge(sft(9699), how="inner", left_on='t1768_id_or_value_at_index', right_on='t9699_struct_id', + validate="1:m") + .drop(columns=['t1768_id_or_value_at_index', 't9699_struct_id', 't1768_type_at_index']) + # + .merge(af_0350_location, how="left", left_on='t9699_locations', + right_on='m0350_location_array_id', validate="1:m") + .drop(columns=['t9699_locations', 'm0350_location_array_id']) + # + .merge(af_0350_location.rename(columns=lambda x: re.sub('m0350_location', + 'm0350_relatedLocation', + x)), + how="left", left_on='t9699_relatedLocations', + right_on='m0350_relatedLocation_array_id', validate="1:m") + .drop(columns=['t9699_relatedLocations', 'm0350_relatedLocation_array_id']) + # + .merge(sft(2774), how="left", left_on='t9699_message', right_on='t2774_struct_id') + .drop(columns=['t9699_message', 't2774_struct_id']) + .rename(columns={"t2774_text": "t9699_message_text"}) + # + .merge(sft(4199), how="left", left_on='t9699_partialFingerprints', + right_on='t4199_struct_id') + .drop(columns=['t9699_partialFingerprints', 't4199_struct_id']) + # + .merge(sft(3942), how="left", left_on='t9699_rule', + right_on='t3942_struct_id') + .drop(columns=['t9699_rule', 't3942_struct_id']) + ) + strip_colums = lambda x: re.sub('t9699_|m0350_|t4199_', '', x) + kind_pathproblem_2 = (kind_pathproblem_1 + .rename({ + 't1768_array_id' : 'results_array_id', + 't1768_value_index' : 'results_array_index', + 't9699_codeFlows' : 'codeFlows_id', + 't9699_ruleId' : 'ruleId', + 't9699_ruleIndex' : 'ruleIndex', + 't9699_message_text' : 'message_text', + 't3942_id' : 'rule_id', + 't3942_index' : 'rule_index', + }, axis='columns') + # Strip type prefix for the rest + .rename(columns = strip_colums)) + + return kind_pathproblem_2 + +def joins_for_relatedLocations(tgraph, sf_2683): + """ + Return table providing the `relatedLocations` and `locations` information. + """ + # Access convenience functions + sf = lambda num: tgraph.dataframes['Struct' + str(num)] + af = lambda num: tgraph.dataframes['Array' + str(num)] + # + # Form the relatedLocation dataframe via joins, starting from the union of + # relatedLocations from `kind problem` (sf(4055)) and `kind path-problem` + # (sf(9699)). + # + related_locations_1 = ( + pd.concat([sf(4055)[['relatedLocations', 'struct_id']], sf(9699)[['relatedLocations', 'struct_id']]]) + .merge(af('0350'), how="left", left_on='relatedLocations', right_on='array_id', validate="1:m") + .drop(columns=['relatedLocations', 'array_id', 'value_index', 'type_at_index']) + # + .merge(sf(2683), how="left", left_on='id_or_value_at_index', right_on='struct_id', + suffixes=("_4055_9699", "_2683"), validate="1:m") + .drop(columns=['struct_id_2683', 'id_or_value_at_index']) + # + .merge(sf(4963), how="left", left_on='physicalLocation', right_on='struct_id', validate="1:m") + .drop(columns=['struct_id', 'physicalLocation']) + # + .merge(sf(6299), how="left", left_on='region', right_on='struct_id', validate="1:m") + .drop(columns=['struct_id', 'region']) + # + .merge(sf(2685), how="left", left_on='artifactLocation', right_on='struct_id', validate="1:m") + .drop(columns=['struct_id', 'artifactLocation']) + # + .merge(sf(2774), how="left", left_on='message', right_on='struct_id', validate="1:m") + .drop(columns=['struct_id', 'message']) + ) + + # Keep columns of interest + related_locations_2 = (related_locations_1[['struct_id_4055_9699', 'uri', 'startLine', 'startColumn', 'endLine', 'endColumn', 'text']] + .rename({'text': 'message', 'struct_id_4055_9699': 'struct_id'}, axis='columns')) + + # Remove dummy locations previously injected by signature.fillsig + related_locations_3 = related_locations_2[related_locations_2.uri != 'scli-dyys dummy value'] + + return related_locations_3 + +def joins_for_project_single(tgraph): + """ + Return table providing the `project` information for sarif-extract-scans + """ + # Access convenience functions + sf = lambda num: tgraph.dataframes['Struct' + str(num)] + af = lambda num: tgraph.dataframes['Array' + str(num)] + # + project_df_temp1 = ( + sf(5521) + .rename(columns={"version": "version_5521", "struct_id": "struct_id_5521"}) + # + .merge(af('1273'), how="left", left_on='runs', right_on='array_id', + validate="1:m") + .drop(columns=['runs', 'array_id', 'type_at_index']) + .rename(columns={"value_index": "value_index_1273"}) + # + .merge(sf(9786), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m") + .drop(columns=['id_or_value_at_index', 'struct_id'])) + # + #newlines there or not - handle + if 'newlineSequences' in project_df_temp1: + project_df_temp2 = project_df_temp1.drop(columns=['newlineSequences']) + + project_df_temp2 = ( + project_df_temp1 + # + .merge(sf(1509), how="left", left_on='properties', right_on='struct_id', validate="1:m") + .drop(columns=['properties', 'struct_id']) + # + # tool - driver - rules - defaultConfiguration - ( properties - tags ) + # + .merge(sf('0032'), how="left", left_on='tool', right_on='struct_id', validate="1:m") + .drop(columns=['tool', 'struct_id']) + # + .merge(sf(7828), how="left", left_on='driver', right_on='struct_id', validate="1:m") + .drop(columns=['driver', 'struct_id']) + .rename(columns={"semanticVersion": "driver_version_7828", "name": "driver_name_7828"}) + # + #assumet to be there + .merge(af(5511), how="left", left_on='versionControlProvenance', right_on='array_id') + .drop(columns=['versionControlProvenance', 'array_id', 'type_at_index']) + .rename(columns={"value_index": "versionControl_value_index_5511"}) + # + .merge(sf(3081), how="left", left_on='id_or_value_at_index', right_on='struct_id') + .drop(columns=['id_or_value_at_index', 'struct_id']) + ) + # + + # Keep columns of interest + project_df_1 = ( + project_df_temp2 + .drop(columns=['struct_id_5521', 'versionControl_value_index_5511']) + .rename({ + 'version_5521': 'sarif_version', + 'value_index_1273': 'run_index', + 'driver_name_7828': 'driver_name', + 'driver_version_7828': 'driver_version', + }, axis='columns') + ) + return project_df_1 + +def joins_for_rules(tgraph): + """ + Return table providing the `rules` information. + """ + # Access convenience functions + sf = lambda num: tgraph.dataframes['Struct' + str(num)] + sft = lambda id: sf(id).rename(columns = tagged_struct_columns(tgraph, id)) + af = lambda num: tgraph.dataframes['Array' + str(num)] + aft = lambda id: af(id).rename(columns = tagged_array_columns(tgraph, id)) + # + rules_df = ( + aft('0147') + # + .drop(columns=['t0147_type_at_index']) + # + .merge(sft(7100), how="left", left_on='t0147_id_or_value_at_index', + right_on='t7100_struct_id', + validate="1:m") + .drop(columns=['t0147_id_or_value_at_index', 't7100_struct_id']) + # + .merge(sft(8581), how="left", left_on='t7100_defaultConfiguration', + right_on='t8581_struct_id', validate="1:m") + .drop(columns=['t7100_defaultConfiguration', 't8581_struct_id']) + # + .merge(sft(2774), how="left", left_on='t7100_fullDescription', + right_on='t2774_struct_id', validate="1:m") + .drop(columns=['t7100_fullDescription', 't2774_struct_id']) + .rename(columns={'t2774_text': "t7100_t2774_fullDescription"}) + # + .merge(sft(2774), how="left", left_on='t7100_shortDescription', + right_on='t2774_struct_id', validate="1:m") + .drop(columns=['t7100_shortDescription', 't2774_struct_id']) + .rename(columns={"t2774_text": 't7100_t2774_shortDescription'}) + # + .merge(sft(6853), how="left", left_on='t7100_properties', + right_on='t6853_struct_id', validate="1:m") + .drop(columns=['t7100_properties', 't6853_struct_id', 't6853_id']) + # + .merge(aft(7069), how="left", left_on='t6853_tags', + right_on='t7069_array_id', validate="1:m") + .drop(columns=['t6853_tags', 't7069_array_id', 't7069_type_at_index']) + ) + rules_2 = ( + rules_df + .rename({ + 't0147_array_id' : 'rules_array_id', + 't0147_value_index' : 'rules_array_index', + 't7069_value_index' : 'tag_index', + 't7069_id_or_value_at_index' : 'tag_text', + }, axis='columns') + # Strip type prefix for the rest + .rename(columns = lambda x: re.sub('t7100_t2774_|t7100_|t8581_|t6853_', '', x)) + ) + return rules_2 + +def joins_for_artifacts(tgraph): + """ + Return table providing the `artifacts` information. + """ + # Access convenience functions + sf = lambda num: tgraph.dataframes['Struct' + str(num)] + af = lambda num: tgraph.dataframes['Array' + str(num)] + # + artifacts_df = ( + af(6920) + # + .merge(sf(5277), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m") + .drop(columns=['id_or_value_at_index', 'struct_id', 'type_at_index']) + .rename(columns={"value_index": "artifact_index_6920"}) + # + .merge(sf(2685), how="left", left_on='location', right_on='struct_id', validate="1:m") + .drop(columns=['location', 'struct_id']) + ) + # Keep columns of interest and rename + df_1 = ( + artifacts_df + .rename({ + 'array_id': 'artifacts_id', + 'artifact_index_6920': 'artifacts_array_index', + }, axis='columns') + ) + + if (df_1['artifacts_array_index'] == df_1['index']).all(): + df_1 = df_1.drop(columns=['artifacts_array_index']) + + return df_1 diff --git a/sarif_cli/typegraph.py b/sarif_cli/typegraph.py index 5761943..3769fc6 100644 --- a/sarif_cli/typegraph.py +++ b/sarif_cli/typegraph.py @@ -179,13 +179,21 @@ def _destructure_dict(typegraph: Typegraph, node, tree): if specific_missing not in status_writer.input_sarif_missing["extra_info"]: status_writer.input_sarif_missing["extra_info"] += specific_missing status_writer.warning_set["input_sarif_missing"]+=1 - raise MissingFieldException( - f"(Sub)tree is missing fields required by typedef.\n" - f"Expected {type_fields}, found {tree_fields}.\n" - f"Missing {set(type_fields) - set(tree_fields)}\n" - f"Note: these fields are post-signature fill and may be more extensive than the orginal. \n" - f"Check input file for the original signature." - ) + + #special case of no longer trying other signatures + #else exception here triggers a retry - mainly needed for Struct9699 or Struct4055 + difference = set(type_fields) - set(tree_fields) + if "uriBaseId" in difference: + tree["uriBaseId"] = "default" + _destructure_dict_1(typegraph, node, tree) + else: + raise MissingFieldException( + f"(Sub)tree is missing fields required by typedef.\n" + f"Expected {type_fields}, found {tree_fields}.\n" + f"Missing {set(type_fields) - set(tree_fields)}\n" + f"Note: these fields are post-signature fill and may be more extensive than the orginal. \n" + f"Check input file for the original signature." + ) else: status_writer.unknown_sarif_parsing_shape["extra_info"] = "type fields {} do not match tree fields {}.".format(type_fields, tree_fields) From b45d868f89d618de2621647f28a26f2b57695fab Mon Sep 17 00:00:00 2001 From: Kristen Newbury Date: Tue, 13 Dec 2022 18:32:34 -0500 Subject: [PATCH 2/4] Update README for CLI usage instructions --- README.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/README.md b/README.md index 49d9706..db6f7f4 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,23 @@ The tool was implemented using Python 3.9. +# Sarif format information + + The tool operates on sarif generated by LGTM 1.27.0 (by default) or by the CodeQL CLI (enabled with the -f flag given a value of `CLI`). + + The values that the -f flag accepts are: `LGTM` and `CLI`. + + The CLI versions used against development of the CLI support were: 2.6.3, 2.9.4, and 2.11.4. + + The CLI sarif **MUST** contain one additional property `versionControlProvenance` - which needs to look like: + ``` + "versionControlProvenance": [ + { + "repositoryUri": "https://github.com/testorg/testrepo.git", + "revisionId": "testsha" + } + ``` + # Test Setup This repository includes some test data (in `data`) and uses =git lfs= for storing those test files; installation steps are at [[https://git-lfs.github.com][git-lfs]]; on a mac with homebrew, install it via From efc87d4f08d5709d73f0b433ad32505e100bcaca Mon Sep 17 00:00:00 2001 From: Kristen Newbury Date: Tue, 13 Dec 2022 18:42:45 -0500 Subject: [PATCH 3/4] Update README missing minor syntax --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index db6f7f4..6c7850e 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,7 @@ "repositoryUri": "https://github.com/testorg/testrepo.git", "revisionId": "testsha" } + ] ``` # Test Setup From bbeba14dec73fa5c1f89d78bf1288df4a7b38bbd Mon Sep 17 00:00:00 2001 From: Kristen Newbury Date: Tue, 13 Dec 2022 20:13:13 -0500 Subject: [PATCH 4/4] Bugfix CLI signature merge mistake --- sarif_cli/signature_single_CLI.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sarif_cli/signature_single_CLI.py b/sarif_cli/signature_single_CLI.py index fd8dfa5..d773cf2 100644 --- a/sarif_cli/signature_single_CLI.py +++ b/sarif_cli/signature_single_CLI.py @@ -111,7 +111,6 @@ struct_graph_CLI = ( ('precision', 'String'), ('problem.severity', 'String'), ('security-severity', 'String'), - ('severity', 'String'), ('sub-severity', 'String'), ('tags', 'Array7069'))), ( 'Struct7100',