From 04a5aae14de19f21ea0f097356f9cbfa2b4ef651 Mon Sep 17 00:00:00 2001 From: Kristen Newbury Date: Thu, 1 Dec 2022 11:37:56 -0500 Subject: [PATCH 1/8] Add CLI support enabled by -f flag with CLI value tested on sarif from CodeQL CLIs: 2.6.3, 2.9.4, 2.11.4 MUST contain versionControlProvenance property however --- bin/sarif-extract-multi | 2 +- bin/sarif-extract-scans | 81 ++++-- bin/sarif-extract-scans-runner | 16 +- bin/sarif-extract-tables | 6 +- sarif_cli/scan_tables.py | 63 ++-- sarif_cli/signature.py | 2 +- sarif_cli/signature_single.py | 7 +- sarif_cli/signature_single_CLI.py | 161 +++++++++++ sarif_cli/table_joins.py | 3 +- sarif_cli/table_joins_CLI.py | 462 ++++++++++++++++++++++++++++++ sarif_cli/typegraph.py | 22 +- 11 files changed, 757 insertions(+), 68 deletions(-) create mode 100644 sarif_cli/signature_single_CLI.py create mode 100644 sarif_cli/table_joins_CLI.py diff --git a/bin/sarif-extract-multi b/bin/sarif-extract-multi index 66f40ac..c5f5655 100755 --- a/bin/sarif-extract-multi +++ b/bin/sarif-extract-multi @@ -81,7 +81,7 @@ bt = BaseTables() # # Add dataframes # -sf_2683 = tj.joins_for_sf_2683(tgraph) +sf_2683 = tj.joins_for_location_info(tgraph) af_0350_location = tj.joins_for_af_0350_location(tgraph) bt.artifacts = tj.joins_for_artifacts(tgraph) bt.codeflows = tj.joins_for_codeflows(tgraph, sf_2683) diff --git a/bin/sarif-extract-scans b/bin/sarif-extract-scans index d676378..d891f71 100755 --- a/bin/sarif-extract-scans +++ b/bin/sarif-extract-scans @@ -2,7 +2,7 @@ """ Extract scan data from multiple sarif files in table form. """ from dataclasses import dataclass -from sarif_cli import signature, signature_single +from sarif_cli import signature, signature_single, signature_single_CLI from sarif_cli import typegraph from sarif_cli import snowflake_id from sarif_cli import status_writer @@ -14,6 +14,7 @@ import logging import pandas as pd import pathlib import sarif_cli.table_joins as tj +import sarif_cli.table_joins_CLI as tj_CLI import sarif_cli.scan_tables as st import sys @@ -32,8 +33,18 @@ parser.add_argument('outdir', metavar='output-dir', type=str, help='output direc parser.add_argument('csvout', metavar='csv-outfile', type=str, help='processing status csv output file name to use') parser.add_argument('-r', '--write-raw-tables', action="store_true", help='Write the raw sarif tables to the output directory') +parser.add_argument('-f','--input-signature', metavar='input-signature', type=str, default="LGTM", + help='Signature of the sarif, as in, where it was generated it may affect the signature.' + 'Options: LGTM, CLI' + 'If current represented signatures are not sufficient, view signature_single.py for how to support further signatures.' + ' Default: "%(default)s"') args = parser.parse_args() +if args.input_signature not in ["LGTM","CLI"]: + print("Unsupported sarif signature requested.") + print("Use one of [LGTM, CLI].") + sys.exit(0) + # Setup csv error writer status_writer.setup_csv_writer(args.csvout) @@ -66,11 +77,20 @@ context = signature.Context( ) sarif_struct = signature.fillsig(args, sarif_struct, context) +# +# Setup which signature to use +if args.input_signature == "LGTM": + signature_to_use = signature_single.struct_graph_LGTM + start_node = signature_single.start_node_LGTM +else: + #signature_to_use = signature_single.struct_graph_CLI + signature_to_use = signature_single_CLI.struct_graph_CLI + start_node = signature_single_CLI.start_node_CLI # # Use reference type graph (signature) to traverse sarif and attach values to tables try: - tgraph = typegraph.Typegraph(signature_single.struct_graph_2022_02_01) - typegraph.destructure(tgraph, signature_single.start_node_2022_02_01, sarif_struct) + tgraph = typegraph.Typegraph(signature_to_use) + typegraph.destructure(tgraph, start_node, sarif_struct) except Exception: # will have gathered errors/warnings status_writer.csv_write_warnings() @@ -126,31 +146,29 @@ external_info = ExternalInfo( # # Add dataframes for base tables # -sf_2683 = tj.joins_for_sf_2683(tgraph) -af_0350_location = tj.joins_for_af_0350_location(tgraph) -bt.artifacts = tj.joins_for_artifacts(tgraph) -bt.codeflows = tj.joins_for_codeflows(tgraph, sf_2683) -bt.kind_pathproblem = tj.joins_for_path_problem(tgraph, af_0350_location) -bt.kind_problem = tj.joins_for_problem(tgraph, af_0350_location) -bt.project = tj.joins_for_project_single(tgraph) -bt.relatedLocations = tj.joins_for_relatedLocations(tgraph, sf_2683) -bt.rules = tj.joins_for_rules(tgraph) +# (relies on some specifics of the sigature type) +if args.input_signature == "LGTM": + tj = tj +else: + tj = tj_CLI +try: + location_info = tj.joins_for_location_info(tgraph) + af_0350_location = tj.joins_for_af_0350_location(tgraph) + bt.artifacts = tj.joins_for_artifacts(tgraph) + bt.codeflows = tj.joins_for_codeflows(tgraph, location_info) + bt.kind_pathproblem = tj.joins_for_path_problem(tgraph, af_0350_location) + bt.kind_problem = tj.joins_for_problem(tgraph, af_0350_location) + bt.project = tj.joins_for_project_single(tgraph) + bt.relatedLocations = tj.joins_for_relatedLocations(tgraph, location_info) + bt.rules = tj.joins_for_rules(tgraph) +except Exception: + #possible warnings accumulated + status_writer.csv_write_warnings() + raise Exception # -# Form scan tables +# Setup rest of basetables # -# joins for projects has to happen first as it backfills the guess about the project_id -scantabs.projects = st.joins_for_projects(bt, external_info, scantabs) -scantabs.results = st.joins_for_results(bt, external_info) -scantabs.scans = st.joins_for_scans(bt, external_info, scantabs) - - - -# -# Replace the remaining internal ids with snowflake ids -# -flakegen = snowflake_id.Snowflake(0) - bt.columns_to_reindex = { # template from {field.name : [''] for field in dc.fields(bt)} 'artifacts': ['artifacts_id'], @@ -167,6 +185,19 @@ scantabs.columns_to_reindex = { 'results': ['codeFlow_id'], } +# +# Form scan tables +# +# joins for projects has to happen first as it backfills the guess about the project_id +scantabs.projects = st.joins_for_projects(bt, external_info) +scantabs.results = st.joins_for_results(bt, external_info) +scantabs.scans = st.joins_for_scans(bt, external_info, scantabs, args.input_signature) + +# +# Replace the remaining internal ids with snowflake ids +# +flakegen = snowflake_id.Snowflake(0) + _id_to_flake = {} def _get_flake(id): flake = _id_to_flake.get(id, -1) diff --git a/bin/sarif-extract-scans-runner b/bin/sarif-extract-scans-runner index 5c5a983..a069493 100755 --- a/bin/sarif-extract-scans-runner +++ b/bin/sarif-extract-scans-runner @@ -87,7 +87,14 @@ from sarif_cli import hash parser = argparse.ArgumentParser(description='Run sarif-extract-scans over a directory hierarchy') parser.add_argument('sarif_files', metavar='sarif-files', type=str, help='File containing list of sarif files, use - for stdin') -parser.add_argument('-o','--outdir', metavar='output-dir', type=str, default="", help='output directory') + +parser.add_argument('-f','--input-signature', metavar='input-signature', type=str, default="LGTM", + help='Signature of the sarif, as in, where it was generated it may affect the signature.' + 'Options: LGTM, CLI' + 'If current represented signatures are not sufficient, view signature_single.py for how to support further signatures.' + ' Default: "%(default)s"') + +parser.add_argument('-o','--outdir', metavar='output-dir', type=str, default="", help='Output directory') parser.add_argument('-m', '--max-files', metavar='number', type=int, default=100000, help='Maximum number of files to process.' @@ -126,6 +133,11 @@ if outer_dir != "": except FileExistsError: pass +if args.input_signature not in ["LGTM","CLI"]: + print("Unsupported sarif signature requested.") + print("Use one of [LGTM, CLI].") + sys.exit(0) + # # Collect sarif file information # @@ -205,7 +217,7 @@ for path in paths: scan_log_file = os.path.join(outer_dir+ project, component + ".scanlog") csv_outfile = os.path.join(outer_dir+ project, component) - runstats = subprocess.run(['sarif-extract-scans', scan_spec_file, output_dir, csv_outfile], + runstats = subprocess.run(['sarif-extract-scans', scan_spec_file, output_dir, csv_outfile, "-f", args.input_signature], capture_output=True, text=True) if runstats.returncode == 0: print("{:6} {}".format("OK", path)) diff --git a/bin/sarif-extract-tables b/bin/sarif-extract-tables index 439b335..97820b3 100755 --- a/bin/sarif-extract-tables +++ b/bin/sarif-extract-tables @@ -59,8 +59,8 @@ sarif_struct = signature.fillsig(args, sarif_struct, context) # # Use reference type graph (signature) to traverse sarif and attach values to tables # -tgraph = typegraph.Typegraph(signature_single.struct_graph_2022_02_01) -typegraph.destructure(tgraph, signature_single.start_node_2022_02_01, sarif_struct) +tgraph = typegraph.Typegraph(signature_single.struct_graph_LGTM) +typegraph.destructure(tgraph, signature_single.start_node_LGTM, sarif_struct) # # Form output tables @@ -84,7 +84,7 @@ bt = BaseTables() # # Add dataframes # -sf_2683 = tj.joins_for_sf_2683(tgraph) +sf_2683 = tj.joins_for_location_info(tgraph) af_0350_location = tj.joins_for_af_0350_location(tgraph) bt.artifacts = tj.joins_for_artifacts(tgraph) bt.codeflows = tj.joins_for_codeflows(tgraph, sf_2683) diff --git a/sarif_cli/scan_tables.py b/sarif_cli/scan_tables.py index 12702e3..716d940 100644 --- a/sarif_cli/scan_tables.py +++ b/sarif_cli/scan_tables.py @@ -73,36 +73,49 @@ class ScanTablesTypes: # # Projects table # -def joins_for_projects(basetables, external_info, scantables): +def joins_for_projects(basetables, external_info): """ Form the 'projects' table for the ScanTables dataclass """ b = basetables; e = external_info - - # For a repository url of the form - # (git|https)://*/org/project.* - # use the org/project part as the project_name. - # - # TODO knewbury error handling for if the signature is slotted out? - repo_url = b.project.repositoryUri[0] - url_parts = re.match(r'(git|https)://[^/]+/([^/]+)/(.*).git', repo_url) - if url_parts: - project_name = f"{url_parts.group(2)}-{url_parts.group(3)}" - project, component = e.sarif_file_name.rstrip().split('/') - # if the runners guess from the filename was bad, replace with real info - # and continue to use that scanspec to pass that around - if project_name != project+"-"+component: - e.project_id = hash.hash_unique(project_name.encode()) + + # if the sarif does not have versionControlProvenance, semmle.sourceLanguage ect + # there is no reliable way to know the project name + # and will still need to use a guess about the project id + if "repositoryUri" in b.project: + repo_url = b.project.repositoryUri[0] + # For a repository url of the form + # (git|https)://*/org/project.* + # use the org/project part as the project_name. + # + url_parts = re.match(r'(git|https)://[^/]+/([^/]+)/(.*).git', repo_url) + if url_parts: + project_name = f"{url_parts.group(2)}-{url_parts.group(3)}" + project, component = e.sarif_file_name.rstrip().split('/') + # if the runners guess from the filename was bad, replace with real info + # and continue to use that scanspec to pass that around + if project_name != project+"-"+component: + e.project_id = hash.hash_unique(project_name.encode()) + else: + project_name = pd.NA else: + repo_url = "unknown" project_name = pd.NA + + if 'semmle.sourceLanguage' in b.project: + srcLang = b.project['semmle.sourceLanguage'][0] + allLang = ",".join(list(b.project['semmle.sourceLanguage'])) + else: + srcLang = "unknown" + allLang = "unknown" res = pd.DataFrame(data={ "id" : e.project_id, "project_name" : project_name, "creation_date" : pd.Timestamp(0.0, unit='s'), # TODO: external info "repo_url" : repo_url, - "primary_language" : b.project['semmle.sourceLanguage'][0], # TODO: external info - "languages_analyzed" : ",".join(list(b.project['semmle.sourceLanguage'])) + "primary_language" : srcLang, # TODO: external info if CLI sarif + "languages_analyzed" : allLang # TODO: external info if CLI sarif }, index=[0]) # Force all column types to ensure appropriate formatting @@ -112,7 +125,7 @@ def joins_for_projects(basetables, external_info, scantables): # # Scans table # -def joins_for_scans(basetables, external_info, scantables): +def joins_for_scans(basetables, external_info, scantables, sarif_type): """ Form the `scans` table for the ScanTables dataclass """ @@ -122,9 +135,14 @@ def joins_for_scans(basetables, external_info, scantables): driver_version = b.project.driver_version.unique() assert len(driver_version) == 1, \ "More than one driver version found for single sarif file." + # TODO if commit id exists in external info for CLI gen'd sarif, add? + if sarif_type == "LGTM": + commit_id = b.project.revisionId[0] + else: + commit_id = "unknown" res = pd.DataFrame(data={ "id" : e.scan_id, - "commit_id" : b.project.revisionId[0], + "commit_id" : commit_id, "project_id" : e.project_id, # TODO extract real date information from somewhere external "db_create_start" : pd.Timestamp(0.0, unit='s'), @@ -159,7 +177,7 @@ def joins_for_results(basetables, external_info): tables = [_results_from_kind_problem(basetables, external_info), _results_from_kind_pathproblem(basetables, external_info)] stack = [table for table in tables if len(table) > 0] - + # Concatenation fails without at least one table, so avoid that. if len(stack) > 0: res = pd.concat(stack) @@ -207,7 +225,6 @@ def _results_from_kind_problem(basetables, external_info): 'query_precision' : [_populate_from_rule_table("precision", b, i) for i in range(len(b.kind_problem))], 'query_severity' : [_populate_from_rule_table("problem.severity", b, i) for i in range(len(b.kind_problem))], 'query_tags' : [_populate_from_rule_table_tag_text(b, i) for i in range(len(b.kind_problem))], - 'codeFlow_id' : 0, # link to codeflows (kind_pathproblem only, NULL here) 'message': b.kind_problem.message_text, @@ -249,6 +266,7 @@ def _results_from_kind_pathproblem(basetables, external_info): # The `result` table has no entry to distinguish these, so we use a simplified # version of `kind_pathproblem`. + reduced_kind_pathp = b.kind_pathproblem.drop( columns=[ 'relatedLocation_array_index', @@ -295,7 +313,6 @@ def _results_from_kind_pathproblem(basetables, external_info): 'query_precision' : _populate_from_rule_table_code_flow("precision", b, cfid0ppt0), 'query_severity' : _populate_from_rule_table_code_flow("problem.severity", b, cfid0ppt0), 'query_tags' : _populate_from_rule_table_code_flow_tag_text(b, cfid0ppt0), - 'codeFlow_id' : cfid0, # 'message': cfid0ppt0.message_text.values[0], diff --git a/sarif_cli/signature.py b/sarif_cli/signature.py index ea99552..42957fb 100644 --- a/sarif_cli/signature.py +++ b/sarif_cli/signature.py @@ -225,7 +225,7 @@ dummy_newlineSequences = ['\r\n', '\n', '\u2028', '\u2029'] dummy_relatedLocations_entry = [ {'id': -1, 'physicalLocation': {'artifactLocation': {'uri': 'scli-dyys dummy value', - 'uriBaseId': 'scli-dyys dummy value', + 'uriBaseId': 'scli-dyys uriBaseId', 'index': -1}, 'region': {'startLine': -1, 'startColumn': -1, diff --git a/sarif_cli/signature_single.py b/sarif_cli/signature_single.py index 72b05e0..c810cb5 100644 --- a/sarif_cli/signature_single.py +++ b/sarif_cli/signature_single.py @@ -12,9 +12,9 @@ is marked below # # The starting node the leftmost node in ../notes/typegraph.pdf # -start_node_2022_02_01 = 'Struct6787' +start_node_LGTM = 'Struct6787' -struct_graph_2022_02_01 = ( +struct_graph_LGTM = ( [ ('String', 'string'), ('Int', 'int'), ('Bool', 'bool'), @@ -121,5 +121,4 @@ struct_graph_2022_02_01 = ( ('$schema', 'String'), ('runs', 'Array0177'), ('version', 'String')))] -) - +) \ No newline at end of file diff --git a/sarif_cli/signature_single_CLI.py b/sarif_cli/signature_single_CLI.py new file mode 100644 index 0000000..fd8dfa5 --- /dev/null +++ b/sarif_cli/signature_single_CLI.py @@ -0,0 +1,161 @@ +""" The signature for a single sarif file + +Produced by + + sarif-to-dot -u -t -f 2021-12-09/results.sarif + +with some arrays manually sorted so the the signature with more fields comes first. The case + ('Array6343', ('array', (1, 'Struct9699'), (0, 'Struct4055'))), # MANUALLY SORTED +is marked below +""" + +# +# The starting node the leftmost node in ../notes/typegraph.pdf +# +start_node_CLI = 'Struct5521' + +# generated with CLI 2.9.4 +struct_graph_CLI = ( + [ ('String', 'string'), + ('Int', 'int'), + ('Bool', 'bool'), + ( 'Struct2685', + ( 'struct', + ('index', 'Int'), + ('uri', 'String'), + ('uriBaseId', 'String'))), + ('Struct5277', ('struct', ('location', 'Struct2685'))), + ('Struct3497', ('struct', ('index', 'Int'), ('uri', 'String'))), + ('Struct9567', ('struct', ('location', 'Struct3497'))), + ('Array6920', ('array', (0, 'Struct5277'), (1, 'Struct9567'))), + ('Struct1509', ('struct', ('semmle.formatSpecifier', 'String'))), + ('Struct2774', ('struct', ('text', 'String'))), + ( 'Struct6299', + ( 'struct', + ('endColumn', 'Int'), + ('endLine', 'Int'), + ('startColumn', 'Int'), + ('startLine', 'Int'))), + ( 'Struct4963', + ( 'struct', + ('artifactLocation', 'Struct2685'), + ('region', 'Struct6299'))), + ( 'Struct2683', + ( 'struct', + ('id', 'Int'), + ('message', 'Struct2774'), + ('physicalLocation', 'Struct4963'))), + ('Array0350', ('array', (0, 'Struct2683'))), + ( 'Struct4199', + ( 'struct', + ('primaryLocationLineHash', 'String'), + ('primaryLocationStartColumnFingerprint', 'String'))), + ('Struct3942', ('struct', ('id', 'String'), ('index', 'Int'))), + ( 'Struct4055', + ( 'struct', + ('locations', 'Array0350'), + ('message', 'Struct2774'), + ('partialFingerprints', 'Struct4199'), + ('relatedLocations', 'Array0350'), + ('rule', 'Struct3942'), + ('ruleId', 'String'), + ('ruleIndex', 'Int'))), + ( 'Struct7125', + ( 'struct', + ('artifactLocation', 'Struct3497'), + ('region', 'Struct6299'))), + ( 'Struct6772', + ( 'struct', + ('id', 'Int'), + ('message', 'Struct2774'), + ('physicalLocation', 'Struct7125'))), + ('Array8753', ('array', (0, 'Struct6772'))), + ( 'Struct0102', + ( 'struct', + ('locations', 'Array0350'), + ('message', 'Struct2774'), + ('partialFingerprints', 'Struct4199'), + ('relatedLocations', 'Array8753'), + ('rule', 'Struct3942'), + ('ruleId', 'String'), + ('ruleIndex', 'Int'))), + ('Struct0987', ('struct', ('location', 'Struct2683'))), + ('Array1075', ('array', (0, 'Struct0987'))), + ('Struct4194', ('struct', ('locations', 'Array1075'))), + ('Array1597', ('array', (0, 'Struct4194'))), + ('Struct7122', ('struct', ('threadFlows', 'Array1597'))), + ('Array9799', ('array', (0, 'Struct7122'))), + ( 'Struct9699', + ( 'struct', + ('codeFlows', 'Array9799'), + ('locations', 'Array0350'), + ('message', 'Struct2774'), + ('partialFingerprints', 'Struct4199'), + ('relatedLocations', 'Array0350'), + ('rule', 'Struct3942'), + ('ruleId', 'String'), + ('ruleIndex', 'Int'))), + ( 'Array1768', + #('array', (2, 'Struct9699'), (1, 'Struct4055'),(0, 'Struct0102'))), + #('array',(0, 'Struct0102'), (1, 'Struct4055'), (2, 'Struct9699'))), + #omitting (0, 'Struct0102') means we will never find column info + ('array', (2, 'Struct9699'), (1, 'Struct4055'))), + ('Struct8581', ('struct', ('enabled', 'Bool'), ('level', 'String'))), + ('Array7069', ('array', (0, 'String'))), + ( 'Struct6853', + ( 'struct', + ('description', 'String'), + ('id', 'String'), + ('kind', 'String'), + ('name', 'String'), + ('precision', 'String'), + ('problem.severity', 'String'), + ('security-severity', 'String'), + ('severity', 'String'), + ('sub-severity', 'String'), + ('tags', 'Array7069'))), + ( 'Struct7100', + ( 'struct', + ('defaultConfiguration', 'Struct8581'), + ('fullDescription', 'Struct2774'), + ('id', 'String'), + ('name', 'String'), + ('properties', 'Struct6853'), + ('shortDescription', 'Struct2774'))), + ('Array0147', ('array', (0, 'Struct7100'))), + ( 'Struct7828', + ( 'struct', + ('name', 'String'), + ('organization', 'String'), + ('rules', 'Array0147'), + ('semanticVersion', 'String'))), + ( 'Struct9027', + ('struct', ('description', 'Struct2774'), ('uri', 'String'))), + ('Array4813', ('array', (0, 'Struct9027'))), + ( 'Struct6152', + ( 'struct', + ('locations', 'Array4813'), + ('name', 'String'), + ('semanticVersion', 'String'))), + ('Struct7826', ('struct', ('locations', 'Array4813'), ('name', 'String'))), + ('Array9357', ('array', (0, 'Struct6152'), (1, 'Struct7826'))), + ( 'Struct0032', + ('struct', ('driver', 'Struct7828'), ('extensions', 'Array9357'))), + ( 'Struct3081', + ('struct', ('repositoryUri', 'String'), ('revisionId', 'String'))), + ('Array5511', ('array', (0, 'Struct3081'))), + ( 'Struct9786', + ( 'struct', + ('artifacts', 'Array6920'), + ('columnKind', 'String'), + ('newlineSequences', 'Array7069'), + ('properties', 'Struct1509'), + ('results', 'Array1768'), + ('tool', 'Struct0032'), + ('versionControlProvenance', 'Array5511'))), + ('Array1273', ('array', (0, 'Struct9786'))), + ( 'Struct5521', + ( 'struct', + ('$schema', 'String'), + ('runs', 'Array1273'), + ('version', 'String')))] ) diff --git a/sarif_cli/table_joins.py b/sarif_cli/table_joins.py index 5209f84..41c5faa 100644 --- a/sarif_cli/table_joins.py +++ b/sarif_cli/table_joins.py @@ -73,13 +73,12 @@ def joins_for_af_0350_location(tgraph): ) return af_0350_location -def joins_for_sf_2683(tgraph): +def joins_for_location_info(tgraph): """ Join all the tables used by 2683's right side into one. """ # Access convenience functions sf = lambda num: tgraph.dataframes['Struct' + str(num)] - af = lambda num: tgraph.dataframes['Array' + str(num)] # sf_2683 = ( # diff --git a/sarif_cli/table_joins_CLI.py b/sarif_cli/table_joins_CLI.py new file mode 100644 index 0000000..71b8c42 --- /dev/null +++ b/sarif_cli/table_joins_CLI.py @@ -0,0 +1,462 @@ +""" Collection of joins for the base tables provided by typegraph.attach_tables() + + The `problem` and `path-problem` entries provide that information; the + `relatedLocations` table provides the details when multiple results are + present for either. `project` is the high-level overview; `artifacts` + provides those for the other tables. +""" +import pandas as pd +import re +from .typegraph import tagged_array_columns, tagged_struct_columns + +class BaseTablesTypes: + codeflows = { + "codeflow_id" : pd.UInt64Dtype(), + "codeflow_index" : pd.Int64Dtype(), + "threadflow_index" : pd.Int64Dtype(), + "location_index" : pd.Int64Dtype(), + "endColumn" : pd.Int64Dtype(), + "endLine" : pd.Int64Dtype(), + "startColumn" : pd.Int64Dtype(), + "startLine" : pd.Int64Dtype(), + "artifact_index" : pd.Int64Dtype(), + "uri" : pd.StringDtype(), + "uriBaseId" : pd.StringDtype(), + "message" : pd.StringDtype(), + } + +def joins_for_af_0350_location(tgraph): + """ + Join all the tables used by 0350's right side into one. + """ + # Access convenience functions + sf = lambda num: tgraph.dataframes['Struct' + str(num)] + af = lambda num: tgraph.dataframes['Array' + str(num)] + sft = lambda id: sf(id).rename(columns = tagged_struct_columns(tgraph, id)) + aft = lambda id: af(id).rename(columns = tagged_array_columns(tgraph, id)) + + af_0350_location = ( + aft('0350') + # + .merge(sft(2683), how="left", left_on='t0350_id_or_value_at_index', right_on='t2683_struct_id', + validate="1:m") + .drop(columns=['t0350_id_or_value_at_index', 't2683_struct_id', 't0350_type_at_index']) + # + .merge(sft(4963), how="left", left_on='t2683_physicalLocation', right_on='t4963_struct_id', + validate="1:m") + .drop(columns=['t2683_physicalLocation', 't4963_struct_id']) + # + .merge(sft(6299), how="left", left_on='t4963_region', right_on='t6299_struct_id', + validate="1:m") + .drop(columns=['t4963_region', 't6299_struct_id']) + # + .merge(sft(2685), how="left", left_on='t4963_artifactLocation', right_on='t2685_struct_id', + validate="1:m") + .drop(columns=['t4963_artifactLocation', 't2685_struct_id']) + # + .merge(sft(2774), how="left", left_on='t2683_message', right_on='t2774_struct_id', + validate="1:m") + .drop(columns=['t2683_message', 't2774_struct_id']) + # + .rename(columns={'t0350_array_id' : 'm0350_location_array_id', + 't0350_value_index' : 'm0350_location_array_index', + 't2683_id' : 'm0350_location_id', + 't6299_endColumn' : 'm0350_location_endColumn', + 't6299_endLine' : 'm0350_location_endLine', + 't6299_startColumn' : 'm0350_location_startColumn', + 't6299_startLine' : 'm0350_location_startLine', + 't2685_index' : 'm0350_location_index', + 't2685_uri' : 'm0350_location_uri', + 't2685_uriBaseId' : 'm0350_location_uriBaseId', + 't2774_text' : 'm0350_location_message', + }) + ) + return af_0350_location + +def joins_for_location_info(tgraph): + """ + Join all the tables used by 2683's right side into one. + """ + # Access convenience functions + sf = lambda num: tgraph.dataframes['Struct' + str(num)] + # + sf_2683 = ( + # + sf(2683) + .rename(columns={"struct_id": "struct_id_2683", "id": "id_2683"}) + # + .merge(sf(4963), how="left", left_on='physicalLocation', right_on='struct_id', validate="1:m") + .drop(columns=['struct_id', 'physicalLocation']) + # + .merge(sf(6299), how="left", left_on='region', right_on='struct_id', validate="1:m") + .drop(columns=['struct_id', 'region']) + # + .merge(sf(2685), how="left", left_on='artifactLocation', right_on='struct_id', validate="1:m") + .drop(columns=['struct_id', 'artifactLocation']) + .rename(columns={"index": "location_index_2685"}) + # + .merge(sf(2774), how="left", left_on='message', right_on='struct_id', validate="1:m") + .drop(columns=['struct_id', 'message']) + .rename(columns={"text": "message_text_2683"}) + # + ) + + return sf_2683 + +def joins_for_problem(tgraph, af_0350_location): + """ + Return table providing the `problem` information. + """ + # Access convenience functions + sf = lambda num: tgraph.dataframes['Struct' + str(num)] + af = lambda num: tgraph.dataframes['Array' + str(num)] + sft = lambda id: sf(id).rename(columns = tagged_struct_columns(tgraph, id)) + aft = lambda id: af(id).rename(columns = tagged_array_columns(tgraph, id)) + # + # Form the message dataframe (@kind problem) via joins + # + + kind_problem_1 = ( + aft(1768) + .merge(sft(4055), how="inner", + left_on='t1768_id_or_value_at_index', right_on='t4055_struct_id', + validate="1:m") + .drop(columns=['t1768_type_at_index', 't1768_id_or_value_at_index', + 't4055_struct_id']) + # + .merge(af_0350_location, how="left", left_on='t4055_locations', + right_on='m0350_location_array_id', validate="1:m") + .drop(columns=['t4055_locations', 'm0350_location_array_id']) + # + .merge(af_0350_location.rename(columns=lambda x: re.sub('m0350_location', + 'm0350_relatedLocation', + x)), + how="left", left_on='t4055_relatedLocations', + right_on='m0350_relatedLocation_array_id', validate="1:m") + .drop(columns=['t4055_relatedLocations', 'm0350_relatedLocation_array_id']) + # + .merge(sft(2774), how="left", left_on='t4055_message', right_on='t2774_struct_id') + .drop(columns=['t4055_message', 't2774_struct_id']) + .rename(columns={"t2774_text": "t4055_message_text"}) + # + .merge(sft(4199), how="left", left_on='t4055_partialFingerprints', + right_on='t4199_struct_id') + .drop(columns=['t4055_partialFingerprints', 't4199_struct_id']) + # + .merge(sft(3942), how="left", left_on='t4055_rule', + right_on='t3942_struct_id') + .drop(columns=['t4055_rule', 't3942_struct_id']) + ) + + kind_problem_2 = ( + kind_problem_1 + .rename({ + 't1768_array_id' : 'results_array_id', + 't1768_value_index' : 'results_array_index', + 't4055_ruleId' : 'ruleId', + 't4055_ruleIndex' : 'ruleIndex', + 't4055_message_text' : 'message_text', + 't3942_id' : 'rule_id', + 't3942_index' : 'rule_index', + }, axis='columns') + # Strip type prefix for the rest + .rename(columns = lambda x: re.sub('m0350_|t4199_', '', x)) + ) + + return kind_problem_2 + + +def joins_for_codeflows(tgraph, sf_2683): + """ + Return the table providing the `codeFlows` for a `path-problem table. + """ + # Access convenience functions + sf = lambda num: tgraph.dataframes['Struct' + str(num)] + af = lambda num: tgraph.dataframes['Array' + str(num)] + # + codeflows = ( + af(9799).rename(columns={"array_id": "t9799_array_id", "value_index": "t9799_idx"}) + # + .merge(sf(7122), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m") + .drop(columns=['id_or_value_at_index', 'struct_id', 'type_at_index']) + # + .merge(af(1597).rename(columns={"array_id": "t1597_array_id", "value_index": "t1597_idx"}), + how="left", left_on='threadFlows', right_on='t1597_array_id', validate="1:m") + .drop(columns=['threadFlows', 't1597_array_id', 'type_at_index']) + # + .merge(sf(4194), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m") + .drop(columns=['id_or_value_at_index', 'struct_id']) + # + .merge(af(1075).rename(columns={"array_id": "t1075_array_id", "value_index": "t1075_idx"}), + how="left", left_on='locations', right_on='t1075_array_id', validate="1:m") + .drop(columns=['locations', 't1075_array_id', 'type_at_index']) + .rename(columns={"t1075_idx": "t1075_locations_idx"}) + # + .merge(sf('0987'), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m") + .drop(columns=['id_or_value_at_index', 'struct_id']) + # + .merge(sf_2683, how="left", left_on='location', right_on='struct_id_2683', validate="1:m") + .drop(columns=['location', 'struct_id_2683']) + ) + codeflows_1 = ( + codeflows + .drop(columns=['id_2683']) + .rename({ + 't9799_array_id': 'codeflow_id', + 't9799_idx': 'codeflow_index', + 't1597_idx': 'threadflow_index', + 't1075_locations_idx': 'location_index', + 'location_index_2685': 'artifact_index', + 'message_text_2683': 'message', + }, axis='columns') + ) + codeflows_2 = codeflows_1.astype(BaseTablesTypes.codeflows).reset_index(drop=True) + return codeflows_2 + +def joins_for_path_problem(tgraph, af_0350_location): + """ + Return table providing the `path-problem` information. + """ + # Access convenience functions + sf = lambda num: tgraph.dataframes['Struct' + str(num)] + af = lambda num: tgraph.dataframes['Array' + str(num)] + sft = lambda id: sf(id).rename(columns = tagged_struct_columns(tgraph, id)) + aft = lambda id: af(id).rename(columns = tagged_array_columns(tgraph, id)) + + kind_pathproblem_1 = ( + aft(1768) + .merge(sft(9699), how="inner", left_on='t1768_id_or_value_at_index', right_on='t9699_struct_id', + validate="1:m") + .drop(columns=['t1768_id_or_value_at_index', 't9699_struct_id', 't1768_type_at_index']) + # + .merge(af_0350_location, how="left", left_on='t9699_locations', + right_on='m0350_location_array_id', validate="1:m") + .drop(columns=['t9699_locations', 'm0350_location_array_id']) + # + .merge(af_0350_location.rename(columns=lambda x: re.sub('m0350_location', + 'm0350_relatedLocation', + x)), + how="left", left_on='t9699_relatedLocations', + right_on='m0350_relatedLocation_array_id', validate="1:m") + .drop(columns=['t9699_relatedLocations', 'm0350_relatedLocation_array_id']) + # + .merge(sft(2774), how="left", left_on='t9699_message', right_on='t2774_struct_id') + .drop(columns=['t9699_message', 't2774_struct_id']) + .rename(columns={"t2774_text": "t9699_message_text"}) + # + .merge(sft(4199), how="left", left_on='t9699_partialFingerprints', + right_on='t4199_struct_id') + .drop(columns=['t9699_partialFingerprints', 't4199_struct_id']) + # + .merge(sft(3942), how="left", left_on='t9699_rule', + right_on='t3942_struct_id') + .drop(columns=['t9699_rule', 't3942_struct_id']) + ) + strip_colums = lambda x: re.sub('t9699_|m0350_|t4199_', '', x) + kind_pathproblem_2 = (kind_pathproblem_1 + .rename({ + 't1768_array_id' : 'results_array_id', + 't1768_value_index' : 'results_array_index', + 't9699_codeFlows' : 'codeFlows_id', + 't9699_ruleId' : 'ruleId', + 't9699_ruleIndex' : 'ruleIndex', + 't9699_message_text' : 'message_text', + 't3942_id' : 'rule_id', + 't3942_index' : 'rule_index', + }, axis='columns') + # Strip type prefix for the rest + .rename(columns = strip_colums)) + + return kind_pathproblem_2 + +def joins_for_relatedLocations(tgraph, sf_2683): + """ + Return table providing the `relatedLocations` and `locations` information. + """ + # Access convenience functions + sf = lambda num: tgraph.dataframes['Struct' + str(num)] + af = lambda num: tgraph.dataframes['Array' + str(num)] + # + # Form the relatedLocation dataframe via joins, starting from the union of + # relatedLocations from `kind problem` (sf(4055)) and `kind path-problem` + # (sf(9699)). + # + related_locations_1 = ( + pd.concat([sf(4055)[['relatedLocations', 'struct_id']], sf(9699)[['relatedLocations', 'struct_id']]]) + .merge(af('0350'), how="left", left_on='relatedLocations', right_on='array_id', validate="1:m") + .drop(columns=['relatedLocations', 'array_id', 'value_index', 'type_at_index']) + # + .merge(sf(2683), how="left", left_on='id_or_value_at_index', right_on='struct_id', + suffixes=("_4055_9699", "_2683"), validate="1:m") + .drop(columns=['struct_id_2683', 'id_or_value_at_index']) + # + .merge(sf(4963), how="left", left_on='physicalLocation', right_on='struct_id', validate="1:m") + .drop(columns=['struct_id', 'physicalLocation']) + # + .merge(sf(6299), how="left", left_on='region', right_on='struct_id', validate="1:m") + .drop(columns=['struct_id', 'region']) + # + .merge(sf(2685), how="left", left_on='artifactLocation', right_on='struct_id', validate="1:m") + .drop(columns=['struct_id', 'artifactLocation']) + # + .merge(sf(2774), how="left", left_on='message', right_on='struct_id', validate="1:m") + .drop(columns=['struct_id', 'message']) + ) + + # Keep columns of interest + related_locations_2 = (related_locations_1[['struct_id_4055_9699', 'uri', 'startLine', 'startColumn', 'endLine', 'endColumn', 'text']] + .rename({'text': 'message', 'struct_id_4055_9699': 'struct_id'}, axis='columns')) + + # Remove dummy locations previously injected by signature.fillsig + related_locations_3 = related_locations_2[related_locations_2.uri != 'scli-dyys dummy value'] + + return related_locations_3 + +def joins_for_project_single(tgraph): + """ + Return table providing the `project` information for sarif-extract-scans + """ + # Access convenience functions + sf = lambda num: tgraph.dataframes['Struct' + str(num)] + af = lambda num: tgraph.dataframes['Array' + str(num)] + # + project_df_temp1 = ( + sf(5521) + .rename(columns={"version": "version_5521", "struct_id": "struct_id_5521"}) + # + .merge(af('1273'), how="left", left_on='runs', right_on='array_id', + validate="1:m") + .drop(columns=['runs', 'array_id', 'type_at_index']) + .rename(columns={"value_index": "value_index_1273"}) + # + .merge(sf(9786), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m") + .drop(columns=['id_or_value_at_index', 'struct_id'])) + # + #newlines there or not - handle + if 'newlineSequences' in project_df_temp1: + project_df_temp2 = project_df_temp1.drop(columns=['newlineSequences']) + + project_df_temp2 = ( + project_df_temp1 + # + .merge(sf(1509), how="left", left_on='properties', right_on='struct_id', validate="1:m") + .drop(columns=['properties', 'struct_id']) + # + # tool - driver - rules - defaultConfiguration - ( properties - tags ) + # + .merge(sf('0032'), how="left", left_on='tool', right_on='struct_id', validate="1:m") + .drop(columns=['tool', 'struct_id']) + # + .merge(sf(7828), how="left", left_on='driver', right_on='struct_id', validate="1:m") + .drop(columns=['driver', 'struct_id']) + .rename(columns={"semanticVersion": "driver_version_7828", "name": "driver_name_7828"}) + # + #assumet to be there + .merge(af(5511), how="left", left_on='versionControlProvenance', right_on='array_id') + .drop(columns=['versionControlProvenance', 'array_id', 'type_at_index']) + .rename(columns={"value_index": "versionControl_value_index_5511"}) + # + .merge(sf(3081), how="left", left_on='id_or_value_at_index', right_on='struct_id') + .drop(columns=['id_or_value_at_index', 'struct_id']) + ) + # + + # Keep columns of interest + project_df_1 = ( + project_df_temp2 + .drop(columns=['struct_id_5521', 'versionControl_value_index_5511']) + .rename({ + 'version_5521': 'sarif_version', + 'value_index_1273': 'run_index', + 'driver_name_7828': 'driver_name', + 'driver_version_7828': 'driver_version', + }, axis='columns') + ) + return project_df_1 + +def joins_for_rules(tgraph): + """ + Return table providing the `rules` information. + """ + # Access convenience functions + sf = lambda num: tgraph.dataframes['Struct' + str(num)] + sft = lambda id: sf(id).rename(columns = tagged_struct_columns(tgraph, id)) + af = lambda num: tgraph.dataframes['Array' + str(num)] + aft = lambda id: af(id).rename(columns = tagged_array_columns(tgraph, id)) + # + rules_df = ( + aft('0147') + # + .drop(columns=['t0147_type_at_index']) + # + .merge(sft(7100), how="left", left_on='t0147_id_or_value_at_index', + right_on='t7100_struct_id', + validate="1:m") + .drop(columns=['t0147_id_or_value_at_index', 't7100_struct_id']) + # + .merge(sft(8581), how="left", left_on='t7100_defaultConfiguration', + right_on='t8581_struct_id', validate="1:m") + .drop(columns=['t7100_defaultConfiguration', 't8581_struct_id']) + # + .merge(sft(2774), how="left", left_on='t7100_fullDescription', + right_on='t2774_struct_id', validate="1:m") + .drop(columns=['t7100_fullDescription', 't2774_struct_id']) + .rename(columns={'t2774_text': "t7100_t2774_fullDescription"}) + # + .merge(sft(2774), how="left", left_on='t7100_shortDescription', + right_on='t2774_struct_id', validate="1:m") + .drop(columns=['t7100_shortDescription', 't2774_struct_id']) + .rename(columns={"t2774_text": 't7100_t2774_shortDescription'}) + # + .merge(sft(6853), how="left", left_on='t7100_properties', + right_on='t6853_struct_id', validate="1:m") + .drop(columns=['t7100_properties', 't6853_struct_id', 't6853_id']) + # + .merge(aft(7069), how="left", left_on='t6853_tags', + right_on='t7069_array_id', validate="1:m") + .drop(columns=['t6853_tags', 't7069_array_id', 't7069_type_at_index']) + ) + rules_2 = ( + rules_df + .rename({ + 't0147_array_id' : 'rules_array_id', + 't0147_value_index' : 'rules_array_index', + 't7069_value_index' : 'tag_index', + 't7069_id_or_value_at_index' : 'tag_text', + }, axis='columns') + # Strip type prefix for the rest + .rename(columns = lambda x: re.sub('t7100_t2774_|t7100_|t8581_|t6853_', '', x)) + ) + return rules_2 + +def joins_for_artifacts(tgraph): + """ + Return table providing the `artifacts` information. + """ + # Access convenience functions + sf = lambda num: tgraph.dataframes['Struct' + str(num)] + af = lambda num: tgraph.dataframes['Array' + str(num)] + # + artifacts_df = ( + af(6920) + # + .merge(sf(5277), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m") + .drop(columns=['id_or_value_at_index', 'struct_id', 'type_at_index']) + .rename(columns={"value_index": "artifact_index_6920"}) + # + .merge(sf(2685), how="left", left_on='location', right_on='struct_id', validate="1:m") + .drop(columns=['location', 'struct_id']) + ) + # Keep columns of interest and rename + df_1 = ( + artifacts_df + .rename({ + 'array_id': 'artifacts_id', + 'artifact_index_6920': 'artifacts_array_index', + }, axis='columns') + ) + + if (df_1['artifacts_array_index'] == df_1['index']).all(): + df_1 = df_1.drop(columns=['artifacts_array_index']) + + return df_1 diff --git a/sarif_cli/typegraph.py b/sarif_cli/typegraph.py index 5761943..3769fc6 100644 --- a/sarif_cli/typegraph.py +++ b/sarif_cli/typegraph.py @@ -179,13 +179,21 @@ def _destructure_dict(typegraph: Typegraph, node, tree): if specific_missing not in status_writer.input_sarif_missing["extra_info"]: status_writer.input_sarif_missing["extra_info"] += specific_missing status_writer.warning_set["input_sarif_missing"]+=1 - raise MissingFieldException( - f"(Sub)tree is missing fields required by typedef.\n" - f"Expected {type_fields}, found {tree_fields}.\n" - f"Missing {set(type_fields) - set(tree_fields)}\n" - f"Note: these fields are post-signature fill and may be more extensive than the orginal. \n" - f"Check input file for the original signature." - ) + + #special case of no longer trying other signatures + #else exception here triggers a retry - mainly needed for Struct9699 or Struct4055 + difference = set(type_fields) - set(tree_fields) + if "uriBaseId" in difference: + tree["uriBaseId"] = "default" + _destructure_dict_1(typegraph, node, tree) + else: + raise MissingFieldException( + f"(Sub)tree is missing fields required by typedef.\n" + f"Expected {type_fields}, found {tree_fields}.\n" + f"Missing {set(type_fields) - set(tree_fields)}\n" + f"Note: these fields are post-signature fill and may be more extensive than the orginal. \n" + f"Check input file for the original signature." + ) else: status_writer.unknown_sarif_parsing_shape["extra_info"] = "type fields {} do not match tree fields {}.".format(type_fields, tree_fields) From 202f7f53a5c574a9e34f123e539ce43560273d88 Mon Sep 17 00:00:00 2001 From: Kristen Newbury Date: Tue, 13 Dec 2022 18:32:34 -0500 Subject: [PATCH 2/8] Update README for CLI usage instructions --- README.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/README.md b/README.md index 49d9706..db6f7f4 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,23 @@ The tool was implemented using Python 3.9. +# Sarif format information + + The tool operates on sarif generated by LGTM 1.27.0 (by default) or by the CodeQL CLI (enabled with the -f flag given a value of `CLI`). + + The values that the -f flag accepts are: `LGTM` and `CLI`. + + The CLI versions used against development of the CLI support were: 2.6.3, 2.9.4, and 2.11.4. + + The CLI sarif **MUST** contain one additional property `versionControlProvenance` - which needs to look like: + ``` + "versionControlProvenance": [ + { + "repositoryUri": "https://github.com/testorg/testrepo.git", + "revisionId": "testsha" + } + ``` + # Test Setup This repository includes some test data (in `data`) and uses =git lfs= for storing those test files; installation steps are at [[https://git-lfs.github.com][git-lfs]]; on a mac with homebrew, install it via From dc4fd09e63d74ad24cc2f372d269f6851681f4f6 Mon Sep 17 00:00:00 2001 From: Kristen Newbury Date: Tue, 13 Dec 2022 18:42:45 -0500 Subject: [PATCH 3/8] Update README missing minor syntax --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index db6f7f4..6c7850e 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,7 @@ "repositoryUri": "https://github.com/testorg/testrepo.git", "revisionId": "testsha" } + ] ``` # Test Setup From dae6c50d5b3e1f6cc765a9bd05600e8f17481f6a Mon Sep 17 00:00:00 2001 From: Kristen Newbury Date: Tue, 13 Dec 2022 20:13:13 -0500 Subject: [PATCH 4/8] Bugfix CLI signature merge mistake --- sarif_cli/signature_single_CLI.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sarif_cli/signature_single_CLI.py b/sarif_cli/signature_single_CLI.py index fd8dfa5..d773cf2 100644 --- a/sarif_cli/signature_single_CLI.py +++ b/sarif_cli/signature_single_CLI.py @@ -111,7 +111,6 @@ struct_graph_CLI = ( ('precision', 'String'), ('problem.severity', 'String'), ('security-severity', 'String'), - ('severity', 'String'), ('sub-severity', 'String'), ('tags', 'Array7069'))), ( 'Struct7100', From d602efd3f07038cb5509ef3a6b434735ba893d00 Mon Sep 17 00:00:00 2001 From: Kristen Newbury Date: Thu, 15 Dec 2022 18:46:32 -0500 Subject: [PATCH 5/8] Bugfix signature subset superset mismatch when the template signature portion contains codeflows it was previously possible that a valid sarif problem portion that contains extra fields would be misdiagnosed as not parsable --- sarif_cli/typegraph.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/sarif_cli/typegraph.py b/sarif_cli/typegraph.py index 3769fc6..4dce356 100644 --- a/sarif_cli/typegraph.py +++ b/sarif_cli/typegraph.py @@ -196,9 +196,14 @@ def _destructure_dict(typegraph: Typegraph, node, tree): ) else: - status_writer.unknown_sarif_parsing_shape["extra_info"] = "type fields {} do not match tree fields {}.".format(type_fields, tree_fields) - status_writer.csv_write(status_writer.unknown_sarif_parsing_shape) - raise Exception("typegraph: unhandled case reached: cannot match type " + # possibly looks like: (Struct9699)type_fields: [codeflows...] vs tree_fields: [...extra_properties] + # in that case we need to also try the Struct4055 signature here + if "codeFlows" in type_fields: + _destructure_dict(typegraph, "Struct4055", tree) + else: + status_writer.unknown_sarif_parsing_shape["extra_info"] = "type fields {} do not match tree fields {}.".format(type_fields, tree_fields) + status_writer.csv_write(status_writer.unknown_sarif_parsing_shape) + raise Exception("typegraph: unhandled case reached: cannot match type " "fields {} to tree fields {}. Data is invalid." .format(type_fields, tree_fields)) From fc2c6bac9993db0f6a0d8ffbcceb8714d33d7bf6 Mon Sep 17 00:00:00 2001 From: Kristen Newbury Date: Thu, 5 Jan 2023 12:50:54 -0500 Subject: [PATCH 6/8] Add capability to read sourceLanguage if exists in CLI sarif otherwise dummy val previously assumed never present in CLI sarif --- sarif_cli/scan_tables.py | 11 ++--------- sarif_cli/signature.py | 6 ++++++ sarif_cli/signature_single_CLI.py | 2 +- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/sarif_cli/scan_tables.py b/sarif_cli/scan_tables.py index 716d940..4218478 100644 --- a/sarif_cli/scan_tables.py +++ b/sarif_cli/scan_tables.py @@ -101,21 +101,14 @@ def joins_for_projects(basetables, external_info): else: repo_url = "unknown" project_name = pd.NA - - if 'semmle.sourceLanguage' in b.project: - srcLang = b.project['semmle.sourceLanguage'][0] - allLang = ",".join(list(b.project['semmle.sourceLanguage'])) - else: - srcLang = "unknown" - allLang = "unknown" res = pd.DataFrame(data={ "id" : e.project_id, "project_name" : project_name, "creation_date" : pd.Timestamp(0.0, unit='s'), # TODO: external info "repo_url" : repo_url, - "primary_language" : srcLang, # TODO: external info if CLI sarif - "languages_analyzed" : allLang # TODO: external info if CLI sarif + "primary_language" : b.project['semmle.sourceLanguage'][0], + "languages_analyzed" : ",".join(list(b.project['semmle.sourceLanguage'])) }, index=[0]) # Force all column types to ensure appropriate formatting diff --git a/sarif_cli/signature.py b/sarif_cli/signature.py index 42957fb..f81a111 100644 --- a/sarif_cli/signature.py +++ b/sarif_cli/signature.py @@ -235,6 +235,8 @@ dummy_relatedLocations_entry = [ dummy_message_entry = {'text': 'scli-dyys dummy value'} +dummy_sourceLanguage = 'unknown' + def fillsig_dict(args, elem, context): """ Fill in the missing fields in dictionary signatures. """ @@ -286,6 +288,10 @@ def fillsig_dict(args, elem, context): if 'level' in elem.keys(): full_elem['enabled'] = elem.get('enabled', True) + if 'semmle.formatSpecifier' in elem.keys(): + # Ensure semmle.sourceLanguage is present at least in dummy form + full_elem['semmle.sourceLanguage'] = elem.get('semmle.sourceLanguage', dummy_sourceLanguage) + if 'versionControlProvenance' in elem.keys(): # Ensure newlineSequences is present when versionControlProvenance is full_elem['newlineSequences'] = elem.get('newlineSequences', dummy_newlineSequences) diff --git a/sarif_cli/signature_single_CLI.py b/sarif_cli/signature_single_CLI.py index d773cf2..1b6b747 100644 --- a/sarif_cli/signature_single_CLI.py +++ b/sarif_cli/signature_single_CLI.py @@ -28,7 +28,7 @@ struct_graph_CLI = ( ('Struct3497', ('struct', ('index', 'Int'), ('uri', 'String'))), ('Struct9567', ('struct', ('location', 'Struct3497'))), ('Array6920', ('array', (0, 'Struct5277'), (1, 'Struct9567'))), - ('Struct1509', ('struct', ('semmle.formatSpecifier', 'String'))), + ('Struct1509', ('struct', ('semmle.formatSpecifier', 'String'), ('semmle.sourceLanguage', 'String'))), ('Struct2774', ('struct', ('text', 'String'))), ( 'Struct6299', ( 'struct', From 1a915e4de8046ff3f276de63d6b31cb5a20f336f Mon Sep 17 00:00:00 2001 From: Kristen Newbury Date: Thu, 5 Jan 2023 16:37:55 -0500 Subject: [PATCH 7/8] Update how project_id is generated previously relied on assumption: naming like: / in repositoryUri now just uses full repositoryUri --- bin/sarif-extract-scans | 9 +++------ bin/sarif-extract-scans-runner | 18 ++++++------------ sarif_cli/hash.py | 2 +- sarif_cli/scan_tables.py | 28 ++++++---------------------- 4 files changed, 16 insertions(+), 41 deletions(-) diff --git a/bin/sarif-extract-scans b/bin/sarif-extract-scans index d891f71..a171e8b 100755 --- a/bin/sarif-extract-scans +++ b/bin/sarif-extract-scans @@ -130,17 +130,14 @@ scantabs = ScanTables() @dataclass class ExternalInfo: - project_id : int + project_id: pd.UInt64Dtype() scan_id : pd.UInt64Dtype() sarif_file_name : str - ql_query_id : str external_info = ExternalInfo( - scan_spec["project_id"], + pd.NA, scan_spec["scan_id"], - scan_spec["sarif_file_name"], - # TODO: Take ql_query_id from where? (git commit id of the ql query set) - 'deadbeef00', + scan_spec["sarif_file_name"] ) # diff --git a/bin/sarif-extract-scans-runner b/bin/sarif-extract-scans-runner index a069493..df9c266 100755 --- a/bin/sarif-extract-scans-runner +++ b/bin/sarif-extract-scans-runner @@ -161,7 +161,6 @@ for path in paths: # Paths and components # path = path.rstrip() - project, component = path.split('/') # # Scan specification # @@ -171,30 +170,25 @@ for path in paths: scan_id = hash.hash_unique(data) scan_spec = { - # assuming sarif file names are like / - # however this will be replaced down the line with the repoURI if possible - # still, leaving here in case later versions of this tool do not rely on that property being there - # in that case this will be the best guess - "project_id": hash.hash_unique((project+"-"+component).encode()), # pd.UInt64Dtype() "scan_id": scan_id, # pd.Int64Dtype() "sarif_file_name": path, # pd.StringDtype() } # # If using outermost output directory, create project directory: - # (like //*.scantables) + # (like //*.scantables) # - try: os.mkdir(outer_dir+ project, mode=0o755) + try: os.mkdir(outer_dir+ path, mode=0o755) except FileExistsError: pass - scan_spec_file = os.path.join(outer_dir+ project, component + ".scanspec") + scan_spec_file = os.path.join(outer_dir+ path + ".scanspec") with open(scan_spec_file, 'w') as fp: json.dump(scan_spec, fp) # # Table output directory # - output_dir = os.path.join(outer_dir+ project, component + ".scantables") + output_dir = os.path.join(outer_dir+ path + ".scantables") try: os.mkdir(output_dir, mode=0o755) except FileExistsError: pass # @@ -215,8 +209,8 @@ for path in paths: with open(args.successful_runs, 'wb') as outfile: pickle.dump(successful_runs, outfile) - scan_log_file = os.path.join(outer_dir+ project, component + ".scanlog") - csv_outfile = os.path.join(outer_dir+ project, component) + scan_log_file = os.path.join(outer_dir+ path + ".scanlog") + csv_outfile = os.path.join(outer_dir+ path) runstats = subprocess.run(['sarif-extract-scans', scan_spec_file, output_dir, csv_outfile, "-f", args.input_signature], capture_output=True, text=True) if runstats.returncode == 0: diff --git a/sarif_cli/hash.py b/sarif_cli/hash.py index 9c107ba..f900897 100644 --- a/sarif_cli/hash.py +++ b/sarif_cli/hash.py @@ -4,4 +4,4 @@ from hashlib import blake2b def hash_unique(item_to_hash): h = blake2b(digest_size = 8) h.update(item_to_hash) - return abs(int.from_bytes(h.digest(), byteorder='big')) + return int.from_bytes(h.digest(), byteorder='big') diff --git a/sarif_cli/scan_tables.py b/sarif_cli/scan_tables.py index 4218478..b2cd8f0 100644 --- a/sarif_cli/scan_tables.py +++ b/sarif_cli/scan_tables.py @@ -79,34 +79,18 @@ def joins_for_projects(basetables, external_info): """ b = basetables; e = external_info - # if the sarif does not have versionControlProvenance, semmle.sourceLanguage ect - # there is no reliable way to know the project name - # and will still need to use a guess about the project id + # if the sarif does have versionControlProvenance if "repositoryUri" in b.project: - repo_url = b.project.repositoryUri[0] - # For a repository url of the form - # (git|https)://*/org/project.* - # use the org/project part as the project_name. - # - url_parts = re.match(r'(git|https)://[^/]+/([^/]+)/(.*).git', repo_url) - if url_parts: - project_name = f"{url_parts.group(2)}-{url_parts.group(3)}" - project, component = e.sarif_file_name.rstrip().split('/') - # if the runners guess from the filename was bad, replace with real info - # and continue to use that scanspec to pass that around - if project_name != project+"-"+component: - e.project_id = hash.hash_unique(project_name.encode()) - else: - project_name = pd.NA + repoUri = b.project.repositoryUri[0] + e.project_id = hash.hash_unique(repoUri.encode()) else: - repo_url = "unknown" - project_name = pd.NA + repoUri = "unknown" res = pd.DataFrame(data={ "id" : e.project_id, - "project_name" : project_name, + "project_name" : repoUri, "creation_date" : pd.Timestamp(0.0, unit='s'), # TODO: external info - "repo_url" : repo_url, + "repo_url" : repoUri, "primary_language" : b.project['semmle.sourceLanguage'][0], "languages_analyzed" : ",".join(list(b.project['semmle.sourceLanguage'])) }, index=[0]) From 7dad175d4df5d1ca2cb8ae1875edcabd0f7939d7 Mon Sep 17 00:00:00 2001 From: Kristen Newbury Date: Thu, 12 Jan 2023 12:03:51 -0500 Subject: [PATCH 8/8] Fix tool to default CLI not LGTM sarif input update readme minor improvement --- README.md | 4 +++- bin/sarif-extract-scans-runner | 6 +++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 6c7850e..3916397 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,9 @@ # CLI tools for SARIF processing Each of these tools present a high-level command-line interface to extract a - specific subset of information from a SARIF file. The main tools are: `sarif-extract-scans-runner`,`sarif-aggregate-scans`,`sarif-create-aggregate-report` + specific subset of information from a SARIF file. The main tools are: `sarif-extract-scans-runner`,`sarif-aggregate-scans`,`sarif-create-aggregate-report`. + + Each tool can print its options and description like: `sarif-extract-scans-runner --help`. The tool was implemented using Python 3.9. diff --git a/bin/sarif-extract-scans-runner b/bin/sarif-extract-scans-runner index df9c266..b323bea 100755 --- a/bin/sarif-extract-scans-runner +++ b/bin/sarif-extract-scans-runner @@ -88,9 +88,9 @@ parser = argparse.ArgumentParser(description='Run sarif-extract-scans over a dir parser.add_argument('sarif_files', metavar='sarif-files', type=str, help='File containing list of sarif files, use - for stdin') -parser.add_argument('-f','--input-signature', metavar='input-signature', type=str, default="LGTM", - help='Signature of the sarif, as in, where it was generated it may affect the signature.' - 'Options: LGTM, CLI' +parser.add_argument('-f','--input-signature', metavar='input-signature', type=str, default="CLI", + help='Signature of the sarif, as in, where it was generated it may affect the signature.\n' + 'Options: LGTM, CLI.\n' 'If current represented signatures are not sufficient, view signature_single.py for how to support further signatures.' ' Default: "%(default)s"')