From 2ba9593d7087226bb25354c32614021996e2d369 Mon Sep 17 00:00:00 2001 From: Kristen Newbury Date: Thu, 1 Dec 2022 11:37:56 -0500 Subject: [PATCH 01/23] Add CLI support enabled by -f flag with CLI value tested on sarif from CodeQL CLIs: 2.6.3, 2.9.4, 2.11.4 MUST contain versionControlProvenance property however --- bin/sarif-extract-multi | 2 +- bin/sarif-extract-scans | 81 ++++-- bin/sarif-extract-scans-runner | 16 +- bin/sarif-extract-tables | 6 +- sarif_cli/scan_tables.py | 65 +++-- sarif_cli/signature.py | 6 +- sarif_cli/signature_single.py | 7 +- sarif_cli/signature_single_CLI.py | 161 +++++++++++ sarif_cli/table_joins.py | 5 +- sarif_cli/table_joins_CLI.py | 462 ++++++++++++++++++++++++++++++ sarif_cli/typegraph.py | 22 +- 11 files changed, 765 insertions(+), 68 deletions(-) create mode 100644 sarif_cli/signature_single_CLI.py create mode 100644 sarif_cli/table_joins_CLI.py diff --git a/bin/sarif-extract-multi b/bin/sarif-extract-multi index 66f40ac..c5f5655 100755 --- a/bin/sarif-extract-multi +++ b/bin/sarif-extract-multi @@ -81,7 +81,7 @@ bt = BaseTables() # # Add dataframes # -sf_2683 = tj.joins_for_sf_2683(tgraph) +sf_2683 = tj.joins_for_location_info(tgraph) af_0350_location = tj.joins_for_af_0350_location(tgraph) bt.artifacts = tj.joins_for_artifacts(tgraph) bt.codeflows = tj.joins_for_codeflows(tgraph, sf_2683) diff --git a/bin/sarif-extract-scans b/bin/sarif-extract-scans index 94052e9..c5b5cb9 100755 --- a/bin/sarif-extract-scans +++ b/bin/sarif-extract-scans @@ -2,7 +2,7 @@ """ Extract scan data from multiple sarif files in table form. """ from dataclasses import dataclass -from sarif_cli import signature, signature_single +from sarif_cli import signature, signature_single, signature_single_CLI from sarif_cli import typegraph from sarif_cli import snowflake_id from sarif_cli import status_writer @@ -14,6 +14,7 @@ import logging import pandas as pd import pathlib import sarif_cli.table_joins as tj +import sarif_cli.table_joins_CLI as tj_CLI import sarif_cli.scan_tables as st import sys @@ -32,8 +33,18 @@ parser.add_argument('outdir', metavar='output-dir', type=str, help='output direc parser.add_argument('csvout', metavar='csv-outfile', type=str, help='processing status csv output file name to use') parser.add_argument('-r', '--write-raw-tables', action="store_true", help='Write the raw sarif tables to the output directory') +parser.add_argument('-f','--input-signature', metavar='input-signature', type=str, default="LGTM", + help='Signature of the sarif, as in, where it was generated it may affect the signature.' + 'Options: LGTM, CLI' + 'If current represented signatures are not sufficient, view signature_single.py for how to support further signatures.' + ' Default: "%(default)s"') args = parser.parse_args() +if args.input_signature not in ["LGTM","CLI"]: + print("Unsupported sarif signature requested.") + print("Use one of [LGTM, CLI].") + sys.exit(0) + # Setup csv error writer status_writer.setup_csv_writer(args.csvout) @@ -66,11 +77,20 @@ context = signature.Context( ) sarif_struct = signature.fillsig(args, sarif_struct, context) +# +# Setup which signature to use +if args.input_signature == "LGTM": + signature_to_use = signature_single.struct_graph_LGTM + start_node = signature_single.start_node_LGTM +else: + #signature_to_use = signature_single.struct_graph_CLI + signature_to_use = signature_single_CLI.struct_graph_CLI + start_node = signature_single_CLI.start_node_CLI # # Use reference type graph (signature) to traverse sarif and attach values to tables try: - tgraph = typegraph.Typegraph(signature_single.struct_graph_2022_02_01) - typegraph.destructure(tgraph, signature_single.start_node_2022_02_01, sarif_struct) + tgraph = typegraph.Typegraph(signature_to_use) + typegraph.destructure(tgraph, start_node, sarif_struct) except Exception: # will have gathered errors/warnings status_writer.csv_write_warnings() @@ -126,31 +146,29 @@ external_info = ExternalInfo( # # Add dataframes for base tables # -sf_2683 = tj.joins_for_sf_2683(tgraph) -af_0350_location = tj.joins_for_af_0350_location(tgraph) -bt.artifacts = tj.joins_for_artifacts(tgraph) -bt.codeflows = tj.joins_for_codeflows(tgraph, sf_2683) -bt.kind_pathproblem = tj.joins_for_path_problem(tgraph, af_0350_location) -bt.kind_problem = tj.joins_for_problem(tgraph, af_0350_location) -bt.project = tj.joins_for_project_single(tgraph) -bt.relatedLocations = tj.joins_for_relatedLocations(tgraph, sf_2683) -bt.rules = tj.joins_for_rules(tgraph) +# (relies on some specifics of the sigature type) +if args.input_signature == "LGTM": + tj = tj +else: + tj = tj_CLI +try: + location_info = tj.joins_for_location_info(tgraph) + af_0350_location = tj.joins_for_af_0350_location(tgraph) + bt.artifacts = tj.joins_for_artifacts(tgraph) + bt.codeflows = tj.joins_for_codeflows(tgraph, location_info) + bt.kind_pathproblem = tj.joins_for_path_problem(tgraph, af_0350_location) + bt.kind_problem = tj.joins_for_problem(tgraph, af_0350_location) + bt.project = tj.joins_for_project_single(tgraph) + bt.relatedLocations = tj.joins_for_relatedLocations(tgraph, location_info) + bt.rules = tj.joins_for_rules(tgraph) +except Exception: + #possible warnings accumulated + status_writer.csv_write_warnings() + raise Exception # -# Form scan tables +# Setup rest of basetables # -# joins for projects has to happen first as it backfills the guess about the project_id -scantabs.projects = st.joins_for_projects(bt, external_info, scantabs) -scantabs.results = st.joins_for_results(bt, external_info) -scantabs.scans = st.joins_for_scans(bt, external_info, scantabs) - - - -# -# Replace the remaining internal ids with snowflake ids -# -flakegen = snowflake_id.Snowflake(0) - bt.columns_to_reindex = { # template from {field.name : [''] for field in dc.fields(bt)} 'artifacts': ['artifacts_id'], @@ -167,6 +185,19 @@ scantabs.columns_to_reindex = { 'results': ['codeFlow_id'], } +# +# Form scan tables +# +# joins for projects has to happen first as it backfills the guess about the project_id +scantabs.projects = st.joins_for_projects(bt, external_info) +scantabs.results = st.joins_for_results(bt, external_info) +scantabs.scans = st.joins_for_scans(bt, external_info, scantabs, args.input_signature) + +# +# Replace the remaining internal ids with snowflake ids +# +flakegen = snowflake_id.Snowflake(0) + _id_to_flake = {} def _get_flake(id): flake = _id_to_flake.get(id, -1) diff --git a/bin/sarif-extract-scans-runner b/bin/sarif-extract-scans-runner index 5c5a983..a069493 100755 --- a/bin/sarif-extract-scans-runner +++ b/bin/sarif-extract-scans-runner @@ -87,7 +87,14 @@ from sarif_cli import hash parser = argparse.ArgumentParser(description='Run sarif-extract-scans over a directory hierarchy') parser.add_argument('sarif_files', metavar='sarif-files', type=str, help='File containing list of sarif files, use - for stdin') -parser.add_argument('-o','--outdir', metavar='output-dir', type=str, default="", help='output directory') + +parser.add_argument('-f','--input-signature', metavar='input-signature', type=str, default="LGTM", + help='Signature of the sarif, as in, where it was generated it may affect the signature.' + 'Options: LGTM, CLI' + 'If current represented signatures are not sufficient, view signature_single.py for how to support further signatures.' + ' Default: "%(default)s"') + +parser.add_argument('-o','--outdir', metavar='output-dir', type=str, default="", help='Output directory') parser.add_argument('-m', '--max-files', metavar='number', type=int, default=100000, help='Maximum number of files to process.' @@ -126,6 +133,11 @@ if outer_dir != "": except FileExistsError: pass +if args.input_signature not in ["LGTM","CLI"]: + print("Unsupported sarif signature requested.") + print("Use one of [LGTM, CLI].") + sys.exit(0) + # # Collect sarif file information # @@ -205,7 +217,7 @@ for path in paths: scan_log_file = os.path.join(outer_dir+ project, component + ".scanlog") csv_outfile = os.path.join(outer_dir+ project, component) - runstats = subprocess.run(['sarif-extract-scans', scan_spec_file, output_dir, csv_outfile], + runstats = subprocess.run(['sarif-extract-scans', scan_spec_file, output_dir, csv_outfile, "-f", args.input_signature], capture_output=True, text=True) if runstats.returncode == 0: print("{:6} {}".format("OK", path)) diff --git a/bin/sarif-extract-tables b/bin/sarif-extract-tables index 439b335..97820b3 100755 --- a/bin/sarif-extract-tables +++ b/bin/sarif-extract-tables @@ -59,8 +59,8 @@ sarif_struct = signature.fillsig(args, sarif_struct, context) # # Use reference type graph (signature) to traverse sarif and attach values to tables # -tgraph = typegraph.Typegraph(signature_single.struct_graph_2022_02_01) -typegraph.destructure(tgraph, signature_single.start_node_2022_02_01, sarif_struct) +tgraph = typegraph.Typegraph(signature_single.struct_graph_LGTM) +typegraph.destructure(tgraph, signature_single.start_node_LGTM, sarif_struct) # # Form output tables @@ -84,7 +84,7 @@ bt = BaseTables() # # Add dataframes # -sf_2683 = tj.joins_for_sf_2683(tgraph) +sf_2683 = tj.joins_for_location_info(tgraph) af_0350_location = tj.joins_for_af_0350_location(tgraph) bt.artifacts = tj.joins_for_artifacts(tgraph) bt.codeflows = tj.joins_for_codeflows(tgraph, sf_2683) diff --git a/sarif_cli/scan_tables.py b/sarif_cli/scan_tables.py index d6b79a7..ce709d1 100644 --- a/sarif_cli/scan_tables.py +++ b/sarif_cli/scan_tables.py @@ -73,36 +73,49 @@ class ScanTablesTypes: # # Projects table # -def joins_for_projects(basetables, external_info, scantables): +def joins_for_projects(basetables, external_info): """ Form the 'projects' table for the ScanTables dataclass """ b = basetables; e = external_info - - # For a repository url of the form - # (git|https)://*/org/project.* - # use the org/project part as the project_name. - # - # TODO knewbury error handling for if the signature is slotted out? - repo_url = b.project.repositoryUri[0] - url_parts = re.match(r'(git|https)://[^/]+/([^/]+)/(.*).git', repo_url) - if url_parts: - project_name = f"{url_parts.group(2)}-{url_parts.group(3)}" - project, component = e.sarif_file_name.rstrip().split('/') - # if the runners guess from the filename was bad, replace with real info - # and continue to use that scanspec to pass that around - if project_name != project+"-"+component: - e.project_id = hash.hash_unique(project_name.encode()) + + # if the sarif does not have versionControlProvenance, semmle.sourceLanguage ect + # there is no reliable way to know the project name + # and will still need to use a guess about the project id + if "repositoryUri" in b.project: + repo_url = b.project.repositoryUri[0] + # For a repository url of the form + # (git|https)://*/org/project.* + # use the org/project part as the project_name. + # + url_parts = re.match(r'(git|https)://[^/]+/([^/]+)/(.*).git', repo_url) + if url_parts: + project_name = f"{url_parts.group(2)}-{url_parts.group(3)}" + project, component = e.sarif_file_name.rstrip().split('/') + # if the runners guess from the filename was bad, replace with real info + # and continue to use that scanspec to pass that around + if project_name != project+"-"+component: + e.project_id = hash.hash_unique(project_name.encode()) + else: + project_name = pd.NA else: + repo_url = "unknown" project_name = pd.NA + + if 'semmle.sourceLanguage' in b.project: + srcLang = b.project['semmle.sourceLanguage'][0] + allLang = ",".join(list(b.project['semmle.sourceLanguage'])) + else: + srcLang = "unknown" + allLang = "unknown" res = pd.DataFrame(data={ "id" : e.project_id, "project_name" : project_name, "creation_date" : pd.Timestamp(0.0, unit='s'), # TODO: external info "repo_url" : repo_url, - "primary_language" : b.project['semmle.sourceLanguage'][0], # TODO: external info - "languages_analyzed" : ",".join(list(b.project['semmle.sourceLanguage'])) + "primary_language" : srcLang, # TODO: external info if CLI sarif + "languages_analyzed" : allLang # TODO: external info if CLI sarif }, index=[0]) # Force all column types to ensure appropriate formatting @@ -112,7 +125,7 @@ def joins_for_projects(basetables, external_info, scantables): # # Scans table # -def joins_for_scans(basetables, external_info, scantables): +def joins_for_scans(basetables, external_info, scantables, sarif_type): """ Form the `scans` table for the ScanTables dataclass """ @@ -122,9 +135,14 @@ def joins_for_scans(basetables, external_info, scantables): driver_version = b.project.driver_version.unique() assert len(driver_version) == 1, \ "More than one driver version found for single sarif file." + # TODO if commit id exists in external info for CLI gen'd sarif, add? + if sarif_type == "LGTM": + commit_id = b.project.revisionId[0] + else: + commit_id = "unknown" res = pd.DataFrame(data={ "id" : e.scan_id, - "commit_id" : b.project.revisionId[0], + "commit_id" : commit_id, "project_id" : e.project_id, # TODO extract real date information from somewhere external "db_create_start" : pd.Timestamp(0.0, unit='s'), @@ -159,7 +177,7 @@ def joins_for_results(basetables, external_info): tables = [_results_from_kind_problem(basetables, external_info), _results_from_kind_pathproblem(basetables, external_info)] stack = [table for table in tables if len(table) > 0] - + # Concatenation fails without at least one table, so avoid that. if len(stack) > 0: res = pd.concat(stack) @@ -195,7 +213,7 @@ def _results_from_kind_problem(basetables, external_info): 'query_id' : b.kind_problem.rule_id, 'query_kind' : "problem", 'query_precision' : [_populate_from_rule_table("precision", b, i) for i in range(len(b.kind_problem))], - 'query_severity' : [_populate_from_rule_table("severity", b, i) for i in range(len(b.kind_problem))], + 'query_severity' : [_populate_from_rule_table("problem.severity", b, i) for i in range(len(b.kind_problem))], 'result_type' : "kind_problem", 'codeFlow_id' : 0, # link to codeflows (kind_pathproblem only, NULL here) @@ -240,6 +258,7 @@ def _results_from_kind_pathproblem(basetables, external_info): # The `result` table has no entry to distinguish these, so we use a simplified # version of `kind_pathproblem`. + reduced_kind_pathp = b.kind_pathproblem.drop( columns=[ 'relatedLocation_array_index', @@ -284,7 +303,7 @@ def _results_from_kind_pathproblem(basetables, external_info): 'query_id' : cfid0ppt0.rule_id.values[0], 'query_kind' : "path-problem", 'query_precision' : _populate_from_rule_table_code_flow("precision", b, cfid0ppt0), - 'query_severity' : _populate_from_rule_table_code_flow("severity", b, cfid0ppt0), + 'query_severity' : _populate_from_rule_table_code_flow("problem.severity", b, cfid0ppt0), # 'result_type' : "kind_pathproblem", 'codeFlow_id' : cfid0, diff --git a/sarif_cli/signature.py b/sarif_cli/signature.py index 582dbee..d3e76e4 100644 --- a/sarif_cli/signature.py +++ b/sarif_cli/signature.py @@ -53,6 +53,8 @@ def _signature_dict(args, elem, context: Context): if args.typedef_signatures: # Give every unique struct a name and use a reference to it as value. if signature not in context.sig_to_typedef: + #cannot have leading 0 hashes later in table joins so replace now + #context.sig_to_typedef[signature] = str("Struct%04d" % shorthash(signature)).replace("0", "1") context.sig_to_typedef[signature] = "Struct%04d" % shorthash(signature) typedef = context.sig_to_typedef[signature] return typedef @@ -79,6 +81,8 @@ def _signature_list(args, elem, context): if args.typedef_signatures: # Give every unique array a name and use a reference to it as value. if signature not in context.sig_to_typedef: + #cannot have leading 0 hashes later in table joins so replace now + #context.sig_to_typedef[signature] = str("Array%04d" % shorthash(signature)).replace("0", "1") context.sig_to_typedef[signature] = "Array%04d" % shorthash(signature) typedef = context.sig_to_typedef[signature] return typedef @@ -225,7 +229,7 @@ dummy_newlineSequences = ['\r\n', '\n', '\u2028', '\u2029'] dummy_relatedLocations_entry = [ {'id': -1, 'physicalLocation': {'artifactLocation': {'uri': 'scli-dyys dummy value', - 'uriBaseId': 'scli-dyys dummy value', + 'uriBaseId': 'scli-dyys uriBaseId', 'index': -1}, 'region': {'startLine': -1, 'startColumn': -1, diff --git a/sarif_cli/signature_single.py b/sarif_cli/signature_single.py index 050cab7..4cc20a5 100644 --- a/sarif_cli/signature_single.py +++ b/sarif_cli/signature_single.py @@ -12,9 +12,9 @@ is marked below # # The starting node the leftmost node in ../notes/typegraph.pdf # -start_node_2022_02_01 = 'Struct6787' +start_node_LGTM = 'Struct6787' -struct_graph_2022_02_01 = ( +struct_graph_LGTM = ( [ ('String', 'string'), ('Int', 'int'), ('Bool', 'bool'), @@ -121,5 +121,4 @@ struct_graph_2022_02_01 = ( ('$schema', 'String'), ('runs', 'Array0177'), ('version', 'String')))] -) - +) \ No newline at end of file diff --git a/sarif_cli/signature_single_CLI.py b/sarif_cli/signature_single_CLI.py new file mode 100644 index 0000000..fd8dfa5 --- /dev/null +++ b/sarif_cli/signature_single_CLI.py @@ -0,0 +1,161 @@ +""" The signature for a single sarif file + +Produced by + + sarif-to-dot -u -t -f 2021-12-09/results.sarif + +with some arrays manually sorted so the the signature with more fields comes first. The case + ('Array6343', ('array', (1, 'Struct9699'), (0, 'Struct4055'))), # MANUALLY SORTED +is marked below +""" + +# +# The starting node the leftmost node in ../notes/typegraph.pdf +# +start_node_CLI = 'Struct5521' + +# generated with CLI 2.9.4 +struct_graph_CLI = ( + [ ('String', 'string'), + ('Int', 'int'), + ('Bool', 'bool'), + ( 'Struct2685', + ( 'struct', + ('index', 'Int'), + ('uri', 'String'), + ('uriBaseId', 'String'))), + ('Struct5277', ('struct', ('location', 'Struct2685'))), + ('Struct3497', ('struct', ('index', 'Int'), ('uri', 'String'))), + ('Struct9567', ('struct', ('location', 'Struct3497'))), + ('Array6920', ('array', (0, 'Struct5277'), (1, 'Struct9567'))), + ('Struct1509', ('struct', ('semmle.formatSpecifier', 'String'))), + ('Struct2774', ('struct', ('text', 'String'))), + ( 'Struct6299', + ( 'struct', + ('endColumn', 'Int'), + ('endLine', 'Int'), + ('startColumn', 'Int'), + ('startLine', 'Int'))), + ( 'Struct4963', + ( 'struct', + ('artifactLocation', 'Struct2685'), + ('region', 'Struct6299'))), + ( 'Struct2683', + ( 'struct', + ('id', 'Int'), + ('message', 'Struct2774'), + ('physicalLocation', 'Struct4963'))), + ('Array0350', ('array', (0, 'Struct2683'))), + ( 'Struct4199', + ( 'struct', + ('primaryLocationLineHash', 'String'), + ('primaryLocationStartColumnFingerprint', 'String'))), + ('Struct3942', ('struct', ('id', 'String'), ('index', 'Int'))), + ( 'Struct4055', + ( 'struct', + ('locations', 'Array0350'), + ('message', 'Struct2774'), + ('partialFingerprints', 'Struct4199'), + ('relatedLocations', 'Array0350'), + ('rule', 'Struct3942'), + ('ruleId', 'String'), + ('ruleIndex', 'Int'))), + ( 'Struct7125', + ( 'struct', + ('artifactLocation', 'Struct3497'), + ('region', 'Struct6299'))), + ( 'Struct6772', + ( 'struct', + ('id', 'Int'), + ('message', 'Struct2774'), + ('physicalLocation', 'Struct7125'))), + ('Array8753', ('array', (0, 'Struct6772'))), + ( 'Struct0102', + ( 'struct', + ('locations', 'Array0350'), + ('message', 'Struct2774'), + ('partialFingerprints', 'Struct4199'), + ('relatedLocations', 'Array8753'), + ('rule', 'Struct3942'), + ('ruleId', 'String'), + ('ruleIndex', 'Int'))), + ('Struct0987', ('struct', ('location', 'Struct2683'))), + ('Array1075', ('array', (0, 'Struct0987'))), + ('Struct4194', ('struct', ('locations', 'Array1075'))), + ('Array1597', ('array', (0, 'Struct4194'))), + ('Struct7122', ('struct', ('threadFlows', 'Array1597'))), + ('Array9799', ('array', (0, 'Struct7122'))), + ( 'Struct9699', + ( 'struct', + ('codeFlows', 'Array9799'), + ('locations', 'Array0350'), + ('message', 'Struct2774'), + ('partialFingerprints', 'Struct4199'), + ('relatedLocations', 'Array0350'), + ('rule', 'Struct3942'), + ('ruleId', 'String'), + ('ruleIndex', 'Int'))), + ( 'Array1768', + #('array', (2, 'Struct9699'), (1, 'Struct4055'),(0, 'Struct0102'))), + #('array',(0, 'Struct0102'), (1, 'Struct4055'), (2, 'Struct9699'))), + #omitting (0, 'Struct0102') means we will never find column info + ('array', (2, 'Struct9699'), (1, 'Struct4055'))), + ('Struct8581', ('struct', ('enabled', 'Bool'), ('level', 'String'))), + ('Array7069', ('array', (0, 'String'))), + ( 'Struct6853', + ( 'struct', + ('description', 'String'), + ('id', 'String'), + ('kind', 'String'), + ('name', 'String'), + ('precision', 'String'), + ('problem.severity', 'String'), + ('security-severity', 'String'), + ('severity', 'String'), + ('sub-severity', 'String'), + ('tags', 'Array7069'))), + ( 'Struct7100', + ( 'struct', + ('defaultConfiguration', 'Struct8581'), + ('fullDescription', 'Struct2774'), + ('id', 'String'), + ('name', 'String'), + ('properties', 'Struct6853'), + ('shortDescription', 'Struct2774'))), + ('Array0147', ('array', (0, 'Struct7100'))), + ( 'Struct7828', + ( 'struct', + ('name', 'String'), + ('organization', 'String'), + ('rules', 'Array0147'), + ('semanticVersion', 'String'))), + ( 'Struct9027', + ('struct', ('description', 'Struct2774'), ('uri', 'String'))), + ('Array4813', ('array', (0, 'Struct9027'))), + ( 'Struct6152', + ( 'struct', + ('locations', 'Array4813'), + ('name', 'String'), + ('semanticVersion', 'String'))), + ('Struct7826', ('struct', ('locations', 'Array4813'), ('name', 'String'))), + ('Array9357', ('array', (0, 'Struct6152'), (1, 'Struct7826'))), + ( 'Struct0032', + ('struct', ('driver', 'Struct7828'), ('extensions', 'Array9357'))), + ( 'Struct3081', + ('struct', ('repositoryUri', 'String'), ('revisionId', 'String'))), + ('Array5511', ('array', (0, 'Struct3081'))), + ( 'Struct9786', + ( 'struct', + ('artifacts', 'Array6920'), + ('columnKind', 'String'), + ('newlineSequences', 'Array7069'), + ('properties', 'Struct1509'), + ('results', 'Array1768'), + ('tool', 'Struct0032'), + ('versionControlProvenance', 'Array5511'))), + ('Array1273', ('array', (0, 'Struct9786'))), + ( 'Struct5521', + ( 'struct', + ('$schema', 'String'), + ('runs', 'Array1273'), + ('version', 'String')))] ) diff --git a/sarif_cli/table_joins.py b/sarif_cli/table_joins.py index 5209f84..520f0c6 100644 --- a/sarif_cli/table_joins.py +++ b/sarif_cli/table_joins.py @@ -73,13 +73,12 @@ def joins_for_af_0350_location(tgraph): ) return af_0350_location -def joins_for_sf_2683(tgraph): +def joins_for_location_info(tgraph): """ Join all the tables used by 2683's right side into one. """ # Access convenience functions sf = lambda num: tgraph.dataframes['Struct' + str(num)] - af = lambda num: tgraph.dataframes['Array' + str(num)] # sf_2683 = ( # @@ -116,6 +115,8 @@ def joins_for_problem(tgraph, af_0350_location): # # Form the message dataframe (@kind problem) via joins # + import IPython + IPython.embed(header="spot 1") kind_problem_1 = ( aft(6343) diff --git a/sarif_cli/table_joins_CLI.py b/sarif_cli/table_joins_CLI.py new file mode 100644 index 0000000..71b8c42 --- /dev/null +++ b/sarif_cli/table_joins_CLI.py @@ -0,0 +1,462 @@ +""" Collection of joins for the base tables provided by typegraph.attach_tables() + + The `problem` and `path-problem` entries provide that information; the + `relatedLocations` table provides the details when multiple results are + present for either. `project` is the high-level overview; `artifacts` + provides those for the other tables. +""" +import pandas as pd +import re +from .typegraph import tagged_array_columns, tagged_struct_columns + +class BaseTablesTypes: + codeflows = { + "codeflow_id" : pd.UInt64Dtype(), + "codeflow_index" : pd.Int64Dtype(), + "threadflow_index" : pd.Int64Dtype(), + "location_index" : pd.Int64Dtype(), + "endColumn" : pd.Int64Dtype(), + "endLine" : pd.Int64Dtype(), + "startColumn" : pd.Int64Dtype(), + "startLine" : pd.Int64Dtype(), + "artifact_index" : pd.Int64Dtype(), + "uri" : pd.StringDtype(), + "uriBaseId" : pd.StringDtype(), + "message" : pd.StringDtype(), + } + +def joins_for_af_0350_location(tgraph): + """ + Join all the tables used by 0350's right side into one. + """ + # Access convenience functions + sf = lambda num: tgraph.dataframes['Struct' + str(num)] + af = lambda num: tgraph.dataframes['Array' + str(num)] + sft = lambda id: sf(id).rename(columns = tagged_struct_columns(tgraph, id)) + aft = lambda id: af(id).rename(columns = tagged_array_columns(tgraph, id)) + + af_0350_location = ( + aft('0350') + # + .merge(sft(2683), how="left", left_on='t0350_id_or_value_at_index', right_on='t2683_struct_id', + validate="1:m") + .drop(columns=['t0350_id_or_value_at_index', 't2683_struct_id', 't0350_type_at_index']) + # + .merge(sft(4963), how="left", left_on='t2683_physicalLocation', right_on='t4963_struct_id', + validate="1:m") + .drop(columns=['t2683_physicalLocation', 't4963_struct_id']) + # + .merge(sft(6299), how="left", left_on='t4963_region', right_on='t6299_struct_id', + validate="1:m") + .drop(columns=['t4963_region', 't6299_struct_id']) + # + .merge(sft(2685), how="left", left_on='t4963_artifactLocation', right_on='t2685_struct_id', + validate="1:m") + .drop(columns=['t4963_artifactLocation', 't2685_struct_id']) + # + .merge(sft(2774), how="left", left_on='t2683_message', right_on='t2774_struct_id', + validate="1:m") + .drop(columns=['t2683_message', 't2774_struct_id']) + # + .rename(columns={'t0350_array_id' : 'm0350_location_array_id', + 't0350_value_index' : 'm0350_location_array_index', + 't2683_id' : 'm0350_location_id', + 't6299_endColumn' : 'm0350_location_endColumn', + 't6299_endLine' : 'm0350_location_endLine', + 't6299_startColumn' : 'm0350_location_startColumn', + 't6299_startLine' : 'm0350_location_startLine', + 't2685_index' : 'm0350_location_index', + 't2685_uri' : 'm0350_location_uri', + 't2685_uriBaseId' : 'm0350_location_uriBaseId', + 't2774_text' : 'm0350_location_message', + }) + ) + return af_0350_location + +def joins_for_location_info(tgraph): + """ + Join all the tables used by 2683's right side into one. + """ + # Access convenience functions + sf = lambda num: tgraph.dataframes['Struct' + str(num)] + # + sf_2683 = ( + # + sf(2683) + .rename(columns={"struct_id": "struct_id_2683", "id": "id_2683"}) + # + .merge(sf(4963), how="left", left_on='physicalLocation', right_on='struct_id', validate="1:m") + .drop(columns=['struct_id', 'physicalLocation']) + # + .merge(sf(6299), how="left", left_on='region', right_on='struct_id', validate="1:m") + .drop(columns=['struct_id', 'region']) + # + .merge(sf(2685), how="left", left_on='artifactLocation', right_on='struct_id', validate="1:m") + .drop(columns=['struct_id', 'artifactLocation']) + .rename(columns={"index": "location_index_2685"}) + # + .merge(sf(2774), how="left", left_on='message', right_on='struct_id', validate="1:m") + .drop(columns=['struct_id', 'message']) + .rename(columns={"text": "message_text_2683"}) + # + ) + + return sf_2683 + +def joins_for_problem(tgraph, af_0350_location): + """ + Return table providing the `problem` information. + """ + # Access convenience functions + sf = lambda num: tgraph.dataframes['Struct' + str(num)] + af = lambda num: tgraph.dataframes['Array' + str(num)] + sft = lambda id: sf(id).rename(columns = tagged_struct_columns(tgraph, id)) + aft = lambda id: af(id).rename(columns = tagged_array_columns(tgraph, id)) + # + # Form the message dataframe (@kind problem) via joins + # + + kind_problem_1 = ( + aft(1768) + .merge(sft(4055), how="inner", + left_on='t1768_id_or_value_at_index', right_on='t4055_struct_id', + validate="1:m") + .drop(columns=['t1768_type_at_index', 't1768_id_or_value_at_index', + 't4055_struct_id']) + # + .merge(af_0350_location, how="left", left_on='t4055_locations', + right_on='m0350_location_array_id', validate="1:m") + .drop(columns=['t4055_locations', 'm0350_location_array_id']) + # + .merge(af_0350_location.rename(columns=lambda x: re.sub('m0350_location', + 'm0350_relatedLocation', + x)), + how="left", left_on='t4055_relatedLocations', + right_on='m0350_relatedLocation_array_id', validate="1:m") + .drop(columns=['t4055_relatedLocations', 'm0350_relatedLocation_array_id']) + # + .merge(sft(2774), how="left", left_on='t4055_message', right_on='t2774_struct_id') + .drop(columns=['t4055_message', 't2774_struct_id']) + .rename(columns={"t2774_text": "t4055_message_text"}) + # + .merge(sft(4199), how="left", left_on='t4055_partialFingerprints', + right_on='t4199_struct_id') + .drop(columns=['t4055_partialFingerprints', 't4199_struct_id']) + # + .merge(sft(3942), how="left", left_on='t4055_rule', + right_on='t3942_struct_id') + .drop(columns=['t4055_rule', 't3942_struct_id']) + ) + + kind_problem_2 = ( + kind_problem_1 + .rename({ + 't1768_array_id' : 'results_array_id', + 't1768_value_index' : 'results_array_index', + 't4055_ruleId' : 'ruleId', + 't4055_ruleIndex' : 'ruleIndex', + 't4055_message_text' : 'message_text', + 't3942_id' : 'rule_id', + 't3942_index' : 'rule_index', + }, axis='columns') + # Strip type prefix for the rest + .rename(columns = lambda x: re.sub('m0350_|t4199_', '', x)) + ) + + return kind_problem_2 + + +def joins_for_codeflows(tgraph, sf_2683): + """ + Return the table providing the `codeFlows` for a `path-problem table. + """ + # Access convenience functions + sf = lambda num: tgraph.dataframes['Struct' + str(num)] + af = lambda num: tgraph.dataframes['Array' + str(num)] + # + codeflows = ( + af(9799).rename(columns={"array_id": "t9799_array_id", "value_index": "t9799_idx"}) + # + .merge(sf(7122), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m") + .drop(columns=['id_or_value_at_index', 'struct_id', 'type_at_index']) + # + .merge(af(1597).rename(columns={"array_id": "t1597_array_id", "value_index": "t1597_idx"}), + how="left", left_on='threadFlows', right_on='t1597_array_id', validate="1:m") + .drop(columns=['threadFlows', 't1597_array_id', 'type_at_index']) + # + .merge(sf(4194), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m") + .drop(columns=['id_or_value_at_index', 'struct_id']) + # + .merge(af(1075).rename(columns={"array_id": "t1075_array_id", "value_index": "t1075_idx"}), + how="left", left_on='locations', right_on='t1075_array_id', validate="1:m") + .drop(columns=['locations', 't1075_array_id', 'type_at_index']) + .rename(columns={"t1075_idx": "t1075_locations_idx"}) + # + .merge(sf('0987'), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m") + .drop(columns=['id_or_value_at_index', 'struct_id']) + # + .merge(sf_2683, how="left", left_on='location', right_on='struct_id_2683', validate="1:m") + .drop(columns=['location', 'struct_id_2683']) + ) + codeflows_1 = ( + codeflows + .drop(columns=['id_2683']) + .rename({ + 't9799_array_id': 'codeflow_id', + 't9799_idx': 'codeflow_index', + 't1597_idx': 'threadflow_index', + 't1075_locations_idx': 'location_index', + 'location_index_2685': 'artifact_index', + 'message_text_2683': 'message', + }, axis='columns') + ) + codeflows_2 = codeflows_1.astype(BaseTablesTypes.codeflows).reset_index(drop=True) + return codeflows_2 + +def joins_for_path_problem(tgraph, af_0350_location): + """ + Return table providing the `path-problem` information. + """ + # Access convenience functions + sf = lambda num: tgraph.dataframes['Struct' + str(num)] + af = lambda num: tgraph.dataframes['Array' + str(num)] + sft = lambda id: sf(id).rename(columns = tagged_struct_columns(tgraph, id)) + aft = lambda id: af(id).rename(columns = tagged_array_columns(tgraph, id)) + + kind_pathproblem_1 = ( + aft(1768) + .merge(sft(9699), how="inner", left_on='t1768_id_or_value_at_index', right_on='t9699_struct_id', + validate="1:m") + .drop(columns=['t1768_id_or_value_at_index', 't9699_struct_id', 't1768_type_at_index']) + # + .merge(af_0350_location, how="left", left_on='t9699_locations', + right_on='m0350_location_array_id', validate="1:m") + .drop(columns=['t9699_locations', 'm0350_location_array_id']) + # + .merge(af_0350_location.rename(columns=lambda x: re.sub('m0350_location', + 'm0350_relatedLocation', + x)), + how="left", left_on='t9699_relatedLocations', + right_on='m0350_relatedLocation_array_id', validate="1:m") + .drop(columns=['t9699_relatedLocations', 'm0350_relatedLocation_array_id']) + # + .merge(sft(2774), how="left", left_on='t9699_message', right_on='t2774_struct_id') + .drop(columns=['t9699_message', 't2774_struct_id']) + .rename(columns={"t2774_text": "t9699_message_text"}) + # + .merge(sft(4199), how="left", left_on='t9699_partialFingerprints', + right_on='t4199_struct_id') + .drop(columns=['t9699_partialFingerprints', 't4199_struct_id']) + # + .merge(sft(3942), how="left", left_on='t9699_rule', + right_on='t3942_struct_id') + .drop(columns=['t9699_rule', 't3942_struct_id']) + ) + strip_colums = lambda x: re.sub('t9699_|m0350_|t4199_', '', x) + kind_pathproblem_2 = (kind_pathproblem_1 + .rename({ + 't1768_array_id' : 'results_array_id', + 't1768_value_index' : 'results_array_index', + 't9699_codeFlows' : 'codeFlows_id', + 't9699_ruleId' : 'ruleId', + 't9699_ruleIndex' : 'ruleIndex', + 't9699_message_text' : 'message_text', + 't3942_id' : 'rule_id', + 't3942_index' : 'rule_index', + }, axis='columns') + # Strip type prefix for the rest + .rename(columns = strip_colums)) + + return kind_pathproblem_2 + +def joins_for_relatedLocations(tgraph, sf_2683): + """ + Return table providing the `relatedLocations` and `locations` information. + """ + # Access convenience functions + sf = lambda num: tgraph.dataframes['Struct' + str(num)] + af = lambda num: tgraph.dataframes['Array' + str(num)] + # + # Form the relatedLocation dataframe via joins, starting from the union of + # relatedLocations from `kind problem` (sf(4055)) and `kind path-problem` + # (sf(9699)). + # + related_locations_1 = ( + pd.concat([sf(4055)[['relatedLocations', 'struct_id']], sf(9699)[['relatedLocations', 'struct_id']]]) + .merge(af('0350'), how="left", left_on='relatedLocations', right_on='array_id', validate="1:m") + .drop(columns=['relatedLocations', 'array_id', 'value_index', 'type_at_index']) + # + .merge(sf(2683), how="left", left_on='id_or_value_at_index', right_on='struct_id', + suffixes=("_4055_9699", "_2683"), validate="1:m") + .drop(columns=['struct_id_2683', 'id_or_value_at_index']) + # + .merge(sf(4963), how="left", left_on='physicalLocation', right_on='struct_id', validate="1:m") + .drop(columns=['struct_id', 'physicalLocation']) + # + .merge(sf(6299), how="left", left_on='region', right_on='struct_id', validate="1:m") + .drop(columns=['struct_id', 'region']) + # + .merge(sf(2685), how="left", left_on='artifactLocation', right_on='struct_id', validate="1:m") + .drop(columns=['struct_id', 'artifactLocation']) + # + .merge(sf(2774), how="left", left_on='message', right_on='struct_id', validate="1:m") + .drop(columns=['struct_id', 'message']) + ) + + # Keep columns of interest + related_locations_2 = (related_locations_1[['struct_id_4055_9699', 'uri', 'startLine', 'startColumn', 'endLine', 'endColumn', 'text']] + .rename({'text': 'message', 'struct_id_4055_9699': 'struct_id'}, axis='columns')) + + # Remove dummy locations previously injected by signature.fillsig + related_locations_3 = related_locations_2[related_locations_2.uri != 'scli-dyys dummy value'] + + return related_locations_3 + +def joins_for_project_single(tgraph): + """ + Return table providing the `project` information for sarif-extract-scans + """ + # Access convenience functions + sf = lambda num: tgraph.dataframes['Struct' + str(num)] + af = lambda num: tgraph.dataframes['Array' + str(num)] + # + project_df_temp1 = ( + sf(5521) + .rename(columns={"version": "version_5521", "struct_id": "struct_id_5521"}) + # + .merge(af('1273'), how="left", left_on='runs', right_on='array_id', + validate="1:m") + .drop(columns=['runs', 'array_id', 'type_at_index']) + .rename(columns={"value_index": "value_index_1273"}) + # + .merge(sf(9786), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m") + .drop(columns=['id_or_value_at_index', 'struct_id'])) + # + #newlines there or not - handle + if 'newlineSequences' in project_df_temp1: + project_df_temp2 = project_df_temp1.drop(columns=['newlineSequences']) + + project_df_temp2 = ( + project_df_temp1 + # + .merge(sf(1509), how="left", left_on='properties', right_on='struct_id', validate="1:m") + .drop(columns=['properties', 'struct_id']) + # + # tool - driver - rules - defaultConfiguration - ( properties - tags ) + # + .merge(sf('0032'), how="left", left_on='tool', right_on='struct_id', validate="1:m") + .drop(columns=['tool', 'struct_id']) + # + .merge(sf(7828), how="left", left_on='driver', right_on='struct_id', validate="1:m") + .drop(columns=['driver', 'struct_id']) + .rename(columns={"semanticVersion": "driver_version_7828", "name": "driver_name_7828"}) + # + #assumet to be there + .merge(af(5511), how="left", left_on='versionControlProvenance', right_on='array_id') + .drop(columns=['versionControlProvenance', 'array_id', 'type_at_index']) + .rename(columns={"value_index": "versionControl_value_index_5511"}) + # + .merge(sf(3081), how="left", left_on='id_or_value_at_index', right_on='struct_id') + .drop(columns=['id_or_value_at_index', 'struct_id']) + ) + # + + # Keep columns of interest + project_df_1 = ( + project_df_temp2 + .drop(columns=['struct_id_5521', 'versionControl_value_index_5511']) + .rename({ + 'version_5521': 'sarif_version', + 'value_index_1273': 'run_index', + 'driver_name_7828': 'driver_name', + 'driver_version_7828': 'driver_version', + }, axis='columns') + ) + return project_df_1 + +def joins_for_rules(tgraph): + """ + Return table providing the `rules` information. + """ + # Access convenience functions + sf = lambda num: tgraph.dataframes['Struct' + str(num)] + sft = lambda id: sf(id).rename(columns = tagged_struct_columns(tgraph, id)) + af = lambda num: tgraph.dataframes['Array' + str(num)] + aft = lambda id: af(id).rename(columns = tagged_array_columns(tgraph, id)) + # + rules_df = ( + aft('0147') + # + .drop(columns=['t0147_type_at_index']) + # + .merge(sft(7100), how="left", left_on='t0147_id_or_value_at_index', + right_on='t7100_struct_id', + validate="1:m") + .drop(columns=['t0147_id_or_value_at_index', 't7100_struct_id']) + # + .merge(sft(8581), how="left", left_on='t7100_defaultConfiguration', + right_on='t8581_struct_id', validate="1:m") + .drop(columns=['t7100_defaultConfiguration', 't8581_struct_id']) + # + .merge(sft(2774), how="left", left_on='t7100_fullDescription', + right_on='t2774_struct_id', validate="1:m") + .drop(columns=['t7100_fullDescription', 't2774_struct_id']) + .rename(columns={'t2774_text': "t7100_t2774_fullDescription"}) + # + .merge(sft(2774), how="left", left_on='t7100_shortDescription', + right_on='t2774_struct_id', validate="1:m") + .drop(columns=['t7100_shortDescription', 't2774_struct_id']) + .rename(columns={"t2774_text": 't7100_t2774_shortDescription'}) + # + .merge(sft(6853), how="left", left_on='t7100_properties', + right_on='t6853_struct_id', validate="1:m") + .drop(columns=['t7100_properties', 't6853_struct_id', 't6853_id']) + # + .merge(aft(7069), how="left", left_on='t6853_tags', + right_on='t7069_array_id', validate="1:m") + .drop(columns=['t6853_tags', 't7069_array_id', 't7069_type_at_index']) + ) + rules_2 = ( + rules_df + .rename({ + 't0147_array_id' : 'rules_array_id', + 't0147_value_index' : 'rules_array_index', + 't7069_value_index' : 'tag_index', + 't7069_id_or_value_at_index' : 'tag_text', + }, axis='columns') + # Strip type prefix for the rest + .rename(columns = lambda x: re.sub('t7100_t2774_|t7100_|t8581_|t6853_', '', x)) + ) + return rules_2 + +def joins_for_artifacts(tgraph): + """ + Return table providing the `artifacts` information. + """ + # Access convenience functions + sf = lambda num: tgraph.dataframes['Struct' + str(num)] + af = lambda num: tgraph.dataframes['Array' + str(num)] + # + artifacts_df = ( + af(6920) + # + .merge(sf(5277), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m") + .drop(columns=['id_or_value_at_index', 'struct_id', 'type_at_index']) + .rename(columns={"value_index": "artifact_index_6920"}) + # + .merge(sf(2685), how="left", left_on='location', right_on='struct_id', validate="1:m") + .drop(columns=['location', 'struct_id']) + ) + # Keep columns of interest and rename + df_1 = ( + artifacts_df + .rename({ + 'array_id': 'artifacts_id', + 'artifact_index_6920': 'artifacts_array_index', + }, axis='columns') + ) + + if (df_1['artifacts_array_index'] == df_1['index']).all(): + df_1 = df_1.drop(columns=['artifacts_array_index']) + + return df_1 diff --git a/sarif_cli/typegraph.py b/sarif_cli/typegraph.py index 5761943..3769fc6 100644 --- a/sarif_cli/typegraph.py +++ b/sarif_cli/typegraph.py @@ -179,13 +179,21 @@ def _destructure_dict(typegraph: Typegraph, node, tree): if specific_missing not in status_writer.input_sarif_missing["extra_info"]: status_writer.input_sarif_missing["extra_info"] += specific_missing status_writer.warning_set["input_sarif_missing"]+=1 - raise MissingFieldException( - f"(Sub)tree is missing fields required by typedef.\n" - f"Expected {type_fields}, found {tree_fields}.\n" - f"Missing {set(type_fields) - set(tree_fields)}\n" - f"Note: these fields are post-signature fill and may be more extensive than the orginal. \n" - f"Check input file for the original signature." - ) + + #special case of no longer trying other signatures + #else exception here triggers a retry - mainly needed for Struct9699 or Struct4055 + difference = set(type_fields) - set(tree_fields) + if "uriBaseId" in difference: + tree["uriBaseId"] = "default" + _destructure_dict_1(typegraph, node, tree) + else: + raise MissingFieldException( + f"(Sub)tree is missing fields required by typedef.\n" + f"Expected {type_fields}, found {tree_fields}.\n" + f"Missing {set(type_fields) - set(tree_fields)}\n" + f"Note: these fields are post-signature fill and may be more extensive than the orginal. \n" + f"Check input file for the original signature." + ) else: status_writer.unknown_sarif_parsing_shape["extra_info"] = "type fields {} do not match tree fields {}.".format(type_fields, tree_fields) From b45d868f89d618de2621647f28a26f2b57695fab Mon Sep 17 00:00:00 2001 From: Kristen Newbury Date: Tue, 13 Dec 2022 18:32:34 -0500 Subject: [PATCH 02/23] Update README for CLI usage instructions --- README.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/README.md b/README.md index 49d9706..db6f7f4 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,23 @@ The tool was implemented using Python 3.9. +# Sarif format information + + The tool operates on sarif generated by LGTM 1.27.0 (by default) or by the CodeQL CLI (enabled with the -f flag given a value of `CLI`). + + The values that the -f flag accepts are: `LGTM` and `CLI`. + + The CLI versions used against development of the CLI support were: 2.6.3, 2.9.4, and 2.11.4. + + The CLI sarif **MUST** contain one additional property `versionControlProvenance` - which needs to look like: + ``` + "versionControlProvenance": [ + { + "repositoryUri": "https://github.com/testorg/testrepo.git", + "revisionId": "testsha" + } + ``` + # Test Setup This repository includes some test data (in `data`) and uses =git lfs= for storing those test files; installation steps are at [[https://git-lfs.github.com][git-lfs]]; on a mac with homebrew, install it via From efc87d4f08d5709d73f0b433ad32505e100bcaca Mon Sep 17 00:00:00 2001 From: Kristen Newbury Date: Tue, 13 Dec 2022 18:42:45 -0500 Subject: [PATCH 03/23] Update README missing minor syntax --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index db6f7f4..6c7850e 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,7 @@ "repositoryUri": "https://github.com/testorg/testrepo.git", "revisionId": "testsha" } + ] ``` # Test Setup From bbeba14dec73fa5c1f89d78bf1288df4a7b38bbd Mon Sep 17 00:00:00 2001 From: Kristen Newbury Date: Tue, 13 Dec 2022 20:13:13 -0500 Subject: [PATCH 04/23] Bugfix CLI signature merge mistake --- sarif_cli/signature_single_CLI.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sarif_cli/signature_single_CLI.py b/sarif_cli/signature_single_CLI.py index fd8dfa5..d773cf2 100644 --- a/sarif_cli/signature_single_CLI.py +++ b/sarif_cli/signature_single_CLI.py @@ -111,7 +111,6 @@ struct_graph_CLI = ( ('precision', 'String'), ('problem.severity', 'String'), ('security-severity', 'String'), - ('severity', 'String'), ('sub-severity', 'String'), ('tags', 'Array7069'))), ( 'Struct7100', From 62ec56948edbbab60fa1066a4db721a679f74ce5 Mon Sep 17 00:00:00 2001 From: Michael Hohn Date: Tue, 11 Jul 2023 10:45:15 -0700 Subject: [PATCH 05/23] WIP: debug missing field propagation for automationDetails.id Create SARIF files with and without automationDetails.id for examination. --- data/build-multiple-sarifs.sh | 120 ++++++++++++++++++++++++++++++++++ notes/README.org | 64 ++++++++++++++++++ scripts/grab.sh | 40 ++++++++++++ scripts/table-tests.sh | 9 +++ 4 files changed, 233 insertions(+) create mode 100644 data/build-multiple-sarifs.sh create mode 100644 scripts/grab.sh diff --git a/data/build-multiple-sarifs.sh b/data/build-multiple-sarifs.sh new file mode 100644 index 0000000..9b3c932 --- /dev/null +++ b/data/build-multiple-sarifs.sh @@ -0,0 +1,120 @@ +# +#* Following are the steps needed to build a codeql db and various SARIF analyses. +# +echo '$0: Interactive use only' +exit 1 + +#* Where are we? +codeql --version + +#* Get repo +git clone git@github.com:hohn/codeql-dataflow-sql-injection.git +cd codeql-dataflow-sql-injection/ + +#* Build vanilla DB +cd ~/local/sarif-cli/data/codeql-dataflow-sql-injection +rm -fR sqlidb +codeql database create --language=cpp -s . -j 8 -v sqlidb --command='./build.sh' +ls sqlidb + + +#* Pack compatibility with CLI +# Note workaround to avoid using --additional-packs +function codeql-complib() { + if [ -z "$1" ]; then + echo "Usage: codeql-complib " + return 1 + fi + curl --silent https://raw.githubusercontent.com/github/codeql/codeql-cli/v$(codeql version --format=json | jq -r .version)/$1/ql/lib/qlpack.yml | grep version | cut -d':' -f2 | sed 's/^[ ]*//' +} + +: ' +0:$ codeql-complib cpp +0.4.6 + +Put the version into the qlpack: +... +dependencies: + codeql/cpp-all: ^0.4.6 +... + +Then + codeql pack install +followed by + codeql database analyze +without + --additional-packs $HOME/local/codeql-v2.11.6/ \ + + +Or create the qlpack file using commands: + codeql pack init foo + codeql pack add --dir=foo codeql/cpp-all@"$(codeql-complib cpp)" + +' + +#* Install packs +cd ~/local/sarif-cli/data/codeql-dataflow-sql-injection +rm -f *lock* +codeql pack install + +#* Run the analyze command's plain version +cd ~/local/sarif-cli/data/codeql-dataflow-sql-injection + +# Note workaround for using --additional-packs +if false +then + source ../scripts/grab.sh + grab v2.11.6 osx64 $HOME/local + + codeql database analyze \ + -v \ + --ram=14000 \ + -j12 \ + --rerun \ + --format=sarif-latest \ + --additional-packs $HOME/local/codeql-v2.11.6/ \ + --output sqlidb-0.sarif \ + -- \ + sqlidb \ + SqlInjection.ql +fi + +codeql database analyze \ + -v \ + --ram=14000 \ + -j12 \ + --rerun \ + --format=sarif-latest \ + --output sqlidb-0.sarif \ + -- \ + sqlidb \ + SqlInjection.ql + +# This field should not be there: +grep automationDetails sqlidb-0.sarif + +#* Run the analyze command with options +# but don't rerun the analysis. We just want another SARIF file. +# +cd ~/local/sarif-cli/data/codeql-dataflow-sql-injection + +codeql database analyze \ + -v \ + --sarif-category mast-issue \ + --ram=14000 \ + -j12 \ + --format=sarif-latest \ + --output sqlidb-1.sarif \ + -- \ + sqlidb \ + SqlInjection.ql + +# Now it's present: +grep -A2 automationDetails sqlidb-1.sarif + +: ' + "automationDetails" : { + "id" : "mast-issue/" + }, +' + diff --git a/notes/README.org b/notes/README.org index 2f7e918..e716d69 100644 --- a/notes/README.org +++ b/notes/README.org @@ -4,6 +4,9 @@ Think of it as staging for [[../docs]]. + Short notes start as sections in this README. They will be moved if separate + file make more sense. + ** The typegraphs The type graph files are derived from a sarif input file, with various options controlling output. @@ -27,3 +30,64 @@ ../../../bin/sarif-to-dot -td -nuf results.sarif | dot -Tpdf > typegraph-tdnuf.pdf #+END_SRC + +** The automationDetails.id + The =automationDetails.id= entry is produced by CodeQL when using the + =--sarif-category= flag. + + Using + #+BEGIN_SRC text + 0:$ codeql --version + CodeQL command-line toolchain release 2.12.6. + #+END_SRC + + and running + #+BEGIN_SRC sh + cd ../data/codeql-dataflow-sql-injection/ && + sarif-extract-scans-runner - > /dev/null < /dev/null < Date: Tue, 11 Jul 2023 20:25:16 -0700 Subject: [PATCH 06/23] Script to add versionControlProvenance --- bin/sarif-insert-vcp | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100755 bin/sarif-insert-vcp diff --git a/bin/sarif-insert-vcp b/bin/sarif-insert-vcp new file mode 100755 index 0000000..52fc772 --- /dev/null +++ b/bin/sarif-insert-vcp @@ -0,0 +1,19 @@ +#!/bin/sh +# Add the versionControlProvenance key to a SARIF file +# usage: $0 file +uri=vcp-no-uri +revid=vcp-no-revid +jq ' {"$schema" : ."$schema", + "version" : .version, + "runs" : [ .runs | .[] +| ( .versionControlProvenance |= +[ + { + "repositoryUri": "'$uri'", + "revisionId": "'$revid'" + } +] +) ] +} +' $1 + From dc8a4929fae74c8878fb236c83a0181a03c2cec8 Mon Sep 17 00:00:00 2001 From: Michael Hohn Date: Tue, 11 Jul 2023 20:26:40 -0700 Subject: [PATCH 07/23] wip: notes cleanup --- notes/README.org | 83 +++++++++++++++++------------------------------ notes/l3style.css | 28 +++++++++------- 2 files changed, 47 insertions(+), 64 deletions(-) diff --git a/notes/README.org b/notes/README.org index e716d69..ac00db4 100644 --- a/notes/README.org +++ b/notes/README.org @@ -1,3 +1,16 @@ +# -*- mode: org; org-confirm-babel-evaluate: nil; coding: utf-8 -*- +#+OPTIONS: org-confirm-babel-evaluate:nil +#+LANGUAGE: en +#+TEXT: +#+OPTIONS: ^:{} H:2 num:t \n:nil @:t ::t |:t ^:nil f:t *:t TeX:t LaTeX:t skip:nil p:nil +#+OPTIONS: toc:nil +#+HTML_HEAD: +#+HTML:
+#+TOC: headlines 2 insert TOC here, with two headline levels +#+HTML:
+# +#+HTML:
+ * The notes directory This directory is for notes that may be useful, but aren't complete enough to serve as documentation in their current state. @@ -35,59 +48,23 @@ The =automationDetails.id= entry is produced by CodeQL when using the =--sarif-category= flag. - Using - #+BEGIN_SRC text - 0:$ codeql --version - CodeQL command-line toolchain release 2.12.6. + The prerequisites for tracing its flow through the tools is started in + [[../data/build-multiple-sarifs.sh]] + + #+BEGIN_SRC sh :session shared :results output + cd ~/local/sarif-cli/ && ag -l automationDetails |cat #+END_SRC - and running - #+BEGIN_SRC sh - cd ../data/codeql-dataflow-sql-injection/ && - sarif-extract-scans-runner - > /dev/null < + diff --git a/notes/l3style.css b/notes/l3style.css index 9b71bbd..b508a59 100644 --- a/notes/l3style.css +++ b/notes/l3style.css @@ -1,3 +1,7 @@ +:root { + --margin-left: 40%; + --body-width: 60%; +} /* The sum of width and margin percentages must not exceed 100.*/ div#toc { @@ -8,30 +12,33 @@ div#toc { /* OR */ /* use a fixed-position toc */ position: fixed; - top: 80px; + top: 8px; left: 0px; /* match toc, org-content, postamble */ - width: 26%; + width: var(--margin-left); margin-right: 1%; margin-left: 1%; + + overflow-y: scroll; + height: calc(100% - 10px); + } div#org-content { float: right; - width: 70%; + width: var(--body-width); /* match toc, org-content, postamble */ - margin-left: 28%; + margin-left: var(--margin-left); } div#postamble { float: right; - width: 70%; + width: var(--body-width); /* match toc, org-content, postamble */ - margin-left: 28%; + margin-left: var(--margin-left); } - p.author { clear: both; font-size: 1em; @@ -107,9 +114,9 @@ h1 { color: #cc8c00; /* padding-top: 5px; */ border-bottom: 2px solid #aaa; - width: 70%; - /* match toc, org-content, postamble */ - margin-left: 28%; /* Align with div#content */ + width: var(--body-width); + /* match toc, org-content, postamble */ + width: var(--margin-left); /* Align with div#content */ } h2 { @@ -167,4 +174,3 @@ td, th { vertical-align: top; border: 1pt solid #ADB9CC; } - From 742392338e8bfe71f9af45d95233e6e1081af2b4 Mon Sep 17 00:00:00 2001 From: Michael Hohn Date: Tue, 11 Jul 2023 20:27:59 -0700 Subject: [PATCH 08/23] wip: finally get CSV; use script to insert versionControlProvenance --- data/build-multiple-sarifs.sh | 62 ++++++++++++++++++++++++++++++++--- 1 file changed, 57 insertions(+), 5 deletions(-) diff --git a/data/build-multiple-sarifs.sh b/data/build-multiple-sarifs.sh index 9b3c932..fb49a4d 100644 --- a/data/build-multiple-sarifs.sh +++ b/data/build-multiple-sarifs.sh @@ -4,6 +4,15 @@ echo '$0: Interactive use only' exit 1 +#* What can we use? +gh codeql list-versions + +#* History +open https://github.com/github/codeql-cli-binaries/blob/HEAD/CHANGELOG.md + +#* Choose +gh codeql set-version v2.9.4 + #* Where are we? codeql --version @@ -17,7 +26,6 @@ rm -fR sqlidb codeql database create --language=cpp -s . -j 8 -v sqlidb --command='./build.sh' ls sqlidb - #* Pack compatibility with CLI # Note workaround to avoid using --additional-packs function codeql-complib() { @@ -30,15 +38,15 @@ function codeql-complib() { : ' 0:$ codeql-complib cpp -0.4.6 +0.2.3 Put the version into the qlpack: ... dependencies: - codeql/cpp-all: ^0.4.6 + codeql/cpp-all: ^0.2.3 ... -Then +Then follow the rest; that is codeql pack install followed by codeql database analyze @@ -111,10 +119,54 @@ codeql database analyze \ # Now it's present: grep -A2 automationDetails sqlidb-1.sarif - : ' "automationDetails" : { "id" : "mast-issue/" }, ' +# Follow the installation in sarif-cli/README.md. + +#* Verify versionControlProvenance location +jq '.runs | .[] | .versionControlProvenance' \ + ~/local/sarif-cli/data/treeio/test_set_1.sarif + +#* Insert versionControlProvenance +cd ~/local/sarif-cli/data/codeql-dataflow-sql-injection +sarif-insert-vcp sqlidb-0.sarif > sqlidb-0.1.sarif + +#* Get CSV. +cd ~/local/sarif-cli/data/codeql-dataflow-sql-injection +sarif-extract-scans-runner --input-signature CLI - > /dev/null < sqlidb-1.1.sarif + +#* Get CSV. +cd ~/local/sarif-cli/data/codeql-dataflow-sql-injection +sarif-extract-scans-runner --input-signature CLI - > /dev/null < Date: Wed, 12 Jul 2023 17:04:23 -0700 Subject: [PATCH 09/23] wip: debug and get automationDetails into CSV output --- data/build-multiple-sarifs.sh | 3 - notes/README.org | 265 +++++++++++++++++++++++++++++++--- sarif_cli/columns.py | 5 +- sarif_cli/scan_tables.py | 8 +- sarif_cli/signature.py | 6 +- sarif_cli/table_joins_CLI.py | 4 + 6 files changed, 267 insertions(+), 24 deletions(-) diff --git a/data/build-multiple-sarifs.sh b/data/build-multiple-sarifs.sh index fb49a4d..23ad2f1 100644 --- a/data/build-multiple-sarifs.sh +++ b/data/build-multiple-sarifs.sh @@ -167,6 +167,3 @@ head -4 sqlidb-1.1.sarif.csv #* Check CSV output ls -la sqlidb-1.1* find sqlidb-1.1.sarif.scantables -print - - - diff --git a/notes/README.org b/notes/README.org index ac00db4..ef6ebd5 100644 --- a/notes/README.org +++ b/notes/README.org @@ -2,11 +2,11 @@ #+OPTIONS: org-confirm-babel-evaluate:nil #+LANGUAGE: en #+TEXT: -#+OPTIONS: ^:{} H:2 num:t \n:nil @:t ::t |:t ^:nil f:t *:t TeX:t LaTeX:t skip:nil p:nil +#+OPTIONS: ^:{} H:3 num:t \n:nil @:t ::t |:t ^:nil f:t *:t TeX:t LaTeX:t skip:nil p:nil #+OPTIONS: toc:nil #+HTML_HEAD: #+HTML:
-#+TOC: headlines 2 insert TOC here, with two headline levels +#+TOC: headlines 3 insert TOC here, with two headline levels #+HTML:
# #+HTML:
@@ -44,27 +44,258 @@ #+END_SRC -** The automationDetails.id +** Debugging the absence of automationDetails.id The =automationDetails.id= entry is produced by CodeQL when using the =--sarif-category= flag. The prerequisites for tracing its flow through the tools is started in [[../data/build-multiple-sarifs.sh]] - #+BEGIN_SRC sh :session shared :results output - cd ~/local/sarif-cli/ && ag -l automationDetails |cat + For testing the following is injected into =sqlidb-1.1.sarif=. + #+BEGIN_SRC text + : ' + "automationDetails" : { + "id" : "mast-issue/" + }, + ' + #+END_SRC + +*** Add repl as appropriate, then examine. + Make sure the input is correct + #+BEGIN_SRC sh :session shared :results output :eval never-export + cd ~/local/sarif-cli/data/codeql-dataflow-sql-injection + grep -A2 automationDetails sqlidb-1.1.sarif + #+END_SRC - #+RESULTS: - : notes/README.org - : notes/README.html - : scripts/table-tests.sh - : sarif_cli/signature_single_CLI.py - : sarif_cli/table_joins_CLI.py - : sarif_cli/scan_tables.py - : sarif_cli/signature.py - : - : hohn@gh-hohn ~/local/sarif-cli - -#+HTML:
+ #+RESULTS: + : hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection + : "automationDetails" : { + : "id" : "mast-issue/" + : }, + : + : hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection + +*** Create the CSV + #+BEGIN_SRC sh :session shared :results output :eval never-export + source ~/local/sarif-cli/.venv/bin/activate + cd ~/local/sarif-cli/data/codeql-dataflow-sql-injection + sarif-extract-scans-runner --input-signature CLI - > /dev/null < > (.venv) + hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection + #+end_example + + #+BEGIN_SRC sh :session shared :results output :eval never-export + cd ~/local/sarif-cli/data/codeql-dataflow-sql-injection + ls -la sqlidb-1.1* + find sqlidb-1.1.sarif.scantables -print + #+END_SRC + + #+RESULTS: + #+begin_example + hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection + -rw-r--r-- 1 hohn staff 8.2K Jul 11 19:25 sqlidb-1.1.sarif + -rw-r--r-- 1 hohn staff 326 Jul 12 16:39 sqlidb-1.1.sarif.csv + -rw-r--r-- 1 hohn staff 72 Jul 12 16:39 sqlidb-1.1.sarif.scanspec + + sqlidb-1.1.sarif.scantables: + total 16K + drwxr-xr-x 6 hohn staff 192 Jul 12 16:39 ./ + drwxr-xr-x 43 hohn staff 1.4K Jul 12 16:39 ../ + -rw-r--r-- 1 hohn staff 622 Jul 12 16:39 codeflows.csv + -rw-r--r-- 1 hohn staff 165 Jul 12 16:39 projects.csv + -rw-r--r-- 1 hohn staff 589 Jul 12 16:39 results.csv + -rw-r--r-- 1 hohn staff 343 Jul 12 16:39 scans.csv + (.venv) + hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection + sqlidb-1.1.sarif.scantables + sqlidb-1.1.sarif.scantables/codeflows.csv + sqlidb-1.1.sarif.scantables/scans.csv + sqlidb-1.1.sarif.scantables/results.csv + sqlidb-1.1.sarif.scantables/projects.csv + (.venv) + hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection + #+end_example + +*** Check if =automationDetails= or its value is in output + #+BEGIN_SRC sh :session shared :results output :eval never-export + cd ~/local/sarif-cli/data/codeql-dataflow-sql-injection/sqlidb-1.1.sarif.scantables + ag automationDetails | cat + #+END_SRC + + #+RESULTS: + : (.venv) + : hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection/sqlidb-1.1.sarif.scantables + : projects.csv:1:"id","project_name","creation_date","repo_url","primary_language","languages_analyzed","automationDetails" + : (.venv) + : hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection/sqlidb-1.1.sarif.scantables + + #+RESULTS: + : (.venv) + : hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection/sqlidb-1.1.sarif.scantables + : (.venv) + : hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection/sqlidb-1.1.sarif.scantables + + #+RESULTS: + : (.venv) + : hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection/sqlidb-1.1.sarif.scantables + : (.venv) + : hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection/sqlidb-1.1.sarif.scantables + + See if the magic value is present + #+BEGIN_SRC sh :session shared :results output :eval never-export + cd ~/local/sarif-cli/data/codeql-dataflow-sql-injection/sqlidb-1.1.sarif.scantables + ag mast-issue |cat + #+END_SRC + + #+RESULTS: + : (.venv) + : hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection/sqlidb-1.1.sarif.scantables + : projects.csv:2:490227419655596076,"vcp-no-uri","1970-01-01","vcp-no-uri","unknown","unknown","mast-issue/" + : (.venv) + : hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection/sqlidb-1.1.sarif.scantables + + #+RESULTS: + : (.venv) + : hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection/sqlidb-1.1.sarif.scantables + : (.venv) + : hohn@gh-hohn ~/local/sarif-cli/data/codeql-dataflow-sql-injection/sqlidb-1.1.sarif.scantables + +*** Nothing is in the output, so trace execution to see where it's dropped + #+BEGIN_SRC sh :session shared :results output :eval never-export + cd ~/local/sarif-cli/notes && ag -l automationDetails ../sarif_cli |cat + #+END_SRC + + #+RESULTS: + : ../sarif_cli/scan_tables.py + : ../sarif_cli/signature_single_CLI.py + : ../sarif_cli/table_joins_CLI.py + : ../sarif_cli/signature.py + : (.venv) + : hohn@gh-hohn ~/local/sarif-cli/notes + +*** Trace the call chain + Trace the call chain to one of + : ../sarif_cli/scan_tables.py + : ../sarif_cli/table_joins_CLI.py + : ../sarif_cli/signature.py + + Entry is + #+BEGIN_SRC sh :session shared :results output :eval never-export + sarif-extract-scans-runner --input-signature CLI - > /dev/null < diff --git a/sarif_cli/columns.py b/sarif_cli/columns.py index 71d8dda..2bbaa14 100644 --- a/sarif_cli/columns.py +++ b/sarif_cli/columns.py @@ -46,7 +46,8 @@ columns = { "creation_date", "repo_url" , "primary_language" , - "languages_analyzed" + "languages_analyzed", + "automationDetails", ], "codeflows" : [ "codeflow_id", @@ -62,4 +63,4 @@ columns = { "uriBaseId", "message" ] -} \ No newline at end of file +} diff --git a/sarif_cli/scan_tables.py b/sarif_cli/scan_tables.py index 28d02bc..69889d9 100644 --- a/sarif_cli/scan_tables.py +++ b/sarif_cli/scan_tables.py @@ -70,6 +70,7 @@ class ScanTablesTypes: "repo_url" : pd.StringDtype(), "primary_language" : pd.StringDtype(), "languages_analyzed" : pd.StringDtype(), + "automationDetails" : pd.StringDtype(), } # @@ -98,11 +99,16 @@ def joins_for_projects(basetables, external_info): "creation_date" : pd.Timestamp(0.0, unit='s'), # TODO: external info "repo_url" : repoUri, "primary_language" : b.project['semmle.sourceLanguage'][0], - "languages_analyzed" : ",".join(list(b.project['semmle.sourceLanguage'])) + "languages_analyzed" : ",".join(list(b.project['semmle.sourceLanguage'])), + "automationDetails" : extra, }, index=[0]) # Force all column types to ensure appropriate formatting res1 = res.astype(ScanTablesTypes.projects).reset_index(drop=True) + # XX: automationDetails? + import IPython + IPython.embed(header="spot 11") + # return res1 # diff --git a/sarif_cli/signature.py b/sarif_cli/signature.py index e3d3f1e..100fce7 100644 --- a/sarif_cli/signature.py +++ b/sarif_cli/signature.py @@ -256,7 +256,11 @@ def fillsig_dict(args, elem, context): if 'results' in elem.keys() and not 'automationDetails' in elem.keys(): #want this to be blank if not present- ie no submodule info added/no sarif-category used - full_elem['automationDetails'] = {'id' : ""} + full_elem['automationDetails'] = {'id' : "no-value-for-ad"} + # XX: automationDetails? + import IPython + IPython.embed(header="spot 2") + # if {'locations', 'message', 'partialFingerprints', 'ruleId', 'ruleIndex'}.issubset(elem.keys()): diff --git a/sarif_cli/table_joins_CLI.py b/sarif_cli/table_joins_CLI.py index 94f9af9..ef6cf84 100644 --- a/sarif_cli/table_joins_CLI.py +++ b/sarif_cli/table_joins_CLI.py @@ -336,6 +336,10 @@ def joins_for_project_single(tgraph): .drop(columns=['automationDetails', 'struct_id']) .rename(columns={"id": "automationDetails"})) # + # XX: automationDetails? + import IPython + IPython.embed(header="spot 3") + # #newlines there or not - handle if 'newlineSequences' in project_df_temp1: project_df_temp2 = project_df_temp1.drop(columns=['newlineSequences']) From 7d4e5026a904b2619e5185a372cc9d1297f925a2 Mon Sep 17 00:00:00 2001 From: Michael Hohn Date: Thu, 13 Jul 2023 12:52:59 -0700 Subject: [PATCH 10/23] Add note about bin/sarif-insert-vcp --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 4928713..156dd99 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,12 @@ ] ``` + The script + + bin/sarif-insert-vcp + + will add that entry to a SARIF file. + # Test Setup This repository includes some test data (in `data`) and uses =git lfs= for storing those test files; installation steps are at [[https://git-lfs.github.com][git-lfs]]; on a mac with homebrew, install it via From f1a70dd02323e119eb1d6fbae2c1a0e6b629dfd0 Mon Sep 17 00:00:00 2001 From: Michael Hohn Date: Thu, 13 Jul 2023 15:55:28 -0700 Subject: [PATCH 11/23] wip: remove extraneous slash --- .../sqlidb-1.sarif | 219 ++++++++++++++++++ 1 file changed, 219 insertions(+) create mode 100644 data/codeql-dataflow-sql-injection/sqlidb-1.sarif diff --git a/data/codeql-dataflow-sql-injection/sqlidb-1.sarif b/data/codeql-dataflow-sql-injection/sqlidb-1.sarif new file mode 100644 index 0000000..1e9d02c --- /dev/null +++ b/data/codeql-dataflow-sql-injection/sqlidb-1.sarif @@ -0,0 +1,219 @@ +{ + "$schema" : "https://json.schemastore.org/sarif-2.1.0.json", + "version" : "2.1.0", + "runs" : [ { + "tool" : { + "driver" : { + "name" : "CodeQL", + "organization" : "GitHub", + "semanticVersion" : "2.9.4", + "rules" : [ { + "id" : "cpp/SQLIVulnerable", + "name" : "cpp/SQLIVulnerable", + "shortDescription" : { + "text" : "SQLI Vulnerability" + }, + "fullDescription" : { + "text" : "Using untrusted strings in a sql query allows sql injection attacks." + }, + "defaultConfiguration" : { + "enabled" : true, + "level" : "warning" + }, + "properties" : { + "description" : "Using untrusted strings in a sql query allows sql injection attacks.", + "id" : "cpp/SQLIVulnerable", + "kind" : "path-problem", + "name" : "SQLI Vulnerability", + "problem.severity" : "warning" + } + } ] + }, + "extensions" : [ { + "name" : "legacy-upgrades", + "semanticVersion" : "0.0.0", + "locations" : [ { + "uri" : "file:///Users/hohn/.local/share/gh/extensions/gh-codeql/dist/release/v2.9.4/legacy-upgrades/", + "description" : { + "text" : "The QL pack root directory." + } + }, { + "uri" : "file:///Users/hohn/.local/share/gh/extensions/gh-codeql/dist/release/v2.9.4/legacy-upgrades/qlpack.yml", + "description" : { + "text" : "The QL pack definition file." + } + } ] + }, { + "name" : "sample/cpp-sql-injection", + "semanticVersion" : "0.0.1", + "locations" : [ { + "uri" : "file:///Users/hohn/local/sarif-cli/data/codeql-dataflow-sql-injection/", + "description" : { + "text" : "The QL pack root directory." + } + }, { + "uri" : "file:///Users/hohn/local/sarif-cli/data/codeql-dataflow-sql-injection/qlpack.yml", + "description" : { + "text" : "The QL pack definition file." + } + } ] + } ] + }, + "artifacts" : [ { + "location" : { + "uri" : "add-user.c", + "uriBaseId" : "%SRCROOT%", + "index" : 0 + } + } ], + "results" : [ { + "ruleId" : "cpp/SQLIVulnerable", + "ruleIndex" : 0, + "rule" : { + "id" : "cpp/SQLIVulnerable", + "index" : 0 + }, + "message" : { + "text" : "Possible SQL injection" + }, + "locations" : [ { + "physicalLocation" : { + "artifactLocation" : { + "uri" : "add-user.c", + "uriBaseId" : "%SRCROOT%", + "index" : 0 + }, + "region" : { + "startLine" : 84, + "startColumn" : 27, + "endColumn" : 32 + } + } + } ], + "partialFingerprints" : { + "primaryLocationLineHash" : "9a8bc91bbc363391:1", + "primaryLocationStartColumnFingerprint" : "22" + }, + "codeFlows" : [ { + "threadFlows" : [ { + "locations" : [ { + "location" : { + "physicalLocation" : { + "artifactLocation" : { + "uri" : "add-user.c", + "uriBaseId" : "%SRCROOT%", + "index" : 0 + }, + "region" : { + "startLine" : 52, + "startColumn" : 32, + "endColumn" : 35 + } + }, + "message" : { + "text" : "ref arg buf" + } + } + }, { + "location" : { + "physicalLocation" : { + "artifactLocation" : { + "uri" : "add-user.c", + "uriBaseId" : "%SRCROOT%", + "index" : 0 + }, + "region" : { + "startLine" : 60, + "startColumn" : 12, + "endColumn" : 15 + } + }, + "message" : { + "text" : "buf" + } + } + }, { + "location" : { + "physicalLocation" : { + "artifactLocation" : { + "uri" : "add-user.c", + "uriBaseId" : "%SRCROOT%", + "index" : 0 + }, + "region" : { + "startLine" : 93, + "startColumn" : 12, + "endColumn" : 25 + } + }, + "message" : { + "text" : "call to get_user_info" + } + } + }, { + "location" : { + "physicalLocation" : { + "artifactLocation" : { + "uri" : "add-user.c", + "uriBaseId" : "%SRCROOT%", + "index" : 0 + }, + "region" : { + "startLine" : 95, + "startColumn" : 20, + "endColumn" : 24 + } + }, + "message" : { + "text" : "info" + } + } + }, { + "location" : { + "physicalLocation" : { + "artifactLocation" : { + "uri" : "add-user.c", + "uriBaseId" : "%SRCROOT%", + "index" : 0 + }, + "region" : { + "startLine" : 68, + "startColumn" : 31, + "endColumn" : 35 + } + }, + "message" : { + "text" : "info" + } + } + }, { + "location" : { + "physicalLocation" : { + "artifactLocation" : { + "uri" : "add-user.c", + "uriBaseId" : "%SRCROOT%", + "index" : 0 + }, + "region" : { + "startLine" : 84, + "startColumn" : 27, + "endColumn" : 32 + } + }, + "message" : { + "text" : "query" + } + } + } ] + } ] + } ] + } ], + "automationDetails" : { + "id" : "mast-issue" + }, + "columnKind" : "utf16CodeUnits", + "properties" : { + "semmle.formatSpecifier" : "sarif-latest" + } + } ] +} From c299321ab8598254c1b032b58898784809806b36 Mon Sep 17 00:00:00 2001 From: Michael Hohn Date: Thu, 13 Jul 2023 16:03:01 -0700 Subject: [PATCH 12/23] Remove repls; add scripts/test-vcp.sh --- .../sqlidb-0.sarif | 246 ++++++++++++++++++ sarif_cli/scan_tables.py | 3 - sarif_cli/signature.py | 4 - sarif_cli/table_joins_CLI.py | 4 - scripts/test-vcp.sh | 48 ++++ 5 files changed, 294 insertions(+), 11 deletions(-) create mode 100644 data/codeql-dataflow-sql-injection/sqlidb-0.sarif create mode 100644 scripts/test-vcp.sh diff --git a/data/codeql-dataflow-sql-injection/sqlidb-0.sarif b/data/codeql-dataflow-sql-injection/sqlidb-0.sarif new file mode 100644 index 0000000..47053af --- /dev/null +++ b/data/codeql-dataflow-sql-injection/sqlidb-0.sarif @@ -0,0 +1,246 @@ +{ + "$schema": "https://json.schemastore.org/sarif-2.1.0.json", + "version": "2.1.0", + "runs": [ + { + "tool": { + "driver": { + "name": "CodeQL", + "organization": "GitHub", + "semanticVersion": "2.9.4", + "rules": [ + { + "id": "cpp/SQLIVulnerable", + "name": "cpp/SQLIVulnerable", + "shortDescription": { + "text": "SQLI Vulnerability" + }, + "fullDescription": { + "text": "Using untrusted strings in a sql query allows sql injection attacks." + }, + "defaultConfiguration": { + "enabled": true, + "level": "warning" + }, + "properties": { + "description": "Using untrusted strings in a sql query allows sql injection attacks.", + "id": "cpp/SQLIVulnerable", + "kind": "path-problem", + "name": "SQLI Vulnerability", + "problem.severity": "warning" + } + } + ] + }, + "extensions": [ + { + "name": "legacy-upgrades", + "semanticVersion": "0.0.0", + "locations": [ + { + "uri": "file:///Users/hohn/.local/share/gh/extensions/gh-codeql/dist/release/v2.9.4/legacy-upgrades/", + "description": { + "text": "The QL pack root directory." + } + }, + { + "uri": "file:///Users/hohn/.local/share/gh/extensions/gh-codeql/dist/release/v2.9.4/legacy-upgrades/qlpack.yml", + "description": { + "text": "The QL pack definition file." + } + } + ] + }, + { + "name": "sample/cpp-sql-injection", + "semanticVersion": "0.0.1", + "locations": [ + { + "uri": "file:///Users/hohn/local/sarif-cli/data/codeql-dataflow-sql-injection/", + "description": { + "text": "The QL pack root directory." + } + }, + { + "uri": "file:///Users/hohn/local/sarif-cli/data/codeql-dataflow-sql-injection/qlpack.yml", + "description": { + "text": "The QL pack definition file." + } + } + ] + } + ] + }, + "artifacts": [ + { + "location": { + "uri": "add-user.c", + "uriBaseId": "%SRCROOT%", + "index": 0 + } + } + ], + "results": [ + { + "ruleId": "cpp/SQLIVulnerable", + "ruleIndex": 0, + "rule": { + "id": "cpp/SQLIVulnerable", + "index": 0 + }, + "message": { + "text": "Possible SQL injection" + }, + "locations": [ + { + "physicalLocation": { + "artifactLocation": { + "uri": "add-user.c", + "uriBaseId": "%SRCROOT%", + "index": 0 + }, + "region": { + "startLine": 84, + "startColumn": 27, + "endColumn": 32 + } + } + } + ], + "partialFingerprints": { + "primaryLocationLineHash": "9a8bc91bbc363391:1", + "primaryLocationStartColumnFingerprint": "22" + }, + "codeFlows": [ + { + "threadFlows": [ + { + "locations": [ + { + "location": { + "physicalLocation": { + "artifactLocation": { + "uri": "add-user.c", + "uriBaseId": "%SRCROOT%", + "index": 0 + }, + "region": { + "startLine": 52, + "startColumn": 32, + "endColumn": 35 + } + }, + "message": { + "text": "ref arg buf" + } + } + }, + { + "location": { + "physicalLocation": { + "artifactLocation": { + "uri": "add-user.c", + "uriBaseId": "%SRCROOT%", + "index": 0 + }, + "region": { + "startLine": 60, + "startColumn": 12, + "endColumn": 15 + } + }, + "message": { + "text": "buf" + } + } + }, + { + "location": { + "physicalLocation": { + "artifactLocation": { + "uri": "add-user.c", + "uriBaseId": "%SRCROOT%", + "index": 0 + }, + "region": { + "startLine": 93, + "startColumn": 12, + "endColumn": 25 + } + }, + "message": { + "text": "call to get_user_info" + } + } + }, + { + "location": { + "physicalLocation": { + "artifactLocation": { + "uri": "add-user.c", + "uriBaseId": "%SRCROOT%", + "index": 0 + }, + "region": { + "startLine": 95, + "startColumn": 20, + "endColumn": 24 + } + }, + "message": { + "text": "info" + } + } + }, + { + "location": { + "physicalLocation": { + "artifactLocation": { + "uri": "add-user.c", + "uriBaseId": "%SRCROOT%", + "index": 0 + }, + "region": { + "startLine": 68, + "startColumn": 31, + "endColumn": 35 + } + }, + "message": { + "text": "info" + } + } + }, + { + "location": { + "physicalLocation": { + "artifactLocation": { + "uri": "add-user.c", + "uriBaseId": "%SRCROOT%", + "index": 0 + }, + "region": { + "startLine": 84, + "startColumn": 27, + "endColumn": 32 + } + }, + "message": { + "text": "query" + } + } + } + ] + } + ] + } + ] + } + ], + "columnKind": "utf16CodeUnits", + "properties": { + "semmle.formatSpecifier": "sarif-latest" + } + } + ] +} diff --git a/sarif_cli/scan_tables.py b/sarif_cli/scan_tables.py index 69889d9..faba273 100644 --- a/sarif_cli/scan_tables.py +++ b/sarif_cli/scan_tables.py @@ -105,9 +105,6 @@ def joins_for_projects(basetables, external_info): # Force all column types to ensure appropriate formatting res1 = res.astype(ScanTablesTypes.projects).reset_index(drop=True) - # XX: automationDetails? - import IPython - IPython.embed(header="spot 11") # return res1 diff --git a/sarif_cli/signature.py b/sarif_cli/signature.py index 100fce7..7302709 100644 --- a/sarif_cli/signature.py +++ b/sarif_cli/signature.py @@ -257,10 +257,6 @@ def fillsig_dict(args, elem, context): if 'results' in elem.keys() and not 'automationDetails' in elem.keys(): #want this to be blank if not present- ie no submodule info added/no sarif-category used full_elem['automationDetails'] = {'id' : "no-value-for-ad"} - # XX: automationDetails? - import IPython - IPython.embed(header="spot 2") - # if {'locations', 'message', 'partialFingerprints', 'ruleId', 'ruleIndex'}.issubset(elem.keys()): diff --git a/sarif_cli/table_joins_CLI.py b/sarif_cli/table_joins_CLI.py index ef6cf84..3859b3e 100644 --- a/sarif_cli/table_joins_CLI.py +++ b/sarif_cli/table_joins_CLI.py @@ -335,10 +335,6 @@ def joins_for_project_single(tgraph): .merge(sf(1111), how="left", left_on='automationDetails', right_on='struct_id', validate="1:m") .drop(columns=['automationDetails', 'struct_id']) .rename(columns={"id": "automationDetails"})) - # - # XX: automationDetails? - import IPython - IPython.embed(header="spot 3") # #newlines there or not - handle if 'newlineSequences' in project_df_temp1: diff --git a/scripts/test-vcp.sh b/scripts/test-vcp.sh new file mode 100644 index 0000000..32afbae --- /dev/null +++ b/scripts/test-vcp.sh @@ -0,0 +1,48 @@ +# +# The automationDetails.id entry is produced by CodeQL when using the +# =--sarif-category= flag. +# +# This is a simple end-to-end test to ensure it appears after CSV conversion. +# + +#* Two databases, one with and one without +# --sarif-category mast-issue +cd ~/local/sarif-cli/data/codeql-dataflow-sql-injection +ls -la sqlidb-0.sarif sqlidb-1.sarif +grep -A2 automationDetails sqlidb-0.sarif sqlidb-1.sarif + +source ~/local/sarif-cli/.venv/bin/activate + +function get-csv() { + #* Insert versionControlProvenance + sarif-insert-vcp $1.sarif > $1.1.sarif + + #* Get CSV. + cd ~/local/sarif-cli/data/codeql-dataflow-sql-injection + sarif-extract-scans-runner --input-signature CLI - > /dev/null < Date: Thu, 13 Jul 2023 16:35:33 -0700 Subject: [PATCH 13/23] Execute test-vcp with tracing --- scripts/test-vcp.sh | 1 + 1 file changed, 1 insertion(+) mode change 100644 => 100755 scripts/test-vcp.sh diff --git a/scripts/test-vcp.sh b/scripts/test-vcp.sh old mode 100644 new mode 100755 index 32afbae..f6b1d27 --- a/scripts/test-vcp.sh +++ b/scripts/test-vcp.sh @@ -1,3 +1,4 @@ +#!/bin/bash -x # # The automationDetails.id entry is produced by CodeQL when using the # =--sarif-category= flag. From 8820186152e476132358aa6c80305ed7299a1f32 Mon Sep 17 00:00:00 2001 From: Michael Hohn Date: Thu, 13 Jul 2023 16:46:24 -0700 Subject: [PATCH 14/23] Add sample output for test-vcp --- scripts/test-vcp.sample | 97 +++++++++++++++++++++++++++++++++++++++++ scripts/test-vcp.sh | 3 ++ 2 files changed, 100 insertions(+) create mode 100644 scripts/test-vcp.sample diff --git a/scripts/test-vcp.sample b/scripts/test-vcp.sample new file mode 100644 index 0000000..4404e57 --- /dev/null +++ b/scripts/test-vcp.sample @@ -0,0 +1,97 @@ ++ cd /Users/hohn/local/sarif-cli/data/codeql-dataflow-sql-injection ++ ls -la sqlidb-0.sarif sqlidb-1.sarif +-rw-r--r-- 1 hohn staff 8098 Jul 11 17:15 sqlidb-0.sarif +-rw-r--r-- 1 hohn staff 6392 Jul 13 15:54 sqlidb-1.sarif ++ grep -A2 automationDetails sqlidb-0.sarif sqlidb-1.sarif +sqlidb-1.sarif: "automationDetails" : { +sqlidb-1.sarif- "id" : "mast-issue" +sqlidb-1.sarif- }, ++ source /Users/hohn/local/sarif-cli/.venv/bin/activate +++ deactivate nondestructive +++ '[' -n '' ']' +++ '[' -n '' ']' +++ '[' -n /bin/bash -o -n '' ']' +++ hash -r +++ '[' -n '' ']' +++ unset VIRTUAL_ENV +++ '[' '!' nondestructive = nondestructive ']' +++ VIRTUAL_ENV=/Users/hohn/local/sarif-cli/.venv +++ export VIRTUAL_ENV +++ '[' -n /bin/bash -o -n '' ']' +++ hash -r ++ cd /Users/hohn/local/sarif-cli/data/codeql-dataflow-sql-injection ++ get-csv sqlidb-0 ++ sarif-insert-vcp sqlidb-0.sarif ++ cd /Users/hohn/local/sarif-cli/data/codeql-dataflow-sql-injection ++ sarif-extract-scans-runner --input-signature CLI - ++ cd /Users/hohn/local/sarif-cli/data/codeql-dataflow-sql-injection ++ head -4 sqlidb-0.1.sarif.csv +sarif_file,level,levelcode,message,extra_info +sqlidb-0.1.sarif,WARNING,4,Input sarif contains extra unneccesary properties.,"Extra properties: type fields: ['description', 'kind', 'precision', 'problem.severity', 'security-severity', 'sub-severity', 'tags', 'uri']" +sqlidb-0.1.sarif,SUCCESS,0,File successfully processed., ++ ls -la sqlidb-0.1.sarif sqlidb-0.1.sarif.csv sqlidb-0.1.sarif.scanspec sqlidb-0.1.sarif.scantables +-rw-r--r-- 1 hohn staff 8243 Jul 13 16:42 sqlidb-0.1.sarif +-rw-r--r-- 1 hohn staff 326 Jul 13 16:42 sqlidb-0.1.sarif.csv +-rw-r--r-- 1 hohn staff 72 Jul 13 16:42 sqlidb-0.1.sarif.scanspec + +sqlidb-0.1.sarif.scantables: +total 32 +drwxr-xr-x 6 hohn staff 192 Jul 13 16:31 . +drwxr-xr-x 12 hohn staff 384 Jul 13 16:31 .. +-rw-r--r-- 1 hohn staff 622 Jul 13 16:42 codeflows.csv +-rw-r--r-- 1 hohn staff 205 Jul 13 16:42 projects.csv +-rw-r--r-- 1 hohn staff 589 Jul 13 16:42 results.csv +-rw-r--r-- 1 hohn staff 345 Jul 13 16:42 scans.csv ++ find sqlidb-0.1.sarif.scantables -print +sqlidb-0.1.sarif.scantables +sqlidb-0.1.sarif.scantables/codeflows.csv +sqlidb-0.1.sarif.scantables/scans.csv +sqlidb-0.1.sarif.scantables/results.csv +sqlidb-0.1.sarif.scantables/projects.csv ++ get-csv sqlidb-1 ++ sarif-insert-vcp sqlidb-1.sarif ++ cd /Users/hohn/local/sarif-cli/data/codeql-dataflow-sql-injection ++ sarif-extract-scans-runner --input-signature CLI - ++ cd /Users/hohn/local/sarif-cli/data/codeql-dataflow-sql-injection ++ head -4 sqlidb-1.1.sarif.csv +sarif_file,level,levelcode,message,extra_info +sqlidb-1.1.sarif,WARNING,4,Input sarif contains extra unneccesary properties.,"Extra properties: type fields: ['description', 'kind', 'precision', 'problem.severity', 'security-severity', 'sub-severity', 'tags', 'uri']" +sqlidb-1.1.sarif,SUCCESS,0,File successfully processed., ++ ls -la sqlidb-1.1.sarif sqlidb-1.1.sarif.csv sqlidb-1.1.sarif.scanspec sqlidb-1.1.sarif.scantables +-rw-r--r-- 1 hohn staff 8308 Jul 13 16:42 sqlidb-1.1.sarif +-rw-r--r-- 1 hohn staff 326 Jul 13 16:42 sqlidb-1.1.sarif.csv +-rw-r--r-- 1 hohn staff 72 Jul 13 16:42 sqlidb-1.1.sarif.scanspec + +sqlidb-1.1.sarif.scantables: +total 32 +drwxr-xr-x 6 hohn staff 192 Jul 13 16:31 . +drwxr-xr-x 12 hohn staff 384 Jul 13 16:31 .. +-rw-r--r-- 1 hohn staff 622 Jul 13 16:42 codeflows.csv +-rw-r--r-- 1 hohn staff 200 Jul 13 16:42 projects.csv +-rw-r--r-- 1 hohn staff 589 Jul 13 16:42 results.csv +-rw-r--r-- 1 hohn staff 345 Jul 13 16:42 scans.csv ++ find sqlidb-1.1.sarif.scantables -print +sqlidb-1.1.sarif.scantables +sqlidb-1.1.sarif.scantables/codeflows.csv +sqlidb-1.1.sarif.scantables/scans.csv +sqlidb-1.1.sarif.scantables/results.csv +sqlidb-1.1.sarif.scantables/projects.csv ++ check-flag 'sqlidb-0*' ++ ag -C1 mast-issue sqlidb-0.1.sarif sqlidb-0.1.sarif.csv sqlidb-0.1.sarif.scanspec sqlidb-0.1.sarif.scantables sqlidb-0.sarif ++ ag -C1 automationDetails sqlidb-0.1.sarif sqlidb-0.1.sarif.csv sqlidb-0.1.sarif.scanspec sqlidb-0.1.sarif.scantables sqlidb-0.sarif +sqlidb-0.1.sarif.scantables/projects.csv:1:"id","project_name","creation_date","repo_url","primary_language","languages_analyzed","automationDetails" +sqlidb-0.1.sarif.scantables/projects.csv:2-10761451173100907203,"vcp-no-uri","1970-01-01","vcp-no-uri","unknown","unknown","no-value-for-ad" ++ check-flag 'sqlidb-1.1*' ++ ag -C1 mast-issue sqlidb-1.1.sarif sqlidb-1.1.sarif.csv sqlidb-1.1.sarif.scanspec sqlidb-1.1.sarif.scantables +sqlidb-1.1.sarif:240- "automationDetails": { +sqlidb-1.1.sarif:241: "id": "mast-issue" +sqlidb-1.1.sarif:242- }, +sqlidb-1.1.sarif.scantables/projects.csv:1-"id","project_name","creation_date","repo_url","primary_language","languages_analyzed","automationDetails" +sqlidb-1.1.sarif.scantables/projects.csv:2:16460100493790735471,"vcp-no-uri","1970-01-01","vcp-no-uri","unknown","unknown","mast-issue" +sqlidb-1.1.sarif.scantables/projects.csv:3- ++ ag -C1 automationDetails sqlidb-1.1.sarif sqlidb-1.1.sarif.csv sqlidb-1.1.sarif.scanspec sqlidb-1.1.sarif.scantables +sqlidb-1.1.sarif:239- ], +sqlidb-1.1.sarif:240: "automationDetails": { +sqlidb-1.1.sarif:241- "id": "mast-issue" +sqlidb-1.1.sarif.scantables/projects.csv:1:"id","project_name","creation_date","repo_url","primary_language","languages_analyzed","automationDetails" +sqlidb-1.1.sarif.scantables/projects.csv:2-16460100493790735471,"vcp-no-uri","1970-01-01","vcp-no-uri","unknown","unknown","mast-issue" diff --git a/scripts/test-vcp.sh b/scripts/test-vcp.sh index f6b1d27..0cb3ccd 100755 --- a/scripts/test-vcp.sh +++ b/scripts/test-vcp.sh @@ -4,7 +4,10 @@ # =--sarif-category= flag. # # This is a simple end-to-end test to ensure it appears after CSV conversion. +# Run via +# ./test-vcp.sh > test-vcp.out 2>&1 # +# An output sample -- not suitable for automatic testing yet -- is in test-vcp.sample #* Two databases, one with and one without # --sarif-category mast-issue From ebeaced0f474e7be37d17e35d206efe2358adfff Mon Sep 17 00:00:00 2001 From: Michael Hohn Date: Mon, 17 Jul 2023 10:30:35 -0700 Subject: [PATCH 15/23] Remove automationDetails from CSV output This reverses commit 68b43e05 to keep the CSV compatible with prior output --- sarif_cli/columns.py | 2 +- sarif_cli/scan_tables.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sarif_cli/columns.py b/sarif_cli/columns.py index 2bbaa14..3e37266 100644 --- a/sarif_cli/columns.py +++ b/sarif_cli/columns.py @@ -47,7 +47,7 @@ columns = { "repo_url" , "primary_language" , "languages_analyzed", - "automationDetails", + # "automationDetails", ], "codeflows" : [ "codeflow_id", diff --git a/sarif_cli/scan_tables.py b/sarif_cli/scan_tables.py index faba273..e5c382b 100644 --- a/sarif_cli/scan_tables.py +++ b/sarif_cli/scan_tables.py @@ -70,7 +70,7 @@ class ScanTablesTypes: "repo_url" : pd.StringDtype(), "primary_language" : pd.StringDtype(), "languages_analyzed" : pd.StringDtype(), - "automationDetails" : pd.StringDtype(), + # "automationDetails" : pd.StringDtype(), } # From 3dfb297612414146497d43f141cb5a068f0d8c58 Mon Sep 17 00:00:00 2001 From: Michael Hohn Date: Thu, 20 Jul 2023 22:39:10 -0700 Subject: [PATCH 16/23] Make project_name unique by adding automationDetails to it --- sarif_cli/scan_tables.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sarif_cli/scan_tables.py b/sarif_cli/scan_tables.py index e5c382b..95cea09 100644 --- a/sarif_cli/scan_tables.py +++ b/sarif_cli/scan_tables.py @@ -89,13 +89,14 @@ def joins_for_projects(basetables, external_info): # if the sarif does have versionControlProvenance if "repositoryUri" in b.project: repoUri = b.project.repositoryUri[0] + project_name = b.project.repositoryUri[0] + "-" + extra e.project_id = hash.hash_unique((repoUri+extra).encode()) else: repoUri = "unknown" res = pd.DataFrame(data={ "id" : e.project_id, - "project_name" : repoUri, + "project_name" : project_name, "creation_date" : pd.Timestamp(0.0, unit='s'), # TODO: external info "repo_url" : repoUri, "primary_language" : b.project['semmle.sourceLanguage'][0], From 5a8b4a33a3c5d935ccd027d45dde4176fdbf78cb Mon Sep 17 00:00:00 2001 From: Michael Hohn Date: Wed, 26 Jul 2023 12:13:37 -0700 Subject: [PATCH 17/23] Add script to test all steps using different codeql cli versions The script build-multiple-codeql-versions.sh is for manual testing and updating. It may be automated for testing. --- build-multiple-codeql-versions.sh | 111 ++++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 build-multiple-codeql-versions.sh diff --git a/build-multiple-codeql-versions.sh b/build-multiple-codeql-versions.sh new file mode 100644 index 0000000..3783544 --- /dev/null +++ b/build-multiple-codeql-versions.sh @@ -0,0 +1,111 @@ +# +#* Following are the steps needed to build a codeql db using different versions of +# the codeql cli +# +echo '$0: Interactive use only' +exit 1 + +#* Use virtual environment. See README for setup. +source ~/local/sarif-cli/.venv/bin/activate + +#* What can we use? +gh codeql list-versions + +#* History +open https://github.com/github/codeql-cli-binaries/blob/HEAD/CHANGELOG.md + +#* Get repo +cd ~/local/sarif-cli +git clone git@github.com:hohn/codeql-dataflow-sql-injection.git +cd codeql-dataflow-sql-injection/ + +#* Choose +v2.14.0 +v2.13.5 +v2.13.4 +v2.13.3 +v2.13.1 +v2.13.0 +v2.12.7 +v2.12.6 +v2.11.6 +v2.10.5 +v2.9.4 + +CLI_VERSION=v2.9.4 +CLI_VERSION=v2.12.7 +gh codeql set-version $CLI_VERSION + +#* Build vanilla DB +cd ~/local/sarif-cli/codeql-dataflow-sql-injection +rm -fR sqlidb +codeql database create --language=cpp -s . -j 8 -v sqlidb --command='./build.sh' +cp -r sqlidb sqlidb-$CLI_VERSION + +#* Pack compatibility with CLI +function codeql-complib() { + if [ -z "$1" ]; then + echo "Usage: codeql-complib " + return 1 + fi + curl --silent https://raw.githubusercontent.com/github/codeql/codeql-cli/v$(codeql version --format=json | jq -r .version)/$1/ql/lib/qlpack.yml | grep version | cut -d':' -f2 | sed 's/^[ ]*//' +} + +# Create the qlpack file using commands: +cd ~/local/sarif-cli +#: Bug: drops the codeql- prefix +rm -fR dataflow-sql-injection +codeql pack init codeql-dataflow-sql-injection +cp -f dataflow-sql-injection/qlpack.yml codeql-dataflow-sql-injection/ +# Add correct library dependency +codeql pack add --dir=codeql-dataflow-sql-injection codeql/cpp-all@"$(codeql-complib cpp)" + +#* Install packs +cd ~/local/sarif-cli/codeql-dataflow-sql-injection +rm -f *lock* +codeql pack install + +#* Run the analyze command with options +# +cd ~/local/sarif-cli/codeql-dataflow-sql-injection +codeql database analyze \ + -v \ + --sarif-category santa-chap \ + --ram=16000 \ + -j8 \ + --format=sarif-latest \ + --output sqlidb-$CLI_VERSION.sarif \ + -- \ + sqlidb-$CLI_VERSION \ + SqlInjection.ql + +# Verify cli version in SARIF output +SAVER=`jq -r '.runs |.[] |.tool.driver.semanticVersion ' sqlidb-$CLI_VERSION.sarif` +if [ v$SAVER != $CLI_VERSION ] ; +then + echo "---: codeql version inconsistency" +fi + +# Check sarif-category flag +grep -A2 automationDetails sqlidb-$CLI_VERSION.sarif + +#* Insert versionControlProvenance +cd ~/local/sarif-cli/codeql-dataflow-sql-injection +sarif-insert-vcp sqlidb-$CLI_VERSION.sarif > sqlidb-$CLI_VERSION-1.sarif + +#* Get CSV. +cd ~/local/sarif-cli/codeql-dataflow-sql-injection +sarif-extract-scans-runner --input-signature CLI - > /dev/null < Date: Wed, 26 Jul 2023 13:30:41 -0700 Subject: [PATCH 18/23] Refine build-multiple-codeql-versions.sh; add v2.13.5 test --- build-multiple-codeql-versions.sh | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/build-multiple-codeql-versions.sh b/build-multiple-codeql-versions.sh index 3783544..aeec21e 100644 --- a/build-multiple-codeql-versions.sh +++ b/build-multiple-codeql-versions.sh @@ -34,6 +34,7 @@ v2.9.4 CLI_VERSION=v2.9.4 CLI_VERSION=v2.12.7 +CLI_VERSION=v2.13.5 gh codeql set-version $CLI_VERSION #* Build vanilla DB @@ -53,7 +54,7 @@ function codeql-complib() { # Create the qlpack file using commands: cd ~/local/sarif-cli -#: Bug: drops the codeql- prefix +# Bug: drops the codeql- prefix rm -fR dataflow-sql-injection codeql pack init codeql-dataflow-sql-injection cp -f dataflow-sql-injection/qlpack.yml codeql-dataflow-sql-injection/ @@ -81,6 +82,7 @@ codeql database analyze \ # Verify cli version in SARIF output SAVER=`jq -r '.runs |.[] |.tool.driver.semanticVersion ' sqlidb-$CLI_VERSION.sarif` +echo $SAVER if [ v$SAVER != $CLI_VERSION ] ; then echo "---: codeql version inconsistency" @@ -101,7 +103,7 @@ EOF #* Check CSV messages for success cd ~/local/sarif-cli/codeql-dataflow-sql-injection -# head -4 sqlidb-$CLI_VERSION-1.sarif.csv +head -4 sqlidb-$CLI_VERSION-1.sarif.csv grep -qi success sqlidb-$CLI_VERSION-1.sarif.csv || { echo "---: sarif-cli failure: sqlidb-$CLI_VERSION-1.sarif*" } @@ -109,3 +111,11 @@ grep -qi success sqlidb-$CLI_VERSION-1.sarif.csv || { #* CSV output # ls -la sqlidb-$CLI_VERSION-1* # find sqlidb-$CLI_VERSION-1*.scantables -print + +#* Summary +cd ~/local/sarif-cli/codeql-dataflow-sql-injection +#** SARIF files +ls sqlidb-v*.sarif +#** CSV conversion info +ls sqlidb-v2.*.sarif.csv* +tail -2 sqlidb-v2.*.sarif.csv* From d386e5da450df514b6669b8254248ae01cd35bcb Mon Sep 17 00:00:00 2001 From: Michael Hohn Date: Wed, 26 Jul 2023 13:47:58 -0700 Subject: [PATCH 19/23] Add tests for 2.14.0; include versioned SARIF and CSV files in the repository --- build-multiple-codeql-versions.sh | 6 +- .../sqlidb-v2.12.7-1.sarif | 255 +++++++++++++++ .../sqlidb-v2.12.7-1.sarif.csv | 3 + .../sqlidb-v2.13.5-1.sarif | 309 ++++++++++++++++++ .../sqlidb-v2.13.5-1.sarif.csv | 3 + .../sqlidb-v2.14.0-1.sarif | 309 ++++++++++++++++++ .../sqlidb-v2.14.0-1.sarif.csv | 3 + .../sqlidb-v2.9.4-1.sarif | 255 +++++++++++++++ .../sqlidb-v2.9.4-1.sarif.csv | 3 + 9 files changed, 1145 insertions(+), 1 deletion(-) create mode 100644 data/codeql-dataflow-sql-injection/sqlidb-v2.12.7-1.sarif create mode 100644 data/codeql-dataflow-sql-injection/sqlidb-v2.12.7-1.sarif.csv create mode 100644 data/codeql-dataflow-sql-injection/sqlidb-v2.13.5-1.sarif create mode 100644 data/codeql-dataflow-sql-injection/sqlidb-v2.13.5-1.sarif.csv create mode 100644 data/codeql-dataflow-sql-injection/sqlidb-v2.14.0-1.sarif create mode 100644 data/codeql-dataflow-sql-injection/sqlidb-v2.14.0-1.sarif.csv create mode 100644 data/codeql-dataflow-sql-injection/sqlidb-v2.9.4-1.sarif create mode 100644 data/codeql-dataflow-sql-injection/sqlidb-v2.9.4-1.sarif.csv diff --git a/build-multiple-codeql-versions.sh b/build-multiple-codeql-versions.sh index aeec21e..96f974c 100644 --- a/build-multiple-codeql-versions.sh +++ b/build-multiple-codeql-versions.sh @@ -2,6 +2,8 @@ #* Following are the steps needed to build a codeql db using different versions of # the codeql cli # +# Some files from prior runs are found in ./data/codeql-dataflow-sql-injection/ +# echo '$0: Interactive use only' exit 1 @@ -35,6 +37,7 @@ v2.9.4 CLI_VERSION=v2.9.4 CLI_VERSION=v2.12.7 CLI_VERSION=v2.13.5 +CLI_VERSION=v2.14.0 gh codeql set-version $CLI_VERSION #* Build vanilla DB @@ -60,6 +63,7 @@ codeql pack init codeql-dataflow-sql-injection cp -f dataflow-sql-injection/qlpack.yml codeql-dataflow-sql-injection/ # Add correct library dependency codeql pack add --dir=codeql-dataflow-sql-injection codeql/cpp-all@"$(codeql-complib cpp)" +cat codeql-dataflow-sql-injection/qlpack.yml #* Install packs cd ~/local/sarif-cli/codeql-dataflow-sql-injection @@ -82,7 +86,7 @@ codeql database analyze \ # Verify cli version in SARIF output SAVER=`jq -r '.runs |.[] |.tool.driver.semanticVersion ' sqlidb-$CLI_VERSION.sarif` -echo $SAVER +printf "db %s\ncli %s\n" $SAVER $CLI_VERSION if [ v$SAVER != $CLI_VERSION ] ; then echo "---: codeql version inconsistency" diff --git a/data/codeql-dataflow-sql-injection/sqlidb-v2.12.7-1.sarif b/data/codeql-dataflow-sql-injection/sqlidb-v2.12.7-1.sarif new file mode 100644 index 0000000..bbb6a94 --- /dev/null +++ b/data/codeql-dataflow-sql-injection/sqlidb-v2.12.7-1.sarif @@ -0,0 +1,255 @@ +{ + "$schema": "https://json.schemastore.org/sarif-2.1.0.json", + "version": "2.1.0", + "runs": [ + { + "tool": { + "driver": { + "name": "CodeQL", + "organization": "GitHub", + "semanticVersion": "2.12.7", + "rules": [ + { + "id": "cpp/SQLIVulnerable", + "name": "cpp/SQLIVulnerable", + "shortDescription": { + "text": "SQLI Vulnerability" + }, + "fullDescription": { + "text": "Using untrusted strings in a sql query allows sql injection attacks." + }, + "defaultConfiguration": { + "enabled": true, + "level": "warning" + }, + "properties": { + "description": "Using untrusted strings in a sql query allows sql injection attacks.", + "id": "cpp/SQLIVulnerable", + "kind": "path-problem", + "name": "SQLI Vulnerability", + "problem.severity": "warning" + } + } + ] + }, + "extensions": [ + { + "name": "legacy-upgrades", + "semanticVersion": "0.0.0", + "locations": [ + { + "uri": "file:///Users/hohn/.local/share/gh/extensions/gh-codeql/dist/release/v2.12.7/legacy-upgrades/", + "description": { + "text": "The QL pack root directory." + } + }, + { + "uri": "file:///Users/hohn/.local/share/gh/extensions/gh-codeql/dist/release/v2.12.7/legacy-upgrades/qlpack.yml", + "description": { + "text": "The QL pack definition file." + } + } + ] + }, + { + "name": "codeql-dataflow-sql-injection", + "semanticVersion": "0.0.1", + "locations": [ + { + "uri": "file:///Users/hohn/local/sarif-cli/codeql-dataflow-sql-injection/", + "description": { + "text": "The QL pack root directory." + } + }, + { + "uri": "file:///Users/hohn/local/sarif-cli/codeql-dataflow-sql-injection/qlpack.yml", + "description": { + "text": "The QL pack definition file." + } + } + ] + } + ] + }, + "artifacts": [ + { + "location": { + "uri": "add-user.c", + "uriBaseId": "%SRCROOT%", + "index": 0 + } + } + ], + "results": [ + { + "ruleId": "cpp/SQLIVulnerable", + "ruleIndex": 0, + "rule": { + "id": "cpp/SQLIVulnerable", + "index": 0 + }, + "message": { + "text": "Possible SQL injection" + }, + "locations": [ + { + "physicalLocation": { + "artifactLocation": { + "uri": "add-user.c", + "uriBaseId": "%SRCROOT%", + "index": 0 + }, + "region": { + "startLine": 84, + "startColumn": 27, + "endColumn": 32 + } + } + } + ], + "partialFingerprints": { + "primaryLocationLineHash": "9a8bc91bbc363391:1", + "primaryLocationStartColumnFingerprint": "22" + }, + "codeFlows": [ + { + "threadFlows": [ + { + "locations": [ + { + "location": { + "physicalLocation": { + "artifactLocation": { + "uri": "add-user.c", + "uriBaseId": "%SRCROOT%", + "index": 0 + }, + "region": { + "startLine": 52, + "startColumn": 32, + "endColumn": 35 + } + }, + "message": { + "text": "ref arg buf" + } + } + }, + { + "location": { + "physicalLocation": { + "artifactLocation": { + "uri": "add-user.c", + "uriBaseId": "%SRCROOT%", + "index": 0 + }, + "region": { + "startLine": 60, + "startColumn": 12, + "endColumn": 15 + } + }, + "message": { + "text": "buf" + } + } + }, + { + "location": { + "physicalLocation": { + "artifactLocation": { + "uri": "add-user.c", + "uriBaseId": "%SRCROOT%", + "index": 0 + }, + "region": { + "startLine": 93, + "startColumn": 12, + "endColumn": 25 + } + }, + "message": { + "text": "call to get_user_info" + } + } + }, + { + "location": { + "physicalLocation": { + "artifactLocation": { + "uri": "add-user.c", + "uriBaseId": "%SRCROOT%", + "index": 0 + }, + "region": { + "startLine": 95, + "startColumn": 20, + "endColumn": 24 + } + }, + "message": { + "text": "info" + } + } + }, + { + "location": { + "physicalLocation": { + "artifactLocation": { + "uri": "add-user.c", + "uriBaseId": "%SRCROOT%", + "index": 0 + }, + "region": { + "startLine": 68, + "startColumn": 31, + "endColumn": 35 + } + }, + "message": { + "text": "info" + } + } + }, + { + "location": { + "physicalLocation": { + "artifactLocation": { + "uri": "add-user.c", + "uriBaseId": "%SRCROOT%", + "index": 0 + }, + "region": { + "startLine": 84, + "startColumn": 27, + "endColumn": 32 + } + }, + "message": { + "text": "query" + } + } + } + ] + } + ] + } + ] + } + ], + "automationDetails": { + "id": "santa-chap/" + }, + "columnKind": "utf16CodeUnits", + "properties": { + "semmle.formatSpecifier": "sarif-latest" + }, + "versionControlProvenance": [ + { + "repositoryUri": "vcp-no-uri", + "revisionId": "vcp-no-revid" + } + ] + } + ] +} diff --git a/data/codeql-dataflow-sql-injection/sqlidb-v2.12.7-1.sarif.csv b/data/codeql-dataflow-sql-injection/sqlidb-v2.12.7-1.sarif.csv new file mode 100644 index 0000000..7f79aff --- /dev/null +++ b/data/codeql-dataflow-sql-injection/sqlidb-v2.12.7-1.sarif.csv @@ -0,0 +1,3 @@ +sarif_file,level,levelcode,message,extra_info +sqlidb-v2.12.7-1.sarif,WARNING,4,Input sarif contains extra unneccesary properties.,"Extra properties: type fields: ['description', 'kind', 'precision', 'problem.severity', 'security-severity', 'sub-severity', 'tags', 'uri']" +sqlidb-v2.12.7-1.sarif,SUCCESS,0,File successfully processed., diff --git a/data/codeql-dataflow-sql-injection/sqlidb-v2.13.5-1.sarif b/data/codeql-dataflow-sql-injection/sqlidb-v2.13.5-1.sarif new file mode 100644 index 0000000..9b3a99a --- /dev/null +++ b/data/codeql-dataflow-sql-injection/sqlidb-v2.13.5-1.sarif @@ -0,0 +1,309 @@ +{ + "$schema": "https://json.schemastore.org/sarif-2.1.0.json", + "version": "2.1.0", + "runs": [ + { + "tool": { + "driver": { + "name": "CodeQL", + "organization": "GitHub", + "semanticVersion": "2.13.5", + "notifications": [ + { + "id": "cpp/baseline/expected-extracted-files", + "name": "cpp/baseline/expected-extracted-files", + "shortDescription": { + "text": "Expected extracted files" + }, + "fullDescription": { + "text": "Files appearing in the source archive that are expected to be extracted." + }, + "defaultConfiguration": { + "enabled": true + }, + "properties": { + "tags": [ + "expected-extracted-files", + "telemetry" + ] + } + } + ], + "rules": [ + { + "id": "cpp/SQLIVulnerable", + "name": "cpp/SQLIVulnerable", + "shortDescription": { + "text": "SQLI Vulnerability" + }, + "fullDescription": { + "text": "Using untrusted strings in a sql query allows sql injection attacks." + }, + "defaultConfiguration": { + "enabled": true, + "level": "warning" + }, + "properties": { + "description": "Using untrusted strings in a sql query allows sql injection attacks.", + "id": "cpp/SQLIVulnerable", + "kind": "path-problem", + "name": "SQLI Vulnerability", + "problem.severity": "warning" + } + } + ] + }, + "extensions": [ + { + "name": "legacy-upgrades", + "semanticVersion": "0.0.0", + "locations": [ + { + "uri": "file:///Users/hohn/.local/share/gh/extensions/gh-codeql/dist/release/v2.13.5/legacy-upgrades/", + "description": { + "text": "The QL pack root directory." + } + }, + { + "uri": "file:///Users/hohn/.local/share/gh/extensions/gh-codeql/dist/release/v2.13.5/legacy-upgrades/qlpack.yml", + "description": { + "text": "The QL pack definition file." + } + } + ] + }, + { + "name": "codeql-dataflow-sql-injection", + "semanticVersion": "0.0.1", + "locations": [ + { + "uri": "file:///Users/hohn/local/sarif-cli/codeql-dataflow-sql-injection/", + "description": { + "text": "The QL pack root directory." + } + }, + { + "uri": "file:///Users/hohn/local/sarif-cli/codeql-dataflow-sql-injection/qlpack.yml", + "description": { + "text": "The QL pack definition file." + } + } + ] + } + ] + }, + "invocations": [ + { + "toolExecutionNotifications": [ + { + "locations": [ + { + "physicalLocation": { + "artifactLocation": { + "uri": "add-user.c", + "uriBaseId": "%SRCROOT%", + "index": 0 + } + } + } + ], + "message": { + "text": "" + }, + "level": "none", + "descriptor": { + "id": "cpp/baseline/expected-extracted-files", + "index": 0 + }, + "properties": { + "formattedMessage": { + "text": "" + } + } + } + ], + "executionSuccessful": true + } + ], + "artifacts": [ + { + "location": { + "uri": "add-user.c", + "uriBaseId": "%SRCROOT%", + "index": 0 + } + } + ], + "results": [ + { + "ruleId": "cpp/SQLIVulnerable", + "ruleIndex": 0, + "rule": { + "id": "cpp/SQLIVulnerable", + "index": 0 + }, + "message": { + "text": "Possible SQL injection" + }, + "locations": [ + { + "physicalLocation": { + "artifactLocation": { + "uri": "add-user.c", + "uriBaseId": "%SRCROOT%", + "index": 0 + }, + "region": { + "startLine": 84, + "startColumn": 27, + "endColumn": 32 + } + } + } + ], + "partialFingerprints": { + "primaryLocationLineHash": "9a8bc91bbc363391:1", + "primaryLocationStartColumnFingerprint": "22" + }, + "codeFlows": [ + { + "threadFlows": [ + { + "locations": [ + { + "location": { + "physicalLocation": { + "artifactLocation": { + "uri": "add-user.c", + "uriBaseId": "%SRCROOT%", + "index": 0 + }, + "region": { + "startLine": 52, + "startColumn": 32, + "endColumn": 35 + } + }, + "message": { + "text": "ref arg buf" + } + } + }, + { + "location": { + "physicalLocation": { + "artifactLocation": { + "uri": "add-user.c", + "uriBaseId": "%SRCROOT%", + "index": 0 + }, + "region": { + "startLine": 60, + "startColumn": 12, + "endColumn": 15 + } + }, + "message": { + "text": "buf" + } + } + }, + { + "location": { + "physicalLocation": { + "artifactLocation": { + "uri": "add-user.c", + "uriBaseId": "%SRCROOT%", + "index": 0 + }, + "region": { + "startLine": 93, + "startColumn": 12, + "endColumn": 25 + } + }, + "message": { + "text": "call to get_user_info" + } + } + }, + { + "location": { + "physicalLocation": { + "artifactLocation": { + "uri": "add-user.c", + "uriBaseId": "%SRCROOT%", + "index": 0 + }, + "region": { + "startLine": 95, + "startColumn": 20, + "endColumn": 24 + } + }, + "message": { + "text": "info" + } + } + }, + { + "location": { + "physicalLocation": { + "artifactLocation": { + "uri": "add-user.c", + "uriBaseId": "%SRCROOT%", + "index": 0 + }, + "region": { + "startLine": 68, + "startColumn": 31, + "endColumn": 35 + } + }, + "message": { + "text": "info" + } + } + }, + { + "location": { + "physicalLocation": { + "artifactLocation": { + "uri": "add-user.c", + "uriBaseId": "%SRCROOT%", + "index": 0 + }, + "region": { + "startLine": 84, + "startColumn": 27, + "endColumn": 32 + } + }, + "message": { + "text": "query" + } + } + } + ] + } + ] + } + ] + } + ], + "automationDetails": { + "id": "santa-chap/" + }, + "columnKind": "utf16CodeUnits", + "properties": { + "semmle.formatSpecifier": "sarif-latest" + }, + "versionControlProvenance": [ + { + "repositoryUri": "vcp-no-uri", + "revisionId": "vcp-no-revid" + } + ] + } + ] +} diff --git a/data/codeql-dataflow-sql-injection/sqlidb-v2.13.5-1.sarif.csv b/data/codeql-dataflow-sql-injection/sqlidb-v2.13.5-1.sarif.csv new file mode 100644 index 0000000..72e53f8 --- /dev/null +++ b/data/codeql-dataflow-sql-injection/sqlidb-v2.13.5-1.sarif.csv @@ -0,0 +1,3 @@ +sarif_file,level,levelcode,message,extra_info +sqlidb-v2.13.5-1.sarif,WARNING,4,Input sarif contains extra unneccesary properties.,"Extra properties: type fields: ['artifacts', 'automationDetails', 'columnKind', 'invocations', 'newlineSequences', 'properties', 'results', 'tool', 'versionControlProvenance']type fields: ['name', 'notifications', 'organization', 'rules', 'semanticVersion']type fields: ['description', 'kind', 'precision', 'problem.severity', 'security-severity', 'sub-severity', 'tags', 'uri']" +sqlidb-v2.13.5-1.sarif,SUCCESS,0,File successfully processed., diff --git a/data/codeql-dataflow-sql-injection/sqlidb-v2.14.0-1.sarif b/data/codeql-dataflow-sql-injection/sqlidb-v2.14.0-1.sarif new file mode 100644 index 0000000..bf3dafe --- /dev/null +++ b/data/codeql-dataflow-sql-injection/sqlidb-v2.14.0-1.sarif @@ -0,0 +1,309 @@ +{ + "$schema": "https://json.schemastore.org/sarif-2.1.0.json", + "version": "2.1.0", + "runs": [ + { + "tool": { + "driver": { + "name": "CodeQL", + "organization": "GitHub", + "semanticVersion": "2.14.0", + "notifications": [ + { + "id": "cpp/baseline/expected-extracted-files", + "name": "cpp/baseline/expected-extracted-files", + "shortDescription": { + "text": "Expected extracted files" + }, + "fullDescription": { + "text": "Files appearing in the source archive that are expected to be extracted." + }, + "defaultConfiguration": { + "enabled": true + }, + "properties": { + "tags": [ + "expected-extracted-files", + "telemetry" + ] + } + } + ], + "rules": [ + { + "id": "cpp/SQLIVulnerable", + "name": "cpp/SQLIVulnerable", + "shortDescription": { + "text": "SQLI Vulnerability" + }, + "fullDescription": { + "text": "Using untrusted strings in a sql query allows sql injection attacks." + }, + "defaultConfiguration": { + "enabled": true, + "level": "warning" + }, + "properties": { + "description": "Using untrusted strings in a sql query allows sql injection attacks.", + "id": "cpp/SQLIVulnerable", + "kind": "path-problem", + "name": "SQLI Vulnerability", + "problem.severity": "warning" + } + } + ] + }, + "extensions": [ + { + "name": "legacy-upgrades", + "semanticVersion": "0.0.0", + "locations": [ + { + "uri": "file:///Users/hohn/.local/share/gh/extensions/gh-codeql/dist/release/v2.14.0/legacy-upgrades/", + "description": { + "text": "The QL pack root directory." + } + }, + { + "uri": "file:///Users/hohn/.local/share/gh/extensions/gh-codeql/dist/release/v2.14.0/legacy-upgrades/qlpack.yml", + "description": { + "text": "The QL pack definition file." + } + } + ] + }, + { + "name": "codeql-dataflow-sql-injection", + "semanticVersion": "0.0.1", + "locations": [ + { + "uri": "file:///Users/hohn/local/sarif-cli/codeql-dataflow-sql-injection/", + "description": { + "text": "The QL pack root directory." + } + }, + { + "uri": "file:///Users/hohn/local/sarif-cli/codeql-dataflow-sql-injection/qlpack.yml", + "description": { + "text": "The QL pack definition file." + } + } + ] + } + ] + }, + "invocations": [ + { + "toolExecutionNotifications": [ + { + "locations": [ + { + "physicalLocation": { + "artifactLocation": { + "uri": "add-user.c", + "uriBaseId": "%SRCROOT%", + "index": 0 + } + } + } + ], + "message": { + "text": "" + }, + "level": "none", + "descriptor": { + "id": "cpp/baseline/expected-extracted-files", + "index": 0 + }, + "properties": { + "formattedMessage": { + "text": "" + } + } + } + ], + "executionSuccessful": true + } + ], + "artifacts": [ + { + "location": { + "uri": "add-user.c", + "uriBaseId": "%SRCROOT%", + "index": 0 + } + } + ], + "results": [ + { + "ruleId": "cpp/SQLIVulnerable", + "ruleIndex": 0, + "rule": { + "id": "cpp/SQLIVulnerable", + "index": 0 + }, + "message": { + "text": "Possible SQL injection" + }, + "locations": [ + { + "physicalLocation": { + "artifactLocation": { + "uri": "add-user.c", + "uriBaseId": "%SRCROOT%", + "index": 0 + }, + "region": { + "startLine": 84, + "startColumn": 27, + "endColumn": 32 + } + } + } + ], + "partialFingerprints": { + "primaryLocationLineHash": "9a8bc91bbc363391:1", + "primaryLocationStartColumnFingerprint": "22" + }, + "codeFlows": [ + { + "threadFlows": [ + { + "locations": [ + { + "location": { + "physicalLocation": { + "artifactLocation": { + "uri": "add-user.c", + "uriBaseId": "%SRCROOT%", + "index": 0 + }, + "region": { + "startLine": 52, + "startColumn": 32, + "endColumn": 35 + } + }, + "message": { + "text": "ref arg buf" + } + } + }, + { + "location": { + "physicalLocation": { + "artifactLocation": { + "uri": "add-user.c", + "uriBaseId": "%SRCROOT%", + "index": 0 + }, + "region": { + "startLine": 60, + "startColumn": 12, + "endColumn": 15 + } + }, + "message": { + "text": "buf" + } + } + }, + { + "location": { + "physicalLocation": { + "artifactLocation": { + "uri": "add-user.c", + "uriBaseId": "%SRCROOT%", + "index": 0 + }, + "region": { + "startLine": 93, + "startColumn": 12, + "endColumn": 25 + } + }, + "message": { + "text": "call to get_user_info" + } + } + }, + { + "location": { + "physicalLocation": { + "artifactLocation": { + "uri": "add-user.c", + "uriBaseId": "%SRCROOT%", + "index": 0 + }, + "region": { + "startLine": 95, + "startColumn": 20, + "endColumn": 24 + } + }, + "message": { + "text": "info" + } + } + }, + { + "location": { + "physicalLocation": { + "artifactLocation": { + "uri": "add-user.c", + "uriBaseId": "%SRCROOT%", + "index": 0 + }, + "region": { + "startLine": 68, + "startColumn": 31, + "endColumn": 35 + } + }, + "message": { + "text": "info" + } + } + }, + { + "location": { + "physicalLocation": { + "artifactLocation": { + "uri": "add-user.c", + "uriBaseId": "%SRCROOT%", + "index": 0 + }, + "region": { + "startLine": 84, + "startColumn": 27, + "endColumn": 32 + } + }, + "message": { + "text": "query" + } + } + } + ] + } + ] + } + ] + } + ], + "automationDetails": { + "id": "santa-chap/" + }, + "columnKind": "utf16CodeUnits", + "properties": { + "semmle.formatSpecifier": "sarif-latest" + }, + "versionControlProvenance": [ + { + "repositoryUri": "vcp-no-uri", + "revisionId": "vcp-no-revid" + } + ] + } + ] +} diff --git a/data/codeql-dataflow-sql-injection/sqlidb-v2.14.0-1.sarif.csv b/data/codeql-dataflow-sql-injection/sqlidb-v2.14.0-1.sarif.csv new file mode 100644 index 0000000..44cb5f2 --- /dev/null +++ b/data/codeql-dataflow-sql-injection/sqlidb-v2.14.0-1.sarif.csv @@ -0,0 +1,3 @@ +sarif_file,level,levelcode,message,extra_info +sqlidb-v2.14.0-1.sarif,WARNING,4,Input sarif contains extra unneccesary properties.,"Extra properties: type fields: ['artifacts', 'automationDetails', 'columnKind', 'invocations', 'newlineSequences', 'properties', 'results', 'tool', 'versionControlProvenance']type fields: ['name', 'notifications', 'organization', 'rules', 'semanticVersion']type fields: ['description', 'kind', 'precision', 'problem.severity', 'security-severity', 'sub-severity', 'tags', 'uri']" +sqlidb-v2.14.0-1.sarif,SUCCESS,0,File successfully processed., diff --git a/data/codeql-dataflow-sql-injection/sqlidb-v2.9.4-1.sarif b/data/codeql-dataflow-sql-injection/sqlidb-v2.9.4-1.sarif new file mode 100644 index 0000000..3cdbc7b --- /dev/null +++ b/data/codeql-dataflow-sql-injection/sqlidb-v2.9.4-1.sarif @@ -0,0 +1,255 @@ +{ + "$schema": "https://json.schemastore.org/sarif-2.1.0.json", + "version": "2.1.0", + "runs": [ + { + "tool": { + "driver": { + "name": "CodeQL", + "organization": "GitHub", + "semanticVersion": "2.9.4", + "rules": [ + { + "id": "cpp/SQLIVulnerable", + "name": "cpp/SQLIVulnerable", + "shortDescription": { + "text": "SQLI Vulnerability" + }, + "fullDescription": { + "text": "Using untrusted strings in a sql query allows sql injection attacks." + }, + "defaultConfiguration": { + "enabled": true, + "level": "warning" + }, + "properties": { + "description": "Using untrusted strings in a sql query allows sql injection attacks.", + "id": "cpp/SQLIVulnerable", + "kind": "path-problem", + "name": "SQLI Vulnerability", + "problem.severity": "warning" + } + } + ] + }, + "extensions": [ + { + "name": "legacy-upgrades", + "semanticVersion": "0.0.0", + "locations": [ + { + "uri": "file:///Users/hohn/.local/share/gh/extensions/gh-codeql/dist/release/v2.9.4/legacy-upgrades/", + "description": { + "text": "The QL pack root directory." + } + }, + { + "uri": "file:///Users/hohn/.local/share/gh/extensions/gh-codeql/dist/release/v2.9.4/legacy-upgrades/qlpack.yml", + "description": { + "text": "The QL pack definition file." + } + } + ] + }, + { + "name": "sample/cpp-sql-injection", + "semanticVersion": "0.0.1", + "locations": [ + { + "uri": "file:///Users/hohn/local/sarif-cli/codeql-dataflow-sql-injection/", + "description": { + "text": "The QL pack root directory." + } + }, + { + "uri": "file:///Users/hohn/local/sarif-cli/codeql-dataflow-sql-injection/qlpack.yml", + "description": { + "text": "The QL pack definition file." + } + } + ] + } + ] + }, + "artifacts": [ + { + "location": { + "uri": "add-user.c", + "uriBaseId": "%SRCROOT%", + "index": 0 + } + } + ], + "results": [ + { + "ruleId": "cpp/SQLIVulnerable", + "ruleIndex": 0, + "rule": { + "id": "cpp/SQLIVulnerable", + "index": 0 + }, + "message": { + "text": "Possible SQL injection" + }, + "locations": [ + { + "physicalLocation": { + "artifactLocation": { + "uri": "add-user.c", + "uriBaseId": "%SRCROOT%", + "index": 0 + }, + "region": { + "startLine": 84, + "startColumn": 27, + "endColumn": 32 + } + } + } + ], + "partialFingerprints": { + "primaryLocationLineHash": "9a8bc91bbc363391:1", + "primaryLocationStartColumnFingerprint": "22" + }, + "codeFlows": [ + { + "threadFlows": [ + { + "locations": [ + { + "location": { + "physicalLocation": { + "artifactLocation": { + "uri": "add-user.c", + "uriBaseId": "%SRCROOT%", + "index": 0 + }, + "region": { + "startLine": 52, + "startColumn": 32, + "endColumn": 35 + } + }, + "message": { + "text": "ref arg buf" + } + } + }, + { + "location": { + "physicalLocation": { + "artifactLocation": { + "uri": "add-user.c", + "uriBaseId": "%SRCROOT%", + "index": 0 + }, + "region": { + "startLine": 60, + "startColumn": 12, + "endColumn": 15 + } + }, + "message": { + "text": "buf" + } + } + }, + { + "location": { + "physicalLocation": { + "artifactLocation": { + "uri": "add-user.c", + "uriBaseId": "%SRCROOT%", + "index": 0 + }, + "region": { + "startLine": 93, + "startColumn": 12, + "endColumn": 25 + } + }, + "message": { + "text": "call to get_user_info" + } + } + }, + { + "location": { + "physicalLocation": { + "artifactLocation": { + "uri": "add-user.c", + "uriBaseId": "%SRCROOT%", + "index": 0 + }, + "region": { + "startLine": 95, + "startColumn": 20, + "endColumn": 24 + } + }, + "message": { + "text": "info" + } + } + }, + { + "location": { + "physicalLocation": { + "artifactLocation": { + "uri": "add-user.c", + "uriBaseId": "%SRCROOT%", + "index": 0 + }, + "region": { + "startLine": 68, + "startColumn": 31, + "endColumn": 35 + } + }, + "message": { + "text": "info" + } + } + }, + { + "location": { + "physicalLocation": { + "artifactLocation": { + "uri": "add-user.c", + "uriBaseId": "%SRCROOT%", + "index": 0 + }, + "region": { + "startLine": 84, + "startColumn": 27, + "endColumn": 32 + } + }, + "message": { + "text": "query" + } + } + } + ] + } + ] + } + ] + } + ], + "automationDetails": { + "id": "santa-chap/" + }, + "columnKind": "utf16CodeUnits", + "properties": { + "semmle.formatSpecifier": "sarif-latest" + }, + "versionControlProvenance": [ + { + "repositoryUri": "vcp-no-uri", + "revisionId": "vcp-no-revid" + } + ] + } + ] +} diff --git a/data/codeql-dataflow-sql-injection/sqlidb-v2.9.4-1.sarif.csv b/data/codeql-dataflow-sql-injection/sqlidb-v2.9.4-1.sarif.csv new file mode 100644 index 0000000..a0e81ad --- /dev/null +++ b/data/codeql-dataflow-sql-injection/sqlidb-v2.9.4-1.sarif.csv @@ -0,0 +1,3 @@ +sarif_file,level,levelcode,message,extra_info +sqlidb-v2.9.4-1.sarif,WARNING,4,Input sarif contains extra unneccesary properties.,"Extra properties: type fields: ['description', 'kind', 'precision', 'problem.severity', 'security-severity', 'sub-severity', 'tags', 'uri']" +sqlidb-v2.9.4-1.sarif,SUCCESS,0,File successfully processed., From f5a850ea7b9c8d5fa0c8c00e68eec23753360d7a Mon Sep 17 00:00:00 2001 From: Michael Hohn Date: Thu, 27 Jul 2023 08:56:16 -0700 Subject: [PATCH 20/23] readme/amend the list of tested CLI versions --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 156dd99..8a0a188 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,10 @@ The CLI versions used against development of the CLI support were: 2.6.3, 2.9.4, and 2.11.4. + Minimal tests are also run against the versions in + [this build script](./build-multiple-codeql-versions.sh). Currently, those are + 2.9.4, 2.12.7, 2.13.5, 2.14.0. + The CLI sarif **MUST** contain one additional property `versionControlProvenance` - which needs to look like: ``` "versionControlProvenance": [ From a90084826864bc43f94c30975becdea67e3b88a0 Mon Sep 17 00:00:00 2001 From: Michael Hohn Date: Thu, 27 Jul 2023 09:05:27 -0700 Subject: [PATCH 21/23] Remove ipython debug call --- sarif_cli/table_joins.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/sarif_cli/table_joins.py b/sarif_cli/table_joins.py index 520f0c6..7133626 100644 --- a/sarif_cli/table_joins.py +++ b/sarif_cli/table_joins.py @@ -115,9 +115,6 @@ def joins_for_problem(tgraph, af_0350_location): # # Form the message dataframe (@kind problem) via joins # - import IPython - IPython.embed(header="spot 1") - kind_problem_1 = ( aft(6343) .merge(sft(4055), how="inner", From 07ed4bf11e246487810b3d74dddd48b4eb0e8662 Mon Sep 17 00:00:00 2001 From: Michael Hohn Date: Thu, 27 Jul 2023 09:18:46 -0700 Subject: [PATCH 22/23] Add 'usage' to build-multiple-codeql-versions.sh --- build-multiple-codeql-versions.sh | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/build-multiple-codeql-versions.sh b/build-multiple-codeql-versions.sh index 96f974c..8f0872e 100644 --- a/build-multiple-codeql-versions.sh +++ b/build-multiple-codeql-versions.sh @@ -1,10 +1,21 @@ -# +#!/bin/bash -e #* Following are the steps needed to build a codeql db using different versions of -# the codeql cli +# the codeql cli. # # Some files from prior runs are found in ./data/codeql-dataflow-sql-injection/ # -echo '$0: Interactive use only' +usage=" +This script's purpose is to run the sarif-cli against SARIF files +produced by different versions of the codeql cli. + +This script is intended for interactive use only. Take one block at a time, +run it, and check results as you go. + +A (subset) of this script may be automated in the future. +" + +echo "$0: Interactive use only" +echo "$usage" exit 1 #* Use virtual environment. See README for setup. From bd9460dd611fa2653a88501e943e99cffefa9df4 Mon Sep 17 00:00:00 2001 From: Michael Hohn Date: Thu, 27 Jul 2023 09:34:17 -0700 Subject: [PATCH 23/23] Remove old comment --- sarif_cli/signature.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sarif_cli/signature.py b/sarif_cli/signature.py index 7302709..82771a5 100644 --- a/sarif_cli/signature.py +++ b/sarif_cli/signature.py @@ -56,8 +56,6 @@ def _signature_dict(args, elem, context: Context): if args.typedef_signatures: # Give every unique struct a name and use a reference to it as value. if signature not in context.sig_to_typedef: - #cannot have leading 0 hashes later in table joins so replace now - #context.sig_to_typedef[signature] = str("Struct%04d" % shorthash(signature)).replace("0", "1") context.sig_to_typedef[signature] = "Struct%04d" % shorthash(signature) typedef = context.sig_to_typedef[signature] return typedef