diff --git a/README.md b/README.md index 6c7850e..3916397 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,9 @@ # CLI tools for SARIF processing Each of these tools present a high-level command-line interface to extract a - specific subset of information from a SARIF file. The main tools are: `sarif-extract-scans-runner`,`sarif-aggregate-scans`,`sarif-create-aggregate-report` + specific subset of information from a SARIF file. The main tools are: `sarif-extract-scans-runner`,`sarif-aggregate-scans`,`sarif-create-aggregate-report`. + + Each tool can print its options and description like: `sarif-extract-scans-runner --help`. The tool was implemented using Python 3.9. diff --git a/bin/sarif-extract-scans b/bin/sarif-extract-scans index d891f71..a171e8b 100755 --- a/bin/sarif-extract-scans +++ b/bin/sarif-extract-scans @@ -130,17 +130,14 @@ scantabs = ScanTables() @dataclass class ExternalInfo: - project_id : int + project_id: pd.UInt64Dtype() scan_id : pd.UInt64Dtype() sarif_file_name : str - ql_query_id : str external_info = ExternalInfo( - scan_spec["project_id"], + pd.NA, scan_spec["scan_id"], - scan_spec["sarif_file_name"], - # TODO: Take ql_query_id from where? (git commit id of the ql query set) - 'deadbeef00', + scan_spec["sarif_file_name"] ) # diff --git a/bin/sarif-extract-scans-runner b/bin/sarif-extract-scans-runner index a069493..b323bea 100755 --- a/bin/sarif-extract-scans-runner +++ b/bin/sarif-extract-scans-runner @@ -88,9 +88,9 @@ parser = argparse.ArgumentParser(description='Run sarif-extract-scans over a dir parser.add_argument('sarif_files', metavar='sarif-files', type=str, help='File containing list of sarif files, use - for stdin') -parser.add_argument('-f','--input-signature', metavar='input-signature', type=str, default="LGTM", - help='Signature of the sarif, as in, where it was generated it may affect the signature.' - 'Options: LGTM, CLI' +parser.add_argument('-f','--input-signature', metavar='input-signature', type=str, default="CLI", + help='Signature of the sarif, as in, where it was generated it may affect the signature.\n' + 'Options: LGTM, CLI.\n' 'If current represented signatures are not sufficient, view signature_single.py for how to support further signatures.' ' Default: "%(default)s"') @@ -161,7 +161,6 @@ for path in paths: # Paths and components # path = path.rstrip() - project, component = path.split('/') # # Scan specification # @@ -171,30 +170,25 @@ for path in paths: scan_id = hash.hash_unique(data) scan_spec = { - # assuming sarif file names are like / - # however this will be replaced down the line with the repoURI if possible - # still, leaving here in case later versions of this tool do not rely on that property being there - # in that case this will be the best guess - "project_id": hash.hash_unique((project+"-"+component).encode()), # pd.UInt64Dtype() "scan_id": scan_id, # pd.Int64Dtype() "sarif_file_name": path, # pd.StringDtype() } # # If using outermost output directory, create project directory: - # (like //*.scantables) + # (like //*.scantables) # - try: os.mkdir(outer_dir+ project, mode=0o755) + try: os.mkdir(outer_dir+ path, mode=0o755) except FileExistsError: pass - scan_spec_file = os.path.join(outer_dir+ project, component + ".scanspec") + scan_spec_file = os.path.join(outer_dir+ path + ".scanspec") with open(scan_spec_file, 'w') as fp: json.dump(scan_spec, fp) # # Table output directory # - output_dir = os.path.join(outer_dir+ project, component + ".scantables") + output_dir = os.path.join(outer_dir+ path + ".scantables") try: os.mkdir(output_dir, mode=0o755) except FileExistsError: pass # @@ -215,8 +209,8 @@ for path in paths: with open(args.successful_runs, 'wb') as outfile: pickle.dump(successful_runs, outfile) - scan_log_file = os.path.join(outer_dir+ project, component + ".scanlog") - csv_outfile = os.path.join(outer_dir+ project, component) + scan_log_file = os.path.join(outer_dir+ path + ".scanlog") + csv_outfile = os.path.join(outer_dir+ path) runstats = subprocess.run(['sarif-extract-scans', scan_spec_file, output_dir, csv_outfile, "-f", args.input_signature], capture_output=True, text=True) if runstats.returncode == 0: diff --git a/sarif_cli/hash.py b/sarif_cli/hash.py index 9c107ba..f900897 100644 --- a/sarif_cli/hash.py +++ b/sarif_cli/hash.py @@ -4,4 +4,4 @@ from hashlib import blake2b def hash_unique(item_to_hash): h = blake2b(digest_size = 8) h.update(item_to_hash) - return abs(int.from_bytes(h.digest(), byteorder='big')) + return int.from_bytes(h.digest(), byteorder='big') diff --git a/sarif_cli/scan_tables.py b/sarif_cli/scan_tables.py index 716d940..0f5a980 100644 --- a/sarif_cli/scan_tables.py +++ b/sarif_cli/scan_tables.py @@ -79,43 +79,20 @@ def joins_for_projects(basetables, external_info): """ b = basetables; e = external_info - # if the sarif does not have versionControlProvenance, semmle.sourceLanguage ect - # there is no reliable way to know the project name - # and will still need to use a guess about the project id + # if the sarif does have versionControlProvenance if "repositoryUri" in b.project: - repo_url = b.project.repositoryUri[0] - # For a repository url of the form - # (git|https)://*/org/project.* - # use the org/project part as the project_name. - # - url_parts = re.match(r'(git|https)://[^/]+/([^/]+)/(.*).git', repo_url) - if url_parts: - project_name = f"{url_parts.group(2)}-{url_parts.group(3)}" - project, component = e.sarif_file_name.rstrip().split('/') - # if the runners guess from the filename was bad, replace with real info - # and continue to use that scanspec to pass that around - if project_name != project+"-"+component: - e.project_id = hash.hash_unique(project_name.encode()) - else: - project_name = pd.NA + repoUri = b.project.repositoryUri[0] + e.project_id = hash.hash_unique(repoUri.encode()) else: - repo_url = "unknown" - project_name = pd.NA + repoUri = "unknown" - if 'semmle.sourceLanguage' in b.project: - srcLang = b.project['semmle.sourceLanguage'][0] - allLang = ",".join(list(b.project['semmle.sourceLanguage'])) - else: - srcLang = "unknown" - allLang = "unknown" - res = pd.DataFrame(data={ "id" : e.project_id, - "project_name" : project_name, + "project_name" : repoUri, "creation_date" : pd.Timestamp(0.0, unit='s'), # TODO: external info - "repo_url" : repo_url, - "primary_language" : srcLang, # TODO: external info if CLI sarif - "languages_analyzed" : allLang # TODO: external info if CLI sarif + "repo_url" : repoUri, + "primary_language" : b.project['semmle.sourceLanguage'][0], + "languages_analyzed" : ",".join(list(b.project['semmle.sourceLanguage'])) }, index=[0]) # Force all column types to ensure appropriate formatting diff --git a/sarif_cli/signature.py b/sarif_cli/signature.py index 941feb2..62b29f1 100644 --- a/sarif_cli/signature.py +++ b/sarif_cli/signature.py @@ -239,6 +239,8 @@ dummy_relatedLocations_entry = [ dummy_message_entry = {'text': 'scli-dyys dummy value'} +dummy_sourceLanguage = 'unknown' + def fillsig_dict(args, elem, context): """ Fill in the missing fields in dictionary signatures. """ @@ -290,6 +292,10 @@ def fillsig_dict(args, elem, context): if 'level' in elem.keys(): full_elem['enabled'] = elem.get('enabled', True) + if 'semmle.formatSpecifier' in elem.keys(): + # Ensure semmle.sourceLanguage is present at least in dummy form + full_elem['semmle.sourceLanguage'] = elem.get('semmle.sourceLanguage', dummy_sourceLanguage) + if 'versionControlProvenance' in elem.keys(): # Ensure newlineSequences is present when versionControlProvenance is full_elem['newlineSequences'] = elem.get('newlineSequences', dummy_newlineSequences) diff --git a/sarif_cli/signature_single_CLI.py b/sarif_cli/signature_single_CLI.py index d773cf2..1b6b747 100644 --- a/sarif_cli/signature_single_CLI.py +++ b/sarif_cli/signature_single_CLI.py @@ -28,7 +28,7 @@ struct_graph_CLI = ( ('Struct3497', ('struct', ('index', 'Int'), ('uri', 'String'))), ('Struct9567', ('struct', ('location', 'Struct3497'))), ('Array6920', ('array', (0, 'Struct5277'), (1, 'Struct9567'))), - ('Struct1509', ('struct', ('semmle.formatSpecifier', 'String'))), + ('Struct1509', ('struct', ('semmle.formatSpecifier', 'String'), ('semmle.sourceLanguage', 'String'))), ('Struct2774', ('struct', ('text', 'String'))), ( 'Struct6299', ( 'struct', diff --git a/sarif_cli/typegraph.py b/sarif_cli/typegraph.py index 3769fc6..4dce356 100644 --- a/sarif_cli/typegraph.py +++ b/sarif_cli/typegraph.py @@ -196,9 +196,14 @@ def _destructure_dict(typegraph: Typegraph, node, tree): ) else: - status_writer.unknown_sarif_parsing_shape["extra_info"] = "type fields {} do not match tree fields {}.".format(type_fields, tree_fields) - status_writer.csv_write(status_writer.unknown_sarif_parsing_shape) - raise Exception("typegraph: unhandled case reached: cannot match type " + # possibly looks like: (Struct9699)type_fields: [codeflows...] vs tree_fields: [...extra_properties] + # in that case we need to also try the Struct4055 signature here + if "codeFlows" in type_fields: + _destructure_dict(typegraph, "Struct4055", tree) + else: + status_writer.unknown_sarif_parsing_shape["extra_info"] = "type fields {} do not match tree fields {}.".format(type_fields, tree_fields) + status_writer.csv_write(status_writer.unknown_sarif_parsing_shape) + raise Exception("typegraph: unhandled case reached: cannot match type " "fields {} to tree fields {}. Data is invalid." .format(type_fields, tree_fields))