mirror of
https://github.com/hohn/sarif-cli.git
synced 2025-12-16 17:23:03 +01:00
Merge remote-tracking branch 'kristen/main'
This commit is contained in:
@@ -1,7 +1,9 @@
|
||||
# CLI tools for SARIF processing
|
||||
|
||||
Each of these tools present a high-level command-line interface to extract a
|
||||
specific subset of information from a SARIF file. The main tools are: `sarif-extract-scans-runner`,`sarif-aggregate-scans`,`sarif-create-aggregate-report`
|
||||
specific subset of information from a SARIF file. The main tools are: `sarif-extract-scans-runner`,`sarif-aggregate-scans`,`sarif-create-aggregate-report`.
|
||||
|
||||
Each tool can print its options and description like: `sarif-extract-scans-runner --help`.
|
||||
|
||||
The tool was implemented using Python 3.9.
|
||||
|
||||
|
||||
@@ -130,17 +130,14 @@ scantabs = ScanTables()
|
||||
|
||||
@dataclass
|
||||
class ExternalInfo:
|
||||
project_id : int
|
||||
project_id: pd.UInt64Dtype()
|
||||
scan_id : pd.UInt64Dtype()
|
||||
sarif_file_name : str
|
||||
ql_query_id : str
|
||||
|
||||
external_info = ExternalInfo(
|
||||
scan_spec["project_id"],
|
||||
pd.NA,
|
||||
scan_spec["scan_id"],
|
||||
scan_spec["sarif_file_name"],
|
||||
# TODO: Take ql_query_id from where? (git commit id of the ql query set)
|
||||
'deadbeef00',
|
||||
scan_spec["sarif_file_name"]
|
||||
)
|
||||
|
||||
#
|
||||
|
||||
@@ -88,9 +88,9 @@ parser = argparse.ArgumentParser(description='Run sarif-extract-scans over a dir
|
||||
|
||||
parser.add_argument('sarif_files', metavar='sarif-files', type=str, help='File containing list of sarif files, use - for stdin')
|
||||
|
||||
parser.add_argument('-f','--input-signature', metavar='input-signature', type=str, default="LGTM",
|
||||
help='Signature of the sarif, as in, where it was generated it may affect the signature.'
|
||||
'Options: LGTM, CLI'
|
||||
parser.add_argument('-f','--input-signature', metavar='input-signature', type=str, default="CLI",
|
||||
help='Signature of the sarif, as in, where it was generated it may affect the signature.\n'
|
||||
'Options: LGTM, CLI.\n'
|
||||
'If current represented signatures are not sufficient, view signature_single.py for how to support further signatures.'
|
||||
' Default: "%(default)s"')
|
||||
|
||||
@@ -161,7 +161,6 @@ for path in paths:
|
||||
# Paths and components
|
||||
#
|
||||
path = path.rstrip()
|
||||
project, component = path.split('/')
|
||||
#
|
||||
# Scan specification
|
||||
#
|
||||
@@ -171,30 +170,25 @@ for path in paths:
|
||||
scan_id = hash.hash_unique(data)
|
||||
|
||||
scan_spec = {
|
||||
# assuming sarif file names are like <org>/<repo>
|
||||
# however this will be replaced down the line with the repoURI if possible
|
||||
# still, leaving here in case later versions of this tool do not rely on that property being there
|
||||
# in that case this will be the best guess
|
||||
"project_id": hash.hash_unique((project+"-"+component).encode()), # pd.UInt64Dtype()
|
||||
"scan_id": scan_id, # pd.Int64Dtype()
|
||||
"sarif_file_name": path, # pd.StringDtype()
|
||||
}
|
||||
|
||||
#
|
||||
# If using outermost output directory, create project directory:
|
||||
# (like <outer_dir>/<project>/*.scantables)
|
||||
# (like <outer_dir>/<repositoryUri>/*.scantables)
|
||||
#
|
||||
try: os.mkdir(outer_dir+ project, mode=0o755)
|
||||
try: os.mkdir(outer_dir+ path, mode=0o755)
|
||||
except FileExistsError: pass
|
||||
|
||||
scan_spec_file = os.path.join(outer_dir+ project, component + ".scanspec")
|
||||
scan_spec_file = os.path.join(outer_dir+ path + ".scanspec")
|
||||
with open(scan_spec_file, 'w') as fp:
|
||||
json.dump(scan_spec, fp)
|
||||
|
||||
#
|
||||
# Table output directory
|
||||
#
|
||||
output_dir = os.path.join(outer_dir+ project, component + ".scantables")
|
||||
output_dir = os.path.join(outer_dir+ path + ".scantables")
|
||||
try: os.mkdir(output_dir, mode=0o755)
|
||||
except FileExistsError: pass
|
||||
#
|
||||
@@ -215,8 +209,8 @@ for path in paths:
|
||||
with open(args.successful_runs, 'wb') as outfile:
|
||||
pickle.dump(successful_runs, outfile)
|
||||
|
||||
scan_log_file = os.path.join(outer_dir+ project, component + ".scanlog")
|
||||
csv_outfile = os.path.join(outer_dir+ project, component)
|
||||
scan_log_file = os.path.join(outer_dir+ path + ".scanlog")
|
||||
csv_outfile = os.path.join(outer_dir+ path)
|
||||
runstats = subprocess.run(['sarif-extract-scans', scan_spec_file, output_dir, csv_outfile, "-f", args.input_signature],
|
||||
capture_output=True, text=True)
|
||||
if runstats.returncode == 0:
|
||||
|
||||
@@ -4,4 +4,4 @@ from hashlib import blake2b
|
||||
def hash_unique(item_to_hash):
|
||||
h = blake2b(digest_size = 8)
|
||||
h.update(item_to_hash)
|
||||
return abs(int.from_bytes(h.digest(), byteorder='big'))
|
||||
return int.from_bytes(h.digest(), byteorder='big')
|
||||
|
||||
@@ -79,43 +79,20 @@ def joins_for_projects(basetables, external_info):
|
||||
"""
|
||||
b = basetables; e = external_info
|
||||
|
||||
# if the sarif does not have versionControlProvenance, semmle.sourceLanguage ect
|
||||
# there is no reliable way to know the project name
|
||||
# and will still need to use a guess about the project id
|
||||
# if the sarif does have versionControlProvenance
|
||||
if "repositoryUri" in b.project:
|
||||
repo_url = b.project.repositoryUri[0]
|
||||
# For a repository url of the form
|
||||
# (git|https)://*/org/project.*
|
||||
# use the org/project part as the project_name.
|
||||
#
|
||||
url_parts = re.match(r'(git|https)://[^/]+/([^/]+)/(.*).git', repo_url)
|
||||
if url_parts:
|
||||
project_name = f"{url_parts.group(2)}-{url_parts.group(3)}"
|
||||
project, component = e.sarif_file_name.rstrip().split('/')
|
||||
# if the runners guess from the filename was bad, replace with real info
|
||||
# and continue to use that scanspec to pass that around
|
||||
if project_name != project+"-"+component:
|
||||
e.project_id = hash.hash_unique(project_name.encode())
|
||||
else:
|
||||
project_name = pd.NA
|
||||
repoUri = b.project.repositoryUri[0]
|
||||
e.project_id = hash.hash_unique(repoUri.encode())
|
||||
else:
|
||||
repo_url = "unknown"
|
||||
project_name = pd.NA
|
||||
|
||||
if 'semmle.sourceLanguage' in b.project:
|
||||
srcLang = b.project['semmle.sourceLanguage'][0]
|
||||
allLang = ",".join(list(b.project['semmle.sourceLanguage']))
|
||||
else:
|
||||
srcLang = "unknown"
|
||||
allLang = "unknown"
|
||||
repoUri = "unknown"
|
||||
|
||||
res = pd.DataFrame(data={
|
||||
"id" : e.project_id,
|
||||
"project_name" : project_name,
|
||||
"project_name" : repoUri,
|
||||
"creation_date" : pd.Timestamp(0.0, unit='s'), # TODO: external info
|
||||
"repo_url" : repo_url,
|
||||
"primary_language" : srcLang, # TODO: external info if CLI sarif
|
||||
"languages_analyzed" : allLang # TODO: external info if CLI sarif
|
||||
"repo_url" : repoUri,
|
||||
"primary_language" : b.project['semmle.sourceLanguage'][0],
|
||||
"languages_analyzed" : ",".join(list(b.project['semmle.sourceLanguage']))
|
||||
}, index=[0])
|
||||
|
||||
# Force all column types to ensure appropriate formatting
|
||||
|
||||
@@ -239,6 +239,8 @@ dummy_relatedLocations_entry = [
|
||||
|
||||
dummy_message_entry = {'text': 'scli-dyys dummy value'}
|
||||
|
||||
dummy_sourceLanguage = 'unknown'
|
||||
|
||||
def fillsig_dict(args, elem, context):
|
||||
""" Fill in the missing fields in dictionary signatures.
|
||||
"""
|
||||
@@ -290,6 +292,10 @@ def fillsig_dict(args, elem, context):
|
||||
if 'level' in elem.keys():
|
||||
full_elem['enabled'] = elem.get('enabled', True)
|
||||
|
||||
if 'semmle.formatSpecifier' in elem.keys():
|
||||
# Ensure semmle.sourceLanguage is present at least in dummy form
|
||||
full_elem['semmle.sourceLanguage'] = elem.get('semmle.sourceLanguage', dummy_sourceLanguage)
|
||||
|
||||
if 'versionControlProvenance' in elem.keys():
|
||||
# Ensure newlineSequences is present when versionControlProvenance is
|
||||
full_elem['newlineSequences'] = elem.get('newlineSequences', dummy_newlineSequences)
|
||||
|
||||
@@ -28,7 +28,7 @@ struct_graph_CLI = (
|
||||
('Struct3497', ('struct', ('index', 'Int'), ('uri', 'String'))),
|
||||
('Struct9567', ('struct', ('location', 'Struct3497'))),
|
||||
('Array6920', ('array', (0, 'Struct5277'), (1, 'Struct9567'))),
|
||||
('Struct1509', ('struct', ('semmle.formatSpecifier', 'String'))),
|
||||
('Struct1509', ('struct', ('semmle.formatSpecifier', 'String'), ('semmle.sourceLanguage', 'String'))),
|
||||
('Struct2774', ('struct', ('text', 'String'))),
|
||||
( 'Struct6299',
|
||||
( 'struct',
|
||||
|
||||
@@ -196,9 +196,14 @@ def _destructure_dict(typegraph: Typegraph, node, tree):
|
||||
)
|
||||
|
||||
else:
|
||||
status_writer.unknown_sarif_parsing_shape["extra_info"] = "type fields {} do not match tree fields {}.".format(type_fields, tree_fields)
|
||||
status_writer.csv_write(status_writer.unknown_sarif_parsing_shape)
|
||||
raise Exception("typegraph: unhandled case reached: cannot match type "
|
||||
# possibly looks like: (Struct9699)type_fields: [codeflows...] vs tree_fields: [...extra_properties]
|
||||
# in that case we need to also try the Struct4055 signature here
|
||||
if "codeFlows" in type_fields:
|
||||
_destructure_dict(typegraph, "Struct4055", tree)
|
||||
else:
|
||||
status_writer.unknown_sarif_parsing_shape["extra_info"] = "type fields {} do not match tree fields {}.".format(type_fields, tree_fields)
|
||||
status_writer.csv_write(status_writer.unknown_sarif_parsing_shape)
|
||||
raise Exception("typegraph: unhandled case reached: cannot match type "
|
||||
"fields {} to tree fields {}. Data is invalid."
|
||||
.format(type_fields, tree_fields))
|
||||
|
||||
|
||||
Reference in New Issue
Block a user