mirror of
https://github.com/hohn/sarif-cli.git
synced 2025-12-16 17:23:03 +01:00
Update how project_id is generated
previously relied on assumption: naming like: <org>/<project> in repositoryUri now just uses full repositoryUri
This commit is contained in:
@@ -130,17 +130,14 @@ scantabs = ScanTables()
|
|||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class ExternalInfo:
|
class ExternalInfo:
|
||||||
project_id : int
|
project_id: pd.UInt64Dtype()
|
||||||
scan_id : pd.UInt64Dtype()
|
scan_id : pd.UInt64Dtype()
|
||||||
sarif_file_name : str
|
sarif_file_name : str
|
||||||
ql_query_id : str
|
|
||||||
|
|
||||||
external_info = ExternalInfo(
|
external_info = ExternalInfo(
|
||||||
scan_spec["project_id"],
|
pd.NA,
|
||||||
scan_spec["scan_id"],
|
scan_spec["scan_id"],
|
||||||
scan_spec["sarif_file_name"],
|
scan_spec["sarif_file_name"]
|
||||||
# TODO: Take ql_query_id from where? (git commit id of the ql query set)
|
|
||||||
'deadbeef00',
|
|
||||||
)
|
)
|
||||||
|
|
||||||
#
|
#
|
||||||
|
|||||||
@@ -161,7 +161,6 @@ for path in paths:
|
|||||||
# Paths and components
|
# Paths and components
|
||||||
#
|
#
|
||||||
path = path.rstrip()
|
path = path.rstrip()
|
||||||
project, component = path.split('/')
|
|
||||||
#
|
#
|
||||||
# Scan specification
|
# Scan specification
|
||||||
#
|
#
|
||||||
@@ -171,30 +170,25 @@ for path in paths:
|
|||||||
scan_id = hash.hash_unique(data)
|
scan_id = hash.hash_unique(data)
|
||||||
|
|
||||||
scan_spec = {
|
scan_spec = {
|
||||||
# assuming sarif file names are like <org>/<repo>
|
|
||||||
# however this will be replaced down the line with the repoURI if possible
|
|
||||||
# still, leaving here in case later versions of this tool do not rely on that property being there
|
|
||||||
# in that case this will be the best guess
|
|
||||||
"project_id": hash.hash_unique((project+"-"+component).encode()), # pd.UInt64Dtype()
|
|
||||||
"scan_id": scan_id, # pd.Int64Dtype()
|
"scan_id": scan_id, # pd.Int64Dtype()
|
||||||
"sarif_file_name": path, # pd.StringDtype()
|
"sarif_file_name": path, # pd.StringDtype()
|
||||||
}
|
}
|
||||||
|
|
||||||
#
|
#
|
||||||
# If using outermost output directory, create project directory:
|
# If using outermost output directory, create project directory:
|
||||||
# (like <outer_dir>/<project>/*.scantables)
|
# (like <outer_dir>/<repositoryUri>/*.scantables)
|
||||||
#
|
#
|
||||||
try: os.mkdir(outer_dir+ project, mode=0o755)
|
try: os.mkdir(outer_dir+ path, mode=0o755)
|
||||||
except FileExistsError: pass
|
except FileExistsError: pass
|
||||||
|
|
||||||
scan_spec_file = os.path.join(outer_dir+ project, component + ".scanspec")
|
scan_spec_file = os.path.join(outer_dir+ path + ".scanspec")
|
||||||
with open(scan_spec_file, 'w') as fp:
|
with open(scan_spec_file, 'w') as fp:
|
||||||
json.dump(scan_spec, fp)
|
json.dump(scan_spec, fp)
|
||||||
|
|
||||||
#
|
#
|
||||||
# Table output directory
|
# Table output directory
|
||||||
#
|
#
|
||||||
output_dir = os.path.join(outer_dir+ project, component + ".scantables")
|
output_dir = os.path.join(outer_dir+ path + ".scantables")
|
||||||
try: os.mkdir(output_dir, mode=0o755)
|
try: os.mkdir(output_dir, mode=0o755)
|
||||||
except FileExistsError: pass
|
except FileExistsError: pass
|
||||||
#
|
#
|
||||||
@@ -215,8 +209,8 @@ for path in paths:
|
|||||||
with open(args.successful_runs, 'wb') as outfile:
|
with open(args.successful_runs, 'wb') as outfile:
|
||||||
pickle.dump(successful_runs, outfile)
|
pickle.dump(successful_runs, outfile)
|
||||||
|
|
||||||
scan_log_file = os.path.join(outer_dir+ project, component + ".scanlog")
|
scan_log_file = os.path.join(outer_dir+ path + ".scanlog")
|
||||||
csv_outfile = os.path.join(outer_dir+ project, component)
|
csv_outfile = os.path.join(outer_dir+ path)
|
||||||
runstats = subprocess.run(['sarif-extract-scans', scan_spec_file, output_dir, csv_outfile, "-f", args.input_signature],
|
runstats = subprocess.run(['sarif-extract-scans', scan_spec_file, output_dir, csv_outfile, "-f", args.input_signature],
|
||||||
capture_output=True, text=True)
|
capture_output=True, text=True)
|
||||||
if runstats.returncode == 0:
|
if runstats.returncode == 0:
|
||||||
|
|||||||
@@ -4,4 +4,4 @@ from hashlib import blake2b
|
|||||||
def hash_unique(item_to_hash):
|
def hash_unique(item_to_hash):
|
||||||
h = blake2b(digest_size = 8)
|
h = blake2b(digest_size = 8)
|
||||||
h.update(item_to_hash)
|
h.update(item_to_hash)
|
||||||
return abs(int.from_bytes(h.digest(), byteorder='big'))
|
return int.from_bytes(h.digest(), byteorder='big')
|
||||||
|
|||||||
@@ -79,34 +79,18 @@ def joins_for_projects(basetables, external_info):
|
|||||||
"""
|
"""
|
||||||
b = basetables; e = external_info
|
b = basetables; e = external_info
|
||||||
|
|
||||||
# if the sarif does not have versionControlProvenance, semmle.sourceLanguage ect
|
# if the sarif does have versionControlProvenance
|
||||||
# there is no reliable way to know the project name
|
|
||||||
# and will still need to use a guess about the project id
|
|
||||||
if "repositoryUri" in b.project:
|
if "repositoryUri" in b.project:
|
||||||
repo_url = b.project.repositoryUri[0]
|
repoUri = b.project.repositoryUri[0]
|
||||||
# For a repository url of the form
|
e.project_id = hash.hash_unique(repoUri.encode())
|
||||||
# (git|https)://*/org/project.*
|
|
||||||
# use the org/project part as the project_name.
|
|
||||||
#
|
|
||||||
url_parts = re.match(r'(git|https)://[^/]+/([^/]+)/(.*).git', repo_url)
|
|
||||||
if url_parts:
|
|
||||||
project_name = f"{url_parts.group(2)}-{url_parts.group(3)}"
|
|
||||||
project, component = e.sarif_file_name.rstrip().split('/')
|
|
||||||
# if the runners guess from the filename was bad, replace with real info
|
|
||||||
# and continue to use that scanspec to pass that around
|
|
||||||
if project_name != project+"-"+component:
|
|
||||||
e.project_id = hash.hash_unique(project_name.encode())
|
|
||||||
else:
|
|
||||||
project_name = pd.NA
|
|
||||||
else:
|
else:
|
||||||
repo_url = "unknown"
|
repoUri = "unknown"
|
||||||
project_name = pd.NA
|
|
||||||
|
|
||||||
res = pd.DataFrame(data={
|
res = pd.DataFrame(data={
|
||||||
"id" : e.project_id,
|
"id" : e.project_id,
|
||||||
"project_name" : project_name,
|
"project_name" : repoUri,
|
||||||
"creation_date" : pd.Timestamp(0.0, unit='s'), # TODO: external info
|
"creation_date" : pd.Timestamp(0.0, unit='s'), # TODO: external info
|
||||||
"repo_url" : repo_url,
|
"repo_url" : repoUri,
|
||||||
"primary_language" : b.project['semmle.sourceLanguage'][0],
|
"primary_language" : b.project['semmle.sourceLanguage'][0],
|
||||||
"languages_analyzed" : ",".join(list(b.project['semmle.sourceLanguage']))
|
"languages_analyzed" : ",".join(list(b.project['semmle.sourceLanguage']))
|
||||||
}, index=[0])
|
}, index=[0])
|
||||||
|
|||||||
Reference in New Issue
Block a user