Update how project_id is generated

previously relied on assumption:
naming like: <org>/<project> in
repositoryUri
now just uses full repositoryUri
This commit is contained in:
Kristen Newbury
2023-01-05 16:37:55 -05:00
parent fc2c6bac99
commit 1a915e4de8
4 changed files with 16 additions and 41 deletions

View File

@@ -130,17 +130,14 @@ scantabs = ScanTables()
@dataclass @dataclass
class ExternalInfo: class ExternalInfo:
project_id : int project_id: pd.UInt64Dtype()
scan_id : pd.UInt64Dtype() scan_id : pd.UInt64Dtype()
sarif_file_name : str sarif_file_name : str
ql_query_id : str
external_info = ExternalInfo( external_info = ExternalInfo(
scan_spec["project_id"], pd.NA,
scan_spec["scan_id"], scan_spec["scan_id"],
scan_spec["sarif_file_name"], scan_spec["sarif_file_name"]
# TODO: Take ql_query_id from where? (git commit id of the ql query set)
'deadbeef00',
) )
# #

View File

@@ -161,7 +161,6 @@ for path in paths:
# Paths and components # Paths and components
# #
path = path.rstrip() path = path.rstrip()
project, component = path.split('/')
# #
# Scan specification # Scan specification
# #
@@ -171,30 +170,25 @@ for path in paths:
scan_id = hash.hash_unique(data) scan_id = hash.hash_unique(data)
scan_spec = { scan_spec = {
# assuming sarif file names are like <org>/<repo>
# however this will be replaced down the line with the repoURI if possible
# still, leaving here in case later versions of this tool do not rely on that property being there
# in that case this will be the best guess
"project_id": hash.hash_unique((project+"-"+component).encode()), # pd.UInt64Dtype()
"scan_id": scan_id, # pd.Int64Dtype() "scan_id": scan_id, # pd.Int64Dtype()
"sarif_file_name": path, # pd.StringDtype() "sarif_file_name": path, # pd.StringDtype()
} }
# #
# If using outermost output directory, create project directory: # If using outermost output directory, create project directory:
# (like <outer_dir>/<project>/*.scantables) # (like <outer_dir>/<repositoryUri>/*.scantables)
# #
try: os.mkdir(outer_dir+ project, mode=0o755) try: os.mkdir(outer_dir+ path, mode=0o755)
except FileExistsError: pass except FileExistsError: pass
scan_spec_file = os.path.join(outer_dir+ project, component + ".scanspec") scan_spec_file = os.path.join(outer_dir+ path + ".scanspec")
with open(scan_spec_file, 'w') as fp: with open(scan_spec_file, 'w') as fp:
json.dump(scan_spec, fp) json.dump(scan_spec, fp)
# #
# Table output directory # Table output directory
# #
output_dir = os.path.join(outer_dir+ project, component + ".scantables") output_dir = os.path.join(outer_dir+ path + ".scantables")
try: os.mkdir(output_dir, mode=0o755) try: os.mkdir(output_dir, mode=0o755)
except FileExistsError: pass except FileExistsError: pass
# #
@@ -215,8 +209,8 @@ for path in paths:
with open(args.successful_runs, 'wb') as outfile: with open(args.successful_runs, 'wb') as outfile:
pickle.dump(successful_runs, outfile) pickle.dump(successful_runs, outfile)
scan_log_file = os.path.join(outer_dir+ project, component + ".scanlog") scan_log_file = os.path.join(outer_dir+ path + ".scanlog")
csv_outfile = os.path.join(outer_dir+ project, component) csv_outfile = os.path.join(outer_dir+ path)
runstats = subprocess.run(['sarif-extract-scans', scan_spec_file, output_dir, csv_outfile, "-f", args.input_signature], runstats = subprocess.run(['sarif-extract-scans', scan_spec_file, output_dir, csv_outfile, "-f", args.input_signature],
capture_output=True, text=True) capture_output=True, text=True)
if runstats.returncode == 0: if runstats.returncode == 0:

View File

@@ -4,4 +4,4 @@ from hashlib import blake2b
def hash_unique(item_to_hash): def hash_unique(item_to_hash):
h = blake2b(digest_size = 8) h = blake2b(digest_size = 8)
h.update(item_to_hash) h.update(item_to_hash)
return abs(int.from_bytes(h.digest(), byteorder='big')) return int.from_bytes(h.digest(), byteorder='big')

View File

@@ -79,34 +79,18 @@ def joins_for_projects(basetables, external_info):
""" """
b = basetables; e = external_info b = basetables; e = external_info
# if the sarif does not have versionControlProvenance, semmle.sourceLanguage ect # if the sarif does have versionControlProvenance
# there is no reliable way to know the project name
# and will still need to use a guess about the project id
if "repositoryUri" in b.project: if "repositoryUri" in b.project:
repo_url = b.project.repositoryUri[0] repoUri = b.project.repositoryUri[0]
# For a repository url of the form e.project_id = hash.hash_unique(repoUri.encode())
# (git|https)://*/org/project.*
# use the org/project part as the project_name.
#
url_parts = re.match(r'(git|https)://[^/]+/([^/]+)/(.*).git', repo_url)
if url_parts:
project_name = f"{url_parts.group(2)}-{url_parts.group(3)}"
project, component = e.sarif_file_name.rstrip().split('/')
# if the runners guess from the filename was bad, replace with real info
# and continue to use that scanspec to pass that around
if project_name != project+"-"+component:
e.project_id = hash.hash_unique(project_name.encode())
else:
project_name = pd.NA
else: else:
repo_url = "unknown" repoUri = "unknown"
project_name = pd.NA
res = pd.DataFrame(data={ res = pd.DataFrame(data={
"id" : e.project_id, "id" : e.project_id,
"project_name" : project_name, "project_name" : repoUri,
"creation_date" : pd.Timestamp(0.0, unit='s'), # TODO: external info "creation_date" : pd.Timestamp(0.0, unit='s'), # TODO: external info
"repo_url" : repo_url, "repo_url" : repoUri,
"primary_language" : b.project['semmle.sourceLanguage'][0], "primary_language" : b.project['semmle.sourceLanguage'][0],
"languages_analyzed" : ",".join(list(b.project['semmle.sourceLanguage'])) "languages_analyzed" : ",".join(list(b.project['semmle.sourceLanguage']))
}, index=[0]) }, index=[0])