Rework project name format and project id format

This commit is contained in:
Kristen Newbury
2022-11-07 13:56:50 -05:00
parent 4121072088
commit 1caf03f5f0
4 changed files with 42 additions and 14 deletions

View File

@@ -38,6 +38,7 @@ def load(fname):
try:
content = json.load(fp)
except json.decoder.JSONDecodeError as err:
# TODO knewbury error handling
logging.error('Error reading from {}: {}: line {}, column {}'
.format(args.file, err.msg, err.lineno, err.colno))
sys.exit(1)
@@ -61,8 +62,16 @@ sarif_struct = signature.fillsig(args, sarif_struct, context)
#
# Use reference type graph (signature) to traverse sarif and attach values to tables
#
# try:
# tgraph = typegraph.Typegraph(signature_single.struct_graph_2022_02_01)
# typegraph.destructure(tgraph, signature_single.start_node_2022_02_01, sarif_struct)
# except json.decoder.JSONDecodeError as err:
# logging.error('Error reading from {}: {}: line {}, column {}'
# .format(args.file, err.msg, err.lineno, err.colno))
# sys.exit(1)
tgraph = typegraph.Typegraph(signature_single.struct_graph_2022_02_01)
typegraph.destructure(tgraph, signature_single.start_node_2022_02_01, sarif_struct)
#
# Form output tables
#
@@ -125,9 +134,11 @@ bt.rules = tj.joins_for_rules(tgraph)
#
# Form scan tables
#
# joins for projects has to happen first as it backfills the guess about the project_id
scantabs.projects = st.joins_for_projects(bt, external_info, scantabs)
scantabs.results = st.joins_for_results(bt, external_info)
scantabs.scans = st.joins_for_scans(bt, external_info, scantabs)
scantabs.projects = st.joins_for_projects(bt, external_info, scantabs)
#

View File

@@ -80,13 +80,7 @@ import os
import sys
import pickle
from datetime import datetime
from hashlib import blake2b
def hash_unique(item_to_hash, size):
h = blake2b(digest_size = size)
h.update(item_to_hash.encode())
return abs(int.from_bytes(h.digest(), byteorder='big'))
from sarif_cli import hash
#
# Handle arguments
#
@@ -147,12 +141,21 @@ for path in paths:
#
# Scan specification
#
# scan id as hash of sarif file contents
with open(path, 'rb') as f:
data = f.read()
scan_id = hash.hash_unique(data)
scan_spec = {
"project_id": hash_unique(project, 8), # pd.UInt64Dtype()
"scan_id": hash_unique(path, 8), # pd.Int64Dtype()
# assuming sarif file names are like <org>/<repo>
# however this will be replaced down the line with the repoURI if possible
# still, leaving here in case later versions of this tool do not rely on that property being there
# in that case this will be the best guess
"project_id": hash.hash_unique((project+"-"+component).encode()), # pd.UInt64Dtype()
"scan_id": scan_id, # pd.Int64Dtype()
"sarif_file_name": path, # pd.StringDtype()
}
scan_spec_file = os.path.join(project, component + ".scanspec")
with open(scan_spec_file, 'w') as fp:
json.dump(scan_spec, fp)

7
sarif_cli/hash.py Normal file
View File

@@ -0,0 +1,7 @@
from hashlib import blake2b
# takes a bytes object and outputs an 8 byte hash
def hash_unique(item_to_hash):
h = blake2b(digest_size = 8)
h.update(item_to_hash)
return abs(int.from_bytes(h.digest(), byteorder='big'))

View File

@@ -7,6 +7,8 @@ import logging
import numpy
import pandas as pd
import re
import sys
from sarif_cli import hash
class ZeroResults(Exception):
pass
@@ -81,10 +83,16 @@ def joins_for_projects(basetables, external_info, scantables):
# (git|https)://*/org/project.*
# use the org/project part as the project_name.
#
# TODO knewbury error handling for if the signature is slotted out?
repo_url = b.project.repositoryUri[0]
url_parts = re.match(r'(git|https)://[^/]+/([^/]+)/([^/.]+).*', repo_url)
if url_parts:
project_name = f"{url_parts.group(2)}/{url_parts.group(3)}"
project_name = f"{url_parts.group(2)}-{url_parts.group(3)}"
project, component = e.sarif_file_name.rstrip().split('/')
# if the runners guess from the filename was bad, replace with real info
# and continue to use that scanspec to pass that around
if project_name != project+"-"+component:
e.project_id = hash.hash_unique(project_name.encode())
else:
project_name = pd.NA
@@ -131,7 +139,6 @@ def joins_for_scans(basetables, external_info, scantables):
"results_count" : scantables.results.shape[0],
"rules_count" : len(b.rules['id'].unique()),
},index=[0])
# Force all column types to ensure correct writing and type checks on reading.
res1 = res.astype(ScanTablesTypes.scans).reset_index(drop=True)
return res1
@@ -158,7 +165,7 @@ def joins_for_results(basetables, external_info):
res = pd.concat(stack)
else:
if stack == []:
# TODO: The case of zero results should be handled at sarif read time
# TODO knewbury to error handling
logging.warning("Zero problem/path_problem results found in sarif "
"file but processing anyway.")
res = tables[0]