mirror of
https://github.com/hohn/sarif-cli.git
synced 2025-12-16 17:23:03 +01:00
Rework project name format and project id format
This commit is contained in:
@@ -38,6 +38,7 @@ def load(fname):
|
||||
try:
|
||||
content = json.load(fp)
|
||||
except json.decoder.JSONDecodeError as err:
|
||||
# TODO knewbury error handling
|
||||
logging.error('Error reading from {}: {}: line {}, column {}'
|
||||
.format(args.file, err.msg, err.lineno, err.colno))
|
||||
sys.exit(1)
|
||||
@@ -61,8 +62,16 @@ sarif_struct = signature.fillsig(args, sarif_struct, context)
|
||||
#
|
||||
# Use reference type graph (signature) to traverse sarif and attach values to tables
|
||||
#
|
||||
# try:
|
||||
# tgraph = typegraph.Typegraph(signature_single.struct_graph_2022_02_01)
|
||||
# typegraph.destructure(tgraph, signature_single.start_node_2022_02_01, sarif_struct)
|
||||
# except json.decoder.JSONDecodeError as err:
|
||||
# logging.error('Error reading from {}: {}: line {}, column {}'
|
||||
# .format(args.file, err.msg, err.lineno, err.colno))
|
||||
# sys.exit(1)
|
||||
tgraph = typegraph.Typegraph(signature_single.struct_graph_2022_02_01)
|
||||
typegraph.destructure(tgraph, signature_single.start_node_2022_02_01, sarif_struct)
|
||||
|
||||
#
|
||||
# Form output tables
|
||||
#
|
||||
@@ -125,9 +134,11 @@ bt.rules = tj.joins_for_rules(tgraph)
|
||||
#
|
||||
# Form scan tables
|
||||
#
|
||||
# joins for projects has to happen first as it backfills the guess about the project_id
|
||||
scantabs.projects = st.joins_for_projects(bt, external_info, scantabs)
|
||||
scantabs.results = st.joins_for_results(bt, external_info)
|
||||
scantabs.scans = st.joins_for_scans(bt, external_info, scantabs)
|
||||
scantabs.projects = st.joins_for_projects(bt, external_info, scantabs)
|
||||
|
||||
|
||||
|
||||
#
|
||||
|
||||
@@ -80,13 +80,7 @@ import os
|
||||
import sys
|
||||
import pickle
|
||||
from datetime import datetime
|
||||
from hashlib import blake2b
|
||||
|
||||
def hash_unique(item_to_hash, size):
|
||||
h = blake2b(digest_size = size)
|
||||
h.update(item_to_hash.encode())
|
||||
return abs(int.from_bytes(h.digest(), byteorder='big'))
|
||||
|
||||
from sarif_cli import hash
|
||||
#
|
||||
# Handle arguments
|
||||
#
|
||||
@@ -147,9 +141,18 @@ for path in paths:
|
||||
#
|
||||
# Scan specification
|
||||
#
|
||||
# scan id as hash of sarif file contents
|
||||
with open(path, 'rb') as f:
|
||||
data = f.read()
|
||||
scan_id = hash.hash_unique(data)
|
||||
|
||||
scan_spec = {
|
||||
"project_id": hash_unique(project, 8), # pd.UInt64Dtype()
|
||||
"scan_id": hash_unique(path, 8), # pd.Int64Dtype()
|
||||
# assuming sarif file names are like <org>/<repo>
|
||||
# however this will be replaced down the line with the repoURI if possible
|
||||
# still, leaving here in case later versions of this tool do not rely on that property being there
|
||||
# in that case this will be the best guess
|
||||
"project_id": hash.hash_unique((project+"-"+component).encode()), # pd.UInt64Dtype()
|
||||
"scan_id": scan_id, # pd.Int64Dtype()
|
||||
"sarif_file_name": path, # pd.StringDtype()
|
||||
}
|
||||
|
||||
|
||||
7
sarif_cli/hash.py
Normal file
7
sarif_cli/hash.py
Normal file
@@ -0,0 +1,7 @@
|
||||
from hashlib import blake2b
|
||||
|
||||
# takes a bytes object and outputs an 8 byte hash
|
||||
def hash_unique(item_to_hash):
|
||||
h = blake2b(digest_size = 8)
|
||||
h.update(item_to_hash)
|
||||
return abs(int.from_bytes(h.digest(), byteorder='big'))
|
||||
@@ -7,6 +7,8 @@ import logging
|
||||
import numpy
|
||||
import pandas as pd
|
||||
import re
|
||||
import sys
|
||||
from sarif_cli import hash
|
||||
|
||||
class ZeroResults(Exception):
|
||||
pass
|
||||
@@ -81,10 +83,16 @@ def joins_for_projects(basetables, external_info, scantables):
|
||||
# (git|https)://*/org/project.*
|
||||
# use the org/project part as the project_name.
|
||||
#
|
||||
# TODO knewbury error handling for if the signature is slotted out?
|
||||
repo_url = b.project.repositoryUri[0]
|
||||
url_parts = re.match(r'(git|https)://[^/]+/([^/]+)/([^/.]+).*', repo_url)
|
||||
if url_parts:
|
||||
project_name = f"{url_parts.group(2)}/{url_parts.group(3)}"
|
||||
project_name = f"{url_parts.group(2)}-{url_parts.group(3)}"
|
||||
project, component = e.sarif_file_name.rstrip().split('/')
|
||||
# if the runners guess from the filename was bad, replace with real info
|
||||
# and continue to use that scanspec to pass that around
|
||||
if project_name != project+"-"+component:
|
||||
e.project_id = hash.hash_unique(project_name.encode())
|
||||
else:
|
||||
project_name = pd.NA
|
||||
|
||||
@@ -131,7 +139,6 @@ def joins_for_scans(basetables, external_info, scantables):
|
||||
"results_count" : scantables.results.shape[0],
|
||||
"rules_count" : len(b.rules['id'].unique()),
|
||||
},index=[0])
|
||||
|
||||
# Force all column types to ensure correct writing and type checks on reading.
|
||||
res1 = res.astype(ScanTablesTypes.scans).reset_index(drop=True)
|
||||
return res1
|
||||
@@ -158,7 +165,7 @@ def joins_for_results(basetables, external_info):
|
||||
res = pd.concat(stack)
|
||||
else:
|
||||
if stack == []:
|
||||
# TODO: The case of zero results should be handled at sarif read time
|
||||
# TODO knewbury to error handling
|
||||
logging.warning("Zero problem/path_problem results found in sarif "
|
||||
"file but processing anyway.")
|
||||
res = tables[0]
|
||||
|
||||
Reference in New Issue
Block a user