Include project table in output of sarif-extract-scans; add commit_id to scans table

This commit is contained in:
Michael Hohn
2022-06-02 16:45:04 -07:00
committed by =Michael Hohn
parent fd55969b76
commit 741be0cfe1
3 changed files with 40 additions and 2 deletions

View File

@@ -36,9 +36,11 @@
python3 -m venv .venv
. .venv/bin/activate
python3 -m pip install -r requirements.txt
# Or separately:
# Or separately, for development:
pip install --upgrade pip
pip install ipython pyyaml pandas jupyter pyflakes
# Or separately, for running:
pip install pandas
#+END_SRC
"Install" for local development:

View File

@@ -88,6 +88,7 @@ class ScanTables:
# project: External table with project information
scans : pd.DataFrame
results : pd.DataFrame
projects : pd.DataFrame
columns_to_reindex : dict # (name -> name list) dict
def __init__(self): pass
scantabs = ScanTables()
@@ -124,6 +125,8 @@ bt.rules = tj.joins_for_rules(tgraph)
#
scantabs.results = st.joins_for_results(bt, external_info)
scantabs.scans = st.joins_for_scans(bt, external_info, scantabs)
scantabs.projects = st.joins_for_projects(bt, external_info, scantabs)
#
# Replace the remaining internal ids with snowflake ids
@@ -142,6 +145,7 @@ bt.columns_to_reindex = {
scantabs.columns_to_reindex = {
'scans': [],
'projects' : [],
'results': ['codeFlow_id'],
}

View File

@@ -2,8 +2,40 @@
"""
import pandas as pd
import re
from . import snowflake_id
#
# Projects table
#
def joins_for_projects(basetables, external_info, scantables):
"""
Form the 'projects' table for the ScanTables dataclass
"""
b = basetables; e = external_info
# For a repository url of the form
# (git|https)://*/org/project.*
# use the org/project part as the project_name.
#
repo_url = b.project.repositoryUri[0]
url_parts = re.match(r'(git|https)://[^/]+/([^/]+)/([^/.]+).*', repo_url)
if url_parts:
project_name = f"{url_parts.group(2)}/{url_parts.group(3)}"
else:
project_name = pd.NA
res = pd.DataFrame(data={
"id" : e.project_id,
"project_name" : project_name,
"creation_date" : pd.NA, # TODO: external info
"repo_url" : repo_url,
"primary_language" : b.project['semmle.sourceLanguage'][0], # TODO: external info
"languages_analyzed" : ",".join(list(b.project['semmle.sourceLanguage']))
},index=[0])
return res
#
# Scans table
#
@@ -19,7 +51,7 @@ def joins_for_scans(basetables, external_info, scantables):
"More than one driver version found for single sarif file."
res = pd.DataFrame(data={
"id" : e.scan_id,
"commit_id" : pd.NA,
"commit_id" : b.project.revisionId[0],
"project_id" : e.project_id,
#
"db_create_start" : pd.NA,