From 30e3dd3a37d9885ee1298c640a133d1c3fb6a1ad Mon Sep 17 00:00:00 2001 From: Michael Hohn Date: Fri, 29 Apr 2022 22:39:25 -0700 Subject: [PATCH] Replace internal ids with snowflake ids before writing tables --- bin/sarif-extract-multi | 74 +++++++++++++++++++++++++++++++++-------- 1 file changed, 61 insertions(+), 13 deletions(-) diff --git a/bin/sarif-extract-multi b/bin/sarif-extract-multi index d6a6a01..404aade 100755 --- a/bin/sarif-extract-multi +++ b/bin/sarif-extract-multi @@ -1,15 +1,17 @@ #!/usr/bin/env python """ Extract data from multiple sarif files in table form. """ -import argparse -import json -import pathlib +from dataclasses import dataclass from sarif_cli import signature, signature_multi from sarif_cli import typegraph -from dataclasses import dataclass +from sarif_cli import snowflake_id +import argparse +import dataclasses as dc +import json +import pandas as pd +import pathlib import sarif_cli.table_joins as tj import sys -import pandas as pd # # Start processing @@ -88,6 +90,57 @@ bt.project = tj.joins_for_project(tgraph) # multi-sarif only bt.relatedLocations = tj.joins_for_relatedLocations(tgraph, sf_2683) bt.rules = tj.joins_for_rules(tgraph) +# +# Replace the remaining internal ids with snowflake ids +# +flakegen = snowflake_id.Snowflake(0) + +columns_to_reindex = { + # template from {field.name : [''] for field in dc.fields(bt)} + 'artifacts': ['artifacts_id'], + 'codeflows': ['codeflow_id'], + 'kind_pathproblem': ['results_array_id', 'codeFlows_id'], + 'kind_problem': ['results_array_id'], + 'project': ['artifacts', 'results', 'rules'], + 'relatedLocations': ['struct_id'], + 'rules': ['rules_array_id']} + +_id_to_flake = {} +def _get_flake(id): + flake = _id_to_flake.get(id, -1) + if flake == -1: + flake = flakegen.next() + _id_to_flake[id] = flake + return flake + +# +# Cleaner, but makes far too many copies; keep the loop below +# +# def _reindex(table, colname): +# newtable = table.astype({ colname : 'uint64'}).reset_index(drop=True) +# for i in range(0, len(newtable)): +# newtable.loc[i, colname] = _get_flake(newtable.loc[i, colname]) +# return newtable +# +# for field in dc.fields(bt): +# table_name = field.name +# for colname in columns_to_reindex[table_name]: +# setattr(bt, field.name, _reindex(getattr(bt, field.name), colname)) +# +for field in dc.fields(bt): + table_name = field.name + table = getattr(bt, field.name) + # Turn all snowflake columns into uint64 and reset indexing to 0..len(table) + newtable = table.astype( + { colname : 'uint64' + for colname in columns_to_reindex[table_name]} + ).reset_index(drop=True) + # Swap ids for flakes + for colname in columns_to_reindex[table_name]: + for i in range(0, len(newtable)): + newtable.loc[i, colname] = _get_flake(newtable.loc[i, colname]) + # Replace the table + setattr(bt, field.name, newtable) # # Write output # @@ -96,11 +149,6 @@ p.mkdir(exist_ok=True) def write(path, frame): with p.joinpath(path + ".csv").open(mode='wb') as fh: frame.to_csv(fh, index=False) -write('artifacts', bt.artifacts) -write('codeflows', bt.codeflows) -write('kind_pathproblem', bt.kind_pathproblem) -write('kind_problem', bt.kind_problem) -write('project', bt.project) -write('relatedLocations', bt.relatedLocations) -write('rules', bt.rules) - +for field in dc.fields(bt): + table = getattr(bt, field.name) + write(field.name, table)