WIP: sarif-extract-scans: back to single sarif file handling, incorporate multi-file libraries

2025-12-16 17:23:03 +01:00 · 2022-05-10 19:01:38 -07:00
parent 675a5a4008
commit b212423907
6 changed files with 264 additions and 3 deletions
--- a/bin/sarif-extract-scans
+++ b/bin/sarif-extract-scans
@@ -0,0 +1,174 @@
 #!/usr/bin/env python
 """ Extract scan data from multiple sarif files in table form.
 """
 from dataclasses import dataclass
 from sarif_cli import signature, signature_single
 from sarif_cli import typegraph
 from sarif_cli import snowflake_id
 import argparse
 import dataclasses as dc
 import json
 import logging
 import pandas as pd
 import pathlib
 import sarif_cli.table_joins as tj
 import sarif_cli.derived_joins as derived
 import sys
 #
 # Configure logger
 # 
 logging.basicConfig(format='%(asctime)s %(message)s')
 #
 # Start processing 
 #
 parser = argparse.ArgumentParser(description='Read a collection of sarif files and produce tabular output.')
 parser.add_argument('file', metavar='scan-spec.json', type=str,
                    help="json file containing required external scan information.")
 parser.add_argument('outdir', metavar='output-dir', type=str, help='output directory')
 args = parser.parse_args()
 # Load meta info
 def load(fname):
    with open(fname, 'rb') if fname != '-' else sys.stdin as fp: 
        try:
            content = json.load(fp)
        except json.decoder.JSONDecodeError as err:
            logging.error('Error reading from {}: {}: line {}, column {}'
                          .format(args.file, err.msg, err.lineno, err.colno))
            sys.exit(1)
        return content
 scan_spec = load(args.file)
 sarif_struct = load(scan_spec['sarif_file_name'])
 #
 # Preprocess raw SARIF to get smaller signature
 #
 context = signature.Context(
    {
        "string" : "String",
        "int" : "Int",
        "bool" : "Bool"
    }
 ) 
 sarif_struct = signature.fillsig(args, sarif_struct, context)
 #
 # Use reference type graph (signature) to traverse sarif and attach values to tables
 #
 tgraph = typegraph.Typegraph(signature_single.struct_graph_2022_02_01)
 typegraph.destructure(tgraph, signature_single.start_node_2022_02_01, sarif_struct)
 #
 # Form output tables
 # 
 typegraph.attach_tables(tgraph)
 #
 # Dataframe / table collection
 # 
@dataclass
 class BaseTables:
    artifacts : pd.DataFrame
    codeflows : pd.DataFrame
    kind_pathproblem : pd.DataFrame
    kind_problem : pd.DataFrame
    project : pd.DataFrame
    relatedLocations : pd.DataFrame
    rules : pd.DataFrame
    def __init__(self): pass
 bt = BaseTables()
@dataclass
 class ScanTables:
    # project: External table with project information
    scans : pd.DataFrame
    results : pd.DataFrame
    def __init__(self): pass
 scantabs = ScanTables()
 # 
 # Add dataframes for base tables 
 # 
 sf_2683 = tj.joins_for_sf_2683(tgraph)
 af_0350_location = tj.joins_for_af_0350_location(tgraph)
 bt.artifacts = tj.joins_for_artifacts(tgraph)
 bt.codeflows = tj.joins_for_codeflows(tgraph, sf_2683)
 bt.kind_pathproblem = tj.joins_for_path_problem(tgraph, af_0350_location)
 bt.kind_problem = tj.joins_for_problem(tgraph, af_0350_location)
 bt.project = tj.joins_for_project_single(tgraph)
 bt.relatedLocations = tj.joins_for_relatedLocations(tgraph, sf_2683)
 bt.rules = tj.joins_for_rules(tgraph)
 #
 # Form derived query tables
 #
 # XX
 # scantabs.project = derived.joins_for_project(bt)
 # scantabs.scans = derived.joins_for_scans(bt)
 # scantabs.results = derived.joins_for_results(bt)
 #
 # Replace the remaining internal ids with snowflake ids
 # 
 flakegen = snowflake_id.Snowflake(0)
 columns_to_reindex = {
    # template from {field.name : [''] for field in dc.fields(bt)}
    'artifacts': ['artifacts_id'],
    'codeflows': ['codeflow_id'],
    'kind_pathproblem': ['results_array_id', 'codeFlows_id'],
    'kind_problem': ['results_array_id'],
    'project': ['artifacts', 'results', 'rules'],
    'relatedLocations': ['struct_id'],
    'rules': ['rules_array_id']}
 _id_to_flake = {}
 def _get_flake(id):
    flake = _id_to_flake.get(id, -1)
    if flake == -1:
        flake = flakegen.next()
        _id_to_flake[id] = flake
    return flake
 # 
 # Cleaner, but makes far too many copies; keep the loop below
 # 
 # def _reindex(table, colname):
 #     newtable = table.astype({ colname : 'uint64'}).reset_index(drop=True)
 #     for i in range(0, len(newtable)):
 #         newtable.loc[i, colname] = _get_flake(newtable.loc[i, colname])
 #     return newtable
 # 
 # for field in dc.fields(bt):
 #     table_name = field.name   
 #     for colname in columns_to_reindex[table_name]:
 #         setattr(bt, field.name, _reindex(getattr(bt, field.name), colname))
 # 
 for field in dc.fields(bt):
    table_name = field.name   
    table = getattr(bt, field.name)
    # Turn all snowflake columns into uint64 and reset indexing to 0..len(table)
    newtable = table.astype(
        { colname : 'uint64'
          for colname in columns_to_reindex[table_name]}
    ).reset_index(drop=True)
    # Swap ids for flakes
    for colname in columns_to_reindex[table_name]:
        for i in range(0, len(newtable)):
            newtable.loc[i, colname] = _get_flake(newtable.loc[i, colname])
    # Replace the table
    setattr(bt, field.name, newtable)
 #
 # Write output
 #
 p = pathlib.Path(args.outdir)
 p.mkdir(exist_ok=True)
 def write(path, frame):
    with p.joinpath(path + ".csv").open(mode='wb') as fh:
        frame.to_csv(fh, index=False)
 for field in dc.fields(bt):
    table = getattr(bt, field.name)
    write(field.name, table)
--- a/data/treeio/scan-spec-0.json
+++ b/data/treeio/scan-spec-0.json
@@ -0,0 +1,5 @@
 {
    "project_id": 13243,
    "scan_id": 123457,
    "sarif_file_name": "2022-02-25/results.sarif"
 }
--- a/data/treeio/scan-spec-1.json
+++ b/data/treeio/scan-spec-1.json
@@ -0,0 +1,5 @@
 {
    "project_id": 13243,
    "scan_id": 123456,
    "sarif_file_name": "2021-12-09/results.sarif"
 }
--- a/notes/tables.org
+++ b/notes/tables.org
@@ -181,7 +181,8 @@
 * Tables or entries to be removed
  The top of the [Mar-23-2022] =projects.csv= table, enumerated below, is ad-hoc
  and included in the other tables below; the information for its fields is not
-  yet collected to it can be discarded.
+  yet collected so it can be discarded.
  #+BEGIN_SRC text
    ==> project-meta.csv <==
    creation_date
@@ -196,6 +197,17 @@
    tool_version
  #+END_SRC
  This information was used to expand the sarif tree (see Struct3452 and Array7481
  in typegraph-multi-with-tables.pdf and the code).  In retrospect, that was a
  poor choice.  All additional information needed can be represented by one or
  more tables, so sarif-extract* post commit 30e3dd3a3 do so.
  The minimal information required to drive the sarif-to-table conversion is
  | project_id      |                      13243 |   |
  | scan_id         |                     123456 |   |
  | sarif_file_name | "2021-12-09/results.sarif" |   |
 * New tables to be exported
  This section enumerates new tables intended for reporting infrastructure. 
--- a/sarif_cli/table_joins.py
+++ b/sarif_cli/table_joins.py
@@ -305,7 +305,7 @@ def joins_for_relatedLocations(tgraph, sf_2683):
 def joins_for_project(tgraph):
    """ 
-    Return table providing the `project` information.
+    Return table providing the `project` information for sarif-extract-multi.
    """
    # Access convenience functions
    sf = lambda num: tgraph.dataframes['Struct' + str(num)]
@@ -368,6 +368,64 @@ def joins_for_project(tgraph):
    )
    return project_df_1
 def joins_for_project_single(tgraph):
    """ 
    Return table providing the `project` information for sarif-extract-scans
    """
    # Access convenience functions
    sf = lambda num: tgraph.dataframes['Struct' + str(num)]
    af = lambda num: tgraph.dataframes['Array' + str(num)]
    # 
    project_df = (
        sf(6787)
        .rename(columns={"version": "version_6787", "struct_id": "struct_id_6787"})
        #
        .merge(af('0177'), how="left", left_on='runs', right_on='array_id',
               validate="1:m")
        .drop(columns=['runs', 'array_id', 'type_at_index'])
        .rename(columns={"value_index": "value_index_0177"})
        #
        .merge(sf(3388), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m")
        .drop(columns=['id_or_value_at_index', 'struct_id'])
        # 
        # .merge(af(7069), how="left", left_on='newlineSequences', right_on='array_id',
        #        validate="1:m")
        # .drop(columns=['newlineSequences', 'array_id', 'type_at_index'])
        .drop(columns=['newlineSequences'])
        #
        .merge(sf(9543), how="left", left_on='properties', right_on='struct_id', validate="1:m")
        .drop(columns=['properties', 'struct_id'])
        #
        # tool - driver - rules - defaultConfiguration - ( properties - tags )
        # 
        .merge(sf(8972), how="left", left_on='tool', right_on='struct_id', validate="1:m")
        .drop(columns=['tool', 'struct_id'])
        # 
        .merge(sf(7820), how="left", left_on='driver', right_on='struct_id', validate="1:m")
        .drop(columns=['driver', 'struct_id'])
        .rename(columns={"version": "driver_version_7820", "name": "driver_name_7820"})
        # 
        .merge(af(5511), how="left", left_on='versionControlProvenance', right_on='array_id')
        .drop(columns=['versionControlProvenance', 'array_id', 'type_at_index'])
        .rename(columns={"value_index": "versionControl_value_index_5511"})
        # 
        .merge(sf(3081), how="left", left_on='id_or_value_at_index', right_on='struct_id')
        .drop(columns=['id_or_value_at_index', 'struct_id'])
        #
    )
    # Keep columns of interest
    project_df_1 = (
        project_df
        .drop(columns=['struct_id_6787', 'versionControl_value_index_5511'])
        .rename({
            'version_6787': 'sarif_version',
            'value_index_0177': 'run_index',
            'driver_name_7820': 'driver_name',
            'driver_version_7820': 'driver_version',
        }, axis='columns')
    )
    return project_df_1
 def joins_for_rules(tgraph):
    """ 
    Return table providing the `rules` information.
--- a/sarif_cli/typegraph.py
+++ b/sarif_cli/typegraph.py
@@ -8,6 +8,7 @@ This file also contains some type graph reference values; these may be moved out
 separate files at some point.
 """
 from dataclasses import dataclass
 import logging
 from typing import Any, Dict, List, Tuple, Union
 import pandas as pd
@@ -160,13 +161,19 @@ def _destructure_dict(typegraph: Typegraph, node, tree):
    elif set(tree_fields).issuperset(set(type_fields)):
        # Log a warning
        # log.warning("XX: Tree has unrecognized fields")
        logging.warning('Input tree has unrecognized fields, collecting only '
                        'known entries: {}'.format(tree))
        logging.warning('tree fields: {}    type fields: {}'
                        .format(tree_fields, type_fields))
        _destructure_dict_1(typegraph, node, tree)
    elif set(tree_fields).issubset(set(type_fields)):
        raise MissingFieldException("XX: (Sub)tree is missing fields required by typedef")
    else:
-        raise Exception("typegraph: unhandled case reached.  Internal error")
+        raise Exception("typegraph: unhandled case reached: cannot match type "
                        "fields {} to tree fields {}.  Data is invalid."
                        .format(type_fields, tree_fields))
 def _destructure_list(typegraph, node: str, tree: List):