Add overview of the base tables derived from multi-sarif input; add rules.csv

The table overview is in the jupyter notebook scripts/multi-table-overview.ipynb and makes use of some formatting customizations to actually get an overview. The initial `projects` table had far too many entries; the `rules` part is now in a separate `rules` table.
2025-12-16 09:13:04 +01:00 · 2022-03-16 16:54:14 -07:00
parent 926e083991
commit b82c620a1e
7 changed files with 4440 additions and 47 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -16,3 +16,8 @@
 # Temporary files
 foo*
 tmp*
+.ipynb_checkpoints
+
+# MacOS index files
+.DS_Store
+
--- a/README.org
+++ b/README.org
@@ -38,7 +38,7 @@
    python3 -m pip install -r requirements.txt
    # Or separately:
    pip install --upgrade pip
-    pip install ipython pyyaml pandas
+    pip install ipython pyyaml pandas jupyter
  #+END_SRC

  "Install" for local development:
--- a/bin/sarif-extract-multi
+++ b/bin/sarif-extract-multi
@@ -6,9 +6,10 @@ import json
 import pathlib
 from sarif_cli import signature, signature_multi
 from sarif_cli import typegraph
+from dataclasses import dataclass
 import sarif_cli.table_joins as tj
 import sys
-from collections import defaultdict
+from collections import UserDict
 import pandas as pd

 #
@@ -62,27 +63,43 @@ typegraph.attach_tables(tgraph)
 # 
 # Form dataframes originally introduced by sarif-extract-tables
 # 
+@dataclass
+class BaseTables:
+    kind_problem : pd.DataFrame
+    kind_pathproblem : pd.DataFrame
+    codeflows : pd.DataFrame
+    relatedLocations : pd.DataFrame
+    project : pd.DataFrame
+    rules : pd.DataFrame
+    artifacts : pd.DataFrame
+    def __init__(self): pass
+
+bt = BaseTables()
+
 sf_2683 = tj.joins_for_sf_2683(tgraph)
-kind_problem = tj.joins_for_problem(tgraph, sf_2683)
-kind_pathproblem = tj.joins_for_path_problem(tgraph, sf_2683)
-codeflows_9799 = tj.joins_for_codeflows(tgraph, sf_2683)
-related_locations = tj.joins_for_relatedLocations(tgraph, sf_2683)
+bt.kind_problem = tj.joins_for_problem(tgraph, sf_2683)
+bt.kind_pathproblem = tj.joins_for_path_problem(tgraph, sf_2683)
+bt.codeflows = tj.joins_for_codeflows(tgraph, sf_2683)
+bt.relatedLocations = tj.joins_for_relatedLocations(tgraph, sf_2683)
 # 
 # Form the new dataframes
 # 
-project_df = tj.joins_for_project(tgraph)
-artifacts_df = tj.joins_for_artifacts(tgraph)
+bt.project = tj.joins_for_project(tgraph)
+bt.rules = tj.joins_for_rules(tgraph)
+bt.artifacts = tj.joins_for_artifacts(tgraph)
 #
 # Write output
 #
 p = pathlib.Path(args.outdir)
 p.mkdir(exist_ok=True)
 def write(path, frame):
-    with p.joinpath(path).open(mode='wb') as fh:
+    with p.joinpath(path + ".csv").open(mode='wb') as fh:
        frame.to_csv(fh, index_label='index')
-write('problem.csv', kind_problem)
-write('path-problem.csv', kind_pathproblem)
-write('codeflows.csv', codeflows_9799)
-write('related-locations.csv', related_locations)
-write('project.csv', project_df)
-write('artifacts.csv', artifacts_df)
+write('kind_problem', bt.kind_problem)
+write('kind_pathproblem', bt.kind_pathproblem)
+write('codeflows', bt.codeflows)
+write('relatedLocations', bt.relatedLocations)
+write('project', bt.project)
+write('rules', bt.rules)
+write('artifacts', bt.artifacts)
+
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,15 +1,68 @@
 appnope==0.1.2
+argon2-cffi==21.3.0
+argon2-cffi-bindings==21.2.0
+asttokens==2.0.5
+attrs==21.4.0
 backcall==0.2.0
-decorator==5.1.0
-ipython==7.28.0
-jedi==0.18.0
+beautifulsoup4==4.10.0
+bleach==4.1.0
+cffi==1.15.0
+debugpy==1.5.1
+decorator==5.1.1
+defusedxml==0.7.1
+entrypoints==0.4
+executing==0.8.3
+ipykernel==6.9.2
+ipython==8.1.1
+ipython-genutils==0.2.0
+ipywidgets==7.6.5
+jedi==0.18.1
+Jinja2==3.0.3
+jsonschema==4.4.0
+jupyter==1.0.0
+jupyter-client==7.1.2
+jupyter-console==6.4.3
+jupyter-core==4.9.2
+jupyterlab-pygments==0.1.2
+jupyterlab-widgets==1.0.2
+MarkupSafe==2.1.1
 matplotlib-inline==0.1.3
-parso==0.8.2
+mistune==0.8.4
+nbclient==0.5.13
+nbconvert==6.4.4
+nbformat==5.2.0
+nest-asyncio==1.5.4
+notebook==6.4.10
+numpy==1.22.3
+packaging==21.3
+pandas==1.4.1
+pandocfilters==1.5.0
+parso==0.8.3
 pexpect==4.8.0
 pickleshare==0.7.5
-prompt-toolkit==3.0.20
+prometheus-client==0.13.1
+prompt-toolkit==3.0.28
+psutil==5.9.0
 ptyprocess==0.7.0
-Pygments==2.10.0
+pure-eval==0.2.2
+pycparser==2.21
+Pygments==2.11.2
+pyparsing==3.0.7
+pyrsistent==0.18.1
+python-dateutil==2.8.2
+pytz==2021.3
 PyYAML==6.0
-traitlets==5.1.0
+pyzmq==22.3.0
+qtconsole==5.2.2
+QtPy==2.0.1
+Send2Trash==1.8.0
+six==1.16.0
+soupsieve==2.3.1
+stack-data==0.2.0
+terminado==0.13.3
+testpath==0.6.0
+tornado==6.1
+traitlets==5.1.1
 wcwidth==0.2.5
+webencodings==0.5.1
+widgetsnbextension==3.5.2
--- a/sarif_cli/table_joins.py
+++ b/sarif_cli/table_joins.py
@@ -6,6 +6,7 @@
    provides those for the other tables.
 """
 import pandas as pd
+from .typegraph import tagged_array_columns, tagged_struct_columns

 def joins_for_sf_2683(tgraph):
    """ 
@@ -256,31 +257,6 @@ def joins_for_project(tgraph):
        .drop(columns=['driver', 'struct_id'])
        .rename(columns={"version": "driver_version_7820", "name": "driver_name_7820"})
        # 
-        .merge(af(8754), how="left", left_on='rules', right_on='array_id', validate="1:m")
-        .drop(columns=['rules', 'array_id', 'type_at_index'])
-        .rename(columns={"value_index": "rule_value_index_8754"}) # rule index
-        #
-        .merge(sf(6818), how="left", left_on='id_or_value_at_index', right_on='struct_id', validate="1:m")
-        .drop(columns=['id_or_value_at_index', 'struct_id'])
-        .rename(columns={"id": "rule_id_6818", "name": "rule_name_6818"})
-        # 
-        .merge(sf(8581), how="left", left_on='defaultConfiguration', right_on='struct_id', validate="1:m")
-        .drop(columns=['defaultConfiguration', 'struct_id'])
-        # 
-        .merge(sf(2774), how="left", left_on='fullDescription', right_on='struct_id', validate="1:m")
-        .drop(columns=['fullDescription', 'struct_id'])
-        .rename(columns={"text": "rule_fullDescription_6818"})
-        # 
-        .merge(sf(2774), how="left", left_on='shortDescription', right_on='struct_id', validate="1:m")
-        .drop(columns=['shortDescription', 'struct_id'])
-        .rename(columns={"text": "rule_shortDescription_6818"})
-        # 
-        .merge(sf(7849), how="left", left_on='properties', right_on='struct_id', validate="1:m")
-        .drop(columns=['properties', 'struct_id'])
-        # 
-        .merge(af(7069), how="left", left_on='tags', right_on='array_id', validate="1:m")
-        .drop(columns=['tags', 'array_id', 'type_at_index'])
-        .rename(columns={"value_index": "tag_index_7069", "id_or_value_at_index": "tag_text_7069"})
        # versionControlProvenance - repositoryUri
        # The merge with af(8754) replicates versionControlProvenance, no 1:m validation
        .merge(af(5511), how="left", left_on='versionControlProvenance', right_on='array_id')
@@ -293,6 +269,50 @@ def joins_for_project(tgraph):
    )
    return project_df

+def joins_for_rules(tgraph):
+    """ 
+    Return table providing the `rules` information.
+    """
+    # Access convenience functions
+    sf = lambda num: tgraph.dataframes['Struct' + str(num)]
+    sft = lambda id: sf(id).rename(columns = tagged_struct_columns(tgraph, id))
+    af = lambda num: tgraph.dataframes['Array' + str(num)]
+    aft = lambda id: af(id).rename(columns = tagged_array_columns(tgraph, id))
+    # 
+    rules_df = (
+        aft(8754)
+        # 
+        .drop(columns=['t8754_type_at_index'])
+        # 
+        .merge(sft(6818), how="left", left_on='t8754_id_or_value_at_index',
+               right_on='t6818_struct_id',
+               validate="1:m")
+        .drop(columns=['t8754_id_or_value_at_index', 't6818_struct_id'])
+        # 
+        .merge(sft(8581), how="left", left_on='t6818_defaultConfiguration',
+               right_on='t8581_struct_id', validate="1:m") 
+        .drop(columns=['t6818_defaultConfiguration', 't8581_struct_id'])
+        # 
+        .merge(sft(2774), how="left", left_on='t6818_fullDescription',
+               right_on='t2774_struct_id', validate="1:m") 
+        .drop(columns=['t6818_fullDescription', 't2774_struct_id'])
+        .rename(columns={'t2774_text': "t6818_t2774_fullDescription"})
+        # 
+        .merge(sft(2774), how="left", left_on='t6818_shortDescription',
+               right_on='t2774_struct_id', validate="1:m") 
+        .drop(columns=['t6818_shortDescription', 't2774_struct_id'])
+        .rename(columns={"t2774_text": 't6818_t2774_shortDescription'})
+        # 
+        .merge(sft(7849), how="left", left_on='t6818_properties',
+               right_on='t7849_struct_id', validate="1:m") 
+        .drop(columns=['t6818_properties', 't7849_struct_id'])
+        # 
+        .merge(aft(7069), how="left", left_on='t7849_tags',
+               right_on='t7069_array_id', validate="1:m")  
+        .drop(columns=['t7849_tags', 't7069_array_id', 't7069_type_at_index'])
+        )
+    return rules_df
+
 def joins_for_artifacts(tgraph):
    """ 
    Return table providing the `artifacts` information.
--- a/sarif_cli/typegraph.py
+++ b/sarif_cli/typegraph.py
@@ -251,3 +251,30 @@ def attach_tables(typegraph):
            continue            # skip String etc.
        typegraph.dataframes[typedef] = pd.DataFrame(valarray, columns = colheader)
        
+
+def tagged_array_columns(typegraph, array_id):
+    """ Return a dict mapping the array column names to versions tagged with the id.
+    
+    Example:
+    The original table headers are 
+
+        array_id  value_index type_at_index  id_or_value_at_index
+
+    the tagged versions become
+
+        t8754_array_id  t8754_value_index t8754_type_at_index  t8754_id_or_value_at_index    
+    """
+    array_id = str(array_id)
+    typedef = 'Array' + array_id
+    colheader = ('array_id', 'value_index', 'type_at_index', 'id_or_value_at_index')
+    return { header:"t{:s}_{:s}".format(array_id, header) for header in colheader}
+
+
+def tagged_struct_columns(typegraph, struct_id):
+    """ Return a dict mapping the struct column names to versions tagged with the id.
+    XX:
+    """
+    struct_id = str(struct_id)
+    typedef = 'Struct' + struct_id
+    colheader = ('struct_id', *typegraph.fields[typedef])
+    return { header:"t{:s}_{:s}".format(struct_id, header) for header in colheader}
--- a/scripts/multi-table-overview.ipynb
+++ b/scripts/multi-table-overview.ipynb