From 582d93313003365250e5867c2e2039b374a6d6e8 Mon Sep 17 00:00:00 2001
From: Michael Hohn <hohn@github.com>
Date: Thu, 1 Aug 2024 14:30:40 -0700
Subject: [PATCH] Improve example data layout and README

---
 client/qldbtools/README.md                    | 116 +++++++++++++-----
 .../qldbtools/qldbtools/session-4-unique.py   |   4 +-
 .../qldbtools/session-generate-selection.py   |   2 +-
 .../qldbtools/session-populate-minio.py       |   2 +-
 .../qldbtools/session-post-refine-info.py     |   2 +-
 client/qldbtools/requirements.txt             |   3 +-
 6 files changed, 89 insertions(+), 40 deletions(-)

diff --git a/client/qldbtools/README.md b/client/qldbtools/README.md
index c923671..43625cb 100644
--- a/client/qldbtools/README.md
+++ b/client/qldbtools/README.md
@@ -1,6 +1,69 @@
-# qldbtools
+# Introduction to qldbtools
 
-qldbtools is a Python package for working with CodeQL databases
+`qldbtools` is a Python package for selecting sets of CodeQL databases to work on.
+It uses a (pandas) dataframe in the implementation, but all results sets are
+available as CSV files to provide flexibility in the tools you want to work with.
+
+The rationale is simple: When working with larger collections of CodeQL databases,
+spread over time, languages, etc., many criteria can be used to select the subset
+of interest.  This package addresses that aspect of MRVA (multi repository
+variant analysis). 
+
+For example, consider this scenario from an enterprise.  We have 10,000
+repositories in C/C++, 5,000 in Python.  We build CodeQL dabases weekly and keep
+the last 2 years worth.
+This means for the last 2 years there are
+
+    (10000 + 5000) * 52 * 2 = 1560000
+
+databases to select from for a single MRVA run.  1.5 Million rows are readily
+handled by a pandas (or R) dataframe.  
+
+The full list of criteria currently encoded via the columns is
+
+-   owner
+-   name
+-   CID
+-   cliVersion
+-   creationTime
+-   language
+-   sha -- git commit sha of the code the CodeQL database is built against
+-   baselineLinesOfCode
+-   path
+-   db_lang
+-   db_lang_displayName
+-   db_lang_file_count
+-   db_lang_linesOfCode
+-   ctime
+-   primaryLanguage
+-   finalised
+-   left_index
+-   size
+
+The minimal criteria needed to distinguish databases in the above scenario are
+
+-   cliVersion
+-   creationTime
+-   language
+-   sha
+
+These are encoded in the single custom id column 'CID'.
+
+Thus, a database can be fully specified using a (owner,name,CID) tuple and this is
+encoded in the names used by the MRVA server and clients. The selection of
+databases can of course be done using the whole table.
+
+For an example of the workflow, see [section 'command line use'](#command-line-use).
+
+
+
+A small sample of a full table:
+
+|    | owner    | name           | CID    | cliVersion   | creationTime                     | language   | sha                                      |   baselineLinesOfCode | path                                                                                                                          | db_lang     | db_lang_displayName   |   db_lang_file_count |   db_lang_linesOfCode | ctime                      | primaryLanguage   |   finalised |   left_index |     size |
+|---:|:---------|:---------------|:-------|:-------------|:---------------------------------|:-----------|:-----------------------------------------|----------------------:|:------------------------------------------------------------------------------------------------------------------------------|:------------|:----------------------|---------------------:|----------------------:|:---------------------------|:------------------|------------:|-------------:|---------:|
+|  0 | 1adrianb | face-alignment | 1f8d99 | 2.16.1       | 2024-02-08 14:18:20.983830+00:00 | python     | c94dd024b1f5410ef160ff82a8423141e2bbb6b4 |                  1839 | /Users/hohn/work-gh/mrva/mrva-open-source-download/repos/1adrianb/face-alignment/code-scanning/codeql/databases/python/db.zip | python      | Python                |                   25 |                  1839 | 2024-07-24T14:09:02.187201 | python            |           1 |         1454 | 24075001 |
+|  1 | 2shou    | TextGrocery    | 9ab87a | 2.12.1       | 2023-02-17T11:32:30.863093193Z   | cpp        | 8a4e41349a9b0175d9a73bc32a6b2eb6bfb51430 |                  3939 | /Users/hohn/work-gh/mrva/mrva-open-source-download/repos/2shou/TextGrocery/code-scanning/codeql/databases/cpp/db.zip          | no-language | no-language           |                    0 |                    -1 | 2024-07-24T06:25:55.347568 | cpp               |         nan |         1403 |  3612535 |
+|  2 | 3b1b     | manim          | 76fdc7 | 2.17.5       | 2024-06-27 17:37:20.587627+00:00 | python     | 88c7e9d2c96be1ea729b089c06cabb1bd3b2c187 |                 19905 | /Users/hohn/work-gh/mrva/mrva-open-source-download/repos/3b1b/manim/code-scanning/codeql/databases/python/db.zip              | python      | Python                |                   94 |                 19905 | 2024-07-24T13:23:04.716286 | python            |           1 |         1647 | 26407541 |
 
 ## Installation
 
@@ -17,21 +80,6 @@ qldbtools is a Python package for working with CodeQL databases
                 pip install jupyterlab pandas ipython
                 pip install lckr-jupyterlab-variableinspector
 
--   Run jupyterlab
-
-                cd ~/work-gh/mrva/mrvacommander/client
-                source venv/bin/activate
-                jupyter lab &
-               
-        The variable inspector is a right-click on an open console or notebook.
-       
-        The `jupyter` command produces output including
-       
-                Jupyter Server 2.14.1 is running at:
-                http://127.0.0.1:8888/lab?token=4c91308819786fe00a33b76e60f3321840283486457516a1
-
-        Use this to connect multiple front ends
-
 -   Local development
 
         ```bash
@@ -51,12 +99,10 @@ qldbtools is a Python package for working with CodeQL databases
 
 
 ## Use as library
+   The best way to examine the code is starting from the high-level scripts in
+   `bin/`. 
 
-```python
-import qldbtools as ql
-```
-
-## Command-line use
+## Command line use
 
    Initial information collection requires a unique file path so it can be run
    repeatedly over DB collections with the same (owner,name) but other differences
@@ -67,26 +113,28 @@ import qldbtools as ql
    - cliVersion
    - language
 
-   Those fields are collected and a single name addenum formed in
-   `bin/mc-db-refine-info`. 
+   Those fields are collected in `bin/mc-db-refine-info`. 
 
-   The command sequence, grouped by data files, is
+   An example workflow with commands grouped by data files follows.
 
-        cd ~/work-gh/mrva/mrvacommander/client/qldbtools
-        ./bin/mc-db-initial-info ~/work-gh/mrva/mrva-open-source-download > db-info-1.csv
-        ./bin/mc-db-refine-info < db-info-1.csv > db-info-2.csv
+        cd ~/work-gh/mrva/mrvacommander/client/qldbtools && mkdir -p scratch
+        ./bin/mc-db-initial-info ~/work-gh/mrva/mrva-open-source-download > scratch/db-info-1.csv
+        ./bin/mc-db-refine-info < scratch/db-info-1.csv > scratch/db-info-2.csv
        
-        ./bin/mc-db-view-info < db-info-2.csv &
-        ./bin/mc-db-unique < db-info-2.csv > db-info-3.csv
-        ./bin/mc-db-view-info < db-info-3.csv &
+        ./bin/mc-db-view-info < scratch/db-info-2.csv &
+        ./bin/mc-db-unique < scratch/db-info-2.csv > scratch/db-info-3.csv
+        ./bin/mc-db-view-info < scratch/db-info-3.csv &
 
-        ./bin/mc-db-populate-minio -n 23 < db-info-3.csv
-        ./bin/mc-db-generate-selection -n 23 vscode-selection.json gh-mrva-selection.json < db-info-3.csv 
+        ./bin/mc-db-populate-minio -n 23 < scratch/db-info-3.csv
+        ./bin/mc-db-generate-selection -n 23 \
+            scratch/vscode-selection.json \
+            scratch/gh-mrva-selection.json \
+            < scratch/db-info-3.csv 
        
        
 ## Notes
 
-   The preview-data plugin for VS Code has a bug; it displays `0` instead of
+   The `preview-data` plugin for VS Code has a bug; it displays `0` instead of
    `0e3379` for the following.  There are other entries with similar malfunction.
    
         CleverRaven,Cataclysm-DDA,0e3379,2.17.0,2024-05-08 12:13:10.038007+00:00,cpp,5ca7f4e59c2d7b0a93fb801a31138477f7b4a761,578098.0,/Users/hohn/work-gh/mrva/mrva-open-source-download/repos-2024-04-29/CleverRaven/Cataclysm-DDA/code-scanning/codeql/databases/cpp/db.zip,cpp,C/C++,1228.0,578098.0,2024-05-13T12:14:54.650648,cpp,True,4245,563435469
diff --git a/client/qldbtools/qldbtools/session-4-unique.py b/client/qldbtools/qldbtools/session-4-unique.py
index ae35e73..9ca32a6 100644
--- a/client/qldbtools/qldbtools/session-4-unique.py
+++ b/client/qldbtools/qldbtools/session-4-unique.py
@@ -5,7 +5,7 @@ import pandas as pd
 # cd ../
 
 #* Reload CSV file to continue work
-df2 = df_refined = pd.read_csv('db-info-2.csv')
+df2 = df_refined = pd.read_csv('scratch/db-info-2.csv')
 
 # Identify rows missing specific entries
 rows = ( df2['cliVersion'].isna() | 
@@ -17,7 +17,7 @@ df3 = df2[~rows]
 df3
 
 #* post-save work
-df4 = pd.read_csv('db-info-3.csv')
+df4 = pd.read_csv('scratch/db-info-3.csv')
 
 # Sort and group
 df_sorted = df4.sort_values(by=['owner', 'name', 'CID', 'creationTime'])
diff --git a/client/qldbtools/qldbtools/session-generate-selection.py b/client/qldbtools/qldbtools/session-generate-selection.py
index 9f1200e..65ac495 100644
--- a/client/qldbtools/qldbtools/session-generate-selection.py
+++ b/client/qldbtools/qldbtools/session-generate-selection.py
@@ -13,7 +13,7 @@ import numpy as np
 import importlib
 importlib.reload(utils)
 
-df0 = pd.read_csv('db-info-3.csv')
+df0 = pd.read_csv('scratch/db-info-3.csv')
 
 # Use num_entries, chosen via pseudo-random numbers
 df1 = df0.sample(n=3, random_state=np.random.RandomState(4242))
diff --git a/client/qldbtools/qldbtools/session-populate-minio.py b/client/qldbtools/qldbtools/session-populate-minio.py
index 2e1a182..df9ec45 100644
--- a/client/qldbtools/qldbtools/session-populate-minio.py
+++ b/client/qldbtools/qldbtools/session-populate-minio.py
@@ -9,7 +9,7 @@ from pathlib import Path
 #
 #* Collect the information and select subset
 #
-df = pd.read_csv('db-info-2.csv')
+df = pd.read_csv('scratch/db-info-2.csv')
 seed = 4242
 if 0:
     # Use all entries
diff --git a/client/qldbtools/qldbtools/session-post-refine-info.py b/client/qldbtools/qldbtools/session-post-refine-info.py
index 18f01df..4825678 100644
--- a/client/qldbtools/qldbtools/session-post-refine-info.py
+++ b/client/qldbtools/qldbtools/session-post-refine-info.py
@@ -4,7 +4,7 @@ import pandas as pd
 #
 #* Collect the information
 #
-df1 = pd.read_csv("db-info-2.csv")
+df1 = pd.read_csv("scratch/db-info-2.csv")
 
 # Add single uniqueness field -- CID (Cumulative ID) -- using
 # - creationTime
diff --git a/client/qldbtools/requirements.txt b/client/qldbtools/requirements.txt
index dd5d27a..1a98ef8 100644
--- a/client/qldbtools/requirements.txt
+++ b/client/qldbtools/requirements.txt
@@ -135,7 +135,7 @@ python-json-logger==2.0.7
 pytz==2024.1
 PyYAML==6.0.1
 pyzmq==26.0.3
--e git+ssh://git@github.com/advanced-security/mrvacommander.git@26dd69c9767c315a8ffb782eedf3b55eac574d45#egg=qldbtools&subdirectory=client/qldbtools
+-e git+ssh://git@github.com/advanced-security/mrvacommander.git@b7b4839fe0760287b80b8e2887c29d736c5cae33#egg=qldbtools&subdirectory=client/qldbtools
 qtstylish==0.1.5
 referencing==0.35.1
 requests==2.32.3
@@ -155,6 +155,7 @@ stack-data==0.6.3
 statsmodels==0.14.2
 strsimpy==0.2.1
 tables==3.9.2
+tabulate==0.9.0
 tenacity==8.5.0
 terminado==0.18.1
 threadpoolctl==3.5.0