From 582d93313003365250e5867c2e2039b374a6d6e8 Mon Sep 17 00:00:00 2001 From: Michael Hohn Date: Thu, 1 Aug 2024 14:30:40 -0700 Subject: [PATCH] Improve example data layout and README --- client/qldbtools/README.md | 116 +++++++++++++----- .../qldbtools/qldbtools/session-4-unique.py | 4 +- .../qldbtools/session-generate-selection.py | 2 +- .../qldbtools/session-populate-minio.py | 2 +- .../qldbtools/session-post-refine-info.py | 2 +- client/qldbtools/requirements.txt | 3 +- 6 files changed, 89 insertions(+), 40 deletions(-) diff --git a/client/qldbtools/README.md b/client/qldbtools/README.md index c923671..43625cb 100644 --- a/client/qldbtools/README.md +++ b/client/qldbtools/README.md @@ -1,6 +1,69 @@ -# qldbtools +# Introduction to qldbtools -qldbtools is a Python package for working with CodeQL databases +`qldbtools` is a Python package for selecting sets of CodeQL databases to work on. +It uses a (pandas) dataframe in the implementation, but all results sets are +available as CSV files to provide flexibility in the tools you want to work with. + +The rationale is simple: When working with larger collections of CodeQL databases, +spread over time, languages, etc., many criteria can be used to select the subset +of interest. This package addresses that aspect of MRVA (multi repository +variant analysis). + +For example, consider this scenario from an enterprise. We have 10,000 +repositories in C/C++, 5,000 in Python. We build CodeQL dabases weekly and keep +the last 2 years worth. +This means for the last 2 years there are + + (10000 + 5000) * 52 * 2 = 1560000 + +databases to select from for a single MRVA run. 1.5 Million rows are readily +handled by a pandas (or R) dataframe. + +The full list of criteria currently encoded via the columns is + +- owner +- name +- CID +- cliVersion +- creationTime +- language +- sha -- git commit sha of the code the CodeQL database is built against +- baselineLinesOfCode +- path +- db_lang +- db_lang_displayName +- db_lang_file_count +- db_lang_linesOfCode +- ctime +- primaryLanguage +- finalised +- left_index +- size + +The minimal criteria needed to distinguish databases in the above scenario are + +- cliVersion +- creationTime +- language +- sha + +These are encoded in the single custom id column 'CID'. + +Thus, a database can be fully specified using a (owner,name,CID) tuple and this is +encoded in the names used by the MRVA server and clients. The selection of +databases can of course be done using the whole table. + +For an example of the workflow, see [section 'command line use'](#command-line-use). + + + +A small sample of a full table: + +| | owner | name | CID | cliVersion | creationTime | language | sha | baselineLinesOfCode | path | db_lang | db_lang_displayName | db_lang_file_count | db_lang_linesOfCode | ctime | primaryLanguage | finalised | left_index | size | +|---:|:---------|:---------------|:-------|:-------------|:---------------------------------|:-----------|:-----------------------------------------|----------------------:|:------------------------------------------------------------------------------------------------------------------------------|:------------|:----------------------|---------------------:|----------------------:|:---------------------------|:------------------|------------:|-------------:|---------:| +| 0 | 1adrianb | face-alignment | 1f8d99 | 2.16.1 | 2024-02-08 14:18:20.983830+00:00 | python | c94dd024b1f5410ef160ff82a8423141e2bbb6b4 | 1839 | /Users/hohn/work-gh/mrva/mrva-open-source-download/repos/1adrianb/face-alignment/code-scanning/codeql/databases/python/db.zip | python | Python | 25 | 1839 | 2024-07-24T14:09:02.187201 | python | 1 | 1454 | 24075001 | +| 1 | 2shou | TextGrocery | 9ab87a | 2.12.1 | 2023-02-17T11:32:30.863093193Z | cpp | 8a4e41349a9b0175d9a73bc32a6b2eb6bfb51430 | 3939 | /Users/hohn/work-gh/mrva/mrva-open-source-download/repos/2shou/TextGrocery/code-scanning/codeql/databases/cpp/db.zip | no-language | no-language | 0 | -1 | 2024-07-24T06:25:55.347568 | cpp | nan | 1403 | 3612535 | +| 2 | 3b1b | manim | 76fdc7 | 2.17.5 | 2024-06-27 17:37:20.587627+00:00 | python | 88c7e9d2c96be1ea729b089c06cabb1bd3b2c187 | 19905 | /Users/hohn/work-gh/mrva/mrva-open-source-download/repos/3b1b/manim/code-scanning/codeql/databases/python/db.zip | python | Python | 94 | 19905 | 2024-07-24T13:23:04.716286 | python | 1 | 1647 | 26407541 | ## Installation @@ -17,21 +80,6 @@ qldbtools is a Python package for working with CodeQL databases pip install jupyterlab pandas ipython pip install lckr-jupyterlab-variableinspector -- Run jupyterlab - - cd ~/work-gh/mrva/mrvacommander/client - source venv/bin/activate - jupyter lab & - - The variable inspector is a right-click on an open console or notebook. - - The `jupyter` command produces output including - - Jupyter Server 2.14.1 is running at: - http://127.0.0.1:8888/lab?token=4c91308819786fe00a33b76e60f3321840283486457516a1 - - Use this to connect multiple front ends - - Local development ```bash @@ -51,12 +99,10 @@ qldbtools is a Python package for working with CodeQL databases ## Use as library + The best way to examine the code is starting from the high-level scripts in + `bin/`. -```python -import qldbtools as ql -``` - -## Command-line use +## Command line use Initial information collection requires a unique file path so it can be run repeatedly over DB collections with the same (owner,name) but other differences @@ -67,26 +113,28 @@ import qldbtools as ql - cliVersion - language - Those fields are collected and a single name addenum formed in - `bin/mc-db-refine-info`. + Those fields are collected in `bin/mc-db-refine-info`. - The command sequence, grouped by data files, is + An example workflow with commands grouped by data files follows. - cd ~/work-gh/mrva/mrvacommander/client/qldbtools - ./bin/mc-db-initial-info ~/work-gh/mrva/mrva-open-source-download > db-info-1.csv - ./bin/mc-db-refine-info < db-info-1.csv > db-info-2.csv + cd ~/work-gh/mrva/mrvacommander/client/qldbtools && mkdir -p scratch + ./bin/mc-db-initial-info ~/work-gh/mrva/mrva-open-source-download > scratch/db-info-1.csv + ./bin/mc-db-refine-info < scratch/db-info-1.csv > scratch/db-info-2.csv - ./bin/mc-db-view-info < db-info-2.csv & - ./bin/mc-db-unique < db-info-2.csv > db-info-3.csv - ./bin/mc-db-view-info < db-info-3.csv & + ./bin/mc-db-view-info < scratch/db-info-2.csv & + ./bin/mc-db-unique < scratch/db-info-2.csv > scratch/db-info-3.csv + ./bin/mc-db-view-info < scratch/db-info-3.csv & - ./bin/mc-db-populate-minio -n 23 < db-info-3.csv - ./bin/mc-db-generate-selection -n 23 vscode-selection.json gh-mrva-selection.json < db-info-3.csv + ./bin/mc-db-populate-minio -n 23 < scratch/db-info-3.csv + ./bin/mc-db-generate-selection -n 23 \ + scratch/vscode-selection.json \ + scratch/gh-mrva-selection.json \ + < scratch/db-info-3.csv ## Notes - The preview-data plugin for VS Code has a bug; it displays `0` instead of + The `preview-data` plugin for VS Code has a bug; it displays `0` instead of `0e3379` for the following. There are other entries with similar malfunction. CleverRaven,Cataclysm-DDA,0e3379,2.17.0,2024-05-08 12:13:10.038007+00:00,cpp,5ca7f4e59c2d7b0a93fb801a31138477f7b4a761,578098.0,/Users/hohn/work-gh/mrva/mrva-open-source-download/repos-2024-04-29/CleverRaven/Cataclysm-DDA/code-scanning/codeql/databases/cpp/db.zip,cpp,C/C++,1228.0,578098.0,2024-05-13T12:14:54.650648,cpp,True,4245,563435469 diff --git a/client/qldbtools/qldbtools/session-4-unique.py b/client/qldbtools/qldbtools/session-4-unique.py index ae35e73..9ca32a6 100644 --- a/client/qldbtools/qldbtools/session-4-unique.py +++ b/client/qldbtools/qldbtools/session-4-unique.py @@ -5,7 +5,7 @@ import pandas as pd # cd ../ #* Reload CSV file to continue work -df2 = df_refined = pd.read_csv('db-info-2.csv') +df2 = df_refined = pd.read_csv('scratch/db-info-2.csv') # Identify rows missing specific entries rows = ( df2['cliVersion'].isna() | @@ -17,7 +17,7 @@ df3 = df2[~rows] df3 #* post-save work -df4 = pd.read_csv('db-info-3.csv') +df4 = pd.read_csv('scratch/db-info-3.csv') # Sort and group df_sorted = df4.sort_values(by=['owner', 'name', 'CID', 'creationTime']) diff --git a/client/qldbtools/qldbtools/session-generate-selection.py b/client/qldbtools/qldbtools/session-generate-selection.py index 9f1200e..65ac495 100644 --- a/client/qldbtools/qldbtools/session-generate-selection.py +++ b/client/qldbtools/qldbtools/session-generate-selection.py @@ -13,7 +13,7 @@ import numpy as np import importlib importlib.reload(utils) -df0 = pd.read_csv('db-info-3.csv') +df0 = pd.read_csv('scratch/db-info-3.csv') # Use num_entries, chosen via pseudo-random numbers df1 = df0.sample(n=3, random_state=np.random.RandomState(4242)) diff --git a/client/qldbtools/qldbtools/session-populate-minio.py b/client/qldbtools/qldbtools/session-populate-minio.py index 2e1a182..df9ec45 100644 --- a/client/qldbtools/qldbtools/session-populate-minio.py +++ b/client/qldbtools/qldbtools/session-populate-minio.py @@ -9,7 +9,7 @@ from pathlib import Path # #* Collect the information and select subset # -df = pd.read_csv('db-info-2.csv') +df = pd.read_csv('scratch/db-info-2.csv') seed = 4242 if 0: # Use all entries diff --git a/client/qldbtools/qldbtools/session-post-refine-info.py b/client/qldbtools/qldbtools/session-post-refine-info.py index 18f01df..4825678 100644 --- a/client/qldbtools/qldbtools/session-post-refine-info.py +++ b/client/qldbtools/qldbtools/session-post-refine-info.py @@ -4,7 +4,7 @@ import pandas as pd # #* Collect the information # -df1 = pd.read_csv("db-info-2.csv") +df1 = pd.read_csv("scratch/db-info-2.csv") # Add single uniqueness field -- CID (Cumulative ID) -- using # - creationTime diff --git a/client/qldbtools/requirements.txt b/client/qldbtools/requirements.txt index dd5d27a..1a98ef8 100644 --- a/client/qldbtools/requirements.txt +++ b/client/qldbtools/requirements.txt @@ -135,7 +135,7 @@ python-json-logger==2.0.7 pytz==2024.1 PyYAML==6.0.1 pyzmq==26.0.3 --e git+ssh://git@github.com/advanced-security/mrvacommander.git@26dd69c9767c315a8ffb782eedf3b55eac574d45#egg=qldbtools&subdirectory=client/qldbtools +-e git+ssh://git@github.com/advanced-security/mrvacommander.git@b7b4839fe0760287b80b8e2887c29d736c5cae33#egg=qldbtools&subdirectory=client/qldbtools qtstylish==0.1.5 referencing==0.35.1 requests==2.32.3 @@ -155,6 +155,7 @@ stack-data==0.6.3 statsmodels==0.14.2 strsimpy==0.2.1 tables==3.9.2 +tabulate==0.9.0 tenacity==8.5.0 terminado==0.18.1 threadpoolctl==3.5.0