From 18333bfdb11f3df34e4711477eaa22317a75187d Mon Sep 17 00:00:00 2001 From: Michael Hohn Date: Tue, 19 Nov 2024 15:23:20 -0800 Subject: [PATCH] Start hepc-init: the data collector for DBs on the file system --- client/qldbtools/README.md | 150 ----------------------------- client/qldbtools/README.org | 171 +++++++++++++++++++++++++++++++++ client/qldbtools/bin/hepc-init | 144 +++++++++++++++++++++++++++ 3 files changed, 315 insertions(+), 150 deletions(-) delete mode 100644 client/qldbtools/README.md create mode 100644 client/qldbtools/README.org create mode 100755 client/qldbtools/bin/hepc-init diff --git a/client/qldbtools/README.md b/client/qldbtools/README.md deleted file mode 100644 index 87babac..0000000 --- a/client/qldbtools/README.md +++ /dev/null @@ -1,150 +0,0 @@ -# Introduction to qldbtools - -`qldbtools` is a Python package for selecting sets of CodeQL databases to work on. -It uses a (pandas) dataframe in the implementation, but all results sets are -available as CSV files to provide flexibility in the tools you want to work with. - -The rationale is simple: When working with larger collections of CodeQL databases, -spread over time, languages, etc., many criteria can be used to select the subset -of interest. This package addresses that aspect of MRVA (multi repository -variant analysis). - -For example, consider this scenario from an enterprise. We have 10,000 -repositories in C/C++, 5,000 in Python. We build CodeQL dabases weekly and keep -the last 2 years worth. -This means for the last 2 years there are - - (10000 + 5000) * 52 * 2 = 1560000 - -databases to select from for a single MRVA run. 1.5 Million rows are readily -handled by a pandas (or R) dataframe. - -The full list of criteria currently encoded via the columns is - -- owner -- name -- CID -- cliVersion -- creationTime -- language -- sha -- git commit sha of the code the CodeQL database is built against -- baselineLinesOfCode -- path -- db_lang -- db_lang_displayName -- db_lang_file_count -- db_lang_linesOfCode -- ctime -- primaryLanguage -- finalised -- left_index -- size - -The minimal criteria needed to distinguish databases in the above scenario are - -- cliVersion -- creationTime -- language -- sha - -These are encoded in the single custom id column 'CID'. - -Thus, a database can be fully specified using a (owner,name,CID) tuple and this is -encoded in the names used by the MRVA server and clients. The selection of -databases can of course be done using the whole table. - -For an example of the workflow, see [section 'command line use'](#command-line-use). - - - -A small sample of a full table: - -| | owner | name | CID | cliVersion | creationTime | language | sha | baselineLinesOfCode | path | db_lang | db_lang_displayName | db_lang_file_count | db_lang_linesOfCode | ctime | primaryLanguage | finalised | left_index | size | -|---:|:---------|:---------------|:-------|:-------------|:---------------------------------|:-----------|:-----------------------------------------|----------------------:|:------------------------------------------------------------------------------------------------------------------------------|:------------|:----------------------|---------------------:|----------------------:|:---------------------------|:------------------|------------:|-------------:|---------:| -| 0 | 1adrianb | face-alignment | 1f8d99 | 2.16.1 | 2024-02-08 14:18:20.983830+00:00 | python | c94dd024b1f5410ef160ff82a8423141e2bbb6b4 | 1839 | /Users/hohn/work-gh/mrva/mrva-open-source-download/repos/1adrianb/face-alignment/code-scanning/codeql/databases/python/db.zip | python | Python | 25 | 1839 | 2024-07-24T14:09:02.187201 | python | 1 | 1454 | 24075001 | -| 1 | 2shou | TextGrocery | 9ab87a | 2.12.1 | 2023-02-17T11:32:30.863093193Z | cpp | 8a4e41349a9b0175d9a73bc32a6b2eb6bfb51430 | 3939 | /Users/hohn/work-gh/mrva/mrva-open-source-download/repos/2shou/TextGrocery/code-scanning/codeql/databases/cpp/db.zip | no-language | no-language | 0 | -1 | 2024-07-24T06:25:55.347568 | cpp | nan | 1403 | 3612535 | -| 2 | 3b1b | manim | 76fdc7 | 2.17.5 | 2024-06-27 17:37:20.587627+00:00 | python | 88c7e9d2c96be1ea729b089c06cabb1bd3b2c187 | 19905 | /Users/hohn/work-gh/mrva/mrva-open-source-download/repos/3b1b/manim/code-scanning/codeql/databases/python/db.zip | python | Python | 94 | 19905 | 2024-07-24T13:23:04.716286 | python | 1 | 1647 | 26407541 | - -## Installation - -- Set up the virtual environment and install tools - - cd ~/work-gh/mrva/mrvacommander/client/qldbtools/ - python3.11 -m venv venv - source venv/bin/activate - pip install --upgrade pip - - # From requirements.txt - pip install -r requirements.txt - # Or explicitly - pip install jupyterlab pandas ipython - pip install lckr-jupyterlab-variableinspector - -- Local development - - ```bash - cd ~/work-gh/mrva/mrvacommander/client/qldbtools - source venv/bin/activate - pip install --editable . - ``` - - The `--editable` *should* use symlinks for all scripts; use `./bin/*` to be sure. - - -- Full installation - - ```bash - pip install qldbtools - ``` - - -## Use as library - The best way to examine the code is starting from the high-level scripts in - `bin/`. - -## Command line use - - Initial information collection requires a unique file path so it can be run - repeatedly over DB collections with the same (owner,name) but other differences - -- namely, in one or more of - - - creationTime - - sha - - cliVersion - - language - - Those fields are collected in `bin/mc-db-refine-info`. - - An example workflow with commands grouped by data files follows. - - cd ~/work-gh/mrva/mrvacommander/client/qldbtools && mkdir -p scratch - ./bin/mc-db-initial-info ~/work-gh/mrva/mrva-open-source-download > scratch/db-info-1.csv - ./bin/mc-db-refine-info < scratch/db-info-1.csv > scratch/db-info-2.csv - - ./bin/mc-db-view-info < scratch/db-info-2.csv & - ./bin/mc-db-unique cpp < scratch/db-info-2.csv > scratch/db-info-3.csv - ./bin/mc-db-view-info < scratch/db-info-3.csv & - - ./bin/mc-db-populate-minio -n 11 < scratch/db-info-3.csv - ./bin/mc-db-generate-selection -n 11 \ - scratch/vscode-selection.json \ - scratch/gh-mrva-selection.json \ - < scratch/db-info-3.csv - - - To see the full information for a selection, use `mc-rows-from-mrva-list`: - - ./bin/mc-rows-from-mrva-list scratch/gh-mrva-selection.json \ - scratch/db-info-3.csv > scratch/selection-full-info - - To check, e.g., the `language` column: - - csvcut -c language scratch/selection-full-info - -## Notes - - The `preview-data` plugin for VS Code has a bug; it displays `0` instead of - `0e3379` for the following. There are other entries with similar malfunction. - - CleverRaven,Cataclysm-DDA,0e3379,2.17.0,2024-05-08 12:13:10.038007+00:00,cpp,5ca7f4e59c2d7b0a93fb801a31138477f7b4a761,578098.0,/Users/hohn/work-gh/mrva/mrva-open-source-download/repos-2024-04-29/CleverRaven/Cataclysm-DDA/code-scanning/codeql/databases/cpp/db.zip,cpp,C/C++,1228.0,578098.0,2024-05-13T12:14:54.650648,cpp,True,4245,563435469 - CleverRaven,Cataclysm-DDA,3231f7,2.18.0,2024-07-18 11:13:01.673231+00:00,cpp,db3435138781937e9e0e999abbaa53f1d3afb5b7,579532.0,/Users/hohn/work-gh/mrva/mrva-open-source-download/repos/CleverRaven/Cataclysm-DDA/code-scanning/codeql/databases/cpp/db.zip,cpp,C/C++,1239.0,579532.0,2024-07-24T02:33:23.900885,cpp,True,1245,573213726 diff --git a/client/qldbtools/README.org b/client/qldbtools/README.org new file mode 100644 index 0000000..de1a241 --- /dev/null +++ b/client/qldbtools/README.org @@ -0,0 +1,171 @@ +* Introduction to hepc -- HTTP End Point for CodeQL + #+BEGIN_SRC sh + 1:$ ./bin/hepc-init --db_collection_dir db-collection --starting_path ~/work-gh/mrva/mrva-open-source-download + [2024-11-19 14:12:06] [INFO] searching for db.zip files + [2024-11-19 14:12:08] [INFO] collecting information from db.zip files + [2024-11-19 14:12:08] [INFO] Extracting from /Users/hohn/work-gh/mrva/mrva-open-source-download/repos/aircrack-ng/aircrack-ng/code-scanning/codeql/databases/cpp/db.zip + [2024-11-19 14:12:08] [INFO] Adding record to db-collection/metadata.json + #+END_SRC + +* Introduction to qldbtools +=qldbtools= is a Python package for selecting sets of CodeQL databases +to work on. It uses a (pandas) dataframe in the implementation, but all +results sets are available as CSV files to provide flexibility in the +tools you want to work with. + +The rationale is simple: When working with larger collections of CodeQL +databases, spread over time, languages, etc., many criteria can be used +to select the subset of interest. This package addresses that aspect of +MRVA (multi repository variant analysis). + +For example, consider this scenario from an enterprise. We have 10,000 +repositories in C/C++, 5,000 in Python. We build CodeQL dabases weekly +and keep the last 2 years worth. This means for the last 2 years there +are + +#+begin_example +(10000 + 5000) * 52 * 2 = 1560000 +#+end_example + +databases to select from for a single MRVA run. 1.5 Million rows are +readily handled by a pandas (or R) dataframe. + +The full list of criteria currently encoded via the columns is + +- owner +- name +- CID +- cliVersion +- creationTime +- language +- sha -- git commit sha of the code the CodeQL database is built against +- baselineLinesOfCode +- path +- db_lang +- db_lang_displayName +- db_lang_file_count +- db_lang_linesOfCode +- ctime +- primaryLanguage +- finalised +- left_index +- size + +The minimal criteria needed to distinguish databases in the above +scenario are + +- cliVersion +- creationTime +- language +- sha + +These are encoded in the single custom id column 'CID'. + +Thus, a database can be fully specified using a (owner,name,CID) tuple +and this is encoded in the names used by the MRVA server and clients. +The selection of databases can of course be done using the whole table. + +For an example of the workflow, see [[#command-line-use][section +'command line use']]. + +A small sample of a full table: + +| | owner | name | CID | cliVersion | creationTime | language | sha | baselineLinesOfCode | path | db_lang | db_lang_displayName | db_lang_file_count | db_lang_linesOfCode | ctime | primaryLanguage | finalised | left_index | size | +|---+----------+----------------+--------+------------+----------------------------------+----------+------------------------------------------+---------------------+-------------------------------------------------------------------------------------------------------------------------------+-------------+---------------------+--------------------+---------------------+----------------------------+-----------------+-----------+------------+----------| +| 0 | 1adrianb | face-alignment | 1f8d99 | 2.16.1 | 2024-02-08 14:18:20.983830+00:00 | python | c94dd024b1f5410ef160ff82a8423141e2bbb6b4 | 1839 | /Users/hohn/work-gh/mrva/mrva-open-source-download/repos/1adrianb/face-alignment/code-scanning/codeql/databases/python/db.zip | python | Python | 25 | 1839 | 2024-07-24T14:09:02.187201 | python | 1 | 1454 | 24075001 | +| 1 | 2shou | TextGrocery | 9ab87a | 2.12.1 | 2023-02-17T11:32:30.863093193Z | cpp | 8a4e41349a9b0175d9a73bc32a6b2eb6bfb51430 | 3939 | /Users/hohn/work-gh/mrva/mrva-open-source-download/repos/2shou/TextGrocery/code-scanning/codeql/databases/cpp/db.zip | no-language | no-language | 0 | -1 | 2024-07-24T06:25:55.347568 | cpp | nan | 1403 | 3612535 | +| 2 | 3b1b | manim | 76fdc7 | 2.17.5 | 2024-06-27 17:37:20.587627+00:00 | python | 88c7e9d2c96be1ea729b089c06cabb1bd3b2c187 | 19905 | /Users/hohn/work-gh/mrva/mrva-open-source-download/repos/3b1b/manim/code-scanning/codeql/databases/python/db.zip | python | Python | 94 | 19905 | 2024-07-24T13:23:04.716286 | python | 1 | 1647 | 26407541 | + +** Installation +- Set up the virtual environment and install tools + + #+begin_example + cd ~/work-gh/mrva/mrvacommander/client/qldbtools/ + python3.11 -m venv venv + source venv/bin/activate + pip install --upgrade pip + + # From requirements.txt + pip install -r requirements.txt + # Or explicitly + pip install jupyterlab pandas ipython + pip install lckr-jupyterlab-variableinspector + #+end_example + +- Local development + + #+begin_example + ```bash + cd ~/work-gh/mrva/mrvacommander/client/qldbtools + source venv/bin/activate + pip install --editable . + ``` + + The `--editable` *should* use symlinks for all scripts; use `./bin/*` to be sure. + #+end_example + +- Full installation + + #+begin_example + ```bash + pip install qldbtools + ``` + #+end_example + +** Use as library +The best way to examine the code is starting from the high-level scripts +in =bin/=. + +** Command line use +Initial information collection requires a unique file path so it can be +run repeatedly over DB collections with the same (owner,name) but other +differences -- namely, in one or more of + +- creationTime +- sha +- cliVersion +- language + +Those fields are collected in =bin/mc-db-refine-info=. + +An example workflow with commands grouped by data files follows. + +#+begin_example + cd ~/work-gh/mrva/mrvacommander/client/qldbtools && mkdir -p scratch + ./bin/mc-db-initial-info ~/work-gh/mrva/mrva-open-source-download > scratch/db-info-1.csv + ./bin/mc-db-refine-info < scratch/db-info-1.csv > scratch/db-info-2.csv + + ./bin/mc-db-view-info < scratch/db-info-2.csv & + ./bin/mc-db-unique cpp < scratch/db-info-2.csv > scratch/db-info-3.csv + ./bin/mc-db-view-info < scratch/db-info-3.csv & + + ./bin/mc-db-populate-minio -n 11 < scratch/db-info-3.csv + ./bin/mc-db-generate-selection -n 11 \ + scratch/vscode-selection.json \ + scratch/gh-mrva-selection.json \ + < scratch/db-info-3.csv +#+end_example + +To see the full information for a selection, use +=mc-rows-from-mrva-list=: + +#+begin_example + ./bin/mc-rows-from-mrva-list scratch/gh-mrva-selection.json \ + scratch/db-info-3.csv > scratch/selection-full-info +#+end_example + +To check, e.g., the =language= column: + +#+begin_example + csvcut -c language scratch/selection-full-info +#+end_example + +** Notes +The =preview-data= plugin for VS Code has a bug; it displays =0= instead +of =0e3379= for the following. There are other entries with similar +malfunction. + +#+begin_example + CleverRaven,Cataclysm-DDA,0e3379,2.17.0,2024-05-08 12:13:10.038007+00:00,cpp,5ca7f4e59c2d7b0a93fb801a31138477f7b4a761,578098.0,/Users/hohn/work-gh/mrva/mrva-open-source-download/repos-2024-04-29/CleverRaven/Cataclysm-DDA/code-scanning/codeql/databases/cpp/db.zip,cpp,C/C++,1228.0,578098.0,2024-05-13T12:14:54.650648,cpp,True,4245,563435469 + CleverRaven,Cataclysm-DDA,3231f7,2.18.0,2024-07-18 11:13:01.673231+00:00,cpp,db3435138781937e9e0e999abbaa53f1d3afb5b7,579532.0,/Users/hohn/work-gh/mrva/mrva-open-source-download/repos/CleverRaven/Cataclysm-DDA/code-scanning/codeql/databases/cpp/db.zip,cpp,C/C++,1239.0,579532.0,2024-07-24T02:33:23.900885,cpp,True,1245,573213726 +#+end_example diff --git a/client/qldbtools/bin/hepc-init b/client/qldbtools/bin/hepc-init new file mode 100755 index 0000000..f263665 --- /dev/null +++ b/client/qldbtools/bin/hepc-init @@ -0,0 +1,144 @@ +#!/bin/bash + +#* Utility functions +log() { + local level="$1" + shift + local color_reset="\033[0m" + local color_info="\033[1;34m" + local color_warn="\033[1;33m" + local color_error="\033[1;31m" + + local color + case "$level" in + INFO) color="$color_info" ;; + WARN) color="$color_warn" ;; + ERROR) color="$color_error" ;; + *) color="$color_reset" ;; + esac + + echo -e "${color}[$(date +"%Y-%m-%d %H:%M:%S")] [$level] $*${color_reset}" >&2 +} +usage() { + echo "Usage: $0 --db_collection_dir --starting_path [-h]" + echo + echo "Options:" + echo " --db_collection_dir Specify the database collection directory." + echo " --starting_path Specify the starting path." + echo " -h Show this help message." + exit 1 +} + + +#* Initialize and parse arguments +set -euo pipefail # exit on error, unset var, pipefail +trap 'rm -fR /tmp/hepc.$$-*' EXIT + +starting_dir=$(pwd) +db_collection_dir="" +starting_path="" + +# Parse arguments +while [[ $# -gt 0 ]]; do + case "$1" in + --db_collection_dir) + shift + if [[ -z "$1" || "$1" == -* ]]; then + echo "Error: --db_collection_dir requires a directory as an argument." + usage + fi + db_collection_dir="$1" + ;; + --starting_path) + shift + if [[ -z "$1" || "$1" == -* ]]; then + echo "Error: --starting_path requires a path as an argument." + usage + fi + starting_path="$1" + ;; + -h) + usage + ;; + *) + echo "Error: Unknown option '$1'." + usage + ;; + esac + shift +done + +# Check if required arguments were provided +if [[ -z "$db_collection_dir" ]]; then + echo "Error: --db_collection_dir is required." + usage +fi + +if [[ -z "$starting_path" ]]; then + echo "Error: --starting_path is required." + usage +fi + +#* Find all DBs +log INFO "searching for db.zip files" +find ${starting_path} -type f -name "db.zip" -size +0c > /tmp/hepc.$$-paths + +#* Collect detailed information from the database files +# Don't assume they are unique. +log INFO "collecting information from db.zip files" +mkdir -p $db_collection_dir +cat /tmp/hepc.$$-paths | while read -r zip_path +do + log INFO "Extracting from ${zip_path}" + zip_dir=$(dirname ${zip_path}) + zip_file=$(basename ${zip_path}) + unzip -o -q ${zip_path} '*codeql-database.yml' -d /tmp/hepc.$$-zip + # The content may be LANGUAGE/codeql-database.yml + + #* For every database, create a metadata record. + mkdir -p /tmp/hepc.$$-zip + cd /tmp/hepc.$$-zip/* + + # Information from codeql-database.yml + primaryLanguage=$(yq '.primaryLanguage' codeql-database.yml) + sha=$(yq '.creationMetadata.sha' codeql-database.yml) + cliVersion=$(yq '.creationMetadata.cliVersion' codeql-database.yml) + creationTime=$(yq '.creationMetadata.creationTime' codeql-database.yml) + sourceLocationPrefix=$(yq '.sourceLocationPrefix' codeql-database.yml) + repo=${sourceLocationPrefix##*/} # keep only last component + # Get sourceLocationPrefix[-2] + owner="${sourceLocationPrefix%/*}" # strip last component + owner="${owner##*/}" # keep only last component + + # cid for repository / db + cid=$(echo "${cliVersion} ${creationTime} ${primaryLanguage} ${sha}" | b2sum |\ + awk '{print substr($1, 1, 6)}') + + # Prepare the metadata record for this DB. + new_db_fname="${owner}-${repo}-ctsj-${cid}.zip" + result_url="http://hepc/${db_collection_dir}/${new_db_fname}" + record=' + { + "git_branch": "HEAD", + "git_commit_id": "'${sha}'", + "git_repo": "'${repo}'", + "ingestion_datetime_utc": "'${creationTime}'", + "result_url": "'${result_url}'", + "tool_id": "9f2f9642-febb-4435-9204-fb50bbd43de4", + "tool_name": "codeql-'${primaryLanguage}'", + "tool_version": "'${cliVersion}'", + "projname": "'${owner}/${repo}'" + } +' + cd "$starting_dir" + rm -fR /tmp/hepc.$$-zip + echo "$record" >> $db_collection_dir/metadata.json + + #* Link original file path to collection directory for serving. Use name including + # the cid and field separator ctsj + cd ${db_collection_dir} + [ -L ${new_db_fname} ] || ln -s ${zip_path} ${new_db_fname} + + # Interim cleanup + rm -fR "/tmp/hepc.$$-*" +done