diff --git a/client/qldbtools/.vscode/launch.json b/client/qldbtools/.vscode/launch.json new file mode 100644 index 0000000..bfb877f --- /dev/null +++ b/client/qldbtools/.vscode/launch.json @@ -0,0 +1,24 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + + { + "name": "Python Debugger: Current File with Arguments", + "type": "debugpy", + "request": "launch", + "program": "${file}", + "console": "integratedTerminal", + "args": [ + "--db_collection_dir", + "db-collection-py", + "--starting_path", + "$HOME/work-gh/mrva/mrva-open-source-download" + ], + "justMyCode": true, + "stopOnEntry": false + } + ] +} diff --git a/client/qldbtools/bin/hepc-init b/client/qldbtools/bin/hepc-init old mode 100755 new mode 100644 index f263665..7399358 --- a/client/qldbtools/bin/hepc-init +++ b/client/qldbtools/bin/hepc-init @@ -1,144 +1,115 @@ -#!/bin/bash +#!/usr/bin/env python3 -#* Utility functions -log() { - local level="$1" - shift - local color_reset="\033[0m" - local color_info="\033[1;34m" - local color_warn="\033[1;33m" - local color_error="\033[1;31m" +import json +import hashlib +import yaml +import sys +from plumbum import cli, local +from plumbum.cmd import find, mkdir, ln, rm, mktemp, unzip, date, env - local color - case "$level" in - INFO) color="$color_info" ;; - WARN) color="$color_warn" ;; - ERROR) color="$color_error" ;; - *) color="$color_reset" ;; - esac - - echo -e "${color}[$(date +"%Y-%m-%d %H:%M:%S")] [$level] $*${color_reset}" >&2 -} -usage() { - echo "Usage: $0 --db_collection_dir --starting_path [-h]" - echo - echo "Options:" - echo " --db_collection_dir Specify the database collection directory." - echo " --starting_path Specify the starting path." - echo " -h Show this help message." - exit 1 -} - - -#* Initialize and parse arguments -set -euo pipefail # exit on error, unset var, pipefail -trap 'rm -fR /tmp/hepc.$$-*' EXIT - -starting_dir=$(pwd) -db_collection_dir="" -starting_path="" - -# Parse arguments -while [[ $# -gt 0 ]]; do - case "$1" in - --db_collection_dir) - shift - if [[ -z "$1" || "$1" == -* ]]; then - echo "Error: --db_collection_dir requires a directory as an argument." - usage - fi - db_collection_dir="$1" - ;; - --starting_path) - shift - if [[ -z "$1" || "$1" == -* ]]; then - echo "Error: --starting_path requires a path as an argument." - usage - fi - starting_path="$1" - ;; - -h) - usage - ;; - *) - echo "Error: Unknown option '$1'." - usage - ;; - esac - shift -done - -# Check if required arguments were provided -if [[ -z "$db_collection_dir" ]]; then - echo "Error: --db_collection_dir is required." - usage -fi - -if [[ -z "$starting_path" ]]; then - echo "Error: --starting_path is required." - usage -fi - -#* Find all DBs -log INFO "searching for db.zip files" -find ${starting_path} -type f -name "db.zip" -size +0c > /tmp/hepc.$$-paths - -#* Collect detailed information from the database files -# Don't assume they are unique. -log INFO "collecting information from db.zip files" -mkdir -p $db_collection_dir -cat /tmp/hepc.$$-paths | while read -r zip_path -do - log INFO "Extracting from ${zip_path}" - zip_dir=$(dirname ${zip_path}) - zip_file=$(basename ${zip_path}) - unzip -o -q ${zip_path} '*codeql-database.yml' -d /tmp/hepc.$$-zip - # The content may be LANGUAGE/codeql-database.yml - - #* For every database, create a metadata record. - mkdir -p /tmp/hepc.$$-zip - cd /tmp/hepc.$$-zip/* - - # Information from codeql-database.yml - primaryLanguage=$(yq '.primaryLanguage' codeql-database.yml) - sha=$(yq '.creationMetadata.sha' codeql-database.yml) - cliVersion=$(yq '.creationMetadata.cliVersion' codeql-database.yml) - creationTime=$(yq '.creationMetadata.creationTime' codeql-database.yml) - sourceLocationPrefix=$(yq '.sourceLocationPrefix' codeql-database.yml) - repo=${sourceLocationPrefix##*/} # keep only last component - # Get sourceLocationPrefix[-2] - owner="${sourceLocationPrefix%/*}" # strip last component - owner="${owner##*/}" # keep only last component - - # cid for repository / db - cid=$(echo "${cliVersion} ${creationTime} ${primaryLanguage} ${sha}" | b2sum |\ - awk '{print substr($1, 1, 6)}') - - # Prepare the metadata record for this DB. - new_db_fname="${owner}-${repo}-ctsj-${cid}.zip" - result_url="http://hepc/${db_collection_dir}/${new_db_fname}" - record=' - { - "git_branch": "HEAD", - "git_commit_id": "'${sha}'", - "git_repo": "'${repo}'", - "ingestion_datetime_utc": "'${creationTime}'", - "result_url": "'${result_url}'", - "tool_id": "9f2f9642-febb-4435-9204-fb50bbd43de4", - "tool_name": "codeql-'${primaryLanguage}'", - "tool_version": "'${cliVersion}'", - "projname": "'${owner}/${repo}'" +# Logging function +def log(level, message): + colors = { + "INFO": "\033[1;34m", + "WARN": "\033[1;33m", + "ERROR": "\033[1;31m", + "RESET": "\033[0m", } -' - cd "$starting_dir" - rm -fR /tmp/hepc.$$-zip - echo "$record" >> $db_collection_dir/metadata.json + timestamp = date("+%Y-%m-%d %H:%M:%S").strip() + print(f"{colors[level]}[{timestamp}] [{level}] {message}{colors['RESET']}", file=sys.stderr) - #* Link original file path to collection directory for serving. Use name including - # the cid and field separator ctsj - cd ${db_collection_dir} - [ -L ${new_db_fname} ] || ln -s ${zip_path} ${new_db_fname} +# Generate a CID +def generate_cid(cli_version, creation_time, primary_language, sha): + hash_input = f"{cli_version} {creation_time} {primary_language} {sha}".encode() + return hashlib.sha256(hash_input).hexdigest()[:6] - # Interim cleanup - rm -fR "/tmp/hepc.$$-*" -done +# Expand environment variables in paths +def expand_path(path): + return local.env.expand(path) + +# Process a single db.zip file +def process_db_file(zip_path, db_collection_dir): + temp_dir = mktemp("-d").strip() + try: + unzip("-o", "-q", zip_path, "*codeql-database.yml", "-d", temp_dir) + + # Locate the YAML file regardless of its depth + yaml_files = list(local.path(temp_dir).walk( + filter=lambda p: p.name == "codeql-database.yml")) + if not yaml_files: + log("WARN", f"No codeql-database.yml found in {zip_path}") + return + + yaml_path = yaml_files[0] + with yaml_path.open("r") as f: + yaml_data = yaml.safe_load(f) + + primary_language = yaml_data["primaryLanguage"] + creation_metadata = yaml_data["creationMetadata"] + sha = creation_metadata["sha"] + cli_version = creation_metadata["cliVersion"] + creation_time = creation_metadata["creationTime"] + source_location_prefix = local.path(yaml_data["sourceLocationPrefix"]) + + repo = source_location_prefix.name + owner = source_location_prefix.parent.name + cid = generate_cid(cli_version, creation_time, primary_language, sha) + new_db_fname = f"{owner}-{repo}-ctsj-{cid}.zip" + result_url = f"http://hepc/{db_collection_dir}/{new_db_fname}" + + metadata = { + "git_branch": "HEAD", + "git_commit_id": sha, + "git_repo": repo, + "ingestion_datetime_utc": str(creation_time), + "result_url": result_url, + "tool_id": "9f2f9642-febb-4435-9204-fb50bbd43de4", + "tool_name": f"codeql-{primary_language}", + "tool_version": cli_version, + "projname": f"{owner}/{repo}", + } + + metadata_file = local.path(db_collection_dir) / "metadata.json" + with metadata_file.open("a") as f: + json.dump(metadata, f) + f.write("\n") + + link_path = local.path(db_collection_dir) / new_db_fname + if not link_path.exists(): + ln("-sf", zip_path, link_path) + + except Exception as e: + log("WARN", f"Error processing {zip_path}: {e}") + finally: + rm("-rf", temp_dir) + +# Main application class +class DBProcessor(cli.Application): + db_collection_dir = cli.SwitchAttr( + "--db_collection_dir", str, mandatory=True, help="Specify the database collection directory" + ) + starting_path = cli.SwitchAttr( + "--starting_path", str, mandatory=True, help="Specify the starting path" + ) + + def main(self): + db_collection_dir = expand_path(self.db_collection_dir) + starting_path = expand_path(self.starting_path) + + mkdir("-p", db_collection_dir) + log("INFO", f"Searching for db.zip files in {starting_path}") + + db_files = find(starting_path, "-type", "f", "-name", "db.zip", + "-size", "+0c").splitlines() + + if not db_files: + log("WARN", "No db.zip files found in the specified starting path.") + return + + for zip_path in db_files: + process_db_file(zip_path, db_collection_dir) + + log("INFO", "Processing completed.") + +if __name__ == "__main__": + DBProcessor.run() diff --git a/client/qldbtools/bin/hepc-init.sh b/client/qldbtools/bin/hepc-init.sh new file mode 100755 index 0000000..f263665 --- /dev/null +++ b/client/qldbtools/bin/hepc-init.sh @@ -0,0 +1,144 @@ +#!/bin/bash + +#* Utility functions +log() { + local level="$1" + shift + local color_reset="\033[0m" + local color_info="\033[1;34m" + local color_warn="\033[1;33m" + local color_error="\033[1;31m" + + local color + case "$level" in + INFO) color="$color_info" ;; + WARN) color="$color_warn" ;; + ERROR) color="$color_error" ;; + *) color="$color_reset" ;; + esac + + echo -e "${color}[$(date +"%Y-%m-%d %H:%M:%S")] [$level] $*${color_reset}" >&2 +} +usage() { + echo "Usage: $0 --db_collection_dir --starting_path [-h]" + echo + echo "Options:" + echo " --db_collection_dir Specify the database collection directory." + echo " --starting_path Specify the starting path." + echo " -h Show this help message." + exit 1 +} + + +#* Initialize and parse arguments +set -euo pipefail # exit on error, unset var, pipefail +trap 'rm -fR /tmp/hepc.$$-*' EXIT + +starting_dir=$(pwd) +db_collection_dir="" +starting_path="" + +# Parse arguments +while [[ $# -gt 0 ]]; do + case "$1" in + --db_collection_dir) + shift + if [[ -z "$1" || "$1" == -* ]]; then + echo "Error: --db_collection_dir requires a directory as an argument." + usage + fi + db_collection_dir="$1" + ;; + --starting_path) + shift + if [[ -z "$1" || "$1" == -* ]]; then + echo "Error: --starting_path requires a path as an argument." + usage + fi + starting_path="$1" + ;; + -h) + usage + ;; + *) + echo "Error: Unknown option '$1'." + usage + ;; + esac + shift +done + +# Check if required arguments were provided +if [[ -z "$db_collection_dir" ]]; then + echo "Error: --db_collection_dir is required." + usage +fi + +if [[ -z "$starting_path" ]]; then + echo "Error: --starting_path is required." + usage +fi + +#* Find all DBs +log INFO "searching for db.zip files" +find ${starting_path} -type f -name "db.zip" -size +0c > /tmp/hepc.$$-paths + +#* Collect detailed information from the database files +# Don't assume they are unique. +log INFO "collecting information from db.zip files" +mkdir -p $db_collection_dir +cat /tmp/hepc.$$-paths | while read -r zip_path +do + log INFO "Extracting from ${zip_path}" + zip_dir=$(dirname ${zip_path}) + zip_file=$(basename ${zip_path}) + unzip -o -q ${zip_path} '*codeql-database.yml' -d /tmp/hepc.$$-zip + # The content may be LANGUAGE/codeql-database.yml + + #* For every database, create a metadata record. + mkdir -p /tmp/hepc.$$-zip + cd /tmp/hepc.$$-zip/* + + # Information from codeql-database.yml + primaryLanguage=$(yq '.primaryLanguage' codeql-database.yml) + sha=$(yq '.creationMetadata.sha' codeql-database.yml) + cliVersion=$(yq '.creationMetadata.cliVersion' codeql-database.yml) + creationTime=$(yq '.creationMetadata.creationTime' codeql-database.yml) + sourceLocationPrefix=$(yq '.sourceLocationPrefix' codeql-database.yml) + repo=${sourceLocationPrefix##*/} # keep only last component + # Get sourceLocationPrefix[-2] + owner="${sourceLocationPrefix%/*}" # strip last component + owner="${owner##*/}" # keep only last component + + # cid for repository / db + cid=$(echo "${cliVersion} ${creationTime} ${primaryLanguage} ${sha}" | b2sum |\ + awk '{print substr($1, 1, 6)}') + + # Prepare the metadata record for this DB. + new_db_fname="${owner}-${repo}-ctsj-${cid}.zip" + result_url="http://hepc/${db_collection_dir}/${new_db_fname}" + record=' + { + "git_branch": "HEAD", + "git_commit_id": "'${sha}'", + "git_repo": "'${repo}'", + "ingestion_datetime_utc": "'${creationTime}'", + "result_url": "'${result_url}'", + "tool_id": "9f2f9642-febb-4435-9204-fb50bbd43de4", + "tool_name": "codeql-'${primaryLanguage}'", + "tool_version": "'${cliVersion}'", + "projname": "'${owner}/${repo}'" + } +' + cd "$starting_dir" + rm -fR /tmp/hepc.$$-zip + echo "$record" >> $db_collection_dir/metadata.json + + #* Link original file path to collection directory for serving. Use name including + # the cid and field separator ctsj + cd ${db_collection_dir} + [ -L ${new_db_fname} ] || ln -s ${zip_path} ${new_db_fname} + + # Interim cleanup + rm -fR "/tmp/hepc.$$-*" +done diff --git a/client/qldbtools/qldbtools.code-workspace b/client/qldbtools/qldbtools.code-workspace new file mode 100644 index 0000000..87966b3 --- /dev/null +++ b/client/qldbtools/qldbtools.code-workspace @@ -0,0 +1,11 @@ +{ + "folders": [ + { + "path": "." + } + ], + "settings": { + "git.ignoreLimitWarning": true, + "makefile.configureOnOpen": false + } +} \ No newline at end of file