9 Commits

Author SHA1 Message Date
Michael Hohn
9e44c8dfe1 d 2024-06-18 14:05:07 -07:00
Michael Hohn
1a009ccde0 test lfs 2024-06-18 13:55:32 -07:00
Michael Hohn
5dfca00fa5 Add instructions to test server from the host 2024-06-18 13:29:41 -07:00
Michael Hohn
46052cd20f Fix simple SIGSEV 2024-06-18 13:29:14 -07:00
Michael Hohn
8f318c114f Add CommanderContainer and CommonState
Use statically distinct types for each mrvacommander configuration
2024-06-18 12:54:59 -07:00
Michael Hohn
1633245444 wip: make server compile post-merge 2024-06-18 10:07:47 -07:00
Nicolas Will
02acf3eeaf Remove storage, add state and store pkgs, refactor 2024-06-18 17:41:28 +02:00
Nicolas Will
30f2d22a71 Format comments in pkg/server/server.go 2024-06-17 15:01:23 +02:00
Nicolas Will
95e42ae85a Fix docker-compose.yml agent depends_on 2024-06-17 13:16:24 +02:00
153 changed files with 1679 additions and 12868 deletions

View File

@@ -1,9 +0,0 @@
# Excludes
/dbstore-data
/qpstore-data
/test-data
/venv
/client
/cmd/server/var
/.git

View File

@@ -1,12 +0,0 @@
MRVA_RABBITMQ_HOST=rabbitmq
MRVA_RABBITMQ_PORT=5672
MRVA_RABBITMQ_USER=user
MRVA_RABBITMQ_PASSWORD=password
MINIO_ROOT_USER=user
MINIO_ROOT_PASSWORD=mmusty8432
ARTIFACT_MINIO_ENDPOINT=artifactstore:9000
ARTIFACT_MINIO_ID=${MINIO_ROOT_USER}
ARTIFACT_MINIO_SECRET=${MINIO_ROOT_PASSWORD}
QLDB_MINIO_ENDPOINT=dbstore:9000
QLDB_MINIO_ID=${MINIO_ROOT_USER}
QLDB_MINIO_SECRET=${MINIO_ROOT_PASSWORD}

1
.gitattributes vendored
View File

@@ -1,3 +1,2 @@
*.zip filter=lfs diff=lfs merge=lfs -text
*.blob filter=lfs diff=lfs merge=lfs -text
*.tar filter=lfs diff=lfs merge=lfs -text

26
.gitignore vendored
View File

@@ -4,9 +4,6 @@ cmd/server/var/
# vscode project dir
.vscode/
# idea project dir
.idea/
# Compiled binary
cmd/server/server
cmd/agent/agent
@@ -44,26 +41,3 @@ go.work.sum
# env file
.env
/artifactstore-data/.minio.sys
/qldbminio/qldb
.ipynb_checkpoints/
venv/
venv-*/
*.egg-info
__pycache__
README.html
ChangeLog
notes/*.html
# Make timestamp files
mk.*
demo/containers/dbsdata/data/
demo/containers/dbsdata/tmp.dbsdata_backup.tar
client/qldbtools/db-collection-py-1/
mrva-overview.aux
mrva-overview.log
mrva-overview.synctex.gz
mrva-overview.toc
auto/

View File

@@ -1,29 +0,0 @@
linters:
enable:
- staticcheck
- unused
- decorder
- errchkjson
- exhaustruct
- gochecknoinits
- gochecksumtype
- goconst
- gocritic
- godox
- lll
- loggercheck
- revive
- sloglint
- tagalign
- unparam
linters-settings:
revive:
config: .revive.toml
staticcheck:
checks:
- "SA"
issues:
format: "format: {{.FromLinter}}: {{.Text}}"

View File

@@ -1,13 +0,0 @@
ignoreGeneratedHeader = true
[rule.blank-imports]
Arguments = [true]
[[rule]]
name = "max-public-identifier-length"
arguments = [15] # Maximum length for public identifiers
[[rule]]
name = "max-private-identifier-length"
arguments = [15] # Maximum length for private identifiers

View File

@@ -1,55 +0,0 @@
all: server agent
.phony: view
view: README.html
open $<
html: README.html
%.html: %.md
pandoc --toc=true --standalone $< --out $@
# Build the qldbtools container image
dbt: mk.client-qldbtools-container
mk.client-qldbtools-container:
cd client/containers/qldbtools && \
docker build -t client-qldbtools-container:0.1.24 .
touch $@
# Run a shell in the container with the qldbtools
dbt-run: mk.client-qldbtools-container
docker run --rm -it client-qldbtools-container:0.1.24 /bin/bash
# Run one of the scripts in the container as check
dbt-check: mk.client-qldbtools-container
docker run --rm -it client-qldbtools-container:0.1.24 mc-db-initial-info
dbt-push: mk.dbt-push
mk.dbt-push: mk.client-qldbtools-container
docker tag client-qldbtools-container:0.1.24 ghcr.io/hohn/client-qldbtools-container:0.1.24
docker push ghcr.io/hohn/client-qldbtools-container:0.1.24
touch $@
server:
cd cmd/server && GOOS=linux GOARCH=arm64 go build
agent:
cd cmd/agent && GOOS=linux GOARCH=arm64 go build
fullbuild:
cd cmd/server && GOOS=linux GOARCH=arm64 go build -a
sendsubmit:
cd tools && sh ./submit-request.curl
# Requires
# go install github.com/golangci/golangci-lint/cmd/golangci-lint@latest
lint:
golangci-lint run cmd/... pkg/...
deps:
godepgraph -maxlevel 4 -nostdlib -i github.com/minio/minio-go ./cmd/server | dot -Tpdf > deps-server.pdf && open deps-server.pdf
depa:
godepgraph -maxlevel 4 -nostdlib -i github.com/minio/minio-go ./cmd/agent | dot -Tpdf > deps-agent.pdf && open deps-agent.pdf

View File

@@ -6,52 +6,6 @@ TODO Style notes
- NO package init() functions
- Dynamic behaviour must be explicit
## Client CodeQL Database Selector
Separate from the server's downloading of databases, a client-side interface is needed to generate the `databases.json` file. This
1. must be usable from the shell
2. must be interactive (Python, Jupyter)
3. is session based to allow iterations on selection / narrowing
4. must be queryable. There is no need to reinvent sql / dataframes
Python with dataframes is ideal for this; the project is in `client/`.
## Reverse proxy
For testing, replay flows using mitmweb. This is faster and simpler than using
gh-mrva or the VS Code plugin.
- Set up the virtual environment and install tools
python3.11 -m venv venv
source venv/bin/activate
pip install mitmproxy
For intercepting requests:
1. Start mitmproxy to listen on port 8080 and forward requests to port 8081, with
web interface
mitmweb --mode reverse:http://localhost:8081 -p 8080
1. Change `server` ports in `docker-compose.yml` to
ports:
- "8081:8080" # host:container
1. Start the containers.
1. Submit requests.
3. Save the flows for later replay.
One such session is in `tools/mitmweb-flows`; it can be loaded to replay the
requests:
1. start `mitmweb --mode reverse:http://localhost:8081 -p 8080`
2. `file` > `open` > `tools/mitmweb-flows`
3. replay at least the submit, status, and download requests
## Cross-compile server on host, run it in container
These are simple steps using a single container.
@@ -77,10 +31,7 @@ These are simple steps using a single container.
cd /mrva/mrvacommander/cmd/server/ && ./server
## Using docker-compose
### Steps to build and run the server
Steps to build and run the server in a multi-container environment set up by
docker-compose.
### Steps to build and run the server in a multi-container environment set up by docker-compose.
1. Built the server-image, above
@@ -107,23 +58,6 @@ docker-compose.
cd ~/work-gh/mrva/mrvacommander/tools
sh ./request_16-Jun-2024_11-33-16.curl
1. Follow server logging via
cd ~/work-gh/mrva/mrvacommander
docker-compose up -d
docker-compose logs -f server
1. Completely rebuild all containers. Useful when running into docker errors
cd ~/work-gh/mrva/mrvacommander
docker-compose up --build
1. Start the server containers and the desktop/demo containers
cd ~/work-gh/mrva/mrvacommander/
docker-compose down --remove-orphans
docker-compose -f docker-compose-demo.yml up -d
1. Test server via remote client by following the steps in [gh-mrva](https://github.com/hohn/gh-mrva/blob/connection-redirect/README.org#compacted-edit-run-debug-cycle)
### Some general docker-compose commands

File diff suppressed because one or more lines are too long

View File

@@ -1,64 +0,0 @@
# ######################
# Use an official Golang image as the base image
FROM golang:1.22 AS builder
# Set the working directory inside the container
WORKDIR /work-gh/mrva/gh-mrva
# Clone the repository
RUN git clone https://github.com/hohn/gh-mrva.git . &&\
git checkout hohn-0.1.24-demo
# Download dependencies
RUN go mod download
# Build the Go binary
RUN go build .
# ######################
# Provide codeql and java
#
FROM ubuntu:24.10 as runner
ENV DEBIAN_FRONTEND=noninteractive
# Build argument for CodeQL version, defaulting to the latest release
ARG CODEQL_VERSION=latest
# Install packages
RUN apt-get update && apt-get install --no-install-recommends --assume-yes \
unzip \
curl \
ca-certificates \
default-jdk
# If the version is 'latest', get the latest release version from GitHub, unzip
# the bundle into /opt, and delete the archive
RUN if [ "$CODEQL_VERSION" = "latest" ]; then \
CODEQL_VERSION=$(curl -s https://api.github.com/repos/github/codeql-cli-binaries/releases/latest | grep '"tag_name"' | sed -E 's/.*"([^"]+)".*/\1/'); \
fi && \
echo "Using CodeQL version $CODEQL_VERSION" && \
curl -L "https://github.com/github/codeql-cli-binaries/releases/download/$CODEQL_VERSION/codeql-linux64.zip" -o /tmp/codeql.zip && \
unzip /tmp/codeql.zip -d /opt && \
rm /tmp/codeql.zip && \
chmod -R +x /opt/codeql
# Set environment variables for CodeQL
ENV CODEQL_CLI_PATH=/opt/codeql/codeql
# Set environment variable for CodeQL for `codeql database analyze` support on ARM
# This env var has no functional effect on CodeQL when running on x86_64 linux
ENV CODEQL_JAVA_HOME=/usr
# ######################
# Set the working directory inside the final image
WORKDIR /app
# Copy the binary from the builder stage
COPY --from=builder /work-gh/mrva/gh-mrva/gh-mrva /usr/local/bin/gh-mrva
# Put CodeQL on the PATH
ENV PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/codeql
# Run forever
CMD ["tail", "-f", "/dev/null"]

View File

@@ -1,13 +0,0 @@
ghm: mk.client-ghmrva-container
mk.client-ghmrva-container:
docker build -t client-ghmrva-container:0.1.24 .
touch $@
ghm-push: mk.ghm-push
mk.ghm-push: mk.client-ghmrva-container
docker tag client-ghmrva-container:0.1.24 ghcr.io/hohn/client-ghmrva-container:0.1.24
docker push ghcr.io/hohn/client-ghmrva-container:0.1.24
touch $@
ghm-run:
docker run --rm -it ghcr.io/hohn/client-ghmrva-container:0.1.24 /bin/bash

View File

@@ -1,16 +0,0 @@
* MRVA cli tools container
Set up / run:
#+BEGIN_SRC sh
# Build
cd ~/work-gh/mrva/mrvacommander/client/containers/ghmrva/
make ghm
# Run
docker run -ti client-ghmrva-container:0.1.24 /bin/bash
# In the container
gh-mrva -h
codeql -h
# Push
make ghm-push
#+END_SRC

View File

@@ -1,30 +0,0 @@
# Use a Python 3.11 image as the base
FROM python:3.11-slim
# Install git
RUN apt-get update && apt-get install -y git
# Create the required directory structure
RUN mkdir -p /work-gh/mrva/
# Change to the directory and clone the repository
WORKDIR /work-gh/mrva/
RUN git clone https://github.com/hohn/mrvacommander.git && \
cd mrvacommander && \
git checkout hohn-0.1.24-demo
# Change to the client directory
WORKDIR /work-gh/mrva/mrvacommander/client/qldbtools/
# We're in a container, so use pip globally -- no virtual env
RUN pip install --upgrade pip
# Install the required Python packages from requirements.txt
RUN pip install --no-cache-dir -r requirements.txt
# Install qldbtools
RUN pip install .
# Run forever
CMD ["tail", "-f", "/dev/null"]

View File

@@ -1,23 +0,0 @@
{"git_branch": "HEAD", "git_commit_id": "2b41915dac8966e95f9e63638d30769b0d69ad68", "git_repo": "aircrack-ng", "ingestion_datetime_utc": "2024-06-07 16:57:47.683012+00:00", "result_url": "http://hepc/db-collection-py/aircrack-ng-aircrack-ng-ctsj-41ebbe.zip", "tool_id": "9f2f9642-febb-4435-9204-fb50bbd43de4", "tool_name": "codeql-cpp", "tool_version": "2.17.4", "projname": "aircrack-ng/aircrack-ng"}
{"git_branch": "HEAD", "git_commit_id": "8b399e9f51701b34f2f3c9375e637e6fffc642b7", "git_repo": "Serial-Studio", "ingestion_datetime_utc": "2023-10-01T15:18:43.503672671Z", "result_url": "http://hepc/db-collection-py/Serial-Studio-Serial-Studio-ctsj-2b2721.zip", "tool_id": "9f2f9642-febb-4435-9204-fb50bbd43de4", "tool_name": "codeql-cpp", "tool_version": "2.12.0", "projname": "Serial-Studio/Serial-Studio"}
{"git_branch": "HEAD", "git_commit_id": "9a9308fd5477d2a44f4e491d5a712546d4a2b3e4", "git_repo": "bulk-builder", "ingestion_datetime_utc": "2024-07-22 13:30:21.681180+00:00", "result_url": "http://hepc/db-collection-py/bulk-builder-bulk-builder-ctsj-0189aa.zip", "tool_id": "9f2f9642-febb-4435-9204-fb50bbd43de4", "tool_name": "codeql-cpp", "tool_version": "2.18.0", "projname": "bulk-builder/bulk-builder"}
{"git_branch": "HEAD", "git_commit_id": "34412555665923bc07d43ce970e9d81be3795de7", "git_repo": "UEFITool", "ingestion_datetime_utc": "2024-07-04 19:00:38.543297+00:00", "result_url": "http://hepc/db-collection-py/UEFITool-UEFITool-ctsj-ee2d3c.zip", "tool_id": "9f2f9642-febb-4435-9204-fb50bbd43de4", "tool_name": "codeql-cpp", "tool_version": "2.17.6", "projname": "UEFITool/UEFITool"}
{"git_branch": "HEAD", "git_commit_id": "00aa56f5257060304d41f09651c6ab58ee6104d6", "git_repo": "bulk-builder", "ingestion_datetime_utc": "2024-07-18 14:12:52.904410+00:00", "result_url": "http://hepc/db-collection-py/bulk-builder-bulk-builder-ctsj-0c6575.zip", "tool_id": "9f2f9642-febb-4435-9204-fb50bbd43de4", "tool_name": "codeql-python", "tool_version": "2.18.0", "projname": "bulk-builder/bulk-builder"}
{"git_branch": "HEAD", "git_commit_id": "e4bffa0a7450e1abd9f4df9565728ae18d86cfd2", "git_repo": "attrs", "ingestion_datetime_utc": "2024-07-18 22:34:57.795427+00:00", "result_url": "http://hepc/db-collection-py/attrs-attrs-ctsj-e2c939.zip", "tool_id": "9f2f9642-febb-4435-9204-fb50bbd43de4", "tool_name": "codeql-python", "tool_version": "2.18.0", "projname": "attrs/attrs"}
{"git_branch": "HEAD", "git_commit_id": "9620901afce56f720e856aca600951c9b61a9460", "git_repo": "apprise", "ingestion_datetime_utc": "2024-07-22 22:26:48.720348+00:00", "result_url": "http://hepc/db-collection-py/apprise-apprise-ctsj-3f4a4e.zip", "tool_id": "9f2f9642-febb-4435-9204-fb50bbd43de4", "tool_name": "codeql-python", "tool_version": "2.18.0", "projname": "apprise/apprise"}
{"git_branch": "HEAD", "git_commit_id": "c38e6c8cfba28980aea8f895c71b376e8a5155d5", "git_repo": "bulk-builder", "ingestion_datetime_utc": "2022-04-16T12:45:56.739003883Z", "result_url": "http://hepc/db-collection-py/bulk-builder-bulk-builder-ctsj-0d6cf6.zip", "tool_id": "9f2f9642-febb-4435-9204-fb50bbd43de4", "tool_name": "codeql-cpp", "tool_version": "2.8.5", "projname": "bulk-builder/bulk-builder"}
{"git_branch": "HEAD", "git_commit_id": "18f6be580b12dc406ef356b2cd65f47c24fce63e", "git_repo": "bulk-builder", "ingestion_datetime_utc": "2024-07-19 05:46:23.392157+00:00", "result_url": "http://hepc/db-collection-py/bulk-builder-bulk-builder-ctsj-0d667f.zip", "tool_id": "9f2f9642-febb-4435-9204-fb50bbd43de4", "tool_name": "codeql-python", "tool_version": "2.18.0", "projname": "bulk-builder/bulk-builder"}
{"git_branch": "HEAD", "git_commit_id": "a587921bac074b1bd1b0a0a5536587660a9b954e", "git_repo": "bulk-builder", "ingestion_datetime_utc": "2024-07-19 16:13:39.094478+00:00", "result_url": "http://hepc/db-collection-py/bulk-builder-bulk-builder-ctsj-0a6352.zip", "tool_id": "9f2f9642-febb-4435-9204-fb50bbd43de4", "tool_name": "codeql-java", "tool_version": "2.18.0", "projname": "bulk-builder/bulk-builder"}
{"git_branch": "HEAD", "git_commit_id": "9b361c7ff497d57651856650667aece8230fab6d", "git_repo": "BentoML", "ingestion_datetime_utc": "2024-07-24 02:17:07.095690+00:00", "result_url": "http://hepc/db-collection-py/BentoML-BentoML-ctsj-d6963d.zip", "tool_id": "9f2f9642-febb-4435-9204-fb50bbd43de4", "tool_name": "codeql-python", "tool_version": "2.18.0", "projname": "BentoML/BentoML"}
{"git_branch": "HEAD", "git_commit_id": "8b399e9f51701b34f2f3c9375e637e6fffc642b7", "git_repo": "Serial-Studio", "ingestion_datetime_utc": "2023-10-01T15:18:43.503672671Z", "result_url": "http://hepc/db-collection-py/Serial-Studio-Serial-Studio-ctsj-2b2721.zip", "tool_id": "9f2f9642-febb-4435-9204-fb50bbd43de4", "tool_name": "codeql-cpp", "tool_version": "2.12.0", "projname": "Serial-Studio/Serial-Studio"}
{"git_branch": "HEAD", "git_commit_id": "53ad2da1a8e6e79e0986ddfa3a45e1db6fdd491c", "git_repo": "bulk-builder", "ingestion_datetime_utc": "2024-05-14 02:24:19.208812+00:00", "result_url": "http://hepc/db-collection-py/bulk-builder-bulk-builder-ctsj-01864e.zip", "tool_id": "9f2f9642-febb-4435-9204-fb50bbd43de4", "tool_name": "codeql-python", "tool_version": "2.17.0", "projname": "bulk-builder/bulk-builder"}
{"git_branch": "HEAD", "git_commit_id": "db8f1a7930c6b5826357646746337dafc983f953", "git_repo": "bulk-builder", "ingestion_datetime_utc": "2023-11-22 01:18:25.079473+00:00", "result_url": "http://hepc/db-collection-py/bulk-builder-bulk-builder-ctsj-099796.zip", "tool_id": "9f2f9642-febb-4435-9204-fb50bbd43de4", "tool_name": "codeql-python", "tool_version": "2.15.2", "projname": "bulk-builder/bulk-builder"}
{"git_branch": "HEAD", "git_commit_id": "f8df9dd749a549dec20aa286a7639ba04190faab", "git_repo": "bulk-builder", "ingestion_datetime_utc": "2024-05-12 16:39:28.854142+00:00", "result_url": "http://hepc/db-collection-py/bulk-builder-bulk-builder-ctsj-0d7b69.zip", "tool_id": "9f2f9642-febb-4435-9204-fb50bbd43de4", "tool_name": "codeql-cpp", "tool_version": "2.17.0", "projname": "bulk-builder/bulk-builder"}
{"git_branch": "HEAD", "git_commit_id": "b5274976cb0a792d05d541a749c0adcd9d20062d", "git_repo": "behave", "ingestion_datetime_utc": "2024-05-11 19:20:51.916333+00:00", "result_url": "http://hepc/db-collection-py/behave-behave-ctsj-b297b5.zip", "tool_id": "9f2f9642-febb-4435-9204-fb50bbd43de4", "tool_name": "codeql-python", "tool_version": "2.17.2", "projname": "behave/behave"}
{"git_branch": "HEAD", "git_commit_id": "4c825c198df470506b0f84da0b25b3b385150dcb", "git_repo": "bulk-builder", "ingestion_datetime_utc": "2024-04-25 03:26:03.986270+00:00", "result_url": "http://hepc/db-collection-py/bulk-builder-bulk-builder-ctsj-035849.zip", "tool_id": "9f2f9642-febb-4435-9204-fb50bbd43de4", "tool_name": "codeql-cpp", "tool_version": "2.17.0", "projname": "bulk-builder/bulk-builder"}
{"git_branch": "HEAD", "git_commit_id": "a8b8ff0acc6fcc629d08a3a9952f83be56a9a3c3", "git_repo": "bulk-builder", "ingestion_datetime_utc": "2024-05-03 13:30:48.829134+00:00", "result_url": "http://hepc/db-collection-py/bulk-builder-bulk-builder-ctsj-051a5c.zip", "tool_id": "9f2f9642-febb-4435-9204-fb50bbd43de4", "tool_name": "codeql-java", "tool_version": "2.17.0", "projname": "bulk-builder/bulk-builder"}
{"git_branch": "HEAD", "git_commit_id": "9ef05731e7c6cbad2e897faa7c526558eed3ceaa", "git_repo": "aws-sam-cli", "ingestion_datetime_utc": "2024-05-14 01:03:18.130142+00:00", "result_url": "http://hepc/db-collection-py/aws-sam-cli-aws-sam-cli-ctsj-b7f561.zip", "tool_id": "9f2f9642-febb-4435-9204-fb50bbd43de4", "tool_name": "codeql-python", "tool_version": "2.17.2", "projname": "aws-sam-cli/aws-sam-cli"}
{"git_branch": "HEAD", "git_commit_id": "16865390a653ceaeabe354df1b37e4a775161a70", "git_repo": "aws-sdk-pandas", "ingestion_datetime_utc": "2024-05-13 15:13:31.853042+00:00", "result_url": "http://hepc/db-collection-py/aws-sdk-pandas-aws-sdk-pandas-ctsj-2b7750.zip", "tool_id": "9f2f9642-febb-4435-9204-fb50bbd43de4", "tool_name": "codeql-python", "tool_version": "2.17.2", "projname": "aws-sdk-pandas/aws-sdk-pandas"}
{"git_branch": "HEAD", "git_commit_id": "093856995af0811d3ebbe8c179b8febf4ae706f0", "git_repo": "bulk-builder", "ingestion_datetime_utc": "2024-03-20 14:18:02.500590+00:00", "result_url": "http://hepc/db-collection-py/bulk-builder-bulk-builder-ctsj-103a8a.zip", "tool_id": "9f2f9642-febb-4435-9204-fb50bbd43de4", "tool_name": "codeql-python", "tool_version": "2.16.4", "projname": "bulk-builder/bulk-builder"}
{"git_branch": "HEAD", "git_commit_id": "0573e6f96637f08fb4cb85e0552f0622d36827d4", "git_repo": "bulk-builder", "ingestion_datetime_utc": "2024-01-24 09:21:05.977294+00:00", "result_url": "http://hepc/db-collection-py/bulk-builder-bulk-builder-ctsj-0cdf2f.zip", "tool_id": "9f2f9642-febb-4435-9204-fb50bbd43de4", "tool_name": "codeql-python", "tool_version": "2.15.5", "projname": "bulk-builder/bulk-builder"}
{"git_branch": "HEAD", "git_commit_id": "93314995a5ee2217d58c3d9cbcbdef5df6c34566", "git_repo": "bulk-builder", "ingestion_datetime_utc": "2024-05-09 05:29:25.243273+00:00", "result_url": "http://hepc/db-collection-py/bulk-builder-bulk-builder-ctsj-0a35a1.zip", "tool_id": "9f2f9642-febb-4435-9204-fb50bbd43de4", "tool_name": "codeql-cpp", "tool_version": "2.17.0", "projname": "bulk-builder/bulk-builder"}

View File

@@ -1,30 +0,0 @@
# Use a Python 3.11 image as the base
FROM python:3.11-slim
# Install git
RUN apt-get update && apt-get install -y git
# Create the required directory structure
RUN mkdir -p /work-gh/mrva/
# Change to the directory and clone the repository
WORKDIR /work-gh/mrva/
RUN git clone https://github.com/hohn/mrvacommander.git && \
cd mrvacommander && \
git checkout hohn-0.1.24-demo
# Change to the client directory
WORKDIR /work-gh/mrva/mrvacommander/client/qldbtools/
# We're in a container, so use pip globally -- no virtual env
RUN pip install --upgrade pip
# Install the required Python packages from requirements.txt
RUN pip install --no-cache-dir -r requirements.txt
# Install qldbtools
RUN pip install .
# Run forever
CMD ["tail", "-f", "/dev/null"]

View File

@@ -1,25 +0,0 @@
DBT_TARGET := client-qldbtools-container:0.1.24
# Build the qldbtools container image
dbt: mk.client-qldbtools-container
mk.client-qldbtools-container:
docker build -t ${DBT_TARGET} .
touch $@
# Run a shell in the container with the qldbtools
dbt-run: dbt
docker run --rm -it ${DBT_TARGET} /bin/bash
# Run one of the scripts in the container as check. Should exit with error.
dbt-check: dbt
docker run --rm -it ${DBT_TARGET} mc-db-initial-info
dbt-push: mk.dbt-push
mk.dbt-push: dbt
docker tag ${DBT_TARGET} ghcr.io/hohn/${DBT_TARGET}
docker push ghcr.io/hohn/${DBT_TARGET}
touch $@
dbt-test:
docker pull ghcr.io/hohn/${DBT_TARGET}
docker run --rm -it --name test-dbt-server ghcr.io/hohn/${DBT_TARGET} sh

View File

@@ -1,13 +0,0 @@
* MRVA python tools container
Set up Docker image with python 3.11 and pip and the qldbtools. The targets are
in the =Makefile=; most important are
#+BEGIN_SRC sh
# Build
make dbt
# Check
make dbt-check
#+END_SRC

View File

@@ -1,67 +0,0 @@
FROM codercom/code-server:4.92.2-debian
# ======================
# Pre-install a custom JDK for this platform and redirect CodeQL to it
USER root
ENV DEBIAN_FRONTEND=noninteractive
# Install packages
RUN apt-get update && apt-get install --no-install-recommends --assume-yes \
ca-certificates \
curl \
default-jdk \
git \
libcurl4-openssl-dev \
libssl-dev \
python3 \
python3-dev \
unzip
# Build argument for CodeQL version, defaulting to the latest release
ARG CODEQL_VERSION=latest
# If the version is 'latest', get the latest release version from GitHub, unzip
# the bundle into /opt, and delete the archive
RUN if [ "$CODEQL_VERSION" = "latest" ]; then \
CODEQL_VERSION=$(curl -s https://api.github.com/repos/github/codeql-cli-binaries/releases/latest | grep '"tag_name"' | sed -E 's/.*"([^"]+)".*/\1/'); \
fi && \
echo "Using CodeQL version $CODEQL_VERSION" && \
curl -L "https://github.com/github/codeql-cli-binaries/releases/download/$CODEQL_VERSION/codeql-linux64.zip" -o /tmp/codeql.zip && \
unzip /tmp/codeql.zip -d /opt && \
rm /tmp/codeql.zip && \
chmod -R +x /opt/codeql
# ======================
# Install code-server
USER coder
# Set environment variables
ENV PASSWORD mrva
# Install VS Code extensions as user root -- globally
RUN code-server --install-extension ms-python.python \
&& code-server --install-extension esbenp.prettier-vscode \
&& code-server --install-extension GitHub.vscode-codeql
# Expose the port that Code Server runs on
EXPOSE 9080
# Point CodeQL to the java binary for this platform
ENV CODEQL_JAVA_HOME=/usr
# Add
# codeQl.cli.executablePath
# to user settings.
# This is in addition to the environment variable CODEQL_JAVA_HOME which has no
# effect on the plugin
USER root
COPY ./settings.json /home/coder/.local/share/code-server/User/
RUN chown -R coder:coder /home/coder/.local/share/code-server/
# Start Code Server
ENTRYPOINT ["dumb-init", "code-server", "--bind-addr", "0.0.0.0:9080", "."]
# Run as the coder user
USER coder

View File

@@ -1,119 +0,0 @@
* MRVA VS Code server container
On the host:
#+BEGIN_SRC sh
# Build the container via
cd ~/work-gh/mrva/mrvacommander/client/containers/vscode/
docker build -t code-server-initialized:0.1.24 .
# Run the container in standalone mode via
cd ~/work-gh/mrva/mrvacommander/client/containers/vscode/
docker run -v ~/work-gh/mrva/vscode-codeql:/work-gh/mrva/vscode-codeql \
-d -p 9080:9080 code-server-initialized:0.1.24
#+END_SRC
- Connect to it at http://localhost:9080/?folder=/home/coder, password is =mrva=.
Inside the container:
- Setup inside the container
#+BEGIN_SRC shell
cd
export PATH=/opt/codeql:$PATH
codeql pack init qldemo
cd qldemo
codeql pack add codeql/python-all@1.0.6
#+END_SRC
- Create a new file =qldemo/simple.ql= with this query. Open it in VS Code.
The plugin will download the CodeQL binaries (but never use them -- the
configuration redirects)
#+BEGIN_SRC sh
cd
cat > qldemo/simple.ql <<eof
import python
select 42
eof
#+END_SRC
- Create database.
#+BEGIN_SRC sh
cd ~/qldemo
cat > short.py <<EOF
print('hello world')
EOF
export PATH=/opt/codeql:$PATH
codeql database create --language=python -s . -v short-db
#+END_SRC
- Set the database as default and run the query =simple.ql=
- Add the customized VS Code plugin
On the host
#+BEGIN_SRC sh
cd ~/work-gh/mrva/vscode-codeql
git checkout mrva-standalone
# Install nvm
curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.39.7/install.sh | bash
# Install correct node version
cd ./extensions/ql-vscode
nvm install
# Build the extension
cd ~/work-gh/mrva/vscode-codeql/extensions/ql-vscode
npm install
npm run build
#+END_SRC
In the container
#+BEGIN_SRC sh
# Install extension
cd /work-gh/mrva/vscode-codeql/dist
/bin/code-server --force --install-extension vscode-codeql-*.vsix
#+END_SRC
- Capture the state of this container and create a new image from it
#+BEGIN_SRC sh
docker ps
# Check id column. Use it below.
docker commit 2df5732c1850 code-server-initialized:0.1.24
# Keep the sha
# sha256:87c8260146e28aed25b094d023a30a015a958f829c09e66cb50ccca2c4a2a000
docker kill 2df5732c1850
# Make sure the image tag matches the sha
docker inspect code-server-initialized:0.1.24 |grep Id
# Run the image and check
docker run --rm -d -p 9080:9080 --name test-code-server-codeql \
code-server-initialized:0.1.24
#+END_SRC
Again connect to it at http://localhost:9080/?folder=/home/coder, password is =mrva=.
- Push this container
#+BEGIN_SRC sh
# Common
export CSI_TARGET=code-server-initialized:0.1.24
# Push container
docker tag ${CSI_TARGET} ghcr.io/hohn/${CSI_TARGET}
docker push ghcr.io/hohn/${CSI_TARGET}
#+END_SRC
- Test the registry image
#+BEGIN_SRC sh
# Test pushed container
docker pull ghcr.io/hohn/${CSI_TARGET}
docker run --rm -d -p 9080:9080 --name test-code-server-codeql\
ghcr.io/hohn/${CSI_TARGET}
#+END_SRC
In the container, inside the running vs code:
- Check the plugin version number via the command
: codeql: copy version information

View File

@@ -1,4 +0,0 @@
{
"codeQL.runningQueries.numberOfThreads": 2,
"codeQL.cli.executablePath": "/opt/codeql/codeql"
}

View File

@@ -1,24 +0,0 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Python Debugger: Current File with Arguments",
"type": "debugpy",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal",
"args": [
"--db_collection_dir",
"db-collection-py",
"--starting_path",
"$HOME/work-gh/mrva/mrva-open-source-download"
],
"justMyCode": true,
"stopOnEntry": false
}
]
}

View File

@@ -1,2 +0,0 @@
doc:
pandoc -s --css=./gfm.css README.md > foo.html && open foo.html

View File

@@ -1,171 +0,0 @@
* Introduction to hepc -- HTTP End Point for CodeQL
#+BEGIN_SRC sh
1:$ ./bin/hepc-init --db_collection_dir db-collection --starting_path ~/work-gh/mrva/mrva-open-source-download
[2024-11-19 14:12:06] [INFO] searching for db.zip files
[2024-11-19 14:12:08] [INFO] collecting information from db.zip files
[2024-11-19 14:12:08] [INFO] Extracting from /Users/hohn/work-gh/mrva/mrva-open-source-download/repos/aircrack-ng/aircrack-ng/code-scanning/codeql/databases/cpp/db.zip
[2024-11-19 14:12:08] [INFO] Adding record to db-collection/metadata.json
#+END_SRC
* Introduction to qldbtools
=qldbtools= is a Python package for selecting sets of CodeQL databases
to work on. It uses a (pandas) dataframe in the implementation, but all
results sets are available as CSV files to provide flexibility in the
tools you want to work with.
The rationale is simple: When working with larger collections of CodeQL
databases, spread over time, languages, etc., many criteria can be used
to select the subset of interest. This package addresses that aspect of
MRVA (multi repository variant analysis).
For example, consider this scenario from an enterprise. We have 10,000
repositories in C/C++, 5,000 in Python. We build CodeQL dabases weekly
and keep the last 2 years worth. This means for the last 2 years there
are
#+begin_example
(10000 + 5000) * 52 * 2 = 1560000
#+end_example
databases to select from for a single MRVA run. 1.5 Million rows are
readily handled by a pandas (or R) dataframe.
The full list of criteria currently encoded via the columns is
- owner
- name
- CID
- cliVersion
- creationTime
- language
- sha -- git commit sha of the code the CodeQL database is built against
- baselineLinesOfCode
- path
- db_lang
- db_lang_displayName
- db_lang_file_count
- db_lang_linesOfCode
- ctime
- primaryLanguage
- finalised
- left_index
- size
The minimal criteria needed to distinguish databases in the above
scenario are
- cliVersion
- creationTime
- language
- sha
These are encoded in the single custom id column 'CID'.
Thus, a database can be fully specified using a (owner,name,CID) tuple
and this is encoded in the names used by the MRVA server and clients.
The selection of databases can of course be done using the whole table.
For an example of the workflow, see [[#command-line-use][section
'command line use']].
A small sample of a full table:
| | owner | name | CID | cliVersion | creationTime | language | sha | baselineLinesOfCode | path | db_lang | db_lang_displayName | db_lang_file_count | db_lang_linesOfCode | ctime | primaryLanguage | finalised | left_index | size |
|---+----------+----------------+--------+------------+----------------------------------+----------+------------------------------------------+---------------------+-------------------------------------------------------------------------------------------------------------------------------+-------------+---------------------+--------------------+---------------------+----------------------------+-----------------+-----------+------------+----------|
| 0 | 1adrianb | face-alignment | 1f8d99 | 2.16.1 | 2024-02-08 14:18:20.983830+00:00 | python | c94dd024b1f5410ef160ff82a8423141e2bbb6b4 | 1839 | /Users/hohn/work-gh/mrva/mrva-open-source-download/repos/1adrianb/face-alignment/code-scanning/codeql/databases/python/db.zip | python | Python | 25 | 1839 | 2024-07-24T14:09:02.187201 | python | 1 | 1454 | 24075001 |
| 1 | 2shou | TextGrocery | 9ab87a | 2.12.1 | 2023-02-17T11:32:30.863093193Z | cpp | 8a4e41349a9b0175d9a73bc32a6b2eb6bfb51430 | 3939 | /Users/hohn/work-gh/mrva/mrva-open-source-download/repos/2shou/TextGrocery/code-scanning/codeql/databases/cpp/db.zip | no-language | no-language | 0 | -1 | 2024-07-24T06:25:55.347568 | cpp | nan | 1403 | 3612535 |
| 2 | 3b1b | manim | 76fdc7 | 2.17.5 | 2024-06-27 17:37:20.587627+00:00 | python | 88c7e9d2c96be1ea729b089c06cabb1bd3b2c187 | 19905 | /Users/hohn/work-gh/mrva/mrva-open-source-download/repos/3b1b/manim/code-scanning/codeql/databases/python/db.zip | python | Python | 94 | 19905 | 2024-07-24T13:23:04.716286 | python | 1 | 1647 | 26407541 |
** Installation
- Set up the virtual environment and install tools
#+begin_example
cd ~/work-gh/mrva/mrvacommander/client/qldbtools/
python3.11 -m venv venv
source venv/bin/activate
pip install --upgrade pip
# From requirements.txt
pip install -r requirements.txt
# Or explicitly
pip install jupyterlab pandas ipython
pip install lckr-jupyterlab-variableinspector
#+end_example
- Local development
#+begin_example
```bash
cd ~/work-gh/mrva/mrvacommander/client/qldbtools
source venv/bin/activate
pip install --editable .
```
The `--editable` *should* use symlinks for all scripts; use `./bin/*` to be sure.
#+end_example
- Full installation
#+begin_example
```bash
pip install qldbtools
```
#+end_example
** Use as library
The best way to examine the code is starting from the high-level scripts
in =bin/=.
** Command line use
Initial information collection requires a unique file path so it can be
run repeatedly over DB collections with the same (owner,name) but other
differences -- namely, in one or more of
- creationTime
- sha
- cliVersion
- language
Those fields are collected in =bin/mc-db-refine-info=.
An example workflow with commands grouped by data files follows.
#+begin_example
cd ~/work-gh/mrva/mrvacommander/client/qldbtools && mkdir -p scratch
./bin/mc-db-initial-info ~/work-gh/mrva/mrva-open-source-download > scratch/db-info-1.csv
./bin/mc-db-refine-info < scratch/db-info-1.csv > scratch/db-info-2.csv
./bin/mc-db-view-info < scratch/db-info-2.csv &
./bin/mc-db-unique cpp < scratch/db-info-2.csv > scratch/db-info-3.csv
./bin/mc-db-view-info < scratch/db-info-3.csv &
./bin/mc-db-populate-minio -n 11 < scratch/db-info-3.csv
./bin/mc-db-generate-selection -n 11 \
scratch/vscode-selection.json \
scratch/gh-mrva-selection.json \
< scratch/db-info-3.csv
#+end_example
To see the full information for a selection, use
=mc-rows-from-mrva-list=:
#+begin_example
./bin/mc-rows-from-mrva-list scratch/gh-mrva-selection.json \
scratch/db-info-3.csv > scratch/selection-full-info
#+end_example
To check, e.g., the =language= column:
#+begin_example
csvcut -c language scratch/selection-full-info
#+end_example
** Notes
The =preview-data= plugin for VS Code has a bug; it displays =0= instead
of =0e3379= for the following. There are other entries with similar
malfunction.
#+begin_example
CleverRaven,Cataclysm-DDA,0e3379,2.17.0,2024-05-08 12:13:10.038007+00:00,cpp,5ca7f4e59c2d7b0a93fb801a31138477f7b4a761,578098.0,/Users/hohn/work-gh/mrva/mrva-open-source-download/repos-2024-04-29/CleverRaven/Cataclysm-DDA/code-scanning/codeql/databases/cpp/db.zip,cpp,C/C++,1228.0,578098.0,2024-05-13T12:14:54.650648,cpp,True,4245,563435469
CleverRaven,Cataclysm-DDA,3231f7,2.18.0,2024-07-18 11:13:01.673231+00:00,cpp,db3435138781937e9e0e999abbaa53f1d3afb5b7,579532.0,/Users/hohn/work-gh/mrva/mrva-open-source-download/repos/CleverRaven/Cataclysm-DDA/code-scanning/codeql/databases/cpp/db.zip,cpp,C/C++,1239.0,579532.0,2024-07-24T02:33:23.900885,cpp,True,1245,573213726
#+end_example

View File

@@ -1,144 +0,0 @@
#!/bin/bash
#* Utility functions
log() {
local level="$1"
shift
local color_reset="\033[0m"
local color_info="\033[1;34m"
local color_warn="\033[1;33m"
local color_error="\033[1;31m"
local color
case "$level" in
INFO) color="$color_info" ;;
WARN) color="$color_warn" ;;
ERROR) color="$color_error" ;;
*) color="$color_reset" ;;
esac
echo -e "${color}[$(date +"%Y-%m-%d %H:%M:%S")] [$level] $*${color_reset}" >&2
}
usage() {
echo "Usage: $0 --db_collection_dir <directory> --starting_path <path> [-h]"
echo
echo "Options:"
echo " --db_collection_dir <directory> Specify the database collection directory."
echo " --starting_path <path> Specify the starting path."
echo " -h Show this help message."
exit 1
}
#* Initialize and parse arguments
set -euo pipefail # exit on error, unset var, pipefail
trap 'rm -fR /tmp/hepc.$$-*' EXIT
starting_dir=$(pwd)
db_collection_dir=""
starting_path=""
# Parse arguments
while [[ $# -gt 0 ]]; do
case "$1" in
--db_collection_dir)
shift
if [[ -z "$1" || "$1" == -* ]]; then
echo "Error: --db_collection_dir requires a directory as an argument."
usage
fi
db_collection_dir="$1"
;;
--starting_path)
shift
if [[ -z "$1" || "$1" == -* ]]; then
echo "Error: --starting_path requires a path as an argument."
usage
fi
starting_path="$1"
;;
-h)
usage
;;
*)
echo "Error: Unknown option '$1'."
usage
;;
esac
shift
done
# Check if required arguments were provided
if [[ -z "$db_collection_dir" ]]; then
echo "Error: --db_collection_dir is required."
usage
fi
if [[ -z "$starting_path" ]]; then
echo "Error: --starting_path is required."
usage
fi
#* Find all DBs
log INFO "searching for db.zip files"
find ${starting_path} -type f -name "db.zip" -size +0c > /tmp/hepc.$$-paths
#* Collect detailed information from the database files
# Don't assume they are unique.
log INFO "collecting information from db.zip files"
mkdir -p $db_collection_dir
cat /tmp/hepc.$$-paths | while read -r zip_path
do
log INFO "Extracting from ${zip_path}"
zip_dir=$(dirname ${zip_path})
zip_file=$(basename ${zip_path})
unzip -o -q ${zip_path} '*codeql-database.yml' -d /tmp/hepc.$$-zip
# The content may be LANGUAGE/codeql-database.yml
#* For every database, create a metadata record.
mkdir -p /tmp/hepc.$$-zip
cd /tmp/hepc.$$-zip/*
# Information from codeql-database.yml
primaryLanguage=$(yq '.primaryLanguage' codeql-database.yml)
sha=$(yq '.creationMetadata.sha' codeql-database.yml)
cliVersion=$(yq '.creationMetadata.cliVersion' codeql-database.yml)
creationTime=$(yq '.creationMetadata.creationTime' codeql-database.yml)
sourceLocationPrefix=$(yq '.sourceLocationPrefix' codeql-database.yml)
repo=${sourceLocationPrefix##*/} # keep only last component
# Get sourceLocationPrefix[-2]
owner="${sourceLocationPrefix%/*}" # strip last component
owner="${owner##*/}" # keep only last component
# cid for repository / db
cid=$(echo "${cliVersion} ${creationTime} ${primaryLanguage} ${sha}" | b2sum |\
awk '{print substr($1, 1, 6)}')
# Prepare the metadata record for this DB.
new_db_fname="${owner}-${repo}-ctsj-${cid}.zip"
result_url="http://hepc/${db_collection_dir}/${new_db_fname}"
record='
{
"git_branch": "HEAD",
"git_commit_id": "'${sha}'",
"git_repo": "'${repo}'",
"ingestion_datetime_utc": "'${creationTime}'",
"result_url": "'${result_url}'",
"tool_id": "9f2f9642-febb-4435-9204-fb50bbd43de4",
"tool_name": "codeql-'${primaryLanguage}'",
"tool_version": "'${cliVersion}'",
"projname": "'${owner}/${repo}'"
}
'
cd "$starting_dir"
rm -fR /tmp/hepc.$$-zip
echo "$record" >> $db_collection_dir/metadata.json
#* Link original file path to collection directory for serving. Use name including
# the cid and field separator ctsj
cd ${db_collection_dir}
[ -L ${new_db_fname} ] || ln -s ${zip_path} ${new_db_fname}
# Interim cleanup
rm -fR "/tmp/hepc.$$-*"
done

View File

@@ -1,104 +0,0 @@
/*
dependencies
go get -u golang.org/x/exp/slog
on-the-fly
go run bin/hepc-serve.go --codeql-db-dir db-collection-py-1
compiled
cd ~/work-gh/mrva/mrvacommander/client/qldbtools/
go build -o ./bin/hepc-serve.bin ./bin/hepc-serve.go
test
curl http://127.0.0.1:8080/api/v1/latest_results/codeql-all -o foo
curl $(head -1 foo | jq -r ".result_url" |sed 's|hepc|127.0.0.1:8080/db|g;') -o foo.zip
*/
package main
import (
"flag"
"fmt"
"net/http"
"os"
"path/filepath"
"golang.org/x/exp/slog"
)
var dbDir string
func serveFile(w http.ResponseWriter, r *http.Request) {
fullPath := r.URL.Path[len("/db/"):]
resolvedPath, err := filepath.EvalSymlinks(fullPath)
if err != nil {
slog.Warn("failed to resolve symlink", slog.String("fullPath", fullPath),
slog.String("error", err.Error()))
http.Error(w, "File not found", http.StatusNotFound)
return
}
if fileInfo, err := os.Stat(resolvedPath); err != nil || fileInfo.IsDir() {
slog.Warn("file not found or is a directory", slog.String("resolvedPath", resolvedPath))
http.Error(w, "File not found", http.StatusNotFound)
return
}
slog.Info("serving file", slog.String("resolvedPath", resolvedPath))
http.ServeFile(w, r, resolvedPath)
}
func serveMetadata(w http.ResponseWriter, r *http.Request) {
metadataPath := filepath.Join(dbDir, "metadata.json")
if fileInfo, err := os.Stat(metadataPath); err != nil || fileInfo.IsDir() {
slog.Warn("metadata.json not found", slog.String("metadataPath", metadataPath))
http.Error(w, "metadata.json not found", http.StatusNotFound)
return
}
slog.Info("serving metadata.json", slog.String("metadataPath", metadataPath))
http.ServeFile(w, r, metadataPath)
}
func logMiddleware(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
slog.Info("incoming request", slog.String("method", r.Method), slog.String("url", r.URL.Path))
next.ServeHTTP(w, r)
})
}
func main() {
var host string
var port int
flag.StringVar(&dbDir, "codeql-db-dir", "", "Directory containing CodeQL database files (required)")
flag.StringVar(&host, "host", "127.0.0.1", "Host address for the HTTP server")
flag.IntVar(&port, "port", 8080, "Port for the HTTP server")
flag.Parse()
if dbDir == "" {
slog.Error("missing required flag", slog.String("flag", "--codeql-db-dir"))
os.Exit(1)
}
if _, err := os.Stat(dbDir); os.IsNotExist(err) {
slog.Error("invalid directory", slog.String("dbDir", dbDir))
os.Exit(1)
}
slog.Info("starting server", slog.String("host", host), slog.Int("port", port), slog.String("dbDir", dbDir))
mux := http.NewServeMux()
mux.HandleFunc("/db/", serveFile)
mux.HandleFunc("/index", serveMetadata)
mux.HandleFunc("/api/v1/latest_results/codeql-all", serveMetadata)
loggedHandler := logMiddleware(mux)
addr := fmt.Sprintf("%s:%d", host, port)
slog.Info("server listening", slog.String("address", addr))
if err := http.ListenAndServe(addr, loggedHandler); err != nil {
slog.Error("server error", slog.String("error", err.Error()))
}
}

View File

@@ -1,108 +0,0 @@
#!/usr/bin/env python
""" Read a table of CodeQL DB information
and generate the selection files for
1. the VS Code CodeQL plugin
2. the gh-mrva command-line client
"""
import argparse
import logging
from argparse import Namespace
from typing import List
from pandas import DataFrame
import qldbtools.utils as utils
import numpy as np
#
#* Configure logger
#
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
# Overwrite log level set by minio
root_logger = logging.getLogger()
root_logger.setLevel(logging.INFO)
#
#* Process command line
#
parser = argparse.ArgumentParser(
description=""" Read a table of CodeQL DB information
and generate the selection files for
1. the VS Code CodeQL plugin
2. the gh-mrva command-line client
""",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('vscode_selection', type=str,
help='VS Code selection file to generate')
parser.add_argument('gh_mrva_selection', type=str,
help='gh-mrva cli selection file to generate')
parser.add_argument('-n', '--num-entries', type=int,
help='Only use N entries',
default=None)
parser.add_argument('-s', '--seed', type=int,
help='Random number seed',
default=4242)
parser.add_argument('-l', '--list-name', type=str,
help='Name of the repository list',
default='mirva-list')
args: Namespace = parser.parse_args()
#
#* Load the information
#
import pandas as pd
import sys
df0: DataFrame = pd.read_csv(sys.stdin)
if args.num_entries == None:
# Use all entries
df1: DataFrame = df0
else:
# Use num_entries, chosen via pseudo-random numbers
df1 = df0.sample(n=args.num_entries,
random_state=np.random.RandomState(args.seed))
#
#* Form and save structures
#
repos: list[str] = []
for index, row in df1[['owner', 'name', 'CID', 'path']].iterrows():
owner, name, CID, path = row
repos.append(utils.form_db_req_name(owner, name, CID))
repo_list_name: str = args.list_name
vsc = {
"version": 1,
"databases": {
"variantAnalysis": {
"repositoryLists": [
{
"name": repo_list_name,
"repositories": repos,
}
],
"owners": [],
"repositories": []
}
},
"selected": {
"kind": "variantAnalysisUserDefinedList",
"listName": repo_list_name
}
}
gh = {
repo_list_name: repos
}
import json
with open(args.vscode_selection, "w") as fc:
json.dump(vsc, fc, indent=4)
with open(args.gh_mrva_selection, "w") as fc:
json.dump(gh, fc, indent=4)
# Local Variables:
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
# End:

View File

@@ -1,48 +0,0 @@
#!/usr/bin/env python
""" Collect information about CodeQL databases from the file system and write out
a table in CSV format.
"""
from argparse import ArgumentParser
from typing import List
from pandas import DataFrame
import qldbtools.utils as utils
import argparse
import logging
import sys
import pandas as pd
from qldbtools.utils import DBInfo
#
#* Configure logger
#
logging.basicConfig(format='%(asctime)s %(message)s')
#
#* Process command line
#
parser: ArgumentParser = argparse.ArgumentParser(
description="""Find all CodeQL DBs in and below starting_dir and export a CSV
file with relevant data.""")
parser.add_argument('starting_dir', type=str,
help='The starting directory to search for codeql.')
args = parser.parse_args()
#
#* Collect info
#
# Get the db information in list of DBInfo form
db_base: str = args.starting_dir
dbs: list[DBInfo] = list(utils.collect_dbs(db_base))
dbdf: DataFrame = pd.DataFrame([d.__dict__ for d in dbs])
#
#
#* Write info out
#
dbdf.to_csv(sys.stdout, index=False)
# Local Variables:
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
# End:

View File

@@ -1,93 +0,0 @@
#!/usr/bin/env python
""" Read a table of CodeQL DB information (like those produced by
mc-db-refine-info) and push the databases it lists to the mrvacommander minio
DB.
"""
# /// script
# dependencies = [
# "pandas",
# "numpy",
# "minio",
# ]
# ///
import argparse
import qldbtools.utils as utils
import logging
import pandas as pd
import numpy as np
import sys
from minio import Minio
from minio.error import S3Error
from pathlib import Path
#
#* Configure logger
#
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
# Overwrite log level set by minio
root_logger = logging.getLogger()
root_logger.setLevel(logging.INFO)
#
#* Process command line
#
parser = argparse.ArgumentParser(
description=""" Read a table of CodeQL DB information (like those produced by
mc-db-refine-info) and push the databases it lists to the mrvacommander minio
DB. """)
parser.add_argument('-n', '--num-entries', type=int,
help='Only use N entries',
default=None)
parser.add_argument('-s', '--seed', type=int,
help='Random number seed',
default=4242)
args = parser.parse_args()
#
#* Collect the information and select subset
#
df = pd.read_csv(sys.stdin)
if args.num_entries == None:
# Use all entries
entries = df
else:
# Use num_entries, chosen via pseudo-random numbers
entries = df.sample(n=args.num_entries,
random_state=np.random.RandomState(args.seed))
#
#* Push the DBs
#
# Configuration
MINIO_URL = "http://localhost:9000"
MINIO_ROOT_USER = "user"
MINIO_ROOT_PASSWORD = "mmusty8432"
QL_DB_BUCKET_NAME = "qldb"
# Initialize MinIO client
client = Minio(
MINIO_URL.replace("http://", "").replace("https://", ""),
access_key=MINIO_ROOT_USER,
secret_key=MINIO_ROOT_PASSWORD,
secure=False
)
# Create the bucket if it doesn't exist
try:
if not client.bucket_exists(QL_DB_BUCKET_NAME):
client.make_bucket(QL_DB_BUCKET_NAME)
else:
logging.info(f"Bucket '{QL_DB_BUCKET_NAME}' already exists.")
except S3Error as err:
logging.error(f"Error creating bucket: {err}")
# Get info from dataframe and push the files
for index, row in entries[['owner', 'name', 'CID', 'path']].iterrows():
owner, name, CID, path = row
new_name = utils.form_db_bucket_name(owner, name, CID)
try:
client.fput_object(QL_DB_BUCKET_NAME, new_name, path)
logging.info(f"Uploaded {path} as {new_name} to bucket {QL_DB_BUCKET_NAME}")
except S3Error as err:
logging.error(f"Error uploading file {local_path}: {err}")
# Local Variables:
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
# End:

View File

@@ -1,60 +0,0 @@
#!/usr/bin/env python
""" Read an initial table of CodeQL DB information, produced by
mc-db-initial-info, and collect more detailed information from the database
files. Write out an extended table in CSV format.
"""
from argparse import ArgumentParser
from typing import List
from pandas import DataFrame
import qldbtools.utils as utils
import argparse
import logging
import pandas as pd
import sys
#
#* Configure logger
#
logging.basicConfig(format='%(asctime)s %(message)s')
#
#* Process command line
#
parser: ArgumentParser = argparse.ArgumentParser(
description="""Read an initial table of CodeQL DB information, produced by
mc-db-initial-info, and collect more detailed information from the database
files. Write out an extended table in CSV format. """)
args = parser.parse_args()
#
#* Collect the information
# This step is time-intensive so we save the results right after.
d: DataFrame = pd.read_csv(sys.stdin)
joiners: list[DataFrame] = []
for left_index in range(0, len(d)-1):
try:
metac: object
cqlc: object
cqlc, metac = utils.extract_metadata(d.path[left_index])
except utils.ExtractNotZipfile:
continue
except utils.ExtractNoCQLDB:
continue
try:
detail_df: DataFrame = utils.metadata_details(left_index, cqlc, metac)
except utils.DetailsMissing:
continue
joiners.append(detail_df)
joiners_df: DataFrame = pd.concat(joiners, axis=0)
full_df: DataFrame = pd.merge(d, joiners_df, left_index=True, right_on='left_index', how='outer')
#
#* Save results
#
full_df.to_csv(sys.stdout, index=False)
# Local Variables:
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
# End:

View File

@@ -1,122 +0,0 @@
#!/usr/bin/env python
""" Read a table of CodeQL DB information and produce a table with unique entries
adding the Cumulative ID (CID) column.
To make this happen:
- Group entries by (owner,name,CID),
sort each group by creationTime,
and keep only the top (newest) element.
- Drop rows that don't have the
| cliVersion |
| creationTime |
| language |
| sha |
columns. There are very few (16 out of 6000 on recent tests) and their DBs
are quesionable.
"""
import argparse
import logging
from argparse import Namespace
from typing import Any
from pandas import DataFrame, Series
#
#* Configure logger
#
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
# Overwrite log level set by minio
root_logger = logging.getLogger()
root_logger.setLevel(logging.INFO)
#
#* Process command line
#
parser = argparse.ArgumentParser(
description=""" Read a table of CodeQL DB information,
narrow to <language>,
group entries by (owner,name), sort each group by
creationTime and keep only the top (newest) element.
""")
parser.add_argument('language', type=str,
help='The language to be analyzed.')
args: Namespace = parser.parse_args()
#
#* Collect the information and select subset
#
import pandas as pd
import sys
import qldbtools.utils as utils
df2: DataFrame = pd.read_csv(sys.stdin)
#
#* Add single uniqueness field -- CID (Cumulative ID)
#
df2['CID'] = df2.apply(lambda row:
utils.cid_hash((
row['cliVersion'],
row['creationTime'],
row['language'],
row['sha'],
)), axis=1)
#
#* Re-order the dataframe columns by importance
# - Much of the data
# 1. Is only conditionally present
# 2. Is extra info, not for the DB proper
# 3. May have various names
#
# - The essential columns are
# | owner |
# | name |
# | language |
# | size |
# | cliVersion |
# | creationTime |
# | sha |
# | baselineLinesOfCode |
# | path |
#
# - The rest are useful; put them last
# | db_lang |
# | db_lang_displayName |
# | db_lang_file_count |
# | db_lang_linesOfCode |
# | left_index |
# | ctime |
# | primaryLanguage |
# | finalised |
df3: DataFrame = df2.reindex( columns=['owner', 'name', 'cliVersion', 'creationTime',
'language', 'sha','CID',
'baselineLinesOfCode', 'path', 'db_lang',
'db_lang_displayName', 'db_lang_file_count',
'db_lang_linesOfCode', 'ctime',
'primaryLanguage', 'finalised', 'left_index',
'size'])
# Identify rows missing specific entries
rows = ( df3['cliVersion'].isna() |
df3['creationTime'].isna() |
df3['language'].isna() |
df3['sha'].isna() )
df4: DataFrame = df3[~rows]
# Limit to one language
df5 = df4[df4['language'] == args.language]
# Sort and group
df_sorted: DataFrame = df5.sort_values(by=['owner', 'name', 'CID', 'creationTime'])
df_unique: DataFrame = df_sorted.groupby(['owner', 'name', 'CID']).first().reset_index()
# Write output
df_unique.to_csv(sys.stdout, index=False)
# Local Variables:
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
# End:

View File

@@ -1,35 +0,0 @@
#!/usr/bin/env python
""" Read a table of CodeQL DB information and display it using pandasui
"""
import argparse
import logging
import sys
#
#* Configure logger
#
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
# Overwrite log level set by minio
root_logger = logging.getLogger()
root_logger.setLevel(logging.INFO)
#
#* Process command line
#
parser = argparse.ArgumentParser(
description="Read a table of CodeQL DB information and display it using pandasui")
args = parser.parse_args()
#
#* Collect the information display
#
import pandas as pd
df = pd.read_csv(sys.stdin)
import os
os.environ['APPDATA'] = "needed-for-pandasgui"
from pandasgui import show
show(df)
# Local Variables:
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
# End:

View File

@@ -1,120 +0,0 @@
#!/usr/bin/env python3
import json
import hashlib
import yaml
import sys
from plumbum import cli, local
from plumbum.cmd import find, mkdir, ln, rm, mktemp, unzip, date, env
# Logging function
def log(level, message):
colors = {
"INFO": "\033[1;34m",
"WARN": "\033[1;33m",
"ERROR": "\033[1;31m",
"RESET": "\033[0m",
}
timestamp = date("+%Y-%m-%d %H:%M:%S").strip()
print(f"{colors[level]}[{timestamp}] [{level}] {message}{colors['RESET']}", file=sys.stderr)
# Generate a CID (cumulative id)
def generate_cid(cli_version, creation_time, primary_language, sha):
hash_input = f"{cli_version} {creation_time} {primary_language} {sha}".encode()
return hashlib.sha256(hash_input).hexdigest()[:6]
# Expand environment variables in paths
def expand_path(path):
return local.env.expand(path)
# Process a single db.zip file
def process_db_file(zip_path, db_collection_dir):
temp_dir = mktemp("-d").strip()
try:
unzip("-o", "-q", zip_path, "*codeql-database.yml", "-d", temp_dir)
# Locate the YAML file regardless of its depth
yaml_files = list(local.path(temp_dir).walk(
filter=lambda p: p.name == "codeql-database.yml"))
if not yaml_files:
log("WARN", f"No codeql-database.yml found in {zip_path}")
return
yaml_path = yaml_files[0]
with yaml_path.open("r") as f:
yaml_data = yaml.safe_load(f)
primary_language = yaml_data["primaryLanguage"]
creation_metadata = yaml_data["creationMetadata"]
sha = creation_metadata["sha"]
cli_version = creation_metadata["cliVersion"]
creation_time = creation_metadata["creationTime"]
source_location_prefix = local.path(yaml_data["sourceLocationPrefix"])
repo = source_location_prefix.name
owner = source_location_prefix.parent.name
cid = generate_cid(cli_version, creation_time, primary_language, sha)
new_db_fname = f"{owner}-{repo}-ctsj-{cid}.zip"
result_url = f"http://hepc/{db_collection_dir}/{new_db_fname}"
metadata = {
"git_branch" : "HEAD",
"git_commit_id" : sha,
"git_repo" : repo,
"ingestion_datetime_utc" : str(creation_time),
"result_url" : result_url,
"tool_id" : "9f2f9642-febb-4435-9204-fb50bbd43de4",
"tool_name" : f"codeql-{primary_language}",
"tool_version" : cli_version,
"projname" : f"{owner}/{repo}",
}
metadata_file = local.path(db_collection_dir) / "metadata.json"
with metadata_file.open("a") as f:
json.dump(metadata, f)
f.write("\n")
link_path = local.path(db_collection_dir) / new_db_fname
if not link_path.exists():
ln("-sf", zip_path, link_path)
except Exception as e:
log("WARN", f"Error processing {zip_path}: {e}")
finally:
rm("-rf", temp_dir)
# Main application class
class DBProcessor(cli.Application):
"""
DBProcessor processes db.zip files found in a starting directory,
symlinks updated names in a collection directory,
and adds a metadata information file "metadata.json" to the directory.
"""
db_collection_dir = cli.SwitchAttr(
"--db_collection_dir", str, mandatory=True, help="Specify the database collection directory"
)
starting_path = cli.SwitchAttr(
"--starting_path", str, mandatory=True, help="Specify the starting path"
)
def main(self):
db_collection_dir = expand_path(self.db_collection_dir)
starting_path = expand_path(self.starting_path)
mkdir("-p", db_collection_dir)
log("INFO", f"Searching for db.zip files in {starting_path}")
db_files = find(starting_path, "-type", "f", "-name", "db.zip",
"-size", "+0c").splitlines()
if not db_files:
log("WARN", "No db.zip files found in the specified starting path.")
return
for zip_path in db_files:
process_db_file(zip_path, db_collection_dir)
log("INFO", "Processing completed.")
if __name__ == "__main__":
DBProcessor.run()

View File

@@ -1,89 +0,0 @@
#!/usr/bin/env python3
import logging
from pathlib import Path
from plumbum import cli
from fastapi import FastAPI, HTTPException
from fastapi.responses import FileResponse
import uvicorn
# Logging configuration
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[logging.StreamHandler()]
)
logger = logging.getLogger(__name__)
# FastAPI application
app = FastAPI()
db_dir = None # This will be set by the CLI application
@app.get("/db/{file_path:path}")
def serve_file(file_path: str):
"""
Serve files from the database directory, such as .zip files or metadata.json.
"""
logger.info(f"Requested file: {file_path}")
# Resolve symlink
resolved_path = Path(file_path).resolve(strict=True)
logger.info(f"file resolved to: {resolved_path}")
if not resolved_path.exists():
logger.error(f"File not found: {resolved_path}")
raise HTTPException(status_code=404, detail=f"{resolved_path} not found")
return FileResponse(resolved_path)
@app.get("/index")
@app.get("/api/v1/latest_results/codeql-all")
def serve_metadata_json():
"""
Serve the metadata.json file for multiple routes.
"""
metadata_path = Path(db_dir) / "metadata.json"
logger.info(f"Requested metadata.json at: {metadata_path}")
if not metadata_path.exists():
logger.error("metadata.json not found.")
raise HTTPException(status_code=404, detail="metadata.json not found")
logger.info(f"Serving metadata.json from: {metadata_path}")
return FileResponse(metadata_path)
@app.middleware("http")
async def log_request(request, call_next):
logger.info(f"Incoming request: {request.method} {request.url}")
response = await call_next(request)
return response
class DBService(cli.Application):
"""
DBService serves:
1. CodeQL database .zip files symlinked in the --codeql-db-dir
2. Metadata for those zip files, contained in metadata.json in the same
directory.
The HTTP endpoints are:
1. /db/{filename}
2. /index
3. /api/v1/latest_results/codeql-all
"""
codeql_db_dir = cli.SwitchAttr("--codeql-db-dir", str, mandatory=True,
help="Directory containing CodeQL database files")
host = cli.SwitchAttr("--host", str, default="127.0.0.1",
help="Host address for the HTTP server")
port = cli.SwitchAttr("--port", int, default=8080, help="Port for the HTTP server")
def main(self):
global db_dir
db_dir = Path(self.codeql_db_dir)
if not db_dir.is_dir():
logger.error(f"Invalid directory: {db_dir}")
return 1
logger.info(f"Starting server at {self.host}:{self.port}")
logger.info(f"Serving files from directory: {db_dir}")
# Run the FastAPI server using Uvicorn
uvicorn.run(app, host=self.host, port=self.port)
if __name__ == "__main__":
DBService.run()

View File

@@ -1,67 +0,0 @@
#!/usr/bin/env python
"""
Script to list full details for a mrva-list file
1. reads files containing
{
"mirva-list": [
"NLPchina/elasticsearch-sqlctsj168cc4",
"LMAX-Exchange/disruptorctsj3e75ec",
"justauth/JustAuthctsj8a6177",
"FasterXML/jackson-modules-basectsj2fe248",
"ionic-team/capacitor-pluginsctsj38d457",
"PaddlePaddle/PaddleOCRctsj60e555",
"elastic/apm-agent-pythonctsj21dc64",
"flipkart-incubator/zjsonpatchctsjc4db35",
"stephane/libmodbusctsj54237e",
"wso2/carbon-kernelctsj5a8a6e",
"apache/servicecomb-packctsj4d98f5"
]
}
2. reads a pandas dataframe stored in a csv file
3. selects all rows from 2. that
- contain the 'owner' column matching the string before the slash from 1. and
- the 'name' column matching the string between the slash and the marker
'ctsj' and
- the 'CID' column matching the string after the marker 'ctsj'
"""
import argparse
import json
import sys
#
#* Process command line
#
parser = argparse.ArgumentParser(
description="""Script to list full details for a mrva-list file""")
parser.add_argument('mrva_list', type=str,
help='The JSON file containing the mrva-list')
parser.add_argument('info_csv', type=str,
help='The CSV file containing the full information')
args = parser.parse_args()
#* Step 1: Read the JSON file containing the "mirva-list"
with open(args.mrva_list, 'r') as f:
data = json.load(f)
# Extract and parse the "mirva-list"
mirva_list = data['mirva-list']
parsed_mirva_list = []
for item in mirva_list:
owner_name = item.split('/')[0]
repo_name = item.split('/')[1].split('ctsj')[0]
cid = item.split('/')[1].split('ctsj')[1]
parsed_mirva_list.append((owner_name, repo_name, cid))
#* Step 2: Read the CSV file into a pandas dataframe
import pandas as pd
df = pd.read_csv(args.info_csv)
#* Step 3: Filter the dataframe based on the parsed "mirva-list"
filtered_df = df[
df.apply(lambda row:
(row['owner'], row['name'], row['CID']) in parsed_mirva_list, axis=1)]
# Optionally, you can save the filtered dataframe to a new CSV file
filtered_df.to_csv(sys.stdout, index=False)

File diff suppressed because it is too large Load Diff

View File

@@ -1,138 +0,0 @@
[project]
name = "qldbtools"
version = "0.1.0"
description = "A Python package for selecting sets of CodeQL databases to work on"
authors = [
{name = "Michael Hohn", email = "hohn@github.com"}
]
readme = {file = "README.org", content-type = "text/plain"}
requires-python = ">=3.11"
dependencies = [
"annotated-types>=0.7.0",
"anyio>=4.4.0",
"appnope>=0.1.4",
"argon2-cffi>=23.1.0",
"argon2-cffi-bindings>=21.2.0",
"arrow>=1.3.0",
"asttokens>=2.4.1",
"async-lru>=2.0.4",
"attrs>=24.2.0",
"babel>=2.16.0",
"beautifulsoup4>=4.12.3",
"bleach>=6.1.0",
"blinker>=1.9.0",
"certifi>=2024.7.4",
"cffi>=1.17.0",
"charset-normalizer>=3.3.2",
"click>=8.1.7",
"comm>=0.2.2",
"debugpy>=1.8.5",
"decorator>=5.1.1",
"defusedxml>=0.7.1",
"executing>=2.0.1",
"fastapi>=0.115.5",
"fastjsonschema>=2.20.0",
"flask>=3.1.0",
"fqdn>=1.5.1",
"h11>=0.14.0",
"httpcore>=1.0.5",
"httpx>=0.27.0",
"idna>=3.7",
"ipykernel>=6.29.5",
"ipython>=8.26.0",
"isoduration>=20.11.0",
"itsdangerous>=2.2.0",
"jedi>=0.19.1",
"jinja2>=3.1.4",
"json5>=0.9.25",
"jsonpointer>=3.0.0",
"jsonschema>=4.23.0",
"jsonschema-specifications>=2023.12.1",
"jupyter-events>=0.10.0",
"jupyter-lsp>=2.2.5",
"jupyter-client>=8.6.2",
"jupyter-core>=5.7.2",
"jupyter-server>=2.14.2",
"jupyter-server-terminals>=0.5.3",
"jupyterlab>=4.2.4",
"jupyterlab-pygments>=0.3.0",
"jupyterlab-server>=2.27.3",
"lckr-jupyterlab-variableinspector",
"markupsafe>=2.1.5",
"matplotlib-inline>=0.1.7",
"minio==7.2.8",
"mistune>=3.0.2",
"nbclient>=0.10.0",
"nbconvert>=7.16.4",
"nbformat>=5.10.4",
"nest-asyncio>=1.6.0",
"notebook-shim>=0.2.4",
"numpy>=2.1.0",
"overrides>=7.7.0",
"packaging>=24.1",
"pandas>=2.2.2",
"pandocfilters>=1.5.1",
"parso>=0.8.4",
"pexpect>=4.9.0",
"platformdirs>=4.2.2",
"plumbum>=1.9.0",
"prometheus-client>=0.20.0",
"prompt-toolkit>=3.0.47",
"psutil>=6.0.0",
"ptyprocess>=0.7.0",
"pure-eval>=0.2.3",
"pycparser>=2.22",
"pycryptodome>=3.20.0",
"pydantic>=2.10.2",
"pydantic-core>=2.27.1",
"pygments>=2.18.0",
"python-dateutil>=2.9.0.post0",
"python-json-logger>=2.0.7",
"pytz>=2024.1",
"pyyaml>=6.0.2",
"pyzmq>=26.1.1",
"referencing>=0.35.1",
"requests>=2.32.3",
"rfc3339-validator>=0.1.4",
"rfc3986-validator>=0.1.1",
"rpds-py>=0.20.0",
"send2trash>=1.8.3",
"six>=1.16.0",
"sniffio>=1.3.1",
"soupsieve>=2.6",
"stack-data>=0.6.3",
"starlette>=0.41.3",
"terminado>=0.18.1",
"tinycss2>=1.3.0",
"tornado>=6.4.1",
"traitlets>=5.14.3",
"types-python-dateutil>=2.9.0.20240821",
"typing-extensions>=4.12.2",
"tzdata>=2024.1",
"uri-template>=1.3.0",
"urllib3>=2.2.2",
"uvicorn>=0.32.1",
"wcwidth>=0.2.13",
"webcolors>=24.8.0",
"webencodings>=0.5.1",
"websocket-client>=1.8.0",
"werkzeug>=3.1.3",
]
[build-system]
requires = ["setuptools>=75.5.0", "wheel"]
build-backend = "setuptools.build_meta"
[tool.setuptools]
packages = ["qldbtools"]
script-files = [
"bin/mc-db-generate-selection",
"bin/mc-db-initial-info",
"bin/mc-db-populate-minio",
"bin/mc-db-refine-info",
"bin/mc-db-unique",
"bin/mc-db-view-info",
"bin/mc-hepc-init",
"bin/mc-hepc-serve",
"bin/mc-rows-from-mrva-list",
]

View File

@@ -1,11 +0,0 @@
{
"folders": [
{
"path": "."
}
],
"settings": {
"git.ignoreLimitWarning": true,
"makefile.configureOnOpen": false
}
}

View File

@@ -1,2 +0,0 @@
from . import utils

View File

@@ -1,205 +0,0 @@
""" This module supports the selection of CodeQL databases based on various
criteria.
"""
#* Imports
from dataclasses import dataclass
from pathlib import Path
import datetime
import json
import logging
import os
from typing import List, Dict, Any
import pandas as pd
import time
import yaml
import zipfile
from pandas import DataFrame
#* Setup
logging.basicConfig(
level=logging.DEBUG,
format='%(asctime)s [%(levelname)s] %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
#* Utility functions
def log_and_raise(message):
logging.error(message)
raise Exception(message)
def log_and_raise_e(message, exception):
logging.error(message)
raise exception(message)
def traverse_tree(root: str) -> Path:
root_path = Path(os.path.expanduser(root))
if not root_path.exists() or not root_path.is_dir():
log_and_raise(f"The specified root path '{root}' does not exist or "
"is not a directory.")
for path in root_path.rglob('*'):
if path.is_file():
yield path
elif path.is_dir():
pass
@dataclass
class DBInfo:
ctime : str = '2024-05-13T12:04:01.593586'
language : str = 'cpp'
name : str = 'nanobind'
owner : str = 'wjakob'
path : Path = Path('/Users/.../db.zip')
size : int = 63083064
def collect_dbs(db_base: str) -> DBInfo:
for path in traverse_tree(db_base):
if path.name == "db.zip":
# For the current repository, we have
# In [292]: len(path.parts)
# Out[292]: 14
# and can work from the end to get relevant info from the file path.
db = DBInfo()
(*_, db.owner, db.name, _, _, _, db.language, _) = path.parts
db.path = path
s = path.stat()
db.size = s.st_size
# db.ctime_raw = s.st_ctime
# db.ctime = time.ctime(s.st_ctime)
db.ctime = datetime.datetime.fromtimestamp(s.st_ctime).isoformat()
yield db
def extract_metadata(zipfile_path: str) -> tuple[object,object]:
"""
extract_metadata(zipfile)
Unzip zipfile into memory and return the contents of the files
codeql-database.yml and baseline-info.json that it contains in a tuple
"""
codeql_content = None
meta_content = None
try:
with zipfile.ZipFile(zipfile_path, 'r') as z:
for file_info in z.infolist():
# Filenames seen
# java/codeql-database.yml
# codeql_db/codeql-database.yml
if file_info.filename.endswith('codeql-database.yml'):
with z.open(file_info) as f:
codeql_content = yaml.safe_load(f)
# And
# java/baseline-info.json
# codeql_db/baseline-info.json
elif file_info.filename.endswith('baseline-info.json'):
with z.open(file_info) as f:
meta_content = json.load(f)
except zipfile.BadZipFile:
log_and_raise_e(f"Not a zipfile: '{zipfile_path}'", ExtractNotZipfile)
# The baseline-info is only available in more recent CodeQL versions
if not meta_content:
meta_content = {'languages':
{'no-language': {'displayName': 'no-language',
'files': [],
'linesOfCode': -1,
'name': 'nolang'},
}}
if not codeql_content:
log_and_raise_e(f"Not a zipfile: '{zipfile_path}'", ExtractNoCQLDB)
return codeql_content, meta_content
class ExtractNotZipfile(Exception): pass
class ExtractNoCQLDB(Exception): pass
def metadata_details(left_index: int, codeql_content: object, meta_content: object) -> pd.DataFrame:
"""
metadata_details(codeql_content, meta_content)
Extract the details from metadata that will be used in DB selection and return a
dataframe with the information. Example, cropped to fit:
full_df.T
Out[535]:
0 1
left_index 0 0
baselineLinesOfCode 17990 17990
primaryLanguage cpp cpp
sha 288920efc079766f4 282c20efc079766f4
cliVersion 2.17.0 2.17.0
creationTime .325253+00:00 51.325253+00:00
finalised True True
db_lang cpp python
db_lang_displayName C/C++ Python
db_lang_file_count 102 27
db_lang_linesOfCode 17990 5586
"""
cqlc, metac = codeql_content, meta_content
d = {'left_index': left_index,
'baselineLinesOfCode': cqlc['baselineLinesOfCode'],
'primaryLanguage': cqlc['primaryLanguage'],
'sha': cqlc['creationMetadata'].get('sha', 'abcde0123'),
'cliVersion': cqlc['creationMetadata']['cliVersion'],
'creationTime': cqlc['creationMetadata']['creationTime'],
'finalised': cqlc.get('finalised', pd.NA),
}
f = pd.DataFrame(d, index=[0])
joiners: list[dict[str, int | Any]] = []
if not ('languages' in metac):
log_and_raise_e("Missing 'languages' in metadata", DetailsMissing)
for lang, lang_cont in metac['languages'].items():
d1: dict[str, int | Any] = { 'left_index' : left_index,
'db_lang': lang }
for prop, val in lang_cont.items():
if prop == 'files':
d1['db_lang_file_count'] = len(val)
elif prop == 'linesOfCode':
d1['db_lang_linesOfCode'] = val
elif prop == 'displayName':
d1['db_lang_displayName'] = val
joiners.append(d1)
fj: DataFrame = pd.DataFrame(joiners)
full_df: DataFrame = pd.merge(f, fj, on='left_index', how='outer')
return full_df
class DetailsMissing(Exception): pass
from hashlib import blake2b
def cid_hash(row_tuple: tuple):
"""
cid_hash(row_tuple)
Take a bytes object and return hash as hex string
"""
h = blake2b(digest_size = 3)
h.update(str(row_tuple).encode())
# return int.from_bytes(h.digest(), byteorder='big')
return h.hexdigest()
def form_db_bucket_name(owner, name, CID):
"""
form_db_bucket_name(owner, name, CID)
Return the name to use in minio storage; this function is trivial and used to
enforce consistent naming.
The 'ctsj' prefix is a random, unique key to identify the information.
"""
return f'{owner}${name}ctsj{CID}.zip'
def form_db_req_name(owner: str, name: str, CID: str) -> str:
"""
form_db_req_name(owner, name, CID)
Return the name to use in mrva requests; this function is trivial and used to
enforce consistent naming.
The 'ctsj' prefix is a random, unique key to identify the information.
"""
return f'{owner}/{name}ctsj{CID}'
# Local Variables:
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
# End:

View File

@@ -1,109 +0,0 @@
annotated-types==0.7.0
anyio==4.4.0
appnope==0.1.4
argon2-cffi==23.1.0
argon2-cffi-bindings==21.2.0
arrow==1.3.0
asttokens==2.4.1
async-lru==2.0.4
attrs==24.2.0
babel==2.16.0
beautifulsoup4==4.12.3
bleach==6.1.0
blinker==1.9.0
certifi==2024.7.4
cffi==1.17.0
charset-normalizer==3.3.2
click==8.1.7
comm==0.2.2
debugpy==1.8.5
decorator==5.1.1
defusedxml==0.7.1
executing==2.0.1
fastapi==0.115.5
fastjsonschema==2.20.0
Flask==3.1.0
fqdn==1.5.1
h11==0.14.0
httpcore==1.0.5
httpx==0.27.0
idna==3.7
ipykernel==6.29.5
ipython==8.26.0
isoduration==20.11.0
itsdangerous==2.2.0
jedi==0.19.1
Jinja2==3.1.4
json5==0.9.25
jsonpointer==3.0.0
jsonschema==4.23.0
jsonschema-specifications==2023.12.1
jupyter-events==0.10.0
jupyter-lsp==2.2.5
jupyter_client==8.6.2
jupyter_core==5.7.2
jupyter_server==2.14.2
jupyter_server_terminals==0.5.3
jupyterlab==4.2.4
jupyterlab_pygments==0.3.0
jupyterlab_server==2.27.3
MarkupSafe==2.1.5
matplotlib-inline==0.1.7
minio==7.2.8
mistune==3.0.2
nbclient==0.10.0
nbconvert==7.16.4
nbformat==5.10.4
nest-asyncio==1.6.0
notebook_shim==0.2.4
numpy==2.1.0
overrides==7.7.0
packaging==24.1
pandas==2.2.2
pandocfilters==1.5.1
parso==0.8.4
pexpect==4.9.0
platformdirs==4.2.2
plumbum==1.9.0
prometheus_client==0.20.0
prompt_toolkit==3.0.47
psutil==6.0.0
ptyprocess==0.7.0
pure_eval==0.2.3
pycparser==2.22
pycryptodome==3.20.0
pydantic==2.10.2
pydantic_core==2.27.1
Pygments==2.18.0
python-dateutil==2.9.0.post0
python-json-logger==2.0.7
pytz==2024.1
PyYAML==6.0.2
pyzmq==26.1.1
referencing==0.35.1
requests==2.32.3
rfc3339-validator==0.1.4
rfc3986-validator==0.1.1
rpds-py==0.20.0
Send2Trash==1.8.3
setuptools==75.5.0
six==1.16.0
sniffio==1.3.1
soupsieve==2.6
stack-data==0.6.3
starlette==0.41.3
terminado==0.18.1
tinycss2==1.3.0
tornado==6.4.1
traitlets==5.14.3
types-python-dateutil==2.9.0.20240821
typing_extensions==4.12.2
tzdata==2024.1
uri-template==1.3.0
urllib3==2.2.2
uvicorn==0.32.1
wcwidth==0.2.13
webcolors==24.8.0
webencodings==0.5.1
websocket-client==1.8.0
Werkzeug==3.1.3

View File

@@ -1,61 +0,0 @@
""" Read a table of CodeQL DB information
and generate the selection files for
1. the VS Code CodeQL plugin
2. the gh-mrva command-line client
"""
#
#* Collect the information and write files
#
import pandas as pd
import sys
import qldbtools.utils as utils
import numpy as np
import importlib
importlib.reload(utils)
df0 = pd.read_csv('scratch/db-info-3.csv')
# Use num_entries, chosen via pseudo-random numbers
df1 = df0.sample(n=3, random_state=np.random.RandomState(4242))
repos = []
for index, row in df1[['owner', 'name', 'CID', 'path']].iterrows():
owner, name, CID, path = row
repos.append(utils.form_db_req_name(owner, name, CID))
repo_list_name = "mirva-list"
vsc = {
"version": 1,
"databases": {
"variantAnalysis": {
"repositoryLists": [
{
"name": repo_list_name,
"repositories": repos,
}
],
"owners": [],
"repositories": []
}
},
"selected": {
"kind": "variantAnalysisUserDefinedList",
"listName": repo_list_name
}
}
gh = {
repo_list_name: repos
}
# write the files
import json
with open("tmp-selection-vsc.json", "w") as fc:
json.dump(vsc, fc, indent=4)
with open("tmp-selection-gh.json", "w") as fc:
json.dump(gh, fc, indent=4)
# Local Variables:
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
# End:

View File

@@ -1,59 +0,0 @@
#* Experimental work with utils.py, to be merged into it.
# The rest of this interactive script is available as cli script in
# mc-db-initial-info
from utils import *
#* Data collection
# Get the db information in list of DBInfo form
db_base = "~/work-gh/mrva/mrva-open-source-download/"
dbs = list(collect_dbs(db_base))
# Inspect:
from pprint import pprint
pprint(["len", len(dbs)])
pprint(["dbs[0]", dbs[0].__dict__])
pprint(["dbs[-1]", dbs[-1].__dict__])
#
# Get a dataframe
dbdf = pd.DataFrame([d.__dict__ for d in dbs])
#
#* Experiments with on-disk format
# Continue use of raw information in separate session.
#
# PosixPath is a problem for json and parquet
#
dbdf['path'] = dbdf['path'].astype(str)
#
dbdf.to_csv('dbdf.csv')
#
dbdf.to_csv('dbdf.csv.gz', compression='gzip', index=False)
#
dbdf.to_json('dbdf.json')
#
# dbdf.to_hdf('dbdf.h5', key='dbdf', mode='w')
#
# fast, binary
dbdf.to_parquet('dbdf.parquet')
#
# fast
import sqlite3
conn = sqlite3.connect('dbdf.db')
dbdf.to_sql('qldbs', conn, if_exists='replace', index=False)
conn.close()
#
# Sizes:
# ls -laSr dbdf.*
# -rw-r--r--@ 1 hohn staff 101390 Jul 12 14:17 dbdf.csv.gz
# -rw-r--r--@ 1 hohn staff 202712 Jul 12 14:17 dbdf.parquet
# -rw-r--r--@ 1 hohn staff 560623 Jul 12 14:17 dbdf.csv
# -rw-r--r--@ 1 hohn staff 610304 Jul 12 14:17 dbdf.db
# -rw-r--r--@ 1 hohn staff 735097 Jul 12 14:17 dbdf.json
#
# parquet has many libraries, including go: xitongsys/parquet-go
# https://parquet.apache.org/
#
# Local Variables:
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
# End:

View File

@@ -1,65 +0,0 @@
import qldbtools.utils as utils
import pandas as pd
import numpy as np
import sys
from minio import Minio
from minio.error import S3Error
from pathlib import Path
#
#* Collect the information and select subset
#
df = pd.read_csv('scratch/db-info-2.csv')
seed = 4242
if 0:
# Use all entries
entries = df
else:
# Use num_entries, chosen via pseudo-random numbers
entries = df.sample(n=3,
random_state=np.random.RandomState(seed))
#
#* Push the DBs
#
# Configuration
MINIO_URL = "http://localhost:9000"
MINIO_ROOT_USER = "user"
MINIO_ROOT_PASSWORD = "mmusty8432"
QL_DB_BUCKET_NAME = "qldb"
# Initialize MinIO client
client = Minio(
MINIO_URL.replace("http://", "").replace("https://", ""),
access_key=MINIO_ROOT_USER,
secret_key=MINIO_ROOT_PASSWORD,
secure=False
)
# Create the bucket if it doesn't exist
try:
if not client.bucket_exists(QL_DB_BUCKET_NAME):
client.make_bucket(QL_DB_BUCKET_NAME)
else:
print(f"Bucket '{QL_DB_BUCKET_NAME}' already exists.")
except S3Error as err:
print(f"Error creating bucket: {err}")
# (test) File paths and new names
files_to_upload = {
"cmd/server/codeql/dbs/google/flatbuffers/google_flatbuffers_db.zip": "google$flatbuffers.zip",
"cmd/server/codeql/dbs/psycopg/psycopg2/psycopg_psycopg2_db.zip": "psycopg$psycopg2.zip"
}
# (test) Push the files
prefix = Path('/Users/hohn/work-gh/mrva/mrvacommander')
for local_path, new_name in files_to_upload.items():
try:
client.fput_object(QL_DB_BUCKET_NAME, new_name, prefix / Path(local_path))
print(f"Uploaded {local_path} as {new_name} to bucket {QL_DB_BUCKET_NAME}")
except S3Error as err:
print(f"Error uploading file {local_path}: {err}")
# Local Variables:
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
# End:

View File

@@ -1,46 +0,0 @@
# Session around bin/mc-db-unique
import qldbtools.utils as utils
import pandas as pd
#
#* Collect the information
#
df1 = pd.read_csv("scratch/db-info-2.csv")
# Add single uniqueness field -- CID (Cumulative ID) -- using
# - creationTime
# - sha
# - cliVersion
# - language
from hashlib import blake2b
def cid_hash(row_tuple: tuple):
"""
cid_hash(row_tuple)
Take a bytes object and return hash as hex string
"""
h = blake2b(digest_size = 3)
h.update(str(row_tuple).encode())
# return int.from_bytes(h.digest(), byteorder='big')
return h.hexdigest()
# Apply the cid_hash function to the specified columns and create the 'CID' column
df1['CID'] = df1.apply(lambda row: cid_hash( (row['creationTime'],
row['sha'],
row['cliVersion'],
row['language'])
), axis=1)
df2 = df1.reindex(columns=['owner', 'name', 'cliVersion', 'creationTime',
'language', 'sha','CID', 'baselineLinesOfCode', 'path',
'db_lang', 'db_lang_displayName', 'db_lang_file_count',
'db_lang_linesOfCode', 'ctime', 'primaryLanguage',
'finalised', 'left_index', 'size'])
df1['cid']
# Local Variables:
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
# End:

View File

@@ -1,118 +0,0 @@
# Experimental work be merged with bin/mc-db-refine-info
from utils import *
from pprint import pprint
#* Reload gzipped CSV file to continue work
dbdf_1 = pd.read_csv('dbdf.csv.gz', compression='gzip')
#
# (old) Consistency check:
# dbdf_1.columns == dbdf.columns
# dbmask = (dbdf_1 != dbdf)
# dbdf_1[dbmask]
# dbdf_1[dbmask].dropna(how='all')
# ctime_raw is different in places, so don't use it.
#
#* Interact with/visualize the dataframe
# Using pandasgui -- qt
from pandasgui import show
os.environ['APPDATA'] = "needed-for-pandasgui"
show(dbdf_1)
# Using dtale -- web
import dtale
dtale.show(dbdf_1)
#
#
#* Collect metadata from DB zip files
#
#** A manual sample
#
d = dbdf_1
left_index = 0
d.path[0]
cqlc, metac = extract_metadata(d.path[0])
cqlc['baselineLinesOfCode']
cqlc['primaryLanguage']
cqlc['creationMetadata']['sha']
cqlc['creationMetadata']['cliVersion']
cqlc['creationMetadata']['creationTime'].isoformat()
cqlc['finalised']
for lang, lang_cont in metac['languages'].items():
print(lang)
indent = " "
for prop, val in lang_cont.items():
if prop == 'files':
print("%sfiles count %d" % (indent, len(val)))
elif prop == 'linesOfCode':
print("%slinesOfCode %d" % (indent, val))
elif prop == 'displayName':
print("%sdisplayName %s" % (indent, val))
#** Automated for all entries
# The rest of this interactive script is available as cli script in
# mc-db-refine-info
d = dbdf_1
joiners = []
for left_index in range(0, len(d)-1):
try:
cqlc, metac = extract_metadata(d.path[left_index])
except ExtractNotZipfile:
continue
except ExtractNoCQLDB:
continue
try:
detail_df = metadata_details(left_index, cqlc, metac)
except DetailsMissing:
continue
joiners.append(detail_df)
joiners_df = pd.concat(joiners, axis=0)
full_df = pd.merge(d, joiners_df, left_index=True, right_on='left_index', how='outer')
#** View the full dataframe with metadata
from pandasgui import show
os.environ['APPDATA'] = "needed-for-pandasgui"
show(full_df)
#** Re-order the dataframe columns by importance
# - Much of the data
# 1. Is only conditionally present
# 2. Is extra info, not for the DB proper
# 3. May have various names
# - The essential columns are
# | owner |
# | name |
# | language |
# | size |
# | cliVersion |
# | creationTime |
# | sha |
# | baselineLinesOfCode |
# | path |
# - The rest are useful; put them last
# | db_lang |
# | db_lang_displayName |
# | db_lang_file_count |
# | db_lang_linesOfCode |
# | left_index |
# | ctime |
# | primaryLanguage |
# | finalised |
final_df = full_df.reindex(columns=['owner', 'name', 'language', 'size', 'cliVersion',
'creationTime', 'sha', 'baselineLinesOfCode', 'path',
'db_lang', 'db_lang_displayName', 'db_lang_file_count',
'db_lang_linesOfCode', 'ctime', 'primaryLanguage',
'finalised', 'left_index'])
final_df.to_csv('all-info-table.csv.gz', compression='gzip', index=False)
#
# Local Variables:
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
# End:
#

View File

@@ -1,41 +0,0 @@
# Experimental work for ../bin/mc-db-unique, to be merged into it.
import qldbtools.utils as utils
from pprint import pprint
import pandas as pd
# cd ../
#* Reload CSV file to continue work
df2 = df_refined = pd.read_csv('scratch/db-info-2.csv')
# Identify rows missing specific entries
rows = ( df2['cliVersion'].isna() |
df2['creationTime'].isna() |
df2['language'].isna() |
df2['sha'].isna() )
df2[rows]
df3 = df2[~rows]
df3
#* post-save work
df4 = pd.read_csv('scratch/db-info-3.csv')
# Sort and group
df_sorted = df4.sort_values(by=['owner', 'name', 'CID', 'creationTime'])
df_unique = df_sorted.groupby(['owner', 'name', 'CID']).first().reset_index()
# Find duplicates
df_dups = df_unique[df_unique['CID'].duplicated(keep=False)]
len(df_dups)
df_dups['CID']
# Set display options
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 140)
#
# Local Variables:
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
# End:
#

View File

@@ -1,46 +0,0 @@
# Session around bin/mc-db-unique
import qldbtools.utils as utils
import pandas as pd
#
#* Collect the information
#
df1 = pd.read_csv("scratch/db-info-2.csv")
# Add single uniqueness field -- CID (Cumulative ID) -- using
# - creationTime
# - sha
# - cliVersion
# - language
from hashlib import blake2b
def cid_hash(row_tuple: tuple):
"""
cid_hash(row_tuple)
Take a bytes object and return hash as hex string
"""
h = blake2b(digest_size = 3)
h.update(str(row_tuple).encode())
# return int.from_bytes(h.digest(), byteorder='big')
return h.hexdigest()
# Apply the cid_hash function to the specified columns and create the 'CID' column
df1['CID'] = df1.apply(lambda row: cid_hash( (row['creationTime'],
row['sha'],
row['cliVersion'],
row['language'])
), axis=1)
df2 = df1.reindex(columns=['owner', 'name', 'cliVersion', 'creationTime',
'language', 'sha','CID', 'baselineLinesOfCode', 'path',
'db_lang', 'db_lang_displayName', 'db_lang_file_count',
'db_lang_linesOfCode', 'ctime', 'primaryLanguage',
'finalised', 'left_index', 'size'])
df1['cid']
# Local Variables:
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
# End:

View File

@@ -1,13 +0,0 @@
from setuptools import setup, find_packages
import glob
setup(
name='qldbtools',
version='0.1.0',
description='A Python package for working with CodeQL databases',
author='Michael Hohn',
author_email='hohn@github.com',
packages=['qldbtools'],
install_requires=[],
scripts=glob.glob("bin/mc-*"),
)

2278
client/qldbtools/uv.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -23,8 +23,7 @@ ARG CODEQL_VERSION=latest
RUN apt-get update && apt-get install --no-install-recommends --assume-yes \
unzip \
curl \
ca-certificates \
default-jdk
ca-certificates
# If the version is 'latest', lsget the latest release version from GitHub, unzip the bundle into /opt, and delete the archive
RUN if [ "$CODEQL_VERSION" = "latest" ]; then \
@@ -33,15 +32,14 @@ RUN if [ "$CODEQL_VERSION" = "latest" ]; then \
echo "Using CodeQL version $CODEQL_VERSION" && \
curl -L "https://github.com/github/codeql-cli-binaries/releases/download/$CODEQL_VERSION/codeql-linux64.zip" -o /tmp/codeql.zip && \
unzip /tmp/codeql.zip -d /opt && \
rm /tmp/codeql.zip && \
chmod -R +x /opt/codeql
rm /tmp/codeql.zip
# Set environment variables for CodeQL
ENV CODEQL_CLI_PATH=/opt/codeql/codeql
ENV CODEQL_CLI_PATH=/opt/codeql
# Set environment variable for CodeQL for `codeql database analyze` support on ARM
# This env var has no functional effect on CodeQL when running on x86_64 linux
ENV CODEQL_JAVA_HOME=/usr
ENV CODEQL_JAVA_HOME=/usr/
# Copy built agent binary from the builder stage
WORKDIR /app

View File

@@ -1,23 +0,0 @@
all: mrva-agent
MAI_TARGET := mrva-agent:0.1.24
mai: mk.mrva-agent
mrva-agent: mk.mrva-agent
mk.mrva-agent:
cd ../../ && docker build -t mrva-agent:0.1.24 -f cmd/agent/Dockerfile .
touch $@
mai-serve: mai
docker run --rm -it ${MAI_TARGET} /bin/bash
clean:
-docker rmi -f ${MAI_TARGET}
-rm mrva-agent
mai-push: mk.mai-push
mk.mai-push: mai
docker tag ${MAI_TARGET} ghcr.io/hohn/${MAI_TARGET}
docker push ghcr.io/hohn/${MAI_TARGET}
touch $@

173
cmd/agent/main.go Normal file
View File

@@ -0,0 +1,173 @@
package main
import (
"context"
"flag"
"os"
"os/signal"
"runtime"
"strconv"
"sync"
"syscall"
"time"
"github.com/elastic/go-sysinfo"
"golang.org/x/exp/slog"
"mrvacommander/pkg/agent"
"mrvacommander/pkg/queue"
)
const (
workerMemoryMB = 2048 // 2 GB
monitorIntervalSec = 10 // Monitor every 10 seconds
)
func calculateWorkers() int {
host, err := sysinfo.Host()
if err != nil {
slog.Error("failed to get host info", "error", err)
os.Exit(1)
}
memInfo, err := host.Memory()
if err != nil {
slog.Error("failed to get memory info", "error", err)
os.Exit(1)
}
// Get available memory in MB
totalMemoryMB := memInfo.Available / (1024 * 1024)
// Ensure we have at least one worker
workers := int(totalMemoryMB / workerMemoryMB)
if workers < 1 {
workers = 1
}
// Limit the number of workers to the number of CPUs
cpuCount := runtime.NumCPU()
if workers > cpuCount {
workers = max(cpuCount, 1)
}
return workers
}
func startAndMonitorWorkers(ctx context.Context, queue queue.Queue, desiredWorkerCount int, wg *sync.WaitGroup) {
currentWorkerCount := 0
stopChans := make([]chan struct{}, 0)
if desiredWorkerCount != 0 {
slog.Info("Starting workers", slog.Int("count", desiredWorkerCount))
for i := 0; i < desiredWorkerCount; i++ {
stopChan := make(chan struct{})
stopChans = append(stopChans, stopChan)
wg.Add(1)
go agent.RunWorker(ctx, stopChan, queue, wg)
}
return
}
slog.Info("Worker count not specified, managing based on available memory and CPU")
for {
select {
case <-ctx.Done():
// signal all workers to stop
for _, stopChan := range stopChans {
close(stopChan)
}
return
default:
newWorkerCount := calculateWorkers()
if newWorkerCount != currentWorkerCount {
slog.Info(
"Modifying worker count",
slog.Int("current", currentWorkerCount),
slog.Int("new", newWorkerCount))
}
if newWorkerCount > currentWorkerCount {
for i := currentWorkerCount; i < newWorkerCount; i++ {
stopChan := make(chan struct{})
stopChans = append(stopChans, stopChan)
wg.Add(1)
go agent.RunWorker(ctx, stopChan, queue, wg)
}
} else if newWorkerCount < currentWorkerCount {
for i := newWorkerCount; i < currentWorkerCount; i++ {
close(stopChans[i])
}
stopChans = stopChans[:newWorkerCount]
}
currentWorkerCount = newWorkerCount
time.Sleep(monitorIntervalSec * time.Second)
}
}
}
func main() {
slog.Info("Starting agent")
workerCount := flag.Int("workers", 0, "number of workers")
flag.Parse()
requiredEnvVars := []string{
"MRVA_RABBITMQ_HOST",
"MRVA_RABBITMQ_PORT",
"MRVA_RABBITMQ_USER",
"MRVA_RABBITMQ_PASSWORD",
"CODEQL_JAVA_HOME",
"CODEQL_CLI_PATH",
}
for _, envVar := range requiredEnvVars {
if _, ok := os.LookupEnv(envVar); !ok {
slog.Error("Missing required environment variable", "key", envVar)
os.Exit(1)
}
}
rmqHost := os.Getenv("MRVA_RABBITMQ_HOST")
rmqPort := os.Getenv("MRVA_RABBITMQ_PORT")
rmqUser := os.Getenv("MRVA_RABBITMQ_USER")
rmqPass := os.Getenv("MRVA_RABBITMQ_PASSWORD")
rmqPortAsInt, err := strconv.ParseInt(rmqPort, 10, 16)
if err != nil {
slog.Error("Failed to parse RabbitMQ port", slog.Any("error", err))
os.Exit(1)
}
slog.Info("Initializing RabbitMQ queue")
rabbitMQQueue, err := queue.NewRabbitMQQueue(rmqHost, int16(rmqPortAsInt), rmqUser, rmqPass, false)
if err != nil {
slog.Error("failed to initialize RabbitMQ", slog.Any("error", err))
os.Exit(1)
}
defer rabbitMQQueue.Close()
var wg sync.WaitGroup
ctx, cancel := context.WithCancel(context.Background())
go startAndMonitorWorkers(ctx, rabbitMQQueue, *workerCount, &wg)
slog.Info("Agent started")
// Gracefully exit on SIGINT/SIGTERM
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
<-sigChan
slog.Info("Shutting down agent")
// TODO: fix this to gracefully terminate agent workers during jobs
cancel()
wg.Wait()
slog.Info("Agent shutdown complete")
}

View File

@@ -1,56 +1,38 @@
FROM golang:1.22 AS builder
# Use the ubuntu 22.04 base image
FROM ubuntu:24.10
# Copy the entire project
WORKDIR /app
COPY . .
# Set architecture to arm64
ARG ARCH=arm64
ARG AARCH=aarch64
# Download dependencies
RUN go mod download
# Set the working directory to the cmd/server subproject
WORKDIR /app/cmd/server
# Build the server
RUN go build -o /bin/mrva_server ./main.go
FROM ubuntu:24.10 as runner
# Set environment variables
ENV DEBIAN_FRONTEND=noninteractive
ENV CODEQL_VERSION=codeql-bundle-v2.17.5
ENV CODEQL_DOWNLOAD_URL=https://github.com/github/codeql-action/releases/download/${CODEQL_VERSION}/codeql-bundle-linux64.tar.gz
ENV JDK_VERSION=22.0.1
ENV JDK_DOWNLOAD_URL=https://download.oracle.com/java/21/latest/jdk-${JDK_VERSION}_linux-${AARCH}_bin.tar.gz
ENV JDK_DOWNLOAD_URL=https://download.java.net/java/GA/jdk${JDK_VERSION}/c7ec1332f7bb44aeba2eb341ae18aca4/8/GPL/openjdk-${JDK_VERSION}_linux-${AARCH}_bin.tar.gz
# Build argument for CodeQL version, defaulting to the latest release
ARG CODEQL_VERSION=latest
ENV CODEQL_JAVA_HOME=/usr/local/jdk-${JDK_VERSION}
# Install packages
RUN apt-get update && apt-get install --no-install-recommends --assume-yes \
unzip \
curl \
ca-certificates \
default-jdk
# Install necessary tools
RUN apt-get update && \
apt-get install -y curl tar && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# If the version is 'latest', lsget the latest release version from GitHub, unzip the bundle into /opt, and delete the archive
RUN if [ "$CODEQL_VERSION" = "latest" ]; then \
CODEQL_VERSION=$(curl -s https://api.github.com/repos/github/codeql-cli-binaries/releases/latest | grep '"tag_name"' | sed -E 's/.*"([^"]+)".*/\1/'); \
fi && \
echo "Using CodeQL version $CODEQL_VERSION" && \
curl -L "https://github.com/github/codeql-cli-binaries/releases/download/$CODEQL_VERSION/codeql-linux64.zip" -o /tmp/codeql.zip && \
unzip /tmp/codeql.zip -d /opt && \
rm /tmp/codeql.zip && \
chmod -R +x /opt/codeql
# Add and extract the CodeQL bundle
RUN curl -L $CODEQL_DOWNLOAD_URL -o /tmp/${CODEQL_VERSION}.tar.gz && \
tar -xzf /tmp/${CODEQL_VERSION}.tar.gz -C /opt && \
rm /tmp/${CODEQL_VERSION}.tar.gz
# Set environment variables for CodeQL
ENV CODEQL_CLI_PATH=/opt/codeql/codeql
# Add and extract the JDK
RUN curl -L $JDK_DOWNLOAD_URL -o /tmp/jdk-${JDK_VERSION}.tar.gz && \
tar -xzf /tmp/jdk-${JDK_VERSION}.tar.gz -C /usr/local && \
rm /tmp/jdk-${JDK_VERSION}.tar.gz
# Set environment variable for CodeQL for `codeql database analyze` support on ARM
# This env var has no functional effect on CodeQL when running on x86_64 linux
ENV CODEQL_JAVA_HOME=/usr
# Set PATH
ENV PATH=/opt/codeql:"$PATH"
# Set working directory to /app
# Copy built server binary from the builder stage
COPY --from=builder /bin/mrva_server ./mrva_server
# Copy the CodeQL database directory from the builder stage (for standalone mode)
COPY --from=builder /app/cmd/server/codeql ./codeql
# Run the server with the default mode set to container
ENTRYPOINT ["./mrva_server"]
CMD ["--mode=container"]
# Prepare host mount point
RUN mkdir /mrva

View File

@@ -1,26 +0,0 @@
all: mrva-server
MSI_TARGET := mrva-server:0.1.24
msi: mk.mrva-server
mrva-server: mk.mrva-server
mk.mrva-server:
cd ../../ && docker build -t mrva-server:0.1.24 -f cmd/server/Dockerfile .
touch $@
msi-serve: msi
docker run --rm -it ${MSI_TARGET} /bin/bash
clean:
-docker rmi -f ${MSI_TARGET}
-rm mrva-server
msi-push: mk.msi-push
mk.msi-push: mk.mrva-server
docker tag ${MSI_TARGET} ghcr.io/hohn/${MSI_TARGET}
docker push ghcr.io/hohn/${MSI_TARGET}
touch $@
msi-test:
docker pull ghcr.io/hohn/${MSI_TARGET}
docker run --rm -it --name test-mrva-server-codeql ghcr.io/hohn/${MSI_TARGET} sh

Binary file not shown.

140
cmd/server/main.go Normal file
View File

@@ -0,0 +1,140 @@
// Copyright © 2024 github
// Licensed under the Apache License, Version 2.0 (the "License").
package main
import (
"flag"
"log"
"log/slog"
"os"
"strconv"
"mrvacommander/config/mcc"
"mrvacommander/pkg/agent"
"mrvacommander/pkg/artifactstore"
"mrvacommander/pkg/qldbstore"
"mrvacommander/pkg/queue"
"mrvacommander/pkg/server"
"mrvacommander/pkg/state"
)
func main() {
// Define flags
helpFlag := flag.Bool("help", false, "Display help message")
logLevel := flag.String("loglevel", "info", "Set log level: debug, info, warn, error")
mode := flag.String("mode", "standalone", "Set mode: standalone, container, cluster")
// Custom usage function for the help flag
flag.Usage = func() {
log.Printf("Usage of %s:\n", os.Args[0])
flag.PrintDefaults()
log.Println("\nExamples:")
log.Println(" go run main.go --loglevel=debug --mode=container")
}
// Parse the flags
flag.Parse()
// Handle the help flag
if *helpFlag {
flag.Usage()
return
}
// Apply 'loglevel' flag
switch *logLevel {
case "debug":
slog.SetLogLoggerLevel(slog.LevelDebug)
case "info":
slog.SetLogLoggerLevel(slog.LevelInfo)
case "warn":
slog.SetLogLoggerLevel(slog.LevelWarn)
case "error":
slog.SetLogLoggerLevel(slog.LevelError)
default:
log.Printf("Invalid logging verbosity level: %s", *logLevel)
os.Exit(1)
}
// Read configuration
config := mcc.LoadConfig("mcconfig.toml")
// Output configuration summary
log.Printf("Help: %t\n", *helpFlag)
log.Printf("Log Level: %s\n", *logLevel)
log.Printf("Mode: %s\n", *mode)
// Apply 'mode' flag
switch *mode {
case "standalone":
// Assemble single-process version
sq := queue.NewQueueSingle(2)
ss := state.NewLocalState(config.Storage.StartingID)
as := artifactstore.NewInMemoryArtifactStore()
ql := qldbstore.NewLocalFilesystemCodeQLDatabaseStore("")
server.NewCommanderSingle(&server.Visibles{
Queue: sq,
State: ss,
Artifacts: as,
CodeQLDBStore: ql,
})
// FIXME take value from configuration
agent.NewAgentSingle(2, &agent.Visibles{
Queue: sq,
Artifacts: as,
CodeQLDBStore: ql,
})
case "container":
rmqHost := os.Getenv("MRVA_RABBITMQ_HOST")
rmqPort := os.Getenv("MRVA_RABBITMQ_PORT")
rmqUser := os.Getenv("MRVA_RABBITMQ_USER")
rmqPass := os.Getenv("MRVA_RABBITMQ_PASSWORD")
rmqPortAsInt, err := strconv.ParseInt(rmqPort, 10, 16)
if err != nil {
slog.Error("Failed to parse RabbitMQ port", slog.Any("error", err))
os.Exit(1)
}
sq, err := queue.NewRabbitMQQueue(rmqHost, int16(rmqPortAsInt), rmqUser, rmqPass, false)
if err != nil {
slog.Error("Unable to initialize RabbitMQ queue")
os.Exit(1)
}
ss := state.NewContainerState(config.Storage.StartingID)
// TODO: add arguments
as, err := artifactstore.NewMinIOArtifactStore("", "", "")
if err != nil {
slog.Error("Unable to initialize artifact store")
os.Exit(1)
}
// TODO: add arguments
ql, err := qldbstore.NewMinIOCodeQLDatabaseStore("", "", "", "")
if err != nil {
slog.Error("Unable to initialize ql database storage")
os.Exit(1)
}
server.NewCommanderContainer(&server.Visibles{
Queue: sq,
State: ss,
Artifacts: as,
CodeQLDBStore: ql,
})
case "cluster":
// Assemble cluster version
default:
slog.Error("Invalid value for --mode. Allowed values are: standalone, container, cluster\n")
os.Exit(1)
}
}

View File

@@ -17,15 +17,15 @@ type System struct {
func LoadConfig(fname string) *System {
if _, err := os.Stat(fname); err != nil {
slog.Warn("Configuration file not found", "name", fname)
return &System{}
slog.Error("Configuration file %s not found", fname)
os.Exit(1)
}
var config System
_, err := toml.DecodeFile(fname, &config)
if err != nil {
slog.Error("Error decoding configuration file", err)
slog.Error("", err)
os.Exit(1)
}

View File

@@ -1,7 +0,0 @@
# Use a minimal base image
FROM busybox
ADD dbsdata_backup.tar /
# Just run sh if this container is ever started
CMD ["sh"]

View File

@@ -1,77 +0,0 @@
* MRVA cli tools container
Set up / run:
#+BEGIN_SRC sh
# Run the raw container assembly
cd ~/work-gh/mrva/mrvacommander/
docker-compose -f docker-compose-demo-build.yml up -d
# Use the following commands to populate the mrvacommander database storage
cd ~/work-gh/mrva/mrvacommander/client/qldbtools
mkdir -p scratch
source venv/bin/activate
./bin/mc-db-initial-info ~/work-gh/mrva/mrva-open-source-download > scratch/db-info-1.csv
./bin/mc-db-refine-info < scratch/db-info-1.csv > scratch/db-info-2.csv
./bin/mc-db-unique cpp < scratch/db-info-2.csv > scratch/db-info-3.csv
./bin/mc-db-generate-selection -n 11 \
scratch/vscode-selection.json \
scratch/gh-mrva-selection.json \
< scratch/db-info-3.csv
# Several seconds start-up time; fast db population
./bin/mc-db-populate-minio -n 11 < scratch/db-info-3.csv
# While the containers are running, this will show minio's storage. The zip files
# are split into part.* and xl.meta by minio. Use the web interface to see real
# names.
docker exec dbstore ls -R /data/mrvacommander/
# Open browser to see the file listing
open http://localhost:9001/browser/qldb
# list the volumes
docker volume ls |grep dbs
docker volume inspect mrvacommander_dbsdata
# Persist volume using container
cd ~/work-gh/mrva/mrvacommander/demo/containers/dbsdata
# Use mrvacommander_dbsdata to access the compose cluster
# EITHER
# Get the data as tar file from the image using container
rm -f dbsdata_backup.tar
docker run --rm \
-v mrvacommander_dbsdata:/data \
-v $(pwd):/backup \
busybox sh -c "tar cf /backup/dbsdata_backup.tar /data"
# OR
# Use gnu tar on host. The macos tar adds extended attributes
# brew install gnu-tar
rm -f dbsdata_backup.tar && gtar cf dbsdata_backup.tar data/
# Build container with the tarball
cd ~/work-gh/mrva/mrvacommander/demo/containers/dbsdata
docker build -t dbsdata-container:0.1.24 .
docker image ls | grep dbs
# check container contents
docker run -it dbsdata-container:0.1.24 /bin/sh
docker run -it dbsdata-container:0.1.24 ls data/qldb
# Tag the dbstore backing container
docker inspect dbsdata-container:0.1.24 |grep Id
docker tag dbsdata-container:0.1.24 ghcr.io/hohn/dbsdata-container:0.1.24
# Push the pre-populated image
docker push ghcr.io/hohn/dbsdata-container:0.1.24
# Check the tagged image
docker run -it ghcr.io/hohn/dbsdata-container:0.1.24 \
ls data/qldb
# Shut down the container assembly
docker-compose -f docker-compose-demo-build.yml down
#+END_SRC

BIN
demo/containers/dbsdata/dbsdata_backup.tar (Stored with Git LFS)

Binary file not shown.

View File

@@ -1,11 +0,0 @@
## The doc/ directory
The `doc/` directory serves as home for documentation. This is the place to
put refined documentation after it has gone through `notes/`. The contents of
this directory should be accessible to a broad audience including prospective
users, active users, and developers. Highly technical
1. The note authors and
2. Developers of the project
It need not be meaningful to casual users.

View File

@@ -1,101 +0,0 @@
* MRVA for CodeQL: A Business View
** Introduction
The companion documents in this directory are mostly technical. The purpose of
this document is to explain, from a business perspective, what MRVA is and why
it matters.
To illustrate its impact, consider two real-world cases:
*** Case 1: Preventing Costly Security Failures
One of our customers faced a significant lawsuit due to inadequate security.
The root cause? Unaddressed technical risks in their code. The work we do
directly prevents similar vulnerabilities from reaching this stage.
While lawsuits of this scale are rare, security failures are not. More common
consequences include:
- Compliance violations (e.g., GDPR, SOC2 penalties)
- Security breaches leading to reputation damage
- Productivity loss from disruptive technical failures
Lawsuits may be exceptional, but code security failures occur daily. Our role
isnt just about preventing catastrophic losses—its about avoiding the small,
accumulating failures that erode security, compliance, and trust over time.
*** Case 2: Identifying Hidden Risks at Scale
Another customer manages a massive software portfolio of 120,000+ distinct
codebases—a scale at which traditional security tools and manual review
processes become impractical.
- A few known vulnerabilities had already been identified and patched.
- Our analysis uncovered 30 additional high-risk instances, previously undetected.
These findings were critical because:
- Traditional security tools break down at scale. Most solutions work well for
isolated codebases but lack the capability to analyze patterns across
120,000 repositories.
- Complexity hides risk. Identifying these vulnerabilities required specialized
techniques beyond simple scanning—capable of handling variations,
context, and subtle exploit paths.
- Existing security processes failed to detect these vulnerabilities. Without
proactive intervention, these risks would have remained undetected until
a potential breach occurred.
This case highlights a critical gap in standard security practices. By leveraging
advanced, scalable analysis, we identified and mitigated risks that would have
otherwise gone unnoticed—demonstrating the value of proactive security
at scale.
** Why This Matters
These examples, along with others, reinforce the importance of proactive
security—especially in the context of MRVA. Security risks dont just exist
in theory; they have tangible business consequences.
MRVA provides a scalable, systematic approach to identifying and addressing
risks before they escalate—ensuring that security is a strategic advantage, not
just a cost.
** What is MRVA?
MRVA stands for /Multi-Repository Variant Analysis/. The concept is straightforward:
1. A /problem/ is identified in one codebase.
2. Variations of this problem (/variants/) can be defined.
3. The organization manages many code repositories (/multi-repository/).
4. A systematic /analysis/ is required to detect these variants across all repositories.
In practice:
- Steps 1 & 2: Defined through CodeQL queries, often custom-written for this purpose.
- Steps 3 & 4: Can be done manually but come with significant challenges.
*** Challenges of Manual Execution
Manually searching for these variants across multiple repositories is possible
but inefficient and error-prone due to:
- /High bookkeeping overhead/ Tracking thousands of repositories is
cumbersome.
- /Heavy scripting requirements/ Expert /Unix scripting skills/ are
necessary.
- /Scaling limitations/ Analyzing /thousands of repositories sequentially/
is slow, and manual parallelization is impractical.
- /Cumbersome review process/ Results are stored as /raw text files/,
requiring multiple processing steps for meaningful analysis.
*** MRVA: A Streamlined, Integrated Solution
Instead of relying on manual effort, MRVA is designed to /automate and
integrate/ the process.
- The system is designed to be /machine-driven/ and integrated into an
automated pipeline.
- Once incorporated, MRVA leverages the /CodeQL VS Code plugin/ to provide a
/seamless user experience/.
- How it works:
- Users submit queries through the UI.
- Results are retrieved and displayed dynamically as they become available.
- The entire workflow is automated, scalable, and significantly more
efficient than manual methods.
By eliminating manual inefficiencies, MRVA enables organizations to identify
and resolve security issues across massive codebases at scale, ensuring both
accuracy and speed in vulnerability detection.

View File

@@ -1,331 +0,0 @@
\documentclass[11pt]{article}
% Load the geometry package to set margins
\usepackage[lmargin=2cm,rmargin=2cm,tmargin=1.8cm,bmargin=1.8cm]{geometry}
% increase nesting depth
\usepackage{enumitem}
\setlistdepth{9}
%
\renewlist{itemize}{itemize}{9}
\setlist[itemize,1]{label=\textbullet}
\setlist[itemize,2]{label=--}
\setlist[itemize,3]{label=*}
\setlist[itemize,4]{label=•}
\setlist[itemize,5]{label=}
\setlist[itemize,6]{label=>}
\setlist[itemize,7]{label=»}
\setlist[itemize,8]{label=}
\setlist[itemize,9]{label=·}
%
\renewlist{enumerate}{enumerate}{9}
\setlist[enumerate,1]{label=\arabic*.,ref=\arabic*}
\setlist[enumerate,2]{label=\alph*.),ref=\theenumi\alph*}
\setlist[enumerate,3]{label=\roman*.),ref=\theenumii\roman*}
\setlist[enumerate,4]{label=\Alph*.),ref=\theenumiii\Alph*}
\setlist[enumerate,5]{label=\Roman*.),ref=\theenumiv\Roman*}
\setlist[enumerate,6]{label=\arabic*),ref=\theenumv\arabic*}
\setlist[enumerate,7]{label=\alph*),ref=\theenumvi\alph*}
\setlist[enumerate,8]{label=\roman*),ref=\theenumvii\roman*}
\setlist[enumerate,9]{label=\Alph*),ref=\theenumviii\Alph*}
% Load CM Bright for math
\usepackage{amsmath} % Standard math package
\usepackage{amssymb} % Additional math symbols
\usepackage{cmbright} % Sans-serif math font that complements Fira Sans
\usepackage{fourier}
% Font configuration
% \usepackage{bera}
% or
% Load Fira Sans for text
\usepackage{fontspec}
\setmainfont{Fira Sans} % System-installed Fira Sans
\renewcommand{\familydefault}{\sfdefault} % Set sans-serif as default
% pseudo-code with math
\usepackage{listings}
\usepackage{float}
\usepackage{xcolor}
\usepackage{colortbl}
% Set TT font
% \usepackage{inconsolata}
% or
\setmonofont{IBMPlexMono-Light}
% Define custom settings for listings
\lstset{
language=Python,
basicstyle=\ttfamily\small, % Monospaced font
commentstyle=\itshape\color{gray}, % Italic and gray for comments
keywordstyle=\color{blue}, % Keywords in blue
stringstyle=\color{red}, % Strings in red
mathescape=true, % Enable math in comments
breaklines=true, % Break long lines
numbers=left, % Add line numbers
numberstyle=\tiny\color{gray}, % Style for line numbers
frame=single, % Add a frame around the code
}
\usepackage{newfloat} % Allows creating custom float types
% Define 'listing' as a floating environment
\DeclareFloatingEnvironment[
fileext=lol,
listname=List of Listings,
name=Listing
]{listing}
% To prevent floats from moving past a section boundary but still allow some floating:
\usepackage{placeins}
% used with \FloatBarrier
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{graphicx}
\usepackage{longtable}
\usepackage{wrapfig}
\usepackage{rotating}
\usepackage[normalem]{ulem}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{capt-of}
\usepackage{hyperref}
\usepackage{algorithm}
\usepackage{algpseudocode}
% Title, Author, and Date (or Report Number)
\title{MRVA component interconnections}
\author{Michael Hohn}
\date{Technical Report 20250524}
\hypersetup{
pdfauthor={Michael Hohn},
pdftitle={MRVA component interconnections},
pdfkeywords={},
pdfsubject={},
pdfcreator={Emacs 29.1},
pdflang={English}}
\begin{document}
\maketitle
\tableofcontents
\section{Overview}
\label{sec:overview}
The MRVA system is organized as a collection of services. On the server side, the
system is containerized using Docker and comprises several key components:
\begin{itemize}
\item {\textbf{Server}}: Acts as the central coordinator.
\item \textbf{Agents}: One or more agents that execute tasks.
\item \textbf{RabbitMQ}: Handles messaging between components.
\item \textbf{MinIO}: Provides storage for both queries and results.
\item \textbf{HEPC}: An HTTP endpoint that hosts and serves CodeQL databases.
\end{itemize}
The execution process follows a structured workflow:
\begin{enumerate}
\item A client submits a set of queries $\mathcal{Q}$ targeting a repository
set $\mathcal{R}$.
\item The server enqueues jobs and distributes them to available agents.
\item Each agent retrieves a job, executes queries against its assigned repository, and accumulates results.
\item The agent sends results back to the server, which then forwards them to the client.
\end{enumerate}
This full round-trip can be expressed as:
\begin{equation}
\text{Client} \xrightarrow{\mathcal{Q}} \text{Server}
\xrightarrow{\text{enqueue}}
\text{Queue} \xrightarrow{\text{dispatch}} \text{Agent}
\xrightarrow{\mathcal{Q}(\mathcal{R}_i)}
\text{Server} \xrightarrow{\mathcal{Q}(\mathcal{R}_i} \text{Client}
\end{equation}
\section{Symbols and Notation}
\label{sec:orgb695d5a}
We define the following symbols for entities in the system:
\begin{center}
\begin{tabular}{lll}
Concept & Symbol & Description \\[0pt]
\hline
Client & \(C\) & The source of the query submission \\[0pt]
Server & \(S\) & Manages job queue and communicates results back to the client \\[0pt]
Job Queue & \(Q\) & Queue for managing submitted jobs \\[0pt]
Agent & \(\alpha\) & Independently polls, executes jobs, and accumulates results \\[0pt]
Agent Set & \(A\) & The set of all available agents \\[0pt]
Query Suite & \(\mathcal{Q}\) & Collection of queries submitted by the client \\[0pt]
Repository List & \(\mathcal{R}\) & Collection of repositories \\[0pt]
\(i\)-th Repository & \(\mathcal{R}_i\) & Specific repository indexed by \(i\) \\[0pt]
\(j\)-th Query & \(\mathcal{Q}_j\) & Specific query from the suite indexed by \(j\) \\[0pt]
Query Result & \(r_{i,j,k_{i,j}}\) & \(k_{i,j}\)-th result from query \(j\) executed on repository \(i\) \\[0pt]
Query Result Set & \(\mathcal{R}_i^{\mathcal{Q}_j}\) & Set of all results for query \(j\) on repository \(i\) \\[0pt]
Accumulated Results & \(\mathcal{R}_i^{\mathcal{Q}}\) & All results from executing all queries on \(\mathcal{R}_i\) \\[0pt]
\end{tabular}
\end{center}
\section{Full Round-Trip Representation}
\label{sec:full-round-trip}
The full round-trip execution, from query submission to result delivery, can be summarized as:
\[
C \xrightarrow{\mathcal{Q}} S \xrightarrow{\text{enqueue}} Q
\xrightarrow{\text{poll}}
\alpha \xrightarrow{\mathcal{Q}(\mathcal{R}_i)} S \xrightarrow{\mathcal{R}_i^{\mathcal{Q}}} C
\]
\begin{itemize}
\item \(C \to S\): Client submits a query suite \(\mathcal{Q}\) to the server.
\item \(S \to Q\): Server enqueues the query suite \((\mathcal{Q}, \mathcal{R}_i)\) for each repository.
\item \(Q \to \alpha\): Agent \(\alpha\) polls the queue and retrieves a job.
\item \(\alpha \to S\): Agent executes the queries and returns the accumulated results \(\mathcal{R}_i^{\mathcal{Q}}\) to the server.
\item \(S \to C\): Server sends the complete result set \(\mathcal{R}_i^{\mathcal{Q}}\) for each repository back to the client.
\end{itemize}
\section{Result Representation}
For the complete collection of results across all repositories and queries:
\[
\mathcal{R}^{\mathcal{Q}} = \bigcup_{i=1}^{N} \bigcup_{j=1}^{M}
\left\{ r_{i,j,1}, r_{i,j,2}, \dots, r_{i,j,k_{i,j}} \right\}
\]
where:
\begin{itemize}
\item \(N\) is the total number of repositories.
\item \(M\) is the total number of queries in \(\mathcal{Q}\).
\item \(k_{i,j}\) is the number of results from executing query
\(\mathcal{Q}_j\)
on repository \(\mathcal{R}_i\).
\end{itemize}
An individual result from the \(i\)-th repository, \(j\)-th query, and \(k\)-th result is:
\[
r_{i,j,k}
\]
\[
C \xrightarrow{\mathcal{Q}} S \xrightarrow{\text{enqueue}} Q \xrightarrow{\text{dispatch}} \alpha \xrightarrow{\mathcal{Q}(\mathcal{R}_i)} S \xrightarrow{r_{i,j}} C
\]
Each result can be further indexed to track multiple repositories and result sets.
\section{Graph Extraction from Log Table}
Assume we have a structured event log represented as a set of tuples.
\subsection*{Event Log Structure}
Let
\[
\mathcal{T} = \{ t_1, t_2, \dots, t_n \}
\]
be the set of all events, where each event
\[
t_i = (\mathit{id}_i, \tau_i, a_i, e_i, q_i, r_i, c_i)
\]
consists of:
\begin{itemize}
\item \(\mathit{id}_i\): unique event ID
\item \(\tau_i\): timestamp
\item \(a_i\): actor (e.g., ``agent\_alpha1'')
\item \(e_i\): event type (e.g., ``enqueue'', ``execute'')
\item \(q_i\): query ID
\item \(r_i\): repository ID
\item \(c_i\): result count (may be \(\bot\) if not applicable)
\end{itemize}
Let
\[
\mathcal{G} = (V, E)
\]
be a directed graph constructed from \(\mathcal{T}\), with vertices \(V\) and edges \(E\).
\subsection*{Graph Definition}
\begin{align*}
V &= \{ \mathit{id}_i \mid t_i \in \mathcal{T} \} \\
E &\subseteq V \times V
\end{align*}
Edges capture temporal or semantic relationships between events.
\subsection*{Construction Steps}
\paragraph{1. Partition by Job Identity}
Define the set of job identifiers:
\[
J = \{ (q, r) \mid \exists i: q_i = q \land r_i = r \}
\]
Then for each \((q, r) \in J\), define:
\[
\mathcal{T}_{q,r} = \{ t_i \in \mathcal{T} \mid q_i = q \land r_i = r \}
\]
\paragraph{2. Sort by Time}
Order each \(\mathcal{T}_{q,r}\) as a list:
\[
\mathcal{T}_{q,r} = [ t_{i_1}, t_{i_2}, \dots, t_{i_k} ]
\quad \text{such that } \tau_{i_j} < \tau_{i_{j+1}}
\]
\paragraph{3. Causal Edges}
Define within-job edges:
\[
E_{q,r} = \{ (\mathit{id}_{i_j}, \mathit{id}_{i_{j+1}}) \mid 1 \leq j < k \}
\]
\paragraph{4. Global Causal Graph}
Take the union:
\[
E_{\text{causal}} = \bigcup_{(q, r) \in J} E_{q,r}
\]
\paragraph{5. Semantic Edges (Optional)}
Define semantic predicates such as:
\[
\mathsf{pulls}(i, j) \iff e_i = \text{enqueue} \land e_j = \text{pull} \land
q_i = q_j \land r_i = r_j \land \tau_i < \tau_j \land a_i = \text{server} \land a_j = \text{agent}
\]
Then:
\[
E_{\text{semantic}} = \{ (\mathit{id}_i, \mathit{id}_j) \mid \mathsf{pulls}(i, j) \}
\]
\subsection*{Final Graph}
\begin{align*}
V &= \{ \mathit{id}_i \mid t_i \in \mathcal{T} \} \\
E &= E_{\text{causal}} \cup E_{\text{semantic}}
\end{align*}
\subsection*{Notes}
\begin{itemize}
\item This construction is generic: the log store \(\mathcal{T}\) may come from a database, file, or tuple-indexed dictionary.
\item Each semantic edge rule corresponds to a logical filter/join over \(\mathcal{T}\).
\item The construction is schema-free on the graph side and can be recomputed on demand with different edge logic.
\end{itemize}
\end{document}
%%% Local Variables:
%%% mode: LaTeX
%%% TeX-master: nil
%%% TeX-engine: luatex
%%% TeX-command-extra-options: "-synctex=1 -shell-escape -interaction=nonstopmode"
%%% End:

Binary file not shown.

Binary file not shown.

View File

@@ -1,605 +0,0 @@
\documentclass[11pt]{article}
% Load the geometry package to set margins
\usepackage[lmargin=2cm,rmargin=2cm,tmargin=1.8cm,bmargin=1.8cm]{geometry}
% increase nesting depth
\usepackage{enumitem}
\setlistdepth{9}
%
\renewlist{itemize}{itemize}{9}
\setlist[itemize,1]{label=\textbullet}
\setlist[itemize,2]{label=--}
\setlist[itemize,3]{label=*}
\setlist[itemize,4]{label=•}
\setlist[itemize,5]{label=}
\setlist[itemize,6]{label=>}
\setlist[itemize,7]{label=»}
\setlist[itemize,8]{label=}
\setlist[itemize,9]{label=·}
%
\renewlist{enumerate}{enumerate}{9}
\setlist[enumerate,1]{label=\arabic*.,ref=\arabic*}
\setlist[enumerate,2]{label=\alph*.),ref=\theenumi\alph*}
\setlist[enumerate,3]{label=\roman*.),ref=\theenumii\roman*}
\setlist[enumerate,4]{label=\Alph*.),ref=\theenumiii\Alph*}
\setlist[enumerate,5]{label=\Roman*.),ref=\theenumiv\Roman*}
\setlist[enumerate,6]{label=\arabic*),ref=\theenumv\arabic*}
\setlist[enumerate,7]{label=\alph*),ref=\theenumvi\alph*}
\setlist[enumerate,8]{label=\roman*),ref=\theenumvii\roman*}
\setlist[enumerate,9]{label=\Alph*),ref=\theenumviii\Alph*}
% Load CM Bright for math
\usepackage{amsmath} % Standard math package
\usepackage{amssymb} % Additional math symbols
\usepackage{cmbright} % Sans-serif math font that complements Fira Sans
\usepackage{fourier}
% Font configuration
% \usepackage{bera}
% or
% Load Fira Sans for text
\usepackage{fontspec}
\setmainfont{Fira Sans} % System-installed Fira Sans
\renewcommand{\familydefault}{\sfdefault} % Set sans-serif as default
% pseudo-code with math
\usepackage{listings}
\usepackage{float}
\usepackage{xcolor}
\usepackage{colortbl}
% Set TT font
% \usepackage{inconsolata}
% or
\setmonofont{IBMPlexMono-Light}
% Define custom settings for listings
\lstset{
language=Python,
basicstyle=\ttfamily\small, % Monospaced font
commentstyle=\itshape\color{gray}, % Italic and gray for comments
keywordstyle=\color{blue}, % Keywords in blue
stringstyle=\color{red}, % Strings in red
mathescape=true, % Enable math in comments
breaklines=true, % Break long lines
numbers=left, % Add line numbers
numberstyle=\tiny\color{gray}, % Style for line numbers
frame=single, % Add a frame around the code
}
\usepackage{newfloat} % Allows creating custom float types
% Define 'listing' as a floating environment
\DeclareFloatingEnvironment[
fileext=lol,
listname=List of Listings,
name=Listing
]{listing}
% To prevent floats from moving past a section boundary but still allow some floating:
\usepackage{placeins}
% used with \FloatBarrier
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{graphicx}
\usepackage{longtable}
\usepackage{wrapfig}
\usepackage{rotating}
\usepackage[normalem]{ulem}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{capt-of}
\usepackage{hyperref}
\usepackage{algorithm}
\usepackage{algpseudocode}
% Title, Author, and Date (or Report Number)
\title{MRVA for CodeQL}
\author{Michael Hohn}
\date{Technical Report 20250224}
\hypersetup{
pdfauthor={Michael Hohn},
pdftitle={MRVA for CodeQL},
pdfkeywords={},
pdfsubject={},
pdfcreator={Emacs 29.1},
pdflang={English}}
\begin{document}
\maketitle
\tableofcontents
\section{MRVA System Architecture Summary}
The MRVA system is organized as a collection of services. On the server side, the
system is containerized using Docker and comprises several key components:
\begin{itemize}
\item {\textbf{Server}}: Acts as the central coordinator.
\item \textbf{Agents}: One or more agents that execute tasks.
\item \textbf{RabbitMQ}: Handles messaging between components.
\item \textbf{MinIO}: Provides storage for both queries and results.
\item \textbf{HEPC}: An HTTP endpoint that hosts and serves CodeQL databases.
\end{itemize}
On the client side, users can interact with the system in two ways:
\begin{itemize}
\item {\textbf{VSCode-CodeQL}}: A graphical interface integrated with Visual Studio Code.
\item \textbf{gh-mrva CLI}: A command-line interface that connects to the server in a similar way.
\end{itemize}
This architecture enables a robust and flexible workflow for code analysis, combining a containerized back-end with both graphical and CLI front-end tools.
The full system details can be seen in the source code. This document provides an
overview.
\section{Distributed Query Execution in MRVA}
\subsection{Execution Overview}
The \textit{MRVA system} is a distributed platform for executing \textit{CodeQL
queries} across multiple repositories using a set of worker agents. The system is
{containerized} and built around a set of core services:
\begin{itemize}
\item \textbf{Server}: Coordinates job distribution and result aggregation.
\item \textbf{Agents}: Execute queries independently and return results.
\item \textbf{RabbitMQ}: Handles messaging between system components.
\item \textbf{MinIO}: Stores query inputs and execution results.
\item \textbf{HEPC}: Serves CodeQL databases over HTTP.
\end{itemize}
Clients interact with MRVA via \texttt{VSCode-CodeQL} (a graphical interface) or
\texttt{gh-mrva CLI} (a command-line tool), both of which submit queries to the
server.
The execution process follows a structured workflow:
\begin{enumerate}
\item A client submits a set of queries $\mathcal{Q}$ targeting a repository
set $\mathcal{R}$.
\item The server enqueues jobs and distributes them to available agents.
\item Each agent retrieves a job, executes queries against its assigned repository, and accumulates results.
\item The agent sends results back to the server, which then forwards them to the client.
\end{enumerate}
This full round-trip can be expressed as:
\begin{equation}
\text{Client} \xrightarrow{\mathcal{Q}} \text{Server}
\xrightarrow{\text{enqueue}}
\text{Queue} \xrightarrow{\text{dispatch}} \text{Agent}
\xrightarrow{\mathcal{Q}(\mathcal{R}_i)}
\text{Server} \xrightarrow{\mathcal{Q}(\mathcal{R}_i} \text{Client}
\end{equation}
where the Client submits queries to the Server, which enqueues jobs in the
Queue. Agents execute the queries, returning results $\mathcal{Q}(\mathcal{R}_i)$
to the Server and ultimately back to the Client.
A more rigorous description of this is in section \ref{sec:full-round-trip}.
\subsection{System Structure Overview}
This design allows for scalable and efficient query execution across multiple
repositories, whether on a single machine or a distributed cluster. The key idea
is that both setups follow the same structural approach:
\begin{itemize}
\item \textbf{Single machine setup:}
\begin{itemize}
\item Uses \textit{at least 5 Docker containers} to manage different
components of the system.
\item The number of \textit{agent containers} (responsible for executing
queries) is constrained by the available \textit{RAM and CPU cores}.
\end{itemize}
\item \textbf{Cluster setup:}
\begin{itemize}
\item Uses \textit{at least 5 virtual machines (VMs) and / or Docker containers}.
\item The number of \textit{agent VMs} is limited by \textit{network bandwidth
and available resources} (e.g., distributed storage and inter-node communication
overhead).
\end{itemize}
\end{itemize}
Thus:
\begin{itemize}
\item The {functional architecture is identical} between the single-machine and cluster setups.
\item The {primary difference} is in \textit{scale}:
\begin{itemize}
\item A single machine is limited by \textit{local CPU and RAM}.
\item A cluster is constrained by \textit{network and inter-node coordination overhead} but allows for higher overall compute capacity.
\end{itemize}
\end{itemize}
\subsection{Messages and their Types}
\label{sec:msg-types}
The following table enumerates the types (messages) passed from Client to Server.
\begin{longtable}{|p{5cm}|p{5cm}|p{5cm}|}
\hline
\rowcolor{gray!20} \textbf{Type Name} & \textbf{Field} & \textbf{Type} \\
\hline
\endfirsthead
\hline
\rowcolor{gray!20} \textbf{Type Name} & \textbf{Field} & \textbf{Type} \\
\hline
\endhead
\hline
\endfoot
\hline
\endlastfoot
ServerState & NextID & () $\rightarrow$ int \\
& GetResult & JobSpec $\rightarrow$ IO (Either Error AnalyzeResult) \\
& GetJobSpecByRepoId & (int, int) $\rightarrow$ IO (Either Error JobSpec) \\
& SetResult & (JobSpec, AnalyzeResult) $\rightarrow$ IO () \\
& GetJobList & int $\rightarrow$ IO (Either Error \textbf{[AnalyzeJob]}) \\
& GetJobInfo & JobSpec $\rightarrow$ IO (Either Error JobInfo) \\
& SetJobInfo & (JobSpec, JobInfo) $\rightarrow$ IO () \\
& GetStatus & JobSpec $\rightarrow$ IO (Either Error Status) \\
& SetStatus & (JobSpec, Status) $\rightarrow$ IO () \\
& AddJob & AnalyzeJob $\rightarrow$ IO () \\
\hline
JobSpec & sessionID & int \\
& nameWithOwner & string \\
\hline
AnalyzeResult & spec & JobSpec \\
& status & Status \\
& resultCount & int \\
& resultLocation & ArtifactLocation \\
& sourceLocationPrefix & string \\
& databaseSHA & string \\
\hline
ArtifactLocation & Key & string \\
& Bucket & string \\
\hline
AnalyzeJob & Spec & JobSpec \\
& QueryPackLocation & ArtifactLocation \\
& QueryLanguage & QueryLanguage \\
\hline
QueryLanguage & & string \\
\hline
JobInfo & QueryLanguage & string \\
& CreatedAt & string \\
& UpdatedAt & string \\
& SkippedRepositories & SkippedRepositories \\
\hline
SkippedRepositories & AccessMismatchRepos & AccessMismatchRepos \\
& NotFoundRepos & NotFoundRepos \\
& NoCodeqlDBRepos & NoCodeqlDBRepos \\
& OverLimitRepos & OverLimitRepos \\
\hline
AccessMismatchRepos & RepositoryCount & int \\
& Repositories & \textbf{[Repository]} \\
\hline
NotFoundRepos & RepositoryCount & int \\
& RepositoryFullNames & \textbf{[string]} \\
\hline
Repository & ID & int \\
& Name & string \\
& FullName & string \\
& Private & bool \\
& StargazersCount & int \\
& UpdatedAt & string \\
\end{longtable}
\section{Symbols and Notation}
\label{sec:orgb695d5a}
We define the following symbols for entities in the system:
\begin{center}
\begin{tabular}{lll}
Concept & Symbol & Description \\[0pt]
\hline
\href{vscode://file//Users/hohn/work-gh/mrva/gh-mrva/README.org:39:1}{Client} & \(C\) & The source of the query submission \\[0pt]
Server & \(S\) & Manages job queue and communicates results back to the client \\[0pt]
Job Queue & \(Q\) & Queue for managing submitted jobs \\[0pt]
Agent & \(\alpha\) & Independently polls, executes jobs, and accumulates results \\[0pt]
Agent Set & \(A\) & The set of all available agents \\[0pt]
Query Suite & \(\mathcal{Q}\) & Collection of queries submitted by the client \\[0pt]
Repository List & \(\mathcal{R}\) & Collection of repositories \\[0pt]
\(i\)-th Repository & \(\mathcal{R}_i\) & Specific repository indexed by \(i\) \\[0pt]
\(j\)-th Query & \(\mathcal{Q}_j\) & Specific query from the suite indexed by \(j\) \\[0pt]
Query Result & \(r_{i,j,k_{i,j}}\) & \(k_{i,j}\)-th result from query \(j\) executed on repository \(i\) \\[0pt]
Query Result Set & \(\mathcal{R}_i^{\mathcal{Q}_j}\) & Set of all results for query \(j\) on repository \(i\) \\[0pt]
Accumulated Results & \(\mathcal{R}_i^{\mathcal{Q}}\) & All results from executing all queries on \(\mathcal{R}_i\) \\[0pt]
\end{tabular}
\end{center}
\section{Full Round-Trip Representation}
\label{sec:full-round-trip}
The full round-trip execution, from query submission to result delivery, can be summarized as:
\[
C \xrightarrow{\mathcal{Q}} S \xrightarrow{\text{enqueue}} Q
\xrightarrow{\text{poll}}
\alpha \xrightarrow{\mathcal{Q}(\mathcal{R}_i)} S \xrightarrow{\mathcal{R}_i^{\mathcal{Q}}} C
\]
\begin{itemize}
\item \(C \to S\): Client submits a query suite \(\mathcal{Q}\) to the server.
\item \(S \to Q\): Server enqueues the query suite \((\mathcal{Q}, \mathcal{R}_i)\) for each repository.
\item \(Q \to \alpha\): Agent \(\alpha\) polls the queue and retrieves a job.
\item \(\alpha \to S\): Agent executes the queries and returns the accumulated results \(\mathcal{R}_i^{\mathcal{Q}}\) to the server.
\item \(S \to C\): Server sends the complete result set \(\mathcal{R}_i^{\mathcal{Q}}\) for each repository back to the client.
\end{itemize}
\section{Result Representation}
For the complete collection of results across all repositories and queries:
\[
\mathcal{R}^{\mathcal{Q}} = \bigcup_{i=1}^{N} \bigcup_{j=1}^{M}
\left\{ r_{i,j,1}, r_{i,j,2}, \dots, r_{i,j,k_{i,j}} \right\}
\]
where:
\begin{itemize}
\item \(N\) is the total number of repositories.
\item \(M\) is the total number of queries in \(\mathcal{Q}\).
\item \(k_{i,j}\) is the number of results from executing query
\(\mathcal{Q}_j\)
on repository \(\mathcal{R}_i\).
\end{itemize}
An individual result from the \(i\)-th repository, \(j\)-th query, and \(k\)-th result is:
\[
r_{i,j,k}
\]
\[
C \xrightarrow{\mathcal{Q}} S \xrightarrow{\text{enqueue}} Q \xrightarrow{\text{dispatch}} \alpha \xrightarrow{\mathcal{Q}(\mathcal{R}_i)} S \xrightarrow{r_{i,j}} C
\]
Each result can be further indexed to track multiple repositories and result sets.
\section{Execution Loop in Pseudo-Code}
\begin{listing}[H] % h = here, t = top, b = bottom, p = page of floats
\caption{Distributed Query Execution Algorithm}
\begin{lstlisting}[language=Python]
# Distributed Query Execution with Agent Polling and Accumulated Results
# Initialization
$\mathcal{R}$ = set() # Repository list
$Q$ = [] # Job queue
$A$ = set() # Set of agents
$\mathcal{R}_i^{\mathcal{Q}}$ = {} # Result storage for each repository
# Initialize result sets for each repository
for $R_i$ in $\mathcal{R}$:
$\mathcal{R}_i^{\mathcal{Q}} = \{\}$ # Initialize empty result set
# Enqueue the entire query suite for all repositories
for $R_i$ in $\mathcal{R}$:
$Q$.append(($\mathcal{Q}$, $R_i$)) # Enqueue $(\mathcal{Q}, \mathcal{R}_i)$ pair
# Processing loop while there are jobs in the queue
while $Q \neq \emptyset$:
# Agents autonomously poll the queue
for $\alpha$ in $A$:
if $\alpha$.is_available():
$(\mathcal{Q}, \mathcal{R}_i)$ = $Q$.pop(0) # Agent polls a job
# Agent execution begins
$\mathcal{R}_i^{\mathcal{Q}} = \{\}$ # Initialize results for repository $R_i$
for $\mathcal{Q}_j$ in $\mathcal{Q}$:
# Execute query $\mathcal{Q}_j$ on repository $\mathcal{R}_i$
$r_{i,j,1}, \dots, r_{i,j,k_{i,j}}$ = $\alpha$.execute($\mathcal{Q}_j$, $R_i$)
# Store results for query $j$
$\mathcal{R}_i^{\mathcal{Q}_j} = \{r_{i,j,1}, \dots, r_{i,j,k_{i,j}}\}$
# Accumulate results
$\mathcal{R}_i^{\mathcal{Q}} = \mathcal{R}_i^{\mathcal{Q}} \cup \mathcal{R}_i^{\mathcal{Q}_j}$
# Send all accumulated results back to the server
$\alpha$.send_results($S$, ($\mathcal{Q}$, $R_i$, $\mathcal{R}_i^{\mathcal{Q}}$))
# Server sends results for $(\mathcal{Q}, \mathcal{R}_i)$ back to the client
$S$.send_results_to_client($C$, ($\mathcal{Q}$, $R_i$, $\mathcal{R}_i^{\mathcal{Q}}$))
\end{lstlisting}
\end{listing}
\FloatBarrier
\section{Execution Loop in Pseudo-Code, declarative}
\begin{listing}[H] % h = here, t = top, b = bottom, p = page of floats
\caption{Distributed Query Execution Algorithm}
\begin{lstlisting}[language=Python]
# Distributed Query Execution with Agent Polling and Accumulated Results
# Define initial state
$\mathcal{R}$: set # Set of repositories
$\mathcal{Q}$: set # Set of queries
A: set # Set of agents
Q: list # Queue of $(\mathcal{Q}, \mathcal{R}_i)$ pairs
$\mathcal{R}_{\text{results}}$: dict = {} # Mapping of repositories to their accumulated query results
# Initialize result sets for each repository
$\mathcal{R}_{\text{results}}$ = {$\mathcal{R}_i$: set() for $\mathcal{R}_i$ in $\mathcal{R}$}
# Define job queue as an immutable mapping
Q = [($\mathcal{Q}$, $\mathcal{R}_i$) for $\mathcal{R}_i$ in $\mathcal{R}$]
# Processing as a declarative iteration over the job queue
def execute_queries(agents, job_queue, repository_results):
def available_agents():
return {$\alpha$ for $\alpha$ in agents if $\alpha$.is_available()}
def process_job($\mathcal{Q}$, $\mathcal{R}_i$, $\alpha$):
results = {$\mathcal{Q}_j$: $\alpha$.execute($\mathcal{Q}_j$, $\mathcal{R}_i$) for $\mathcal{Q}_j$ in $\mathcal{Q}$}
return $\mathcal{R}_i$, results
def accumulate_results($\mathcal{R}_{\text{results}}$, $\mathcal{R}_i$, query_results):
return {**$\mathcal{R}_{\text{results}}$, $\mathcal{R}_i$: $\mathcal{R}_{\text{results}}$[$\mathcal{R}_i$] | set().union(*query_results.values())}
while job_queue:
active_agents = available_agents()
for $\alpha$ in active_agents:
$\mathcal{Q}$, $\mathcal{R}_i$ = job_queue[0] # Peek at the first job
_, query_results = process_job($\mathcal{Q}$, $\mathcal{R}_i$, $\alpha$)
repository_results = accumulate_results(repository_results, $\mathcal{R}_i$, query_results)
$\alpha$.send_results(S, ($\mathcal{Q}$, $\mathcal{R}_i$, repository_results[$\mathcal{R}_i$]))
S.send_results_to_client(C, ($\mathcal{Q}$, $\mathcal{R}_i$, repository_results[$\mathcal{R}_i$]))
job_queue = job_queue[1:] # Move to the next job
return repository_results
# Execute the distributed query process
$\mathcal{R}_{\text{results}}$ = execute_queries(A, Q, $\mathcal{R}_{\text{results}}$)
\end{lstlisting}
\end{listing}
\FloatBarrier
\newpage{}
\section{Execution Loop in Pseudo-Code, algorithmic}
\begin{algorithm}
\caption{Distribute a set of queries $\mathcal{Q}$ across repositories
$\mathcal{R}$ using agents $A$}
\begin{algorithmic}[1] % Line numbering enabled
\Procedure{DistributedQueryExecution}{$\mathcal{Q}, \mathcal{R}, A$}
\ForAll{$\mathcal{R}_i \in \mathcal{R}$}
\Comment{Initialize result sets for each repository and query}
\State $\mathcal{R}_i^{\mathcal{Q}} \gets \left\{ \, \right\}$
\EndFor
\State $Q \gets \left\{ \, \right\}$ \Comment{Initialize empty job queue}
\ForAll{$\mathcal{R}_i \in \mathcal{R}$}
\Comment{Enqueue the entire query suite across all repositories}
\State $S \xrightarrow{\text{enqueue}(\mathcal{Q}, \mathcal{R}_i)} Q$
\EndFor
\While{$Q \neq \emptyset$}
\Comment{Agents poll the queue for available jobs}
\ForAll{$\alpha \in A$ \textbf{where} $\alpha$ \text{is available}}
\State $\alpha \xleftarrow{\text{poll}(Q)}$ \Comment{Agent autonomously retrieves a job}
% --- Begin Agent Execution Block ---
\State \textbf{\raisebox{0.5ex}{\rule{25em}{0.7pt}}} \Comment{Agent Execution Begins}
\State $\mathcal{R}_i^{\mathcal{Q}} \gets \left\{ \, \right\}$ \Comment{Initialize result set for this repository}
\ForAll{$\mathcal{Q}_j \in \mathcal{Q}$}
\State $\mathcal{R}_i^{\mathcal{Q}_j} \gets \left\{ r_{i,j,1}, r_{i,j,2}, \dots, r_{i,j,k_{i,j}} \right\}$
\Comment{Collect results for query $j$ on repository $i$}
\State $\mathcal{R}_i^{\mathcal{Q}} \gets \mathcal{R}_i^{\mathcal{Q}}
\cup \mathcal{R}_i^{\mathcal{Q}_j}$
\Comment{Accumulate results}
\EndFor
\State $\alpha \xrightarrow{(\mathcal{Q}, \mathcal{R}_i, \mathcal{R}_i^{\mathcal{Q}})} S$
\Comment{Agent sends all accumulated results back to server}
\State \textbf{\raisebox{0.5ex}{\rule{25em}{0.7pt}}} \Comment{Agent
Execution Ends}
% --- End Agent Execution Block ---
\State $S \xrightarrow{(\mathcal{Q}, \mathcal{R}_i, \mathcal{R}_i^{\mathcal{Q}})} C$
\Comment{Server sends results for repository $i$ back to the client}
\EndFor
\EndWhile
\EndProcedure
\end{algorithmic}
\end{algorithm}
\FloatBarrier
\section{Execution Loop in Pseudo-Code, hybrid}
\label{sec:orgb767ab2}
{\textbf{Algorithm:} Distribute a set of queries \(\mathcal{Q}\) across repositories \(\mathcal{R}\) using agents \(A\)}
\begin{enumerate}
\item \textbf{\textbf{Initialization}}
\begin{itemize}
\item For each repository \(\mathcal{R}_i \in \mathcal{R}\):
\begin{itemize}
\item Initialize result sets: \(\mathcal{R}_i^{\mathcal{Q}} \gets \{\}\).
\end{itemize}
\item Initialize an empty job queue: \(Q \gets \{\}\).
\end{itemize}
\item \textbf{\textbf{Enqueue Queries}}
\begin{itemize}
\item For each repository \(\mathcal{R}_i \in \mathcal{R}\):
\begin{itemize}
\item Enqueue the entire query suite: \(S \xrightarrow{\text{enqueue}(\mathcal{Q}, \mathcal{R}_i)} Q\).
\end{itemize}
\end{itemize}
\item \textbf{\textbf{Execution Loop}}
\begin{itemize}
\item While \(Q \neq \emptyset\): (agents poll the queue for available jobs)
\begin{itemize}
\item For each available agent \(\alpha \in A\):
\begin{itemize}
\item Agent autonomously retrieves a job: \(\alpha \xleftarrow{\text{poll}(Q)}\).
\item \textbf{\textbf{Agent Execution Block}}
\begin{itemize}
\item Initialize result set for this repository: \(\mathcal{R}_i^{\mathcal{Q}} \gets \{\}\).
\item For each query \(\mathcal{Q}_j \in \mathcal{Q}\):
\begin{itemize}
\item Collect results:
\(\mathcal{R}_i^{\mathcal{Q}_j} \gets \{ r_{i,j,1}, r_{i,j,2}, \dots, r_{i,j,k_{i,j}} \}\).
\item Accumulate results:
\(\mathcal{R}_i^{\mathcal{Q}} \gets \mathcal{R}_i^{\mathcal{Q}} \cup \mathcal{R}_i^{\mathcal{Q}_j}\).
\end{itemize}
\item Agent sends all accumulated results back to the server:
\(\alpha \xrightarrow{(\mathcal{Q}, \mathcal{R}_i, \mathcal{R}_i^{\mathcal{Q}})} S\).
\end{itemize}
\end{itemize}
\end{itemize}
\end{itemize}
\item \textbf{\textbf{Agent Sends Results}}
\begin{itemize}
\item Server sends results for repository \(i\) back to the client:
\(S \xrightarrow{(\mathcal{Q}, \mathcal{R}_i, \mathcal{R}_i^{\mathcal{Q}})} C\).
\end{itemize}
\end{enumerate}
\end{document}
%%% Local Variables:
%%% mode: LaTeX
%%% TeX-master: t
%%% TeX-engine: luatex
%%% TeX-command-extra-options: "-synctex=1 -shell-escape -interaction=nonstopmode"
%%% End:

View File

@@ -1,56 +0,0 @@
digraph mrvacommander {
rankdir=LR;
node [shape=box style=filled fillcolor=lightgrey fontname="monospace"];
// Entry points
cmd_server [label="cmd/server\nmain()", fillcolor=lightblue];
cmd_agent [label="cmd/agent\nmain()", fillcolor=lightblue];
// Config
config [label="config/mcc\nparseEnv()", shape=ellipse, fillcolor=lightyellow];
// Server-side
server [label="pkg/server\nServer.Run()"];
deploy [label="pkg/deploy\nInit()"];
qldbstore [label="pkg/qldbstore\nQLDB Store"];
artifactstore [label="pkg/artifactstore\nArtifact Store"];
queue [label="pkg/queue\nQueue Interface"];
// Agent-side
agent [label="pkg/agent\nAgent.Run()"];
state [label="pkg/state\nState"];
codeql [label="pkg/codeql\nrunCodeQL()"];
// Common
common [label="pkg/common\nTypes, MinIO, Jobs"];
utils [label="utils\nDownload, Archive"];
// Edges: config used by both
cmd_server -> config;
cmd_agent -> config;
// Server wiring
cmd_server -> server;
server -> queue;
server -> artifactstore;
server -> qldbstore;
// Agent wiring
cmd_agent -> agent;
agent -> queue;
agent -> codeql;
agent -> artifactstore;
agent -> state;
// Shared deps
server -> common;
agent -> common;
codeql -> common;
qldbstore -> common;
artifactstore -> common;
// Utils used by backends
qldbstore -> utils;
artifactstore -> utils;
codeql -> utils;
}

View File

@@ -1,84 +0,0 @@
.TH MRVACOMMANDER 7 "April 2025" "MRVA Project" "System Overview"
.SH NAME
mrvacommander \- distributed CodeQL task queue and execution system
.SH SYNOPSIS
.B server
.RI [ environment ]
.br
.B agent
.RI [ environment ]
.SH DESCRIPTION
mrvacommander coordinates analysis jobs over multiple worker nodes using queues, pluggable storage, and CodeQL execution. It consists of multiple interacting packages and entry points.
.SH STRUCTURE
.TP
.B cmd/server
Entry point. Loads configuration, initializes dependencies, runs queue subscriber with a dispatcher.
.TP
.B cmd/agent
Entry point. Loads configuration, runs a processing loop: receive job, execute query, save result, update state.
.SH CONFIGURATION
.TP
.B config/mcc
Parses environment variables into structured configuration. Modules include:
.IR queue ,
.IR storage ,
.IR logger ,
.IR commander .
.SH SERVER SIDE MODULES
.TP
.B pkg/server
Initializes:
queue backend
QLDB store
artifact store
Subscribes to queue and dispatches jobs to handler.
.TP
.B pkg/deploy
Deployment helpers: validate environment variables, bootstrap key services.
.SH AGENT SIDE MODULES
.TP
.B pkg/agent
Receives jobs, executes CodeQL queries, stores outputs, marks completion.
.TP
.B pkg/state
Tracks which jobs have been completed. Local file-backed.
.SH SHARED MODULES
.TP
.B pkg/common
Core types: Job, JobOutput, NameWithOwner, Query.
Includes MinIO wrappers, external API access, and job spec parsing.
.TP
.B pkg/codeql
Defines query structure and executes CodeQL against a database.
.TP
.B pkg/qldbstore
Provides read-only access to CodeQL databases via:
- MinIO (S3)
- HTTP (hepc)
- Filesystem
.TP
.B pkg/artifactstore
Persists job results. Implementations:
- MinIO
- Memory
.TP
.B pkg/queue
Job queue interface. Implementations:
- RabbitMQ
- In-memory single-node
.TP
.B utils
Generic helpers:
- HTTP download
- tar.gz extraction
.SH SEE ALSO
.BR codeql (1),
.BR rabbitmq-server (1),
.BR minio (1)

Binary file not shown.

View File

@@ -1,129 +0,0 @@
# This is the compose configuration used to build / prepopulate the containers for
# a demo.
services:
dbssvc:
## image: ghcr.io/hohn/dbsdata-container:0.1.24
build:
context: ./demo/containers/dbsdata
dockerfile: Dockerfile
container_name: dbssvc
volumes:
- dbsdata:/data/mrvacommander/dbstore-data
networks:
- backend
dbstore:
image: minio/minio:RELEASE.2024-06-11T03-13-30Z
container_name: dbstore
ports:
- "9000:9000"
- "9001:9001"
env_file:
- path: .env.container
required: true
command: server /data/mrvacommander/dbstore-data --console-address ":9001"
depends_on:
- dbssvc
volumes:
- dbsdata:/data/mrvacommander/dbstore-data
networks:
- backend
client-ghmrva:
## image: ghcr.io/hohn/client-ghmrva-container:0.1.24
build:
context: .
dockerfile: ./client/containers/ghmrva/Dockerfile
network_mode: "service:server" # Share the 'server' network namespace
environment:
- SERVER_URL=http://localhost:8080 # 'localhost' now refers to 'server'
code-server:
## image: ghcr.io/hohn/code-server-initialized:0.1.24
build:
context: ./client/containers/vscode
dockerfile: Dockerfile
ports:
- "9080:9080"
environment:
- PASSWORD=mrva
rabbitmq:
image: rabbitmq:3-management
hostname: rabbitmq
container_name: rabbitmq
volumes:
- ./init/rabbitmq/rabbitmq.conf:/etc/rabbitmq/rabbitmq.conf:ro
- ./init/rabbitmq/definitions.json:/etc/rabbitmq/definitions.json:ro
ports:
- "5672:5672"
- "15672:15672"
healthcheck:
test: rabbitmq-diagnostics check_port_connectivity
interval: 30s
timeout: 30s
retries: 10
networks:
- backend
server:
build:
context: .
dockerfile: ./cmd/server/Dockerfile
command: [ '--mode=container', '--loglevel=debug' ]
container_name: server
stop_grace_period: 1s
ports:
# - "8081:8080" # host:container for proxy
- "8080:8080" # host:container
depends_on:
- rabbitmq
- dbstore
- artifactstore
env_file:
- path: ./.env.container
required: true
networks:
- backend
artifactstore:
image: minio/minio:RELEASE.2024-06-11T03-13-30Z
container_name: artifactstore
ports:
- "19000:9000" # host:container
- "19001:9001"
env_file:
- path: ./.env.container
required: true
command: server /data --console-address ":9001"
volumes:
# The artifactstore is only populated at runtime so there is no need
# for Docker storage; a directory is fine.
- ./qpstore-data:/data
networks:
- backend
agent:
## image: ghcr.io/hohn/mrva-agent:0.1.24
build:
context: .
dockerfile: ./cmd/agent/Dockerfile
command: [ '--loglevel=debug' ]
container_name: agent
depends_on:
- rabbitmq
- dbstore
- artifactstore
env_file:
- path: ./.env.container
required: true
networks:
- backend
networks:
backend:
driver: bridge
volumes:
dbsdata:

View File

@@ -1,116 +0,0 @@
services:
dbssvc:
# dbsdata-container:0.1.24
image: ghcr.io/hohn/dbsdata-container:0.1.24
command: tail -f /dev/null # Keep the container running
# volumes:
# - /qldb # Directory inside the container that contains the data
volumes:
- dbsdata:/data
container_name: dbssvc
networks:
- backend
dbstore:
image: minio/minio:RELEASE.2024-06-11T03-13-30Z
container_name: dbstore
ports:
- "9000:9000"
- "9001:9001"
env_file:
- path: .env.container
required: true
command: server /data/mrvacommander/dbstore-data --console-address ":9001"
depends_on:
- dbssvc
# volumes_from:
# - dbsdata # Use the volumes from dbsdata container
volumes:
- dbsdata:/data/mrvacommander/dbstore-data
networks:
- backend
client-ghmrva:
image: ghcr.io/hohn/client-ghmrva-container:0.1.24
network_mode: "service:server" # Share the 'server' network namespace
environment:
- SERVER_URL=http://localhost:8080 # 'localhost' now refers to 'server'
code-server:
image: ghcr.io/hohn/code-server-initialized:0.1.24
ports:
- "9080:9080"
# XX: Include codeql binary in code-server (if it's not there already)
environment:
- PASSWORD=mrva
rabbitmq:
image: rabbitmq:3-management
hostname: rabbitmq
container_name: rabbitmq
volumes:
- ./init/rabbitmq/rabbitmq.conf:/etc/rabbitmq/rabbitmq.conf:ro
- ./init/rabbitmq/definitions.json:/etc/rabbitmq/definitions.json:ro
ports:
- "5672:5672"
- "15672:15672"
healthcheck:
test: rabbitmq-diagnostics check_port_connectivity
interval: 30s
timeout: 30s
retries: 10
networks:
- backend
server:
image: ghcr.io/hohn/mrva-server:0.1.24
command: [ '--mode=container', '--loglevel=debug' ]
container_name: server
stop_grace_period: 1s
depends_on:
- rabbitmq
- dbstore
- artifactstore
env_file:
- path: ./.env.container
required: true
networks:
- backend
artifactstore:
image: minio/minio:RELEASE.2024-06-11T03-13-30Z
container_name: artifactstore
ports:
- "19000:9000" # host:container
- "19001:9001"
env_file:
- path: ./.env.container
required: true
command: server /data --console-address ":9001"
volumes:
# The artifactstore is only populated at runtime so there is no need
# for Docker storage; a directory is fine.
- ./qpstore-data:/data
networks:
- backend
agent:
image: ghcr.io/hohn/mrva-agent:0.1.24
command: [ '--loglevel=debug' ]
container_name: agent
depends_on:
- rabbitmq
- dbstore
- artifactstore
env_file:
- path: ./.env.container
required: true
networks:
- backend
networks:
backend:
driver: bridge
volumes:
dbsdata:

View File

@@ -7,36 +7,37 @@ services:
volumes:
- ./init/rabbitmq/rabbitmq.conf:/etc/rabbitmq/rabbitmq.conf:ro
- ./init/rabbitmq/definitions.json:/etc/rabbitmq/definitions.json:ro
expose:
- "5672"
- "15672"
ports:
- "5672:5672"
- "15672:15672"
networks:
- backend
healthcheck:
test: rabbitmq-diagnostics check_port_connectivity
interval: 30s
timeout: 30s
retries: 10
test: [ "CMD", "nc", "-z", "localhost", "5672" ]
interval: 5s
timeout: 15s
retries: 1
server:
build:
context: .
dockerfile: ./cmd/server/Dockerfile
command: [ '--mode=container', '--loglevel=debug' ]
context: ./cmd/server
dockerfile: Dockerfile
container_name: server
stop_grace_period: 1s
stop_grace_period: 1s # Reduce the timeout period for testing
environment:
- MRVA_SERVER_ROOT=/mrva/mrvacommander/cmd/server
command: sh -c "tail -f /dev/null"
ports:
# - "8081:8080" # host:container for proxy
- "8080:8080" # host:container
- "8080:8080"
volumes:
- ./:/mrva/mrvacommander
depends_on:
- rabbitmq
- dbstore
- artifactstore
networks:
- backend
env_file:
- path: ./.env.container
required: true
dbstore:
image: minio/minio:RELEASE.2024-06-11T03-13-30Z
@@ -44,46 +45,49 @@ services:
ports:
- "9000:9000"
- "9001:9001"
env_file:
- path: .env.container
required: true
environment:
MINIO_ROOT_USER: user
MINIO_ROOT_PASSWORD: mmusty8432
command: server /data --console-address ":9001"
volumes:
- ./dbstore-data:/data
networks:
- backend
artifactstore:
qpstore:
image: minio/minio:RELEASE.2024-06-11T03-13-30Z
container_name: artifactstore
container_name: qpstore
ports:
- "19000:9000" # host:container
- "19001:9001"
env_file:
- path: ./.env.container
required: true
environment:
MINIO_ROOT_USER: user
MINIO_ROOT_PASSWORD: mmusty8432
command: server /data --console-address ":9001"
volumes:
- ./qpstore-data:/data
networks:
- backend
agent:
build:
context: .
dockerfile: ./cmd/agent/Dockerfile
command: [ '--loglevel=debug' ]
container_name: agent
depends_on:
- rabbitmq
- dbstore
- artifactstore
env_file:
- path: ./.env.container
required: true
- qpstore
environment:
MRVA_RABBITMQ_HOST: rabbitmq
MRVA_RABBITMQ_PORT: 5672
MRVA_RABBITMQ_USER: user
MRVA_RABBITMQ_PASSWORD: password
networks:
- backend
networks:
backend:
driver: bridge
# Remove named volumes to use bind mounts
# volumes:
# minio-data:

View File

@@ -1,22 +0,0 @@
* tuple hashing functions across languages
There are three parallel implementations of a hash for every entry of a tuple
list. The functions produce identical results across 3 languages and can be
used across agent / server / client.
#+BEGIN_SRC sh
hohn@ghm3 ~/work-gh/mrva/mrvacommander/experimental/qldb-specification
0:$ node tuple-hash.js
[
'91b80a9933218ff5bc62df8ff71f1252',
'b0934b29293e91aefaac73c99fc75e94'
]
hohn@ghm3 ~/work-gh/mrva/mrvacommander/experimental/qldb-specification
0:$ python3 tuple-hash.py
['91b80a9933218ff5bc62df8ff71f1252', 'b0934b29293e91aefaac73c99fc75e94']
hohn@ghm3 ~/work-gh/mrva/mrvacommander/experimental/qldb-specification
0:$ go run tuple-hash.go
[91b80a9933218ff5bc62df8ff71f1252 b0934b29293e91aefaac73c99fc75e94]
#+END_SRC

View File

@@ -1,28 +0,0 @@
package main
import (
"crypto/md5"
"encoding/hex"
"encoding/json"
"fmt"
)
func main() {
atl_L := [][2]interface{}{
{1, "s1"},
{2, "str"},
}
var sl_hash []string
for _, item := range atl_L {
jsonBytes, err := json.Marshal(item)
if err != nil {
panic(err)
}
sum := md5.Sum(jsonBytes)
sl_hash = append(sl_hash, hex.EncodeToString(sum[:]))
}
fmt.Println(sl_hash)
}

View File

@@ -1,9 +0,0 @@
const crypto = require("crypto");
const atl_L = [[1, "s1"], [2, "str"]];
const sl_hash = atl_L.map(item => {
const json = JSON.stringify(item);
return crypto.createHash("md5").update(json).digest("hex");
});
console.log(sl_hash);

View File

@@ -1,12 +0,0 @@
import hashlib
import json
atl_L = [(1, "s1"), (2, "str")]
sl_hash = []
for item in atl_L:
encoded = json.dumps(item, separators=(',', ':')).encode("utf-8")
md5sum = hashlib.md5(encoded).hexdigest()
sl_hash.append(md5sum)
print(sl_hash)

17
go.mod
View File

@@ -1,34 +1,43 @@
module github.com/hohn/mrvacommander
module mrvacommander
go 1.22.0
require (
github.com/BurntSushi/toml v1.4.0
github.com/elastic/go-sysinfo v1.14.0
github.com/google/uuid v1.6.0
github.com/gorilla/mux v1.8.1
github.com/jackc/pgx/v5 v5.6.0
github.com/minio/minio-go/v7 v7.0.71
github.com/rabbitmq/amqp091-go v1.10.0
golang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f
golang.org/x/exp v0.0.0-20240613232115-7f521ea00fb8
gopkg.in/yaml.v3 v3.0.1
gorm.io/driver/postgres v1.5.9
gorm.io/gorm v1.25.10
)
require (
github.com/dustin/go-humanize v1.0.1 // indirect
github.com/elastic/go-windows v1.0.1 // indirect
github.com/goccy/go-json v0.10.2 // indirect
github.com/jackc/pgpassfile v1.0.0 // indirect
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect
github.com/jackc/pgx/v5 v5.6.0 // indirect
github.com/jackc/puddle/v2 v2.2.1 // indirect
github.com/jinzhu/inflection v1.0.0 // indirect
github.com/jinzhu/now v1.1.5 // indirect
github.com/klauspost/compress v1.17.6 // indirect
github.com/klauspost/cpuid/v2 v2.2.6 // indirect
github.com/kr/text v0.2.0 // indirect
github.com/minio/md5-simd v1.1.2 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/prometheus/procfs v0.15.1 // indirect
github.com/rogpeppe/go-internal v1.12.0 // indirect
github.com/rs/xid v1.5.0 // indirect
golang.org/x/crypto v0.24.0 // indirect
golang.org/x/net v0.23.0 // indirect
golang.org/x/sync v0.9.0 // indirect
golang.org/x/sync v0.7.0 // indirect
golang.org/x/sys v0.21.0 // indirect
golang.org/x/text v0.16.0 // indirect
gopkg.in/ini.v1 v1.67.0 // indirect
howett.net/plist v1.0.1 // indirect
)

32
go.sum
View File

@@ -6,8 +6,14 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
github.com/elastic/go-sysinfo v1.14.0 h1:dQRtiqLycoOOla7IflZg3aN213vqJmP0lpVpKQ9lUEY=
github.com/elastic/go-sysinfo v1.14.0/go.mod h1:FKUXnZWhnYI0ueO7jhsGV3uQJ5hiz8OqM5b3oGyaRr8=
github.com/elastic/go-windows v1.0.1 h1:AlYZOldA+UJ0/2nBuqWdo90GFCgG9xuyw9SYzGUtJm0=
github.com/elastic/go-windows v1.0.1/go.mod h1:FoVvqWSun28vaDQPbj2Elfc0JahhPB7WQEGa3c814Ss=
github.com/goccy/go-json v0.10.2 h1:CrxCmQqYDkv1z7lO7Wbh2HN93uovUHgrECaO5ZrCXAU=
github.com/goccy/go-json v0.10.2/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I=
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY=
@@ -20,6 +26,11 @@ github.com/jackc/pgx/v5 v5.6.0 h1:SWJzexBzPL5jb0GEsrPMLIsi/3jOo7RHlzTjcAeDrPY=
github.com/jackc/pgx/v5 v5.6.0/go.mod h1:DNZ/vlrUnhWCoFGxHAG8U2ljioxukquj7utPDgtQdTw=
github.com/jackc/puddle/v2 v2.2.1 h1:RhxXJtFG022u4ibrCSMSiu5aOq1i77R3OHKNJj77OAk=
github.com/jackc/puddle/v2 v2.2.1/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4=
github.com/jessevdk/go-flags v1.4.0/go.mod h1:4FA24M0QyGHXBuZZK/XkWh8h0e1EYbRYJSGM75WSRxI=
github.com/jinzhu/inflection v1.0.0 h1:K317FqzuhWc8YvSVlFMCCUb36O/S9MCKRDI7QkRKD/E=
github.com/jinzhu/inflection v1.0.0/go.mod h1:h+uFLlag+Qp1Va5pdKtLDYj+kHp5pxUVkryuEj+Srlc=
github.com/jinzhu/now v1.1.5 h1:/o9tlHleP7gOFmsnYNz3RGnqzefHA47wQpKrrdTIwXQ=
github.com/jinzhu/now v1.1.5/go.mod h1:d3SSVoowX0Lcu0IBviAWJpolVfI5UJVZZ7cO71lE/z8=
github.com/klauspost/compress v1.17.6 h1:60eq2E/jlfwQXtvZEeBUYADs+BwKBWURIY+Gj2eRGjI=
github.com/klauspost/compress v1.17.6/go.mod h1:/dCuZOvVtNoHsyb+cuJD3itjs3NbnF6KH9zAO4BDxPM=
github.com/klauspost/cpuid/v2 v2.0.1/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
@@ -33,8 +44,13 @@ github.com/minio/md5-simd v1.1.2 h1:Gdi1DZK69+ZVMoNHRXJyNcxrMA4dSxoYHZSQbirFg34=
github.com/minio/md5-simd v1.1.2/go.mod h1:MzdKDxYpY2BT9XQFocsiZf/NKVtR7nkE4RoEpN+20RM=
github.com/minio/minio-go/v7 v7.0.71 h1:No9XfOKTYi6i0GnBj+WZwD8WP5GZfL7n7GOjRqCdAjA=
github.com/minio/minio-go/v7 v7.0.71/go.mod h1:4yBA8v80xGA30cfM3fz0DKYMXunWl/AV/6tWEs9ryzo=
github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc=
github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk=
github.com/rabbitmq/amqp091-go v1.10.0 h1:STpn5XsHlHGcecLmMFCtg7mqq0RnD+zFr4uzukfVhBw=
github.com/rabbitmq/amqp091-go v1.10.0/go.mod h1:Hy4jKW5kQART1u+JkDTF9YYOQUHXqMuhrgxOEeS7G4o=
github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8=
@@ -50,12 +66,13 @@ go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
golang.org/x/crypto v0.24.0 h1:mnl8DM0o513X8fdIkmyFE/5hTYxbwYOjDS/+rK6qpRI=
golang.org/x/crypto v0.24.0/go.mod h1:Z1PMYSOR5nyMcyAVAIQSKCDwalqy85Aqn1x3Ws4L5DM=
golang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f h1:XdNn9LlyWAhLVp6P/i8QYBW+hlyhrhei9uErw2B5GJo=
golang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f/go.mod h1:D5SMRVC3C2/4+F/DB1wZsLRnSNimn2Sp/NPsCrsv8ak=
golang.org/x/exp v0.0.0-20240613232115-7f521ea00fb8 h1:yixxcjnhBmY0nkL253HFVIm0JsFHwrHdT3Yh6szTnfY=
golang.org/x/exp v0.0.0-20240613232115-7f521ea00fb8/go.mod h1:jj3sYF3dwk5D+ghuXyeI3r5MFf+NT2An6/9dOA95KSI=
golang.org/x/net v0.23.0 h1:7EYJ93RZ9vYSZAIb2x3lnuvqO5zneoD6IvWjuhfxjTs=
golang.org/x/net v0.23.0/go.mod h1:JKghWKKOSdJwpW2GEx0Ja7fmaKnMsbu+MWVZTokSYmg=
golang.org/x/sync v0.9.0 h1:fEo0HyrW1GIgZdpbhCRO0PkJajUS5H9IFUztCgEo2jQ=
golang.org/x/sync v0.9.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M=
golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sys v0.0.0-20190813064441-fde4db37ae7a/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.21.0 h1:rF+pYz3DAGSQAxAu1CbC7catZg4ebC4UIeIhKxBZvws=
golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
@@ -66,6 +83,13 @@ gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntN
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
gopkg.in/ini.v1 v1.67.0 h1:Dgnx+6+nfE+IfzjUEISNeydPJh9AXNNsWbGP9KzCsOA=
gopkg.in/ini.v1 v1.67.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k=
gopkg.in/yaml.v1 v1.0.0-20140924161607-9f9df34309c0/go.mod h1:WDnlLJ4WF5VGsH/HVa3CI79GS0ol3YnhVnKP89i0kNg=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gorm.io/driver/postgres v1.5.9 h1:DkegyItji119OlcaLjqN11kHoUgZ/j13E0jkJZgD6A8=
gorm.io/driver/postgres v1.5.9/go.mod h1:DX3GReXH+3FPWGrrgffdvCk3DQ1dwDPdmbenSkweRGI=
gorm.io/gorm v1.25.10 h1:dQpO+33KalOA+aFYGlK+EfxcI5MbO7EP2yYygwh9h+s=
gorm.io/gorm v1.25.10/go.mod h1:hbnx/Oo0ChWMn1BIhpy1oYozzpM15i4YPuHDmfYtwg8=
howett.net/plist v1.0.1 h1:37GdZ8tP09Q35o9ych3ehygcsL+HqKSwzctveSlarvM=
howett.net/plist v1.0.1/go.mod h1:lqaXoTrLY4hg8tnEzNru53gicrbv7rrk+2xJA/7hw9g=

Some files were not shown because too many files have changed in this diff Show More