From 242ba3fc1ed07d5ab9c604d75482c2d3e7ea65e9 Mon Sep 17 00:00:00 2001 From: Michael Hohn Date: Thu, 25 Jul 2024 15:14:37 -0700 Subject: [PATCH] Add script to populate minio using dataframe previously chosen --- client/qldbtools/README.md | 6 +- client/qldbtools/bin/mc-db-populate-minio | 86 +++++++++++++++++++ .../qldbtools/session-populate-minio.py | 65 ++++++++++++++ client/qldbtools/requirements.txt | 3 + 4 files changed, 158 insertions(+), 2 deletions(-) create mode 100755 client/qldbtools/bin/mc-db-populate-minio create mode 100644 client/qldbtools/qldbtools/session-populate-minio.py diff --git a/client/qldbtools/README.md b/client/qldbtools/README.md index bf19f97..a4e3b5f 100644 --- a/client/qldbtools/README.md +++ b/client/qldbtools/README.md @@ -59,8 +59,10 @@ import qldbtools as ql ## Command-line use cd ~/work-gh/mrva/mrvacommander/client/qldbtools - ./bin/mc-db-initial-info ~/work-gh/mrva/mrva-open-source-download | gzip > db-info-1.csv.gz + ./bin/mc-db-initial-info ~/work-gh/mrva/mrva-open-source-download > db-info-1.csv - gunzip < db-info-1.csv.gz | ./bin/mc-db-refine-info | gzip > db-info-2.csv.gz + ./bin/mc-db-refine-info < db-info-1.csv > db-info-2.csv + ./bin/mc-db-populate-minio < db-info-2.csv -n 3 + diff --git a/client/qldbtools/bin/mc-db-populate-minio b/client/qldbtools/bin/mc-db-populate-minio new file mode 100755 index 0000000..175803a --- /dev/null +++ b/client/qldbtools/bin/mc-db-populate-minio @@ -0,0 +1,86 @@ +#!/usr/bin/env python +""" Read a table of CodeQL DB information (like those produced by + mc-db-refine-info) and push the databases it lists to the mrvacommander minio + DB. +""" +import argparse +import qldbtools.utils as utils +import logging +import pandas as pd +import numpy as np +import sys +from minio import Minio +from minio.error import S3Error +from pathlib import Path +# +#* Configure logger +# +logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') +# Overwrite log level set by minio +root_logger = logging.getLogger() +root_logger.setLevel(logging.INFO) + +# +#* Process command line +# +parser = argparse.ArgumentParser( + description=""" Read a table of CodeQL DB information (like those produced by + mc-db-refine-info) and push the databases it lists to the mrvacommander minio + DB. """) +parser.add_argument('-n', '--num-entries', type=int, + help='Only use N entries', + default=None) +parser.add_argument('-s', '--seed', type=int, + help='Random number seed', + default=4242) +args = parser.parse_args() +# +#* Collect the information and select subset +# +df = pd.read_csv(sys.stdin) +if args.num_entries == None: + # Use all entries + entries = df +else: + # Use num_entries, chosen via pseudo-random numbers + entries = df.sample(n=args.num_entries, + random_state=np.random.RandomState(args.seed)) +# +#* Push the DBs +# +# Configuration +MINIO_URL = "http://localhost:9000" +MINIO_ROOT_USER = "user" +MINIO_ROOT_PASSWORD = "mmusty8432" +QL_DB_BUCKET_NAME = "qldb" + +# Initialize MinIO client +client = Minio( + MINIO_URL.replace("http://", "").replace("https://", ""), + access_key=MINIO_ROOT_USER, + secret_key=MINIO_ROOT_PASSWORD, + secure=False +) + +# Create the bucket if it doesn't exist +try: + if not client.bucket_exists(QL_DB_BUCKET_NAME): + client.make_bucket(QL_DB_BUCKET_NAME) + else: + logging.info(f"Bucket '{QL_DB_BUCKET_NAME}' already exists.") +except S3Error as err: + logging.error(f"Error creating bucket: {err}") + +# Get info from dataframe and push the files +for index, row in entries[['owner', 'name', 'path']].iterrows(): + owner, name, path = row + new_name = f'{owner}${name}.zip' + try: + client.fput_object(QL_DB_BUCKET_NAME, new_name, path) + logging.info(f"Uploaded {path} as {new_name} to bucket {QL_DB_BUCKET_NAME}") + except S3Error as err: + logging.error(f"Error uploading file {local_path}: {err}") + +# Local Variables: +# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/" +# End: diff --git a/client/qldbtools/qldbtools/session-populate-minio.py b/client/qldbtools/qldbtools/session-populate-minio.py new file mode 100644 index 0000000..2e1a182 --- /dev/null +++ b/client/qldbtools/qldbtools/session-populate-minio.py @@ -0,0 +1,65 @@ +import qldbtools.utils as utils +import pandas as pd +import numpy as np +import sys +from minio import Minio +from minio.error import S3Error +from pathlib import Path + +# +#* Collect the information and select subset +# +df = pd.read_csv('db-info-2.csv') +seed = 4242 +if 0: + # Use all entries + entries = df +else: + # Use num_entries, chosen via pseudo-random numbers + entries = df.sample(n=3, + random_state=np.random.RandomState(seed)) +# +#* Push the DBs +# +# Configuration +MINIO_URL = "http://localhost:9000" +MINIO_ROOT_USER = "user" +MINIO_ROOT_PASSWORD = "mmusty8432" +QL_DB_BUCKET_NAME = "qldb" + +# Initialize MinIO client +client = Minio( + MINIO_URL.replace("http://", "").replace("https://", ""), + access_key=MINIO_ROOT_USER, + secret_key=MINIO_ROOT_PASSWORD, + secure=False +) + +# Create the bucket if it doesn't exist +try: + if not client.bucket_exists(QL_DB_BUCKET_NAME): + client.make_bucket(QL_DB_BUCKET_NAME) + else: + print(f"Bucket '{QL_DB_BUCKET_NAME}' already exists.") +except S3Error as err: + print(f"Error creating bucket: {err}") + +# (test) File paths and new names +files_to_upload = { + "cmd/server/codeql/dbs/google/flatbuffers/google_flatbuffers_db.zip": "google$flatbuffers.zip", + "cmd/server/codeql/dbs/psycopg/psycopg2/psycopg_psycopg2_db.zip": "psycopg$psycopg2.zip" +} + +# (test) Push the files +prefix = Path('/Users/hohn/work-gh/mrva/mrvacommander') +for local_path, new_name in files_to_upload.items(): + try: + client.fput_object(QL_DB_BUCKET_NAME, new_name, prefix / Path(local_path)) + print(f"Uploaded {local_path} as {new_name} to bucket {QL_DB_BUCKET_NAME}") + except S3Error as err: + print(f"Error uploading file {local_path}: {err}") + + +# Local Variables: +# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/" +# End: diff --git a/client/qldbtools/requirements.txt b/client/qldbtools/requirements.txt index 30a0e60..dd5d27a 100644 --- a/client/qldbtools/requirements.txt +++ b/client/qldbtools/requirements.txt @@ -80,6 +80,7 @@ lz4==4.3.3 MarkupSafe==2.1.5 matplotlib==3.9.1 matplotlib-inline==0.1.7 +minio==7.2.7 missingno==0.5.2 mistune==3.0.2 msgpack==1.0.8 @@ -115,6 +116,7 @@ pure-eval==0.2.2 py-cpuinfo==9.0.0 pyarrow==16.1.0 pycparser==2.22 +pycryptodome==3.20.0 Pygments==2.18.0 pynput==1.7.7 pyobjc-core==10.3.1 @@ -133,6 +135,7 @@ python-json-logger==2.0.7 pytz==2024.1 PyYAML==6.0.1 pyzmq==26.0.3 +-e git+ssh://git@github.com/advanced-security/mrvacommander.git@26dd69c9767c315a8ffb782eedf3b55eac574d45#egg=qldbtools&subdirectory=client/qldbtools qtstylish==0.1.5 referencing==0.35.1 requests==2.32.3