commit fadc66958695f30fd2b0185f677086f16512444d Author: Michael Hohn Date: Tue Dec 17 21:29:27 2024 -0800 initial mrvahepc commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a1e7574 --- /dev/null +++ b/.gitignore @@ -0,0 +1,45 @@ +# vscode project dir +.vscode/ + +# idea project dir +.idea/ + +# Scratch space +scratch/ + +# +# Binaries for programs and plugins +*.exe +*.exe~ +*.dll +*.so +*.dylib + +# Test binary, built with `go test -c` +*.test + +# Output of the go coverage tool, specifically when used with LiteIDE +*.out + +# Go workspace file +go.work +go.work.sum + +# env file +.env +venv/ +venv-*/ +*.egg-info +__pycache__ +README.html +ChangeLog +notes/*.html + +# Make timestamp files +mk.* + +# temporaries +*.tmp + +# Mac OS +.DS_Store diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..d645695 --- /dev/null +++ b/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.org b/README.org new file mode 100644 index 0000000..049f23f --- /dev/null +++ b/README.org @@ -0,0 +1,59 @@ +* Introduction to hepc -- HTTP End Point for CodeQL +** Usage Sample + #+BEGIN_SRC sh + # Collect DBs from filesystem + cd ~/work-gh/mrva/mrvahepc + ./bin/mc-hepc-init --db_collection_dir db-collection.tmp \ + --starting_path ~/work-gh/mrva/mrva-open-source-download + + # Serve collected DBs plus metadata + ./bin/mc-hepc-serve --codeql-db-dir db-collection.tmp + + # Test server + curl 127.0.0.1:8070/index -o - 2>/dev/null | wc -l + + curl 127.0.0.1:8070/api/v1/latest_results/codeql-all \ + -o - 2>/dev/null | wc -l + + url=$(curl 127.0.0.1:8070/api/v1/latest_results/codeql-all \ + -o - 2>/dev/null | head -1 | jq -r .result_url) + # http://hepc/db-collection.tmp/aircrack-ng-aircrack-ng-ctsj-41ebbe.zip + + wget $(echo $url|sed 's|http://hepc|http://127.0.0.1:8070|g;') + + + #+END_SRC + +** Installation + - Set up the virtual environment and install tools + #+begin_example + cd ~/work-gh/mrva/mrvahepc + python3.11 -m venv venv + source venv/bin/activate + pip install --upgrade pip + + # From requirements.txt + pip install -r requirements.txt + # Or explicitly + pip install ipython + #+end_example + + - Local development + #+begin_example + cd ~/work-gh/mrva/mrvahepc + source venv/bin/activate + pip install --editable . + #+end_example + The `--editable` *should* use symlinks for all scripts; use `./bin/*` to be sure. + + - Full installation + + #+begin_example + pip install mrvahepc + #+end_example + +** Use as library + The best way to examine the code is starting from the high-level scripts + in =bin/=. + + diff --git a/bin/mc-hepc-init b/bin/mc-hepc-init new file mode 100755 index 0000000..24a5d0b --- /dev/null +++ b/bin/mc-hepc-init @@ -0,0 +1,120 @@ +#!/usr/bin/env python3 + +import json +import hashlib +import yaml +import sys +from plumbum import cli, local +from plumbum.cmd import find, mkdir, cp, rm, mktemp, unzip, date, env + +# Logging function +def log(level, message): + colors = { + "INFO": "\033[1;34m", + "WARN": "\033[1;33m", + "ERROR": "\033[1;31m", + "RESET": "\033[0m", + } + timestamp = date("+%Y-%m-%d %H:%M:%S").strip() + print(f"{colors[level]}[{timestamp}] [{level}] {message}{colors['RESET']}", file=sys.stderr) + +# Generate a CID (cumulative id) +def generate_cid(cli_version, creation_time, primary_language, sha): + hash_input = f"{cli_version} {creation_time} {primary_language} {sha}".encode() + return hashlib.sha256(hash_input).hexdigest()[:6] + +# Expand environment variables in paths +def expand_path(path): + return local.env.expand(path) + +# Process a single db.zip file +def process_db_file(zip_path, db_collection_dir): + temp_dir = mktemp("-d").strip() + try: + unzip("-o", "-q", zip_path, "*codeql-database.yml", "-d", temp_dir) + + # Locate the YAML file regardless of its depth + yaml_files = list(local.path(temp_dir).walk( + filter=lambda p: p.name == "codeql-database.yml")) + if not yaml_files: + log("WARN", f"No codeql-database.yml found in {zip_path}") + return + + yaml_path = yaml_files[0] + with yaml_path.open("r") as f: + yaml_data = yaml.safe_load(f) + + primary_language = yaml_data["primaryLanguage"] + creation_metadata = yaml_data["creationMetadata"] + sha = creation_metadata["sha"] + cli_version = creation_metadata["cliVersion"] + creation_time = creation_metadata["creationTime"] + source_location_prefix = local.path(yaml_data["sourceLocationPrefix"]) + repo = source_location_prefix.name + owner = source_location_prefix.parent.name + cid = generate_cid(cli_version, creation_time, primary_language, sha) + new_db_fname = f"{owner}-{repo}-ctsj-{cid}.zip" + result_url = f"http://hepc/{db_collection_dir}/{new_db_fname}" + + metadata = { + "git_branch" : "HEAD", + "git_commit_id" : sha, + "git_repo" : repo, + "ingestion_datetime_utc" : str(creation_time), + "result_url" : result_url, + "tool_id" : "9f2f9642-febb-4435-9204-fb50bbd43de4", + "tool_name" : f"codeql-{primary_language}", + "tool_version" : cli_version, + "projname" : f"{owner}/{repo}", + } + + metadata_file = local.path(db_collection_dir) / "metadata.json" + with metadata_file.open("a") as f: + json.dump(metadata, f) + f.write("\n") + + copy_path = local.path(db_collection_dir) / new_db_fname + if not copy_path.exists(): + cp(zip_path, copy_path) + + except Exception as e: + log("WARN", f"Error processing {zip_path}: {e}") + finally: + rm("-rf", temp_dir) + +# Main application class +class DBProcessor(cli.Application): + """ + DBProcessor processes db.zip files found in a starting directory, + copies updated names in a collection directory, + and adds a metadata information file "metadata.json" to the directory. + """ + + db_collection_dir = cli.SwitchAttr( + "--db_collection_dir", str, mandatory=True, help="Specify the database collection directory" + ) + starting_path = cli.SwitchAttr( + "--starting_path", str, mandatory=True, help="Specify the starting path" + ) + + def main(self): + db_collection_dir = expand_path(self.db_collection_dir) + starting_path = expand_path(self.starting_path) + + mkdir("-p", db_collection_dir) + log("INFO", f"Searching for db.zip files in {starting_path}") + + db_files = find(starting_path, "-type", "f", "-name", "db.zip", + "-size", "+0c").splitlines() + + if not db_files: + log("WARN", "No db.zip files found in the specified starting path.") + return + + for zip_path in db_files: + process_db_file(zip_path, db_collection_dir) + + log("INFO", "Processing completed.") + +if __name__ == "__main__": + DBProcessor.run() diff --git a/bin/mc-hepc-serve b/bin/mc-hepc-serve new file mode 100755 index 0000000..e7037f9 --- /dev/null +++ b/bin/mc-hepc-serve @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 +import logging +from pathlib import Path +from plumbum import cli +from fastapi import FastAPI, HTTPException +from fastapi.responses import FileResponse +import uvicorn + +# Logging configuration +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + handlers=[logging.StreamHandler()] +) +logger = logging.getLogger(__name__) + +# FastAPI application +app = FastAPI() +db_dir = None # This will be set by the CLI application + +@app.get("/{file_path:path}") +def serve_file(file_path: str): + """ + Serve files from the database directory, such as .zip files or metadata.json. + """ + logger.info(f"Requested file: {file_path}") + resolved_path = Path(file_path).resolve(strict=True) + logger.info(f"file resolved to: {resolved_path}") + if not resolved_path.exists(): + logger.error(f"File not found: {resolved_path}") + raise HTTPException(status_code=404, detail=f"{resolved_path} not found") + return FileResponse(resolved_path) + + +@app.get("/index") +@app.get("/api/v1/latest_results/codeql-all") +def serve_metadata_json(): + """ + Serve the metadata.json file for multiple routes. + """ + metadata_path = Path(db_dir) / "metadata.json" + logger.info(f"Requested metadata.json at: {metadata_path}") + if not metadata_path.exists(): + logger.error("metadata.json not found.") + raise HTTPException(status_code=404, detail="metadata.json not found") + logger.info(f"Serving metadata.json from: {metadata_path}") + return FileResponse(metadata_path) + +@app.middleware("http") +async def log_request(request, call_next): + logger.info(f"Incoming request: {request.method} {request.url}") + response = await call_next(request) + return response + +class MRVAHepc(cli.Application): + """ + MRVAHepc serves: + 1. CodeQL database .zip files found in the --codeql-db-dir + 2. Metadata for those zip files, contained in metadata.json in the same + directory. + The HTTP endpoints are: + 1. /{filename} + 2. /index + 3. /api/v1/latest_results/codeql-all + """ + + codeql_db_dir = cli.SwitchAttr("--codeql-db-dir", str, mandatory=True, + help="Directory containing CodeQL database files") + host = cli.SwitchAttr("--host", str, default="127.0.0.1", + help="Host address for the HTTP server") + port = cli.SwitchAttr("--port", int, default=8070, help="Port for the HTTP server") + + def main(self): + global db_dir + db_dir = Path(self.codeql_db_dir) + if not db_dir.is_dir(): + logger.error(f"Invalid directory: {db_dir}") + return 1 + + logger.info(f"Starting server at {self.host}:{self.port}") + logger.info(f"Serving files from directory: {db_dir}") + + # Run the FastAPI server using Uvicorn + uvicorn.run(app, host=self.host, port=self.port) + + +if __name__ == "__main__": + MRVAHepc.run() diff --git a/mrvahepc/__init__.py b/mrvahepc/__init__.py new file mode 100644 index 0000000..139597f --- /dev/null +++ b/mrvahepc/__init__.py @@ -0,0 +1,2 @@ + + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..a9b5f81 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,31 @@ +annotated-types==0.7.0 +anyio==4.7.0 +asttokens==3.0.0 +click==8.1.7 +decorator==5.1.1 +executing==2.1.0 +fastapi==0.115.6 +h11==0.14.0 +idna==3.10 +ipython==8.30.0 +jedi==0.19.2 +matplotlib-inline==0.1.7 +# Editable Git install with no remote (mrvahepc==0.1.0) +-e /Users/hohn/work-gh/mrva/mrvahepc +parso==0.8.4 +pexpect==4.9.0 +plumbum==1.9.0 +prompt_toolkit==3.0.48 +ptyprocess==0.7.0 +pure_eval==0.2.3 +pydantic==2.10.3 +pydantic_core==2.27.1 +Pygments==2.18.0 +PyYAML==6.0.2 +sniffio==1.3.1 +stack-data==0.6.3 +starlette==0.41.3 +traitlets==5.14.3 +typing_extensions==4.12.2 +uvicorn==0.34.0 +wcwidth==0.2.13 diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..4d0afdc --- /dev/null +++ b/setup.py @@ -0,0 +1,13 @@ +from setuptools import setup, find_packages +import glob + +setup( + name='mrvahepc', + version='0.1.0', + description='A Python package for serving CodeQL databases', + author='Michael Hohn', + author_email='hohn@github.com', + packages=['mrvahepc'], + install_requires=[], + scripts=glob.glob("bin/mc-*"), +)