mirror of
https://github.com/github/codeql.git
synced 2025-12-16 16:53:25 +01:00
Merge branch 'main' into redsun82/rust-mad
This commit is contained in:
207
misc/scripts/models-as-data/bulk_generate_mad.py
Normal file → Executable file
207
misc/scripts/models-as-data/bulk_generate_mad.py
Normal file → Executable file
@@ -1,3 +1,4 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Experimental script for bulk generation of MaD models based on a list of projects.
|
||||
|
||||
@@ -7,15 +8,31 @@ Note: This file must be formatted using the Black Python formatter.
|
||||
import os.path
|
||||
import subprocess
|
||||
import sys
|
||||
from typing import NotRequired, TypedDict, List
|
||||
from typing import Required, TypedDict, List, Callable, Optional
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
import time
|
||||
import argparse
|
||||
import json
|
||||
import requests
|
||||
import zipfile
|
||||
import tarfile
|
||||
from functools import cmp_to_key
|
||||
import shutil
|
||||
|
||||
|
||||
def missing_module(module_name: str) -> None:
|
||||
print(
|
||||
f"ERROR: {module_name} is not installed. Please install it with 'pip install {module_name}'."
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
try:
|
||||
import yaml
|
||||
except ImportError:
|
||||
missing_module("pyyaml")
|
||||
|
||||
try:
|
||||
import requests
|
||||
except ImportError:
|
||||
missing_module("requests")
|
||||
|
||||
import generate_mad as mad
|
||||
|
||||
@@ -28,22 +45,18 @@ build_dir = os.path.join(gitroot, "mad-generation-build")
|
||||
|
||||
|
||||
# A project to generate models for
|
||||
class Project(TypedDict):
|
||||
"""
|
||||
Type definition for projects (acquired via a GitHub repo) to model.
|
||||
|
||||
Attributes:
|
||||
name: The name of the project
|
||||
git_repo: URL to the git repository
|
||||
git_tag: Optional Git tag to check out
|
||||
"""
|
||||
|
||||
name: str
|
||||
git_repo: NotRequired[str]
|
||||
git_tag: NotRequired[str]
|
||||
with_sinks: NotRequired[bool]
|
||||
with_sinks: NotRequired[bool]
|
||||
with_summaries: NotRequired[bool]
|
||||
Project = TypedDict(
|
||||
"Project",
|
||||
{
|
||||
"name": Required[str],
|
||||
"git-repo": str,
|
||||
"git-tag": str,
|
||||
"with-sinks": bool,
|
||||
"with-sources": bool,
|
||||
"with-summaries": bool,
|
||||
},
|
||||
total=False,
|
||||
)
|
||||
|
||||
|
||||
def should_generate_sinks(project: Project) -> bool:
|
||||
@@ -63,14 +76,14 @@ def clone_project(project: Project) -> str:
|
||||
Shallow clone a project into the build directory.
|
||||
|
||||
Args:
|
||||
project: A dictionary containing project information with 'name', 'git_repo', and optional 'git_tag' keys.
|
||||
project: A dictionary containing project information with 'name', 'git-repo', and optional 'git-tag' keys.
|
||||
|
||||
Returns:
|
||||
The path to the cloned project directory.
|
||||
"""
|
||||
name = project["name"]
|
||||
repo_url = project["git_repo"]
|
||||
git_tag = project.get("git_tag")
|
||||
repo_url = project["git-repo"]
|
||||
git_tag = project.get("git-tag")
|
||||
|
||||
# Determine target directory
|
||||
target_dir = os.path.join(build_dir, name)
|
||||
@@ -103,6 +116,39 @@ def clone_project(project: Project) -> str:
|
||||
return target_dir
|
||||
|
||||
|
||||
def run_in_parallel[
|
||||
T, U
|
||||
](
|
||||
func: Callable[[T], U],
|
||||
items: List[T],
|
||||
*,
|
||||
on_error=lambda item, exc: None,
|
||||
error_summary=lambda failures: None,
|
||||
max_workers=8,
|
||||
) -> List[Optional[U]]:
|
||||
if not items:
|
||||
return []
|
||||
max_workers = min(max_workers, len(items))
|
||||
results = [None for _ in range(len(items))]
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
# Start cloning tasks and keep track of them
|
||||
futures = {
|
||||
executor.submit(func, item): index for index, item in enumerate(items)
|
||||
}
|
||||
# Process results as they complete
|
||||
for future in as_completed(futures):
|
||||
index = futures[future]
|
||||
try:
|
||||
results[index] = future.result()
|
||||
except Exception as e:
|
||||
on_error(items[index], e)
|
||||
failed = [item for item, result in zip(items, results) if result is None]
|
||||
if failed:
|
||||
error_summary(failed)
|
||||
sys.exit(1)
|
||||
return results
|
||||
|
||||
|
||||
def clone_projects(projects: List[Project]) -> List[tuple[Project, str]]:
|
||||
"""
|
||||
Clone all projects in parallel.
|
||||
@@ -114,40 +160,19 @@ def clone_projects(projects: List[Project]) -> List[tuple[Project, str]]:
|
||||
List of (project, project_dir) pairs in the same order as the input projects
|
||||
"""
|
||||
start_time = time.time()
|
||||
max_workers = min(8, len(projects)) # Use at most 8 threads
|
||||
project_dirs_map = {} # Map to store results by project name
|
||||
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
# Start cloning tasks and keep track of them
|
||||
future_to_project = {
|
||||
executor.submit(clone_project, project): project for project in projects
|
||||
}
|
||||
|
||||
# Process results as they complete
|
||||
for future in as_completed(future_to_project):
|
||||
project = future_to_project[future]
|
||||
try:
|
||||
project_dir = future.result()
|
||||
project_dirs_map[project["name"]] = (project, project_dir)
|
||||
except Exception as e:
|
||||
print(f"ERROR: Failed to clone {project['name']}: {e}")
|
||||
|
||||
if len(project_dirs_map) != len(projects):
|
||||
failed_projects = [
|
||||
project["name"]
|
||||
for project in projects
|
||||
if project["name"] not in project_dirs_map
|
||||
]
|
||||
print(
|
||||
f"ERROR: Only {len(project_dirs_map)} out of {len(projects)} projects were cloned successfully. Failed projects: {', '.join(failed_projects)}"
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
project_dirs = [project_dirs_map[project["name"]] for project in projects]
|
||||
|
||||
dirs = run_in_parallel(
|
||||
clone_project,
|
||||
projects,
|
||||
on_error=lambda project, exc: print(
|
||||
f"ERROR: Failed to clone project {project['name']}: {exc}"
|
||||
),
|
||||
error_summary=lambda failures: print(
|
||||
f"ERROR: Failed to clone {len(failures)} projects: {', '.join(p['name'] for p in failures)}"
|
||||
),
|
||||
)
|
||||
clone_time = time.time() - start_time
|
||||
print(f"Cloning completed in {clone_time:.2f} seconds")
|
||||
return project_dirs
|
||||
return list(zip(projects, dirs))
|
||||
|
||||
|
||||
def build_database(
|
||||
@@ -159,7 +184,7 @@ def build_database(
|
||||
Args:
|
||||
language: The language for which to build the database (e.g., "rust").
|
||||
extractor_options: Additional options for the extractor.
|
||||
project: A dictionary containing project information with 'name' and 'git_repo' keys.
|
||||
project: A dictionary containing project information with 'name' and 'git-repo' keys.
|
||||
project_dir: Path to the CodeQL database.
|
||||
|
||||
Returns:
|
||||
@@ -307,7 +332,10 @@ def pretty_name_from_artifact_name(artifact_name: str) -> str:
|
||||
|
||||
|
||||
def download_dca_databases(
|
||||
experiment_name: str, pat: str, projects: List[Project]
|
||||
language: str,
|
||||
experiment_name: str,
|
||||
pat: str,
|
||||
projects: List[Project],
|
||||
) -> List[tuple[Project, str | None]]:
|
||||
"""
|
||||
Download databases from a DCA experiment.
|
||||
@@ -318,7 +346,6 @@ def download_dca_databases(
|
||||
Returns:
|
||||
List of (project_name, database_dir) pairs, where database_dir is None if the download failed.
|
||||
"""
|
||||
database_results = {}
|
||||
print("\n=== Finding projects ===")
|
||||
response = get_json_from_github(
|
||||
f"https://raw.githubusercontent.com/github/codeql-dca-main/data/{experiment_name}/reports/downloads.json",
|
||||
@@ -326,6 +353,7 @@ def download_dca_databases(
|
||||
)
|
||||
targets = response["targets"]
|
||||
project_map = {project["name"]: project for project in projects}
|
||||
analyzed_databases = {}
|
||||
for data in targets.values():
|
||||
downloads = data["downloads"]
|
||||
analyzed_database = downloads["analyzed_database"]
|
||||
@@ -336,6 +364,15 @@ def download_dca_databases(
|
||||
print(f"Skipping {pretty_name} as it is not in the list of projects")
|
||||
continue
|
||||
|
||||
if pretty_name in analyzed_databases:
|
||||
print(
|
||||
f"Skipping previous database {analyzed_databases[pretty_name]['artifact_name']} for {pretty_name}"
|
||||
)
|
||||
|
||||
analyzed_databases[pretty_name] = analyzed_database
|
||||
|
||||
def download_and_decompress(analyzed_database: dict) -> str:
|
||||
artifact_name = analyzed_database["artifact_name"]
|
||||
repository = analyzed_database["repository"]
|
||||
run_id = analyzed_database["run_id"]
|
||||
print(f"=== Finding artifact: {artifact_name} ===")
|
||||
@@ -351,27 +388,40 @@ def download_dca_databases(
|
||||
artifact_zip_location = download_artifact(
|
||||
archive_download_url, artifact_name, pat
|
||||
)
|
||||
print(f"=== Extracting artifact: {artifact_name} ===")
|
||||
print(f"=== Decompressing artifact: {artifact_name} ===")
|
||||
# The database is in a zip file, which contains a tar.gz file with the DB
|
||||
# First we open the zip file
|
||||
with zipfile.ZipFile(artifact_zip_location, "r") as zip_ref:
|
||||
artifact_unzipped_location = os.path.join(build_dir, artifact_name)
|
||||
# clean up any remnants of previous runs
|
||||
shutil.rmtree(artifact_unzipped_location, ignore_errors=True)
|
||||
# And then we extract it to build_dir/artifact_name
|
||||
zip_ref.extractall(artifact_unzipped_location)
|
||||
# And then we iterate over the contents of the extracted directory
|
||||
# and extract the tar.gz files inside it
|
||||
for entry in os.listdir(artifact_unzipped_location):
|
||||
artifact_tar_location = os.path.join(artifact_unzipped_location, entry)
|
||||
with tarfile.open(artifact_tar_location, "r:gz") as tar_ref:
|
||||
# And we just untar it to the same directory as the zip file
|
||||
tar_ref.extractall(artifact_unzipped_location)
|
||||
database_results[pretty_name] = os.path.join(
|
||||
artifact_unzipped_location, remove_extension(entry)
|
||||
)
|
||||
# And then we extract the language tar.gz file inside it
|
||||
artifact_tar_location = os.path.join(
|
||||
artifact_unzipped_location, f"{language}.tar.gz"
|
||||
)
|
||||
with tarfile.open(artifact_tar_location, "r:gz") as tar_ref:
|
||||
# And we just untar it to the same directory as the zip file
|
||||
tar_ref.extractall(artifact_unzipped_location)
|
||||
ret = os.path.join(artifact_unzipped_location, language)
|
||||
print(f"Decompression complete: {ret}")
|
||||
return ret
|
||||
|
||||
print(f"\n=== Extracted {len(database_results)} databases ===")
|
||||
results = run_in_parallel(
|
||||
download_and_decompress,
|
||||
list(analyzed_databases.values()),
|
||||
on_error=lambda db, exc: print(
|
||||
f"ERROR: Failed to download and decompress {db["artifact_name"]}: {exc}"
|
||||
),
|
||||
error_summary=lambda failures: print(
|
||||
f"ERROR: Failed to download {len(failures)} databases: {', '.join(item[0] for item in failures)}"
|
||||
),
|
||||
)
|
||||
|
||||
return [(project, database_results[project["name"]]) for project in projects]
|
||||
print(f"\n=== Fetched {len(results)} databases ===")
|
||||
|
||||
return [(project_map[n], r) for n, r in zip(analyzed_databases, results)]
|
||||
|
||||
|
||||
def get_mad_destination_for_project(config, name: str) -> str:
|
||||
@@ -422,7 +472,9 @@ To avoid loss of data, please commit your changes."""
|
||||
case "repo":
|
||||
extractor_options = config.get("extractor_options", [])
|
||||
database_results = build_databases_from_projects(
|
||||
language, extractor_options, projects
|
||||
language,
|
||||
extractor_options,
|
||||
projects,
|
||||
)
|
||||
case "dca":
|
||||
experiment_name = args.dca
|
||||
@@ -439,7 +491,10 @@ To avoid loss of data, please commit your changes."""
|
||||
with open(args.pat, "r") as f:
|
||||
pat = f.read().strip()
|
||||
database_results = download_dca_databases(
|
||||
experiment_name, pat, projects
|
||||
language,
|
||||
experiment_name,
|
||||
pat,
|
||||
projects,
|
||||
)
|
||||
|
||||
# Generate models for all projects
|
||||
@@ -492,9 +547,9 @@ if __name__ == "__main__":
|
||||
sys.exit(1)
|
||||
try:
|
||||
with open(args.config, "r") as f:
|
||||
config = json.load(f)
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"ERROR: Failed to parse JSON file {args.config}: {e}")
|
||||
config = yaml.safe_load(f)
|
||||
except yaml.YAMLError as e:
|
||||
print(f"ERROR: Failed to parse YAML file {args.config}: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
main(config, args)
|
||||
|
||||
Reference in New Issue
Block a user