Merge branch 'main' into redsun82/rust-mad

This commit is contained in:
Paolo Tranquilli
2025-06-11 16:39:10 +02:00
1104 changed files with 33004 additions and 12030 deletions

207
misc/scripts/models-as-data/bulk_generate_mad.py Normal file → Executable file
View File

@@ -1,3 +1,4 @@
#!/usr/bin/env python3
"""
Experimental script for bulk generation of MaD models based on a list of projects.
@@ -7,15 +8,31 @@ Note: This file must be formatted using the Black Python formatter.
import os.path
import subprocess
import sys
from typing import NotRequired, TypedDict, List
from typing import Required, TypedDict, List, Callable, Optional
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
import argparse
import json
import requests
import zipfile
import tarfile
from functools import cmp_to_key
import shutil
def missing_module(module_name: str) -> None:
print(
f"ERROR: {module_name} is not installed. Please install it with 'pip install {module_name}'."
)
sys.exit(1)
try:
import yaml
except ImportError:
missing_module("pyyaml")
try:
import requests
except ImportError:
missing_module("requests")
import generate_mad as mad
@@ -28,22 +45,18 @@ build_dir = os.path.join(gitroot, "mad-generation-build")
# A project to generate models for
class Project(TypedDict):
"""
Type definition for projects (acquired via a GitHub repo) to model.
Attributes:
name: The name of the project
git_repo: URL to the git repository
git_tag: Optional Git tag to check out
"""
name: str
git_repo: NotRequired[str]
git_tag: NotRequired[str]
with_sinks: NotRequired[bool]
with_sinks: NotRequired[bool]
with_summaries: NotRequired[bool]
Project = TypedDict(
"Project",
{
"name": Required[str],
"git-repo": str,
"git-tag": str,
"with-sinks": bool,
"with-sources": bool,
"with-summaries": bool,
},
total=False,
)
def should_generate_sinks(project: Project) -> bool:
@@ -63,14 +76,14 @@ def clone_project(project: Project) -> str:
Shallow clone a project into the build directory.
Args:
project: A dictionary containing project information with 'name', 'git_repo', and optional 'git_tag' keys.
project: A dictionary containing project information with 'name', 'git-repo', and optional 'git-tag' keys.
Returns:
The path to the cloned project directory.
"""
name = project["name"]
repo_url = project["git_repo"]
git_tag = project.get("git_tag")
repo_url = project["git-repo"]
git_tag = project.get("git-tag")
# Determine target directory
target_dir = os.path.join(build_dir, name)
@@ -103,6 +116,39 @@ def clone_project(project: Project) -> str:
return target_dir
def run_in_parallel[
T, U
](
func: Callable[[T], U],
items: List[T],
*,
on_error=lambda item, exc: None,
error_summary=lambda failures: None,
max_workers=8,
) -> List[Optional[U]]:
if not items:
return []
max_workers = min(max_workers, len(items))
results = [None for _ in range(len(items))]
with ThreadPoolExecutor(max_workers=max_workers) as executor:
# Start cloning tasks and keep track of them
futures = {
executor.submit(func, item): index for index, item in enumerate(items)
}
# Process results as they complete
for future in as_completed(futures):
index = futures[future]
try:
results[index] = future.result()
except Exception as e:
on_error(items[index], e)
failed = [item for item, result in zip(items, results) if result is None]
if failed:
error_summary(failed)
sys.exit(1)
return results
def clone_projects(projects: List[Project]) -> List[tuple[Project, str]]:
"""
Clone all projects in parallel.
@@ -114,40 +160,19 @@ def clone_projects(projects: List[Project]) -> List[tuple[Project, str]]:
List of (project, project_dir) pairs in the same order as the input projects
"""
start_time = time.time()
max_workers = min(8, len(projects)) # Use at most 8 threads
project_dirs_map = {} # Map to store results by project name
with ThreadPoolExecutor(max_workers=max_workers) as executor:
# Start cloning tasks and keep track of them
future_to_project = {
executor.submit(clone_project, project): project for project in projects
}
# Process results as they complete
for future in as_completed(future_to_project):
project = future_to_project[future]
try:
project_dir = future.result()
project_dirs_map[project["name"]] = (project, project_dir)
except Exception as e:
print(f"ERROR: Failed to clone {project['name']}: {e}")
if len(project_dirs_map) != len(projects):
failed_projects = [
project["name"]
for project in projects
if project["name"] not in project_dirs_map
]
print(
f"ERROR: Only {len(project_dirs_map)} out of {len(projects)} projects were cloned successfully. Failed projects: {', '.join(failed_projects)}"
)
sys.exit(1)
project_dirs = [project_dirs_map[project["name"]] for project in projects]
dirs = run_in_parallel(
clone_project,
projects,
on_error=lambda project, exc: print(
f"ERROR: Failed to clone project {project['name']}: {exc}"
),
error_summary=lambda failures: print(
f"ERROR: Failed to clone {len(failures)} projects: {', '.join(p['name'] for p in failures)}"
),
)
clone_time = time.time() - start_time
print(f"Cloning completed in {clone_time:.2f} seconds")
return project_dirs
return list(zip(projects, dirs))
def build_database(
@@ -159,7 +184,7 @@ def build_database(
Args:
language: The language for which to build the database (e.g., "rust").
extractor_options: Additional options for the extractor.
project: A dictionary containing project information with 'name' and 'git_repo' keys.
project: A dictionary containing project information with 'name' and 'git-repo' keys.
project_dir: Path to the CodeQL database.
Returns:
@@ -307,7 +332,10 @@ def pretty_name_from_artifact_name(artifact_name: str) -> str:
def download_dca_databases(
experiment_name: str, pat: str, projects: List[Project]
language: str,
experiment_name: str,
pat: str,
projects: List[Project],
) -> List[tuple[Project, str | None]]:
"""
Download databases from a DCA experiment.
@@ -318,7 +346,6 @@ def download_dca_databases(
Returns:
List of (project_name, database_dir) pairs, where database_dir is None if the download failed.
"""
database_results = {}
print("\n=== Finding projects ===")
response = get_json_from_github(
f"https://raw.githubusercontent.com/github/codeql-dca-main/data/{experiment_name}/reports/downloads.json",
@@ -326,6 +353,7 @@ def download_dca_databases(
)
targets = response["targets"]
project_map = {project["name"]: project for project in projects}
analyzed_databases = {}
for data in targets.values():
downloads = data["downloads"]
analyzed_database = downloads["analyzed_database"]
@@ -336,6 +364,15 @@ def download_dca_databases(
print(f"Skipping {pretty_name} as it is not in the list of projects")
continue
if pretty_name in analyzed_databases:
print(
f"Skipping previous database {analyzed_databases[pretty_name]['artifact_name']} for {pretty_name}"
)
analyzed_databases[pretty_name] = analyzed_database
def download_and_decompress(analyzed_database: dict) -> str:
artifact_name = analyzed_database["artifact_name"]
repository = analyzed_database["repository"]
run_id = analyzed_database["run_id"]
print(f"=== Finding artifact: {artifact_name} ===")
@@ -351,27 +388,40 @@ def download_dca_databases(
artifact_zip_location = download_artifact(
archive_download_url, artifact_name, pat
)
print(f"=== Extracting artifact: {artifact_name} ===")
print(f"=== Decompressing artifact: {artifact_name} ===")
# The database is in a zip file, which contains a tar.gz file with the DB
# First we open the zip file
with zipfile.ZipFile(artifact_zip_location, "r") as zip_ref:
artifact_unzipped_location = os.path.join(build_dir, artifact_name)
# clean up any remnants of previous runs
shutil.rmtree(artifact_unzipped_location, ignore_errors=True)
# And then we extract it to build_dir/artifact_name
zip_ref.extractall(artifact_unzipped_location)
# And then we iterate over the contents of the extracted directory
# and extract the tar.gz files inside it
for entry in os.listdir(artifact_unzipped_location):
artifact_tar_location = os.path.join(artifact_unzipped_location, entry)
with tarfile.open(artifact_tar_location, "r:gz") as tar_ref:
# And we just untar it to the same directory as the zip file
tar_ref.extractall(artifact_unzipped_location)
database_results[pretty_name] = os.path.join(
artifact_unzipped_location, remove_extension(entry)
)
# And then we extract the language tar.gz file inside it
artifact_tar_location = os.path.join(
artifact_unzipped_location, f"{language}.tar.gz"
)
with tarfile.open(artifact_tar_location, "r:gz") as tar_ref:
# And we just untar it to the same directory as the zip file
tar_ref.extractall(artifact_unzipped_location)
ret = os.path.join(artifact_unzipped_location, language)
print(f"Decompression complete: {ret}")
return ret
print(f"\n=== Extracted {len(database_results)} databases ===")
results = run_in_parallel(
download_and_decompress,
list(analyzed_databases.values()),
on_error=lambda db, exc: print(
f"ERROR: Failed to download and decompress {db["artifact_name"]}: {exc}"
),
error_summary=lambda failures: print(
f"ERROR: Failed to download {len(failures)} databases: {', '.join(item[0] for item in failures)}"
),
)
return [(project, database_results[project["name"]]) for project in projects]
print(f"\n=== Fetched {len(results)} databases ===")
return [(project_map[n], r) for n, r in zip(analyzed_databases, results)]
def get_mad_destination_for_project(config, name: str) -> str:
@@ -422,7 +472,9 @@ To avoid loss of data, please commit your changes."""
case "repo":
extractor_options = config.get("extractor_options", [])
database_results = build_databases_from_projects(
language, extractor_options, projects
language,
extractor_options,
projects,
)
case "dca":
experiment_name = args.dca
@@ -439,7 +491,10 @@ To avoid loss of data, please commit your changes."""
with open(args.pat, "r") as f:
pat = f.read().strip()
database_results = download_dca_databases(
experiment_name, pat, projects
language,
experiment_name,
pat,
projects,
)
# Generate models for all projects
@@ -492,9 +547,9 @@ if __name__ == "__main__":
sys.exit(1)
try:
with open(args.config, "r") as f:
config = json.load(f)
except json.JSONDecodeError as e:
print(f"ERROR: Failed to parse JSON file {args.config}: {e}")
config = yaml.safe_load(f)
except yaml.YAMLError as e:
print(f"ERROR: Failed to parse YAML file {args.config}: {e}")
sys.exit(1)
main(config, args)