Merge branch 'main' into redsun82/rust-mad

2025-12-16 16:53:25 +01:00 · 2025-06-11 16:39:10 +02:00
parent 5f474a7185 fbcd9eaf09
commit 051704e59d
1104 changed files with 33004 additions and 12030 deletions
--- a/misc/scripts/models-as-data/bulk_generate_mad.py
+++ b/misc/scripts/models-as-data/bulk_generate_mad.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 """
 Experimental script for bulk generation of MaD models based on a list of projects.

@@ -7,15 +8,31 @@ Note: This file must be formatted using the Black Python formatter.
 import os.path
 import subprocess
 import sys
-from typing import NotRequired, TypedDict, List
+from typing import Required, TypedDict, List, Callable, Optional
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import time
 import argparse
-import json
-import requests
 import zipfile
 import tarfile
-from functools import cmp_to_key
+import shutil
+
+
+def missing_module(module_name: str) -> None:
+    print(
+        f"ERROR: {module_name} is not installed. Please install it with 'pip install {module_name}'."
+    )
+    sys.exit(1)
+
+
+try:
+    import yaml
+except ImportError:
+    missing_module("pyyaml")
+
+try:
+    import requests
+except ImportError:
+    missing_module("requests")

 import generate_mad as mad

@@ -28,22 +45,18 @@ build_dir = os.path.join(gitroot, "mad-generation-build")


 # A project to generate models for
-class Project(TypedDict):
-    """
-    Type definition for projects (acquired via a GitHub repo) to model.
-
-    Attributes:
-        name: The name of the project
-        git_repo: URL to the git repository
-        git_tag: Optional Git tag to check out
-    """
-
-    name: str
-    git_repo: NotRequired[str]
-    git_tag: NotRequired[str]
-    with_sinks: NotRequired[bool]
-    with_sinks: NotRequired[bool]
-    with_summaries: NotRequired[bool]
+Project = TypedDict(
+    "Project",
+    {
+        "name": Required[str],
+        "git-repo": str,
+        "git-tag": str,
+        "with-sinks": bool,
+        "with-sources": bool,
+        "with-summaries": bool,
+    },
+    total=False,
+)


 def should_generate_sinks(project: Project) -> bool:
@@ -63,14 +76,14 @@ def clone_project(project: Project) -> str:
    Shallow clone a project into the build directory.

    Args:
-        project: A dictionary containing project information with 'name', 'git_repo', and optional 'git_tag' keys.
+        project: A dictionary containing project information with 'name', 'git-repo', and optional 'git-tag' keys.

    Returns:
        The path to the cloned project directory.
    """
    name = project["name"]
-    repo_url = project["git_repo"]
-    git_tag = project.get("git_tag")
+    repo_url = project["git-repo"]
+    git_tag = project.get("git-tag")

    # Determine target directory
    target_dir = os.path.join(build_dir, name)
@@ -103,6 +116,39 @@ def clone_project(project: Project) -> str:
    return target_dir


+def run_in_parallel[
+    T, U
+](
+    func: Callable[[T], U],
+    items: List[T],
+    *,
+    on_error=lambda item, exc: None,
+    error_summary=lambda failures: None,
+    max_workers=8,
+) -> List[Optional[U]]:
+    if not items:
+        return []
+    max_workers = min(max_workers, len(items))
+    results = [None for _ in range(len(items))]
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        # Start cloning tasks and keep track of them
+        futures = {
+            executor.submit(func, item): index for index, item in enumerate(items)
+        }
+        # Process results as they complete
+        for future in as_completed(futures):
+            index = futures[future]
+            try:
+                results[index] = future.result()
+            except Exception as e:
+                on_error(items[index], e)
+    failed = [item for item, result in zip(items, results) if result is None]
+    if failed:
+        error_summary(failed)
+        sys.exit(1)
+    return results
+
+
 def clone_projects(projects: List[Project]) -> List[tuple[Project, str]]:
    """
    Clone all projects in parallel.
@@ -114,40 +160,19 @@ def clone_projects(projects: List[Project]) -> List[tuple[Project, str]]:
        List of (project, project_dir) pairs in the same order as the input projects
    """
    start_time = time.time()
-    max_workers = min(8, len(projects))  # Use at most 8 threads
-    project_dirs_map = {}  # Map to store results by project name
-
-    with ThreadPoolExecutor(max_workers=max_workers) as executor:
-        # Start cloning tasks and keep track of them
-        future_to_project = {
-            executor.submit(clone_project, project): project for project in projects
-        }
-
-        # Process results as they complete
-        for future in as_completed(future_to_project):
-            project = future_to_project[future]
-            try:
-                project_dir = future.result()
-                project_dirs_map[project["name"]] = (project, project_dir)
-            except Exception as e:
-                print(f"ERROR: Failed to clone {project['name']}: {e}")
-
-    if len(project_dirs_map) != len(projects):
-        failed_projects = [
-            project["name"]
-            for project in projects
-            if project["name"] not in project_dirs_map
-        ]
-        print(
-            f"ERROR: Only {len(project_dirs_map)} out of {len(projects)} projects were cloned successfully. Failed projects: {', '.join(failed_projects)}"
-        )
-        sys.exit(1)
-
-    project_dirs = [project_dirs_map[project["name"]] for project in projects]
-
+    dirs = run_in_parallel(
+        clone_project,
+        projects,
+        on_error=lambda project, exc: print(
+            f"ERROR: Failed to clone project {project['name']}: {exc}"
+        ),
+        error_summary=lambda failures: print(
+            f"ERROR: Failed to clone {len(failures)} projects: {', '.join(p['name'] for p in failures)}"
+        ),
+    )
    clone_time = time.time() - start_time
    print(f"Cloning completed in {clone_time:.2f} seconds")
-    return project_dirs
+    return list(zip(projects, dirs))


 def build_database(
@@ -159,7 +184,7 @@ def build_database(
    Args:
        language: The language for which to build the database (e.g., "rust").
        extractor_options: Additional options for the extractor.
-        project: A dictionary containing project information with 'name' and 'git_repo' keys.
+        project: A dictionary containing project information with 'name' and 'git-repo' keys.
        project_dir: Path to the CodeQL database.

    Returns:
@@ -307,7 +332,10 @@ def pretty_name_from_artifact_name(artifact_name: str) -> str:


 def download_dca_databases(
-    experiment_name: str, pat: str, projects: List[Project]
+    language: str,
+    experiment_name: str,
+    pat: str,
+    projects: List[Project],
 ) -> List[tuple[Project, str | None]]:
    """
    Download databases from a DCA experiment.
@@ -318,7 +346,6 @@ def download_dca_databases(
    Returns:
        List of (project_name, database_dir) pairs, where database_dir is None if the download failed.
    """
-    database_results = {}
    print("\n=== Finding projects ===")
    response = get_json_from_github(
        f"https://raw.githubusercontent.com/github/codeql-dca-main/data/{experiment_name}/reports/downloads.json",
@@ -326,6 +353,7 @@ def download_dca_databases(
    )
    targets = response["targets"]
    project_map = {project["name"]: project for project in projects}
+    analyzed_databases = {}
    for data in targets.values():
        downloads = data["downloads"]
        analyzed_database = downloads["analyzed_database"]
@@ -336,6 +364,15 @@ def download_dca_databases(
            print(f"Skipping {pretty_name} as it is not in the list of projects")
            continue

+        if pretty_name in analyzed_databases:
+            print(
+                f"Skipping previous database {analyzed_databases[pretty_name]['artifact_name']} for {pretty_name}"
+            )
+
+        analyzed_databases[pretty_name] = analyzed_database
+
+    def download_and_decompress(analyzed_database: dict) -> str:
+        artifact_name = analyzed_database["artifact_name"]
        repository = analyzed_database["repository"]
        run_id = analyzed_database["run_id"]
        print(f"=== Finding artifact: {artifact_name} ===")
@@ -351,27 +388,40 @@ def download_dca_databases(
        artifact_zip_location = download_artifact(
            archive_download_url, artifact_name, pat
        )
-        print(f"=== Extracting artifact: {artifact_name} ===")
+        print(f"=== Decompressing artifact: {artifact_name} ===")
        # The database is in a zip file, which contains a tar.gz file with the DB
        # First we open the zip file
        with zipfile.ZipFile(artifact_zip_location, "r") as zip_ref:
            artifact_unzipped_location = os.path.join(build_dir, artifact_name)
+            # clean up any remnants of previous runs
+            shutil.rmtree(artifact_unzipped_location, ignore_errors=True)
            # And then we extract it to build_dir/artifact_name
            zip_ref.extractall(artifact_unzipped_location)
-            # And then we iterate over the contents of the extracted directory
-            # and extract the tar.gz files inside it
-            for entry in os.listdir(artifact_unzipped_location):
-                artifact_tar_location = os.path.join(artifact_unzipped_location, entry)
-                with tarfile.open(artifact_tar_location, "r:gz") as tar_ref:
-                    # And we just untar it to the same directory as the zip file
-                    tar_ref.extractall(artifact_unzipped_location)
-                    database_results[pretty_name] = os.path.join(
-                        artifact_unzipped_location, remove_extension(entry)
-                    )
+            # And then we extract the language tar.gz file inside it
+            artifact_tar_location = os.path.join(
+                artifact_unzipped_location, f"{language}.tar.gz"
+            )
+            with tarfile.open(artifact_tar_location, "r:gz") as tar_ref:
+                # And we just untar it to the same directory as the zip file
+                tar_ref.extractall(artifact_unzipped_location)
+        ret = os.path.join(artifact_unzipped_location, language)
+        print(f"Decompression complete: {ret}")
+        return ret

-    print(f"\n=== Extracted {len(database_results)} databases ===")
+    results = run_in_parallel(
+        download_and_decompress,
+        list(analyzed_databases.values()),
+        on_error=lambda db, exc: print(
+            f"ERROR: Failed to download and decompress {db["artifact_name"]}: {exc}"
+        ),
+        error_summary=lambda failures: print(
+            f"ERROR: Failed to download {len(failures)} databases: {', '.join(item[0] for item in failures)}"
+        ),
+    )

-    return [(project, database_results[project["name"]]) for project in projects]
+    print(f"\n=== Fetched {len(results)} databases ===")
+
+    return [(project_map[n], r) for n, r in zip(analyzed_databases, results)]


 def get_mad_destination_for_project(config, name: str) -> str:
@@ -422,7 +472,9 @@ To avoid loss of data, please commit your changes."""
        case "repo":
            extractor_options = config.get("extractor_options", [])
            database_results = build_databases_from_projects(
-                language, extractor_options, projects
+                language,
+                extractor_options,
+                projects,
            )
        case "dca":
            experiment_name = args.dca
@@ -439,7 +491,10 @@ To avoid loss of data, please commit your changes."""
            with open(args.pat, "r") as f:
                pat = f.read().strip()
                database_results = download_dca_databases(
-                    experiment_name, pat, projects
+                    language,
+                    experiment_name,
+                    pat,
+                    projects,
                )

    # Generate models for all projects
@@ -492,9 +547,9 @@ if __name__ == "__main__":
        sys.exit(1)
    try:
        with open(args.config, "r") as f:
-            config = json.load(f)
-    except json.JSONDecodeError as e:
-        print(f"ERROR: Failed to parse JSON file {args.config}: {e}")
+            config = yaml.safe_load(f)
+    except yaml.YAMLError as e:
+        print(f"ERROR: Failed to parse YAML file {args.config}: {e}")
        sys.exit(1)

    main(config, args)