codeql/misc/scripts/models-as-data/bulk_generate_mad.py

#!/usr/bin/env python3
"""
Experimental script for bulk generation of MaD models based on a list of projects.

Note: This file must be formatted using the Black Python formatter.
"""

import pathlib
import subprocess
import sys
from typing import Required, TypedDict, List, Callable, Optional
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
import argparse
import zipfile
import tarfile
import shutil


def missing_module(module_name: str) -> None:
    print(
        f"ERROR: {module_name} is not installed. Please install it with 'pip install {module_name}'."
    )
    sys.exit(1)


try:
    import yaml
except ImportError:
    missing_module("pyyaml")

try:
    import requests
except ImportError:
    missing_module("requests")

import generate_mad as mad

gitroot = (
    subprocess.check_output(["git", "rev-parse", "--show-toplevel"])
    .decode("utf-8")
    .strip()
)
build_dir = pathlib.Path(gitroot, "mad-generation-build")


# A project to generate models for
Project = TypedDict(
    "Project",
    {
        "name": Required[str],
        "git-repo": str,
        "git-tag": str,
        "with-sinks": bool,
        "with-sources": bool,
        "with-summaries": bool,
    },
    total=False,
)


def should_generate_sinks(project: Project) -> bool:
    return project.get("with-sinks", True)


def should_generate_sources(project: Project) -> bool:
    return project.get("with-sources", True)


def should_generate_summaries(project: Project) -> bool:
    return project.get("with-summaries", True)


def clone_project(project: Project) -> str:
    """
    Shallow clone a project into the build directory.

    Args:
        project: A dictionary containing project information with 'name', 'git-repo', and optional 'git-tag' keys.

    Returns:
        The path to the cloned project directory.
    """
    name = project["name"]
    repo_url = project["git-repo"]
    git_tag = project.get("git-tag")

    # Determine target directory
    target_dir = build_dir / name

    # Clone only if directory doesn't already exist
    if not target_dir.exists():
        if git_tag:
            print(f"Cloning {name} from {repo_url} at tag {git_tag}")
        else:
            print(f"Cloning {name} from {repo_url}")

        subprocess.check_call(
            [
                "git",
                "clone",
                "--quiet",
                "--depth",
                "1",  # Shallow clone
                *(
                    ["--branch", git_tag] if git_tag else []
                ),  # Add branch if tag is provided
                repo_url,
                target_dir,
            ]
        )
        print(f"Completed cloning {name}")
    else:
        print(f"Skipping cloning {name} as it already exists at {target_dir}")

    return target_dir


def run_in_parallel[T, U](
    func: Callable[[T], U],
    items: List[T],
    *,
    on_error=lambda item, exc: None,
    error_summary=lambda failures: None,
    max_workers=8,
) -> List[Optional[U]]:
    if not items:
        return []
    max_workers = min(max_workers, len(items))
    results = [None for _ in range(len(items))]
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Start cloning tasks and keep track of them
        futures = {
            executor.submit(func, item): index for index, item in enumerate(items)
        }
        # Process results as they complete
        for future in as_completed(futures):
            index = futures[future]
            try:
                results[index] = future.result()
            except Exception as e:
                on_error(items[index], e)
    failed = [item for item, result in zip(items, results) if result is None]
    if failed:
        error_summary(failed)
        sys.exit(1)
    return results


def clone_projects(projects: List[Project]) -> List[tuple[Project, str]]:
    """
    Clone all projects in parallel.

    Args:
        projects: List of projects to clone

    Returns:
        List of (project, project_dir) pairs in the same order as the input projects
    """
    start_time = time.time()
    dirs = run_in_parallel(
        clone_project,
        projects,
        on_error=lambda project, exc: print(
            f"ERROR: Failed to clone project {project['name']}: {exc}"
        ),
        error_summary=lambda failures: print(
            f"ERROR: Failed to clone {len(failures)} projects: {', '.join(p['name'] for p in failures)}"
        ),
    )
    clone_time = time.time() - start_time
    print(f"Cloning completed in {clone_time:.2f} seconds")
    return list(zip(projects, dirs))


def build_database(
    language: str, extractor_options, project: Project, project_dir: str
) -> str | None:
    """
    Build a CodeQL database for a project.

    Args:
        language: The language for which to build the database (e.g., "rust").
        extractor_options: Additional options for the extractor.
        project: A dictionary containing project information with 'name' and 'git-repo' keys.
        project_dir: Path to the CodeQL database.

    Returns:
        The path to the created database directory.
    """
    name = project["name"]

    # Create database directory path
    database_dir = build_dir / f"{name}-db"

    # Only build the database if it doesn't already exist
    if not database_dir.exists():
        print(f"Building CodeQL database for {name}...")
        extractor_options = [option for x in extractor_options for option in ("-O", x)]
        try:
            subprocess.check_call(
                [
                    "codeql",
                    "database",
                    "create",
                    f"--language={language}",
                    "--source-root=" + project_dir,
                    "--overwrite",
                    *extractor_options,
                    "--",
                    database_dir,
                ]
            )
            print(f"Successfully created database at {database_dir}")
        except subprocess.CalledProcessError as e:
            print(f"Failed to create database for {name}: {e}")
            return None
    else:
        print(
            f"Skipping database creation for {name} as it already exists at {database_dir}"
        )

    return database_dir


def generate_models(config, args, project: Project, database_dir: str) -> None:
    """
    Generate models for a project.

    Args:
        args: Command line arguments passed to this script.
        name: The name of the project.
        database_dir: Path to the CodeQL database.
    """
    name = project["name"]
    language = config["language"]

    generator = mad.Generator(language)
    generator.with_sinks = should_generate_sinks(project)
    generator.with_sources = should_generate_sources(project)
    generator.with_summaries = should_generate_summaries(project)
    generator.threads = args.codeql_threads
    generator.ram = args.codeql_ram
    if config.get("single-file", False):
        generator.single_file = name
    else:
        generator.folder = name
    generator.setenvironment(database=database_dir)
    generator.run()


def build_databases_from_projects(
    language: str, extractor_options, projects: List[Project]
) -> List[tuple[Project, str | None]]:
    """
    Build databases for all projects in parallel.

    Args:
        language: The language for which to build the databases (e.g., "rust").
        extractor_options: Additional options for the extractor.
        projects: List of projects to build databases for.

    Returns:
        List of (project_name, database_dir) pairs, where database_dir is None if the build failed.
    """
    # Clone projects in parallel
    print("=== Cloning projects ===")
    project_dirs = clone_projects(projects)

    # Build databases for all projects
    print("\n=== Building databases ===")
    database_results = [
        (
            project,
            build_database(language, extractor_options, project, project_dir),
        )
        for project, project_dir in project_dirs
    ]
    return database_results


def get_json_from_github(
    url: str, pat: str, extra_headers: dict[str, str] = {}
) -> dict:
    """
    Download a JSON file from GitHub using a personal access token (PAT).
    Args:
        url: The URL to download the JSON file from.
        pat: Personal Access Token for GitHub API authentication.
        extra_headers: Additional headers to include in the request.
    Returns:
        The JSON response as a dictionary.
    """
    headers = {"Authorization": f"token {pat}"} | extra_headers
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to download JSON: {response.status_code} {response.text}")
        sys.exit(1)
    else:
        return response.json()


def download_artifact(url: str, artifact_name: str, pat: str) -> str:
    """
    Download a GitHub Actions artifact from a given URL.
    Args:
        url: The URL to download the artifact from.
        artifact_name: The name of the artifact (used for naming the downloaded file).
        pat: Personal Access Token for GitHub API authentication.
    Returns:
        The path to the downloaded artifact file.
    """
    headers = {"Authorization": f"token {pat}", "Accept": "application/vnd.github+json"}
    response = requests.get(url, stream=True, headers=headers)
    zipName = artifact_name + ".zip"
    if response.status_code != 200:
        print(f"Failed to download file. Status code: {response.status_code}")
        sys.exit(1)
    target_zip = build_dir / zipName
    with open(target_zip, "wb") as file:
        for chunk in response.iter_content(chunk_size=8192):
            file.write(chunk)
    print(f"Download complete: {target_zip}")
    return target_zip


def pretty_name_from_artifact_name(artifact_name: str) -> str:
    return artifact_name.split("___")[1]


def download_dca_databases(
    language: str,
    experiment_names: list[str],
    pat: str,
    projects: List[Project],
) -> List[tuple[Project, str | None]]:
    """
    Download databases from a DCA experiment.
    Args:
        experiment_names: The names of the DCA experiments to download databases from.
        pat: Personal Access Token for GitHub API authentication.
        projects: List of projects to download databases for.
    Returns:
        List of (project_name, database_dir) pairs, where database_dir is None if the download failed.
    """
    print("\n=== Finding projects ===")
    project_map = {project["name"]: project for project in projects}
    analyzed_databases = {n: None for n in project_map}
    for experiment_name in experiment_names:
        response = get_json_from_github(
            f"https://raw.githubusercontent.com/github/codeql-dca-main/data/{experiment_name}/reports/downloads.json",
            pat,
        )
        targets = response["targets"]
        for data in targets.values():
            downloads = data["downloads"]
            analyzed_database = downloads["analyzed_database"]
            artifact_name = analyzed_database["artifact_name"]
            pretty_name = pretty_name_from_artifact_name(artifact_name)

            if not pretty_name in analyzed_databases:
                print(f"Skipping {pretty_name} as it is not in the list of projects")
                continue

            if analyzed_databases[pretty_name] is not None:
                print(
                    f"Skipping previous database {analyzed_databases[pretty_name]['artifact_name']} for {pretty_name}"
                )

            analyzed_databases[pretty_name] = analyzed_database

    not_found = [name for name, db in analyzed_databases.items() if db is None]
    if not_found:
        print(
            f"ERROR: The following projects were not found in the DCA experiments: {', '.join(not_found)}"
        )
        sys.exit(1)

    def download_and_decompress(analyzed_database: dict) -> str:
        artifact_name = analyzed_database["artifact_name"]
        repository = analyzed_database["repository"]
        run_id = analyzed_database["run_id"]
        print(f"=== Finding artifact: {artifact_name} ===")
        response = get_json_from_github(
            f"https://api.github.com/repos/{repository}/actions/runs/{run_id}/artifacts",
            pat,
            {"Accept": "application/vnd.github+json"},
        )
        artifacts = response["artifacts"]
        artifact_map = {artifact["name"]: artifact for artifact in artifacts}
        print(f"=== Downloading artifact: {artifact_name} ===")
        archive_download_url = artifact_map[artifact_name]["archive_download_url"]
        artifact_zip_location = download_artifact(
            archive_download_url, artifact_name, pat
        )
        print(f"=== Decompressing artifact: {artifact_name} ===")
        # The database is in a zip file, which contains a tar.gz file with the DB
        # First we open the zip file
        with zipfile.ZipFile(artifact_zip_location, "r") as zip_ref:
            artifact_unzipped_location = build_dir / artifact_name
            # clean up any remnants of previous runs
            shutil.rmtree(artifact_unzipped_location, ignore_errors=True)
            # And then we extract it to build_dir/artifact_name
            zip_ref.extractall(artifact_unzipped_location)
            # And then we extract the language tar.gz file inside it
            artifact_tar_location = artifact_unzipped_location / f"{language}.tar.gz"
            with tarfile.open(artifact_tar_location, "r:gz") as tar_ref:
                # And we just untar it to the same directory as the zip file
                tar_ref.extractall(artifact_unzipped_location)
        ret = artifact_unzipped_location / language
        print(f"Decompression complete: {ret}")
        return ret

    results = run_in_parallel(
        download_and_decompress,
        list(analyzed_databases.values()),
        on_error=lambda db, exc: print(
            f"ERROR: Failed to download and decompress {db["artifact_name"]}: {exc}"
        ),
        error_summary=lambda failures: print(
            f"ERROR: Failed to download {len(failures)} databases: {', '.join(item[0] for item in failures)}"
        ),
    )

    print(f"\n=== Fetched {len(results)} databases ===")

    return [(project_map[n], r) for n, r in zip(analyzed_databases, results)]


def clean_up_mad_destination_for_project(config, name: str):
    target = pathlib.Path(config["destination"], name)
    if config.get("single-file", False):
        target = target.with_suffix(".model.yml")
        if target.exists():
            print(f"Deleting existing MaD file at {target}")
            target.unlink()
    elif target.exists():
        print(f"Deleting existing MaD directory at {target}")
        shutil.rmtree(target, ignore_errors=True)


def get_strategy(config) -> str:
    return config["strategy"].lower()


def main(config, args) -> None:
    """
    Main function to handle the bulk generation of MaD models.
    Args:
        config: Configuration dictionary containing project details and other settings.
        args: Command line arguments passed to this script.
    """

    projects = config["targets"]
    if not "language" in config:
        print("ERROR: 'language' key is missing in the configuration file.")
        sys.exit(1)
    language = config["language"]

    # Create build directory if it doesn't exist
    build_dir.mkdir(parents=True, exist_ok=True)

    database_results = []
    match get_strategy(config):
        case "repo":
            extractor_options = config.get("extractor_options", [])
            database_results = build_databases_from_projects(
                language,
                extractor_options,
                projects,
            )
        case "dca":
            experiment_names = args.dca
            if experiment_names is None:
                print("ERROR: --dca argument is required for DCA strategy")
                sys.exit(1)

            if args.pat is None:
                print("ERROR: --pat argument is required for DCA strategy")
                sys.exit(1)
            if not args.pat.exists():
                print(f"ERROR: Personal Access Token file '{pat}' does not exist.")
                sys.exit(1)
            with open(args.pat, "r") as f:
                pat = f.read().strip()
                database_results = download_dca_databases(
                    language,
                    experiment_names,
                    pat,
                    projects,
                )

    # Generate models for all projects
    print("\n=== Generating models ===")

    failed_builds = [
        project["name"] for project, db_dir in database_results if db_dir is None
    ]
    if failed_builds:
        print(
            f"ERROR: {len(failed_builds)} database builds failed: {', '.join(failed_builds)}"
        )
        sys.exit(1)

    # clean up existing MaD data for the projects
    for project, _ in database_results:
        clean_up_mad_destination_for_project(config, project["name"])

    for project, database_dir in database_results:
        if database_dir is not None:
            generate_models(config, args, project, database_dir)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--config",
        type=pathlib.Path,
        help="Path to the configuration file.",
        required=True,
    )
    parser.add_argument(
        "--dca",
        type=str,
        help="Name of a DCA run that built all the projects. Can be repeated, with sources taken from all provided runs, "
        "the last provided ones having priority",
        action="append",
    )
    parser.add_argument(
        "--pat",
        type=pathlib.Path,
        help="Path to a file containing the PAT token required to grab DCA databases (the same as the one you use for DCA)",
    )
    parser.add_argument(
        "--codeql-ram",
        type=int,
        help="What `--ram` value to pass to `codeql` while generating models (by default 2048 MB per thread)",
        default=None,
    )
    parser.add_argument(
        "--codeql-threads",
        type=int,
        help="What `--threads` value to pass to `codeql` (default %(default)s)",
        default=0,
    )
    args = parser.parse_args()

    # Load config file
    config = {}
    if not args.config.exists():
        print(f"ERROR: Config file '{args.config}' does not exist.")
        sys.exit(1)
    try:
        with open(args.config, "r") as f:
            config = yaml.safe_load(f)
    except yaml.YAMLError as e:
        print(f"ERROR: Failed to parse YAML file {args.config}: {e}")
        sys.exit(1)

    main(config, args)