mirror of
https://github.com/github/codeql.git
synced 2025-12-16 08:43:11 +01:00
561 lines
18 KiB
Python
Executable File
561 lines
18 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Experimental script for bulk generation of MaD models based on a list of projects.
|
|
|
|
Note: This file must be formatted using the Black Python formatter.
|
|
"""
|
|
|
|
import pathlib
|
|
import subprocess
|
|
import sys
|
|
from typing import Required, TypedDict, List, Callable, Optional
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
import time
|
|
import argparse
|
|
import zipfile
|
|
import tarfile
|
|
import shutil
|
|
|
|
|
|
def missing_module(module_name: str) -> None:
|
|
print(
|
|
f"ERROR: {module_name} is not installed. Please install it with 'pip install {module_name}'."
|
|
)
|
|
sys.exit(1)
|
|
|
|
|
|
try:
|
|
import yaml
|
|
except ImportError:
|
|
missing_module("pyyaml")
|
|
|
|
try:
|
|
import requests
|
|
except ImportError:
|
|
missing_module("requests")
|
|
|
|
import generate_mad as mad
|
|
|
|
gitroot = (
|
|
subprocess.check_output(["git", "rev-parse", "--show-toplevel"])
|
|
.decode("utf-8")
|
|
.strip()
|
|
)
|
|
build_dir = pathlib.Path(gitroot, "mad-generation-build")
|
|
|
|
|
|
# A project to generate models for
|
|
Project = TypedDict(
|
|
"Project",
|
|
{
|
|
"name": Required[str],
|
|
"git-repo": str,
|
|
"git-tag": str,
|
|
"with-sinks": bool,
|
|
"with-sources": bool,
|
|
"with-summaries": bool,
|
|
},
|
|
total=False,
|
|
)
|
|
|
|
|
|
def should_generate_sinks(project: Project) -> bool:
|
|
return project.get("with-sinks", True)
|
|
|
|
|
|
def should_generate_sources(project: Project) -> bool:
|
|
return project.get("with-sources", True)
|
|
|
|
|
|
def should_generate_summaries(project: Project) -> bool:
|
|
return project.get("with-summaries", True)
|
|
|
|
|
|
def clone_project(project: Project) -> str:
|
|
"""
|
|
Shallow clone a project into the build directory.
|
|
|
|
Args:
|
|
project: A dictionary containing project information with 'name', 'git-repo', and optional 'git-tag' keys.
|
|
|
|
Returns:
|
|
The path to the cloned project directory.
|
|
"""
|
|
name = project["name"]
|
|
repo_url = project["git-repo"]
|
|
git_tag = project.get("git-tag")
|
|
|
|
# Determine target directory
|
|
target_dir = build_dir / name
|
|
|
|
# Clone only if directory doesn't already exist
|
|
if not target_dir.exists():
|
|
if git_tag:
|
|
print(f"Cloning {name} from {repo_url} at tag {git_tag}")
|
|
else:
|
|
print(f"Cloning {name} from {repo_url}")
|
|
|
|
subprocess.check_call(
|
|
[
|
|
"git",
|
|
"clone",
|
|
"--quiet",
|
|
"--depth",
|
|
"1", # Shallow clone
|
|
*(
|
|
["--branch", git_tag] if git_tag else []
|
|
), # Add branch if tag is provided
|
|
repo_url,
|
|
target_dir,
|
|
]
|
|
)
|
|
print(f"Completed cloning {name}")
|
|
else:
|
|
print(f"Skipping cloning {name} as it already exists at {target_dir}")
|
|
|
|
return target_dir
|
|
|
|
|
|
def run_in_parallel[T, U](
|
|
func: Callable[[T], U],
|
|
items: List[T],
|
|
*,
|
|
on_error=lambda item, exc: None,
|
|
error_summary=lambda failures: None,
|
|
max_workers=8,
|
|
) -> List[Optional[U]]:
|
|
if not items:
|
|
return []
|
|
max_workers = min(max_workers, len(items))
|
|
results = [None for _ in range(len(items))]
|
|
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
# Start cloning tasks and keep track of them
|
|
futures = {
|
|
executor.submit(func, item): index for index, item in enumerate(items)
|
|
}
|
|
# Process results as they complete
|
|
for future in as_completed(futures):
|
|
index = futures[future]
|
|
try:
|
|
results[index] = future.result()
|
|
except Exception as e:
|
|
on_error(items[index], e)
|
|
failed = [item for item, result in zip(items, results) if result is None]
|
|
if failed:
|
|
error_summary(failed)
|
|
sys.exit(1)
|
|
return results
|
|
|
|
|
|
def clone_projects(projects: List[Project]) -> List[tuple[Project, str]]:
|
|
"""
|
|
Clone all projects in parallel.
|
|
|
|
Args:
|
|
projects: List of projects to clone
|
|
|
|
Returns:
|
|
List of (project, project_dir) pairs in the same order as the input projects
|
|
"""
|
|
start_time = time.time()
|
|
dirs = run_in_parallel(
|
|
clone_project,
|
|
projects,
|
|
on_error=lambda project, exc: print(
|
|
f"ERROR: Failed to clone project {project['name']}: {exc}"
|
|
),
|
|
error_summary=lambda failures: print(
|
|
f"ERROR: Failed to clone {len(failures)} projects: {', '.join(p['name'] for p in failures)}"
|
|
),
|
|
)
|
|
clone_time = time.time() - start_time
|
|
print(f"Cloning completed in {clone_time:.2f} seconds")
|
|
return list(zip(projects, dirs))
|
|
|
|
|
|
def build_database(
|
|
language: str, extractor_options, project: Project, project_dir: str
|
|
) -> str | None:
|
|
"""
|
|
Build a CodeQL database for a project.
|
|
|
|
Args:
|
|
language: The language for which to build the database (e.g., "rust").
|
|
extractor_options: Additional options for the extractor.
|
|
project: A dictionary containing project information with 'name' and 'git-repo' keys.
|
|
project_dir: Path to the CodeQL database.
|
|
|
|
Returns:
|
|
The path to the created database directory.
|
|
"""
|
|
name = project["name"]
|
|
|
|
# Create database directory path
|
|
database_dir = build_dir / f"{name}-db"
|
|
|
|
# Only build the database if it doesn't already exist
|
|
if not database_dir.exists():
|
|
print(f"Building CodeQL database for {name}...")
|
|
extractor_options = [option for x in extractor_options for option in ("-O", x)]
|
|
try:
|
|
subprocess.check_call(
|
|
[
|
|
"codeql",
|
|
"database",
|
|
"create",
|
|
f"--language={language}",
|
|
"--source-root=" + project_dir,
|
|
"--overwrite",
|
|
*extractor_options,
|
|
"--",
|
|
database_dir,
|
|
]
|
|
)
|
|
print(f"Successfully created database at {database_dir}")
|
|
except subprocess.CalledProcessError as e:
|
|
print(f"Failed to create database for {name}: {e}")
|
|
return None
|
|
else:
|
|
print(
|
|
f"Skipping database creation for {name} as it already exists at {database_dir}"
|
|
)
|
|
|
|
return database_dir
|
|
|
|
|
|
def generate_models(config, args, project: Project, database_dir: str) -> None:
|
|
"""
|
|
Generate models for a project.
|
|
|
|
Args:
|
|
args: Command line arguments passed to this script.
|
|
name: The name of the project.
|
|
database_dir: Path to the CodeQL database.
|
|
"""
|
|
name = project["name"]
|
|
language = config["language"]
|
|
|
|
generator = mad.Generator(language)
|
|
generator.with_sinks = should_generate_sinks(project)
|
|
generator.with_sources = should_generate_sources(project)
|
|
generator.with_summaries = should_generate_summaries(project)
|
|
generator.threads = args.codeql_threads
|
|
generator.ram = args.codeql_ram
|
|
if config.get("single-file", False):
|
|
generator.single_file = name
|
|
else:
|
|
generator.folder = name
|
|
generator.setenvironment(database=database_dir)
|
|
generator.run()
|
|
|
|
|
|
def build_databases_from_projects(
|
|
language: str, extractor_options, projects: List[Project]
|
|
) -> List[tuple[Project, str | None]]:
|
|
"""
|
|
Build databases for all projects in parallel.
|
|
|
|
Args:
|
|
language: The language for which to build the databases (e.g., "rust").
|
|
extractor_options: Additional options for the extractor.
|
|
projects: List of projects to build databases for.
|
|
|
|
Returns:
|
|
List of (project_name, database_dir) pairs, where database_dir is None if the build failed.
|
|
"""
|
|
# Clone projects in parallel
|
|
print("=== Cloning projects ===")
|
|
project_dirs = clone_projects(projects)
|
|
|
|
# Build databases for all projects
|
|
print("\n=== Building databases ===")
|
|
database_results = [
|
|
(
|
|
project,
|
|
build_database(language, extractor_options, project, project_dir),
|
|
)
|
|
for project, project_dir in project_dirs
|
|
]
|
|
return database_results
|
|
|
|
|
|
def get_json_from_github(
|
|
url: str, pat: str, extra_headers: dict[str, str] = {}
|
|
) -> dict:
|
|
"""
|
|
Download a JSON file from GitHub using a personal access token (PAT).
|
|
Args:
|
|
url: The URL to download the JSON file from.
|
|
pat: Personal Access Token for GitHub API authentication.
|
|
extra_headers: Additional headers to include in the request.
|
|
Returns:
|
|
The JSON response as a dictionary.
|
|
"""
|
|
headers = {"Authorization": f"token {pat}"} | extra_headers
|
|
response = requests.get(url, headers=headers)
|
|
if response.status_code != 200:
|
|
print(f"Failed to download JSON: {response.status_code} {response.text}")
|
|
sys.exit(1)
|
|
else:
|
|
return response.json()
|
|
|
|
|
|
def download_artifact(url: str, artifact_name: str, pat: str) -> str:
|
|
"""
|
|
Download a GitHub Actions artifact from a given URL.
|
|
Args:
|
|
url: The URL to download the artifact from.
|
|
artifact_name: The name of the artifact (used for naming the downloaded file).
|
|
pat: Personal Access Token for GitHub API authentication.
|
|
Returns:
|
|
The path to the downloaded artifact file.
|
|
"""
|
|
headers = {"Authorization": f"token {pat}", "Accept": "application/vnd.github+json"}
|
|
response = requests.get(url, stream=True, headers=headers)
|
|
zipName = artifact_name + ".zip"
|
|
if response.status_code != 200:
|
|
print(f"Failed to download file. Status code: {response.status_code}")
|
|
sys.exit(1)
|
|
target_zip = build_dir / zipName
|
|
with open(target_zip, "wb") as file:
|
|
for chunk in response.iter_content(chunk_size=8192):
|
|
file.write(chunk)
|
|
print(f"Download complete: {target_zip}")
|
|
return target_zip
|
|
|
|
|
|
def pretty_name_from_artifact_name(artifact_name: str) -> str:
|
|
return artifact_name.split("___")[1]
|
|
|
|
|
|
def download_dca_databases(
|
|
language: str,
|
|
experiment_names: list[str],
|
|
pat: str,
|
|
projects: List[Project],
|
|
) -> List[tuple[Project, str | None]]:
|
|
"""
|
|
Download databases from a DCA experiment.
|
|
Args:
|
|
experiment_names: The names of the DCA experiments to download databases from.
|
|
pat: Personal Access Token for GitHub API authentication.
|
|
projects: List of projects to download databases for.
|
|
Returns:
|
|
List of (project_name, database_dir) pairs, where database_dir is None if the download failed.
|
|
"""
|
|
print("\n=== Finding projects ===")
|
|
project_map = {project["name"]: project for project in projects}
|
|
analyzed_databases = {n: None for n in project_map}
|
|
for experiment_name in experiment_names:
|
|
response = get_json_from_github(
|
|
f"https://raw.githubusercontent.com/github/codeql-dca-main/data/{experiment_name}/reports/downloads.json",
|
|
pat,
|
|
)
|
|
targets = response["targets"]
|
|
for data in targets.values():
|
|
downloads = data["downloads"]
|
|
analyzed_database = downloads["analyzed_database"]
|
|
artifact_name = analyzed_database["artifact_name"]
|
|
pretty_name = pretty_name_from_artifact_name(artifact_name)
|
|
|
|
if not pretty_name in analyzed_databases:
|
|
print(f"Skipping {pretty_name} as it is not in the list of projects")
|
|
continue
|
|
|
|
if analyzed_databases[pretty_name] is not None:
|
|
print(
|
|
f"Skipping previous database {analyzed_databases[pretty_name]['artifact_name']} for {pretty_name}"
|
|
)
|
|
|
|
analyzed_databases[pretty_name] = analyzed_database
|
|
|
|
not_found = [name for name, db in analyzed_databases.items() if db is None]
|
|
if not_found:
|
|
print(
|
|
f"ERROR: The following projects were not found in the DCA experiments: {', '.join(not_found)}"
|
|
)
|
|
sys.exit(1)
|
|
|
|
def download_and_decompress(analyzed_database: dict) -> str:
|
|
artifact_name = analyzed_database["artifact_name"]
|
|
repository = analyzed_database["repository"]
|
|
run_id = analyzed_database["run_id"]
|
|
print(f"=== Finding artifact: {artifact_name} ===")
|
|
response = get_json_from_github(
|
|
f"https://api.github.com/repos/{repository}/actions/runs/{run_id}/artifacts",
|
|
pat,
|
|
{"Accept": "application/vnd.github+json"},
|
|
)
|
|
artifacts = response["artifacts"]
|
|
artifact_map = {artifact["name"]: artifact for artifact in artifacts}
|
|
print(f"=== Downloading artifact: {artifact_name} ===")
|
|
archive_download_url = artifact_map[artifact_name]["archive_download_url"]
|
|
artifact_zip_location = download_artifact(
|
|
archive_download_url, artifact_name, pat
|
|
)
|
|
print(f"=== Decompressing artifact: {artifact_name} ===")
|
|
# The database is in a zip file, which contains a tar.gz file with the DB
|
|
# First we open the zip file
|
|
with zipfile.ZipFile(artifact_zip_location, "r") as zip_ref:
|
|
artifact_unzipped_location = build_dir / artifact_name
|
|
# clean up any remnants of previous runs
|
|
shutil.rmtree(artifact_unzipped_location, ignore_errors=True)
|
|
# And then we extract it to build_dir/artifact_name
|
|
zip_ref.extractall(artifact_unzipped_location)
|
|
# And then we extract the language tar.gz file inside it
|
|
artifact_tar_location = artifact_unzipped_location / f"{language}.tar.gz"
|
|
with tarfile.open(artifact_tar_location, "r:gz") as tar_ref:
|
|
# And we just untar it to the same directory as the zip file
|
|
tar_ref.extractall(artifact_unzipped_location)
|
|
ret = artifact_unzipped_location / language
|
|
print(f"Decompression complete: {ret}")
|
|
return ret
|
|
|
|
results = run_in_parallel(
|
|
download_and_decompress,
|
|
list(analyzed_databases.values()),
|
|
on_error=lambda db, exc: print(
|
|
f"ERROR: Failed to download and decompress {db["artifact_name"]}: {exc}"
|
|
),
|
|
error_summary=lambda failures: print(
|
|
f"ERROR: Failed to download {len(failures)} databases: {', '.join(item[0] for item in failures)}"
|
|
),
|
|
)
|
|
|
|
print(f"\n=== Fetched {len(results)} databases ===")
|
|
|
|
return [(project_map[n], r) for n, r in zip(analyzed_databases, results)]
|
|
|
|
|
|
def clean_up_mad_destination_for_project(config, name: str):
|
|
target = pathlib.Path(config["destination"], name)
|
|
if config.get("single-file", False):
|
|
target = target.with_suffix(".model.yml")
|
|
if target.exists():
|
|
print(f"Deleting existing MaD file at {target}")
|
|
target.unlink()
|
|
elif target.exists():
|
|
print(f"Deleting existing MaD directory at {target}")
|
|
shutil.rmtree(target, ignore_errors=True)
|
|
|
|
|
|
def get_strategy(config) -> str:
|
|
return config["strategy"].lower()
|
|
|
|
|
|
def main(config, args) -> None:
|
|
"""
|
|
Main function to handle the bulk generation of MaD models.
|
|
Args:
|
|
config: Configuration dictionary containing project details and other settings.
|
|
args: Command line arguments passed to this script.
|
|
"""
|
|
|
|
projects = config["targets"]
|
|
if not "language" in config:
|
|
print("ERROR: 'language' key is missing in the configuration file.")
|
|
sys.exit(1)
|
|
language = config["language"]
|
|
|
|
# Create build directory if it doesn't exist
|
|
build_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
database_results = []
|
|
match get_strategy(config):
|
|
case "repo":
|
|
extractor_options = config.get("extractor_options", [])
|
|
database_results = build_databases_from_projects(
|
|
language,
|
|
extractor_options,
|
|
projects,
|
|
)
|
|
case "dca":
|
|
experiment_names = args.dca
|
|
if experiment_names is None:
|
|
print("ERROR: --dca argument is required for DCA strategy")
|
|
sys.exit(1)
|
|
|
|
if args.pat is None:
|
|
print("ERROR: --pat argument is required for DCA strategy")
|
|
sys.exit(1)
|
|
if not args.pat.exists():
|
|
print(f"ERROR: Personal Access Token file '{pat}' does not exist.")
|
|
sys.exit(1)
|
|
with open(args.pat, "r") as f:
|
|
pat = f.read().strip()
|
|
database_results = download_dca_databases(
|
|
language,
|
|
experiment_names,
|
|
pat,
|
|
projects,
|
|
)
|
|
|
|
# Generate models for all projects
|
|
print("\n=== Generating models ===")
|
|
|
|
failed_builds = [
|
|
project["name"] for project, db_dir in database_results if db_dir is None
|
|
]
|
|
if failed_builds:
|
|
print(
|
|
f"ERROR: {len(failed_builds)} database builds failed: {', '.join(failed_builds)}"
|
|
)
|
|
sys.exit(1)
|
|
|
|
# clean up existing MaD data for the projects
|
|
for project, _ in database_results:
|
|
clean_up_mad_destination_for_project(config, project["name"])
|
|
|
|
for project, database_dir in database_results:
|
|
if database_dir is not None:
|
|
generate_models(config, args, project, database_dir)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument(
|
|
"--config",
|
|
type=pathlib.Path,
|
|
help="Path to the configuration file.",
|
|
required=True,
|
|
)
|
|
parser.add_argument(
|
|
"--dca",
|
|
type=str,
|
|
help="Name of a DCA run that built all the projects. Can be repeated, with sources taken from all provided runs, "
|
|
"the last provided ones having priority",
|
|
action="append",
|
|
)
|
|
parser.add_argument(
|
|
"--pat",
|
|
type=pathlib.Path,
|
|
help="Path to a file containing the PAT token required to grab DCA databases (the same as the one you use for DCA)",
|
|
)
|
|
parser.add_argument(
|
|
"--codeql-ram",
|
|
type=int,
|
|
help="What `--ram` value to pass to `codeql` while generating models (by default 2048 MB per thread)",
|
|
default=None,
|
|
)
|
|
parser.add_argument(
|
|
"--codeql-threads",
|
|
type=int,
|
|
help="What `--threads` value to pass to `codeql` (default %(default)s)",
|
|
default=0,
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
# Load config file
|
|
config = {}
|
|
if not args.config.exists():
|
|
print(f"ERROR: Config file '{args.config}' does not exist.")
|
|
sys.exit(1)
|
|
try:
|
|
with open(args.config, "r") as f:
|
|
config = yaml.safe_load(f)
|
|
except yaml.YAMLError as e:
|
|
print(f"ERROR: Failed to parse YAML file {args.config}: {e}")
|
|
sys.exit(1)
|
|
|
|
main(config, args)
|