mirror of
https://github.com/github/codeql.git
synced 2025-12-17 01:03:14 +01:00
MaD generator: tweak the scripts
* fix a bug where the order of model generation was determined by the order in the `download.json` file of the experiment rather than the order in the config file * allow configuring `--ram` and `--threads` in the MaD generator scripts * use no `--ram` and `--threads=0` by default in the bulk generator (single generator defaults are left unchanged) * allow to pass `--dca` multiple times, taking DBs from experiments listed last. This allows to run a subset of the sources in a "fixup" experiment and use it to "patch" a previous run without rerunning everything.
This commit is contained in:
@@ -225,7 +225,7 @@ def build_database(
|
|||||||
return database_dir
|
return database_dir
|
||||||
|
|
||||||
|
|
||||||
def generate_models(config, project: Project, database_dir: str) -> None:
|
def generate_models(config, args, project: Project, database_dir: str) -> None:
|
||||||
"""
|
"""
|
||||||
Generate models for a project.
|
Generate models for a project.
|
||||||
|
|
||||||
@@ -243,6 +243,8 @@ def generate_models(config, project: Project, database_dir: str) -> None:
|
|||||||
generator.generateSources = should_generate_sources(project)
|
generator.generateSources = should_generate_sources(project)
|
||||||
generator.generateSummaries = should_generate_summaries(project)
|
generator.generateSummaries = should_generate_summaries(project)
|
||||||
generator.setenvironment(database=database_dir, folder=name)
|
generator.setenvironment(database=database_dir, folder=name)
|
||||||
|
generator.threads = args.codeql_threads
|
||||||
|
generator.ram = args.codeql_ram
|
||||||
generator.run()
|
generator.run()
|
||||||
|
|
||||||
|
|
||||||
@@ -333,43 +335,44 @@ def pretty_name_from_artifact_name(artifact_name: str) -> str:
|
|||||||
|
|
||||||
def download_dca_databases(
|
def download_dca_databases(
|
||||||
language: str,
|
language: str,
|
||||||
experiment_name: str,
|
experiment_names: list[str],
|
||||||
pat: str,
|
pat: str,
|
||||||
projects: List[Project],
|
projects: List[Project],
|
||||||
) -> List[tuple[Project, str | None]]:
|
) -> List[tuple[Project, str | None]]:
|
||||||
"""
|
"""
|
||||||
Download databases from a DCA experiment.
|
Download databases from a DCA experiment.
|
||||||
Args:
|
Args:
|
||||||
experiment_name: The name of the DCA experiment to download databases from.
|
experiment_names: The names of the DCA experiments to download databases from.
|
||||||
pat: Personal Access Token for GitHub API authentication.
|
pat: Personal Access Token for GitHub API authentication.
|
||||||
projects: List of projects to download databases for.
|
projects: List of projects to download databases for.
|
||||||
Returns:
|
Returns:
|
||||||
List of (project_name, database_dir) pairs, where database_dir is None if the download failed.
|
List of (project_name, database_dir) pairs, where database_dir is None if the download failed.
|
||||||
"""
|
"""
|
||||||
print("\n=== Finding projects ===")
|
print("\n=== Finding projects ===")
|
||||||
response = get_json_from_github(
|
|
||||||
f"https://raw.githubusercontent.com/github/codeql-dca-main/data/{experiment_name}/reports/downloads.json",
|
|
||||||
pat,
|
|
||||||
)
|
|
||||||
targets = response["targets"]
|
|
||||||
project_map = {project["name"]: project for project in projects}
|
project_map = {project["name"]: project for project in projects}
|
||||||
analyzed_databases = {}
|
analyzed_databases = {}
|
||||||
for data in targets.values():
|
for experiment_name in experiment_names:
|
||||||
downloads = data["downloads"]
|
response = get_json_from_github(
|
||||||
analyzed_database = downloads["analyzed_database"]
|
f"https://raw.githubusercontent.com/github/codeql-dca-main/data/{experiment_name}/reports/downloads.json",
|
||||||
artifact_name = analyzed_database["artifact_name"]
|
pat,
|
||||||
pretty_name = pretty_name_from_artifact_name(artifact_name)
|
)
|
||||||
|
targets = response["targets"]
|
||||||
|
for data in targets.values():
|
||||||
|
downloads = data["downloads"]
|
||||||
|
analyzed_database = downloads["analyzed_database"]
|
||||||
|
artifact_name = analyzed_database["artifact_name"]
|
||||||
|
pretty_name = pretty_name_from_artifact_name(artifact_name)
|
||||||
|
|
||||||
if not pretty_name in project_map:
|
if not pretty_name in project_map:
|
||||||
print(f"Skipping {pretty_name} as it is not in the list of projects")
|
print(f"Skipping {pretty_name} as it is not in the list of projects")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if pretty_name in analyzed_databases:
|
if pretty_name in analyzed_databases:
|
||||||
print(
|
print(
|
||||||
f"Skipping previous database {analyzed_databases[pretty_name]['artifact_name']} for {pretty_name}"
|
f"Skipping previous database {analyzed_databases[pretty_name]['artifact_name']} for {pretty_name}"
|
||||||
)
|
)
|
||||||
|
|
||||||
analyzed_databases[pretty_name] = analyzed_database
|
analyzed_databases[pretty_name] = analyzed_database
|
||||||
|
|
||||||
def download_and_decompress(analyzed_database: dict) -> str:
|
def download_and_decompress(analyzed_database: dict) -> str:
|
||||||
artifact_name = analyzed_database["artifact_name"]
|
artifact_name = analyzed_database["artifact_name"]
|
||||||
@@ -450,23 +453,6 @@ def main(config, args) -> None:
|
|||||||
if not os.path.exists(build_dir):
|
if not os.path.exists(build_dir):
|
||||||
os.makedirs(build_dir)
|
os.makedirs(build_dir)
|
||||||
|
|
||||||
# Check if any of the MaD directories contain working directory changes in git
|
|
||||||
for project in projects:
|
|
||||||
mad_dir = get_mad_destination_for_project(config, project["name"])
|
|
||||||
if os.path.exists(mad_dir):
|
|
||||||
git_status_output = subprocess.check_output(
|
|
||||||
["git", "status", "-s", mad_dir], text=True
|
|
||||||
).strip()
|
|
||||||
if git_status_output:
|
|
||||||
print(
|
|
||||||
f"""ERROR: Working directory changes detected in {mad_dir}.
|
|
||||||
|
|
||||||
Before generating new models, the existing models are deleted.
|
|
||||||
|
|
||||||
To avoid loss of data, please commit your changes."""
|
|
||||||
)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
database_results = []
|
database_results = []
|
||||||
match get_strategy(config):
|
match get_strategy(config):
|
||||||
case "repo":
|
case "repo":
|
||||||
@@ -477,8 +463,8 @@ To avoid loss of data, please commit your changes."""
|
|||||||
projects,
|
projects,
|
||||||
)
|
)
|
||||||
case "dca":
|
case "dca":
|
||||||
experiment_name = args.dca
|
experiment_names = args.dca
|
||||||
if experiment_name is None:
|
if experiment_names is None:
|
||||||
print("ERROR: --dca argument is required for DCA strategy")
|
print("ERROR: --dca argument is required for DCA strategy")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
@@ -492,7 +478,7 @@ To avoid loss of data, please commit your changes."""
|
|||||||
pat = f.read().strip()
|
pat = f.read().strip()
|
||||||
database_results = download_dca_databases(
|
database_results = download_dca_databases(
|
||||||
language,
|
language,
|
||||||
experiment_name,
|
experiment_names,
|
||||||
pat,
|
pat,
|
||||||
projects,
|
projects,
|
||||||
)
|
)
|
||||||
@@ -518,7 +504,7 @@ To avoid loss of data, please commit your changes."""
|
|||||||
|
|
||||||
for project, database_dir in database_results:
|
for project, database_dir in database_results:
|
||||||
if database_dir is not None:
|
if database_dir is not None:
|
||||||
generate_models(config, project, database_dir)
|
generate_models(config, args, project, database_dir)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
@@ -529,14 +515,26 @@ if __name__ == "__main__":
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--dca",
|
"--dca",
|
||||||
type=str,
|
type=str,
|
||||||
help="Name of a DCA run that built all the projects",
|
help="Name of a DCA run that built all the projects. Can be repeated, with sources taken from all provided runs, "
|
||||||
required=False,
|
"the last provided ones having priority",
|
||||||
|
action="append",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--pat",
|
"--pat",
|
||||||
type=str,
|
type=str,
|
||||||
help="Path to a file containing the PAT token required to grab DCA databases (the same as the one you use for DCA)",
|
help="Path to a file containing the PAT token required to grab DCA databases (the same as the one you use for DCA)",
|
||||||
required=False,
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--codeql-ram",
|
||||||
|
type=int,
|
||||||
|
help="What `--ram` value to pass to `codeql` while generating models (by default the flag is not passed)",
|
||||||
|
default=None,
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--codeql-threads",
|
||||||
|
type=int,
|
||||||
|
help="What `--threads` value to pass to `codeql` (default %(default)s)",
|
||||||
|
default=0,
|
||||||
)
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
|||||||
@@ -62,6 +62,8 @@ class Generator:
|
|||||||
self.generateTypeBasedSummaries = False
|
self.generateTypeBasedSummaries = False
|
||||||
self.dryRun = False
|
self.dryRun = False
|
||||||
self.dirname = "modelgenerator"
|
self.dirname = "modelgenerator"
|
||||||
|
self.ram = 2**15
|
||||||
|
self.threads = 8
|
||||||
|
|
||||||
|
|
||||||
def setenvironment(self, database, folder):
|
def setenvironment(self, database, folder):
|
||||||
@@ -138,8 +140,12 @@ class Generator:
|
|||||||
queryFile = os.path.join(self.codeQlRoot, f"{self.language}/ql/src/utils/{self.dirname}", query)
|
queryFile = os.path.join(self.codeQlRoot, f"{self.language}/ql/src/utils/{self.dirname}", query)
|
||||||
resultBqrs = os.path.join(self.workDir, "out.bqrs")
|
resultBqrs = os.path.join(self.workDir, "out.bqrs")
|
||||||
|
|
||||||
helpers.run_cmd(['codeql', 'query', 'run', queryFile, '--database',
|
cmd = ['codeql', 'query', 'run', queryFile, '--database', self.database, '--output', resultBqrs]
|
||||||
self.database, '--output', resultBqrs, '--threads', '8', '--ram', '32768'], "Failed to generate " + query)
|
if self.threads is not None:
|
||||||
|
cmd += ["--threads", str(self.threads)]
|
||||||
|
if self.ram is not None:
|
||||||
|
cmd += ["--ram", str(self.ram)]
|
||||||
|
helpers.run_cmd(cmd, "Failed to generate " + query)
|
||||||
|
|
||||||
return helpers.readData(self.workDir, resultBqrs)
|
return helpers.readData(self.workDir, resultBqrs)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user