Merge branch 'main' into redsun82/kotlin

2025-12-16 16:53:25 +01:00 · 2024-06-04 08:48:52 +02:00
parent 821bd1f27b e6dc36b2c4
commit d1a2c0fbe4
623 changed files with 17082 additions and 9250 deletions
--- a/misc/bazel/cmake/setup.cmake
+++ b/misc/bazel/cmake/setup.cmake
@@ -42,7 +42,7 @@ endmacro()
 macro(generate_and_include)
    file(REMOVE "${BAZEL_WORKSPACE}/.bazel-cmake/BUILD.bazel")
    # use aquery to only get targets compatible with the current platform
-    bazel_even_if_failing(aquery "kind(\"cc_test|cc_binary\", ${ARGN})" ${BAZEL_BUILD_OPTIONS} --output=jsonproto OUTPUT_VARIABLE BAZEL_AQUERY_RESULT)
+    bazel_even_if_failing(aquery "kind(\"cc_test|cc_binary\",${ARGN})" ${BAZEL_BUILD_OPTIONS} --output=jsonproto OUTPUT_VARIABLE BAZEL_AQUERY_RESULT)
    string(JSON BAZEL_JSON_TARGETS GET "${BAZEL_AQUERY_RESULT}" targets)
    string(JSON LAST_IDX LENGTH "${BAZEL_JSON_TARGETS}")
    math(EXPR LAST_IDX "${LAST_IDX} - 1")
--- a/misc/bazel/csharp.bzl
+++ b/misc/bazel/csharp.bzl
@@ -1,7 +1,6 @@
 load("@rules_dotnet//dotnet:defs.bzl", "csharp_binary", "csharp_library", "csharp_test", "publish_binary")
 load("@rules_pkg//pkg:mappings.bzl", "strip_prefix")
-load("@semmle_code//:dist.bzl", "pack_zip")
-load("//:defs.bzl", "codeql_platform")
+load("//misc/bazel:pkg.bzl", "codeql_pkg_files")

 TARGET_FRAMEWORK = "net8.0"

@@ -33,7 +32,7 @@ def codeql_xunit_test(name, **kwargs):
        **kwargs
    )

-def codeql_csharp_binary(name, language_prefix = "csharp", **kwargs):
+def codeql_csharp_binary(name, **kwargs):
    kwargs.setdefault("nullable", "enable")
    kwargs.setdefault("target_frameworks", [TARGET_FRAMEWORK])

@@ -60,10 +59,10 @@ def codeql_csharp_binary(name, language_prefix = "csharp", **kwargs):
        ),
    )

-    pack_zip(
+    codeql_pkg_files(
        name = name,
-        srcs = [publish_binary_target],
-        prefix = language_prefix + "/tools/" + codeql_platform,
+        exes = [publish_binary_target],
+        prefix = "tools/{CODEQL_PLATFORM}",
        strip_prefix = strip_prefix.files_only(),
        visibility = visibility,
    )
--- a/misc/bazel/internal/BUILD.bazel
+++ b/misc/bazel/internal/BUILD.bazel
@@ -0,0 +1 @@
+exports_files(["install.py"])
--- a/misc/bazel/internal/git_lfs_probe.py
+++ b/misc/bazel/internal/git_lfs_probe.py
@@ -57,10 +57,18 @@ def git(*args, **kwargs):


 def get_endpoint():
-    lfs_env = get_env(subprocess.check_output(["git", "lfs", "env"], text=True, cwd=source_dir))
-    endpoint = next(v for k, v in lfs_env.items() if k.startswith('Endpoint'))
+    lfs_env_items = iter(get_env(subprocess.check_output(["git", "lfs", "env"], text=True, cwd=source_dir)).items())
+    endpoint = next(v for k, v in lfs_env_items if k.startswith('Endpoint'))
    endpoint, _, _ = endpoint.partition(' ')
-    ssh_endpoint = lfs_env.get("  SSH")
+    # only take the ssh endpoint if it follows directly after the first endpoint we found
+    # in a situation like
+    #   Endpoint (a)=...
+    #   Endpoint (b)=...
+    #     SSH=...
+    # we want to ignore the SSH endpoint, as it's not linked to the default (a) endpoint
+    following_key, following_value = next(lfs_env_items, (None, None))
+    ssh_endpoint = following_value if following_key == "  SSH" else None
+
    endpoint = Endpoint(endpoint, {
        "Content-Type": "application/vnd.git-lfs+json",
        "Accept": "application/vnd.git-lfs+json",
--- a/misc/bazel/internal/install.py
+++ b/misc/bazel/internal/install.py
@@ -0,0 +1,56 @@
+"""
+Helper script for installing `codeql_pack` targets.
+
+This mainly wraps around a `pkg_install` script from `rules_pkg` adding:
+* resolving destination directory with respect to a provided `--build-file`
+* clean-up of target destination directory before a reinstall
+* installing imported zip files using a provided `--ripunzip`
+"""
+
+import argparse
+import pathlib
+import shutil
+import subprocess
+from python.runfiles import runfiles
+
+runfiles = runfiles.Create()
+assert runfiles, "Installer should be run with `bazel run`"
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument("--destdir", type=pathlib.Path, required=True,
+                    help="Desination directory, relative to `--build-file`")
+parser.add_argument("--pkg-install-script", required=True,
+                    help="The wrapped `pkg_install` installation script rlocation")
+parser.add_argument("--build-file", required=True,
+                    help="BUILD.bazel rlocation relative to which the installation should take place")
+parser.add_argument("--ripunzip",
+                    help="ripunzip executable rlocation. Must be provided if `--zip-manifest` is.")
+parser.add_argument("--zip-manifest",
+                    help="The rlocation of a file containing newline-separated `prefix:zip_file` entries")
+parser.add_argument("--cleanup", action=argparse.BooleanOptionalAction, default=True,
+                    help="Whether to wipe the destination directory before installing (true by default)")
+opts = parser.parse_args()
+if opts.zip_manifest and not opts.ripunzip:
+    parser.error("Provide `--ripunzip` when specifying `--zip-manifest`")
+
+build_file = runfiles.Rlocation(opts.build_file)
+script = runfiles.Rlocation(opts.pkg_install_script)
+destdir = pathlib.Path(build_file).resolve().parent / opts.destdir
+
+if destdir.exists() and opts.cleanup:
+    shutil.rmtree(destdir)
+
+destdir.mkdir(parents=True, exist_ok=True)
+subprocess.run([script, "--destdir", destdir], check=True)
+
+if opts.zip_manifest:
+    ripunzip = runfiles.Rlocation(opts.ripunzip)
+    zip_manifest = runfiles.Rlocation(opts.zip_manifest)
+    with open(zip_manifest) as manifest:
+        for line in manifest:
+            prefix, _, zip = line.partition(":")
+            assert zip, f"missing prefix for {prefix}, you should use prefix:zip format"
+            zip = zip.strip()
+            dest = destdir / prefix
+            dest.mkdir(parents=True, exist_ok=True)
+            subprocess.run([ripunzip, "unzip-file", zip, "-d", dest], check=True)
--- a/misc/bazel/internal/ripunzip/BUILD.bazel
+++ b/misc/bazel/internal/ripunzip/BUILD.bazel
@@ -0,0 +1,8 @@
+load("@bazel_skylib//rules:native_binary.bzl", "native_binary")
+
+native_binary(
+    name = "ripunzip",
+    src = select({"@platforms//os:" + os: "@ripunzip-" + os for os in ("linux", "windows", "macos")}),
+    out = "ripunzip.exe",
+    visibility = ["//visibility:public"],
+)
--- a/misc/bazel/internal/ripunzip/LICENSE.txt
+++ b/misc/bazel/internal/ripunzip/LICENSE.txt
@@ -0,0 +1,236 @@
+This software is distributed under the terms of both the MIT license and the
+Apache License (Version 2.0).
+
+
+MIT license
+
+Copyright 2022 Google LLC
+
+Permission is hereby granted, free of charge, to any
+person obtaining a copy of this software and associated
+documentation files (the "Software"), to deal in the
+Software without restriction, including without
+limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software
+is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice
+shall be included in all copies or substantial portions
+of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+
+
+Apache 2 license
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/misc/bazel/internal/ripunzip/README.md
+++ b/misc/bazel/internal/ripunzip/README.md
@@ -0,0 +1,2 @@
+These LFS files are distributions of [ripunzip](https://github.com/google/ripunzip), compiled with this [workflow](https://github.com/github/codeql/actions/workflows/build-ripunzip.yml).
+A [copy](./LICENSE.txt) of the ripunzip license is included.
--- a/misc/bazel/internal/ripunzip/ripunzip-linux
+++ b/misc/bazel/internal/ripunzip/ripunzip-linux
--- a/misc/bazel/internal/ripunzip/ripunzip-macos
+++ b/misc/bazel/internal/ripunzip/ripunzip-macos
--- a/misc/bazel/internal/ripunzip/ripunzip-windows.exe
+++ b/misc/bazel/internal/ripunzip/ripunzip-windows.exe
--- a/misc/bazel/internal/zipmerge/.clang-format
+++ b/misc/bazel/internal/zipmerge/.clang-format
@@ -0,0 +1,7 @@
+BasedOnStyle: Chromium
+ColumnLimit: 100
+IndentWidth: 2
+SortIncludes: false
+AllowShortIfStatementsOnASingleLine: WithoutElse
+AlwaysBreakBeforeMultilineStrings: false
+Standard: c++20
--- a/misc/bazel/internal/zipmerge/BUILD.bazel
+++ b/misc/bazel/internal/zipmerge/BUILD.bazel
@@ -0,0 +1,34 @@
+cc_library(
+    name = "lib",
+    srcs = [
+        "zipmerge.cpp",
+    ],
+    hdrs = ["zipmerge.h"],
+    copts = ["-std=c++20"],
+)
+
+cc_binary(
+    name = "zipmerge",
+    srcs = [
+        "zipmerge_main.cpp",
+    ],
+    copts = ["-std=c++20"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":lib",
+    ],
+)
+
+cc_test(
+    name = "test",
+    size = "small",
+    srcs = ["zipmerge_test.cpp"],
+    copts = ["-std=c++20"],
+    data = glob(["test-files/*"]),
+    linkstatic = True,  # required to build the test in the internal repo
+    deps = [
+        ":lib",
+        "@bazel_tools//tools/cpp/runfiles",
+        "@googletest//:gtest_main",
+    ],
+)
--- a/misc/bazel/internal/zipmerge/test-files/almost-minimal.zip
+++ b/misc/bazel/internal/zipmerge/test-files/almost-minimal.zip
--- a/misc/bazel/internal/zipmerge/test-files/directory-partial.zip
+++ b/misc/bazel/internal/zipmerge/test-files/directory-partial.zip
--- a/misc/bazel/internal/zipmerge/test-files/directory.zip
+++ b/misc/bazel/internal/zipmerge/test-files/directory.zip
--- a/misc/bazel/internal/zipmerge/test-files/empty.zip
+++ b/misc/bazel/internal/zipmerge/test-files/empty.zip
--- a/misc/bazel/internal/zipmerge/test-files/footers.jar
+++ b/misc/bazel/internal/zipmerge/test-files/footers.jar
--- a/misc/bazel/internal/zipmerge/test-files/minimal-x3.zip
+++ b/misc/bazel/internal/zipmerge/test-files/minimal-x3.zip
--- a/misc/bazel/internal/zipmerge/test-files/minimal.zip
+++ b/misc/bazel/internal/zipmerge/test-files/minimal.zip
--- a/misc/bazel/internal/zipmerge/test-files/no-footers.jar
+++ b/misc/bazel/internal/zipmerge/test-files/no-footers.jar
--- a/misc/bazel/internal/zipmerge/zipmerge.cpp
+++ b/misc/bazel/internal/zipmerge/zipmerge.cpp
@@ -0,0 +1,529 @@
+/*
+  Utility for munging zip files.
+
+  The high-level pseudo-code is:
+    for each input zip Z:
+      for each file F in Z:
+        F.name = adjust(F.name)
+        if F.name should be included:
+          write F to the output zip
+
+  File inclusion testing consists of two parts:
+    1. Don't include anything matching an explicit removal list.
+    2. If the same filename occurs in multiple input zips, only include the file from the last input
+       zip.
+
+  Filename adjustment consists of optionally prepending a prefix to the filename.
+*/
+
+#include "misc/bazel/internal/zipmerge/zipmerge.h"
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#ifdef _WIN32
+#include <Windows.h>
+#define unlink(s) DeleteFileA(s)
+#else
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#endif
+
+#include <string_view>
+
+namespace {
+struct {
+  FILE* file;
+  uint32_t num_bytes_written;
+  uint16_t num_files_written;
+} output_zip{};  // The zip file being written.
+
+struct {
+  uint8_t* bytes;
+  uint16_t length;
+} filename_prefix{};  // A string to prepend to all filenames added to the output file.
+
+constexpr size_t maximum_input_files = 1000;
+struct {
+  int count;
+  struct {
+    const char* prefix;
+    const char* name;
+  } entries[maximum_input_files];
+} input_files;  // A list of input zip files.
+
+static bool verbose;                  // If true, more things are written to stdout.
+static const char* output_file_name;  // The name of the output zip file.
+static const char*
+    current_input_file_name;  // The name of the current input zip file (used for diagnostics).
+
+constexpr size_t filename_hash_table_size = 0x20000;
+typedef struct {
+  uint32_t hash;
+  uint32_t len;
+  const uint8_t* data;
+} hash_entry_t;
+
+// A hash set containing the name of everything so far written to the output file.
+static hash_entry_t filename_hash_table[filename_hash_table_size];
+
+constexpr size_t maximum_removals = 1000;
+struct removal_entry {
+  // A removal entry can either be a literal string, or a wildcard containing a single "*".
+  // In the former case, the literal string is called the head. In the latter case, the
+  // segment before the "*" is called the head, and the segment after the "*" is called the tail.
+  uint32_t head_len;
+  uint32_t tail_len;  // zero for literal removals, possibly zero for wildcard removals
+  const uint8_t* head;
+  const uint8_t* tail;  // NULL for literal removals, non-NULL for wildcard removals
+};
+
+struct {
+  int count;
+  removal_entry entries[maximum_removals];
+} removals;  // A list of files and directories to ignore in input files.
+
+// Sizes and signatures of zip file structures (central-directory, local-file-header,
+// end-of-central-directory).
+constexpr size_t cd_size = 46;
+constexpr std::string_view cd_signature = "\x50\x4b\x01\x02";
+constexpr size_t lfh_size = 30;
+constexpr std::string_view lfh_signature = "\x50\x4b\x03\x04";
+constexpr size_t eocd_size = 22;
+
+// Write the bytes [src, src + len) to the output file.
+void append_data(const uint8_t* src, uint32_t len) {
+  if (fwrite(src, 1, len, output_zip.file) != len) {
+    printf("Error: Could not write %lu bytes to output file.\n", (unsigned long)len);
+    exit(1);
+  }
+  uint32_t new_output_size = output_zip.num_bytes_written + len;
+  if (new_output_size < output_zip.num_bytes_written) {
+    printf("Error: Output zip file exceeds 4 gigabytes.\n");
+    exit(1);
+  }
+  output_zip.num_bytes_written = new_output_size;
+}
+}  // namespace
+
+void append_cd(const uint8_t* src, uint32_t len) {
+  if ((output_cd.capacity - output_cd.length) < len) {
+    uint32_t new_capacity;
+    uint8_t* new_data;
+
+    new_capacity = output_cd.capacity + (output_cd.capacity >> 1);
+    if (new_capacity < output_cd.length + len) new_capacity = output_cd.length + len;
+    new_data = (uint8_t*)realloc(output_cd.bytes, new_capacity);
+    if (!new_data) {
+      printf("Error: Could not grow central-directory buffer from %lu bytes to %lu bytes.\n",
+             (unsigned long)output_cd.capacity, (unsigned long)new_capacity);
+      exit(1);
+    }
+    output_cd.bytes = new_data;
+    output_cd.capacity = new_capacity;
+  }
+  memcpy(output_cd.bytes + output_cd.length, src, len);
+  output_cd.length += len;
+}
+
+namespace {
+// Copy a local-file-header and accompanying file data from an input file to the output file.
+// The input file is [input_file, input_file + input_file_len).
+// The offset within the input file of the local-file-header is given by lfh_offset.
+// The central-directory entry corresponding to the file is given by cd.
+void copy_file_data(const uint8_t* input_file,
+                    size_t lfh_offset,
+                    const uint8_t* cd,
+                    size_t input_file_len) {
+  if (lfh_offset >= input_file_len || (size_t)(input_file_len - lfh_offset) < lfh_size) {
+    printf("Error: %s is invalid; central-directory references local-file-header at offset %llu, "
+           "but file is only %llu bytes.\n",
+           current_input_file_name, (unsigned long long)lfh_offset,
+           (unsigned long long)input_file_len);
+    exit(1);
+  }
+
+  const uint8_t* lfh = input_file + lfh_offset;
+  if (memcmp(lfh, lfh_signature.data(), lfh_signature.size()) != 0) {
+    printf("Error: Expected local-file-header signature at offset %llu of %s, but instead got %02x "
+           "%02x %02x %02x.\n",
+           (unsigned long long)lfh_offset, current_input_file_name, lfh[0], lfh[1], lfh[2], lfh[3]);
+    exit(1);
+  }
+
+  size_t data_offset = lfh_offset + lfh_size;
+  uint16_t name_len = read2(lfh + 26);
+  uint16_t extra_len = read2(lfh + 28);
+  uint32_t data_len = read4(cd + 20);
+  append_data(lfh, 6);  // signature, version
+  // flags, compression, mod-time, mod-date, crc-32, compressed-size, uncompressed-size, name-len
+  append_data(cd + 8, 22);
+  append_data(lfh + 28, 2);  // extra-len
+
+  size_t total_variable_len = (size_t)name_len + (size_t)extra_len + (size_t)data_len;
+  if ((size_t)(input_file_len - data_offset) < total_variable_len) {
+    printf(
+        "Error: %s is invalid; starting at offset %llu, reading a filename of %u bytes, extra data "
+        "of %u bytes, and %lu bytes of compressed data would exceed file size of %llu bytes.\n",
+        current_input_file_name, (unsigned long long)data_offset, (unsigned)name_len,
+        (unsigned)extra_len, (unsigned long)data_len, (unsigned long long)input_file_len);
+    exit(1);
+  }
+  append_data(filename_prefix.bytes, filename_prefix.length);
+  append_data(input_file + data_offset, (uint32_t)total_variable_len);
+}
+
+bool removal_entry_matches(const struct removal_entry* re, const uint8_t* full_name, uint32_t len) {
+  if (len < re->head_len + re->tail_len) {
+    return false;
+  }
+  if (memcmp(full_name, re->head, re->head_len) != 0) {
+    return false;
+  }
+  if (re->tail) {
+    for (uint32_t i = re->head_len + re->tail_len;; ++i) {
+      if (len == i || full_name[i] == '/') {
+        if (memcmp(full_name + i - re->tail_len, re->tail, re->tail_len) == 0) {
+          return true;
+        }
+      }
+      if (len == i || full_name[i - re->tail_len] == '/') {
+        return false;
+      }
+    }
+  } else {
+    return len == re->head_len || full_name[re->head_len] == '/';
+  }
+}
+}  // namespace
+
+bool should_include_filename_now(const uint8_t* name, uint32_t len) {
+  uint8_t* full_name = (uint8_t*)malloc(filename_prefix.length + len + 1);
+  memcpy(full_name, filename_prefix.bytes, filename_prefix.length);
+  memcpy(full_name + filename_prefix.length, name, len);
+  len += filename_prefix.length;
+
+  for (int i = 0; i < removals.count; ++i) {
+    if (removal_entry_matches(&removals.entries[i], full_name, len)) {
+      free(full_name);
+      return false;
+    }
+  }
+
+  uint32_t hash = 5381;
+  for (uint32_t i = 0; i < len; ++i)
+    hash = hash * 33 ^ full_name[i];
+
+  for (uint32_t idx = hash;; ++idx) {
+    hash_entry_t* e = filename_hash_table + (idx & (filename_hash_table_size - 1));
+    if (e->hash == hash && e->len == len && memcmp(e->data, full_name, len) == 0) {
+      free(full_name);
+      return false;
+    } else if (e->data == NULL) {
+      e->hash = hash;
+      e->len = len;
+      e->data = full_name;
+      return true;
+    }
+  }
+}
+
+// Try to find the end-of-central-directory record in a zip file.
+const uint8_t* find_eocd(const uint8_t* input_file, size_t input_file_len) {
+  for (size_t i = eocd_size; i < 1024 + eocd_size && i <= input_file_len; ++i) {
+    const uint8_t* candidate = input_file + input_file_len - i;
+    if (memcmp(candidate, eocd_signature.data(), eocd_signature.size()) == 0) {
+      return candidate;
+    }
+  }
+  return NULL;
+}
+
+namespace {
+// Copy all appropriate files from an input zip to the output zip.
+void process_input_file(const uint8_t* input_file, size_t input_file_len) {
+  const uint8_t* eocd = find_eocd(input_file, input_file_len);
+  if (!eocd) {
+    printf("Error: Could not find end-of-central-directory in %s.\n", current_input_file_name);
+    exit(1);
+  }
+  if (read2(eocd + 4) != 0 || read2(eocd + 6) != 0) {
+    printf("Error: %s is split over multiple disks, which is not supported.\n",
+           current_input_file_name);
+    exit(1);
+  }
+  if (!(uint16_t)~read2(eocd + 8) || !(uint16_t)~read2(eocd + 10) || !~read4(eocd + 12) ||
+      !~read4(eocd + 16)) {
+    printf("Error: %s is zip64, which is not supported.\n", current_input_file_name);
+    exit(1);
+  }
+  uint16_t num_entries = read2(eocd + 10);
+  size_t cd_offset = read4(eocd + 16);
+
+  for (uint16_t i = 0; i < num_entries; ++i) {
+    uint8_t cd[cd_size];
+    if (cd_offset >= input_file_len || (size_t)(input_file_len - cd_offset) < sizeof(cd)) {
+      printf("Error: %s is invalid; central-directory %u/%u would start at offset %llu, but file "
+             "is only %llu bytes.\n",
+             current_input_file_name, (unsigned)i, (unsigned)num_entries,
+             (unsigned long long)cd_offset, (unsigned long long)input_file_len);
+      exit(1);
+    }
+
+    memcpy(cd, input_file + cd_offset, sizeof(cd));
+    if (memcmp(cd, cd_signature.data(), cd_signature.size()) != 0) {
+      printf("Error: Expected central-directory signature at offset %llu of %s, but instead got "
+             "%02x %02x %02x %02x.\n",
+             (unsigned long long)cd_offset, current_input_file_name, cd[0], cd[1], cd[2], cd[3]);
+      exit(1);
+    }
+    cd[8] &= 0xF7;  // Clear the bit indicating that a local-file-footer follows the file data
+    cd_offset += sizeof(cd);
+
+    uint16_t name_len = read2(cd + 28);
+    if (((uint32_t)name_len + (uint32_t)filename_prefix.length) > 0xFFFFU) {
+      printf("Error: Combining prefix of %.*s with filename of %.*s results in a filename which is "
+             "too long.\n",
+             (int)filename_prefix.length, (const char*)filename_prefix.bytes, (int)name_len,
+             (const char*)(input_file + cd_offset));
+      exit(1);
+    }
+    write2(cd + 28, name_len + filename_prefix.length);
+    uint16_t extra_len = read2(cd + 30);
+    uint16_t comment_len = read2(cd + 32);
+    uint32_t offset = read4(cd + 42);
+    write4(cd + 42, output_zip.num_bytes_written);
+    if (!~offset || !~read4(cd + 20)) {
+      printf("Error: %s is zip64 (because of %.*s), which is not supported.\n",
+             current_input_file_name, (int)name_len, (const char*)(input_file + cd_offset));
+      exit(1);
+    }
+
+    size_t total_variable_len = (size_t)name_len + (size_t)extra_len + (size_t)comment_len;
+    if ((size_t)(input_file_len - cd_offset) < total_variable_len) {
+      printf("Error: %s is invalid; starting at offset %llu, reading a filename of %u bytes, extra "
+             "data of %u bytes, and comment of %u bytes exceed file size of %llu bytes.\n",
+             current_input_file_name, (unsigned long long)offset, (unsigned)name_len,
+             (unsigned)extra_len, (unsigned)comment_len, (unsigned long long)input_file_len);
+      exit(1);
+    }
+
+    bool should_include = should_include_filename_now(input_file + cd_offset, name_len);
+    if (verbose) {
+      printf("%s %.*s from %s\n", should_include ? "Using" : "Skipping", (int)name_len,
+             (const char*)(input_file + cd_offset), current_input_file_name);
+    }
+    if (should_include) {
+      append_cd(cd, sizeof(cd));
+      append_cd(filename_prefix.bytes, filename_prefix.length);
+      append_cd(input_file + cd_offset, (uint32_t)total_variable_len);
+      copy_file_data(input_file, offset, cd, input_file_len);
+      if (output_zip.num_files_written == 0xFFFFU) {
+        printf("Error: Too many files in output zip.\n");
+        exit(1);
+      }
+      ++output_zip.num_files_written;
+    }
+    cd_offset += total_variable_len;
+  }
+}
+
+// Read a file into memory and pass it to process_input_file.
+void read_and_process_input_file(const char* filename) {
+#ifdef _WIN32
+  HANDLE file = CreateFileA(filename, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING,
+                            FILE_ATTRIBUTE_NORMAL, NULL);
+  if (file == INVALID_HANDLE_VALUE) {
+    printf("Error: Cannot open %s for reading.\n", filename);
+    exit(1);
+  }
+  LARGE_INTEGER size;
+  if (!GetFileSizeEx(file, &size)) {
+    printf("Error: Cannot determine size of %s.\n", filename);
+    exit(1);
+  }
+  if (size.HighPart != 0) {
+    printf("Error: Input file %s exceeds 4 gigabytes.\n", filename);
+    exit(1);
+  }
+  if (size.LowPart == 0) {
+    printf("Error: Input file %s is empty.\n", filename);
+    exit(1);
+  }
+  HANDLE mapping = CreateFileMappingA(file, NULL, PAGE_READONLY, 0, size.LowPart, NULL);
+  if (mapping == NULL) {
+    printf("Error: Cannot mmap %s (CreateFileMapping).\n", filename);
+    exit(1);
+  }
+  void* data = MapViewOfFile(mapping, FILE_MAP_READ, 0, 0, size.LowPart);
+  if (data == NULL) {
+    printf("Error: Cannot mmap %s (MapViewOfFile).\n", filename);
+    exit(1);
+  }
+  process_input_file((uint8_t*)data, size.LowPart);
+  UnmapViewOfFile(data);
+  CloseHandle(mapping);
+  CloseHandle(file);
+#else
+  int file = open(filename, O_RDONLY);
+  if (file == -1) {
+    printf("Error: Cannot open %s for reading.\n", filename);
+    exit(1);
+  }
+  struct stat st;
+  if (fstat(file, &st) == -1) {
+    printf("Error: Cannot stat %s.\n", filename);
+    exit(1);
+  }
+  void* data = mmap(NULL, st.st_size, PROT_READ, MAP_SHARED, file, 0);
+  if (data == MAP_FAILED) {
+    printf("Error: Cannot mmap %s.\n", filename);
+    exit(1);
+  }
+  process_input_file((uint8_t*)data, st.st_size);
+  munmap(data, st.st_size);
+  close(file);
+#endif
+}
+
+// Print usage information and exit.
+void usage_and_exit(const char** argv) {
+  printf("Usage: %s [-v|--verbose] [--remove=FILE] outfile.zip [--prefix=PREFIX] infile1.zip "
+         "[--prefix=PREFIX] infile2.zip ...\n",
+         argv[0]);
+  exit(1);
+}
+
+// Set filename_prefix based on a string from the command line.
+void set_filename_prefix(const char* prefix) {
+  free(filename_prefix.bytes);
+  filename_prefix.bytes = NULL;
+  filename_prefix.length = 0;
+
+  if (prefix == NULL) {
+    return;
+  }
+  if (*prefix == '/' || *prefix == '\\') {
+    ++prefix;
+  }
+  size_t len = strlen(prefix);
+  if (len == 0) {
+    return;
+  }
+
+  filename_prefix.bytes = (uint8_t*)malloc(len + 1);
+  memcpy(filename_prefix.bytes, prefix, len);
+  for (size_t i = 0; i < len; ++i) {
+    if (filename_prefix.bytes[i] == '\\') filename_prefix.bytes[i] = '/';
+  }
+  filename_prefix.bytes[len] = '/';
+  filename_prefix.length = (uint16_t)(len + 1);
+}
+
+// Set various global variables based on the command line.
+void parse_command_line(int argc, const char** argv) {
+  int i = 1;
+  for (; i < argc; ++i) {
+    const char* arg = argv[i];
+    if (strcmp(arg, "-v") == 0 || strcmp(arg, "--verbose") == 0) {
+      verbose = true;
+    } else if (strncmp(arg, "--remove=", 9) == 0) {
+      arg += 9;
+      if (*arg == '/' || *arg == '\\') ++arg;
+      if (removals.count == maximum_removals) {
+        printf("Error: Too many --remove flags.\n");
+        exit(1);
+      }
+      const char* star = strchr(arg, '*');
+      struct removal_entry* re = &removals.entries[removals.count++];
+      if (star == NULL) {
+        re->head_len = (uint32_t)strlen(arg);
+        re->tail_len = 0;
+        re->head = (const uint8_t*)arg;
+        re->tail = NULL;
+      } else {
+        if (strchr(star + 1, '*')) {
+          printf("Error: At most one * is permitted per removal (%s).\n", arg);
+          exit(1);
+        }
+        re->head_len = (uint32_t)(star - arg);
+        re->tail_len = (uint32_t)strlen(star + 1);
+        re->head = (const uint8_t*)arg;
+        re->tail = (const uint8_t*)(star + 1);
+      }
+      ++removals.count;
+    } else {
+      break;
+    }
+  }
+
+  if (i == argc) {
+    printf("Error: Missing output file name.\n");
+    usage_and_exit(argv);
+  }
+  output_file_name = argv[i];
+  ++i;
+
+  const char* prefix = NULL;
+  for (; i < argc; ++i) {
+    const char* arg = argv[i];
+    if (strncmp(arg, "--prefix=", 9) == 0) {
+      prefix = arg + 9;
+    } else {
+      if (input_files.count == maximum_input_files) {
+        printf("Error: Too many input files.\n");
+        exit(1);
+      }
+      input_files.entries[input_files.count].prefix = prefix;
+      input_files.entries[input_files.count].name = arg;
+      ++input_files.count;
+    }
+  }
+
+  if (input_files.count <= 0) {
+    printf("Error: Missing input file names.\n");
+    usage_and_exit(argv);
+  }
+}
+}  // namespace
+
+int zipmerge_main(int argc, const char** argv) {
+  parse_command_line(argc, argv);
+
+  output_zip.file = fopen(output_file_name, "wb");
+  if (!output_zip.file) {
+    printf("Error: Cannot open %s for writing.\n", output_file_name);
+    return 1;
+  }
+
+  for (int i = input_files.count - 1; i >= 0; --i) {
+    set_filename_prefix(input_files.entries[i].prefix);
+    current_input_file_name = input_files.entries[i].name;
+    read_and_process_input_file(current_input_file_name);
+  }
+
+  uint8_t eocd[eocd_size] = {0};
+  memcpy(eocd, eocd_signature.data(), eocd_signature.size());
+  write2(eocd + 8, output_zip.num_files_written);
+  write2(eocd + 10, output_zip.num_files_written);
+  write4(eocd + 12, output_cd.length);
+  write4(eocd + 16, output_zip.num_bytes_written);
+  append_data(output_cd.bytes, output_cd.length);
+  append_data(eocd, sizeof(eocd));
+  fclose(output_zip.file);
+  return 0;
+}
+
+void reset() {
+  memset(&output_zip, 0, sizeof(output_zip));
+  memset(&filename_prefix, 0, sizeof(filename_prefix));
+  memset(&output_cd, 0, sizeof(output_cd));
+  memset(&input_files, 0, sizeof(input_files));
+  memset(&filename_hash_table, 0, sizeof(filename_hash_table));
+  memset(&removals, 0, sizeof(removals));
+}
--- a/misc/bazel/internal/zipmerge/zipmerge.h
+++ b/misc/bazel/internal/zipmerge/zipmerge.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include <cstdlib>
+#include <cstdint>
+#include <string_view>
+
+struct output_cd_t {
+  uint8_t* bytes;
+  uint32_t length;
+  uint32_t capacity;
+};
+
+inline output_cd_t output_cd{};  // An in-memory buffer in which the central-directory records for
+                                 // the output file are accumulated.
+
+// Read and write little-endian integers (as the only supported host platforms are little-endian,
+// and all host platforms support unaligned memory accesses, these macros are currently very
+// simple).
+#define read2(ptr) (*(uint16_t*)(ptr))
+#define read4(ptr) (*(uint32_t*)(ptr))
+#define write2(ptr, val) (*(uint16_t*)(ptr) = (val))
+#define write4(ptr, val) (*(uint32_t*)(ptr) = (val))
+
+// Add the bytes [src, src + len) to the output's central-directory.
+void append_cd(const uint8_t* src, uint32_t len);
+
+// Test whether a given filename should be included in the output zip.
+// Note that if a call returns true for a given filename, all future calls with the same filename
+// will return false.
+bool should_include_filename_now(const uint8_t* name, uint32_t len);
+
+inline constexpr std::string_view eocd_signature = "\x50\x4b\x05\x06";
+const uint8_t* find_eocd(const uint8_t* input_file, size_t input_file_len);
+
+int zipmerge_main(int argc, const char** argv);
+
+void reset();
--- a/misc/bazel/internal/zipmerge/zipmerge_main.cpp
+++ b/misc/bazel/internal/zipmerge/zipmerge_main.cpp
@@ -0,0 +1,5 @@
+#include "misc/bazel/internal/zipmerge/zipmerge.h"
+
+int main(int argc, const char** argv) {
+  return zipmerge_main(argc, argv);
+}
--- a/misc/bazel/internal/zipmerge/zipmerge_test.cpp
+++ b/misc/bazel/internal/zipmerge/zipmerge_test.cpp
@@ -0,0 +1,166 @@
+#include "misc/bazel/internal/zipmerge/zipmerge.h"
+
+#include <array>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <cstring>
+#include <filesystem>
+
+#include <gtest/gtest.h>
+#include <gmock/gmock.h>
+#include "tools/cpp/runfiles/runfiles.h"
+
+using bazel::tools::cpp::runfiles::Runfiles;
+using namespace std::string_literals;
+namespace fs = std::filesystem;
+
+namespace codeql_testing {
+
+TEST(Zipmerge, ReadAndWrite) {
+  char buf[7] = {0};
+  write2(buf + 1, 0xF2F1U);
+  write4(buf + 3, 0xF6F5F4F3UL);
+  EXPECT_STREQ(buf, "\x00\xF1\xF2\xF3\xF4\xF5\xF6");
+  EXPECT_EQ(read2(buf + 1), 0xF2F1U);
+  EXPECT_EQ(read4(buf + 3), 0xF6F5F4F3UL);
+}
+
+TEST(Zipmerge, AppendCd) {
+  output_cd.length = 0;
+  append_cd((const uint8_t*)"a", 1);
+  append_cd((const uint8_t*)"bcd", 3);
+  append_cd((const uint8_t*)"efghijklmno", 11);
+  EXPECT_EQ(output_cd.length, 15);
+  std::string_view bytes{reinterpret_cast<char*>(output_cd.bytes), 15};
+  EXPECT_EQ(bytes, "abcdefghijklmno");
+}
+
+TEST(Zipmerge, ShouldIncludeFilenameNow) {
+  EXPECT_TRUE(should_include_filename_now((const uint8_t*)"x", 1));
+  EXPECT_FALSE(should_include_filename_now((const uint8_t*)"x", 1));
+  EXPECT_TRUE(should_include_filename_now((const uint8_t*)"y", 1));
+  EXPECT_TRUE(should_include_filename_now((const uint8_t*)"yy", 2));
+  EXPECT_FALSE(should_include_filename_now((const uint8_t*)"x", 1));
+  EXPECT_FALSE(should_include_filename_now((const uint8_t*)"yy", 2));
+}
+
+TEST(Zipmerge, FindEocd) {
+  uint8_t buf[500] = {0};
+  auto i = 0u;
+  for (auto& b : buf) {
+    b = i % 256;
+  }
+  memcpy(buf + 17, eocd_signature.data(), eocd_signature.size());
+  memcpy(buf + 101, eocd_signature.data(), eocd_signature.size());
+  EXPECT_EQ(find_eocd(buf, sizeof(buf)), buf + 101);
+}
+
+std::string read_file(const std::string& filename) {
+  std::ifstream f(filename, std::ios::binary);
+  EXPECT_TRUE(f) << "Could not open '" << filename << "' (" << std::strerror(errno) << ")";
+  if (!f) {
+    return 0;
+  }
+  std::stringstream contents;
+  contents << f.rdbuf();
+  return contents.str();
+}
+
+std::string get_file(const char* name) {
+  static auto runfiles = [] {
+    std::string error;
+    auto ret = Runfiles::CreateForTest(&error);
+    EXPECT_TRUE(ret) << error;
+    return ret;
+  }();
+  // this works from both `codeql` and the internal repository
+  for (auto prefix : {"_main", "codeql~"}) {
+    auto ret = runfiles->Rlocation(prefix + "/misc/bazel/internal/zipmerge/test-files/"s + name);
+    if (fs::exists(ret)) {
+      return ret;
+    }
+  }
+  EXPECT_TRUE(false) << "test file " << name << " not found";
+  return "";
+}
+
+void expect_same_file(const char* actual, const char* expected) {
+  auto expected_file = get_file(expected);
+  auto actual_contents = read_file(actual);
+  unlink(actual);  // If tests start failing, you might want to comment out this unlink in order to
+                   // inspect the output.
+  ASSERT_EQ(actual_contents, read_file(expected_file))
+      << "contents of " << actual << " do not match contents of " << expected_file;
+}
+
+template <typename... Args>
+const char* zipmerge(Args*... inputs) {
+  reset();
+  const char* output = nullptr;
+  std::vector<std::string> args{"self"};
+  std::array<const char*, sizeof...(Args)> flags{{inputs...}};
+  auto i = 0u;
+  for (; i < flags.size() && std::string_view{flags[i]}.starts_with("-"); ++i) {
+    args.push_back(flags[i]);
+  }
+  output = flags[i];
+  args.push_back(output);
+  ++i;
+  for (; i < flags.size(); ++i) {
+    args.push_back(std::string_view{flags[i]}.starts_with("-") ? flags[i] : get_file(flags[i]));
+  }
+  std::vector<const char*> argv;
+  std::transform(args.begin(), args.end(), std::back_inserter(argv),
+                 [](const std::string& s) { return s.c_str(); });
+  EXPECT_EQ(zipmerge_main(argv.size(), argv.data()), 0);
+  return output;
+}
+
+TEST(Zipmerge, Identity) {
+  expect_same_file(zipmerge("out.zip", "directory.zip"), "directory.zip");
+}
+
+TEST(Zipmerge, Idempotent) {
+  expect_same_file(zipmerge("out.zip", "directory.zip", "directory.zip", "directory.zip"),
+                   "directory.zip");
+}
+
+TEST(Zipmerge, RemoveEverything) {
+  expect_same_file(zipmerge("--remove=directory", "out.zip", "directory.zip"), "empty.zip");
+}
+
+TEST(Zipmerge, RemoveEverythingWildcard) {
+  expect_same_file(zipmerge("--remove=*ory", "out.zip", "directory.zip"), "empty.zip");
+}
+
+TEST(Zipmerge, RemovePrefixedPaths) {
+  expect_same_file(zipmerge("--remove=My/directory", "out.zip", "--prefix=My", "directory.zip"),
+                   "empty.zip");
+}
+TEST(Zipmerge, RemoveSome) {
+  expect_same_file(
+      zipmerge("--remove=directory/b.txt", "--remove=directory/c.txt", "out.zip", "directory.zip"),
+      "directory-partial.zip");
+}
+
+TEST(Zipmerge, RemoveSomeWildcard) {
+  expect_same_file(zipmerge("--remove=directory/b*t", "--remove=directory/c*", "--remove=dir*t",
+                            "out.zip", "directory.zip"),
+                   "directory-partial.zip");
+}
+
+TEST(Zipmerge, Prefix) {
+  expect_same_file(
+      zipmerge("out.zip", "minimal.zip", "--prefix=a", "minimal.zip", "--prefix=b", "minimal.zip"),
+      "minimal-x3.zip");
+}
+
+TEST(Zipmerge, InputFileOrder) {
+  expect_same_file(zipmerge("out.zip", "minimal.zip", "almost-minimal.zip"), "almost-minimal.zip");
+}
+
+TEST(Zipmerge, LocalFileFooters) {
+  expect_same_file(zipmerge("out.jar", "footers.jar"), "no-footers.jar");
+}
+}  // namespace codeql_testing
--- a/misc/bazel/lfs.bzl
+++ b/misc/bazel/lfs.bzl
@@ -1,4 +1,4 @@
-def lfs_smudge(repository_ctx, srcs, extract = False, stripPrefix = None):
+def lfs_smudge(repository_ctx, srcs, *, extract = False, stripPrefix = None, executable = False):
    python = repository_ctx.which("python3") or repository_ctx.which("python")
    if not python:
        fail("Neither python3 nor python executables found")
@@ -25,7 +25,7 @@ def lfs_smudge(repository_ctx, srcs, extract = False, stripPrefix = None):
            repository_ctx.symlink(src, src.basename)
        else:
            repository_ctx.report_progress("trying cache for remote %s" % src.basename)
-            res = repository_ctx.download([], src.basename, sha256 = info, allow_fail = True)
+            res = repository_ctx.download([], src.basename, sha256 = info, allow_fail = True, executable = executable)
            if not res.success:
                remote.append(src)
        if remote:
@@ -33,7 +33,7 @@ def lfs_smudge(repository_ctx, srcs, extract = False, stripPrefix = None):
            for src, info in zip(remote, infos):
                sha256, _, url = info.partition(" ")
                repository_ctx.report_progress("downloading remote %s" % src.basename)
-                repository_ctx.download(url, src.basename, sha256 = sha256)
+                repository_ctx.download(url, src.basename, sha256 = sha256, executable = executable)
        if extract:
            for src in srcs:
                repository_ctx.report_progress("extracting %s" % src.basename)
@@ -62,19 +62,20 @@ def _download_lfs(repository_ctx):
        if not dir.is_dir:
            fail("`dir` not a directory in @%s" % repository_ctx.name)
        srcs = [f for f in dir.readdir() if not f.is_dir]
-    lfs_smudge(repository_ctx, srcs)
+    lfs_smudge(repository_ctx, srcs, executable = repository_ctx.attr.executable)

    # with bzlmod the name is qualified with `~` separators, and we want the base name here
    name = repository_ctx.name.split("~")[-1]
-    repository_ctx.file("BUILD.bazel", """
-exports_files({files})
+    basenames = [src.basename for src in srcs]
+    build = "exports_files(%s)\n" % repr(basenames)

-filegroup(
-    name = "{name}",
-    srcs = {files},
-    visibility = ["//visibility:public"],
-)
-""".format(name = name, files = repr([src.basename for src in srcs])))
+    # add a main `name` filegroup only if it doesn't conflict with existing exported files
+    if name not in basenames:
+        build += 'filegroup(name = "%s", srcs = %s, visibility = ["//visibility:public"])\n' % (
+            name,
+            basenames,
+        )
+    repository_ctx.file("BUILD.bazel", build)

 lfs_archive = repository_rule(
    doc = "Export the contents from an on-demand LFS archive. The corresponding path should be added to be ignored " +
@@ -98,5 +99,6 @@ lfs_files = repository_rule(
        "srcs": attr.label_list(doc = "Local paths to the LFS files to export."),
        "dir": attr.label(doc = "Local path to a directory containing LFS files to export. Only the direct contents " +
                                "of the directory are exported"),
+        "executable": attr.bool(doc = "Whether files should be marked as executable"),
    },
 )
--- a/misc/bazel/os.bzl
+++ b/misc/bazel/os.bzl
@@ -0,0 +1,38 @@
+""" Os detection facilities. """
+
+def os_select(
+        ctx = None,
+        *,
+        linux = None,
+        windows = None,
+        macos = None,
+        default = None):
+    """
+    This can work both in a macro and a rule context to choose something based on the current OS.
+    If used in a rule implementation, you need to pass `ctx` and add `OS_DETECTION_ATTRS` to the
+    rule attributes.
+    """
+    choices = {
+        "linux": linux or default,
+        "windows": windows or default,
+        "macos": macos or default,
+    }
+    if not ctx:
+        return select({
+            "@platforms//os:%s" % os: v
+            for os, v in choices.items()
+            if v != None
+        })
+
+    for os, v in choices.items():
+        if ctx.target_platform_has_constraint(getattr(ctx.attr, "_%s_constraint" % os)[platform_common.ConstraintValueInfo]):
+            if v == None:
+                fail("%s not supported by %s" % (os, ctx.label))
+            return v
+    fail("Unknown OS detected")
+
+OS_DETECTION_ATTRS = {
+    "_windows_constraint": attr.label(default = "@platforms//os:windows"),
+    "_macos_constraint": attr.label(default = "@platforms//os:macos"),
+    "_linux_constraint": attr.label(default = "@platforms//os:linux"),
+}
--- a/misc/bazel/pkg.bzl
+++ b/misc/bazel/pkg.bzl
@@ -1,4 +1,424 @@
+"""
+Wrappers and helpers around `rules_pkg` to build codeql packs.
+"""
+
+load("@rules_pkg//pkg:install.bzl", "pkg_install")
+load("@rules_pkg//pkg:mappings.bzl", "pkg_attributes", "pkg_filegroup", "pkg_files", _strip_prefix = "strip_prefix")
+load("@rules_pkg//pkg:pkg.bzl", "pkg_zip")
 load("@rules_pkg//pkg:providers.bzl", "PackageFilegroupInfo", "PackageFilesInfo")
+load("@rules_python//python:defs.bzl", "py_binary")
+load("//misc/bazel:os.bzl", "OS_DETECTION_ATTRS", "os_select")
+
+def _make_internal(name):
+    def internal(suffix = "internal", *args):
+        args = (name, suffix) + args
+        return "-".join(args)
+
+    return internal
+
+_PLAT_PLACEHOLDER = "{CODEQL_PLATFORM}"
+
+def _expand_path(path, platform):
+    if _PLAT_PLACEHOLDER in path:
+        path = path.replace(_PLAT_PLACEHOLDER, platform)
+        return ("arch", path)
+    return ("generic", path)
+
+def _detect_platform(ctx = None):
+    return os_select(ctx, linux = "linux64", macos = "osx64", windows = "win64")
+
+def codeql_pkg_files(
+        *,
+        name,
+        srcs = None,
+        exes = None,
+        visibility = None,
+        **kwargs):
+    """ Wrapper around `pkg_files` adding a distinction between `srcs` and `exes`, where the
+    latter will get executable permissions.
+    """
+
+    internal = _make_internal(name)
+    if "attributes" in kwargs:
+        fail("do not use attributes with codeql_pkg_* rules. Use `exes` to mark executable files.")
+    internal_srcs = []
+    if srcs and exes:
+        pkg_files(
+            name = internal("srcs"),
+            srcs = srcs,
+            visibility = ["//visibility:private"],
+            **kwargs
+        )
+        pkg_files(
+            name = internal("exes"),
+            srcs = exes,
+            visibility = ["//visibility:private"],
+            attributes = pkg_attributes(mode = "755"),
+            **kwargs
+        )
+        pkg_filegroup(
+            name = name,
+            srcs = [internal("srcs"), internal("exes")],
+            visibility = visibility,
+        )
+    else:
+        pkg_files(
+            name = name,
+            srcs = srcs or exes,
+            visibility = visibility,
+            attributes = pkg_attributes(mode = "755") if exes else None,
+            **kwargs
+        )
+
+def _extract_pkg_filegroup_impl(ctx):
+    src = ctx.attr.src[PackageFilegroupInfo]
+    arch_overrides = ctx.attr.arch_overrides
+    platform = _detect_platform(ctx)
+
+    if src.pkg_dirs or src.pkg_symlinks:
+        fail("`pkg_dirs` and `pkg_symlinks` are not supported for codeql packaging rules")
+
+    pkg_files = []
+    for pfi, origin in src.pkg_files:
+        dest_src_map = {}
+        for dest, file in pfi.dest_src_map.items():
+            file_kind, expanded_dest = _expand_path(dest, platform)
+            if file_kind == "generic" and dest in arch_overrides:
+                file_kind = "arch"
+            if file_kind == ctx.attr.kind:
+                dest_src_map[expanded_dest] = file
+
+        if dest_src_map:
+            pkg_files.append((PackageFilesInfo(dest_src_map = dest_src_map, attributes = pfi.attributes), origin))
+
+    files = [depset(pfi.dest_src_map.values()) for pfi, _ in pkg_files]
+    return [
+        PackageFilegroupInfo(pkg_files = pkg_files, pkg_dirs = [], pkg_symlinks = []),
+        DefaultInfo(files = depset(transitive = files)),
+    ]
+
+_extract_pkg_filegroup = rule(
+    implementation = _extract_pkg_filegroup_impl,
+    doc = """
+        This internal rule extracts the arch or generic part of a `PackageFilegroupInfo` source, returning a
+        `PackageFilegroupInfo` that is a subset of the provided `src`, while expanding `{CODEQL_PLATFORM}` in
+        destination paths to the relevant codeql platform (linux64, win64 or osx64).
+        The distinction between generic and arch contents is given on a per-file basis depending on the install path
+        containing {CODEQL_PLATFORM}, which will typically have been added by a `prefix` attribute to a `pkg_*` rule.
+        Files that are arch-specific, but outside of the `CODEQL_PLATFORM` path can be specified in `arch_overrides`.
+        No `pkg_dirs` or `pkg_symlink` must have been used for assembling the source mapping information: we could
+        easily add support for that, but we don't require it for now.
+    """,
+    attrs = {
+        "src": attr.label(providers = [PackageFilegroupInfo, DefaultInfo]),
+        "kind": attr.string(doc = "What part to extract", values = ["generic", "arch"]),
+        "arch_overrides": attr.string_list(doc = "A list of files that should be included in the arch package regardless of the path"),
+    } | OS_DETECTION_ATTRS,
+)
+
+_ZipInfo = provider(fields = {"zips_to_prefixes": "mapping of zip files to prefixes"})
+
+def _zip_info_impl(ctx):
+    zips = {}
+    for zip_target, prefix in ctx.attr.srcs.items():
+        for zip in zip_target.files.to_list():
+            zips[zip] = prefix
+    return [
+        _ZipInfo(zips_to_prefixes = zips),
+    ]
+
+_zip_info = rule(
+    implementation = _zip_info_impl,
+    doc = """
+        This internal rule simply instantiates a _ZipInfo provider out of `zips`.
+    """,
+    attrs = {
+        "srcs": attr.label_keyed_string_dict(
+            doc = "mapping from zip files to install prefixes",
+            allow_files = [".zip"],
+        ),
+    },
+)
+
+def _zip_info_filter_impl(ctx):
+    platform = _detect_platform(ctx)
+    filtered_zips = {}
+    for zip_info in ctx.attr.srcs:
+        for zip, prefix in zip_info[_ZipInfo].zips_to_prefixes.items():
+            zip_kind, expanded_prefix = _expand_path(prefix, platform)
+            if zip_kind == ctx.attr.kind:
+                filtered_zips[zip] = expanded_prefix
+    return [
+        _ZipInfo(zips_to_prefixes = filtered_zips),
+    ]
+
+_zip_info_filter = rule(
+    implementation = _zip_info_filter_impl,
+    doc = """
+        This internal rule transforms a _ZipInfo provider so that:
+        * only zips matching `kind` are included
+        * a kind of a zip is given by its prefix: if it contains {CODEQL_PLATFORM} it is arch, otherwise it's generic
+        * in the former case, {CODEQL_PLATFORM} is expanded
+    """,
+    attrs = {
+        "srcs": attr.label_list(doc = "_ZipInfos to transform", providers = [_ZipInfo]),
+        "kind": attr.string(doc = "Which zip kind to consider", values = ["generic", "arch"]),
+    } | OS_DETECTION_ATTRS,
+)
+
+def _imported_zips_manifest_impl(ctx):
+    manifest = []
+    files = []
+    for zip_info in ctx.attr.srcs:
+        zip_info = zip_info[_ZipInfo]
+        manifest += ["%s:%s" % (p, z.short_path) for z, p in zip_info.zips_to_prefixes.items()]
+        files.extend(zip_info.zips_to_prefixes)
+
+    output = ctx.actions.declare_file(ctx.label.name + ".params")
+    ctx.actions.write(
+        output,
+        "\n".join(manifest),
+    )
+    return DefaultInfo(
+        files = depset([output]),
+        runfiles = ctx.runfiles(files),
+    )
+
+_imported_zips_manifest = rule(
+    implementation = _imported_zips_manifest_impl,
+    doc = """
+        This internal rule prints a zip manifest file that `misc/bazel/internal/install.py` understands.
+        {CODEQL_PLATFORM} can be used as zip prefixes and will be expanded to the relevant codeql platform.
+    """,
+    attrs = {
+        "srcs": attr.label_list(
+            doc = "mappings from zip files to install prefixes in _ZipInfo format",
+            providers = [_ZipInfo],
+        ),
+    },
+)
+
+def _zipmerge_impl(ctx):
+    zips = []
+    transitive_zips = []
+    output = ctx.actions.declare_file(ctx.attr.out)
+    args = [output.path]
+    for zip_target in ctx.attr.srcs:
+        if _ZipInfo in zip_target:
+            zip_info = zip_target[_ZipInfo]
+            for zip, prefix in zip_info.zips_to_prefixes.items():
+                args += [
+                    "--prefix=%s/%s" % (ctx.attr.prefix, prefix.rstrip("/")),
+                    zip.path,
+                ]
+                zips.append(zip)
+        else:
+            zip_files = zip_target.files.to_list()
+            for zip in zip_files:
+                if zip.extension != "zip":
+                    fail("%s file found while expecting a .zip file " % zip.short_path)
+            args.append("--prefix=%s" % ctx.attr.prefix)
+            args += [z.path for z in zip_files]
+            transitive_zips.append(zip_target.files)
+    ctx.actions.run(
+        outputs = [output],
+        executable = ctx.executable._zipmerge,
+        inputs = depset(zips, transitive = transitive_zips),
+        arguments = args,
+    )
+
+    return [
+        DefaultInfo(files = depset([output])),
+    ]
+
+_zipmerge = rule(
+    implementation = _zipmerge_impl,
+    doc = """
+        This internal rule merges a zip files together
+    """,
+    attrs = {
+        "srcs": attr.label_list(doc = "Zip file to include, either as straight up `.zip` files or `_ZipInfo` data"),
+        "out": attr.string(doc = "output file name"),
+        "prefix": attr.string(doc = "Prefix posix path to add to the zip contents in the archive"),
+        "_zipmerge": attr.label(default = "//misc/bazel/internal/zipmerge", executable = True, cfg = "exec"),
+    },
+)
+
+def _get_zip_filename(name_prefix, kind):
+    if kind == "arch":
+        return name_prefix + "-" + _detect_platform() + ".zip"  # using + because there's a select
+    else:
+        return "%s-generic.zip" % name_prefix
+
+def codeql_pack(
+        *,
+        name,
+        srcs = None,
+        zips = None,
+        zip_filename = None,
+        visibility = None,
+        install_dest = "extractor-pack",
+        compression_level = None,
+        arch_overrides = None,
+        zip_prefix = None,
+        **kwargs):
+    """
+    Define a codeql pack. This macro accepts `pkg_files`, `pkg_filegroup` or their `codeql_*` counterparts as `srcs`.
+    `zips` is a map from `.zip` files to prefixes to import.
+    * defines a `<name>-generic-zip` target creating a `<zip_filename>-generic.zip` archive with the generic bits,
+      prefixed with `zip_prefix`
+    * defines a `<name>-arch-zip` target creating a `<zip_filename>-<codeql_platform>.zip` archive with the
+      arch-specific bits, prefixed with `zip_prefix`
+    * defines a runnable `<name>-installer` target that will install the pack in `install_dest`, relative to where the
+      rule is used. The install destination can be overridden appending `-- --destdir=...` to the `bazel run`
+      invocation. This installation _does not_ prefix the contents with `zip_prefix`.
+    The prefix for the zip files can be set with `zip_prefix`, it is `name` by default.
+
+    The distinction between arch-specific and generic contents is made based on whether the paths (including possible
+    prefixes added by rules) contain the special `{CODEQL_PLATFORM}` placeholder, which in case it is present will also
+    be replaced by the appropriate platform (`linux64`, `win64` or `osx64`).
+    Specific file paths can be placed in the arch-specific package by adding them to `arch_overrides`, even if their
+    path doesn't contain the `CODEQL_PLATFORM` placeholder.
+
+    `compression_level` can be used to tweak the compression level used when creating archives. Consider that this
+    does not affect the contents of `zips`, only `srcs`.
+    """
+    internal = _make_internal(name)
+    zip_filename = zip_filename or name
+    zips = zips or {}
+    if zip_prefix == None:
+        zip_prefix = name
+    pkg_filegroup(
+        name = internal("all"),
+        srcs = srcs,
+        visibility = ["//visibility:private"],
+        **kwargs
+    )
+    if zips:
+        _zip_info(
+            name = internal("zip-info"),
+            srcs = zips,
+            visibility = ["//visibility:private"],
+        )
+    for kind in ("generic", "arch"):
+        _extract_pkg_filegroup(
+            name = internal(kind),
+            src = internal("all"),
+            kind = kind,
+            arch_overrides = arch_overrides,
+            visibility = ["//visibility:private"],
+        )
+        if zips:
+            pkg_zip(
+                name = internal(kind, "zip-base"),
+                srcs = [internal(kind)],
+                visibility = ["//visibility:private"],
+                compression_level = compression_level,
+            )
+            _zip_info_filter(
+                name = internal(kind, "zip-info"),
+                kind = kind,
+                srcs = [internal("zip-info")],
+                visibility = ["//visibility:private"],
+            )
+            _zipmerge(
+                name = internal(kind, "zip"),
+                srcs = [internal(kind, "zip-base"), internal(kind, "zip-info")],
+                out = _get_zip_filename(name, kind),
+                prefix = zip_prefix,
+                visibility = visibility,
+            )
+        else:
+            pkg_zip(
+                name = internal(kind, "zip"),
+                srcs = [internal(kind)],
+                visibility = visibility,
+                package_dir = zip_prefix,
+                package_file_name = _get_zip_filename(name, kind),
+                compression_level = compression_level,
+            )
+    if zips:
+        _imported_zips_manifest(
+            name = internal("zip-manifest"),
+            srcs = [internal("generic-zip-info"), internal("arch-zip-info")],
+            visibility = ["//visibility:private"],
+        )
+
+    pkg_install(
+        name = internal("script"),
+        srcs = [internal("generic"), internal("arch")],
+        visibility = ["//visibility:private"],
+    )
+    native.filegroup(
+        # used to locate current src directory
+        name = internal("build-file"),
+        srcs = ["BUILD.bazel"],
+        visibility = ["//visibility:private"],
+    )
+    py_binary(
+        name = internal("installer"),
+        srcs = [Label("//misc/bazel/internal:install.py")],
+        main = Label("//misc/bazel/internal:install.py"),
+        data = [
+            internal("build-file"),
+            internal("script"),
+        ] + ([
+            internal("zip-manifest"),
+            Label("//misc/bazel/internal/ripunzip"),
+        ] if zips else []),
+        deps = ["@rules_python//python/runfiles"],
+        args = [
+            "--build-file=$(rlocationpath %s)" % internal("build-file"),
+            "--pkg-install-script=$(rlocationpath %s)" % internal("script"),
+            "--destdir",
+            install_dest,
+        ] + ([
+            "--ripunzip=$(rlocationpath %s)" % Label("//misc/bazel/internal/ripunzip"),
+            "--zip-manifest=$(rlocationpath %s)" % internal("zip-manifest"),
+        ] if zips else []),
+        visibility = visibility,
+    )
+    native.filegroup(
+        name = name,
+        srcs = [internal("generic-zip"), internal("arch-zip")],
+    )
+
+strip_prefix = _strip_prefix
+
+def _runfiles_group_impl(ctx):
+    files = []
+    for src in ctx.attr.srcs:
+        rf = src[DefaultInfo].default_runfiles
+        if rf != None:
+            files.append(rf.files)
+    return [
+        DefaultInfo(
+            files = depset(transitive = files),
+        ),
+    ]
+
+_runfiles_group = rule(
+    implementation = _runfiles_group_impl,
+    attrs = {
+        "srcs": attr.label_list(),
+    },
+)
+
+def codeql_pkg_runfiles(*, name, exes, **kwargs):
+    """
+    Create a `codeql_pkg_files` with all runfiles from files in `exes`, flattened together.
+    """
+    internal = _make_internal(name)
+    _runfiles_group(
+        name = internal("runfiles"),
+        srcs = exes,
+        visibility = ["//visibility:private"],
+    )
+    codeql_pkg_files(
+        name = name,
+        exes = [internal("runfiles")],
+        **kwargs
+    )

 def _pkg_overlay_impl(ctx):
    destinations = {}
--- a/misc/bazel/pkg_runfiles.bzl
+++ b/misc/bazel/pkg_runfiles.bzl
@@ -1,33 +0,0 @@
-load("@rules_pkg//pkg:mappings.bzl", "pkg_attributes", "pkg_files")
-
-def _runfiles_group_impl(ctx):
-    files = []
-    for src in ctx.attr.srcs:
-        rf = src[DefaultInfo].default_runfiles
-        if rf != None:
-            files.append(rf.files)
-    return [
-        DefaultInfo(
-            files = depset(transitive = files),
-        ),
-    ]
-
-_runfiles_group = rule(
-    implementation = _runfiles_group_impl,
-    attrs = {
-        "srcs": attr.label_list(),
-    },
-)
-
-def pkg_runfiles(*, name, srcs, **kwargs):
-    internal_name = "_%s_runfiles" % name
-    _runfiles_group(
-        name = internal_name,
-        srcs = srcs,
-    )
-    kwargs.setdefault("attributes", pkg_attributes(mode = "0755"))
-    pkg_files(
-        name = name,
-        srcs = [internal_name],
-        **kwargs
-    )