Compare commits

...

1 Commits

Author SHA1 Message Date
Paolo Tranquilli
fc956eeed9 CI: use git-lfs fork for git_lfs_probe.py 2024-11-12 17:02:28 +01:00
3 changed files with 92 additions and 173 deletions

View File

@@ -166,6 +166,21 @@ go_deps = use_extension("@gazelle//:extensions.bzl", "go_deps")
go_deps.from_file(go_mod = "//go/extractor:go.mod")
use_repo(go_deps, "org_golang_x_mod", "org_golang_x_tools")
git_lfs_binary = use_repo_rule("//misc/bazel:lfs.bzl", "git_lfs_binary")
# to update, check out dsp-testing/codeql-git-lfs, do changes there, and push a tag with
# `git tag $(git describe)-ls-urls && git push --tags`
# then wait for https://github.com/dsp-testing/codeql-git-lfs/actions/runs/11800398535 to end,
# then copy here information from https://github.com/dsp-testing/codeql-git-lfs/releases/latest
git_lfs_binary(
name = "git-lfs",
sha256_linux = "08b75033a98f77f7e60b0928e160a6f0a5c5cd9d91b8605537969eec6980219a",
sha256_macos_arm64 = "8a17c488c975dbd050610a0b2692567064dbfef33b6c58ee89ea02f649cc0114",
sha256_macos_x86 = "9fc7265c5345901ca5cb83707ed5374fc6dfbf7ed45d2c047d5929bfe0b5f64a",
sha256_windows = "ef2f5794667584b155786291d4f839c59bfe10fcc5f870902c64f3063ffd9923",
version = "v3.5.0-179-gfd031ea1",
)
lfs_files = use_repo_rule("//misc/bazel:lfs.bzl", "lfs_files")
lfs_files(

View File

@@ -24,39 +24,19 @@ from dataclasses import dataclass
import argparse
def options():
def resolved_path(path):
return pathlib.Path(path).expanduser().resolve()
p = argparse.ArgumentParser(description=__doc__)
p.add_argument("--hash-only", action="store_true")
p.add_argument("sources", type=pathlib.Path, nargs="+")
return p.parse_args()
TIMEOUT = 20
def warn(message: str) -> None:
print(f"WARNING: {message}", file=sys.stderr)
@dataclass
class Endpoint:
name: str
href: str
ssh: typing.Optional[str] = None
headers: typing.Dict[str, str] = dataclasses.field(default_factory=dict)
def update_headers(self, d: typing.Iterable[typing.Tuple[str, str]]):
self.headers.update((k.capitalize(), v) for k, v in d)
class NoEndpointsFound(Exception):
pass
opts = options()
sources = [p.resolve() for p in opts.sources]
source_dir = pathlib.Path(os.path.commonpath(src.parent for src in sources))
source_dir = subprocess.check_output(
["git", "rev-parse", "--show-toplevel"], cwd=source_dir, text=True
).strip()
excl = p.add_mutually_exclusive_group(required=True)
excl.add_argument("--hash-only", action="store_true")
excl.add_argument("--git-lfs", type=resolved_path)
p.add_argument("sources", type=resolved_path, nargs="+")
opts = p.parse_args()
source_dir = pathlib.Path(os.path.commonpath(src.parent for src in opts.sources))
opts.source_dir = subprocess.check_output(
["git", "rev-parse", "--show-toplevel"], cwd=source_dir, text=True
).strip()
return opts
def get_env(s: str, sep: str = "=") -> typing.Iterable[typing.Tuple[str, str]]:
@@ -64,161 +44,37 @@ def get_env(s: str, sep: str = "=") -> typing.Iterable[typing.Tuple[str, str]]:
yield m.groups()
def git(*args, **kwargs):
proc = subprocess.run(
("git",) + args, stdout=subprocess.PIPE, text=True, cwd=source_dir, **kwargs
)
return proc.stdout.strip() if proc.returncode == 0 else None
endpoint_re = re.compile(r"^Endpoint(?: \((.*)\))?$")
def get_endpoint_addresses() -> typing.Iterable[Endpoint]:
"""Get all lfs endpoints, including SSH if present"""
lfs_env_items = get_env(
subprocess.check_output(["git", "lfs", "env"], text=True, cwd=source_dir)
)
current_endpoint = None
for k, v in lfs_env_items:
m = endpoint_re.match(k)
if m:
if current_endpoint:
yield current_endpoint
href, _, _ = v.partition(" ")
current_endpoint = Endpoint(name=m[1] or "default", href=href)
elif k == " SSH" and current_endpoint:
current_endpoint.ssh = v
if current_endpoint:
yield current_endpoint
def get_endpoints() -> typing.Iterable[Endpoint]:
for endpoint in get_endpoint_addresses():
endpoint.headers = {
"Content-Type": "application/vnd.git-lfs+json",
"Accept": "application/vnd.git-lfs+json",
}
if endpoint.ssh:
# see https://github.com/git-lfs/git-lfs/blob/main/docs/api/authentication.md
server, _, path = endpoint.ssh.partition(":")
ssh_command = shutil.which(
os.environ.get("GIT_SSH", os.environ.get("GIT_SSH_COMMAND", "ssh"))
)
assert ssh_command, "no ssh command found"
cmd = [
ssh_command,
"-oStrictHostKeyChecking=accept-new",
server,
"git-lfs-authenticate",
path,
"download",
]
try:
res = subprocess.run(cmd, stdout=subprocess.PIPE, timeout=TIMEOUT)
except subprocess.TimeoutExpired:
warn(f"ssh timed out when connecting to {server}, ignoring {endpoint.name} endpoint")
continue
if res.returncode != 0:
warn(f"ssh failed when connecting to {server}, ignoring {endpoint.name} endpoint")
continue
ssh_resp = json.loads(res.stdout)
endpoint.href = ssh_resp.get("href", endpoint)
endpoint.update_headers(ssh_resp.get("header", {}).items())
url = urlparse(endpoint.href)
# this is how actions/checkout persist credentials
# see https://github.com/actions/checkout/blob/44c2b7a8a4ea60a981eaca3cf939b5f4305c123b/src/git-auth-helper.ts#L56-L63
auth = git("config", f"http.{url.scheme}://{url.netloc}/.extraheader") or ""
endpoint.update_headers(get_env(auth, sep=": "))
if os.environ.get("GITHUB_TOKEN"):
endpoint.headers["Authorization"] = f"token {os.environ['GITHUB_TOKEN']}"
if "Authorization" not in endpoint.headers:
# last chance: use git credentials (possibly backed by a credential helper like the one installed by gh)
# see https://git-scm.com/docs/git-credential
credentials = git(
"credential",
"fill",
check=True,
# drop leading / from url.path
input=f"protocol={url.scheme}\nhost={url.netloc}\npath={url.path[1:]}\n",
)
if credentials is None:
warn(f"no authorization method found, ignoring {endpoint.name} endpoint")
continue
credentials = dict(get_env(credentials))
auth = base64.b64encode(
f'{credentials["username"]}:{credentials["password"]}'.encode()
).decode("ascii")
endpoint.headers["Authorization"] = f"Basic {auth}"
yield endpoint
# see https://github.com/git-lfs/git-lfs/blob/310d1b4a7d01e8d9d884447df4635c7a9c7642c2/docs/api/basic-transfers.md
def get_locations(objects):
def get_locations(objects, opts):
ret = ["local" for _ in objects]
indexes = [i for i, o in enumerate(objects) if o]
if not indexes:
# all objects are local, do not send an empty request as that would be an error
return ret
if opts.hash_only:
for i in indexes:
ret[i] = objects[i]["oid"]
return ret
data = {
"operation": "download",
"transfers": ["basic"],
"objects": [objects[i] for i in indexes],
"hash_algo": "sha256",
}
for endpoint in get_endpoints():
req = urllib.request.Request(
f"{endpoint.href}/objects/batch",
headers=endpoint.headers,
data=json.dumps(data).encode("ascii"),
)
try:
with urllib.request.urlopen(req, timeout=TIMEOUT) as resp:
data = json.load(resp)
assert len(data["objects"]) == len(
indexes
), f"received {len(data)} objects, expected {len(indexes)}"
for i, resp in zip(indexes, data["objects"]):
ret[i] = f'{resp["oid"]} {resp["actions"]["download"]["href"]}'
return ret
except urllib.error.URLError as e:
warn(f"encountered {type(e).__name__} {e}, ignoring endpoint {endpoint.name}")
continue
except KeyError:
warn(f"encountered malformed response, ignoring endpoint {endpoint.name}:\n{json.dumps(data, indent=2)}")
continue
raise NoEndpointsFound
else:
cmd = [opts.git_lfs, "ls-urls", "--json"]
cmd.extend(objects[i]["path"] for i in indexes)
data = json.loads(subprocess.check_output(cmd, cwd=opts.source_dir))
for i, f in zip(indexes, data["files"]):
ret[i] = f'{f["oid"]} {f["url"]}'
return ret
def get_lfs_object(path):
with open(path, "rb") as fileobj:
lfs_header = "version https://git-lfs.github.com/spec".encode()
actual_header = fileobj.read(len(lfs_header))
sha256 = size = None
if lfs_header != actual_header:
return None
data = dict(get_env(fileobj.read().decode("ascii"), sep=" "))
assert data["oid"].startswith("sha256:"), f"unknown oid type: {data['oid']}"
_, _, sha256 = data["oid"].partition(":")
size = int(data["size"])
return {"oid": sha256, "size": size}
return {"path": path, "oid": sha256}
try:
objects = [get_lfs_object(src) for src in sources]
for resp in get_locations(objects):
def main():
opts = options()
objects = [get_lfs_object(src) for src in opts.sources]
for resp in get_locations(objects, opts):
print(resp)
except NoEndpointsFound as e:
print("""\
ERROR: no valid endpoints found, your git authentication method might be currently unsupported by this script.
You can bypass this error by running from semmle-code (this might take a while):
git config lfs.fetchexclude ""
git -C ql config lfs.fetchinclude \\*
git lfs fetch && git lfs checkout
cd ql
git lfs fetch && git lfs checkout""", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -2,13 +2,16 @@ def lfs_smudge(repository_ctx, srcs, *, extract = False, stripPrefix = None, exe
python = repository_ctx.which("python3") or repository_ctx.which("python")
if not python:
fail("Neither python3 nor python executables found")
script = Label("//misc/bazel/internal:git_lfs_probe.py")
script = repository_ctx.path(Label("//misc/bazel/internal:git_lfs_probe.py"))
git_lfs_binary = repository_ctx.path(Label("@git-lfs"))
def probe(srcs, hash_only = False):
repository_ctx.report_progress("querying LFS url(s) for: %s" % ", ".join([src.basename for src in srcs]))
cmd = [python, script]
if hash_only:
cmd.append("--hash-only")
else:
cmd += ["--git-lfs", git_lfs_binary]
cmd.extend(srcs)
res = repository_ctx.execute(cmd, quiet = True)
if res.return_code != 0:
@@ -102,3 +105,48 @@ lfs_files = repository_rule(
"executable": attr.bool(doc = "Whether files should be marked as executable"),
},
)
def _lfs_binary_impl(repository_ctx):
suffix = ""
if repository_ctx.os.name.startswith("windows"):
arch = "windows-amd64"
sha256 = repository_ctx.attr.sha256_windows
suffix = ".exe"
elif repository_ctx.os.name.startswith("mac"):
if repository_ctx.os.arch == "x86":
arch = "darwin-amd64"
sha256 = repository_ctx.attr.sha256_macos_x86
else:
arch = "darwin-arm64"
sha256 = repository_ctx.attr.sha256_macos_arm64
else:
arch = "linux-amd64"
sha256 = repository_ctx.attr.sha256_linux
url = "https://github.com/dsp-testing/codeql-git-lfs/releases/download/%s/git-lfs-%s%s" % (
repository_ctx.attr.version,
arch,
suffix,
)
exe = "git-lfs" + suffix
repository_ctx.download(
url = url,
output = exe,
sha256 = sha256,
executable = True,
)
name = repository_ctx.name.split("+")[-1]
if suffix:
repository_ctx.file("BUILD.bazel", "filegroup(name = %r, srcs = [%r], visibility = ['//visibility:public'])" % (name, exe))
else:
repository_ctx.file("BUILD.bazel", "exports_files([%r])" % exe)
git_lfs_binary = repository_rule(
implementation = _lfs_binary_impl,
attrs = {
"version": attr.string(mandatory = True),
"sha256_linux": attr.string(mandatory = True),
"sha256_macos_x86": attr.string(mandatory = True),
"sha256_macos_arm64": attr.string(mandatory = True),
"sha256_windows": attr.string(mandatory = True),
},
)