Merge pull request #21797 from github/tausbn/yeast-desugaring-tool

Shared: Add YEAST desugaring library
This commit is contained in:
Taus
2026-05-07 13:48:12 +02:00
committed by GitHub
50 changed files with 5132 additions and 61 deletions

46
Cargo.lock generated
View File

@@ -240,9 +240,9 @@ dependencies = [
[[package]]
name = "cc"
version = "1.2.37"
version = "1.2.61"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "65193589c6404eb80b450d618eaf9a2cafaaafd57ecce47370519ef674a7bd44"
checksum = "d16d90359e986641506914ba71350897565610e87ce0ad9e6f28569db3dd5c6d"
dependencies = [
"find-msvc-tools",
"jobserver",
@@ -416,6 +416,7 @@ dependencies = [
"tree-sitter",
"tree-sitter-json",
"tree-sitter-ql",
"yeast",
"zstd",
]
@@ -754,9 +755,9 @@ dependencies = [
[[package]]
name = "find-msvc-tools"
version = "0.1.1"
version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7fd99930f64d146689264c637b5af2f0233a933bef0d8570e2526bf9e083192d"
checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582"
[[package]]
name = "fixedbitset"
@@ -2853,9 +2854,9 @@ dependencies = [
[[package]]
name = "tree-sitter"
version = "0.25.9"
version = "0.26.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ccd2a058a86cfece0bf96f7cce1021efef9c8ed0e892ab74639173e5ed7a34fa"
checksum = "887bd495d0582c5e3e0d8ece2233666169fa56a9644d172fc22ad179ab2d0538"
dependencies = [
"cc",
"regex",
@@ -2891,6 +2892,16 @@ version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c4013970217383f67b18aef68f6fb2e8d409bc5755227092d32efb0422ba24b8"
[[package]]
name = "tree-sitter-python"
version = "0.23.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3d065aaa27f3aaceaf60c1f0e0ac09e1cb9eb8ed28e7bcdaa52129cffc7f4b04"
dependencies = [
"cc",
"tree-sitter-language",
]
[[package]]
name = "tree-sitter-ql"
version = "0.23.1"
@@ -3367,6 +3378,29 @@ version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cfe53a6657fd280eaa890a3bc59152892ffa3e30101319d168b781ed6529b049"
[[package]]
name = "yeast"
version = "0.1.0"
dependencies = [
"clap",
"serde",
"serde_json",
"serde_yaml",
"tree-sitter",
"tree-sitter-python",
"tree-sitter-ruby",
"yeast-macros",
]
[[package]]
name = "yeast-macros"
version = "0.1.0"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "yoke"
version = "0.8.0"

View File

@@ -4,6 +4,8 @@
resolver = "2"
members = [
"shared/tree-sitter-extractor",
"shared/yeast",
"shared/yeast-macros",
"ruby/extractor",
"rust/extractor",
"rust/extractor/macros",

View File

@@ -141,14 +141,16 @@ use_repo(
"vendor_ts__serde-1.0.228",
"vendor_ts__serde_json-1.0.145",
"vendor_ts__serde_with-3.14.1",
"vendor_ts__serde_yaml-0.9.34-deprecated",
"vendor_ts__syn-2.0.106",
"vendor_ts__toml-0.9.7",
"vendor_ts__tracing-0.1.41",
"vendor_ts__tracing-flame-0.2.0",
"vendor_ts__tracing-subscriber-0.3.20",
"vendor_ts__tree-sitter-0.25.9",
"vendor_ts__tree-sitter-0.26.8",
"vendor_ts__tree-sitter-embedded-template-0.25.0",
"vendor_ts__tree-sitter-json-0.24.8",
"vendor_ts__tree-sitter-python-0.23.6",
"vendor_ts__tree-sitter-ql-0.23.1",
"vendor_ts__tree-sitter-ruby-0.23.1",
"vendor_ts__triomphe-0.1.14",

View File

@@ -529,6 +529,18 @@ alias(
tags = ["manual"],
)
alias(
name = "serde_yaml-0.9.34+deprecated",
actual = "@vendor_ts__serde_yaml-0.9.34-deprecated//:serde_yaml",
tags = ["manual"],
)
alias(
name = "serde_yaml",
actual = "@vendor_ts__serde_yaml-0.9.34-deprecated//:serde_yaml",
tags = ["manual"],
)
alias(
name = "syn-2.0.106",
actual = "@vendor_ts__syn-2.0.106//:syn",
@@ -590,14 +602,14 @@ alias(
)
alias(
name = "tree-sitter-0.25.9",
actual = "@vendor_ts__tree-sitter-0.25.9//:tree_sitter",
name = "tree-sitter-0.26.8",
actual = "@vendor_ts__tree-sitter-0.26.8//:tree_sitter",
tags = ["manual"],
)
alias(
name = "tree-sitter",
actual = "@vendor_ts__tree-sitter-0.25.9//:tree_sitter",
actual = "@vendor_ts__tree-sitter-0.26.8//:tree_sitter",
tags = ["manual"],
)
@@ -625,6 +637,18 @@ alias(
tags = ["manual"],
)
alias(
name = "tree-sitter-python-0.23.6",
actual = "@vendor_ts__tree-sitter-python-0.23.6//:tree_sitter_python",
tags = ["manual"],
)
alias(
name = "tree-sitter-python",
actual = "@vendor_ts__tree-sitter-python-0.23.6//:tree_sitter_python",
tags = ["manual"],
)
alias(
name = "tree-sitter-ql-0.23.1",
actual = "@vendor_ts__tree-sitter-ql-0.23.1//:tree_sitter_ql",

View File

@@ -96,9 +96,9 @@ rust_library(
"@rules_rust//rust/platform:x86_64-unknown-uefi": [],
"//conditions:default": ["@platforms//:incompatible"],
}),
version = "1.2.37",
version = "1.2.61",
deps = [
"@vendor_ts__find-msvc-tools-0.1.1//:find_msvc_tools",
"@vendor_ts__find-msvc-tools-0.1.9//:find_msvc_tools",
"@vendor_ts__jobserver-0.1.34//:jobserver",
"@vendor_ts__shlex-1.3.0//:shlex",
] + select({

View File

@@ -93,5 +93,5 @@ rust_library(
"@rules_rust//rust/platform:x86_64-unknown-uefi": [],
"//conditions:default": ["@platforms//:incompatible"],
}),
version = "0.1.1",
version = "0.1.9",
)

View File

@@ -154,7 +154,7 @@ cargo_build_script(
version = "0.1.2",
visibility = ["//visibility:private"],
deps = [
"@vendor_ts__cc-1.2.37//:cc",
"@vendor_ts__cc-1.2.61//:cc",
],
)

View File

@@ -101,12 +101,12 @@ rust_library(
"@rules_rust//rust/platform:x86_64-unknown-uefi": [],
"//conditions:default": ["@platforms//:incompatible"],
}),
version = "0.25.9",
version = "0.26.8",
deps = [
"@vendor_ts__regex-1.11.3//:regex",
"@vendor_ts__regex-syntax-0.8.6//:regex_syntax",
"@vendor_ts__streaming-iterator-0.1.9//:streaming_iterator",
"@vendor_ts__tree-sitter-0.25.9//:build_script_build",
"@vendor_ts__tree-sitter-0.26.8//:build_script_build",
"@vendor_ts__tree-sitter-language-0.1.5//:tree_sitter_language",
],
)
@@ -164,10 +164,10 @@ cargo_build_script(
"noclippy",
"norustfmt",
],
version = "0.25.9",
version = "0.26.8",
visibility = ["//visibility:private"],
deps = [
"@vendor_ts__cc-1.2.37//:cc",
"@vendor_ts__cc-1.2.61//:cc",
"@vendor_ts__serde_json-1.0.145//:serde_json",
],
)

View File

@@ -155,7 +155,7 @@ cargo_build_script(
version = "0.25.0",
visibility = ["//visibility:private"],
deps = [
"@vendor_ts__cc-1.2.37//:cc",
"@vendor_ts__cc-1.2.61//:cc",
],
)

View File

@@ -155,7 +155,7 @@ cargo_build_script(
version = "0.24.8",
visibility = ["//visibility:private"],
deps = [
"@vendor_ts__cc-1.2.37//:cc",
"@vendor_ts__cc-1.2.61//:cc",
],
)

View File

@@ -0,0 +1,166 @@
###############################################################################
# @generated
# DO NOT MODIFY: This file is auto-generated by a crate_universe tool. To
# regenerate this file, run the following:
#
# bazel run @@//misc/bazel/3rdparty:vendor_tree_sitter_extractors
###############################################################################
load(
"@rules_rust//cargo:defs.bzl",
"cargo_build_script",
"cargo_toml_env_vars",
)
load("@rules_rust//rust:defs.bzl", "rust_library")
package(default_visibility = ["//visibility:public"])
cargo_toml_env_vars(
name = "cargo_toml_env_vars",
src = "Cargo.toml",
)
rust_library(
name = "tree_sitter_python",
srcs = glob(
include = ["**/*.rs"],
allow_empty = True,
),
compile_data = glob(
include = ["**"],
allow_empty = True,
exclude = [
"**/* *",
".tmp_git_root/**/*",
"BUILD",
"BUILD.bazel",
"WORKSPACE",
"WORKSPACE.bazel",
],
),
crate_root = "bindings/rust/lib.rs",
edition = "2021",
rustc_env_files = [
":cargo_toml_env_vars",
],
rustc_flags = [
"--cap-lints=allow",
],
tags = [
"cargo-bazel",
"crate-name=tree-sitter-python",
"manual",
"noclippy",
"norustfmt",
],
target_compatible_with = select({
"@rules_rust//rust/platform:aarch64-apple-darwin": [],
"@rules_rust//rust/platform:aarch64-apple-ios": [],
"@rules_rust//rust/platform:aarch64-apple-ios-sim": [],
"@rules_rust//rust/platform:aarch64-linux-android": [],
"@rules_rust//rust/platform:aarch64-pc-windows-msvc": [],
"@rules_rust//rust/platform:aarch64-unknown-fuchsia": [],
"@rules_rust//rust/platform:aarch64-unknown-linux-gnu": [],
"@rules_rust//rust/platform:aarch64-unknown-nixos-gnu": [],
"@rules_rust//rust/platform:aarch64-unknown-nto-qnx710": [],
"@rules_rust//rust/platform:aarch64-unknown-uefi": [],
"@rules_rust//rust/platform:arm-unknown-linux-gnueabi": [],
"@rules_rust//rust/platform:arm-unknown-linux-musleabi": [],
"@rules_rust//rust/platform:armv7-linux-androideabi": [],
"@rules_rust//rust/platform:armv7-unknown-linux-gnueabi": [],
"@rules_rust//rust/platform:i686-apple-darwin": [],
"@rules_rust//rust/platform:i686-linux-android": [],
"@rules_rust//rust/platform:i686-pc-windows-msvc": [],
"@rules_rust//rust/platform:i686-unknown-freebsd": [],
"@rules_rust//rust/platform:i686-unknown-linux-gnu": [],
"@rules_rust//rust/platform:powerpc-unknown-linux-gnu": [],
"@rules_rust//rust/platform:riscv32imc-unknown-none-elf": [],
"@rules_rust//rust/platform:riscv64gc-unknown-linux-gnu": [],
"@rules_rust//rust/platform:riscv64gc-unknown-none-elf": [],
"@rules_rust//rust/platform:s390x-unknown-linux-gnu": [],
"@rules_rust//rust/platform:thumbv7em-none-eabi": [],
"@rules_rust//rust/platform:thumbv8m.main-none-eabi": [],
"@rules_rust//rust/platform:wasm32-unknown-emscripten": [],
"@rules_rust//rust/platform:wasm32-unknown-unknown": [],
"@rules_rust//rust/platform:wasm32-wasip1": [],
"@rules_rust//rust/platform:wasm32-wasip1-threads": [],
"@rules_rust//rust/platform:wasm32-wasip2": [],
"@rules_rust//rust/platform:x86_64-apple-darwin": [],
"@rules_rust//rust/platform:x86_64-apple-ios": [],
"@rules_rust//rust/platform:x86_64-linux-android": [],
"@rules_rust//rust/platform:x86_64-pc-windows-msvc": [],
"@rules_rust//rust/platform:x86_64-unknown-freebsd": [],
"@rules_rust//rust/platform:x86_64-unknown-fuchsia": [],
"@rules_rust//rust/platform:x86_64-unknown-linux-gnu": [],
"@rules_rust//rust/platform:x86_64-unknown-nixos-gnu": [],
"@rules_rust//rust/platform:x86_64-unknown-none": [],
"@rules_rust//rust/platform:x86_64-unknown-uefi": [],
"//conditions:default": ["@platforms//:incompatible"],
}),
version = "0.23.6",
deps = [
"@vendor_ts__tree-sitter-language-0.1.5//:tree_sitter_language",
"@vendor_ts__tree-sitter-python-0.23.6//:build_script_build",
],
)
cargo_build_script(
name = "_bs",
srcs = glob(
include = ["**/*.rs"],
allow_empty = True,
),
compile_data = glob(
include = ["**"],
allow_empty = True,
exclude = [
"**/* *",
"**/*.rs",
".tmp_git_root/**/*",
"BUILD",
"BUILD.bazel",
"WORKSPACE",
"WORKSPACE.bazel",
],
),
crate_name = "build_script_build",
crate_root = "bindings/rust/build.rs",
data = glob(
include = ["**"],
allow_empty = True,
exclude = [
"**/* *",
".tmp_git_root/**/*",
"BUILD",
"BUILD.bazel",
"WORKSPACE",
"WORKSPACE.bazel",
],
),
edition = "2021",
pkg_name = "tree-sitter-python",
rustc_env_files = [
":cargo_toml_env_vars",
],
rustc_flags = [
"--cap-lints=allow",
],
tags = [
"cargo-bazel",
"crate-name=tree-sitter-python",
"manual",
"noclippy",
"norustfmt",
],
version = "0.23.6",
visibility = ["//visibility:private"],
deps = [
"@vendor_ts__cc-1.2.61//:cc",
],
)
alias(
name = "build_script_build",
actual = ":_bs",
tags = ["manual"],
)

View File

@@ -155,7 +155,7 @@ cargo_build_script(
version = "0.23.1",
visibility = ["//visibility:private"],
deps = [
"@vendor_ts__cc-1.2.37//:cc",
"@vendor_ts__cc-1.2.61//:cc",
],
)

View File

@@ -155,7 +155,7 @@ cargo_build_script(
version = "0.23.1",
visibility = ["//visibility:private"],
deps = [
"@vendor_ts__cc-1.2.37//:cc",
"@vendor_ts__cc-1.2.61//:cc",
],
)

View File

@@ -165,7 +165,7 @@ cargo_build_script(
version = "2.0.16+zstd.1.5.7",
visibility = ["//visibility:private"],
deps = [
"@vendor_ts__cc-1.2.37//:cc",
"@vendor_ts__cc-1.2.61//:cc",
"@vendor_ts__pkg-config-0.3.32//:pkg_config",
],
)

View File

@@ -303,7 +303,7 @@ _NORMAL_DEPENDENCIES = {
"serde_json": Label("@vendor_ts__serde_json-1.0.145//:serde_json"),
"tracing": Label("@vendor_ts__tracing-0.1.41//:tracing"),
"tracing-subscriber": Label("@vendor_ts__tracing-subscriber-0.3.20//:tracing_subscriber"),
"tree-sitter": Label("@vendor_ts__tree-sitter-0.25.9//:tree_sitter"),
"tree-sitter": Label("@vendor_ts__tree-sitter-0.26.8//:tree_sitter"),
"tree-sitter-embedded-template": Label("@vendor_ts__tree-sitter-embedded-template-0.25.0//:tree_sitter_embedded_template"),
"tree-sitter-ruby": Label("@vendor_ts__tree-sitter-ruby-0.23.1//:tree_sitter_ruby"),
},
@@ -381,10 +381,28 @@ _NORMAL_DEPENDENCIES = {
"serde_json": Label("@vendor_ts__serde_json-1.0.145//:serde_json"),
"tracing": Label("@vendor_ts__tracing-0.1.41//:tracing"),
"tracing-subscriber": Label("@vendor_ts__tracing-subscriber-0.3.20//:tracing_subscriber"),
"tree-sitter": Label("@vendor_ts__tree-sitter-0.25.9//:tree_sitter"),
"tree-sitter": Label("@vendor_ts__tree-sitter-0.26.8//:tree_sitter"),
"zstd": Label("@vendor_ts__zstd-0.13.3//:zstd"),
},
},
"shared/yeast": {
_COMMON_CONDITION: {
"clap": Label("@vendor_ts__clap-4.5.48//:clap"),
"serde": Label("@vendor_ts__serde-1.0.228//:serde"),
"serde_json": Label("@vendor_ts__serde_json-1.0.145//:serde_json"),
"serde_yaml": Label("@vendor_ts__serde_yaml-0.9.34-deprecated//:serde_yaml"),
"tree-sitter": Label("@vendor_ts__tree-sitter-0.26.8//:tree_sitter"),
"tree-sitter-python": Label("@vendor_ts__tree-sitter-python-0.23.6//:tree_sitter_python"),
"tree-sitter-ruby": Label("@vendor_ts__tree-sitter-ruby-0.23.1//:tree_sitter_ruby"),
},
},
"shared/yeast-macros": {
_COMMON_CONDITION: {
"proc-macro2": Label("@vendor_ts__proc-macro2-1.0.101//:proc_macro2"),
"quote": Label("@vendor_ts__quote-1.0.41//:quote"),
"syn": Label("@vendor_ts__syn-2.0.106//:syn"),
},
},
}
_NORMAL_ALIASES = {
@@ -411,6 +429,14 @@ _NORMAL_ALIASES = {
_COMMON_CONDITION: {
},
},
"shared/yeast": {
_COMMON_CONDITION: {
},
},
"shared/yeast-macros": {
_COMMON_CONDITION: {
},
},
}
_NORMAL_DEV_DEPENDENCIES = {
@@ -431,6 +457,10 @@ _NORMAL_DEV_DEPENDENCIES = {
"tree-sitter-ql": Label("@vendor_ts__tree-sitter-ql-0.23.1//:tree_sitter_ql"),
},
},
"shared/yeast": {
},
"shared/yeast-macros": {
},
}
_NORMAL_DEV_ALIASES = {
@@ -448,6 +478,10 @@ _NORMAL_DEV_ALIASES = {
_COMMON_CONDITION: {
},
},
"shared/yeast": {
},
"shared/yeast-macros": {
},
}
_PROC_MACRO_DEPENDENCIES = {
@@ -463,6 +497,10 @@ _PROC_MACRO_DEPENDENCIES = {
},
"shared/tree-sitter-extractor": {
},
"shared/yeast": {
},
"shared/yeast-macros": {
},
}
_PROC_MACRO_ALIASES = {
@@ -478,6 +516,10 @@ _PROC_MACRO_ALIASES = {
},
"shared/tree-sitter-extractor": {
},
"shared/yeast": {
},
"shared/yeast-macros": {
},
}
_PROC_MACRO_DEV_DEPENDENCIES = {
@@ -493,6 +535,10 @@ _PROC_MACRO_DEV_DEPENDENCIES = {
},
"shared/tree-sitter-extractor": {
},
"shared/yeast": {
},
"shared/yeast-macros": {
},
}
_PROC_MACRO_DEV_ALIASES = {
@@ -510,6 +556,10 @@ _PROC_MACRO_DEV_ALIASES = {
_COMMON_CONDITION: {
},
},
"shared/yeast": {
},
"shared/yeast-macros": {
},
}
_BUILD_DEPENDENCIES = {
@@ -525,6 +575,10 @@ _BUILD_DEPENDENCIES = {
},
"shared/tree-sitter-extractor": {
},
"shared/yeast": {
},
"shared/yeast-macros": {
},
}
_BUILD_ALIASES = {
@@ -540,6 +594,10 @@ _BUILD_ALIASES = {
},
"shared/tree-sitter-extractor": {
},
"shared/yeast": {
},
"shared/yeast-macros": {
},
}
_BUILD_PROC_MACRO_DEPENDENCIES = {
@@ -555,6 +613,10 @@ _BUILD_PROC_MACRO_DEPENDENCIES = {
},
"shared/tree-sitter-extractor": {
},
"shared/yeast": {
},
"shared/yeast-macros": {
},
}
_BUILD_PROC_MACRO_ALIASES = {
@@ -570,6 +632,10 @@ _BUILD_PROC_MACRO_ALIASES = {
},
"shared/tree-sitter-extractor": {
},
"shared/yeast": {
},
"shared/yeast-macros": {
},
}
_CONDITIONS = {
@@ -923,12 +989,12 @@ def crate_repositories():
maybe(
http_archive,
name = "vendor_ts__cc-1.2.37",
sha256 = "65193589c6404eb80b450d618eaf9a2cafaaafd57ecce47370519ef674a7bd44",
name = "vendor_ts__cc-1.2.61",
sha256 = "d16d90359e986641506914ba71350897565610e87ce0ad9e6f28569db3dd5c6d",
type = "tar.gz",
urls = ["https://static.crates.io/crates/cc/1.2.37/download"],
strip_prefix = "cc-1.2.37",
build_file = Label("//misc/bazel/3rdparty/tree_sitter_extractors_deps:BUILD.cc-1.2.37.bazel"),
urls = ["https://static.crates.io/crates/cc/1.2.61/download"],
strip_prefix = "cc-1.2.61",
build_file = Label("//misc/bazel/3rdparty/tree_sitter_extractors_deps:BUILD.cc-1.2.61.bazel"),
)
maybe(
@@ -1373,12 +1439,12 @@ def crate_repositories():
maybe(
http_archive,
name = "vendor_ts__find-msvc-tools-0.1.1",
sha256 = "7fd99930f64d146689264c637b5af2f0233a933bef0d8570e2526bf9e083192d",
name = "vendor_ts__find-msvc-tools-0.1.9",
sha256 = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582",
type = "tar.gz",
urls = ["https://static.crates.io/crates/find-msvc-tools/0.1.1/download"],
strip_prefix = "find-msvc-tools-0.1.1",
build_file = Label("//misc/bazel/3rdparty/tree_sitter_extractors_deps:BUILD.find-msvc-tools-0.1.1.bazel"),
urls = ["https://static.crates.io/crates/find-msvc-tools/0.1.9/download"],
strip_prefix = "find-msvc-tools-0.1.9",
build_file = Label("//misc/bazel/3rdparty/tree_sitter_extractors_deps:BUILD.find-msvc-tools-0.1.9.bazel"),
)
maybe(
@@ -3363,12 +3429,12 @@ def crate_repositories():
maybe(
http_archive,
name = "vendor_ts__tree-sitter-0.25.9",
sha256 = "ccd2a058a86cfece0bf96f7cce1021efef9c8ed0e892ab74639173e5ed7a34fa",
name = "vendor_ts__tree-sitter-0.26.8",
sha256 = "887bd495d0582c5e3e0d8ece2233666169fa56a9644d172fc22ad179ab2d0538",
type = "tar.gz",
urls = ["https://static.crates.io/crates/tree-sitter/0.25.9/download"],
strip_prefix = "tree-sitter-0.25.9",
build_file = Label("//misc/bazel/3rdparty/tree_sitter_extractors_deps:BUILD.tree-sitter-0.25.9.bazel"),
urls = ["https://static.crates.io/crates/tree-sitter/0.26.8/download"],
strip_prefix = "tree-sitter-0.26.8",
build_file = Label("//misc/bazel/3rdparty/tree_sitter_extractors_deps:BUILD.tree-sitter-0.26.8.bazel"),
)
maybe(
@@ -3401,6 +3467,16 @@ def crate_repositories():
build_file = Label("//misc/bazel/3rdparty/tree_sitter_extractors_deps:BUILD.tree-sitter-language-0.1.5.bazel"),
)
maybe(
http_archive,
name = "vendor_ts__tree-sitter-python-0.23.6",
sha256 = "3d065aaa27f3aaceaf60c1f0e0ac09e1cb9eb8ed28e7bcdaa52129cffc7f4b04",
type = "tar.gz",
urls = ["https://static.crates.io/crates/tree-sitter-python/0.23.6/download"],
strip_prefix = "tree-sitter-python-0.23.6",
build_file = Label("//misc/bazel/3rdparty/tree_sitter_extractors_deps:BUILD.tree-sitter-python-0.23.6.bazel"),
)
maybe(
http_archive,
name = "vendor_ts__tree-sitter-ql-0.23.1",
@@ -4152,13 +4228,15 @@ def crate_repositories():
struct(repo = "vendor_ts__serde-1.0.228", is_dev_dep = False),
struct(repo = "vendor_ts__serde_json-1.0.145", is_dev_dep = False),
struct(repo = "vendor_ts__serde_with-3.14.1", is_dev_dep = False),
struct(repo = "vendor_ts__serde_yaml-0.9.34-deprecated", is_dev_dep = False),
struct(repo = "vendor_ts__syn-2.0.106", is_dev_dep = False),
struct(repo = "vendor_ts__toml-0.9.7", is_dev_dep = False),
struct(repo = "vendor_ts__tracing-0.1.41", is_dev_dep = False),
struct(repo = "vendor_ts__tracing-flame-0.2.0", is_dev_dep = False),
struct(repo = "vendor_ts__tracing-subscriber-0.3.20", is_dev_dep = False),
struct(repo = "vendor_ts__tree-sitter-0.25.9", is_dev_dep = False),
struct(repo = "vendor_ts__tree-sitter-0.26.8", is_dev_dep = False),
struct(repo = "vendor_ts__tree-sitter-embedded-template-0.25.0", is_dev_dep = False),
struct(repo = "vendor_ts__tree-sitter-python-0.23.6", is_dev_dep = False),
struct(repo = "vendor_ts__tree-sitter-ruby-0.23.1", is_dev_dep = False),
struct(repo = "vendor_ts__triomphe-0.1.14", is_dev_dep = False),
struct(repo = "vendor_ts__ungrammar-1.16.1", is_dev_dep = False),

View File

@@ -29,24 +29,28 @@ pub fn run(options: Options) -> std::io::Result<()> {
prefix: "ql",
ts_language: tree_sitter_ql::LANGUAGE.into(),
node_types: tree_sitter_ql::NODE_TYPES,
desugar: None,
file_globs: vec!["*.ql".into(), "*.qll".into()],
},
simple::LanguageSpec {
prefix: "dbscheme",
ts_language: tree_sitter_ql_dbscheme::LANGUAGE.into(),
node_types: tree_sitter_ql_dbscheme::NODE_TYPES,
desugar: None,
file_globs: vec!["*.dbscheme".into()],
},
simple::LanguageSpec {
prefix: "json",
ts_language: tree_sitter_json::LANGUAGE.into(),
node_types: tree_sitter_json::NODE_TYPES,
desugar: None,
file_globs: vec!["*.json".into(), "*.jsonl".into(), "*.jsonc".into()],
},
simple::LanguageSpec {
prefix: "blame",
ts_language: tree_sitter_blame::LANGUAGE.into(),
node_types: tree_sitter_blame::NODE_TYPES,
desugar: None,
file_globs: vec!["*.blame".into()],
},
],

View File

@@ -21,18 +21,22 @@ pub fn run(options: Options) -> std::io::Result<()> {
Language {
name: "QL".to_owned(),
node_types: tree_sitter_ql::NODE_TYPES,
desugar: None,
},
Language {
name: "Dbscheme".to_owned(),
node_types: tree_sitter_ql_dbscheme::NODE_TYPES,
desugar: None,
},
Language {
name: "Blame".to_owned(),
node_types: tree_sitter_blame::NODE_TYPES,
desugar: None,
},
Language {
name: "JSON".to_owned(),
node_types: tree_sitter_json::NODE_TYPES,
desugar: None,
},
];

View File

@@ -123,6 +123,7 @@ pub fn run(options: Options) -> std::io::Result<()> {
&path,
&source,
&[],
None,
);
let (ranges, line_breaks) = scan_erb(
@@ -211,6 +212,7 @@ pub fn run(options: Options) -> std::io::Result<()> {
&path,
&source,
&code_ranges,
None,
);
std::fs::create_dir_all(src_archive_file.parent().unwrap())?;
if needs_conversion {

View File

@@ -21,10 +21,12 @@ pub fn run(options: Options) -> std::io::Result<()> {
Language {
name: "Ruby".to_owned(),
node_types: tree_sitter_ruby::NODE_TYPES,
desugar: None,
},
Language {
name: "Erb".to_owned(),
node_types: tree_sitter_embedded_template::NODE_TYPES,
desugar: None,
},
];

View File

@@ -12,7 +12,9 @@ rust_library(
compile_data = [
"src/generator/prefix.dbscheme",
],
deps = all_crate_deps(),
deps = all_crate_deps() + [
"//shared/yeast",
],
)
alias(

View File

@@ -20,6 +20,7 @@ serde_json = "1.0"
chrono = { version = "0.4.42", features = ["serde"] }
num_cpus = "1.17.0"
zstd = "0.13.3"
yeast = { path = "../yeast" }
[dev-dependencies]
tree-sitter-ql = "0.23.1"

View File

@@ -18,6 +18,82 @@ use tree_sitter::{Language, Node, Parser, Range, Tree};
pub mod simple;
/// Trait abstracting over tree-sitter and yeast node types for extraction.
trait AstNode {
fn kind(&self) -> &str;
fn is_named(&self) -> bool;
fn is_missing(&self) -> bool;
fn is_error(&self) -> bool;
fn is_extra(&self) -> bool;
fn start_position(&self) -> tree_sitter::Point;
fn end_position(&self) -> tree_sitter::Point;
fn byte_range(&self) -> std::ops::Range<usize>;
fn end_byte(&self) -> usize {
self.byte_range().end
}
/// For yeast nodes with synthetic content, return it. Otherwise None.
fn opt_string_content(&self) -> Option<String> {
None
}
}
impl<'a> AstNode for Node<'a> {
fn kind(&self) -> &str {
Node::kind(self)
}
fn is_named(&self) -> bool {
Node::is_named(self)
}
fn is_missing(&self) -> bool {
Node::is_missing(self)
}
fn is_error(&self) -> bool {
Node::is_error(self)
}
fn is_extra(&self) -> bool {
Node::is_extra(self)
}
fn start_position(&self) -> tree_sitter::Point {
Node::start_position(self)
}
fn end_position(&self) -> tree_sitter::Point {
Node::end_position(self)
}
fn byte_range(&self) -> std::ops::Range<usize> {
Node::byte_range(self)
}
}
impl AstNode for yeast::Node {
fn kind(&self) -> &str {
yeast::Node::kind(self)
}
fn is_named(&self) -> bool {
yeast::Node::is_named(self)
}
fn is_missing(&self) -> bool {
yeast::Node::is_missing(self)
}
fn is_error(&self) -> bool {
yeast::Node::is_error(self)
}
fn is_extra(&self) -> bool {
yeast::Node::is_extra(self)
}
fn start_position(&self) -> tree_sitter::Point {
yeast::Node::start_position(self)
}
fn end_position(&self) -> tree_sitter::Point {
yeast::Node::end_position(self)
}
fn byte_range(&self) -> std::ops::Range<usize> {
yeast::Node::byte_range(self)
}
fn opt_string_content(&self) -> Option<String> {
yeast::Node::opt_string_content(self)
}
}
/// Sets the tracing level based on the environment variables
/// `RUST_LOG` and `CODEQL_VERBOSITY` (prioritized in that order),
/// falling back to `warn` if neither is set.
@@ -204,6 +280,11 @@ pub fn location_label(writer: &mut trap::Writer, location: trap::Location) -> tr
}
/// Extracts the source file at `path`, which is assumed to be canonicalized.
/// When `yeast_runner` is `Some`, the parsed tree is first transformed
/// through the supplied yeast `Runner` before TRAP extraction. Building the
/// `Runner` (which parses YAML and constructs the schema) is the caller's
/// responsibility, allowing it to be done once and shared across files.
#[allow(clippy::too_many_arguments)]
pub fn extract(
language: &Language,
language_prefix: &str,
@@ -214,6 +295,7 @@ pub fn extract(
path: &Path,
source: &[u8],
ranges: &[Range],
yeast_runner: Option<&yeast::Runner<'_>>,
) {
let path_str = file_paths::normalize_and_transform_path(path, transformer);
let span = tracing::span!(
@@ -236,13 +318,20 @@ pub fn extract(
source,
diagnostics_writer,
trap_writer,
// TODO: should we handle path strings that are not valid UTF8 better?
&path_str,
file_label,
language_prefix,
schema,
);
traverse(&tree, &mut visitor);
if let Some(yeast_runner) = yeast_runner {
let ast = yeast_runner
.run_from_tree(&tree)
.unwrap_or_else(|e| panic!("Desugaring failed for {path_str}: {e}"));
traverse_yeast(&ast, &mut visitor);
} else {
traverse(&tree, &mut visitor);
}
parser.reset();
}
@@ -329,11 +418,11 @@ impl<'a> Visitor<'a> {
);
}
fn record_parse_error_for_node(
fn record_parse_error_for_node<N: AstNode>(
&mut self,
message: &str,
args: &[diagnostics::MessageArg],
node: Node,
node: &N,
status_page: bool,
) {
let loc = location_for(self, self.file_label, node);
@@ -357,7 +446,7 @@ impl<'a> Visitor<'a> {
self.record_parse_error(loc_label, &mesg);
}
fn enter_node(&mut self, node: Node) -> bool {
fn enter_node<N: AstNode>(&mut self, node: &N) -> bool {
if node.is_missing() {
self.record_parse_error_for_node(
"A parse error occurred (expected {} symbol). Check the syntax of the file. If the file is invalid, correct the error or {} the file from analysis.",
@@ -383,7 +472,7 @@ impl<'a> Visitor<'a> {
true
}
fn leave_node(&mut self, field_name: Option<&'static str>, node: Node) {
fn leave_node<N: AstNode>(&mut self, field_name: Option<&'static str>, node: &N) {
if node.is_error() || node.is_missing() {
return;
}
@@ -434,7 +523,7 @@ impl<'a> Visitor<'a> {
fields,
name: table_name,
} => {
if let Some(args) = self.complex_node(&node, fields, &child_nodes, id) {
if let Some(args) = self.complex_node(node, fields, &child_nodes, id) {
self.trap_writer.add_tuple(
&self.ast_node_location_table_name,
vec![trap::Arg::Label(id), trap::Arg::Label(loc_label)],
@@ -495,9 +584,9 @@ impl<'a> Visitor<'a> {
}
}
fn complex_node(
fn complex_node<N: AstNode>(
&mut self,
node: &Node,
node: &N,
fields: &[Field],
child_nodes: &[ChildNode],
parent_id: trap::Label,
@@ -529,7 +618,7 @@ impl<'a> Visitor<'a> {
diagnostics::MessageArg::Code(&format!("{:?}", child_node.type_name)),
diagnostics::MessageArg::Code(&format!("{:?}", field.type_info)),
],
*node,
node,
false,
);
}
@@ -541,7 +630,7 @@ impl<'a> Visitor<'a> {
diagnostics::MessageArg::Code(child_node.field_name.unwrap_or("child")),
diagnostics::MessageArg::Code(&format!("{:?}", child_node.type_name)),
],
*node,
node,
false,
);
}
@@ -566,7 +655,7 @@ impl<'a> Visitor<'a> {
node.kind(),
column_name
);
self.record_parse_error_for_node(&error_message, &[], *node, false);
self.record_parse_error_for_node(&error_message, &[], node, false);
}
}
Storage::Table {
@@ -582,7 +671,7 @@ impl<'a> Visitor<'a> {
diagnostics::MessageArg::Code(node.kind()),
diagnostics::MessageArg::Code(table_name),
],
*node,
node,
false,
);
break;
@@ -639,15 +728,21 @@ impl<'a> Visitor<'a> {
}
// Emit a slice of a source file as an Arg.
fn sliced_source_arg(source: &[u8], n: Node) -> trap::Arg {
let range = n.byte_range();
trap::Arg::String(String::from_utf8_lossy(&source[range.start..range.end]).into_owned())
fn sliced_source_arg<N: AstNode>(source: &[u8], n: &N) -> trap::Arg {
trap::Arg::String(n.opt_string_content().unwrap_or_else(|| {
let range = n.byte_range();
String::from_utf8_lossy(&source[range.start..range.end]).into_owned()
}))
}
// Emit a pair of `TrapEntry`s for the provided node, appropriately calibrated.
// The first is the location and label definition, and the second is the
// 'Located' entry.
fn location_for(visitor: &mut Visitor, file_label: trap::Label, n: Node) -> trap::Location {
fn location_for<N: AstNode>(
visitor: &mut Visitor,
file_label: trap::Label,
n: &N,
) -> trap::Location {
// Tree-sitter row, column values are 0-based while CodeQL starts
// counting at 1. In addition Tree-sitter's row and column for the
// end position are exclusive while CodeQL's end positions are inclusive.
@@ -715,6 +810,28 @@ fn location_for(visitor: &mut Visitor, file_label: trap::Label, n: Node) -> trap
fn traverse(tree: &Tree, visitor: &mut Visitor) {
let cursor = &mut tree.walk();
visitor.enter_node(&cursor.node());
let mut recurse = true;
loop {
if recurse && cursor.goto_first_child() {
recurse = visitor.enter_node(&cursor.node());
} else {
visitor.leave_node(cursor.field_name(), &cursor.node());
if cursor.goto_next_sibling() {
recurse = visitor.enter_node(&cursor.node());
} else if cursor.goto_parent() {
recurse = false;
} else {
break;
}
}
}
}
fn traverse_yeast(tree: &yeast::Ast, visitor: &mut Visitor) {
use yeast::Cursor;
let mut cursor = tree.walk();
visitor.enter_node(cursor.node());
let mut recurse = true;
loop {

View File

@@ -7,11 +7,17 @@ use std::path::{Path, PathBuf};
use crate::diagnostics;
use crate::node_types;
use yeast;
pub struct LanguageSpec {
pub prefix: &'static str,
pub ts_language: tree_sitter::Language,
pub node_types: &'static str,
/// Optional yeast desugaring configuration. When set, the parsed
/// tree is rewritten through yeast before TRAP extraction. The
/// config's `output_node_types_yaml` (if set) provides the schema
/// used both at runtime (for the rewriter) and for TRAP validation.
pub desugar: Option<yeast::DesugaringConfig>,
pub file_globs: Vec<String>,
}
@@ -85,9 +91,35 @@ impl Extractor {
.collect();
let mut schemas = vec![];
let mut yeast_runners = Vec::new();
for lang in &self.languages {
let schema = node_types::read_node_types_str(lang.prefix, lang.node_types)?;
let effective_node_types: String =
match lang.desugar.as_ref().and_then(|c| c.output_node_types_yaml) {
Some(yaml) => yeast::node_types_yaml::convert(yaml).map_err(|e| {
std::io::Error::other(format!(
"Failed to convert YAML node-types to JSON for {}: {e}",
lang.prefix
))
})?,
None => lang.node_types.to_string(),
};
let schema = node_types::read_node_types_str(lang.prefix, &effective_node_types)?;
schemas.push(schema);
// Build the yeast runner once per language so the YAML schema
// isn't re-parsed for every file.
let yeast_runner = lang
.desugar
.as_ref()
.map(|config| yeast::Runner::from_config(lang.ts_language.clone(), config))
.transpose()
.map_err(|e| {
std::io::Error::other(format!(
"Failed to build desugaring runner for {}: {e}",
lang.prefix
))
})?;
yeast_runners.push(yeast_runner);
}
// Construct a single globset containing all language globs,
@@ -162,6 +194,7 @@ impl Extractor {
&path,
&source,
&[],
yeast_runners[i].as_ref(),
);
std::fs::create_dir_all(src_archive_file.parent().unwrap())?;
std::fs::copy(&path, &src_archive_file)?;

View File

@@ -1,4 +1,9 @@
pub struct Language {
pub name: String,
pub node_types: &'static str,
/// Optional yeast desugaring configuration. When set with an
/// `output_node_types_yaml`, the generator uses that YAML for the
/// dbscheme/QL library instead of `node_types`. The `rules` field is
/// unused at code-generation time; only the schema matters.
pub desugar: Option<yeast::DesugaringConfig>,
}

View File

@@ -6,6 +6,7 @@ use std::io::Write;
use std::path::PathBuf;
use crate::node_types;
use yeast;
pub mod dbscheme;
pub mod language;
@@ -68,7 +69,20 @@ pub fn generate(
let token_name = format!("{}_token", &prefix);
let tokeninfo_name = format!("{}_tokeninfo", &prefix);
let reserved_word_name = format!("{}_reserved_word", &prefix);
let nodes = node_types::read_node_types_str(&prefix, language.node_types)?;
let effective_node_types: String = match language
.desugar
.as_ref()
.and_then(|c| c.output_node_types_yaml)
{
Some(yaml) => yeast::node_types_yaml::convert(yaml).map_err(|e| {
std::io::Error::other(format!(
"Failed to convert YAML node-types to JSON for {}: {e}",
language.name
))
})?,
None => language.node_types.to_string(),
};
let nodes = node_types::read_node_types_str(&prefix, &effective_node_types)?;
let (dbscheme_entries, mut ast_node_members, token_kinds) = convert_nodes(&nodes);
ast_node_members.insert(&token_name);
writeln!(&mut dbscheme_writer, "/*- {} dbscheme -*/", language.name)?;

View File

@@ -13,6 +13,7 @@ fn simple_extractor() {
prefix: "ql",
ts_language: tree_sitter_ql::LANGUAGE.into(),
node_types: tree_sitter_ql::NODE_TYPES,
desugar: None,
file_globs: vec!["*.qll".into()],
};

View File

@@ -13,12 +13,14 @@ fn multiple_language_extractor() {
prefix: "ql",
ts_language: tree_sitter_ql::LANGUAGE.into(),
node_types: tree_sitter_ql::NODE_TYPES,
desugar: None,
file_globs: vec!["*.qll".into()],
};
let lang_json = simple::LanguageSpec {
prefix: "json",
ts_language: tree_sitter_json::LANGUAGE.into(),
node_types: tree_sitter_json::NODE_TYPES,
desugar: None,
file_globs: vec!["*.json".into(), "*Jsonfile".into()],
};

View File

@@ -0,0 +1,12 @@
load("@rules_rust//rust:defs.bzl", "rust_proc_macro")
load("//misc/bazel/3rdparty/tree_sitter_extractors_deps:defs.bzl", "aliases", "all_crate_deps")
exports_files(["Cargo.toml"])
rust_proc_macro(
name = "yeast-macros",
srcs = glob(["src/**/*.rs"]),
aliases = aliases(),
visibility = ["//visibility:public"],
deps = all_crate_deps(),
)

View File

@@ -0,0 +1,12 @@
[package]
name = "yeast-macros"
version = "0.1.0"
edition = "2021"
[lib]
proc-macro = true
[dependencies]
proc-macro2 = "1.0"
quote = "1.0"
syn = "2.0"

View File

@@ -0,0 +1,105 @@
use proc_macro::TokenStream;
use proc_macro2::TokenStream as TokenStream2;
mod parse;
/// Proc macro for constructing a `QueryNode` from a tree-sitter-inspired pattern.
///
/// # Syntax
///
/// ```text
/// (_) - match any named node (skips unnamed tokens)
/// (kind) - match a named node of the given kind
/// ("literal") - match an unnamed token by its text
/// (kind field: (pattern)) - match with named field
/// (kind (pat) (pat)...) - match unnamed children (after all fields)
/// (pattern) @capture - capture the matched node
/// (pattern)* @capture - capture each repeated match
/// (pattern)? - zero or one
/// ```
#[proc_macro]
pub fn query(input: TokenStream) -> TokenStream {
let input2: TokenStream2 = input.into();
match parse::parse_query_top(input2) {
Ok(output) => output.into(),
Err(err) => err.to_compile_error().into(),
}
}
/// Build a single AST node from a template, returning its `Id`.
///
/// # Template syntax
///
/// ```text
/// (kind "literal") - leaf with static content
/// (kind #{expr}) - leaf with computed content (expr.to_string())
/// (kind $fresh) - leaf with auto-generated unique name
/// {expr} - embed a Rust expression returning Id
/// {..expr} - splice an iterable of Id (in child/field position)
/// field: {..expr} - splice into a named field
/// ```
///
/// Can be called with an explicit context or using the implicit context
/// from an enclosing `rule!`:
///
/// ```text
/// tree!(ctx, (kind ...)) // explicit BuildCtx
/// tree!((kind ...)) // implicit context from rule!
/// ```
#[proc_macro]
pub fn tree(input: TokenStream) -> TokenStream {
let input2: TokenStream2 = input.into();
match parse::parse_tree_top(input2) {
Ok(output) => output.into(),
Err(err) => err.to_compile_error().into(),
}
}
/// Build a list of AST nodes from a template, returning `Vec<Id>`.
///
/// Like `tree!` but returns `Vec<Id>` and supports multiple top-level
/// elements. All syntax from `tree!` is available.
///
/// Can be called with an explicit context or using the implicit context
/// from an enclosing `rule!`:
///
/// ```text
/// trees!(ctx, (node1 ...) (node2 ...)) // explicit BuildCtx
/// trees!((node1 ...) (node2 ...)) // implicit context from rule!
/// ```
#[proc_macro]
pub fn trees(input: TokenStream) -> TokenStream {
let input2: TokenStream2 = input.into();
match parse::parse_trees_top(input2) {
Ok(output) => output.into(),
Err(err) => err.to_compile_error().into(),
}
}
/// Define a desugaring rule with query and transform in one declaration.
///
/// ```text
/// rule!(
/// (query_pattern field: (_) @name (kind)* @repeated (_)? @optional)
/// =>
/// (output_template field: {name} {..repeated})
/// )
///
/// // Shorthand: captures become fields on the output node
/// rule!((query ...) => output_kind)
/// ```
///
/// Captures become Rust variables automatically:
/// - `@name` (no quantifier) → `name: Id`
/// - `@name` (after `*`/`+`) → `name: Vec<Id>`
/// - `@name` (after `?`) → `name: Option<Id>`
///
/// `tree!` and `trees!` can be used without explicit context inside `{...}`.
#[proc_macro]
pub fn rule(input: TokenStream) -> TokenStream {
let input2: TokenStream2 = input.into();
match parse::parse_rule_top(input2) {
Ok(output) => output.into(),
Err(err) => err.to_compile_error().into(),
}
}

View File

@@ -0,0 +1,838 @@
use proc_macro2::{Delimiter, Ident, Literal, Span, TokenStream, TokenTree};
use quote::quote;
use std::iter::Peekable;
type Tokens = Peekable<proc_macro2::token_stream::IntoIter>;
type Result<T> = std::result::Result<T, syn::Error>;
// ---------------------------------------------------------------------------
// Query parsing
// ---------------------------------------------------------------------------
/// Top-level entry: parse a single query node from the full input.
pub fn parse_query_top(input: TokenStream) -> Result<TokenStream> {
let mut tokens = input.into_iter().peekable();
let result = parse_query_node(&mut tokens)?;
if let Some(tok) = tokens.next() {
return Err(syn::Error::new_spanned(tok, "unexpected token after query"));
}
Ok(result)
}
/// Parse a single query node (possibly with a trailing `@capture`).
fn parse_query_node(tokens: &mut Tokens) -> Result<TokenStream> {
let base = parse_query_atom(tokens)?;
// Check for trailing @capture
if peek_is_at(tokens) {
tokens.next(); // consume @
let capture_name = expect_ident(tokens, "expected capture name after @")?;
let name_str = capture_name.to_string();
Ok(quote! {
yeast::query::QueryNode::Capture {
capture: #name_str,
node: Box::new(#base),
}
})
} else {
Ok(base)
}
}
/// Parse a query atom: `(kind fields...)` or `(kind fields... bare_children...)`.
/// Does not handle `@capture` — that's handled by the caller as a postfix.
fn parse_query_atom(tokens: &mut Tokens) -> Result<TokenStream> {
match tokens.peek() {
None => Err(syn::Error::new(
Span::call_site(),
"unexpected end of query",
)),
Some(TokenTree::Group(g)) if g.delimiter() == Delimiter::Parenthesis => {
let group = expect_group(tokens, Delimiter::Parenthesis)?;
let mut inner = group.stream().into_iter().peekable();
let result = parse_query_node_inner(&mut inner)?;
if let Some(tok) = inner.next() {
return Err(syn::Error::new_spanned(
tok,
"unexpected token in query node",
));
}
Ok(result)
}
Some(tok) => Err(syn::Error::new_spanned(
tok.clone(),
"expected `(` in query; use `(_) @name` to capture a wildcard",
)),
}
}
/// Parse the inside of a parenthesized query node: `kind fields...` or `_` or `"lit"`.
fn parse_query_node_inner(tokens: &mut Tokens) -> Result<TokenStream> {
match tokens.peek() {
None => Err(syn::Error::new(
Span::call_site(),
"empty parenthesized group in query",
)),
Some(TokenTree::Ident(id)) if *id == "_" => {
tokens.next();
Ok(quote! { yeast::query::QueryNode::Any() })
}
Some(TokenTree::Literal(_)) => {
let lit = expect_literal(tokens)?;
Ok(quote! { yeast::query::QueryNode::UnnamedNode { kind: #lit } })
}
Some(TokenTree::Ident(_)) => {
let kind = expect_ident(tokens, "expected node kind")?;
let kind_str = kind.to_string();
let fields = parse_query_fields(tokens)?;
Ok(quote! {
yeast::query::QueryNode::Node {
kind: #kind_str,
children: vec![#(#fields),*],
}
})
}
Some(tok) => Err(syn::Error::new_spanned(
tok.clone(),
"expected node kind, `_`, or string literal",
)),
}
}
/// Parse zero or more field specifications and trailing bare patterns.
/// Named fields: `name: pattern` or `name*: (list...)`.
/// Bare patterns (no field name) become implicit `child` field entries.
fn parse_query_fields(tokens: &mut Tokens) -> Result<Vec<TokenStream>> {
let mut fields = Vec::new();
while tokens.peek().is_some() {
if peek_is_field(tokens) {
let field_name = expect_ident(tokens, "expected field name")?;
let field_str = field_name.to_string();
expect_punct(tokens, ':', "expected `:` after field name")?;
let child = parse_query_node(tokens)?;
fields.push(quote! {
(#field_str, vec![yeast::query::QueryListElem::SingleNode(#child)])
});
} else {
// Bare patterns — collect as implicit `child` field
let elems = parse_query_list(tokens)?;
if !elems.is_empty() {
fields.push(quote! {
("child", vec![#(#elems),*])
});
}
break;
}
}
Ok(fields)
}
/// Parse a list of query elements (bare children).
/// Each element is a node pattern, possibly followed by `*`, `+`, `?`.
fn parse_query_list(tokens: &mut Tokens) -> Result<Vec<TokenStream>> {
let mut elems = Vec::new();
while tokens.peek().is_some() {
// Check for parenthesized group
if peek_is_group(tokens, Delimiter::Parenthesis) {
let group = expect_group(tokens, Delimiter::Parenthesis)?;
let mut inner = group.stream().into_iter().peekable();
// Check for repetition after the group
if peek_is_repetition(tokens) {
let rep = expect_repetition(tokens)?;
// Determine if the group is a single node pattern or a list
// of patterns. If it starts with an identifier (node kind) or
// `_`, treat it as a single repeated node. Otherwise, parse
// as a repeated list of sub-patterns.
let is_single_node = matches!(inner.peek(), Some(TokenTree::Ident(_)));
if is_single_node {
let node = parse_query_node_inner(&mut inner)?;
let elem = quote! {
yeast::query::QueryListElem::Repeated {
children: vec![yeast::query::QueryListElem::SingleNode(#node)],
rep: #rep,
}
};
let elem = maybe_wrap_list_capture(tokens, elem)?;
elems.push(elem);
} else {
let sub_elems = parse_query_list(&mut inner)?;
let elem = quote! {
yeast::query::QueryListElem::Repeated {
children: vec![#(#sub_elems),*],
rep: #rep,
}
};
let elem = maybe_wrap_list_capture(tokens, elem)?;
elems.push(elem);
}
} else {
// Single parenthesized node, possibly followed by @capture
let node = parse_query_node_inner(&mut inner)?;
let node = maybe_wrap_capture(tokens, node)?;
elems.push(quote! {
yeast::query::QueryListElem::SingleNode(#node)
});
}
continue;
}
// Check for string literal (unnamed node)
if peek_is_literal(tokens) {
let lit = expect_literal(tokens)?;
let node = quote! { yeast::query::QueryNode::UnnamedNode { kind: #lit } };
let elem = maybe_wrap_repetition(
tokens,
quote! {
yeast::query::QueryListElem::SingleNode(#node)
},
)?;
elems.push(elem);
continue;
}
// Check for bare _ (wildcard), possibly followed by @capture
if peek_is_underscore(tokens) {
tokens.next();
let node = quote! { yeast::query::QueryNode::Any() };
let node = maybe_wrap_capture(tokens, node)?;
let elem = maybe_wrap_repetition(
tokens,
quote! {
yeast::query::QueryListElem::SingleNode(#node)
},
)?;
elems.push(elem);
continue;
}
break;
}
Ok(elems)
}
// ---------------------------------------------------------------------------
// tree! / trees! parsing — direct code generation against BuildCtx
// ---------------------------------------------------------------------------
const IMPLICIT_CTX: &str = "__yeast_ctx";
/// Determine the context identifier: either explicit `ctx,` or the implicit
/// `__yeast_ctx` from an enclosing `rule!`.
fn parse_ctx_or_implicit(tokens: &mut Tokens) -> Ident {
// Check if first token is an ident followed by a comma
let mut lookahead = tokens.clone();
let is_explicit = matches!(lookahead.next(), Some(TokenTree::Ident(_)))
&& matches!(lookahead.next(), Some(TokenTree::Punct(p)) if p.as_char() == ',');
if is_explicit {
let ctx = expect_ident(tokens, "").unwrap();
let _ = tokens.next(); // consume comma
ctx
} else {
Ident::new(IMPLICIT_CTX, Span::call_site())
}
}
/// Parse `tree!(ctx, (template))` or `tree!((template))` — returns single `Id`.
pub fn parse_tree_top(input: TokenStream) -> Result<TokenStream> {
let mut tokens = input.into_iter().peekable();
let ctx = parse_ctx_or_implicit(&mut tokens);
let first = parse_direct_node(&mut tokens, &ctx)?;
if let Some(tok) = tokens.next() {
return Err(syn::Error::new_spanned(
tok,
"unexpected tokens after tree! template; use trees! for multiple nodes",
));
}
Ok(quote! { { #first } })
}
/// Parse `trees!(ctx, ...)` or `trees!(...)` — returns `Vec<Id>`.
pub fn parse_trees_top(input: TokenStream) -> Result<TokenStream> {
let mut tokens = input.into_iter().peekable();
let ctx = parse_ctx_or_implicit(&mut tokens);
let items = parse_direct_list(&mut tokens, &ctx)?;
if let Some(tok) = tokens.next() {
return Err(syn::Error::new_spanned(
tok,
"unexpected token after trees! template",
));
}
Ok(quote! {
{
let mut __nodes: Vec<usize> = Vec::new();
#(#items)*
__nodes
}
})
}
/// Parse a single node template and generate code that returns an `Id`.
/// Handles: `(kind fields... children...)` and `{expr}`.
fn parse_direct_node(tokens: &mut Tokens, ctx: &Ident) -> Result<TokenStream> {
match tokens.peek() {
Some(TokenTree::Group(g)) if g.delimiter() == Delimiter::Brace => {
let group = expect_group(tokens, Delimiter::Brace)?;
let expr = group.stream();
Ok(quote! { #expr })
}
Some(TokenTree::Group(g)) if g.delimiter() == Delimiter::Parenthesis => {
let group = expect_group(tokens, Delimiter::Parenthesis)?;
let mut inner = group.stream().into_iter().peekable();
parse_direct_node_inner(&mut inner, ctx)
}
Some(tok) => Err(syn::Error::new_spanned(
tok.clone(),
"expected `(` or `{` in tree template",
)),
None => Err(syn::Error::new(
Span::call_site(),
"unexpected end of tree template",
)),
}
}
/// Parse the inside of a parenthesized node: `kind fields... children...`
/// or `kind "literal"` or `kind $fresh`.
fn parse_direct_node_inner(tokens: &mut Tokens, ctx: &Ident) -> Result<TokenStream> {
let kind = expect_ident(tokens, "expected node kind")?;
let kind_str = kind.to_string();
// Check for (kind "literal")
if peek_is_literal(tokens) {
let lit = expect_literal(tokens)?;
return Ok(quote! { #ctx.literal(#kind_str, #lit) });
}
// Check for (kind #{expr}) — computed literal, expr converted via .to_string()
if peek_is_hash(tokens) {
tokens.next(); // consume #
let group = expect_group(tokens, Delimiter::Brace)?;
let expr = group.stream();
return Ok(quote! { #ctx.literal(#kind_str, &(#expr).to_string()) });
}
// Check for (kind $fresh)
if peek_is_dollar(tokens) {
tokens.next();
let name = expect_ident(tokens, "expected fresh variable name after $")?;
let name_str = name.to_string();
return Ok(quote! { #ctx.fresh(#kind_str, #name_str) });
}
// Parse named fields
let mut stmts = Vec::new();
let mut field_args = Vec::new();
let mut field_counter = 0usize;
// Named fields — compute each value into a temp, then reference it
while peek_is_field(tokens) {
let field_name = expect_ident(tokens, "expected field name")?;
let field_str = field_name.to_string();
expect_punct(tokens, ':', "expected `:` after field name")?;
let temp = Ident::new(
&format!("__field_{field_str}_{field_counter}"),
Span::call_site(),
);
field_counter += 1;
// Check for field: {..expr} — splice a Vec<Id> into the field
if peek_is_group(tokens, Delimiter::Brace) {
let group_clone = tokens.clone().next().unwrap();
if let TokenTree::Group(g) = &group_clone {
let mut inner_check = g.stream().into_iter();
let is_splice = matches!(inner_check.next(), Some(TokenTree::Punct(p)) if p.as_char() == '.')
&& matches!(inner_check.next(), Some(TokenTree::Punct(p)) if p.as_char() == '.');
if is_splice {
let group = expect_group(tokens, Delimiter::Brace)?;
let mut inner = group.stream().into_iter().peekable();
inner.next(); // consume first .
inner.next(); // consume second .
let expr: proc_macro2::TokenStream = inner.collect();
stmts.push(quote! { let #temp: Vec<usize> = #expr; });
field_args.push(quote! { (#field_str, #temp) });
continue;
}
}
}
let value = parse_direct_node(tokens, ctx)?;
stmts.push(quote! { let #temp = #value; });
field_args.push(quote! { (#field_str, vec![#temp]) });
}
// After all named fields, no other tokens are allowed.
// Output templates require all children to be in named fields.
if let Some(tok) = tokens.peek() {
return Err(syn::Error::new_spanned(
tok.clone(),
"expected named field (`name:`) or end of node template; \
output templates do not support unnamed children",
));
}
Ok(quote! {
{
#(#stmts)*
#ctx.node(#kind_str, vec![#(#field_args),*])
}
})
}
/// Parse the top-level list of a `trees!` template.
/// Each item is a node template or `{expr}` splice.
fn parse_direct_list(tokens: &mut Tokens, ctx: &Ident) -> Result<Vec<TokenStream>> {
let mut items = Vec::new();
while tokens.peek().is_some() {
if peek_is_group(tokens, Delimiter::Parenthesis) {
let group = expect_group(tokens, Delimiter::Parenthesis)?;
let mut inner = group.stream().into_iter().peekable();
// Regular node
let node = parse_direct_node_inner(&mut inner, ctx)?;
items.push(quote! { __nodes.push(#node); });
continue;
}
// {expr} or {..expr} — single node or splice
if peek_is_group(tokens, Delimiter::Brace) {
let group = expect_group(tokens, Delimiter::Brace)?;
let mut inner = group.stream().into_iter().peekable();
if peek_is_dotdot(&inner) {
inner.next(); // consume first .
inner.next(); // consume second .
let expr: TokenStream = inner.collect();
items.push(quote! { __nodes.extend(#expr); });
} else {
let expr = group.stream();
items.push(quote! { __nodes.push(#expr); });
}
continue;
}
break;
}
Ok(items)
}
// ---------------------------------------------------------------------------
// rule! parsing
// ---------------------------------------------------------------------------
/// A captured variable from a query pattern.
struct CaptureInfo {
name: String,
multiplicity: CaptureMultiplicity,
}
#[derive(Clone, Copy, PartialEq)]
enum CaptureMultiplicity {
/// Exactly one match (bare pattern or after no quantifier)
Single,
/// Zero or one match (after `?`)
Optional,
/// Zero or more matches (after `*` or `+`, or inside a repeated group)
Repeated,
}
/// Walk a token stream and extract all `@name` captures, noting whether
/// they appear after `*` or `+` (repeated) or not.
fn extract_captures(stream: &TokenStream) -> Vec<CaptureInfo> {
let mut captures = Vec::new();
extract_captures_inner(
&mut stream.clone().into_iter().peekable(),
&mut captures,
CaptureMultiplicity::Single,
);
captures
}
fn extract_captures_inner(
tokens: &mut Tokens,
captures: &mut Vec<CaptureInfo>,
parent_mult: CaptureMultiplicity,
) {
let mut last_mult = CaptureMultiplicity::Single;
while let Some(tok) = tokens.next() {
match tok {
TokenTree::Group(g) => {
let mut inner = g.stream().into_iter().peekable();
let group_mult = match tokens.peek() {
Some(TokenTree::Punct(p)) if p.as_char() == '*' || p.as_char() == '+' => {
CaptureMultiplicity::Repeated
}
Some(TokenTree::Punct(p)) if p.as_char() == '?' => {
CaptureMultiplicity::Optional
}
_ => CaptureMultiplicity::Single,
};
last_mult = group_mult;
let child_mult = if parent_mult == CaptureMultiplicity::Repeated
|| group_mult == CaptureMultiplicity::Repeated
{
CaptureMultiplicity::Repeated
} else if parent_mult == CaptureMultiplicity::Optional
|| group_mult == CaptureMultiplicity::Optional
{
CaptureMultiplicity::Optional
} else {
CaptureMultiplicity::Single
};
extract_captures_inner(&mut inner, captures, child_mult);
}
TokenTree::Punct(p) if p.as_char() == '@' => {
if let Some(TokenTree::Ident(name)) = tokens.next() {
let mult = if parent_mult == CaptureMultiplicity::Repeated
|| last_mult == CaptureMultiplicity::Repeated
{
CaptureMultiplicity::Repeated
} else if parent_mult == CaptureMultiplicity::Optional
|| last_mult == CaptureMultiplicity::Optional
{
CaptureMultiplicity::Optional
} else {
CaptureMultiplicity::Single
};
captures.push(CaptureInfo {
name: name.to_string(),
multiplicity: mult,
});
}
last_mult = CaptureMultiplicity::Single;
}
TokenTree::Punct(p) if matches!(p.as_char(), '*' | '+' | '?') => {
// Keep last_mult — the @capture follows
}
_ => {
last_mult = CaptureMultiplicity::Single;
}
}
}
}
/// Parse `rule!( query => transform )`.
pub fn parse_rule_top(input: TokenStream) -> Result<TokenStream> {
let mut tokens = input.into_iter().peekable();
// Collect query tokens up to `=>`
let mut query_tokens = Vec::new();
loop {
match tokens.peek() {
None => return Err(syn::Error::new(Span::call_site(), "expected `=>` in rule!")),
Some(TokenTree::Punct(p)) if p.as_char() == '=' => {
let eq = tokens.next().unwrap();
match tokens.peek() {
Some(TokenTree::Punct(p)) if p.as_char() == '>' => {
tokens.next(); // consume >
break;
}
_ => {
query_tokens.push(eq);
continue;
}
}
}
_ => {
query_tokens.push(tokens.next().unwrap());
}
}
}
let query_stream: TokenStream = query_tokens.into_iter().collect();
// Extract captures from query
let captures = extract_captures(&query_stream);
// Parse query
let query_code = parse_query_top(query_stream.clone())?;
// Generate capture bindings
let ctx_ident = Ident::new(IMPLICIT_CTX, Span::call_site());
let bindings: Vec<TokenStream> = captures
.iter()
.map(|cap| {
let name = Ident::new(&cap.name, Span::call_site());
let name_str = &cap.name;
match cap.multiplicity {
CaptureMultiplicity::Repeated => {
quote! { let #name: Vec<usize> = __captures.get_all(#name_str); }
}
CaptureMultiplicity::Optional => {
quote! { let #name: Option<usize> = __captures.get_opt(#name_str); }
}
CaptureMultiplicity::Single => {
quote! { let #name: usize = __captures.get_var(#name_str).unwrap(); }
}
}
})
.collect();
// Parse transform: either shorthand `=> kind_name` or full `=> (template ...)`
let transform_body = if peek_is_field(&mut tokens) && {
// Shorthand form: bare identifier = output node kind.
// Auto-generate template from captures.
let mut lookahead = tokens.clone();
lookahead.next(); // skip ident
lookahead.peek().is_none() // nothing after = shorthand
} {
let output_kind = expect_ident(&mut tokens, "expected output node kind")?;
let output_kind_str = output_kind.to_string();
// Generate field assignments from captures
let field_stmts: Vec<TokenStream> = captures
.iter()
.map(|cap| {
let name = Ident::new(&cap.name, Span::call_site());
let name_str = &cap.name;
match cap.multiplicity {
CaptureMultiplicity::Repeated => quote! {
let __field_id = #ctx_ident.ast.field_id_for_name(#name_str)
.unwrap_or_else(|| panic!("field '{}' not found", #name_str));
__fields.insert(__field_id, #name);
},
CaptureMultiplicity::Optional => quote! {
let __field_id = #ctx_ident.ast.field_id_for_name(#name_str)
.unwrap_or_else(|| panic!("field '{}' not found", #name_str));
if let Some(__id) = #name {
__fields.entry(__field_id).or_insert_with(Vec::new).push(__id);
}
},
CaptureMultiplicity::Single => quote! {
let __field_id = #ctx_ident.ast.field_id_for_name(#name_str)
.unwrap_or_else(|| panic!("field '{}' not found", #name_str));
__fields.entry(__field_id).or_insert_with(Vec::new).push(#name);
},
}
})
.collect();
quote! {
let __kind = #ctx_ident.ast.id_for_node_kind(#output_kind_str)
.unwrap_or_else(|| panic!("node kind '{}' not found", #output_kind_str));
let mut __fields = std::collections::BTreeMap::new();
#(#field_stmts)*
let __id = #ctx_ident.ast.create_node_with_range(
__kind,
yeast::NodeContent::DynamicString(String::new()),
__fields,
true,
__source_range,
);
vec![__id]
}
} else {
// Full template form
let transform_items = parse_direct_list(&mut tokens, &ctx_ident)?;
if let Some(tok) = tokens.next() {
return Err(syn::Error::new_spanned(
tok,
"unexpected token after rule! transform",
));
}
quote! {
let mut __nodes: Vec<usize> = Vec::new();
#(#transform_items)*
__nodes
}
};
Ok(quote! {
{
let __query = #query_code;
yeast::Rule::new(__query, Box::new(|__ast: &mut yeast::Ast, __captures: yeast::captures::Captures, __fresh: &yeast::tree_builder::FreshScope, __source_range: Option<tree_sitter::Range>| {
#(#bindings)*
let mut #ctx_ident = yeast::build::BuildCtx::with_source_range(__ast, &__captures, __fresh, __source_range);
#transform_body
}))
}
})
}
// ---------------------------------------------------------------------------
// Token utilities
// ---------------------------------------------------------------------------
fn peek_is_at(tokens: &mut Tokens) -> bool {
matches!(tokens.peek(), Some(TokenTree::Punct(p)) if p.as_char() == '@')
}
fn peek_is_literal(tokens: &mut Tokens) -> bool {
matches!(tokens.peek(), Some(TokenTree::Literal(_)))
}
fn peek_is_dollar(tokens: &mut Tokens) -> bool {
matches!(tokens.peek(), Some(TokenTree::Punct(p)) if p.as_char() == '$')
}
fn peek_is_hash(tokens: &mut Tokens) -> bool {
matches!(tokens.peek(), Some(TokenTree::Punct(p)) if p.as_char() == '#')
}
/// Check for `..` (two consecutive dot punctuation tokens).
fn peek_is_dotdot(tokens: &Tokens) -> bool {
let mut lookahead = tokens.clone();
matches!(lookahead.next(), Some(TokenTree::Punct(p)) if p.as_char() == '.')
&& matches!(lookahead.next(), Some(TokenTree::Punct(p)) if p.as_char() == '.')
}
fn peek_is_underscore(tokens: &mut Tokens) -> bool {
matches!(tokens.peek(), Some(TokenTree::Ident(id)) if *id == "_")
}
/// Check if the next tokens form a field specification (ident followed by `:` or `*:`).
/// A bare identifier (other than `_`) at this position is always a field name, since
/// bare child patterns must start with `(`, `@`, `"literal"`, or `_`.
fn peek_is_field(tokens: &mut Tokens) -> bool {
matches!(tokens.peek(), Some(TokenTree::Ident(id)) if *id != "_")
}
fn peek_is_group(tokens: &mut Tokens, delim: Delimiter) -> bool {
matches!(tokens.peek(), Some(TokenTree::Group(g)) if g.delimiter() == delim)
}
fn peek_is_repetition(tokens: &mut Tokens) -> bool {
matches!(tokens.peek(), Some(TokenTree::Punct(p)) if matches!(p.as_char(), '*' | '+' | '?'))
}
fn expect_ident(tokens: &mut Tokens, msg: &str) -> Result<Ident> {
match tokens.next() {
Some(TokenTree::Ident(id)) => Ok(id),
Some(tok) => Err(syn::Error::new_spanned(tok, msg)),
None => Err(syn::Error::new(Span::call_site(), msg)),
}
}
fn expect_literal(tokens: &mut Tokens) -> Result<Literal> {
match tokens.next() {
Some(TokenTree::Literal(lit)) => Ok(lit),
Some(tok) => Err(syn::Error::new_spanned(tok, "expected string literal")),
None => Err(syn::Error::new(
Span::call_site(),
"expected string literal",
)),
}
}
fn expect_punct(tokens: &mut Tokens, ch: char, msg: &str) -> Result<()> {
match tokens.next() {
Some(TokenTree::Punct(p)) if p.as_char() == ch => Ok(()),
Some(tok) => Err(syn::Error::new_spanned(tok, msg)),
None => Err(syn::Error::new(Span::call_site(), msg)),
}
}
fn expect_group(tokens: &mut Tokens, delim: Delimiter) -> Result<proc_macro2::Group> {
match tokens.next() {
Some(TokenTree::Group(g)) if g.delimiter() == delim => Ok(g),
Some(tok) => Err(syn::Error::new_spanned(
tok,
format!("expected {delim:?} group"),
)),
None => Err(syn::Error::new(
Span::call_site(),
format!("expected {delim:?} group"),
)),
}
}
fn expect_repetition(tokens: &mut Tokens) -> Result<TokenStream> {
match tokens.next() {
Some(TokenTree::Punct(p)) => match p.as_char() {
'*' => Ok(quote! { yeast::query::Rep::ZeroOrMore }),
'+' => Ok(quote! { yeast::query::Rep::OneOrMore }),
'?' => Ok(quote! { yeast::query::Rep::ZeroOrOne }),
_ => Err(syn::Error::new(p.span(), "expected `*`, `+`, or `?`")),
},
Some(tok) => Err(syn::Error::new_spanned(
tok,
"expected repetition quantifier",
)),
None => Err(syn::Error::new(
Span::call_site(),
"expected repetition quantifier",
)),
}
}
fn maybe_wrap_capture(tokens: &mut Tokens, base: TokenStream) -> Result<TokenStream> {
if peek_is_at(tokens) {
tokens.next(); // consume @
let name = expect_ident(tokens, "expected capture name after @")?;
let name_str = name.to_string();
Ok(quote! {
yeast::query::QueryNode::Capture {
capture: #name_str,
node: Box::new(#base),
}
})
} else {
Ok(base)
}
}
fn maybe_wrap_repetition(tokens: &mut Tokens, single: TokenStream) -> Result<TokenStream> {
if peek_is_repetition(tokens) {
let rep = expect_repetition(tokens)?;
Ok(quote! {
yeast::query::QueryListElem::Repeated {
children: vec![#single],
rep: #rep,
}
})
} else {
Ok(single)
}
}
/// If `@name` follows a Repeated list element, wrap each child SingleNode
/// inside the repetition with a Capture. This matches tree-sitter semantics
/// where `(_)* @name` captures each matched node.
fn maybe_wrap_list_capture(tokens: &mut Tokens, elem: TokenStream) -> Result<TokenStream> {
if peek_is_at(tokens) {
tokens.next();
let name = expect_ident(tokens, "expected capture name after @")?;
let name_str = name.to_string();
// Re-parse the element isn't practical, so we generate a wrapper
// that creates a new Repeated with each child wrapped in a capture.
// The simplest approach: generate code that the runtime can interpret.
// Actually, the capture annotation on repeated elements is best handled
// by re-generating the Repeated with captures injected.
// For now, assume the common case: the repetition contains a single
// SingleNode child, and we wrap that node in a capture.
Ok(quote! {
{
let __rep = #elem;
match __rep {
yeast::query::QueryListElem::Repeated { children, rep } => {
yeast::query::QueryListElem::Repeated {
children: children.into_iter().map(|child| {
match child {
yeast::query::QueryListElem::SingleNode(node) => {
yeast::query::QueryListElem::SingleNode(
yeast::query::QueryNode::Capture {
capture: #name_str,
node: Box::new(node),
}
)
}
other => other,
}
}).collect(),
rep,
}
}
other => other,
}
}
})
} else {
Ok(elem)
}
}

18
shared/yeast/BUILD.bazel Normal file
View File

@@ -0,0 +1,18 @@
load("@rules_rust//rust:defs.bzl", "rust_library")
load("//misc/bazel/3rdparty/tree_sitter_extractors_deps:defs.bzl", "aliases", "all_crate_deps")
exports_files(["Cargo.toml"])
rust_library(
name = "yeast",
srcs = glob(
["src/**/*.rs"],
exclude = ["src/bin/**"],
),
aliases = aliases(),
proc_macro_deps = [
"//shared/yeast-macros",
],
visibility = ["//visibility:public"],
deps = all_crate_deps(),
)

15
shared/yeast/Cargo.toml Normal file
View File

@@ -0,0 +1,15 @@
[package]
name = "yeast"
version = "0.1.0"
edition = "2021"
[dependencies]
clap = { version = "4.4.10", features = ["derive"] }
serde = { version = "1.0.193", features = ["derive"] }
serde_json = "1.0.108"
serde_yaml = "0.9"
tree-sitter = ">= 0.23.0"
yeast-macros = { path = "../yeast-macros" }
tree-sitter-ruby = "0.23"
tree-sitter-python = "0.23"

View File

@@ -0,0 +1,241 @@
# YAML Node Types Format
The YAML node-types format is a human-friendly alternative to tree-sitter's
`node-types.json`. It can be converted to and from JSON using the
`node_types_yaml` tool.
## Overview
A YAML node-types file has three top-level sections:
```yaml
supertypes:
# Abstract union types
named:
# Concrete AST nodes and leaf tokens
unnamed:
# Punctuation and keyword tokens
```
All three sections are optional. If omitted, they default to empty.
## Supertypes
Supertypes are abstract groupings of node types (unions). Each supertype maps
to a list of its members:
```yaml
supertypes:
_expression:
- assignment
- binary
- identifier
- call
```
This corresponds to the following JSON:
```json
{
"type": "_expression",
"named": true,
"subtypes": [
{ "type": "assignment", "named": true },
{ "type": "binary", "named": true },
{ "type": "identifier", "named": true },
{ "type": "call", "named": true }
]
}
```
Members are resolved as named or unnamed using the
[type reference rules](#type-references) described below.
## Named nodes
Named nodes are concrete AST node types. Each entry is a node kind mapping to
its fields. A node with no fields (a leaf token like `identifier`) uses an
empty value:
```yaml
named:
identifier:
constant:
```
```json
{"type": "identifier", "named": true, "fields": {}},
{"type": "constant", "named": true, "fields": {}}
```
### Fields
Each field has a name, a multiplicity suffix, and a list of allowed types.
| Suffix | Meaning | JSON `multiple` | JSON `required` |
| ------ | ------------ | --------------- | --------------- |
| (none) | exactly one | `false` | `true` |
| `?` | zero or one | `false` | `false` |
| `+` | one or more | `true` | `true` |
| `*` | zero or more | `true` | `false` |
Example:
```yaml
named:
assignment:
left: _lhs
right: _expression
```
```json
{
"type": "assignment",
"named": true,
"fields": {
"left": {
"multiple": false,
"required": true,
"types": [{ "type": "_lhs", "named": true }]
},
"right": {
"multiple": false,
"required": true,
"types": [{ "type": "_expression", "named": true }]
}
}
}
```
A field with multiple allowed types uses a list:
```yaml
named:
binary:
left: [_expression, _simple_numeric]
operator: ["!=", "+", "&&"]
right: _expression
```
A singleton list can be written as a bare value (as shown with `right` above).
### Unnamed children
Unnamed children (nodes that appear as children without a field name) are
specified using the special `$children` field name, with the same suffixes:
```yaml
named:
argument_list:
$children*: [_expression, block_argument, splat_argument]
```
```json
{
"type": "argument_list",
"named": true,
"fields": {},
"children": {
"multiple": true,
"required": false,
"types": [
{ "type": "_expression", "named": true },
{ "type": "block_argument", "named": true },
{ "type": "splat_argument", "named": true }
]
}
}
```
## Unnamed tokens
Unnamed tokens are punctuation, operators, and keywords that appear in the
parse tree but don't have their own AST node type. They are listed as simple
strings:
```yaml
unnamed:
- "="
- "end"
- "+"
- "&&"
```
```json
{"type": "=", "named": false},
{"type": "end", "named": false},
{"type": "+", "named": false},
{"type": "&&", "named": false}
```
When converting to YAML, unnamed tokens are always wrapped in quotes for
visual clarity. This is purely cosmetic — YAML treats `end` and `"end"` as
the same string.
## Type references
When a type name appears in a field's type list or a supertype's member list,
it needs to be resolved as either named or unnamed. The rules are:
1. If the name only appears in `named` or `supertypes`, it is **named**.
2. If the name only appears in `unnamed`, it is **unnamed**.
3. If the name appears in both, it defaults to **named**.
4. To explicitly reference an unnamed type in the ambiguous case, use the
map form:
```yaml
named:
example:
field: { unnamed: foo }
```
In practice, ambiguity is rare — names like `end`, `+`, `if` are almost
always only unnamed, while names like `identifier`, `assignment` are only
named.
## Complete example
```yaml
supertypes:
_expression:
- assignment
- binary
- identifier
named:
assignment:
left: _expression
right?: _expression
binary:
left: [_expression, _simple_numeric]
operator: ["!=", "+"]
right: _expression
argument_list:
$children*: [_expression, block_argument]
identifier:
constant:
unnamed:
- "!="
- "+"
- "="
- "end"
```
## CLI usage
Convert YAML to JSON:
```
node_types_yaml input.yaml > node-types.json
```
Convert JSON to YAML:
```
node_types_yaml --from-json node-types.json > node-types.yaml
```
Both commands also accept input from stdin if no file argument is given.

329
shared/yeast/doc/yeast.md Normal file
View File

@@ -0,0 +1,329 @@
# YEAST — YEAST Elaborates Abstract Syntax Trees
YEAST is a framework for transforming tree-sitter parse trees before they are
extracted into a CodeQL database. It sits between the tree-sitter parser and
the TRAP extractor, rewriting parts of the AST according to declarative rules.
## Motivation
Tree-sitter grammars describe the **concrete syntax** of a language — every
keyword, operator, and punctuation token appears in the parse tree. CodeQL
analyses often prefer a **simplified abstract syntax** where syntactic sugar
has been removed. YEAST bridges this gap by desugaring the tree-sitter output
into a cleaner form before extraction.
For example, Ruby's `for x in list do ... end` is syntactic sugar for
`list.each { |x| ... }`. A YEAST rule can rewrite the former into the latter
so that CodeQL queries only need to reason about the `.each` form.
## Architecture
```
Source code
┌──────────────┐
│ tree-sitter │ Parse source into a concrete syntax tree
│ parser │
└──────┬───────┘
│ tree_sitter::Tree
┌──────────────┐
│ YEAST │ Apply desugaring rules, producing a new AST
│ Runner │
└──────┬───────┘
│ yeast::Ast
┌──────────────┐
│ TRAP │ Walk the (possibly rewritten) AST and emit TRAP tuples
│ extractor │
└──────────────┘
```
The entry point is `extract()` in the shared tree-sitter extractor. When
called with a non-empty `rules` vector, the parsed tree is run through the
YEAST `Runner` before TRAP extraction; with an empty `rules` vector the
tree is extracted unchanged.
## How desugaring works
A YEAST `Rule` has two parts:
1. A **query** that matches nodes in the AST using a tree-sitter-inspired
pattern language.
2. A **transform** that produces replacement nodes from the match captures.
The `Runner` applies rules by walking the tree top-down. At each node, it
tries each rule in order. If a rule's query matches, the node is replaced by
the transform's output, and the rules are re-applied to the result. If no
rule matches, the node is kept and its children are processed recursively.
A rule can replace one node with zero nodes (deletion), one node (rewriting),
or multiple nodes (expansion).
## Query language
Queries use a syntax inspired by
[tree-sitter queries](https://tree-sitter.github.io/tree-sitter/using-parsers/queries/index.html),
written inside the `yeast::query!()` proc macro.
### Node patterns
```rust
// Match any named node
(_)
// Match a node of a specific kind
(assignment)
// Match an unnamed token by its text
("end")
```
### Fields
```rust
// Match a node with specific fields
(assignment
left: (identifier) @lhs
right: (_) @rhs
)
```
Fields are matched by name. Unmentioned fields are ignored — the pattern
`(assignment left: (_) @x)` matches any `assignment` node regardless of
what's in `right`.
### Captures
Captures bind matched nodes to names for use in the transform. A capture
`@name` always follows the pattern it captures:
```rust
(identifier) @name // capture an identifier node
(_) @value // capture any named node
(identifier)* @items // capture each repeated match
```
### Unnamed children
Patterns that appear after all named fields match unnamed (positional)
children. Named node patterns like `(_)` automatically skip unnamed tokens
(keywords, operators, punctuation), matching tree-sitter semantics:
```rust
(for
pattern: (_) @pat // named field
value: (in (_) @val) // "in" token is skipped automatically
body: (do (_)* @body) // "do" and "end" tokens skipped
)
```
### Repetitions
```rust
(_)* // zero or more
(_)+ // one or more
(_)? // zero or one
(identifier)* @names // capture each repeated match
```
## Template language
Templates construct new AST nodes using the `tree!` and `trees!` macros.
All children in a template must be in named fields — output AST nodes are
always fully fielded.
When used inside a `rule!` macro, the context is implicit — no explicit
`BuildCtx` argument is needed. When used standalone, they take a `BuildCtx`
as the first argument:
```rust
// Inside rule! — implicit context, captures are Rust variables
yeast::rule!(
(assignment left: (_) @left right: (_) @right)
=>
(assignment left: {right} right: {left})
);
// Standalone — explicit context
let fresh = yeast::tree_builder::FreshScope::new();
let mut ctx = BuildCtx::new(ast, &captures, &fresh);
let id = yeast::tree!(ctx,
(assignment
left: {ctx.capture("lhs")}
right: {ctx.capture("rhs")}
)
);
```
### `tree!` — build a single node
`tree!(...)` returns a single node `Id`:
```rust
yeast::tree!(ctx,
(assignment
left: {ctx.capture("lhs")}
right: {ctx.capture("rhs")}
)
)
```
### `trees!` — build multiple nodes
`trees!(...)` returns `Vec<Id>`:
```rust
yeast::trees!(ctx,
(assignment left: {tmp} right: {right})
{..body}
)
```
### Literal nodes
`(kind "text")` creates a leaf node with fixed text content:
```rust
(identifier "each") // an identifier node whose text is "each"
```
### Computed literals
`(kind #{expr})` creates a leaf node whose content is `expr.to_string()`:
```rust
(integer #{i}) // an integer node with the value of i
(identifier #{name}) // an identifier from a Rust variable
```
### Fresh identifiers
`(kind $name)` creates a leaf node with an auto-generated unique name. All
occurrences of the same `$name` within one `BuildCtx` share the same value:
```rust
(block
parameters: (block_parameters
(identifier $tmp) // generates e.g. "$tmp-0"
)
body: (block_body
(assignment
left: {pat}
right: (identifier $tmp) // same "$tmp-0" value
)
)
)
```
### Embedded Rust expressions
`{expr}` embeds a Rust expression that returns a single node `Id`:
```rust
(assignment
left: {some_node_id} // insert a pre-built node
right: {rhs} // insert a captured value (inside rule!)
)
```
`{..expr}` splices a `Vec<Id>` (or any iterable of `Id`):
```rust
yeast::trees!(ctx,
(assignment left: {tmp} right: {right})
{..extra_nodes} // splice a Vec<Id>
)
```
Inside `rule!`, captures are Rust variables, so `{name}` inserts a
single capture (`Id`) and `{..name}` splices a repeated capture
(`Vec<Id>`).
## Complete example: for-loop desugaring
This rule rewrites Ruby's `for pat in val do body end` into
`val.each { |tmp| pat = tmp; body }`:
```rust
let for_rule = yeast::rule!(
(for
pattern: (_) @pat
value: (in (_) @val)
body: (do (_)* @body)
)
=>
(call
receiver: {val}
method: (identifier "each")
block: (block
parameters: (block_parameters
(identifier $tmp)
)
body: (block_body
(assignment
left: {pat}
right: (identifier $tmp)
)
{..body}
)
)
)
);
```
Captures from the query (`@pat`, `@val`, `@body`) become Rust variables
automatically: single captures bind as `Id`, repeated captures (after
`*` or `+`) as `Vec<Id>`, and optional captures (after `?`) as
`Option<Id>`.
## The `rule!` macro
`rule!` combines a query and a transform into a single declaration:
```rust
// Full template form
yeast::rule!(
(query_pattern field: (_) @capture)
=>
(output_template field: {capture})
)
// Shorthand form — captures become fields on the output node
yeast::rule!(
(query_pattern field: (_) @capture)
=> output_kind
)
```
The shorthand `=> kind` form auto-generates the template, mapping each
capture name to a field of the same name on the output node.
## Integration with the extractor
A YEAST desugaring pass is configured with a [`DesugaringConfig`], which
carries the rules and an optional output node-types schema (in YAML
format). Attach it to a language spec to enable rewriting:
```rust
let desugar = yeast::DesugaringConfig::new(my_rules)
.with_output_node_types_yaml(include_str!("output-node-types.yml"));
let lang = simple::LanguageSpec {
prefix: "ruby",
ts_language: tree_sitter_ruby::LANGUAGE.into(),
node_types: tree_sitter_ruby::NODE_TYPES,
desugar: Some(desugar),
file_globs: vec!["*.rb".into()],
};
```
The same YAML node-types is used for both the runtime yeast `Schema` (so
rules can refer to output-only kinds and fields) and TRAP validation (it
is converted to JSON internally).
For the dbscheme/QL code generator, set `Language::desugar` to a
`DesugaringConfig` carrying the same YAML; the generator converts it to
JSON for downstream code generation. The `rules` field of the config is
unused at code-generation time.

View File

@@ -0,0 +1,26 @@
use clap::Parser;
#[derive(Parser)]
#[clap(name = "yeast", about = "yeast elaborates abstract syntax trees")]
struct Cli {
file: String,
#[clap(default_value = "ruby")]
language: String,
}
fn get_language(language: &str) -> tree_sitter::Language {
match language {
"ruby" => tree_sitter_ruby::LANGUAGE.into(),
"python" => tree_sitter_python::LANGUAGE.into(),
_ => panic!("Unsupported language: {language}"),
}
}
fn main() {
let args = Cli::parse();
let language = get_language(&args.language);
let source = std::fs::read_to_string(&args.file).unwrap();
let runner = yeast::Runner::new(language, &[]);
let ast = runner.run(&source).unwrap();
println!("{}", ast.print(&source, ast.get_root()));
}

View File

@@ -0,0 +1,51 @@
use clap::Parser;
use std::io::Read;
#[derive(Parser)]
#[clap(
name = "node-types-yaml",
about = "Convert between YAML and JSON node-types formats"
)]
struct Cli {
/// Input file (reads from stdin if not provided)
input: Option<String>,
/// Convert from JSON to YAML (default is YAML to JSON)
#[arg(long)]
from_json: bool,
}
fn main() {
let args = Cli::parse();
let input = match &args.input {
Some(path) => std::fs::read_to_string(path).unwrap_or_else(|e| {
eprintln!("Error reading {path}: {e}");
std::process::exit(1);
}),
None => {
let mut buf = String::new();
std::io::stdin()
.read_to_string(&mut buf)
.unwrap_or_else(|e| {
eprintln!("Error reading stdin: {e}");
std::process::exit(1);
});
buf
}
};
let result = if args.from_json {
yeast::node_types_yaml::convert_from_json(&input)
} else {
yeast::node_types_yaml::convert(&input)
};
match result {
Ok(output) => print!("{output}"),
Err(e) => {
eprintln!("Error: {e}");
std::process::exit(1);
}
}
}

91
shared/yeast/src/build.rs Normal file
View File

@@ -0,0 +1,91 @@
use std::collections::BTreeMap;
use crate::captures::Captures;
use crate::tree_builder::FreshScope;
use crate::{Ast, FieldId, Id, NodeContent};
/// Context for building new AST nodes during a transformation.
///
/// Used by the `tree!` and `trees!` macros. Holds a mutable reference to the
/// AST, a reference to the captures from a query match, and a `FreshScope` for
/// generating unique identifiers.
pub struct BuildCtx<'a> {
pub ast: &'a mut Ast,
pub captures: &'a Captures,
pub fresh: &'a FreshScope,
/// Source range of the matched node, inherited by synthetic nodes.
pub source_range: Option<tree_sitter::Range>,
}
impl<'a> BuildCtx<'a> {
pub fn new(ast: &'a mut Ast, captures: &'a Captures, fresh: &'a FreshScope) -> Self {
Self {
ast,
captures,
fresh,
source_range: None,
}
}
pub fn with_source_range(
ast: &'a mut Ast,
captures: &'a Captures,
fresh: &'a FreshScope,
source_range: Option<tree_sitter::Range>,
) -> Self {
Self {
ast,
captures,
fresh,
source_range,
}
}
/// Look up a capture variable, returning its node Id.
pub fn capture(&self, name: &str) -> Id {
self.captures
.get_var(name)
.unwrap_or_else(|e| panic!("build: {e}"))
}
/// Get all values of a repeated capture variable.
pub fn capture_all(&self, name: &str) -> Vec<Id> {
self.captures.get_all(name)
}
/// Create a named AST node with the given kind and fields.
pub fn node(&mut self, kind: &str, fields: Vec<(&str, Vec<Id>)>) -> Id {
let kind_id = self
.ast
.id_for_node_kind(kind)
.unwrap_or_else(|| panic!("build: node kind '{kind}' not found"));
let mut field_map: BTreeMap<FieldId, Vec<Id>> = BTreeMap::new();
for (name, ids) in fields {
let field_id = self
.ast
.field_id_for_name(name)
.unwrap_or_else(|| panic!("build: field '{name}' not found"));
field_map.entry(field_id).or_default().extend(ids);
}
self.ast.create_node_with_range(
kind_id,
NodeContent::DynamicString(String::new()),
field_map,
true,
self.source_range,
)
}
/// Create a leaf node with a fixed string content.
pub fn literal(&mut self, kind: &'static str, value: &str) -> Id {
self.ast
.create_named_token_with_range(kind, value.to_string(), self.source_range)
}
/// Create a leaf node with an auto-generated unique name.
pub fn fresh(&mut self, kind: &'static str, name: &str) -> Id {
let generated = self.fresh.resolve(name);
self.ast
.create_named_token_with_range(kind, generated, self.source_range)
}
}

View File

@@ -0,0 +1,105 @@
use std::collections::{BTreeMap, BTreeSet};
use crate::Id;
#[derive(Debug, Clone)]
pub struct Captures {
captures: BTreeMap<&'static str, Vec<Id>>,
}
impl Default for Captures {
fn default() -> Self {
Self::new()
}
}
impl Captures {
pub fn new() -> Self {
Captures {
captures: BTreeMap::new(),
}
}
pub fn get_var(&self, key: &str) -> Result<Id, String> {
let ids = self.captures.get(key);
if let Some(ids) = ids {
if ids.len() == 1 {
Ok(ids[0])
} else {
Err(format!(
"Variable {} has {} matches, use * to allow repetition",
key,
ids.len()
))
}
} else {
Err(format!("No variable named {key}"))
}
}
/// Get all values of a capture variable (for repeated captures).
pub fn get_all(&self, key: &str) -> Vec<Id> {
self.captures.get(key).cloned().unwrap_or_default()
}
/// Get an optional capture variable. Returns None if unmatched,
/// Some(id) if matched exactly once.
pub fn get_opt(&self, key: &str) -> Option<Id> {
self.captures
.get(key)
.and_then(|ids| if ids.len() == 1 { Some(ids[0]) } else { None })
}
pub fn insert(&mut self, key: &'static str, id: Id) {
self.captures.entry(key).or_default().push(id);
}
pub fn map_captures(&mut self, kind: &str, f: &mut impl FnMut(Id) -> Id) {
if let Some(ids) = self.captures.get_mut(kind) {
for id in ids {
*id = f(*id);
}
}
}
pub fn map_captures_to(&mut self, from: &str, to: &'static str, f: &mut impl FnMut(Id) -> Id) {
if let Some(from_ids) = self.captures.get(from) {
let new_values = from_ids.iter().copied().map(f).collect();
self.captures.insert(to, new_values);
}
}
pub fn merge(&mut self, other: &Captures) {
for (key, ids) in &other.captures {
self.captures.entry(key).or_default().extend(ids);
}
}
pub fn un_star<'a>(
&'a self,
children: &'a BTreeSet<&'static str>,
) -> Result<impl Iterator<Item = Captures> + 'a, String> {
let mut id_iter = children.iter();
if let Some(fst) = id_iter.next() {
let repeats = self
.captures
.get(fst)
.ok_or_else(|| format!("No variable named {fst}"))?
.len();
// TODO: better error on missing capture
if id_iter.any(|id| self.captures.get(id).map(Vec::len).unwrap_or(0) != repeats) {
return Err("Repeated captures must have the same number of matches".to_string());
}
Ok((0..repeats).map(move |iter| {
let mut new_vars: Captures = Captures::new();
for id in children {
let child_capture = self.captures.get(id).unwrap()[iter];
new_vars.captures.insert(id, vec![child_capture]);
}
new_vars
}))
} else {
Err("Repeated captures must have at least one capture".to_string())
}
}
}

View File

@@ -0,0 +1,8 @@
pub trait Cursor<'a, T, N, F> {
fn node(&self) -> &'a N;
fn field_id(&self) -> Option<F>;
fn field_name(&self) -> Option<&'static str>;
fn goto_first_child(&mut self) -> bool;
fn goto_next_sibling(&mut self) -> bool;
fn goto_parent(&mut self) -> bool;
}

181
shared/yeast/src/dump.rs Normal file
View File

@@ -0,0 +1,181 @@
use std::fmt::Write;
use crate::{Ast, Node, NodeContent, CHILD_FIELD};
/// Options for controlling AST dump output.
pub struct DumpOptions {
/// Whether to include source locations in the output.
pub show_locations: bool,
/// Whether to include source text for leaf nodes.
pub show_content: bool,
}
impl Default for DumpOptions {
fn default() -> Self {
Self {
show_locations: false,
show_content: true,
}
}
}
/// Dump a yeast AST as a human-readable indented text format.
///
/// Output format:
/// ```text
/// program
/// assignment
/// left:
/// left_assignment_list
/// identifier "x"
/// identifier "y"
/// right:
/// call
/// method:
/// identifier "foo"
/// ```
pub fn dump_ast(ast: &Ast, root: usize, source: &str) -> String {
dump_ast_with_options(ast, root, source, &DumpOptions::default())
}
pub fn dump_ast_with_options(
ast: &Ast,
root: usize,
source: &str,
options: &DumpOptions,
) -> String {
let mut out = String::new();
dump_node(ast, root, source, options, 0, &mut out);
out
}
fn dump_node(
ast: &Ast,
id: usize,
source: &str,
options: &DumpOptions,
indent: usize,
out: &mut String,
) {
let node = match ast.get_node(id) {
Some(n) => n,
None => return,
};
let prefix = " ".repeat(indent);
// Node kind
write!(out, "{}{}", prefix, node.kind_name()).unwrap();
// Location
if options.show_locations {
let start = node.start_position();
let end = node.end_position();
write!(
out,
" [{},{}]-[{},{}]",
start.row + 1,
start.column + 1,
end.row + 1,
end.column + 1
)
.unwrap();
}
// Content for leaf nodes
if options.show_content && node.is_named() && is_leaf(node) {
let content = node_content(node, source);
if !content.is_empty() {
write!(out, " {content:?}").unwrap();
}
}
writeln!(out).unwrap();
// Named fields first
for (&field_id, children) in &node.fields {
if field_id == CHILD_FIELD {
continue; // Handle unnamed children last
}
let field_name = ast.field_name_for_id(field_id).unwrap_or("?");
if children.len() == 1 {
write!(out, "{prefix} {field_name}:").unwrap();
// Inline single child
let child = ast.get_node(children[0]);
if child.is_some_and(is_leaf) {
write!(out, " ").unwrap();
dump_node_inline(ast, children[0], source, options, out);
} else {
writeln!(out).unwrap();
dump_node(ast, children[0], source, options, indent + 2, out);
}
} else {
writeln!(out, "{prefix} {field_name}:").unwrap();
for &child_id in children {
dump_node(ast, child_id, source, options, indent + 2, out);
}
}
}
// Unnamed children — skip unnamed tokens (keywords, punctuation)
if let Some(children) = node.fields.get(&CHILD_FIELD) {
for &child_id in children {
if let Some(child) = ast.get_node(child_id) {
if child.is_named() {
dump_node(ast, child_id, source, options, indent + 1, out);
}
}
}
}
}
/// Dump a leaf node inline (no newline prefix, caller provides context).
fn dump_node_inline(ast: &Ast, id: usize, source: &str, options: &DumpOptions, out: &mut String) {
let node = match ast.get_node(id) {
Some(n) => n,
None => return,
};
write!(out, "{}", node.kind_name()).unwrap();
if options.show_locations {
let start = node.start_position();
let end = node.end_position();
write!(
out,
" [{},{}]-[{},{}]",
start.row + 1,
start.column + 1,
end.row + 1,
end.column + 1
)
.unwrap();
}
if options.show_content && node.is_named() {
let content = node_content(node, source);
if !content.is_empty() {
write!(out, " {content:?}").unwrap();
}
}
writeln!(out).unwrap();
}
fn is_leaf(node: &Node) -> bool {
node.fields.is_empty()
}
fn node_content(node: &Node, source: &str) -> String {
match &node.content {
NodeContent::DynamicString(s) if !s.is_empty() => s.clone(),
_ => {
let range = node.byte_range();
if range.start < source.len() && range.end <= source.len() {
source[range.start..range.end].to_string()
} else {
String::new()
}
}
}
}

727
shared/yeast/src/lib.rs Normal file
View File

@@ -0,0 +1,727 @@
use std::collections::BTreeMap;
extern crate self as yeast;
use serde::Serialize;
use serde_json::{json, Value};
pub mod build;
pub mod captures;
pub mod cursor;
pub mod dump;
pub mod node_types_yaml;
pub mod query;
mod range;
pub mod schema;
pub mod tree_builder;
mod visitor;
pub use yeast_macros::{query, rule, tree, trees};
use captures::Captures;
pub use cursor::Cursor;
use query::QueryNode;
/// Node ids are indexes into the arena
type Id = usize;
/// Field and Kind ids are provided by tree-sitter
type FieldId = u16;
type KindId = u16;
pub const CHILD_FIELD: u16 = u16::MAX;
#[derive(Debug)]
pub struct AstCursor<'a> {
ast: &'a Ast,
/// A stack of parents, along with iterators for their children
parents: Vec<(&'a Node, ChildrenIter<'a>)>,
node: &'a Node,
}
impl<'a> AstCursor<'a> {
pub fn new(ast: &'a Ast) -> Self {
// TODO: handle non-zero root
let node = ast.get_node(ast.root).unwrap();
Self {
ast,
parents: vec![],
node,
}
}
fn goto_next_sibling_opt(&mut self) -> Option<()> {
self.node = self.parents.last_mut()?.1.next()?;
Some(())
}
fn goto_first_child_opt(&mut self) -> Option<()> {
let parent = self.node;
let mut children = ChildrenIter::new(self.ast, parent);
let first_child = children.next()?;
self.node = first_child;
self.parents.push((parent, children));
Some(())
}
fn goto_parent_opt(&mut self) -> Option<()> {
self.node = self.parents.pop()?.0;
Some(())
}
}
impl<'a> Cursor<'a, Ast, Node, FieldId> for AstCursor<'a> {
fn node(&self) -> &'a Node {
self.node
}
fn field_id(&self) -> Option<FieldId> {
let (_, children) = self.parents.last()?;
children.current_field()
}
fn field_name(&self) -> Option<&'static str> {
if self.field_id() == Some(CHILD_FIELD) {
None
} else {
self.field_id()
.and_then(|id| self.ast.field_name_for_id(id))
}
}
fn goto_first_child(&mut self) -> bool {
self.goto_first_child_opt().is_some()
}
fn goto_next_sibling(&mut self) -> bool {
self.goto_next_sibling_opt().is_some()
}
fn goto_parent(&mut self) -> bool {
self.goto_parent_opt().is_some()
}
}
/// An iterator over all the child nodes of a node.
#[derive(Debug)]
struct ChildrenIter<'a> {
ast: &'a Ast,
current_field: Option<FieldId>,
fields: std::collections::btree_map::Iter<'a, FieldId, Vec<Id>>,
field_children: Option<std::slice::Iter<'a, Id>>,
}
impl<'a> ChildrenIter<'a> {
fn new(ast: &'a Ast, node: &'a Node) -> Self {
Self {
ast,
current_field: None,
fields: node.fields.iter(),
field_children: None,
}
}
fn get_node(&self, id: Id) -> &'a Node {
self.ast.get_node(id).unwrap()
}
fn current_field(&self) -> Option<FieldId> {
self.current_field
}
}
impl<'a> Iterator for ChildrenIter<'a> {
type Item = &'a Node;
fn next(&mut self) -> Option<Self::Item> {
match self.field_children.as_mut() {
None => match self.fields.next() {
Some((field, children)) => {
self.current_field = Some(*field);
self.field_children = Some(children.iter());
self.next()
}
None => None,
},
Some(children) => match children.next() {
None => match self.fields.next() {
None => None,
Some((field, children)) => {
self.current_field = Some(*field);
self.field_children = Some(children.iter());
self.next()
}
},
Some(child_id) => Some(self.get_node(*child_id)),
},
}
}
}
/// Our AST
pub struct Ast {
root: Id,
nodes: Vec<Node>,
schema: schema::Schema,
}
impl std::fmt::Debug for Ast {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("Ast")
.field("root", &self.root)
.field("nodes", &self.nodes.len())
.finish()
}
}
impl Ast {
/// Construct an AST from a TS tree
pub fn from_tree(language: tree_sitter::Language, tree: &tree_sitter::Tree) -> Self {
let schema = schema::Schema::from_language(&language);
Self::from_tree_with_schema(schema, tree, &language)
}
pub fn from_tree_with_schema(
schema: schema::Schema,
tree: &tree_sitter::Tree,
language: &tree_sitter::Language,
) -> Self {
let mut visitor = visitor::Visitor::new(language.clone());
visitor.visit(tree);
visitor.build_with_schema(schema)
}
pub fn walk(&self) -> AstCursor {
AstCursor::new(self)
}
pub fn nodes(&self) -> &[Node] {
&self.nodes
}
pub fn get_root(&self) -> Id {
self.root
}
pub fn set_root(&mut self, root: Id) {
self.root = root;
}
pub fn get_node(&self, id: Id) -> Option<&Node> {
self.nodes.get(id)
}
pub fn print(&self, source: &str, root_id: Id) -> Value {
let root = &self.nodes()[root_id];
self.print_node(root, source)
}
pub fn create_node(
&mut self,
kind: KindId,
content: NodeContent,
fields: BTreeMap<FieldId, Vec<Id>>,
is_named: bool,
) -> Id {
self.create_node_with_range(kind, content, fields, is_named, None)
}
pub fn create_node_with_range(
&mut self,
kind: KindId,
content: NodeContent,
fields: BTreeMap<FieldId, Vec<Id>>,
is_named: bool,
source_range: Option<tree_sitter::Range>,
) -> Id {
let id = self.nodes.len();
self.nodes.push(Node {
id,
kind,
kind_name: self.schema.node_kind_for_id(kind).unwrap(),
fields,
content,
is_missing: false,
is_error: false,
is_extra: false,
is_named,
source_range,
});
id
}
pub fn create_named_token(&mut self, kind: &'static str, content: String) -> Id {
self.create_named_token_with_range(kind, content, None)
}
pub fn create_named_token_with_range(
&mut self,
kind: &'static str,
content: String,
source_range: Option<tree_sitter::Range>,
) -> Id {
let kind_id = self.schema.id_for_node_kind(kind).unwrap_or_else(|| {
panic!("create_named_token: node kind '{kind}' not found in schema")
});
let id = self.nodes.len();
self.nodes.push(Node {
id,
kind: kind_id,
kind_name: kind,
is_named: true,
is_missing: false,
is_error: false,
source_range,
is_extra: false,
fields: BTreeMap::new(),
content: NodeContent::DynamicString(content),
});
id
}
pub fn field_name_for_id(&self, id: FieldId) -> Option<&'static str> {
self.schema.field_name_for_id(id)
}
pub fn field_id_for_name(&self, name: &str) -> Option<FieldId> {
self.schema.field_id_for_name(name)
}
/// Print a node for debugging
fn print_node(&self, node: &Node, source: &str) -> Value {
let fields: BTreeMap<&'static str, Vec<Value>> = node
.fields
.iter()
.map(|(field_id, nodes)| {
let field_name = if field_id == &CHILD_FIELD {
"rest"
} else {
self.field_name_for_id(*field_id).unwrap()
};
let nodes: Vec<Value> = nodes
.iter()
.map(|id| self.print_node(self.get_node(*id).unwrap(), source))
.collect();
(field_name, nodes)
})
.collect();
let mut value = BTreeMap::new();
let kind = self.schema.node_kind_for_id(node.kind).unwrap();
let content = match &node.content {
NodeContent::Range(range) => source[range.start_byte..range.end_byte].to_string(),
NodeContent::String(s) => s.to_string(),
NodeContent::DynamicString(s) => s.clone(),
};
if fields.is_empty() {
value.insert(kind, json!(content));
} else {
let mut fields: BTreeMap<_, _> =
fields.into_iter().map(|(k, v)| (k, json!(v))).collect();
fields.insert("content", json!(content));
value.insert(kind, json!(fields));
}
json!(value)
}
pub fn id_for_node_kind(&self, kind: &str) -> Option<KindId> {
let id = self.schema.id_for_node_kind(kind).unwrap_or(0);
if id == 0 {
None
} else {
Some(id)
}
}
fn id_for_unnamed_node_kind(&self, kind: &str) -> Option<KindId> {
let id = self.schema.id_for_unnamed_node_kind(kind).unwrap_or(0);
if id == 0 {
None
} else {
Some(id)
}
}
}
/// A node in our AST
#[derive(PartialEq, Eq, Debug, Clone, Serialize)]
pub struct Node {
id: Id,
kind: KindId,
kind_name: &'static str,
pub(crate) fields: BTreeMap<FieldId, Vec<Id>>,
pub(crate) content: NodeContent,
/// For synthetic nodes, the source range of the original node they
/// were desugared from. Used for location information in TRAP output.
#[serde(skip)]
source_range: Option<tree_sitter::Range>,
is_named: bool,
is_missing: bool,
is_extra: bool,
is_error: bool,
}
impl Node {
pub fn id(&self) -> Id {
self.id
}
pub fn kind(&self) -> &'static str {
self.kind_name
}
pub fn kind_name(&self) -> &'static str {
self.kind_name
}
pub fn is_named(&self) -> bool {
self.is_named
}
pub fn is_missing(&self) -> bool {
self.is_missing
}
pub fn is_extra(&self) -> bool {
self.is_extra
}
pub fn is_error(&self) -> bool {
self.is_error
}
fn fake_point(&self) -> tree_sitter::Point {
tree_sitter::Point { row: 0, column: 0 }
}
pub fn start_position(&self) -> tree_sitter::Point {
match self.content {
NodeContent::Range(range) => range.start_point,
_ => self
.source_range
.map_or_else(|| self.fake_point(), |r| r.start_point),
}
}
pub fn end_position(&self) -> tree_sitter::Point {
match self.content {
NodeContent::Range(range) => range.end_point,
_ => self
.source_range
.map_or_else(|| self.fake_point(), |r| r.end_point),
}
}
pub fn start_byte(&self) -> usize {
match self.content {
NodeContent::Range(range) => range.start_byte,
_ => self.source_range.map_or(0, |r| r.start_byte),
}
}
pub fn end_byte(&self) -> usize {
match self.content {
NodeContent::Range(range) => range.end_byte,
_ => self.source_range.map_or(0, |r| r.end_byte),
}
}
pub fn byte_range(&self) -> std::ops::Range<usize> {
self.start_byte()..self.end_byte()
}
pub fn opt_string_content(&self) -> Option<String> {
match &self.content {
NodeContent::Range(_range) => None,
NodeContent::String(s) => Some(s.to_string()),
NodeContent::DynamicString(s) => Some(s.to_string()),
}
}
}
/// The contents of a node is either a range in the original source file,
/// or a new string if the node is synthesized.
#[derive(PartialEq, Eq, Debug, Clone, Serialize)]
pub enum NodeContent {
Range(#[serde(with = "range::Range")] tree_sitter::Range),
String(&'static str),
DynamicString(String),
}
impl From<&'static str> for NodeContent {
fn from(value: &'static str) -> Self {
NodeContent::String(value)
}
}
impl From<tree_sitter::Range> for NodeContent {
fn from(value: tree_sitter::Range) -> Self {
NodeContent::Range(value)
}
}
/// The transform function for a rule: takes the AST, captured variables, a
/// fresh-name scope, and the source range of the matched node, and returns
/// the IDs of the replacement nodes.
pub type Transform = Box<
dyn Fn(&mut Ast, Captures, &tree_builder::FreshScope, Option<tree_sitter::Range>) -> Vec<Id>
+ Send
+ Sync,
>;
pub struct Rule {
query: QueryNode,
transform: Transform,
}
impl Rule {
pub fn new(query: QueryNode, transform: Transform) -> Self {
Self { query, transform }
}
fn try_rule(
&self,
ast: &mut Ast,
node: Id,
fresh: &tree_builder::FreshScope,
) -> Result<Option<Vec<Id>>, String> {
let mut captures = Captures::new();
if self.query.do_match(ast, node, &mut captures)? {
fresh.next_scope();
let source_range = ast.get_node(node).and_then(|n| match n.content {
NodeContent::Range(r) => Some(r),
_ => n.source_range,
});
Ok(Some((self.transform)(ast, captures, fresh, source_range)))
} else {
Ok(None)
}
}
}
const MAX_REWRITE_DEPTH: usize = 100;
/// Index of rules by their root query kind for fast lookup.
struct RuleIndex<'a> {
/// Rules indexed by root node kind name.
by_kind: BTreeMap<&'static str, Vec<&'a Rule>>,
/// Rules with wildcard queries (Any) that apply to all nodes.
wildcard: Vec<&'a Rule>,
}
impl<'a> RuleIndex<'a> {
fn new(rules: &'a [Rule]) -> Self {
let mut by_kind: BTreeMap<&'static str, Vec<&'a Rule>> = BTreeMap::new();
let mut wildcard = Vec::new();
for rule in rules {
match rule.query.root_kind() {
Some(kind) => by_kind.entry(kind).or_default().push(rule),
None => wildcard.push(rule),
}
}
Self { by_kind, wildcard }
}
fn rules_for_kind(&self, kind: &str) -> impl Iterator<Item = &&'a Rule> {
self.by_kind
.get(kind)
.into_iter()
.flat_map(|v| v.iter())
.chain(self.wildcard.iter())
}
}
fn apply_rules(
rules: &[Rule],
ast: &mut Ast,
id: Id,
fresh: &tree_builder::FreshScope,
) -> Result<Vec<Id>, String> {
let index = RuleIndex::new(rules);
apply_rules_inner(&index, ast, id, fresh, 0)
}
fn apply_rules_inner(
index: &RuleIndex,
ast: &mut Ast,
id: Id,
fresh: &tree_builder::FreshScope,
rewrite_depth: usize,
) -> Result<Vec<Id>, String> {
if rewrite_depth > MAX_REWRITE_DEPTH {
return Err(format!(
"Desugaring exceeded maximum rewrite depth ({MAX_REWRITE_DEPTH}). \
This likely indicates a non-terminating rule cycle."
));
}
let node_kind = ast.get_node(id).map(|n| n.kind()).unwrap_or("");
for rule in index.rules_for_kind(node_kind) {
if let Some(result_node) = rule.try_rule(ast, id, fresh)? {
let mut results = Vec::new();
for node in result_node {
results.extend(apply_rules_inner(
index,
ast,
node,
fresh,
rewrite_depth + 1,
)?);
}
return Ok(results);
}
}
// Collect fields before recursing (avoids borrowing ast immutably during mutation)
let field_entries: Vec<(FieldId, Vec<Id>)> = ast.nodes[id]
.fields
.iter()
.map(|(&fid, children)| (fid, children.clone()))
.collect();
// recursively descend into all the fields
// Child traversal does not increment rewrite depth
let mut changed = false;
let mut new_fields = BTreeMap::new();
for (field_id, children) in field_entries {
let mut new_children = Vec::new();
for child_id in children {
let result = apply_rules_inner(index, ast, child_id, fresh, rewrite_depth)?;
if result.len() != 1 || result[0] != child_id {
changed = true;
}
new_children.extend(result);
}
new_fields.insert(field_id, new_children);
}
if !changed {
return Ok(vec![id]);
}
let mut node = ast.nodes[id].clone();
node.fields = new_fields;
node.id = ast.nodes.len();
ast.nodes.push(node);
Ok(vec![ast.nodes.len() - 1])
}
/// Configuration for a desugaring pass: a set of rules and an optional
/// output node-types schema (in YAML format).
///
/// When attached to a `LanguageSpec` (in the shared tree-sitter extractor),
/// enables yeast-based AST rewriting before TRAP extraction. The same YAML
/// is used both to validate TRAP output (via JSON conversion) and to
/// resolve output-only node kinds and fields at runtime.
pub struct DesugaringConfig {
/// Rules to apply during desugaring.
pub rules: Vec<Rule>,
/// Output node-types in YAML format. If `None`, the input grammar's
/// node types are used (i.e. the desugared AST has the same node types
/// as the tree-sitter grammar).
pub output_node_types_yaml: Option<&'static str>,
}
impl DesugaringConfig {
pub fn new(rules: Vec<Rule>) -> Self {
Self {
rules,
output_node_types_yaml: None,
}
}
pub fn with_output_node_types_yaml(mut self, yaml: &'static str) -> Self {
self.output_node_types_yaml = Some(yaml);
self
}
/// Build the yeast `Schema` for this config, given the input language.
/// If `output_node_types_yaml` is `None`, returns the schema derived from
/// the input grammar.
pub fn build_schema(&self, language: &tree_sitter::Language) -> Result<schema::Schema, String> {
match self.output_node_types_yaml {
Some(yaml) => node_types_yaml::schema_from_yaml_with_language(yaml, language),
None => Ok(schema::Schema::from_language(language)),
}
}
}
pub struct Runner<'a> {
language: tree_sitter::Language,
schema: schema::Schema,
rules: &'a [Rule],
}
impl<'a> Runner<'a> {
/// Create a runner using the input grammar's schema for output.
pub fn new(language: tree_sitter::Language, rules: &'a [Rule]) -> Self {
let schema = schema::Schema::from_language(&language);
Self {
language,
schema,
rules,
}
}
/// Create a runner with separate input language and output schema.
pub fn with_schema(
language: tree_sitter::Language,
schema: &schema::Schema,
rules: &'a [Rule],
) -> Self {
Self {
language,
schema: schema.clone(),
rules,
}
}
/// Create a runner from a [`DesugaringConfig`].
pub fn from_config(
language: tree_sitter::Language,
config: &'a DesugaringConfig,
) -> Result<Self, String> {
let schema = config.build_schema(&language)?;
Ok(Self {
language,
schema,
rules: &config.rules,
})
}
pub fn run_from_tree(&self, tree: &tree_sitter::Tree) -> Result<Ast, String> {
let fresh = tree_builder::FreshScope::new();
let mut ast = Ast::from_tree_with_schema(self.schema.clone(), tree, &self.language);
let root = ast.get_root();
let res = apply_rules(self.rules, &mut ast, root, &fresh)?;
if res.len() != 1 {
return Err(format!(
"Expected exactly one result node, got {}",
res.len()
));
}
ast.set_root(res[0]);
Ok(ast)
}
pub fn run(&self, input: &str) -> Result<Ast, String> {
let fresh = tree_builder::FreshScope::new();
let mut parser = tree_sitter::Parser::new();
parser
.set_language(&self.language)
.map_err(|e| format!("Failed to set language: {e}"))?;
let tree = parser
.parse(input, None)
.ok_or_else(|| "Failed to parse input".to_string())?;
let mut ast = Ast::from_tree_with_schema(self.schema.clone(), &tree, &self.language);
let root = ast.get_root();
let res = apply_rules(self.rules, &mut ast, root, &fresh)?;
if res.len() != 1 {
return Err(format!(
"Expected exactly one result node, got {}",
res.len()
));
}
ast.set_root(res[0]);
Ok(ast)
}
}

View File

@@ -0,0 +1,722 @@
/// Converts a YAML node-types file to the tree-sitter `node-types.json` format.
///
/// # YAML format
///
/// ```yaml
/// supertypes:
/// _expression:
/// - assignment
/// - binary
///
/// named:
/// assignment:
/// left: _lhs
/// right: _expression
/// identifier:
///
/// unnamed:
/// - "+"
/// - "end"
/// ```
///
/// See the crate-level docs for the full format specification.
use std::collections::{BTreeMap, BTreeSet};
use std::fmt::Write;
use serde::Deserialize;
use serde_json::json;
/// Top-level YAML structure.
#[derive(Deserialize, Default)]
struct YamlNodeTypes {
#[serde(default)]
supertypes: BTreeMap<String, Vec<TypeRef>>,
#[serde(default)]
named: BTreeMap<String, Option<BTreeMap<String, TypeRefOrList>>>,
#[serde(default)]
unnamed: Vec<String>,
}
/// A reference to a node type. Can be:
/// - a plain string (resolved by looking up named vs unnamed)
/// - a map `{unnamed: "name"}` to force unnamed interpretation
#[derive(Deserialize, Debug, Clone)]
#[serde(untagged)]
enum TypeRef {
Name(String),
Explicit { unnamed: String },
}
/// A field value: either a single type ref or a list of them.
#[derive(Deserialize, Debug, Clone)]
#[serde(untagged)]
enum TypeRefOrList {
Single(TypeRef),
List(Vec<TypeRef>),
}
impl TypeRefOrList {
fn into_vec(self) -> Vec<TypeRef> {
match self {
TypeRefOrList::Single(t) => vec![t],
TypeRefOrList::List(v) => v,
}
}
}
/// Parsed field name: base name + multiplicity markers.
struct FieldSpec {
name: Option<String>, // None for $children
multiple: bool,
required: bool,
}
fn parse_field_name(raw: &str) -> FieldSpec {
let is_children =
raw == "$children" || raw == "$children?" || raw == "$children*" || raw == "$children+";
let suffix = raw.chars().last().filter(|c| matches!(c, '?' | '*' | '+'));
let (multiple, required) = match suffix {
Some('?') => (false, false),
Some('*') => (true, false),
Some('+') => (true, true),
_ => (false, true), // bare field name = required, single
};
let name = if is_children {
None
} else {
let base = raw.trim_end_matches(['?', '*', '+']);
Some(base.to_string())
};
FieldSpec {
name,
multiple,
required,
}
}
/// Resolve a TypeRef to a (type, named) pair, given the sets of known named
/// and unnamed types.
fn resolve_type_ref(
type_ref: &TypeRef,
named_types: &BTreeSet<String>,
unnamed_types: &BTreeSet<String>,
) -> serde_json::Value {
match type_ref {
TypeRef::Explicit { unnamed } => {
json!({"type": unnamed, "named": false})
}
TypeRef::Name(name) => {
let is_named = named_types.contains(name);
let is_unnamed = unnamed_types.contains(name);
if is_named && is_unnamed {
// Ambiguous: default to named
json!({"type": name, "named": true})
} else if is_unnamed {
json!({"type": name, "named": false})
} else {
// Named, or unknown (assume named)
json!({"type": name, "named": true})
}
}
}
}
/// Convert YAML string to node-types JSON string.
pub fn convert(yaml_input: &str) -> Result<String, String> {
let yaml: YamlNodeTypes =
serde_yaml::from_str(yaml_input).map_err(|e| format!("Failed to parse YAML: {e}"))?;
// Build the sets of known named and unnamed types for resolution.
let mut named_types = BTreeSet::new();
for name in yaml.supertypes.keys() {
named_types.insert(name.clone());
}
for name in yaml.named.keys() {
named_types.insert(name.clone());
}
let unnamed_types: BTreeSet<String> = yaml.unnamed.iter().cloned().collect();
let mut output = Vec::new();
// 1. Supertypes
for (name, members) in &yaml.supertypes {
let subtypes: Vec<_> = members
.iter()
.map(|m| resolve_type_ref(m, &named_types, &unnamed_types))
.collect();
output.push(json!({
"type": name,
"named": true,
"subtypes": subtypes,
}));
}
// 2. Named nodes
for (name, fields_opt) in &yaml.named {
let fields_map = match fields_opt {
None => {
// Leaf token: no fields, no children, no subtypes
output.push(json!({
"type": name,
"named": true,
"fields": {},
}));
continue;
}
Some(m) if m.is_empty() => {
output.push(json!({
"type": name,
"named": true,
"fields": {},
}));
continue;
}
Some(m) => m,
};
let mut json_fields = serde_json::Map::new();
let mut json_children: Option<serde_json::Value> = None;
for (raw_field_name, type_refs) in fields_map {
let spec = parse_field_name(raw_field_name);
let types: Vec<_> = type_refs
.clone()
.into_vec()
.iter()
.map(|t| resolve_type_ref(t, &named_types, &unnamed_types))
.collect();
// Cloning to make the borrow checker happy
let field_info = json!({
"multiple": spec.multiple,
"required": spec.required,
"types": types,
});
if spec.name.is_none() {
// $children
json_children = Some(field_info);
} else {
json_fields.insert(spec.name.unwrap(), field_info);
}
}
let mut entry = json!({
"type": name,
"named": true,
"fields": json_fields,
});
if let Some(children) = json_children {
entry
.as_object_mut()
.unwrap()
.insert("children".to_string(), children);
}
output.push(entry);
}
// 3. Unnamed tokens
for name in &yaml.unnamed {
output.push(json!({
"type": name,
"named": false,
}));
}
serde_json::to_string_pretty(&output).map_err(|e| format!("Failed to serialize JSON: {e}"))
}
/// Build a Schema from a YAML node-types string.
/// Registers all node kinds and field names found in the YAML.
pub fn schema_from_yaml(yaml_input: &str) -> Result<crate::schema::Schema, String> {
let yaml: YamlNodeTypes =
serde_yaml::from_str(yaml_input).map_err(|e| format!("Failed to parse YAML: {e}"))?;
let mut schema = crate::schema::Schema::new();
// Register all supertypes as node kinds
for name in yaml.supertypes.keys() {
schema.register_kind(name);
}
// Register named node kinds and their fields
for (name, fields_opt) in &yaml.named {
schema.register_kind(name);
if let Some(fields) = fields_opt {
for raw_field_name in fields.keys() {
let spec = parse_field_name(raw_field_name);
if let Some(field_name) = &spec.name {
schema.register_field(field_name);
}
}
}
}
// Register unnamed tokens as node kinds
for name in &yaml.unnamed {
schema.register_unnamed_kind(name);
}
Ok(schema)
}
/// Build a Schema from a YAML string, extending a tree-sitter Language.
/// The Schema inherits all field/kind names from the Language, plus any
/// additional ones defined in the YAML.
pub fn schema_from_yaml_with_language(
yaml_input: &str,
language: &tree_sitter::Language,
) -> Result<crate::schema::Schema, String> {
let yaml: YamlNodeTypes =
serde_yaml::from_str(yaml_input).map_err(|e| format!("Failed to parse YAML: {e}"))?;
let mut schema = crate::schema::Schema::from_language(language);
// Register supertypes
for name in yaml.supertypes.keys() {
schema.register_kind(name);
}
// Register named node kinds and their fields
for (name, fields_opt) in &yaml.named {
schema.register_kind(name);
if let Some(fields) = fields_opt {
for raw_field_name in fields.keys() {
let spec = parse_field_name(raw_field_name);
if let Some(field_name) = &spec.name {
schema.register_field(field_name);
}
}
}
}
// Register unnamed tokens
for name in &yaml.unnamed {
schema.register_unnamed_kind(name);
}
Ok(schema)
}
// ---------------------------------------------------------------------------
// JSON → YAML conversion
// ---------------------------------------------------------------------------
/// JSON node-types structures (mirrors tree-sitter's format).
#[derive(Deserialize)]
struct JsonNodeInfo {
#[serde(rename = "type")]
kind: String,
named: bool,
#[serde(default)]
fields: BTreeMap<String, JsonFieldInfo>,
children: Option<JsonFieldInfo>,
#[serde(default)]
subtypes: Vec<JsonNodeType>,
}
#[derive(Deserialize)]
struct JsonNodeType {
#[serde(rename = "type")]
kind: String,
named: bool,
}
#[derive(Deserialize)]
struct JsonFieldInfo {
multiple: bool,
required: bool,
types: Vec<JsonNodeType>,
}
/// Convert a tree-sitter node-types.json string to the YAML format.
pub fn convert_from_json(json_input: &str) -> Result<String, String> {
let nodes: Vec<JsonNodeInfo> =
serde_json::from_str(json_input).map_err(|e| format!("Failed to parse JSON: {e}"))?;
// Collect all named and unnamed types for disambiguation decisions.
let mut all_named: BTreeSet<String> = BTreeSet::new();
let mut all_unnamed: BTreeSet<String> = BTreeSet::new();
for node in &nodes {
if node.named {
all_named.insert(node.kind.clone());
} else {
all_unnamed.insert(node.kind.clone());
}
}
let mut supertypes: BTreeMap<String, Vec<JsonNodeType>> = BTreeMap::new();
let mut named: BTreeMap<String, Option<BTreeMap<String, JsonFieldInfo>>> = BTreeMap::new();
let mut unnamed: Vec<String> = Vec::new();
for node in nodes {
if !node.named {
unnamed.push(node.kind);
continue;
}
if !node.subtypes.is_empty() {
supertypes.insert(node.kind, node.subtypes);
continue;
}
if node.fields.is_empty() && node.children.is_none() {
// Leaf token
named.insert(node.kind, None);
} else {
let mut fields = BTreeMap::new();
for (name, info) in node.fields {
fields.insert(name, info);
}
if let Some(children) = node.children {
fields.insert("$children".to_string(), children);
}
named.insert(node.kind, Some(fields));
}
}
// Now emit YAML
let mut out = String::new();
// Supertypes
if !supertypes.is_empty() {
writeln!(out, "supertypes:").unwrap();
for (name, members) in &supertypes {
writeln!(out, " {name}:").unwrap();
for member in members {
let ref_str = format_type_ref(&member.kind, member.named, &all_named, &all_unnamed);
writeln!(out, " - {ref_str}").unwrap();
}
}
writeln!(out).unwrap();
}
// Named
if !named.is_empty() {
writeln!(out, "named:").unwrap();
for (name, fields_opt) in &named {
match fields_opt {
None => {
writeln!(out, " {name}:").unwrap();
}
Some(fields) => {
writeln!(out, " {name}:").unwrap();
for (field_name, info) in fields {
let suffix = field_suffix(info.multiple, info.required);
let yaml_name = if field_name == "$children" {
format!("$children{suffix}")
} else {
format!("{field_name}{suffix}")
};
let type_refs: Vec<String> = info
.types
.iter()
.map(|t| format_type_ref(&t.kind, t.named, &all_named, &all_unnamed))
.collect();
if type_refs.len() == 1 {
writeln!(out, " {yaml_name}: {}", type_refs[0]).unwrap();
} else {
let list = type_refs
.iter()
.map(|s| s.as_str())
.collect::<Vec<_>>()
.join(", ");
writeln!(out, " {yaml_name}: [{list}]").unwrap();
}
}
}
}
}
writeln!(out).unwrap();
}
// Unnamed
if !unnamed.is_empty() {
writeln!(out, "unnamed:").unwrap();
for name in &unnamed {
writeln!(out, " - {}", force_quote(name)).unwrap();
}
}
Ok(out)
}
fn field_suffix(multiple: bool, required: bool) -> &'static str {
match (multiple, required) {
(false, true) => "",
(false, false) => "?",
(true, true) => "+",
(true, false) => "*",
}
}
/// Format a type reference for YAML output. Uses the disambiguation rule:
/// plain string if unambiguous, `{unnamed: name}` if the name exists as both
/// named and unnamed and we need the unnamed interpretation.
fn format_type_ref(
kind: &str,
named: bool,
all_named: &BTreeSet<String>,
_all_unnamed: &BTreeSet<String>,
) -> String {
if named {
quote_yaml(kind)
} else {
let is_also_named = all_named.contains(kind);
if is_also_named {
format!("{{unnamed: {}}}", force_quote(kind))
} else {
force_quote(kind)
}
}
}
/// Always wrap in double quotes. Used for unnamed node references so they're
/// visually distinct from named ones — YAML treats both forms as equivalent strings.
fn force_quote(s: &str) -> String {
format!("\"{}\"", s.replace('\\', "\\\\").replace('"', "\\\""))
}
/// Quote a YAML string value if it contains special characters or could be
/// misinterpreted.
fn quote_yaml(s: &str) -> String {
let needs_quoting = s.is_empty()
|| s.contains(|c: char| {
matches!(
c,
':' | '{'
| '}'
| '['
| ']'
| ','
| '&'
| '*'
| '#'
| '?'
| '|'
| '-'
| '<'
| '>'
| '='
| '!'
| '%'
| '@'
| '`'
| '"'
| '\''
)
})
|| s.starts_with(' ')
|| s.ends_with(' ')
|| s == "true"
|| s == "false"
|| s == "null"
|| s == "yes"
|| s == "no"
|| s.parse::<f64>().is_ok();
if needs_quoting {
format!("\"{}\"", s.replace('\\', "\\\\").replace('"', "\\\""))
} else {
s.to_string()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_basic_conversion() {
let yaml = r#"
supertypes:
_expression:
- assignment
- binary
named:
assignment:
left: _lhs
right: _expression
binary:
left: [_expression, _simple_numeric]
operator: ["!=", "+"]
right: _expression
argument_list:
$children*: [_expression, block_argument]
identifier:
unnamed:
- "!="
- "+"
- "end"
"#;
let json_str = convert(yaml).unwrap();
let result: Vec<serde_json::Value> = serde_json::from_str(&json_str).unwrap();
// Check supertype
let expr = &result[0];
assert_eq!(expr["type"], "_expression");
assert_eq!(expr["named"], true);
assert_eq!(expr["subtypes"].as_array().unwrap().len(), 2);
// Check assignment
let assign = result.iter().find(|n| n["type"] == "assignment").unwrap();
assert_eq!(assign["fields"]["left"]["required"], true);
assert_eq!(assign["fields"]["left"]["multiple"], false);
assert_eq!(assign["fields"]["left"]["types"][0]["type"], "_lhs");
assert_eq!(assign["fields"]["left"]["types"][0]["named"], true);
// Check binary.operator — "!=" and "+" should resolve to unnamed
let binary = result.iter().find(|n| n["type"] == "binary").unwrap();
let op_types = binary["fields"]["operator"]["types"].as_array().unwrap();
assert_eq!(op_types[0]["type"], "!=");
assert_eq!(op_types[0]["named"], false);
assert_eq!(op_types[1]["type"], "+");
assert_eq!(op_types[1]["named"], false);
// Check argument_list has children, not a field
let arg_list = result
.iter()
.find(|n| n["type"] == "argument_list")
.unwrap();
assert!(arg_list.get("children").is_some());
assert_eq!(arg_list["children"]["multiple"], true);
assert_eq!(arg_list["children"]["required"], false);
// Check identifier is a leaf
let ident = result.iter().find(|n| n["type"] == "identifier").unwrap();
assert_eq!(ident["fields"].as_object().unwrap().len(), 0);
// Check unnamed tokens
let end = result.iter().find(|n| n["type"] == "end").unwrap();
assert_eq!(end["named"], false);
}
#[test]
fn test_explicit_unnamed_disambiguation() {
let yaml = r#"
named:
foo:
field: [{unnamed: bar}]
unnamed:
- bar
"#;
let json_str = convert(yaml).unwrap();
let result: Vec<serde_json::Value> = serde_json::from_str(&json_str).unwrap();
let foo = result.iter().find(|n| n["type"] == "foo").unwrap();
assert_eq!(foo["fields"]["field"]["types"][0]["named"], false);
}
#[test]
fn test_field_suffixes() {
let yaml = r#"
named:
test_node:
required_single: foo
optional_single?: foo
required_multiple+: foo
optional_multiple*: foo
"#;
let json_str = convert(yaml).unwrap();
let result: Vec<serde_json::Value> = serde_json::from_str(&json_str).unwrap();
let node = result.iter().find(|n| n["type"] == "test_node").unwrap();
let fields = node["fields"].as_object().unwrap();
assert_eq!(fields["required_single"]["required"], true);
assert_eq!(fields["required_single"]["multiple"], false);
assert_eq!(fields["optional_single"]["required"], false);
assert_eq!(fields["optional_single"]["multiple"], false);
assert_eq!(fields["required_multiple"]["required"], true);
assert_eq!(fields["required_multiple"]["multiple"], true);
assert_eq!(fields["optional_multiple"]["required"], false);
assert_eq!(fields["optional_multiple"]["multiple"], true);
}
#[test]
fn test_json_to_yaml() {
let json = r#"[
{"type": "_expression", "named": true, "subtypes": [
{"type": "assignment", "named": true},
{"type": "identifier", "named": true}
]},
{"type": "assignment", "named": true, "fields": {
"left": {"multiple": false, "required": true, "types": [
{"type": "_expression", "named": true}
]},
"right": {"multiple": false, "required": false, "types": [
{"type": "_expression", "named": true}
]}
}, "children": {
"multiple": true, "required": false, "types": [
{"type": "identifier", "named": true}
]
}},
{"type": "identifier", "named": true, "fields": {}},
{"type": "=", "named": false},
{"type": "end", "named": false}
]"#;
let yaml = convert_from_json(json).unwrap();
// Verify key structures are present
assert!(yaml.contains("supertypes:"));
assert!(yaml.contains("_expression:"));
assert!(yaml.contains("named:"));
assert!(yaml.contains("assignment:"));
assert!(yaml.contains("left:"));
assert!(yaml.contains("right?:"));
assert!(yaml.contains("$children*:"));
assert!(yaml.contains("identifier:"));
assert!(yaml.contains("unnamed:"));
assert!(yaml.contains("\"=\""));
assert!(yaml.contains("end"));
}
#[test]
fn test_round_trip() {
let yaml_input = r#"
supertypes:
_expression:
- assignment
- identifier
named:
assignment:
left: _expression
right?: _expression
$children*: identifier
identifier:
unnamed:
- "="
- end
"#;
// YAML → JSON → YAML
let json = convert(yaml_input).unwrap();
let yaml_output = convert_from_json(&json).unwrap();
// YAML → JSON again (should be identical)
let json2 = convert(&yaml_output).unwrap();
let v1: serde_json::Value = serde_json::from_str(&json).unwrap();
let v2: serde_json::Value = serde_json::from_str(&json2).unwrap();
assert_eq!(v1, v2);
}
}

228
shared/yeast/src/query.rs Normal file
View File

@@ -0,0 +1,228 @@
use crate::{captures::Captures, Ast, Id};
#[derive(Debug, Clone)]
pub enum QueryNode {
Any(),
Node {
kind: &'static str,
children: Vec<(&'static str, Vec<QueryListElem>)>,
},
UnnamedNode {
kind: &'static str,
},
Capture {
capture: &'static str,
node: Box<QueryNode>,
},
}
impl QueryNode {
/// Returns the root node kind this query matches, if it's specific.
/// Returns None for wildcards (Any) and captures wrapping wildcards.
pub fn root_kind(&self) -> Option<&'static str> {
match self {
QueryNode::Node { kind, .. } => Some(kind),
QueryNode::UnnamedNode { kind } => Some(kind),
QueryNode::Capture { node, .. } => node.root_kind(),
QueryNode::Any() => None,
}
}
}
#[derive(Debug, Clone)]
pub enum QueryListElem {
Repeated {
children: Vec<QueryListElem>,
rep: Rep,
},
SingleNode(QueryNode),
}
#[derive(Debug, PartialEq, Eq, Copy, Clone)]
pub enum Rep {
ZeroOrMore,
OneOrMore,
ZeroOrOne,
}
impl QueryNode {
/// Returns true if this query only matches named nodes (not unnamed tokens).
/// Used to skip unnamed children in positional matching, matching tree-sitter
/// semantics where `(_)` only matches named nodes.
fn matches_named_only(&self) -> bool {
match self {
QueryNode::Any() => true,
QueryNode::Node { .. } => true,
QueryNode::UnnamedNode { .. } => false,
QueryNode::Capture { node, .. } => node.matches_named_only(),
}
}
pub fn do_match(&self, ast: &Ast, node: Id, matches: &mut Captures) -> Result<bool, String> {
match self {
QueryNode::Any() => Ok(true),
QueryNode::Node { kind, children } => {
let node = ast.get_node(node).unwrap();
let target_kind = ast
.id_for_node_kind(kind)
.ok_or_else(|| format!("Node kind {kind} not found in language"))?;
if node.kind != target_kind {
return Ok(false);
}
for (field, field_children) in children {
let field_id = ast
.field_id_for_name(field)
.ok_or_else(|| format!("Field {field} not found in language"))?;
let empty = Vec::new();
let mut child_iter =
node.fields.get(&field_id).unwrap_or(&empty).iter().cloned();
if !match_children(field_children.iter(), ast, &mut child_iter, matches)? {
return Ok(false);
}
}
Ok(true)
}
QueryNode::UnnamedNode { kind } => {
let node = ast.get_node(node).unwrap();
let target_kind = ast
.id_for_unnamed_node_kind(kind)
.ok_or_else(|| format!("unnamed Node kind {kind} not found in language"))?;
Ok(node.kind == target_kind)
}
QueryNode::Capture {
capture,
node: sub_query,
} => {
let matched = sub_query.do_match(ast, node, matches)?;
if matched {
matches.insert(capture, node);
}
Ok(matched)
}
}
}
}
fn match_children<'a>(
child_matchers: impl Iterator<Item = &'a QueryListElem>,
ast: &Ast,
remaining_children: &mut (impl Iterator<Item = Id> + Clone),
matches: &mut Captures,
) -> Result<bool, String> {
for child in child_matchers {
if !child.do_match(ast, remaining_children, matches)? {
return Ok(false);
}
}
Ok(true)
}
impl QueryListElem {
fn do_match(
&self,
ast: &Ast,
remaining_children: &mut (impl Iterator<Item = Id> + Clone),
matches: &mut Captures,
) -> Result<bool, String> {
match self {
QueryListElem::Repeated { children, rep } => {
if children.is_empty() {
// Empty repetition always succeeds without consuming
return Ok(*rep != Rep::OneOrMore);
}
let mut iters = 0;
loop {
let matches_initial = matches.clone();
let start = remaining_children.clone();
let start_next = start.clone().next();
if !match_children(children.iter(), ast, remaining_children, matches)? {
*remaining_children = start;
*matches = matches_initial;
break;
}
// Guard against zero-width matches: if the iterator
// didn't advance, break to avoid infinite looping.
let current_next = remaining_children.clone().next();
if start_next == current_next {
break;
}
iters += 1;
if *rep == Rep::ZeroOrOne {
break;
}
}
if *rep == Rep::OneOrMore && iters == 0 {
// We didn't match any children but we were supposed to
Ok(false)
} else {
Ok(true)
}
}
QueryListElem::SingleNode(sub_query) => {
if sub_query.matches_named_only() {
// Skip unnamed children, matching tree-sitter semantics
// where (_) only matches named nodes.
loop {
match remaining_children.next() {
Some(child) => {
let node = ast.get_node(child).unwrap();
if node.is_named() {
return sub_query.do_match(ast, child, matches);
}
// Skip unnamed child, continue to next
}
None => return Ok(false),
}
}
} else if let Some(child) = remaining_children.next() {
sub_query.do_match(ast, child, matches)
} else {
Ok(false)
}
}
}
}
}
#[cfg(test)]
mod tests {
use crate::query::*;
#[test]
fn it_works() {
let query1: QueryNode = yeast::query!((_));
println!("{query1:?}");
let query2 = yeast::query!((foo));
println!("{query2:?}");
let query3 = yeast::query!((foo child: (_)));
println!("{query3:?}");
let query4 = yeast::query!((foo (_)*));
println!("{query4:?}");
let query5: QueryNode = yeast::query!((foo (_)*));
println!("{query5:?}");
let query6: QueryNode = yeast::query!((_) @bar);
println!("{query6:?}");
let query7: QueryNode = yeast::query!((foo child: (_) @bar));
println!("{query7:?}");
let query8: QueryNode = yeast::query!(
(assignment
left: (element_reference
object: (_) @obj
(_) @index
)
right: (_) @rhs
)
);
println!("{query8:?}");
let query9 = yeast::query!(
(program
child: (assignment
left: (_) @left
right: (_) @right
)
)
);
println!("{query9:?}");
}
}

21
shared/yeast/src/range.rs Normal file
View File

@@ -0,0 +1,21 @@
//! (de)-serialize helpers for tree_sitter::Range
use serde::{Deserialize, Serialize};
#[derive(Serialize, Deserialize)]
#[serde(remote = "tree_sitter::Point")]
pub struct Point {
pub row: usize,
pub column: usize,
}
#[derive(Serialize, Deserialize)]
#[serde(remote = "tree_sitter::Range")]
pub struct Range {
pub start_byte: usize,
pub end_byte: usize,
#[serde(with = "Point")]
pub start_point: tree_sitter::Point,
#[serde(with = "Point")]
pub end_point: tree_sitter::Point,
}

167
shared/yeast/src/schema.rs Normal file
View File

@@ -0,0 +1,167 @@
use std::collections::BTreeMap;
use crate::{FieldId, KindId, CHILD_FIELD};
/// A schema defining node kinds and field names for the output AST.
/// Built from a node-types.yml file, independent of any tree-sitter grammar.
///
/// # Memory management
///
/// `register_field`/`register_kind`/`register_unnamed_kind` use `Box::leak`
/// to obtain `&'static str` names. This is intentional: the `&'static str`
/// names appear pervasively in `Node`, `AstCursor`, query patterns, and the
/// extractor's TRAP output, where adding a lifetime would propagate widely.
///
/// The leak is bounded by the number of distinct kind/field names registered.
/// Schemas are expected to be constructed once per process (e.g. at extractor
/// startup) and reused. Repeated construction in long-running processes will
/// leak memory unboundedly and should be avoided.
#[derive(Clone)]
pub struct Schema {
field_ids: BTreeMap<String, FieldId>,
field_names: BTreeMap<FieldId, &'static str>,
next_field_id: FieldId,
kind_ids: BTreeMap<String, KindId>,
unnamed_kind_ids: BTreeMap<String, KindId>,
kind_names: BTreeMap<KindId, &'static str>,
next_kind_id: KindId,
}
impl Default for Schema {
fn default() -> Self {
Self::new()
}
}
impl Schema {
pub fn new() -> Self {
Self {
field_ids: BTreeMap::new(),
field_names: BTreeMap::new(),
next_field_id: 1, // 0 is reserved
kind_ids: BTreeMap::new(),
unnamed_kind_ids: BTreeMap::new(),
kind_names: BTreeMap::new(),
next_kind_id: 1, // 0 is reserved
}
}
/// Create a schema from a tree-sitter language, importing all its
/// known field and kind names.
pub fn from_language(language: &tree_sitter::Language) -> Self {
let mut schema = Self::new();
// Import all field names, preserving tree-sitter's IDs
for id in 1..=language.field_count() as u16 {
if let Some(name) = language.field_name_for_id(id) {
schema.field_ids.insert(name.to_string(), id);
schema.field_names.insert(id, name);
if id >= schema.next_field_id {
schema.next_field_id = id + 1;
}
}
}
// Import all node kind names, preserving tree-sitter's IDs.
// Track named and unnamed variants separately.
// For named kinds, use the canonical ID from id_for_node_kind(name, true)
// since some languages have multiple IDs for the same named kind.
for id in 0..language.node_kind_count() as u16 {
if let Some(name) = language.node_kind_for_id(id) {
if !name.is_empty() {
let is_named = language.node_kind_is_named(id);
if is_named {
let canonical_id = language.id_for_node_kind(name, true);
if canonical_id != 0 && !schema.kind_ids.contains_key(name) {
schema.kind_ids.insert(name.to_string(), canonical_id);
schema.kind_names.insert(canonical_id, name);
}
} else {
// For unnamed kinds, only insert if we don't already have one
// (some languages have multiple unnamed IDs for the same text)
schema
.unnamed_kind_ids
.entry(name.to_string())
.or_insert(id);
}
// Always track the name for any ID we encounter
schema.kind_names.entry(id).or_insert(name);
if id >= schema.next_kind_id {
schema.next_kind_id = id + 1;
}
}
}
}
schema
}
/// Register a field name, returning its ID.
/// If already registered, returns the existing ID.
pub fn register_field(&mut self, name: &str) -> FieldId {
if name == "child" {
return CHILD_FIELD;
}
if let Some(&id) = self.field_ids.get(name) {
return id;
}
let id = self.next_field_id;
assert!(id < CHILD_FIELD, "too many fields");
self.next_field_id += 1;
let leaked: &'static str = Box::leak(name.to_string().into_boxed_str());
self.field_ids.insert(name.to_string(), id);
self.field_names.insert(id, leaked);
id
}
/// Register a named node kind name, returning its ID.
/// If already registered, returns the existing ID.
pub fn register_kind(&mut self, name: &str) -> KindId {
if let Some(&id) = self.kind_ids.get(name) {
return id;
}
let id = self.next_kind_id;
self.next_kind_id += 1;
let leaked: &'static str = Box::leak(name.to_string().into_boxed_str());
self.kind_ids.insert(name.to_string(), id);
self.kind_names.insert(id, leaked);
id
}
/// Register an unnamed token kind (e.g. `"="`, `"end"`), returning its ID.
/// If already registered, returns the existing ID.
pub fn register_unnamed_kind(&mut self, name: &str) -> KindId {
if let Some(&id) = self.unnamed_kind_ids.get(name) {
return id;
}
let id = self.next_kind_id;
self.next_kind_id += 1;
let leaked: &'static str = Box::leak(name.to_string().into_boxed_str());
self.unnamed_kind_ids.insert(name.to_string(), id);
self.kind_names.insert(id, leaked);
id
}
pub fn field_id_for_name(&self, name: &str) -> Option<FieldId> {
if name == "child" {
return Some(CHILD_FIELD);
}
self.field_ids.get(name).copied()
}
pub fn field_name_for_id(&self, id: FieldId) -> Option<&'static str> {
if id == CHILD_FIELD {
return Some("child");
}
self.field_names.get(&id).copied()
}
pub fn id_for_node_kind(&self, kind: &str) -> Option<KindId> {
self.kind_ids.get(kind).copied()
}
pub fn id_for_unnamed_node_kind(&self, kind: &str) -> Option<KindId> {
self.unnamed_kind_ids.get(kind).copied()
}
pub fn node_kind_for_id(&self, id: KindId) -> Option<&'static str> {
self.kind_names.get(&id).copied()
}
}

View File

@@ -0,0 +1,43 @@
use std::cell::Cell;
use std::collections::BTreeMap;
/// Tracks fresh identifier generation during a single tree-building operation.
/// All occurrences of the same `$name` within one build share the same generated value.
pub struct FreshScope {
counter: Cell<u32>,
resolved: std::cell::RefCell<BTreeMap<String, String>>,
}
impl Default for FreshScope {
fn default() -> Self {
Self::new()
}
}
impl FreshScope {
pub fn new() -> Self {
Self {
counter: Cell::new(0),
resolved: std::cell::RefCell::new(BTreeMap::new()),
}
}
pub fn resolve(&self, name: &str) -> String {
self.resolved
.borrow_mut()
.entry(name.to_string())
.or_insert_with(|| {
let id = self.counter.get();
self.counter.set(id + 1);
format!("${name}-{id}")
})
.clone()
}
/// Clear resolved names but keep the counter. Called between rule
/// applications so that `$tmp` in different rules gets different values
/// while the counter increases monotonically.
pub fn next_scope(&self) {
self.resolved.borrow_mut().clear();
}
}

111
shared/yeast/src/visitor.rs Normal file
View File

@@ -0,0 +1,111 @@
use std::collections::BTreeMap;
use tree_sitter::{Language, Tree};
use crate::{Ast, Id, Node, NodeContent, CHILD_FIELD};
#[derive(Debug)]
struct VisitorNode {
inner: Node,
parent: Option<Id>,
}
/// A type that can walk a TS tree and produce an `Ast`.
#[derive(Debug)]
pub(crate) struct Visitor {
nodes: Vec<VisitorNode>,
current: Option<Id>,
language: Language,
}
impl Visitor {
pub fn new(language: Language) -> Self {
Self {
nodes: Vec::new(),
current: None,
language,
}
}
pub fn visit(&mut self, tree: &Tree) {
let cursor = &mut tree.walk();
self.enter_node(cursor.node());
let mut recurse = true;
loop {
if recurse && cursor.goto_first_child() {
recurse = self.enter_node(cursor.node());
} else {
self.leave_node(cursor.field_name(), cursor.node());
if cursor.goto_next_sibling() {
recurse = self.enter_node(cursor.node());
} else if cursor.goto_parent() {
recurse = false;
} else {
break;
}
}
}
}
pub fn build_with_schema(self, schema: crate::schema::Schema) -> Ast {
Ast {
root: self.nodes[0].inner.id,
schema,
nodes: self.nodes.into_iter().map(|n| n.inner).collect(),
}
}
fn add_node(&mut self, n: tree_sitter::Node<'_>, content: NodeContent, is_named: bool) -> Id {
let id = self.nodes.len();
self.nodes.push(VisitorNode {
inner: Node {
id,
kind: self.language.id_for_node_kind(n.kind(), is_named),
kind_name: n.kind(),
content,
fields: BTreeMap::new(),
is_missing: n.is_missing(),
is_named: n.is_named(),
is_extra: n.is_extra(),
is_error: n.is_error(),
source_range: None,
},
parent: self.current,
});
id
}
fn enter_node(&mut self, node: tree_sitter::Node<'_>) -> bool {
let id = self.add_node(node, node.range().into(), node.is_named());
self.current = Some(id);
true
}
fn leave_node(&mut self, field_name: Option<&'static str>, _node: tree_sitter::Node<'_>) {
let node = self.current.map(|i| &self.nodes[i]).unwrap();
let node_id = node.inner.id;
let node_parent = node.parent;
if let Some(parent_id) = node.parent {
let parent = self.nodes.get_mut(parent_id).unwrap();
if let Some(field) = field_name {
let field_id = self.language.field_id_for_name(field).unwrap().get();
parent
.inner
.fields
.entry(field_id)
.or_default()
.push(node_id);
} else {
parent
.inner
.fields
.entry(CHILD_FIELD)
.or_default()
.push(node_id);
}
}
self.current = node_parent;
}
}

View File

@@ -0,0 +1,73 @@
# Output node types for yeast test rules.
# Inspired by tree-sitter-ruby, but with all children in named fields
# (no unnamed children). This represents the desugared output schema.
named:
program:
stmt*: [assignment, call, identifier, for, first_node, second_node]
assignment:
left: [identifier, left_assignment_list]
right: [identifier, integer, call, element_reference]
left_assignment_list:
item*: identifier
element_reference:
object: identifier
index: [integer, identifier]
for:
pattern: [identifier, left_assignment_list]
value: in
body: do
in:
value: [identifier, call]
do:
stmt*: [assignment, identifier, call]
call:
receiver: [identifier, call]
method: identifier
arguments?: argument_list
block?: block
argument_list:
argument*: [identifier, integer, call]
block:
parameters: block_parameters
body: block_body
block_parameters:
parameter*: identifier
block_body:
stmt*: [assignment, identifier, call]
identifier:
integer:
# Output-only kinds, used by tests of chained desugaring rules.
# Neither exists in the input tree-sitter-ruby grammar.
first_node:
left: [identifier, integer]
right: [identifier, integer]
second_node:
left: [identifier, integer]
right: [identifier, integer]
unnamed:
- "="
- ","
- "("
- ")"
- "for"
- "in"
- "do"
- "end"
- "|"
- "."

454
shared/yeast/tests/test.rs Normal file
View File

@@ -0,0 +1,454 @@
#![cfg(test)]
use yeast::dump::dump_ast;
use yeast::*;
const OUTPUT_SCHEMA_YAML: &str = include_str!("node-types.yml");
/// Helper: parse Ruby source with no rules, return dump.
fn parse_and_dump(input: &str) -> String {
let runner = Runner::new(tree_sitter_ruby::LANGUAGE.into(), &[]);
let ast = runner.run(input).unwrap();
dump_ast(&ast, ast.get_root(), input)
}
/// Helper: parse Ruby source with a custom output schema and rules, return dump.
fn run_and_dump(input: &str, rules: Vec<Rule>) -> String {
let lang: tree_sitter::Language = tree_sitter_ruby::LANGUAGE.into();
let schema =
yeast::node_types_yaml::schema_from_yaml_with_language(OUTPUT_SCHEMA_YAML, &lang).unwrap();
let runner = Runner::with_schema(lang, &schema, &rules);
let ast = runner.run(input).unwrap();
dump_ast(&ast, ast.get_root(), input)
}
/// Assert that a dump equals the expected string, treating the expected
/// string as an indented multiline literal: leading/trailing blank lines
/// are stripped, and the common leading indentation is removed from every
/// line. This lets test assertions place the first line at the same
/// indentation as the rest of the body.
#[track_caller]
fn assert_dump_eq(actual: &str, expected: &str) {
let min_indent = expected
.lines()
.filter(|l| !l.trim().is_empty())
.map(|l| l.len() - l.trim_start().len())
.min()
.unwrap_or(0);
let dedented: String = expected
.lines()
.map(|l| {
if l.len() >= min_indent {
&l[min_indent..]
} else {
l
}
})
.collect::<Vec<_>>()
.join("\n");
assert_eq!(actual.trim(), dedented.trim());
}
// ---- Parsing tests ----
#[test]
fn test_parse_assignment() {
let dump = parse_and_dump("x = 1");
assert_dump_eq(
&dump,
r#"
program
assignment
left: identifier "x"
right: integer "1"
"#,
);
}
#[test]
fn test_parse_multiple_assignment() {
let dump = parse_and_dump("x, y = foo()");
assert_dump_eq(
&dump,
r#"
program
assignment
left:
left_assignment_list
identifier "x"
identifier "y"
right:
call
arguments:
argument_list
method: identifier "foo"
"#,
);
}
#[test]
fn test_parse_for_loop() {
let dump = parse_and_dump("for x in list do\n y\nend");
assert_dump_eq(
&dump,
r#"
program
for
body:
do
identifier "y"
pattern: identifier "x"
value:
in
identifier "list"
"#,
);
}
// ---- Query tests ----
#[test]
fn test_query_match() {
let runner = Runner::new(tree_sitter_ruby::LANGUAGE.into(), &[]);
let ast = runner.run("x = 1").unwrap();
let query = yeast::query!(
(program
child: (assignment
left: (_) @left
right: (_) @right
)
)
);
let mut captures = yeast::captures::Captures::new();
let matched = query.do_match(&ast, ast.get_root(), &mut captures).unwrap();
assert!(matched);
assert!(captures.get_var("left").is_ok());
assert!(captures.get_var("right").is_ok());
}
#[test]
fn test_query_no_match() {
let runner = Runner::new(tree_sitter_ruby::LANGUAGE.into(), &[]);
let ast = runner.run("x = 1").unwrap();
let query = yeast::query!(
(program
child: (call
method: (_) @m
)
)
);
let mut captures = yeast::captures::Captures::new();
let matched = query.do_match(&ast, ast.get_root(), &mut captures).unwrap();
assert!(!matched);
}
#[test]
fn test_query_repeated_capture() {
let runner = Runner::new(tree_sitter_ruby::LANGUAGE.into(), &[]);
let ast = runner.run("x, y, z = 1").unwrap();
let query = yeast::query!(
(assignment
left: (left_assignment_list
(identifier)* @names
)
)
);
// Match against the assignment node (first named child of program)
let mut cursor = AstCursor::new(&ast);
cursor.goto_first_child();
let assignment_id = cursor.node().id();
let mut captures = yeast::captures::Captures::new();
let matched = query.do_match(&ast, assignment_id, &mut captures).unwrap();
assert!(matched);
assert_eq!(captures.get_all("names").len(), 3);
}
// ---- Tree builder tests ----
#[test]
fn test_tree_builder() {
let runner = Runner::new(tree_sitter_ruby::LANGUAGE.into(), &[]);
let mut ast = runner.run("x = 1").unwrap();
let input = "x = 1";
let query = yeast::query!(
(program
child: (assignment
left: (_) @left
right: (_) @right
)
)
);
let mut captures = yeast::captures::Captures::new();
query.do_match(&ast, ast.get_root(), &mut captures).unwrap();
// Swap left and right
let fresh = yeast::tree_builder::FreshScope::new();
let mut ctx = yeast::build::BuildCtx::new(&mut ast, &captures, &fresh);
let new_id = yeast::tree!(ctx,
(program
child: (assignment
left: {ctx.capture("right")}
right: {ctx.capture("left")}
)
)
);
let dump = dump_ast(ctx.ast, new_id, input);
assert_dump_eq(
&dump,
r#"
program
assignment
left: integer "1"
right: identifier "x"
"#,
);
}
// ---- Rule tests ----
// These rules use field names from node-types.yml, which extends the
// tree-sitter-ruby grammar with named fields for nodes that only have
// unnamed children in tree-sitter (e.g. block_body.stmt, block_parameters.parameter).
fn ruby_rules() -> Vec<Rule> {
let assign_rule = yeast::rule!(
(assignment
left: (left_assignment_list
(identifier)* @left
)
right: (_) @right
)
=>
(assignment
left: (identifier $tmp)
right: {right}
)
{..left.iter().enumerate().map(|(i, &lhs)|
yeast::tree!(
(assignment
left: {lhs}
right: (element_reference
object: (identifier $tmp)
index: (integer #{i})
)
)
)
)}
);
let for_rule = yeast::rule!(
(for
pattern: (_) @pat
value: (in (_) @val)
body: (do (_)* @body)
)
=>
(call
receiver: {val}
method: (identifier "each")
block: (block
parameters: (block_parameters
parameter: (identifier $tmp)
)
body: (block_body
stmt: (assignment
left: {pat}
right: (identifier $tmp)
)
stmt: {..body}
)
)
)
);
vec![assign_rule, for_rule]
}
#[test]
fn test_desugar_multiple_assignment() {
let dump = run_and_dump("x, y = e", ruby_rules());
assert_dump_eq(
&dump,
r#"
program
assignment
left: identifier "$tmp-0"
right: identifier "e"
assignment
left: identifier "x"
right:
element_reference
object: identifier "$tmp-0"
index: integer "0"
assignment
left: identifier "y"
right:
element_reference
object: identifier "$tmp-0"
index: integer "1"
"#,
);
}
#[test]
fn test_desugar_for_loop() {
let dump = run_and_dump("for x in list do\n y\nend", ruby_rules());
assert_dump_eq(
&dump,
r#"
program
call
block:
block
body:
block_body
stmt:
assignment
left: identifier "x"
right: identifier "$tmp-0"
identifier "y"
parameters:
block_parameters
parameter: identifier "$tmp-0"
method: identifier "each"
receiver: identifier "list"
"#,
);
}
#[test]
fn test_shorthand_rule() {
let rule = yeast::rule!(
(assignment
left: (_) @method
right: (_) @receiver
)
=> call
);
let dump = run_and_dump("x = 1", vec![rule]);
assert_dump_eq(
&dump,
r#"
program
call
method: identifier "x"
receiver: integer "1"
"#,
);
}
#[test]
fn test_chained_rules_output_only_kind() {
// Exercise rule chaining where an intermediate kind exists only in the
// output schema (not in the input tree-sitter grammar):
// assignment → first_node (input → output-only)
// first_node → second_node (output-only → output-only)
// The matcher must look up `first_node` against the schema, which only
// knows about it via the YAML node-types file.
let assignment_to_first = yeast::rule!(
(assignment
left: (_) @left
right: (_) @right
)
=> first_node
);
let first_to_second = yeast::rule!(
(first_node
left: (_) @left
right: (_) @right
)
=> second_node
);
let dump = run_and_dump("x = 1", vec![assignment_to_first, first_to_second]);
assert_dump_eq(
&dump,
r#"
program
second_node
left: identifier "x"
right: integer "1"
"#,
);
}
// ---- Cursor tests ----
#[test]
fn test_cursor_navigation() {
let runner = Runner::new(tree_sitter_ruby::LANGUAGE.into(), &[]);
let ast = runner.run("x = 1").unwrap();
let mut cursor = AstCursor::new(&ast);
// Start at root
assert_eq!(cursor.node().kind(), "program");
// Go to first child (assignment)
assert!(cursor.goto_first_child());
assert_eq!(cursor.node().kind(), "assignment");
// No sibling
assert!(!cursor.goto_next_sibling());
// Go to first child of assignment
assert!(cursor.goto_first_child());
assert!(cursor.node().is_named());
// Go back up
assert!(cursor.goto_parent());
assert_eq!(cursor.node().kind(), "assignment");
assert!(cursor.goto_parent());
assert_eq!(cursor.node().kind(), "program");
// Can't go further up
assert!(!cursor.goto_parent());
}
#[test]
fn test_desugar_for_with_multiple_assignment() {
let dump = run_and_dump("for a, b in list do\n x\nend", ruby_rules());
assert_dump_eq(
&dump,
r#"
program
call
block:
block
body:
block_body
stmt:
assignment
left: identifier "$tmp-1"
right: identifier "$tmp-0"
assignment
left: identifier "a"
right:
element_reference
object: identifier "$tmp-1"
index: integer "0"
assignment
left: identifier "b"
right:
element_reference
object: identifier "$tmp-1"
index: integer "1"
identifier "x"
parameters:
block_parameters
parameter: identifier "$tmp-0"
method: identifier "each"
receiver: identifier "list"
"#,
);
}