Yeast: AST desugaring framework (rebased from hackathon-desugaring)

Add the yeast crate (Yet another Elaborator for Abstract Syntax Trees),
a framework for tree-sitter AST transformations/desugaring. Integrate it
into the shared tree-sitter extractor.

Key components:
- shared/yeast/: New crate with query/match/transform pipeline for
  tree-sitter ASTs, with Ruby desugaring rules as an example
- shared/tree-sitter-extractor: Pass parsed trees through yeast before
  TRAP extraction, applying language-specific desugaring rules

Updated from the original hackathon branch to work with tree-sitter 0.24
and current main dependencies.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
Taus
2026-04-14 16:03:02 +00:00
parent 154d213fd2
commit 7efb03d4cc
27 changed files with 2475 additions and 19 deletions

29
Cargo.lock generated
View File

@@ -416,6 +416,7 @@ dependencies = [
"tree-sitter",
"tree-sitter-json",
"tree-sitter-ql",
"yeast",
"zstd",
]
@@ -2470,7 +2471,6 @@ version = "1.0.145"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c"
dependencies = [
"indexmap 2.11.4",
"itoa",
"memchr",
"ryu",
@@ -2853,14 +2853,13 @@ dependencies = [
[[package]]
name = "tree-sitter"
version = "0.25.9"
version = "0.24.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ccd2a058a86cfece0bf96f7cce1021efef9c8ed0e892ab74639173e5ed7a34fa"
checksum = "a5387dffa7ffc7d2dae12b50c6f7aab8ff79d6210147c6613561fc3d474c6f75"
dependencies = [
"cc",
"regex",
"regex-syntax",
"serde_json",
"streaming-iterator",
"tree-sitter-language",
]
@@ -2891,6 +2890,16 @@ version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c4013970217383f67b18aef68f6fb2e8d409bc5755227092d32efb0422ba24b8"
[[package]]
name = "tree-sitter-python"
version = "0.23.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3d065aaa27f3aaceaf60c1f0e0ac09e1cb9eb8ed28e7bcdaa52129cffc7f4b04"
dependencies = [
"cc",
"tree-sitter-language",
]
[[package]]
name = "tree-sitter-ql"
version = "0.23.1"
@@ -3367,6 +3376,18 @@ version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cfe53a6657fd280eaa890a3bc59152892ffa3e30101319d168b781ed6529b049"
[[package]]
name = "yeast"
version = "0.1.0"
dependencies = [
"clap",
"serde",
"serde_json",
"tree-sitter",
"tree-sitter-python",
"tree-sitter-ruby",
]
[[package]]
name = "yoke"
version = "0.8.0"

View File

@@ -4,6 +4,7 @@
resolver = "2"
members = [
"shared/tree-sitter-extractor",
"shared/yeast",
"ruby/extractor",
"rust/extractor",
"rust/extractor/macros",

View File

@@ -20,6 +20,7 @@ serde_json = "1.0"
chrono = { version = "0.4.42", features = ["serde"] }
num_cpus = "1.17.0"
zstd = "0.13.3"
yeast = { path = "../yeast" }
[dev-dependencies]
tree-sitter-ql = "0.23.1"

View File

@@ -14,7 +14,8 @@ use tracing_subscriber::fmt::format::DefaultFields;
use tracing_subscriber::fmt::format::Format;
use tracing_subscriber::layer::SubscriberExt;
use tracing_subscriber::util::SubscriberInitExt;
use tree_sitter::{Language, Node, Parser, Range, Tree};
use tree_sitter::{Language, Parser, Range};
use yeast::{Cursor, Node};
pub mod simple;
@@ -242,7 +243,16 @@ pub fn extract(
language_prefix,
schema,
);
traverse(&tree, &mut visitor);
// HACK: Pass the tree through yeast
let rules = if language_prefix == "ruby" {
yeast::rules::rules()
} else {
vec![]
};
let runner = yeast::Runner::new(language.clone(), rules);
let ast = runner.run_from_tree(&tree);
traverse(&ast, &mut visitor);
parser.reset();
}
@@ -333,7 +343,7 @@ impl<'a> Visitor<'a> {
&mut self,
message: &str,
args: &[diagnostics::MessageArg],
node: Node,
node: &Node,
status_page: bool,
) {
let loc = location_for(self, self.file_label, node);
@@ -357,7 +367,7 @@ impl<'a> Visitor<'a> {
self.record_parse_error(loc_label, &mesg);
}
fn enter_node(&mut self, node: Node) -> bool {
fn enter_node(&mut self, node: &Node) -> bool {
if node.is_missing() {
self.record_parse_error_for_node(
"A parse error occurred (expected {} symbol). Check the syntax of the file. If the file is invalid, correct the error or {} the file from analysis.",
@@ -383,7 +393,7 @@ impl<'a> Visitor<'a> {
true
}
fn leave_node(&mut self, field_name: Option<&'static str>, node: Node) {
fn leave_node(&mut self, field_name: Option<&'static str>, node: &Node) {
if node.is_error() || node.is_missing() {
return;
}
@@ -529,7 +539,7 @@ impl<'a> Visitor<'a> {
diagnostics::MessageArg::Code(&format!("{:?}", child_node.type_name)),
diagnostics::MessageArg::Code(&format!("{:?}", field.type_info)),
],
*node,
node,
false,
);
}
@@ -541,7 +551,7 @@ impl<'a> Visitor<'a> {
diagnostics::MessageArg::Code(child_node.field_name.unwrap_or("child")),
diagnostics::MessageArg::Code(&format!("{:?}", child_node.type_name)),
],
*node,
node,
false,
);
}
@@ -566,7 +576,7 @@ impl<'a> Visitor<'a> {
node.kind(),
column_name
);
self.record_parse_error_for_node(&error_message, &[], *node, false);
self.record_parse_error_for_node(&error_message, &[], node, false);
}
}
Storage::Table {
@@ -582,7 +592,7 @@ impl<'a> Visitor<'a> {
diagnostics::MessageArg::Code(node.kind()),
diagnostics::MessageArg::Code(table_name),
],
*node,
node,
false,
);
break;
@@ -639,15 +649,17 @@ impl<'a> Visitor<'a> {
}
// Emit a slice of a source file as an Arg.
fn sliced_source_arg(source: &[u8], n: Node) -> trap::Arg {
let range = n.byte_range();
trap::Arg::String(String::from_utf8_lossy(&source[range.start..range.end]).into_owned())
fn sliced_source_arg(source: &[u8], n: &Node) -> trap::Arg {
trap::Arg::String(n.opt_string_content().unwrap_or_else(|| {
let range = n.byte_range();
String::from_utf8_lossy(&source[range.start..range.end]).into_owned()
}))
}
// Emit a pair of `TrapEntry`s for the provided node, appropriately calibrated.
// The first is the location and label definition, and the second is the
// 'Located' entry.
fn location_for(visitor: &mut Visitor, file_label: trap::Label, n: Node) -> trap::Location {
fn location_for(visitor: &mut Visitor, file_label: trap::Label, n: &Node) -> trap::Location {
// Tree-sitter row, column values are 0-based while CodeQL starts
// counting at 1. In addition Tree-sitter's row and column for the
// end position are exclusive while CodeQL's end positions are inclusive.
@@ -713,8 +725,8 @@ fn location_for(visitor: &mut Visitor, file_label: trap::Label, n: Node) -> trap
}
}
fn traverse(tree: &Tree, visitor: &mut Visitor) {
let cursor = &mut tree.walk();
fn traverse(tree: &yeast::Ast, visitor: &mut Visitor) {
let mut cursor = tree.walk();
visitor.enter_node(cursor.node());
let mut recurse = true;
loop {

1
shared/yeast/.envrc Normal file
View File

@@ -0,0 +1 @@
use flake

1
shared/yeast/.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
/target

0
shared/yeast/.gitkeep Normal file
View File

357
shared/yeast/Cargo.lock generated Normal file
View File

@@ -0,0 +1,357 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 4
[[package]]
name = "aho-corasick"
version = "1.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301"
dependencies = [
"memchr",
]
[[package]]
name = "anstream"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d"
dependencies = [
"anstyle",
"anstyle-parse",
"anstyle-query",
"anstyle-wincon",
"colorchoice",
"is_terminal_polyfill",
"utf8parse",
]
[[package]]
name = "anstyle"
version = "1.0.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000"
[[package]]
name = "anstyle-parse"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e"
dependencies = [
"utf8parse",
]
[[package]]
name = "anstyle-query"
version = "1.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc"
dependencies = [
"windows-sys",
]
[[package]]
name = "anstyle-wincon"
version = "3.0.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d"
dependencies = [
"anstyle",
"once_cell_polyfill",
"windows-sys",
]
[[package]]
name = "cc"
version = "1.2.60"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "43c5703da9466b66a946814e1adf53ea2c90f10063b86290cc9eb67ce3478a20"
dependencies = [
"find-msvc-tools",
"shlex",
]
[[package]]
name = "clap"
version = "4.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b193af5b67834b676abd72466a96c1024e6a6ad978a1f484bd90b85c94041351"
dependencies = [
"clap_builder",
"clap_derive",
]
[[package]]
name = "clap_builder"
version = "4.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f"
dependencies = [
"anstream",
"anstyle",
"clap_lex",
"strsim",
]
[[package]]
name = "clap_derive"
version = "4.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1110bd8a634a1ab8cb04345d8d878267d57c3cf1b38d91b71af6686408bbca6a"
dependencies = [
"heck",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "clap_lex"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9"
[[package]]
name = "colorchoice"
version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570"
[[package]]
name = "find-msvc-tools"
version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582"
[[package]]
name = "heck"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
[[package]]
name = "is_terminal_polyfill"
version = "1.70.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695"
[[package]]
name = "itoa"
version = "1.0.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682"
[[package]]
name = "memchr"
version = "2.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79"
[[package]]
name = "once_cell_polyfill"
version = "1.70.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
[[package]]
name = "proc-macro2"
version = "1.0.106"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.45"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924"
dependencies = [
"proc-macro2",
]
[[package]]
name = "regex"
version = "1.12.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276"
dependencies = [
"aho-corasick",
"memchr",
"regex-automata",
"regex-syntax",
]
[[package]]
name = "regex-automata"
version = "0.4.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
]
[[package]]
name = "regex-syntax"
version = "0.8.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a"
[[package]]
name = "serde"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
dependencies = [
"serde_core",
"serde_derive",
]
[[package]]
name = "serde_core"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "serde_json"
version = "1.0.149"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86"
dependencies = [
"itoa",
"memchr",
"serde",
"serde_core",
"zmij",
]
[[package]]
name = "shlex"
version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
[[package]]
name = "streaming-iterator"
version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b2231b7c3057d5e4ad0156fb3dc807d900806020c5ffa3ee6ff2c8c76fb8520"
[[package]]
name = "strsim"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
[[package]]
name = "syn"
version = "2.0.117"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "tree-sitter"
version = "0.24.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a5387dffa7ffc7d2dae12b50c6f7aab8ff79d6210147c6613561fc3d474c6f75"
dependencies = [
"cc",
"regex",
"regex-syntax",
"streaming-iterator",
"tree-sitter-language",
]
[[package]]
name = "tree-sitter-language"
version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "009994f150cc0cd50ff54917d5bc8bffe8cad10ca10d81c34da2ec421ae61782"
[[package]]
name = "tree-sitter-python"
version = "0.23.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3d065aaa27f3aaceaf60c1f0e0ac09e1cb9eb8ed28e7bcdaa52129cffc7f4b04"
dependencies = [
"cc",
"tree-sitter-language",
]
[[package]]
name = "tree-sitter-ruby"
version = "0.23.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "be0484ea4ef6bb9c575b4fdabde7e31340a8d2dbc7d52b321ac83da703249f95"
dependencies = [
"cc",
"tree-sitter-language",
]
[[package]]
name = "unicode-ident"
version = "1.0.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"
[[package]]
name = "utf8parse"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
[[package]]
name = "windows-link"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
[[package]]
name = "windows-sys"
version = "0.61.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc"
dependencies = [
"windows-link",
]
[[package]]
name = "yeast"
version = "0.1.0"
dependencies = [
"clap",
"serde",
"serde_json",
"tree-sitter",
"tree-sitter-python",
"tree-sitter-ruby",
]
[[package]]
name = "zmij"
version = "1.0.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa"

13
shared/yeast/Cargo.toml Normal file
View File

@@ -0,0 +1,13 @@
[package]
name = "yeast"
version = "0.1.0"
edition = "2021"
[dependencies]
clap = { version = "4.4.10", features = ["derive"] }
serde = { version = "1.0.193", features = ["derive"] }
serde_json = "1.0.108"
tree-sitter = "0.24"
tree-sitter-ruby = "0.23"
tree-sitter-python = "0.23"

85
shared/yeast/flake.lock generated Normal file
View File

@@ -0,0 +1,85 @@
{
"nodes": {
"flake-utils": {
"inputs": {
"systems": "systems"
},
"locked": {
"lastModified": 1694529238,
"narHash": "sha256-zsNZZGTGnMOf9YpHKJqMSsa0dXbfmxeoJ7xHlrt+xmY=",
"owner": "numtide",
"repo": "flake-utils",
"rev": "ff7b65b44d01cf9ba6a71320833626af21126384",
"type": "github"
},
"original": {
"owner": "numtide",
"repo": "flake-utils",
"type": "github"
}
},
"nixpkgs": {
"locked": {
"lastModified": 1697730408,
"narHash": "sha256-Ww//zzukdTrwTrCUkaJA/NsaLEfUfQpWZXBdXBYfhak=",
"owner": "nixos",
"repo": "nixpkgs",
"rev": "ff0a5a776b56e0ca32d47a4a47695452ec7f7d80",
"type": "github"
},
"original": {
"owner": "nixos",
"ref": "nixpkgs-unstable",
"repo": "nixpkgs",
"type": "github"
}
},
"root": {
"inputs": {
"flake-utils": "flake-utils",
"nixpkgs": "nixpkgs",
"rust-overlay": "rust-overlay"
}
},
"rust-overlay": {
"inputs": {
"flake-utils": [
"flake-utils"
],
"nixpkgs": [
"nixpkgs"
]
},
"locked": {
"lastModified": 1697767917,
"narHash": "sha256-9+FjCVE1Y7iUKohBF43yD05KoQB+FPcw/XL2rlKkjqY=",
"owner": "oxalica",
"repo": "rust-overlay",
"rev": "679ea0878edc749f23516ea6d7ffa974c6304bf5",
"type": "github"
},
"original": {
"owner": "oxalica",
"repo": "rust-overlay",
"type": "github"
}
},
"systems": {
"locked": {
"lastModified": 1681028828,
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
"owner": "nix-systems",
"repo": "default",
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
"type": "github"
},
"original": {
"owner": "nix-systems",
"repo": "default",
"type": "github"
}
}
},
"root": "root",
"version": 7
}

31
shared/yeast/flake.nix Normal file
View File

@@ -0,0 +1,31 @@
{
description = "YEAST elaborates abstract syntax trees";
inputs = {
nixpkgs.url = "github:nixos/nixpkgs/nixpkgs-unstable";
flake-utils.url = "github:numtide/flake-utils";
rust-overlay = {
url = "github:oxalica/rust-overlay";
inputs.nixpkgs.follows = "nixpkgs";
inputs.flake-utils.follows = "flake-utils";
};
};
outputs = { self, nixpkgs, flake-utils, rust-overlay }:
flake-utils.lib.eachDefaultSystem (system:
let
pkgs = nixpkgs.legacyPackages.${system};
rust = rust-overlay.packages.${system}.rust;
in
{
devShells.default = pkgs.mkShell {
buildInputs = with pkgs; [
pkgs.tree-sitter
rust
rust-analyzer
libiconv
];
};
});
}

View File

@@ -0,0 +1,26 @@
use clap::Parser;
#[derive(Parser)]
#[clap(name = "yeast", about = "yeast elaborates abstract syntax trees")]
struct Cli {
file: String,
#[clap(default_value = "ruby")]
language: String,
}
fn get_language(language: &str) -> tree_sitter::Language {
match language {
"ruby" => tree_sitter_ruby::LANGUAGE.into(),
"python" => tree_sitter_python::LANGUAGE.into(),
_ => panic!("Unsupported language: {}", language),
}
}
fn main() {
let args = Cli::parse();
let language = get_language(&args.language);
let source = std::fs::read_to_string(&args.file).unwrap();
let runner = yeast::Runner::new(language, vec![]);
let ast = runner.run(&source);
println!("{}", ast.print(&source, ast.get_root()));
}

View File

@@ -0,0 +1,92 @@
use std::collections::{BTreeMap, BTreeSet};
use crate::Id;
#[derive(Debug, Clone)]
pub struct Captures {
captures: BTreeMap<&'static str, Vec<Id>>,
}
impl Default for Captures {
fn default() -> Self {
Self::new()
}
}
impl Captures {
pub fn new() -> Self {
Captures {
captures: BTreeMap::new(),
}
}
pub fn get_var(&self, key: &str) -> Result<Id, String> {
let ids = self.captures.get(key);
if let Some(ids) = ids {
if ids.len() == 1 {
Ok(ids[0])
} else {
Err(format!(
"Variable {} has {} matches, use * to allow repetition",
key,
ids.len()
))
}
} else {
Err(format!("No variable named {}", key))
}
}
pub fn insert(&mut self, key: &'static str, id: Id) {
self.captures.entry(key).or_default().push(id);
}
pub fn map_captures(&mut self, kind: &str, f: &mut impl FnMut(Id) -> Id) {
if let Some(ids) = self.captures.get_mut(kind) {
for id in ids {
*id = f(*id);
}
}
}
pub fn map_captures_to(&mut self, from: &str, to: &'static str, f: &mut impl FnMut(Id) -> Id) {
if let Some(from_ids) = self.captures.get(from) {
let new_values = from_ids.iter().copied().map(f).collect();
self.captures.insert(to, new_values);
}
}
pub fn merge(&mut self, other: &Captures) {
for (key, ids) in &other.captures {
self.captures.entry(key).or_default().extend(ids);
}
}
pub fn un_star<'a>(
&'a self,
children: &'a BTreeSet<&'static str>,
) -> Result<impl Iterator<Item = Captures> + 'a, String> {
let mut id_iter = children.iter();
if let Some(fst) = id_iter.next() {
let repeats = self
.captures
.get(fst)
.ok_or_else(|| format!("No variable named {}", fst))?
.len();
// TODO: better error on missing capture
if id_iter.any(|id| self.captures.get(id).map(Vec::len).unwrap_or(0) != repeats) {
return Err("Repeated captures must have the same number of matches".to_string());
}
Ok((0..repeats).map(move |iter| {
let mut new_vars: Captures = Captures::new();
for id in children {
let child_capture = self.captures.get(id).unwrap()[iter];
new_vars.captures.insert(id, vec![child_capture]);
}
new_vars
}))
} else {
Err("Repeated captures must have at least one capture".to_string())
}
}
}

View File

@@ -0,0 +1,8 @@
pub trait Cursor<'a, T, N, F> {
fn node(&self) -> &'a N;
fn field_id(&self) -> Option<F>;
fn field_name(&self) -> Option<&'static str>;
fn goto_first_child(&mut self) -> bool;
fn goto_next_sibling(&mut self) -> bool;
fn goto_parent(&mut self) -> bool;
}

575
shared/yeast/src/lib.rs Normal file
View File

@@ -0,0 +1,575 @@
use std::{collections::BTreeMap, mem};
use serde::Serialize;
use serde_json::{json, Value};
pub mod captures;
pub mod cursor;
pub mod print;
pub mod query;
mod range;
pub mod rules;
pub mod tree_builder;
mod visitor;
use captures::Captures;
pub use cursor::Cursor;
use query::QueryNode;
/// Node ids are indexes into the arena
type Id = usize;
/// Field and Kind ids are provided by tree-sitter
type FieldId = u16;
type KindId = u16;
pub const CHILD_FIELD: u16 = u16::MAX;
const CHILD_FIELD_NAME: &str = "child";
#[derive(Debug)]
pub struct AstCursor<'a> {
ast: &'a Ast,
/// A stack of parents, along with iterators for their children
parents: Vec<(&'a Node, ChildrenIter<'a>)>,
node: &'a Node,
}
impl<'a> AstCursor<'a> {
pub fn new(ast: &'a Ast) -> Self {
// TODO: handle non-zero root
let node = ast.get_node(ast.root).unwrap();
Self {
ast,
parents: vec![],
node,
}
}
fn goto_next_sibling_opt(&mut self) -> Option<()> {
self.node = self.parents.last_mut()?.1.next()?;
Some(())
}
fn goto_first_child_opt(&mut self) -> Option<()> {
let parent = self.node;
let mut children = ChildrenIter::new(self.ast, parent);
let first_child = children.next()?;
self.node = first_child;
self.parents.push((parent, children));
Some(())
}
fn goto_parent_opt(&mut self) -> Option<()> {
self.node = self.parents.pop()?.0;
Some(())
}
}
impl<'a> Cursor<'a, Ast, Node, FieldId> for AstCursor<'a> {
fn node(&self) -> &'a Node {
self.node
}
fn field_id(&self) -> Option<FieldId> {
let (_, children) = self.parents.last()?;
children.current_field()
}
fn field_name(&self) -> Option<&'static str> {
if self.field_id() == Some(CHILD_FIELD) {
None
} else {
self.field_id()
.and_then(|id| self.ast.field_name_for_id(id))
}
}
fn goto_first_child(&mut self) -> bool {
self.goto_first_child_opt().is_some()
}
fn goto_next_sibling(&mut self) -> bool {
self.goto_next_sibling_opt().is_some()
}
fn goto_parent(&mut self) -> bool {
self.goto_parent_opt().is_some()
}
}
/// An iterator over all the child nodes of a node.
#[derive(Debug)]
struct ChildrenIter<'a> {
ast: &'a Ast,
current_field: Option<FieldId>,
fields: std::collections::btree_map::Iter<'a, FieldId, Vec<Id>>,
field_children: Option<std::slice::Iter<'a, Id>>,
}
impl<'a> ChildrenIter<'a> {
fn new(ast: &'a Ast, node: &'a Node) -> Self {
Self {
ast,
current_field: None,
fields: node.fields.iter(),
field_children: None,
}
}
fn get_node(&self, id: Id) -> &'a Node {
self.ast.get_node(id).unwrap()
}
fn current_field(&self) -> Option<FieldId> {
self.current_field
}
}
impl<'a> Iterator for ChildrenIter<'a> {
type Item = &'a Node;
fn next(&mut self) -> Option<Self::Item> {
match self.field_children.as_mut() {
None => match self.fields.next() {
Some((field, children)) => {
self.current_field = Some(*field);
self.field_children = Some(children.iter());
self.next()
}
None => None,
},
Some(children) => match children.next() {
None => match self.fields.next() {
None => None,
Some((field, children)) => {
self.current_field = Some(*field);
self.field_children = Some(children.iter());
self.next()
}
},
Some(child_id) => Some(self.get_node(*child_id)),
},
}
}
}
/// Our AST
#[derive(PartialEq, Eq, Debug)]
pub struct Ast {
root: Id,
nodes: Vec<Node>,
language: tree_sitter::Language,
}
impl Ast {
/// Construct an AST from a TS tree
pub fn from_tree(language: tree_sitter::Language, tree: &tree_sitter::Tree) -> Self {
let mut visitor = visitor::Visitor::new(language);
visitor.visit(tree);
visitor.build()
}
pub fn walk(&self) -> AstCursor {
AstCursor::new(self)
}
pub fn nodes(&self) -> &[Node] {
&self.nodes
}
pub fn get_root(&self) -> Id {
self.root
}
pub fn set_root(&mut self, root: Id) {
self.root = root;
}
pub fn get_node(&self, id: Id) -> Option<&Node> {
self.nodes.get(id)
}
pub fn print(&self, source: &str, root_id: Id) -> Value {
let root = &self.nodes()[root_id];
self.print_node(root, source)
}
fn create_node(
&mut self,
kind: KindId,
content: NodeContent,
fields: BTreeMap<FieldId, Vec<Id>>,
is_named: bool,
) -> Id {
let id = self.nodes.len();
self.nodes.push(Node {
id,
kind,
kind_name: self.language.node_kind_for_id(kind).unwrap(),
fields,
content,
is_missing: false,
is_error: false,
is_extra: false,
is_named,
});
id
}
pub fn create_named_token(&mut self, kind: &'static str, content: String) -> Id {
let kind_id = self.language.id_for_node_kind(kind, true);
let id = self.nodes.len();
self.nodes.push(Node {
id,
kind: kind_id,
kind_name: kind,
is_named: true,
is_missing: false,
is_error: false,
is_extra: false,
fields: BTreeMap::new(),
content: NodeContent::DynamicString(content),
});
id
}
fn field_name_for_id(&self, id: FieldId) -> Option<&'static str> {
if id == CHILD_FIELD {
Some(CHILD_FIELD_NAME)
} else {
self.language.field_name_for_id(id)
}
}
fn field_id_for_name(&self, name: &str) -> Option<FieldId> {
if name == CHILD_FIELD_NAME {
Some(CHILD_FIELD)
} else {
self.language.field_id_for_name(name).map(|id| id.get())
}
}
/// Print a node for debugging
fn print_node(&self, node: &Node, source: &str) -> Value {
let fields: BTreeMap<&'static str, Vec<Value>> = node
.fields
.iter()
.map(|(field_id, nodes)| {
let field_name = if field_id == &CHILD_FIELD {
"rest"
} else {
self.field_name_for_id(*field_id).unwrap()
};
let nodes: Vec<Value> = nodes
.iter()
.map(|id| self.print_node(self.get_node(*id).unwrap(), source))
.collect();
(field_name, nodes)
})
.collect();
let mut value = BTreeMap::new();
let kind = self.language.node_kind_for_id(node.kind).unwrap();
let content = match &node.content {
NodeContent::Range(range) => {
let len = range.end_byte - range.start_byte;
let end = range.start_byte + len;
source.as_bytes()[range.start_byte..end]
.iter()
.map(|b| *b as char)
.collect()
}
NodeContent::String(s) => s.to_string(),
NodeContent::DynamicString(s) => s.clone(),
};
if fields.is_empty() {
value.insert(kind, json!(content));
} else {
let mut fields: BTreeMap<_, _> =
fields.into_iter().map(|(k, v)| (k, json!(v))).collect();
fields.insert("content", json!(content));
value.insert(kind, json!(fields));
}
json!(value)
}
/// Return an example AST, for testing and to fill implementation gaps
pub fn example(language: tree_sitter::Language) -> Self {
// x = 1
Self {
root: 0,
language,
nodes: vec![
// assignment
Node {
id: 0,
kind: 276,
kind_name: "assignment",
fields: {
let mut map = BTreeMap::new();
map.insert(18, vec![1]);
map.insert(28, vec![3]);
map
},
content: NodeContent::String("x = 1"),
is_missing: false,
is_error: false,
is_extra: false,
is_named: true,
},
// identifier
Node {
id: 1,
kind: 1,
kind_name: "identifier",
fields: BTreeMap::new(),
content: NodeContent::String("x"),
is_missing: false,
is_error: false,
is_extra: false,
is_named: true,
},
// "="
Node {
id: 2,
kind: 17,
kind_name: "=",
fields: BTreeMap::new(),
content: NodeContent::String("="),
is_missing: false,
is_error: false,
is_extra: false,
is_named: false,
},
// integer
Node {
id: 3,
kind: 110,
kind_name: "integer",
fields: BTreeMap::new(),
content: NodeContent::String("1"),
is_missing: false,
is_error: false,
is_extra: false,
is_named: true,
},
],
}
}
fn id_for_node_kind(&self, kind: &str) -> Option<KindId> {
let id = self.language.id_for_node_kind(kind, true);
if id == 0 {
None
} else {
Some(id)
}
}
fn id_for_unnamed_node_kind(&self, kind: &str) -> Option<KindId> {
let id = self.language.id_for_node_kind(kind, false);
if id == 0 {
None
} else {
Some(id)
}
}
}
/// A node in our AST
#[derive(PartialEq, Eq, Debug, Clone, Serialize)]
pub struct Node {
id: Id,
kind: KindId,
kind_name: &'static str,
fields: BTreeMap<FieldId, Vec<Id>>,
content: NodeContent,
is_named: bool,
is_missing: bool,
is_extra: bool,
is_error: bool,
}
impl Node {
pub fn id(&self) -> Id {
self.id
}
pub fn kind(&self) -> &'static str {
self.kind_name
}
pub fn is_named(&self) -> bool {
self.is_named
}
pub fn is_missing(&self) -> bool {
self.is_missing
}
pub fn is_extra(&self) -> bool {
self.is_extra
}
pub fn is_error(&self) -> bool {
self.is_error
}
fn fake_range(&self) -> tree_sitter::Range {
tree_sitter::Range {
start_byte: 0,
end_byte: 0,
start_point: tree_sitter::Point { row: 0, column: 0 },
end_point: tree_sitter::Point { row: 0, column: 0 },
}
}
fn fake_point(&self) -> tree_sitter::Point {
tree_sitter::Point { row: 0, column: 0 }
}
pub fn start_position(&self) -> tree_sitter::Point {
match self.content {
NodeContent::Range(range) => range.start_point,
_ => self.fake_point(),
}
}
pub fn end_position(&self) -> tree_sitter::Point {
match self.content {
NodeContent::Range(range) => range.end_point,
_ => self.fake_point(),
}
}
pub fn start_byte(&self) -> usize {
match self.content {
NodeContent::Range(range) => range.start_byte,
_ => 0,
}
}
pub fn end_byte(&self) -> usize {
match self.content {
NodeContent::Range(range) => range.end_byte,
_ => 0,
}
}
pub fn byte_range(&self) -> std::ops::Range<usize> {
self.start_byte()..self.end_byte()
}
pub fn opt_string_content(&self) -> Option<String> {
match &self.content {
NodeContent::Range(_range) => None,
NodeContent::String(s) => Some(s.to_string()),
NodeContent::DynamicString(s) => Some(s.to_string()),
}
}
}
/// The contents of a node is either a range in the original source file,
/// or a new string if the node is synthesized.
#[derive(PartialEq, Eq, Debug, Clone, Serialize)]
pub enum NodeContent {
Range(#[serde(with = "range::Range")] tree_sitter::Range),
String(&'static str),
DynamicString(String),
}
impl From<&'static str> for NodeContent {
fn from(value: &'static str) -> Self {
NodeContent::String(value)
}
}
impl From<tree_sitter::Range> for NodeContent {
fn from(value: tree_sitter::Range) -> Self {
NodeContent::Range(value)
}
}
pub struct Rule {
query: QueryNode,
transform: Box<dyn Fn(&mut Ast, Captures) -> Vec<Id>>,
}
impl Rule {
pub fn new(query: QueryNode, transform: Box<dyn Fn(&mut Ast, Captures) -> Vec<Id>>) -> Self {
Self { query, transform }
}
fn try_rule(&self, ast: &mut Ast, node: Id) -> Option<Vec<Id>> {
let mut captures = Captures::new();
if self.query.do_match(ast, node, &mut captures).unwrap() {
Some((self.transform)(ast, captures))
} else {
None
}
}
}
fn apply_rules(rules: &Vec<Rule>, ast: &mut Ast, id: Id) -> Vec<Id> {
// apply the transformation rules on this node
for rule in rules {
if let Some(result_node) = rule.try_rule(ast, id) {
// We transformed it so now recurse into the result
return result_node
.iter()
.flat_map(|node| apply_rules(rules, ast, *node))
.collect();
}
}
// copy the current node
let mut node = ast.nodes[id].clone();
// recursively descend into all the fields
for vec in node.fields.values_mut() {
let mut old = Vec::new();
mem::swap(vec, &mut old);
*vec = old
.iter()
.flat_map(|node| apply_rules(rules, ast, *node))
.collect();
}
node.id = ast.nodes.len();
ast.nodes.push(node);
vec![ast.nodes.len() - 1]
}
pub struct Runner {
language: tree_sitter::Language,
rules: Vec<Rule>,
}
impl Runner {
pub fn new(language: tree_sitter::Language, rules: Vec<Rule>) -> Self {
Self { language, rules }
}
pub fn run_from_tree(&self, tree: &tree_sitter::Tree) -> Ast {
let mut ast = Ast::from_tree(self.language.clone(), tree);
let res = apply_rules(&self.rules, &mut ast, 0);
if res.len() != 1 {
panic!("Expected at exactly one result node, got {}", res.len());
}
ast.set_root(res[0]);
ast
}
pub fn run(&self, input: &str) -> Ast {
// Parse the input into an AST
let mut parser = tree_sitter::Parser::new();
parser.set_language(&self.language).unwrap();
let tree = parser.parse(input, None).unwrap();
let mut ast = Ast::from_tree(self.language.clone(), &tree);
let res = apply_rules(&self.rules, &mut ast, 0);
if res.len() != 1 {
panic!("Expected at exactly one result node, got {}", res.len());
}
ast.set_root(res[0]);
ast
}
}

34
shared/yeast/src/print.rs Normal file
View File

@@ -0,0 +1,34 @@
use crate::{cursor::Cursor, AstCursor, Node};
pub struct Printer {}
impl Printer {
pub fn visit(&mut self, mut cursor: AstCursor<'_>) {
self.enter_node(cursor.node());
let mut recurse = true;
loop {
if recurse && cursor.goto_first_child() {
recurse = self.enter_node(cursor.node());
} else {
self.leave_node(cursor.node());
if cursor.goto_next_sibling() {
recurse = self.enter_node(cursor.node());
} else if cursor.goto_parent() {
recurse = false;
} else {
break;
}
}
}
}
pub fn enter_node(&mut self, node: &Node) -> bool {
println!("enter_node: {:?}", node);
true
}
pub fn leave_node(&mut self, node: &Node) -> bool {
println!("leave_node: {:?}", node);
true
}
}

243
shared/yeast/src/query.rs Normal file
View File

@@ -0,0 +1,243 @@
use crate::{captures::Captures, Ast, Id};
#[derive(Debug, Clone)]
pub enum QueryNode {
Any(),
Node {
kind: &'static str,
children: Vec<(&'static str, Vec<QueryListElem>)>,
},
UnnamedNode {
kind: &'static str,
},
Capture {
capture: &'static str,
node: Box<QueryNode>,
},
}
#[derive(Debug, Clone)]
pub enum QueryListElem {
Repeated { children: Vec<QueryListElem>, rep: Rep },
SingleNode(QueryNode),
}
#[derive(Debug, PartialEq, Eq, Copy, Clone)]
pub enum Rep {
ZeroOrMore,
OneOrMore,
ZeroOrOne,
}
impl QueryNode {
pub fn do_match(&self, ast: &Ast, node: Id, matches: &mut Captures) -> Result<bool, String> {
match self {
QueryNode::Any() => Ok(true),
QueryNode::Node { kind, children } => {
let node = ast.get_node(node).unwrap();
let target_kind = ast.id_for_node_kind(kind).ok_or_else(|| {
format!("Node kind {} not found in language", kind)
})?;
if node.kind != target_kind {
return Ok(false);
}
for (field, field_children) in children {
let field_id = ast
.field_id_for_name(field)
.ok_or_else(|| format!("Field {} not found in language", field))?;
let empty = Vec::new();
let mut child_iter = node
.fields
.get(&field_id)
.unwrap_or(&empty)
.iter()
.cloned();
if !match_children(field_children.iter(), ast, &mut child_iter, matches)? {
return Ok(false);
}
}
Ok(true)
}
QueryNode::UnnamedNode { kind } => {
let node = ast.get_node(node).unwrap();
let target_kind = ast.id_for_unnamed_node_kind(kind).ok_or_else(|| {
format!("unnamed Node kind {} not found in language", kind)
})?;
Ok(node.kind == target_kind)
}
QueryNode::Capture {
capture,
node: sub_query,
} => {
matches.insert(capture, node);
sub_query.do_match(ast, node, matches)
}
}
}
}
fn match_children<'a>(
child_matchers: impl Iterator<Item = &'a QueryListElem>,
ast: &Ast,
remaining_children: &mut (impl Iterator<Item = Id> + Clone),
matches: &mut Captures,
) -> Result<bool, String> {
for child in child_matchers {
if !child.do_match(ast, remaining_children, matches)? {
return Ok(false);
}
}
Ok(true)
}
impl QueryListElem {
fn do_match(
&self,
ast: &Ast,
remaining_children: &mut (impl Iterator<Item = Id> + Clone),
matches: &mut Captures,
) -> Result<bool, String> {
match self {
QueryListElem::Repeated { children, rep } => {
let mut iters = 0;
loop {
let matches_initial = matches.clone();
let start = remaining_children.clone();
if !match_children(children.iter(), ast, remaining_children, matches)? {
// Reset the state
*remaining_children = start;
*matches = matches_initial;
break;
}
iters += 1;
if *rep == Rep::ZeroOrOne {
break;
}
}
if *rep == Rep::OneOrMore && iters == 0 {
// We didn't match any children but we were supposed to
Ok(false)
} else {
Ok(true)
}
}
QueryListElem::SingleNode(sub_query) => {
if let Some(child) = remaining_children.next() {
sub_query.do_match(ast, child, matches)
} else {
Ok(false)
}
}
}
}
}
#[macro_export]
macro_rules! query {
// _
(_) => { $crate::query::QueryNode::Any()};
// Parens
(($($child:tt)*)) => { query!($($child)*)};
// Match a node of a given kind
($node_id:ident $($rest:tt)*) => { $crate::query::QueryNode::Node{ kind: stringify!($node_id), children: query_fields!($($rest)*)}};
// Match an unamed node of a given kind (using a string literal)
($node_id:literal) => { $crate::query::QueryNode::UnnamedNode{ kind: $node_id}};
// Capture
($child:tt @ $capture_id:ident) => { $crate::query::QueryNode::Capture{ capture: stringify!($capture_id), node: Box::new(query!($child))}};
// Capture only (implicit _)
(@ $capture_id:ident) => { $crate::query::QueryNode::Capture{ capture: stringify!($capture_id), node: Box::new($crate::query::QueryNode::Any())}};
}
// We use an accumulator to build up the list of children incrementally so this starts the tail recursion
#[macro_export]
macro_rules! query_list {
($($rest:tt)*) => { _query_list!( @ACC [] $($rest)* )};
}
#[macro_export]
macro_rules! query_fields {
($($rest:tt)*) => { _query_fields!( @ACC [] $($rest)* )};
}
#[macro_export]
macro_rules! _query_fields {
// vec! allows a trailing comma so we assume that either the accumulator is empty or`ends in a comma
// Base case: no more tokens, so return the accumulator
(@ACC [$($acc:tt)*]) => { vec![$($acc)*]};
// Parse field * : (nodeList)
(@ACC [$($acc:tt)*] $field_name:ident * : ($($sub_node:tt)*) $($rest:tt)*) => { _query_fields!( @ACC [ $($acc)* (stringify!($field_name), query_list!($($sub_node)*)),] $($rest)*)};
// Parse field : node
(@ACC [$($acc:tt)*] $field_name:ident : $sub_node:tt $($rest:tt)*) => { _query_fields!( @ACC [ $($acc)* (stringify!($field_name), vec![$crate::query::QueryListElem::SingleNode(query!($sub_node))]),] $($rest)* )};
}
#[macro_export]
macro_rules! _query_list {
// vec! allows a trailing comma so we assume that either the accumulator is empty or`ends in a comma
// Base case: no more tokens, so return the accumulator
(@ACC [$($acc:tt)*]) => { vec![$($acc)*]};
// Parse (nodeList)*
(@ACC [$($acc:tt)*] ($($sub_node:tt)*) * $($rest:tt)*) => { _query_list!( @ACC [ $($acc)* $crate::query::QueryListElem::Repeated{children: query_list!($($sub_node)*), rep: $crate::query::Rep::ZeroOrMore},] $($rest)*)};
// Parse (nodeList)+
(@ACC [$($acc:tt)*] ($($sub_node:tt)*) + $($rest:tt)*) => { _query_list!( @ACC [ $($acc)* $crate::query::QueryListElem::Repeated{children: query_list!($($sub_node)*), rep: $crate::query::Rep::OneOrMore},] $($rest)*)};
// Parse (nodeList)?
(@ACC [$($acc:tt)*] ($($sub_node:tt)*) ? $($rest:tt)*) => { _query_list!( @ACC [ $($acc)* $crate::query::QueryListElem::Repeated{children: query_list!($($sub_node)*), rep: $crate::query::Rep::ZeroOrOne},] $($rest)*)};
// Parse node (treating @cap as a single node)
(@ACC [$($acc:tt)*] @ $sub_node:tt $($rest:tt)*) => { _query_list!( @ACC [ $($acc)* $crate::query::QueryListElem::SingleNode(query!(@$sub_node)),] $($rest)*)};
// Parse node (this must be last as it only applies if the earlier cases don't match)
(@ACC [$($acc:tt)*] $sub_node:tt $($rest:tt)*) => { _query_list!( @ACC [ $($acc)* $crate::query::QueryListElem::SingleNode(query!($sub_node)),] $($rest)*)};
}
pub use query;
pub use query_list;
#[cfg(test)]
mod tests {
use crate::query::*;
#[test]
fn it_works() {
let query1: QueryNode = query!(_);
println!("{:?}", query1);
let query2 = query!(foo);
println!("{:?}", query2);
let query3 = query!(foo child: (_));
println!("{:?}", query3);
let query4 = query!(foo child*:((_)*));
println!("{:?}", query4);
let query5: QueryNode = query!(foo child*:((_)*));
println!("{:?}", query5);
let query6: QueryNode = query!(_ @ bar);
println!("{:?}", query6);
let query7: QueryNode = query!(foo child:(_ @ bar));
println!("{:?}", query7);
let query7: QueryNode = query!(foo child:(@ bar));
println!("{:?}", query7);
let query8: QueryNode = query!((assignment
left: (element_reference
object: (@ obj)
child: (_ @ index)
)
right: (_ @ rhs)
));
println!("{:?}", query8);
let query9: QueryNode = query!((assignment
left: (element_reference
object * : ((@ obj)*)
child: (_ @ index)
)
right: (_ @ rhs)
));
println!("{:?}", query9);
let query10 = query!(
program
child: (assignment
left: (@left)
right: (@right))
);
println!("{:?}", query10);
}
}

21
shared/yeast/src/range.rs Normal file
View File

@@ -0,0 +1,21 @@
//! (de)-serialize helpers for tree_sitter::Range
use serde::{Deserialize, Serialize};
#[derive(Serialize, Deserialize)]
#[serde(remote = "tree_sitter::Point")]
pub struct Point {
pub row: usize,
pub column: usize,
}
#[derive(Serialize, Deserialize)]
#[serde(remote = "tree_sitter::Range")]
pub struct Range {
pub start_byte: usize,
pub end_byte: usize,
#[serde(with = "Point")]
pub start_point: tree_sitter::Point,
#[serde(with = "Point")]
pub end_point: tree_sitter::Point,
}

133
shared/yeast/src/rules.rs Normal file
View File

@@ -0,0 +1,133 @@
use std::cell::Cell;
use std::rc::Rc;
use crate::{captures::Captures, *};
pub fn rules() -> Vec<Rule> {
let fresh_ids = Rc::new(Cell::new(0));
let fresh_ids2: Rc<Cell<i32>> = fresh_ids.clone();
let assign_query = query!(
(assignment
left: (
left_assignment_list child*: ((((identifier) @ left) (",")?)*)
)
right: (@right)
)
);
let assign_transform = move |ast: &mut Ast, mut match_: Captures| {
println!("match: {:?}", match_);
let fresh = fresh_ids.get();
fresh_ids.set(fresh + 1);
let new_ident = format!("tmp-{}", fresh);
match_.insert(
"tmp_lhs",
ast.create_named_token("identifier", new_ident.clone()),
);
let mut i = 0;
match_.map_captures_to("left", "assigns", &mut |old_id| {
let mut local_capture = Captures::new();
local_capture.insert("lhs", old_id);
local_capture.insert(
"tmp",
ast.create_named_token("identifier", new_ident.clone()),
);
let index: i32 = i;
i += 1;
local_capture.insert(
"index",
ast.create_named_token("integer", index.to_string()),
);
tree_builder!(
(assignment
left: (@lhs)
right: (
element_reference
object: (@tmp)
child: (@index)
)
)
)
.build_tree(ast, &local_capture)
.unwrap()
});
// construct the new tree here maybe
// captures is probably a HashMap from capture name to AST node
trees_builder!(
(assignment
left: (@tmp_lhs)
right: (@right)
)
(
@assigns
)*
)
.build_trees(ast, &match_)
.unwrap()
};
let assign_rule = Rule::new(assign_query, Box::new(assign_transform));
// TODO: There is a spurious end token
let for_query = query!(
(for
pattern: (@pat)
value: (in child*: ("in" @val))
body: (do child*: (("do")? (@body)*))
)
);
let for_transform = move |ast: &mut Ast, mut match_: Captures| {
let fresh = fresh_ids2.get();
fresh_ids2.set(fresh + 1);
let new_ident = format!("tmp-{}", fresh);
match_.insert(
"tmp_rhs",
ast.create_named_token("identifier", new_ident.clone()),
);
match_.insert(
"tmp_param",
ast.create_named_token("identifier", new_ident.clone()),
);
match_.insert(
"each",
ast.create_named_token("identifier", "each".to_string()),
);
trees_builder!(
(call
receiver: (@val)
method: (@each)
block: (block
parameters: (
block_parameters
child: (@tmp_param)
)
body: (block_body
child*: (
(assignment
left: (@pat)
right: (@tmp_rhs)
)
(@body)*
)
)
)
)
)
.build_trees(ast, &match_)
.unwrap()
};
let for_rule = Rule::new(for_query, Box::new(for_transform));
// Just get rid of all end tokens as they aren't needed
let end_query = query!(("end"));
let end_transform = |_ast: &mut Ast, _match: Captures| vec![];
let end_rule = Rule::new(end_query, Box::new(end_transform));
vec![assign_rule, for_rule, end_rule]
}

View File

@@ -0,0 +1,170 @@
use crate::{captures::Captures, Ast, Id};
use std::collections::BTreeSet;
#[derive(Debug, Clone)]
pub enum TreeBuilder {
Node {
kind: &'static str,
children: Vec<(&'static str, Vec<TreeChildBuilder>)>,
},
Capture {
capture: &'static str,
},
}
#[derive(Debug, Clone)]
pub enum TreeChildBuilder {
Repeated {
child: TreeBuilder,
},
SingleNode(TreeBuilder),
}
impl TreeChildBuilder {
fn get_opt_contained(&self) -> BTreeSet<&'static str> {
match self {
TreeChildBuilder::Repeated { child } => child.get_opt_contained(),
TreeChildBuilder::SingleNode(node) => node.get_opt_contained(),
}
}
fn build_tree(
&self,
target: &mut Ast,
vars: &Captures,
child_ids: &mut Vec<Id>,
) -> Result<(), String> {
match self {
TreeChildBuilder::Repeated { child } => {
let repeated_ids = self.get_opt_contained();
for sub_captures in vars.un_star(&repeated_ids)? {
child_ids.push(child.build_tree(target, &sub_captures)?)
}
Ok(())
}
TreeChildBuilder::SingleNode(node) => {
child_ids.push(node.build_tree(target, vars)?);
Ok(())
}
}
}
}
impl TreeBuilder {
fn get_opt_contained(&self) -> BTreeSet<&'static str> {
match self {
TreeBuilder::Node { kind: _, children } => {
let mut contained = BTreeSet::new();
for (_, children) in children {
for child in children {
contained.extend(child.get_opt_contained());
}
}
contained
}
TreeBuilder::Capture { capture } => {
let mut contained = BTreeSet::new();
contained.insert(*capture);
contained
}
}
}
pub fn build_tree(&self, target: &mut Ast, vars: &Captures) -> Result<Id, String> {
match self {
TreeBuilder::Capture { capture } => vars.get_var(capture),
TreeBuilder::Node { kind, children } => {
let ast_kind = target.id_for_node_kind(kind).ok_or_else(||
format!("Node kind {} does not exist in language", kind)
)?;
let child_vars = children.iter().map(|(field, children)| {
let mut child_ids = Vec::new();
for child in children {
child.build_tree(target, vars, &mut child_ids)?;
}
let field_id = target
.field_id_for_name(field)
.ok_or(format!("Field {} does not exist in language", field))?;
Ok((field_id, child_ids))
}).collect::<Result<_,String>>()?;
Ok(target.create_node(ast_kind, "".into(), child_vars, true))
}
}
}
}
#[macro_export]
macro_rules! tree_builder {
(($($child:tt)*)) => { tree_builder!($($child)*)};
// Match a node of a given kind
($node_id:ident $($rest:tt)*) => { $crate::tree_builder::TreeBuilder::Node{ kind: stringify!($node_id), children: tree_builder_fields!($($rest)*)}};
// Capture only (implicit _)
(@ $capture_id:ident) => { $crate::tree_builder::TreeBuilder::Capture{ capture: stringify!($capture_id)}};
}
// We use an accumulator to build up the list of children incrementally so this starts the tail recursion
#[macro_export]
macro_rules! tree_builder_child {
() => { Vec::new()};
($($rest:tt)*) => { _tree_builder_child!( @ACC [] $($rest)* )};
}
#[macro_export]
macro_rules! _tree_builder_child {
// vec! allows a trailing comma so we assume that either the accumulator is empty or`ends in a comma
// Base case: no more tokens, so return the accumulator
(@ACC [$($acc:tt)*]) => { vec![$($acc)*]};
// Parse field* : node
(@ACC [$($acc:tt)*] $field_name:ident * : ($($sub_node:tt)*) $($rest:tt)*) => { _tree_builder_child!( @ACC [ $($acc)* $crate::tree_builder::TreeChildBuilder::Field{field_name: stringify!($field_name), node: tree_builder_child!($($sub_node)*)},] $($rest)*)};
// Parse field : node
(@ACC [$($acc:tt)*] $field_name:ident : $sub_node:tt $($rest:tt)*) => { _tree_builder_child!( @ACC [ $($acc)* $crate::tree_builder::TreeChildBuilder::Field{field_name: stringify!($field_name), node: vec![$crate::tree_builder::TreeChildBuilder::SingleNode(tree_builder!($sub_node))]},] $($rest)*)};
// Parse (node)*
(@ACC [$($acc:tt)*] $sub_node:tt * $($rest:tt)*) => { _tree_builder_child!( @ACC [ $($acc)* $crate::tree_builder::TreeChildBuilder::Repeated{child: tree_builder!($sub_node)},] $($rest)*)};
// Parse node (this must be last as it only applies if the earlier cases don't match)
(@ACC [$($acc:tt)*] $sub_node:tt $($rest:tt)*) => { _tree_builder_child!( @ACC [ $($acc)* $crate::tree_builder::TreeChildBuilder::SingleNode(tree_builder!($sub_node)),] $($rest)*)};
}
#[macro_export]
macro_rules! _tree_builder_fields {
// vec! allows a trailing comma so we assume that either the accumulator is empty or`ends in a comma
// Base case: no more tokens, so return the accumulator
(@ACC [$($acc:tt)*]) => { vec![$($acc)*]};
// Parse field* : node
(@ACC [$($acc:tt)*] $field_name:ident * : ($($sub_node:tt)*) $($rest:tt)*) => { _tree_builder_fields!( @ACC [ $($acc)* (stringify!($field_name), tree_builder_child!($($sub_node)*)),] $($rest)*)};
// Parse field : node
(@ACC [$($acc:tt)*] $field_name:ident : $sub_node:tt $($rest:tt)*) => { _tree_builder_fields!( @ACC [ $($acc)* (stringify!($field_name), vec![$crate::tree_builder::TreeChildBuilder::SingleNode(tree_builder!($sub_node))]),] $($rest)*)};
}
#[macro_export]
macro_rules! tree_builder_fields {
($($all:tt)*) => { _tree_builder_fields!( @ACC [] $($all)*)};
}
pub struct TreesBuilder {
pub children: Vec<TreeChildBuilder>,
}
impl TreesBuilder {
pub fn build_trees(&self, target: &mut Ast, vars: &Captures) -> Result<Vec<Id>, String> {
let mut child_ids = Vec::new();
for child in &self.children {
child.build_tree(target, vars, &mut child_ids)?;
}
Ok(child_ids)
}
}
#[macro_export]
macro_rules! trees_builder {
() => { $crate::tree_builder::TreesBuilder { children: Vec::new()}};
($($rest:tt)*) => {$crate::tree_builder::TreesBuilder { children: _tree_builder_child!( @ACC [] $($rest)* )}};
}
pub use tree_builder;
pub use tree_builder_child;
pub use trees_builder;

110
shared/yeast/src/visitor.rs Normal file
View File

@@ -0,0 +1,110 @@
use std::collections::BTreeMap;
use tree_sitter::{Language, Tree};
use crate::{Ast, Id, Node, NodeContent, CHILD_FIELD};
#[derive(Debug)]
struct VisitorNode {
inner: Node,
parent: Option<Id>,
}
/// A type that can walk a TS tree and produce an `Ast`.
#[derive(Debug)]
pub(crate) struct Visitor {
nodes: Vec<VisitorNode>,
current: Option<Id>,
language: Language,
}
impl Visitor {
pub fn new(language: Language) -> Self {
Self {
nodes: Vec::new(),
current: None,
language,
}
}
pub fn visit(&mut self, tree: &Tree) {
let cursor = &mut tree.walk();
self.enter_node(cursor.node());
let mut recurse = true;
loop {
if recurse && cursor.goto_first_child() {
recurse = self.enter_node(cursor.node());
} else {
self.leave_node(cursor.field_name(), cursor.node());
if cursor.goto_next_sibling() {
recurse = self.enter_node(cursor.node());
} else if cursor.goto_parent() {
recurse = false;
} else {
break;
}
}
}
}
pub fn build(self) -> Ast {
Ast {
root: self.nodes[0].inner.id, // this is likely always just 0
language: self.language,
nodes: self.nodes.into_iter().map(|n| n.inner).collect(),
}
}
fn add_node(&mut self, n: tree_sitter::Node<'_>, content: NodeContent, is_named: bool) -> Id {
let id = self.nodes.len();
self.nodes.push(VisitorNode {
inner: Node {
id,
kind: self.language.id_for_node_kind(n.kind(), is_named),
kind_name: n.kind(),
content,
fields: BTreeMap::new(),
is_missing: n.is_missing(),
is_named: n.is_named(),
is_extra: n.is_extra(),
is_error: n.is_error(),
},
parent: self.current,
});
id
}
fn enter_node(&mut self, node: tree_sitter::Node<'_>) -> bool {
let id = self.add_node(node, node.range().into(), node.is_named());
self.current = Some(id);
true
}
fn leave_node(&mut self, field_name: Option<&'static str>, _node: tree_sitter::Node<'_>) {
let node = self.current.map(|i| &self.nodes[i]).unwrap();
let node_id = node.inner.id;
let node_parent = node.parent;
if let Some(parent_id) = node.parent {
let parent = self.nodes.get_mut(parent_id).unwrap();
if let Some(field) = field_name {
let field_id = self.language.field_id_for_name(field).unwrap().get();
parent
.inner
.fields
.entry(field_id)
.or_default()
.push(node_id);
} else {
parent
.inner
.fields
.entry(CHILD_FIELD)
.or_default()
.push(node_id);
}
}
self.current = node_parent;
}
}

View File

@@ -0,0 +1,68 @@
{
"program": {
"content": "x, y, z = foo()\n",
"rest": [
{
"assignment": {
"content": "x, y, z = foo()",
"left": [
{
"left_assignment_list": {
"content": "x, y, z",
"rest": [
{
"identifier": "x"
},
{
",": ","
},
{
"identifier": "y"
},
{
",": ","
},
{
"identifier": "z"
}
]
}
}
],
"rest": [
{
"=": "="
}
],
"right": [
{
"call": {
"arguments": [
{
"argument_list": {
"content": "()",
"rest": [
{
"(": "("
},
{
")": ")"
}
]
}
}
],
"content": "foo()",
"method": [
{
"identifier": "foo"
}
]
}
}
]
}
}
]
}
}

1
shared/yeast/tests/fixtures/1.rb vendored Normal file
View File

@@ -0,0 +1 @@
x, y, z = foo()

View File

@@ -0,0 +1,68 @@
{
"program": {
"content": "",
"rest": [
{
"assignment": {
"content": "",
"left": [
{
"call": {
"arguments": [
{
"argument_list": {
"content": "()",
"rest": [
{
"(": "("
},
{
")": ")"
}
]
}
}
],
"content": "foo()",
"method": [
{
"identifier": "foo"
}
]
}
}
],
"rest": [
{
"=": "="
}
],
"right": [
{
"left_assignment_list": {
"content": "x, y, z",
"rest": [
{
"identifier": "x"
},
{
",": ","
},
{
"identifier": "y"
},
{
",": ","
},
{
"identifier": "z"
}
]
}
}
]
}
}
]
}
}

View File

@@ -0,0 +1,85 @@
{
"program": {
"content": "for a, b in pairs_list do\n x=y\nend",
"rest": [
{
"for": {
"body": [
{
"do": {
"content": "do\n x=y\nend",
"rest": [
{
"do": "do"
},
{
"assignment": {
"content": "x=y",
"left": [
{
"identifier": "x"
}
],
"rest": [
{
"=": "="
}
],
"right": [
{
"identifier": "y"
}
]
}
},
{
"end": "end"
}
]
}
}
],
"content": "for a, b in pairs_list do\n x=y\nend",
"pattern": [
{
"left_assignment_list": {
"content": "a, b",
"rest": [
{
"identifier": "a"
},
{
",": ","
},
{
"identifier": "b"
}
]
}
}
],
"rest": [
{
"for": "for"
}
],
"value": [
{
"in": {
"content": "in pairs_list",
"rest": [
{
"in": "in"
},
{
"identifier": "pairs_list"
}
]
}
}
]
}
}
]
}
}

View File

@@ -0,0 +1,139 @@
{
"program": {
"content": "for a, b in pairs_list do\n x=y\nend",
"rest": [
{
"call": {
"block": [
{
"block": {
"body": [
{
"block_body": {
"content": "",
"rest": [
{
"assignment": {
"content": "",
"left": [
{
"identifier": "tmp-1"
}
],
"right": [
{
"identifier": "tmp-0"
}
]
}
},
{
"assignment": {
"content": "",
"left": [
{
"identifier": "a"
}
],
"right": [
{
"element_reference": {
"content": "",
"object": [
{
"identifier": "tmp-1"
}
],
"rest": [
{
"integer": "0"
}
]
}
}
]
}
},
{
"assignment": {
"content": "",
"left": [
{
"identifier": "b"
}
],
"right": [
{
"element_reference": {
"content": "",
"object": [
{
"identifier": "tmp-1"
}
],
"rest": [
{
"integer": "1"
}
]
}
}
]
}
},
{
"assignment": {
"content": "x=y",
"left": [
{
"identifier": "x"
}
],
"rest": [
{
"=": "="
}
],
"right": [
{
"identifier": "y"
}
]
}
}
]
}
}
],
"content": "",
"parameters": [
{
"block_parameters": {
"content": "",
"rest": [
{
"identifier": "tmp-0"
}
]
}
}
]
}
}
],
"content": "",
"method": [
{
"identifier": "each"
}
],
"receiver": [
{
"identifier": "pairs_list"
}
]
}
}
]
}
}

160
shared/yeast/tests/test.rs Normal file
View File

@@ -0,0 +1,160 @@
#![cfg(test)]
use std::cell::Cell;
use std::fs::read_to_string;
use std::path::Path;
use std::rc::Rc;
use yeast::{captures::Captures, print::Printer, *, rules::rules};
#[test]
fn test_ruby_multiple_assignment() {
// We want to convert this
//
// x, y, z = e
//
// into this
//
// tmp = e
// x = tmp[0]
// y = tmp[1]
// z = tmp[2]
// Define a desugaring rule, which is a query together with a transformation.
let input = "for a, b in pairs_list do\n x=y\nend";
// Construct the thing that runs our desugaring process
let runner = Runner::new(
tree_sitter_ruby::LANGUAGE.into(),
rules(),
);
let old_root = 0;
// Run it on our example
let ast = runner.run(input);
let new_root = ast.get_root();
let formattedInput = serde_json::to_string_pretty(&ast.print(input, old_root)).unwrap();
let formattedOutput = serde_json::to_string_pretty(&ast.print(input, new_root)).unwrap();
println!("before transformation: {}", formattedInput);
println!("after transformation: {}", formattedOutput);
assert_eq!(
formattedInput,
read_to_string("tests/fixtures/multiple_assignment.input.json").unwrap()
);
assert_eq!(
formattedOutput,
read_to_string("tests/fixtures/multiple_assignment.output.json").unwrap()
);
}
#[test]
fn test_parse_input() {
let input = read_to_string("tests/fixtures/1.rb").unwrap();
let parsed_expected = read_to_string("tests/fixtures/1.parsed.json").unwrap();
let runner = Runner::new(tree_sitter_ruby::LANGUAGE.into(), vec![]);
let ast = runner.run(&input);
let parsed_actual = serde_json::to_string_pretty(&ast.print(&input, ast.get_root())).unwrap();
assert_eq!(parsed_actual, parsed_expected);
}
#[test]
fn test_query_input() {
let input = read_to_string("tests/fixtures/1.rb").unwrap();
let rewritten_expected = read_to_string("tests/fixtures/1.rewritten.json").unwrap();
let runner = Runner::new(tree_sitter_ruby::LANGUAGE.into(), vec![]);
let mut ast = runner.run(&input);
let query = yeast::query::query!(
program child:(
(assignment
left: (@left)
right: (@right)
child*: ((@rest)*)
)
)
);
print!("query: {:?}", query);
let mut matches = Captures::new();
if query.do_match(&ast, ast.get_root(), &mut matches).unwrap() {
println!("match: {:?}", matches);
} else {
println!("no match");
}
let builder = yeast::tree_builder::tree_builder!(
program child:
(assignment
left: (@right)
right: (@left)
child*:((@rest)*)
)
);
let new_id = builder.build_tree(&mut ast, &matches).unwrap();
let rewritten_actual = serde_json::to_string_pretty(&ast.print(&input, new_id)).unwrap();
write_expected("tests/fixtures/1.rewritten.json", &rewritten_actual);
assert_eq!(rewritten_actual, rewritten_expected);
}
/// Useful for updating fixtures
/// ```
/// write_expected("tests/fixtures/1.parsed.json", &parsed_actual);
/// ```
fn write_expected<P: AsRef<Path>>(file: P, content: &str) {
use std::io::Write;
std::fs::File::create(file)
.unwrap()
.write_all(content.as_bytes())
.unwrap();
}
#[test]
fn test_cursor() {
let input = read_to_string("tests/fixtures/1.rb").unwrap();
let runner = Runner::new(tree_sitter_ruby::LANGUAGE.into(), vec![]);
let ast = runner.run(&input);
let mut cursor = AstCursor::new(&ast);
assert_eq!(cursor.node().id(), ast.get_root());
assert_eq!(cursor.field_id(), None);
assert!(cursor.goto_first_child());
assert_eq!(cursor.node().id(), 26);
assert!(!cursor.goto_next_sibling());
assert_eq!(cursor.node().id(), 26);
assert!(cursor.goto_first_child());
assert_eq!(cursor.node().id(), 19);
assert!(cursor.goto_first_child());
assert_eq!(cursor.node().id(), 14);
assert!(!cursor.goto_first_child());
assert_eq!(cursor.node().id(), 14);
assert!(cursor.goto_next_sibling());
assert_eq!(cursor.node().id(), 15);
assert_eq!(cursor.field_id(), Some(CHILD_FIELD));
assert!(cursor.goto_parent());
assert_eq!(cursor.node().id(), 19);
assert_eq!(cursor.field_id(), Some(18));
let cursor = AstCursor::new(&ast);
let mut printer = Printer {};
printer.visit(cursor);
}