From 356fd97b4ebfff70450b4b1696676ff253108570 Mon Sep 17 00:00:00 2001 From: Taus Date: Mon, 4 May 2026 13:13:08 +0000 Subject: [PATCH] Add yeast crate: AST desugaring framework with proc-macro DSL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit YEAST (Yet another Elaborator for Abstract Syntax Trees) is a framework for transforming tree-sitter parse trees before CodeQL extraction. Core components: - shared/yeast/ — Ast, Node, Schema, query matching engine, captures, FreshScope, BuildCtx - shared/yeast-macros/ — proc macros: query!, tree!, trees!, rule! The query language is inspired by tree-sitter queries: (assignment left: (_) @lhs right: (_) @rhs) Templates support embedded Rust ({expr}), splicing ({..expr}), computed literals (#{expr}), fresh identifiers ($name), and captures (@name). The rule! macro combines query and transform: rule!((for pattern: (_) @pat ...) => (call receiver: {val} ...)) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- Cargo.lock | 40 +- Cargo.toml | 2 + shared/yeast-macros/Cargo.toml | 12 + shared/yeast-macros/src/lib.rs | 105 +++++ shared/yeast-macros/src/parse.rs | 772 +++++++++++++++++++++++++++++++ shared/yeast/.envrc | 1 + shared/yeast/.gitignore | 1 + shared/yeast/.gitkeep | 0 shared/yeast/Cargo.lock | 357 ++++++++++++++ shared/yeast/Cargo.toml | 15 + shared/yeast/src/bin/main.rs | 26 ++ shared/yeast/src/build.rs | 79 ++++ shared/yeast/src/captures.rs | 105 +++++ shared/yeast/src/cursor.rs | 8 + shared/yeast/src/lib.rs | 691 +++++++++++++++++++++++++++ shared/yeast/src/print.rs | 34 ++ shared/yeast/src/query.rs | 230 +++++++++ shared/yeast/src/range.rs | 21 + shared/yeast/src/schema.rs | 132 ++++++ shared/yeast/src/tree_builder.rs | 37 ++ shared/yeast/src/visitor.rs | 111 +++++ 21 files changed, 2775 insertions(+), 4 deletions(-) create mode 100644 shared/yeast-macros/Cargo.toml create mode 100644 shared/yeast-macros/src/lib.rs create mode 100644 shared/yeast-macros/src/parse.rs create mode 100644 shared/yeast/.envrc create mode 100644 shared/yeast/.gitignore create mode 100644 shared/yeast/.gitkeep create mode 100644 shared/yeast/Cargo.lock create mode 100644 shared/yeast/Cargo.toml create mode 100644 shared/yeast/src/bin/main.rs create mode 100644 shared/yeast/src/build.rs create mode 100644 shared/yeast/src/captures.rs create mode 100644 shared/yeast/src/cursor.rs create mode 100644 shared/yeast/src/lib.rs create mode 100644 shared/yeast/src/print.rs create mode 100644 shared/yeast/src/query.rs create mode 100644 shared/yeast/src/range.rs create mode 100644 shared/yeast/src/schema.rs create mode 100644 shared/yeast/src/tree_builder.rs create mode 100644 shared/yeast/src/visitor.rs diff --git a/Cargo.lock b/Cargo.lock index b6456c84106..38a3c806fb2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -416,6 +416,7 @@ dependencies = [ "tree-sitter", "tree-sitter-json", "tree-sitter-ql", + "yeast", "zstd", ] @@ -2470,7 +2471,6 @@ version = "1.0.145" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c" dependencies = [ - "indexmap 2.11.4", "itoa", "memchr", "ryu", @@ -2853,14 +2853,13 @@ dependencies = [ [[package]] name = "tree-sitter" -version = "0.25.9" +version = "0.24.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ccd2a058a86cfece0bf96f7cce1021efef9c8ed0e892ab74639173e5ed7a34fa" +checksum = "a5387dffa7ffc7d2dae12b50c6f7aab8ff79d6210147c6613561fc3d474c6f75" dependencies = [ "cc", "regex", "regex-syntax", - "serde_json", "streaming-iterator", "tree-sitter-language", ] @@ -2891,6 +2890,16 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c4013970217383f67b18aef68f6fb2e8d409bc5755227092d32efb0422ba24b8" +[[package]] +name = "tree-sitter-python" +version = "0.23.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d065aaa27f3aaceaf60c1f0e0ac09e1cb9eb8ed28e7bcdaa52129cffc7f4b04" +dependencies = [ + "cc", + "tree-sitter-language", +] + [[package]] name = "tree-sitter-ql" version = "0.23.1" @@ -3367,6 +3376,29 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cfe53a6657fd280eaa890a3bc59152892ffa3e30101319d168b781ed6529b049" +[[package]] +name = "yeast" +version = "0.1.0" +dependencies = [ + "clap", + "serde", + "serde_json", + "serde_yaml", + "tree-sitter", + "tree-sitter-python", + "tree-sitter-ruby", + "yeast-macros", +] + +[[package]] +name = "yeast-macros" +version = "0.1.0" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "yoke" version = "0.8.0" diff --git a/Cargo.toml b/Cargo.toml index 58a755340b9..1e2be0d9ca5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,6 +4,8 @@ resolver = "2" members = [ "shared/tree-sitter-extractor", + "shared/yeast", + "shared/yeast-macros", "ruby/extractor", "rust/extractor", "rust/extractor/macros", diff --git a/shared/yeast-macros/Cargo.toml b/shared/yeast-macros/Cargo.toml new file mode 100644 index 00000000000..30c82d03b6e --- /dev/null +++ b/shared/yeast-macros/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "yeast-macros" +version = "0.1.0" +edition = "2021" + +[lib] +proc-macro = true + +[dependencies] +proc-macro2 = "1.0" +quote = "1.0" +syn = "2.0" diff --git a/shared/yeast-macros/src/lib.rs b/shared/yeast-macros/src/lib.rs new file mode 100644 index 00000000000..0c264ee13c8 --- /dev/null +++ b/shared/yeast-macros/src/lib.rs @@ -0,0 +1,105 @@ +use proc_macro::TokenStream; +use proc_macro2::TokenStream as TokenStream2; + +mod parse; + +/// Proc macro for constructing a `QueryNode` from a tree-sitter-inspired pattern. +/// +/// # Syntax +/// +/// ```text +/// (_) - match any named node (skips unnamed tokens) +/// (kind) - match a named node of the given kind +/// ("literal") - match an unnamed token by its text +/// (kind field: (pattern)) - match with named field +/// (kind (pat) (pat)...) - match unnamed children (after all fields) +/// (pattern) @capture - capture the matched node +/// (pattern)* @capture - capture each repeated match +/// (pattern)? - zero or one +/// ``` +#[proc_macro] +pub fn query(input: TokenStream) -> TokenStream { + let input2: TokenStream2 = input.into(); + match parse::parse_query_top(input2) { + Ok(output) => output.into(), + Err(err) => err.to_compile_error().into(), + } +} + +/// Build a single AST node from a template, returning its `Id`. +/// +/// # Template syntax +/// +/// ```text +/// (kind "literal") - leaf with static content +/// (kind #{expr}) - leaf with computed content (expr.to_string()) +/// (kind $fresh) - leaf with auto-generated unique name +/// {expr} - embed a Rust expression returning Id +/// {..expr} - splice an iterable of Id (in child/field position) +/// field: {..expr} - splice into a named field +/// ``` +/// +/// Can be called with an explicit context or using the implicit context +/// from an enclosing `rule!`: +/// +/// ```text +/// tree!(ctx, (kind ...)) // explicit BuildCtx +/// tree!((kind ...)) // implicit context from rule! +/// ``` +#[proc_macro] +pub fn tree(input: TokenStream) -> TokenStream { + let input2: TokenStream2 = input.into(); + match parse::parse_tree_top(input2) { + Ok(output) => output.into(), + Err(err) => err.to_compile_error().into(), + } +} + +/// Build a list of AST nodes from a template, returning `Vec`. +/// +/// Like `tree!` but returns `Vec` and supports multiple top-level +/// elements. All syntax from `tree!` is available. +/// +/// Can be called with an explicit context or using the implicit context +/// from an enclosing `rule!`: +/// +/// ```text +/// trees!(ctx, (node1 ...) (node2 ...)) // explicit BuildCtx +/// trees!((node1 ...) (node2 ...)) // implicit context from rule! +/// ``` +#[proc_macro] +pub fn trees(input: TokenStream) -> TokenStream { + let input2: TokenStream2 = input.into(); + match parse::parse_trees_top(input2) { + Ok(output) => output.into(), + Err(err) => err.to_compile_error().into(), + } +} + +/// Define a desugaring rule with query and transform in one declaration. +/// +/// ```text +/// rule!( +/// (query_pattern field: (_) @name (kind)* @repeated (_)? @optional) +/// => +/// (output_template field: {name} {..repeated}) +/// ) +/// +/// // Shorthand: captures become fields on the output node +/// rule!((query ...) => output_kind) +/// ``` +/// +/// Captures become Rust variables automatically: +/// - `@name` (no quantifier) → `name: Id` +/// - `@name` (after `*`/`+`) → `name: Vec` +/// - `@name` (after `?`) → `name: Option` +/// +/// `tree!` and `trees!` can be used without explicit context inside `{...}`. +#[proc_macro] +pub fn rule(input: TokenStream) -> TokenStream { + let input2: TokenStream2 = input.into(); + match parse::parse_rule_top(input2) { + Ok(output) => output.into(), + Err(err) => err.to_compile_error().into(), + } +} diff --git a/shared/yeast-macros/src/parse.rs b/shared/yeast-macros/src/parse.rs new file mode 100644 index 00000000000..05d44841f40 --- /dev/null +++ b/shared/yeast-macros/src/parse.rs @@ -0,0 +1,772 @@ +use proc_macro2::{Delimiter, Ident, Literal, Span, TokenStream, TokenTree}; +use quote::quote; +use std::iter::Peekable; + +type Tokens = Peekable; +type Result = std::result::Result; + +// --------------------------------------------------------------------------- +// Query parsing +// --------------------------------------------------------------------------- + +/// Top-level entry: parse a single query node from the full input. +pub fn parse_query_top(input: TokenStream) -> Result { + let mut tokens = input.into_iter().peekable(); + let result = parse_query_node(&mut tokens)?; + if let Some(tok) = tokens.next() { + return Err(syn::Error::new_spanned(tok, "unexpected token after query")); + } + Ok(result) +} + +/// Parse a single query node (possibly with a trailing `@capture`). +fn parse_query_node(tokens: &mut Tokens) -> Result { + let base = parse_query_atom(tokens)?; + // Check for trailing @capture + if peek_is_at(tokens) { + tokens.next(); // consume @ + let capture_name = expect_ident(tokens, "expected capture name after @")?; + let name_str = capture_name.to_string(); + Ok(quote! { + yeast::query::QueryNode::Capture { + capture: #name_str, + node: Box::new(#base), + } + }) + } else { + Ok(base) + } +} + +/// Parse a query atom: `(kind fields...)` or `(kind fields... bare_children...)`. +/// Does not handle `@capture` — that's handled by the caller as a postfix. +fn parse_query_atom(tokens: &mut Tokens) -> Result { + match tokens.peek() { + None => Err(syn::Error::new(Span::call_site(), "unexpected end of query")), + Some(TokenTree::Group(g)) if g.delimiter() == Delimiter::Parenthesis => { + let group = expect_group(tokens, Delimiter::Parenthesis)?; + let mut inner = group.stream().into_iter().peekable(); + let result = parse_query_node_inner(&mut inner)?; + if let Some(tok) = inner.next() { + return Err(syn::Error::new_spanned(tok, "unexpected token in query node")); + } + Ok(result) + } + Some(tok) => Err(syn::Error::new_spanned( + tok.clone(), + "expected `(` in query; use `(_) @name` to capture a wildcard", + )), + } +} + +/// Parse the inside of a parenthesized query node: `kind fields...` or `_` or `"lit"`. +fn parse_query_node_inner(tokens: &mut Tokens) -> Result { + match tokens.peek() { + None => Err(syn::Error::new(Span::call_site(), "empty parenthesized group in query")), + Some(TokenTree::Ident(id)) if id.to_string() == "_" => { + tokens.next(); + Ok(quote! { yeast::query::QueryNode::Any() }) + } + Some(TokenTree::Literal(_)) => { + let lit = expect_literal(tokens)?; + Ok(quote! { yeast::query::QueryNode::UnnamedNode { kind: #lit } }) + } + Some(TokenTree::Ident(_)) => { + let kind = expect_ident(tokens, "expected node kind")?; + let kind_str = kind.to_string(); + let fields = parse_query_fields(tokens)?; + Ok(quote! { + yeast::query::QueryNode::Node { + kind: #kind_str, + children: vec![#(#fields),*], + } + }) + } + Some(tok) => Err(syn::Error::new_spanned( + tok.clone(), + "expected node kind, `_`, or string literal", + )), + } +} + +/// Parse zero or more field specifications and trailing bare patterns. +/// Named fields: `name: pattern` or `name*: (list...)`. +/// Bare patterns (no field name) become implicit `child` field entries. +fn parse_query_fields(tokens: &mut Tokens) -> Result> { + let mut fields = Vec::new(); + while tokens.peek().is_some() { + if peek_is_field(tokens) { + let field_name = expect_ident(tokens, "expected field name")?; + let field_str = field_name.to_string(); + + expect_punct(tokens, ':', "expected `:` after field name")?; + + let child = parse_query_node(tokens)?; + fields.push(quote! { + (#field_str, vec![yeast::query::QueryListElem::SingleNode(#child)]) + }); + } else { + // Bare patterns — collect as implicit `child` field + let elems = parse_query_list(tokens)?; + if !elems.is_empty() { + fields.push(quote! { + ("child", vec![#(#elems),*]) + }); + } + break; + } + } + Ok(fields) +} + +/// Parse a list of query elements (bare children). +/// Each element is a node pattern, possibly followed by `*`, `+`, `?`. +fn parse_query_list(tokens: &mut Tokens) -> Result> { + let mut elems = Vec::new(); + while tokens.peek().is_some() { + // Check for parenthesized group + if peek_is_group(tokens, Delimiter::Parenthesis) { + let group = expect_group(tokens, Delimiter::Parenthesis)?; + let mut inner = group.stream().into_iter().peekable(); + + // Check for repetition after the group + if peek_is_repetition(tokens) { + let rep = expect_repetition(tokens)?; + // Determine if the group is a single node pattern or a list + // of patterns. If it starts with an identifier (node kind) or + // `_`, treat it as a single repeated node. Otherwise, parse + // as a repeated list of sub-patterns. + let is_single_node = matches!(inner.peek(), Some(TokenTree::Ident(_))); + if is_single_node { + let node = parse_query_node_inner(&mut inner)?; + let elem = quote! { + yeast::query::QueryListElem::Repeated { + children: vec![yeast::query::QueryListElem::SingleNode(#node)], + rep: #rep, + } + }; + let elem = maybe_wrap_list_capture(tokens, elem)?; + elems.push(elem); + } else { + let sub_elems = parse_query_list(&mut inner)?; + let elem = quote! { + yeast::query::QueryListElem::Repeated { + children: vec![#(#sub_elems),*], + rep: #rep, + } + }; + let elem = maybe_wrap_list_capture(tokens, elem)?; + elems.push(elem); + } + } else { + // Single parenthesized node, possibly followed by @capture + let node = parse_query_node_inner(&mut inner)?; + let node = maybe_wrap_capture(tokens, node)?; + elems.push(quote! { + yeast::query::QueryListElem::SingleNode(#node) + }); + } + continue; + } + + // Check for string literal (unnamed node) + if peek_is_literal(tokens) { + let lit = expect_literal(tokens)?; + let node = quote! { yeast::query::QueryNode::UnnamedNode { kind: #lit } }; + let elem = maybe_wrap_repetition(tokens, quote! { + yeast::query::QueryListElem::SingleNode(#node) + })?; + elems.push(elem); + continue; + } + + // Check for bare _ (wildcard), possibly followed by @capture + if peek_is_underscore(tokens) { + tokens.next(); + let node = quote! { yeast::query::QueryNode::Any() }; + let node = maybe_wrap_capture(tokens, node)?; + let elem = maybe_wrap_repetition(tokens, quote! { + yeast::query::QueryListElem::SingleNode(#node) + })?; + elems.push(elem); + continue; + } + + break; + } + Ok(elems) +} + +// --------------------------------------------------------------------------- +// tree! / trees! parsing — direct code generation against BuildCtx +// --------------------------------------------------------------------------- + +const IMPLICIT_CTX: &str = "__yeast_ctx"; + +/// Determine the context identifier: either explicit `ctx,` or the implicit +/// `__yeast_ctx` from an enclosing `rule!`. +fn parse_ctx_or_implicit(tokens: &mut Tokens) -> Ident { + // Check if first token is an ident followed by a comma + let mut lookahead = tokens.clone(); + let is_explicit = matches!(lookahead.next(), Some(TokenTree::Ident(_))) + && matches!(lookahead.next(), Some(TokenTree::Punct(p)) if p.as_char() == ','); + + if is_explicit { + let ctx = expect_ident(tokens, "").unwrap(); + let _ = tokens.next(); // consume comma + ctx + } else { + Ident::new(IMPLICIT_CTX, Span::call_site()) + } +} + +/// Parse `tree!(ctx, (template))` or `tree!((template))` — returns single `Id`. +pub fn parse_tree_top(input: TokenStream) -> Result { + let mut tokens = input.into_iter().peekable(); + let ctx = parse_ctx_or_implicit(&mut tokens); + + let first = parse_direct_node(&mut tokens, &ctx)?; + + if let Some(tok) = tokens.next() { + return Err(syn::Error::new_spanned(tok, "unexpected tokens after tree! template; use trees! for multiple nodes")); + } + + Ok(quote! { { #first } }) +} + +/// Parse `trees!(ctx, ...)` or `trees!(...)` — returns `Vec`. +pub fn parse_trees_top(input: TokenStream) -> Result { + let mut tokens = input.into_iter().peekable(); + let ctx = parse_ctx_or_implicit(&mut tokens); + let items = parse_direct_list(&mut tokens, &ctx)?; + if let Some(tok) = tokens.next() { + return Err(syn::Error::new_spanned(tok, "unexpected token after trees! template")); + } + Ok(quote! { + { + let mut __nodes: Vec = Vec::new(); + #(#items)* + __nodes + } + }) +} + +/// Parse a single node template and generate code that returns an `Id`. +/// Handles: `(kind fields... children...)` and `{expr}`. +fn parse_direct_node(tokens: &mut Tokens, ctx: &Ident) -> Result { + match tokens.peek() { + Some(TokenTree::Group(g)) if g.delimiter() == Delimiter::Brace => { + let group = expect_group(tokens, Delimiter::Brace)?; + let expr = group.stream(); + Ok(quote! { #expr }) + } + Some(TokenTree::Group(g)) if g.delimiter() == Delimiter::Parenthesis => { + let group = expect_group(tokens, Delimiter::Parenthesis)?; + let mut inner = group.stream().into_iter().peekable(); + parse_direct_node_inner(&mut inner, ctx) + } + Some(tok) => Err(syn::Error::new_spanned(tok.clone(), "expected `(` or `{` in tree template")), + None => Err(syn::Error::new(Span::call_site(), "unexpected end of tree template")), + } +} + +/// Parse the inside of a parenthesized node: `kind fields... children...` +/// or `kind "literal"` or `kind $fresh`. +fn parse_direct_node_inner(tokens: &mut Tokens, ctx: &Ident) -> Result { + let kind = expect_ident(tokens, "expected node kind")?; + let kind_str = kind.to_string(); + + // Check for (kind "literal") + if peek_is_literal(tokens) { + let lit = expect_literal(tokens)?; + return Ok(quote! { #ctx.literal(#kind_str, #lit) }); + } + + // Check for (kind #{expr}) — computed literal, expr converted via .to_string() + if peek_is_hash(tokens) { + tokens.next(); // consume # + let group = expect_group(tokens, Delimiter::Brace)?; + let expr = group.stream(); + return Ok(quote! { #ctx.literal(#kind_str, &(#expr).to_string()) }); + } + + // Check for (kind $fresh) + if peek_is_dollar(tokens) { + tokens.next(); + let name = expect_ident(tokens, "expected fresh variable name after $")?; + let name_str = name.to_string(); + return Ok(quote! { #ctx.fresh(#kind_str, #name_str) }); + } + + // Parse named fields + let mut stmts = Vec::new(); + let mut field_args = Vec::new(); + + // Named fields — compute each value into a temp, then reference it + while peek_is_field(tokens) { + let field_name = expect_ident(tokens, "expected field name")?; + let field_str = field_name.to_string(); + expect_punct(tokens, ':', "expected `:` after field name")?; + + // Check for field: {..expr} — splice a Vec into the field + if peek_is_group(tokens, Delimiter::Brace) { + let group_clone = tokens.clone().next().unwrap(); + if let TokenTree::Group(g) = &group_clone { + let mut inner_check = g.stream().into_iter(); + let is_splice = matches!(inner_check.next(), Some(TokenTree::Punct(p)) if p.as_char() == '.') + && matches!(inner_check.next(), Some(TokenTree::Punct(p)) if p.as_char() == '.'); + if is_splice { + let group = expect_group(tokens, Delimiter::Brace)?; + let mut inner = group.stream().into_iter().peekable(); + inner.next(); // consume first . + inner.next(); // consume second . + let expr: proc_macro2::TokenStream = inner.collect(); + let temp = Ident::new(&format!("__field_{field_str}"), Span::call_site()); + stmts.push(quote! { let #temp: Vec = #expr; }); + field_args.push(quote! { (#field_str, #temp) }); + continue; + } + } + } + + let value = parse_direct_node(tokens, ctx)?; + let temp = Ident::new(&format!("__field_{field_str}"), Span::call_site()); + stmts.push(quote! { let #temp = #value; }); + field_args.push(quote! { (#field_str, vec![#temp]) }); + } + + // After all named fields, no other tokens are allowed. + // Output templates require all children to be in named fields. + if let Some(tok) = tokens.peek() { + return Err(syn::Error::new_spanned( + tok.clone(), + "expected named field (`name:`) or end of node template; \ + output templates do not support unnamed children", + )); + } + + Ok(quote! { + { + #(#stmts)* + #ctx.node(#kind_str, vec![#(#field_args),*]) + } + }) +} + +/// Parse the top-level list of a `trees!` template. +/// Each item is a node template or `{expr}` splice. +fn parse_direct_list(tokens: &mut Tokens, ctx: &Ident) -> Result> { + let mut items = Vec::new(); + while tokens.peek().is_some() { + if peek_is_group(tokens, Delimiter::Parenthesis) { + let group = expect_group(tokens, Delimiter::Parenthesis)?; + let mut inner = group.stream().into_iter().peekable(); + + // Regular node + let node = parse_direct_node_inner(&mut inner, ctx)?; + items.push(quote! { __nodes.push(#node); }); + continue; + } + + // {expr} or {..expr} — single node or splice + if peek_is_group(tokens, Delimiter::Brace) { + let group = expect_group(tokens, Delimiter::Brace)?; + let mut inner = group.stream().into_iter().peekable(); + if peek_is_dotdot(&mut inner) { + inner.next(); // consume first . + inner.next(); // consume second . + let expr: TokenStream = inner.collect(); + items.push(quote! { __nodes.extend(#expr); }); + } else { + let expr = group.stream(); + items.push(quote! { __nodes.push(#expr); }); + } + continue; + } + + break; + } + Ok(items) +} + +// --------------------------------------------------------------------------- +// rule! parsing +// --------------------------------------------------------------------------- + +/// A captured variable from a query pattern. +struct CaptureInfo { + name: String, + multiplicity: CaptureMultiplicity, +} + +#[derive(Clone, Copy, PartialEq)] +enum CaptureMultiplicity { + /// Exactly one match (bare pattern or after no quantifier) + Single, + /// Zero or one match (after `?`) + Optional, + /// Zero or more matches (after `*` or `+`, or inside a repeated group) + Repeated, +} + +/// Walk a token stream and extract all `@name` captures, noting whether +/// they appear after `*` or `+` (repeated) or not. +fn extract_captures(stream: &TokenStream) -> Vec { + let mut captures = Vec::new(); + extract_captures_inner(&mut stream.clone().into_iter().peekable(), &mut captures, CaptureMultiplicity::Single); + captures +} + +fn extract_captures_inner(tokens: &mut Tokens, captures: &mut Vec, parent_mult: CaptureMultiplicity) { + let mut last_mult = CaptureMultiplicity::Single; + while let Some(tok) = tokens.next() { + match tok { + TokenTree::Group(g) => { + let mut inner = g.stream().into_iter().peekable(); + let group_mult = match tokens.peek() { + Some(TokenTree::Punct(p)) if p.as_char() == '*' || p.as_char() == '+' => CaptureMultiplicity::Repeated, + Some(TokenTree::Punct(p)) if p.as_char() == '?' => CaptureMultiplicity::Optional, + _ => CaptureMultiplicity::Single, + }; + last_mult = group_mult; + let child_mult = if parent_mult == CaptureMultiplicity::Repeated || group_mult == CaptureMultiplicity::Repeated { + CaptureMultiplicity::Repeated + } else if parent_mult == CaptureMultiplicity::Optional || group_mult == CaptureMultiplicity::Optional { + CaptureMultiplicity::Optional + } else { + CaptureMultiplicity::Single + }; + extract_captures_inner(&mut inner, captures, child_mult); + } + TokenTree::Punct(p) if p.as_char() == '@' => { + if let Some(TokenTree::Ident(name)) = tokens.next() { + let mult = if parent_mult == CaptureMultiplicity::Repeated || last_mult == CaptureMultiplicity::Repeated { + CaptureMultiplicity::Repeated + } else if parent_mult == CaptureMultiplicity::Optional || last_mult == CaptureMultiplicity::Optional { + CaptureMultiplicity::Optional + } else { + CaptureMultiplicity::Single + }; + captures.push(CaptureInfo { + name: name.to_string(), + multiplicity: mult, + }); + } + last_mult = CaptureMultiplicity::Single; + } + TokenTree::Punct(p) if matches!(p.as_char(), '*' | '+' | '?') => { + // Keep last_mult — the @capture follows + } + _ => { + last_mult = CaptureMultiplicity::Single; + } + } + } +} + +/// Parse `rule!( query => transform )`. +pub fn parse_rule_top(input: TokenStream) -> Result { + let mut tokens = input.into_iter().peekable(); + + // Collect query tokens up to `=>` + let mut query_tokens = Vec::new(); + loop { + match tokens.peek() { + None => return Err(syn::Error::new(Span::call_site(), "expected `=>` in rule!")), + Some(TokenTree::Punct(p)) if p.as_char() == '=' => { + let eq = tokens.next().unwrap(); + match tokens.peek() { + Some(TokenTree::Punct(p)) if p.as_char() == '>' => { + tokens.next(); // consume > + break; + } + _ => { + query_tokens.push(eq); + continue; + } + } + } + _ => { + query_tokens.push(tokens.next().unwrap()); + } + } + } + + let query_stream: TokenStream = query_tokens.into_iter().collect(); + + // Extract captures from query + let captures = extract_captures(&query_stream); + + // Parse query + let query_code = parse_query_top(query_stream.clone())?; + + // Generate capture bindings + let ctx_ident = Ident::new(IMPLICIT_CTX, Span::call_site()); + let bindings: Vec = captures.iter().map(|cap| { + let name = Ident::new(&cap.name, Span::call_site()); + let name_str = &cap.name; + match cap.multiplicity { + CaptureMultiplicity::Repeated => { + quote! { let #name: Vec = __captures.get_all(#name_str); } + } + CaptureMultiplicity::Optional => { + quote! { let #name: Option = __captures.get_opt(#name_str); } + } + CaptureMultiplicity::Single => { + quote! { let #name: usize = __captures.get_var(#name_str).unwrap(); } + } + } + }).collect(); + + // Parse transform: either shorthand `=> kind_name` or full `=> (template ...)` + let transform_body = if peek_is_field(&mut tokens) && { + // Shorthand form: bare identifier = output node kind. + // Auto-generate template from captures. + let mut lookahead = tokens.clone(); + lookahead.next(); // skip ident + lookahead.peek().is_none() // nothing after = shorthand + } { + let output_kind = expect_ident(&mut tokens, "expected output node kind")?; + let output_kind_str = output_kind.to_string(); + + // Generate field assignments from captures + let field_stmts: Vec = captures.iter().map(|cap| { + let name = Ident::new(&cap.name, Span::call_site()); + let name_str = &cap.name; + match cap.multiplicity { + CaptureMultiplicity::Repeated => quote! { + let __field_id = #ctx_ident.ast.field_id_for_name(#name_str) + .unwrap_or_else(|| panic!("field '{}' not found", #name_str)); + __fields.insert(__field_id, #name); + }, + CaptureMultiplicity::Optional => quote! { + let __field_id = #ctx_ident.ast.field_id_for_name(#name_str) + .unwrap_or_else(|| panic!("field '{}' not found", #name_str)); + if let Some(__id) = #name { + __fields.entry(__field_id).or_insert_with(Vec::new).push(__id); + } + }, + CaptureMultiplicity::Single => quote! { + let __field_id = #ctx_ident.ast.field_id_for_name(#name_str) + .unwrap_or_else(|| panic!("field '{}' not found", #name_str)); + __fields.entry(__field_id).or_insert_with(Vec::new).push(#name); + }, + } + }).collect(); + + quote! { + let __kind = #ctx_ident.ast.id_for_node_kind(#output_kind_str) + .unwrap_or_else(|| panic!("node kind '{}' not found", #output_kind_str)); + let mut __fields = std::collections::BTreeMap::new(); + #(#field_stmts)* + let __id = #ctx_ident.ast.create_node_with_range( + __kind, + yeast::NodeContent::DynamicString(String::new()), + __fields, + true, + __source_range, + ); + vec![__id] + } + } else { + // Full template form + let transform_items = parse_direct_list(&mut tokens, &ctx_ident)?; + + if let Some(tok) = tokens.next() { + return Err(syn::Error::new_spanned(tok, "unexpected token after rule! transform")); + } + + quote! { + let mut __nodes: Vec = Vec::new(); + #(#transform_items)* + __nodes + } + }; + + Ok(quote! { + { + let __query = #query_code; + yeast::Rule::new(__query, Box::new(|__ast: &mut yeast::Ast, __captures: yeast::captures::Captures, __fresh: &yeast::tree_builder::FreshScope, __source_range: Option| { + #(#bindings)* + let mut #ctx_ident = yeast::build::BuildCtx::with_source_range(__ast, &__captures, __fresh, __source_range); + #transform_body + })) + } + }) +} + +// --------------------------------------------------------------------------- +// Token utilities +// --------------------------------------------------------------------------- + +fn peek_is_at(tokens: &mut Tokens) -> bool { + matches!(tokens.peek(), Some(TokenTree::Punct(p)) if p.as_char() == '@') +} + +fn peek_is_literal(tokens: &mut Tokens) -> bool { + matches!(tokens.peek(), Some(TokenTree::Literal(_))) +} + +fn peek_is_dollar(tokens: &mut Tokens) -> bool { + matches!(tokens.peek(), Some(TokenTree::Punct(p)) if p.as_char() == '$') +} + +fn peek_is_hash(tokens: &mut Tokens) -> bool { + matches!(tokens.peek(), Some(TokenTree::Punct(p)) if p.as_char() == '#') +} + +/// Check for `..` (two consecutive dot punctuation tokens). +fn peek_is_dotdot(tokens: &Tokens) -> bool { + let mut lookahead = tokens.clone(); + matches!(lookahead.next(), Some(TokenTree::Punct(p)) if p.as_char() == '.') + && matches!(lookahead.next(), Some(TokenTree::Punct(p)) if p.as_char() == '.') +} + +fn peek_is_underscore(tokens: &mut Tokens) -> bool { + matches!(tokens.peek(), Some(TokenTree::Ident(id)) if id.to_string() == "_") +} + +/// Check if the next tokens form a field specification (ident followed by `:` or `*:`). +/// A bare identifier (other than `_`) at this position is always a field name, since +/// bare child patterns must start with `(`, `@`, `"literal"`, or `_`. +fn peek_is_field(tokens: &mut Tokens) -> bool { + match tokens.peek() { + Some(TokenTree::Ident(id)) if id.to_string() != "_" => true, + _ => false, + } +} + +fn peek_is_group(tokens: &mut Tokens, delim: Delimiter) -> bool { + matches!(tokens.peek(), Some(TokenTree::Group(g)) if g.delimiter() == delim) +} + +fn peek_is_repetition(tokens: &mut Tokens) -> bool { + matches!(tokens.peek(), Some(TokenTree::Punct(p)) if matches!(p.as_char(), '*' | '+' | '?')) +} + +fn expect_ident(tokens: &mut Tokens, msg: &str) -> Result { + match tokens.next() { + Some(TokenTree::Ident(id)) => Ok(id), + Some(tok) => Err(syn::Error::new_spanned(tok, msg)), + None => Err(syn::Error::new(Span::call_site(), msg)), + } +} + +fn expect_literal(tokens: &mut Tokens) -> Result { + match tokens.next() { + Some(TokenTree::Literal(lit)) => Ok(lit), + Some(tok) => Err(syn::Error::new_spanned(tok, "expected string literal")), + None => Err(syn::Error::new(Span::call_site(), "expected string literal")), + } +} + +fn expect_punct(tokens: &mut Tokens, ch: char, msg: &str) -> Result<()> { + match tokens.next() { + Some(TokenTree::Punct(p)) if p.as_char() == ch => Ok(()), + Some(tok) => Err(syn::Error::new_spanned(tok, msg)), + None => Err(syn::Error::new(Span::call_site(), msg)), + } +} + +fn expect_group(tokens: &mut Tokens, delim: Delimiter) -> Result { + match tokens.next() { + Some(TokenTree::Group(g)) if g.delimiter() == delim => Ok(g), + Some(tok) => Err(syn::Error::new_spanned( + tok, + format!("expected {:?} group", delim), + )), + None => Err(syn::Error::new( + Span::call_site(), + format!("expected {:?} group", delim), + )), + } +} + +fn expect_repetition(tokens: &mut Tokens) -> Result { + match tokens.next() { + Some(TokenTree::Punct(p)) => match p.as_char() { + '*' => Ok(quote! { yeast::query::Rep::ZeroOrMore }), + '+' => Ok(quote! { yeast::query::Rep::OneOrMore }), + '?' => Ok(quote! { yeast::query::Rep::ZeroOrOne }), + _ => Err(syn::Error::new(p.span(), "expected `*`, `+`, or `?`")), + }, + Some(tok) => Err(syn::Error::new_spanned(tok, "expected repetition quantifier")), + None => Err(syn::Error::new(Span::call_site(), "expected repetition quantifier")), + } +} + +fn maybe_wrap_capture(tokens: &mut Tokens, base: TokenStream) -> Result { + if peek_is_at(tokens) { + tokens.next(); // consume @ + let name = expect_ident(tokens, "expected capture name after @")?; + let name_str = name.to_string(); + Ok(quote! { + yeast::query::QueryNode::Capture { + capture: #name_str, + node: Box::new(#base), + } + }) + } else { + Ok(base) + } +} + +fn maybe_wrap_repetition(tokens: &mut Tokens, single: TokenStream) -> Result { + if peek_is_repetition(tokens) { + let rep = expect_repetition(tokens)?; + Ok(quote! { + yeast::query::QueryListElem::Repeated { + children: vec![#single], + rep: #rep, + } + }) + } else { + Ok(single) + } +} + +/// If `@name` follows a Repeated list element, wrap each child SingleNode +/// inside the repetition with a Capture. This matches tree-sitter semantics +/// where `(_)* @name` captures each matched node. +fn maybe_wrap_list_capture(tokens: &mut Tokens, elem: TokenStream) -> Result { + if peek_is_at(tokens) { + tokens.next(); + let name = expect_ident(tokens, "expected capture name after @")?; + let name_str = name.to_string(); + // Re-parse the element isn't practical, so we generate a wrapper + // that creates a new Repeated with each child wrapped in a capture. + // The simplest approach: generate code that the runtime can interpret. + // Actually, the capture annotation on repeated elements is best handled + // by re-generating the Repeated with captures injected. + // For now, assume the common case: the repetition contains a single + // SingleNode child, and we wrap that node in a capture. + Ok(quote! { + { + let __rep = #elem; + match __rep { + yeast::query::QueryListElem::Repeated { children, rep } => { + yeast::query::QueryListElem::Repeated { + children: children.into_iter().map(|child| { + match child { + yeast::query::QueryListElem::SingleNode(node) => { + yeast::query::QueryListElem::SingleNode( + yeast::query::QueryNode::Capture { + capture: #name_str, + node: Box::new(node), + } + ) + } + other => other, + } + }).collect(), + rep, + } + } + other => other, + } + } + }) + } else { + Ok(elem) + } +} diff --git a/shared/yeast/.envrc b/shared/yeast/.envrc new file mode 100644 index 00000000000..3550a30f2de --- /dev/null +++ b/shared/yeast/.envrc @@ -0,0 +1 @@ +use flake diff --git a/shared/yeast/.gitignore b/shared/yeast/.gitignore new file mode 100644 index 00000000000..ea8c4bf7f35 --- /dev/null +++ b/shared/yeast/.gitignore @@ -0,0 +1 @@ +/target diff --git a/shared/yeast/.gitkeep b/shared/yeast/.gitkeep new file mode 100644 index 00000000000..e69de29bb2d diff --git a/shared/yeast/Cargo.lock b/shared/yeast/Cargo.lock new file mode 100644 index 00000000000..01fc0da60da --- /dev/null +++ b/shared/yeast/Cargo.lock @@ -0,0 +1,357 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + +[[package]] +name = "anstream" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000" + +[[package]] +name = "anstyle-parse" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" +dependencies = [ + "anstyle", + "once_cell_polyfill", + "windows-sys", +] + +[[package]] +name = "cc" +version = "1.2.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43c5703da9466b66a946814e1adf53ea2c90f10063b86290cc9eb67ce3478a20" +dependencies = [ + "find-msvc-tools", + "shlex", +] + +[[package]] +name = "clap" +version = "4.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b193af5b67834b676abd72466a96c1024e6a6ad978a1f484bd90b85c94041351" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1110bd8a634a1ab8cb04345d8d878267d57c3cf1b38d91b71af6686408bbca6a" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" + +[[package]] +name = "colorchoice" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" + +[[package]] +name = "find-msvc-tools" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" + +[[package]] +name = "itoa" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" + +[[package]] +name = "memchr" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" + +[[package]] +name = "once_cell_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" + +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "regex" +version = "1.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.149" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" +dependencies = [ + "itoa", + "memchr", + "serde", + "serde_core", + "zmij", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "streaming-iterator" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b2231b7c3057d5e4ad0156fb3dc807d900806020c5ffa3ee6ff2c8c76fb8520" + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "syn" +version = "2.0.117" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "tree-sitter" +version = "0.24.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5387dffa7ffc7d2dae12b50c6f7aab8ff79d6210147c6613561fc3d474c6f75" +dependencies = [ + "cc", + "regex", + "regex-syntax", + "streaming-iterator", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-language" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "009994f150cc0cd50ff54917d5bc8bffe8cad10ca10d81c34da2ec421ae61782" + +[[package]] +name = "tree-sitter-python" +version = "0.23.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d065aaa27f3aaceaf60c1f0e0ac09e1cb9eb8ed28e7bcdaa52129cffc7f4b04" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-ruby" +version = "0.23.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be0484ea4ef6bb9c575b4fdabde7e31340a8d2dbc7d52b321ac83da703249f95" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "unicode-ident" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" + +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link", +] + +[[package]] +name = "yeast" +version = "0.1.0" +dependencies = [ + "clap", + "serde", + "serde_json", + "tree-sitter", + "tree-sitter-python", + "tree-sitter-ruby", +] + +[[package]] +name = "zmij" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" diff --git a/shared/yeast/Cargo.toml b/shared/yeast/Cargo.toml new file mode 100644 index 00000000000..0dba2d5a53c --- /dev/null +++ b/shared/yeast/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "yeast" +version = "0.1.0" +edition = "2021" + +[dependencies] +clap = { version = "4.4.10", features = ["derive"] } +serde = { version = "1.0.193", features = ["derive"] } +serde_json = "1.0.108" +serde_yaml = "0.9" +tree-sitter = "0.24" +yeast-macros = { path = "../yeast-macros" } + +tree-sitter-ruby = "0.23" +tree-sitter-python = "0.23" diff --git a/shared/yeast/src/bin/main.rs b/shared/yeast/src/bin/main.rs new file mode 100644 index 00000000000..e9ee1f38078 --- /dev/null +++ b/shared/yeast/src/bin/main.rs @@ -0,0 +1,26 @@ +use clap::Parser; + +#[derive(Parser)] +#[clap(name = "yeast", about = "yeast elaborates abstract syntax trees")] +struct Cli { + file: String, + #[clap(default_value = "ruby")] + language: String, +} + +fn get_language(language: &str) -> tree_sitter::Language { + match language { + "ruby" => tree_sitter_ruby::LANGUAGE.into(), + "python" => tree_sitter_python::LANGUAGE.into(), + _ => panic!("Unsupported language: {}", language), + } +} + +fn main() { + let args = Cli::parse(); + let language = get_language(&args.language); + let source = std::fs::read_to_string(&args.file).unwrap(); + let runner = yeast::Runner::new(language, vec![]); + let ast = runner.run(&source).unwrap(); + println!("{}", ast.print(&source, ast.get_root())); +} diff --git a/shared/yeast/src/build.rs b/shared/yeast/src/build.rs new file mode 100644 index 00000000000..d9b9f0ab3a9 --- /dev/null +++ b/shared/yeast/src/build.rs @@ -0,0 +1,79 @@ +use std::collections::BTreeMap; + +use crate::captures::Captures; +use crate::tree_builder::FreshScope; +use crate::{Ast, FieldId, Id, NodeContent}; + +/// Context for building new AST nodes during a transformation. +/// +/// Used by the `tree!` and `trees!` macros. Holds a mutable reference to the +/// AST, a reference to the captures from a query match, and a `FreshScope` for +/// generating unique identifiers. +pub struct BuildCtx<'a> { + pub ast: &'a mut Ast, + pub captures: &'a Captures, + pub fresh: &'a FreshScope, + /// Source range of the matched node, inherited by synthetic nodes. + pub source_range: Option, +} + +impl<'a> BuildCtx<'a> { + pub fn new(ast: &'a mut Ast, captures: &'a Captures, fresh: &'a FreshScope) -> Self { + Self { + ast, + captures, + fresh, + source_range: None, + } + } + + pub fn with_source_range(ast: &'a mut Ast, captures: &'a Captures, fresh: &'a FreshScope, source_range: Option) -> Self { + Self { + ast, + captures, + fresh, + source_range, + } + } + + /// Look up a capture variable, returning its node Id. + pub fn capture(&self, name: &str) -> Id { + self.captures + .get_var(name) + .unwrap_or_else(|e| panic!("build: {e}")) + } + + /// Get all values of a repeated capture variable. + pub fn capture_all(&self, name: &str) -> Vec { + self.captures.get_all(name) + } + + /// Create a named AST node with the given kind and fields. + pub fn node(&mut self, kind: &str, fields: Vec<(&str, Vec)>) -> Id { + let kind_id = self + .ast + .id_for_node_kind(kind) + .unwrap_or_else(|| panic!("build: node kind '{kind}' not found")); + let mut field_map: BTreeMap> = BTreeMap::new(); + for (name, ids) in fields { + let field_id = self + .ast + .field_id_for_name(name) + .unwrap_or_else(|| panic!("build: field '{name}' not found")); + field_map.entry(field_id).or_default().extend(ids); + } + self.ast + .create_node_with_range(kind_id, NodeContent::DynamicString(String::new()), field_map, true, self.source_range) + } + + /// Create a leaf node with a fixed string content. + pub fn literal(&mut self, kind: &'static str, value: &str) -> Id { + self.ast.create_named_token_with_range(kind, value.to_string(), self.source_range) + } + + /// Create a leaf node with an auto-generated unique name. + pub fn fresh(&mut self, kind: &'static str, name: &str) -> Id { + let generated = self.fresh.resolve(name); + self.ast.create_named_token_with_range(kind, generated, self.source_range) + } +} diff --git a/shared/yeast/src/captures.rs b/shared/yeast/src/captures.rs new file mode 100644 index 00000000000..8982b2f88be --- /dev/null +++ b/shared/yeast/src/captures.rs @@ -0,0 +1,105 @@ +use std::collections::{BTreeMap, BTreeSet}; + +use crate::Id; + +#[derive(Debug, Clone)] +pub struct Captures { + captures: BTreeMap<&'static str, Vec>, +} + +impl Default for Captures { + fn default() -> Self { + Self::new() + } +} + +impl Captures { + pub fn new() -> Self { + Captures { + captures: BTreeMap::new(), + } + } + + pub fn get_var(&self, key: &str) -> Result { + let ids = self.captures.get(key); + if let Some(ids) = ids { + if ids.len() == 1 { + Ok(ids[0]) + } else { + Err(format!( + "Variable {} has {} matches, use * to allow repetition", + key, + ids.len() + )) + } + } else { + Err(format!("No variable named {}", key)) + } + } + + /// Get all values of a capture variable (for repeated captures). + pub fn get_all(&self, key: &str) -> Vec { + self.captures.get(key).cloned().unwrap_or_default() + } + + /// Get an optional capture variable. Returns None if unmatched, + /// Some(id) if matched exactly once. + pub fn get_opt(&self, key: &str) -> Option { + self.captures.get(key).and_then(|ids| { + if ids.len() == 1 { Some(ids[0]) } else { None } + }) + } + + pub fn insert(&mut self, key: &'static str, id: Id) { + self.captures.entry(key).or_default().push(id); + } + + pub fn map_captures(&mut self, kind: &str, f: &mut impl FnMut(Id) -> Id) { + if let Some(ids) = self.captures.get_mut(kind) { + for id in ids { + *id = f(*id); + } + } + } + pub fn map_captures_to(&mut self, from: &str, to: &'static str, f: &mut impl FnMut(Id) -> Id) { + if let Some(from_ids) = self.captures.get(from) { + let new_values = from_ids.iter().copied().map(f).collect(); + self.captures.insert(to, new_values); + } + } + + pub fn merge(&mut self, other: &Captures) { + for (key, ids) in &other.captures { + self.captures.entry(key).or_default().extend(ids); + } + } + + pub fn un_star<'a>( + &'a self, + children: &'a BTreeSet<&'static str>, + ) -> Result + 'a, String> { + let mut id_iter = children.iter(); + + if let Some(fst) = id_iter.next() { + let repeats = self + .captures + .get(fst) + .ok_or_else(|| format!("No variable named {}", fst))? + .len(); + // TODO: better error on missing capture + if id_iter.any(|id| self.captures.get(id).map(Vec::len).unwrap_or(0) != repeats) { + return Err("Repeated captures must have the same number of matches".to_string()); + } + Ok((0..repeats).map(move |iter| { + let mut new_vars: Captures = Captures::new(); + for id in children { + let child_capture = self.captures.get(id).unwrap()[iter]; + new_vars.captures.insert(id, vec![child_capture]); + } + new_vars + })) + } else { + Err("Repeated captures must have at least one capture".to_string()) + } + } +} diff --git a/shared/yeast/src/cursor.rs b/shared/yeast/src/cursor.rs new file mode 100644 index 00000000000..ef5f6d94f25 --- /dev/null +++ b/shared/yeast/src/cursor.rs @@ -0,0 +1,8 @@ +pub trait Cursor<'a, T, N, F> { + fn node(&self) -> &'a N; + fn field_id(&self) -> Option; + fn field_name(&self) -> Option<&'static str>; + fn goto_first_child(&mut self) -> bool; + fn goto_next_sibling(&mut self) -> bool; + fn goto_parent(&mut self) -> bool; +} diff --git a/shared/yeast/src/lib.rs b/shared/yeast/src/lib.rs new file mode 100644 index 00000000000..7f44a9fed55 --- /dev/null +++ b/shared/yeast/src/lib.rs @@ -0,0 +1,691 @@ +use std::collections::BTreeMap; + +extern crate self as yeast; + +use serde::Serialize; +use serde_json::{json, Value}; + +pub mod build; +pub mod captures; +pub mod cursor; +pub mod dump; +pub mod node_types_yaml; +pub mod print; +pub mod query; +mod range; +pub mod schema; +pub mod tree_builder; +mod visitor; + +pub use yeast_macros::{query, rule, tree, trees}; + +use captures::Captures; +pub use cursor::Cursor; +use query::QueryNode; + +/// Node ids are indexes into the arena +type Id = usize; + +/// Field and Kind ids are provided by tree-sitter +type FieldId = u16; +type KindId = u16; + +pub const CHILD_FIELD: u16 = u16::MAX; + +#[derive(Debug)] +pub struct AstCursor<'a> { + ast: &'a Ast, + /// A stack of parents, along with iterators for their children + parents: Vec<(&'a Node, ChildrenIter<'a>)>, + node: &'a Node, +} + +impl<'a> AstCursor<'a> { + pub fn new(ast: &'a Ast) -> Self { + // TODO: handle non-zero root + let node = ast.get_node(ast.root).unwrap(); + Self { + ast, + parents: vec![], + node, + } + } + + fn goto_next_sibling_opt(&mut self) -> Option<()> { + self.node = self.parents.last_mut()?.1.next()?; + Some(()) + } + + fn goto_first_child_opt(&mut self) -> Option<()> { + let parent = self.node; + let mut children = ChildrenIter::new(self.ast, parent); + let first_child = children.next()?; + self.node = first_child; + self.parents.push((parent, children)); + Some(()) + } + + fn goto_parent_opt(&mut self) -> Option<()> { + self.node = self.parents.pop()?.0; + Some(()) + } +} +impl<'a> Cursor<'a, Ast, Node, FieldId> for AstCursor<'a> { + fn node(&self) -> &'a Node { + self.node + } + + fn field_id(&self) -> Option { + let (_, children) = self.parents.last()?; + children.current_field() + } + + fn field_name(&self) -> Option<&'static str> { + if self.field_id() == Some(CHILD_FIELD) { + None + } else { + self.field_id() + .and_then(|id| self.ast.field_name_for_id(id)) + } + } + + fn goto_first_child(&mut self) -> bool { + self.goto_first_child_opt().is_some() + } + + fn goto_next_sibling(&mut self) -> bool { + self.goto_next_sibling_opt().is_some() + } + + fn goto_parent(&mut self) -> bool { + self.goto_parent_opt().is_some() + } +} + +/// An iterator over all the child nodes of a node. +#[derive(Debug)] +struct ChildrenIter<'a> { + ast: &'a Ast, + current_field: Option, + fields: std::collections::btree_map::Iter<'a, FieldId, Vec>, + field_children: Option>, +} + +impl<'a> ChildrenIter<'a> { + fn new(ast: &'a Ast, node: &'a Node) -> Self { + Self { + ast, + current_field: None, + fields: node.fields.iter(), + field_children: None, + } + } + + fn get_node(&self, id: Id) -> &'a Node { + self.ast.get_node(id).unwrap() + } + + fn current_field(&self) -> Option { + self.current_field + } +} + +impl<'a> Iterator for ChildrenIter<'a> { + type Item = &'a Node; + + fn next(&mut self) -> Option { + match self.field_children.as_mut() { + None => match self.fields.next() { + Some((field, children)) => { + self.current_field = Some(*field); + self.field_children = Some(children.iter()); + self.next() + } + None => None, + }, + Some(children) => match children.next() { + None => match self.fields.next() { + None => None, + Some((field, children)) => { + self.current_field = Some(*field); + self.field_children = Some(children.iter()); + self.next() + } + }, + Some(child_id) => Some(self.get_node(*child_id)), + }, + } + } +} + +/// Our AST +pub struct Ast { + root: Id, + nodes: Vec, + schema: schema::Schema, +} + +impl std::fmt::Debug for Ast { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Ast") + .field("root", &self.root) + .field("nodes", &self.nodes.len()) + .finish() + } +} + +impl Ast { + /// Construct an AST from a TS tree + pub fn from_tree(language: tree_sitter::Language, tree: &tree_sitter::Tree) -> Self { + let schema = schema::Schema::from_language(&language); + Self::from_tree_with_schema(schema, tree, &language) + } + + pub fn from_tree_with_schema(schema: schema::Schema, tree: &tree_sitter::Tree, language: &tree_sitter::Language) -> Self { + let mut visitor = visitor::Visitor::new(language.clone()); + visitor.visit(tree); + let ast = visitor.build_with_schema(schema); + ast + } + + pub fn walk(&self) -> AstCursor { + AstCursor::new(self) + } + + pub fn nodes(&self) -> &[Node] { + &self.nodes + } + + pub fn get_root(&self) -> Id { + self.root + } + + pub fn set_root(&mut self, root: Id) { + self.root = root; + } + + pub fn get_node(&self, id: Id) -> Option<&Node> { + self.nodes.get(id) + } + + pub fn print(&self, source: &str, root_id: Id) -> Value { + let root = &self.nodes()[root_id]; + self.print_node(root, source) + } + + pub fn create_node( + &mut self, + kind: KindId, + content: NodeContent, + fields: BTreeMap>, + is_named: bool, + ) -> Id { + self.create_node_with_range(kind, content, fields, is_named, None) + } + + pub fn create_node_with_range( + &mut self, + kind: KindId, + content: NodeContent, + fields: BTreeMap>, + is_named: bool, + source_range: Option, + ) -> Id { + let id = self.nodes.len(); + self.nodes.push(Node { + id, + kind, + kind_name: self.schema.node_kind_for_id(kind).unwrap(), + fields, + content, + is_missing: false, + is_error: false, + is_extra: false, + is_named, + source_range, + }); + id + } + + pub fn create_named_token(&mut self, kind: &'static str, content: String) -> Id { + self.create_named_token_with_range(kind, content, None) + } + + pub fn create_named_token_with_range(&mut self, kind: &'static str, content: String, source_range: Option) -> Id { + let kind_id = self.schema.id_for_node_kind(kind) + .unwrap_or_else(|| panic!("create_named_token: node kind '{kind}' not found in schema")); + let id = self.nodes.len(); + self.nodes.push(Node { + id, + kind: kind_id, + kind_name: kind, + is_named: true, + is_missing: false, + is_error: false, + source_range, + is_extra: false, + fields: BTreeMap::new(), + content: NodeContent::DynamicString(content), + }); + id + } + + pub fn field_name_for_id(&self, id: FieldId) -> Option<&'static str> { + self.schema.field_name_for_id(id) + } + + pub fn field_id_for_name(&self, name: &str) -> Option { + self.schema.field_id_for_name(name) + } + + /// Print a node for debugging + fn print_node(&self, node: &Node, source: &str) -> Value { + let fields: BTreeMap<&'static str, Vec> = node + .fields + .iter() + .map(|(field_id, nodes)| { + let field_name = if field_id == &CHILD_FIELD { + "rest" + } else { + self.field_name_for_id(*field_id).unwrap() + }; + let nodes: Vec = nodes + .iter() + .map(|id| self.print_node(self.get_node(*id).unwrap(), source)) + .collect(); + (field_name, nodes) + }) + .collect(); + let mut value = BTreeMap::new(); + let kind = self.schema.node_kind_for_id(node.kind).unwrap(); + let content = match &node.content { + NodeContent::Range(range) => { + let end = range.end_byte; + std::str::from_utf8(&source.as_bytes()[range.start_byte..end]) + .unwrap_or("") + .to_string() + } + NodeContent::String(s) => s.to_string(), + NodeContent::DynamicString(s) => s.clone(), + }; + if fields.is_empty() { + value.insert(kind, json!(content)); + } else { + let mut fields: BTreeMap<_, _> = + fields.into_iter().map(|(k, v)| (k, json!(v))).collect(); + fields.insert("content", json!(content)); + value.insert(kind, json!(fields)); + } + json!(value) + } + + /// Return an example AST, for testing and to fill implementation gaps + pub fn example(language: tree_sitter::Language) -> Self { + // x = 1 + Self { + root: 0, + schema: schema::Schema::from_language(&language), + nodes: vec![ + // assignment + Node { + id: 0, + kind: 276, + kind_name: "assignment", + fields: { + let mut map = BTreeMap::new(); + map.insert(18, vec![1]); + map.insert(28, vec![3]); + map + }, + content: NodeContent::String("x = 1"), + is_missing: false, + is_error: false, + source_range: None, + is_extra: false, + is_named: true, + }, + // identifier + Node { + id: 1, + kind: 1, + kind_name: "identifier", + fields: BTreeMap::new(), + content: NodeContent::String("x"), + is_missing: false, + is_error: false, + source_range: None, + is_extra: false, + is_named: true, + }, + // "=" + Node { + id: 2, + kind: 17, + kind_name: "=", + fields: BTreeMap::new(), + content: NodeContent::String("="), + is_missing: false, + is_error: false, + source_range: None, + is_extra: false, + is_named: false, + }, + // integer + Node { + id: 3, + kind: 110, + kind_name: "integer", + fields: BTreeMap::new(), + content: NodeContent::String("1"), + is_missing: false, + is_error: false, + source_range: None, + is_extra: false, + is_named: true, + }, + ], + } + } + + pub fn id_for_node_kind(&self, kind: &str) -> Option { + let id = self.schema.id_for_node_kind(kind).unwrap_or(0); + if id == 0 { + None + } else { + Some(id) + } + } + + fn id_for_unnamed_node_kind(&self, kind: &str) -> Option { + let id = self.schema.id_for_unnamed_node_kind(kind).unwrap_or(0); + if id == 0 { + None + } else { + Some(id) + } + } +} + +/// A node in our AST +#[derive(PartialEq, Eq, Debug, Clone, Serialize)] +pub struct Node { + id: Id, + kind: KindId, + kind_name: &'static str, + pub(crate) fields: BTreeMap>, + pub(crate) content: NodeContent, + /// For synthetic nodes, the source range of the original node they + /// were desugared from. Used for location information in TRAP output. + #[serde(skip)] + source_range: Option, + is_named: bool, + is_missing: bool, + is_extra: bool, + is_error: bool, +} + +impl Node { + pub fn id(&self) -> Id { + self.id + } + + pub fn kind(&self) -> &'static str { + self.kind_name + } + + pub fn kind_name(&self) -> &'static str { + self.kind_name + } + + pub fn is_named(&self) -> bool { + self.is_named + } + + pub fn is_missing(&self) -> bool { + self.is_missing + } + + pub fn is_extra(&self) -> bool { + self.is_extra + } + + pub fn is_error(&self) -> bool { + self.is_error + } + + fn fake_point(&self) -> tree_sitter::Point { + tree_sitter::Point { row: 0, column: 0 } + } + + pub fn start_position(&self) -> tree_sitter::Point { + match self.content { + NodeContent::Range(range) => range.start_point, + _ => self.source_range.map_or_else( + || self.fake_point(), + |r| r.start_point, + ), + } + } + + pub fn end_position(&self) -> tree_sitter::Point { + match self.content { + NodeContent::Range(range) => range.end_point, + _ => self.source_range.map_or_else( + || self.fake_point(), + |r| r.end_point, + ), + } + } + + pub fn start_byte(&self) -> usize { + match self.content { + NodeContent::Range(range) => range.start_byte, + _ => self.source_range.map_or(0, |r| r.start_byte), + } + } + + pub fn end_byte(&self) -> usize { + match self.content { + NodeContent::Range(range) => range.end_byte, + _ => self.source_range.map_or(0, |r| r.end_byte), + } + } + + pub fn byte_range(&self) -> std::ops::Range { + self.start_byte()..self.end_byte() + } + + pub fn opt_string_content(&self) -> Option { + match &self.content { + NodeContent::Range(_range) => None, + NodeContent::String(s) => Some(s.to_string()), + NodeContent::DynamicString(s) => Some(s.to_string()), + } + } +} + +/// The contents of a node is either a range in the original source file, +/// or a new string if the node is synthesized. +#[derive(PartialEq, Eq, Debug, Clone, Serialize)] +pub enum NodeContent { + Range(#[serde(with = "range::Range")] tree_sitter::Range), + String(&'static str), + DynamicString(String), +} + +impl From<&'static str> for NodeContent { + fn from(value: &'static str) -> Self { + NodeContent::String(value) + } +} + +impl From for NodeContent { + fn from(value: tree_sitter::Range) -> Self { + NodeContent::Range(value) + } +} + +pub struct Rule { + query: QueryNode, + transform: Box) -> Vec>, +} + +impl Rule { + pub fn new(query: QueryNode, transform: Box) -> Vec>) -> Self { + Self { query, transform } + } + + fn try_rule(&self, ast: &mut Ast, node: Id, fresh: &tree_builder::FreshScope) -> Result>, String> { + let mut captures = Captures::new(); + if self.query.do_match(ast, node, &mut captures)? { + fresh.next_scope(); + let source_range = ast.get_node(node).and_then(|n| { + match n.content { + NodeContent::Range(r) => Some(r), + _ => n.source_range, + } + }); + Ok(Some((self.transform)(ast, captures, fresh, source_range))) + } else { + Ok(None) + } + } +} + +const MAX_REWRITE_DEPTH: usize = 100; + +/// Index of rules by their root query kind for fast lookup. +struct RuleIndex<'a> { + /// Rules indexed by root node kind name. + by_kind: BTreeMap<&'static str, Vec<&'a Rule>>, + /// Rules with wildcard queries (Any) that apply to all nodes. + wildcard: Vec<&'a Rule>, +} + +impl<'a> RuleIndex<'a> { + fn new(rules: &'a [Rule]) -> Self { + let mut by_kind: BTreeMap<&'static str, Vec<&'a Rule>> = BTreeMap::new(); + let mut wildcard = Vec::new(); + for rule in rules { + match rule.query.root_kind() { + Some(kind) => by_kind.entry(kind).or_default().push(rule), + None => wildcard.push(rule), + } + } + Self { by_kind, wildcard } + } + + fn rules_for_kind(&self, kind: &str) -> impl Iterator { + self.by_kind + .get(kind) + .into_iter() + .flat_map(|v| v.iter()) + .chain(self.wildcard.iter()) + } +} + +fn apply_rules(rules: &[Rule], ast: &mut Ast, id: Id, fresh: &tree_builder::FreshScope) -> Result, String> { + let index = RuleIndex::new(rules); + apply_rules_inner(&index, ast, id, fresh, 0) +} + +fn apply_rules_inner(index: &RuleIndex, ast: &mut Ast, id: Id, fresh: &tree_builder::FreshScope, rewrite_depth: usize) -> Result, String> { + if rewrite_depth > MAX_REWRITE_DEPTH { + return Err(format!( + "Desugaring exceeded maximum rewrite depth ({MAX_REWRITE_DEPTH}). \ + This likely indicates a non-terminating rule cycle." + )); + } + + let node_kind = ast.get_node(id).map(|n| n.kind()).unwrap_or(""); + for rule in index.rules_for_kind(node_kind) { + if let Some(result_node) = rule.try_rule(ast, id, fresh)? { + let mut results = Vec::new(); + for node in result_node { + results.extend(apply_rules_inner(index, ast, node, fresh, rewrite_depth + 1)?); + } + return Ok(results); + } + } + + // Collect fields before recursing (avoids borrowing ast immutably during mutation) + let field_entries: Vec<(FieldId, Vec)> = ast.nodes[id] + .fields + .iter() + .map(|(&fid, children)| (fid, children.clone())) + .collect(); + + // recursively descend into all the fields + // Child traversal does not increment rewrite depth + let mut changed = false; + let mut new_fields = BTreeMap::new(); + for (field_id, children) in field_entries { + let mut new_children = Vec::new(); + for child_id in children { + let result = apply_rules_inner(index, ast, child_id, fresh, rewrite_depth)?; + if result.len() != 1 || result[0] != child_id { + changed = true; + } + new_children.extend(result); + } + new_fields.insert(field_id, new_children); + } + + if !changed { + return Ok(vec![id]); + } + + let mut node = ast.nodes[id].clone(); + node.fields = new_fields; + node.id = ast.nodes.len(); + ast.nodes.push(node); + Ok(vec![ast.nodes.len() - 1]) +} + +pub struct Runner { + language: tree_sitter::Language, + schema: schema::Schema, + rules: Vec, +} + +impl Runner { + /// Create a runner using the input grammar's schema for output. + pub fn new(language: tree_sitter::Language, rules: Vec) -> Self { + let schema = schema::Schema::from_language(&language); + Self { language, schema, rules } + } + + /// Create a runner with separate input language and output schema. + pub fn with_schema(language: tree_sitter::Language, schema: schema::Schema, rules: Vec) -> Self { + Self { language, schema, rules } + } + + pub fn run_from_tree(&self, tree: &tree_sitter::Tree) -> Result { + let fresh = tree_builder::FreshScope::new(); + let mut ast = Ast::from_tree_with_schema(self.schema.clone(), tree, &self.language); + let root = ast.get_root(); + let res = apply_rules(&self.rules, &mut ast, root, &fresh)?; + if res.len() != 1 { + return Err(format!("Expected exactly one result node, got {}", res.len())); + } + ast.set_root(res[0]); + Ok(ast) + } + + pub fn run(&self, input: &str) -> Result { + let fresh = tree_builder::FreshScope::new(); + let mut parser = tree_sitter::Parser::new(); + parser.set_language(&self.language) + .map_err(|e| format!("Failed to set language: {e}"))?; + let tree = parser.parse(input, None) + .ok_or_else(|| "Failed to parse input".to_string())?; + let mut ast = Ast::from_tree_with_schema(self.schema.clone(), &tree, &self.language); + let root = ast.get_root(); + let res = apply_rules(&self.rules, &mut ast, root, &fresh)?; + if res.len() != 1 { + return Err(format!("Expected exactly one result node, got {}", res.len())); + } + ast.set_root(res[0]); + Ok(ast) + } +} diff --git a/shared/yeast/src/print.rs b/shared/yeast/src/print.rs new file mode 100644 index 00000000000..6bf10bcfe20 --- /dev/null +++ b/shared/yeast/src/print.rs @@ -0,0 +1,34 @@ +use crate::{cursor::Cursor, AstCursor, Node}; + +pub struct Printer {} + +impl Printer { + pub fn visit(&mut self, mut cursor: AstCursor<'_>) { + self.enter_node(cursor.node()); + let mut recurse = true; + loop { + if recurse && cursor.goto_first_child() { + recurse = self.enter_node(cursor.node()); + } else { + self.leave_node(cursor.node()); + + if cursor.goto_next_sibling() { + recurse = self.enter_node(cursor.node()); + } else if cursor.goto_parent() { + recurse = false; + } else { + break; + } + } + } + } + + pub fn enter_node(&mut self, node: &Node) -> bool { + println!("enter_node: {:?}", node); + true + } + pub fn leave_node(&mut self, node: &Node) -> bool { + println!("leave_node: {:?}", node); + true + } +} diff --git a/shared/yeast/src/query.rs b/shared/yeast/src/query.rs new file mode 100644 index 00000000000..b09b7d9b89b --- /dev/null +++ b/shared/yeast/src/query.rs @@ -0,0 +1,230 @@ + +use crate::{captures::Captures, Ast, Id}; + +#[derive(Debug, Clone)] +pub enum QueryNode { + Any(), + Node { + kind: &'static str, + children: Vec<(&'static str, Vec)>, + }, + UnnamedNode { + kind: &'static str, + }, + Capture { + capture: &'static str, + node: Box, + }, +} + +impl QueryNode { + /// Returns the root node kind this query matches, if it's specific. + /// Returns None for wildcards (Any) and captures wrapping wildcards. + pub fn root_kind(&self) -> Option<&'static str> { + match self { + QueryNode::Node { kind, .. } => Some(kind), + QueryNode::UnnamedNode { kind } => Some(kind), + QueryNode::Capture { node, .. } => node.root_kind(), + QueryNode::Any() => None, + } + } +} + +#[derive(Debug, Clone)] +pub enum QueryListElem { + Repeated { children: Vec, rep: Rep }, + SingleNode(QueryNode), +} + +#[derive(Debug, PartialEq, Eq, Copy, Clone)] +pub enum Rep { + ZeroOrMore, + OneOrMore, + ZeroOrOne, +} + +impl QueryNode { + /// Returns true if this query only matches named nodes (not unnamed tokens). + /// Used to skip unnamed children in positional matching, matching tree-sitter + /// semantics where `(_)` only matches named nodes. + fn matches_named_only(&self) -> bool { + match self { + QueryNode::Any() => true, + QueryNode::Node { .. } => true, + QueryNode::UnnamedNode { .. } => false, + QueryNode::Capture { node, .. } => node.matches_named_only(), + } + } + + pub fn do_match(&self, ast: &Ast, node: Id, matches: &mut Captures) -> Result { + match self { + QueryNode::Any() => Ok(true), + QueryNode::Node { kind, children } => { + let node = ast.get_node(node).unwrap(); + let target_kind = ast.id_for_node_kind(kind).ok_or_else(|| { + format!("Node kind {} not found in language", kind) + })?; + if node.kind != target_kind { + return Ok(false); + } + for (field, field_children) in children { + let field_id = ast + .field_id_for_name(field) + .ok_or_else(|| format!("Field {} not found in language", field))?; + let empty = Vec::new(); + let mut child_iter = node + .fields + .get(&field_id) + .unwrap_or(&empty) + .iter() + .cloned(); + if !match_children(field_children.iter(), ast, &mut child_iter, matches)? { + return Ok(false); + } + } + Ok(true) + } + QueryNode::UnnamedNode { kind } => { + let node = ast.get_node(node).unwrap(); + let target_kind = ast.id_for_unnamed_node_kind(kind).ok_or_else(|| { + format!("unnamed Node kind {} not found in language", kind) + })?; + Ok(node.kind == target_kind) + } + QueryNode::Capture { + capture, + node: sub_query, + } => { + let matched = sub_query.do_match(ast, node, matches)?; + if matched { + matches.insert(capture, node); + } + Ok(matched) + } + } + } +} + +fn match_children<'a>( + child_matchers: impl Iterator, + ast: &Ast, + remaining_children: &mut (impl Iterator + Clone), + matches: &mut Captures, +) -> Result { + for child in child_matchers { + if !child.do_match(ast, remaining_children, matches)? { + return Ok(false); + } + } + Ok(true) +} + +impl QueryListElem { + fn do_match( + &self, + ast: &Ast, + remaining_children: &mut (impl Iterator + Clone), + matches: &mut Captures, + ) -> Result { + match self { + QueryListElem::Repeated { children, rep } => { + if children.is_empty() { + // Empty repetition always succeeds without consuming + return Ok(*rep != Rep::OneOrMore); + } + + let mut iters = 0; + + loop { + let matches_initial = matches.clone(); + let start = remaining_children.clone(); + let start_next = start.clone().next(); + if !match_children(children.iter(), ast, remaining_children, matches)? { + *remaining_children = start; + *matches = matches_initial; + break; + } + // Guard against zero-width matches: if the iterator + // didn't advance, break to avoid infinite looping. + let current_next = remaining_children.clone().next(); + if start_next == current_next { + break; + } + iters += 1; + if *rep == Rep::ZeroOrOne { + break; + } + } + if *rep == Rep::OneOrMore && iters == 0 { + // We didn't match any children but we were supposed to + Ok(false) + } else { + Ok(true) + } + } + QueryListElem::SingleNode(sub_query) => { + if sub_query.matches_named_only() { + // Skip unnamed children, matching tree-sitter semantics + // where (_) only matches named nodes. + loop { + match remaining_children.next() { + Some(child) => { + let node = ast.get_node(child).unwrap(); + if node.is_named() { + return sub_query.do_match(ast, child, matches); + } + // Skip unnamed child, continue to next + } + None => return Ok(false), + } + } + } else if let Some(child) = remaining_children.next() { + sub_query.do_match(ast, child, matches) + } else { + Ok(false) + } + } + } + } +} + +#[cfg(test)] +mod tests { + use crate::query::*; + #[test] + fn it_works() { + let query1: QueryNode = yeast::query!((_)); + println!("{:?}", query1); + let query2 = yeast::query!((foo)); + println!("{:?}", query2); + let query3 = yeast::query!((foo child: (_))); + println!("{:?}", query3); + let query4 = yeast::query!((foo (_)*)); + println!("{:?}", query4); + let query5: QueryNode = yeast::query!((foo (_)*)); + println!("{:?}", query5); + let query6: QueryNode = yeast::query!((_) @bar); + println!("{:?}", query6); + let query7: QueryNode = yeast::query!((foo child: (_) @bar)); + println!("{:?}", query7); + let query8: QueryNode = yeast::query!( + (assignment + left: (element_reference + object: (_) @obj + (_) @index + ) + right: (_) @rhs + ) + ); + println!("{:?}", query8); + let query9 = yeast::query!( + (program + child: (assignment + left: (_) @left + right: (_) @right + ) + ) + ); + println!("{:?}", query9); + } +} diff --git a/shared/yeast/src/range.rs b/shared/yeast/src/range.rs new file mode 100644 index 00000000000..ec670b438d5 --- /dev/null +++ b/shared/yeast/src/range.rs @@ -0,0 +1,21 @@ +//! (de)-serialize helpers for tree_sitter::Range + +use serde::{Deserialize, Serialize}; + +#[derive(Serialize, Deserialize)] +#[serde(remote = "tree_sitter::Point")] +pub struct Point { + pub row: usize, + pub column: usize, +} + +#[derive(Serialize, Deserialize)] +#[serde(remote = "tree_sitter::Range")] +pub struct Range { + pub start_byte: usize, + pub end_byte: usize, + #[serde(with = "Point")] + pub start_point: tree_sitter::Point, + #[serde(with = "Point")] + pub end_point: tree_sitter::Point, +} diff --git a/shared/yeast/src/schema.rs b/shared/yeast/src/schema.rs new file mode 100644 index 00000000000..6993c979320 --- /dev/null +++ b/shared/yeast/src/schema.rs @@ -0,0 +1,132 @@ +use std::collections::BTreeMap; + +use crate::{FieldId, KindId, CHILD_FIELD}; + +/// A schema defining node kinds and field names for the output AST. +/// Built from a node-types.yml file, independent of any tree-sitter grammar. +#[derive(Clone)] +pub struct Schema { + field_ids: BTreeMap, + field_names: BTreeMap, + next_field_id: FieldId, + kind_ids: BTreeMap, + unnamed_kind_ids: BTreeMap, + kind_names: BTreeMap, + next_kind_id: KindId, +} + +impl Schema { + pub fn new() -> Self { + Self { + field_ids: BTreeMap::new(), + field_names: BTreeMap::new(), + next_field_id: 1, // 0 is reserved + kind_ids: BTreeMap::new(), + unnamed_kind_ids: BTreeMap::new(), + kind_names: BTreeMap::new(), + next_kind_id: 1, // 0 is reserved + } + } + + /// Create a schema from a tree-sitter language, importing all its + /// known field and kind names. + pub fn from_language(language: &tree_sitter::Language) -> Self { + let mut schema = Self::new(); + // Import all field names, preserving tree-sitter's IDs + for id in 1..=language.field_count() as u16 { + if let Some(name) = language.field_name_for_id(id) { + schema.field_ids.insert(name.to_string(), id); + schema.field_names.insert(id, name); + if id >= schema.next_field_id { + schema.next_field_id = id + 1; + } + } + } + // Import all node kind names, preserving tree-sitter's IDs. + // Track named and unnamed variants separately. + // For named kinds, use the canonical ID from id_for_node_kind(name, true) + // since some languages have multiple IDs for the same named kind. + for id in 0..language.node_kind_count() as u16 { + if let Some(name) = language.node_kind_for_id(id) { + if !name.is_empty() { + let is_named = language.node_kind_is_named(id); + if is_named { + let canonical_id = language.id_for_node_kind(name, true); + if canonical_id != 0 && !schema.kind_ids.contains_key(name) { + schema.kind_ids.insert(name.to_string(), canonical_id); + schema.kind_names.insert(canonical_id, name); + } + } else { + // For unnamed kinds, only insert if we don't already have one + // (some languages have multiple unnamed IDs for the same text) + schema.unnamed_kind_ids.entry(name.to_string()).or_insert(id); + } + // Always track the name for any ID we encounter + schema.kind_names.entry(id).or_insert(name); + if id >= schema.next_kind_id { + schema.next_kind_id = id + 1; + } + } + } + } + schema + } + + /// Register a field name, returning its ID. + /// If already registered, returns the existing ID. + pub fn register_field(&mut self, name: &str) -> FieldId { + if name == "child" { + return CHILD_FIELD; + } + if let Some(&id) = self.field_ids.get(name) { + return id; + } + let id = self.next_field_id; + assert!(id < CHILD_FIELD, "too many fields"); + self.next_field_id += 1; + let leaked: &'static str = Box::leak(name.to_string().into_boxed_str()); + self.field_ids.insert(name.to_string(), id); + self.field_names.insert(id, leaked); + id + } + + /// Register a node kind name, returning its ID. + /// If already registered, returns the existing ID. + pub fn register_kind(&mut self, name: &str) -> KindId { + if let Some(&id) = self.kind_ids.get(name) { + return id; + } + let id = self.next_kind_id; + self.next_kind_id += 1; + let leaked: &'static str = Box::leak(name.to_string().into_boxed_str()); + self.kind_ids.insert(name.to_string(), id); + self.kind_names.insert(id, leaked); + id + } + + pub fn field_id_for_name(&self, name: &str) -> Option { + if name == "child" { + return Some(CHILD_FIELD); + } + self.field_ids.get(name).copied() + } + + pub fn field_name_for_id(&self, id: FieldId) -> Option<&'static str> { + if id == CHILD_FIELD { + return Some("child"); + } + self.field_names.get(&id).copied() + } + + pub fn id_for_node_kind(&self, kind: &str) -> Option { + self.kind_ids.get(kind).copied() + } + + pub fn id_for_unnamed_node_kind(&self, kind: &str) -> Option { + self.unnamed_kind_ids.get(kind).copied() + } + + pub fn node_kind_for_id(&self, id: KindId) -> Option<&'static str> { + self.kind_names.get(&id).copied() + } +} diff --git a/shared/yeast/src/tree_builder.rs b/shared/yeast/src/tree_builder.rs new file mode 100644 index 00000000000..b3b4c832c4f --- /dev/null +++ b/shared/yeast/src/tree_builder.rs @@ -0,0 +1,37 @@ +use std::collections::BTreeMap; +use std::cell::Cell; + +/// Tracks fresh identifier generation during a single tree-building operation. +/// All occurrences of the same `$name` within one build share the same generated value. +pub struct FreshScope { + counter: Cell, + resolved: std::cell::RefCell>, +} + +impl FreshScope { + pub fn new() -> Self { + Self { + counter: Cell::new(0), + resolved: std::cell::RefCell::new(BTreeMap::new()), + } + } + + pub fn resolve(&self, name: &str) -> String { + self.resolved + .borrow_mut() + .entry(name.to_string()) + .or_insert_with(|| { + let id = self.counter.get(); + self.counter.set(id + 1); + format!("${name}-{id}") + }) + .clone() + } + + /// Clear resolved names but keep the counter. Called between rule + /// applications so that `$tmp` in different rules gets different values + /// while the counter increases monotonically. + pub fn next_scope(&self) { + self.resolved.borrow_mut().clear(); + } +} diff --git a/shared/yeast/src/visitor.rs b/shared/yeast/src/visitor.rs new file mode 100644 index 00000000000..655aa01e6b3 --- /dev/null +++ b/shared/yeast/src/visitor.rs @@ -0,0 +1,111 @@ +use std::collections::BTreeMap; +use tree_sitter::{Language, Tree}; + +use crate::{Ast, Id, Node, NodeContent, CHILD_FIELD}; + +#[derive(Debug)] +struct VisitorNode { + inner: Node, + parent: Option, +} + +/// A type that can walk a TS tree and produce an `Ast`. +#[derive(Debug)] +pub(crate) struct Visitor { + nodes: Vec, + current: Option, + language: Language, +} + +impl Visitor { + pub fn new(language: Language) -> Self { + Self { + nodes: Vec::new(), + current: None, + language, + } + } + + pub fn visit(&mut self, tree: &Tree) { + let cursor = &mut tree.walk(); + self.enter_node(cursor.node()); + let mut recurse = true; + loop { + if recurse && cursor.goto_first_child() { + recurse = self.enter_node(cursor.node()); + } else { + self.leave_node(cursor.field_name(), cursor.node()); + + if cursor.goto_next_sibling() { + recurse = self.enter_node(cursor.node()); + } else if cursor.goto_parent() { + recurse = false; + } else { + break; + } + } + } + } + + pub fn build_with_schema(self, schema: crate::schema::Schema) -> Ast { + Ast { + root: self.nodes[0].inner.id, + schema, + nodes: self.nodes.into_iter().map(|n| n.inner).collect(), + } + } + + fn add_node(&mut self, n: tree_sitter::Node<'_>, content: NodeContent, is_named: bool) -> Id { + let id = self.nodes.len(); + self.nodes.push(VisitorNode { + inner: Node { + id, + kind: self.language.id_for_node_kind(n.kind(), is_named), + kind_name: n.kind(), + content, + fields: BTreeMap::new(), + is_missing: n.is_missing(), + is_named: n.is_named(), + is_extra: n.is_extra(), + is_error: n.is_error(), + source_range: None, + }, + parent: self.current, + }); + id + } + + fn enter_node(&mut self, node: tree_sitter::Node<'_>) -> bool { + let id = self.add_node(node, node.range().into(), node.is_named()); + self.current = Some(id); + true + } + + fn leave_node(&mut self, field_name: Option<&'static str>, _node: tree_sitter::Node<'_>) { + let node = self.current.map(|i| &self.nodes[i]).unwrap(); + let node_id = node.inner.id; + let node_parent = node.parent; + + if let Some(parent_id) = node.parent { + let parent = self.nodes.get_mut(parent_id).unwrap(); + if let Some(field) = field_name { + let field_id = self.language.field_id_for_name(field).unwrap().get(); + parent + .inner + .fields + .entry(field_id) + .or_default() + .push(node_id); + } else { + parent + .inner + .fields + .entry(CHILD_FIELD) + .or_default() + .push(node_id); + } + } + + self.current = node_parent; + } +}