yeast: Support capturing unnamed nodes in queries

Three improvements to the query parser, all aimed at allowing query
patterns to refer to unnamed tokens:

1. Bare-literal capture: `"=" @op` now captures the unnamed `=` token,
   matching the parenthesized form `("=") @op`. Previously the literal
   branch in parse_query_list skipped the maybe_wrap_capture call, so
   the `@op` was a leftover token and would error.

2. Bare `_` matches any node, named or unnamed. Previously bare `_` and
   `(_)` both produced QueryNode::Any with the same matches_named_only
   behaviour, so bare `_` would skip unnamed children. Now Any carries a
   match_unnamed flag: false for `(_)` (named-only, tree-sitter default)
   and true for bare `_` (any node).

3. Named fields and bare child patterns may be intermixed in any order.
   Previously, once parse_query_fields saw a bare pattern it would stop
   accepting named fields. The fix accumulates bare patterns into the
   implicit `child` field and keeps parsing.

Each named field independently selects its target field for matching, so
the source-order of fields in the query is purely cosmetic and intermixing
is safe.

Add tests covering parenthesized capture, bare-literal capture, and the
named-vs-any distinction between `(_)` and bare `_`. Update query-syntax
docs to reflect all three.
This commit is contained in:
Taus
2026-05-07 12:18:04 +00:00
parent a0a0e9e9a7
commit a4df96aad6
5 changed files with 200 additions and 27 deletions

View File

@@ -9,14 +9,21 @@ mod parse;
///
/// ```text
/// (_) - match any named node (skips unnamed tokens)
/// _ - match any node, named or unnamed
/// (kind) - match a named node of the given kind
/// ("literal") - match an unnamed token by its text
/// "literal" - shorthand for `("literal")`
/// (kind field: (pattern)) - match with named field
/// (kind (pat) (pat)...) - match unnamed children (after all fields)
/// (kind field: _) - bare `_` and bare literals work in field position too
/// (kind (pat) (pat)...) - match unnamed children
/// (pattern) @capture - capture the matched node
/// "literal" @capture - capture an unnamed token
/// _ @capture - capture any node
/// (pattern)* @capture - capture each repeated match
/// (pattern)? - zero or one
/// ```
///
/// Named fields and bare child patterns may be intermixed in any order.
#[proc_macro]
pub fn query(input: TokenStream) -> TokenStream {
let input2: TokenStream2 = input.into();

View File

@@ -38,7 +38,8 @@ fn parse_query_node(tokens: &mut Tokens) -> Result<TokenStream> {
}
}
/// Parse a query atom: `(kind fields...)` or `(kind fields... bare_children...)`.
/// Parse a query atom: a parenthesized node, a bare `_` (any node), or a
/// bare string literal (unnamed token).
/// Does not handle `@capture` — that's handled by the caller as a postfix.
fn parse_query_atom(tokens: &mut Tokens) -> Result<TokenStream> {
match tokens.peek() {
@@ -58,9 +59,17 @@ fn parse_query_atom(tokens: &mut Tokens) -> Result<TokenStream> {
}
Ok(result)
}
Some(TokenTree::Ident(id)) if *id == "_" => {
tokens.next();
Ok(quote! { yeast::query::QueryNode::Any { match_unnamed: true } })
}
Some(TokenTree::Literal(_)) => {
let lit = expect_literal(tokens)?;
Ok(quote! { yeast::query::QueryNode::UnnamedNode { kind: #lit } })
}
Some(tok) => Err(syn::Error::new_spanned(
tok.clone(),
"expected `(` in query; use `(_) @name` to capture a wildcard",
"expected `(`, `_`, or string literal in query",
)),
}
}
@@ -74,7 +83,7 @@ fn parse_query_node_inner(tokens: &mut Tokens) -> Result<TokenStream> {
)),
Some(TokenTree::Ident(id)) if *id == "_" => {
tokens.next();
Ok(quote! { yeast::query::QueryNode::Any() })
Ok(quote! { yeast::query::QueryNode::Any { match_unnamed: false } })
}
Some(TokenTree::Literal(_)) => {
let lit = expect_literal(tokens)?;
@@ -98,11 +107,14 @@ fn parse_query_node_inner(tokens: &mut Tokens) -> Result<TokenStream> {
}
}
/// Parse zero or more field specifications and trailing bare patterns.
/// Named fields: `name: pattern` or `name*: (list...)`.
/// Bare patterns (no field name) become implicit `child` field entries.
/// Parse zero or more field specifications and bare patterns.
/// Named fields: `name: pattern`. Bare patterns (no field name) become
/// implicit `child` field entries. Named fields and bare patterns may
/// appear in any order; bare patterns are accumulated and emitted as a
/// single `("child", ...)` entry.
fn parse_query_fields(tokens: &mut Tokens) -> Result<Vec<TokenStream>> {
let mut fields = Vec::new();
let mut bare_children: Vec<TokenStream> = Vec::new();
while tokens.peek().is_some() {
if peek_is_field(tokens) {
let field_name = expect_ident(tokens, "expected field name")?;
@@ -115,16 +127,21 @@ fn parse_query_fields(tokens: &mut Tokens) -> Result<Vec<TokenStream>> {
(#field_str, vec![yeast::query::QueryListElem::SingleNode(#child)])
});
} else {
// Bare patterns — collect as implicit `child` field
// Bare patterns — accumulate into the implicit `child` field.
// We don't break here, so we can interleave with named fields.
let elems = parse_query_list(tokens)?;
if !elems.is_empty() {
fields.push(quote! {
("child", vec![#(#elems),*])
});
if elems.is_empty() {
// Nothing more we can parse at this level.
break;
}
break;
bare_children.extend(elems);
}
}
if !bare_children.is_empty() {
fields.push(quote! {
("child", vec![#(#bare_children),*])
});
}
Ok(fields)
}
@@ -178,10 +195,11 @@ fn parse_query_list(tokens: &mut Tokens) -> Result<Vec<TokenStream>> {
continue;
}
// Check for string literal (unnamed node)
// Check for string literal (unnamed node), optionally followed by @capture
if peek_is_literal(tokens) {
let lit = expect_literal(tokens)?;
let node = quote! { yeast::query::QueryNode::UnnamedNode { kind: #lit } };
let node = maybe_wrap_capture(tokens, node)?;
let elem = maybe_wrap_repetition(
tokens,
quote! {
@@ -192,10 +210,12 @@ fn parse_query_list(tokens: &mut Tokens) -> Result<Vec<TokenStream>> {
continue;
}
// Check for bare _ (wildcard), possibly followed by @capture
// Check for bare `_` (any node, named or unnamed), possibly followed by @capture.
// Distinct from `(_)` which only matches named nodes — this matches
// tree-sitter query semantics.
if peek_is_underscore(tokens) {
tokens.next();
let node = quote! { yeast::query::QueryNode::Any() };
let node = quote! { yeast::query::QueryNode::Any { match_unnamed: true } };
let node = maybe_wrap_capture(tokens, node)?;
let elem = maybe_wrap_repetition(
tokens,

View File

@@ -103,19 +103,30 @@ Captures bind matched nodes to names for use in the transform. A capture
(identifier) @name // capture an identifier node
(_) @value // capture any named node
(identifier)* @items // capture each repeated match
("=") @op // capture an unnamed token by its text
"=" @op // shorthand for the line above
_ @anything // capture any node, named or unnamed
```
### Unnamed children
### Named vs unnamed children
Patterns that appear after all named fields match unnamed (positional)
children. Named node patterns like `(_)` automatically skip unnamed tokens
(keywords, operators, punctuation), matching tree-sitter semantics:
The two wildcard forms `(_)` and bare `_` differ:
- `(_)` matches only **named** nodes. When used as a positional pattern,
unnamed children (keywords, operators, punctuation) are skipped over to
find the next named child.
- Bare `_` matches **any** node, named or unnamed, taking whatever is next
in the child list.
Similarly, named-kind patterns like `(call ...)` skip unnamed children;
unnamed-kind patterns like `("end")` or `"end"` consume the next child
unconditionally:
```rust
(for
pattern: (_) @pat // named field
value: (in (_) @val) // "in" token is skipped automatically
body: (do (_)* @body) // "do" and "end" tokens skipped
pattern: (_) @pat // named field, captures any named node
value: (in (_) @val) // "in" wrapper is a named node here
body: (do (_)* @body) // "do" and "end" tokens skipped by (_)
)
```

View File

@@ -2,7 +2,13 @@ use crate::{captures::Captures, Ast, Id};
#[derive(Debug, Clone)]
pub enum QueryNode {
Any(),
/// A wildcard. With `match_unnamed = false` (the default for `(_)`),
/// only matches named nodes when used positionally — unnamed children
/// are skipped over. With `match_unnamed = true` (for bare `_`), the
/// wildcard consumes whatever the next child is, named or unnamed.
Any {
match_unnamed: bool,
},
Node {
kind: &'static str,
children: Vec<(&'static str, Vec<QueryListElem>)>,
@@ -24,7 +30,7 @@ impl QueryNode {
QueryNode::Node { kind, .. } => Some(kind),
QueryNode::UnnamedNode { kind } => Some(kind),
QueryNode::Capture { node, .. } => node.root_kind(),
QueryNode::Any() => None,
QueryNode::Any { .. } => None,
}
}
}
@@ -51,7 +57,7 @@ impl QueryNode {
/// semantics where `(_)` only matches named nodes.
fn matches_named_only(&self) -> bool {
match self {
QueryNode::Any() => true,
QueryNode::Any { match_unnamed } => !match_unnamed,
QueryNode::Node { .. } => true,
QueryNode::UnnamedNode { .. } => false,
QueryNode::Capture { node, .. } => node.matches_named_only(),
@@ -60,7 +66,7 @@ impl QueryNode {
pub fn do_match(&self, ast: &Ast, node: Id, matches: &mut Captures) -> Result<bool, String> {
match self {
QueryNode::Any() => Ok(true),
QueryNode::Any { .. } => Ok(true),
QueryNode::Node { kind, children } => {
let node = ast.get_node(node).unwrap();
let target_kind = ast

View File

@@ -170,6 +170,135 @@ fn test_query_repeated_capture() {
assert_eq!(captures.get_all("names").len(), 3);
}
#[test]
fn test_capture_unnamed_node_parenthesized() {
// `("=") @op` captures the unnamed `=` token between left and right.
let runner = Runner::new(tree_sitter_ruby::LANGUAGE.into(), &[]);
let ast = runner.run("x = 1").unwrap();
let query = yeast::query!(
(assignment
left: (_) @lhs
("=") @op
right: (_) @rhs
)
);
let mut cursor = AstCursor::new(&ast);
cursor.goto_first_child();
let assignment_id = cursor.node().id();
let mut captures = yeast::captures::Captures::new();
let matched = query.do_match(&ast, assignment_id, &mut captures).unwrap();
assert!(matched);
let op_id = captures.get_var("op").unwrap();
let op_node = ast.get_node(op_id).unwrap();
assert_eq!(op_node.kind(), "=");
assert!(!op_node.is_named());
}
#[test]
fn test_capture_unnamed_node_bare_literal() {
// `"=" @op` (without surrounding parens) is the same as `("=") @op`.
let runner = Runner::new(tree_sitter_ruby::LANGUAGE.into(), &[]);
let ast = runner.run("x = 1").unwrap();
let query = yeast::query!(
(assignment
left: (_) @lhs
"=" @op
right: (_) @rhs
)
);
let mut cursor = AstCursor::new(&ast);
cursor.goto_first_child();
let assignment_id = cursor.node().id();
let mut captures = yeast::captures::Captures::new();
let matched = query.do_match(&ast, assignment_id, &mut captures).unwrap();
assert!(matched);
let op_id = captures.get_var("op").unwrap();
let op_node = ast.get_node(op_id).unwrap();
assert_eq!(op_node.kind(), "=");
assert!(!op_node.is_named());
}
#[test]
fn test_bare_underscore_matches_unnamed() {
// Bare `_` matches any node, including unnamed tokens, while `(_)`
// matches only named nodes. Demonstrate by matching the unnamed `=`
// token in the implicit `child` field of an `assignment`.
let runner = Runner::new(tree_sitter_ruby::LANGUAGE.into(), &[]);
let ast = runner.run("x = 1").unwrap();
let mut cursor = AstCursor::new(&ast);
cursor.goto_first_child();
let assignment_id = cursor.node().id();
// `(_)` skips unnamed children, so a query containing a single `(_)`
// bare pattern fails to match the assignment (whose only unfielded
// child is the unnamed `=`).
let query_named = yeast::query!((assignment (_) @any));
let mut captures = yeast::captures::Captures::new();
let matched = query_named
.do_match(&ast, assignment_id, &mut captures)
.unwrap();
assert!(
!matched,
"(_) should skip the unnamed `=` and fail to match"
);
// Bare `_` accepts the next child whatever it is, so it matches the
// unnamed `=` token.
let query_any = yeast::query!((assignment _ @any));
let mut captures = yeast::captures::Captures::new();
let matched = query_any
.do_match(&ast, assignment_id, &mut captures)
.unwrap();
assert!(matched, "_ should match the unnamed `=`");
let any_node = ast.get_node(captures.get_var("any").unwrap()).unwrap();
assert_eq!(any_node.kind(), "=");
assert!(!any_node.is_named());
}
#[test]
fn test_bare_forms_in_field_position() {
// The bare `_` and bare-literal forms should be accepted as a
// field's value, not just in the bare-children position. This is
// syntactic sugar for `(_)` / `("…")` and goes through the same
// code paths.
let runner = Runner::new(tree_sitter_ruby::LANGUAGE.into(), &[]);
let ast = runner.run("x = 1").unwrap();
let mut cursor = AstCursor::new(&ast);
cursor.goto_first_child();
let assignment_id = cursor.node().id();
// Bare `_` in field position. Captures the named `identifier "x"`
// child of the `left` field — bare `_` admits unnamed too, but the
// first child of `left` happens to be named.
let query = yeast::query!((assignment left: _ @lhs));
let mut captures = yeast::captures::Captures::new();
let matched = query.do_match(&ast, assignment_id, &mut captures).unwrap();
assert!(matched);
assert_eq!(
ast.get_node(captures.get_var("lhs").unwrap())
.unwrap()
.kind(),
"identifier"
);
// Bare literal in field position. Equivalent to `("=") @op`.
let query = yeast::query!((assignment child: "=" @op));
let mut captures = yeast::captures::Captures::new();
let matched = query.do_match(&ast, assignment_id, &mut captures).unwrap();
assert!(matched);
let op = ast.get_node(captures.get_var("op").unwrap()).unwrap();
assert_eq!(op.kind(), "=");
assert!(!op.is_named());
}
// ---- Tree builder tests ----
#[test]