mirror of
https://github.com/github/codeql.git
synced 2026-05-14 11:19:27 +02:00
Merge pull request #21810 from github/tausbn/yeast-forward-scan-queries
yeast: Align query semantics more closely with tree-sitter
This commit is contained in:
@@ -9,14 +9,21 @@ mod parse;
|
||||
///
|
||||
/// ```text
|
||||
/// (_) - match any named node (skips unnamed tokens)
|
||||
/// _ - match any node, named or unnamed
|
||||
/// (kind) - match a named node of the given kind
|
||||
/// ("literal") - match an unnamed token by its text
|
||||
/// "literal" - shorthand for `("literal")`
|
||||
/// (kind field: (pattern)) - match with named field
|
||||
/// (kind (pat) (pat)...) - match unnamed children (after all fields)
|
||||
/// (kind field: _) - bare `_` and bare literals work in field position too
|
||||
/// (kind (pat) (pat)...) - match unnamed children
|
||||
/// (pattern) @capture - capture the matched node
|
||||
/// "literal" @capture - capture an unnamed token
|
||||
/// _ @capture - capture any node
|
||||
/// (pattern)* @capture - capture each repeated match
|
||||
/// (pattern)? - zero or one
|
||||
/// ```
|
||||
///
|
||||
/// Named fields and bare child patterns may be intermixed in any order.
|
||||
#[proc_macro]
|
||||
pub fn query(input: TokenStream) -> TokenStream {
|
||||
let input2: TokenStream2 = input.into();
|
||||
|
||||
@@ -38,7 +38,8 @@ fn parse_query_node(tokens: &mut Tokens) -> Result<TokenStream> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Parse a query atom: `(kind fields...)` or `(kind fields... bare_children...)`.
|
||||
/// Parse a query atom: a parenthesized node, a bare `_` (any node), or a
|
||||
/// bare string literal (unnamed token).
|
||||
/// Does not handle `@capture` — that's handled by the caller as a postfix.
|
||||
fn parse_query_atom(tokens: &mut Tokens) -> Result<TokenStream> {
|
||||
match tokens.peek() {
|
||||
@@ -58,9 +59,17 @@ fn parse_query_atom(tokens: &mut Tokens) -> Result<TokenStream> {
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
Some(TokenTree::Ident(id)) if *id == "_" => {
|
||||
tokens.next();
|
||||
Ok(quote! { yeast::query::QueryNode::Any { match_unnamed: true } })
|
||||
}
|
||||
Some(TokenTree::Literal(_)) => {
|
||||
let lit = expect_literal(tokens)?;
|
||||
Ok(quote! { yeast::query::QueryNode::UnnamedNode { kind: #lit } })
|
||||
}
|
||||
Some(tok) => Err(syn::Error::new_spanned(
|
||||
tok.clone(),
|
||||
"expected `(` in query; use `(_) @name` to capture a wildcard",
|
||||
"expected `(`, `_`, or string literal in query",
|
||||
)),
|
||||
}
|
||||
}
|
||||
@@ -74,7 +83,7 @@ fn parse_query_node_inner(tokens: &mut Tokens) -> Result<TokenStream> {
|
||||
)),
|
||||
Some(TokenTree::Ident(id)) if *id == "_" => {
|
||||
tokens.next();
|
||||
Ok(quote! { yeast::query::QueryNode::Any() })
|
||||
Ok(quote! { yeast::query::QueryNode::Any { match_unnamed: false } })
|
||||
}
|
||||
Some(TokenTree::Literal(_)) => {
|
||||
let lit = expect_literal(tokens)?;
|
||||
@@ -98,11 +107,14 @@ fn parse_query_node_inner(tokens: &mut Tokens) -> Result<TokenStream> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Parse zero or more field specifications and trailing bare patterns.
|
||||
/// Named fields: `name: pattern` or `name*: (list...)`.
|
||||
/// Bare patterns (no field name) become implicit `child` field entries.
|
||||
/// Parse zero or more field specifications and bare patterns.
|
||||
/// Named fields: `name: pattern`. Bare patterns (no field name) become
|
||||
/// implicit `child` field entries. Named fields and bare patterns may
|
||||
/// appear in any order; bare patterns are accumulated and emitted as a
|
||||
/// single `("child", ...)` entry.
|
||||
fn parse_query_fields(tokens: &mut Tokens) -> Result<Vec<TokenStream>> {
|
||||
let mut fields = Vec::new();
|
||||
let mut bare_children: Vec<TokenStream> = Vec::new();
|
||||
while tokens.peek().is_some() {
|
||||
if peek_is_field(tokens) {
|
||||
let field_name = expect_ident(tokens, "expected field name")?;
|
||||
@@ -115,16 +127,21 @@ fn parse_query_fields(tokens: &mut Tokens) -> Result<Vec<TokenStream>> {
|
||||
(#field_str, vec![yeast::query::QueryListElem::SingleNode(#child)])
|
||||
});
|
||||
} else {
|
||||
// Bare patterns — collect as implicit `child` field
|
||||
// Bare patterns — accumulate into the implicit `child` field.
|
||||
// We don't break here, so we can interleave with named fields.
|
||||
let elems = parse_query_list(tokens)?;
|
||||
if !elems.is_empty() {
|
||||
fields.push(quote! {
|
||||
("child", vec![#(#elems),*])
|
||||
});
|
||||
if elems.is_empty() {
|
||||
// Nothing more we can parse at this level.
|
||||
break;
|
||||
}
|
||||
break;
|
||||
bare_children.extend(elems);
|
||||
}
|
||||
}
|
||||
if !bare_children.is_empty() {
|
||||
fields.push(quote! {
|
||||
("child", vec![#(#bare_children),*])
|
||||
});
|
||||
}
|
||||
Ok(fields)
|
||||
}
|
||||
|
||||
@@ -178,10 +195,11 @@ fn parse_query_list(tokens: &mut Tokens) -> Result<Vec<TokenStream>> {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check for string literal (unnamed node)
|
||||
// Check for string literal (unnamed node), optionally followed by @capture
|
||||
if peek_is_literal(tokens) {
|
||||
let lit = expect_literal(tokens)?;
|
||||
let node = quote! { yeast::query::QueryNode::UnnamedNode { kind: #lit } };
|
||||
let node = maybe_wrap_capture(tokens, node)?;
|
||||
let elem = maybe_wrap_repetition(
|
||||
tokens,
|
||||
quote! {
|
||||
@@ -192,10 +210,12 @@ fn parse_query_list(tokens: &mut Tokens) -> Result<Vec<TokenStream>> {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check for bare _ (wildcard), possibly followed by @capture
|
||||
// Check for bare `_` (any node, named or unnamed), possibly followed by @capture.
|
||||
// Distinct from `(_)` which only matches named nodes — this matches
|
||||
// tree-sitter query semantics.
|
||||
if peek_is_underscore(tokens) {
|
||||
tokens.next();
|
||||
let node = quote! { yeast::query::QueryNode::Any() };
|
||||
let node = quote! { yeast::query::QueryNode::Any { match_unnamed: true } };
|
||||
let node = maybe_wrap_capture(tokens, node)?;
|
||||
let elem = maybe_wrap_repetition(
|
||||
tokens,
|
||||
|
||||
@@ -119,19 +119,40 @@ Captures bind matched nodes to names for use in the transform. A capture
|
||||
(identifier) @name // capture an identifier node
|
||||
(_) @value // capture any named node
|
||||
(identifier)* @items // capture each repeated match
|
||||
("=") @op // capture an unnamed token by its text
|
||||
"=" @op // shorthand for the line above
|
||||
_ @anything // capture any node, named or unnamed
|
||||
```
|
||||
|
||||
### Unnamed children
|
||||
### Named vs unnamed children
|
||||
|
||||
Patterns that appear after all named fields match unnamed (positional)
|
||||
children. Named node patterns like `(_)` automatically skip unnamed tokens
|
||||
(keywords, operators, punctuation), matching tree-sitter semantics:
|
||||
The two wildcard forms `(_)` and bare `_` differ:
|
||||
|
||||
- `(_)` matches only **named** nodes. When used as a positional pattern,
|
||||
unnamed children (keywords, operators, punctuation) are skipped over.
|
||||
- Bare `_` matches **any** node, named or unnamed, taking whatever is next
|
||||
in the child list.
|
||||
|
||||
Bare child patterns are matched **forward-scan**: each pattern advances
|
||||
through the iterator until it finds a child that matches, skipping
|
||||
non-matching children along the way. So `(foo ("baz"))` against a `foo`
|
||||
whose children are `[bar, baz]` succeeds — the matcher scans past `bar`
|
||||
and matches `baz`. The iterator advances as it goes, so subsequent
|
||||
patterns can never match children that appear earlier in source order
|
||||
than already-matched ones.
|
||||
|
||||
For named-only patterns (`(_)`, `(some_kind ...)`), the scan additionally
|
||||
skips past unnamed tokens without trying to match them, since they can
|
||||
never match anyway.
|
||||
|
||||
Anchors (`.`) for forcing immediate adjacency, like in tree-sitter
|
||||
queries, are not supported.
|
||||
|
||||
```rust
|
||||
(for
|
||||
pattern: (_) @pat // named field
|
||||
value: (in (_) @val) // "in" token is skipped automatically
|
||||
body: (do (_)* @body) // "do" and "end" tokens skipped
|
||||
pattern: (_) @pat // named field, captures any named node
|
||||
value: (in (_) @val) // "in" wrapper is a named node here
|
||||
body: (do (_)* @body) // "do" and "end" tokens skipped by (_)
|
||||
)
|
||||
```
|
||||
|
||||
|
||||
@@ -2,7 +2,13 @@ use crate::{captures::Captures, Ast, Id};
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum QueryNode {
|
||||
Any(),
|
||||
/// A wildcard. With `match_unnamed = false` (the default for `(_)`),
|
||||
/// only matches named nodes when used positionally — unnamed children
|
||||
/// are skipped over. With `match_unnamed = true` (for bare `_`), the
|
||||
/// wildcard consumes whatever the next child is, named or unnamed.
|
||||
Any {
|
||||
match_unnamed: bool,
|
||||
},
|
||||
Node {
|
||||
kind: &'static str,
|
||||
children: Vec<(&'static str, Vec<QueryListElem>)>,
|
||||
@@ -24,7 +30,7 @@ impl QueryNode {
|
||||
QueryNode::Node { kind, .. } => Some(kind),
|
||||
QueryNode::UnnamedNode { kind } => Some(kind),
|
||||
QueryNode::Capture { node, .. } => node.root_kind(),
|
||||
QueryNode::Any() => None,
|
||||
QueryNode::Any { .. } => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -51,7 +57,7 @@ impl QueryNode {
|
||||
/// semantics where `(_)` only matches named nodes.
|
||||
fn matches_named_only(&self) -> bool {
|
||||
match self {
|
||||
QueryNode::Any() => true,
|
||||
QueryNode::Any { match_unnamed } => !match_unnamed,
|
||||
QueryNode::Node { .. } => true,
|
||||
QueryNode::UnnamedNode { .. } => false,
|
||||
QueryNode::Capture { node, .. } => node.matches_named_only(),
|
||||
@@ -60,7 +66,7 @@ impl QueryNode {
|
||||
|
||||
pub fn do_match(&self, ast: &Ast, node: Id, matches: &mut Captures) -> Result<bool, String> {
|
||||
match self {
|
||||
QueryNode::Any() => Ok(true),
|
||||
QueryNode::Any { .. } => Ok(true),
|
||||
QueryNode::Node { kind, children } => {
|
||||
let node = ast.get_node(node).unwrap();
|
||||
let target_kind = ast
|
||||
@@ -161,25 +167,28 @@ impl QueryListElem {
|
||||
}
|
||||
}
|
||||
QueryListElem::SingleNode(sub_query) => {
|
||||
if sub_query.matches_named_only() {
|
||||
// Skip unnamed children, matching tree-sitter semantics
|
||||
// where (_) only matches named nodes.
|
||||
loop {
|
||||
match remaining_children.next() {
|
||||
Some(child) => {
|
||||
let node = ast.get_node(child).unwrap();
|
||||
if node.is_named() {
|
||||
return sub_query.do_match(ast, child, matches);
|
||||
}
|
||||
// Skip unnamed child, continue to next
|
||||
}
|
||||
None => return Ok(false),
|
||||
// Forward-scan semantics: advance through the iterator until
|
||||
// we find a child that matches `sub_query`. Skip ahead past
|
||||
// unnamed children when the sub-query is named-only (so they
|
||||
// can never match anyway). On a match attempt that fails,
|
||||
// restore the captures so partial captures from a complex
|
||||
// sub-query don't leak.
|
||||
let skip_unnamed = sub_query.matches_named_only();
|
||||
loop {
|
||||
let Some(child) = remaining_children.next() else {
|
||||
return Ok(false);
|
||||
};
|
||||
if skip_unnamed {
|
||||
let node = ast.get_node(child).unwrap();
|
||||
if !node.is_named() {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
} else if let Some(child) = remaining_children.next() {
|
||||
sub_query.do_match(ast, child, matches)
|
||||
} else {
|
||||
Ok(false)
|
||||
let snapshot = matches.clone();
|
||||
if sub_query.do_match(ast, child, matches)? {
|
||||
return Ok(true);
|
||||
}
|
||||
*matches = snapshot;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -61,9 +61,10 @@ impl Schema {
|
||||
}
|
||||
}
|
||||
// Import all node kind names, preserving tree-sitter's IDs.
|
||||
// Track named and unnamed variants separately.
|
||||
// For named kinds, use the canonical ID from id_for_node_kind(name, true)
|
||||
// since some languages have multiple IDs for the same named kind.
|
||||
// Track named and unnamed variants separately. For both named and
|
||||
// unnamed kinds, use the canonical ID from id_for_node_kind, since
|
||||
// some languages have multiple IDs for the same name (e.g., the
|
||||
// reserved error token at ID 0 may share a name with a real token).
|
||||
for id in 0..language.node_kind_count() as u16 {
|
||||
if let Some(name) = language.node_kind_for_id(id) {
|
||||
if !name.is_empty() {
|
||||
@@ -75,12 +76,13 @@ impl Schema {
|
||||
schema.kind_names.insert(canonical_id, name);
|
||||
}
|
||||
} else {
|
||||
// For unnamed kinds, only insert if we don't already have one
|
||||
// (some languages have multiple unnamed IDs for the same text)
|
||||
schema
|
||||
.unnamed_kind_ids
|
||||
.entry(name.to_string())
|
||||
.or_insert(id);
|
||||
let canonical_id = language.id_for_node_kind(name, false);
|
||||
if canonical_id != 0 && !schema.unnamed_kind_ids.contains_key(name) {
|
||||
schema
|
||||
.unnamed_kind_ids
|
||||
.insert(name.to_string(), canonical_id);
|
||||
schema.kind_names.insert(canonical_id, name);
|
||||
}
|
||||
}
|
||||
// Always track the name for any ID we encounter
|
||||
schema.kind_names.entry(id).or_insert(name);
|
||||
|
||||
@@ -190,6 +190,187 @@ fn test_query_repeated_capture() {
|
||||
assert_eq!(captures.get_all("names").len(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_capture_unnamed_node_parenthesized() {
|
||||
// `("=") @op` captures the unnamed `=` token between left and right.
|
||||
let runner = Runner::new(tree_sitter_ruby::LANGUAGE.into(), &[]);
|
||||
let ast = runner.run("x = 1").unwrap();
|
||||
|
||||
let query = yeast::query!(
|
||||
(assignment
|
||||
left: (_) @lhs
|
||||
("=") @op
|
||||
right: (_) @rhs
|
||||
)
|
||||
);
|
||||
|
||||
let mut cursor = AstCursor::new(&ast);
|
||||
cursor.goto_first_child();
|
||||
let assignment_id = cursor.node().id();
|
||||
|
||||
let mut captures = yeast::captures::Captures::new();
|
||||
let matched = query.do_match(&ast, assignment_id, &mut captures).unwrap();
|
||||
assert!(matched);
|
||||
let op_id = captures.get_var("op").unwrap();
|
||||
let op_node = ast.get_node(op_id).unwrap();
|
||||
assert_eq!(op_node.kind(), "=");
|
||||
assert!(!op_node.is_named());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_capture_unnamed_node_bare_literal() {
|
||||
// `"=" @op` (without surrounding parens) is the same as `("=") @op`.
|
||||
let runner = Runner::new(tree_sitter_ruby::LANGUAGE.into(), &[]);
|
||||
let ast = runner.run("x = 1").unwrap();
|
||||
|
||||
let query = yeast::query!(
|
||||
(assignment
|
||||
left: (_) @lhs
|
||||
"=" @op
|
||||
right: (_) @rhs
|
||||
)
|
||||
);
|
||||
|
||||
let mut cursor = AstCursor::new(&ast);
|
||||
cursor.goto_first_child();
|
||||
let assignment_id = cursor.node().id();
|
||||
|
||||
let mut captures = yeast::captures::Captures::new();
|
||||
let matched = query.do_match(&ast, assignment_id, &mut captures).unwrap();
|
||||
assert!(matched);
|
||||
let op_id = captures.get_var("op").unwrap();
|
||||
let op_node = ast.get_node(op_id).unwrap();
|
||||
assert_eq!(op_node.kind(), "=");
|
||||
assert!(!op_node.is_named());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bare_underscore_matches_unnamed() {
|
||||
// Bare `_` matches any node, including unnamed tokens, while `(_)`
|
||||
// matches only named nodes. Demonstrate by matching the unnamed `=`
|
||||
// token in the implicit `child` field of an `assignment`.
|
||||
let runner = Runner::new(tree_sitter_ruby::LANGUAGE.into(), &[]);
|
||||
let ast = runner.run("x = 1").unwrap();
|
||||
|
||||
let mut cursor = AstCursor::new(&ast);
|
||||
cursor.goto_first_child();
|
||||
let assignment_id = cursor.node().id();
|
||||
|
||||
// `(_)` skips unnamed children, so a query containing a single `(_)`
|
||||
// bare pattern fails to match the assignment (whose only unfielded
|
||||
// child is the unnamed `=`).
|
||||
let query_named = yeast::query!((assignment (_) @any));
|
||||
let mut captures = yeast::captures::Captures::new();
|
||||
let matched = query_named
|
||||
.do_match(&ast, assignment_id, &mut captures)
|
||||
.unwrap();
|
||||
assert!(
|
||||
!matched,
|
||||
"(_) should skip the unnamed `=` and fail to match"
|
||||
);
|
||||
|
||||
// Bare `_` accepts the next child whatever it is, so it matches the
|
||||
// unnamed `=` token.
|
||||
let query_any = yeast::query!((assignment _ @any));
|
||||
let mut captures = yeast::captures::Captures::new();
|
||||
let matched = query_any
|
||||
.do_match(&ast, assignment_id, &mut captures)
|
||||
.unwrap();
|
||||
assert!(matched, "_ should match the unnamed `=`");
|
||||
let any_node = ast.get_node(captures.get_var("any").unwrap()).unwrap();
|
||||
assert_eq!(any_node.kind(), "=");
|
||||
assert!(!any_node.is_named());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bare_forms_in_field_position() {
|
||||
// The bare `_` and bare-literal forms should be accepted as a
|
||||
// field's value, not just in the bare-children position. This is
|
||||
// syntactic sugar for `(_)` / `("…")` and goes through the same
|
||||
// code paths.
|
||||
let runner = Runner::new(tree_sitter_ruby::LANGUAGE.into(), &[]);
|
||||
let ast = runner.run("x = 1").unwrap();
|
||||
|
||||
let mut cursor = AstCursor::new(&ast);
|
||||
cursor.goto_first_child();
|
||||
let assignment_id = cursor.node().id();
|
||||
|
||||
// Bare `_` in field position. Captures the named `identifier "x"`
|
||||
// child of the `left` field — bare `_` admits unnamed too, but the
|
||||
// first child of `left` happens to be named.
|
||||
let query = yeast::query!((assignment left: _ @lhs));
|
||||
let mut captures = yeast::captures::Captures::new();
|
||||
let matched = query.do_match(&ast, assignment_id, &mut captures).unwrap();
|
||||
assert!(matched);
|
||||
assert_eq!(
|
||||
ast.get_node(captures.get_var("lhs").unwrap())
|
||||
.unwrap()
|
||||
.kind(),
|
||||
"identifier"
|
||||
);
|
||||
|
||||
// Bare literal in field position. Equivalent to `("=") @op`.
|
||||
let query = yeast::query!((assignment child: "=" @op));
|
||||
let mut captures = yeast::captures::Captures::new();
|
||||
let matched = query.do_match(&ast, assignment_id, &mut captures).unwrap();
|
||||
assert!(matched);
|
||||
let op = ast.get_node(captures.get_var("op").unwrap()).unwrap();
|
||||
assert_eq!(op.kind(), "=");
|
||||
assert!(!op.is_named());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_forward_scan_finds_unnamed_token_late() {
|
||||
// The `do` named-wrapper node has three children in its implicit
|
||||
// `child` field, in source order: `do` (unnamed kw), the body
|
||||
// identifier, and `end` (unnamed kw). Forward-scan semantics let a
|
||||
// query for `("end")` skip past the first two and match the third.
|
||||
// Without forward-scan, the matcher took the first child unconditionally
|
||||
// and failed.
|
||||
let runner = Runner::new(tree_sitter_ruby::LANGUAGE.into(), &[]);
|
||||
let ast = runner.run("for x in list do\n y\nend").unwrap();
|
||||
|
||||
// Navigate: program > for > do (the body wrapper).
|
||||
let mut cursor = AstCursor::new(&ast);
|
||||
cursor.goto_first_child(); // for
|
||||
cursor.goto_first_child(); // do (the body)
|
||||
while cursor.node().kind() != "do" || !cursor.node().is_named() {
|
||||
assert!(cursor.goto_next_sibling(), "expected to find named `do`");
|
||||
}
|
||||
let do_id = cursor.node().id();
|
||||
|
||||
let query = yeast::query!((do ("end") @kw));
|
||||
let mut captures = yeast::captures::Captures::new();
|
||||
let matched = query.do_match(&ast, do_id, &mut captures).unwrap();
|
||||
assert!(matched, "forward-scan should find the `end` keyword");
|
||||
let kw = ast.get_node(captures.get_var("kw").unwrap()).unwrap();
|
||||
assert_eq!(kw.kind(), "end");
|
||||
assert!(!kw.is_named());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_forward_scan_preserves_order() {
|
||||
// Bare patterns are scanned left-to-right and consume positions in
|
||||
// order. A query for ("end") then ("do") should fail because `do`
|
||||
// appears before `end` in the source order; once forward-scan has
|
||||
// consumed `end`, the iterator is exhausted.
|
||||
let runner = Runner::new(tree_sitter_ruby::LANGUAGE.into(), &[]);
|
||||
let ast = runner.run("for x in list do\n y\nend").unwrap();
|
||||
|
||||
let mut cursor = AstCursor::new(&ast);
|
||||
cursor.goto_first_child();
|
||||
cursor.goto_first_child();
|
||||
while cursor.node().kind() != "do" || !cursor.node().is_named() {
|
||||
assert!(cursor.goto_next_sibling(), "expected to find named `do`");
|
||||
}
|
||||
let do_id = cursor.node().id();
|
||||
|
||||
let query = yeast::query!((do ("end") @first ("do") @second));
|
||||
let mut captures = yeast::captures::Captures::new();
|
||||
let matched = query.do_match(&ast, do_id, &mut captures).unwrap();
|
||||
assert!(!matched, "scan must not go backwards");
|
||||
}
|
||||
|
||||
// ---- Tree builder tests ----
|
||||
|
||||
#[test]
|
||||
|
||||
Reference in New Issue
Block a user