Merge pull request #21809 from github/tausbn/yeast-add-support-for-desugaring-phases

Yeast: Two small improvements
2026-05-14 03:09:26 +02:00 · 2026-05-07 19:00:44 +02:00
parent f9e42ac443 e0d663f79b
commit b027ac3658
3 changed files with 267 additions and 45 deletions
--- a/shared/yeast/doc/yeast.md
+++ b/shared/yeast/doc/yeast.md
@@ -61,6 +61,22 @@ rule matches, the node is kept and its children are processed recursively.
 A rule can replace one node with zero nodes (deletion), one node (rewriting),
 or multiple nodes (expansion).

+By default a rule fires **at most once on a given node**: after firing, the
+engine will not re-try that same rule on the result root. Other rules may
+still fire on the result, and the rule may still fire on different nodes
+(including the result's children). To opt into iterative behaviour — when a
+rule's output is intentionally re-matched by the same rule — call
+`.repeated()` on the constructed `Rule`:
+
+```rust
+let r = yeast::rule!((foo ...) => (foo ...)).repeated();
+```
+
+Without `.repeated()`, a rule whose output happens to match its own query
+simply fires once and stops. With `.repeated()`, the rule is allowed to
+re-match indefinitely; the runner still enforces a global rewrite-depth
+limit (currently 100) as a safety net against accidental cycles.
+
 ## Query language

 Queries use a syntax inspired by
@@ -303,11 +319,17 @@ capture name to a field of the same name on the output node.
 ## Integration with the extractor

 A YEAST desugaring pass is configured with a [`DesugaringConfig`], which
-carries the rules and an optional output node-types schema (in YAML
-format). Attach it to a language spec to enable rewriting:
+carries one or more named [`Phase`]s of rules and an optional output
+node-types schema (in YAML format). Each phase is a complete traversal
+that runs to completion before the next phase starts; only the current
+phase's rules are considered during that traversal. Attach the config to
+a language spec
+to enable rewriting:

 ```rust
-let desugar = yeast::DesugaringConfig::new(my_rules)
+let desugar = yeast::DesugaringConfig::new()
+    .add_phase("cleanup", cleanup_rules())
+    .add_phase("desugar", desugar_rules())
    .with_output_node_types_yaml(include_str!("output-node-types.yml"));

 let lang = simple::LanguageSpec {
@@ -319,11 +341,14 @@ let lang = simple::LanguageSpec {
 };
 ```

+A single-phase config is just `.add_phase(...)` called once. Phase names
+appear in error messages so you can tell which phase failed.
+
 The same YAML node-types is used for both the runtime yeast `Schema` (so
 rules can refer to output-only kinds and fields) and TRAP validation (it
 is converted to JSON internally).

 For the dbscheme/QL code generator, set `Language::desugar` to a
 `DesugaringConfig` carrying the same YAML; the generator converts it to
-JSON for downstream code generation. The `rules` field of the config is
+JSON for downstream code generation. The `phases` field of the config is
 unused at code-generation time.
--- a/shared/yeast/src/lib.rs
+++ b/shared/yeast/src/lib.rs
@@ -471,11 +471,29 @@ pub type Transform = Box<
 pub struct Rule {
    query: QueryNode,
    transform: Transform,
+    /// If true, after this rule fires on a node the engine will try to
+    /// re-apply this same rule on the result root. Defaults to false:
+    /// each rule fires at most once on a given node, which prevents
+    /// accidental loops where a rule's output matches its own query.
+    repeated: bool,
 }

 impl Rule {
    pub fn new(query: QueryNode, transform: Transform) -> Self {
-        Self { query, transform }
+        Self {
+            query,
+            transform,
+            repeated: false,
+        }
+    }
+
+    /// Mark this rule as allowed to fire multiple times on the same node.
+    /// Use when the rule is intentionally iterative (its output may match
+    /// its own query). Without this, a rule fires at most once per node;
+    /// other rules can still fire on the result.
+    pub fn repeated(mut self) -> Self {
+        self.repeated = true;
+        self
    }

    fn try_rule(
@@ -537,7 +555,7 @@ fn apply_rules(
    fresh: &tree_builder::FreshScope,
 ) -> Result<Vec<Id>, String> {
    let index = RuleIndex::new(rules);
-    apply_rules_inner(&index, ast, id, fresh, 0)
+    apply_rules_inner(&index, ast, id, fresh, 0, None)
 }

 fn apply_rules_inner(
@@ -546,6 +564,7 @@ fn apply_rules_inner(
    id: Id,
    fresh: &tree_builder::FreshScope,
    rewrite_depth: usize,
+    skip_rule: Option<*const Rule>,
 ) -> Result<Vec<Id>, String> {
    if rewrite_depth > MAX_REWRITE_DEPTH {
        return Err(format!(
@@ -556,7 +575,16 @@ fn apply_rules_inner(

    let node_kind = ast.get_node(id).map(|n| n.kind()).unwrap_or("");
    for rule in index.rules_for_kind(node_kind) {
+        let rule_ptr = *rule as *const Rule;
+        if Some(rule_ptr) == skip_rule {
+            continue;
+        }
        if let Some(result_node) = rule.try_rule(ast, id, fresh)? {
+            // For non-repeated rules, suppress further application of *this*
+            // rule on the result root, so a rule whose output matches its own
+            // query doesn't loop. Other rules and child traversal are
+            // unaffected.
+            let next_skip = if rule.repeated { None } else { Some(rule_ptr) };
            let mut results = Vec::new();
            for node in result_node {
                results.extend(apply_rules_inner(
@@ -565,6 +593,7 @@ fn apply_rules_inner(
                    node,
                    fresh,
                    rewrite_depth + 1,
+                    next_skip,
                )?);
            }
            return Ok(results);
@@ -579,13 +608,14 @@ fn apply_rules_inner(
        .collect();

    // recursively descend into all the fields
-    // Child traversal does not increment rewrite depth
+    // Child traversal does not increment rewrite depth and starts fresh
+    // (no rule is skipped on child subtrees).
    let mut changed = false;
    let mut new_fields = BTreeMap::new();
    for (field_id, children) in field_entries {
        let mut new_children = Vec::new();
        for child_id in children {
-            let result = apply_rules_inner(index, ast, child_id, fresh, rewrite_depth)?;
+            let result = apply_rules_inner(index, ast, child_id, fresh, rewrite_depth, None)?;
            if result.len() != 1 || result[0] != child_id {
                changed = true;
            }
@@ -605,16 +635,47 @@ fn apply_rules_inner(
    Ok(vec![ast.nodes.len() - 1])
 }

-/// Configuration for a desugaring pass: a set of rules and an optional
-/// output node-types schema (in YAML format).
+/// One phase of a desugaring pass: a named bundle of rules that runs to
+/// completion (a full traversal applying its rules) before the next phase
+/// starts. Rules within a phase compete for matches as usual; rules in
+/// different phases never compete because each traversal only considers the
+/// current phase's rules.
+pub struct Phase {
+    /// Name used in error messages.
+    pub name: String,
+    pub rules: Vec<Rule>,
+}
+
+impl Phase {
+    pub fn new(name: impl Into<String>, rules: Vec<Rule>) -> Self {
+        Self {
+            name: name.into(),
+            rules,
+        }
+    }
+}
+
+/// Configuration for a desugaring pass: an ordered list of [`Phase`]s and
+/// an optional output node-types schema (in YAML format).
 ///
 /// When attached to a `LanguageSpec` (in the shared tree-sitter extractor),
 /// enables yeast-based AST rewriting before TRAP extraction. The same YAML
 /// is used both to validate TRAP output (via JSON conversion) and to
 /// resolve output-only node kinds and fields at runtime.
+///
+/// Construct with `DesugaringConfig::new()` and add phases via
+/// `add_phase`:
+///
+/// ```ignore
+/// let config = yeast::DesugaringConfig::new()
+///     .add_phase("cleanup", cleanup_rules)
+///     .add_phase("desugar", desugar_rules)
+///     .with_output_node_types_yaml(yaml);
+/// ```
+#[derive(Default)]
 pub struct DesugaringConfig {
-    /// Rules to apply during desugaring.
-    pub rules: Vec<Rule>,
+    /// Phases of rule application, applied in order.
+    pub phases: Vec<Phase>,
    /// Output node-types in YAML format. If `None`, the input grammar's
    /// node types are used (i.e. the desugared AST has the same node types
    /// as the tree-sitter grammar).
@@ -622,11 +683,16 @@ pub struct DesugaringConfig {
 }

 impl DesugaringConfig {
-    pub fn new(rules: Vec<Rule>) -> Self {
-        Self {
-            rules,
-            output_node_types_yaml: None,
-        }
+    /// Create an empty configuration. Add phases via [`add_phase`] and an
+    /// optional output schema via [`with_output_node_types_yaml`].
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Append a new phase with the given name and rules.
+    pub fn add_phase(mut self, name: impl Into<String>, rules: Vec<Rule>) -> Self {
+        self.phases.push(Phase::new(name, rules));
+        self
    }

    pub fn with_output_node_types_yaml(mut self, yaml: &'static str) -> Self {
@@ -648,17 +714,17 @@ impl DesugaringConfig {
 pub struct Runner<'a> {
    language: tree_sitter::Language,
    schema: schema::Schema,
-    rules: &'a [Rule],
+    phases: &'a [Phase],
 }

 impl<'a> Runner<'a> {
    /// Create a runner using the input grammar's schema for output.
-    pub fn new(language: tree_sitter::Language, rules: &'a [Rule]) -> Self {
+    pub fn new(language: tree_sitter::Language, phases: &'a [Phase]) -> Self {
        let schema = schema::Schema::from_language(&language);
        Self {
            language,
            schema,
-            rules,
+            phases,
        }
    }

@@ -666,12 +732,12 @@ impl<'a> Runner<'a> {
    pub fn with_schema(
        language: tree_sitter::Language,
        schema: &schema::Schema,
-        rules: &'a [Rule],
+        phases: &'a [Phase],
    ) -> Self {
        Self {
            language,
            schema: schema.clone(),
-            rules,
+            phases,
        }
    }

@@ -684,27 +750,17 @@ impl<'a> Runner<'a> {
        Ok(Self {
            language,
            schema,
-            rules: &config.rules,
+            phases: &config.phases,
        })
    }

    pub fn run_from_tree(&self, tree: &tree_sitter::Tree) -> Result<Ast, String> {
-        let fresh = tree_builder::FreshScope::new();
        let mut ast = Ast::from_tree_with_schema(self.schema.clone(), tree, &self.language);
-        let root = ast.get_root();
-        let res = apply_rules(self.rules, &mut ast, root, &fresh)?;
-        if res.len() != 1 {
-            return Err(format!(
-                "Expected exactly one result node, got {}",
-                res.len()
-            ));
-        }
-        ast.set_root(res[0]);
+        self.run_phases(&mut ast)?;
        Ok(ast)
    }

    pub fn run(&self, input: &str) -> Result<Ast, String> {
-        let fresh = tree_builder::FreshScope::new();
        let mut parser = tree_sitter::Parser::new();
        parser
            .set_language(&self.language)
@@ -713,15 +769,29 @@ impl<'a> Runner<'a> {
            .parse(input, None)
            .ok_or_else(|| "Failed to parse input".to_string())?;
        let mut ast = Ast::from_tree_with_schema(self.schema.clone(), &tree, &self.language);
-        let root = ast.get_root();
-        let res = apply_rules(self.rules, &mut ast, root, &fresh)?;
-        if res.len() != 1 {
-            return Err(format!(
-                "Expected exactly one result node, got {}",
-                res.len()
-            ));
-        }
-        ast.set_root(res[0]);
+        self.run_phases(&mut ast)?;
        Ok(ast)
    }
+
+    /// Apply each phase in turn to the AST, threading the root through.
+    /// A single `FreshScope` is shared across phases so that fresh
+    /// identifiers generated in different phases don't collide.
+    fn run_phases(&self, ast: &mut Ast) -> Result<(), String> {
+        let fresh = tree_builder::FreshScope::new();
+        let mut root = ast.get_root();
+        for phase in self.phases {
+            let res = apply_rules(&phase.rules, ast, root, &fresh)
+                .map_err(|e| format!("Phase `{}`: {e}", phase.name))?;
+            if res.len() != 1 {
+                return Err(format!(
+                    "Phase `{}`: expected exactly one result node, got {}",
+                    phase.name,
+                    res.len()
+                ));
+            }
+            root = res[0];
+        }
+        ast.set_root(root);
+        Ok(())
+    }
 }
--- a/shared/yeast/tests/test.rs
+++ b/shared/yeast/tests/test.rs
@@ -12,16 +12,36 @@ fn parse_and_dump(input: &str) -> String {
    dump_ast(&ast, ast.get_root(), input)
 }

-/// Helper: parse Ruby source with a custom output schema and rules, return dump.
+/// Helper: parse Ruby source with a custom output schema and a single
+/// phase of rules, return dump.
 fn run_and_dump(input: &str, rules: Vec<Rule>) -> String {
+    run_phased_and_dump(input, vec![Phase::new("test", rules)])
+}
+
+/// Helper: parse Ruby source with a custom output schema and multiple
+/// rule phases, return dump.
+fn run_phased_and_dump(input: &str, phases: Vec<Phase>) -> String {
    let lang: tree_sitter::Language = tree_sitter_ruby::LANGUAGE.into();
    let schema =
        yeast::node_types_yaml::schema_from_yaml_with_language(OUTPUT_SCHEMA_YAML, &lang).unwrap();
-    let runner = Runner::with_schema(lang, &schema, &rules);
+    let runner = Runner::with_schema(lang, &schema, &phases);
    let ast = runner.run(input).unwrap();
    dump_ast(&ast, ast.get_root(), input)
 }

+/// Helper: like `run_and_dump`, but returns the runner error (if any)
+/// instead of unwrapping.
+fn run_and_get_error(input: &str, rules: Vec<Rule>) -> String {
+    let lang: tree_sitter::Language = tree_sitter_ruby::LANGUAGE.into();
+    let schema =
+        yeast::node_types_yaml::schema_from_yaml_with_language(OUTPUT_SCHEMA_YAML, &lang).unwrap();
+    let phases = vec![Phase::new("test", rules)];
+    let runner = Runner::with_schema(lang, &schema, &phases);
+    runner
+        .run(input)
+        .expect_err("expected runner to return an error")
+}
+
 /// Assert that a dump equals the expected string, treating the expected
 /// string as an indented multiline literal: leading/trailing blank lines
 /// are stripped, and the common leading indentation is removed from every
@@ -382,6 +402,113 @@ fn test_chained_rules_output_only_kind() {
    );
 }

+// A rule that swaps `assignment.left` and `assignment.right`. Each
+// application produces another `assignment` whose query the rule
+// matches again, so without the once-per-node default it would loop.
+fn swap_assignment_rule() -> Rule {
+    yeast::rule!(
+        (assignment
+            left: (_) @left
+            right: (_) @right
+        )
+        =>
+        (assignment
+            left: {right}
+            right: {left}
+        )
+    )
+}
+
+#[test]
+fn test_repeated_rule_hits_depth_limit() {
+    // With `.repeated()` the rule is allowed to fire on its own output,
+    // which cycles forever and trips the rewrite-depth safety net.
+    let err = run_and_get_error("x = 1", vec![swap_assignment_rule().repeated()]);
+    assert!(
+        err.contains("exceeded maximum rewrite depth"),
+        "expected depth-limit error, got: {err}"
+    );
+}
+
+#[test]
+fn test_default_rule_fires_at_most_once_per_node() {
+    // Without `.repeated()` (the default), a rule fires at most once on a
+    // given node. The swap therefore happens exactly once and the desugaring
+    // terminates cleanly.
+    let dump = run_and_dump("x = 1", vec![swap_assignment_rule()]);
+    assert_dump_eq(
+        &dump,
+        r#"
+        program
+          assignment
+            left: integer "1"
+            right: identifier "x"
+    "#,
+    );
+}
+
+// ---- Phase tests ----
+
+#[test]
+fn test_phased_desugaring() {
+    // Two phases that could equally have been a single one with chained
+    // rules. Splitting them makes the intent (cleanup, then desugar)
+    // explicit and provides per-phase error messages.
+    let cleanup = vec![yeast::rule!(
+        (assignment
+            left: (_) @left
+            right: (_) @right
+        )
+        => first_node
+    )];
+    let desugar = vec![yeast::rule!(
+        (first_node
+            left: (_) @left
+            right: (_) @right
+        )
+        => second_node
+    )];
+
+    let dump = run_phased_and_dump(
+        "x = 1",
+        vec![
+            Phase::new("cleanup", cleanup),
+            Phase::new("desugar", desugar),
+        ],
+    );
+    assert_dump_eq(
+        &dump,
+        r#"
+        program
+          second_node
+            left: identifier "x"
+            right: integer "1"
+    "#,
+    );
+}
+
+#[test]
+fn test_phase_error_includes_phase_name() {
+    // A repeated rule that loops; the error message should identify the
+    // phase that tripped the depth limit.
+    let lang: tree_sitter::Language = tree_sitter_ruby::LANGUAGE.into();
+    let schema =
+        yeast::node_types_yaml::schema_from_yaml_with_language(OUTPUT_SCHEMA_YAML, &lang).unwrap();
+    let phases = vec![Phase::new("buggy", vec![swap_assignment_rule().repeated()])];
+    let runner = Runner::with_schema(lang, &schema, &phases);
+    let err = runner
+        .run("x = 1")
+        .expect_err("expected runner to return an error");
+    assert!(
+        err.contains("Phase `buggy`"),
+        "error should mention the failing phase, got: {err}"
+    );
+    assert!(
+        err.contains("exceeded maximum rewrite depth"),
+        "error should mention the depth limit, got: {err}"
+    );
+}
+
 // ---- Cursor tests ----

 #[test]