Merge pull request #21809 from github/tausbn/yeast-add-support-for-desugaring-phases

Yeast: Two small improvements
This commit is contained in:
Taus
2026-05-07 19:00:44 +02:00
committed by GitHub
3 changed files with 267 additions and 45 deletions

View File

@@ -61,6 +61,22 @@ rule matches, the node is kept and its children are processed recursively.
A rule can replace one node with zero nodes (deletion), one node (rewriting),
or multiple nodes (expansion).
By default a rule fires **at most once on a given node**: after firing, the
engine will not re-try that same rule on the result root. Other rules may
still fire on the result, and the rule may still fire on different nodes
(including the result's children). To opt into iterative behaviour — when a
rule's output is intentionally re-matched by the same rule — call
`.repeated()` on the constructed `Rule`:
```rust
let r = yeast::rule!((foo ...) => (foo ...)).repeated();
```
Without `.repeated()`, a rule whose output happens to match its own query
simply fires once and stops. With `.repeated()`, the rule is allowed to
re-match indefinitely; the runner still enforces a global rewrite-depth
limit (currently 100) as a safety net against accidental cycles.
## Query language
Queries use a syntax inspired by
@@ -303,11 +319,17 @@ capture name to a field of the same name on the output node.
## Integration with the extractor
A YEAST desugaring pass is configured with a [`DesugaringConfig`], which
carries the rules and an optional output node-types schema (in YAML
format). Attach it to a language spec to enable rewriting:
carries one or more named [`Phase`]s of rules and an optional output
node-types schema (in YAML format). Each phase is a complete traversal
that runs to completion before the next phase starts; only the current
phase's rules are considered during that traversal. Attach the config to
a language spec
to enable rewriting:
```rust
let desugar = yeast::DesugaringConfig::new(my_rules)
let desugar = yeast::DesugaringConfig::new()
.add_phase("cleanup", cleanup_rules())
.add_phase("desugar", desugar_rules())
.with_output_node_types_yaml(include_str!("output-node-types.yml"));
let lang = simple::LanguageSpec {
@@ -319,11 +341,14 @@ let lang = simple::LanguageSpec {
};
```
A single-phase config is just `.add_phase(...)` called once. Phase names
appear in error messages so you can tell which phase failed.
The same YAML node-types is used for both the runtime yeast `Schema` (so
rules can refer to output-only kinds and fields) and TRAP validation (it
is converted to JSON internally).
For the dbscheme/QL code generator, set `Language::desugar` to a
`DesugaringConfig` carrying the same YAML; the generator converts it to
JSON for downstream code generation. The `rules` field of the config is
JSON for downstream code generation. The `phases` field of the config is
unused at code-generation time.

View File

@@ -471,11 +471,29 @@ pub type Transform = Box<
pub struct Rule {
query: QueryNode,
transform: Transform,
/// If true, after this rule fires on a node the engine will try to
/// re-apply this same rule on the result root. Defaults to false:
/// each rule fires at most once on a given node, which prevents
/// accidental loops where a rule's output matches its own query.
repeated: bool,
}
impl Rule {
pub fn new(query: QueryNode, transform: Transform) -> Self {
Self { query, transform }
Self {
query,
transform,
repeated: false,
}
}
/// Mark this rule as allowed to fire multiple times on the same node.
/// Use when the rule is intentionally iterative (its output may match
/// its own query). Without this, a rule fires at most once per node;
/// other rules can still fire on the result.
pub fn repeated(mut self) -> Self {
self.repeated = true;
self
}
fn try_rule(
@@ -537,7 +555,7 @@ fn apply_rules(
fresh: &tree_builder::FreshScope,
) -> Result<Vec<Id>, String> {
let index = RuleIndex::new(rules);
apply_rules_inner(&index, ast, id, fresh, 0)
apply_rules_inner(&index, ast, id, fresh, 0, None)
}
fn apply_rules_inner(
@@ -546,6 +564,7 @@ fn apply_rules_inner(
id: Id,
fresh: &tree_builder::FreshScope,
rewrite_depth: usize,
skip_rule: Option<*const Rule>,
) -> Result<Vec<Id>, String> {
if rewrite_depth > MAX_REWRITE_DEPTH {
return Err(format!(
@@ -556,7 +575,16 @@ fn apply_rules_inner(
let node_kind = ast.get_node(id).map(|n| n.kind()).unwrap_or("");
for rule in index.rules_for_kind(node_kind) {
let rule_ptr = *rule as *const Rule;
if Some(rule_ptr) == skip_rule {
continue;
}
if let Some(result_node) = rule.try_rule(ast, id, fresh)? {
// For non-repeated rules, suppress further application of *this*
// rule on the result root, so a rule whose output matches its own
// query doesn't loop. Other rules and child traversal are
// unaffected.
let next_skip = if rule.repeated { None } else { Some(rule_ptr) };
let mut results = Vec::new();
for node in result_node {
results.extend(apply_rules_inner(
@@ -565,6 +593,7 @@ fn apply_rules_inner(
node,
fresh,
rewrite_depth + 1,
next_skip,
)?);
}
return Ok(results);
@@ -579,13 +608,14 @@ fn apply_rules_inner(
.collect();
// recursively descend into all the fields
// Child traversal does not increment rewrite depth
// Child traversal does not increment rewrite depth and starts fresh
// (no rule is skipped on child subtrees).
let mut changed = false;
let mut new_fields = BTreeMap::new();
for (field_id, children) in field_entries {
let mut new_children = Vec::new();
for child_id in children {
let result = apply_rules_inner(index, ast, child_id, fresh, rewrite_depth)?;
let result = apply_rules_inner(index, ast, child_id, fresh, rewrite_depth, None)?;
if result.len() != 1 || result[0] != child_id {
changed = true;
}
@@ -605,16 +635,47 @@ fn apply_rules_inner(
Ok(vec![ast.nodes.len() - 1])
}
/// Configuration for a desugaring pass: a set of rules and an optional
/// output node-types schema (in YAML format).
/// One phase of a desugaring pass: a named bundle of rules that runs to
/// completion (a full traversal applying its rules) before the next phase
/// starts. Rules within a phase compete for matches as usual; rules in
/// different phases never compete because each traversal only considers the
/// current phase's rules.
pub struct Phase {
/// Name used in error messages.
pub name: String,
pub rules: Vec<Rule>,
}
impl Phase {
pub fn new(name: impl Into<String>, rules: Vec<Rule>) -> Self {
Self {
name: name.into(),
rules,
}
}
}
/// Configuration for a desugaring pass: an ordered list of [`Phase`]s and
/// an optional output node-types schema (in YAML format).
///
/// When attached to a `LanguageSpec` (in the shared tree-sitter extractor),
/// enables yeast-based AST rewriting before TRAP extraction. The same YAML
/// is used both to validate TRAP output (via JSON conversion) and to
/// resolve output-only node kinds and fields at runtime.
///
/// Construct with `DesugaringConfig::new()` and add phases via
/// `add_phase`:
///
/// ```ignore
/// let config = yeast::DesugaringConfig::new()
/// .add_phase("cleanup", cleanup_rules)
/// .add_phase("desugar", desugar_rules)
/// .with_output_node_types_yaml(yaml);
/// ```
#[derive(Default)]
pub struct DesugaringConfig {
/// Rules to apply during desugaring.
pub rules: Vec<Rule>,
/// Phases of rule application, applied in order.
pub phases: Vec<Phase>,
/// Output node-types in YAML format. If `None`, the input grammar's
/// node types are used (i.e. the desugared AST has the same node types
/// as the tree-sitter grammar).
@@ -622,11 +683,16 @@ pub struct DesugaringConfig {
}
impl DesugaringConfig {
pub fn new(rules: Vec<Rule>) -> Self {
Self {
rules,
output_node_types_yaml: None,
}
/// Create an empty configuration. Add phases via [`add_phase`] and an
/// optional output schema via [`with_output_node_types_yaml`].
pub fn new() -> Self {
Self::default()
}
/// Append a new phase with the given name and rules.
pub fn add_phase(mut self, name: impl Into<String>, rules: Vec<Rule>) -> Self {
self.phases.push(Phase::new(name, rules));
self
}
pub fn with_output_node_types_yaml(mut self, yaml: &'static str) -> Self {
@@ -648,17 +714,17 @@ impl DesugaringConfig {
pub struct Runner<'a> {
language: tree_sitter::Language,
schema: schema::Schema,
rules: &'a [Rule],
phases: &'a [Phase],
}
impl<'a> Runner<'a> {
/// Create a runner using the input grammar's schema for output.
pub fn new(language: tree_sitter::Language, rules: &'a [Rule]) -> Self {
pub fn new(language: tree_sitter::Language, phases: &'a [Phase]) -> Self {
let schema = schema::Schema::from_language(&language);
Self {
language,
schema,
rules,
phases,
}
}
@@ -666,12 +732,12 @@ impl<'a> Runner<'a> {
pub fn with_schema(
language: tree_sitter::Language,
schema: &schema::Schema,
rules: &'a [Rule],
phases: &'a [Phase],
) -> Self {
Self {
language,
schema: schema.clone(),
rules,
phases,
}
}
@@ -684,27 +750,17 @@ impl<'a> Runner<'a> {
Ok(Self {
language,
schema,
rules: &config.rules,
phases: &config.phases,
})
}
pub fn run_from_tree(&self, tree: &tree_sitter::Tree) -> Result<Ast, String> {
let fresh = tree_builder::FreshScope::new();
let mut ast = Ast::from_tree_with_schema(self.schema.clone(), tree, &self.language);
let root = ast.get_root();
let res = apply_rules(self.rules, &mut ast, root, &fresh)?;
if res.len() != 1 {
return Err(format!(
"Expected exactly one result node, got {}",
res.len()
));
}
ast.set_root(res[0]);
self.run_phases(&mut ast)?;
Ok(ast)
}
pub fn run(&self, input: &str) -> Result<Ast, String> {
let fresh = tree_builder::FreshScope::new();
let mut parser = tree_sitter::Parser::new();
parser
.set_language(&self.language)
@@ -713,15 +769,29 @@ impl<'a> Runner<'a> {
.parse(input, None)
.ok_or_else(|| "Failed to parse input".to_string())?;
let mut ast = Ast::from_tree_with_schema(self.schema.clone(), &tree, &self.language);
let root = ast.get_root();
let res = apply_rules(self.rules, &mut ast, root, &fresh)?;
if res.len() != 1 {
return Err(format!(
"Expected exactly one result node, got {}",
res.len()
));
}
ast.set_root(res[0]);
self.run_phases(&mut ast)?;
Ok(ast)
}
/// Apply each phase in turn to the AST, threading the root through.
/// A single `FreshScope` is shared across phases so that fresh
/// identifiers generated in different phases don't collide.
fn run_phases(&self, ast: &mut Ast) -> Result<(), String> {
let fresh = tree_builder::FreshScope::new();
let mut root = ast.get_root();
for phase in self.phases {
let res = apply_rules(&phase.rules, ast, root, &fresh)
.map_err(|e| format!("Phase `{}`: {e}", phase.name))?;
if res.len() != 1 {
return Err(format!(
"Phase `{}`: expected exactly one result node, got {}",
phase.name,
res.len()
));
}
root = res[0];
}
ast.set_root(root);
Ok(())
}
}

View File

@@ -12,16 +12,36 @@ fn parse_and_dump(input: &str) -> String {
dump_ast(&ast, ast.get_root(), input)
}
/// Helper: parse Ruby source with a custom output schema and rules, return dump.
/// Helper: parse Ruby source with a custom output schema and a single
/// phase of rules, return dump.
fn run_and_dump(input: &str, rules: Vec<Rule>) -> String {
run_phased_and_dump(input, vec![Phase::new("test", rules)])
}
/// Helper: parse Ruby source with a custom output schema and multiple
/// rule phases, return dump.
fn run_phased_and_dump(input: &str, phases: Vec<Phase>) -> String {
let lang: tree_sitter::Language = tree_sitter_ruby::LANGUAGE.into();
let schema =
yeast::node_types_yaml::schema_from_yaml_with_language(OUTPUT_SCHEMA_YAML, &lang).unwrap();
let runner = Runner::with_schema(lang, &schema, &rules);
let runner = Runner::with_schema(lang, &schema, &phases);
let ast = runner.run(input).unwrap();
dump_ast(&ast, ast.get_root(), input)
}
/// Helper: like `run_and_dump`, but returns the runner error (if any)
/// instead of unwrapping.
fn run_and_get_error(input: &str, rules: Vec<Rule>) -> String {
let lang: tree_sitter::Language = tree_sitter_ruby::LANGUAGE.into();
let schema =
yeast::node_types_yaml::schema_from_yaml_with_language(OUTPUT_SCHEMA_YAML, &lang).unwrap();
let phases = vec![Phase::new("test", rules)];
let runner = Runner::with_schema(lang, &schema, &phases);
runner
.run(input)
.expect_err("expected runner to return an error")
}
/// Assert that a dump equals the expected string, treating the expected
/// string as an indented multiline literal: leading/trailing blank lines
/// are stripped, and the common leading indentation is removed from every
@@ -382,6 +402,113 @@ fn test_chained_rules_output_only_kind() {
);
}
// A rule that swaps `assignment.left` and `assignment.right`. Each
// application produces another `assignment` whose query the rule
// matches again, so without the once-per-node default it would loop.
fn swap_assignment_rule() -> Rule {
yeast::rule!(
(assignment
left: (_) @left
right: (_) @right
)
=>
(assignment
left: {right}
right: {left}
)
)
}
#[test]
fn test_repeated_rule_hits_depth_limit() {
// With `.repeated()` the rule is allowed to fire on its own output,
// which cycles forever and trips the rewrite-depth safety net.
let err = run_and_get_error("x = 1", vec![swap_assignment_rule().repeated()]);
assert!(
err.contains("exceeded maximum rewrite depth"),
"expected depth-limit error, got: {err}"
);
}
#[test]
fn test_default_rule_fires_at_most_once_per_node() {
// Without `.repeated()` (the default), a rule fires at most once on a
// given node. The swap therefore happens exactly once and the desugaring
// terminates cleanly.
let dump = run_and_dump("x = 1", vec![swap_assignment_rule()]);
assert_dump_eq(
&dump,
r#"
program
assignment
left: integer "1"
right: identifier "x"
"#,
);
}
// ---- Phase tests ----
#[test]
fn test_phased_desugaring() {
// Two phases that could equally have been a single one with chained
// rules. Splitting them makes the intent (cleanup, then desugar)
// explicit and provides per-phase error messages.
let cleanup = vec![yeast::rule!(
(assignment
left: (_) @left
right: (_) @right
)
=> first_node
)];
let desugar = vec![yeast::rule!(
(first_node
left: (_) @left
right: (_) @right
)
=> second_node
)];
let dump = run_phased_and_dump(
"x = 1",
vec![
Phase::new("cleanup", cleanup),
Phase::new("desugar", desugar),
],
);
assert_dump_eq(
&dump,
r#"
program
second_node
left: identifier "x"
right: integer "1"
"#,
);
}
#[test]
fn test_phase_error_includes_phase_name() {
// A repeated rule that loops; the error message should identify the
// phase that tripped the depth limit.
let lang: tree_sitter::Language = tree_sitter_ruby::LANGUAGE.into();
let schema =
yeast::node_types_yaml::schema_from_yaml_with_language(OUTPUT_SCHEMA_YAML, &lang).unwrap();
let phases = vec![Phase::new("buggy", vec![swap_assignment_rule().repeated()])];
let runner = Runner::with_schema(lang, &schema, &phases);
let err = runner
.run("x = 1")
.expect_err("expected runner to return an error");
assert!(
err.contains("Phase `buggy`"),
"error should mention the failing phase, got: {err}"
);
assert!(
err.contains("exceeded maximum rewrite depth"),
"error should mention the depth limit, got: {err}"
);
}
// ---- Cursor tests ----
#[test]