mirror of
https://github.com/github/codeql.git
synced 2026-05-14 03:09:26 +02:00
Merge pull request #21809 from github/tausbn/yeast-add-support-for-desugaring-phases
Yeast: Two small improvements
This commit is contained in:
@@ -61,6 +61,22 @@ rule matches, the node is kept and its children are processed recursively.
|
||||
A rule can replace one node with zero nodes (deletion), one node (rewriting),
|
||||
or multiple nodes (expansion).
|
||||
|
||||
By default a rule fires **at most once on a given node**: after firing, the
|
||||
engine will not re-try that same rule on the result root. Other rules may
|
||||
still fire on the result, and the rule may still fire on different nodes
|
||||
(including the result's children). To opt into iterative behaviour — when a
|
||||
rule's output is intentionally re-matched by the same rule — call
|
||||
`.repeated()` on the constructed `Rule`:
|
||||
|
||||
```rust
|
||||
let r = yeast::rule!((foo ...) => (foo ...)).repeated();
|
||||
```
|
||||
|
||||
Without `.repeated()`, a rule whose output happens to match its own query
|
||||
simply fires once and stops. With `.repeated()`, the rule is allowed to
|
||||
re-match indefinitely; the runner still enforces a global rewrite-depth
|
||||
limit (currently 100) as a safety net against accidental cycles.
|
||||
|
||||
## Query language
|
||||
|
||||
Queries use a syntax inspired by
|
||||
@@ -303,11 +319,17 @@ capture name to a field of the same name on the output node.
|
||||
## Integration with the extractor
|
||||
|
||||
A YEAST desugaring pass is configured with a [`DesugaringConfig`], which
|
||||
carries the rules and an optional output node-types schema (in YAML
|
||||
format). Attach it to a language spec to enable rewriting:
|
||||
carries one or more named [`Phase`]s of rules and an optional output
|
||||
node-types schema (in YAML format). Each phase is a complete traversal
|
||||
that runs to completion before the next phase starts; only the current
|
||||
phase's rules are considered during that traversal. Attach the config to
|
||||
a language spec
|
||||
to enable rewriting:
|
||||
|
||||
```rust
|
||||
let desugar = yeast::DesugaringConfig::new(my_rules)
|
||||
let desugar = yeast::DesugaringConfig::new()
|
||||
.add_phase("cleanup", cleanup_rules())
|
||||
.add_phase("desugar", desugar_rules())
|
||||
.with_output_node_types_yaml(include_str!("output-node-types.yml"));
|
||||
|
||||
let lang = simple::LanguageSpec {
|
||||
@@ -319,11 +341,14 @@ let lang = simple::LanguageSpec {
|
||||
};
|
||||
```
|
||||
|
||||
A single-phase config is just `.add_phase(...)` called once. Phase names
|
||||
appear in error messages so you can tell which phase failed.
|
||||
|
||||
The same YAML node-types is used for both the runtime yeast `Schema` (so
|
||||
rules can refer to output-only kinds and fields) and TRAP validation (it
|
||||
is converted to JSON internally).
|
||||
|
||||
For the dbscheme/QL code generator, set `Language::desugar` to a
|
||||
`DesugaringConfig` carrying the same YAML; the generator converts it to
|
||||
JSON for downstream code generation. The `rules` field of the config is
|
||||
JSON for downstream code generation. The `phases` field of the config is
|
||||
unused at code-generation time.
|
||||
|
||||
@@ -471,11 +471,29 @@ pub type Transform = Box<
|
||||
pub struct Rule {
|
||||
query: QueryNode,
|
||||
transform: Transform,
|
||||
/// If true, after this rule fires on a node the engine will try to
|
||||
/// re-apply this same rule on the result root. Defaults to false:
|
||||
/// each rule fires at most once on a given node, which prevents
|
||||
/// accidental loops where a rule's output matches its own query.
|
||||
repeated: bool,
|
||||
}
|
||||
|
||||
impl Rule {
|
||||
pub fn new(query: QueryNode, transform: Transform) -> Self {
|
||||
Self { query, transform }
|
||||
Self {
|
||||
query,
|
||||
transform,
|
||||
repeated: false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Mark this rule as allowed to fire multiple times on the same node.
|
||||
/// Use when the rule is intentionally iterative (its output may match
|
||||
/// its own query). Without this, a rule fires at most once per node;
|
||||
/// other rules can still fire on the result.
|
||||
pub fn repeated(mut self) -> Self {
|
||||
self.repeated = true;
|
||||
self
|
||||
}
|
||||
|
||||
fn try_rule(
|
||||
@@ -537,7 +555,7 @@ fn apply_rules(
|
||||
fresh: &tree_builder::FreshScope,
|
||||
) -> Result<Vec<Id>, String> {
|
||||
let index = RuleIndex::new(rules);
|
||||
apply_rules_inner(&index, ast, id, fresh, 0)
|
||||
apply_rules_inner(&index, ast, id, fresh, 0, None)
|
||||
}
|
||||
|
||||
fn apply_rules_inner(
|
||||
@@ -546,6 +564,7 @@ fn apply_rules_inner(
|
||||
id: Id,
|
||||
fresh: &tree_builder::FreshScope,
|
||||
rewrite_depth: usize,
|
||||
skip_rule: Option<*const Rule>,
|
||||
) -> Result<Vec<Id>, String> {
|
||||
if rewrite_depth > MAX_REWRITE_DEPTH {
|
||||
return Err(format!(
|
||||
@@ -556,7 +575,16 @@ fn apply_rules_inner(
|
||||
|
||||
let node_kind = ast.get_node(id).map(|n| n.kind()).unwrap_or("");
|
||||
for rule in index.rules_for_kind(node_kind) {
|
||||
let rule_ptr = *rule as *const Rule;
|
||||
if Some(rule_ptr) == skip_rule {
|
||||
continue;
|
||||
}
|
||||
if let Some(result_node) = rule.try_rule(ast, id, fresh)? {
|
||||
// For non-repeated rules, suppress further application of *this*
|
||||
// rule on the result root, so a rule whose output matches its own
|
||||
// query doesn't loop. Other rules and child traversal are
|
||||
// unaffected.
|
||||
let next_skip = if rule.repeated { None } else { Some(rule_ptr) };
|
||||
let mut results = Vec::new();
|
||||
for node in result_node {
|
||||
results.extend(apply_rules_inner(
|
||||
@@ -565,6 +593,7 @@ fn apply_rules_inner(
|
||||
node,
|
||||
fresh,
|
||||
rewrite_depth + 1,
|
||||
next_skip,
|
||||
)?);
|
||||
}
|
||||
return Ok(results);
|
||||
@@ -579,13 +608,14 @@ fn apply_rules_inner(
|
||||
.collect();
|
||||
|
||||
// recursively descend into all the fields
|
||||
// Child traversal does not increment rewrite depth
|
||||
// Child traversal does not increment rewrite depth and starts fresh
|
||||
// (no rule is skipped on child subtrees).
|
||||
let mut changed = false;
|
||||
let mut new_fields = BTreeMap::new();
|
||||
for (field_id, children) in field_entries {
|
||||
let mut new_children = Vec::new();
|
||||
for child_id in children {
|
||||
let result = apply_rules_inner(index, ast, child_id, fresh, rewrite_depth)?;
|
||||
let result = apply_rules_inner(index, ast, child_id, fresh, rewrite_depth, None)?;
|
||||
if result.len() != 1 || result[0] != child_id {
|
||||
changed = true;
|
||||
}
|
||||
@@ -605,16 +635,47 @@ fn apply_rules_inner(
|
||||
Ok(vec![ast.nodes.len() - 1])
|
||||
}
|
||||
|
||||
/// Configuration for a desugaring pass: a set of rules and an optional
|
||||
/// output node-types schema (in YAML format).
|
||||
/// One phase of a desugaring pass: a named bundle of rules that runs to
|
||||
/// completion (a full traversal applying its rules) before the next phase
|
||||
/// starts. Rules within a phase compete for matches as usual; rules in
|
||||
/// different phases never compete because each traversal only considers the
|
||||
/// current phase's rules.
|
||||
pub struct Phase {
|
||||
/// Name used in error messages.
|
||||
pub name: String,
|
||||
pub rules: Vec<Rule>,
|
||||
}
|
||||
|
||||
impl Phase {
|
||||
pub fn new(name: impl Into<String>, rules: Vec<Rule>) -> Self {
|
||||
Self {
|
||||
name: name.into(),
|
||||
rules,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Configuration for a desugaring pass: an ordered list of [`Phase`]s and
|
||||
/// an optional output node-types schema (in YAML format).
|
||||
///
|
||||
/// When attached to a `LanguageSpec` (in the shared tree-sitter extractor),
|
||||
/// enables yeast-based AST rewriting before TRAP extraction. The same YAML
|
||||
/// is used both to validate TRAP output (via JSON conversion) and to
|
||||
/// resolve output-only node kinds and fields at runtime.
|
||||
///
|
||||
/// Construct with `DesugaringConfig::new()` and add phases via
|
||||
/// `add_phase`:
|
||||
///
|
||||
/// ```ignore
|
||||
/// let config = yeast::DesugaringConfig::new()
|
||||
/// .add_phase("cleanup", cleanup_rules)
|
||||
/// .add_phase("desugar", desugar_rules)
|
||||
/// .with_output_node_types_yaml(yaml);
|
||||
/// ```
|
||||
#[derive(Default)]
|
||||
pub struct DesugaringConfig {
|
||||
/// Rules to apply during desugaring.
|
||||
pub rules: Vec<Rule>,
|
||||
/// Phases of rule application, applied in order.
|
||||
pub phases: Vec<Phase>,
|
||||
/// Output node-types in YAML format. If `None`, the input grammar's
|
||||
/// node types are used (i.e. the desugared AST has the same node types
|
||||
/// as the tree-sitter grammar).
|
||||
@@ -622,11 +683,16 @@ pub struct DesugaringConfig {
|
||||
}
|
||||
|
||||
impl DesugaringConfig {
|
||||
pub fn new(rules: Vec<Rule>) -> Self {
|
||||
Self {
|
||||
rules,
|
||||
output_node_types_yaml: None,
|
||||
}
|
||||
/// Create an empty configuration. Add phases via [`add_phase`] and an
|
||||
/// optional output schema via [`with_output_node_types_yaml`].
|
||||
pub fn new() -> Self {
|
||||
Self::default()
|
||||
}
|
||||
|
||||
/// Append a new phase with the given name and rules.
|
||||
pub fn add_phase(mut self, name: impl Into<String>, rules: Vec<Rule>) -> Self {
|
||||
self.phases.push(Phase::new(name, rules));
|
||||
self
|
||||
}
|
||||
|
||||
pub fn with_output_node_types_yaml(mut self, yaml: &'static str) -> Self {
|
||||
@@ -648,17 +714,17 @@ impl DesugaringConfig {
|
||||
pub struct Runner<'a> {
|
||||
language: tree_sitter::Language,
|
||||
schema: schema::Schema,
|
||||
rules: &'a [Rule],
|
||||
phases: &'a [Phase],
|
||||
}
|
||||
|
||||
impl<'a> Runner<'a> {
|
||||
/// Create a runner using the input grammar's schema for output.
|
||||
pub fn new(language: tree_sitter::Language, rules: &'a [Rule]) -> Self {
|
||||
pub fn new(language: tree_sitter::Language, phases: &'a [Phase]) -> Self {
|
||||
let schema = schema::Schema::from_language(&language);
|
||||
Self {
|
||||
language,
|
||||
schema,
|
||||
rules,
|
||||
phases,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -666,12 +732,12 @@ impl<'a> Runner<'a> {
|
||||
pub fn with_schema(
|
||||
language: tree_sitter::Language,
|
||||
schema: &schema::Schema,
|
||||
rules: &'a [Rule],
|
||||
phases: &'a [Phase],
|
||||
) -> Self {
|
||||
Self {
|
||||
language,
|
||||
schema: schema.clone(),
|
||||
rules,
|
||||
phases,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -684,27 +750,17 @@ impl<'a> Runner<'a> {
|
||||
Ok(Self {
|
||||
language,
|
||||
schema,
|
||||
rules: &config.rules,
|
||||
phases: &config.phases,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn run_from_tree(&self, tree: &tree_sitter::Tree) -> Result<Ast, String> {
|
||||
let fresh = tree_builder::FreshScope::new();
|
||||
let mut ast = Ast::from_tree_with_schema(self.schema.clone(), tree, &self.language);
|
||||
let root = ast.get_root();
|
||||
let res = apply_rules(self.rules, &mut ast, root, &fresh)?;
|
||||
if res.len() != 1 {
|
||||
return Err(format!(
|
||||
"Expected exactly one result node, got {}",
|
||||
res.len()
|
||||
));
|
||||
}
|
||||
ast.set_root(res[0]);
|
||||
self.run_phases(&mut ast)?;
|
||||
Ok(ast)
|
||||
}
|
||||
|
||||
pub fn run(&self, input: &str) -> Result<Ast, String> {
|
||||
let fresh = tree_builder::FreshScope::new();
|
||||
let mut parser = tree_sitter::Parser::new();
|
||||
parser
|
||||
.set_language(&self.language)
|
||||
@@ -713,15 +769,29 @@ impl<'a> Runner<'a> {
|
||||
.parse(input, None)
|
||||
.ok_or_else(|| "Failed to parse input".to_string())?;
|
||||
let mut ast = Ast::from_tree_with_schema(self.schema.clone(), &tree, &self.language);
|
||||
let root = ast.get_root();
|
||||
let res = apply_rules(self.rules, &mut ast, root, &fresh)?;
|
||||
if res.len() != 1 {
|
||||
return Err(format!(
|
||||
"Expected exactly one result node, got {}",
|
||||
res.len()
|
||||
));
|
||||
}
|
||||
ast.set_root(res[0]);
|
||||
self.run_phases(&mut ast)?;
|
||||
Ok(ast)
|
||||
}
|
||||
|
||||
/// Apply each phase in turn to the AST, threading the root through.
|
||||
/// A single `FreshScope` is shared across phases so that fresh
|
||||
/// identifiers generated in different phases don't collide.
|
||||
fn run_phases(&self, ast: &mut Ast) -> Result<(), String> {
|
||||
let fresh = tree_builder::FreshScope::new();
|
||||
let mut root = ast.get_root();
|
||||
for phase in self.phases {
|
||||
let res = apply_rules(&phase.rules, ast, root, &fresh)
|
||||
.map_err(|e| format!("Phase `{}`: {e}", phase.name))?;
|
||||
if res.len() != 1 {
|
||||
return Err(format!(
|
||||
"Phase `{}`: expected exactly one result node, got {}",
|
||||
phase.name,
|
||||
res.len()
|
||||
));
|
||||
}
|
||||
root = res[0];
|
||||
}
|
||||
ast.set_root(root);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,16 +12,36 @@ fn parse_and_dump(input: &str) -> String {
|
||||
dump_ast(&ast, ast.get_root(), input)
|
||||
}
|
||||
|
||||
/// Helper: parse Ruby source with a custom output schema and rules, return dump.
|
||||
/// Helper: parse Ruby source with a custom output schema and a single
|
||||
/// phase of rules, return dump.
|
||||
fn run_and_dump(input: &str, rules: Vec<Rule>) -> String {
|
||||
run_phased_and_dump(input, vec![Phase::new("test", rules)])
|
||||
}
|
||||
|
||||
/// Helper: parse Ruby source with a custom output schema and multiple
|
||||
/// rule phases, return dump.
|
||||
fn run_phased_and_dump(input: &str, phases: Vec<Phase>) -> String {
|
||||
let lang: tree_sitter::Language = tree_sitter_ruby::LANGUAGE.into();
|
||||
let schema =
|
||||
yeast::node_types_yaml::schema_from_yaml_with_language(OUTPUT_SCHEMA_YAML, &lang).unwrap();
|
||||
let runner = Runner::with_schema(lang, &schema, &rules);
|
||||
let runner = Runner::with_schema(lang, &schema, &phases);
|
||||
let ast = runner.run(input).unwrap();
|
||||
dump_ast(&ast, ast.get_root(), input)
|
||||
}
|
||||
|
||||
/// Helper: like `run_and_dump`, but returns the runner error (if any)
|
||||
/// instead of unwrapping.
|
||||
fn run_and_get_error(input: &str, rules: Vec<Rule>) -> String {
|
||||
let lang: tree_sitter::Language = tree_sitter_ruby::LANGUAGE.into();
|
||||
let schema =
|
||||
yeast::node_types_yaml::schema_from_yaml_with_language(OUTPUT_SCHEMA_YAML, &lang).unwrap();
|
||||
let phases = vec![Phase::new("test", rules)];
|
||||
let runner = Runner::with_schema(lang, &schema, &phases);
|
||||
runner
|
||||
.run(input)
|
||||
.expect_err("expected runner to return an error")
|
||||
}
|
||||
|
||||
/// Assert that a dump equals the expected string, treating the expected
|
||||
/// string as an indented multiline literal: leading/trailing blank lines
|
||||
/// are stripped, and the common leading indentation is removed from every
|
||||
@@ -382,6 +402,113 @@ fn test_chained_rules_output_only_kind() {
|
||||
);
|
||||
}
|
||||
|
||||
// A rule that swaps `assignment.left` and `assignment.right`. Each
|
||||
// application produces another `assignment` whose query the rule
|
||||
// matches again, so without the once-per-node default it would loop.
|
||||
fn swap_assignment_rule() -> Rule {
|
||||
yeast::rule!(
|
||||
(assignment
|
||||
left: (_) @left
|
||||
right: (_) @right
|
||||
)
|
||||
=>
|
||||
(assignment
|
||||
left: {right}
|
||||
right: {left}
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_repeated_rule_hits_depth_limit() {
|
||||
// With `.repeated()` the rule is allowed to fire on its own output,
|
||||
// which cycles forever and trips the rewrite-depth safety net.
|
||||
let err = run_and_get_error("x = 1", vec![swap_assignment_rule().repeated()]);
|
||||
assert!(
|
||||
err.contains("exceeded maximum rewrite depth"),
|
||||
"expected depth-limit error, got: {err}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_default_rule_fires_at_most_once_per_node() {
|
||||
// Without `.repeated()` (the default), a rule fires at most once on a
|
||||
// given node. The swap therefore happens exactly once and the desugaring
|
||||
// terminates cleanly.
|
||||
let dump = run_and_dump("x = 1", vec![swap_assignment_rule()]);
|
||||
assert_dump_eq(
|
||||
&dump,
|
||||
r#"
|
||||
program
|
||||
assignment
|
||||
left: integer "1"
|
||||
right: identifier "x"
|
||||
"#,
|
||||
);
|
||||
}
|
||||
|
||||
// ---- Phase tests ----
|
||||
|
||||
#[test]
|
||||
fn test_phased_desugaring() {
|
||||
// Two phases that could equally have been a single one with chained
|
||||
// rules. Splitting them makes the intent (cleanup, then desugar)
|
||||
// explicit and provides per-phase error messages.
|
||||
let cleanup = vec![yeast::rule!(
|
||||
(assignment
|
||||
left: (_) @left
|
||||
right: (_) @right
|
||||
)
|
||||
=> first_node
|
||||
)];
|
||||
let desugar = vec![yeast::rule!(
|
||||
(first_node
|
||||
left: (_) @left
|
||||
right: (_) @right
|
||||
)
|
||||
=> second_node
|
||||
)];
|
||||
|
||||
let dump = run_phased_and_dump(
|
||||
"x = 1",
|
||||
vec![
|
||||
Phase::new("cleanup", cleanup),
|
||||
Phase::new("desugar", desugar),
|
||||
],
|
||||
);
|
||||
assert_dump_eq(
|
||||
&dump,
|
||||
r#"
|
||||
program
|
||||
second_node
|
||||
left: identifier "x"
|
||||
right: integer "1"
|
||||
"#,
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_phase_error_includes_phase_name() {
|
||||
// A repeated rule that loops; the error message should identify the
|
||||
// phase that tripped the depth limit.
|
||||
let lang: tree_sitter::Language = tree_sitter_ruby::LANGUAGE.into();
|
||||
let schema =
|
||||
yeast::node_types_yaml::schema_from_yaml_with_language(OUTPUT_SCHEMA_YAML, &lang).unwrap();
|
||||
let phases = vec![Phase::new("buggy", vec![swap_assignment_rule().repeated()])];
|
||||
let runner = Runner::with_schema(lang, &schema, &phases);
|
||||
let err = runner
|
||||
.run("x = 1")
|
||||
.expect_err("expected runner to return an error");
|
||||
assert!(
|
||||
err.contains("Phase `buggy`"),
|
||||
"error should mention the failing phase, got: {err}"
|
||||
);
|
||||
assert!(
|
||||
err.contains("exceeded maximum rewrite depth"),
|
||||
"error should mention the depth limit, got: {err}"
|
||||
);
|
||||
}
|
||||
|
||||
// ---- Cursor tests ----
|
||||
|
||||
#[test]
|
||||
|
||||
Reference in New Issue
Block a user