Merge pull request #21981 from asgerf/yeast/comments

Yeast/Unified: Extract comments
This commit is contained in:
Asger F
2026-06-15 15:25:35 +02:00
committed by GitHub
10 changed files with 208 additions and 20 deletions

View File

@@ -333,6 +333,9 @@ pub fn extract(
.run_from_tree(&tree, source)
.unwrap_or_else(|e| panic!("Desugaring failed for {path_str}: {e}"));
traverse_yeast(&ast, &mut visitor);
// Comments and other `extra` nodes are not represented in the desugared
// AST, so recover them directly from the original parse tree.
traverse_extras(&tree, &mut visitor);
} else {
traverse(&tree, &mut visitor);
}
@@ -365,6 +368,8 @@ struct Visitor<'a> {
ast_node_parent_table_name: String,
/// Language-specific name of the tokeninfo table
tokeninfo_table_name: String,
/// Language-specific name of the trivia tokeninfo table
trivia_tokeninfo_table_name: String,
/// A lookup table from type name to node types
schema: &'a NodeTypeMap,
/// A stack for gathering information from child nodes. Whenever a node is
@@ -395,11 +400,33 @@ impl<'a> Visitor<'a> {
ast_node_location_table_name: format!("{language_prefix}_ast_node_location"),
ast_node_parent_table_name: format!("{language_prefix}_ast_node_parent"),
tokeninfo_table_name: format!("{language_prefix}_tokeninfo"),
trivia_tokeninfo_table_name: format!("{language_prefix}_trivia_tokeninfo"),
schema,
stack: Vec::new(),
}
}
/// Emits a `TriviaToken` for the given `extra` node (e.g. a comment) from
/// the original parse tree. Trivia tokens carry a location and their source
/// text, but are not attached to a parent in the (possibly desugared) AST.
fn emit_trivia_token(&mut self, node: &Node) {
let id = self.trap_writer.fresh_id();
let loc = location_for(self, self.file_label, node);
let loc_label = location_label(self.trap_writer, loc);
self.trap_writer.add_tuple(
&self.ast_node_location_table_name,
vec![trap::Arg::Label(id), trap::Arg::Label(loc_label)],
);
self.trap_writer.add_tuple(
&self.trivia_tokeninfo_table_name,
vec![
trap::Arg::Label(id),
trap::Arg::Int(node.kind_id() as usize),
sliced_source_arg(self.source, node),
],
);
}
fn record_parse_error(&mut self, loc: trap::Label, mesg: &diagnostics::DiagnosticMessage) {
self.diagnostics_writer.write(mesg);
let id = self.trap_writer.fresh_id();
@@ -835,6 +862,24 @@ fn traverse(tree: &Tree, visitor: &mut Visitor) {
}
}
/// Walks the original tree-sitter tree and emits a `TriviaToken` for every
/// `extra` node (e.g. a comment). Used to preserve comments that would
/// otherwise be lost after a desugaring pass rewrites the tree.
fn traverse_extras(tree: &Tree, visitor: &mut Visitor) {
emit_extras_in(visitor, tree.root_node());
}
fn emit_extras_in(visitor: &mut Visitor, node: Node<'_>) {
let mut cursor = node.walk();
for child in node.children(&mut cursor) {
if child.is_extra() {
visitor.emit_trivia_token(&child);
} else {
emit_extras_in(visitor, child);
}
}
}
fn traverse_yeast(tree: &yeast::Ast, visitor: &mut Visitor) {
use yeast::Cursor;
let mut cursor = tree.walk();

View File

@@ -68,7 +68,12 @@ pub fn generate(
let node_parent_table_name = format!("{}_ast_node_parent", &prefix);
let token_name = format!("{}_token", &prefix);
let tokeninfo_name = format!("{}_tokeninfo", &prefix);
let trivia_token_name = format!("{}_trivia_token", &prefix);
let trivia_tokeninfo_name = format!("{}_trivia_tokeninfo", &prefix);
let reserved_word_name = format!("{}_reserved_word", &prefix);
// When a desugaring is configured, comments and other `extra` nodes are
// preserved from the original parse tree as `TriviaToken`s.
let has_trivia_tokens = language.desugar.is_some();
let effective_node_types: String = match language
.desugar
.as_ref()
@@ -85,28 +90,35 @@ pub fn generate(
let nodes = node_types::read_node_types_str(&prefix, &effective_node_types)?;
let (dbscheme_entries, mut ast_node_members, token_kinds) = convert_nodes(&nodes);
ast_node_members.insert(&token_name);
if has_trivia_tokens {
ast_node_members.insert(&trivia_token_name);
}
writeln!(&mut dbscheme_writer, "/*- {} dbscheme -*/", language.name)?;
dbscheme::write(&mut dbscheme_writer, &dbscheme_entries)?;
let token_case = create_token_case(&token_name, token_kinds);
dbscheme::write(
&mut dbscheme_writer,
&[
dbscheme::Entry::Table(create_tokeninfo(&tokeninfo_name, &token_name)),
dbscheme::Entry::Case(token_case),
dbscheme::Entry::Union(dbscheme::Union {
name: &ast_node_name,
members: ast_node_members,
}),
dbscheme::Entry::Table(create_ast_node_location_table(
&node_location_table_name,
&ast_node_name,
)),
dbscheme::Entry::Table(create_ast_node_parent_table(
&node_parent_table_name,
&ast_node_name,
)),
],
)?;
let mut dbscheme_tail = vec![
dbscheme::Entry::Table(create_tokeninfo(&tokeninfo_name, &token_name)),
dbscheme::Entry::Case(token_case),
];
if has_trivia_tokens {
dbscheme_tail.push(dbscheme::Entry::Table(create_tokeninfo(
&trivia_tokeninfo_name,
&trivia_token_name,
)));
}
dbscheme_tail.push(dbscheme::Entry::Union(dbscheme::Union {
name: &ast_node_name,
members: ast_node_members,
}));
dbscheme_tail.push(dbscheme::Entry::Table(create_ast_node_location_table(
&node_location_table_name,
&ast_node_name,
)));
dbscheme_tail.push(dbscheme::Entry::Table(create_ast_node_parent_table(
&node_parent_table_name,
&ast_node_name,
)));
dbscheme::write(&mut dbscheme_writer, &dbscheme_tail)?;
let mut body = vec![
ql::TopLevel::Class(ql_gen::create_ast_node_class(
@@ -116,6 +128,12 @@ pub fn generate(
)),
ql::TopLevel::Class(ql_gen::create_token_class(&token_name, &tokeninfo_name)),
];
if has_trivia_tokens {
body.push(ql::TopLevel::Class(ql_gen::create_trivia_token_class(
&trivia_token_name,
&trivia_tokeninfo_name,
)));
}
// Only emit the ReservedWord class when there are actually unnamed token
// types in the schema (i.e., @{prefix}_reserved_word exists in the dbscheme).
// When converting from a YEAST YAML schema that has no unnamed tokens, this

View File

@@ -199,6 +199,70 @@ pub fn create_token_class<'a>(token_type: &'a str, tokeninfo: &'a str) -> ql::Cl
}
}
/// Creates the `TriviaToken` class. Trivia tokens (e.g. comments) are
/// `extra` nodes preserved from the original parse tree even when the tree has
/// been rewritten by a desugaring pass. They are not part of the regular
/// `Token` hierarchy because they do not appear in the (possibly desugared)
/// output schema.
pub fn create_trivia_token_class<'a>(
trivia_token_type: &'a str,
trivia_tokeninfo: &'a str,
) -> ql::Class<'a> {
let trivia_tokeninfo_arity = 3; // id, kind, value
let get_value = ql::Predicate {
qldoc: Some(String::from("Gets the source text of this trivia token.")),
name: "getValue",
overridden: false,
is_private: false,
is_final: true,
return_type: Some(ql::Type::String),
formal_parameters: vec![],
body: create_get_field_expr_for_column_storage(
"result",
trivia_tokeninfo,
1,
trivia_tokeninfo_arity,
),
overlay: None,
};
let to_string = ql::Predicate {
qldoc: Some(String::from(
"Gets a string representation of this element.",
)),
name: "toString",
overridden: true,
is_private: false,
is_final: true,
return_type: Some(ql::Type::String),
formal_parameters: vec![],
body: ql::Expression::Equals(
Box::new(ql::Expression::Var("result")),
Box::new(ql::Expression::Dot(
Box::new(ql::Expression::Var("this")),
"getValue",
vec![],
)),
),
overlay: None,
};
ql::Class {
qldoc: Some(String::from(
"A trivia token, such as a comment, preserved from the original parse tree.",
)),
name: "TriviaToken",
is_abstract: false,
supertypes: vec![ql::Type::At(trivia_token_type), ql::Type::Normal("AstNode")]
.into_iter()
.collect(),
characteristic_predicate: None,
predicates: vec![
get_value,
to_string,
create_get_a_primary_ql_class("TriviaToken", false),
],
}
}
// Creates the `ReservedWord` class.
pub fn create_reserved_word_class(db_name: &str) -> ql::Class<'_> {
let class_name = "ReservedWord";

View File

@@ -61,6 +61,18 @@ module Unified {
override string getAPrimaryQlClass() { result = "Token" }
}
/** A trivia token, such as a comment, preserved from the original parse tree. */
class TriviaToken extends @unified_trivia_token, AstNode {
/** Gets the source text of this trivia token. */
final string getValue() { unified_trivia_tokeninfo(this, _, result) }
/** Gets a string representation of this element. */
final override string toString() { result = this.getValue() }
/** Gets the name of the primary QL class for this element. */
override string getAPrimaryQlClass() { result = "TriviaToken" }
}
/** Gets the file containing the given `node`. */
private @file getNodeFile(@unified_ast_node node) {
exists(@location_default loc | unified_ast_node_location(node, loc) |

View File

@@ -0,0 +1,18 @@
/** Provides classes for working with comments. */
private import unified
/**
* A comment appearing in the source code.
*/
class Comment extends TriviaToken {
// At the moment, comments are the only type trivia token we extract
/**
* Gets the text inside this comment, not counting the delimeters.
*/
string getCommentText() {
result = this.getValue().regexpCapture("//(.*)", 1)
or
result = this.getValue().regexpCapture("(?s)/\\*(.*)\\*/", 1)
}
}

View File

@@ -334,7 +334,13 @@ case @unified_token.kind of
;
@unified_ast_node = @unified_apply_pattern | @unified_binary_expr | @unified_block_stmt | @unified_call_expr | @unified_expr_condition | @unified_expr_stmt | @unified_guard_if_stmt | @unified_if_stmt | @unified_lambda_expr | @unified_let_pattern_condition | @unified_member_access_expr | @unified_name_expr | @unified_parameter | @unified_sequence_condition | @unified_token | @unified_top_level | @unified_tuple_pattern | @unified_unary_expr | @unified_var_pattern | @unified_variable_declaration_stmt | @unified_variable_declarator
unified_trivia_tokeninfo(
unique int id: @unified_trivia_token,
int kind: int ref,
string value: string ref
);
@unified_ast_node = @unified_apply_pattern | @unified_binary_expr | @unified_block_stmt | @unified_call_expr | @unified_expr_condition | @unified_expr_stmt | @unified_guard_if_stmt | @unified_if_stmt | @unified_lambda_expr | @unified_let_pattern_condition | @unified_member_access_expr | @unified_name_expr | @unified_parameter | @unified_sequence_condition | @unified_token | @unified_top_level | @unified_trivia_token | @unified_tuple_pattern | @unified_unary_expr | @unified_var_pattern | @unified_variable_declaration_stmt | @unified_variable_declarator
unified_ast_node_location(
unique int node: @unified_ast_node ref,

View File

@@ -0,0 +1,8 @@
/**
* Provides classes for working with the AST, as well as files and locations.
*/
import codeql.Locations
import codeql.files.FileSystem
import codeql.unified.Ast::Unified
import codeql.unified.Comments

View File

@@ -0,0 +1,3 @@
| comments.swift:1:1:1:22 | // Hello this is swift | Hello this is swift |
| comments.swift:3:1:6:3 | /*\n * This is a multi-line comment\n * It should be ignored by the parser\n */ | \n * This is a multi-line comment\n * It should be ignored by the parser\n |
| comments.swift:9:5:9:36 | // This is a single-line comment | This is a single-line comment |

View File

@@ -0,0 +1,3 @@
import unified
query predicate comments(Comment c, string text) { text = c.getCommentText() }

View File

@@ -0,0 +1,11 @@
// Hello this is swift
/*
* This is a multi-line comment
* It should be ignored by the parser
*/
func hello() {
// This is a single-line comment
print("Hello, world!")
}