Rust: generate schema.py and extractor from ungrammar

This commit is contained in:
Arthur Baars
2024-09-13 16:57:54 +02:00
parent 61ac8d66f5
commit 1f30d5f41b
8 changed files with 539 additions and 84 deletions

1
Cargo.lock generated
View File

@@ -382,6 +382,7 @@ dependencies = [
"ra_ap_hir_def", "ra_ap_hir_def",
"ra_ap_ide_db", "ra_ap_ide_db",
"ra_ap_load-cargo", "ra_ap_load-cargo",
"ra_ap_parser",
"ra_ap_paths", "ra_ap_paths",
"ra_ap_project_model", "ra_ap_project_model",
"ra_ap_syntax", "ra_ap_syntax",

View File

@@ -61,6 +61,7 @@ r.from_cargo(
"//ruby/extractor:Cargo.toml", "//ruby/extractor:Cargo.toml",
"//rust/extractor:Cargo.toml", "//rust/extractor:Cargo.toml",
"//rust/extractor/macros:Cargo.toml", "//rust/extractor/macros:Cargo.toml",
"//rust/generate-schema:Cargo.toml",
"//shared/tree-sitter-extractor:Cargo.toml", "//shared/tree-sitter-extractor:Cargo.toml",
], ],
) )

View File

@@ -18,6 +18,7 @@ ra_ap_paths = "0.0.232"
ra_ap_project_model = "0.0.232" ra_ap_project_model = "0.0.232"
ra_ap_syntax = "0.0.232" ra_ap_syntax = "0.0.232"
ra_ap_vfs = "0.0.232" ra_ap_vfs = "0.0.232"
ra_ap_parser = "0.0.232"
serde = "1.0.209" serde = "1.0.209"
serde_with = "3.9.0" serde_with = "3.9.0"
stderrlog = "0.6.0" stderrlog = "0.6.0"

View File

@@ -1,37 +1,11 @@
use crate::trap::TrapId;
use anyhow::Context; use anyhow::Context;
use itertools::Itertools; use ra_ap_ide_db::line_index::LineIndex;
use log::info;
use ra_ap_hir::db::DefDatabase;
use ra_ap_hir::Crate;
use ra_ap_load_cargo::{load_workspace_at, LoadCargoConfig, ProcMacroServerChoice};
use ra_ap_project_model::CargoConfig;
use ra_ap_project_model::RustLibSource;
use ra_ap_vfs::AbsPathBuf;
use std::path::PathBuf;
mod archive; mod archive;
mod config; mod config;
pub mod generated; pub mod generated;
mod translate; mod translate;
pub mod trap; pub mod trap;
fn find_project_manifests(
files: &[PathBuf],
) -> anyhow::Result<Vec<ra_ap_project_model::ProjectManifest>> {
let current = std::env::current_dir()?;
let abs_files: Vec<_> = files
.iter()
.map(|path| AbsPathBuf::assert_utf8(current.join(path)))
.collect();
let ret = ra_ap_project_model::ProjectManifest::discover_all(&abs_files);
info!(
"found manifests: {}",
ret.iter().map(|m| format!("{m}")).join(", ")
);
Ok(ret)
}
fn main() -> anyhow::Result<()> { fn main() -> anyhow::Result<()> {
let cfg = config::Config::extract().context("failed to load configuration")?; let cfg = config::Config::extract().context("failed to load configuration")?;
stderrlog::new() stderrlog::new()
@@ -43,52 +17,20 @@ fn main() -> anyhow::Result<()> {
let archiver = archive::Archiver { let archiver = archive::Archiver {
root: cfg.source_archive_dir, root: cfg.source_archive_dir,
}; };
for file in cfg.inputs {
let config = CargoConfig { let file = std::path::absolute(&file).unwrap_or(file);
sysroot: Some(RustLibSource::Discover), let file = std::fs::canonicalize(&file).unwrap_or(file);
target_dir: ra_ap_paths::Utf8PathBuf::from_path_buf(cfg.scratch_dir) archiver.archive(&file);
.map(|x| x.join("target")) let input = std::fs::read(&file)?;
.ok(), let input = String::from_utf8(input)?;
..Default::default() let line_index = LineIndex::new(&input);
}; let display_path = file.to_string_lossy();
let progress = |t| (log::info!("progress: {}", t)); let mut trap = traps.create("source", &file);
let load_config = LoadCargoConfig { let label = trap.emit_file(&file);
load_out_dirs_from_check: true, translate::SourceFileTranslator::new(trap, label, line_index)
with_proc_macro_server: ProcMacroServerChoice::Sysroot, .extract(&display_path, &input)
prefill_caches: false, .context("writing trap file")?;
};
let projects = find_project_manifests(&cfg.inputs).context("loading inputs")?;
for project in projects {
let (db, vfs, _macro_server) = load_workspace_at(
project.manifest_path().as_ref(),
&config,
&load_config,
&progress,
)?;
let crates = <dyn DefDatabase>::crate_graph(&db);
for crate_id in crates.iter() {
let krate = Crate::from(crate_id);
if !cfg.extract_dependencies && !krate.origin(&db).is_local() {
continue;
}
let name = krate.display_name(&db);
let crate_name = name
.as_ref()
.map(|n| n.canonical_name().as_str())
.unwrap_or("");
let trap = traps.create(
"crates",
&PathBuf::from(format!(
"/{}_{}",
crate_name,
crate_id.into_raw().into_u32()
)),
);
translate::CrateTranslator::new(&db, trap, &krate, &vfs, &archiver)
.emit_crate()
.context("writing trap file")?;
}
} }
Ok(()) Ok(())
} }

View File

@@ -0,0 +1,14 @@
load("//misc/bazel:rust.bzl", "codeql_rust_binary")
codeql_rust_binary(
name = "generate-schema",
srcs = glob(["src/**/*.rs"]),
aliases = aliases(),
proc_macro_deps = all_crate_deps(
proc_macro = True,
),
visibility = ["//rust:__subpackages__"],
deps = all_crate_deps(
normal = True,
),
)

View File

@@ -1,5 +1,4 @@
pub mod grammar; pub mod grammar;
pub fn reformat(x: String) -> String { pub fn reformat(x: String) -> String {
x x
} }

View File

@@ -21,7 +21,7 @@ use crate::{
project_root, project_root,
}; };
mod ast_src; pub mod ast_src;
use self::ast_src::{AstEnumSrc, AstNodeSrc, AstSrc, Cardinality, Field, KindsSrc}; use self::ast_src::{AstEnumSrc, AstNodeSrc, AstSrc, Cardinality, Field, KindsSrc};
pub(crate) fn generate(check: bool) { pub(crate) fn generate(check: bool) {
@@ -621,10 +621,16 @@ fn pluralize(s: &str) -> String {
} }
impl Field { impl Field {
fn is_many(&self) -> bool { pub fn is_many(&self) -> bool {
matches!(self, Field::Node { cardinality: Cardinality::Many, .. }) matches!(
self,
Field::Node {
cardinality: Cardinality::Many,
..
}
)
} }
fn token_kind(&self) -> Option<proc_macro2::TokenStream> { pub fn token_kind(&self) -> Option<proc_macro2::TokenStream> {
match self { match self {
Field::Token(token) => { Field::Token(token) => {
let token: proc_macro2::TokenStream = token.parse().unwrap(); let token: proc_macro2::TokenStream = token.parse().unwrap();
@@ -633,7 +639,7 @@ impl Field {
_ => None, _ => None,
} }
} }
fn method_name(&self) -> String { pub fn method_name(&self) -> String {
match self { match self {
Field::Token(name) => { Field::Token(name) => {
let name = match name.as_str() { let name = match name.as_str() {
@@ -679,7 +685,7 @@ impl Field {
} }
} }
} }
fn ty(&self) -> proc_macro2::Ident { pub fn ty(&self) -> proc_macro2::Ident {
match self { match self {
Field::Token(_) => format_ident!("SyntaxToken"), Field::Token(_) => format_ident!("SyntaxToken"),
Field::Node { ty, .. } => format_ident!("{}", ty), Field::Node { ty, .. } => format_ident!("{}", ty),
@@ -696,7 +702,7 @@ fn clean_token_name(name: &str) -> String {
} }
} }
fn lower(grammar: &Grammar) -> AstSrc { pub(crate) fn lower(grammar: &Grammar) -> AstSrc {
let mut res = AstSrc { let mut res = AstSrc {
tokens: tokens:
"Whitespace Comment String ByteString CString IntNumber FloatNumber Char Byte Ident" "Whitespace Comment String ByteString CString IntNumber FloatNumber Char Byte Ident"

View File

@@ -1,12 +1,503 @@
use std::path::PathBuf; use std::{fs, path::PathBuf};
mod codegen; pub mod codegen;
mod flags; mod flags;
use codegen::grammar::ast_src::{AstNodeSrc, AstSrc};
use std::collections::{BTreeMap, BTreeSet};
use std::env; use std::env;
use ungrammar::Grammar;
fn project_root() -> PathBuf { fn project_root() -> PathBuf {
let dir = let dir =
env::var("CARGO_MANIFEST_DIR").unwrap_or_else(|_| env!("CARGO_MANIFEST_DIR").to_owned()); env::var("CARGO_MANIFEST_DIR").unwrap_or_else(|_| env!("CARGO_MANIFEST_DIR").to_owned());
PathBuf::from(dir).parent().unwrap().to_owned() PathBuf::from(dir).parent().unwrap().to_owned()
} }
fn main() {}
fn class_name(type_name: &String) -> String {
match type_name.as_str() {
"BinExpr" => "BinaryExpr".to_owned(),
"ElseBranch" => "Expr".to_owned(),
"Fn" => "Function".to_owned(),
"Literal" => "LiteralExpr".to_owned(),
"Type" => "TypeRef".to_owned(),
_ => type_name.to_owned(),
}
}
fn property_name(type_name: &String, field_name: &String) -> String {
match (type_name.as_str(), field_name.as_str()) {
("Path", "segment") => "part".to_owned(),
(_, "then_branch") => "then".to_owned(),
(_, "else_branch") => "else_".to_owned(),
_ => field_name.to_owned(),
}
}
fn to_lower_snake_case(s: &str) -> String {
let mut buf = String::with_capacity(s.len());
let mut prev = false;
for c in s.chars() {
if c.is_ascii_uppercase() && prev {
buf.push('_')
}
prev = true;
buf.push(c.to_ascii_lowercase());
}
buf
}
fn print_schema(grammar: &AstSrc, super_types: BTreeMap<String, BTreeSet<String>>) {
for node in &grammar.enums {
let super_classses = if let Some(cls) = super_types.get(&node.name) {
let super_classes: Vec<String> = cls.iter().map(|x| class_name(x)).collect();
super_classes.join(",")
} else {
"AstNode".to_owned()
};
println!("class {}({}):", class_name(&node.name), super_classses);
println!(" pass");
println!("");
}
for node in &grammar.nodes {
let super_classses = if let Some(cls) = super_types.get(&node.name) {
let super_classes: Vec<String> = cls.iter().map(|x| class_name(x)).collect();
super_classes.join(",")
} else {
"AstNode".to_owned()
};
println!("class {}({}):", class_name(&node.name), super_classses);
let mut empty = true;
for field in get_fields(node) {
if field.tp == "SyntaxToken" {
continue;
}
empty = false;
if field.tp == "string" {
println!(
" {}: optional[string]",
property_name(&node.name, &field.name),
);
} else {
let list = field.is_many;
let (o, c) = if list {
("list[", "]")
} else {
("optional[", "]")
};
println!(
" {}: {}\"{}\"{} | child",
property_name(&node.name, &field.name),
o,
class_name(&field.tp),
c
);
};
}
if empty {
println!(" pass");
}
println!("");
}
}
struct FieldInfo {
name: String,
tp: String,
is_many: bool,
}
fn get_fields(node: &AstNodeSrc) -> Vec<FieldInfo> {
let mut result = Vec::new();
match node.name.as_str() {
"Name" | "NameRef" | "Lifetime" => {
result.push(FieldInfo {
name: "text".to_string(),
tp: "string".to_string(),
is_many: false,
});
}
"Abi" => {
result.push(FieldInfo {
name: "abi_string".to_string(),
tp: "string".to_string(),
is_many: false,
});
}
"Literal" => {
result.push(FieldInfo {
name: "text_value".to_string(),
tp: "string".to_string(),
is_many: false,
});
}
"PrefixExpr" => {
result.push(FieldInfo {
name: "operator_name".to_string(),
tp: "string".to_string(),
is_many: false,
});
}
"BinExpr" => {
result.push(FieldInfo {
name: "lhs".to_string(),
tp: "Expr".to_string(),
is_many: false,
});
result.push(FieldInfo {
name: "rhs".to_string(),
tp: "Expr".to_string(),
is_many: false,
});
result.push(FieldInfo {
name: "operator_name".to_string(),
tp: "string".to_string(),
is_many: false,
});
}
"IfExpr" => {
result.push(FieldInfo {
name: "then_branch".to_string(),
tp: "BlockExpr".to_string(),
is_many: false,
});
result.push(FieldInfo {
name: "else_branch".to_string(),
tp: "ElseBranch".to_string(),
is_many: false,
});
result.push(FieldInfo {
name: "condition".to_string(),
tp: "Expr".to_string(),
is_many: false,
});
}
"RangeExpr" => {
result.push(FieldInfo {
name: "start".to_string(),
tp: "Expr".to_string(),
is_many: false,
});
result.push(FieldInfo {
name: "end".to_string(),
tp: "Expr".to_string(),
is_many: false,
});
result.push(FieldInfo {
name: "operator_name".to_string(),
tp: "string".to_string(),
is_many: false,
});
}
"RangePat" => {
result.push(FieldInfo {
name: "start".to_string(),
tp: "Pat".to_string(),
is_many: false,
});
result.push(FieldInfo {
name: "end".to_string(),
tp: "Pat".to_string(),
is_many: false,
});
result.push(FieldInfo {
name: "operator_name".to_string(),
tp: "string".to_string(),
is_many: false,
});
}
"IndexExpr" => {
result.push(FieldInfo {
name: "index".to_string(),
tp: "Expr".to_string(),
is_many: false,
});
result.push(FieldInfo {
name: "base".to_string(),
tp: "Expr".to_string(),
is_many: false,
});
}
"Impl" => {
result.push(FieldInfo {
name: "trait_".to_string(),
tp: "Type".to_string(),
is_many: false,
});
result.push(FieldInfo {
name: "self_ty".to_string(),
tp: "Type".to_string(),
is_many: false,
});
}
"ForExpr" => {
result.push(FieldInfo {
name: "iterable".to_string(),
tp: "Expr".to_string(),
is_many: false,
});
}
"WhileExpr" => {
result.push(FieldInfo {
name: "condition".to_string(),
tp: "Expr".to_string(),
is_many: false,
});
}
"MatchGuard" => {
result.push(FieldInfo {
name: "condition".to_string(),
tp: "Expr".to_string(),
is_many: false,
});
}
"MacroDef" => {
result.push(FieldInfo {
name: "args".to_string(),
tp: "TokenTree".to_string(),
is_many: false,
});
result.push(FieldInfo {
name: "body".to_string(),
tp: "TokenTree".to_string(),
is_many: false,
});
}
"FormatArgsExpr" => {
result.push(FieldInfo {
name: "args".to_string(),
tp: "FormatArgsArg".to_string(),
is_many: true,
});
}
"ArgList" => {
result.push(FieldInfo {
name: "args".to_string(),
tp: "Expr".to_string(),
is_many: true,
});
}
"Fn" => {
result.push(FieldInfo {
name: "body".to_string(),
tp: "BlockExpr".to_string(),
is_many: false,
});
}
"Const" => {
result.push(FieldInfo {
name: "body".to_string(),
tp: "Expr".to_string(),
is_many: false,
});
}
"Static" => {
result.push(FieldInfo {
name: "body".to_string(),
tp: "Expr".to_string(),
is_many: false,
});
}
"ClosureExpr" => {
result.push(FieldInfo {
name: "body".to_string(),
tp: "Expr".to_string(),
is_many: false,
});
}
_ => {}
}
for field in &node.fields {
// The ArrayExpr type also has an 'exprs' field
if node.name == "ArrayExpr" && field.method_name() == "expr" {
continue;
}
result.push(FieldInfo {
name: field.method_name(),
tp: field.ty().to_string(),
is_many: field.is_many(),
});
}
for trait_ in &node.traits {
match trait_.as_str() {
"HasAttrs" => result.push(FieldInfo {
name: "attrs".to_owned(),
tp: "Attr".to_owned(),
is_many: true,
}),
"HasName" => result.push(FieldInfo {
name: "name".to_owned(),
tp: "Name".to_owned(),
is_many: false,
}),
"HasVisibility" => result.push(FieldInfo {
name: "visibility".to_owned(),
tp: "Visibility".to_owned(),
is_many: false,
}),
"HasGenericParams" => {
result.push(FieldInfo {
name: "generic_param_list".to_owned(),
tp: "GenericParamList".to_owned(),
is_many: false,
});
result.push(FieldInfo {
name: "where_clause".to_owned(),
tp: "WhereClause".to_owned(),
is_many: false,
})
}
"HasGenericArgs" => result.push(FieldInfo {
name: "generic_arg_list".to_owned(),
tp: "GenericArgList".to_owned(),
is_many: false,
}),
"HasTypeBounds" => result.push(FieldInfo {
name: "type_bound_list".to_owned(),
tp: "TypeBoundList".to_owned(),
is_many: false,
}),
"HasModuleItem" => result.push(FieldInfo {
name: "items".to_owned(),
tp: "Item".to_owned(),
is_many: true,
}),
"HasLoopBody" => {
result.push(FieldInfo {
name: "label".to_owned(),
tp: "Label".to_owned(),
is_many: false,
});
result.push(FieldInfo {
name: "loop_body".to_owned(),
tp: "BlockExpr".to_owned(),
is_many: false,
})
}
"HasArgList" => result.push(FieldInfo {
name: "arg_list".to_owned(),
tp: "ArgList".to_owned(),
is_many: false,
}),
"HasDocComments" => {}
_ => panic!("Unknown trait {}", trait_),
};
}
result.sort_by(|x, y| x.name.cmp(&y.name));
result
}
fn print_extractor(grammar: &AstSrc) {
for node in &grammar.enums {
let type_name = &node.name;
let class_name = class_name(&node.name);
println!(
" fn emit_{}(&mut self, node: ast::{}) -> Label<generated::{}> {{",
to_lower_snake_case(type_name),
type_name,
class_name
);
println!(" match node {{");
for variant in &node.variants {
println!(
" ast::{}::{}(inner) => self.emit_{}(inner).into(),",
type_name,
variant,
to_lower_snake_case(variant)
);
}
println!(" }}");
println!(" }}\n");
}
for node in &grammar.nodes {
let type_name = &node.name;
let class_name = class_name(&node.name);
println!(
" fn emit_{}(&mut self, node: ast::{}) -> Label<generated::{}> {{",
to_lower_snake_case(type_name),
type_name,
class_name
);
for field in get_fields(&node) {
if &field.tp == "SyntaxToken" {
continue;
}
let type_name = &field.tp;
let struct_field_name = &field.name;
let class_field_name = property_name(&node.name, &field.name);
if field.tp == "string" {
println!(" let {} = node.try_get_text();", class_field_name,);
} else if field.is_many {
println!(
" let {} = node.{}().map(|x| self.emit_{}(x)).collect();",
class_field_name,
struct_field_name,
to_lower_snake_case(type_name)
);
} else {
println!(
" let {} = node.{}().map(|x| self.emit_{}(x));",
class_field_name,
struct_field_name,
to_lower_snake_case(type_name)
);
}
}
println!(
" let label = self.trap.emit(generated::{} {{",
class_name
);
println!(" id: TrapId::Star,");
for field in get_fields(&node) {
if field.tp == "SyntaxToken" {
continue;
}
let class_field_name: String = property_name(&node.name, &field.name);
println!(" {},", class_field_name);
}
println!(" }});");
println!(" self.emit_location(label, node);");
println!(" label");
println!(" }}\n");
}
}
fn main() {
let grammar: Grammar = fs::read_to_string(project_root().join("generate-schema/rust.ungram"))
.unwrap()
.parse()
.unwrap();
let mut grammar = codegen::grammar::lower(&grammar);
grammar
.nodes
.retain(|x| x.name != "MacroStmts" && x.name != "MacroItems");
grammar.enums.retain(|x| x.name != "Adt");
let mut super_types: BTreeMap<String, BTreeSet<String>> = BTreeMap::new();
for node in &grammar.enums {
for variant in &node.variants {
let set = super_types
.entry(variant.to_owned())
.or_insert_with(|| BTreeSet::new());
set.insert(node.name.to_owned());
}
}
// sort things while ensuring super clases are defined before they are used
grammar.enums.sort_by(|x, y| {
let super_class_x = super_types.get(&x.name).into_iter().flatten().max();
let super_class_y = super_types.get(&y.name).into_iter().flatten().max();
super_class_x.cmp(&super_class_y).then(x.name.cmp(&y.name))
});
//print_schema(&grammar, super_types);
print_extractor(&grammar);
}