Rust: fetch ungram and rust-analyzer code instead of checking it in

* The ungram file is now taken from the rust-analyzer dependencies
  pulled in by bazel
* the grammar parsing code is not published, so it must be taken
  directly from rust-analyzer code. That part should be less prone to be
  updated than the ungram file, so it does not necessarily need to be
  in sync with the rust-analyzer version is used elsewhere.
* both need some patches. The former is patched during build, the latter
  during loading in `MODULE.bazel`.
This commit is contained in:
Paolo Tranquilli
2024-12-18 16:25:30 +01:00
parent 023f48ff1c
commit 290a1043b1
141 changed files with 968 additions and 2902 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -1,287 +0,0 @@
//! Defines input for code generation process.
use quote::ToTokens;
use crate::codegen::grammar::to_upper_snake_case;
#[derive(Copy, Clone, Debug)]
pub(crate) struct KindsSrc {
pub(crate) punct: &'static [(&'static str, &'static str)],
pub(crate) keywords: &'static [&'static str],
pub(crate) contextual_keywords: &'static [&'static str],
pub(crate) literals: &'static [&'static str],
pub(crate) tokens: &'static [&'static str],
pub(crate) nodes: &'static [&'static str],
pub(crate) edition_dependent_keywords: &'static [(&'static str, Edition)],
}
#[allow(dead_code)]
#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub(super) enum Edition {
Edition2015,
Edition2018,
Edition2021,
Edition2024,
}
impl ToTokens for Edition {
fn to_tokens(&self, tokens: &mut proc_macro2::TokenStream) {
match self {
Edition::Edition2015 => {
tokens.extend(quote::quote! { Edition::Edition2015 });
}
Edition::Edition2018 => {
tokens.extend(quote::quote! { Edition::Edition2018 });
}
Edition::Edition2021 => {
tokens.extend(quote::quote! { Edition::Edition2021 });
}
Edition::Edition2024 => {
tokens.extend(quote::quote! { Edition::Edition2024 });
}
}
}
}
/// The punctuations of the language.
const PUNCT: &[(&str, &str)] = &[
// KEEP THE DOLLAR AT THE TOP ITS SPECIAL
("$", "DOLLAR"),
(";", "SEMICOLON"),
(",", "COMMA"),
("(", "L_PAREN"),
(")", "R_PAREN"),
("{", "L_CURLY"),
("}", "R_CURLY"),
("[", "L_BRACK"),
("]", "R_BRACK"),
("<", "L_ANGLE"),
(">", "R_ANGLE"),
("@", "AT"),
("#", "POUND"),
("~", "TILDE"),
("?", "QUESTION"),
("&", "AMP"),
("|", "PIPE"),
("+", "PLUS"),
("*", "STAR"),
("/", "SLASH"),
("^", "CARET"),
("%", "PERCENT"),
("_", "UNDERSCORE"),
(".", "DOT"),
("..", "DOT2"),
("...", "DOT3"),
("..=", "DOT2EQ"),
(":", "COLON"),
("::", "COLON2"),
("=", "EQ"),
("==", "EQ2"),
("=>", "FAT_ARROW"),
("!", "BANG"),
("!=", "NEQ"),
("-", "MINUS"),
("->", "THIN_ARROW"),
("<=", "LTEQ"),
(">=", "GTEQ"),
("+=", "PLUSEQ"),
("-=", "MINUSEQ"),
("|=", "PIPEEQ"),
("&=", "AMPEQ"),
("^=", "CARETEQ"),
("/=", "SLASHEQ"),
("*=", "STAREQ"),
("%=", "PERCENTEQ"),
("&&", "AMP2"),
("||", "PIPE2"),
("<<", "SHL"),
(">>", "SHR"),
("<<=", "SHLEQ"),
(">>=", "SHREQ"),
];
const TOKENS: &[&str] = &["ERROR", "WHITESPACE", "NEWLINE", "COMMENT"];
// &["ERROR", "IDENT", "WHITESPACE", "LIFETIME_IDENT", "COMMENT", "SHEBANG"],;
const EOF: &str = "EOF";
const RESERVED: &[&str] = &[
"abstract", "become", "box", "do", "final", "macro", "override", "priv", "typeof", "unsized",
"virtual", "yield",
];
// keywords that are keywords only in specific parse contexts
#[doc(alias = "WEAK_KEYWORDS")]
const CONTEXTUAL_KEYWORDS: &[&str] = &[
"macro_rules",
"union",
"default",
"raw",
"dyn",
"auto",
"yeet",
"safe",
];
// keywords we use for special macro expansions
const CONTEXTUAL_BUILTIN_KEYWORDS: &[&str] = &[
"asm",
"att_syntax",
"builtin",
"clobber_abi",
"format_args",
// "in",
"inlateout",
"inout",
"label",
"lateout",
"may_unwind",
"nomem",
"noreturn",
"nostack",
"offset_of",
"options",
"out",
"preserves_flags",
"pure",
// "raw",
"readonly",
"sym",
];
// keywords that are keywords depending on the edition
const EDITION_DEPENDENT_KEYWORDS: &[(&str, Edition)] = &[
("try", Edition::Edition2018),
("dyn", Edition::Edition2018),
("async", Edition::Edition2018),
("await", Edition::Edition2018),
("gen", Edition::Edition2024),
];
pub(crate) fn generate_kind_src(
nodes: &[AstNodeSrc],
enums: &[AstEnumSrc],
grammar: &ungrammar::Grammar,
) -> KindsSrc {
let mut contextual_keywords: Vec<&_> = CONTEXTUAL_KEYWORDS
.iter()
.chain(CONTEXTUAL_BUILTIN_KEYWORDS)
.copied()
.collect();
let mut keywords: Vec<&_> = Vec::new();
let mut tokens: Vec<&_> = TOKENS.to_vec();
let mut literals: Vec<&_> = Vec::new();
let mut used_puncts = vec![false; PUNCT.len()];
// Mark $ as used
used_puncts[0] = true;
grammar.tokens().for_each(|token| {
let name = &*grammar[token].name;
if name == EOF {
return;
}
match name.split_at(1) {
("@", lit) if !lit.is_empty() => {
literals.push(String::leak(to_upper_snake_case(lit)));
}
("#", token) if !token.is_empty() => {
tokens.push(String::leak(to_upper_snake_case(token)));
}
_ if contextual_keywords.contains(&name) => {}
_ if name.chars().all(char::is_alphabetic) => {
keywords.push(String::leak(name.to_owned()));
}
_ => {
let idx = PUNCT
.iter()
.position(|(punct, _)| punct == &name)
.unwrap_or_else(|| panic!("Grammar references unknown punctuation {name:?}"));
used_puncts[idx] = true;
}
}
});
PUNCT
.iter()
.zip(used_puncts)
.filter(|(_, used)| !used)
.for_each(|((punct, _), _)| {
panic!("Punctuation {punct:?} is not used in grammar");
});
keywords.extend(RESERVED.iter().copied());
keywords.sort();
keywords.dedup();
contextual_keywords.sort();
contextual_keywords.dedup();
let mut edition_dependent_keywords: Vec<(&_, _)> = EDITION_DEPENDENT_KEYWORDS.to_vec();
edition_dependent_keywords.sort();
edition_dependent_keywords.dedup();
keywords.retain(|&it| !contextual_keywords.contains(&it));
keywords.retain(|&it| !edition_dependent_keywords.iter().any(|&(kw, _)| kw == it));
// we leak things here for simplicity, that way we don't have to deal with lifetimes
// The execution is a one shot job so thats fine
let nodes = nodes
.iter()
.map(|it| &it.name)
.chain(enums.iter().map(|it| &it.name))
.map(|it| to_upper_snake_case(it))
.map(String::leak)
.map(|it| &*it)
.collect();
let nodes = Vec::leak(nodes);
nodes.sort();
let keywords = Vec::leak(keywords);
let contextual_keywords = Vec::leak(contextual_keywords);
let edition_dependent_keywords = Vec::leak(edition_dependent_keywords);
let literals = Vec::leak(literals);
literals.sort();
let tokens = Vec::leak(tokens);
tokens.sort();
KindsSrc {
punct: PUNCT,
nodes,
keywords,
contextual_keywords,
edition_dependent_keywords,
literals,
tokens,
}
}
#[derive(Default, Debug)]
pub(crate) struct AstSrc {
pub(crate) tokens: Vec<String>,
pub(crate) nodes: Vec<AstNodeSrc>,
pub(crate) enums: Vec<AstEnumSrc>,
}
#[derive(Debug)]
pub(crate) struct AstNodeSrc {
pub(crate) doc: Vec<String>,
pub(crate) name: String,
pub(crate) traits: Vec<String>,
pub(crate) fields: Vec<Field>,
}
#[derive(Debug, Eq, PartialEq)]
pub(crate) enum Field {
Token(String),
Node {
name: String,
ty: String,
cardinality: Cardinality,
},
}
#[derive(Debug, Eq, PartialEq)]
pub(crate) enum Cardinality {
Optional,
Many,
}
#[derive(Debug)]
pub(crate) struct AstEnumSrc {
pub(crate) doc: Vec<String>,
pub(crate) name: String,
pub(crate) traits: Vec<String>,
pub(crate) variants: Vec<String>,
}

View File

@@ -10,7 +10,7 @@ use ungrammar::Grammar;
fn project_root() -> PathBuf {
let dir =
env::var("CARGO_MANIFEST_DIR").unwrap_or_else(|_| env!("CARGO_MANIFEST_DIR").to_owned());
env::var("CARGO_MANIFEST_DIR").unwrap().to_owned();
PathBuf::from(dir).parent().unwrap().to_owned()
}
@@ -591,10 +591,11 @@ impl Translator<'_> {{
}
fn main() -> std::io::Result<()> {
let grammar: Grammar = fs::read_to_string(project_root().join("ast-generator/rust.ungram"))
.unwrap()
let grammar = PathBuf::from("..").join(env::args().nth(1).expect("grammar file path required"));
let grammar: Grammar = fs::read_to_string(&grammar)
.expect(&format!("Failed to parse grammar file: {}", grammar.display()))
.parse()
.unwrap();
.expect("Failed to parse grammar");
let mut grammar = codegen::grammar::lower(&grammar);
grammar.enums.retain(|x| x.name != "Adt");