Files
codeql/unified/extractor/tests/corpus_tests.rs

294 lines
9.5 KiB
Rust

use std::fs;
use std::path::Path;
use codeql_extractor::extractor::simple;
use yeast::{dump::dump_ast, dump::dump_ast_with_type_errors, Runner};
#[path = "../src/languages/mod.rs"]
mod languages;
#[derive(Debug)]
struct CorpusCase {
name: String,
input: String,
raw: String,
expected: String,
}
fn update_mode_enabled() -> bool {
std::env::var("UNIFIED_UPDATE_CORPUS")
.map(|v| matches!(v.to_ascii_lowercase().as_str(), "1" | "true" | "yes" | "on"))
.unwrap_or(false)
}
fn is_header_rule(line: &str) -> bool {
let trimmed = line.trim();
trimmed.len() >= 3 && trimmed.chars().all(|c| c == '=')
}
fn is_next_case_header(lines: &[&str], i: usize) -> bool {
is_header_rule(lines[i])
&& i + 2 < lines.len()
&& !lines[i + 1].trim().is_empty()
&& is_header_rule(lines[i + 2])
}
fn parse_corpus(content: &str) -> Vec<CorpusCase> {
let lines: Vec<&str> = content.lines().collect();
let mut i = 0;
let mut cases = Vec::new();
while i < lines.len() {
while i < lines.len() && lines[i].trim().is_empty() {
i += 1;
}
if i >= lines.len() {
break;
}
assert!(
is_header_rule(lines[i]),
"Expected header delimiter at line {}",
i + 1
);
i += 1;
assert!(i < lines.len(), "Missing test name at line {}", i + 1);
let name = lines[i].trim().to_string();
i += 1;
assert!(
i < lines.len() && is_header_rule(lines[i]),
"Missing closing header delimiter for case {name}"
);
i += 1;
let input_start = i;
while i < lines.len() && lines[i].trim() != "---" {
if is_next_case_header(&lines, i) {
break;
}
i += 1;
}
let input = lines[input_start..i].join("\n").trim_end().to_string();
let raw;
let expected;
if i >= lines.len() || lines[i].trim() != "---" {
// No `---` separator before next case (or EOF). Treat the
// remaining sections as empty.
raw = String::new();
expected = String::new();
} else {
i += 1;
// Raw tree-sitter parse section. New-format files have a second
// `---` separator between the raw tree and the mapped AST. Legacy
// files (with only one separator) have no raw section — in that
// case `raw` stays empty and update mode will populate it.
let raw_start = i;
let mut next_sep = i;
while next_sep < lines.len() && lines[next_sep].trim() != "---" {
if is_next_case_header(&lines, next_sep) {
break;
}
next_sep += 1;
}
raw = if next_sep < lines.len() && lines[next_sep].trim() == "---" {
let raw_text = lines[raw_start..next_sep].join("\n").trim().to_string();
i = next_sep + 1;
raw_text
} else {
String::new()
};
let expected_start = i;
while i < lines.len() {
if is_next_case_header(&lines, i) {
break;
}
i += 1;
}
expected = lines[expected_start..i].join("\n").trim().to_string();
}
cases.push(CorpusCase {
name,
input,
raw,
expected,
});
}
cases
}
fn render_corpus(cases: &[CorpusCase]) -> String {
let mut out = String::new();
for (idx, case) in cases.iter().enumerate() {
if idx > 0 {
// Blank line between cases.
out.push('\n');
}
out.push_str("===\n");
out.push_str(case.name.trim());
out.push_str("\n===\n\n");
out.push_str(case.input.trim());
out.push_str("\n\n---\n\n");
out.push_str(case.raw.trim());
out.push_str("\n\n---\n\n");
out.push_str(case.expected.trim());
// Single trailing newline per case; the inter-case blank line is
// added by the prefix above, and the file ends with exactly one `\n`.
out.push('\n');
}
out
}
fn run_desugaring(
lang: &simple::LanguageSpec,
input: &str,
) -> Result<yeast::Ast, String> {
let runner = match lang.desugar.as_ref() {
Some(config) => Runner::from_config(lang.ts_language.clone(), config)
.map_err(|e| format!("Failed to create yeast runner: {e}"))?,
None => Runner::new(lang.ts_language.clone(), &[]),
};
runner
.run(input)
.map_err(|e| format!("Failed to parse input: {e}"))
}
/// Produce the raw tree-sitter parse tree dump for `input`, with no
/// desugaring rules applied. Uses a `Runner` with an empty phase list and
/// the input grammar's own schema.
fn dump_raw_parse(
lang: &simple::LanguageSpec,
input: &str,
) -> Result<String, String> {
let runner = Runner::new(lang.ts_language.clone(), &[]);
let ast = runner
.run(input)
.map_err(|e| format!("Failed to parse input: {e}"))?;
Ok(dump_ast(&ast, ast.get_root(), input))
}
#[test]
fn test_corpus() {
let update_mode = update_mode_enabled();
let all_languages = languages::all_language_specs();
let corpus_dir = Path::new("tests/corpus");
for lang in all_languages {
let output_schema = yeast::node_types_yaml::schema_from_yaml_with_language(
languages::OUTPUT_AST_SCHEMA,
&lang.ts_language,
)
.expect("Failed to parse OUTPUT_AST_SCHEMA YAML");
let lang_corpus_dir = corpus_dir.join(&lang.prefix);
if !lang_corpus_dir.exists() {
continue;
}
let mut corpus_files: Vec<_> = fs::read_dir(&lang_corpus_dir)
.unwrap_or_else(|e| {
panic!(
"Failed to read corpus directory {}: {e}",
lang_corpus_dir.display()
)
})
.map(|entry| entry.expect("Failed to read corpus entry").path())
.filter(|path| path.extension().is_some_and(|ext| ext == "txt"))
.collect();
corpus_files.sort();
for corpus_path in corpus_files {
let content = fs::read_to_string(&corpus_path)
.unwrap_or_else(|e| panic!("Failed to read {}: {e}", corpus_path.display()));
let mut cases = parse_corpus(&content);
let mut failures = Vec::new();
assert!(
!cases.is_empty(),
"No corpus cases found in {}",
corpus_path.display()
);
for case in &mut cases {
match dump_raw_parse(&lang, &case.input) {
Err(e) => {
failures.push(format!(
"Raw parse failed for {} in {}: {}",
case.name,
corpus_path.display(),
e
));
}
Ok(actual_raw) => {
if update_mode {
case.raw = actual_raw.trim().to_string();
} else if case.raw.trim() != actual_raw.trim() {
failures.push(format!(
"Raw parse mismatch in {}: \"{}\"\nEXPECTED:\n\n{}\n\nACTUAL:\n\n{}",
corpus_path.display(),
case.name,
case.raw.trim(),
actual_raw.trim()
));
}
}
}
match run_desugaring(&lang, &case.input) {
Err(e) => {
failures.push(format!(
"Desugaring failed for {} in {}: {}",
case.name,
corpus_path.display(),
e
));
}
Ok(actual) => {
let actual_dump = dump_ast_with_type_errors(
&actual,
actual.get_root(),
&case.input,
&output_schema,
);
if update_mode {
case.expected = actual_dump.trim().to_string();
} else if case.expected.trim() != actual_dump.trim() {
failures.push(format!(
"Test failed in {}: \"{}\"\nEXPECTED:\n\n{}\n\nACTUAL:\n\n{}",
corpus_path.display(),
case.name,
case.expected.trim(),
actual_dump.trim()
));
}
}
}
}
assert!(
failures.is_empty(),
"{}",
failures.join("\n\n") + "\n\n"
);
if update_mode {
let updated = render_corpus(&cases);
let write_result = fs::write(&corpus_path, updated);
assert!(
write_result.is_ok(),
"Failed to update corpus file {}: {}",
corpus_path.display(),
write_result.err().map_or_else(String::new, |e| e.to_string())
);
}
}
}
}