From cad2b7413764b67fe18e4653e325081e8f55be12 Mon Sep 17 00:00:00 2001 From: Arthur Baars Date: Tue, 24 Sep 2024 10:12:55 +0200 Subject: [PATCH] Rust: integrate Rust Analyzer's Semantic module into extractor --- Cargo.lock | 2 + rust/ast-generator/src/main.rs | 2 +- rust/extractor/Cargo.toml | 2 + rust/extractor/src/main.rs | 71 +++-------- rust/extractor/src/rust_analyzer.rs | 144 ++++++++++++++++++++++ rust/extractor/src/translate/base.rs | 15 ++- rust/extractor/src/translate/generated.rs | 2 +- 7 files changed, 176 insertions(+), 62 deletions(-) create mode 100644 rust/extractor/src/rust_analyzer.rs diff --git a/Cargo.lock b/Cargo.lock index 07176705783..327f62e7935 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -390,11 +390,13 @@ dependencies = [ "ra_ap_base_db", "ra_ap_hir", "ra_ap_hir_def", + "ra_ap_hir_expand", "ra_ap_ide_db", "ra_ap_load-cargo", "ra_ap_parser", "ra_ap_paths", "ra_ap_project_model", + "ra_ap_span", "ra_ap_syntax", "ra_ap_vfs", "rust-extractor-macros", diff --git a/rust/ast-generator/src/main.rs b/rust/ast-generator/src/main.rs index 043848e902c..3a248df4a19 100644 --- a/rust/ast-generator/src/main.rs +++ b/rust/ast-generator/src/main.rs @@ -428,7 +428,7 @@ use ra_ap_syntax::ast::{{ }}; use ra_ap_syntax::{{ast, AstNode}}; -impl Translator {{ +impl Translator<'_> {{ fn emit_else_branch(&mut self, node: ast::ElseBranch) -> Label {{ match node {{ ast::ElseBranch::IfExpr(inner) => self.emit_if_expr(inner).into(), diff --git a/rust/extractor/Cargo.toml b/rust/extractor/Cargo.toml index 71ad6d8ac45..2a711291258 100644 --- a/rust/extractor/Cargo.toml +++ b/rust/extractor/Cargo.toml @@ -13,12 +13,14 @@ ra_ap_base_db = "0.0.232" ra_ap_hir = "0.0.232" ra_ap_hir_def = "0.0.232" ra_ap_ide_db = "0.0.232" +ra_ap_hir_expand = "0.0.232" ra_ap_load-cargo = "0.0.232" ra_ap_paths = "0.0.232" ra_ap_project_model = "0.0.232" ra_ap_syntax = "0.0.232" ra_ap_vfs = "0.0.232" ra_ap_parser = "0.0.232" +ra_ap_span = "0.0.232" serde = "1.0.209" serde_with = "3.9.0" stderrlog = "0.6.0" diff --git a/rust/extractor/src/main.rs b/rust/extractor/src/main.rs index 81032636ceb..2d79fedebf3 100644 --- a/rust/extractor/src/main.rs +++ b/rust/extractor/src/main.rs @@ -1,76 +1,28 @@ use anyhow::Context; use ra_ap_ide_db::line_index::LineIndex; -use ra_ap_parser::Edition; -use std::borrow::Cow; mod archive; mod config; pub mod generated; +mod rust_analyzer; mod translate; pub mod trap; -use ra_ap_syntax::ast::SourceFile; -use ra_ap_syntax::{AstNode, SyntaxError, TextRange, TextSize}; - -fn from_utf8_lossy(v: &[u8]) -> (Cow<'_, str>, Option) { - let mut iter = v.utf8_chunks(); - let (first_valid, first_invalid) = if let Some(chunk) = iter.next() { - let valid = chunk.valid(); - let invalid = chunk.invalid(); - if invalid.is_empty() { - debug_assert_eq!(valid.len(), v.len()); - return (Cow::Borrowed(valid), None); - } - (valid, invalid) - } else { - return (Cow::Borrowed(""), None); - }; - - const REPLACEMENT: &str = "\u{FFFD}"; - let error_start = first_valid.len() as u32; - let error_end = error_start + first_invalid.len() as u32; - let error_range = TextRange::new(TextSize::new(error_start), TextSize::new(error_end)); - let error = SyntaxError::new("invalid utf-8 sequence".to_owned(), error_range); - let mut res = String::with_capacity(v.len()); - res.push_str(first_valid); - - res.push_str(REPLACEMENT); - - for chunk in iter { - res.push_str(chunk.valid()); - if !chunk.invalid().is_empty() { - res.push_str(REPLACEMENT); - } - } - - (Cow::Owned(res), Some(error)) -} fn extract( - archiver: &archive::Archiver, + rust_analyzer: &rust_analyzer::RustAnalyzer, traps: &trap::TrapFileProvider, file: std::path::PathBuf, ) -> anyhow::Result<()> { - let file = std::path::absolute(&file).unwrap_or(file); - let file = std::fs::canonicalize(&file).unwrap_or(file); - archiver.archive(&file); - let input = std::fs::read(&file)?; - let (input, err) = from_utf8_lossy(&input); - let line_index = LineIndex::new(&input); + let (ast, input, parse_errors, semi) = rust_analyzer.parse(&file); + let line_index = LineIndex::new(input.as_ref()); let display_path = file.to_string_lossy(); let mut trap = traps.create("source", &file); let label = trap.emit_file(&file); - let mut translator = translate::Translator::new(trap, label, line_index); - if let Some(err) = err { + let mut translator = translate::Translator::new(trap, label, line_index, semi); + + for err in parse_errors { translator.emit_parse_error(display_path.as_ref(), err); } - let parse = ra_ap_syntax::ast::SourceFile::parse(&input, Edition::CURRENT); - for err in parse.errors() { - translator.emit_parse_error(display_path.as_ref(), err); - } - if let Some(ast) = SourceFile::cast(parse.syntax_node()) { - translator.emit_source_file(ast); - } else { - log::warn!("Skipped {}", display_path); - } + translator.emit_source_file(ast); translator.trap.commit()?; Ok(()) } @@ -81,12 +33,17 @@ fn main() -> anyhow::Result<()> { .verbosity(2 + cfg.verbose as usize) .init()?; log::info!("{cfg:?}"); + let rust_analyzer = rust_analyzer::RustAnalyzer::new(&cfg)?; + let traps = trap::TrapFileProvider::new(&cfg).context("failed to set up trap files")?; let archiver = archive::Archiver { root: cfg.source_archive_dir, }; for file in cfg.inputs { - extract(&archiver, &traps, file)?; + let file = std::path::absolute(&file).unwrap_or(file); + let file = std::fs::canonicalize(&file).unwrap_or(file); + archiver.archive(&file); + extract(&rust_analyzer, &traps, file)?; } Ok(()) diff --git a/rust/extractor/src/rust_analyzer.rs b/rust/extractor/src/rust_analyzer.rs new file mode 100644 index 00000000000..e1d818dc380 --- /dev/null +++ b/rust/extractor/src/rust_analyzer.rs @@ -0,0 +1,144 @@ +use crate::config::Config; +use anyhow::Context; +use itertools::Itertools; +use log::info; +use ra_ap_base_db::SourceDatabase; +use ra_ap_hir::Semantics; +use ra_ap_ide_db::RootDatabase; +use ra_ap_load_cargo::{load_workspace_at, LoadCargoConfig, ProcMacroServerChoice}; +use ra_ap_paths::Utf8PathBuf; +use ra_ap_project_model::CargoConfig; +use ra_ap_project_model::RustLibSource; +use ra_ap_span::Edition; +use ra_ap_span::EditionedFileId; +use ra_ap_span::TextRange; +use ra_ap_span::TextSize; +use ra_ap_syntax::SourceFile; +use ra_ap_syntax::SyntaxError; +use ra_ap_vfs::AbsPathBuf; +use ra_ap_vfs::Vfs; +use ra_ap_vfs::VfsPath; +use std::borrow::Cow; +use std::collections::HashMap; +use std::path::{Path, PathBuf}; +use triomphe::Arc; +pub struct RustAnalyzer { + workspace: HashMap, +} + +impl RustAnalyzer { + pub fn new(cfg: &Config) -> anyhow::Result { + let mut workspace = HashMap::new(); + let config = CargoConfig { + sysroot: Some(RustLibSource::Discover), + target_dir: ra_ap_paths::Utf8PathBuf::from_path_buf(cfg.scratch_dir.to_path_buf()) + .map(|x| x.join("target")) + .ok(), + ..Default::default() + }; + let progress = |t| (log::info!("progress: {}", t)); + let load_config = LoadCargoConfig { + load_out_dirs_from_check: true, + with_proc_macro_server: ProcMacroServerChoice::Sysroot, + prefill_caches: false, + }; + let projects = find_project_manifests(&cfg.inputs).context("loading inputs")?; + for project in projects { + let manifest = project.manifest_path(); + let (db, vfs, _macro_server) = + load_workspace_at(manifest.as_ref(), &config, &load_config, &progress)?; + let path: &Path = manifest.parent().as_ref(); + workspace.insert(path.to_path_buf(), (vfs, db)); + } + Ok(RustAnalyzer { workspace }) + } + pub fn parse( + &self, + path: &PathBuf, + ) -> ( + SourceFile, + Arc, + Vec, + Option>, + ) { + let mut p = path.as_path(); + while let Some(parent) = p.parent() { + p = parent; + if let Some((vfs, db)) = self.workspace.get(parent) { + if let Some(file_id) = Utf8PathBuf::from_path_buf(path.to_path_buf()) + .ok() + .and_then(|x| AbsPathBuf::try_from(x).ok()) + .map(VfsPath::from) + .and_then(|x| vfs.file_id(&x)) + { + let semi = Semantics::new(db); + let file_id = EditionedFileId::current_edition(file_id); + + return ( + semi.parse(file_id), + db.file_text(file_id.into()), + db.parse_errors(file_id) + .map(|x| x.to_vec()) + .unwrap_or_default(), + Some(semi), + ); + } + } + } + let input = std::fs::read(&path).unwrap(); + let (input, err) = from_utf8_lossy(&input); + let parse = ra_ap_syntax::ast::SourceFile::parse(&input, Edition::CURRENT); + let mut errors = parse.errors(); + errors.extend(err.into_iter()); + (parse.tree(), input.as_ref().into(), errors, None) + } +} + +fn find_project_manifests( + files: &[PathBuf], +) -> anyhow::Result> { + let current = std::env::current_dir()?; + let abs_files: Vec<_> = files + .iter() + .map(|path| AbsPathBuf::assert_utf8(current.join(path))) + .collect(); + let ret = ra_ap_project_model::ProjectManifest::discover_all(&abs_files); + info!( + "found manifests: {}", + ret.iter().map(|m| format!("{m}")).join(", ") + ); + Ok(ret) +} +fn from_utf8_lossy(v: &[u8]) -> (Cow<'_, str>, Option) { + let mut iter = v.utf8_chunks(); + let (first_valid, first_invalid) = if let Some(chunk) = iter.next() { + let valid = chunk.valid(); + let invalid = chunk.invalid(); + if invalid.is_empty() { + debug_assert_eq!(valid.len(), v.len()); + return (Cow::Borrowed(valid), None); + } + (valid, invalid) + } else { + return (Cow::Borrowed(""), None); + }; + + const REPLACEMENT: &str = "\u{FFFD}"; + let error_start = first_valid.len() as u32; + let error_end = error_start + first_invalid.len() as u32; + let error_range = TextRange::new(TextSize::new(error_start), TextSize::new(error_end)); + let error = SyntaxError::new("invalid utf-8 sequence".to_owned(), error_range); + let mut res = String::with_capacity(v.len()); + res.push_str(first_valid); + + res.push_str(REPLACEMENT); + + for chunk in iter { + res.push_str(chunk.valid()); + if !chunk.invalid().is_empty() { + res.push_str(REPLACEMENT); + } + } + + (Cow::Owned(res), Some(error)) +} diff --git a/rust/extractor/src/translate/base.rs b/rust/extractor/src/translate/base.rs index 0c10150882e..96d6cb0b007 100644 --- a/rust/extractor/src/translate/base.rs +++ b/rust/extractor/src/translate/base.rs @@ -2,7 +2,9 @@ use crate::generated::{self, AstNode}; use crate::trap::{DiagnosticSeverity, TrapFile, TrapId}; use crate::trap::{Label, TrapClass}; use codeql_extractor::trap::{self}; +use ra_ap_hir::Semantics; use ra_ap_ide_db::line_index::{LineCol, LineIndex}; +use ra_ap_ide_db::RootDatabase; use ra_ap_parser::SyntaxKind; use ra_ap_syntax::ast::RangeItem; use ra_ap_syntax::{ast, NodeOrToken, SyntaxElementChildren, SyntaxError, SyntaxToken, TextRange}; @@ -56,18 +58,25 @@ impl TextValue for ast::RangePat { self.op_token().map(|x| x.text().to_string()) } } -pub struct Translator { +pub struct Translator<'a> { pub trap: TrapFile, label: trap::Label, line_index: LineIndex, + semi: Option>, } -impl Translator { - pub fn new(trap: TrapFile, label: trap::Label, line_index: LineIndex) -> Translator { +impl Translator<'_> { + pub fn new( + trap: TrapFile, + label: trap::Label, + line_index: LineIndex, + semi: Option>, + ) -> Translator { Translator { trap, label, line_index, + semi, } } pub fn location(&self, range: TextRange) -> (LineCol, LineCol) { diff --git a/rust/extractor/src/translate/generated.rs b/rust/extractor/src/translate/generated.rs index 9725bbfa6b6..45d1c540602 100644 --- a/rust/extractor/src/translate/generated.rs +++ b/rust/extractor/src/translate/generated.rs @@ -11,7 +11,7 @@ use ra_ap_syntax::ast::{ }; use ra_ap_syntax::{ast, AstNode}; -impl Translator { +impl Translator<'_> { fn emit_else_branch(&mut self, node: ast::ElseBranch) -> Label { match node { ast::ElseBranch::IfExpr(inner) => self.emit_if_expr(inner).into(),