diff --git a/Cargo.lock b/Cargo.lock index 4fab55a6444..171f4b52c42 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -456,6 +456,7 @@ dependencies = [ "tree-sitter", "tree-sitter-json", "tree-sitter-ql", + "url", "yeast", "zstd", ] diff --git a/ql/Cargo.lock b/ql/Cargo.lock index 6632bf162ee..ba31581cc23 100644 Binary files a/ql/Cargo.lock and b/ql/Cargo.lock differ diff --git a/ruby/extractor/src/extractor.rs b/ruby/extractor/src/extractor.rs index 4849f473ccb..817db97c0ef 100644 --- a/ruby/extractor/src/extractor.rs +++ b/ruby/extractor/src/extractor.rs @@ -94,11 +94,14 @@ pub fn run(options: Options) -> std::io::Result<()> { node_types::read_node_types_str("erb", tree_sitter_embedded_template::NODE_TYPES)?; let lines: std::io::Result> = std::io::BufReader::new(file_list).lines().collect(); let lines = lines?; + let source_root = std::env::current_dir().ok(); lines .par_iter() .try_for_each(|line| { let mut diagnostics_writer = diagnostics.logger(); let path = PathBuf::from(line).canonicalize()?; + let diagnostics_file_path = + file_paths::relativize_for_diagnostic(&path, source_root.as_deref()); match &overlay_changed_files { Some(changed_files) if !changed_files.contains(&path) => { // We are extracting an overlay and this file is not in the list of changes files, so we should skip it. @@ -165,7 +168,7 @@ pub fn run(options: Options) -> std::io::Result<()> { "character-decoding-error", "Character decoding error", ) - .file(&file_paths::normalize_and_transform_path(&path, path_transformer.as_ref())) + .file(&diagnostics_file_path) .message( "Could not decode the file contents as {}: {}. The contents of the file must match the character encoding specified in the {} {}.", &[ @@ -185,7 +188,7 @@ pub fn run(options: Options) -> std::io::Result<()> { diagnostics_writer.write( diagnostics_writer .new_entry("unknown-character-encoding", "Could not process some files due to an unknown character encoding") - .file(&file_paths::normalize_and_transform_path(&path, path_transformer.as_ref())) + .file(&diagnostics_file_path) .message( "Unknown character encoding {} in {} {}.", &[ diff --git a/ruby/ql/integration-tests/diagnostics/syntax-error/diagnostics.expected b/ruby/ql/integration-tests/diagnostics/syntax-error/diagnostics.expected index d9ae8e1b617..b688f22e39a 100644 --- a/ruby/ql/integration-tests/diagnostics/syntax-error/diagnostics.expected +++ b/ruby/ql/integration-tests/diagnostics/syntax-error/diagnostics.expected @@ -5,7 +5,7 @@ "location": { "endColumn": 5, "endLine": 1, - "file": "/bad.rb", + "file": "bad.rb", "startColumn": 4, "startLine": 1 }, @@ -28,7 +28,7 @@ "location": { "endColumn": 7, "endLine": 3, - "file": "/bad.rb", + "file": "bad.rb", "startColumn": 8, "startLine": 3 }, diff --git a/ruby/ql/integration-tests/diagnostics/unknown-encoding/diagnostics.expected b/ruby/ql/integration-tests/diagnostics/unknown-encoding/diagnostics.expected index 1c9caa49824..2470d930430 100644 --- a/ruby/ql/integration-tests/diagnostics/unknown-encoding/diagnostics.expected +++ b/ruby/ql/integration-tests/diagnostics/unknown-encoding/diagnostics.expected @@ -3,7 +3,7 @@ "https://docs.ruby-lang.org/en/master/syntax/comments_rdoc.html#label-encoding+Directive" ], "location": { - "file": "/encoding.rb" + "file": "encoding.rb" }, "markdownMessage": "Unknown character encoding `silly` in `#encoding:` [directive](https://docs.ruby-lang.org/en/master/syntax/comments_rdoc.html#label-encoding+Directive).", "plaintextMessage": "Unknown character encoding silly in #encoding: directive.", diff --git a/shared/tree-sitter-extractor/Cargo.toml b/shared/tree-sitter-extractor/Cargo.toml index 1ad18a6df5a..10b1d8db35c 100644 --- a/shared/tree-sitter-extractor/Cargo.toml +++ b/shared/tree-sitter-extractor/Cargo.toml @@ -18,6 +18,7 @@ lazy_static = "1.5.0" serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" chrono = { version = "0.4.42", features = ["serde"] } +url = "2.5" num_cpus = "1.17.0" zstd = "0.13.3" yeast = { path = "../yeast" } diff --git a/shared/tree-sitter-extractor/src/extractor/mod.rs b/shared/tree-sitter-extractor/src/extractor/mod.rs index 0c3e1366081..9eaa1f93cc7 100644 --- a/shared/tree-sitter-extractor/src/extractor/mod.rs +++ b/shared/tree-sitter-extractor/src/extractor/mod.rs @@ -298,6 +298,9 @@ pub fn extract( yeast_runner: Option<&yeast::Runner<'_>>, ) { let path_str = file_paths::normalize_and_transform_path(path, transformer); + let source_root = std::env::current_dir().ok(); + let diagnostics_path = + file_paths::relativize_for_diagnostic(path, source_root.as_deref()); let span = tracing::span!( tracing::Level::TRACE, "extract", @@ -318,7 +321,7 @@ pub fn extract( source, diagnostics_writer, trap_writer, - &path_str, + &diagnostics_path, file_label, language_prefix, schema, @@ -343,8 +346,9 @@ struct ChildNode { } struct Visitor<'a> { - /// The file path of the source code (as string) - path: &'a str, + /// A path suitable for diagnostic locations: relative to the source root if possible, + /// otherwise a file: URI + diagnostics_path: &'a str, /// The label to use whenever we need to refer to the `@file` entity of this /// source file. file_label: trap::Label, @@ -376,13 +380,13 @@ impl<'a> Visitor<'a> { source: &'a [u8], diagnostics_writer: &'a mut diagnostics::LogWriter, trap_writer: &'a mut trap::Writer, - path: &'a str, + diagnostics_path: &'a str, file_label: trap::Label, language_prefix: &str, schema: &'a NodeTypeMap, ) -> Visitor<'a> { Visitor { - path, + diagnostics_path, file_label, source, diagnostics_writer, @@ -433,7 +437,7 @@ impl<'a> Visitor<'a> { ); mesg.severity(diagnostics::Severity::Warning) .location( - self.path, + self.diagnostics_path, loc.start_line, loc.start_column, loc.end_line, @@ -553,7 +557,7 @@ impl<'a> Visitor<'a> { ) .severity(diagnostics::Severity::Warning) .location( - self.path, + self.diagnostics_path, loc.start_line, loc.start_column, loc.end_line, diff --git a/shared/tree-sitter-extractor/src/file_paths.rs b/shared/tree-sitter-extractor/src/file_paths.rs index bdb9dd035f0..2728f10a5f8 100644 --- a/shared/tree-sitter-extractor/src/file_paths.rs +++ b/shared/tree-sitter-extractor/src/file_paths.rs @@ -3,6 +3,25 @@ use std::{ path::{Path, PathBuf}, }; +/// Given an absolute path, returns a relative path if it's under `source_root`, +/// otherwise a properly-encoded `file:` URI. This is used for diagnostic locations, which +/// should use relative paths per the CodeQL diagnostic message format spec. +pub fn relativize_for_diagnostic(path: &Path, source_root: Option<&Path>) -> String { + source_root + .and_then(|root| path.strip_prefix(root).ok()) + .and_then(|rel| rel.to_str()) + .map(|s| s.to_owned()) + .unwrap_or_else(|| path_to_file_uri(path)) +} + +/// Convert a path to a `file:` URI, using the `url` crate for proper percent-encoding. +/// Falls back to a simple `file://` prefix if the `url` crate can't handle the path. +fn path_to_file_uri(path: &Path) -> String { + url::Url::from_file_path(path) + .map(|u| u.to_string()) + .unwrap_or_else(|()| format!("file://{}", path.display())) +} + /// This represents the minimum supported path transformation that is needed to support extracting /// overlay databases. Specifically, it represents a transformer where one path prefix is replaced /// with a different prefix. @@ -224,3 +243,44 @@ pub fn path_for( } result } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn relativize_under_source_root() { + let path = Path::new("/home/runner/work/repo/src/foo.rb"); + let result = + relativize_for_diagnostic(path, Some(Path::new("/home/runner/work/repo"))); + assert_eq!(result, "src/foo.rb"); + } + + #[test] + fn relativize_outside_source_root_produces_file_uri() { + let path = Path::new("/other/location/foo.rb"); + let result = + relativize_for_diagnostic(path, Some(Path::new("/home/runner/work/repo"))); + assert_eq!(result, "file:///other/location/foo.rb"); + } + + #[test] + fn relativize_no_source_root_produces_file_uri() { + let path = Path::new("/home/runner/work/repo/src/foo.rb"); + let result = relativize_for_diagnostic(path, None); + assert_eq!(result, "file:///home/runner/work/repo/src/foo.rb"); + } + + #[test] + fn path_to_file_uri_encodes_spaces() { + let result = path_to_file_uri(Path::new("/home/user/my project/foo.rb")); + assert_eq!(result, "file:///home/user/my%20project/foo.rb"); + } + + #[test] + fn relativize_exact_root_path() { + let path = Path::new("/repo/foo.rb"); + let result = relativize_for_diagnostic(path, Some(Path::new("/repo"))); + assert_eq!(result, "foo.rb"); + } +}