From d16bc36e83155c01ad8d82795fd1815ee92a6b55 Mon Sep 17 00:00:00 2001 From: Paolo Tranquilli Date: Wed, 13 May 2026 09:32:11 +0200 Subject: [PATCH] Use relative paths in tree-sitter extractor diagnostics Diagnostic `location.file` entries were using absolute paths (e.g. `/home/runner/work/...`), causing broken links in the GitHub UI. Now relativize against CWD (the source root during extraction), falling back to a properly percent-encoded `file:` URI for paths outside it. Fixes https://github.com/github/codeql/issues/21802 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- Cargo.lock | 1 + ql/Cargo.lock | Bin 25823 -> 33983 bytes ruby/extractor/src/extractor.rs | 7 +- .../syntax-error/diagnostics.expected | 4 +- .../unknown-encoding/diagnostics.expected | 2 +- shared/tree-sitter-extractor/Cargo.toml | 1 + .../src/extractor/mod.rs | 18 ++++-- .../tree-sitter-extractor/src/file_paths.rs | 60 ++++++++++++++++++ 8 files changed, 81 insertions(+), 12 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4fab55a6444..171f4b52c42 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -456,6 +456,7 @@ dependencies = [ "tree-sitter", "tree-sitter-json", "tree-sitter-ql", + "url", "yeast", "zstd", ] diff --git a/ql/Cargo.lock b/ql/Cargo.lock index 6632bf162eecd1f860ea3cbd2c50ad60d2884680..ba31581cc2336306b0a725a0939526ddc6f0d7cd 100644 GIT binary patch delta 6409 zcmZ{o%a5PeS;sXK*G`-^aW1|#No^-;3du~){ZfHuBAT=%1)QL=NQB&TO@@wVoSCs3 zcZb9VP^H3)#D+hDWOm(T1FB%tmWu>JDCv?-n(PoP!som*wr9p>VR`2Fd*?mpJm>p- zFV9E+u<+rZF8=Mai@X1|_{u|1{?TUMZd*6o*5wyZUc9`%mdoWArx(8X-wS3+r+#mx zIIo4bNj1@IH%ZpolrJsT)?`b@+E9G&&6&`xcSeV#MBy=&N-=r5^u(Fn|M=3wUpRO1 z@{O(ad5v2wZSTxK{cwAyPM|x!XqBtgRkdOO3TwOEx7dV}%Y@x^8>0%KP9g z2TG>J|ZIHe6>D?2jPhD?Yg~Ke(3$1OuJ^jUr3)7{`pFB3# zd8@tM-dozfwXu`l8{a(jPTGDaZ*BfySxT>EOPHj0>C3o)shjj^s-Q>yLY z!ADtrD~TVK5-MlP6HBqe^rm{u!KCTsH_uM5KKri=c0OY-qPIR?VsfHTHASAb98stu zf-L;bfs9l!G$U))z4)eGmNf+v%lx_vUwvfvpPqYZ@tJbtYS~<0Z)FFFY#%y;0xXxr z>nps+R9UKYB?Mi4)HcT^d^1LPSxX8vO5dwht#F-SilZfwZPHyvIbA<-<~s-QI9lWS z<_;BJOY2u}?DV;;{2#8a?d-SCtn6{cpAV zZagX_sk_R$scdQpsaPpv4^fyRMGxJ{+eJ8K)aws-b={Rdlok@EW8UGH!7nX%Cgpqtj$e#q$#cH@45SIYoIYcW>Wbk#aj z&thw^#)j;KfH9ObLMT~^R;D!Wu~yN%D!uD$25ReA=ip6_G9O<1YE3(7-ud|a`qm~u z-&t$h2S2*GR`;)9n;oxnh@+K%_I@wD{w(z!tv-nXQ+>81kmB2||rdf+`SLROh- z_gqWqCa7q11Mn)Rs;8IBxd&Kl`gHN!1Gg)cThrNj&Ymn+RFV}!8|fph#kEG)gA=B) zT8_2$=#&UaXibwhFKQc=YU2R+tJFa_-LBrlY;K)J{uq_;eVC7l>F#3?B#%QiB+}cl*=Az$OR%v(u|gVYX4IAwsoIXR-}%_ zln$YXY`dnDn~c#I*LshIwN-7U7u5yitV*a}c4u?zC{s3S`o&9U-a66_W6h&wJKPHY ze<$CsV#n6%Mw-H#M=DS&%qS5Gwp4u8)u{^Is2YV2p&Fl?%(9ui zn|^M*?P{v&`fXl01nND?nLd8}>8B1oc(8U74%`zK7qr(!R0pOS$Q8B)YeBcMO@=J0 z5S7l&!uM>oYDODnVEG)A=Y{~@#Zj#TRB#W*rXQd9^7*xm+TI%!b8s7KnO6)huEx0* zjWu0B?kP$MPJ(bnnTFuyBvC-A=qvn$H=2&%pFr$85?=?Uojah{V+(imYw+p^&;R+t zg*z_h;1&?HWj+1!_>~nCDyo8eXxSy-8|oq~8X;1$I%Mu7;a7Tuo;-e{mlB&5bW}*A!*Z+BX_FHePRLN_SN@`dvgFN*TL(Wk`P}Wi|#S_8`kFr^-V?*Rh z@GjfxERKi5P4B$;$f(ioU%&X1g=1Dwjv?jm#ZgQ?{EA>`A=ERGTF-&;G(7dyAr$$dV z*gC$)S7%x9(ZK-%m2gU+j%MA-`t&43e^rb^KVfx?2v?}nVS(b)s7g0JU zIf@otbj|{e2+v1N!CMlx1PiZa7Uh<4n0(~sK0Gn_dS9NW75zEx{N1H1t44N(3*}Y7 zz){nnDz<@oIm9a$0Nn6YKHzH{O5L=FkGl5MFGn@M8dLnp^j9a&P7m4B3$uG#?O8|e z+j$NSqER`k4gL*|A9vtFH!Q2+jD`&<;JP94J$JlFRIbkV`SKq*v7*$>HC+gZ zPm5}l7Fj_=y?Y#3hjoBeF z+Jh(vbnJz4>QV+qdaZ-*1ej(OE76|jyH16zdN(aBpP4?sd}{j8pTECFo*oTPA2Wl( z#(J)idvK?!7il9bDv>k71*GOFJMz{`@;Lwm72sNL5|DLRAhe)`eqg=Fj^Fj+EOLDJ z-@g6_$DVInwOzfHuB{(|W?a_hh1#xf0m+?K_coJg#w3Yk8R#1!?Xa*7corH^1>a>7 zl1gj5iJ+f(NF$N90yoMrZs)N(lsz;0&SshxR!*NHq-|`|!b{}9v74DXmR8cfFq&5L4$#FxS zq8lx=`P&g38Q+W?A$owlp*!O3jix5FS@QdbzE@vT@W~B8YUp&O_LMJWNV2J zA;B9)J6R(Cd;W~>ok9B1t6@6(jZe?*+u|dXzR#8=Xr;9S{;pkW%9-VP~v#=7bWI}X!pbkA3EmbagMTRyauZtQGcBPV?D+uvPy0PjEj_~p~P+pqm-@!0iSJMT>5J5Np@ z*k@UpaPm zFJ>Ry6{DJE{3T^ZfB^Mi-QGzgq+n)KHiFg0_OLXO=s-qb@4?1oI~o*WhEa8wXVu>Q z`IUb;zWdgj#}<&7z22dL86Y8E^aQL*QzxewNr3k;d?^8Z7sQN#2LpxHTlCd2B*Ija zdz#(-?5$58oPO)gGZ*f}?1S4E%MNxKrlWV*O5%c0N9H7f85)r=Xp@>ws3IH?G+kW6 zFg5~&rUbHv@FY{_9QB2Hq&A&;LQH@2`1zB}^Y+zb`slSwFN`MG+?eaTv(?(t_Sz2F zVrfS1sQ&!NOV?7_+8l2F;e)@w|HA3%nTMYkgR>{+SakRdd!`o)gz$lE99X69x^@`= zSqqs=TUt%z42Td1oLkilfs94JOlZ!L;$~)eX?y1$iJ3RYRAmX>S-W{Rs*d5xP^K@B z9o_!&ejBSKb5qb^hCn7D+7f@?I<`x}QN9*ee1-^7MG)ggfW=35MTnCDKHTTOi;rBq zd{B8?xI ziK!{AshC*Lm5&J%&rHWl2?>Z8FpQyK8<8SX(sTDLdvJnu;07-3@8tfVlOt`;c2U_1 z`B`{TFl%(Rz?NDv^K{K|Ikhk;ND*orbEd2qjb+#E|1N{d-UqUTO?VA$Wv@{py|rMlOx{IHj%;A{!hzgnf)1IBjRV<>`8cTce$hjWwnp zop|QCy~S=Fo!@e=?uGFk+1*Z`Jvq`PGoJ=#QiTJ9CNMDMZyeGydxUlZ<70-lT2+89 d2j(GpLwf7EzI+HY0 z8$U3PUMYAH_dj4QUJ8aoZv+uU5WVx`LcGyGKn%GML9qCoGqKnxdNVNRyysmX&-47& zPro(&<1Z&adSZHPefP^%c4h7G7u!oWuI_A~ID2BvY|sXcSL?;6bIy(mTh}dFD#jEg ztIXAemeltiIaDgaQWA}=a11%eQi|)Xn=vs*=kJ|9_x5q?9e*9yKXhS65_52Ys79)t zh_z6(rgLi{T6SH#tEOuRCWw~Oa_P+kJjyQEsAF&Skuf{HvU0!easBj>`KfpQ~grVggqq^i= z#(Im3+P!-3({k*yXWq4f#vzSC=f~&m%#SBi7skDni?bLdQSeZsCAR3*RcPRy&+IrD zV~s10#W!Ks8fTXprDoWc9*ZQmC~#y47I<+->T+GLT-}yyR~LoGpU?h%vR}qgpspog z8(dvwlSq10E6yj+HAOGR)@p5(PMMFYj3`_K^k|VjCTw`jE9kpa!wG0K>QPFYoSr0eJX-IXXUW zoj*9fcJAnY8%TOkwx&jlJk_LBvSJRh@H5PhIkhz_2HsvgP`q}6uY6B6A{1m7b3Jn8 zfy{;t&Pzw`7dtlA`h}SduwCkGIl4$(yg`6c%N5dfLs*tqO(jK%T8pdLFJKd}0ZysP zz9;wRJ$mp>Ff0ao>>(5c8MdAdtfAHj{-vCtZFPpphzgtH3V@YMfD&TGvD9BW>i zv7B4)AW>1A%d$AsWkJLqdI?fsmsrN$p+`S(zubTA=dDXZ!53yoQe%cMsTyF2S|hCs zi5Yq|FCIsHfg_h-Q^i$p>R7u{hBuP!O9Fp=cjdJ^+mq!93{|y*2Nul2&G)td;-Oj% nzSVBnFm9n(-`J?uZX42RoNbsEQ#bps7mn|TBlF#BKRNVItHYWn diff --git a/ruby/extractor/src/extractor.rs b/ruby/extractor/src/extractor.rs index 4849f473ccb..817db97c0ef 100644 --- a/ruby/extractor/src/extractor.rs +++ b/ruby/extractor/src/extractor.rs @@ -94,11 +94,14 @@ pub fn run(options: Options) -> std::io::Result<()> { node_types::read_node_types_str("erb", tree_sitter_embedded_template::NODE_TYPES)?; let lines: std::io::Result> = std::io::BufReader::new(file_list).lines().collect(); let lines = lines?; + let source_root = std::env::current_dir().ok(); lines .par_iter() .try_for_each(|line| { let mut diagnostics_writer = diagnostics.logger(); let path = PathBuf::from(line).canonicalize()?; + let diagnostics_file_path = + file_paths::relativize_for_diagnostic(&path, source_root.as_deref()); match &overlay_changed_files { Some(changed_files) if !changed_files.contains(&path) => { // We are extracting an overlay and this file is not in the list of changes files, so we should skip it. @@ -165,7 +168,7 @@ pub fn run(options: Options) -> std::io::Result<()> { "character-decoding-error", "Character decoding error", ) - .file(&file_paths::normalize_and_transform_path(&path, path_transformer.as_ref())) + .file(&diagnostics_file_path) .message( "Could not decode the file contents as {}: {}. The contents of the file must match the character encoding specified in the {} {}.", &[ @@ -185,7 +188,7 @@ pub fn run(options: Options) -> std::io::Result<()> { diagnostics_writer.write( diagnostics_writer .new_entry("unknown-character-encoding", "Could not process some files due to an unknown character encoding") - .file(&file_paths::normalize_and_transform_path(&path, path_transformer.as_ref())) + .file(&diagnostics_file_path) .message( "Unknown character encoding {} in {} {}.", &[ diff --git a/ruby/ql/integration-tests/diagnostics/syntax-error/diagnostics.expected b/ruby/ql/integration-tests/diagnostics/syntax-error/diagnostics.expected index d9ae8e1b617..b688f22e39a 100644 --- a/ruby/ql/integration-tests/diagnostics/syntax-error/diagnostics.expected +++ b/ruby/ql/integration-tests/diagnostics/syntax-error/diagnostics.expected @@ -5,7 +5,7 @@ "location": { "endColumn": 5, "endLine": 1, - "file": "/bad.rb", + "file": "bad.rb", "startColumn": 4, "startLine": 1 }, @@ -28,7 +28,7 @@ "location": { "endColumn": 7, "endLine": 3, - "file": "/bad.rb", + "file": "bad.rb", "startColumn": 8, "startLine": 3 }, diff --git a/ruby/ql/integration-tests/diagnostics/unknown-encoding/diagnostics.expected b/ruby/ql/integration-tests/diagnostics/unknown-encoding/diagnostics.expected index 1c9caa49824..2470d930430 100644 --- a/ruby/ql/integration-tests/diagnostics/unknown-encoding/diagnostics.expected +++ b/ruby/ql/integration-tests/diagnostics/unknown-encoding/diagnostics.expected @@ -3,7 +3,7 @@ "https://docs.ruby-lang.org/en/master/syntax/comments_rdoc.html#label-encoding+Directive" ], "location": { - "file": "/encoding.rb" + "file": "encoding.rb" }, "markdownMessage": "Unknown character encoding `silly` in `#encoding:` [directive](https://docs.ruby-lang.org/en/master/syntax/comments_rdoc.html#label-encoding+Directive).", "plaintextMessage": "Unknown character encoding silly in #encoding: directive.", diff --git a/shared/tree-sitter-extractor/Cargo.toml b/shared/tree-sitter-extractor/Cargo.toml index 1ad18a6df5a..10b1d8db35c 100644 --- a/shared/tree-sitter-extractor/Cargo.toml +++ b/shared/tree-sitter-extractor/Cargo.toml @@ -18,6 +18,7 @@ lazy_static = "1.5.0" serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" chrono = { version = "0.4.42", features = ["serde"] } +url = "2.5" num_cpus = "1.17.0" zstd = "0.13.3" yeast = { path = "../yeast" } diff --git a/shared/tree-sitter-extractor/src/extractor/mod.rs b/shared/tree-sitter-extractor/src/extractor/mod.rs index 0c3e1366081..9eaa1f93cc7 100644 --- a/shared/tree-sitter-extractor/src/extractor/mod.rs +++ b/shared/tree-sitter-extractor/src/extractor/mod.rs @@ -298,6 +298,9 @@ pub fn extract( yeast_runner: Option<&yeast::Runner<'_>>, ) { let path_str = file_paths::normalize_and_transform_path(path, transformer); + let source_root = std::env::current_dir().ok(); + let diagnostics_path = + file_paths::relativize_for_diagnostic(path, source_root.as_deref()); let span = tracing::span!( tracing::Level::TRACE, "extract", @@ -318,7 +321,7 @@ pub fn extract( source, diagnostics_writer, trap_writer, - &path_str, + &diagnostics_path, file_label, language_prefix, schema, @@ -343,8 +346,9 @@ struct ChildNode { } struct Visitor<'a> { - /// The file path of the source code (as string) - path: &'a str, + /// A path suitable for diagnostic locations: relative to the source root if possible, + /// otherwise a file: URI + diagnostics_path: &'a str, /// The label to use whenever we need to refer to the `@file` entity of this /// source file. file_label: trap::Label, @@ -376,13 +380,13 @@ impl<'a> Visitor<'a> { source: &'a [u8], diagnostics_writer: &'a mut diagnostics::LogWriter, trap_writer: &'a mut trap::Writer, - path: &'a str, + diagnostics_path: &'a str, file_label: trap::Label, language_prefix: &str, schema: &'a NodeTypeMap, ) -> Visitor<'a> { Visitor { - path, + diagnostics_path, file_label, source, diagnostics_writer, @@ -433,7 +437,7 @@ impl<'a> Visitor<'a> { ); mesg.severity(diagnostics::Severity::Warning) .location( - self.path, + self.diagnostics_path, loc.start_line, loc.start_column, loc.end_line, @@ -553,7 +557,7 @@ impl<'a> Visitor<'a> { ) .severity(diagnostics::Severity::Warning) .location( - self.path, + self.diagnostics_path, loc.start_line, loc.start_column, loc.end_line, diff --git a/shared/tree-sitter-extractor/src/file_paths.rs b/shared/tree-sitter-extractor/src/file_paths.rs index bdb9dd035f0..2728f10a5f8 100644 --- a/shared/tree-sitter-extractor/src/file_paths.rs +++ b/shared/tree-sitter-extractor/src/file_paths.rs @@ -3,6 +3,25 @@ use std::{ path::{Path, PathBuf}, }; +/// Given an absolute path, returns a relative path if it's under `source_root`, +/// otherwise a properly-encoded `file:` URI. This is used for diagnostic locations, which +/// should use relative paths per the CodeQL diagnostic message format spec. +pub fn relativize_for_diagnostic(path: &Path, source_root: Option<&Path>) -> String { + source_root + .and_then(|root| path.strip_prefix(root).ok()) + .and_then(|rel| rel.to_str()) + .map(|s| s.to_owned()) + .unwrap_or_else(|| path_to_file_uri(path)) +} + +/// Convert a path to a `file:` URI, using the `url` crate for proper percent-encoding. +/// Falls back to a simple `file://` prefix if the `url` crate can't handle the path. +fn path_to_file_uri(path: &Path) -> String { + url::Url::from_file_path(path) + .map(|u| u.to_string()) + .unwrap_or_else(|()| format!("file://{}", path.display())) +} + /// This represents the minimum supported path transformation that is needed to support extracting /// overlay databases. Specifically, it represents a transformer where one path prefix is replaced /// with a different prefix. @@ -224,3 +243,44 @@ pub fn path_for( } result } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn relativize_under_source_root() { + let path = Path::new("/home/runner/work/repo/src/foo.rb"); + let result = + relativize_for_diagnostic(path, Some(Path::new("/home/runner/work/repo"))); + assert_eq!(result, "src/foo.rb"); + } + + #[test] + fn relativize_outside_source_root_produces_file_uri() { + let path = Path::new("/other/location/foo.rb"); + let result = + relativize_for_diagnostic(path, Some(Path::new("/home/runner/work/repo"))); + assert_eq!(result, "file:///other/location/foo.rb"); + } + + #[test] + fn relativize_no_source_root_produces_file_uri() { + let path = Path::new("/home/runner/work/repo/src/foo.rb"); + let result = relativize_for_diagnostic(path, None); + assert_eq!(result, "file:///home/runner/work/repo/src/foo.rb"); + } + + #[test] + fn path_to_file_uri_encodes_spaces() { + let result = path_to_file_uri(Path::new("/home/user/my project/foo.rb")); + assert_eq!(result, "file:///home/user/my%20project/foo.rb"); + } + + #[test] + fn relativize_exact_root_path() { + let path = Path::new("/repo/foo.rb"); + let result = relativize_for_diagnostic(path, Some(Path::new("/repo"))); + assert_eq!(result, "foo.rb"); + } +}