Merge pull request #17552 from github/aibaars/diagnostics

Rust: extract parse errors as diagnostics
2025-12-17 01:03:14 +01:00 · 2024-09-25 13:15:24 +02:00
parent 0ae10ece39 5714811071
commit f57dd0a596
14 changed files with 282 additions and 20 deletions
--- a/config/identical-files.json
+++ b/config/identical-files.json
@@ -355,5 +355,9 @@
  "Python model summaries test extension": [
    "python/ql/test/library-tests/dataflow/model-summaries/InlineTaintTest.ext.yml",
    "python/ql/test/library-tests/dataflow/model-summaries/NormalDataflowTest.ext.yml"
+  ],
+  "Diagnostics.qll": [
+    "ruby/ql/lib/codeql/ruby/Diagnostics.qll",
+    "rust/ql/lib/codeql/rust/Diagnostics.qll"
  ]
 }
--- a/ruby/ql/lib/codeql/ruby/Diagnostics.qll
+++ b/ruby/ql/lib/codeql/ruby/Diagnostics.qll
@@ -1,3 +1,5 @@
+/** Provides classes relating to extraction diagnostics. */
+
 private import codeql.Locations

 /** A diagnostic emitted during extraction, such as a parse error */
--- a/rust/extractor/src/main.rs
+++ b/rust/extractor/src/main.rs
@@ -1,13 +1,48 @@
 use anyhow::Context;
 use ra_ap_ide_db::line_index::LineIndex;
 use ra_ap_parser::Edition;
+use std::borrow::Cow;
 mod archive;
 mod config;
 pub mod generated;
 mod translate;
 pub mod trap;
 use ra_ap_syntax::ast::SourceFile;
-use ra_ap_syntax::AstNode;
+use ra_ap_syntax::{AstNode, SyntaxError, TextRange, TextSize};
+
+fn from_utf8_lossy(v: &[u8]) -> (Cow<'_, str>, Option<SyntaxError>) {
+    let mut iter = v.utf8_chunks();
+    let (first_valid, first_invalid) = if let Some(chunk) = iter.next() {
+        let valid = chunk.valid();
+        let invalid = chunk.invalid();
+        if invalid.is_empty() {
+            debug_assert_eq!(valid.len(), v.len());
+            return (Cow::Borrowed(valid), None);
+        }
+        (valid, invalid)
+    } else {
+        return (Cow::Borrowed(""), None);
+    };
+
+    const REPLACEMENT: &str = "\u{FFFD}";
+    let error_start = first_valid.len() as u32;
+    let error_end = error_start + first_invalid.len() as u32;
+    let error_range = TextRange::new(TextSize::new(error_start), TextSize::new(error_end));
+    let error = SyntaxError::new("invalid utf-8 sequence".to_owned(), error_range);
+    let mut res = String::with_capacity(v.len());
+    res.push_str(first_valid);
+
+    res.push_str(REPLACEMENT);
+
+    for chunk in iter {
+        res.push_str(chunk.valid());
+        if !chunk.invalid().is_empty() {
+            res.push_str(REPLACEMENT);
+        }
+    }
+
+    (Cow::Owned(res), Some(error))
+}

 fn extract(
    archiver: &archive::Archiver,
@@ -18,24 +53,25 @@ fn extract(
    let file = std::fs::canonicalize(&file).unwrap_or(file);
    archiver.archive(&file);
    let input = std::fs::read(&file)?;
-    let input = String::from_utf8(input)?;
+    let (input, err) = from_utf8_lossy(&input);
    let line_index = LineIndex::new(&input);
    let display_path = file.to_string_lossy();
    let mut trap = traps.create("source", &file);
    let label = trap.emit_file(&file);
    let mut translator = translate::Translator::new(trap, label, line_index);
-
+    if let Some(err) = err {
+        translator.emit_parse_error(display_path.as_ref(), err);
+    }
    let parse = ra_ap_syntax::ast::SourceFile::parse(&input, Edition::CURRENT);
    for err in parse.errors() {
-        let (start, _) = translator.location(err.range());
-        log::warn!("{}:{}:{}: {}", display_path, start.line, start.col, err);
+        translator.emit_parse_error(display_path.as_ref(), err);
    }
    if let Some(ast) = SourceFile::cast(parse.syntax_node()) {
        translator.emit_source_file(ast);
-        translator.trap.commit()?
    } else {
        log::warn!("Skipped {}", display_path);
    }
+    translator.trap.commit()?;
    Ok(())
 }
 fn main() -> anyhow::Result<()> {
--- a/rust/extractor/src/translate/base.rs
+++ b/rust/extractor/src/translate/base.rs
@@ -1,10 +1,9 @@
-use crate::trap::TrapFile;
+use crate::trap::{DiagnosticSeverity, TrapFile};
 use crate::trap::{Label, TrapClass};
 use codeql_extractor::trap::{self};
 use ra_ap_ide_db::line_index::{LineCol, LineIndex};
 use ra_ap_syntax::ast::RangeItem;
-use ra_ap_syntax::TextSize;
-use ra_ap_syntax::{ast, TextRange};
+use ra_ap_syntax::{ast, SyntaxError, TextRange};
 pub trait TextValue {
    fn try_get_text(&self) -> Option<String>;
 }
@@ -71,16 +70,38 @@ impl Translator {
    }
    pub fn location(&self, range: TextRange) -> (LineCol, LineCol) {
        let start = self.line_index.line_col(range.start());
-        let end = self.line_index.line_col(
-            range
-                .end()
-                .checked_sub(TextSize::new(1))
-                .unwrap_or(range.end()),
-        );
+        let range_end = range.end();
+        // QL end positions are inclusive, while TextRange offsets are exclusive and point at the position
+        // right after the last character of the range. We need to shift the end offset one character to the left to
+        // get the right inclusive QL position. Unfortunately, simply subtracting `1` from the end-offset may cause
+        // the offset to point in the middle of a mult-byte character, resulting in a `panic`. Therefore we use `try_line_col`
+        // with decreasing offsets to find the start of the last character included in the range.
+        for i in 1..4 {
+            if let Some(end) = range_end
+                .checked_sub(i.into())
+                .and_then(|x| self.line_index.try_line_col(x))
+            {
+                return (start, end);
+            }
+        }
+        let end = self.line_index.line_col(range_end);
        (start, end)
    }
    pub fn emit_location<T: TrapClass>(&mut self, label: Label<T>, node: impl ast::AstNode) {
        let (start, end) = self.location(node.syntax().text_range());
        self.trap.emit_location(self.label, label, start, end)
    }
+    pub fn emit_parse_error(&mut self, path: &str, err: SyntaxError) {
+        let (start, end) = self.location(err.range());
+        log::warn!("{}:{}:{}: {}", path, start.line + 1, start.col + 1, err);
+        let message = err.to_string();
+        let location = self.trap.emit_location_label(self.label, start, end);
+        self.trap.emit_diagnostic(
+            DiagnosticSeverity::Warning,
+            "parse_error".to_owned(),
+            message.clone(),
+            message,
+            location,
+        );
+    }
 }
--- a/rust/extractor/src/trap.rs
+++ b/rust/extractor/src/trap.rs
@@ -128,19 +128,25 @@ pub struct TrapFile {
    compression: Compression,
 }

+#[derive(Copy, Clone)]
+pub enum DiagnosticSeverity {
+    Debug = 10,
+    Info = 20,
+    Warning = 30,
+    Error = 40,
+}
 impl TrapFile {
-    pub fn emit_location<E: TrapClass>(
+    pub fn emit_location_label(
        &mut self,
        file_label: UntypedLabel,
-        entity_label: Label<E>,
        start: LineCol,
        end: LineCol,
-    ) {
+    ) -> UntypedLabel {
        let start_line = 1 + start.line as usize;
        let start_column = 1 + start.col as usize;
        let end_line = 1 + end.line as usize;
        let end_column = 1 + end.col as usize;
-        let location_label = extractor::location_label(
+        extractor::location_label(
            &mut self.writer,
            trap::Location {
                file_label,
@@ -149,13 +155,43 @@ impl TrapFile {
                end_line,
                end_column,
            },
-        );
+        )
+    }
+    pub fn emit_location<E: TrapClass>(
+        &mut self,
+        file_label: UntypedLabel,
+        entity_label: Label<E>,
+        start: LineCol,
+        end: LineCol,
+    ) {
+        let location_label = self.emit_location_label(file_label, start, end);
        self.writer.add_tuple(
            "locatable_locations",
            vec![entity_label.into(), location_label.into()],
        );
    }

+    pub fn emit_diagnostic(
+        &mut self,
+        severity: DiagnosticSeverity,
+        error_tag: String,
+        error_message: String,
+        full_error_message: String,
+        location: UntypedLabel,
+    ) {
+        let label = self.writer.fresh_id();
+        self.writer.add_tuple(
+            "diagnostics",
+            vec![
+                trap::Arg::Label(label),
+                trap::Arg::Int(severity as usize),
+                trap::Arg::String(error_tag),
+                trap::Arg::String(error_message),
+                trap::Arg::String(full_error_message),
+                trap::Arg::Label(location),
+            ],
+        );
+    }
    pub fn emit_file(&mut self, absolute_path: &Path) -> trap::Label {
        extractor::populate_file(&mut self.writer, absolute_path)
    }
--- a/rust/ql/lib/codeql/rust/Diagnostics.qll
+++ b/rust/ql/lib/codeql/rust/Diagnostics.qll
@@ -0,0 +1,54 @@
+/** Provides classes relating to extraction diagnostics. */
+
+private import codeql.Locations
+
+/** A diagnostic emitted during extraction, such as a parse error */
+class Diagnostic extends @diagnostic {
+  int severity;
+  string tag;
+  string message;
+  string fullMessage;
+  Location location;
+
+  Diagnostic() { diagnostics(this, severity, tag, message, fullMessage, location) }
+
+  /**
+   * Gets the numerical severity level associated with this diagnostic.
+   */
+  int getSeverity() { result = severity }
+
+  /** Gets a string representation of the severity of this diagnostic. */
+  string getSeverityText() {
+    severity = 10 and result = "Debug"
+    or
+    severity = 20 and result = "Info"
+    or
+    severity = 30 and result = "Warning"
+    or
+    severity = 40 and result = "Error"
+  }
+
+  /** Gets the error code associated with this diagnostic, e.g. parse_error. */
+  string getTag() { result = tag }
+
+  /**
+   * Gets the error message text associated with this diagnostic.
+   */
+  string getMessage() { result = message }
+
+  /**
+   * Gets the full error message text associated with this diagnostic.
+   */
+  string getFullMessage() { result = fullMessage }
+
+  /** Gets the source location of this diagnostic. */
+  Location getLocation() { result = location }
+
+  /** Gets a textual representation of this diagnostic. */
+  string toString() { result = this.getMessage() }
+}
+
+/** A diagnostic relating to a particular error in extracting a file. */
+class ExtractionError extends Diagnostic {
+  ExtractionError() { this.getTag() = "parse_error" }
+}
--- a/rust/ql/src/queries/diagnostics/ExtractionErrors.ql
+++ b/rust/ql/src/queries/diagnostics/ExtractionErrors.ql
@@ -0,0 +1,18 @@
+/**
+ * @name Extraction errors
+ * @description List all extraction errors for files in the source code directory.
+ * @kind diagnostic
+ * @id rust/diagnostics/extraction-errors
+ */
+
+import codeql.rust.Diagnostics
+import codeql.files.FileSystem
+
+/** Gets the SARIF severity to associate an error. */
+int getSeverity() { result = 2 }
+
+from ExtractionError error, File f
+where
+  f = error.getLocation().getFile() and
+  exists(f.getRelativePath())
+select error, "Extraction failed in " + f + " with error " + error.getMessage(), getSeverity()
--- a/rust/ql/src/queries/summary/NumberOfFilesExtractedWithErrors.ql
+++ b/rust/ql/src/queries/summary/NumberOfFilesExtractedWithErrors.ql
@@ -0,0 +1,15 @@
+/**
+ * @id rust/summary/number-of-files-extracted-with-errors
+ * @name Total number of Rust files that were extracted with errors
+ * @description The total number of Rust files in the source code directory that
+ *  were extracted, but where at least one extraction error occurred in the process.
+ * @kind metric
+ * @tags summary
+ */
+
+import codeql.files.FileSystem
+import codeql.rust.Diagnostics
+
+select count(File f |
+    exists(ExtractionError e | e.getLocation().getFile() = f) and exists(f.getRelativePath())
+  )
--- a/rust/ql/src/queries/summary/NumberOfSuccessfullyExtractedFiles.ql
+++ b/rust/ql/src/queries/summary/NumberOfSuccessfullyExtractedFiles.ql
@@ -0,0 +1,15 @@
+/**
+ * @id rust/summary/number-of-successfully-extracted-files
+ * @name Total number of Rust files that were extracted without error
+ * @description The total number of Rust files in the source code directory that
+ *  were extracted without encountering any extraction errors.
+ * @kind metric
+ * @tags summary
+ */
+
+import codeql.rust.Diagnostics
+import codeql.files.FileSystem
+
+select count(File f |
+    not exists(ExtractionError e | e.getLocation().getFile() = f) and exists(f.getRelativePath())
+  )
--- a/rust/ql/test/extractor-tests/utf8/ast.expected
+++ b/rust/ql/test/extractor-tests/utf8/ast.expected
@@ -0,0 +1,39 @@
+| lib.rs:1:1:3:22 | SourceFile |
+| lib.rs:2:1:2:8 | Module |
+| lib.rs:2:5:2:7 | Name |
+| lib.rs:3:1:3:8 | Module |
+| lib.rs:3:5:3:8 | Name |
+| lib.rs:3:10:3:20 | NameRef |
+| lib.rs:3:10:3:20 | Path |
+| lib.rs:3:10:3:20 | PathSegment |
+| lib.rs:3:10:3:21 | MacroCall |
+| utf8-identifiers.rs:1:1:4:6 | foo |
+| utf8-identifiers.rs:1:1:12:2 | SourceFile |
+| utf8-identifiers.rs:1:4:1:6 | Name |
+| utf8-identifiers.rs:1:7:4:1 | GenericParamList |
+| utf8-identifiers.rs:2:5:2:6 | Lifetime |
+| utf8-identifiers.rs:2:5:2:6 | LifetimeParam |
+| utf8-identifiers.rs:3:5:3:5 | Name |
+| utf8-identifiers.rs:3:5:3:5 | TypeParam |
+| utf8-identifiers.rs:4:2:4:3 | ParamList |
+| utf8-identifiers.rs:4:5:4:6 | BlockExpr |
+| utf8-identifiers.rs:4:5:4:6 | StmtList |
+| utf8-identifiers.rs:6:1:8:1 | Struct |
+| utf8-identifiers.rs:6:8:6:8 | Name |
+| utf8-identifiers.rs:6:10:8:1 | RecordFieldList |
+| utf8-identifiers.rs:7:5:7:5 | Name |
+| utf8-identifiers.rs:7:5:7:13 | RecordField |
+| utf8-identifiers.rs:7:9:7:13 | NameRef |
+| utf8-identifiers.rs:7:9:7:13 | Path |
+| utf8-identifiers.rs:7:9:7:13 | PathSegment |
+| utf8-identifiers.rs:7:9:7:13 | PathType |
+| utf8-identifiers.rs:10:1:10:3 | Visibility |
+| utf8-identifiers.rs:10:1:12:1 | main |
+| utf8-identifiers.rs:10:8:10:11 | Name |
+| utf8-identifiers.rs:10:12:10:13 | ParamList |
+| utf8-identifiers.rs:10:15:12:1 | BlockExpr |
+| utf8-identifiers.rs:10:15:12:1 | StmtList |
+| utf8-identifiers.rs:11:5:11:24 | LetStmt |
+| utf8-identifiers.rs:11:9:11:9 | IdentPat |
+| utf8-identifiers.rs:11:9:11:9 | Name |
+| utf8-identifiers.rs:11:14:11:23 | LiteralExpr |
--- a/rust/ql/test/extractor-tests/utf8/ast.ql
+++ b/rust/ql/test/extractor-tests/utf8/ast.ql
@@ -0,0 +1,3 @@
+import codeql.rust.elements
+
+select any(AstNode n)
--- a/rust/ql/test/extractor-tests/utf8/utf8-identifiers.rs
+++ b/rust/ql/test/extractor-tests/utf8/utf8-identifiers.rs
@@ -0,0 +1,12 @@
+fn foo<
+    'β,
+    γ
+>() {}
+
+struct X {
+    δ: usize
+}
+
+pub fn main() {
+    let α = 0.00001f64;
+}
--- a/rust/ql/test/query-tests/diagnostics/ExtractionErrors.expected
+++ b/rust/ql/test/query-tests/diagnostics/ExtractionErrors.expected
@@ -0,0 +1,6 @@
+| does_not_compile.rs:2:6:2:5 | expected SEMICOLON | Extraction failed in does_not_compile.rs with error expected SEMICOLON | 2 |
+| does_not_compile.rs:2:9:2:8 | expected SEMICOLON | Extraction failed in does_not_compile.rs with error expected SEMICOLON | 2 |
+| does_not_compile.rs:2:13:2:12 | expected SEMICOLON | Extraction failed in does_not_compile.rs with error expected SEMICOLON | 2 |
+| does_not_compile.rs:2:21:2:20 | expected SEMICOLON | Extraction failed in does_not_compile.rs with error expected SEMICOLON | 2 |
+| does_not_compile.rs:2:26:2:25 | expected SEMICOLON | Extraction failed in does_not_compile.rs with error expected SEMICOLON | 2 |
+| does_not_compile.rs:2:32:2:31 | expected field name or number | Extraction failed in does_not_compile.rs with error expected field name or number | 2 |
--- a/rust/ql/test/query-tests/diagnostics/ExtractionErrors.qlref
+++ b/rust/ql/test/query-tests/diagnostics/ExtractionErrors.qlref
@@ -0,0 +1 @@
+queries/diagnostics/ExtractionErrors.ql