Merge branch 'main' into encoding

2025-12-16 16:53:25 +01:00 · 2022-07-21 21:21:17 +02:00
parent 7be106d7bb ed0325f162
commit 1399610bd4
20 changed files with 694 additions and 486 deletions
--- a/cpp/ql/src/Security/CWE/CWE-078/ExecTainted.ql
+++ b/cpp/ql/src/Security/CWE/CWE-078/ExecTainted.ql
@@ -77,7 +77,7 @@ class ExecState extends DataFlow::FlowState {
  ExecState() {
    this =
      "ExecState (" + fst.getLocation() + " | " + fst + ", " + snd.getLocation() + " | " + snd + ")" and
-    interestingConcatenation(fst, snd)
+    interestingConcatenation(pragma[only_bind_into](fst), pragma[only_bind_into](snd))
  }

  DataFlow::Node getFstNode() { result = fst }
--- a/csharp/tools/tracing-config.lua
+++ b/csharp/tools/tracing-config.lua
@@ -2,7 +2,54 @@ function RegisterExtractorPack(id)
    local extractor = GetPlatformToolsDirectory() ..
                          'Semmle.Extraction.CSharp.Driver'
    if OperatingSystem == 'windows' then extractor = extractor .. '.exe' end
+
+    function DotnetMatcherBuild(compilerName, compilerPath, compilerArguments,
+                                _languageId)
+        if compilerName ~= 'dotnet' and compilerName ~= 'dotnet.exe' then
+            return nil
+        end
+
+        -- The dotnet CLI has the following usage instructions:
+        -- dotnet [sdk-options] [command] [command-options] [arguments]
+        -- we are interested in dotnet build, which has the following usage instructions:
+        -- dotnet [options] build [<PROJECT | SOLUTION>...]
+        -- For now, parse the command line as follows:
+        -- Everything that starts with `-` (or `/`) will be ignored.
+        -- The first non-option argument is treated as the command.
+        -- if that's `build`, we append `/p:UseSharedCompilation=false` to the command line,
+        -- otherwise we do nothing.
+        local match = false
+        local argv = compilerArguments.argv
+        if OperatingSystem == 'windows' then
+            -- let's hope that this split matches the escaping rules `dotnet` applies to command line arguments
+            -- or, at least, that it is close enough
+            argv =
+                NativeArgumentsToArgv(compilerArguments.nativeArgumentPointer)
+        end
+        for i, arg in ipairs(argv) do
+            -- dotnet options start with either - or / (both are legal)
+            local firstCharacter = string.sub(arg, 1, 1)
+            if not (firstCharacter == '-') and not (firstCharacter == '/') then
+                Log(1, 'Dotnet subcommand detected: %s', arg)
+                if arg == 'build' then match = true end
+                break
+            end
+        end
+        if match then
+            return {
+                order = ORDER_REPLACE,
+                invocation = BuildExtractorInvocation(id, compilerPath,
+                                                      compilerPath,
+                                                      compilerArguments, nil, {
+                    '/p:UseSharedCompilation=false'
+                })
+            }
+        end
+        return nil
+    end
+
    local windowsMatchers = {
+        DotnetMatcherBuild,
        CreatePatternMatcher({'^dotnet%.exe$'}, MatchCompilerName, extractor, {
            prepend = {'--dotnetexec', '--cil'},
            order = ORDER_BEFORE
@@ -10,22 +57,21 @@ function RegisterExtractorPack(id)
        CreatePatternMatcher({'^csc.*%.exe$'}, MatchCompilerName, extractor, {
            prepend = {'--compiler', '"${compiler}"', '--cil'},
            order = ORDER_BEFORE
-
        }),
        CreatePatternMatcher({'^fakes.*%.exe$', 'moles.*%.exe'},
                             MatchCompilerName, nil, {trace = false})
    }
    local posixMatchers = {
-        CreatePatternMatcher({'^mcs%.exe$', '^csc%.exe$'}, MatchCompilerName,
-                             extractor, {
-            prepend = {'--compiler', '"${compiler}"', '--cil'},
-            order = ORDER_BEFORE
-
-        }),
+        DotnetMatcherBuild,
        CreatePatternMatcher({'^mono', '^dotnet$'}, MatchCompilerName,
                             extractor, {
            prepend = {'--dotnetexec', '--cil'},
            order = ORDER_BEFORE
+        }),
+        CreatePatternMatcher({'^mcs%.exe$', '^csc%.exe$'}, MatchCompilerName,
+                             extractor, {
+            prepend = {'--compiler', '"${compiler}"', '--cil'},
+            order = ORDER_BEFORE
        }), function(compilerName, compilerPath, compilerArguments, _languageId)
            if MatchCompilerName('^msbuild$', compilerName, compilerPath,
                                 compilerArguments) or
@@ -49,7 +95,6 @@ function RegisterExtractorPack(id)
    else
        return posixMatchers
    end
-
 end

 -- Return a list of minimum supported versions of the configuration file format
--- a/docs/codeql/codeql-overview/supported-languages-and-frameworks.rst
+++ b/docs/codeql/codeql-overview/supported-languages-and-frameworks.rst
@@ -11,14 +11,17 @@ CodeQL.
 Languages and compilers
 #######################

-CodeQL supports the following languages and compilers.
+The current versions of the CodeQL CLI (`changelog <https://github.com/github/codeql-cli-binaries/blob/main/CHANGELOG.md>`__, `releases <https://github.com/github/codeql-cli-binaries/releases>`__),
+CodeQL library packs (`source <https://github.com/github/codeql/tree/codeql-cli/latest>`__),
+and CodeQL bundle (`releases <https://github.com/github/codeql-action/releases>`__)
+support the following languages and compilers.

 .. include:: ../support/reusables/versions-compilers.rst

 Frameworks and libraries
 ########################

-The libraries and queries in the current version of CodeQL have been explicitly checked against the libraries and frameworks listed below.
+The current versions of the CodeQL library and query packs (`source <https://github.com/github/codeql/tree/codeql-cli/latest>`__) have been explicitly checked against the libraries and frameworks listed below.

 .. pull-quote::

--- a/docs/codeql/support/reusables/frameworks.rst
+++ b/docs/codeql/support/reusables/frameworks.rst
@@ -1,6 +1,10 @@
 C and C++ built-in support
 ================================

+Provided by the current versions of the
+CodeQL query pack ``codeql/cpp-queries`` (`changelog <https://github.com/github/codeql/tree/codeql-cli/latest/cpp/ql/src/CHANGELOG.md>`__, `source <https://github.com/github/codeql/tree/codeql-cli/latest/cpp/ql/src>`__)
+and the CodeQL library pack ``codeql/cpp-all`` (`changelog <https://github.com/github/codeql/tree/codeql-cli/latest/cpp/ql/lib/CHANGELOG.md>`__, `source <https://github.com/github/codeql/tree/codeql-cli/latest/cpp/ql/lib>`__).
+
 .. csv-table::
   :header-rows: 1
   :class: fullWidthTable
@@ -14,6 +18,10 @@ C and C++ built-in support
 C# built-in support
 ================================

+Provided by the current versions of the
+CodeQL query pack ``codeql/csharp-queries`` (`changelog <https://github.com/github/codeql/tree/codeql-cli/latest/csharp/ql/src/CHANGELOG.md>`__, `source <https://github.com/github/codeql/tree/codeql-cli/latest/csharp/ql/src>`__)
+and the CodeQL library pack ``codeql/csharp-all`` (`changelog <https://github.com/github/codeql/tree/codeql-cli/latest/csharp/ql/lib/CHANGELOG.md>`__, `source <https://github.com/github/codeql/tree/codeql-cli/latest/csharp/ql/lib>`__).
+
 .. csv-table::
   :header-rows: 1
   :class: fullWidthTable
@@ -33,6 +41,10 @@ C# built-in support
 Go built-in support
 ================================

+Provided by the current versions of the
+CodeQL query pack ``codeql/go-queries`` (`changelog <https://github.com/github/codeql/tree/codeql-cli/latest/go/ql/src/CHANGELOG.md>`__, `source <https://github.com/github/codeql/tree/codeql-cli/latest/go/ql/src>`__)
+and the CodeQL library pack ``codeql/go-all`` (`changelog <https://github.com/github/codeql/tree/codeql-cli/latest/go/ql/lib/CHANGELOG.md>`__, `source <https://github.com/github/codeql/tree/codeql-cli/latest/go/ql/lib>`__).
+
 .. csv-table::
   :header-rows: 1
   :class: fullWidthTable
@@ -84,6 +96,10 @@ Go built-in support
 Java built-in support
 ==================================

+Provided by the current versions of the
+CodeQL query pack ``codeql/java-queries`` (`changelog <https://github.com/github/codeql/tree/codeql-cli/latest/java/ql/src/CHANGELOG.md>`__, `source <https://github.com/github/codeql/tree/codeql-cli/latest/java/ql/src>`__)
+and the CodeQL library pack ``codeql/java-all`` (`changelog <https://github.com/github/codeql/tree/codeql-cli/latest/java/ql/lib/CHANGELOG.md>`__, `source <https://github.com/github/codeql/tree/codeql-cli/latest/java/ql/lib>`__).
+
 .. csv-table::
   :header-rows: 1
   :class: fullWidthTable
@@ -113,6 +129,10 @@ Java built-in support
 JavaScript and TypeScript built-in support
 =======================================================

+Provided by the current versions of the
+CodeQL query pack ``codeql/javascript-queries`` (`changelog <https://github.com/github/codeql/tree/codeql-cli/latest/javascript/ql/src/CHANGELOG.md>`__, `source <https://github.com/github/codeql/tree/codeql-cli/latest/javascript/ql/src>`__)
+and the CodeQL library pack ``codeql/javascript-all`` (`changelog <https://github.com/github/codeql/tree/codeql-cli/latest/javascript/ql/lib/CHANGELOG.md>`__, `source <https://github.com/github/codeql/tree/codeql-cli/latest/javascript/ql/lib>`__).
+
 .. csv-table::
   :header-rows: 1
   :class: fullWidthTable
@@ -156,6 +176,10 @@ JavaScript and TypeScript built-in support
 Python built-in support
 ====================================

+Provided by the current versions of the
+CodeQL query pack ``codeql/python-queries`` (`changelog <https://github.com/github/codeql/tree/codeql-cli/latest/python/ql/src/CHANGELOG.md>`__, `source <https://github.com/github/codeql/tree/codeql-cli/latest/python/ql/src>`__)
+and the CodeQL library pack ``codeql/python-all`` (`changelog <https://github.com/github/codeql/tree/codeql-cli/latest/python/ql/lib/CHANGELOG.md>`__, `source <https://github.com/github/codeql/tree/codeql-cli/latest/python/ql/lib>`__).
+
 .. csv-table::
   :header-rows: 1
   :class: fullWidthTable
--- a/ruby/Cargo.lock
+++ b/ruby/Cargo.lock
--- a/ruby/extractor/Cargo.toml
+++ b/ruby/extractor/Cargo.toml
@@ -11,7 +11,7 @@ flate2 = "1.0"
 node-types = { path = "../node-types" }
 tree-sitter = "0.19"
 tree-sitter-embedded-template = { git = "https://github.com/tree-sitter/tree-sitter-embedded-template.git", rev = "1a538da253d73f896b9f6c0c7d79cda58791ac5c" }
-tree-sitter-ruby = { git = "https://github.com/tree-sitter/tree-sitter-ruby.git", rev = "5b305c3cd32db10494cedd2743de6bbe32f1a573" }
+tree-sitter-ruby = { git = "https://github.com/tree-sitter/tree-sitter-ruby.git", rev = "e75d04404c9dd71ad68850d5c672b226d5e694f3" }
 clap = "3.0"
 tracing = "0.1"
 tracing-subscriber = { version = "0.3.3", features = ["env-filter"] }
--- a/ruby/extractor/src/extractor.rs
+++ b/ruby/extractor/src/extractor.rs
@@ -1,161 +1,112 @@
+use crate::trap;
 use node_types::{EntryKind, Field, NodeTypeMap, Storage, TypeName};
-use std::borrow::Cow;
 use std::collections::BTreeMap as Map;
 use std::collections::BTreeSet as Set;
 use std::fmt;
-use std::io::Write;
 use std::path::Path;

 use tracing::{error, info, span, Level};
 use tree_sitter::{Language, Node, Parser, Range, Tree};

-pub struct TrapWriter {
-    /// The accumulated trap entries
-    trap_output: Vec<TrapEntry>,
-    /// A counter for generating fresh labels
-    counter: u32,
-    /// cache of global keys
-    global_keys: std::collections::HashMap<String, Label>,
+pub fn populate_file(writer: &mut trap::Writer, absolute_path: &Path) -> trap::Label {
+    let (file_label, fresh) =
+        writer.global_id(&trap::full_id_for_file(&normalize_path(absolute_path)));
+    if fresh {
+        writer.add_tuple(
+            "files",
+            vec![
+                trap::Arg::Label(file_label),
+                trap::Arg::String(normalize_path(absolute_path)),
+            ],
+        );
+        populate_parent_folders(writer, file_label, absolute_path.parent());
+    }
+    file_label
 }

-pub fn new_trap_writer() -> TrapWriter {
-    TrapWriter {
-        counter: 0,
-        trap_output: Vec::new(),
-        global_keys: std::collections::HashMap::new(),
+fn populate_empty_file(writer: &mut trap::Writer) -> trap::Label {
+    let (file_label, fresh) = writer.global_id("empty;sourcefile");
+    if fresh {
+        writer.add_tuple(
+            "files",
+            vec![
+                trap::Arg::Label(file_label),
+                trap::Arg::String("".to_string()),
+            ],
+        );
    }
+    file_label
 }

-impl TrapWriter {
-    ///  Gets a label that will hold the unique ID of the passed string at import time.
-    ///  This can be used for incrementally importable TRAP files -- use globally unique
-    ///  strings to compute a unique ID for table tuples.
-    ///
-    ///  Note: You probably want to make sure that the key strings that you use are disjoint
-    ///  for disjoint column types; the standard way of doing this is to prefix (or append)
-    ///  the column type name to the ID. Thus, you might identify methods in Java by the
-    ///  full ID "methods_com.method.package.DeclaringClass.method(argumentList)".
+pub fn populate_empty_location(writer: &mut trap::Writer) {
+    let file_label = populate_empty_file(writer);
+    location(writer, file_label, 0, 0, 0, 0);
+}

-    fn fresh_id(&mut self) -> Label {
-        let label = Label(self.counter);
-        self.counter += 1;
-        self.trap_output.push(TrapEntry::FreshId(label));
-        label
-    }
-
-    fn global_id(&mut self, key: &str) -> (Label, bool) {
-        if let Some(label) = self.global_keys.get(key) {
-            return (*label, false);
-        }
-        let label = Label(self.counter);
-        self.counter += 1;
-        self.global_keys.insert(key.to_owned(), label);
-        self.trap_output
-            .push(TrapEntry::MapLabelToKey(label, key.to_owned()));
-        (label, true)
-    }
-
-    fn add_tuple(&mut self, table_name: &str, args: Vec<Arg>) {
-        self.trap_output
-            .push(TrapEntry::GenericTuple(table_name.to_owned(), args))
-    }
-
-    fn populate_file(&mut self, absolute_path: &Path) -> Label {
-        let (file_label, fresh) = self.global_id(&full_id_for_file(absolute_path));
-        if fresh {
-            self.add_tuple(
-                "files",
-                vec![
-                    Arg::Label(file_label),
-                    Arg::String(normalize_path(absolute_path)),
-                ],
-            );
-            self.populate_parent_folders(file_label, absolute_path.parent());
-        }
-        file_label
-    }
-
-    fn populate_empty_file(&mut self) -> Label {
-        let (file_label, fresh) = self.global_id("empty;sourcefile");
-        if fresh {
-            self.add_tuple(
-                "files",
-                vec![Arg::Label(file_label), Arg::String("".to_string())],
-            );
-        }
-        file_label
-    }
-
-    pub fn populate_empty_location(&mut self) {
-        let file_label = self.populate_empty_file();
-        self.location(file_label, 0, 0, 0, 0);
-    }
-
-    fn populate_parent_folders(&mut self, child_label: Label, path: Option<&Path>) {
-        let mut path = path;
-        let mut child_label = child_label;
-        loop {
-            match path {
-                None => break,
-                Some(folder) => {
-                    let (folder_label, fresh) = self.global_id(&full_id_for_folder(folder));
-                    self.add_tuple(
-                        "containerparent",
-                        vec![Arg::Label(folder_label), Arg::Label(child_label)],
+pub fn populate_parent_folders(
+    writer: &mut trap::Writer,
+    child_label: trap::Label,
+    path: Option<&Path>,
+) {
+    let mut path = path;
+    let mut child_label = child_label;
+    loop {
+        match path {
+            None => break,
+            Some(folder) => {
+                let (folder_label, fresh) =
+                    writer.global_id(&trap::full_id_for_folder(&normalize_path(folder)));
+                writer.add_tuple(
+                    "containerparent",
+                    vec![
+                        trap::Arg::Label(folder_label),
+                        trap::Arg::Label(child_label),
+                    ],
+                );
+                if fresh {
+                    writer.add_tuple(
+                        "folders",
+                        vec![
+                            trap::Arg::Label(folder_label),
+                            trap::Arg::String(normalize_path(folder)),
+                        ],
                    );
-                    if fresh {
-                        self.add_tuple(
-                            "folders",
-                            vec![
-                                Arg::Label(folder_label),
-                                Arg::String(normalize_path(folder)),
-                            ],
-                        );
-                        path = folder.parent();
-                        child_label = folder_label;
-                    } else {
-                        break;
-                    }
+                    path = folder.parent();
+                    child_label = folder_label;
+                } else {
+                    break;
                }
            }
        }
    }
+}

-    fn location(
-        &mut self,
-        file_label: Label,
-        start_line: usize,
-        start_column: usize,
-        end_line: usize,
-        end_column: usize,
-    ) -> Label {
-        let (loc_label, fresh) = self.global_id(&format!(
-            "loc,{{{}}},{},{},{},{}",
-            file_label, start_line, start_column, end_line, end_column
-        ));
-        if fresh {
-            self.add_tuple(
-                "locations_default",
-                vec![
-                    Arg::Label(loc_label),
-                    Arg::Label(file_label),
-                    Arg::Int(start_line),
-                    Arg::Int(start_column),
-                    Arg::Int(end_line),
-                    Arg::Int(end_column),
-                ],
-            );
-        }
-        loc_label
-    }
-
-    fn comment(&mut self, text: String) {
-        self.trap_output.push(TrapEntry::Comment(text));
-    }
-
-    pub fn output(self, writer: &mut dyn Write) -> std::io::Result<()> {
-        write!(writer, "{}", Program(self.trap_output))
+fn location(
+    writer: &mut trap::Writer,
+    file_label: trap::Label,
+    start_line: usize,
+    start_column: usize,
+    end_line: usize,
+    end_column: usize,
+) -> trap::Label {
+    let (loc_label, fresh) = writer.global_id(&format!(
+        "loc,{{{}}},{},{},{},{}",
+        file_label, start_line, start_column, end_line, end_column
+    ));
+    if fresh {
+        writer.add_tuple(
+            "locations_default",
+            vec![
+                trap::Arg::Label(loc_label),
+                trap::Arg::Label(file_label),
+                trap::Arg::Int(start_line),
+                trap::Arg::Int(start_column),
+                trap::Arg::Int(end_line),
+                trap::Arg::Int(end_column),
+            ],
+        );
    }
+    loc_label
 }

 /// Extracts the source file at `path`, which is assumed to be canonicalized.
@@ -163,71 +114,43 @@ pub fn extract(
    language: Language,
    language_prefix: &str,
    schema: &NodeTypeMap,
-    trap_writer: &mut TrapWriter,
+    trap_writer: &mut trap::Writer,
    path: &Path,
    source: &[u8],
    ranges: &[Range],
 ) -> std::io::Result<()> {
+    let path_str = format!("{}", path.display());
    let span = span!(
        Level::TRACE,
        "extract",
-        file = %path.display()
+        file = %path_str
    );

    let _enter = span.enter();

-    info!("extracting: {}", path.display());
+    info!("extracting: {}", path_str);

    let mut parser = Parser::new();
    parser.set_language(language).unwrap();
    parser.set_included_ranges(ranges).unwrap();
    let tree = parser.parse(&source, None).expect("Failed to parse file");
-    trap_writer.comment(format!("Auto-generated TRAP file for {}", path.display()));
-    let file_label = &trap_writer.populate_file(path);
-    let mut visitor = Visitor {
+    trap_writer.comment(format!("Auto-generated TRAP file for {}", path_str));
+    let file_label = populate_file(trap_writer, path);
+    let mut visitor = Visitor::new(
        source,
        trap_writer,
        // TODO: should we handle path strings that are not valid UTF8 better?
-        path: format!("{}", path.display()),
-        file_label: *file_label,
-        toplevel_child_counter: 0,
-        stack: Vec::new(),
+        &path_str,
+        file_label,
        language_prefix,
        schema,
-    };
+    );
    traverse(&tree, &mut visitor);

    parser.reset();
    Ok(())
 }

-/// Escapes a string for use in a TRAP key, by replacing special characters with
-/// HTML entities.
-fn escape_key<'a, S: Into<Cow<'a, str>>>(key: S) -> Cow<'a, str> {
-    fn needs_escaping(c: char) -> bool {
-        matches!(c, '&' | '{' | '}' | '"' | '@' | '#')
-    }
-
-    let key = key.into();
-    if key.contains(needs_escaping) {
-        let mut escaped = String::with_capacity(2 * key.len());
-        for c in key.chars() {
-            match c {
-                '&' => escaped.push_str("&amp;"),
-                '{' => escaped.push_str("&lbrace;"),
-                '}' => escaped.push_str("&rbrace;"),
-                '"' => escaped.push_str("&quot;"),
-                '@' => escaped.push_str("&commat;"),
-                '#' => escaped.push_str("&num;"),
-                _ => escaped.push(c),
-            }
-        }
-        Cow::Owned(escaped)
-    } else {
-        key
-    }
-}
-
 /// Normalizes the path according the common CodeQL specification. Assumes that
 /// `path` has already been canonicalized using `std::fs::canonicalize`.
 fn normalize_path(path: &Path) -> String {
@@ -267,34 +190,28 @@ fn normalize_path(path: &Path) -> String {
    }
 }

-fn full_id_for_file(path: &Path) -> String {
-    format!("{};sourcefile", escape_key(&normalize_path(path)))
-}
-
-fn full_id_for_folder(path: &Path) -> String {
-    format!("{};folder", escape_key(&normalize_path(path)))
-}
-
 struct ChildNode {
    field_name: Option<&'static str>,
-    label: Label,
+    label: trap::Label,
    type_name: TypeName,
 }

 struct Visitor<'a> {
    /// The file path of the source code (as string)
-    path: String,
+    path: &'a str,
    /// The label to use whenever we need to refer to the `@file` entity of this
    /// source file.
-    file_label: Label,
+    file_label: trap::Label,
    /// The source code as a UTF-8 byte array
    source: &'a [u8],
-    /// A TrapWriter to accumulate trap entries
-    trap_writer: &'a mut TrapWriter,
+    /// A trap::Writer to accumulate trap entries
+    trap_writer: &'a mut trap::Writer,
    /// A counter for top-level child nodes
    toplevel_child_counter: usize,
-    /// Language prefix
-    language_prefix: &'a str,
+    /// Language-specific name of the AST info table
+    ast_node_info_table_name: String,
+    /// Language-specific name of the tokeninfo table
+    tokeninfo_table_name: String,
    /// A lookup table from type name to node types
    schema: &'a NodeTypeMap,
    /// A stack for gathering information from child nodes. Whenever a node is
@@ -303,27 +220,48 @@ struct Visitor<'a> {
    /// node the list containing the child data is popped from the stack and
    /// matched against the dbscheme for the node. If the expectations are met
    /// the corresponding row definitions are added to the trap_output.
-    stack: Vec<(Label, usize, Vec<ChildNode>)>,
+    stack: Vec<(trap::Label, usize, Vec<ChildNode>)>,
 }

-impl Visitor<'_> {
+impl<'a> Visitor<'a> {
+    fn new(
+        source: &'a [u8],
+        trap_writer: &'a mut trap::Writer,
+        path: &'a str,
+        file_label: trap::Label,
+        language_prefix: &str,
+        schema: &'a NodeTypeMap,
+    ) -> Visitor<'a> {
+        Visitor {
+            path,
+            file_label,
+            source,
+            trap_writer,
+            toplevel_child_counter: 0,
+            ast_node_info_table_name: format!("{}_ast_node_info", language_prefix),
+            tokeninfo_table_name: format!("{}_tokeninfo", language_prefix),
+            schema,
+            stack: Vec::new(),
+        }
+    }
+
    fn record_parse_error(
        &mut self,
        error_message: String,
        full_error_message: String,
-        loc: Label,
+        loc: trap::Label,
    ) {
        error!("{}", full_error_message);
        let id = self.trap_writer.fresh_id();
        self.trap_writer.add_tuple(
            "diagnostics",
            vec![
-                Arg::Label(id),
-                Arg::Int(40), // severity 40 = error
-                Arg::String("parse_error".to_string()),
-                Arg::String(error_message),
-                Arg::String(full_error_message),
-                Arg::Label(loc),
+                trap::Arg::Label(id),
+                trap::Arg::Int(40), // severity 40 = error
+                trap::Arg::String("parse_error".to_string()),
+                trap::Arg::String(error_message),
+                trap::Arg::String(full_error_message),
+                trap::Arg::Label(loc),
            ],
        );
    }
@@ -335,7 +273,8 @@ impl Visitor<'_> {
        node: Node,
    ) {
        let (start_line, start_column, end_line, end_column) = location_for(self.source, node);
-        let loc = self.trap_writer.location(
+        let loc = location(
+            self.trap_writer,
            self.file_label,
            start_line,
            start_column,
@@ -374,7 +313,8 @@ impl Visitor<'_> {
        }
        let (id, _, child_nodes) = self.stack.pop().expect("Vistor: empty stack");
        let (start_line, start_column, end_line, end_column) = location_for(self.source, node);
-        let loc = self.trap_writer.location(
+        let loc = location(
+            self.trap_writer,
            self.file_label,
            start_line,
            start_column,
@@ -402,19 +342,19 @@ impl Visitor<'_> {
        match &table.kind {
            EntryKind::Token { kind_id, .. } => {
                self.trap_writer.add_tuple(
-                    &format!("{}_ast_node_info", self.language_prefix),
+                    &self.ast_node_info_table_name,
                    vec![
-                        Arg::Label(id),
-                        Arg::Label(parent_id),
-                        Arg::Int(parent_index),
-                        Arg::Label(loc),
+                        trap::Arg::Label(id),
+                        trap::Arg::Label(parent_id),
+                        trap::Arg::Int(parent_index),
+                        trap::Arg::Label(loc),
                    ],
                );
                self.trap_writer.add_tuple(
-                    &format!("{}_tokeninfo", self.language_prefix),
+                    &self.tokeninfo_table_name,
                    vec![
-                        Arg::Label(id),
-                        Arg::Int(*kind_id),
+                        trap::Arg::Label(id),
+                        trap::Arg::Int(*kind_id),
                        sliced_source_arg(self.source, node),
                    ],
                );
@@ -425,15 +365,15 @@ impl Visitor<'_> {
            } => {
                if let Some(args) = self.complex_node(&node, fields, &child_nodes, id) {
                    self.trap_writer.add_tuple(
-                        &format!("{}_ast_node_info", self.language_prefix),
+                        &self.ast_node_info_table_name,
                        vec![
-                            Arg::Label(id),
-                            Arg::Label(parent_id),
-                            Arg::Int(parent_index),
-                            Arg::Label(loc),
+                            trap::Arg::Label(id),
+                            trap::Arg::Label(parent_id),
+                            trap::Arg::Int(parent_index),
+                            trap::Arg::Label(loc),
                        ],
                    );
-                    let mut all_args = vec![Arg::Label(id)];
+                    let mut all_args = vec![trap::Arg::Label(id)];
                    all_args.extend(args);
                    self.trap_writer.add_tuple(table_name, all_args);
                }
@@ -472,9 +412,9 @@ impl Visitor<'_> {
        node: &Node,
        fields: &[Field],
        child_nodes: &[ChildNode],
-        parent_id: Label,
-    ) -> Option<Vec<Arg>> {
-        let mut map: Map<&Option<String>, (&Field, Vec<Arg>)> = Map::new();
+        parent_id: trap::Label,
+    ) -> Option<Vec<trap::Arg>> {
+        let mut map: Map<&Option<String>, (&Field, Vec<trap::Arg>)> = Map::new();
        for field in fields {
            map.insert(&field.name, (field, Vec::new()));
        }
@@ -488,9 +428,9 @@ impl Visitor<'_> {
                    {
                        // We can safely unwrap because type_matches checks the key is in the map.
                        let (int_value, _) = int_mapping.get(&child_node.type_name.kind).unwrap();
-                        values.push(Arg::Int(*int_value));
+                        values.push(trap::Arg::Int(*int_value));
                    } else {
-                        values.push(Arg::Label(child_node.label));
+                        values.push(trap::Arg::Label(child_node.label));
                    }
                } else if field.name.is_some() {
                    let error_message = format!(
@@ -569,9 +509,9 @@ impl Visitor<'_> {
                            );
                            break;
                        }
-                        let mut args = vec![Arg::Label(parent_id)];
+                        let mut args = vec![trap::Arg::Label(parent_id)];
                        if *has_index {
-                            args.push(Arg::Int(index))
+                            args.push(trap::Arg::Int(index))
                        }
                        args.push(child_value.clone());
                        self.trap_writer.add_tuple(table_name, args);
@@ -625,9 +565,9 @@ impl Visitor<'_> {
 }

 // Emit a slice of a source file as an Arg.
-fn sliced_source_arg(source: &[u8], n: Node) -> Arg {
+fn sliced_source_arg(source: &[u8], n: Node) -> trap::Arg {
    let range = n.byte_range();
-    Arg::String(String::from_utf8_lossy(&source[range.start..range.end]).into_owned())
+    trap::Arg::String(String::from_utf8_lossy(&source[range.start..range.end]).into_owned())
 }

 // Emit a pair of `TrapEntry`s for the provided node, appropriately calibrated.
@@ -699,59 +639,6 @@ fn traverse(tree: &Tree, visitor: &mut Visitor) {
    }
 }

-pub struct Program(Vec<TrapEntry>);
-
-impl fmt::Display for Program {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        let mut text = String::new();
-        for trap_entry in &self.0 {
-            text.push_str(&format!("{}\n", trap_entry));
-        }
-        write!(f, "{}", text)
-    }
-}
-
-enum TrapEntry {
-    /// Maps the label to a fresh id, e.g. `#123=*`.
-    FreshId(Label),
-    /// Maps the label to a key, e.g. `#7=@"foo"`.
-    MapLabelToKey(Label, String),
-    /// foo_bar(arg*)
-    GenericTuple(String, Vec<Arg>),
-    Comment(String),
-}
-impl fmt::Display for TrapEntry {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        match self {
-            TrapEntry::FreshId(label) => write!(f, "{}=*", label),
-            TrapEntry::MapLabelToKey(label, key) => {
-                write!(f, "{}=@\"{}\"", label, key.replace("\"", "\"\""))
-            }
-            TrapEntry::GenericTuple(name, args) => {
-                write!(f, "{}(", name)?;
-                for (index, arg) in args.iter().enumerate() {
-                    if index > 0 {
-                        write!(f, ",")?;
-                    }
-                    write!(f, "{}", arg)?;
-                }
-                write!(f, ")")
-            }
-            TrapEntry::Comment(line) => write!(f, "// {}", line),
-        }
-    }
-}
-
-#[derive(Debug, Copy, Clone)]
-// Identifiers of the form #0, #1...
-struct Label(u32);
-
-impl fmt::Display for Label {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        write!(f, "#{:x}", self.0)
-    }
-}
-
 // Numeric indices.
 #[derive(Debug, Copy, Clone)]
 struct Index(usize);
@@ -761,69 +648,3 @@ impl fmt::Display for Index {
        write!(f, "{}", self.0)
    }
 }
-
-// Some untyped argument to a TrapEntry.
-#[derive(Debug, Clone)]
-enum Arg {
-    Label(Label),
-    Int(usize),
-    String(String),
-}
-
-const MAX_STRLEN: usize = 1048576;
-
-impl fmt::Display for Arg {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        match self {
-            Arg::Label(x) => write!(f, "{}", x),
-            Arg::Int(x) => write!(f, "{}", x),
-            Arg::String(x) => write!(
-                f,
-                "\"{}\"",
-                limit_string(x, MAX_STRLEN).replace("\"", "\"\"")
-            ),
-        }
-    }
-}
-
-/// Limit the length (in bytes) of a string. If the string's length in bytes is
-/// less than or equal to the limit then the entire string is returned. Otherwise
-/// the string is sliced at the provided limit. If there is a multi-byte character
-/// at the limit then the returned slice will be slightly shorter than the limit to
-/// avoid splitting that multi-byte character.
-fn limit_string(string: &str, max_size: usize) -> &str {
-    if string.len() <= max_size {
-        return string;
-    }
-    let p = string.as_bytes();
-    let mut index = max_size;
-    // We want to clip the string at [max_size]; however, the character at that position
-    // may span several bytes. We need to find the first byte of the character. In UTF-8
-    // encoded data any byte that matches the bit pattern 10XXXXXX is not a start byte.
-    // Therefore we decrement the index as long as there are bytes matching this pattern.
-    // This ensures we cut the string at the border between one character and another.
-    while index > 0 && (p[index] & 0b11000000) == 0b10000000 {
-        index -= 1;
-    }
-    &string[0..index]
-}
-
-#[test]
-fn limit_string_test() {
-    assert_eq!("hello", limit_string(&"hello world".to_owned(), 5));
-    assert_eq!("hi ☹", limit_string(&"hi ☹☹".to_owned(), 6));
-    assert_eq!("hi ", limit_string(&"hi ☹☹".to_owned(), 5));
-}
-
-#[test]
-fn escape_key_test() {
-    assert_eq!("foo!", escape_key("foo!"));
-    assert_eq!("foo&lbrace;&rbrace;", escape_key("foo{}"));
-    assert_eq!("&lbrace;&rbrace;", escape_key("{}"));
-    assert_eq!("", escape_key(""));
-    assert_eq!("/path/to/foo.rb", escape_key("/path/to/foo.rb"));
-    assert_eq!(
-        "/path/to/foo&amp;&lbrace;&rbrace;&quot;&commat;&num;.rb",
-        escape_key("/path/to/foo&{}\"@#.rb")
-    );
-}
--- a/ruby/extractor/src/main.rs
+++ b/ruby/extractor/src/main.rs
@@ -1,4 +1,5 @@
 mod extractor;
+mod trap;

 #[macro_use]
 extern crate lazy_static;
@@ -6,50 +7,13 @@ extern crate num_cpus;

 use clap::arg;
 use encoding::{self};
-use flate2::write::GzEncoder;
 use rayon::prelude::*;
 use std::borrow::Cow;
 use std::fs;
-use std::io::{BufRead, BufWriter};
+use std::io::BufRead;
 use std::path::{Path, PathBuf};
 use tree_sitter::{Language, Parser, Range};

-enum TrapCompression {
-    None,
-    Gzip,
-}
-
-impl TrapCompression {
-    fn from_env() -> TrapCompression {
-        match std::env::var("CODEQL_RUBY_TRAP_COMPRESSION") {
-            Ok(method) => match TrapCompression::from_string(&method) {
-                Some(c) => c,
-                None => {
-                    tracing::error!("Unknown compression method '{}'; using gzip.", &method);
-                    TrapCompression::Gzip
-                }
-            },
-            // Default compression method if the env var isn't set:
-            Err(_) => TrapCompression::Gzip,
-        }
-    }
-
-    fn from_string(s: &str) -> Option<TrapCompression> {
-        match s.to_lowercase().as_ref() {
-            "none" => Some(TrapCompression::None),
-            "gzip" => Some(TrapCompression::Gzip),
-            _ => None,
-        }
-    }
-
-    fn extension(&self) -> &str {
-        match self {
-            TrapCompression::None => "trap",
-            TrapCompression::Gzip => "trap.gz",
-        }
-    }
-}
-
 /**
 * Gets the number of threads the extractor should use, by reading the
 * CODEQL_THREADS environment variable and using it as described in the
@@ -141,7 +105,7 @@ fn main() -> std::io::Result<()> {
        .value_of("output-dir")
        .expect("missing --output-dir");
    let trap_dir = PathBuf::from(trap_dir);
-    let trap_compression = TrapCompression::from_env();
+    let trap_compression = trap::Compression::from_env("CODEQL_RUBY_TRAP_COMPRESSION");

    let file_list = matches.value_of("file-list").expect("missing --file-list");
    let file_list = fs::File::open(file_list)?;
@@ -165,7 +129,7 @@ fn main() -> std::io::Result<()> {
            let mut source = std::fs::read(&path)?;
            let mut needs_conversion = false;
            let code_ranges;
-            let mut trap_writer = extractor::new_trap_writer();
+            let mut trap_writer = trap::Writer::new();
            if path.extension().map_or(false, |x| x == "erb") {
                tracing::info!("scanning: {}", path.display());
                extractor::extract(
@@ -246,33 +210,25 @@ fn main() -> std::io::Result<()> {
            } else {
                std::fs::copy(&path, &src_archive_file)?;
            }
-            write_trap(&trap_dir, path, trap_writer, &trap_compression)
+            write_trap(&trap_dir, path, &trap_writer, trap_compression)
        })
        .expect("failed to extract files");

    let path = PathBuf::from("extras");
-    let mut trap_writer = extractor::new_trap_writer();
-    trap_writer.populate_empty_location();
-    write_trap(&trap_dir, path, trap_writer, &trap_compression)
+    let mut trap_writer = trap::Writer::new();
+    extractor::populate_empty_location(&mut trap_writer);
+    write_trap(&trap_dir, path, &trap_writer, trap_compression)
 }

 fn write_trap(
    trap_dir: &Path,
    path: PathBuf,
-    trap_writer: extractor::TrapWriter,
-    trap_compression: &TrapCompression,
+    trap_writer: &trap::Writer,
+    trap_compression: trap::Compression,
 ) -> std::io::Result<()> {
    let trap_file = path_for(trap_dir, &path, trap_compression.extension());
    std::fs::create_dir_all(&trap_file.parent().unwrap())?;
-    let trap_file = std::fs::File::create(&trap_file)?;
-    let mut trap_file = BufWriter::new(trap_file);
-    match trap_compression {
-        TrapCompression::None => trap_writer.output(&mut trap_file),
-        TrapCompression::Gzip => {
-            let mut compressed_writer = GzEncoder::new(trap_file, flate2::Compression::fast());
-            trap_writer.output(&mut compressed_writer)
-        }
-    }
+    trap_writer.write_to_file(&trap_file, trap_compression)
 }

 fn scan_erb(
--- a/ruby/extractor/src/trap.rs
+++ b/ruby/extractor/src/trap.rs
@@ -0,0 +1,272 @@
+use std::borrow::Cow;
+use std::fmt;
+use std::io::{BufWriter, Write};
+use std::path::Path;
+
+use flate2::write::GzEncoder;
+
+pub struct Writer {
+    /// The accumulated trap entries
+    trap_output: Vec<Entry>,
+    /// A counter for generating fresh labels
+    counter: u32,
+    /// cache of global keys
+    global_keys: std::collections::HashMap<String, Label>,
+}
+
+impl Writer {
+    pub fn new() -> Writer {
+        Writer {
+            counter: 0,
+            trap_output: Vec::new(),
+            global_keys: std::collections::HashMap::new(),
+        }
+    }
+
+    pub fn fresh_id(&mut self) -> Label {
+        let label = Label(self.counter);
+        self.counter += 1;
+        self.trap_output.push(Entry::FreshId(label));
+        label
+    }
+
+    ///  Gets a label that will hold the unique ID of the passed string at import time.
+    ///  This can be used for incrementally importable TRAP files -- use globally unique
+    ///  strings to compute a unique ID for table tuples.
+    ///
+    ///  Note: You probably want to make sure that the key strings that you use are disjoint
+    ///  for disjoint column types; the standard way of doing this is to prefix (or append)
+    ///  the column type name to the ID. Thus, you might identify methods in Java by the
+    ///  full ID "methods_com.method.package.DeclaringClass.method(argumentList)".
+    pub fn global_id(&mut self, key: &str) -> (Label, bool) {
+        if let Some(label) = self.global_keys.get(key) {
+            return (*label, false);
+        }
+        let label = Label(self.counter);
+        self.counter += 1;
+        self.global_keys.insert(key.to_owned(), label);
+        self.trap_output
+            .push(Entry::MapLabelToKey(label, key.to_owned()));
+        (label, true)
+    }
+
+    pub fn add_tuple(&mut self, table_name: &str, args: Vec<Arg>) {
+        self.trap_output
+            .push(Entry::GenericTuple(table_name.to_owned(), args))
+    }
+
+    pub fn comment(&mut self, text: String) {
+        self.trap_output.push(Entry::Comment(text));
+    }
+
+    pub fn write_to_file(&self, path: &Path, compression: Compression) -> std::io::Result<()> {
+        let trap_file = std::fs::File::create(path)?;
+        let mut trap_file = BufWriter::new(trap_file);
+        match compression {
+            Compression::None => self.write_trap_entries(&mut trap_file),
+            Compression::Gzip => {
+                let mut compressed_writer = GzEncoder::new(trap_file, flate2::Compression::fast());
+                self.write_trap_entries(&mut compressed_writer)
+            }
+        }
+    }
+
+    fn write_trap_entries<W: Write>(&self, file: &mut W) -> std::io::Result<()> {
+        for trap_entry in &self.trap_output {
+            writeln!(file, "{}", trap_entry)?;
+        }
+        std::io::Result::Ok(())
+    }
+}
+
+pub enum Entry {
+    /// Maps the label to a fresh id, e.g. `#123=*`.
+    FreshId(Label),
+    /// Maps the label to a key, e.g. `#7=@"foo"`.
+    MapLabelToKey(Label, String),
+    /// foo_bar(arg*)
+    GenericTuple(String, Vec<Arg>),
+    Comment(String),
+}
+
+impl fmt::Display for Entry {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match self {
+            Entry::FreshId(label) => write!(f, "{}=*", label),
+            Entry::MapLabelToKey(label, key) => {
+                write!(f, "{}=@\"{}\"", label, key.replace("\"", "\"\""))
+            }
+            Entry::GenericTuple(name, args) => {
+                write!(f, "{}(", name)?;
+                for (index, arg) in args.iter().enumerate() {
+                    if index > 0 {
+                        write!(f, ",")?;
+                    }
+                    write!(f, "{}", arg)?;
+                }
+                write!(f, ")")
+            }
+            Entry::Comment(line) => write!(f, "// {}", line),
+        }
+    }
+}
+
+#[derive(Debug, Copy, Clone)]
+// Identifiers of the form #0, #1...
+pub struct Label(u32);
+
+impl fmt::Display for Label {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "#{:x}", self.0)
+    }
+}
+
+// Some untyped argument to a TrapEntry.
+#[derive(Debug, Clone)]
+pub enum Arg {
+    Label(Label),
+    Int(usize),
+    String(String),
+}
+
+const MAX_STRLEN: usize = 1048576;
+
+impl fmt::Display for Arg {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match self {
+            Arg::Label(x) => write!(f, "{}", x),
+            Arg::Int(x) => write!(f, "{}", x),
+            Arg::String(x) => write!(
+                f,
+                "\"{}\"",
+                limit_string(x, MAX_STRLEN).replace("\"", "\"\"")
+            ),
+        }
+    }
+}
+
+pub struct Program(Vec<Entry>);
+
+impl fmt::Display for Program {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        let mut text = String::new();
+        for trap_entry in &self.0 {
+            text.push_str(&format!("{}\n", trap_entry));
+        }
+        write!(f, "{}", text)
+    }
+}
+
+pub fn full_id_for_file(normalized_path: &str) -> String {
+    format!("{};sourcefile", escape_key(normalized_path))
+}
+
+pub fn full_id_for_folder(normalized_path: &str) -> String {
+    format!("{};folder", escape_key(normalized_path))
+}
+
+/// Escapes a string for use in a TRAP key, by replacing special characters with
+/// HTML entities.
+fn escape_key<'a, S: Into<Cow<'a, str>>>(key: S) -> Cow<'a, str> {
+    fn needs_escaping(c: char) -> bool {
+        matches!(c, '&' | '{' | '}' | '"' | '@' | '#')
+    }
+
+    let key = key.into();
+    if key.contains(needs_escaping) {
+        let mut escaped = String::with_capacity(2 * key.len());
+        for c in key.chars() {
+            match c {
+                '&' => escaped.push_str("&amp;"),
+                '{' => escaped.push_str("&lbrace;"),
+                '}' => escaped.push_str("&rbrace;"),
+                '"' => escaped.push_str("&quot;"),
+                '@' => escaped.push_str("&commat;"),
+                '#' => escaped.push_str("&num;"),
+                _ => escaped.push(c),
+            }
+        }
+        Cow::Owned(escaped)
+    } else {
+        key
+    }
+}
+
+/// Limit the length (in bytes) of a string. If the string's length in bytes is
+/// less than or equal to the limit then the entire string is returned. Otherwise
+/// the string is sliced at the provided limit. If there is a multi-byte character
+/// at the limit then the returned slice will be slightly shorter than the limit to
+/// avoid splitting that multi-byte character.
+fn limit_string(string: &str, max_size: usize) -> &str {
+    if string.len() <= max_size {
+        return string;
+    }
+    let p = string.as_bytes();
+    let mut index = max_size;
+    // We want to clip the string at [max_size]; however, the character at that position
+    // may span several bytes. We need to find the first byte of the character. In UTF-8
+    // encoded data any byte that matches the bit pattern 10XXXXXX is not a start byte.
+    // Therefore we decrement the index as long as there are bytes matching this pattern.
+    // This ensures we cut the string at the border between one character and another.
+    while index > 0 && (p[index] & 0b11000000) == 0b10000000 {
+        index -= 1;
+    }
+    &string[0..index]
+}
+
+#[derive(Clone, Copy)]
+pub enum Compression {
+    None,
+    Gzip,
+}
+
+impl Compression {
+    pub fn from_env(var_name: &str) -> Compression {
+        match std::env::var(var_name) {
+            Ok(method) => match Compression::from_string(&method) {
+                Some(c) => c,
+                None => {
+                    tracing::error!("Unknown compression method '{}'; using gzip.", &method);
+                    Compression::Gzip
+                }
+            },
+            // Default compression method if the env var isn't set:
+            Err(_) => Compression::Gzip,
+        }
+    }
+
+    pub fn from_string(s: &str) -> Option<Compression> {
+        match s.to_lowercase().as_ref() {
+            "none" => Some(Compression::None),
+            "gzip" => Some(Compression::Gzip),
+            _ => None,
+        }
+    }
+
+    pub fn extension(&self) -> &str {
+        match self {
+            Compression::None => "trap",
+            Compression::Gzip => "trap.gz",
+        }
+    }
+}
+
+#[test]
+fn limit_string_test() {
+    assert_eq!("hello", limit_string(&"hello world".to_owned(), 5));
+    assert_eq!("hi ☹", limit_string(&"hi ☹☹".to_owned(), 6));
+    assert_eq!("hi ", limit_string(&"hi ☹☹".to_owned(), 5));
+}
+
+#[test]
+fn escape_key_test() {
+    assert_eq!("foo!", escape_key("foo!"));
+    assert_eq!("foo&lbrace;&rbrace;", escape_key("foo{}"));
+    assert_eq!("&lbrace;&rbrace;", escape_key("{}"));
+    assert_eq!("", escape_key(""));
+    assert_eq!("/path/to/foo.rb", escape_key("/path/to/foo.rb"));
+    assert_eq!(
+        "/path/to/foo&amp;&lbrace;&rbrace;&quot;&commat;&num;.rb",
+        escape_key("/path/to/foo&{}\"@#.rb")
+    );
+}
--- a/ruby/generator/Cargo.toml
+++ b/ruby/generator/Cargo.toml
@@ -12,4 +12,4 @@ node-types = { path = "../node-types" }
 tracing = "0.1"
 tracing-subscriber = { version = "0.3.3", features = ["env-filter"] }
 tree-sitter-embedded-template = { git = "https://github.com/tree-sitter/tree-sitter-embedded-template.git", rev = "1a538da253d73f896b9f6c0c7d79cda58791ac5c" }
-tree-sitter-ruby = { git = "https://github.com/tree-sitter/tree-sitter-ruby.git", rev = "5b305c3cd32db10494cedd2743de6bbe32f1a573" }
+tree-sitter-ruby = { git = "https://github.com/tree-sitter/tree-sitter-ruby.git", rev = "e75d04404c9dd71ad68850d5c672b226d5e694f3" }
--- a/ruby/ql/lib/change-notes/2022-07-18-sqli-in-activerecord-relation-annotate.md
+++ b/ruby/ql/lib/change-notes/2022-07-18-sqli-in-activerecord-relation-annotate.md
@@ -0,0 +1,5 @@
+---
+category: minorAnalysis
+---
+
+- Calls to `ActiveRecord::Relation#annotate` are now recognized as`SqlExecution`s so that it will be considered as a sink for queries like rb/sql-injection.
--- a/ruby/ql/lib/codeql/ruby/frameworks/ActiveRecord.qll
+++ b/ruby/ql/lib/codeql/ruby/frameworks/ActiveRecord.qll
@@ -133,6 +133,11 @@ private Expr sqlFragmentArgument(MethodCall call) {
      or
      methodName = "reload" and
      result = call.getKeywordArgument("lock")
+      or
+      // Calls to `annotate` can be used to add block comments to SQL queries. These are potentially vulnerable to
+      // SQLi if user supplied input is passed in as an argument.
+      methodName = "annotate" and
+      result = call.getArgument(_)
    )
  )
 }
--- a/ruby/ql/test/library-tests/frameworks/ActionController.expected
+++ b/ruby/ql/test/library-tests/frameworks/ActionController.expected
@@ -2,6 +2,7 @@ actionControllerControllerClasses
 | ActiveRecord.rb:23:1:39:3 | FooController |
 | ActiveRecord.rb:41:1:64:3 | BarController |
 | ActiveRecord.rb:66:1:70:3 | BazController |
+| ActiveRecord.rb:72:1:80:3 | AnnotatedController |
 | app/controllers/comments_controller.rb:1:1:7:3 | CommentsController |
 | app/controllers/foo/bars_controller.rb:3:1:39:3 | BarsController |
 | app/controllers/photos_controller.rb:1:1:4:3 | PhotosController |
@@ -12,6 +13,8 @@ actionControllerActionMethods
 | ActiveRecord.rb:42:3:47:5 | some_other_request_handler |
 | ActiveRecord.rb:49:3:63:5 | safe_paths |
 | ActiveRecord.rb:67:3:69:5 | yet_another_handler |
+| ActiveRecord.rb:73:3:75:5 | index |
+| ActiveRecord.rb:77:3:79:5 | unsafe_action |
 | app/controllers/comments_controller.rb:2:3:3:5 | index |
 | app/controllers/comments_controller.rb:5:3:6:5 | show |
 | app/controllers/foo/bars_controller.rb:5:3:7:5 | index |
@@ -38,6 +41,7 @@ paramsCalls
 | ActiveRecord.rb:59:12:59:17 | call to params |
 | ActiveRecord.rb:62:15:62:20 | call to params |
 | ActiveRecord.rb:68:21:68:26 | call to params |
+| ActiveRecord.rb:78:59:78:64 | call to params |
 | app/controllers/foo/bars_controller.rb:13:21:13:26 | call to params |
 | app/controllers/foo/bars_controller.rb:14:10:14:15 | call to params |
 | app/controllers/foo/bars_controller.rb:21:21:21:26 | call to params |
@@ -57,6 +61,7 @@ paramsSources
 | ActiveRecord.rb:59:12:59:17 | call to params |
 | ActiveRecord.rb:62:15:62:20 | call to params |
 | ActiveRecord.rb:68:21:68:26 | call to params |
+| ActiveRecord.rb:78:59:78:64 | call to params |
 | app/controllers/foo/bars_controller.rb:13:21:13:26 | call to params |
 | app/controllers/foo/bars_controller.rb:14:10:14:15 | call to params |
 | app/controllers/foo/bars_controller.rb:21:21:21:26 | call to params |
--- a/ruby/ql/test/library-tests/frameworks/ActiveRecord.expected
+++ b/ruby/ql/test/library-tests/frameworks/ActiveRecord.expected
@@ -22,6 +22,7 @@ activeRecordSqlExecutionRanges
 | ActiveRecord.rb:46:20:46:32 | ... + ... |
 | ActiveRecord.rb:52:16:52:28 | "name #{...}" |
 | ActiveRecord.rb:56:20:56:39 | "username = #{...}" |
+| ActiveRecord.rb:78:27:78:76 | "this is an unsafe annotation:..." |
 activeRecordModelClassMethodCalls
 | ActiveRecord.rb:2:3:2:17 | call to has_many |
 | ActiveRecord.rb:6:3:6:24 | call to belongs_to |
@@ -44,6 +45,8 @@ activeRecordModelClassMethodCalls
 | ActiveRecord.rb:60:5:60:33 | call to find_by |
 | ActiveRecord.rb:62:5:62:34 | call to find |
 | ActiveRecord.rb:68:5:68:45 | call to delete_by |
+| ActiveRecord.rb:74:13:74:54 | call to annotate |
+| ActiveRecord.rb:78:13:78:77 | call to annotate |
 potentiallyUnsafeSqlExecutingMethodCall
 | ActiveRecord.rb:9:5:9:68 | call to find |
 | ActiveRecord.rb:19:5:19:25 | call to destroy_by |
@@ -55,6 +58,7 @@ potentiallyUnsafeSqlExecutingMethodCall
 | ActiveRecord.rb:46:5:46:33 | call to delete_by |
 | ActiveRecord.rb:52:5:52:29 | call to order |
 | ActiveRecord.rb:56:7:56:40 | call to find_by |
+| ActiveRecord.rb:78:13:78:77 | call to annotate |
 activeRecordModelInstantiations
 | ActiveRecord.rb:9:5:9:68 | call to find | ActiveRecord.rb:5:1:15:3 | User |
 | ActiveRecord.rb:13:5:13:40 | call to find_by | ActiveRecord.rb:1:1:3:3 | UserGroup |
--- a/ruby/ql/test/library-tests/frameworks/ActiveRecord.rb
+++ b/ruby/ql/test/library-tests/frameworks/ActiveRecord.rb
@@ -68,3 +68,13 @@ class BazController < BarController
    Admin.delete_by(params[:admin_condition])
  end
 end
+
+class AnnotatedController < ActionController::Base
+  def index
+    users = User.annotate("this is a safe annotation")
+  end
+
+  def unsafe_action
+    users = User.annotate("this is an unsafe annotation:#{params[:comment]}")
+  end
+end
--- a/ruby/ql/test/query-tests/security/cwe-089/ActiveRecordInjection.rb
+++ b/ruby/ql/test/query-tests/security/cwe-089/ActiveRecordInjection.rb
@@ -137,3 +137,17 @@ class BazController < BarController
    Admin.delete_by(params[:admin_condition])
  end
 end
+
+class AnnotatedController < ActionController::Base
+  def index
+    name = params[:user_name]
+    # GOOD: string literal arguments not controlled by user are safe for annotations
+    users = User.annotate("this is a safe annotation").find_by(user_name: name)
+  end
+
+  def unsafe_action
+    name = params[:user_name]
+    # BAD: user input passed into annotations are vulnerable to SQLi
+    users = User.annotate("this is an unsafe annotation:#{params[:comment]}").find_by(user_name: name)
+  end
+end
--- a/ruby/ql/test/query-tests/security/cwe-089/SqlInjection.expected
+++ b/ruby/ql/test/query-tests/security/cwe-089/SqlInjection.expected
@@ -31,6 +31,8 @@ edges
 | ActiveRecordInjection.rb:99:11:99:17 | ...[...] :  | ActiveRecordInjection.rb:104:20:104:32 | ... + ... |
 | ActiveRecordInjection.rb:137:21:137:26 | call to params :  | ActiveRecordInjection.rb:137:21:137:44 | ...[...] :  |
 | ActiveRecordInjection.rb:137:21:137:44 | ...[...] :  | ActiveRecordInjection.rb:20:22:20:30 | condition :  |
+| ActiveRecordInjection.rb:151:59:151:64 | call to params :  | ActiveRecordInjection.rb:151:59:151:74 | ...[...] :  |
+| ActiveRecordInjection.rb:151:59:151:74 | ...[...] :  | ActiveRecordInjection.rb:151:27:151:76 | "this is an unsafe annotation:..." |
 nodes
 | ActiveRecordInjection.rb:8:25:8:28 | name :  | semmle.label | name :  |
 | ActiveRecordInjection.rb:8:31:8:34 | pass :  | semmle.label | pass :  |
@@ -80,6 +82,9 @@ nodes
 | ActiveRecordInjection.rb:104:20:104:32 | ... + ... | semmle.label | ... + ... |
 | ActiveRecordInjection.rb:137:21:137:26 | call to params :  | semmle.label | call to params :  |
 | ActiveRecordInjection.rb:137:21:137:44 | ...[...] :  | semmle.label | ...[...] :  |
+| ActiveRecordInjection.rb:151:27:151:76 | "this is an unsafe annotation:..." | semmle.label | "this is an unsafe annotation:..." |
+| ActiveRecordInjection.rb:151:59:151:64 | call to params :  | semmle.label | call to params :  |
+| ActiveRecordInjection.rb:151:59:151:74 | ...[...] :  | semmle.label | ...[...] :  |
 subpaths
 #select
 | ActiveRecordInjection.rb:10:33:10:67 | "name='#{...}' and pass='#{...}'" | ActiveRecordInjection.rb:70:23:70:28 | call to params :  | ActiveRecordInjection.rb:10:33:10:67 | "name='#{...}' and pass='#{...}'" | This SQL query depends on $@. | ActiveRecordInjection.rb:70:23:70:28 | call to params | a user-provided value |
@@ -99,3 +104,4 @@ subpaths
 | ActiveRecordInjection.rb:88:18:88:35 | ...[...] | ActiveRecordInjection.rb:88:18:88:23 | call to params :  | ActiveRecordInjection.rb:88:18:88:35 | ...[...] | This SQL query depends on $@. | ActiveRecordInjection.rb:88:18:88:23 | call to params | a user-provided value |
 | ActiveRecordInjection.rb:92:21:92:35 | ...[...] | ActiveRecordInjection.rb:92:21:92:26 | call to params :  | ActiveRecordInjection.rb:92:21:92:35 | ...[...] | This SQL query depends on $@. | ActiveRecordInjection.rb:92:21:92:26 | call to params | a user-provided value |
 | ActiveRecordInjection.rb:104:20:104:32 | ... + ... | ActiveRecordInjection.rb:98:10:98:15 | call to params :  | ActiveRecordInjection.rb:104:20:104:32 | ... + ... | This SQL query depends on $@. | ActiveRecordInjection.rb:98:10:98:15 | call to params | a user-provided value |
+| ActiveRecordInjection.rb:151:27:151:76 | "this is an unsafe annotation:..." | ActiveRecordInjection.rb:151:59:151:64 | call to params :  | ActiveRecordInjection.rb:151:27:151:76 | "this is an unsafe annotation:..." | This SQL query depends on $@. | ActiveRecordInjection.rb:151:59:151:64 | call to params | a user-provided value |
--- a/swift/ql/src/queries/Security/CWE-135/StringLengthConflation.ql
+++ b/swift/ql/src/queries/Security/CWE-135/StringLengthConflation.ql
@@ -41,88 +41,80 @@ class StringLengthConflationConfiguration extends DataFlow::Configuration {
  }

  override predicate isSink(DataFlow::Node node, string flowstate) {
-    // arguments to method calls...
    exists(
-      string className, string methodName, string paramName, ClassDecl c, AbstractFunctionDecl f,
-      CallExpr call, int arg
+      AbstractFunctionDecl funcDecl, CallExpr call, string funcName, string paramName, int arg
    |
      (
-        // `NSRange.init`
-        className = "NSRange" and
-        methodName = "init(location:length:)" and
-        paramName = ["location", "length"]
+        // arguments to method calls...
+        exists(string className, ClassDecl c |
+          (
+            // `NSRange.init`
+            className = "NSRange" and
+            funcName = "init(location:length:)" and
+            paramName = ["location", "length"]
+            or
+            // `NSString.character`
+            className = ["NSString", "NSMutableString"] and
+            funcName = "character(at:)" and
+            paramName = "at"
+            or
+            // `NSString.character`
+            className = ["NSString", "NSMutableString"] and
+            funcName = "substring(from:)" and
+            paramName = "from"
+            or
+            // `NSString.character`
+            className = ["NSString", "NSMutableString"] and
+            funcName = "substring(to:)" and
+            paramName = "to"
+            or
+            // `NSMutableString.insert`
+            className = "NSMutableString" and
+            funcName = "insert(_:at:)" and
+            paramName = "at"
+          ) and
+          c.getName() = className and
+          c.getAMember() = funcDecl and
+          call.getFunction().(ApplyExpr).getStaticTarget() = funcDecl and
+          flowstate = "String" // `String` length flowing into `NSString`
+        )
        or
-        // `NSString.character`
-        className = ["NSString", "NSMutableString"] and
-        methodName = "character(at:)" and
-        paramName = "at"
+        // arguments to function calls...
+        // `NSMakeRange`
+        funcName = "NSMakeRange(_:_:)" and
+        paramName = ["loc", "len"] and
+        call.getStaticTarget() = funcDecl and
+        flowstate = "String" // `String` length flowing into `NSString`
        or
-        // `NSString.character`
-        className = ["NSString", "NSMutableString"] and
-        methodName = "substring(from:)" and
-        paramName = "from"
-        or
-        // `NSString.character`
-        className = ["NSString", "NSMutableString"] and
-        methodName = "substring(to:)" and
-        paramName = "to"
-        or
-        // `NSMutableString.insert`
-        className = "NSMutableString" and
-        methodName = "insert(_:at:)" and
-        paramName = "at"
+        // arguments to method calls...
+        (
+          // `String.dropFirst`, `String.dropLast`, `String.removeFirst`, `String.removeLast`
+          funcName = ["dropFirst(_:)", "dropLast(_:)", "removeFirst(_:)", "removeLast(_:)"] and
+          paramName = "k"
+          or
+          // `String.prefix`, `String.suffix`
+          funcName = ["prefix(_:)", "suffix(_:)"] and
+          paramName = "maxLength"
+          or
+          // `String.Index.init`
+          funcName = "init(encodedOffset:)" and
+          paramName = "offset"
+          or
+          // `String.index`
+          funcName = ["index(_:offsetBy:)", "index(_:offsetBy:limitBy:)"] and
+          paramName = "n"
+          or
+          // `String.formIndex`
+          funcName = ["formIndex(_:offsetBy:)", "formIndex(_:offsetBy:limitBy:)"] and
+          paramName = "distance"
+        ) and
+        call.getFunction().(ApplyExpr).getStaticTarget() = funcDecl and
+        flowstate = "NSString" // `NSString` length flowing into `String`
      ) and
-      c.getName() = className and
-      c.getAMember() = f and // TODO: will this even work if its defined in a parent class?
-      call.getFunction().(ApplyExpr).getStaticTarget() = f and
-      f.getName() = methodName and
-      f.getParam(pragma[only_bind_into](arg)).getName() = paramName and
-      call.getArgument(pragma[only_bind_into](arg)).getExpr() = node.asExpr() and
-      flowstate = "String" // `String` length flowing into `NSString`
-    )
-    or
-    // arguments to function calls...
-    exists(string funcName, string paramName, CallExpr call, int arg |
-      // `NSMakeRange`
-      funcName = "NSMakeRange(_:_:)" and
-      paramName = ["loc", "len"] and
-      call.getStaticTarget().getName() = funcName and
-      call.getStaticTarget().getParam(pragma[only_bind_into](arg)).getName() = paramName and
-      call.getArgument(pragma[only_bind_into](arg)).getExpr() = node.asExpr() and
-      flowstate = "String" // `String` length flowing into `NSString`
-    )
-    or
-    // arguments to function calls...
-    exists(string funcName, string paramName, CallExpr call, int arg |
-      (
-        // `String.dropFirst`, `String.dropLast`, `String.removeFirst`, `String.removeLast`
-        funcName = ["dropFirst(_:)", "dropLast(_:)", "removeFirst(_:)", "removeLast(_:)"] and
-        paramName = "k"
-        or
-        // `String.prefix`, `String.suffix`
-        funcName = ["prefix(_:)", "suffix(_:)"] and
-        paramName = "maxLength"
-        or
-        // `String.Index.init`
-        funcName = "init(encodedOffset:)" and
-        paramName = "offset"
-        or
-        // `String.index`
-        funcName = ["index(_:offsetBy:)", "index(_:offsetBy:limitBy:)"] and
-        paramName = "n"
-        or
-        // `String.formIndex`
-        funcName = ["formIndex(_:offsetBy:)", "formIndex(_:offsetBy:limitBy:)"] and
-        paramName = "distance"
-      ) and
-      call.getFunction().(ApplyExpr).getStaticTarget().getName() = funcName and
-      call.getFunction()
-          .(ApplyExpr)
-          .getStaticTarget()
-          .getParam(pragma[only_bind_into](arg))
-          .getName() = paramName and
-      call.getArgument(pragma[only_bind_into](arg)).getExpr() = node.asExpr() and
-      flowstate = "NSString" // `NSString` length flowing into `String`
+      // match up `funcName`, `paramName`, `arg`, `node`.
+      funcDecl.getName() = funcName and
+      funcDecl.getParam(pragma[only_bind_into](arg)).getName() = paramName and
+      call.getArgument(pragma[only_bind_into](arg)).getExpr() = node.asExpr()
    )
  }

--- a/swift/ql/test/query-tests/Security/CWE-135/StringLengthConflation.expected
+++ b/swift/ql/test/query-tests/Security/CWE-135/StringLengthConflation.expected
@@ -1,4 +1,5 @@
 edges
+| StringLengthConflation2.swift:37:34:37:36 | .count :  | StringLengthConflation2.swift:37:34:37:44 | ... call to -(_:_:) ... |
 | StringLengthConflation.swift:60:47:60:50 | .length :  | StringLengthConflation.swift:60:47:60:59 | ... call to /(_:_:) ... |
 | StringLengthConflation.swift:66:33:66:36 | .length :  | StringLengthConflation.swift:66:33:66:45 | ... call to /(_:_:) ... |
 | StringLengthConflation.swift:93:28:93:31 | .length :  | StringLengthConflation.swift:93:28:93:40 | ... call to -(_:_:) ... |
@@ -15,6 +16,8 @@ edges
 | StringLengthConflation.swift:135:36:135:38 | .count :  | StringLengthConflation.swift:135:36:135:46 | ... call to -(_:_:) ... |
 | StringLengthConflation.swift:141:28:141:30 | .count :  | StringLengthConflation.swift:141:28:141:38 | ... call to -(_:_:) ... |
 nodes
+| StringLengthConflation2.swift:37:34:37:36 | .count :  | semmle.label | .count :  |
+| StringLengthConflation2.swift:37:34:37:44 | ... call to -(_:_:) ... | semmle.label | ... call to -(_:_:) ... |
 | StringLengthConflation.swift:53:43:53:46 | .length | semmle.label | .length |
 | StringLengthConflation.swift:60:47:60:50 | .length :  | semmle.label | .length :  |
 | StringLengthConflation.swift:60:47:60:59 | ... call to /(_:_:) ... | semmle.label | ... call to /(_:_:) ... |
@@ -50,6 +53,7 @@ nodes
 | StringLengthConflation.swift:141:28:141:38 | ... call to -(_:_:) ... | semmle.label | ... call to -(_:_:) ... |
 subpaths
 #select
+| StringLengthConflation2.swift:37:34:37:44 | ... call to -(_:_:) ... | StringLengthConflation2.swift:37:34:37:36 | .count :  | StringLengthConflation2.swift:37:34:37:44 | ... call to -(_:_:) ... | This String length is used in an NSString, but it may not be equivalent. |
 | StringLengthConflation.swift:53:43:53:46 | .length | StringLengthConflation.swift:53:43:53:46 | .length | StringLengthConflation.swift:53:43:53:46 | .length | This NSString length is used in a String, but it may not be equivalent. |
 | StringLengthConflation.swift:60:47:60:59 | ... call to /(_:_:) ... | StringLengthConflation.swift:60:47:60:50 | .length :  | StringLengthConflation.swift:60:47:60:59 | ... call to /(_:_:) ... | This NSString length is used in a String, but it may not be equivalent. |
 | StringLengthConflation.swift:66:33:66:45 | ... call to /(_:_:) ... | StringLengthConflation.swift:66:33:66:36 | .length :  | StringLengthConflation.swift:66:33:66:45 | ... call to /(_:_:) ... | This NSString length is used in a String, but it may not be equivalent. |
--- a/swift/ql/test/query-tests/Security/CWE-135/StringLengthConflation2.swift
+++ b/swift/ql/test/query-tests/Security/CWE-135/StringLengthConflation2.swift
@@ -0,0 +1,42 @@
+// this test is in a separate file, because we want to test with a slightly different library
+// implementation. In this version, some of the functions of `NSString` are in fact implemented
+// in a base class `NSStringBase`.
+
+// --- stubs ---
+
+func print(_ items: Any...) {}
+
+typealias unichar = UInt16
+
+class NSObject
+{
+}
+
+class NSStringBase : NSObject
+{
+    func substring(from: Int) -> String { return "" }
+}
+
+class NSString : NSStringBase
+{
+    init(string: String) { length = string.count }
+
+    func substring(to: Int) -> String { return "" }
+
+    private(set) var length: Int
+}
+
+// --- tests ---
+
+func test(s: String) {
+    let ns = NSString(string: s)
+
+    let nstr1 = ns.substring(from: ns.length - 1) // GOOD
+    let nstr2 = ns.substring(from: s.count - 1) // BAD: String length used in NSString [NOT DETECTED]
+    let nstr3 = ns.substring(to: ns.length - 1) // GOOD
+    let nstr4 = ns.substring(to: s.count - 1) // BAD: String length used in NSString
+    print("substrings '\(nstr1)' '\(nstr2)' / '\(nstr3)' '\(nstr4)'")
+}
+
+// `begin :thumbsup: end`, with thumbs up emoji and skin tone modifier
+test(s: "begin \u{0001F44D}\u{0001F3FF} end")