Merge branch 'main' into experimental-strong-params

2026-04-27 01:35:13 +02:00 · 2022-07-22 20:41:33 -04:00
parent 0c0ba925a7 0a35f97074
commit 6cfde70898
28 changed files with 1298 additions and 814 deletions
--- a/cpp/ql/lib/change-notes/2022-06-24-unique-variable.md
+++ b/cpp/ql/lib/change-notes/2022-06-24-unique-variable.md
@@ -0,0 +1,4 @@
+---
+category: fix
+---
+* Under certain circumstances a variable declaration that is not also a definition could be associated with a `Variable` that did not have the definition as a `VariableDeclarationEntry`. This is now fixed, and a unique `Variable` will exist that has both the declaration and the definition as a `VariableDeclarationEntry`.
--- a/cpp/ql/lib/semmle/code/cpp/Element.qll
+++ b/cpp/ql/lib/semmle/code/cpp/Element.qll
@@ -6,6 +6,7 @@
 import semmle.code.cpp.Location
 private import semmle.code.cpp.Enclosing
 private import semmle.code.cpp.internal.ResolveClass
+private import semmle.code.cpp.internal.ResolveGlobalVariable

 /**
 * Get the `Element` that represents this `@element`.
@@ -28,9 +29,12 @@ Element mkElement(@element e) { unresolveElement(result) = e }
 pragma[inline]
@element unresolveElement(Element e) {
  not result instanceof @usertype and
+  not result instanceof @variable and
  result = e
  or
  e = resolveClass(result)
+  or
+  e = resolveGlobalVariable(result)
 }

 /**
--- a/cpp/ql/lib/semmle/code/cpp/Variable.qll
+++ b/cpp/ql/lib/semmle/code/cpp/Variable.qll
@@ -6,6 +6,7 @@ import semmle.code.cpp.Element
 import semmle.code.cpp.exprs.Access
 import semmle.code.cpp.Initializer
 private import semmle.code.cpp.internal.ResolveClass
+private import semmle.code.cpp.internal.ResolveGlobalVariable

 /**
 * A C/C++ variable. For example, in the following code there are four
@@ -32,6 +33,8 @@ private import semmle.code.cpp.internal.ResolveClass
 * can have multiple declarations.
 */
 class Variable extends Declaration, @variable {
+  Variable() { isVariable(underlyingElement(this)) }
+
  override string getAPrimaryQlClass() { result = "Variable" }

  /** Gets the initializer of this variable, if any. */
--- a/cpp/ql/lib/semmle/code/cpp/internal/ResolveGlobalVariable.qll
+++ b/cpp/ql/lib/semmle/code/cpp/internal/ResolveGlobalVariable.qll
@@ -0,0 +1,60 @@
+private predicate hasDefinition(@globalvariable g) {
+  exists(@var_decl vd | var_decls(vd, g, _, _, _) | var_def(vd))
+}
+
+pragma[noinline]
+private predicate onlyOneCompleteGlobalVariableExistsWithMangledName(@mangledname name) {
+  strictcount(@globalvariable g | hasDefinition(g) and mangled_name(g, name)) = 1
+}
+
+/** Holds if `g` is a unique global variable with a definition named `name`. */
+pragma[noinline]
+private predicate isGlobalWithMangledNameAndWithDefinition(@mangledname name, @globalvariable g) {
+  hasDefinition(g) and
+  mangled_name(g, name) and
+  onlyOneCompleteGlobalVariableExistsWithMangledName(name)
+}
+
+/** Holds if `g` is a global variable without a definition named `name`. */
+pragma[noinline]
+private predicate isGlobalWithMangledNameAndWithoutDefinition(@mangledname name, @globalvariable g) {
+  not hasDefinition(g) and
+  mangled_name(g, name)
+}
+
+/**
+ * Holds if `incomplete` is a global variable without a definition, and there exists
+ * a unique global variable `complete` with the same name that does have a definition.
+ */
+private predicate hasTwinWithDefinition(@globalvariable incomplete, @globalvariable complete) {
+  exists(@mangledname name |
+    not variable_instantiation(incomplete, complete) and
+    isGlobalWithMangledNameAndWithoutDefinition(name, incomplete) and
+    isGlobalWithMangledNameAndWithDefinition(name, complete)
+  )
+}
+
+import Cached
+
+cached
+private module Cached {
+  /**
+   * If `v` is a global variable without a definition, and there exists a unique
+   * global variable with the same name that does have a definition, then the
+   * result is that unique global variable. Otherwise, the result is `v`.
+   */
+  cached
+  @variable resolveGlobalVariable(@variable v) {
+    hasTwinWithDefinition(v, result)
+    or
+    not hasTwinWithDefinition(v, _) and
+    result = v
+  }
+
+  cached
+  predicate isVariable(@variable v) {
+    not v instanceof @globalvariable
+    or
+    v = resolveGlobalVariable(_)
+  }
+}
--- a/Management/ReturnStackAllocatedMemory.ql
+++ b/Management/ReturnStackAllocatedMemory.ql
@@ -74,13 +74,12 @@ class ReturnStackAllocatedMemoryConfig extends MustFlowConfiguration {

 from
  MustFlowPathNode source, MustFlowPathNode sink, VariableAddressInstruction var,
-  ReturnStackAllocatedMemoryConfig conf, Function f
+  ReturnStackAllocatedMemoryConfig conf
 where
-  conf.hasFlowPath(source, sink) and
+  conf.hasFlowPath(pragma[only_bind_into](source), pragma[only_bind_into](sink)) and
  source.getNode().asInstruction() = var and
  // Only raise an alert if we're returning from the _same_ callable as the on that
  // declared the stack variable.
-  var.getEnclosingFunction() = pragma[only_bind_into](f) and
-  sink.getNode().getEnclosingCallable() = pragma[only_bind_into](f)
+  var.getEnclosingFunction() = sink.getNode().getEnclosingCallable()
 select sink.getNode(), source, sink, "May return stack-allocated memory from $@.", var.getAst(),
  var.getAst().toString()
--- a/cpp/ql/src/Security/CWE/CWE-078/ExecTainted.ql
+++ b/cpp/ql/src/Security/CWE/CWE-078/ExecTainted.ql
@@ -77,7 +77,7 @@ class ExecState extends DataFlow::FlowState {
  ExecState() {
    this =
      "ExecState (" + fst.getLocation() + " | " + fst + ", " + snd.getLocation() + " | " + snd + ")" and
-    interestingConcatenation(fst, snd)
+    interestingConcatenation(pragma[only_bind_into](fst), pragma[only_bind_into](snd))
  }

  DataFlow::Node getFstNode() { result = fst }
--- a/cpp/ql/test/library-tests/variables/global/variables.expected
+++ b/cpp/ql/test/library-tests/variables/global/variables.expected
@@ -4,11 +4,7 @@
 | c.c:6:5:6:6 | ls | array of 4 {int} | 1 |
 | c.c:8:5:8:7 | iss | array of 4 {array of 2 {int}} | 1 |
 | c.c:12:11:12:11 | i | typedef {int} as "int_alias" | 1 |
-| c.h:4:12:4:13 | ks | array of {int} | 1 |
-| c.h:8:12:8:14 | iss | array of {array of 2 {int}} | 1 |
-| c.h:10:12:10:12 | i | int | 1 |
 | d.cpp:3:7:3:8 | xs | array of {int} | 1 |
-| d.h:3:14:3:15 | xs | array of 2 {int} | 1 |
 | file://:0:0:0:0 | (unnamed parameter 0) | reference to {const {struct __va_list_tag}} | 1 |
 | file://:0:0:0:0 | (unnamed parameter 0) | rvalue reference to {struct __va_list_tag} | 1 |
 | file://:0:0:0:0 | fp_offset | unsigned int | 1 |
--- a/csharp/tools/tracing-config.lua
+++ b/csharp/tools/tracing-config.lua
@@ -2,7 +2,54 @@ function RegisterExtractorPack(id)
    local extractor = GetPlatformToolsDirectory() ..
                          'Semmle.Extraction.CSharp.Driver'
    if OperatingSystem == 'windows' then extractor = extractor .. '.exe' end
+
+    function DotnetMatcherBuild(compilerName, compilerPath, compilerArguments,
+                                _languageId)
+        if compilerName ~= 'dotnet' and compilerName ~= 'dotnet.exe' then
+            return nil
+        end
+
+        -- The dotnet CLI has the following usage instructions:
+        -- dotnet [sdk-options] [command] [command-options] [arguments]
+        -- we are interested in dotnet build, which has the following usage instructions:
+        -- dotnet [options] build [<PROJECT | SOLUTION>...]
+        -- For now, parse the command line as follows:
+        -- Everything that starts with `-` (or `/`) will be ignored.
+        -- The first non-option argument is treated as the command.
+        -- if that's `build`, we append `/p:UseSharedCompilation=false` to the command line,
+        -- otherwise we do nothing.
+        local match = false
+        local argv = compilerArguments.argv
+        if OperatingSystem == 'windows' then
+            -- let's hope that this split matches the escaping rules `dotnet` applies to command line arguments
+            -- or, at least, that it is close enough
+            argv =
+                NativeArgumentsToArgv(compilerArguments.nativeArgumentPointer)
+        end
+        for i, arg in ipairs(argv) do
+            -- dotnet options start with either - or / (both are legal)
+            local firstCharacter = string.sub(arg, 1, 1)
+            if not (firstCharacter == '-') and not (firstCharacter == '/') then
+                Log(1, 'Dotnet subcommand detected: %s', arg)
+                if arg == 'build' then match = true end
+                break
+            end
+        end
+        if match then
+            return {
+                order = ORDER_REPLACE,
+                invocation = BuildExtractorInvocation(id, compilerPath,
+                                                      compilerPath,
+                                                      compilerArguments, nil, {
+                    '/p:UseSharedCompilation=false'
+                })
+            }
+        end
+        return nil
+    end
+
    local windowsMatchers = {
+        DotnetMatcherBuild,
        CreatePatternMatcher({'^dotnet%.exe$'}, MatchCompilerName, extractor, {
            prepend = {'--dotnetexec', '--cil'},
            order = ORDER_BEFORE
@@ -10,22 +57,21 @@ function RegisterExtractorPack(id)
        CreatePatternMatcher({'^csc.*%.exe$'}, MatchCompilerName, extractor, {
            prepend = {'--compiler', '"${compiler}"', '--cil'},
            order = ORDER_BEFORE
-
        }),
        CreatePatternMatcher({'^fakes.*%.exe$', 'moles.*%.exe'},
                             MatchCompilerName, nil, {trace = false})
    }
    local posixMatchers = {
-        CreatePatternMatcher({'^mcs%.exe$', '^csc%.exe$'}, MatchCompilerName,
-                             extractor, {
-            prepend = {'--compiler', '"${compiler}"', '--cil'},
-            order = ORDER_BEFORE
-
-        }),
+        DotnetMatcherBuild,
        CreatePatternMatcher({'^mono', '^dotnet$'}, MatchCompilerName,
                             extractor, {
            prepend = {'--dotnetexec', '--cil'},
            order = ORDER_BEFORE
+        }),
+        CreatePatternMatcher({'^mcs%.exe$', '^csc%.exe$'}, MatchCompilerName,
+                             extractor, {
+            prepend = {'--compiler', '"${compiler}"', '--cil'},
+            order = ORDER_BEFORE
        }), function(compilerName, compilerPath, compilerArguments, _languageId)
            if MatchCompilerName('^msbuild$', compilerName, compilerPath,
                                 compilerArguments) or
@@ -49,7 +95,6 @@ function RegisterExtractorPack(id)
    else
        return posixMatchers
    end
-
 end

 -- Return a list of minimum supported versions of the configuration file format
--- a/ql/extractor/src/extractor.rs
+++ b/ql/extractor/src/extractor.rs
@@ -1,161 +1,112 @@
+use crate::trap;
 use node_types::{EntryKind, Field, NodeTypeMap, Storage, TypeName};
-use std::borrow::Cow;
 use std::collections::BTreeMap as Map;
 use std::collections::BTreeSet as Set;
 use std::fmt;
-use std::io::Write;
 use std::path::Path;

 use tracing::{error, info, span, Level};
 use tree_sitter::{Language, Node, Parser, Range, Tree};

-pub struct TrapWriter {
-    /// The accumulated trap entries
-    trap_output: Vec<TrapEntry>,
-    /// A counter for generating fresh labels
-    counter: u32,
-    /// cache of global keys
-    global_keys: std::collections::HashMap<String, Label>,
+pub fn populate_file(writer: &mut trap::Writer, absolute_path: &Path) -> trap::Label {
+    let (file_label, fresh) =
+        writer.global_id(&trap::full_id_for_file(&normalize_path(absolute_path)));
+    if fresh {
+        writer.add_tuple(
+            "files",
+            vec![
+                trap::Arg::Label(file_label),
+                trap::Arg::String(normalize_path(absolute_path)),
+            ],
+        );
+        populate_parent_folders(writer, file_label, absolute_path.parent());
+    }
+    file_label
 }

-pub fn new_trap_writer() -> TrapWriter {
-    TrapWriter {
-        counter: 0,
-        trap_output: Vec::new(),
-        global_keys: std::collections::HashMap::new(),
+fn populate_empty_file(writer: &mut trap::Writer) -> trap::Label {
+    let (file_label, fresh) = writer.global_id("empty;sourcefile");
+    if fresh {
+        writer.add_tuple(
+            "files",
+            vec![
+                trap::Arg::Label(file_label),
+                trap::Arg::String("".to_string()),
+            ],
+        );
    }
+    file_label
 }

-impl TrapWriter {
-    ///  Gets a label that will hold the unique ID of the passed string at import time.
-    ///  This can be used for incrementally importable TRAP files -- use globally unique
-    ///  strings to compute a unique ID for table tuples.
-    ///
-    ///  Note: You probably want to make sure that the key strings that you use are disjoint
-    ///  for disjoint column types; the standard way of doing this is to prefix (or append)
-    ///  the column type name to the ID. Thus, you might identify methods in Java by the
-    ///  full ID "methods_com.method.package.DeclaringClass.method(argumentList)".
+pub fn populate_empty_location(writer: &mut trap::Writer) {
+    let file_label = populate_empty_file(writer);
+    location(writer, file_label, 0, 0, 0, 0);
+}

-    fn fresh_id(&mut self) -> Label {
-        let label = Label(self.counter);
-        self.counter += 1;
-        self.trap_output.push(TrapEntry::FreshId(label));
-        label
-    }
-
-    fn global_id(&mut self, key: &str) -> (Label, bool) {
-        if let Some(label) = self.global_keys.get(key) {
-            return (*label, false);
-        }
-        let label = Label(self.counter);
-        self.counter += 1;
-        self.global_keys.insert(key.to_owned(), label);
-        self.trap_output
-            .push(TrapEntry::MapLabelToKey(label, key.to_owned()));
-        (label, true)
-    }
-
-    fn add_tuple(&mut self, table_name: &str, args: Vec<Arg>) {
-        self.trap_output
-            .push(TrapEntry::GenericTuple(table_name.to_owned(), args))
-    }
-
-    fn populate_file(&mut self, absolute_path: &Path) -> Label {
-        let (file_label, fresh) = self.global_id(&full_id_for_file(absolute_path));
-        if fresh {
-            self.add_tuple(
-                "files",
-                vec![
-                    Arg::Label(file_label),
-                    Arg::String(normalize_path(absolute_path)),
-                ],
-            );
-            self.populate_parent_folders(file_label, absolute_path.parent());
-        }
-        file_label
-    }
-
-    fn populate_empty_file(&mut self) -> Label {
-        let (file_label, fresh) = self.global_id("empty;sourcefile");
-        if fresh {
-            self.add_tuple(
-                "files",
-                vec![Arg::Label(file_label), Arg::String("".to_string())],
-            );
-        }
-        file_label
-    }
-
-    pub fn populate_empty_location(&mut self) {
-        let file_label = self.populate_empty_file();
-        self.location(file_label, 0, 0, 0, 0);
-    }
-
-    fn populate_parent_folders(&mut self, child_label: Label, path: Option<&Path>) {
-        let mut path = path;
-        let mut child_label = child_label;
-        loop {
-            match path {
-                None => break,
-                Some(folder) => {
-                    let (folder_label, fresh) = self.global_id(&full_id_for_folder(folder));
-                    self.add_tuple(
-                        "containerparent",
-                        vec![Arg::Label(folder_label), Arg::Label(child_label)],
+pub fn populate_parent_folders(
+    writer: &mut trap::Writer,
+    child_label: trap::Label,
+    path: Option<&Path>,
+) {
+    let mut path = path;
+    let mut child_label = child_label;
+    loop {
+        match path {
+            None => break,
+            Some(folder) => {
+                let (folder_label, fresh) =
+                    writer.global_id(&trap::full_id_for_folder(&normalize_path(folder)));
+                writer.add_tuple(
+                    "containerparent",
+                    vec![
+                        trap::Arg::Label(folder_label),
+                        trap::Arg::Label(child_label),
+                    ],
+                );
+                if fresh {
+                    writer.add_tuple(
+                        "folders",
+                        vec![
+                            trap::Arg::Label(folder_label),
+                            trap::Arg::String(normalize_path(folder)),
+                        ],
                    );
-                    if fresh {
-                        self.add_tuple(
-                            "folders",
-                            vec![
-                                Arg::Label(folder_label),
-                                Arg::String(normalize_path(folder)),
-                            ],
-                        );
-                        path = folder.parent();
-                        child_label = folder_label;
-                    } else {
-                        break;
-                    }
+                    path = folder.parent();
+                    child_label = folder_label;
+                } else {
+                    break;
                }
            }
        }
    }
+}

-    fn location(
-        &mut self,
-        file_label: Label,
-        start_line: usize,
-        start_column: usize,
-        end_line: usize,
-        end_column: usize,
-    ) -> Label {
-        let (loc_label, fresh) = self.global_id(&format!(
-            "loc,{{{}}},{},{},{},{}",
-            file_label, start_line, start_column, end_line, end_column
-        ));
-        if fresh {
-            self.add_tuple(
-                "locations_default",
-                vec![
-                    Arg::Label(loc_label),
-                    Arg::Label(file_label),
-                    Arg::Int(start_line),
-                    Arg::Int(start_column),
-                    Arg::Int(end_line),
-                    Arg::Int(end_column),
-                ],
-            );
-        }
-        loc_label
-    }
-
-    fn comment(&mut self, text: String) {
-        self.trap_output.push(TrapEntry::Comment(text));
-    }
-
-    pub fn output(self, writer: &mut dyn Write) -> std::io::Result<()> {
-        write!(writer, "{}", Program(self.trap_output))
+fn location(
+    writer: &mut trap::Writer,
+    file_label: trap::Label,
+    start_line: usize,
+    start_column: usize,
+    end_line: usize,
+    end_column: usize,
+) -> trap::Label {
+    let (loc_label, fresh) = writer.global_id(&format!(
+        "loc,{{{}}},{},{},{},{}",
+        file_label, start_line, start_column, end_line, end_column
+    ));
+    if fresh {
+        writer.add_tuple(
+            "locations_default",
+            vec![
+                trap::Arg::Label(loc_label),
+                trap::Arg::Label(file_label),
+                trap::Arg::Int(start_line),
+                trap::Arg::Int(start_column),
+                trap::Arg::Int(end_line),
+                trap::Arg::Int(end_column),
+            ],
+        );
    }
+    loc_label
 }

 /// Extracts the source file at `path`, which is assumed to be canonicalized.
@@ -163,71 +114,43 @@ pub fn extract(
    language: Language,
    language_prefix: &str,
    schema: &NodeTypeMap,
-    trap_writer: &mut TrapWriter,
+    trap_writer: &mut trap::Writer,
    path: &Path,
    source: &[u8],
    ranges: &[Range],
 ) -> std::io::Result<()> {
+    let path_str = format!("{}", path.display());
    let span = span!(
        Level::TRACE,
        "extract",
-        file = %path.display()
+        file = %path_str
    );

    let _enter = span.enter();

-    info!("extracting: {}", path.display());
+    info!("extracting: {}", path_str);

    let mut parser = Parser::new();
    parser.set_language(language).unwrap();
    parser.set_included_ranges(ranges).unwrap();
    let tree = parser.parse(&source, None).expect("Failed to parse file");
-    trap_writer.comment(format!("Auto-generated TRAP file for {}", path.display()));
-    let file_label = &trap_writer.populate_file(path);
-    let mut visitor = Visitor {
+    trap_writer.comment(format!("Auto-generated TRAP file for {}", path_str));
+    let file_label = populate_file(trap_writer, path);
+    let mut visitor = Visitor::new(
        source,
        trap_writer,
        // TODO: should we handle path strings that are not valid UTF8 better?
-        path: format!("{}", path.display()),
-        file_label: *file_label,
-        toplevel_child_counter: 0,
-        stack: Vec::new(),
+        &path_str,
+        file_label,
        language_prefix,
        schema,
-    };
+    );
    traverse(&tree, &mut visitor);

    parser.reset();
    Ok(())
 }

-/// Escapes a string for use in a TRAP key, by replacing special characters with
-/// HTML entities.
-fn escape_key<'a, S: Into<Cow<'a, str>>>(key: S) -> Cow<'a, str> {
-    fn needs_escaping(c: char) -> bool {
-        matches!(c, '&' | '{' | '}' | '"' | '@' | '#')
-    }
-
-    let key = key.into();
-    if key.contains(needs_escaping) {
-        let mut escaped = String::with_capacity(2 * key.len());
-        for c in key.chars() {
-            match c {
-                '&' => escaped.push_str("&amp;"),
-                '{' => escaped.push_str("&lbrace;"),
-                '}' => escaped.push_str("&rbrace;"),
-                '"' => escaped.push_str("&quot;"),
-                '@' => escaped.push_str("&commat;"),
-                '#' => escaped.push_str("&num;"),
-                _ => escaped.push(c),
-            }
-        }
-        Cow::Owned(escaped)
-    } else {
-        key
-    }
-}
-
 /// Normalizes the path according the common CodeQL specification. Assumes that
 /// `path` has already been canonicalized using `std::fs::canonicalize`.
 fn normalize_path(path: &Path) -> String {
@@ -267,34 +190,28 @@ fn normalize_path(path: &Path) -> String {
    }
 }

-fn full_id_for_file(path: &Path) -> String {
-    format!("{};sourcefile", escape_key(&normalize_path(path)))
-}
-
-fn full_id_for_folder(path: &Path) -> String {
-    format!("{};folder", escape_key(&normalize_path(path)))
-}
-
 struct ChildNode {
    field_name: Option<&'static str>,
-    label: Label,
+    label: trap::Label,
    type_name: TypeName,
 }

 struct Visitor<'a> {
    /// The file path of the source code (as string)
-    path: String,
+    path: &'a str,
    /// The label to use whenever we need to refer to the `@file` entity of this
    /// source file.
-    file_label: Label,
+    file_label: trap::Label,
    /// The source code as a UTF-8 byte array
    source: &'a [u8],
-    /// A TrapWriter to accumulate trap entries
-    trap_writer: &'a mut TrapWriter,
+    /// A trap::Writer to accumulate trap entries
+    trap_writer: &'a mut trap::Writer,
    /// A counter for top-level child nodes
    toplevel_child_counter: usize,
-    /// Language prefix
-    language_prefix: &'a str,
+    /// Language-specific name of the AST info table
+    ast_node_info_table_name: String,
+    /// Language-specific name of the tokeninfo table
+    tokeninfo_table_name: String,
    /// A lookup table from type name to node types
    schema: &'a NodeTypeMap,
    /// A stack for gathering information from child nodes. Whenever a node is
@@ -303,27 +220,48 @@ struct Visitor<'a> {
    /// node the list containing the child data is popped from the stack and
    /// matched against the dbscheme for the node. If the expectations are met
    /// the corresponding row definitions are added to the trap_output.
-    stack: Vec<(Label, usize, Vec<ChildNode>)>,
+    stack: Vec<(trap::Label, usize, Vec<ChildNode>)>,
 }

-impl Visitor<'_> {
+impl<'a> Visitor<'a> {
+    fn new(
+        source: &'a [u8],
+        trap_writer: &'a mut trap::Writer,
+        path: &'a str,
+        file_label: trap::Label,
+        language_prefix: &str,
+        schema: &'a NodeTypeMap,
+    ) -> Visitor<'a> {
+        Visitor {
+            path,
+            file_label,
+            source,
+            trap_writer,
+            toplevel_child_counter: 0,
+            ast_node_info_table_name: format!("{}_ast_node_info", language_prefix),
+            tokeninfo_table_name: format!("{}_tokeninfo", language_prefix),
+            schema,
+            stack: Vec::new(),
+        }
+    }
+
    fn record_parse_error(
        &mut self,
        error_message: String,
        full_error_message: String,
-        loc: Label,
+        loc: trap::Label,
    ) {
        error!("{}", full_error_message);
        let id = self.trap_writer.fresh_id();
        self.trap_writer.add_tuple(
            "diagnostics",
            vec![
-                Arg::Label(id),
-                Arg::Int(40), // severity 40 = error
-                Arg::String("parse_error".to_string()),
-                Arg::String(error_message),
-                Arg::String(full_error_message),
-                Arg::Label(loc),
+                trap::Arg::Label(id),
+                trap::Arg::Int(40), // severity 40 = error
+                trap::Arg::String("parse_error".to_string()),
+                trap::Arg::String(error_message),
+                trap::Arg::String(full_error_message),
+                trap::Arg::Label(loc),
            ],
        );
    }
@@ -335,7 +273,8 @@ impl Visitor<'_> {
        node: Node,
    ) {
        let (start_line, start_column, end_line, end_column) = location_for(self.source, node);
-        let loc = self.trap_writer.location(
+        let loc = location(
+            self.trap_writer,
            self.file_label,
            start_line,
            start_column,
@@ -374,7 +313,8 @@ impl Visitor<'_> {
        }
        let (id, _, child_nodes) = self.stack.pop().expect("Vistor: empty stack");
        let (start_line, start_column, end_line, end_column) = location_for(self.source, node);
-        let loc = self.trap_writer.location(
+        let loc = location(
+            self.trap_writer,
            self.file_label,
            start_line,
            start_column,
@@ -402,19 +342,19 @@ impl Visitor<'_> {
        match &table.kind {
            EntryKind::Token { kind_id, .. } => {
                self.trap_writer.add_tuple(
-                    &format!("{}_ast_node_info", self.language_prefix),
+                    &self.ast_node_info_table_name,
                    vec![
-                        Arg::Label(id),
-                        Arg::Label(parent_id),
-                        Arg::Int(parent_index),
-                        Arg::Label(loc),
+                        trap::Arg::Label(id),
+                        trap::Arg::Label(parent_id),
+                        trap::Arg::Int(parent_index),
+                        trap::Arg::Label(loc),
                    ],
                );
                self.trap_writer.add_tuple(
-                    &format!("{}_tokeninfo", self.language_prefix),
+                    &self.tokeninfo_table_name,
                    vec![
-                        Arg::Label(id),
-                        Arg::Int(*kind_id),
+                        trap::Arg::Label(id),
+                        trap::Arg::Int(*kind_id),
                        sliced_source_arg(self.source, node),
                    ],
                );
@@ -425,15 +365,15 @@ impl Visitor<'_> {
            } => {
                if let Some(args) = self.complex_node(&node, fields, &child_nodes, id) {
                    self.trap_writer.add_tuple(
-                        &format!("{}_ast_node_info", self.language_prefix),
+                        &self.ast_node_info_table_name,
                        vec![
-                            Arg::Label(id),
-                            Arg::Label(parent_id),
-                            Arg::Int(parent_index),
-                            Arg::Label(loc),
+                            trap::Arg::Label(id),
+                            trap::Arg::Label(parent_id),
+                            trap::Arg::Int(parent_index),
+                            trap::Arg::Label(loc),
                        ],
                    );
-                    let mut all_args = vec![Arg::Label(id)];
+                    let mut all_args = vec![trap::Arg::Label(id)];
                    all_args.extend(args);
                    self.trap_writer.add_tuple(table_name, all_args);
                }
@@ -472,9 +412,9 @@ impl Visitor<'_> {
        node: &Node,
        fields: &[Field],
        child_nodes: &[ChildNode],
-        parent_id: Label,
-    ) -> Option<Vec<Arg>> {
-        let mut map: Map<&Option<String>, (&Field, Vec<Arg>)> = Map::new();
+        parent_id: trap::Label,
+    ) -> Option<Vec<trap::Arg>> {
+        let mut map: Map<&Option<String>, (&Field, Vec<trap::Arg>)> = Map::new();
        for field in fields {
            map.insert(&field.name, (field, Vec::new()));
        }
@@ -488,9 +428,9 @@ impl Visitor<'_> {
                    {
                        // We can safely unwrap because type_matches checks the key is in the map.
                        let (int_value, _) = int_mapping.get(&child_node.type_name.kind).unwrap();
-                        values.push(Arg::Int(*int_value));
+                        values.push(trap::Arg::Int(*int_value));
                    } else {
-                        values.push(Arg::Label(child_node.label));
+                        values.push(trap::Arg::Label(child_node.label));
                    }
                } else if field.name.is_some() {
                    let error_message = format!(
@@ -569,9 +509,9 @@ impl Visitor<'_> {
                            );
                            break;
                        }
-                        let mut args = vec![Arg::Label(parent_id)];
+                        let mut args = vec![trap::Arg::Label(parent_id)];
                        if *has_index {
-                            args.push(Arg::Int(index))
+                            args.push(trap::Arg::Int(index))
                        }
                        args.push(child_value.clone());
                        self.trap_writer.add_tuple(table_name, args);
@@ -625,9 +565,9 @@ impl Visitor<'_> {
 }

 // Emit a slice of a source file as an Arg.
-fn sliced_source_arg(source: &[u8], n: Node) -> Arg {
+fn sliced_source_arg(source: &[u8], n: Node) -> trap::Arg {
    let range = n.byte_range();
-    Arg::String(String::from_utf8_lossy(&source[range.start..range.end]).into_owned())
+    trap::Arg::String(String::from_utf8_lossy(&source[range.start..range.end]).into_owned())
 }

 // Emit a pair of `TrapEntry`s for the provided node, appropriately calibrated.
@@ -699,59 +639,6 @@ fn traverse(tree: &Tree, visitor: &mut Visitor) {
    }
 }

-pub struct Program(Vec<TrapEntry>);
-
-impl fmt::Display for Program {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        let mut text = String::new();
-        for trap_entry in &self.0 {
-            text.push_str(&format!("{}\n", trap_entry));
-        }
-        write!(f, "{}", text)
-    }
-}
-
-enum TrapEntry {
-    /// Maps the label to a fresh id, e.g. `#123=*`.
-    FreshId(Label),
-    /// Maps the label to a key, e.g. `#7=@"foo"`.
-    MapLabelToKey(Label, String),
-    /// foo_bar(arg*)
-    GenericTuple(String, Vec<Arg>),
-    Comment(String),
-}
-impl fmt::Display for TrapEntry {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        match self {
-            TrapEntry::FreshId(label) => write!(f, "{}=*", label),
-            TrapEntry::MapLabelToKey(label, key) => {
-                write!(f, "{}=@\"{}\"", label, key.replace("\"", "\"\""))
-            }
-            TrapEntry::GenericTuple(name, args) => {
-                write!(f, "{}(", name)?;
-                for (index, arg) in args.iter().enumerate() {
-                    if index > 0 {
-                        write!(f, ",")?;
-                    }
-                    write!(f, "{}", arg)?;
-                }
-                write!(f, ")")
-            }
-            TrapEntry::Comment(line) => write!(f, "// {}", line),
-        }
-    }
-}
-
-#[derive(Debug, Copy, Clone)]
-// Identifiers of the form #0, #1...
-struct Label(u32);
-
-impl fmt::Display for Label {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        write!(f, "#{:x}", self.0)
-    }
-}
-
 // Numeric indices.
 #[derive(Debug, Copy, Clone)]
 struct Index(usize);
@@ -761,69 +648,3 @@ impl fmt::Display for Index {
        write!(f, "{}", self.0)
    }
 }
-
-// Some untyped argument to a TrapEntry.
-#[derive(Debug, Clone)]
-enum Arg {
-    Label(Label),
-    Int(usize),
-    String(String),
-}
-
-const MAX_STRLEN: usize = 1048576;
-
-impl fmt::Display for Arg {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        match self {
-            Arg::Label(x) => write!(f, "{}", x),
-            Arg::Int(x) => write!(f, "{}", x),
-            Arg::String(x) => write!(
-                f,
-                "\"{}\"",
-                limit_string(x, MAX_STRLEN).replace("\"", "\"\"")
-            ),
-        }
-    }
-}
-
-/// Limit the length (in bytes) of a string. If the string's length in bytes is
-/// less than or equal to the limit then the entire string is returned. Otherwise
-/// the string is sliced at the provided limit. If there is a multi-byte character
-/// at the limit then the returned slice will be slightly shorter than the limit to
-/// avoid splitting that multi-byte character.
-fn limit_string(string: &str, max_size: usize) -> &str {
-    if string.len() <= max_size {
-        return string;
-    }
-    let p = string.as_bytes();
-    let mut index = max_size;
-    // We want to clip the string at [max_size]; however, the character at that position
-    // may span several bytes. We need to find the first byte of the character. In UTF-8
-    // encoded data any byte that matches the bit pattern 10XXXXXX is not a start byte.
-    // Therefore we decrement the index as long as there are bytes matching this pattern.
-    // This ensures we cut the string at the border between one character and another.
-    while index > 0 && (p[index] & 0b11000000) == 0b10000000 {
-        index -= 1;
-    }
-    &string[0..index]
-}
-
-#[test]
-fn limit_string_test() {
-    assert_eq!("hello", limit_string(&"hello world".to_owned(), 5));
-    assert_eq!("hi ☹", limit_string(&"hi ☹☹".to_owned(), 6));
-    assert_eq!("hi ", limit_string(&"hi ☹☹".to_owned(), 5));
-}
-
-#[test]
-fn escape_key_test() {
-    assert_eq!("foo!", escape_key("foo!"));
-    assert_eq!("foo&lbrace;&rbrace;", escape_key("foo{}"));
-    assert_eq!("&lbrace;&rbrace;", escape_key("{}"));
-    assert_eq!("", escape_key(""));
-    assert_eq!("/path/to/foo.rb", escape_key("/path/to/foo.rb"));
-    assert_eq!(
-        "/path/to/foo&amp;&lbrace;&rbrace;&quot;&commat;&num;.rb",
-        escape_key("/path/to/foo&{}\"@#.rb")
-    );
-}
--- a/ql/extractor/src/main.rs
+++ b/ql/extractor/src/main.rs
@@ -1,49 +1,13 @@
 mod extractor;
+mod trap;

 extern crate num_cpus;

-use flate2::write::GzEncoder;
 use rayon::prelude::*;
 use std::fs;
-use std::io::{BufRead, BufWriter};
+use std::io::BufRead;
 use std::path::{Path, PathBuf};

-enum TrapCompression {
-    None,
-    Gzip,
-}
-
-impl TrapCompression {
-    fn from_env() -> TrapCompression {
-        match std::env::var("CODEQL_QL_TRAP_COMPRESSION") {
-            Ok(method) => match TrapCompression::from_string(&method) {
-                Some(c) => c,
-                None => {
-                    tracing::error!("Unknown compression method '{}'; using gzip.", &method);
-                    TrapCompression::Gzip
-                }
-            },
-            // Default compression method if the env var isn't set:
-            Err(_) => TrapCompression::Gzip,
-        }
-    }
-
-    fn from_string(s: &str) -> Option<TrapCompression> {
-        match s.to_lowercase().as_ref() {
-            "none" => Some(TrapCompression::None),
-            "gzip" => Some(TrapCompression::Gzip),
-            _ => None,
-        }
-    }
-
-    fn extension(&self) -> &str {
-        match self {
-            TrapCompression::None => "trap",
-            TrapCompression::Gzip => "trap.gz",
-        }
-    }
-}
-
 /**
 * Gets the number of threads the extractor should use, by reading the
 * CODEQL_THREADS environment variable and using it as described in the
@@ -115,7 +79,7 @@ fn main() -> std::io::Result<()> {
        .value_of("output-dir")
        .expect("missing --output-dir");
    let trap_dir = PathBuf::from(trap_dir);
-    let trap_compression = TrapCompression::from_env();
+    let trap_compression = trap::Compression::from_env("CODEQL_QL_TRAP_COMPRESSION");

    let file_list = matches.value_of("file-list").expect("missing --file-list");
    let file_list = fs::File::open(file_list)?;
@@ -140,7 +104,7 @@ fn main() -> std::io::Result<()> {
            let src_archive_file = path_for(&src_archive_dir, &path, "");
            let source = std::fs::read(&path)?;
            let code_ranges = vec![];
-            let mut trap_writer = extractor::new_trap_writer();
+            let mut trap_writer = trap::Writer::new();
            extractor::extract(
                language,
                "ql",
@@ -152,33 +116,25 @@ fn main() -> std::io::Result<()> {
            )?;
            std::fs::create_dir_all(&src_archive_file.parent().unwrap())?;
            std::fs::copy(&path, &src_archive_file)?;
-            write_trap(&trap_dir, path, trap_writer, &trap_compression)
+            write_trap(&trap_dir, path, &trap_writer, trap_compression)
        })
        .expect("failed to extract files");

    let path = PathBuf::from("extras");
-    let mut trap_writer = extractor::new_trap_writer();
-    trap_writer.populate_empty_location();
-    write_trap(&trap_dir, path, trap_writer, &trap_compression)
+    let mut trap_writer = trap::Writer::new();
+    extractor::populate_empty_location(&mut trap_writer);
+    write_trap(&trap_dir, path, &trap_writer, trap_compression)
 }

 fn write_trap(
    trap_dir: &Path,
    path: PathBuf,
-    trap_writer: extractor::TrapWriter,
-    trap_compression: &TrapCompression,
+    trap_writer: &trap::Writer,
+    trap_compression: trap::Compression,
 ) -> std::io::Result<()> {
    let trap_file = path_for(trap_dir, &path, trap_compression.extension());
    std::fs::create_dir_all(&trap_file.parent().unwrap())?;
-    let trap_file = std::fs::File::create(&trap_file)?;
-    let mut trap_file = BufWriter::new(trap_file);
-    match trap_compression {
-        TrapCompression::None => trap_writer.output(&mut trap_file),
-        TrapCompression::Gzip => {
-            let mut compressed_writer = GzEncoder::new(trap_file, flate2::Compression::fast());
-            trap_writer.output(&mut compressed_writer)
-        }
-    }
+    trap_writer.write_to_file(&trap_file, trap_compression)
 }

 fn path_for(dir: &Path, path: &Path, ext: &str) -> PathBuf {
--- a/ql/extractor/src/trap.rs
+++ b/ql/extractor/src/trap.rs
@@ -0,0 +1,272 @@
+use std::borrow::Cow;
+use std::fmt;
+use std::io::{BufWriter, Write};
+use std::path::Path;
+
+use flate2::write::GzEncoder;
+
+pub struct Writer {
+    /// The accumulated trap entries
+    trap_output: Vec<Entry>,
+    /// A counter for generating fresh labels
+    counter: u32,
+    /// cache of global keys
+    global_keys: std::collections::HashMap<String, Label>,
+}
+
+impl Writer {
+    pub fn new() -> Writer {
+        Writer {
+            counter: 0,
+            trap_output: Vec::new(),
+            global_keys: std::collections::HashMap::new(),
+        }
+    }
+
+    pub fn fresh_id(&mut self) -> Label {
+        let label = Label(self.counter);
+        self.counter += 1;
+        self.trap_output.push(Entry::FreshId(label));
+        label
+    }
+
+    ///  Gets a label that will hold the unique ID of the passed string at import time.
+    ///  This can be used for incrementally importable TRAP files -- use globally unique
+    ///  strings to compute a unique ID for table tuples.
+    ///
+    ///  Note: You probably want to make sure that the key strings that you use are disjoint
+    ///  for disjoint column types; the standard way of doing this is to prefix (or append)
+    ///  the column type name to the ID. Thus, you might identify methods in Java by the
+    ///  full ID "methods_com.method.package.DeclaringClass.method(argumentList)".
+    pub fn global_id(&mut self, key: &str) -> (Label, bool) {
+        if let Some(label) = self.global_keys.get(key) {
+            return (*label, false);
+        }
+        let label = Label(self.counter);
+        self.counter += 1;
+        self.global_keys.insert(key.to_owned(), label);
+        self.trap_output
+            .push(Entry::MapLabelToKey(label, key.to_owned()));
+        (label, true)
+    }
+
+    pub fn add_tuple(&mut self, table_name: &str, args: Vec<Arg>) {
+        self.trap_output
+            .push(Entry::GenericTuple(table_name.to_owned(), args))
+    }
+
+    pub fn comment(&mut self, text: String) {
+        self.trap_output.push(Entry::Comment(text));
+    }
+
+    pub fn write_to_file(&self, path: &Path, compression: Compression) -> std::io::Result<()> {
+        let trap_file = std::fs::File::create(path)?;
+        let mut trap_file = BufWriter::new(trap_file);
+        match compression {
+            Compression::None => self.write_trap_entries(&mut trap_file),
+            Compression::Gzip => {
+                let mut compressed_writer = GzEncoder::new(trap_file, flate2::Compression::fast());
+                self.write_trap_entries(&mut compressed_writer)
+            }
+        }
+    }
+
+    fn write_trap_entries<W: Write>(&self, file: &mut W) -> std::io::Result<()> {
+        for trap_entry in &self.trap_output {
+            writeln!(file, "{}", trap_entry)?;
+        }
+        std::io::Result::Ok(())
+    }
+}
+
+pub enum Entry {
+    /// Maps the label to a fresh id, e.g. `#123=*`.
+    FreshId(Label),
+    /// Maps the label to a key, e.g. `#7=@"foo"`.
+    MapLabelToKey(Label, String),
+    /// foo_bar(arg*)
+    GenericTuple(String, Vec<Arg>),
+    Comment(String),
+}
+
+impl fmt::Display for Entry {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match self {
+            Entry::FreshId(label) => write!(f, "{}=*", label),
+            Entry::MapLabelToKey(label, key) => {
+                write!(f, "{}=@\"{}\"", label, key.replace("\"", "\"\""))
+            }
+            Entry::GenericTuple(name, args) => {
+                write!(f, "{}(", name)?;
+                for (index, arg) in args.iter().enumerate() {
+                    if index > 0 {
+                        write!(f, ",")?;
+                    }
+                    write!(f, "{}", arg)?;
+                }
+                write!(f, ")")
+            }
+            Entry::Comment(line) => write!(f, "// {}", line),
+        }
+    }
+}
+
+#[derive(Debug, Copy, Clone)]
+// Identifiers of the form #0, #1...
+pub struct Label(u32);
+
+impl fmt::Display for Label {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "#{:x}", self.0)
+    }
+}
+
+// Some untyped argument to a TrapEntry.
+#[derive(Debug, Clone)]
+pub enum Arg {
+    Label(Label),
+    Int(usize),
+    String(String),
+}
+
+const MAX_STRLEN: usize = 1048576;
+
+impl fmt::Display for Arg {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match self {
+            Arg::Label(x) => write!(f, "{}", x),
+            Arg::Int(x) => write!(f, "{}", x),
+            Arg::String(x) => write!(
+                f,
+                "\"{}\"",
+                limit_string(x, MAX_STRLEN).replace("\"", "\"\"")
+            ),
+        }
+    }
+}
+
+pub struct Program(Vec<Entry>);
+
+impl fmt::Display for Program {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        let mut text = String::new();
+        for trap_entry in &self.0 {
+            text.push_str(&format!("{}\n", trap_entry));
+        }
+        write!(f, "{}", text)
+    }
+}
+
+pub fn full_id_for_file(normalized_path: &str) -> String {
+    format!("{};sourcefile", escape_key(normalized_path))
+}
+
+pub fn full_id_for_folder(normalized_path: &str) -> String {
+    format!("{};folder", escape_key(normalized_path))
+}
+
+/// Escapes a string for use in a TRAP key, by replacing special characters with
+/// HTML entities.
+fn escape_key<'a, S: Into<Cow<'a, str>>>(key: S) -> Cow<'a, str> {
+    fn needs_escaping(c: char) -> bool {
+        matches!(c, '&' | '{' | '}' | '"' | '@' | '#')
+    }
+
+    let key = key.into();
+    if key.contains(needs_escaping) {
+        let mut escaped = String::with_capacity(2 * key.len());
+        for c in key.chars() {
+            match c {
+                '&' => escaped.push_str("&amp;"),
+                '{' => escaped.push_str("&lbrace;"),
+                '}' => escaped.push_str("&rbrace;"),
+                '"' => escaped.push_str("&quot;"),
+                '@' => escaped.push_str("&commat;"),
+                '#' => escaped.push_str("&num;"),
+                _ => escaped.push(c),
+            }
+        }
+        Cow::Owned(escaped)
+    } else {
+        key
+    }
+}
+
+/// Limit the length (in bytes) of a string. If the string's length in bytes is
+/// less than or equal to the limit then the entire string is returned. Otherwise
+/// the string is sliced at the provided limit. If there is a multi-byte character
+/// at the limit then the returned slice will be slightly shorter than the limit to
+/// avoid splitting that multi-byte character.
+fn limit_string(string: &str, max_size: usize) -> &str {
+    if string.len() <= max_size {
+        return string;
+    }
+    let p = string.as_bytes();
+    let mut index = max_size;
+    // We want to clip the string at [max_size]; however, the character at that position
+    // may span several bytes. We need to find the first byte of the character. In UTF-8
+    // encoded data any byte that matches the bit pattern 10XXXXXX is not a start byte.
+    // Therefore we decrement the index as long as there are bytes matching this pattern.
+    // This ensures we cut the string at the border between one character and another.
+    while index > 0 && (p[index] & 0b11000000) == 0b10000000 {
+        index -= 1;
+    }
+    &string[0..index]
+}
+
+#[derive(Clone, Copy)]
+pub enum Compression {
+    None,
+    Gzip,
+}
+
+impl Compression {
+    pub fn from_env(var_name: &str) -> Compression {
+        match std::env::var(var_name) {
+            Ok(method) => match Compression::from_string(&method) {
+                Some(c) => c,
+                None => {
+                    tracing::error!("Unknown compression method '{}'; using gzip.", &method);
+                    Compression::Gzip
+                }
+            },
+            // Default compression method if the env var isn't set:
+            Err(_) => Compression::Gzip,
+        }
+    }
+
+    pub fn from_string(s: &str) -> Option<Compression> {
+        match s.to_lowercase().as_ref() {
+            "none" => Some(Compression::None),
+            "gzip" => Some(Compression::Gzip),
+            _ => None,
+        }
+    }
+
+    pub fn extension(&self) -> &str {
+        match self {
+            Compression::None => "trap",
+            Compression::Gzip => "trap.gz",
+        }
+    }
+}
+
+#[test]
+fn limit_string_test() {
+    assert_eq!("hello", limit_string(&"hello world".to_owned(), 5));
+    assert_eq!("hi ☹", limit_string(&"hi ☹☹".to_owned(), 6));
+    assert_eq!("hi ", limit_string(&"hi ☹☹".to_owned(), 5));
+}
+
+#[test]
+fn escape_key_test() {
+    assert_eq!("foo!", escape_key("foo!"));
+    assert_eq!("foo&lbrace;&rbrace;", escape_key("foo{}"));
+    assert_eq!("&lbrace;&rbrace;", escape_key("{}"));
+    assert_eq!("", escape_key(""));
+    assert_eq!("/path/to/foo.rb", escape_key("/path/to/foo.rb"));
+    assert_eq!(
+        "/path/to/foo&amp;&lbrace;&rbrace;&quot;&commat;&num;.rb",
+        escape_key("/path/to/foo&{}\"@#.rb")
+    );
+}
--- a/ruby/Cargo.lock
+++ b/ruby/Cargo.lock
--- a/ruby/extractor/Cargo.toml
+++ b/ruby/extractor/Cargo.toml
@@ -11,10 +11,12 @@ flate2 = "1.0"
 node-types = { path = "../node-types" }
 tree-sitter = "0.19"
 tree-sitter-embedded-template = { git = "https://github.com/tree-sitter/tree-sitter-embedded-template.git", rev = "1a538da253d73f896b9f6c0c7d79cda58791ac5c" }
-tree-sitter-ruby = { git = "https://github.com/tree-sitter/tree-sitter-ruby.git", rev = "5b305c3cd32db10494cedd2743de6bbe32f1a573" }
+tree-sitter-ruby = { git = "https://github.com/tree-sitter/tree-sitter-ruby.git", rev = "e75d04404c9dd71ad68850d5c672b226d5e694f3" }
 clap = "3.0"
 tracing = "0.1"
 tracing-subscriber = { version = "0.3.3", features = ["env-filter"] }
 rayon = "1.5.0"
 num_cpus = "1.13.0"
 regex = "1.5.5"
+encoding = "0.2"
+lazy_static = "1.4.0"
--- a/ruby/extractor/src/extractor.rs
+++ b/ruby/extractor/src/extractor.rs
@@ -1,161 +1,112 @@
+use crate::trap;
 use node_types::{EntryKind, Field, NodeTypeMap, Storage, TypeName};
-use std::borrow::Cow;
 use std::collections::BTreeMap as Map;
 use std::collections::BTreeSet as Set;
 use std::fmt;
-use std::io::Write;
 use std::path::Path;

 use tracing::{error, info, span, Level};
 use tree_sitter::{Language, Node, Parser, Range, Tree};

-pub struct TrapWriter {
-    /// The accumulated trap entries
-    trap_output: Vec<TrapEntry>,
-    /// A counter for generating fresh labels
-    counter: u32,
-    /// cache of global keys
-    global_keys: std::collections::HashMap<String, Label>,
+pub fn populate_file(writer: &mut trap::Writer, absolute_path: &Path) -> trap::Label {
+    let (file_label, fresh) =
+        writer.global_id(&trap::full_id_for_file(&normalize_path(absolute_path)));
+    if fresh {
+        writer.add_tuple(
+            "files",
+            vec![
+                trap::Arg::Label(file_label),
+                trap::Arg::String(normalize_path(absolute_path)),
+            ],
+        );
+        populate_parent_folders(writer, file_label, absolute_path.parent());
+    }
+    file_label
 }

-pub fn new_trap_writer() -> TrapWriter {
-    TrapWriter {
-        counter: 0,
-        trap_output: Vec::new(),
-        global_keys: std::collections::HashMap::new(),
+fn populate_empty_file(writer: &mut trap::Writer) -> trap::Label {
+    let (file_label, fresh) = writer.global_id("empty;sourcefile");
+    if fresh {
+        writer.add_tuple(
+            "files",
+            vec![
+                trap::Arg::Label(file_label),
+                trap::Arg::String("".to_string()),
+            ],
+        );
    }
+    file_label
 }

-impl TrapWriter {
-    ///  Gets a label that will hold the unique ID of the passed string at import time.
-    ///  This can be used for incrementally importable TRAP files -- use globally unique
-    ///  strings to compute a unique ID for table tuples.
-    ///
-    ///  Note: You probably want to make sure that the key strings that you use are disjoint
-    ///  for disjoint column types; the standard way of doing this is to prefix (or append)
-    ///  the column type name to the ID. Thus, you might identify methods in Java by the
-    ///  full ID "methods_com.method.package.DeclaringClass.method(argumentList)".
+pub fn populate_empty_location(writer: &mut trap::Writer) {
+    let file_label = populate_empty_file(writer);
+    location(writer, file_label, 0, 0, 0, 0);
+}

-    fn fresh_id(&mut self) -> Label {
-        let label = Label(self.counter);
-        self.counter += 1;
-        self.trap_output.push(TrapEntry::FreshId(label));
-        label
-    }
-
-    fn global_id(&mut self, key: &str) -> (Label, bool) {
-        if let Some(label) = self.global_keys.get(key) {
-            return (*label, false);
-        }
-        let label = Label(self.counter);
-        self.counter += 1;
-        self.global_keys.insert(key.to_owned(), label);
-        self.trap_output
-            .push(TrapEntry::MapLabelToKey(label, key.to_owned()));
-        (label, true)
-    }
-
-    fn add_tuple(&mut self, table_name: &str, args: Vec<Arg>) {
-        self.trap_output
-            .push(TrapEntry::GenericTuple(table_name.to_owned(), args))
-    }
-
-    fn populate_file(&mut self, absolute_path: &Path) -> Label {
-        let (file_label, fresh) = self.global_id(&full_id_for_file(absolute_path));
-        if fresh {
-            self.add_tuple(
-                "files",
-                vec![
-                    Arg::Label(file_label),
-                    Arg::String(normalize_path(absolute_path)),
-                ],
-            );
-            self.populate_parent_folders(file_label, absolute_path.parent());
-        }
-        file_label
-    }
-
-    fn populate_empty_file(&mut self) -> Label {
-        let (file_label, fresh) = self.global_id("empty;sourcefile");
-        if fresh {
-            self.add_tuple(
-                "files",
-                vec![Arg::Label(file_label), Arg::String("".to_string())],
-            );
-        }
-        file_label
-    }
-
-    pub fn populate_empty_location(&mut self) {
-        let file_label = self.populate_empty_file();
-        self.location(file_label, 0, 0, 0, 0);
-    }
-
-    fn populate_parent_folders(&mut self, child_label: Label, path: Option<&Path>) {
-        let mut path = path;
-        let mut child_label = child_label;
-        loop {
-            match path {
-                None => break,
-                Some(folder) => {
-                    let (folder_label, fresh) = self.global_id(&full_id_for_folder(folder));
-                    self.add_tuple(
-                        "containerparent",
-                        vec![Arg::Label(folder_label), Arg::Label(child_label)],
+pub fn populate_parent_folders(
+    writer: &mut trap::Writer,
+    child_label: trap::Label,
+    path: Option<&Path>,
+) {
+    let mut path = path;
+    let mut child_label = child_label;
+    loop {
+        match path {
+            None => break,
+            Some(folder) => {
+                let (folder_label, fresh) =
+                    writer.global_id(&trap::full_id_for_folder(&normalize_path(folder)));
+                writer.add_tuple(
+                    "containerparent",
+                    vec![
+                        trap::Arg::Label(folder_label),
+                        trap::Arg::Label(child_label),
+                    ],
+                );
+                if fresh {
+                    writer.add_tuple(
+                        "folders",
+                        vec![
+                            trap::Arg::Label(folder_label),
+                            trap::Arg::String(normalize_path(folder)),
+                        ],
                    );
-                    if fresh {
-                        self.add_tuple(
-                            "folders",
-                            vec![
-                                Arg::Label(folder_label),
-                                Arg::String(normalize_path(folder)),
-                            ],
-                        );
-                        path = folder.parent();
-                        child_label = folder_label;
-                    } else {
-                        break;
-                    }
+                    path = folder.parent();
+                    child_label = folder_label;
+                } else {
+                    break;
                }
            }
        }
    }
+}

-    fn location(
-        &mut self,
-        file_label: Label,
-        start_line: usize,
-        start_column: usize,
-        end_line: usize,
-        end_column: usize,
-    ) -> Label {
-        let (loc_label, fresh) = self.global_id(&format!(
-            "loc,{{{}}},{},{},{},{}",
-            file_label, start_line, start_column, end_line, end_column
-        ));
-        if fresh {
-            self.add_tuple(
-                "locations_default",
-                vec![
-                    Arg::Label(loc_label),
-                    Arg::Label(file_label),
-                    Arg::Int(start_line),
-                    Arg::Int(start_column),
-                    Arg::Int(end_line),
-                    Arg::Int(end_column),
-                ],
-            );
-        }
-        loc_label
-    }
-
-    fn comment(&mut self, text: String) {
-        self.trap_output.push(TrapEntry::Comment(text));
-    }
-
-    pub fn output(self, writer: &mut dyn Write) -> std::io::Result<()> {
-        write!(writer, "{}", Program(self.trap_output))
+fn location(
+    writer: &mut trap::Writer,
+    file_label: trap::Label,
+    start_line: usize,
+    start_column: usize,
+    end_line: usize,
+    end_column: usize,
+) -> trap::Label {
+    let (loc_label, fresh) = writer.global_id(&format!(
+        "loc,{{{}}},{},{},{},{}",
+        file_label, start_line, start_column, end_line, end_column
+    ));
+    if fresh {
+        writer.add_tuple(
+            "locations_default",
+            vec![
+                trap::Arg::Label(loc_label),
+                trap::Arg::Label(file_label),
+                trap::Arg::Int(start_line),
+                trap::Arg::Int(start_column),
+                trap::Arg::Int(end_line),
+                trap::Arg::Int(end_column),
+            ],
+        );
    }
+    loc_label
 }

 /// Extracts the source file at `path`, which is assumed to be canonicalized.
@@ -163,71 +114,43 @@ pub fn extract(
    language: Language,
    language_prefix: &str,
    schema: &NodeTypeMap,
-    trap_writer: &mut TrapWriter,
+    trap_writer: &mut trap::Writer,
    path: &Path,
    source: &[u8],
    ranges: &[Range],
 ) -> std::io::Result<()> {
+    let path_str = format!("{}", path.display());
    let span = span!(
        Level::TRACE,
        "extract",
-        file = %path.display()
+        file = %path_str
    );

    let _enter = span.enter();

-    info!("extracting: {}", path.display());
+    info!("extracting: {}", path_str);

    let mut parser = Parser::new();
    parser.set_language(language).unwrap();
    parser.set_included_ranges(ranges).unwrap();
    let tree = parser.parse(&source, None).expect("Failed to parse file");
-    trap_writer.comment(format!("Auto-generated TRAP file for {}", path.display()));
-    let file_label = &trap_writer.populate_file(path);
-    let mut visitor = Visitor {
+    trap_writer.comment(format!("Auto-generated TRAP file for {}", path_str));
+    let file_label = populate_file(trap_writer, path);
+    let mut visitor = Visitor::new(
        source,
        trap_writer,
        // TODO: should we handle path strings that are not valid UTF8 better?
-        path: format!("{}", path.display()),
-        file_label: *file_label,
-        toplevel_child_counter: 0,
-        stack: Vec::new(),
+        &path_str,
+        file_label,
        language_prefix,
        schema,
-    };
+    );
    traverse(&tree, &mut visitor);

    parser.reset();
    Ok(())
 }

-/// Escapes a string for use in a TRAP key, by replacing special characters with
-/// HTML entities.
-fn escape_key<'a, S: Into<Cow<'a, str>>>(key: S) -> Cow<'a, str> {
-    fn needs_escaping(c: char) -> bool {
-        matches!(c, '&' | '{' | '}' | '"' | '@' | '#')
-    }
-
-    let key = key.into();
-    if key.contains(needs_escaping) {
-        let mut escaped = String::with_capacity(2 * key.len());
-        for c in key.chars() {
-            match c {
-                '&' => escaped.push_str("&amp;"),
-                '{' => escaped.push_str("&lbrace;"),
-                '}' => escaped.push_str("&rbrace;"),
-                '"' => escaped.push_str("&quot;"),
-                '@' => escaped.push_str("&commat;"),
-                '#' => escaped.push_str("&num;"),
-                _ => escaped.push(c),
-            }
-        }
-        Cow::Owned(escaped)
-    } else {
-        key
-    }
-}
-
 /// Normalizes the path according the common CodeQL specification. Assumes that
 /// `path` has already been canonicalized using `std::fs::canonicalize`.
 fn normalize_path(path: &Path) -> String {
@@ -267,34 +190,28 @@ fn normalize_path(path: &Path) -> String {
    }
 }

-fn full_id_for_file(path: &Path) -> String {
-    format!("{};sourcefile", escape_key(&normalize_path(path)))
-}
-
-fn full_id_for_folder(path: &Path) -> String {
-    format!("{};folder", escape_key(&normalize_path(path)))
-}
-
 struct ChildNode {
    field_name: Option<&'static str>,
-    label: Label,
+    label: trap::Label,
    type_name: TypeName,
 }

 struct Visitor<'a> {
    /// The file path of the source code (as string)
-    path: String,
+    path: &'a str,
    /// The label to use whenever we need to refer to the `@file` entity of this
    /// source file.
-    file_label: Label,
+    file_label: trap::Label,
    /// The source code as a UTF-8 byte array
    source: &'a [u8],
-    /// A TrapWriter to accumulate trap entries
-    trap_writer: &'a mut TrapWriter,
+    /// A trap::Writer to accumulate trap entries
+    trap_writer: &'a mut trap::Writer,
    /// A counter for top-level child nodes
    toplevel_child_counter: usize,
-    /// Language prefix
-    language_prefix: &'a str,
+    /// Language-specific name of the AST info table
+    ast_node_info_table_name: String,
+    /// Language-specific name of the tokeninfo table
+    tokeninfo_table_name: String,
    /// A lookup table from type name to node types
    schema: &'a NodeTypeMap,
    /// A stack for gathering information from child nodes. Whenever a node is
@@ -303,27 +220,48 @@ struct Visitor<'a> {
    /// node the list containing the child data is popped from the stack and
    /// matched against the dbscheme for the node. If the expectations are met
    /// the corresponding row definitions are added to the trap_output.
-    stack: Vec<(Label, usize, Vec<ChildNode>)>,
+    stack: Vec<(trap::Label, usize, Vec<ChildNode>)>,
 }

-impl Visitor<'_> {
+impl<'a> Visitor<'a> {
+    fn new(
+        source: &'a [u8],
+        trap_writer: &'a mut trap::Writer,
+        path: &'a str,
+        file_label: trap::Label,
+        language_prefix: &str,
+        schema: &'a NodeTypeMap,
+    ) -> Visitor<'a> {
+        Visitor {
+            path,
+            file_label,
+            source,
+            trap_writer,
+            toplevel_child_counter: 0,
+            ast_node_info_table_name: format!("{}_ast_node_info", language_prefix),
+            tokeninfo_table_name: format!("{}_tokeninfo", language_prefix),
+            schema,
+            stack: Vec::new(),
+        }
+    }
+
    fn record_parse_error(
        &mut self,
        error_message: String,
        full_error_message: String,
-        loc: Label,
+        loc: trap::Label,
    ) {
        error!("{}", full_error_message);
        let id = self.trap_writer.fresh_id();
        self.trap_writer.add_tuple(
            "diagnostics",
            vec![
-                Arg::Label(id),
-                Arg::Int(40), // severity 40 = error
-                Arg::String("parse_error".to_string()),
-                Arg::String(error_message),
-                Arg::String(full_error_message),
-                Arg::Label(loc),
+                trap::Arg::Label(id),
+                trap::Arg::Int(40), // severity 40 = error
+                trap::Arg::String("parse_error".to_string()),
+                trap::Arg::String(error_message),
+                trap::Arg::String(full_error_message),
+                trap::Arg::Label(loc),
            ],
        );
    }
@@ -335,7 +273,8 @@ impl Visitor<'_> {
        node: Node,
    ) {
        let (start_line, start_column, end_line, end_column) = location_for(self.source, node);
-        let loc = self.trap_writer.location(
+        let loc = location(
+            self.trap_writer,
            self.file_label,
            start_line,
            start_column,
@@ -374,7 +313,8 @@ impl Visitor<'_> {
        }
        let (id, _, child_nodes) = self.stack.pop().expect("Vistor: empty stack");
        let (start_line, start_column, end_line, end_column) = location_for(self.source, node);
-        let loc = self.trap_writer.location(
+        let loc = location(
+            self.trap_writer,
            self.file_label,
            start_line,
            start_column,
@@ -402,19 +342,19 @@ impl Visitor<'_> {
        match &table.kind {
            EntryKind::Token { kind_id, .. } => {
                self.trap_writer.add_tuple(
-                    &format!("{}_ast_node_info", self.language_prefix),
+                    &self.ast_node_info_table_name,
                    vec![
-                        Arg::Label(id),
-                        Arg::Label(parent_id),
-                        Arg::Int(parent_index),
-                        Arg::Label(loc),
+                        trap::Arg::Label(id),
+                        trap::Arg::Label(parent_id),
+                        trap::Arg::Int(parent_index),
+                        trap::Arg::Label(loc),
                    ],
                );
                self.trap_writer.add_tuple(
-                    &format!("{}_tokeninfo", self.language_prefix),
+                    &self.tokeninfo_table_name,
                    vec![
-                        Arg::Label(id),
-                        Arg::Int(*kind_id),
+                        trap::Arg::Label(id),
+                        trap::Arg::Int(*kind_id),
                        sliced_source_arg(self.source, node),
                    ],
                );
@@ -425,15 +365,15 @@ impl Visitor<'_> {
            } => {
                if let Some(args) = self.complex_node(&node, fields, &child_nodes, id) {
                    self.trap_writer.add_tuple(
-                        &format!("{}_ast_node_info", self.language_prefix),
+                        &self.ast_node_info_table_name,
                        vec![
-                            Arg::Label(id),
-                            Arg::Label(parent_id),
-                            Arg::Int(parent_index),
-                            Arg::Label(loc),
+                            trap::Arg::Label(id),
+                            trap::Arg::Label(parent_id),
+                            trap::Arg::Int(parent_index),
+                            trap::Arg::Label(loc),
                        ],
                    );
-                    let mut all_args = vec![Arg::Label(id)];
+                    let mut all_args = vec![trap::Arg::Label(id)];
                    all_args.extend(args);
                    self.trap_writer.add_tuple(table_name, all_args);
                }
@@ -472,9 +412,9 @@ impl Visitor<'_> {
        node: &Node,
        fields: &[Field],
        child_nodes: &[ChildNode],
-        parent_id: Label,
-    ) -> Option<Vec<Arg>> {
-        let mut map: Map<&Option<String>, (&Field, Vec<Arg>)> = Map::new();
+        parent_id: trap::Label,
+    ) -> Option<Vec<trap::Arg>> {
+        let mut map: Map<&Option<String>, (&Field, Vec<trap::Arg>)> = Map::new();
        for field in fields {
            map.insert(&field.name, (field, Vec::new()));
        }
@@ -488,9 +428,9 @@ impl Visitor<'_> {
                    {
                        // We can safely unwrap because type_matches checks the key is in the map.
                        let (int_value, _) = int_mapping.get(&child_node.type_name.kind).unwrap();
-                        values.push(Arg::Int(*int_value));
+                        values.push(trap::Arg::Int(*int_value));
                    } else {
-                        values.push(Arg::Label(child_node.label));
+                        values.push(trap::Arg::Label(child_node.label));
                    }
                } else if field.name.is_some() {
                    let error_message = format!(
@@ -569,9 +509,9 @@ impl Visitor<'_> {
                            );
                            break;
                        }
-                        let mut args = vec![Arg::Label(parent_id)];
+                        let mut args = vec![trap::Arg::Label(parent_id)];
                        if *has_index {
-                            args.push(Arg::Int(index))
+                            args.push(trap::Arg::Int(index))
                        }
                        args.push(child_value.clone());
                        self.trap_writer.add_tuple(table_name, args);
@@ -625,9 +565,9 @@ impl Visitor<'_> {
 }

 // Emit a slice of a source file as an Arg.
-fn sliced_source_arg(source: &[u8], n: Node) -> Arg {
+fn sliced_source_arg(source: &[u8], n: Node) -> trap::Arg {
    let range = n.byte_range();
-    Arg::String(String::from_utf8_lossy(&source[range.start..range.end]).into_owned())
+    trap::Arg::String(String::from_utf8_lossy(&source[range.start..range.end]).into_owned())
 }

 // Emit a pair of `TrapEntry`s for the provided node, appropriately calibrated.
@@ -699,59 +639,6 @@ fn traverse(tree: &Tree, visitor: &mut Visitor) {
    }
 }

-pub struct Program(Vec<TrapEntry>);
-
-impl fmt::Display for Program {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        let mut text = String::new();
-        for trap_entry in &self.0 {
-            text.push_str(&format!("{}\n", trap_entry));
-        }
-        write!(f, "{}", text)
-    }
-}
-
-enum TrapEntry {
-    /// Maps the label to a fresh id, e.g. `#123=*`.
-    FreshId(Label),
-    /// Maps the label to a key, e.g. `#7=@"foo"`.
-    MapLabelToKey(Label, String),
-    /// foo_bar(arg*)
-    GenericTuple(String, Vec<Arg>),
-    Comment(String),
-}
-impl fmt::Display for TrapEntry {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        match self {
-            TrapEntry::FreshId(label) => write!(f, "{}=*", label),
-            TrapEntry::MapLabelToKey(label, key) => {
-                write!(f, "{}=@\"{}\"", label, key.replace("\"", "\"\""))
-            }
-            TrapEntry::GenericTuple(name, args) => {
-                write!(f, "{}(", name)?;
-                for (index, arg) in args.iter().enumerate() {
-                    if index > 0 {
-                        write!(f, ",")?;
-                    }
-                    write!(f, "{}", arg)?;
-                }
-                write!(f, ")")
-            }
-            TrapEntry::Comment(line) => write!(f, "// {}", line),
-        }
-    }
-}
-
-#[derive(Debug, Copy, Clone)]
-// Identifiers of the form #0, #1...
-struct Label(u32);
-
-impl fmt::Display for Label {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        write!(f, "#{:x}", self.0)
-    }
-}
-
 // Numeric indices.
 #[derive(Debug, Copy, Clone)]
 struct Index(usize);
@@ -761,69 +648,3 @@ impl fmt::Display for Index {
        write!(f, "{}", self.0)
    }
 }
-
-// Some untyped argument to a TrapEntry.
-#[derive(Debug, Clone)]
-enum Arg {
-    Label(Label),
-    Int(usize),
-    String(String),
-}
-
-const MAX_STRLEN: usize = 1048576;
-
-impl fmt::Display for Arg {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        match self {
-            Arg::Label(x) => write!(f, "{}", x),
-            Arg::Int(x) => write!(f, "{}", x),
-            Arg::String(x) => write!(
-                f,
-                "\"{}\"",
-                limit_string(x, MAX_STRLEN).replace("\"", "\"\"")
-            ),
-        }
-    }
-}
-
-/// Limit the length (in bytes) of a string. If the string's length in bytes is
-/// less than or equal to the limit then the entire string is returned. Otherwise
-/// the string is sliced at the provided limit. If there is a multi-byte character
-/// at the limit then the returned slice will be slightly shorter than the limit to
-/// avoid splitting that multi-byte character.
-fn limit_string(string: &str, max_size: usize) -> &str {
-    if string.len() <= max_size {
-        return string;
-    }
-    let p = string.as_bytes();
-    let mut index = max_size;
-    // We want to clip the string at [max_size]; however, the character at that position
-    // may span several bytes. We need to find the first byte of the character. In UTF-8
-    // encoded data any byte that matches the bit pattern 10XXXXXX is not a start byte.
-    // Therefore we decrement the index as long as there are bytes matching this pattern.
-    // This ensures we cut the string at the border between one character and another.
-    while index > 0 && (p[index] & 0b11000000) == 0b10000000 {
-        index -= 1;
-    }
-    &string[0..index]
-}
-
-#[test]
-fn limit_string_test() {
-    assert_eq!("hello", limit_string(&"hello world".to_owned(), 5));
-    assert_eq!("hi ☹", limit_string(&"hi ☹☹".to_owned(), 6));
-    assert_eq!("hi ", limit_string(&"hi ☹☹".to_owned(), 5));
-}
-
-#[test]
-fn escape_key_test() {
-    assert_eq!("foo!", escape_key("foo!"));
-    assert_eq!("foo&lbrace;&rbrace;", escape_key("foo{}"));
-    assert_eq!("&lbrace;&rbrace;", escape_key("{}"));
-    assert_eq!("", escape_key(""));
-    assert_eq!("/path/to/foo.rb", escape_key("/path/to/foo.rb"));
-    assert_eq!(
-        "/path/to/foo&amp;&lbrace;&rbrace;&quot;&commat;&num;.rb",
-        escape_key("/path/to/foo&{}\"@#.rb")
-    );
-}
--- a/ruby/extractor/src/main.rs
+++ b/ruby/extractor/src/main.rs
@@ -1,51 +1,19 @@
 mod extractor;
+mod trap;

+#[macro_use]
+extern crate lazy_static;
 extern crate num_cpus;

 use clap::arg;
-use flate2::write::GzEncoder;
+use encoding::{self};
 use rayon::prelude::*;
+use std::borrow::Cow;
 use std::fs;
-use std::io::{BufRead, BufWriter};
+use std::io::BufRead;
 use std::path::{Path, PathBuf};
 use tree_sitter::{Language, Parser, Range};

-enum TrapCompression {
-    None,
-    Gzip,
-}
-
-impl TrapCompression {
-    fn from_env() -> TrapCompression {
-        match std::env::var("CODEQL_RUBY_TRAP_COMPRESSION") {
-            Ok(method) => match TrapCompression::from_string(&method) {
-                Some(c) => c,
-                None => {
-                    tracing::error!("Unknown compression method '{}'; using gzip.", &method);
-                    TrapCompression::Gzip
-                }
-            },
-            // Default compression method if the env var isn't set:
-            Err(_) => TrapCompression::Gzip,
-        }
-    }
-
-    fn from_string(s: &str) -> Option<TrapCompression> {
-        match s.to_lowercase().as_ref() {
-            "none" => Some(TrapCompression::None),
-            "gzip" => Some(TrapCompression::Gzip),
-            _ => None,
-        }
-    }
-
-    fn extension(&self) -> &str {
-        match self {
-            TrapCompression::None => "trap",
-            TrapCompression::Gzip => "trap.gz",
-        }
-    }
-}
-
 /**
 * Gets the number of threads the extractor should use, by reading the
 * CODEQL_THREADS environment variable and using it as described in the
@@ -75,6 +43,21 @@ fn num_codeql_threads() -> usize {
    }
 }

+lazy_static! {
+    static ref CP_NUMBER: regex::Regex = regex::Regex::new("cp([0-9]+)").unwrap();
+}
+
+fn encoding_from_name(encoding_name: &str) -> Option<&(dyn encoding::Encoding + Send + Sync)> {
+    match encoding::label::encoding_from_whatwg_label(encoding_name) {
+        s @ Some(_) => s,
+        None => CP_NUMBER.captures(encoding_name).and_then(|cap| {
+            encoding::label::encoding_from_windows_code_page(
+                str::parse(cap.get(1).unwrap().as_str()).unwrap(),
+            )
+        }),
+    }
+}
+
 fn main() -> std::io::Result<()> {
    tracing_subscriber::fmt()
        .with_target(false)
@@ -118,7 +101,7 @@ fn main() -> std::io::Result<()> {
        .value_of("output-dir")
        .expect("missing --output-dir");
    let trap_dir = PathBuf::from(trap_dir);
-    let trap_compression = TrapCompression::from_env();
+    let trap_compression = trap::Compression::from_env("CODEQL_RUBY_TRAP_COMPRESSION");

    let file_list = matches.value_of("file-list").expect("missing --file-list");
    let file_list = fs::File::open(file_list)?;
@@ -140,8 +123,9 @@ fn main() -> std::io::Result<()> {
            let path = PathBuf::from(line).canonicalize()?;
            let src_archive_file = path_for(&src_archive_dir, &path, "");
            let mut source = std::fs::read(&path)?;
+            let mut needs_conversion = false;
            let code_ranges;
-            let mut trap_writer = extractor::new_trap_writer();
+            let mut trap_writer = trap::Writer::new();
            if path.extension().map_or(false, |x| x == "erb") {
                tracing::info!("scanning: {}", path.display());
                extractor::extract(
@@ -168,6 +152,43 @@ fn main() -> std::io::Result<()> {
                }
                code_ranges = ranges;
            } else {
+                if let Some(encoding_name) = scan_coding_comment(&source) {
+                    // If the input is already UTF-8 then there is no need to recode the source
+                    // If the declared encoding is 'binary' or 'ascii-8bit' then it is not clear how
+                    // to interpret characters. In this case it is probably best to leave the input
+                    // unchanged.
+                    if !encoding_name.eq_ignore_ascii_case("utf-8")
+                        && !encoding_name.eq_ignore_ascii_case("ascii-8bit")
+                        && !encoding_name.eq_ignore_ascii_case("binary")
+                    {
+                        if let Some(encoding) = encoding_from_name(&encoding_name) {
+                            needs_conversion =
+                                encoding.whatwg_name().unwrap_or_default() != "utf-8";
+                            if needs_conversion {
+                                match encoding
+                                    .decode(&source, encoding::types::DecoderTrap::Replace)
+                                {
+                                    Ok(str) => source = str.as_bytes().to_owned(),
+                                    Err(msg) => {
+                                        needs_conversion = false;
+                                        tracing::warn!(
+                                            "{}: character decoding failure: {} ({})",
+                                            &path.to_string_lossy(),
+                                            msg,
+                                            &encoding_name
+                                        );
+                                    }
+                                }
+                            }
+                        } else {
+                            tracing::warn!(
+                                "{}: unknown character encoding: '{}'",
+                                &path.to_string_lossy(),
+                                &encoding_name
+                            );
+                        }
+                    }
+                }
                code_ranges = vec![];
            }
            extractor::extract(
@@ -180,34 +201,30 @@ fn main() -> std::io::Result<()> {
                &code_ranges,
            )?;
            std::fs::create_dir_all(&src_archive_file.parent().unwrap())?;
-            std::fs::copy(&path, &src_archive_file)?;
-            write_trap(&trap_dir, path, trap_writer, &trap_compression)
+            if needs_conversion {
+                std::fs::write(&src_archive_file, &source)?;
+            } else {
+                std::fs::copy(&path, &src_archive_file)?;
+            }
+            write_trap(&trap_dir, path, &trap_writer, trap_compression)
        })
        .expect("failed to extract files");

    let path = PathBuf::from("extras");
-    let mut trap_writer = extractor::new_trap_writer();
-    trap_writer.populate_empty_location();
-    write_trap(&trap_dir, path, trap_writer, &trap_compression)
+    let mut trap_writer = trap::Writer::new();
+    extractor::populate_empty_location(&mut trap_writer);
+    write_trap(&trap_dir, path, &trap_writer, trap_compression)
 }

 fn write_trap(
    trap_dir: &Path,
    path: PathBuf,
-    trap_writer: extractor::TrapWriter,
-    trap_compression: &TrapCompression,
+    trap_writer: &trap::Writer,
+    trap_compression: trap::Compression,
 ) -> std::io::Result<()> {
    let trap_file = path_for(trap_dir, &path, trap_compression.extension());
    std::fs::create_dir_all(&trap_file.parent().unwrap())?;
-    let trap_file = std::fs::File::create(&trap_file)?;
-    let mut trap_file = BufWriter::new(trap_file);
-    match trap_compression {
-        TrapCompression::None => trap_writer.output(&mut trap_file),
-        TrapCompression::Gzip => {
-            let mut compressed_writer = GzEncoder::new(trap_file, flate2::Compression::fast());
-            trap_writer.output(&mut compressed_writer)
-        }
-    }
+    trap_writer.write_to_file(&trap_file, trap_compression)
 }

 fn scan_erb(
@@ -299,3 +316,143 @@ fn path_for(dir: &Path, path: &Path, ext: &str) -> PathBuf {
    }
    result
 }
+
+fn skip_space(content: &[u8], index: usize) -> usize {
+    let mut index = index;
+    while index < content.len() {
+        let c = content[index] as char;
+        // white space except \n
+        let is_space = c == ' ' || ('\t'..='\r').contains(&c) && c != '\n';
+        if !is_space {
+            break;
+        }
+        index += 1;
+    }
+    index
+}
+
+fn scan_coding_comment(content: &[u8]) -> std::option::Option<Cow<str>> {
+    let mut index = 0;
+    // skip UTF-8 BOM marker if there is one
+    if content.len() >= 3 && content[0] == 0xef && content[1] == 0xbb && content[2] == 0xbf {
+        index += 3;
+    }
+    // skip #! line if there is one
+    if index + 1 < content.len()
+        && content[index] as char == '#'
+        && content[index + 1] as char == '!'
+    {
+        index += 2;
+        while index < content.len() && content[index] as char != '\n' {
+            index += 1
+        }
+        index += 1
+    }
+    index = skip_space(content, index);
+
+    if index >= content.len() || content[index] as char != '#' {
+        return None;
+    }
+    index += 1;
+
+    const CODING: [char; 12] = ['C', 'c', 'O', 'o', 'D', 'd', 'I', 'i', 'N', 'n', 'G', 'g'];
+    let mut word_index = 0;
+    while index < content.len() && word_index < CODING.len() && content[index] as char != '\n' {
+        if content[index] as char == CODING[word_index]
+            || content[index] as char == CODING[word_index + 1]
+        {
+            word_index += 2
+        } else {
+            word_index = 0;
+        }
+        index += 1;
+    }
+    if word_index < CODING.len() {
+        return None;
+    }
+    index = skip_space(content, index);
+
+    if index < content.len() && content[index] as char != ':' && content[index] as char != '=' {
+        return None;
+    }
+    index += 1;
+    index = skip_space(content, index);
+
+    let start = index;
+    while index < content.len() {
+        let c = content[index] as char;
+        if c == '-' || c == '_' || c.is_ascii_alphanumeric() {
+            index += 1;
+        } else {
+            break;
+        }
+    }
+    if index > start {
+        return Some(String::from_utf8_lossy(&content[start..index]));
+    }
+    None
+}
+
+#[test]
+fn test_scan_coding_comment() {
+    let text = "# encoding: utf-8";
+    let result = scan_coding_comment(text.as_bytes());
+    assert_eq!(result, Some("utf-8".into()));
+
+    let text = "#coding:utf-8";
+    let result = scan_coding_comment(&text.as_bytes());
+    assert_eq!(result, Some("utf-8".into()));
+
+    let text = "# foo\n# encoding: utf-8";
+    let result = scan_coding_comment(&text.as_bytes());
+    assert_eq!(result, None);
+
+    let text = "# encoding: latin1 encoding: utf-8";
+    let result = scan_coding_comment(&text.as_bytes());
+    assert_eq!(result, Some("latin1".into()));
+
+    let text = "# encoding: nonsense";
+    let result = scan_coding_comment(&text.as_bytes());
+    assert_eq!(result, Some("nonsense".into()));
+
+    let text = "# coding = utf-8";
+    let result = scan_coding_comment(&text.as_bytes());
+    assert_eq!(result, Some("utf-8".into()));
+
+    let text = "# CODING = utf-8";
+    let result = scan_coding_comment(&text.as_bytes());
+    assert_eq!(result, Some("utf-8".into()));
+
+    let text = "# CoDiNg = utf-8";
+    let result = scan_coding_comment(&text.as_bytes());
+    assert_eq!(result, Some("utf-8".into()));
+
+    let text = "# blah blahblahcoding = utf-8";
+    let result = scan_coding_comment(&text.as_bytes());
+    assert_eq!(result, Some("utf-8".into()));
+
+    // unicode BOM is ignored
+    let text = "\u{FEFF}# encoding: utf-8";
+    let result = scan_coding_comment(&text.as_bytes());
+    assert_eq!(result, Some("utf-8".into()));
+
+    let text = "\u{FEFF} # encoding: utf-8";
+    let result = scan_coding_comment(&text.as_bytes());
+    assert_eq!(result, Some("utf-8".into()));
+
+    let text = "#! /usr/bin/env ruby\n # encoding: utf-8";
+    let result = scan_coding_comment(&text.as_bytes());
+    assert_eq!(result, Some("utf-8".into()));
+
+    let text = "\u{FEFF}#! /usr/bin/env ruby\n # encoding: utf-8";
+    let result = scan_coding_comment(&text.as_bytes());
+    assert_eq!(result, Some("utf-8".into()));
+
+    // A #! must be the first thing on a line, otherwise it's a normal comment
+    let text = " #! /usr/bin/env ruby encoding = utf-8";
+    let result = scan_coding_comment(&text.as_bytes());
+    assert_eq!(result, Some("utf-8".into()));
+    let text = " #! /usr/bin/env ruby \n # encoding = utf-8";
+    let result = scan_coding_comment(&text.as_bytes());
+    assert_eq!(result, None);
+}
--- a/ruby/extractor/src/trap.rs
+++ b/ruby/extractor/src/trap.rs
@@ -0,0 +1,272 @@
+use std::borrow::Cow;
+use std::fmt;
+use std::io::{BufWriter, Write};
+use std::path::Path;
+
+use flate2::write::GzEncoder;
+
+pub struct Writer {
+    /// The accumulated trap entries
+    trap_output: Vec<Entry>,
+    /// A counter for generating fresh labels
+    counter: u32,
+    /// cache of global keys
+    global_keys: std::collections::HashMap<String, Label>,
+}
+
+impl Writer {
+    pub fn new() -> Writer {
+        Writer {
+            counter: 0,
+            trap_output: Vec::new(),
+            global_keys: std::collections::HashMap::new(),
+        }
+    }
+
+    pub fn fresh_id(&mut self) -> Label {
+        let label = Label(self.counter);
+        self.counter += 1;
+        self.trap_output.push(Entry::FreshId(label));
+        label
+    }
+
+    ///  Gets a label that will hold the unique ID of the passed string at import time.
+    ///  This can be used for incrementally importable TRAP files -- use globally unique
+    ///  strings to compute a unique ID for table tuples.
+    ///
+    ///  Note: You probably want to make sure that the key strings that you use are disjoint
+    ///  for disjoint column types; the standard way of doing this is to prefix (or append)
+    ///  the column type name to the ID. Thus, you might identify methods in Java by the
+    ///  full ID "methods_com.method.package.DeclaringClass.method(argumentList)".
+    pub fn global_id(&mut self, key: &str) -> (Label, bool) {
+        if let Some(label) = self.global_keys.get(key) {
+            return (*label, false);
+        }
+        let label = Label(self.counter);
+        self.counter += 1;
+        self.global_keys.insert(key.to_owned(), label);
+        self.trap_output
+            .push(Entry::MapLabelToKey(label, key.to_owned()));
+        (label, true)
+    }
+
+    pub fn add_tuple(&mut self, table_name: &str, args: Vec<Arg>) {
+        self.trap_output
+            .push(Entry::GenericTuple(table_name.to_owned(), args))
+    }
+
+    pub fn comment(&mut self, text: String) {
+        self.trap_output.push(Entry::Comment(text));
+    }
+
+    pub fn write_to_file(&self, path: &Path, compression: Compression) -> std::io::Result<()> {
+        let trap_file = std::fs::File::create(path)?;
+        let mut trap_file = BufWriter::new(trap_file);
+        match compression {
+            Compression::None => self.write_trap_entries(&mut trap_file),
+            Compression::Gzip => {
+                let mut compressed_writer = GzEncoder::new(trap_file, flate2::Compression::fast());
+                self.write_trap_entries(&mut compressed_writer)
+            }
+        }
+    }
+
+    fn write_trap_entries<W: Write>(&self, file: &mut W) -> std::io::Result<()> {
+        for trap_entry in &self.trap_output {
+            writeln!(file, "{}", trap_entry)?;
+        }
+        std::io::Result::Ok(())
+    }
+}
+
+pub enum Entry {
+    /// Maps the label to a fresh id, e.g. `#123=*`.
+    FreshId(Label),
+    /// Maps the label to a key, e.g. `#7=@"foo"`.
+    MapLabelToKey(Label, String),
+    /// foo_bar(arg*)
+    GenericTuple(String, Vec<Arg>),
+    Comment(String),
+}
+
+impl fmt::Display for Entry {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match self {
+            Entry::FreshId(label) => write!(f, "{}=*", label),
+            Entry::MapLabelToKey(label, key) => {
+                write!(f, "{}=@\"{}\"", label, key.replace("\"", "\"\""))
+            }
+            Entry::GenericTuple(name, args) => {
+                write!(f, "{}(", name)?;
+                for (index, arg) in args.iter().enumerate() {
+                    if index > 0 {
+                        write!(f, ",")?;
+                    }
+                    write!(f, "{}", arg)?;
+                }
+                write!(f, ")")
+            }
+            Entry::Comment(line) => write!(f, "// {}", line),
+        }
+    }
+}
+
+#[derive(Debug, Copy, Clone)]
+// Identifiers of the form #0, #1...
+pub struct Label(u32);
+
+impl fmt::Display for Label {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "#{:x}", self.0)
+    }
+}
+
+// Some untyped argument to a TrapEntry.
+#[derive(Debug, Clone)]
+pub enum Arg {
+    Label(Label),
+    Int(usize),
+    String(String),
+}
+
+const MAX_STRLEN: usize = 1048576;
+
+impl fmt::Display for Arg {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match self {
+            Arg::Label(x) => write!(f, "{}", x),
+            Arg::Int(x) => write!(f, "{}", x),
+            Arg::String(x) => write!(
+                f,
+                "\"{}\"",
+                limit_string(x, MAX_STRLEN).replace("\"", "\"\"")
+            ),
+        }
+    }
+}
+
+pub struct Program(Vec<Entry>);
+
+impl fmt::Display for Program {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        let mut text = String::new();
+        for trap_entry in &self.0 {
+            text.push_str(&format!("{}\n", trap_entry));
+        }
+        write!(f, "{}", text)
+    }
+}
+
+pub fn full_id_for_file(normalized_path: &str) -> String {
+    format!("{};sourcefile", escape_key(normalized_path))
+}
+
+pub fn full_id_for_folder(normalized_path: &str) -> String {
+    format!("{};folder", escape_key(normalized_path))
+}
+
+/// Escapes a string for use in a TRAP key, by replacing special characters with
+/// HTML entities.
+fn escape_key<'a, S: Into<Cow<'a, str>>>(key: S) -> Cow<'a, str> {
+    fn needs_escaping(c: char) -> bool {
+        matches!(c, '&' | '{' | '}' | '"' | '@' | '#')
+    }
+
+    let key = key.into();
+    if key.contains(needs_escaping) {
+        let mut escaped = String::with_capacity(2 * key.len());
+        for c in key.chars() {
+            match c {
+                '&' => escaped.push_str("&amp;"),
+                '{' => escaped.push_str("&lbrace;"),
+                '}' => escaped.push_str("&rbrace;"),
+                '"' => escaped.push_str("&quot;"),
+                '@' => escaped.push_str("&commat;"),
+                '#' => escaped.push_str("&num;"),
+                _ => escaped.push(c),
+            }
+        }
+        Cow::Owned(escaped)
+    } else {
+        key
+    }
+}
+
+/// Limit the length (in bytes) of a string. If the string's length in bytes is
+/// less than or equal to the limit then the entire string is returned. Otherwise
+/// the string is sliced at the provided limit. If there is a multi-byte character
+/// at the limit then the returned slice will be slightly shorter than the limit to
+/// avoid splitting that multi-byte character.
+fn limit_string(string: &str, max_size: usize) -> &str {
+    if string.len() <= max_size {
+        return string;
+    }
+    let p = string.as_bytes();
+    let mut index = max_size;
+    // We want to clip the string at [max_size]; however, the character at that position
+    // may span several bytes. We need to find the first byte of the character. In UTF-8
+    // encoded data any byte that matches the bit pattern 10XXXXXX is not a start byte.
+    // Therefore we decrement the index as long as there are bytes matching this pattern.
+    // This ensures we cut the string at the border between one character and another.
+    while index > 0 && (p[index] & 0b11000000) == 0b10000000 {
+        index -= 1;
+    }
+    &string[0..index]
+}
+
+#[derive(Clone, Copy)]
+pub enum Compression {
+    None,
+    Gzip,
+}
+
+impl Compression {
+    pub fn from_env(var_name: &str) -> Compression {
+        match std::env::var(var_name) {
+            Ok(method) => match Compression::from_string(&method) {
+                Some(c) => c,
+                None => {
+                    tracing::error!("Unknown compression method '{}'; using gzip.", &method);
+                    Compression::Gzip
+                }
+            },
+            // Default compression method if the env var isn't set:
+            Err(_) => Compression::Gzip,
+        }
+    }
+
+    pub fn from_string(s: &str) -> Option<Compression> {
+        match s.to_lowercase().as_ref() {
+            "none" => Some(Compression::None),
+            "gzip" => Some(Compression::Gzip),
+            _ => None,
+        }
+    }
+
+    pub fn extension(&self) -> &str {
+        match self {
+            Compression::None => "trap",
+            Compression::Gzip => "trap.gz",
+        }
+    }
+}
+
+#[test]
+fn limit_string_test() {
+    assert_eq!("hello", limit_string(&"hello world".to_owned(), 5));
+    assert_eq!("hi ☹", limit_string(&"hi ☹☹".to_owned(), 6));
+    assert_eq!("hi ", limit_string(&"hi ☹☹".to_owned(), 5));
+}
+
+#[test]
+fn escape_key_test() {
+    assert_eq!("foo!", escape_key("foo!"));
+    assert_eq!("foo&lbrace;&rbrace;", escape_key("foo{}"));
+    assert_eq!("&lbrace;&rbrace;", escape_key("{}"));
+    assert_eq!("", escape_key(""));
+    assert_eq!("/path/to/foo.rb", escape_key("/path/to/foo.rb"));
+    assert_eq!(
+        "/path/to/foo&amp;&lbrace;&rbrace;&quot;&commat;&num;.rb",
+        escape_key("/path/to/foo&{}\"@#.rb")
+    );
+}
--- a/ruby/generator/Cargo.toml
+++ b/ruby/generator/Cargo.toml
@@ -12,4 +12,4 @@ node-types = { path = "../node-types" }
 tracing = "0.1"
 tracing-subscriber = { version = "0.3.3", features = ["env-filter"] }
 tree-sitter-embedded-template = { git = "https://github.com/tree-sitter/tree-sitter-embedded-template.git", rev = "1a538da253d73f896b9f6c0c7d79cda58791ac5c" }
-tree-sitter-ruby = { git = "https://github.com/tree-sitter/tree-sitter-ruby.git", rev = "5b305c3cd32db10494cedd2743de6bbe32f1a573" }
+tree-sitter-ruby = { git = "https://github.com/tree-sitter/tree-sitter-ruby.git", rev = "e75d04404c9dd71ad68850d5c672b226d5e694f3" }
--- a/ruby/ql/lib/change-notes/2022-07-18-sqli-in-activerecord-relation-annotate.md
+++ b/ruby/ql/lib/change-notes/2022-07-18-sqli-in-activerecord-relation-annotate.md
@@ -0,0 +1,5 @@
+---
+category: minorAnalysis
+---
+
+- Calls to `ActiveRecord::Relation#annotate` are now recognized as`SqlExecution`s so that it will be considered as a sink for queries like rb/sql-injection.
--- a/ruby/ql/lib/codeql/ruby/frameworks/ActiveRecord.qll
+++ b/ruby/ql/lib/codeql/ruby/frameworks/ActiveRecord.qll
@@ -133,6 +133,11 @@ private Expr sqlFragmentArgument(MethodCall call) {
      or
      methodName = "reload" and
      result = call.getKeywordArgument("lock")
+      or
+      // Calls to `annotate` can be used to add block comments to SQL queries. These are potentially vulnerable to
+      // SQLi if user supplied input is passed in as an argument.
+      methodName = "annotate" and
+      result = call.getArgument(_)
    )
  )
 }
--- a/ruby/ql/test/library-tests/ast/Ast.expected
+++ b/ruby/ql/test/library-tests/ast/Ast.expected
@@ -1762,6 +1762,12 @@ escape_sequences/escapes.rb:
 #   93|   getStmt: [SymbolLiteral] :"\C-?"
 #   93|     getComponent: [StringEscapeSequenceComponent] \C
 #   93|     getComponent: [StringTextComponent] -?
+misc/iso-8859-15.rb:
+#    1| [Toplevel] iso-8859-15.rb
+#    4|   getStmt: [MethodCall] call to print
+#    4|     getReceiver: [SelfVariableAccess] self
+#    4|     getArgument: [StringLiteral] "EUR = €"
+#    4|       getComponent: [StringTextComponent] EUR = €
 literals/literals.rb:
 #    1| [Toplevel] literals.rb
 #    2|   getStmt: [NilLiteral] nil
--- a/ruby/ql/test/library-tests/ast/TreeSitter.expected
+++ b/ruby/ql/test/library-tests/ast/TreeSitter.expected
@@ -4604,6 +4604,17 @@ literals/literals.rb:
 #  193|    cat file.txt
 #  193|   
 #  195|   1: [HeredocEnd] SCRIPT
+misc/iso-8859-15.rb:
+#    1| [Program] Program
+#    4|   0: [Call] Call
+#    4|     0: [Identifier] print
+#    4|     1: [ArgumentList] ArgumentList
+#    4|       0: [String] String
+#    4|         0: [ReservedWord] "
+#    4|         1: [StringContent] EUR = €
+#    4|         2: [ReservedWord] "
+#    1| [Comment] #! /usr/bin/ruby
+#    2| [Comment] # coding: iso-8859-15
 misc/misc.erb:
 #    2| [Program] Program
 #    2|   0: [Call] Call
--- a/ruby/ql/test/library-tests/ast/ValueText.expected
+++ b/ruby/ql/test/library-tests/ast/ValueText.expected
@@ -717,6 +717,7 @@ exprValue
 | literals/literals.rb:198:8:198:8 | 5 | 5 | int |
 | literals/literals.rb:199:2:199:2 | :y | :y | symbol |
 | literals/literals.rb:199:7:199:7 | :Z | :Z | symbol |
+| misc/iso-8859-15.rb:4:7:4:17 | "EUR = \u20ac" | EUR = \u20ac | string |
 | misc/misc.erb:2:15:2:37 | "main_include_admin.js" | main_include_admin.js | string |
 | misc/misc.rb:1:7:1:11 | "bar" | bar | string |
 | misc/misc.rb:3:7:3:9 | foo | foo | string |
@@ -1592,6 +1593,7 @@ exprCfgNodeValue
 | literals/literals.rb:198:8:198:8 | 5 | 5 | int |
 | literals/literals.rb:199:2:199:2 | :y | :y | symbol |
 | literals/literals.rb:199:7:199:7 | :Z | :Z | symbol |
+| misc/iso-8859-15.rb:4:7:4:17 | "EUR = \u20ac" | EUR = \u20ac | string |
 | misc/misc.erb:2:15:2:37 | "main_include_admin.js" | main_include_admin.js | string |
 | misc/misc.rb:1:7:1:11 | "bar" | bar | string |
 | misc/misc.rb:3:7:3:9 | foo | foo | string |
--- a/ruby/ql/test/library-tests/ast/misc/iso-8859-15.rb
+++ b/ruby/ql/test/library-tests/ast/misc/iso-8859-15.rb
@@ -0,0 +1,4 @@
+#! /usr/bin/ruby
+# coding: iso-8859-15
+
+print "EUR = <20>"
--- a/ruby/ql/test/library-tests/frameworks/ActionController.expected
+++ b/ruby/ql/test/library-tests/frameworks/ActionController.expected
@@ -2,6 +2,7 @@ actionControllerControllerClasses
 | ActiveRecord.rb:23:1:39:3 | FooController |
 | ActiveRecord.rb:41:1:64:3 | BarController |
 | ActiveRecord.rb:66:1:70:3 | BazController |
+| ActiveRecord.rb:72:1:80:3 | AnnotatedController |
 | app/controllers/comments_controller.rb:1:1:7:3 | CommentsController |
 | app/controllers/foo/bars_controller.rb:3:1:39:3 | BarsController |
 | app/controllers/photos_controller.rb:1:1:4:3 | PhotosController |
@@ -12,6 +13,8 @@ actionControllerActionMethods
 | ActiveRecord.rb:42:3:47:5 | some_other_request_handler |
 | ActiveRecord.rb:49:3:63:5 | safe_paths |
 | ActiveRecord.rb:67:3:69:5 | yet_another_handler |
+| ActiveRecord.rb:73:3:75:5 | index |
+| ActiveRecord.rb:77:3:79:5 | unsafe_action |
 | app/controllers/comments_controller.rb:2:3:3:5 | index |
 | app/controllers/comments_controller.rb:5:3:6:5 | show |
 | app/controllers/foo/bars_controller.rb:5:3:7:5 | index |
@@ -38,6 +41,7 @@ paramsCalls
 | ActiveRecord.rb:59:12:59:17 | call to params |
 | ActiveRecord.rb:62:15:62:20 | call to params |
 | ActiveRecord.rb:68:21:68:26 | call to params |
+| ActiveRecord.rb:78:59:78:64 | call to params |
 | app/controllers/foo/bars_controller.rb:13:21:13:26 | call to params |
 | app/controllers/foo/bars_controller.rb:14:10:14:15 | call to params |
 | app/controllers/foo/bars_controller.rb:21:21:21:26 | call to params |
@@ -57,6 +61,7 @@ paramsSources
 | ActiveRecord.rb:59:12:59:17 | call to params |
 | ActiveRecord.rb:62:15:62:20 | call to params |
 | ActiveRecord.rb:68:21:68:26 | call to params |
+| ActiveRecord.rb:78:59:78:64 | call to params |
 | app/controllers/foo/bars_controller.rb:13:21:13:26 | call to params |
 | app/controllers/foo/bars_controller.rb:14:10:14:15 | call to params |
 | app/controllers/foo/bars_controller.rb:21:21:21:26 | call to params |
--- a/ruby/ql/test/library-tests/frameworks/ActiveRecord.expected
+++ b/ruby/ql/test/library-tests/frameworks/ActiveRecord.expected
@@ -22,6 +22,7 @@ activeRecordSqlExecutionRanges
 | ActiveRecord.rb:46:20:46:32 | ... + ... |
 | ActiveRecord.rb:52:16:52:28 | "name #{...}" |
 | ActiveRecord.rb:56:20:56:39 | "username = #{...}" |
+| ActiveRecord.rb:78:27:78:76 | "this is an unsafe annotation:..." |
 activeRecordModelClassMethodCalls
 | ActiveRecord.rb:2:3:2:17 | call to has_many |
 | ActiveRecord.rb:6:3:6:24 | call to belongs_to |
@@ -44,6 +45,8 @@ activeRecordModelClassMethodCalls
 | ActiveRecord.rb:60:5:60:33 | call to find_by |
 | ActiveRecord.rb:62:5:62:34 | call to find |
 | ActiveRecord.rb:68:5:68:45 | call to delete_by |
+| ActiveRecord.rb:74:13:74:54 | call to annotate |
+| ActiveRecord.rb:78:13:78:77 | call to annotate |
 potentiallyUnsafeSqlExecutingMethodCall
 | ActiveRecord.rb:9:5:9:68 | call to find |
 | ActiveRecord.rb:19:5:19:25 | call to destroy_by |
@@ -55,6 +58,7 @@ potentiallyUnsafeSqlExecutingMethodCall
 | ActiveRecord.rb:46:5:46:33 | call to delete_by |
 | ActiveRecord.rb:52:5:52:29 | call to order |
 | ActiveRecord.rb:56:7:56:40 | call to find_by |
+| ActiveRecord.rb:78:13:78:77 | call to annotate |
 activeRecordModelInstantiations
 | ActiveRecord.rb:9:5:9:68 | call to find | ActiveRecord.rb:5:1:15:3 | User |
 | ActiveRecord.rb:13:5:13:40 | call to find_by | ActiveRecord.rb:1:1:3:3 | UserGroup |
--- a/ruby/ql/test/library-tests/frameworks/ActiveRecord.rb
+++ b/ruby/ql/test/library-tests/frameworks/ActiveRecord.rb
@@ -68,3 +68,13 @@ class BazController < BarController
    Admin.delete_by(params[:admin_condition])
  end
 end
+
+class AnnotatedController < ActionController::Base
+  def index
+    users = User.annotate("this is a safe annotation")
+  end
+
+  def unsafe_action
+    users = User.annotate("this is an unsafe annotation:#{params[:comment]}")
+  end
+end
--- a/ruby/ql/test/query-tests/security/cwe-089/ActiveRecordInjection.rb
+++ b/ruby/ql/test/query-tests/security/cwe-089/ActiveRecordInjection.rb
@@ -137,3 +137,17 @@ class BazController < BarController
    Admin.delete_by(params[:admin_condition])
  end
 end
+
+class AnnotatedController < ActionController::Base
+  def index
+    name = params[:user_name]
+    # GOOD: string literal arguments not controlled by user are safe for annotations
+    users = User.annotate("this is a safe annotation").find_by(user_name: name)
+  end
+
+  def unsafe_action
+    name = params[:user_name]
+    # BAD: user input passed into annotations are vulnerable to SQLi
+    users = User.annotate("this is an unsafe annotation:#{params[:comment]}").find_by(user_name: name)
+  end
+end
--- a/ruby/ql/test/query-tests/security/cwe-089/SqlInjection.expected
+++ b/ruby/ql/test/query-tests/security/cwe-089/SqlInjection.expected
@@ -31,6 +31,8 @@ edges
 | ActiveRecordInjection.rb:99:11:99:17 | ...[...] :  | ActiveRecordInjection.rb:104:20:104:32 | ... + ... |
 | ActiveRecordInjection.rb:137:21:137:26 | call to params :  | ActiveRecordInjection.rb:137:21:137:44 | ...[...] :  |
 | ActiveRecordInjection.rb:137:21:137:44 | ...[...] :  | ActiveRecordInjection.rb:20:22:20:30 | condition :  |
+| ActiveRecordInjection.rb:151:59:151:64 | call to params :  | ActiveRecordInjection.rb:151:59:151:74 | ...[...] :  |
+| ActiveRecordInjection.rb:151:59:151:74 | ...[...] :  | ActiveRecordInjection.rb:151:27:151:76 | "this is an unsafe annotation:..." |
 nodes
 | ActiveRecordInjection.rb:8:25:8:28 | name :  | semmle.label | name :  |
 | ActiveRecordInjection.rb:8:31:8:34 | pass :  | semmle.label | pass :  |
@@ -80,6 +82,9 @@ nodes
 | ActiveRecordInjection.rb:104:20:104:32 | ... + ... | semmle.label | ... + ... |
 | ActiveRecordInjection.rb:137:21:137:26 | call to params :  | semmle.label | call to params :  |
 | ActiveRecordInjection.rb:137:21:137:44 | ...[...] :  | semmle.label | ...[...] :  |
+| ActiveRecordInjection.rb:151:27:151:76 | "this is an unsafe annotation:..." | semmle.label | "this is an unsafe annotation:..." |
+| ActiveRecordInjection.rb:151:59:151:64 | call to params :  | semmle.label | call to params :  |
+| ActiveRecordInjection.rb:151:59:151:74 | ...[...] :  | semmle.label | ...[...] :  |
 subpaths
 #select
 | ActiveRecordInjection.rb:10:33:10:67 | "name='#{...}' and pass='#{...}'" | ActiveRecordInjection.rb:70:23:70:28 | call to params :  | ActiveRecordInjection.rb:10:33:10:67 | "name='#{...}' and pass='#{...}'" | This SQL query depends on $@. | ActiveRecordInjection.rb:70:23:70:28 | call to params | a user-provided value |
@@ -99,3 +104,4 @@ subpaths
 | ActiveRecordInjection.rb:88:18:88:35 | ...[...] | ActiveRecordInjection.rb:88:18:88:23 | call to params :  | ActiveRecordInjection.rb:88:18:88:35 | ...[...] | This SQL query depends on $@. | ActiveRecordInjection.rb:88:18:88:23 | call to params | a user-provided value |
 | ActiveRecordInjection.rb:92:21:92:35 | ...[...] | ActiveRecordInjection.rb:92:21:92:26 | call to params :  | ActiveRecordInjection.rb:92:21:92:35 | ...[...] | This SQL query depends on $@. | ActiveRecordInjection.rb:92:21:92:26 | call to params | a user-provided value |
 | ActiveRecordInjection.rb:104:20:104:32 | ... + ... | ActiveRecordInjection.rb:98:10:98:15 | call to params :  | ActiveRecordInjection.rb:104:20:104:32 | ... + ... | This SQL query depends on $@. | ActiveRecordInjection.rb:98:10:98:15 | call to params | a user-provided value |
+| ActiveRecordInjection.rb:151:27:151:76 | "this is an unsafe annotation:..." | ActiveRecordInjection.rb:151:59:151:64 | call to params :  | ActiveRecordInjection.rb:151:27:151:76 | "this is an unsafe annotation:..." | This SQL query depends on $@. | ActiveRecordInjection.rb:151:59:151:64 | call to params | a user-provided value |