Share IncompleteMultiCharacterSanitization JS/Ruby

Most of the classes and predicates in this query can be shared between the two languages. There's just a few language-specific things that we place in IncompleteMultiCharacterSanitizationSpecific.
2025-12-17 01:03:14 +01:00 · 2022-07-28 15:05:29 +12:00
parent 3179c60a1e
commit b7d9bf4066
8 changed files with 370 additions and 340 deletions
--- a/config/identical-files.json
+++ b/config/identical-files.json
@@ -597,5 +597,9 @@
  "Swift patterns test file": [
    "swift/ql/test/extractor-tests/patterns/patterns.swift",
    "swift/ql/test/library-tests/parent/patterns.swift"
+  ],
+  "IncompleteMultiCharacterSanitization JS/Ruby": [
+    "javascript/ql/lib/semmle/javascript/security/IncompleteMultiCharacterSanitization.qll",
+    "ruby/ql/lib/codeql/ruby/security/IncompleteMultiCharacterSanitization.qll"
  ]
 }
--- a/javascript/ql/lib/semmle/javascript/security/IncompleteMultiCharacterSanitization.qll
+++ b/javascript/ql/lib/semmle/javascript/security/IncompleteMultiCharacterSanitization.qll
@@ -0,0 +1,159 @@
+/**
+ * Provides shared predicates for reasoning about improper multi-character sanitization.
+ */
+
+import IncompleteMultiCharacterSanitizationSpecific
+
+/**
+ * A prefix that may be dangerous to sanitize explicitly.
+ *
+ * Note that this class exists solely as a (necessary) optimization for this query.
+ */
+private class DangerousPrefix extends string {
+  DangerousPrefix() {
+    this = ["/..", "../"] or
+    this = "<!--" or
+    this = "<" + ["iframe", "script", "cript", "scrip", "style"]
+  }
+}
+
+/**
+ * A substring of a prefix that may be dangerous to sanitize explicitly.
+ */
+private class DangerousPrefixSubstring extends string {
+  DangerousPrefixSubstring() {
+    exists(DangerousPrefix s | this = s.substring([0 .. s.length()], [0 .. s.length()]))
+  }
+}
+
+/**
+ * Gets a char from a dangerous prefix that is matched by `t`.
+ */
+pragma[noinline]
+private DangerousPrefixSubstring getADangerousMatchedChar(EmptyReplaceRegExpTerm t) {
+  t.isNullable() and result = ""
+  or
+  result = t.getAMatchedString()
+  or
+  // A substring matched by some character class. This is only used to match the "word" part of a HTML tag (e.g. "iframe" in "<iframe").
+  exists(ReDoSUtil::CharacterClass cc |
+    cc = ReDoSUtil::getCanonicalCharClass(t) and
+    cc.matches(result) and
+    result.regexpMatch("\\w") and
+    // excluding character classes that match ">" (e.g. /<[^<]*>/), as these might consume nested HTML tags, and thus prevent the dangerous pattern this query is looking for.
+    not cc.matches(">")
+  )
+  or
+  t instanceof RegExpDot and
+  result.length() = 1
+  or
+  (
+    t instanceof RegExpOpt or
+    t instanceof RegExpStar or
+    t instanceof RegExpPlus or
+    t instanceof RegExpGroup or
+    t instanceof RegExpAlt
+  ) and
+  result = getADangerousMatchedChar(t.getAChild())
+}
+
+/**
+ * Gets a dangerous prefix that is in the prefix language of `t`.
+ */
+private DangerousPrefix getADangerousMatchedPrefix(EmptyReplaceRegExpTerm t) {
+  result = getADangerousMatchedPrefixSubstring(t) and
+  not exists(EmptyReplaceRegExpTerm pred | pred = t.getPredecessor+() and not pred.isNullable())
+}
+
+/**
+ * Gets a substring of a dangerous prefix that is in the language starting at `t` (ignoring lookarounds).
+ *
+ * Note that the language of `t` is slightly restricted as not all RegExpTerm types are supported.
+ */
+private DangerousPrefixSubstring getADangerousMatchedPrefixSubstring(EmptyReplaceRegExpTerm t) {
+  result = getADangerousMatchedChar(t) + getADangerousMatchedPrefixSubstring(t.getSuccessor())
+  or
+  result = getADangerousMatchedChar(t)
+  or
+  // loop around for repetitions (only considering alphanumeric characters in the repetition)
+  exists(RepetitionMatcher repetition | t = repetition |
+    result = getADangerousMatchedPrefixSubstring(repetition) + repetition.getAChar()
+  )
+}
+
+private class RepetitionMatcher extends EmptyReplaceRegExpTerm {
+  string char;
+
+  pragma[noinline]
+  RepetitionMatcher() {
+    (this instanceof RegExpPlus or this instanceof RegExpStar) and
+    char = getADangerousMatchedChar(this.getAChild()) and
+    char.regexpMatch("\\w")
+  }
+
+  pragma[noinline]
+  string getAChar() { result = char }
+}
+
+/**
+ * Holds if `t` may match the dangerous `prefix` and some suffix, indicating intent to prevent a vulnerability of kind `kind`.
+ */
+predicate matchesDangerousPrefix(EmptyReplaceRegExpTerm t, string prefix, string kind) {
+  prefix = getADangerousMatchedPrefix(t) and
+  (
+    kind = "path injection" and
+    prefix = ["/..", "../"] and
+    // If the regex is matching explicit path components, it is unlikely that it's being used as a sanitizer.
+    not t.getSuccessor*().getAMatchedString().regexpMatch("(?is).*[a-z0-9_-].*")
+    or
+    kind = "HTML element injection" and
+    (
+      // comments
+      prefix = "<!--" and
+      // If the regex is matching explicit textual content of an HTML comment, it is unlikely that it's being used as a sanitizer.
+      not t.getSuccessor*().getAMatchedString().regexpMatch("(?is).*[a-z0-9_].*")
+      or
+      // specific tags
+      // the `cript|scrip` case has been observed in the wild several times
+      prefix = "<" + ["iframe", "script", "cript", "scrip", "style"]
+    )
+  )
+  or
+  kind = "HTML attribute injection" and
+  prefix =
+    [
+      // ordinary event handler prefix
+      "on",
+      // angular prefixes
+      "ng-", "ng:", "data-ng-", "x-ng-"
+    ] and
+  (
+    // explicit matching: `onclick` and `ng-bind`
+    t.getAMatchedString().regexpMatch("(?i)" + prefix + "[a-z]+")
+    or
+    // regexp-based matching: `on[a-z]+`
+    exists(EmptyReplaceRegExpTerm start | start = t.getAChild() |
+      start.getAMatchedString().regexpMatch("(?i)[^a-z]*" + prefix) and
+      isCommonWordMatcher(start.getSuccessor())
+    )
+  )
+}
+
+/**
+ * Holds if `t` is a common pattern for matching words
+ */
+private predicate isCommonWordMatcher(RegExpTerm t) {
+  exists(RegExpTerm quantified | quantified = t.(RegExpQuantifier).getChild(0) |
+    // [a-z]+ and similar
+    quantified
+        .(RegExpCharacterClass)
+        .getAChild()
+        .(RegExpCharacterRange)
+        .isRange(["a", "A"], ["z", "Z"])
+    or
+    // \w+ or [\w]+
+    [quantified, quantified.(RegExpCharacterClass).getAChild()]
+        .(RegExpCharacterClassEscape)
+        .getValue() = "w"
+  )
+}
--- a/javascript/ql/lib/semmle/javascript/security/IncompleteMultiCharacterSanitizationSpecific.qll
+++ b/javascript/ql/lib/semmle/javascript/security/IncompleteMultiCharacterSanitizationSpecific.qll
@@ -0,0 +1,18 @@
+/**
+ * Provides language-specific predicates for reasoning about improper multi-character sanitization.
+ */
+
+import javascript
+import semmle.javascript.security.performance.ReDoSUtil as ReDoSUtil
+
+/**
+ * A regexp term that matches substrings that should be replaced with the empty string.
+ */
+class EmptyReplaceRegExpTerm extends RegExpTerm {
+  EmptyReplaceRegExpTerm() {
+    exists(StringReplaceCall replace |
+      [replace.getRawReplacement(), replace.getCallback(1).getAReturn()].mayHaveStringValue("") and
+      this = replace.getRegExp().getRoot().getAChild*()
+    )
+  }
+}
--- a/javascript/ql/src/Security/CWE-116/IncompleteMultiCharacterSanitization.ql
+++ b/javascript/ql/src/Security/CWE-116/IncompleteMultiCharacterSanitization.ql
@@ -14,172 +14,7 @@
 */

 import javascript
-
-/**
- * A regexp term that matches substrings that should be replaced with the empty string.
- */
-class EmptyReplaceRegExpTerm extends RegExpTerm {
-  EmptyReplaceRegExpTerm() {
-    exists(StringReplaceCall replace |
-      [replace.getRawReplacement(), replace.getCallback(1).getAReturn()].mayHaveStringValue("") and
-      this = replace.getRegExp().getRoot().getAChild*()
-    )
-  }
-}
-
-/**
- * A prefix that may be dangerous to sanitize explicitly.
- *
- * Note that this class exists solely as a (necessary) optimization for this query.
- */
-class DangerousPrefix extends string {
-  DangerousPrefix() {
-    this = ["/..", "../"] or
-    this = "<!--" or
-    this = "<" + ["iframe", "script", "cript", "scrip", "style"]
-  }
-}
-
-/**
- * A substring of a prefix that may be dangerous to sanitize explicitly.
- */
-class DangerousPrefixSubstring extends string {
-  DangerousPrefixSubstring() {
-    exists(DangerousPrefix s | this = s.substring([0 .. s.length()], [0 .. s.length()]))
-  }
-}
-
-/**
- * Gets a dangerous prefix that is in the prefix language of `t`.
- */
-DangerousPrefix getADangerousMatchedPrefix(EmptyReplaceRegExpTerm t) {
-  result = getADangerousMatchedPrefixSubstring(t) and
-  not exists(EmptyReplaceRegExpTerm pred | pred = t.getPredecessor+() and not pred.isNullable())
-}
-
-private import semmle.javascript.security.regexp.NfaUtils as NfaUtils
-
-/**
- * Gets a char from a dangerous prefix that is matched by `t`.
- */
-pragma[noinline]
-DangerousPrefixSubstring getADangerousMatchedChar(EmptyReplaceRegExpTerm t) {
-  t.isNullable() and result = ""
-  or
-  t.getAMatchedString() = result
-  or
-  // A substring matched by some character class. This is only used to match the "word" part of a HTML tag (e.g. "iframe" in "<iframe").
-  exists(NfaUtils::CharacterClass cc |
-    cc = NfaUtils::getCanonicalCharClass(t) and
-    cc.matches(result) and
-    result.regexpMatch("\\w") and
-    // excluding character classes that match ">" (e.g. /<[^<]*>/), as these might consume nested HTML tags, and thus prevent the dangerous pattern this query is looking for.
-    not cc.matches(">")
-  )
-  or
-  t instanceof RegExpDot and
-  result.length() = 1
-  or
-  (
-    t instanceof RegExpOpt or
-    t instanceof RegExpStar or
-    t instanceof RegExpPlus or
-    t instanceof RegExpGroup or
-    t instanceof RegExpAlt
-  ) and
-  result = getADangerousMatchedChar(t.getAChild())
-}
-
-/**
- * Gets a substring of a dangerous prefix that is in the language starting at `t` (ignoring lookarounds).
- *
- * Note that the language of `t` is slightly restricted as not all RegExpTerm types are supported.
- */
-DangerousPrefixSubstring getADangerousMatchedPrefixSubstring(EmptyReplaceRegExpTerm t) {
-  result = getADangerousMatchedChar(t) + getADangerousMatchedPrefixSubstring(t.getSuccessor())
-  or
-  result = getADangerousMatchedChar(t)
-  or
-  // loop around for repetitions (only considering alphanumeric characters in the repetition)
-  exists(RepetitionMatcher repetition | t = repetition |
-    result = getADangerousMatchedPrefixSubstring(repetition) + repetition.getAChar()
-  )
-}
-
-class RepetitionMatcher extends EmptyReplaceRegExpTerm {
-  string char;
-
-  pragma[noinline]
-  RepetitionMatcher() {
-    (this instanceof RegExpPlus or this instanceof RegExpStar) and
-    char = getADangerousMatchedChar(this.getAChild()) and
-    char.regexpMatch("\\w")
-  }
-
-  pragma[noinline]
-  string getAChar() { result = char }
-}
-
-/**
- * Holds if `t` may match the dangerous `prefix` and some suffix, indicating intent to prevent a vulnerablity of kind `kind`.
- */
-predicate matchesDangerousPrefix(EmptyReplaceRegExpTerm t, string prefix, string kind) {
-  prefix = getADangerousMatchedPrefix(t) and
-  (
-    kind = "path injection" and
-    // upwards navigation
-    prefix = ["/..", "../"] and
-    not t.getSuccessor*().getAMatchedString().regexpMatch("(?is).*[a-z0-9_-].*") // explicit path name mentions make this an unlikely sanitizer
-    or
-    kind = "HTML element injection" and
-    (
-      // comments
-      prefix = "<!--" and
-      not t.getSuccessor*().getAMatchedString().regexpMatch("(?is).*[a-z0-9_].*") // explicit comment content mentions make this an unlikely sanitizer
-      or
-      // specific tags
-      prefix = "<" + ["iframe", "script", "cript", "scrip", "style"] // the `cript|scrip` case has been observed in the wild several times
-    )
-  )
-  or
-  kind = "HTML attribute injection" and
-  prefix =
-    [
-      // ordinary event handler prefix
-      "on",
-      // angular prefixes
-      "ng-", "ng:", "data-ng-", "x-ng-"
-    ] and
-  (
-    // explicit matching: `onclick` and `ng-bind`
-    t.getAMatchedString().regexpMatch("(?i)" + prefix + "[a-z]+")
-    or
-    // regexp-based matching: `on[a-z]+`
-    exists(EmptyReplaceRegExpTerm start | start = t.getAChild() |
-      start.getConstantValue().regexpMatch("(?i)[^a-z]*" + prefix) and
-      isCommonWordMatcher(start.getSuccessor())
-    )
-  )
-}
-
-/**
- * Holds if `t` is a common pattern for matching words
- */
-predicate isCommonWordMatcher(RegExpTerm t) {
-  exists(RegExpTerm quantified | quantified = t.(RegExpQuantifier).getChild(0) |
-    // [a-z]+ and similar
-    quantified
-        .(RegExpCharacterClass)
-        .getAChild()
-        .(RegExpCharacterRange)
-        .isRange(["a", "A"], ["z", "Z"])
-    or
-    // \w+ or [\w]+
-    [quantified, quantified.(RegExpCharacterClass).getAChild()]
-        .(RegExpCharacterClassEscape)
-        .getValue() = "w"
-  )
-}
+private import semmle.javascript.security.IncompleteMultiCharacterSanitization

 from
  StringReplaceCall replace, EmptyReplaceRegExpTerm regexp, EmptyReplaceRegExpTerm dangerous,
--- a/ruby/ql/lib/codeql/ruby/security/IncompleteMultiCharacterSanitization.qll
+++ b/ruby/ql/lib/codeql/ruby/security/IncompleteMultiCharacterSanitization.qll
@@ -0,0 +1,159 @@
+/**
+ * Provides shared predicates for reasoning about improper multi-character sanitization.
+ */
+
+import IncompleteMultiCharacterSanitizationSpecific
+
+/**
+ * A prefix that may be dangerous to sanitize explicitly.
+ *
+ * Note that this class exists solely as a (necessary) optimization for this query.
+ */
+private class DangerousPrefix extends string {
+  DangerousPrefix() {
+    this = ["/..", "../"] or
+    this = "<!--" or
+    this = "<" + ["iframe", "script", "cript", "scrip", "style"]
+  }
+}
+
+/**
+ * A substring of a prefix that may be dangerous to sanitize explicitly.
+ */
+private class DangerousPrefixSubstring extends string {
+  DangerousPrefixSubstring() {
+    exists(DangerousPrefix s | this = s.substring([0 .. s.length()], [0 .. s.length()]))
+  }
+}
+
+/**
+ * Gets a char from a dangerous prefix that is matched by `t`.
+ */
+pragma[noinline]
+private DangerousPrefixSubstring getADangerousMatchedChar(EmptyReplaceRegExpTerm t) {
+  t.isNullable() and result = ""
+  or
+  result = t.getAMatchedString()
+  or
+  // A substring matched by some character class. This is only used to match the "word" part of a HTML tag (e.g. "iframe" in "<iframe").
+  exists(ReDoSUtil::CharacterClass cc |
+    cc = ReDoSUtil::getCanonicalCharClass(t) and
+    cc.matches(result) and
+    result.regexpMatch("\\w") and
+    // excluding character classes that match ">" (e.g. /<[^<]*>/), as these might consume nested HTML tags, and thus prevent the dangerous pattern this query is looking for.
+    not cc.matches(">")
+  )
+  or
+  t instanceof RegExpDot and
+  result.length() = 1
+  or
+  (
+    t instanceof RegExpOpt or
+    t instanceof RegExpStar or
+    t instanceof RegExpPlus or
+    t instanceof RegExpGroup or
+    t instanceof RegExpAlt
+  ) and
+  result = getADangerousMatchedChar(t.getAChild())
+}
+
+/**
+ * Gets a dangerous prefix that is in the prefix language of `t`.
+ */
+private DangerousPrefix getADangerousMatchedPrefix(EmptyReplaceRegExpTerm t) {
+  result = getADangerousMatchedPrefixSubstring(t) and
+  not exists(EmptyReplaceRegExpTerm pred | pred = t.getPredecessor+() and not pred.isNullable())
+}
+
+/**
+ * Gets a substring of a dangerous prefix that is in the language starting at `t` (ignoring lookarounds).
+ *
+ * Note that the language of `t` is slightly restricted as not all RegExpTerm types are supported.
+ */
+private DangerousPrefixSubstring getADangerousMatchedPrefixSubstring(EmptyReplaceRegExpTerm t) {
+  result = getADangerousMatchedChar(t) + getADangerousMatchedPrefixSubstring(t.getSuccessor())
+  or
+  result = getADangerousMatchedChar(t)
+  or
+  // loop around for repetitions (only considering alphanumeric characters in the repetition)
+  exists(RepetitionMatcher repetition | t = repetition |
+    result = getADangerousMatchedPrefixSubstring(repetition) + repetition.getAChar()
+  )
+}
+
+private class RepetitionMatcher extends EmptyReplaceRegExpTerm {
+  string char;
+
+  pragma[noinline]
+  RepetitionMatcher() {
+    (this instanceof RegExpPlus or this instanceof RegExpStar) and
+    char = getADangerousMatchedChar(this.getAChild()) and
+    char.regexpMatch("\\w")
+  }
+
+  pragma[noinline]
+  string getAChar() { result = char }
+}
+
+/**
+ * Holds if `t` may match the dangerous `prefix` and some suffix, indicating intent to prevent a vulnerability of kind `kind`.
+ */
+predicate matchesDangerousPrefix(EmptyReplaceRegExpTerm t, string prefix, string kind) {
+  prefix = getADangerousMatchedPrefix(t) and
+  (
+    kind = "path injection" and
+    prefix = ["/..", "../"] and
+    // If the regex is matching explicit path components, it is unlikely that it's being used as a sanitizer.
+    not t.getSuccessor*().getAMatchedString().regexpMatch("(?is).*[a-z0-9_-].*")
+    or
+    kind = "HTML element injection" and
+    (
+      // comments
+      prefix = "<!--" and
+      // If the regex is matching explicit textual content of an HTML comment, it is unlikely that it's being used as a sanitizer.
+      not t.getSuccessor*().getAMatchedString().regexpMatch("(?is).*[a-z0-9_].*")
+      or
+      // specific tags
+      // the `cript|scrip` case has been observed in the wild several times
+      prefix = "<" + ["iframe", "script", "cript", "scrip", "style"]
+    )
+  )
+  or
+  kind = "HTML attribute injection" and
+  prefix =
+    [
+      // ordinary event handler prefix
+      "on",
+      // angular prefixes
+      "ng-", "ng:", "data-ng-", "x-ng-"
+    ] and
+  (
+    // explicit matching: `onclick` and `ng-bind`
+    t.getAMatchedString().regexpMatch("(?i)" + prefix + "[a-z]+")
+    or
+    // regexp-based matching: `on[a-z]+`
+    exists(EmptyReplaceRegExpTerm start | start = t.getAChild() |
+      start.getAMatchedString().regexpMatch("(?i)[^a-z]*" + prefix) and
+      isCommonWordMatcher(start.getSuccessor())
+    )
+  )
+}
+
+/**
+ * Holds if `t` is a common pattern for matching words
+ */
+private predicate isCommonWordMatcher(RegExpTerm t) {
+  exists(RegExpTerm quantified | quantified = t.(RegExpQuantifier).getChild(0) |
+    // [a-z]+ and similar
+    quantified
+        .(RegExpCharacterClass)
+        .getAChild()
+        .(RegExpCharacterRange)
+        .isRange(["a", "A"], ["z", "Z"])
+    or
+    // \w+ or [\w]+
+    [quantified, quantified.(RegExpCharacterClass).getAChild()]
+        .(RegExpCharacterClassEscape)
+        .getValue() = "w"
+  )
+}
--- a/ruby/ql/lib/codeql/ruby/security/IncompleteMultiCharacterSanitizationQuery.qll
+++ b/ruby/ql/lib/codeql/ruby/security/IncompleteMultiCharacterSanitizationQuery.qll
@@ -2,180 +2,10 @@
 * Provides predicates for reasoning about improper multi-character sanitization.
 */

-private import ruby
-private import codeql.ruby.regexp.RegExpTreeView as RETV
-private import codeql.ruby.security.performance.ReDoSUtil as ReDoSUtil
 private import codeql.ruby.DataFlow
 private import codeql.ruby.frameworks.core.String
 private import codeql.ruby.dataflow.internal.DataFlowDispatch
-
-/**
- * A regexp term that matches substrings that should be replaced with the empty string.
- */
-class EmptyReplaceRegExpTerm extends RETV::RegExpTerm {
-  private StringSubstitutionCall call;
-
-  EmptyReplaceRegExpTerm() {
-    call.getReplacementString() = "" and
-    this = call.getPatternRegExp().getRegExpTerm().getAChild*()
-  }
-
-  /**
-   * Get the substitution call that uses this regexp term.
-   */
-  StringSubstitutionCall getCall() { result = call }
-}
-
-/**
- * A prefix that may be dangerous to sanitize explicitly.
- *
- * Note that this class exists solely as a (necessary) optimization for this query.
- */
-private class DangerousPrefix extends string {
-  DangerousPrefix() {
-    this = ["/..", "../"] or
-    this = "<!--" or
-    this = "<" + ["iframe", "script", "cript", "scrip", "style"]
-  }
-}
-
-/**
- * A substring of a prefix that may be dangerous to sanitize explicitly.
- */
-private class DangerousPrefixSubstring extends string {
-  DangerousPrefixSubstring() {
-    exists(DangerousPrefix s | this = s.substring([0 .. s.length()], [0 .. s.length()]))
-  }
-}
-
-/**
- * Gets a dangerous prefix that is in the prefix language of `t`.
- */
-private DangerousPrefix getADangerousMatchedPrefix(EmptyReplaceRegExpTerm t) {
-  result = getADangerousMatchedPrefixSubstring(t) and
-  not exists(EmptyReplaceRegExpTerm pred | pred = t.getPredecessor+() and not pred.isNullable())
-}
-
-pragma[noinline]
-private DangerousPrefixSubstring getADangerousMatchedChar(EmptyReplaceRegExpTerm t) {
-  t.isNullable() and result = ""
-  or
-  result = t.getAMatchedString()
-  or
-  // A substring matched by some character class. This is only used to match the "word" part of a HTML tag (e.g. "iframe" in "<iframe").
-  exists(ReDoSUtil::CharacterClass cc |
-    cc = ReDoSUtil::getCanonicalCharClass(t) and
-    cc.matches(result) and
-    result.regexpMatch("\\w") and
-    // excluding character classes that match ">" (e.g. /<[^<]*>/), as these might consume nested HTML tags, and thus prevent the dangerous pattern this query is looking for.
-    not cc.matches(">")
-  )
-  or
-  t instanceof RETV::RegExpDot and
-  result.length() = 1
-  or
-  (
-    t instanceof RETV::RegExpOpt or
-    t instanceof RETV::RegExpStar or
-    t instanceof RETV::RegExpPlus or
-    t instanceof RETV::RegExpGroup or
-    t instanceof RETV::RegExpAlt
-  ) and
-  result = getADangerousMatchedChar(t.getAChild())
-}
-
-/**
- * Gets a substring of a dangerous prefix that is in the language starting at `t` (ignoring lookarounds).
- *
- * Note that the language of `t` is slightly restricted as not all RegExpTerm types are supported.
- */
-private DangerousPrefixSubstring getADangerousMatchedPrefixSubstring(EmptyReplaceRegExpTerm t) {
-  result = getADangerousMatchedChar(t) + getADangerousMatchedPrefixSubstring(t.getSuccessor())
-  or
-  result = getADangerousMatchedChar(t)
-  or
-  // loop around for repetitions (only considering alphanumeric characters in the repetition)
-  exists(RepetitionMatcher repetition | t = repetition |
-    result = getADangerousMatchedPrefixSubstring(repetition) + repetition.getAChar()
-  )
-}
-
-private class RepetitionMatcher extends EmptyReplaceRegExpTerm {
-  string char;
-
-  pragma[noinline]
-  RepetitionMatcher() {
-    (this instanceof RETV::RegExpPlus or this instanceof RETV::RegExpStar) and
-    char = getADangerousMatchedChar(this.getAChild()) and
-    char.regexpMatch("\\w")
-  }
-
-  pragma[noinline]
-  string getAChar() { result = char }
-}
-
-/**
- * Holds if `t` may match the dangerous `prefix` and some suffix, indicating intent to prevent a vulnerability of kind `kind`.
- */
-private predicate matchesDangerousPrefix(EmptyReplaceRegExpTerm t, string prefix, string kind) {
-  prefix = getADangerousMatchedPrefix(t) and
-  (
-    kind = "path injection" and
-    prefix = ["/..", "../"] and
-    // If the regex is matching explicit path components, it is unlikely that it's being used as a sanitizer.
-    not t.getSuccessor*().getAMatchedString().regexpMatch("(?is).*[a-z0-9_-].*")
-    or
-    kind = "HTML element injection" and
-    (
-      // comments
-      prefix = "<!--" and
-      // If the regex is matching explicit textual content of an HTML comment, it is unlikely that it's being used as a sanitizer.
-      not t.getSuccessor*().getAMatchedString().regexpMatch("(?is).*[a-z0-9_].*")
-      or
-      // specific tags
-      // the `cript|scrip` case has been observed in the wild several times
-      prefix = "<" + ["iframe", "script", "cript", "scrip", "style"]
-    )
-  )
-  or
-  kind = "HTML attribute injection" and
-  prefix =
-    [
-      // ordinary event handler prefix
-      "on",
-      // angular prefixes
-      "ng-", "ng:", "data-ng-", "x-ng-"
-    ] and
-  (
-    // explicit matching: `onclick` and `ng-bind`
-    t.getAMatchedString().regexpMatch("(?i)" + prefix + "[a-z]+")
-    or
-    // regexp-based matching: `on[a-z]+`
-    exists(EmptyReplaceRegExpTerm start | start = t.getAChild() |
-      start.getAMatchedString().regexpMatch("(?i)[^a-z]*" + prefix) and
-      isCommonWordMatcher(start.getSuccessor())
-    )
-  )
-}
-
-/**
- * Holds if `t` is a common pattern for matching words
- */
-private predicate isCommonWordMatcher(RETV::RegExpTerm t) {
-  exists(RETV::RegExpTerm quantified | quantified = t.(RETV::RegExpQuantifier).getChild(0) |
-    // [a-z]+ and similar
-    quantified
-        .(RETV::RegExpCharacterClass)
-        .getAChild()
-        .(RETV::RegExpCharacterRange)
-        .isRange(["a", "A"], ["z", "Z"])
-    or
-    // \w+ or [\w]+
-    [quantified, quantified.(RETV::RegExpCharacterClass).getAChild()]
-        .(RETV::RegExpCharacterClassEscape)
-        .getValue() = "w"
-  )
-}
+private import codeql.ruby.security.IncompleteMultiCharacterSanitization

 /**
 * Holds if `replace` has a pattern argument containing a regular expression
@@ -199,8 +29,8 @@ predicate hasResult(
      matchesDangerousPrefix(other, _, kind) and
      not other.isNullable()
    ) and
-    not exists(RETV::RegExpCaret c | regexp = c.getRootTerm()) and
-    not exists(RETV::RegExpDollar d | regexp = d.getRootTerm()) and
+    not exists(RegExpCaret c | regexp = c.getRootTerm()) and
+    not exists(RegExpDollar d | regexp = d.getRootTerm()) and
    // Don't flag replace operations that are called repeatedly in a loop, as they can actually work correctly.
    not replace.flowsTo(replace.getReceiver+())
  )
--- a/ruby/ql/lib/codeql/ruby/security/IncompleteMultiCharacterSanitizationSpecific.qll
+++ b/ruby/ql/lib/codeql/ruby/security/IncompleteMultiCharacterSanitizationSpecific.qll
@@ -0,0 +1,24 @@
+/**
+ * Provides language-specific predicates for reasoning about improper multi-character sanitization.
+ */
+
+private import codeql.ruby.frameworks.core.String
+import codeql.ruby.regexp.RegExpTreeView
+import codeql.ruby.security.performance.ReDoSUtil as ReDoSUtil
+
+/**
+ * A regexp term that matches substrings that should be replaced with the empty string.
+ */
+class EmptyReplaceRegExpTerm extends RegExpTerm {
+  private StringSubstitutionCall call;
+
+  EmptyReplaceRegExpTerm() {
+    call.getReplacementString() = "" and
+    this = call.getPatternRegExp().getRegExpTerm().getAChild*()
+  }
+
+  /**
+   * Get the substitution call that uses this regexp term.
+   */
+  StringSubstitutionCall getCall() { result = call }
+}
--- a/ruby/ql/test/query-tests/security/cwe-116/IncompleteMultiCharacterSanitization/IncompleteMultiCharacterSanitization.ql
+++ b/ruby/ql/test/query-tests/security/cwe-116/IncompleteMultiCharacterSanitization/IncompleteMultiCharacterSanitization.ql
@@ -4,6 +4,7 @@

 import ruby
 import codeql.ruby.security.IncompleteMultiCharacterSanitizationQuery as Query
+import codeql.ruby.security.IncompleteMultiCharacterSanitizationSpecific as Specific
 import codeql.ruby.frameworks.core.String
 import TestUtilities.InlineExpectationsTest

@@ -20,7 +21,7 @@ class Test extends InlineExpectationsTest {

 predicate hasResult(Location location, string element, string value) {
  exists(
-    StringSubstitutionCall replace, Query::EmptyReplaceRegExpTerm dangerous, string prefix,
+    StringSubstitutionCall replace, Specific::EmptyReplaceRegExpTerm dangerous, string prefix,
    string kind
  |
    replace.getLocation() = location and