Merge pull request #8142 from github/hmac/incomplete-multi-char-sanitization

2026-07-20 18:58:36 +02:00 · 2022-08-18 10:02:39 +12:00
parent e93ff8672c 1f4dad4167
commit 70ec70940a
15 changed files with 724 additions and 198 deletions
--- a/config/identical-files.json
+++ b/config/identical-files.json
@@ -597,5 +597,9 @@
  "Swift patterns test file": [
    "swift/ql/test/extractor-tests/patterns/patterns.swift",
    "swift/ql/test/library-tests/parent/patterns.swift"
+  ],
+  "IncompleteMultiCharacterSanitization JS/Ruby": [
+    "javascript/ql/lib/semmle/javascript/security/IncompleteMultiCharacterSanitizationQuery.qll",
+    "ruby/ql/lib/codeql/ruby/security/IncompleteMultiCharacterSanitizationQuery.qll"
  ]
 }
--- a/javascript/ql/lib/semmle/javascript/security/IncompleteMultiCharacterSanitizationQuery.qll
+++ b/javascript/ql/lib/semmle/javascript/security/IncompleteMultiCharacterSanitizationQuery.qll
@@ -0,0 +1,202 @@
+/**
+ * Provides shared predicates for reasoning about improper multi-character sanitization.
+ */
+
+import IncompleteMultiCharacterSanitizationSpecific
+
+/**
+ * A prefix that may be dangerous to sanitize explicitly.
+ *
+ * Note that this class exists solely as a (necessary) optimization for this query.
+ */
+private class DangerousPrefix extends string {
+  DangerousPrefix() {
+    this = ["/..", "../"] or
+    this = "<!--" or
+    this = "<" + ["iframe", "script", "cript", "scrip", "style"]
+  }
+}
+
+/**
+ * A substring of a prefix that may be dangerous to sanitize explicitly.
+ */
+private class DangerousPrefixSubstring extends string {
+  DangerousPrefixSubstring() {
+    exists(DangerousPrefix s | this = s.substring([0 .. s.length()], [0 .. s.length()]))
+  }
+}
+
+/**
+ * Gets a char from a dangerous prefix that is matched by `t`.
+ */
+pragma[noinline]
+private DangerousPrefixSubstring getADangerousMatchedChar(EmptyReplaceRegExpTerm t) {
+  t.isNullable() and result = ""
+  or
+  result = t.getAMatchedString()
+  or
+  // A substring matched by some character class. This is only used to match the "word" part of a HTML tag (e.g. "iframe" in "<iframe").
+  exists(NfaUtils::CharacterClass cc |
+    cc = NfaUtils::getCanonicalCharClass(t) and
+    cc.matches(result) and
+    result.regexpMatch("\\w") and
+    // excluding character classes that match ">" (e.g. /<[^<]*>/), as these might consume nested HTML tags, and thus prevent the dangerous pattern this query is looking for.
+    not cc.matches(">")
+  )
+  or
+  t instanceof RegExpDot and
+  result.length() = 1
+  or
+  (
+    t instanceof RegExpOpt or
+    t instanceof RegExpStar or
+    t instanceof RegExpPlus or
+    t instanceof RegExpGroup or
+    t instanceof RegExpAlt
+  ) and
+  result = getADangerousMatchedChar(t.getAChild())
+}
+
+/**
+ * Gets a dangerous prefix that is in the prefix language of `t`.
+ */
+private DangerousPrefix getADangerousMatchedPrefix(EmptyReplaceRegExpTerm t) {
+  result = getADangerousMatchedPrefixSubstring(t) and
+  not exists(EmptyReplaceRegExpTerm pred | pred = t.getPredecessor+() and not pred.isNullable())
+}
+
+/**
+ * Gets a substring of a dangerous prefix that is in the language starting at `t` (ignoring lookarounds).
+ *
+ * Note that the language of `t` is slightly restricted as not all RegExpTerm types are supported.
+ */
+private DangerousPrefixSubstring getADangerousMatchedPrefixSubstring(EmptyReplaceRegExpTerm t) {
+  result = getADangerousMatchedChar(t) + getADangerousMatchedPrefixSubstring(t.getSuccessor())
+  or
+  result = getADangerousMatchedChar(t)
+  or
+  // loop around for repetitions (only considering alphanumeric characters in the repetition)
+  exists(RepetitionMatcher repetition | t = repetition |
+    result = getADangerousMatchedPrefixSubstring(repetition) + repetition.getAChar()
+  )
+}
+
+private class RepetitionMatcher extends EmptyReplaceRegExpTerm {
+  string char;
+
+  pragma[noinline]
+  RepetitionMatcher() {
+    (this instanceof RegExpPlus or this instanceof RegExpStar) and
+    char = getADangerousMatchedChar(this.getAChild()) and
+    char.regexpMatch("\\w")
+  }
+
+  pragma[noinline]
+  string getAChar() { result = char }
+}
+
+/**
+ * Holds if `t` may match the dangerous `prefix` and some suffix, indicating intent to prevent a vulnerability of kind `kind`.
+ */
+predicate matchesDangerousPrefix(EmptyReplaceRegExpTerm t, string prefix, string kind) {
+  prefix = getADangerousMatchedPrefix(t) and
+  (
+    kind = "path injection" and
+    prefix = ["/..", "../"] and
+    // If the regex is matching explicit path components, it is unlikely that it's being used as a sanitizer.
+    not t.getSuccessor*().getAMatchedString().regexpMatch("(?is).*[a-z0-9_-].*")
+    or
+    kind = "HTML element injection" and
+    (
+      // comments
+      prefix = "<!--" and
+      // If the regex is matching explicit textual content of an HTML comment, it is unlikely that it's being used as a sanitizer.
+      not t.getSuccessor*().getAMatchedString().regexpMatch("(?is).*[a-z0-9_].*")
+      or
+      // specific tags
+      // the `cript|scrip` case has been observed in the wild several times
+      prefix = "<" + ["iframe", "script", "cript", "scrip", "style"]
+    )
+  )
+  or
+  kind = "HTML attribute injection" and
+  prefix =
+    [
+      // ordinary event handler prefix
+      "on",
+      // angular prefixes
+      "ng-", "ng:", "data-ng-", "x-ng-"
+    ] and
+  (
+    // explicit matching: `onclick` and `ng-bind`
+    t.getAMatchedString().regexpMatch("(?i)" + prefix + "[a-z]+")
+    or
+    // regexp-based matching: `on[a-z]+`
+    exists(EmptyReplaceRegExpTerm start | start = t.getAChild() |
+      start.getAMatchedString().regexpMatch("(?i)[^a-z]*" + prefix) and
+      isCommonWordMatcher(start.getSuccessor())
+    )
+  )
+}
+
+/**
+ * Holds if `t` is a common pattern for matching words
+ */
+private predicate isCommonWordMatcher(RegExpTerm t) {
+  exists(RegExpTerm quantified | quantified = t.(RegExpQuantifier).getChild(0) |
+    // [a-z]+ and similar
+    quantified
+        .(RegExpCharacterClass)
+        .getAChild()
+        .(RegExpCharacterRange)
+        .isRange(["a", "A"], ["z", "Z"])
+    or
+    // \w+ or [\w]+
+    [quantified, quantified.(RegExpCharacterClass).getAChild()]
+        .(RegExpCharacterClassEscape)
+        .getValue() = "w"
+  )
+}
+
+/**
+ * Holds if `replace` has a pattern argument containing a regular expression
+ * `dangerous` which matches a dangerous string beginning with `prefix`, in an
+ * attempt to avoid a vulnerability of kind `kind`.
+ */
+predicate isResult(
+  StringSubstitutionCall replace, EmptyReplaceRegExpTerm dangerous, string prefix, string kind
+) {
+  exists(EmptyReplaceRegExpTerm regexp |
+    replace = regexp.getCall() and
+    dangerous.getRootTerm() = regexp and
+    // skip leading optional elements
+    not dangerous.isNullable() and
+    // only warn about the longest match
+    prefix = max(string m | matchesDangerousPrefix(dangerous, m, kind) | m order by m.length(), m) and
+    // only warn once per kind
+    not exists(EmptyReplaceRegExpTerm other |
+      other = dangerous.getAChild+() or other = dangerous.getPredecessor+()
+    |
+      matchesDangerousPrefix(other, _, kind) and
+      not other.isNullable()
+    ) and
+    // avoid anchored terms
+    not exists(RegExpAnchor a | regexp = a.getRootTerm()) and
+    // Don't flag replace operations that are called repeatedly in a loop, as they can actually work correctly.
+    not replace.flowsTo(replace.getReceiver+())
+  )
+}
+
+/**
+ * Holds if `replace` has a pattern argument containing a regular expression
+ * `dangerous` which matches a dangerous string beginning with `prefix`. `msg`
+ * is the alert we report.
+ */
+query predicate problems(
+  StringSubstitutionCall replace, string msg, EmptyReplaceRegExpTerm dangerous, string prefix
+) {
+  exists(string kind |
+    isResult(replace, dangerous, prefix, kind) and
+    msg = "This string may still contain $@, which may cause a " + kind + " vulnerability."
+  )
+}
--- a/javascript/ql/lib/semmle/javascript/security/IncompleteMultiCharacterSanitizationSpecific.qll
+++ b/javascript/ql/lib/semmle/javascript/security/IncompleteMultiCharacterSanitizationSpecific.qll
@@ -0,0 +1,25 @@
+/**
+ * Provides language-specific predicates for reasoning about improper multi-character sanitization.
+ */
+
+import javascript
+import semmle.javascript.security.regexp.NfaUtils as NfaUtils
+
+class StringSubstitutionCall = StringReplaceCall;
+
+/**
+ * A regexp term that matches substrings that should be replaced with the empty string.
+ */
+class EmptyReplaceRegExpTerm extends RegExpTerm {
+  EmptyReplaceRegExpTerm() {
+    exists(StringReplaceCall replace |
+      [replace.getRawReplacement(), replace.getCallback(1).getAReturn()].mayHaveStringValue("") and
+      this = replace.getRegExp().getRoot().getAChild*()
+    )
+  }
+
+  /**
+   * Get the substitution call that uses this regexp term.
+   */
+  StringSubstitutionCall getCall() { this = result.getRegExp().getRoot() }
+}
--- a/javascript/ql/src/Security/CWE-116/IncompleteMultiCharacterSanitization.ql
+++ b/javascript/ql/src/Security/CWE-116/IncompleteMultiCharacterSanitization.ql
@@ -13,194 +13,4 @@
 *       external/cwe/cwe-116
 */

-import javascript
-
-/**
- * A regexp term that matches substrings that should be replaced with the empty string.
- */
-class EmptyReplaceRegExpTerm extends RegExpTerm {
-  EmptyReplaceRegExpTerm() {
-    exists(StringReplaceCall replace |
-      [replace.getRawReplacement(), replace.getCallback(1).getAReturn()].mayHaveStringValue("") and
-      this = replace.getRegExp().getRoot().getAChild*()
-    )
-  }
-}
-
-/**
- * A prefix that may be dangerous to sanitize explicitly.
- *
- * Note that this class exists solely as a (necessary) optimization for this query.
- */
-class DangerousPrefix extends string {
-  DangerousPrefix() {
-    this = ["/..", "../"] or
-    this = "<!--" or
-    this = "<" + ["iframe", "script", "cript", "scrip", "style"]
-  }
-}
-
-/**
- * A substring of a prefix that may be dangerous to sanitize explicitly.
- */
-class DangerousPrefixSubstring extends string {
-  DangerousPrefixSubstring() {
-    exists(DangerousPrefix s | this = s.substring([0 .. s.length()], [0 .. s.length()]))
-  }
-}
-
-/**
- * Gets a dangerous prefix that is in the prefix language of `t`.
- */
-DangerousPrefix getADangerousMatchedPrefix(EmptyReplaceRegExpTerm t) {
-  result = getADangerousMatchedPrefixSubstring(t) and
-  not exists(EmptyReplaceRegExpTerm pred | pred = t.getPredecessor+() and not pred.isNullable())
-}
-
-private import semmle.javascript.security.regexp.NfaUtils as NfaUtils
-
-/**
- * Gets a char from a dangerous prefix that is matched by `t`.
- */
-pragma[noinline]
-DangerousPrefixSubstring getADangerousMatchedChar(EmptyReplaceRegExpTerm t) {
-  t.isNullable() and result = ""
-  or
-  t.getAMatchedString() = result
-  or
-  // A substring matched by some character class. This is only used to match the "word" part of a HTML tag (e.g. "iframe" in "<iframe").
-  exists(NfaUtils::CharacterClass cc |
-    cc = NfaUtils::getCanonicalCharClass(t) and
-    cc.matches(result) and
-    result.regexpMatch("\\w") and
-    // excluding character classes that match ">" (e.g. /<[^<]*>/), as these might consume nested HTML tags, and thus prevent the dangerous pattern this query is looking for.
-    not cc.matches(">")
-  )
-  or
-  t instanceof RegExpDot and
-  result.length() = 1
-  or
-  (
-    t instanceof RegExpOpt or
-    t instanceof RegExpStar or
-    t instanceof RegExpPlus or
-    t instanceof RegExpGroup or
-    t instanceof RegExpAlt
-  ) and
-  result = getADangerousMatchedChar(t.getAChild())
-}
-
-/**
- * Gets a substring of a dangerous prefix that is in the language starting at `t` (ignoring lookarounds).
- *
- * Note that the language of `t` is slightly restricted as not all RegExpTerm types are supported.
- */
-DangerousPrefixSubstring getADangerousMatchedPrefixSubstring(EmptyReplaceRegExpTerm t) {
-  result = getADangerousMatchedChar(t) + getADangerousMatchedPrefixSubstring(t.getSuccessor())
-  or
-  result = getADangerousMatchedChar(t)
-  or
-  // loop around for repetitions (only considering alphanumeric characters in the repetition)
-  exists(RepetitionMatcher repetition | t = repetition |
-    result = getADangerousMatchedPrefixSubstring(repetition) + repetition.getAChar()
-  )
-}
-
-class RepetitionMatcher extends EmptyReplaceRegExpTerm {
-  string char;
-
-  pragma[noinline]
-  RepetitionMatcher() {
-    (this instanceof RegExpPlus or this instanceof RegExpStar) and
-    char = getADangerousMatchedChar(this.getAChild()) and
-    char.regexpMatch("\\w")
-  }
-
-  pragma[noinline]
-  string getAChar() { result = char }
-}
-
-/**
- * Holds if `t` may match the dangerous `prefix` and some suffix, indicating intent to prevent a vulnerablity of kind `kind`.
- */
-predicate matchesDangerousPrefix(EmptyReplaceRegExpTerm t, string prefix, string kind) {
-  prefix = getADangerousMatchedPrefix(t) and
-  (
-    kind = "path injection" and
-    // upwards navigation
-    prefix = ["/..", "../"] and
-    not t.getSuccessor*().getAMatchedString().regexpMatch("(?is).*[a-z0-9_-].*") // explicit path name mentions make this an unlikely sanitizer
-    or
-    kind = "HTML element injection" and
-    (
-      // comments
-      prefix = "<!--" and
-      not t.getSuccessor*().getAMatchedString().regexpMatch("(?is).*[a-z0-9_].*") // explicit comment content mentions make this an unlikely sanitizer
-      or
-      // specific tags
-      prefix = "<" + ["iframe", "script", "cript", "scrip", "style"] // the `cript|scrip` case has been observed in the wild several times
-    )
-  )
-  or
-  kind = "HTML attribute injection" and
-  prefix =
-    [
-      // ordinary event handler prefix
-      "on",
-      // angular prefixes
-      "ng-", "ng:", "data-ng-", "x-ng-"
-    ] and
-  (
-    // explicit matching: `onclick` and `ng-bind`
-    t.getAMatchedString().regexpMatch("(?i)" + prefix + "[a-z]+")
-    or
-    // regexp-based matching: `on[a-z]+`
-    exists(EmptyReplaceRegExpTerm start | start = t.getAChild() |
-      start.getConstantValue().regexpMatch("(?i)[^a-z]*" + prefix) and
-      isCommonWordMatcher(start.getSuccessor())
-    )
-  )
-}
-
-/**
- * Holds if `t` is a common pattern for matching words
- */
-predicate isCommonWordMatcher(RegExpTerm t) {
-  exists(RegExpTerm quantified | quantified = t.(RegExpQuantifier).getChild(0) |
-    // [a-z]+ and similar
-    quantified
-        .(RegExpCharacterClass)
-        .getAChild()
-        .(RegExpCharacterRange)
-        .isRange(["a", "A"], ["z", "Z"])
-    or
-    // \w+ or [\w]+
-    [quantified, quantified.(RegExpCharacterClass).getAChild()]
-        .(RegExpCharacterClassEscape)
-        .getValue() = "w"
-  )
-}
-
-from
-  StringReplaceCall replace, EmptyReplaceRegExpTerm regexp, EmptyReplaceRegExpTerm dangerous,
-  string prefix, string kind
-where
-  regexp = replace.getRegExp().getRoot() and
-  dangerous.getRootTerm() = regexp and
-  // skip leading optional elements
-  not dangerous.isNullable() and
-  // only warn about the longest match (presumably the most descriptive)
-  prefix = max(string m | matchesDangerousPrefix(dangerous, m, kind) | m order by m.length(), m) and
-  // only warn once per kind
-  not exists(EmptyReplaceRegExpTerm other |
-    other = dangerous.getAChild+() or other = dangerous.getPredecessor+()
-  |
-    matchesDangerousPrefix(other, _, kind) and
-    not other.isNullable()
-  ) and
-  // don't flag replace operations in a loop
-  not replace.getAMethodCall*().flowsTo(replace.getReceiver()) and
-  // avoid anchored terms
-  not exists(RegExpAnchor a | regexp = a.getRootTerm())
-select replace, "This string may still contain $@, which may cause a " + kind + " vulnerability.",
-  dangerous, prefix
+import semmle.javascript.security.IncompleteMultiCharacterSanitizationQuery
--- a/javascript/ql/test/query-tests/Security/CWE-116/IncompleteSanitization/IncompleteMultiCharacterSanitization.qlref
+++ b/javascript/ql/test/query-tests/Security/CWE-116/IncompleteSanitization/IncompleteMultiCharacterSanitization.qlref
@@ -1 +1 @@
-Security/CWE-116/IncompleteMultiCharacterSanitization.ql
+Security/CWE-116/IncompleteMultiCharacterSanitization.ql
--- a/ruby/ql/lib/codeql/ruby/frameworks/core/String.qll
+++ b/ruby/ql/lib/codeql/ruby/frameworks/core/String.qll
@@ -20,6 +20,8 @@ class StringSubstitutionCall extends DataFlow::CallNode {
    this.getMethodName() = ["sub", "sub!", "gsub", "gsub!"] and
    exists(this.getReceiver()) and
    this.getNumberOfArguments() = 2
+    or
+    this.getNumberOfArguments() = 1 and exists(this.getBlock())
  }

  /**
@@ -45,9 +47,10 @@ class StringSubstitutionCall extends DataFlow::CallNode {
   * call, if any.
   */
  RE::RegExpPatternSource getPatternRegExp() {
-    // TODO: using local flow means we miss regexps defined as constants outside
-    // of the function scope.
    result.(DataFlow::LocalSourceNode).flowsTo(this.getPatternArgument())
+    or
+    result.asExpr().getExpr() =
+      this.getPatternArgument().asExpr().getExpr().(ConstantReadAccess).getValue()
  }

  /**
@@ -59,11 +62,19 @@ class StringSubstitutionCall extends DataFlow::CallNode {
  }

  /**
-   * Gets the string value passed as the second (replacement) argument in this
-   * call, if any.
+   * Gets the string value used to replace instances of the pattern, if any.
+   * This includes values passed explicitly as the second argument and values
+   * returned from the block, if one is given.
   */
  string getReplacementString() {
    result = this.getReplacementArgument().asExpr().getConstantValue().getString()
+    or
+    exists(DataFlow::Node blockReturnNode, DataFlow::LocalSourceNode stringNode |
+      exprNodeReturnedFrom(blockReturnNode, this.getBlock().asExpr().getExpr())
+    |
+      stringNode.flowsTo(blockReturnNode) and
+      result = stringNode.asExpr().getConstantValue().getString()
+    )
  }

  /** Gets a string that is being replaced by this call. */
@@ -77,7 +88,6 @@ class StringSubstitutionCall extends DataFlow::CallNode {
  predicate replaces(string old, string new) {
    old = this.getAReplacedString() and
    new = this.getReplacementString()
-    // TODO: handle block-variant of the call
  }
 }

--- a/ruby/ql/lib/codeql/ruby/regexp/RegExpTreeView.qll
+++ b/ruby/ql/lib/codeql/ruby/regexp/RegExpTreeView.qll
@@ -268,6 +268,9 @@ class RegExpTerm extends RegExpParent {

  /** Gets the primary QL class for this term. */
  override string getAPrimaryQlClass() { result = "RegExpTerm" }
+
+  /** Holds if this regular expression term can match the empty string. */
+  predicate isNullable() { none() }
 }

 /**
@@ -326,6 +329,8 @@ class RegExpStar extends InfiniteRepetitionQuantifier {
  RegExpStar() { this.getQualifier().charAt(0) = "*" }

  override string getAPrimaryQlClass() { result = "RegExpStar" }
+
+  override predicate isNullable() { any() }
 }

 /**
@@ -341,6 +346,8 @@ class RegExpPlus extends InfiniteRepetitionQuantifier {
  RegExpPlus() { this.getQualifier().charAt(0) = "+" }

  override string getAPrimaryQlClass() { result = "RegExpPlus" }
+
+  override predicate isNullable() { this.getAChild().isNullable() }
 }

 /**
@@ -356,6 +363,8 @@ class RegExpOpt extends RegExpQuantifier {
  RegExpOpt() { this.getQualifier().charAt(0) = "?" }

  override string getAPrimaryQlClass() { result = "RegExpOpt" }
+
+  override predicate isNullable() { any() }
 }

 /**
@@ -375,6 +384,8 @@ class RegExpRange extends RegExpQuantifier {

  RegExpRange() { re.multiples(part_end, end, lower, upper) }

+  override string getAPrimaryQlClass() { result = "RegExpRange" }
+
  /** Gets the string defining the upper bound of this range, if any. */
  string getUpper() { result = upper }

@@ -393,7 +404,7 @@ class RegExpRange extends RegExpQuantifier {
  /** Gets the lower bound of the range. */
  int getLowerBound() { result = this.getLower().toInt() }

-  override string getAPrimaryQlClass() { result = "RegExpRange" }
+  override predicate isNullable() { this.getAChild().isNullable() or this.getLowerBound() = 0 }
 }

 /**
@@ -440,6 +451,10 @@ class RegExpSequence extends RegExpTerm, TRegExpSequence {
  }

  override string getAPrimaryQlClass() { result = "RegExpSequence" }
+
+  override predicate isNullable() {
+    forall(RegExpTerm child | child = this.getAChild() | child.isNullable())
+  }
 }

 pragma[nomagic]
@@ -505,6 +520,8 @@ class RegExpAlt extends RegExpTerm, TRegExpAlt {
  override string getAMatchedString() { result = this.getAlternative().getAMatchedString() }

  override string getAPrimaryQlClass() { result = "RegExpAlt" }
+
+  override predicate isNullable() { this.getAChild().isNullable() }
 }

 class RegExpCharEscape = RegExpEscape;
@@ -579,6 +596,8 @@ class RegExpEscape extends RegExpNormalChar {
 */
 class RegExpWordBoundary extends RegExpSpecialChar {
  RegExpWordBoundary() { this.getChar() = "\\b" }
+
+  override predicate isNullable() { none() }
 }

 /**
@@ -607,6 +626,8 @@ class RegExpCharacterClassEscape extends RegExpEscape {
  override RegExpTerm getChild(int i) { none() }

  override string getAPrimaryQlClass() { result = "RegExpCharacterClassEscape" }
+
+  override predicate isNullable() { none() }
 }

 /**
@@ -663,6 +684,8 @@ class RegExpCharacterClass extends RegExpTerm, TRegExpCharacterClass {
  }

  override string getAPrimaryQlClass() { result = "RegExpCharacterClass" }
+
+  override predicate isNullable() { none() }
 }

 /**
@@ -702,6 +725,8 @@ class RegExpCharacterRange extends RegExpTerm, TRegExpCharacterRange {
  }

  override string getAPrimaryQlClass() { result = "RegExpCharacterRange" }
+
+  override predicate isNullable() { none() }
 }

 /**
@@ -773,6 +798,8 @@ class RegExpConstant extends RegExpTerm {
  override string getConstantValue() { result = this.getValue() }

  override string getAPrimaryQlClass() { result = "RegExpConstant" }
+
+  override predicate isNullable() { none() }
 }

 /**
@@ -820,6 +847,8 @@ class RegExpGroup extends RegExpTerm, TRegExpGroup {
  override string getAMatchedString() { result = this.getAChild().getAMatchedString() }

  override string getAPrimaryQlClass() { result = "RegExpGroup" }
+
+  override predicate isNullable() { this.getAChild().isNullable() }
 }

 /**
@@ -867,6 +896,8 @@ class RegExpDot extends RegExpSpecialChar {
  RegExpDot() { this.getChar() = "." }

  override string getAPrimaryQlClass() { result = "RegExpDot" }
+
+  override predicate isNullable() { none() }
 }

 /**
@@ -897,6 +928,8 @@ class RegExpDollar extends RegExpAnchor {
  RegExpDollar() { this.getChar() = ["$", "\\Z", "\\z"] }

  override string getAPrimaryQlClass() { result = "RegExpDollar" }
+
+  override predicate isNullable() { any() }
 }

 /**
@@ -912,6 +945,8 @@ class RegExpCaret extends RegExpAnchor {
  RegExpCaret() { this.getChar() = ["^", "\\A"] }

  override string getAPrimaryQlClass() { result = "RegExpCaret" }
+
+  override predicate isNullable() { any() }
 }

 /**
@@ -929,6 +964,8 @@ class RegExpZeroWidthMatch extends RegExpGroup {
  override RegExpTerm getChild(int i) { none() }

  override string getAPrimaryQlClass() { result = "RegExpZeroWidthMatch" }
+
+  override predicate isNullable() { any() }
 }

 /**
@@ -954,6 +991,8 @@ class RegExpSubPattern extends RegExpZeroWidthMatch {
      result.getEnd() = in_end
    )
  }
+
+  override predicate isNullable() { any() }
 }

 /**
@@ -981,6 +1020,8 @@ class RegExpPositiveLookahead extends RegExpLookahead {
  RegExpPositiveLookahead() { re.positiveLookaheadAssertionGroup(start, end) }

  override string getAPrimaryQlClass() { result = "RegExpPositiveLookahead" }
+
+  override predicate isNullable() { any() }
 }

 /**
@@ -1076,6 +1117,8 @@ class RegExpBackRef extends RegExpTerm, TRegExpBackRef {
  override RegExpTerm getChild(int i) { none() }

  override string getAPrimaryQlClass() { result = "RegExpBackRef" }
+
+  override predicate isNullable() { this.getGroup().isNullable() }
 }

 /**
--- a/ruby/ql/lib/codeql/ruby/security/IncompleteMultiCharacterSanitizationQuery.qll
+++ b/ruby/ql/lib/codeql/ruby/security/IncompleteMultiCharacterSanitizationQuery.qll
@@ -0,0 +1,202 @@
+/**
+ * Provides shared predicates for reasoning about improper multi-character sanitization.
+ */
+
+import IncompleteMultiCharacterSanitizationSpecific
+
+/**
+ * A prefix that may be dangerous to sanitize explicitly.
+ *
+ * Note that this class exists solely as a (necessary) optimization for this query.
+ */
+private class DangerousPrefix extends string {
+  DangerousPrefix() {
+    this = ["/..", "../"] or
+    this = "<!--" or
+    this = "<" + ["iframe", "script", "cript", "scrip", "style"]
+  }
+}
+
+/**
+ * A substring of a prefix that may be dangerous to sanitize explicitly.
+ */
+private class DangerousPrefixSubstring extends string {
+  DangerousPrefixSubstring() {
+    exists(DangerousPrefix s | this = s.substring([0 .. s.length()], [0 .. s.length()]))
+  }
+}
+
+/**
+ * Gets a char from a dangerous prefix that is matched by `t`.
+ */
+pragma[noinline]
+private DangerousPrefixSubstring getADangerousMatchedChar(EmptyReplaceRegExpTerm t) {
+  t.isNullable() and result = ""
+  or
+  result = t.getAMatchedString()
+  or
+  // A substring matched by some character class. This is only used to match the "word" part of a HTML tag (e.g. "iframe" in "<iframe").
+  exists(NfaUtils::CharacterClass cc |
+    cc = NfaUtils::getCanonicalCharClass(t) and
+    cc.matches(result) and
+    result.regexpMatch("\\w") and
+    // excluding character classes that match ">" (e.g. /<[^<]*>/), as these might consume nested HTML tags, and thus prevent the dangerous pattern this query is looking for.
+    not cc.matches(">")
+  )
+  or
+  t instanceof RegExpDot and
+  result.length() = 1
+  or
+  (
+    t instanceof RegExpOpt or
+    t instanceof RegExpStar or
+    t instanceof RegExpPlus or
+    t instanceof RegExpGroup or
+    t instanceof RegExpAlt
+  ) and
+  result = getADangerousMatchedChar(t.getAChild())
+}
+
+/**
+ * Gets a dangerous prefix that is in the prefix language of `t`.
+ */
+private DangerousPrefix getADangerousMatchedPrefix(EmptyReplaceRegExpTerm t) {
+  result = getADangerousMatchedPrefixSubstring(t) and
+  not exists(EmptyReplaceRegExpTerm pred | pred = t.getPredecessor+() and not pred.isNullable())
+}
+
+/**
+ * Gets a substring of a dangerous prefix that is in the language starting at `t` (ignoring lookarounds).
+ *
+ * Note that the language of `t` is slightly restricted as not all RegExpTerm types are supported.
+ */
+private DangerousPrefixSubstring getADangerousMatchedPrefixSubstring(EmptyReplaceRegExpTerm t) {
+  result = getADangerousMatchedChar(t) + getADangerousMatchedPrefixSubstring(t.getSuccessor())
+  or
+  result = getADangerousMatchedChar(t)
+  or
+  // loop around for repetitions (only considering alphanumeric characters in the repetition)
+  exists(RepetitionMatcher repetition | t = repetition |
+    result = getADangerousMatchedPrefixSubstring(repetition) + repetition.getAChar()
+  )
+}
+
+private class RepetitionMatcher extends EmptyReplaceRegExpTerm {
+  string char;
+
+  pragma[noinline]
+  RepetitionMatcher() {
+    (this instanceof RegExpPlus or this instanceof RegExpStar) and
+    char = getADangerousMatchedChar(this.getAChild()) and
+    char.regexpMatch("\\w")
+  }
+
+  pragma[noinline]
+  string getAChar() { result = char }
+}
+
+/**
+ * Holds if `t` may match the dangerous `prefix` and some suffix, indicating intent to prevent a vulnerability of kind `kind`.
+ */
+predicate matchesDangerousPrefix(EmptyReplaceRegExpTerm t, string prefix, string kind) {
+  prefix = getADangerousMatchedPrefix(t) and
+  (
+    kind = "path injection" and
+    prefix = ["/..", "../"] and
+    // If the regex is matching explicit path components, it is unlikely that it's being used as a sanitizer.
+    not t.getSuccessor*().getAMatchedString().regexpMatch("(?is).*[a-z0-9_-].*")
+    or
+    kind = "HTML element injection" and
+    (
+      // comments
+      prefix = "<!--" and
+      // If the regex is matching explicit textual content of an HTML comment, it is unlikely that it's being used as a sanitizer.
+      not t.getSuccessor*().getAMatchedString().regexpMatch("(?is).*[a-z0-9_].*")
+      or
+      // specific tags
+      // the `cript|scrip` case has been observed in the wild several times
+      prefix = "<" + ["iframe", "script", "cript", "scrip", "style"]
+    )
+  )
+  or
+  kind = "HTML attribute injection" and
+  prefix =
+    [
+      // ordinary event handler prefix
+      "on",
+      // angular prefixes
+      "ng-", "ng:", "data-ng-", "x-ng-"
+    ] and
+  (
+    // explicit matching: `onclick` and `ng-bind`
+    t.getAMatchedString().regexpMatch("(?i)" + prefix + "[a-z]+")
+    or
+    // regexp-based matching: `on[a-z]+`
+    exists(EmptyReplaceRegExpTerm start | start = t.getAChild() |
+      start.getAMatchedString().regexpMatch("(?i)[^a-z]*" + prefix) and
+      isCommonWordMatcher(start.getSuccessor())
+    )
+  )
+}
+
+/**
+ * Holds if `t` is a common pattern for matching words
+ */
+private predicate isCommonWordMatcher(RegExpTerm t) {
+  exists(RegExpTerm quantified | quantified = t.(RegExpQuantifier).getChild(0) |
+    // [a-z]+ and similar
+    quantified
+        .(RegExpCharacterClass)
+        .getAChild()
+        .(RegExpCharacterRange)
+        .isRange(["a", "A"], ["z", "Z"])
+    or
+    // \w+ or [\w]+
+    [quantified, quantified.(RegExpCharacterClass).getAChild()]
+        .(RegExpCharacterClassEscape)
+        .getValue() = "w"
+  )
+}
+
+/**
+ * Holds if `replace` has a pattern argument containing a regular expression
+ * `dangerous` which matches a dangerous string beginning with `prefix`, in an
+ * attempt to avoid a vulnerability of kind `kind`.
+ */
+predicate isResult(
+  StringSubstitutionCall replace, EmptyReplaceRegExpTerm dangerous, string prefix, string kind
+) {
+  exists(EmptyReplaceRegExpTerm regexp |
+    replace = regexp.getCall() and
+    dangerous.getRootTerm() = regexp and
+    // skip leading optional elements
+    not dangerous.isNullable() and
+    // only warn about the longest match
+    prefix = max(string m | matchesDangerousPrefix(dangerous, m, kind) | m order by m.length(), m) and
+    // only warn once per kind
+    not exists(EmptyReplaceRegExpTerm other |
+      other = dangerous.getAChild+() or other = dangerous.getPredecessor+()
+    |
+      matchesDangerousPrefix(other, _, kind) and
+      not other.isNullable()
+    ) and
+    // avoid anchored terms
+    not exists(RegExpAnchor a | regexp = a.getRootTerm()) and
+    // Don't flag replace operations that are called repeatedly in a loop, as they can actually work correctly.
+    not replace.flowsTo(replace.getReceiver+())
+  )
+}
+
+/**
+ * Holds if `replace` has a pattern argument containing a regular expression
+ * `dangerous` which matches a dangerous string beginning with `prefix`. `msg`
+ * is the alert we report.
+ */
+query predicate problems(
+  StringSubstitutionCall replace, string msg, EmptyReplaceRegExpTerm dangerous, string prefix
+) {
+  exists(string kind |
+    isResult(replace, dangerous, prefix, kind) and
+    msg = "This string may still contain $@, which may cause a " + kind + " vulnerability."
+  )
+}
--- a/ruby/ql/lib/codeql/ruby/security/IncompleteMultiCharacterSanitizationSpecific.qll
+++ b/ruby/ql/lib/codeql/ruby/security/IncompleteMultiCharacterSanitizationSpecific.qll
@@ -0,0 +1,24 @@
+/**
+ * Provides language-specific predicates for reasoning about improper multi-character sanitization.
+ */
+
+import codeql.ruby.frameworks.core.String
+import codeql.ruby.regexp.RegExpTreeView
+import codeql.ruby.security.regexp.NfaUtils as NfaUtils
+
+/**
+ * A regexp term that matches substrings that should be replaced with the empty string.
+ */
+class EmptyReplaceRegExpTerm extends RegExpTerm {
+  private StringSubstitutionCall call;
+
+  EmptyReplaceRegExpTerm() {
+    call.getReplacementString() = "" and
+    this = call.getPatternRegExp().getRegExpTerm().getAChild*()
+  }
+
+  /**
+   * Get the substitution call that uses this regexp term.
+   */
+  StringSubstitutionCall getCall() { result = call }
+}
--- a/ruby/ql/src/change-notes/2022-07-21-incomplete-multi-character-sanitization.md
+++ b/ruby/ql/src/change-notes/2022-07-21-incomplete-multi-character-sanitization.md
@@ -0,0 +1,6 @@
+---
+category: newQuery
+---
+* Added a new query, `rb/incomplete-multi-character-sanitization`. The query
+  finds string transformations that do not replace all occurrences of a
+  multi-character substring.
--- a/ruby/ql/src/queries/security/cwe-116/IncompleteMultiCharacterSanitization.qhelp
+++ b/ruby/ql/src/queries/security/cwe-116/IncompleteMultiCharacterSanitization.qhelp
@@ -0,0 +1,8 @@
+<!DOCTYPE qhelp PUBLIC
+"-//Semmle//qhelp//EN"
+"qhelp.dtd">
+<qhelp>
+
+	<include src="IncompleteSanitization.qhelp" />
+
+</qhelp>
--- a/ruby/ql/src/queries/security/cwe-116/IncompleteMultiCharacterSanitization.ql
+++ b/ruby/ql/src/queries/security/cwe-116/IncompleteMultiCharacterSanitization.ql
@@ -0,0 +1,16 @@
+/**
+ * @name Incomplete multi-character sanitization
+ * @description A sanitizer that removes a sequence of characters may reintroduce the dangerous sequence.
+ * @kind problem
+ * @problem.severity warning
+ * @security-severity 7.8
+ * @precision high
+ * @id rb/incomplete-multi-character-sanitization
+ * @tags correctness
+ *       security
+ *       external/cwe/cwe-020
+ *       external/cwe/cwe-080
+ *       external/cwe/cwe-116
+ */
+
+import codeql.ruby.security.IncompleteMultiCharacterSanitizationQuery
--- a/ruby/ql/test/query-tests/security/cwe-116/IncompleteMultiCharacterSanitization/IncompleteMultiCharacterSanitization.expected
+++ b/ruby/ql/test/query-tests/security/cwe-116/IncompleteMultiCharacterSanitization/IncompleteMultiCharacterSanitization.expected
--- a/ruby/ql/test/query-tests/security/cwe-116/IncompleteMultiCharacterSanitization/IncompleteMultiCharacterSanitization.ql
+++ b/ruby/ql/test/query-tests/security/cwe-116/IncompleteMultiCharacterSanitization/IncompleteMultiCharacterSanitization.ql
@@ -0,0 +1,39 @@
+/**
+ * @kind problem
+ */
+
+import ruby
+import codeql.ruby.regexp.RegExpTreeView as RETV
+import codeql.ruby.DataFlow
+import codeql.ruby.security.IncompleteMultiCharacterSanitizationQuery as Query
+import TestUtilities.InlineExpectationsTest
+
+class Test extends InlineExpectationsTest {
+  Test() { this = "IncompleteMultiCharacterSanitizationTest" }
+
+  override string getARelevantTag() { result = "hasResult" }
+
+  override predicate hasActualResult(Location location, string element, string tag, string value) {
+    tag = "hasResult" and
+    hasResult(location, element, value)
+  }
+}
+
+predicate hasResult(Location location, string element, string value) {
+  exists(DataFlow::Node replace, RETV::RegExpTerm dangerous, string prefix, string kind |
+    replace.getLocation() = location and
+    element = replace.toString() and
+    value = shortKind(kind)
+  |
+    Query::isResult(replace, dangerous, prefix, kind)
+  )
+}
+
+bindingset[kind]
+string shortKind(string kind) {
+  kind = "HTML element injection" and result = "html"
+  or
+  kind = "path injection" and result = "path"
+  or
+  kind = "HTML attribute injection" and result = "attr"
+}
--- a/ruby/ql/test/query-tests/security/cwe-116/IncompleteMultiCharacterSanitization/incomplete_multi_character_sanitization.rb
+++ b/ruby/ql/test/query-tests/security/cwe-116/IncompleteMultiCharacterSanitization/incomplete_multi_character_sanitization.rb
@@ -0,0 +1,137 @@
+# CVE-2019-10756
+def m1(content)
+  content = content.gsub(/<.*cript.*\/scrip.*>/i, "") # $ hasResult=html
+  content = content.gsub(/ on\w+=".*"/, "") # $ hasResult=attr
+  content = content.gsub(/ on\w+=\'.*\'/, "") # $ hasResult=attr
+  content
+end
+
+def m2(content)
+  content = content.gsub(/<.*cript.*/i, "") # $ hasResult=html
+  content = content.gsub(/.on\w+=.*".*"/, "") # $ hasResult=attr
+  content = content.gsub(/.on\w+=.*\'.*\'/, "") # $ hasResult=attr
+
+  content
+end
+
+# CVE-2020-7656
+def m3(text)
+  rscript = /<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/i
+  text.gsub(rscript, "") # $ hasResult=html
+  text
+end
+
+# CVE-2019-1010091
+def m4(text)
+  text.gsub(/<!--|--!?>/, "") # $ hasResult=html
+end
+
+def m5(text)
+  while /<!--|--!?>/.match?(text)
+    text = text.gsub(/<!--|--!?>/, "") # OK
+  end
+
+  text
+end
+
+# CVE-2019-10767
+def m6(id)
+  id.gsub(/\.\./, "") # OK (can not contain '..' afterwards)
+end
+
+def m7(id)
+  id.gsub(/[\]\[*,'"`<>\\?\/]/, "") # OK (or is it?)
+end
+
+# CVE-2019-8903
+REG_TRAVEL = /(\/)?\.\.\//
+def m8(req)
+  req.url = req.url.gsub(REG_TRAVEL, "") # $ hasResult=path
+end
+
+# New cases
+
+def m9(x)
+  x = x.gsub(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/, "") # $ hasResult=html
+
+  x = x.gsub(/(\/|\s)on\w+=(\'|")?[^"]*(\'|")?/, "") # $ hasResult=attr
+
+  x = x.gsub(/<\/script>/, "") # OK
+
+  x = x.gsub(/<(.)?br(.)?>/, "") # OK
+  x = x.gsub(/<\/?b>/, "") # OK
+  x = x.gsub(/<(ul|ol)><\/(ul|ol)>/i, "") # OK
+  x = x.gsub(/<li><\/li>/i, "") # OK
+
+  x = x.gsub(/<!--(.*?)-->/m, "") # $ hasResult=html
+  x = x.gsub(/\sng-[a-z-]+/, "") # $ hasResult=attr
+  x = x.gsub(/\sng-[a-z-]+/, "") # $ hasResult=attr
+
+  x = x.gsub(/(<!--\[CDATA\[|\]\]-->)/, "\n") # OK: not a sanitizer
+
+  x = x.gsub(/<script.+desktop\-only.+<\/script>/, "") # $ SPURIOUS: hasResult=html SPURIOUS: hasResult=attr
+  x = x.gsub(/<script async.+?<\/script>/, "") # OK
+  x = x.gsub(/<!--[\s\S]*?-->|<\?(?:php)?[\s\S]*?\?>/i, "") # $ hasResult=html
+
+  x = x.gsub(/\x2E\x2E\x2F\x2E\x2E\x2F/, "") # NOT OK (matches "../../") $ hasResult=path
+
+  x = x.gsub(/<script.*>.*<\/script>/i, "") # $ hasResult=html
+
+  x = x.gsub(/^(\.\.\/?)+/, "") # OK
+
+  # NOT OK
+  x = x.gsub(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/) do |match| # $ hasResult=html
+      if unknown then match else "" end
+  end
+
+  x = x.gsub(/<\/?([a-z][a-z0-9]*)\b[^>]*>/i, "") # NOT OK [INCONSISTENCY] $ hasResult=html
+
+  x = x.gsub(/\.\./, "") # OK
+  x = x.gsub(/\.\.\//, "") # $ hasResult=path
+  x = x.gsub(/\/\.\./, "") # $ hasResult=path
+
+  x = x.gsub(/<script(.*?)>([\s\S]*?)<\/script>/i, "") # $ hasResult=html
+
+  x = x.gsub(/<(script|del)(?=[\s>])[\w\W]*?<\/\1\s*>/i, "") # $ hasResult=html
+  x = x.gsub(/\<script[\s\S]*?\>[\s\S]*?\<\/script\>/, "") # $ hasResult=html
+  x = x.gsub(/<(script|style|title)[^<]+<\/(script|style|title)>/m, "") # $ hasResult=html
+  x = x.gsub(/<script[^>]*>([\s\S]*?)<\/script>/i, "") # $ hasResult=html
+  x = x.gsub(/<script[\s\S]*?<\/script>/i, "") # $ hasResult=html
+  x = x.gsub(/ ?<!-- ?/, "") # $ hasResult=html
+  x = x.gsub(/require\('\.\.\/common'\)/, "") # OK
+  x = x.gsub(/\.\.\/\.\.\/lib\//, "") # OK
+
+  # TODO: make Rubyish
+  while x.include? "."
+    x = x
+      .gsub(/^\.\//, "")
+      .gsub(/\/\.\//, "/")
+      .gsub(/[^\/]*\/\.\.\//, "") # OK
+  end
+
+  x = x.gsub(/([^.\s]+\.)+/, "") # OK
+
+  x = x.gsub(/<!\-\-DEVEL[\d\D]*?DEVEL\-\->/, "") # OK
+
+  x = x # $ hasResult=path
+    .gsub(/^\.\//, "")
+    .gsub(/\/\.\//, "/")
+    .gsub(/[^\/]*\/\.\.\//, "")
+
+  x
+end
+
+def m10(content) 
+	content.gsub(/<script.*\/script>/i, "") # $ hasResult=html
+	content.gsub(/<(script).*\/script>/i, "") # $ hasResult=html
+	content.gsub(/.+<(script).*\/script>/i, "") # $ hasResult=html
+	content.gsub(/.*<(script).*\/script>/i, "") # $ hasResult=html
+end
+
+def m11(content)
+  content = content.gsub(/<script[\s\S]*?<\/script>/i, "") # $ hasResult=html
+  content = content.gsub(/<[a-zA-Z\/](.|\n)*?>/, '') || ' ' # $ hasResult=html
+  content = content.gsub(/<(script|iframe|video)[\s\S]*?<\/(script|iframe|video)>/, '') # $ hasResult=html
+  content = content.gsub(/<(script|iframe|video)(.|\s)*?\/(script|iframe|video)>/, '') # $ hasResult=html
+  content = content.gsub(/<[^<]*>/, "") # OK
+end