Merge pull request #11699 from erik-krogh/shareHost

Dynamic: Share more regexp code
2026-07-20 18:58:36 +02:00 · 2022-12-19 13:29:53 +01:00
parent 31f7702a04 6c8b1cf4be
commit d4eb2b964c
28 changed files with 753 additions and 1044 deletions
--- a/javascript/ql/lib/semmle/javascript/Regexp.qll
+++ b/javascript/ql/lib/semmle/javascript/Regexp.qll
@@ -366,6 +366,9 @@ class RegExpAnchor extends RegExpTerm, @regexp_anchor {
  override predicate isNullable() { any() }

  override string getAPrimaryQlClass() { result = "RegExpAnchor" }
+
+  /** Gets the char for this term. */
+  abstract string getChar();
 }

 /**
@@ -379,6 +382,8 @@ class RegExpAnchor extends RegExpTerm, @regexp_anchor {
 */
 class RegExpCaret extends RegExpAnchor, @regexp_caret {
  override string getAPrimaryQlClass() { result = "RegExpCaret" }
+
+  override string getChar() { result = "^" }
 }

 /**
@@ -392,6 +397,8 @@ class RegExpCaret extends RegExpAnchor, @regexp_caret {
 */
 class RegExpDollar extends RegExpAnchor, @regexp_dollar {
  override string getAPrimaryQlClass() { result = "RegExpDollar" }
+
+  override string getChar() { result = "$" }
 }

 /**
@@ -999,11 +1006,12 @@ predicate isInterpretedAsRegExp(DataFlow::Node source) {
 /**
 * Provides utility predicates related to regular expressions.
 */
-module RegExpPatterns {
+deprecated module RegExpPatterns {
  /**
   * Gets a pattern that matches common top-level domain names in lower case.
+   * DEPRECATED: use the similarly named predicate from `HostnameRegex` from the `regex` pack instead.
   */
-  string getACommonTld() {
+  deprecated string getACommonTld() {
    // according to ranking by http://google.com/search?q=site:.<<TLD>>
    result = "(?:com|org|edu|gov|uk|net|io)(?![a-z0-9])"
  }
--- a/javascript/ql/lib/semmle/javascript/security/regexp/HostnameRegexp.qll
+++ b/javascript/ql/lib/semmle/javascript/security/regexp/HostnameRegexp.qll
@@ -0,0 +1,18 @@
+/**
+ * Provides predicates for reasoning about regular expressions
+ * that match URLs and hostname patterns.
+ */
+
+private import javascript as JS
+private import semmle.javascript.security.regexp.RegExpTreeView::RegExpTreeView as TreeImpl
+private import semmle.javascript.Regexp as RegExp
+private import codeql.regex.HostnameRegexp as Shared
+
+/** An implementation of the signature that allows the Hostname analysis to run. */
+module Impl implements Shared::HostnameRegexpSig<TreeImpl> {
+  class DataFlowNode = JS::DataFlow::Node;
+
+  class RegExpPatternSource = RegExp::RegExpPatternSource;
+}
+
+import Shared::Make<TreeImpl, Impl>
--- a/javascript/ql/src/Security/CWE-020/HostnameRegexpShared.qll
+++ b/javascript/ql/src/Security/CWE-020/HostnameRegexpShared.qll
@@ -3,200 +3,5 @@
 * that match URLs and hostname patterns.
 */

-private import HostnameRegexpSpecific
-
-/**
- * Holds if the given constant is unlikely to occur in the origin part of a URL.
- */
-predicate isConstantInvalidInsideOrigin(RegExpConstant term) {
-  // Look for any of these cases:
-  // - A character that can't occur in the origin
-  // - Two dashes in a row
-  // - A colon that is not part of port or scheme separator
-  // - A slash that is not part of scheme separator
-  term.getValue().regexpMatch(".*(?:[^a-zA-Z0-9.:/-]|--|:[^0-9/]|(?<![/:]|^)/).*")
-}
-
-/** Holds if `term` is a dot constant of form `\.` or `[.]`. */
-predicate isDotConstant(RegExpTerm term) {
-  term.(RegExpCharEscape).getValue() = "."
-  or
-  exists(RegExpCharacterClass cls |
-    term = cls and
-    not cls.isInverted() and
-    cls.getNumChild() = 1 and
-    cls.getAChild().(RegExpConstant).getValue() = "."
-  )
-}
-
-/** Holds if `term` is a wildcard `.` or an actual `.` character. */
-predicate isDotLike(RegExpTerm term) {
-  term instanceof RegExpDot
-  or
-  isDotConstant(term)
-}
-
-/** Holds if `term` will only ever be matched against the beginning of the input. */
-predicate matchesBeginningOfString(RegExpTerm term) {
-  term.isRootTerm()
-  or
-  exists(RegExpTerm parent | matchesBeginningOfString(parent) |
-    term = parent.(RegExpSequence).getChild(0)
-    or
-    parent.(RegExpSequence).getChild(0) instanceof RegExpCaret and
-    term = parent.(RegExpSequence).getChild(1)
-    or
-    term = parent.(RegExpAlt).getAChild()
-    or
-    term = parent.(RegExpGroup).getAChild()
-  )
-}
-
-/**
- * Holds if the given sequence `seq` contains top-level domain preceded by a dot, such as `.com`,
- * excluding cases where this is at the very beginning of the regexp.
- *
- * `i` is bound to the index of the last child in the top-level domain part.
- */
-predicate hasTopLevelDomainEnding(RegExpSequence seq, int i) {
-  seq.getChild(i)
-      .(RegExpConstant)
-      .getValue()
-      .regexpMatch("(?i)" + RegExpPatterns::getACommonTld() + "(:\\d+)?([/?#].*)?") and
-  isDotLike(seq.getChild(i - 1)) and
-  not (i = 1 and matchesBeginningOfString(seq))
-}
-
-/**
- * Holds if the given regular expression term contains top-level domain preceded by a dot,
- * such as `.com`.
- */
-predicate hasTopLevelDomainEnding(RegExpSequence seq) { hasTopLevelDomainEnding(seq, _) }
-
-/**
- * Holds if `term` will always match a hostname, that is, all disjunctions contain
- * a hostname pattern that isn't inside a quantifier.
- */
-predicate alwaysMatchesHostname(RegExpTerm term) {
-  hasTopLevelDomainEnding(term, _)
-  or
-  // `localhost` is considered a hostname pattern, but has no TLD
-  term.(RegExpConstant).getValue().regexpMatch("\\blocalhost\\b")
-  or
-  not term instanceof RegExpAlt and
-  not term instanceof RegExpQuantifier and
-  alwaysMatchesHostname(term.getAChild())
-  or
-  alwaysMatchesHostnameAlt(term)
-}
-
-/** Holds if every child of `alt` contains a hostname pattern. */
-predicate alwaysMatchesHostnameAlt(RegExpAlt alt) {
-  alwaysMatchesHostnameAlt(alt, alt.getNumChild() - 1)
-}
-
-/**
- * Holds if the first `i` children of `alt` contains a hostname pattern.
- *
- * This is used instead of `forall` to avoid materializing the set of alternatives
- * that don't contains hostnames, which is much larger.
- */
-predicate alwaysMatchesHostnameAlt(RegExpAlt alt, int i) {
-  alwaysMatchesHostname(alt.getChild(0)) and i = 0
-  or
-  alwaysMatchesHostnameAlt(alt, i - 1) and
-  alwaysMatchesHostname(alt.getChild(i))
-}
-
-/**
- * Holds if `term` occurs inside a quantifier or alternative (and thus
- * can not be expected to correspond to a unique match), or as part of
- * a lookaround assertion (which are rarely used for capture groups).
- */
-predicate isInsideChoiceOrSubPattern(RegExpTerm term) {
-  exists(RegExpParent parent | parent = term.getParent() |
-    parent instanceof RegExpAlt
-    or
-    parent instanceof RegExpQuantifier
-    or
-    parent instanceof RegExpSubPattern
-    or
-    isInsideChoiceOrSubPattern(parent)
-  )
-}
-
-/**
- * Holds if `group` is likely to be used as a capture group.
- */
-predicate isLikelyCaptureGroup(RegExpGroup group) {
-  group.isCapture() and
-  not isInsideChoiceOrSubPattern(group)
-}
-
-/**
- * Holds if `seq` contains two consecutive dots `..` or escaped dots.
- *
- * At least one of these dots is not intended to be a subdomain separator,
- * so we avoid flagging the pattern in this case.
- */
-predicate hasConsecutiveDots(RegExpSequence seq) {
-  exists(int i |
-    isDotLike(seq.getChild(i)) and
-    isDotLike(seq.getChild(i + 1))
-  )
-}
-
-predicate isIncompleteHostNameRegExpPattern(RegExpTerm regexp, RegExpSequence seq, string msg) {
-  seq = regexp.getAChild*() and
-  exists(RegExpDot unescapedDot, int i, string hostname |
-    hasTopLevelDomainEnding(seq, i) and
-    not isConstantInvalidInsideOrigin(seq.getChild([0 .. i - 1]).getAChild*()) and
-    not isLikelyCaptureGroup(seq.getChild([i .. seq.getNumChild() - 1]).getAChild*()) and
-    unescapedDot = seq.getChild([0 .. i - 1]).getAChild*() and
-    unescapedDot != seq.getChild(i - 1) and // Should not be the '.' immediately before the TLD
-    not hasConsecutiveDots(unescapedDot.getParent()) and
-    hostname =
-      seq.getChild(i - 2).getRawValue() + seq.getChild(i - 1).getRawValue() +
-        seq.getChild(i).getRawValue()
-  |
-    if unescapedDot.getParent() instanceof RegExpQuantifier
-    then
-      // `.*\.example.com` can match `evil.com/?x=.example.com`
-      //
-      // This problem only occurs when the pattern is applied against a full URL, not just a hostname/origin.
-      // We therefore check if the pattern includes a suffix after the TLD, such as `.*\.example.com/`.
-      // Note that a post-anchored pattern (`.*\.example.com$`) will usually fail to match a full URL,
-      // and patterns with neither a suffix nor an anchor fall under the purview of MissingRegExpAnchor.
-      seq.getChild(0) instanceof RegExpCaret and
-      not seq.getAChild() instanceof RegExpDollar and
-      seq.getChild([i .. i + 1]).(RegExpConstant).getValue().regexpMatch(".*[/?#].*") and
-      msg =
-        "has an unrestricted wildcard '" + unescapedDot.getParent().(RegExpQuantifier).getRawValue()
-          + "' which may cause '" + hostname +
-          "' to be matched anywhere in the URL, outside the hostname."
-    else
-      msg =
-        "has an unescaped '.' before '" + hostname +
-          "', so it might match more hosts than expected."
-  )
-}
-
-predicate incompleteHostnameRegExp(
-  RegExpSequence hostSequence, string message, DataFlow::Node aux, string label
-) {
-  exists(RegExpPatternSource re, RegExpTerm regexp, string msg, string kind |
-    regexp = re.getRegExpTerm() and
-    isIncompleteHostNameRegExpPattern(regexp, hostSequence, msg) and
-    (
-      if re.getAParse() != re
-      then (
-        kind = "string, which is used as a regular expression $@," and
-        aux = re.getAParse()
-      ) else (
-        kind = "regular expression" and aux = re
-      )
-    )
-  |
-    message = "This " + kind + " " + msg and label = "here"
-  )
-}
+deprecated import semmle.javascript.security.regexp.HostnameRegexp as Dep
+import Dep
--- a/javascript/ql/src/Security/CWE-020/HostnameRegexpSpecific.qll
+++ b/javascript/ql/src/Security/CWE-020/HostnameRegexpSpecific.qll
@@ -1 +0,0 @@
-import javascript
--- a/javascript/ql/src/Security/CWE-020/IncompleteHostnameRegExp.ql
+++ b/javascript/ql/src/Security/CWE-020/IncompleteHostnameRegExp.ql
@@ -11,6 +11,6 @@
 *       external/cwe/cwe-020
 */

-import HostnameRegexpShared
+private import semmle.javascript.security.regexp.HostnameRegexp as HostnameRegexp

-query predicate problems = incompleteHostnameRegExp/4;
+query predicate problems = HostnameRegexp::incompleteHostnameRegExp/4;
--- a/javascript/ql/src/Security/CWE-020/IncompleteUrlSubstringSanitization.qll
+++ b/javascript/ql/src/Security/CWE-020/IncompleteUrlSubstringSanitization.qll
@@ -3,6 +3,7 @@
 */

 private import IncompleteUrlSubstringSanitizationSpecific
+private import codeql.regex.HostnameRegexp::Utils

 /**
 * A check on a string for whether it contains a given substring, possibly with restrictions on the location of the substring.
@@ -30,9 +31,7 @@ query predicate problems(
  mayHaveStringValue(substring, target) and
  (
    // target contains a domain on a common TLD, and perhaps some other URL components
-    target
-        .regexpMatch("(?i)([a-z]*:?//)?\\.?([a-z0-9-]+\\.)+" + RegExpPatterns::getACommonTld() +
-            "(:[0-9]+)?/?")
+    target.regexpMatch("(?i)([a-z]*:?//)?\\.?([a-z0-9-]+\\.)+" + getACommonTld() + "(:[0-9]+)?/?")
    or
    // target is a HTTP URL to a domain on any TLD
    target.regexpMatch("(?i)https?://([a-z0-9-]+\\.)+([a-z]+)(:[0-9]+)?/?")
--- a/javascript/ql/src/Security/CWE-020/IncompleteUrlSubstringSanitizationSpecific.qll
+++ b/javascript/ql/src/Security/CWE-020/IncompleteUrlSubstringSanitizationSpecific.qll
@@ -3,3 +3,5 @@ import semmle.javascript.dataflow.InferredTypes

 /** Holds if `node` may evaluate to `value` */
 predicate mayHaveStringValue(DataFlow::Node node, string value) { node.mayHaveStringValue(value) }
+
+import codeql.regex.HostnameRegexp::Utils
--- a/javascript/ql/src/Security/CWE-020/MissingRegExpAnchor.ql
+++ b/javascript/ql/src/Security/CWE-020/MissingRegExpAnchor.ql
@@ -11,214 +11,39 @@
 *       external/cwe/cwe-020
 */

-import javascript
-import HostnameRegexpShared
+private import javascript
+private import semmle.javascript.security.regexp.HostnameRegexp as HostnameRegexp
+private import codeql.regex.MissingRegExpAnchor as MissingRegExpAnchor
+private import semmle.javascript.security.regexp.RegExpTreeView::RegExpTreeView as TreeImpl

-/** Holds if `term` is one of the transitive left children of a regexp. */
-predicate isLeftArmTerm(RegExpTerm term) {
-  term.isRootTerm()
-  or
-  exists(RegExpTerm parent |
-    term = parent.getChild(0) and
-    isLeftArmTerm(parent)
-  )
-}
-
-/** Holds if `term` is one of the transitive right children of a regexp. */
-predicate isRightArmTerm(RegExpTerm term) {
-  term.isRootTerm()
-  or
-  exists(RegExpTerm parent |
-    term = parent.getLastChild() and
-    isRightArmTerm(parent)
-  )
-}
-
-/**
- * Holds if `term` is an anchor that is not the first or last node
- * in its tree.
- */
-predicate isInteriorAnchor(RegExpAnchor term) {
-  not isLeftArmTerm(term) and
-  not isRightArmTerm(term)
-}
-
-/**
- * Holds if `term` contains an anchor that is not the first or last node
- * in its tree, such as `(foo|bar$|baz)`.
- */
-predicate containsInteriorAnchor(RegExpTerm term) { isInteriorAnchor(term.getAChild*()) }
-
-/**
- * Holds if `term` starts with a word boundary or lookbehind assertion,
- * indicating that it's not intended to be anchored on that side.
- */
-predicate containsLeadingPseudoAnchor(RegExpSequence term) {
-  exists(RegExpTerm child | child = term.getChild(0) |
-    child instanceof RegExpWordBoundary or
-    child instanceof RegExpNonWordBoundary or
-    child instanceof RegExpLookbehind
-  )
-}
-
-/**
- * Holds if `term` ends with a word boundary or lookahead assertion,
- * indicating that it's not intended to be anchored on that side.
- */
-predicate containsTrailingPseudoAnchor(RegExpSequence term) {
-  exists(RegExpTerm child | child = term.getLastChild() |
-    child instanceof RegExpWordBoundary or
-    child instanceof RegExpNonWordBoundary or
-    child instanceof RegExpLookahead
-  )
-}
-
-/**
- * Holds if `term` is an empty sequence, usually arising from
- * literals with a trailing alternative such as `foo|`.
- */
-predicate isEmpty(RegExpSequence term) { term.getNumChild() = 0 }
-
-/**
- * Holds if `term` contains a letter constant.
- *
- * We use this as a heuristic to filter out uninteresting results.
- */
-predicate containsLetters(RegExpTerm term) {
-  term.getAChild*().(RegExpConstant).getValue().regexpMatch(".*[a-zA-Z].*")
-}
-
-/**
- * Holds if `term` consists only of an anchor and a parenthesized term,
- * such as the left side of `^(foo|bar)|baz`.
- *
- * The precedence of the anchor is likely to be intentional in this case,
- * as the group wouldn't be needed otherwise.
- */
-predicate isAnchoredGroup(RegExpSequence term) {
-  term.getNumChild() = 2 and
-  term.getAChild() instanceof RegExpAnchor and
-  term.getAChild() instanceof RegExpGroup
-}
-
-/**
- * Holds if `alt` has an explicitly anchored group, such as `^(foo|bar)|baz`
- * and doesn't have any unnecessary groups, such as in `^(foo)|(bar)`.
- */
-predicate hasExplicitAnchorPrecedence(RegExpAlt alt) {
-  isAnchoredGroup(alt.getAChild()) and
-  not alt.getAChild() instanceof RegExpGroup
-}
-
-/**
- * Holds if `src` is a pattern for a collection of alternatives where
- * only the first or last alternative is anchored, indicating a
- * precedence mistake explained by `msg`.
- *
- * The canonical example of such a mistake is: `^a|b|c`, which is
- * parsed as `(^a)|(b)|(c)`.
- */
-predicate hasMisleadingAnchorPrecedence(RegExpPatternSource src, string msg) {
-  exists(RegExpAlt root, RegExpSequence anchoredTerm, string direction |
-    root = src.getRegExpTerm() and
-    not containsInteriorAnchor(root) and
-    not isEmpty(root.getAChild()) and
-    not hasExplicitAnchorPrecedence(root) and
-    containsLetters(anchoredTerm) and
-    (
-      anchoredTerm = root.getChild(0) and
-      anchoredTerm.getChild(0) instanceof RegExpCaret and
-      not containsLeadingPseudoAnchor(root.getChild([1 .. root.getNumChild() - 1])) and
-      containsLetters(root.getChild([1 .. root.getNumChild() - 1])) and
-      direction = "beginning"
-      or
-      anchoredTerm = root.getLastChild() and
-      anchoredTerm.getLastChild() instanceof RegExpDollar and
-      not containsTrailingPseudoAnchor(root.getChild([0 .. root.getNumChild() - 2])) and
-      containsLetters(root.getChild([0 .. root.getNumChild() - 2])) and
-      direction = "end"
-    ) and
-    // is not used for replace
-    not exists(DataFlow::MethodCallNode replace |
-      replace.getMethodName() = "replace" and
-      src.getARegExpObject().flowsTo(replace.getArgument(0))
-    ) and
-    msg =
-      "Misleading operator precedence. The subexpression '" + anchoredTerm.getRawValue() +
-        "' is anchored at the " + direction +
-        ", but the other parts of this regular expression are not"
-  )
-}
-
-/**
- * Holds if `term` is a final term, that is, no term will match anything after this one.
- */
-predicate isFinalRegExpTerm(RegExpTerm term) {
-  term.isRootTerm()
-  or
-  exists(RegExpSequence seq |
-    isFinalRegExpTerm(seq) and
-    term = seq.getLastChild()
-  )
-  or
-  exists(RegExpTerm parent |
-    isFinalRegExpTerm(parent) and
-    term = parent.getAChild() and
-    not parent instanceof RegExpSequence and
-    not parent instanceof RegExpQuantifier
-  )
-}
-
-/**
- * Holds if `src` contains a hostname pattern that is missing a `$` anchor.
- */
-predicate isSemiAnchoredHostnameRegExp(RegExpPatternSource src, string msg) {
-  not hasMisleadingAnchorPrecedence(src, _) and // avoid double reporting
-  exists(RegExpTerm term, RegExpSequence tld, int i | term = src.getRegExpTerm() |
-    not isConstantInvalidInsideOrigin(term.getAChild*()) and
-    tld = term.getAChild*() and
-    hasTopLevelDomainEnding(tld, i) and
-    isFinalRegExpTerm(tld.getChild(i)) and // nothing is matched after the TLD
-    tld.getChild(0) instanceof RegExpCaret and
-    msg =
-      "This hostname pattern may match any domain name, as it is missing a '$' or '/' at the end."
-  )
-}
-
-/**
- * Holds if `src` is an unanchored pattern for a URL, indicating a
- * mistake explained by `msg`.
- */
-predicate isUnanchoredHostnameRegExp(RegExpPatternSource src, string msg) {
-  exists(RegExpTerm term, RegExpSequence tld | term = src.getRegExpTerm() |
-    alwaysMatchesHostname(term) and
-    tld = term.getAChild*() and
-    hasTopLevelDomainEnding(tld) and
-    not isConstantInvalidInsideOrigin(term.getAChild*()) and
-    not term.getAChild*() instanceof RegExpAnchor and
-    // that is not used for capture or replace
-    not exists(DataFlow::MethodCallNode mcn, string name | name = mcn.getMethodName() |
+private module Impl implements
+MissingRegExpAnchor::MissingRegExpAnchorSig<TreeImpl, HostnameRegexp::Impl> {
+  predicate isUsedAsReplace(RegExpPatternSource pattern) {
+    // is used for capture or replace
+    exists(DataFlow::MethodCallNode mcn, string name | name = mcn.getMethodName() |
      name = "exec" and
-      mcn = src.getARegExpObject().getAMethodCall() and
+      mcn = pattern.getARegExpObject().getAMethodCall() and
      exists(mcn.getAPropertyRead())
      or
      exists(DataFlow::Node arg |
        arg = mcn.getArgument(0) and
        (
-          src.getARegExpObject().flowsTo(arg) or
-          src.getAParse() = arg
+          pattern.getARegExpObject().flowsTo(arg) or
+          pattern.getAParse() = arg
        )
      |
        name = "replace"
        or
        name = "match" and exists(mcn.getAPropertyRead())
      )
-    ) and
-    msg =
-      "When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it."
-  )
+    )
+  }
+
+  string getEndAnchorText() { result = "$" }
 }

+import MissingRegExpAnchor::Make<TreeImpl, HostnameRegexp::Impl, Impl>
+
 from DataFlow::Node nd, string msg
 where
  isUnanchoredHostnameRegExp(nd, msg)
@@ -226,4 +51,5 @@ where
  isSemiAnchoredHostnameRegExp(nd, msg)
  or
  hasMisleadingAnchorPrecedence(nd, msg)
+// isLineAnchoredHostnameRegExp is not used here, as it is not relevant to JS.
 select nd, msg