Merge pull request #11699 from erik-krogh/shareHost

Dynamic: Share more regexp code
2026-07-21 03:08:25 +02:00 · 2022-12-19 13:29:53 +01:00
parent 31f7702a04 6c8b1cf4be
commit d4eb2b964c
28 changed files with 753 additions and 1044 deletions
--- a/config/identical-files.json
+++ b/config/identical-files.json
@@ -531,11 +531,6 @@
    "ruby/ql/lib/codeql/ruby/internal/ConceptsShared.qll",
    "javascript/ql/lib/semmle/javascript/internal/ConceptsShared.qll"
  ],
-  "Hostname Regexp queries": [
-    "javascript/ql/src/Security/CWE-020/HostnameRegexpShared.qll",
-    "python/ql/src/Security/CWE-020/HostnameRegexpShared.qll",
-    "ruby/ql/src/queries/security/cwe-020/HostnameRegexpShared.qll"
-  ],
  "ApiGraphModels": [
    "javascript/ql/lib/semmle/javascript/frameworks/data/internal/ApiGraphModels.qll",
    "ruby/ql/lib/codeql/ruby/frameworks/data/internal/ApiGraphModels.qll",
--- a/java/ql/lib/semmle/code/java/regex/RegexTreeView.qll
+++ b/java/ql/lib/semmle/code/java/regex/RegexTreeView.qll
@@ -85,6 +85,9 @@ module Impl implements RegexTreeViewSig {

    /** Gets the associated regex. */
    abstract Regex getRegex();
+
+    /** Gets the last child term of this element. */
+    RegExpTerm getLastChild() { result = this.getChild(this.getNumChild() - 1) }
  }

  /**
@@ -558,6 +561,17 @@ module Impl implements RegexTreeViewSig {
    }
  }

+  /**
+   * A character escape in a regular expression.
+   *
+   * Example:
+   *
+   * ```
+   * \.
+   * ```
+   */
+  class RegExpCharEscape = RegExpEscape;
+
  /**
   * A word boundary, that is, a regular expression term of the form `\b`.
   */
@@ -565,6 +579,13 @@ module Impl implements RegexTreeViewSig {
    RegExpWordBoundary() { this.getChar() = "\\b" }
  }

+  /**
+   * A non-word boundary, that is, a regular expression term of the form `\B`.
+   */
+  class RegExpNonWordBoundary extends RegExpSpecialChar {
+    RegExpNonWordBoundary() { this.getChar() = "\\B" }
+  }
+
  /**
   * Gets the hex number for the `hex` char.
   */
@@ -868,6 +889,9 @@ module Impl implements RegexTreeViewSig {
    predicate isNamedGroupOfLiteral(RegExpLiteral lit, string name) {
      lit = this.getLiteral() and name = this.getName()
    }
+
+    /** Holds if this is a capture group. */
+    predicate isCapture() { exists(this.getNumber()) }
  }

  /**
@@ -917,6 +941,21 @@ module Impl implements RegexTreeViewSig {
    override string getPrimaryQLClass() { result = "RegExpDot" }
  }

+  /**
+   * A term that matches a specific position between characters in the string.
+   *
+   * Example:
+   *
+   * ```
+   * ^
+   * ```
+   */
+  class RegExpAnchor extends RegExpSpecialChar {
+    RegExpAnchor() { this.getChar() = ["$", "^"] }
+
+    override string getPrimaryQLClass() { result = "RegExpAnchor" }
+  }
+
  /**
   * A dollar assertion `$` matching the end of a line.
   *
@@ -926,7 +965,7 @@ module Impl implements RegexTreeViewSig {
   * $
   * ```
   */
-  class RegExpDollar extends RegExpSpecialChar {
+  class RegExpDollar extends RegExpAnchor {
    RegExpDollar() { this.getChar() = "$" }

    override string getPrimaryQLClass() { result = "RegExpDollar" }
@@ -941,7 +980,7 @@ module Impl implements RegexTreeViewSig {
   * ^
   * ```
   */
-  class RegExpCaret extends RegExpSpecialChar {
+  class RegExpCaret extends RegExpAnchor {
    RegExpCaret() { this.getChar() = "^" }

    override string getPrimaryQLClass() { result = "RegExpCaret" }
--- a/javascript/ql/lib/semmle/javascript/Regexp.qll
+++ b/javascript/ql/lib/semmle/javascript/Regexp.qll
@@ -366,6 +366,9 @@ class RegExpAnchor extends RegExpTerm, @regexp_anchor {
  override predicate isNullable() { any() }

  override string getAPrimaryQlClass() { result = "RegExpAnchor" }
+
+  /** Gets the char for this term. */
+  abstract string getChar();
 }

 /**
@@ -379,6 +382,8 @@ class RegExpAnchor extends RegExpTerm, @regexp_anchor {
 */
 class RegExpCaret extends RegExpAnchor, @regexp_caret {
  override string getAPrimaryQlClass() { result = "RegExpCaret" }
+
+  override string getChar() { result = "^" }
 }

 /**
@@ -392,6 +397,8 @@ class RegExpCaret extends RegExpAnchor, @regexp_caret {
 */
 class RegExpDollar extends RegExpAnchor, @regexp_dollar {
  override string getAPrimaryQlClass() { result = "RegExpDollar" }
+
+  override string getChar() { result = "$" }
 }

 /**
@@ -999,11 +1006,12 @@ predicate isInterpretedAsRegExp(DataFlow::Node source) {
 /**
 * Provides utility predicates related to regular expressions.
 */
-module RegExpPatterns {
+deprecated module RegExpPatterns {
  /**
   * Gets a pattern that matches common top-level domain names in lower case.
+   * DEPRECATED: use the similarly named predicate from `HostnameRegex` from the `regex` pack instead.
   */
-  string getACommonTld() {
+  deprecated string getACommonTld() {
    // according to ranking by http://google.com/search?q=site:.<<TLD>>
    result = "(?:com|org|edu|gov|uk|net|io)(?![a-z0-9])"
  }
--- a/javascript/ql/lib/semmle/javascript/security/regexp/HostnameRegexp.qll
+++ b/javascript/ql/lib/semmle/javascript/security/regexp/HostnameRegexp.qll
@@ -0,0 +1,18 @@
+/**
+ * Provides predicates for reasoning about regular expressions
+ * that match URLs and hostname patterns.
+ */
+
+private import javascript as JS
+private import semmle.javascript.security.regexp.RegExpTreeView::RegExpTreeView as TreeImpl
+private import semmle.javascript.Regexp as RegExp
+private import codeql.regex.HostnameRegexp as Shared
+
+/** An implementation of the signature that allows the Hostname analysis to run. */
+module Impl implements Shared::HostnameRegexpSig<TreeImpl> {
+  class DataFlowNode = JS::DataFlow::Node;
+
+  class RegExpPatternSource = RegExp::RegExpPatternSource;
+}
+
+import Shared::Make<TreeImpl, Impl>
--- a/javascript/ql/src/Security/CWE-020/HostnameRegexpShared.qll
+++ b/javascript/ql/src/Security/CWE-020/HostnameRegexpShared.qll
@@ -3,200 +3,5 @@
 * that match URLs and hostname patterns.
 */

-private import HostnameRegexpSpecific
-
-/**
- * Holds if the given constant is unlikely to occur in the origin part of a URL.
- */
-predicate isConstantInvalidInsideOrigin(RegExpConstant term) {
-  // Look for any of these cases:
-  // - A character that can't occur in the origin
-  // - Two dashes in a row
-  // - A colon that is not part of port or scheme separator
-  // - A slash that is not part of scheme separator
-  term.getValue().regexpMatch(".*(?:[^a-zA-Z0-9.:/-]|--|:[^0-9/]|(?<![/:]|^)/).*")
-}
-
-/** Holds if `term` is a dot constant of form `\.` or `[.]`. */
-predicate isDotConstant(RegExpTerm term) {
-  term.(RegExpCharEscape).getValue() = "."
-  or
-  exists(RegExpCharacterClass cls |
-    term = cls and
-    not cls.isInverted() and
-    cls.getNumChild() = 1 and
-    cls.getAChild().(RegExpConstant).getValue() = "."
-  )
-}
-
-/** Holds if `term` is a wildcard `.` or an actual `.` character. */
-predicate isDotLike(RegExpTerm term) {
-  term instanceof RegExpDot
-  or
-  isDotConstant(term)
-}
-
-/** Holds if `term` will only ever be matched against the beginning of the input. */
-predicate matchesBeginningOfString(RegExpTerm term) {
-  term.isRootTerm()
-  or
-  exists(RegExpTerm parent | matchesBeginningOfString(parent) |
-    term = parent.(RegExpSequence).getChild(0)
-    or
-    parent.(RegExpSequence).getChild(0) instanceof RegExpCaret and
-    term = parent.(RegExpSequence).getChild(1)
-    or
-    term = parent.(RegExpAlt).getAChild()
-    or
-    term = parent.(RegExpGroup).getAChild()
-  )
-}
-
-/**
- * Holds if the given sequence `seq` contains top-level domain preceded by a dot, such as `.com`,
- * excluding cases where this is at the very beginning of the regexp.
- *
- * `i` is bound to the index of the last child in the top-level domain part.
- */
-predicate hasTopLevelDomainEnding(RegExpSequence seq, int i) {
-  seq.getChild(i)
-      .(RegExpConstant)
-      .getValue()
-      .regexpMatch("(?i)" + RegExpPatterns::getACommonTld() + "(:\\d+)?([/?#].*)?") and
-  isDotLike(seq.getChild(i - 1)) and
-  not (i = 1 and matchesBeginningOfString(seq))
-}
-
-/**
- * Holds if the given regular expression term contains top-level domain preceded by a dot,
- * such as `.com`.
- */
-predicate hasTopLevelDomainEnding(RegExpSequence seq) { hasTopLevelDomainEnding(seq, _) }
-
-/**
- * Holds if `term` will always match a hostname, that is, all disjunctions contain
- * a hostname pattern that isn't inside a quantifier.
- */
-predicate alwaysMatchesHostname(RegExpTerm term) {
-  hasTopLevelDomainEnding(term, _)
-  or
-  // `localhost` is considered a hostname pattern, but has no TLD
-  term.(RegExpConstant).getValue().regexpMatch("\\blocalhost\\b")
-  or
-  not term instanceof RegExpAlt and
-  not term instanceof RegExpQuantifier and
-  alwaysMatchesHostname(term.getAChild())
-  or
-  alwaysMatchesHostnameAlt(term)
-}
-
-/** Holds if every child of `alt` contains a hostname pattern. */
-predicate alwaysMatchesHostnameAlt(RegExpAlt alt) {
-  alwaysMatchesHostnameAlt(alt, alt.getNumChild() - 1)
-}
-
-/**
- * Holds if the first `i` children of `alt` contains a hostname pattern.
- *
- * This is used instead of `forall` to avoid materializing the set of alternatives
- * that don't contains hostnames, which is much larger.
- */
-predicate alwaysMatchesHostnameAlt(RegExpAlt alt, int i) {
-  alwaysMatchesHostname(alt.getChild(0)) and i = 0
-  or
-  alwaysMatchesHostnameAlt(alt, i - 1) and
-  alwaysMatchesHostname(alt.getChild(i))
-}
-
-/**
- * Holds if `term` occurs inside a quantifier or alternative (and thus
- * can not be expected to correspond to a unique match), or as part of
- * a lookaround assertion (which are rarely used for capture groups).
- */
-predicate isInsideChoiceOrSubPattern(RegExpTerm term) {
-  exists(RegExpParent parent | parent = term.getParent() |
-    parent instanceof RegExpAlt
-    or
-    parent instanceof RegExpQuantifier
-    or
-    parent instanceof RegExpSubPattern
-    or
-    isInsideChoiceOrSubPattern(parent)
-  )
-}
-
-/**
- * Holds if `group` is likely to be used as a capture group.
- */
-predicate isLikelyCaptureGroup(RegExpGroup group) {
-  group.isCapture() and
-  not isInsideChoiceOrSubPattern(group)
-}
-
-/**
- * Holds if `seq` contains two consecutive dots `..` or escaped dots.
- *
- * At least one of these dots is not intended to be a subdomain separator,
- * so we avoid flagging the pattern in this case.
- */
-predicate hasConsecutiveDots(RegExpSequence seq) {
-  exists(int i |
-    isDotLike(seq.getChild(i)) and
-    isDotLike(seq.getChild(i + 1))
-  )
-}
-
-predicate isIncompleteHostNameRegExpPattern(RegExpTerm regexp, RegExpSequence seq, string msg) {
-  seq = regexp.getAChild*() and
-  exists(RegExpDot unescapedDot, int i, string hostname |
-    hasTopLevelDomainEnding(seq, i) and
-    not isConstantInvalidInsideOrigin(seq.getChild([0 .. i - 1]).getAChild*()) and
-    not isLikelyCaptureGroup(seq.getChild([i .. seq.getNumChild() - 1]).getAChild*()) and
-    unescapedDot = seq.getChild([0 .. i - 1]).getAChild*() and
-    unescapedDot != seq.getChild(i - 1) and // Should not be the '.' immediately before the TLD
-    not hasConsecutiveDots(unescapedDot.getParent()) and
-    hostname =
-      seq.getChild(i - 2).getRawValue() + seq.getChild(i - 1).getRawValue() +
-        seq.getChild(i).getRawValue()
-  |
-    if unescapedDot.getParent() instanceof RegExpQuantifier
-    then
-      // `.*\.example.com` can match `evil.com/?x=.example.com`
-      //
-      // This problem only occurs when the pattern is applied against a full URL, not just a hostname/origin.
-      // We therefore check if the pattern includes a suffix after the TLD, such as `.*\.example.com/`.
-      // Note that a post-anchored pattern (`.*\.example.com$`) will usually fail to match a full URL,
-      // and patterns with neither a suffix nor an anchor fall under the purview of MissingRegExpAnchor.
-      seq.getChild(0) instanceof RegExpCaret and
-      not seq.getAChild() instanceof RegExpDollar and
-      seq.getChild([i .. i + 1]).(RegExpConstant).getValue().regexpMatch(".*[/?#].*") and
-      msg =
-        "has an unrestricted wildcard '" + unescapedDot.getParent().(RegExpQuantifier).getRawValue()
-          + "' which may cause '" + hostname +
-          "' to be matched anywhere in the URL, outside the hostname."
-    else
-      msg =
-        "has an unescaped '.' before '" + hostname +
-          "', so it might match more hosts than expected."
-  )
-}
-
-predicate incompleteHostnameRegExp(
-  RegExpSequence hostSequence, string message, DataFlow::Node aux, string label
-) {
-  exists(RegExpPatternSource re, RegExpTerm regexp, string msg, string kind |
-    regexp = re.getRegExpTerm() and
-    isIncompleteHostNameRegExpPattern(regexp, hostSequence, msg) and
-    (
-      if re.getAParse() != re
-      then (
-        kind = "string, which is used as a regular expression $@," and
-        aux = re.getAParse()
-      ) else (
-        kind = "regular expression" and aux = re
-      )
-    )
-  |
-    message = "This " + kind + " " + msg and label = "here"
-  )
-}
+deprecated import semmle.javascript.security.regexp.HostnameRegexp as Dep
+import Dep
--- a/javascript/ql/src/Security/CWE-020/HostnameRegexpSpecific.qll
+++ b/javascript/ql/src/Security/CWE-020/HostnameRegexpSpecific.qll
@@ -1 +0,0 @@
-import javascript
--- a/javascript/ql/src/Security/CWE-020/IncompleteHostnameRegExp.ql
+++ b/javascript/ql/src/Security/CWE-020/IncompleteHostnameRegExp.ql
@@ -11,6 +11,6 @@
 *       external/cwe/cwe-020
 */

-import HostnameRegexpShared
+private import semmle.javascript.security.regexp.HostnameRegexp as HostnameRegexp

-query predicate problems = incompleteHostnameRegExp/4;
+query predicate problems = HostnameRegexp::incompleteHostnameRegExp/4;
--- a/javascript/ql/src/Security/CWE-020/IncompleteUrlSubstringSanitization.qll
+++ b/javascript/ql/src/Security/CWE-020/IncompleteUrlSubstringSanitization.qll
@@ -3,6 +3,7 @@
 */

 private import IncompleteUrlSubstringSanitizationSpecific
+private import codeql.regex.HostnameRegexp::Utils

 /**
 * A check on a string for whether it contains a given substring, possibly with restrictions on the location of the substring.
@@ -30,9 +31,7 @@ query predicate problems(
  mayHaveStringValue(substring, target) and
  (
    // target contains a domain on a common TLD, and perhaps some other URL components
-    target
-        .regexpMatch("(?i)([a-z]*:?//)?\\.?([a-z0-9-]+\\.)+" + RegExpPatterns::getACommonTld() +
-            "(:[0-9]+)?/?")
+    target.regexpMatch("(?i)([a-z]*:?//)?\\.?([a-z0-9-]+\\.)+" + getACommonTld() + "(:[0-9]+)?/?")
    or
    // target is a HTTP URL to a domain on any TLD
    target.regexpMatch("(?i)https?://([a-z0-9-]+\\.)+([a-z]+)(:[0-9]+)?/?")
--- a/javascript/ql/src/Security/CWE-020/IncompleteUrlSubstringSanitizationSpecific.qll
+++ b/javascript/ql/src/Security/CWE-020/IncompleteUrlSubstringSanitizationSpecific.qll
@@ -3,3 +3,5 @@ import semmle.javascript.dataflow.InferredTypes

 /** Holds if `node` may evaluate to `value` */
 predicate mayHaveStringValue(DataFlow::Node node, string value) { node.mayHaveStringValue(value) }
+
+import codeql.regex.HostnameRegexp::Utils
--- a/javascript/ql/src/Security/CWE-020/MissingRegExpAnchor.ql
+++ b/javascript/ql/src/Security/CWE-020/MissingRegExpAnchor.ql
@@ -11,214 +11,39 @@
 *       external/cwe/cwe-020
 */

-import javascript
-import HostnameRegexpShared
+private import javascript
+private import semmle.javascript.security.regexp.HostnameRegexp as HostnameRegexp
+private import codeql.regex.MissingRegExpAnchor as MissingRegExpAnchor
+private import semmle.javascript.security.regexp.RegExpTreeView::RegExpTreeView as TreeImpl

-/** Holds if `term` is one of the transitive left children of a regexp. */
-predicate isLeftArmTerm(RegExpTerm term) {
-  term.isRootTerm()
-  or
-  exists(RegExpTerm parent |
-    term = parent.getChild(0) and
-    isLeftArmTerm(parent)
-  )
-}
-
-/** Holds if `term` is one of the transitive right children of a regexp. */
-predicate isRightArmTerm(RegExpTerm term) {
-  term.isRootTerm()
-  or
-  exists(RegExpTerm parent |
-    term = parent.getLastChild() and
-    isRightArmTerm(parent)
-  )
-}
-
-/**
- * Holds if `term` is an anchor that is not the first or last node
- * in its tree.
- */
-predicate isInteriorAnchor(RegExpAnchor term) {
-  not isLeftArmTerm(term) and
-  not isRightArmTerm(term)
-}
-
-/**
- * Holds if `term` contains an anchor that is not the first or last node
- * in its tree, such as `(foo|bar$|baz)`.
- */
-predicate containsInteriorAnchor(RegExpTerm term) { isInteriorAnchor(term.getAChild*()) }
-
-/**
- * Holds if `term` starts with a word boundary or lookbehind assertion,
- * indicating that it's not intended to be anchored on that side.
- */
-predicate containsLeadingPseudoAnchor(RegExpSequence term) {
-  exists(RegExpTerm child | child = term.getChild(0) |
-    child instanceof RegExpWordBoundary or
-    child instanceof RegExpNonWordBoundary or
-    child instanceof RegExpLookbehind
-  )
-}
-
-/**
- * Holds if `term` ends with a word boundary or lookahead assertion,
- * indicating that it's not intended to be anchored on that side.
- */
-predicate containsTrailingPseudoAnchor(RegExpSequence term) {
-  exists(RegExpTerm child | child = term.getLastChild() |
-    child instanceof RegExpWordBoundary or
-    child instanceof RegExpNonWordBoundary or
-    child instanceof RegExpLookahead
-  )
-}
-
-/**
- * Holds if `term` is an empty sequence, usually arising from
- * literals with a trailing alternative such as `foo|`.
- */
-predicate isEmpty(RegExpSequence term) { term.getNumChild() = 0 }
-
-/**
- * Holds if `term` contains a letter constant.
- *
- * We use this as a heuristic to filter out uninteresting results.
- */
-predicate containsLetters(RegExpTerm term) {
-  term.getAChild*().(RegExpConstant).getValue().regexpMatch(".*[a-zA-Z].*")
-}
-
-/**
- * Holds if `term` consists only of an anchor and a parenthesized term,
- * such as the left side of `^(foo|bar)|baz`.
- *
- * The precedence of the anchor is likely to be intentional in this case,
- * as the group wouldn't be needed otherwise.
- */
-predicate isAnchoredGroup(RegExpSequence term) {
-  term.getNumChild() = 2 and
-  term.getAChild() instanceof RegExpAnchor and
-  term.getAChild() instanceof RegExpGroup
-}
-
-/**
- * Holds if `alt` has an explicitly anchored group, such as `^(foo|bar)|baz`
- * and doesn't have any unnecessary groups, such as in `^(foo)|(bar)`.
- */
-predicate hasExplicitAnchorPrecedence(RegExpAlt alt) {
-  isAnchoredGroup(alt.getAChild()) and
-  not alt.getAChild() instanceof RegExpGroup
-}
-
-/**
- * Holds if `src` is a pattern for a collection of alternatives where
- * only the first or last alternative is anchored, indicating a
- * precedence mistake explained by `msg`.
- *
- * The canonical example of such a mistake is: `^a|b|c`, which is
- * parsed as `(^a)|(b)|(c)`.
- */
-predicate hasMisleadingAnchorPrecedence(RegExpPatternSource src, string msg) {
-  exists(RegExpAlt root, RegExpSequence anchoredTerm, string direction |
-    root = src.getRegExpTerm() and
-    not containsInteriorAnchor(root) and
-    not isEmpty(root.getAChild()) and
-    not hasExplicitAnchorPrecedence(root) and
-    containsLetters(anchoredTerm) and
-    (
-      anchoredTerm = root.getChild(0) and
-      anchoredTerm.getChild(0) instanceof RegExpCaret and
-      not containsLeadingPseudoAnchor(root.getChild([1 .. root.getNumChild() - 1])) and
-      containsLetters(root.getChild([1 .. root.getNumChild() - 1])) and
-      direction = "beginning"
-      or
-      anchoredTerm = root.getLastChild() and
-      anchoredTerm.getLastChild() instanceof RegExpDollar and
-      not containsTrailingPseudoAnchor(root.getChild([0 .. root.getNumChild() - 2])) and
-      containsLetters(root.getChild([0 .. root.getNumChild() - 2])) and
-      direction = "end"
-    ) and
-    // is not used for replace
-    not exists(DataFlow::MethodCallNode replace |
-      replace.getMethodName() = "replace" and
-      src.getARegExpObject().flowsTo(replace.getArgument(0))
-    ) and
-    msg =
-      "Misleading operator precedence. The subexpression '" + anchoredTerm.getRawValue() +
-        "' is anchored at the " + direction +
-        ", but the other parts of this regular expression are not"
-  )
-}
-
-/**
- * Holds if `term` is a final term, that is, no term will match anything after this one.
- */
-predicate isFinalRegExpTerm(RegExpTerm term) {
-  term.isRootTerm()
-  or
-  exists(RegExpSequence seq |
-    isFinalRegExpTerm(seq) and
-    term = seq.getLastChild()
-  )
-  or
-  exists(RegExpTerm parent |
-    isFinalRegExpTerm(parent) and
-    term = parent.getAChild() and
-    not parent instanceof RegExpSequence and
-    not parent instanceof RegExpQuantifier
-  )
-}
-
-/**
- * Holds if `src` contains a hostname pattern that is missing a `$` anchor.
- */
-predicate isSemiAnchoredHostnameRegExp(RegExpPatternSource src, string msg) {
-  not hasMisleadingAnchorPrecedence(src, _) and // avoid double reporting
-  exists(RegExpTerm term, RegExpSequence tld, int i | term = src.getRegExpTerm() |
-    not isConstantInvalidInsideOrigin(term.getAChild*()) and
-    tld = term.getAChild*() and
-    hasTopLevelDomainEnding(tld, i) and
-    isFinalRegExpTerm(tld.getChild(i)) and // nothing is matched after the TLD
-    tld.getChild(0) instanceof RegExpCaret and
-    msg =
-      "This hostname pattern may match any domain name, as it is missing a '$' or '/' at the end."
-  )
-}
-
-/**
- * Holds if `src` is an unanchored pattern for a URL, indicating a
- * mistake explained by `msg`.
- */
-predicate isUnanchoredHostnameRegExp(RegExpPatternSource src, string msg) {
-  exists(RegExpTerm term, RegExpSequence tld | term = src.getRegExpTerm() |
-    alwaysMatchesHostname(term) and
-    tld = term.getAChild*() and
-    hasTopLevelDomainEnding(tld) and
-    not isConstantInvalidInsideOrigin(term.getAChild*()) and
-    not term.getAChild*() instanceof RegExpAnchor and
-    // that is not used for capture or replace
-    not exists(DataFlow::MethodCallNode mcn, string name | name = mcn.getMethodName() |
+private module Impl implements
+MissingRegExpAnchor::MissingRegExpAnchorSig<TreeImpl, HostnameRegexp::Impl> {
+  predicate isUsedAsReplace(RegExpPatternSource pattern) {
+    // is used for capture or replace
+    exists(DataFlow::MethodCallNode mcn, string name | name = mcn.getMethodName() |
      name = "exec" and
-      mcn = src.getARegExpObject().getAMethodCall() and
+      mcn = pattern.getARegExpObject().getAMethodCall() and
      exists(mcn.getAPropertyRead())
      or
      exists(DataFlow::Node arg |
        arg = mcn.getArgument(0) and
        (
-          src.getARegExpObject().flowsTo(arg) or
-          src.getAParse() = arg
+          pattern.getARegExpObject().flowsTo(arg) or
+          pattern.getAParse() = arg
        )
      |
        name = "replace"
        or
        name = "match" and exists(mcn.getAPropertyRead())
      )
-    ) and
-    msg =
-      "When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it."
-  )
+    )
+  }
+
+  string getEndAnchorText() { result = "$" }
 }

+import MissingRegExpAnchor::Make<TreeImpl, HostnameRegexp::Impl, Impl>
+
 from DataFlow::Node nd, string msg
 where
  isUnanchoredHostnameRegExp(nd, msg)
@@ -226,4 +51,5 @@ where
  isSemiAnchoredHostnameRegExp(nd, msg)
  or
  hasMisleadingAnchorPrecedence(nd, msg)
+// isLineAnchoredHostnameRegExp is not used here, as it is not relevant to JS.
 select nd, msg
--- a/python/ql/lib/semmle/python/RegexTreeView.qll
+++ b/python/ql/lib/semmle/python/RegexTreeView.qll
@@ -102,6 +102,9 @@ module Impl implements RegexTreeViewSig {
    /** Gets the number of child terms. */
    int getNumChild() { result = count(this.getAChild()) }

+    /** Gets the last child term of this element. */
+    RegExpTerm getLastChild() { result = this.getChild(this.getNumChild() - 1) }
+
    /** Gets the associated regex. */
    abstract Regex getRegex();
  }
@@ -454,7 +457,16 @@ module Impl implements RegexTreeViewSig {
    override string getPrimaryQLClass() { result = "RegExpAlt" }
  }

-  additional class RegExpCharEscape = RegExpEscape;
+  /**
+   * A character escape in a regular expression.
+   *
+   * Example:
+   *
+   * ```
+   * \.
+   * ```
+   */
+  class RegExpCharEscape = RegExpEscape;

  /**
   * An escaped regular expression term, that is, a regular expression
@@ -561,6 +573,13 @@ module Impl implements RegexTreeViewSig {
    RegExpWordBoundary() { this.getChar() = "\\b" }
  }

+  /**
+   * A non-word boundary, that is, a regular expression term of the form `\B`.
+   */
+  class RegExpNonWordBoundary extends RegExpSpecialChar {
+    RegExpNonWordBoundary() { this.getChar() = "\\B" }
+  }
+
  /**
   * A character class escape in a regular expression.
   * That is, an escaped character that denotes multiple characters.
@@ -829,6 +848,19 @@ module Impl implements RegexTreeViewSig {
    override string getPrimaryQLClass() { result = "RegExpDot" }
  }

+  /**
+   * A term that matches a specific position between characters in the string.
+   *
+   * Example:
+   *
+   * ```
+   * \A
+   * ```
+   */
+  class RegExpAnchor extends RegExpSpecialChar {
+    RegExpAnchor() { this.getChar() = ["\\A", "^", "$", "\\Z"] }
+  }
+
  /**
   * A dollar assertion `$` or `\Z` matching the end of a line.
   *
@@ -838,7 +870,7 @@ module Impl implements RegexTreeViewSig {
   * $
   * ```
   */
-  class RegExpDollar extends RegExpSpecialChar {
+  class RegExpDollar extends RegExpAnchor {
    RegExpDollar() { this.getChar() = ["$", "\\Z"] }

    override string getPrimaryQLClass() { result = "RegExpDollar" }
@@ -853,7 +885,7 @@ module Impl implements RegexTreeViewSig {
   * ^
   * ```
   */
-  class RegExpCaret extends RegExpSpecialChar {
+  class RegExpCaret extends RegExpAnchor {
    RegExpCaret() { this.getChar() = ["^", "\\A"] }

    override string getPrimaryQLClass() { result = "RegExpCaret" }
--- a/python/ql/lib/semmle/python/dataflow/new/Regexp.qll
+++ b/python/ql/lib/semmle/python/dataflow/new/Regexp.qll
@@ -9,11 +9,12 @@ private import semmle.python.dataflow.new.DataFlow
 /**
 * Provides utility predicates related to regular expressions.
 */
-module RegExpPatterns {
+deprecated module RegExpPatterns {
  /**
   * Gets a pattern that matches common top-level domain names in lower case.
+   * DEPRECATED: use the similarly named predicate from `HostnameRegex` from the `regex` pack instead.
   */
-  string getACommonTld() {
+  deprecated string getACommonTld() {
    // according to ranking by http://google.com/search?q=site:.<<TLD>>
    result = "(?:com|org|edu|gov|uk|net|io)(?![a-z0-9])"
  }
--- a/python/ql/lib/semmle/python/security/regexp/HostnameRegex.qll
+++ b/python/ql/lib/semmle/python/security/regexp/HostnameRegex.qll
@@ -0,0 +1,18 @@
+/**
+ * Provides predicates for reasoning about regular expressions
+ * that match URLs and hostname patterns.
+ */
+
+private import python
+private import semmle.python.dataflow.new.DataFlow
+private import semmle.python.RegexTreeView::RegexTreeView as TreeImpl
+private import semmle.python.dataflow.new.Regexp as Regexp
+private import codeql.regex.HostnameRegexp as Shared
+
+private module Impl implements Shared::HostnameRegexpSig<TreeImpl> {
+  class DataFlowNode = DataFlow::Node;
+
+  class RegExpPatternSource = Regexp::RegExpPatternSource;
+}
+
+import Shared::Make<TreeImpl, Impl>
--- a/python/ql/src/Security/CWE-020/HostnameRegexpShared.qll
+++ b/python/ql/src/Security/CWE-020/HostnameRegexpShared.qll
@@ -3,200 +3,6 @@
 * that match URLs and hostname patterns.
 */

-private import HostnameRegexpSpecific
-
-/**
- * Holds if the given constant is unlikely to occur in the origin part of a URL.
- */
-predicate isConstantInvalidInsideOrigin(RegExpConstant term) {
-  // Look for any of these cases:
-  // - A character that can't occur in the origin
-  // - Two dashes in a row
-  // - A colon that is not part of port or scheme separator
-  // - A slash that is not part of scheme separator
-  term.getValue().regexpMatch(".*(?:[^a-zA-Z0-9.:/-]|--|:[^0-9/]|(?<![/:]|^)/).*")
-}
-
-/** Holds if `term` is a dot constant of form `\.` or `[.]`. */
-predicate isDotConstant(RegExpTerm term) {
-  term.(RegExpCharEscape).getValue() = "."
-  or
-  exists(RegExpCharacterClass cls |
-    term = cls and
-    not cls.isInverted() and
-    cls.getNumChild() = 1 and
-    cls.getAChild().(RegExpConstant).getValue() = "."
-  )
-}
-
-/** Holds if `term` is a wildcard `.` or an actual `.` character. */
-predicate isDotLike(RegExpTerm term) {
-  term instanceof RegExpDot
-  or
-  isDotConstant(term)
-}
-
-/** Holds if `term` will only ever be matched against the beginning of the input. */
-predicate matchesBeginningOfString(RegExpTerm term) {
-  term.isRootTerm()
-  or
-  exists(RegExpTerm parent | matchesBeginningOfString(parent) |
-    term = parent.(RegExpSequence).getChild(0)
-    or
-    parent.(RegExpSequence).getChild(0) instanceof RegExpCaret and
-    term = parent.(RegExpSequence).getChild(1)
-    or
-    term = parent.(RegExpAlt).getAChild()
-    or
-    term = parent.(RegExpGroup).getAChild()
-  )
-}
-
-/**
- * Holds if the given sequence `seq` contains top-level domain preceded by a dot, such as `.com`,
- * excluding cases where this is at the very beginning of the regexp.
- *
- * `i` is bound to the index of the last child in the top-level domain part.
- */
-predicate hasTopLevelDomainEnding(RegExpSequence seq, int i) {
-  seq.getChild(i)
-      .(RegExpConstant)
-      .getValue()
-      .regexpMatch("(?i)" + RegExpPatterns::getACommonTld() + "(:\\d+)?([/?#].*)?") and
-  isDotLike(seq.getChild(i - 1)) and
-  not (i = 1 and matchesBeginningOfString(seq))
-}
-
-/**
- * Holds if the given regular expression term contains top-level domain preceded by a dot,
- * such as `.com`.
- */
-predicate hasTopLevelDomainEnding(RegExpSequence seq) { hasTopLevelDomainEnding(seq, _) }
-
-/**
- * Holds if `term` will always match a hostname, that is, all disjunctions contain
- * a hostname pattern that isn't inside a quantifier.
- */
-predicate alwaysMatchesHostname(RegExpTerm term) {
-  hasTopLevelDomainEnding(term, _)
-  or
-  // `localhost` is considered a hostname pattern, but has no TLD
-  term.(RegExpConstant).getValue().regexpMatch("\\blocalhost\\b")
-  or
-  not term instanceof RegExpAlt and
-  not term instanceof RegExpQuantifier and
-  alwaysMatchesHostname(term.getAChild())
-  or
-  alwaysMatchesHostnameAlt(term)
-}
-
-/** Holds if every child of `alt` contains a hostname pattern. */
-predicate alwaysMatchesHostnameAlt(RegExpAlt alt) {
-  alwaysMatchesHostnameAlt(alt, alt.getNumChild() - 1)
-}
-
-/**
- * Holds if the first `i` children of `alt` contains a hostname pattern.
- *
- * This is used instead of `forall` to avoid materializing the set of alternatives
- * that don't contains hostnames, which is much larger.
- */
-predicate alwaysMatchesHostnameAlt(RegExpAlt alt, int i) {
-  alwaysMatchesHostname(alt.getChild(0)) and i = 0
-  or
-  alwaysMatchesHostnameAlt(alt, i - 1) and
-  alwaysMatchesHostname(alt.getChild(i))
-}
-
-/**
- * Holds if `term` occurs inside a quantifier or alternative (and thus
- * can not be expected to correspond to a unique match), or as part of
- * a lookaround assertion (which are rarely used for capture groups).
- */
-predicate isInsideChoiceOrSubPattern(RegExpTerm term) {
-  exists(RegExpParent parent | parent = term.getParent() |
-    parent instanceof RegExpAlt
-    or
-    parent instanceof RegExpQuantifier
-    or
-    parent instanceof RegExpSubPattern
-    or
-    isInsideChoiceOrSubPattern(parent)
-  )
-}
-
-/**
- * Holds if `group` is likely to be used as a capture group.
- */
-predicate isLikelyCaptureGroup(RegExpGroup group) {
-  group.isCapture() and
-  not isInsideChoiceOrSubPattern(group)
-}
-
-/**
- * Holds if `seq` contains two consecutive dots `..` or escaped dots.
- *
- * At least one of these dots is not intended to be a subdomain separator,
- * so we avoid flagging the pattern in this case.
- */
-predicate hasConsecutiveDots(RegExpSequence seq) {
-  exists(int i |
-    isDotLike(seq.getChild(i)) and
-    isDotLike(seq.getChild(i + 1))
-  )
-}
-
-predicate isIncompleteHostNameRegExpPattern(RegExpTerm regexp, RegExpSequence seq, string msg) {
-  seq = regexp.getAChild*() and
-  exists(RegExpDot unescapedDot, int i, string hostname |
-    hasTopLevelDomainEnding(seq, i) and
-    not isConstantInvalidInsideOrigin(seq.getChild([0 .. i - 1]).getAChild*()) and
-    not isLikelyCaptureGroup(seq.getChild([i .. seq.getNumChild() - 1]).getAChild*()) and
-    unescapedDot = seq.getChild([0 .. i - 1]).getAChild*() and
-    unescapedDot != seq.getChild(i - 1) and // Should not be the '.' immediately before the TLD
-    not hasConsecutiveDots(unescapedDot.getParent()) and
-    hostname =
-      seq.getChild(i - 2).getRawValue() + seq.getChild(i - 1).getRawValue() +
-        seq.getChild(i).getRawValue()
-  |
-    if unescapedDot.getParent() instanceof RegExpQuantifier
-    then
-      // `.*\.example.com` can match `evil.com/?x=.example.com`
-      //
-      // This problem only occurs when the pattern is applied against a full URL, not just a hostname/origin.
-      // We therefore check if the pattern includes a suffix after the TLD, such as `.*\.example.com/`.
-      // Note that a post-anchored pattern (`.*\.example.com$`) will usually fail to match a full URL,
-      // and patterns with neither a suffix nor an anchor fall under the purview of MissingRegExpAnchor.
-      seq.getChild(0) instanceof RegExpCaret and
-      not seq.getAChild() instanceof RegExpDollar and
-      seq.getChild([i .. i + 1]).(RegExpConstant).getValue().regexpMatch(".*[/?#].*") and
-      msg =
-        "has an unrestricted wildcard '" + unescapedDot.getParent().(RegExpQuantifier).getRawValue()
-          + "' which may cause '" + hostname +
-          "' to be matched anywhere in the URL, outside the hostname."
-    else
-      msg =
-        "has an unescaped '.' before '" + hostname +
-          "', so it might match more hosts than expected."
-  )
-}
-
-predicate incompleteHostnameRegExp(
-  RegExpSequence hostSequence, string message, DataFlow::Node aux, string label
-) {
-  exists(RegExpPatternSource re, RegExpTerm regexp, string msg, string kind |
-    regexp = re.getRegExpTerm() and
-    isIncompleteHostNameRegExpPattern(regexp, hostSequence, msg) and
-    (
-      if re.getAParse() != re
-      then (
-        kind = "string, which is used as a regular expression $@," and
-        aux = re.getAParse()
-      ) else (
-        kind = "regular expression" and aux = re
-      )
-    )
-  |
-    message = "This " + kind + " " + msg and label = "here"
-  )
-}
+// HostnameRegexp should be used directly from the shared regex pack, and not from this file.
+deprecated private import semmle.python.security.regexp.HostnameRegex as Dep
+import Dep
--- a/python/ql/src/Security/CWE-020/HostnameRegexpSpecific.qll
+++ b/python/ql/src/Security/CWE-020/HostnameRegexpSpecific.qll
@@ -1,3 +0,0 @@
-import semmle.python.RegexTreeView
-import semmle.python.dataflow.new.DataFlow
-import semmle.python.dataflow.new.Regexp
--- a/python/ql/src/Security/CWE-020/IncompleteHostnameRegExp.ql
+++ b/python/ql/src/Security/CWE-020/IncompleteHostnameRegExp.ql
@@ -11,6 +11,6 @@
 *       external/cwe/cwe-020
 */

-import HostnameRegexpShared
+private import semmle.python.security.regexp.HostnameRegex as HostnameRegex

-query predicate problems = incompleteHostnameRegExp/4;
+query predicate problems = HostnameRegex::incompleteHostnameRegExp/4;
--- a/ruby/ql/lib/codeql/ruby/Regexp.qll
+++ b/ruby/ql/lib/codeql/ruby/Regexp.qll
@@ -15,11 +15,12 @@ private import codeql.ruby.ApiGraphs
 /**
 * Provides utility predicates related to regular expressions.
 */
-module RegExpPatterns {
+deprecated module RegExpPatterns {
  /**
   * Gets a pattern that matches common top-level domain names in lower case.
+   * DEPRECATED: use the similarly named predicate from `HostnameRegex` from the `regex` pack instead.
   */
-  string getACommonTld() {
+  deprecated string getACommonTld() {
    // according to ranking by http://google.com/search?q=site:.<<TLD>>
    result = "(?:com|org|edu|gov|uk|net|io)(?![a-z0-9])"
  }
--- a/ruby/ql/lib/codeql/ruby/regexp/RegExpTreeView.qll
+++ b/ruby/ql/lib/codeql/ruby/regexp/RegExpTreeView.qll
@@ -539,7 +539,16 @@ private module Impl implements RegexTreeViewSig {
    override predicate isNullable() { this.getAChild().isNullable() }
  }

-  additional class RegExpCharEscape = RegExpEscape;
+  /**
+   * A character escape in a regular expression.
+   *
+   * Example:
+   *
+   * ```
+   * \.
+   * ```
+   */
+  class RegExpCharEscape = RegExpEscape;

  /**
   * An escaped regular expression term, that is, a regular expression
@@ -620,7 +629,7 @@ private module Impl implements RegexTreeViewSig {
  /**
   * A non-word boundary, that is, a regular expression term of the form `\B`.
   */
-  additional class RegExpNonWordBoundary extends RegExpSpecialChar {
+  class RegExpNonWordBoundary extends RegExpSpecialChar {
    RegExpNonWordBoundary() { this.getChar() = "\\B" }

    override string getAPrimaryQlClass() { result = "RegExpNonWordBoundary" }
@@ -926,7 +935,7 @@ private module Impl implements RegexTreeViewSig {
   * \A
   * ```
   */
-  additional class RegExpAnchor extends RegExpSpecialChar {
+  class RegExpAnchor extends RegExpSpecialChar {
    RegExpAnchor() { this.getChar() = ["^", "$", "\\A", "\\Z", "\\z"] }

    override string getAPrimaryQlClass() { result = "RegExpAnchor" }
--- a/ruby/ql/lib/codeql/ruby/security/regexp/HostnameRegexp.qll
+++ b/ruby/ql/lib/codeql/ruby/security/regexp/HostnameRegexp.qll
@@ -0,0 +1,18 @@
+/**
+ * Provides predicates for reasoning about regular expressions
+ * that match URLs and hostname patterns.
+ */
+
+private import ruby
+private import codeql.ruby.regexp.RegExpTreeView::RegexTreeView as TreeImpl
+private import codeql.ruby.Regexp as Regexp
+private import codeql.regex.HostnameRegexp as Shared
+
+/** An implementation of the signature that allows the Hostname analysis to run. */
+module Impl implements Shared::HostnameRegexpSig<TreeImpl> {
+  class DataFlowNode = DataFlow::Node;
+
+  class RegExpPatternSource = Regexp::RegExpPatternSource;
+}
+
+import Shared::Make<TreeImpl, Impl>
--- a/ruby/ql/src/queries/security/cwe-020/HostnameRegexpShared.qll
+++ b/ruby/ql/src/queries/security/cwe-020/HostnameRegexpShared.qll
@@ -3,200 +3,6 @@
 * that match URLs and hostname patterns.
 */

-private import HostnameRegexpSpecific
-
-/**
- * Holds if the given constant is unlikely to occur in the origin part of a URL.
- */
-predicate isConstantInvalidInsideOrigin(RegExpConstant term) {
-  // Look for any of these cases:
-  // - A character that can't occur in the origin
-  // - Two dashes in a row
-  // - A colon that is not part of port or scheme separator
-  // - A slash that is not part of scheme separator
-  term.getValue().regexpMatch(".*(?:[^a-zA-Z0-9.:/-]|--|:[^0-9/]|(?<![/:]|^)/).*")
-}
-
-/** Holds if `term` is a dot constant of form `\.` or `[.]`. */
-predicate isDotConstant(RegExpTerm term) {
-  term.(RegExpCharEscape).getValue() = "."
-  or
-  exists(RegExpCharacterClass cls |
-    term = cls and
-    not cls.isInverted() and
-    cls.getNumChild() = 1 and
-    cls.getAChild().(RegExpConstant).getValue() = "."
-  )
-}
-
-/** Holds if `term` is a wildcard `.` or an actual `.` character. */
-predicate isDotLike(RegExpTerm term) {
-  term instanceof RegExpDot
-  or
-  isDotConstant(term)
-}
-
-/** Holds if `term` will only ever be matched against the beginning of the input. */
-predicate matchesBeginningOfString(RegExpTerm term) {
-  term.isRootTerm()
-  or
-  exists(RegExpTerm parent | matchesBeginningOfString(parent) |
-    term = parent.(RegExpSequence).getChild(0)
-    or
-    parent.(RegExpSequence).getChild(0) instanceof RegExpCaret and
-    term = parent.(RegExpSequence).getChild(1)
-    or
-    term = parent.(RegExpAlt).getAChild()
-    or
-    term = parent.(RegExpGroup).getAChild()
-  )
-}
-
-/**
- * Holds if the given sequence `seq` contains top-level domain preceded by a dot, such as `.com`,
- * excluding cases where this is at the very beginning of the regexp.
- *
- * `i` is bound to the index of the last child in the top-level domain part.
- */
-predicate hasTopLevelDomainEnding(RegExpSequence seq, int i) {
-  seq.getChild(i)
-      .(RegExpConstant)
-      .getValue()
-      .regexpMatch("(?i)" + RegExpPatterns::getACommonTld() + "(:\\d+)?([/?#].*)?") and
-  isDotLike(seq.getChild(i - 1)) and
-  not (i = 1 and matchesBeginningOfString(seq))
-}
-
-/**
- * Holds if the given regular expression term contains top-level domain preceded by a dot,
- * such as `.com`.
- */
-predicate hasTopLevelDomainEnding(RegExpSequence seq) { hasTopLevelDomainEnding(seq, _) }
-
-/**
- * Holds if `term` will always match a hostname, that is, all disjunctions contain
- * a hostname pattern that isn't inside a quantifier.
- */
-predicate alwaysMatchesHostname(RegExpTerm term) {
-  hasTopLevelDomainEnding(term, _)
-  or
-  // `localhost` is considered a hostname pattern, but has no TLD
-  term.(RegExpConstant).getValue().regexpMatch("\\blocalhost\\b")
-  or
-  not term instanceof RegExpAlt and
-  not term instanceof RegExpQuantifier and
-  alwaysMatchesHostname(term.getAChild())
-  or
-  alwaysMatchesHostnameAlt(term)
-}
-
-/** Holds if every child of `alt` contains a hostname pattern. */
-predicate alwaysMatchesHostnameAlt(RegExpAlt alt) {
-  alwaysMatchesHostnameAlt(alt, alt.getNumChild() - 1)
-}
-
-/**
- * Holds if the first `i` children of `alt` contains a hostname pattern.
- *
- * This is used instead of `forall` to avoid materializing the set of alternatives
- * that don't contains hostnames, which is much larger.
- */
-predicate alwaysMatchesHostnameAlt(RegExpAlt alt, int i) {
-  alwaysMatchesHostname(alt.getChild(0)) and i = 0
-  or
-  alwaysMatchesHostnameAlt(alt, i - 1) and
-  alwaysMatchesHostname(alt.getChild(i))
-}
-
-/**
- * Holds if `term` occurs inside a quantifier or alternative (and thus
- * can not be expected to correspond to a unique match), or as part of
- * a lookaround assertion (which are rarely used for capture groups).
- */
-predicate isInsideChoiceOrSubPattern(RegExpTerm term) {
-  exists(RegExpParent parent | parent = term.getParent() |
-    parent instanceof RegExpAlt
-    or
-    parent instanceof RegExpQuantifier
-    or
-    parent instanceof RegExpSubPattern
-    or
-    isInsideChoiceOrSubPattern(parent)
-  )
-}
-
-/**
- * Holds if `group` is likely to be used as a capture group.
- */
-predicate isLikelyCaptureGroup(RegExpGroup group) {
-  group.isCapture() and
-  not isInsideChoiceOrSubPattern(group)
-}
-
-/**
- * Holds if `seq` contains two consecutive dots `..` or escaped dots.
- *
- * At least one of these dots is not intended to be a subdomain separator,
- * so we avoid flagging the pattern in this case.
- */
-predicate hasConsecutiveDots(RegExpSequence seq) {
-  exists(int i |
-    isDotLike(seq.getChild(i)) and
-    isDotLike(seq.getChild(i + 1))
-  )
-}
-
-predicate isIncompleteHostNameRegExpPattern(RegExpTerm regexp, RegExpSequence seq, string msg) {
-  seq = regexp.getAChild*() and
-  exists(RegExpDot unescapedDot, int i, string hostname |
-    hasTopLevelDomainEnding(seq, i) and
-    not isConstantInvalidInsideOrigin(seq.getChild([0 .. i - 1]).getAChild*()) and
-    not isLikelyCaptureGroup(seq.getChild([i .. seq.getNumChild() - 1]).getAChild*()) and
-    unescapedDot = seq.getChild([0 .. i - 1]).getAChild*() and
-    unescapedDot != seq.getChild(i - 1) and // Should not be the '.' immediately before the TLD
-    not hasConsecutiveDots(unescapedDot.getParent()) and
-    hostname =
-      seq.getChild(i - 2).getRawValue() + seq.getChild(i - 1).getRawValue() +
-        seq.getChild(i).getRawValue()
-  |
-    if unescapedDot.getParent() instanceof RegExpQuantifier
-    then
-      // `.*\.example.com` can match `evil.com/?x=.example.com`
-      //
-      // This problem only occurs when the pattern is applied against a full URL, not just a hostname/origin.
-      // We therefore check if the pattern includes a suffix after the TLD, such as `.*\.example.com/`.
-      // Note that a post-anchored pattern (`.*\.example.com$`) will usually fail to match a full URL,
-      // and patterns with neither a suffix nor an anchor fall under the purview of MissingRegExpAnchor.
-      seq.getChild(0) instanceof RegExpCaret and
-      not seq.getAChild() instanceof RegExpDollar and
-      seq.getChild([i .. i + 1]).(RegExpConstant).getValue().regexpMatch(".*[/?#].*") and
-      msg =
-        "has an unrestricted wildcard '" + unescapedDot.getParent().(RegExpQuantifier).getRawValue()
-          + "' which may cause '" + hostname +
-          "' to be matched anywhere in the URL, outside the hostname."
-    else
-      msg =
-        "has an unescaped '.' before '" + hostname +
-          "', so it might match more hosts than expected."
-  )
-}
-
-predicate incompleteHostnameRegExp(
-  RegExpSequence hostSequence, string message, DataFlow::Node aux, string label
-) {
-  exists(RegExpPatternSource re, RegExpTerm regexp, string msg, string kind |
-    regexp = re.getRegExpTerm() and
-    isIncompleteHostNameRegExpPattern(regexp, hostSequence, msg) and
-    (
-      if re.getAParse() != re
-      then (
-        kind = "string, which is used as a regular expression $@," and
-        aux = re.getAParse()
-      ) else (
-        kind = "regular expression" and aux = re
-      )
-    )
-  |
-    message = "This " + kind + " " + msg and label = "here"
-  )
-}
+// HostnameRegexp should be used directly from the shared regex pack, and not from this file.
+deprecated import codeql.ruby.security.regexp.HostnameRegexp as Dep
+import Dep
--- a/ruby/ql/src/queries/security/cwe-020/HostnameRegexpSpecific.qll
+++ b/ruby/ql/src/queries/security/cwe-020/HostnameRegexpSpecific.qll
@@ -1,2 +0,0 @@
-import codeql.ruby.Regexp
-import codeql.ruby.DataFlow
--- a/ruby/ql/src/queries/security/cwe-020/IncompleteHostnameRegExp.ql
+++ b/ruby/ql/src/queries/security/cwe-020/IncompleteHostnameRegExp.ql
@@ -11,6 +11,6 @@
 *       external/cwe/cwe-020
 */

-import HostnameRegexpShared
+private import codeql.ruby.security.regexp.HostnameRegexp as HostnameRegxp

-query predicate problems = incompleteHostnameRegExp/4;
+query predicate problems = HostnameRegxp::incompleteHostnameRegExp/4;
--- a/ruby/ql/src/queries/security/cwe-020/IncompleteUrlSubstringSanitization.qll
+++ b/ruby/ql/src/queries/security/cwe-020/IncompleteUrlSubstringSanitization.qll
@@ -3,6 +3,7 @@
 */

 private import IncompleteUrlSubstringSanitizationSpecific
+private import codeql.regex.HostnameRegexp::Utils

 /**
 * A check on a string for whether it contains a given substring, possibly with restrictions on the location of the substring.
@@ -30,9 +31,7 @@ query predicate problems(
  mayHaveStringValue(substring, target) and
  (
    // target contains a domain on a common TLD, and perhaps some other URL components
-    target
-        .regexpMatch("(?i)([a-z]*:?//)?\\.?([a-z0-9-]+\\.)+" + RegExpPatterns::getACommonTld() +
-            "(:[0-9]+)?/?")
+    target.regexpMatch("(?i)([a-z]*:?//)?\\.?([a-z0-9-]+\\.)+" + getACommonTld() + "(:[0-9]+)?/?")
    or
    // target is a HTTP URL to a domain on any TLD
    target.regexpMatch("(?i)https?://([a-z0-9-]+\\.)+([a-z]+)(:[0-9]+)?/?")
--- a/ruby/ql/src/queries/security/cwe-020/IncompleteUrlSubstringSanitizationSpecific.qll
+++ b/ruby/ql/src/queries/security/cwe-020/IncompleteUrlSubstringSanitizationSpecific.qll
@@ -1,6 +1,5 @@
 import codeql.ruby.DataFlow
 import codeql.ruby.StringOps
-import codeql.ruby.Regexp::RegExpPatterns as RegExpPatterns

 /** Holds if `node` may evaluate to `value` */
 predicate mayHaveStringValue(DataFlow::Node node, string value) {
--- a/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.ql
+++ b/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.ql
@@ -11,238 +11,32 @@
 *       external/cwe/cwe-020
 */

-import HostnameRegexpShared
 import codeql.ruby.DataFlow
 import codeql.ruby.regexp.RegExpTreeView
 import codeql.ruby.Regexp
+private import codeql.ruby.security.regexp.HostnameRegexp as HostnameRegexp
+private import codeql.regex.MissingRegExpAnchor as MissingRegExpAnchor
+private import codeql.ruby.regexp.RegExpTreeView::RegexTreeView as TreeImpl

-/**
- * Holds if `term` is a final term, that is, no term will match anything after this one.
- */
-predicate isFinalRegExpTerm(RegExpTerm term) {
-  term.isRootTerm()
-  or
-  exists(RegExpSequence seq |
-    isFinalRegExpTerm(seq) and
-    term = seq.getLastChild()
-  )
-  or
-  exists(RegExpTerm parent |
-    isFinalRegExpTerm(parent) and
-    term = parent.getAChild() and
-    not parent instanceof RegExpSequence and
-    not parent instanceof RegExpQuantifier
-  )
-}
-
-/** Holds if `term` is one of the transitive left children of a regexp. */
-predicate isLeftArmTerm(RegExpTerm term) {
-  term.isRootTerm()
-  or
-  exists(RegExpTerm parent |
-    term = parent.getChild(0) and
-    isLeftArmTerm(parent)
-  )
-}
-
-/** Holds if `term` is one of the transitive right children of a regexp. */
-predicate isRightArmTerm(RegExpTerm term) {
-  term.isRootTerm()
-  or
-  exists(RegExpTerm parent |
-    term = parent.getLastChild() and
-    isRightArmTerm(parent)
-  )
-}
-
-/**
- * Holds if `term` is an anchor that is not the first or last node
- * in its tree.
- */
-predicate isInteriorAnchor(RegExpAnchor term) {
-  not isLeftArmTerm(term) and
-  not isRightArmTerm(term)
-}
-
-/**
- * Holds if `term` contains an anchor that is not the first or last node
- * in its tree, such as `(foo|bar$|baz)`.
- */
-predicate containsInteriorAnchor(RegExpTerm term) { isInteriorAnchor(term.getAChild*()) }
-
-/**
- * Holds if `term` starts with a word boundary or lookbehind assertion,
- * indicating that it's not intended to be anchored on that side.
- */
-predicate containsLeadingPseudoAnchor(RegExpSequence term) {
-  exists(RegExpTerm child | child = term.getChild(0) |
-    child instanceof RegExpWordBoundary or
-    child instanceof RegExpNonWordBoundary or
-    child instanceof RegExpLookbehind
-  )
-}
-
-/**
- * Holds if `term` ends with a word boundary or lookahead assertion,
- * indicating that it's not intended to be anchored on that side.
- */
-predicate containsTrailingPseudoAnchor(RegExpSequence term) {
-  exists(RegExpTerm child | child = term.getLastChild() |
-    child instanceof RegExpWordBoundary or
-    child instanceof RegExpNonWordBoundary or
-    child instanceof RegExpLookahead
-  )
-}
-
-/**
- * Holds if `term` is an empty sequence, usually arising from
- * literals with a trailing alternative such as `foo|`.
- */
-predicate isEmpty(RegExpSequence term) { term.getNumChild() = 0 }
-
-/**
- * Holds if `term` contains a letter constant.
- *
- * We use this as a heuristic to filter out uninteresting results.
- */
-predicate containsLetters(RegExpTerm term) {
-  term.getAChild*().(RegExpConstant).getValue().regexpMatch(".*[a-zA-Z].*")
-}
-
-/**
- * Holds if `alt` has an explicitly anchored group, such as `^(foo|bar)|baz`
- * and doesn't have any unnecessary groups, such as in `^(foo)|(bar)`.
- */
-predicate hasExplicitAnchorPrecedence(RegExpAlt alt) {
-  isAnchoredGroup(alt.getAChild()) and
-  not alt.getAChild() instanceof RegExpGroup
-}
-
-/**
- * Holds if `term` consists only of an anchor and a parenthesized term,
- * such as the left side of `^(foo|bar)|baz`.
- *
- * The precedence of the anchor is likely to be intentional in this case,
- * as the group wouldn't be needed otherwise.
- */
-predicate isAnchoredGroup(RegExpSequence term) {
-  term.getNumChild() = 2 and
-  term.getAChild() instanceof RegExpAnchor and
-  term.getAChild() instanceof RegExpGroup
-}
-
-/**
- * Holds if `src` is a pattern for a collection of alternatives where
- * only the first or last alternative is anchored, indicating a
- * precedence mistake explained by `msg`.
- *
- * The canonical example of such a mistake is: `^a|b|c`, which is
- * parsed as `(^a)|(b)|(c)`.
- */
-predicate hasMisleadingAnchorPrecedence(RegExpPatternSource src, string msg) {
-  exists(RegExpAlt root, RegExpSequence anchoredTerm, string direction |
-    root = src.getRegExpTerm() and
-    not containsInteriorAnchor(root) and
-    not isEmpty(root.getAChild()) and
-    not hasExplicitAnchorPrecedence(root) and
-    containsLetters(anchoredTerm) and
-    (
-      anchoredTerm = root.getChild(0) and
-      anchoredTerm.getChild(0) instanceof RegExpCaret and
-      not containsLeadingPseudoAnchor(root.getChild([1 .. root.getNumChild() - 1])) and
-      containsLetters(root.getChild([1 .. root.getNumChild() - 1])) and
-      direction = "beginning"
-      or
-      anchoredTerm = root.getLastChild() and
-      anchoredTerm.getLastChild() instanceof RegExpDollar and
-      not containsTrailingPseudoAnchor(root.getChild([0 .. root.getNumChild() - 2])) and
-      containsLetters(root.getChild([0 .. root.getNumChild() - 2])) and
-      direction = "end"
-    ) and
-    // that is not used for string replacement
-    not exists(DataFlow::CallNode mcn, DataFlow::Node arg, string name |
+private module Impl implements
+MissingRegExpAnchor::MissingRegExpAnchorSig<TreeImpl, HostnameRegexp::Impl> {
+  predicate isUsedAsReplace(RegExpPatternSource pattern) {
+    exists(DataFlow::CallNode mcn, DataFlow::Node arg, string name |
      name = mcn.getMethodName() and
      arg = mcn.getArgument(0)
    |
      (
-        src.getAParse().(DataFlow::LocalSourceNode).flowsTo(arg) or
-        src.getAParse() = arg
+        pattern.getAParse().(DataFlow::LocalSourceNode).flowsTo(arg) or
+        pattern.getAParse() = arg
      ) and
      name = ["sub", "sub!", "gsub", "gsub!"]
-    ) and
-    msg =
-      "Misleading operator precedence. The subexpression '" + anchoredTerm.getRawValue() +
-        "' is anchored at the " + direction +
-        ", but the other parts of this regular expression are not"
-  )
+    )
+  }
+
+  string getEndAnchorText() { result = "\\z" }
 }

-/**
- * Holds if `src` contains a hostname pattern that uses the `^/$` line anchors
- * rather than `\A/\z` which match the start/end of the whole string.
- */
-predicate isLineAnchoredHostnameRegExp(RegExpPatternSource src, string msg) {
-  // avoid double reporting
-  not (
-    isSemiAnchoredHostnameRegExp(src, _) or
-    hasMisleadingAnchorPrecedence(src, _)
-  ) and
-  exists(RegExpTerm term, RegExpSequence tld, int i | term = src.getRegExpTerm() |
-    not isConstantInvalidInsideOrigin(term.getAChild*()) and
-    tld = term.getAChild*() and
-    hasTopLevelDomainEnding(tld, i) and
-    isFinalRegExpTerm(tld.getChild(i)) and // nothing is matched after the TLD
-    (
-      tld.getChild(0).(RegExpCaret).getChar() = "^" or
-      tld.getLastChild().(RegExpDollar).getChar() = "$"
-    ) and
-    msg =
-      "This hostname pattern uses anchors such as '^' and '$', which match the start and end of a line, not the whole string. Use '\\A' and '\\z' instead."
-  )
-}
-
-/**
- * Holds if `src` contains a hostname pattern that is missing a `$` anchor.
- */
-predicate isSemiAnchoredHostnameRegExp(RegExpPatternSource src, string msg) {
-  not hasMisleadingAnchorPrecedence(src, _) and // avoid double reporting
-  exists(RegExpTerm term, RegExpSequence tld, int i | term = src.getRegExpTerm() |
-    not isConstantInvalidInsideOrigin(term.getAChild*()) and
-    tld = term.getAChild*() and
-    hasTopLevelDomainEnding(tld, i) and
-    isFinalRegExpTerm(tld.getChild(i)) and // nothing is matched after the TLD
-    tld.getChild(0) instanceof RegExpCaret and
-    msg =
-      "This hostname pattern may match any domain name, as it is missing a '\\z' or '/' at the end."
-  )
-}
-
-/**
- * Holds if `src` is an unanchored pattern for a URL, indicating a
- * mistake explained by `msg`.
- */
-predicate isUnanchoredHostnameRegExp(RegExpPatternSource src, string msg) {
-  exists(RegExpTerm term, RegExpSequence tld | term = src.getRegExpTerm() |
-    alwaysMatchesHostname(term) and
-    tld = term.getAChild*() and
-    hasTopLevelDomainEnding(tld) and
-    not isConstantInvalidInsideOrigin(term.getAChild*()) and
-    not term.getAChild*() instanceof RegExpAnchor and
-    // that is not used for string replacement
-    not exists(DataFlow::CallNode mcn, DataFlow::Node arg, string name |
-      name = mcn.getMethodName() and
-      arg = mcn.getArgument(0)
-    |
-      (
-        src.getAParse().(DataFlow::LocalSourceNode).flowsTo(arg) or
-        src.getAParse() = arg
-      ) and
-      name = ["sub", "sub!", "gsub", "gsub!"]
-    ) and
-    msg =
-      "When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it."
-  )
-}
+import MissingRegExpAnchor::Make<TreeImpl, HostnameRegexp::Impl, Impl>

 from DataFlow::Node nd, string msg
 where
--- a/shared/regex/codeql/regex/HostnameRegexp.qll
+++ b/shared/regex/codeql/regex/HostnameRegexp.qll
@@ -0,0 +1,276 @@
+/**
+ * Provides predicates for reasoning about regular expressions
+ * that match URLs and hostname patterns.
+ */
+
+private import RegexTreeView
+
+/**
+ * A signature specifying the required parts to perform an
+ * analysis on regular expressions matching hostnames.
+ */
+signature module HostnameRegexpSig<RegexTreeViewSig TreeImpl> {
+  /** A node in the data-flow graph. */
+  class DataFlowNode {
+    /** Gets a string representation of this node. */
+    string toString();
+  }
+
+  /** A node in the data-flow graph that represents a regular expression pattern. */
+  class RegExpPatternSource extends DataFlowNode {
+    /**
+     * Gets the root term of the regular expression parsed from this pattern.
+     */
+    TreeImpl::RegExpTerm getRegExpTerm();
+
+    /**
+     * Gets a node where the pattern of this node is parsed as a part of
+     * a regular expression.
+     */
+    DataFlowNode getAParse();
+  }
+}
+
+/**
+ * Utility predicates and classes that doesn't depend on any signature.
+ */
+module Utils {
+  /**
+   * Gets a pattern that matches common top-level domain names in lower case.
+   */
+  string getACommonTld() {
+    // according to ranking by http://google.com/search?q=site:.<<TLD>>
+    result = "(?:com|org|edu|gov|uk|net|io)(?![a-z0-9])"
+  }
+}
+
+/**
+ * Classes and predicates implementing an analysis on regular expressions
+ * that match URLs and hostname patterns.
+ */
+module Make<RegexTreeViewSig TreeImpl, HostnameRegexpSig<TreeImpl> Specific> {
+  private import TreeImpl
+  import Utils
+
+  /**
+   * Holds if the given constant is unlikely to occur in the origin part of a URL.
+   */
+  predicate isConstantInvalidInsideOrigin(RegExpConstant term) {
+    // Look for any of these cases:
+    // - A character that can't occur in the origin
+    // - Two dashes in a row
+    // - A colon that is not part of port or scheme separator
+    // - A slash that is not part of scheme separator
+    term.getValue().regexpMatch(".*(?:[^a-zA-Z0-9.:/-]|--|:[^0-9/]|(?<![/:]|^)/).*")
+  }
+
+  /** Holds if `term` is a dot constant of form `\.` or `[.]`. */
+  predicate isDotConstant(RegExpTerm term) {
+    term.(RegExpCharEscape).getValue() = "."
+    or
+    exists(RegExpCharacterClass cls |
+      term = cls and
+      not cls.isInverted() and
+      cls.getNumChild() = 1 and
+      cls.getAChild().(RegExpConstant).getValue() = "."
+    )
+  }
+
+  /** Holds if `term` is a wildcard `.` or an actual `.` character. */
+  predicate isDotLike(RegExpTerm term) {
+    term instanceof RegExpDot
+    or
+    isDotConstant(term)
+  }
+
+  /** Holds if `term` will only ever be matched against the beginning of the input. */
+  predicate matchesBeginningOfString(RegExpTerm term) {
+    term.isRootTerm()
+    or
+    exists(RegExpTerm parent | matchesBeginningOfString(parent) |
+      term = parent.(RegExpSequence).getChild(0)
+      or
+      parent.(RegExpSequence).getChild(0) instanceof RegExpCaret and
+      term = parent.(RegExpSequence).getChild(1)
+      or
+      term = parent.(RegExpAlt).getAChild()
+      or
+      term = parent.(RegExpGroup).getAChild()
+    )
+  }
+
+  /**
+   * Holds if the given sequence `seq` contains top-level domain preceded by a dot, such as `.com`,
+   * excluding cases where this is at the very beginning of the regexp.
+   *
+   * `i` is bound to the index of the last child in the top-level domain part.
+   */
+  predicate hasTopLevelDomainEnding(RegExpSequence seq, int i) {
+    seq.getChild(i)
+        .(RegExpConstant)
+        .getValue()
+        .regexpMatch("(?i)" + getACommonTld() + "(:\\d+)?([/?#].*)?") and
+    isDotLike(seq.getChild(i - 1)) and
+    not (i = 1 and matchesBeginningOfString(seq))
+  }
+
+  /**
+   * Holds if the given regular expression term contains top-level domain preceded by a dot,
+   * such as `.com`.
+   */
+  predicate hasTopLevelDomainEnding(RegExpSequence seq) { hasTopLevelDomainEnding(seq, _) }
+
+  /**
+   * Holds if `term` will always match a hostname, that is, all disjunctions contain
+   * a hostname pattern that isn't inside a quantifier.
+   */
+  predicate alwaysMatchesHostname(RegExpTerm term) {
+    hasTopLevelDomainEnding(term, _)
+    or
+    // `localhost` is considered a hostname pattern, but has no TLD
+    term.(RegExpConstant).getValue().regexpMatch("\\blocalhost\\b")
+    or
+    not term instanceof RegExpAlt and
+    not term instanceof RegExpQuantifier and
+    alwaysMatchesHostname(term.getAChild())
+    or
+    alwaysMatchesHostnameAlt(term)
+  }
+
+  /** Holds if every child of `alt` contains a hostname pattern. */
+  predicate alwaysMatchesHostnameAlt(RegExpAlt alt) {
+    alwaysMatchesHostnameAlt(alt, alt.getNumChild() - 1)
+  }
+
+  /**
+   * Holds if the first `i` children of `alt` contains a hostname pattern.
+   *
+   * This is used instead of `forall` to avoid materializing the set of alternatives
+   * that don't contains hostnames, which is much larger.
+   */
+  predicate alwaysMatchesHostnameAlt(RegExpAlt alt, int i) {
+    alwaysMatchesHostname(alt.getChild(0)) and i = 0
+    or
+    alwaysMatchesHostnameAlt(alt, i - 1) and
+    alwaysMatchesHostname(alt.getChild(i))
+  }
+
+  /**
+   * Holds if `term` occurs inside a quantifier or alternative (and thus
+   * can not be expected to correspond to a unique match), or as part of
+   * a lookaround assertion (which are rarely used for capture groups).
+   */
+  predicate isInsideChoiceOrSubPattern(RegExpTerm term) {
+    exists(RegExpParent parent | parent = term.getParent() |
+      parent instanceof RegExpAlt
+      or
+      parent instanceof RegExpQuantifier
+      or
+      parent instanceof RegExpSubPattern
+      or
+      isInsideChoiceOrSubPattern(parent)
+    )
+  }
+
+  /**
+   * Holds if `group` is likely to be used as a capture group.
+   */
+  predicate isLikelyCaptureGroup(RegExpGroup group) {
+    group.isCapture() and
+    not isInsideChoiceOrSubPattern(group)
+  }
+
+  /**
+   * Holds if `seq` contains two consecutive dots `..` or escaped dots.
+   *
+   * At least one of these dots is not intended to be a subdomain separator,
+   * so we avoid flagging the pattern in this case.
+   */
+  predicate hasConsecutiveDots(RegExpSequence seq) {
+    exists(int i |
+      isDotLike(seq.getChild(i)) and
+      isDotLike(seq.getChild(i + 1))
+    )
+  }
+
+  private predicate isIncompleteHostNameRegExpPattern(
+    RegExpTerm regexp, RegExpSequence seq, string msg
+  ) {
+    seq = regexp.getAChild*() and
+    exists(RegExpDot unescapedDot, int i, string hostname |
+      hasTopLevelDomainEnding(seq, i) and
+      not isConstantInvalidInsideOrigin(seq.getChild([0 .. i - 1]).getAChild*()) and
+      not isLikelyCaptureGroup(seq.getChild([i .. seq.getNumChild() - 1]).getAChild*()) and
+      unescapedDot = seq.getChild([0 .. i - 1]).getAChild*() and
+      unescapedDot != seq.getChild(i - 1) and // Should not be the '.' immediately before the TLD
+      not hasConsecutiveDots(unescapedDot.getParent()) and
+      hostname =
+        seq.getChild(i - 2).getRawValue() + seq.getChild(i - 1).getRawValue() +
+          seq.getChild(i).getRawValue()
+    |
+      if unescapedDot.getParent() instanceof RegExpQuantifier
+      then
+        // `.*\.example.com` can match `evil.com/?x=.example.com`
+        //
+        // This problem only occurs when the pattern is applied against a full URL, not just a hostname/origin.
+        // We therefore check if the pattern includes a suffix after the TLD, such as `.*\.example.com/`.
+        // Note that a post-anchored pattern (`.*\.example.com$`) will usually fail to match a full URL,
+        // and patterns with neither a suffix nor an anchor fall under the purview of MissingRegExpAnchor.
+        seq.getChild(0) instanceof RegExpCaret and
+        not seq.getAChild() instanceof RegExpDollar and
+        seq.getChild([i .. i + 1]).(RegExpConstant).getValue().regexpMatch(".*[/?#].*") and
+        msg =
+          "has an unrestricted wildcard '" +
+            unescapedDot.getParent().(RegExpQuantifier).getRawValue() + "' which may cause '" +
+            hostname + "' to be matched anywhere in the URL, outside the hostname."
+      else
+        msg =
+          "has an unescaped '.' before '" + hostname +
+            "', so it might match more hosts than expected."
+    )
+  }
+
+  /** Holds if `term` is one of the transitive left children of a regexp. */
+  predicate isLeftArmTerm(RegExpTerm term) {
+    term.isRootTerm()
+    or
+    exists(RegExpTerm parent |
+      term = parent.getChild(0) and
+      isLeftArmTerm(parent)
+    )
+  }
+
+  /** Holds if `term` is one of the transitive right children of a regexp. */
+  predicate isRightArmTerm(RegExpTerm term) {
+    term.isRootTerm()
+    or
+    exists(RegExpTerm parent |
+      term = parent.getLastChild() and
+      isRightArmTerm(parent)
+    )
+  }
+
+  /**
+   * Holds if `regexp` is a regular expression that is likely to match a hostname,
+   * but the pattern is incomplete and may match more hosts than intended.
+   */
+  predicate incompleteHostnameRegExp(
+    RegExpSequence hostSequence, string message, Specific::DataFlowNode aux, string label
+  ) {
+    exists(Specific::RegExpPatternSource re, RegExpTerm regexp, string msg, string kind |
+      regexp = re.getRegExpTerm() and
+      isIncompleteHostNameRegExpPattern(regexp, hostSequence, msg) and
+      (
+        if re.getAParse() != re
+        then (
+          kind = "string, which is used as a regular expression $@," and
+          aux = re.getAParse()
+        ) else (
+          kind = "regular expression" and aux = re
+        )
+      )
+    |
+      message = "This " + kind + " " + msg and label = "here"
+    )
+  }
+}
--- a/shared/regex/codeql/regex/MissingRegExpAnchor.qll
+++ b/shared/regex/codeql/regex/MissingRegExpAnchor.qll
@@ -0,0 +1,225 @@
+/**
+ * Provides predicates for reasoning about regular expressions
+ * without anchors.
+ */
+
+private import RegexTreeView
+import HostnameRegexp as HostnameShared
+
+/**
+ * A signature specifying the required parts to perform an
+ * analysis on regular expressions without anchors.
+ *
+ * This analysis requires the hostname analysis to be available.
+ */
+signature module MissingRegExpAnchorSig<
+RegexTreeViewSig TreeImpl, HostnameShared::HostnameRegexpSig<TreeImpl> Specific> {
+  predicate isUsedAsReplace(Specific::RegExpPatternSource pattern);
+
+  /** Gets a string representation of an end anchor from a regular expression. */
+  string getEndAnchorText();
+}
+
+/**
+ * Classes and predicates implementing an analysis on regular expressions
+ * without anchors.
+ */
+module Make<
+RegexTreeViewSig TreeImpl, HostnameShared::HostnameRegexpSig<TreeImpl> HostnameImpl,
+MissingRegExpAnchorSig<TreeImpl, HostnameImpl> Impl> {
+  private import TreeImpl
+  private import HostnameShared::Make<TreeImpl, HostnameImpl> as HostnameRegexp
+  private import HostnameImpl
+  private import Impl
+
+  /**
+   * Holds if `term` is a final term, that is, no term will match anything after this one.
+   */
+  predicate isFinalRegExpTerm(RegExpTerm term) {
+    term.isRootTerm()
+    or
+    exists(RegExpSequence seq |
+      isFinalRegExpTerm(seq) and
+      term = seq.getLastChild()
+    )
+    or
+    exists(RegExpTerm parent |
+      isFinalRegExpTerm(parent) and
+      term = parent.getAChild() and
+      not parent instanceof RegExpSequence and
+      not parent instanceof RegExpQuantifier
+    )
+  }
+
+  /**
+   * Holds if `term` is an anchor that is not the first or last node
+   * in its tree.
+   */
+  predicate isInteriorAnchor(RegExpAnchor term) {
+    not HostnameRegexp::isLeftArmTerm(term) and
+    not HostnameRegexp::isRightArmTerm(term)
+  }
+
+  /**
+   * Holds if `term` contains an anchor that is not the first or last node
+   * in its tree, such as `(foo|bar$|baz)`.
+   */
+  predicate containsInteriorAnchor(RegExpTerm term) { isInteriorAnchor(term.getAChild*()) }
+
+  /**
+   * Holds if `term` starts with a word boundary or lookbehind assertion,
+   * indicating that it's not intended to be anchored on that side.
+   */
+  predicate containsLeadingPseudoAnchor(RegExpSequence term) {
+    exists(RegExpTerm child | child = term.getChild(0) |
+      child instanceof RegExpWordBoundary or
+      child instanceof RegExpNonWordBoundary or
+      child instanceof RegExpLookbehind
+    )
+  }
+
+  /**
+   * Holds if `term` ends with a word boundary or lookahead assertion,
+   * indicating that it's not intended to be anchored on that side.
+   */
+  predicate containsTrailingPseudoAnchor(RegExpSequence term) {
+    exists(RegExpTerm child | child = term.getLastChild() |
+      child instanceof RegExpWordBoundary or
+      child instanceof RegExpNonWordBoundary or
+      child instanceof RegExpLookahead
+    )
+  }
+
+  /**
+   * Holds if `term` is an empty sequence, usually arising from
+   * literals with a trailing alternative such as `foo|`.
+   */
+  predicate isEmpty(RegExpSequence term) { term.getNumChild() = 0 }
+
+  /**
+   * Holds if `term` contains a letter constant.
+   *
+   * We use this as a heuristic to filter out uninteresting results.
+   */
+  predicate containsLetters(RegExpTerm term) {
+    term.getAChild*().(RegExpConstant).getValue().regexpMatch(".*[a-zA-Z].*")
+  }
+
+  /**
+   * Holds if `term` consists only of an anchor and a parenthesized term,
+   * such as the left side of `^(foo|bar)|baz`.
+   *
+   * The precedence of the anchor is likely to be intentional in this case,
+   * as the group wouldn't be needed otherwise.
+   */
+  predicate isAnchoredGroup(RegExpSequence term) {
+    term.getNumChild() = 2 and
+    term.getAChild() instanceof RegExpAnchor and
+    term.getAChild() instanceof RegExpGroup
+  }
+
+  /**
+   * Holds if `alt` has an explicitly anchored group, such as `^(foo|bar)|baz`
+   * and doesn't have any unnecessary groups, such as in `^(foo)|(bar)`.
+   */
+  predicate hasExplicitAnchorPrecedence(RegExpAlt alt) {
+    isAnchoredGroup(alt.getAChild()) and
+    not alt.getAChild() instanceof RegExpGroup
+  }
+
+  /**
+   * Holds if `src` is a pattern for a collection of alternatives where
+   * only the first or last alternative is anchored, indicating a
+   * precedence mistake explained by `msg`.
+   *
+   * The canonical example of such a mistake is: `^a|b|c`, which is
+   * parsed as `(^a)|(b)|(c)`.
+   */
+  predicate hasMisleadingAnchorPrecedence(RegExpPatternSource src, string msg) {
+    exists(RegExpAlt root, RegExpSequence anchoredTerm, string direction |
+      root = src.getRegExpTerm() and
+      not containsInteriorAnchor(root) and
+      not isEmpty(root.getAChild()) and
+      not hasExplicitAnchorPrecedence(root) and
+      containsLetters(anchoredTerm) and
+      (
+        anchoredTerm = root.getChild(0) and
+        anchoredTerm.getChild(0) instanceof RegExpCaret and
+        not containsLeadingPseudoAnchor(root.getChild([1 .. root.getNumChild() - 1])) and
+        containsLetters(root.getChild([1 .. root.getNumChild() - 1])) and
+        direction = "beginning"
+        or
+        anchoredTerm = root.getLastChild() and
+        anchoredTerm.getLastChild() instanceof RegExpDollar and
+        not containsTrailingPseudoAnchor(root.getChild([0 .. root.getNumChild() - 2])) and
+        containsLetters(root.getChild([0 .. root.getNumChild() - 2])) and
+        direction = "end"
+      ) and
+      // is not used for replace
+      not isUsedAsReplace(src) and
+      msg =
+        "Misleading operator precedence. The subexpression '" + anchoredTerm.getRawValue() +
+          "' is anchored at the " + direction +
+          ", but the other parts of this regular expression are not"
+    )
+  }
+
+  /**
+   * Holds if `src` contains a hostname pattern that is missing a `$` anchor.
+   */
+  predicate isSemiAnchoredHostnameRegExp(RegExpPatternSource src, string msg) {
+    not hasMisleadingAnchorPrecedence(src, _) and // avoid double reporting
+    exists(RegExpTerm term, RegExpSequence tld, int i | term = src.getRegExpTerm() |
+      not HostnameRegexp::isConstantInvalidInsideOrigin(term.getAChild*()) and
+      tld = term.getAChild*() and
+      HostnameRegexp::hasTopLevelDomainEnding(tld, i) and
+      isFinalRegExpTerm(tld.getChild(i)) and // nothing is matched after the TLD
+      tld.getChild(0) instanceof RegExpCaret and
+      msg =
+        "This hostname pattern may match any domain name, as it is missing a '" + getEndAnchorText()
+          + "' or '/' at the end."
+    )
+  }
+
+  /**
+   * Holds if `src` is an unanchored pattern for a URL, indicating a
+   * mistake explained by `msg`.
+   */
+  predicate isUnanchoredHostnameRegExp(RegExpPatternSource src, string msg) {
+    exists(RegExpTerm term, RegExpSequence tld | term = src.getRegExpTerm() |
+      HostnameRegexp::alwaysMatchesHostname(term) and
+      tld = term.getAChild*() and
+      HostnameRegexp::hasTopLevelDomainEnding(tld) and
+      not HostnameRegexp::isConstantInvalidInsideOrigin(term.getAChild*()) and
+      not term.getAChild*() instanceof RegExpAnchor and
+      // that is not used for string replacement
+      not isUsedAsReplace(src) and
+      msg =
+        "When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it."
+    )
+  }
+
+  /**
+   * Holds if `src` contains a hostname pattern that uses the `^/$` line anchors
+   * rather than `\A/\z` which match the start/end of the whole string.
+   */
+  predicate isLineAnchoredHostnameRegExp(RegExpPatternSource src, string msg) {
+    // avoid double reporting
+    not (
+      isSemiAnchoredHostnameRegExp(src, _) or
+      hasMisleadingAnchorPrecedence(src, _)
+    ) and
+    exists(RegExpTerm term, RegExpSequence tld, int i | term = src.getRegExpTerm() |
+      not HostnameRegexp::isConstantInvalidInsideOrigin(term.getAChild*()) and
+      tld = term.getAChild*() and
+      HostnameRegexp::hasTopLevelDomainEnding(tld, i) and
+      isFinalRegExpTerm(tld.getChild(i)) and // nothing is matched after the TLD
+      (
+        tld.getChild(0).(RegExpCaret).getChar() = "^" or
+        tld.getLastChild().(RegExpDollar).getChar() = "$"
+      ) and
+      msg =
+        "This hostname pattern uses anchors such as '^' and '$', which match the start and end of a line, not the whole string. Use '\\A' and '\\z' instead."
+    )
+  }
+}
--- a/shared/regex/codeql/regex/RegexTreeView.qll
+++ b/shared/regex/codeql/regex/RegexTreeView.qll
@@ -73,6 +73,9 @@ signature module RegexTreeViewSig {
    /** Gets the regular expression term that is matched (textually) after this one, if any. */
    RegExpTerm getSuccessor();

+    /** Gets the last child term of this element. */
+    RegExpTerm getLastChild();
+
    string toString();

    predicate hasLocationInfo(
@@ -149,6 +152,11 @@ signature module RegexTreeViewSig {
    int getUpperBound();
  }

+  /**
+   * A non-word boundary, that is, a regular expression term of the form `\B`.
+   */
+  class RegExpNonWordBoundary extends RegExpTerm;
+
  /**
   * An escaped regular expression term, that is, a regular expression
   * term starting with a backslash.
@@ -210,6 +218,9 @@ signature module RegexTreeViewSig {
     * not a capture group.
     */
    int getNumber();
+
+    /** Holds if this is a capture group. */
+    predicate isCapture();
  }

  /**
@@ -325,6 +336,20 @@ signature module RegexTreeViewSig {
    predicate isCharacter();
  }

+  /**
+   * A character escape in a regular expression.
+   *
+   * Example:
+   *
+   * ```
+   * \.
+   * ```
+   */
+  class RegExpCharEscape extends RegExpEscape {
+    /** Gets the string matched by this term. */
+    string getValue();
+  }
+
  /**
   * A character class in a regular expression.
   *
@@ -370,6 +395,20 @@ signature module RegexTreeViewSig {
   */
  class RegExpDot extends RegExpTerm;

+  /**
+   * A term that matches a specific position between characters in the string.
+   *
+   * Example:
+   *
+   * ```
+   * \A
+   * ```
+   */
+  class RegExpAnchor extends RegExpTerm {
+    /** Gets the char for this term. */
+    string getChar();
+  }
+
  /**
   * A dollar assertion `$` matching the end of a line.
   *
@@ -379,7 +418,7 @@ signature module RegexTreeViewSig {
   * $
   * ```
   */
-  class RegExpDollar extends RegExpTerm;
+  class RegExpDollar extends RegExpAnchor;

  /**
   * A caret assertion `^` matching the beginning of a line.
@@ -390,7 +429,7 @@ signature module RegexTreeViewSig {
   * ^
   * ```
   */
-  class RegExpCaret extends RegExpTerm;
+  class RegExpCaret extends RegExpAnchor;

  /**
   * A word boundary assertion.