Merge pull request #12552 from erik-krogh/py-type-trackers

Py: refactor regex tracking to type-trackers
2025-12-17 01:03:14 +01:00 · 2023-05-11 16:18:34 +02:00
parent 61b0514b53 18f8c69261
commit 62f0c64a03
34 changed files with 2335 additions and 2632 deletions
--- a/config/identical-files.json
+++ b/config/identical-files.json
@@ -47,7 +47,6 @@
    "python/ql/lib/semmle/python/dataflow/new/internal/DataFlowImpl2.qll",
    "python/ql/lib/semmle/python/dataflow/new/internal/DataFlowImpl3.qll",
    "python/ql/lib/semmle/python/dataflow/new/internal/DataFlowImpl4.qll",
-    "python/ql/lib/semmle/python/dataflow/new/internal/DataFlowImplForRegExp.qll",
    "ruby/ql/lib/codeql/ruby/dataflow/internal/DataFlowImpl1.qll",
    "ruby/ql/lib/codeql/ruby/dataflow/internal/DataFlowImpl2.qll",
    "ruby/ql/lib/codeql/ruby/dataflow/internal/DataFlowImplForHttpClientLibraries.qll",
--- a/python/ql/lib/semmle/python/Concepts.qll
+++ b/python/ql/lib/semmle/python/Concepts.qll
@@ -421,6 +421,24 @@ module RegexExecution {
  }
 }

+/**
+ * A node where a string is interpreted as a regular expression,
+ * for instance an argument to `re.compile`.
+ *
+ * Extend this class to refine existing API models. If you want to model new APIs,
+ * extend `RegExpInterpretation::Range` instead.
+ */
+class RegExpInterpretation extends DataFlow::Node instanceof RegExpInterpretation::Range { }
+
+/** Provides a class for modeling regular expression interpretations. */
+module RegExpInterpretation {
+  /**
+   * A node where a string is interpreted as a regular expression,
+   * for instance an argument to `re.compile`.
+   */
+  abstract class Range extends DataFlow::Node { }
+}
+
 /** Provides classes for modeling XML-related APIs. */
 module XML {
  /**
--- a/python/ql/lib/semmle/python/PrintAst.qll
+++ b/python/ql/lib/semmle/python/PrintAst.qll
@@ -7,7 +7,7 @@
 */

 import python
-import semmle.python.RegexTreeView
+import semmle.python.regexp.RegexTreeView
 import semmle.python.Yaml

 private newtype TPrintAstConfiguration = MkPrintAstConfiguration()
--- a/python/ql/lib/semmle/python/RegexTreeView.qll
+++ b/python/ql/lib/semmle/python/RegexTreeView.qll
--- a/python/ql/lib/semmle/python/dataflow/new/Regexp.qll
+++ b/python/ql/lib/semmle/python/dataflow/new/Regexp.qll
@@ -2,9 +2,10 @@
 * Provides classes for working with regular expressions.
 */

-private import semmle.python.RegexTreeView
+private import semmle.python.regexp.RegexTreeView
 private import semmle.python.regex
 private import semmle.python.dataflow.new.DataFlow
+private import semmle.python.regexp.internal.RegExpTracking

 /**
 * Provides utility predicates related to regular expressions.
@@ -25,18 +26,18 @@ deprecated module RegExpPatterns {
 * as a part of a regular expression.
 */
 class RegExpPatternSource extends DataFlow::CfgNode {
-  private Regex astNode;
+  private RegExpSink sink;

-  RegExpPatternSource() { astNode = this.asExpr() }
+  RegExpPatternSource() { this = regExpSource(sink) }

  /**
   * Gets a node where the pattern of this node is parsed as a part of
   * a regular expression.
   */
-  DataFlow::Node getAParse() { result = this }
+  RegExpSink getAParse() { result = sink }

  /**
   * Gets the root term of the regular expression parsed from this pattern.
   */
-  RegExpTerm getRegExpTerm() { result.getRegex() = astNode }
+  RegExpTerm getRegExpTerm() { result.getRegex() = this.asExpr() }
 }
--- a/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowImplForRegExp.qll
+++ b/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowImplForRegExp.qll
@@ -1,398 +0,0 @@
-/**
- * DEPRECATED: Use `Global` and `GlobalWithState` instead.
- *
- * Provides a `Configuration` class backwards-compatible interface to the data
- * flow library.
- */
-
-private import DataFlowImplCommon
-private import DataFlowImplSpecific::Private
-import DataFlowImplSpecific::Public
-private import DataFlowImpl
-import DataFlowImplCommonPublic
-import FlowStateString
-private import codeql.util.Unit
-
-/**
- * A configuration of interprocedural data flow analysis. This defines
- * sources, sinks, and any other configurable aspect of the analysis. Each
- * use of the global data flow library must define its own unique extension
- * of this abstract class. To create a configuration, extend this class with
- * a subclass whose characteristic predicate is a unique singleton string.
- * For example, write
- *
- * ```ql
- * class MyAnalysisConfiguration extends DataFlow::Configuration {
- *   MyAnalysisConfiguration() { this = "MyAnalysisConfiguration" }
- *   // Override `isSource` and `isSink`.
- *   // Optionally override `isBarrier`.
- *   // Optionally override `isAdditionalFlowStep`.
- * }
- * ```
- * Conceptually, this defines a graph where the nodes are `DataFlow::Node`s and
- * the edges are those data-flow steps that preserve the value of the node
- * along with any additional edges defined by `isAdditionalFlowStep`.
- * Specifying nodes in `isBarrier` will remove those nodes from the graph, and
- * specifying nodes in `isBarrierIn` and/or `isBarrierOut` will remove in-going
- * and/or out-going edges from those nodes, respectively.
- *
- * Then, to query whether there is flow between some `source` and `sink`,
- * write
- *
- * ```ql
- * exists(MyAnalysisConfiguration cfg | cfg.hasFlow(source, sink))
- * ```
- *
- * Multiple configurations can coexist, but two classes extending
- * `DataFlow::Configuration` should never depend on each other. One of them
- * should instead depend on a `DataFlow2::Configuration`, a
- * `DataFlow3::Configuration`, or a `DataFlow4::Configuration`.
- */
-abstract class Configuration extends string {
-  bindingset[this]
-  Configuration() { any() }
-
-  /**
-   * Holds if `source` is a relevant data flow source.
-   */
-  predicate isSource(Node source) { none() }
-
-  /**
-   * Holds if `source` is a relevant data flow source with the given initial
-   * `state`.
-   */
-  predicate isSource(Node source, FlowState state) { none() }
-
-  /**
-   * Holds if `sink` is a relevant data flow sink.
-   */
-  predicate isSink(Node sink) { none() }
-
-  /**
-   * Holds if `sink` is a relevant data flow sink accepting `state`.
-   */
-  predicate isSink(Node sink, FlowState state) { none() }
-
-  /**
-   * Holds if data flow through `node` is prohibited. This completely removes
-   * `node` from the data flow graph.
-   */
-  predicate isBarrier(Node node) { none() }
-
-  /**
-   * Holds if data flow through `node` is prohibited when the flow state is
-   * `state`.
-   */
-  predicate isBarrier(Node node, FlowState state) { none() }
-
-  /** Holds if data flow into `node` is prohibited. */
-  predicate isBarrierIn(Node node) { none() }
-
-  /** Holds if data flow out of `node` is prohibited. */
-  predicate isBarrierOut(Node node) { none() }
-
-  /**
-   * DEPRECATED: Use `isBarrier` and `BarrierGuard` module instead.
-   *
-   * Holds if data flow through nodes guarded by `guard` is prohibited.
-   */
-  deprecated predicate isBarrierGuard(BarrierGuard guard) { none() }
-
-  /**
-   * DEPRECATED: Use `isBarrier` and `BarrierGuard` module instead.
-   *
-   * Holds if data flow through nodes guarded by `guard` is prohibited when
-   * the flow state is `state`
-   */
-  deprecated predicate isBarrierGuard(BarrierGuard guard, FlowState state) { none() }
-
-  /**
-   * Holds if data may flow from `node1` to `node2` in addition to the normal data-flow steps.
-   */
-  predicate isAdditionalFlowStep(Node node1, Node node2) { none() }
-
-  /**
-   * Holds if data may flow from `node1` to `node2` in addition to the normal data-flow steps.
-   * This step is only applicable in `state1` and updates the flow state to `state2`.
-   */
-  predicate isAdditionalFlowStep(Node node1, FlowState state1, Node node2, FlowState state2) {
-    none()
-  }
-
-  /**
-   * Holds if an arbitrary number of implicit read steps of content `c` may be
-   * taken at `node`.
-   */
-  predicate allowImplicitRead(Node node, ContentSet c) { none() }
-
-  /**
-   * Gets the virtual dispatch branching limit when calculating field flow.
-   * This can be overridden to a smaller value to improve performance (a
-   * value of 0 disables field flow), or a larger value to get more results.
-   */
-  int fieldFlowBranchLimit() { result = 2 }
-
-  /**
-   * Gets a data flow configuration feature to add restrictions to the set of
-   * valid flow paths.
-   *
-   * - `FeatureHasSourceCallContext`:
-   *    Assume that sources have some existing call context to disallow
-   *    conflicting return-flow directly following the source.
-   * - `FeatureHasSinkCallContext`:
-   *    Assume that sinks have some existing call context to disallow
-   *    conflicting argument-to-parameter flow directly preceding the sink.
-   * - `FeatureEqualSourceSinkCallContext`:
-   *    Implies both of the above and additionally ensures that the entire flow
-   *    path preserves the call context.
-   *
-   * These features are generally not relevant for typical end-to-end data flow
-   * queries, but should only be used for constructing paths that need to
-   * somehow be pluggable in another path context.
-   */
-  FlowFeature getAFeature() { none() }
-
-  /** Holds if sources should be grouped in the result of `hasFlowPath`. */
-  predicate sourceGrouping(Node source, string sourceGroup) { none() }
-
-  /** Holds if sinks should be grouped in the result of `hasFlowPath`. */
-  predicate sinkGrouping(Node sink, string sinkGroup) { none() }
-
-  /**
-   * Holds if data may flow from `source` to `sink` for this configuration.
-   */
-  predicate hasFlow(Node source, Node sink) { hasFlow(source, sink, this) }
-
-  /**
-   * Holds if data may flow from `source` to `sink` for this configuration.
-   *
-   * The corresponding paths are generated from the end-points and the graph
-   * included in the module `PathGraph`.
-   */
-  predicate hasFlowPath(PathNode source, PathNode sink) { hasFlowPath(source, sink, this) }
-
-  /**
-   * Holds if data may flow from some source to `sink` for this configuration.
-   */
-  predicate hasFlowTo(Node sink) { hasFlowTo(sink, this) }
-
-  /**
-   * Holds if data may flow from some source to `sink` for this configuration.
-   */
-  predicate hasFlowToExpr(DataFlowExpr sink) { this.hasFlowTo(exprNode(sink)) }
-
-  /**
-   * DEPRECATED: Use `FlowExploration<explorationLimit>` instead.
-   *
-   * Gets the exploration limit for `hasPartialFlow` and `hasPartialFlowRev`
-   * measured in approximate number of interprocedural steps.
-   */
-  deprecated int explorationLimit() { none() }
-
-  /**
-   * Holds if hidden nodes should be included in the data flow graph.
-   *
-   * This feature should only be used for debugging or when the data flow graph
-   * is not visualized (for example in a `path-problem` query).
-   */
-  predicate includeHiddenNodes() { none() }
-}
-
-/**
- * This class exists to prevent mutual recursion between the user-overridden
- * member predicates of `Configuration` and the rest of the data-flow library.
- * Good performance cannot be guaranteed in the presence of such recursion, so
- * it should be replaced by using more than one copy of the data flow library.
- */
-abstract private class ConfigurationRecursionPrevention extends Configuration {
-  bindingset[this]
-  ConfigurationRecursionPrevention() { any() }
-
-  override predicate hasFlow(Node source, Node sink) {
-    strictcount(Node n | this.isSource(n)) < 0
-    or
-    strictcount(Node n | this.isSource(n, _)) < 0
-    or
-    strictcount(Node n | this.isSink(n)) < 0
-    or
-    strictcount(Node n | this.isSink(n, _)) < 0
-    or
-    strictcount(Node n1, Node n2 | this.isAdditionalFlowStep(n1, n2)) < 0
-    or
-    strictcount(Node n1, Node n2 | this.isAdditionalFlowStep(n1, _, n2, _)) < 0
-    or
-    super.hasFlow(source, sink)
-  }
-}
-
-/** A bridge class to access the deprecated `isBarrierGuard`. */
-private class BarrierGuardGuardedNodeBridge extends Unit {
-  abstract predicate guardedNode(Node n, Configuration config);
-
-  abstract predicate guardedNode(Node n, FlowState state, Configuration config);
-}
-
-private class BarrierGuardGuardedNode extends BarrierGuardGuardedNodeBridge {
-  deprecated override predicate guardedNode(Node n, Configuration config) {
-    exists(BarrierGuard g |
-      config.isBarrierGuard(g) and
-      n = g.getAGuardedNode()
-    )
-  }
-
-  deprecated override predicate guardedNode(Node n, FlowState state, Configuration config) {
-    exists(BarrierGuard g |
-      config.isBarrierGuard(g, state) and
-      n = g.getAGuardedNode()
-    )
-  }
-}
-
-private FlowState relevantState(Configuration config) {
-  config.isSource(_, result) or
-  config.isSink(_, result) or
-  config.isBarrier(_, result) or
-  config.isAdditionalFlowStep(_, result, _, _) or
-  config.isAdditionalFlowStep(_, _, _, result)
-}
-
-private newtype TConfigState =
-  TMkConfigState(Configuration config, FlowState state) {
-    state = relevantState(config) or state instanceof FlowStateEmpty
-  }
-
-private Configuration getConfig(TConfigState state) { state = TMkConfigState(result, _) }
-
-private FlowState getState(TConfigState state) { state = TMkConfigState(_, result) }
-
-private predicate singleConfiguration() { 1 = strictcount(Configuration c) }
-
-private module Config implements FullStateConfigSig {
-  class FlowState = TConfigState;
-
-  predicate isSource(Node source, FlowState state) {
-    getConfig(state).isSource(source, getState(state))
-    or
-    getConfig(state).isSource(source) and getState(state) instanceof FlowStateEmpty
-  }
-
-  predicate isSink(Node sink, FlowState state) {
-    getConfig(state).isSink(sink, getState(state))
-    or
-    getConfig(state).isSink(sink) and getState(state) instanceof FlowStateEmpty
-  }
-
-  predicate isBarrier(Node node) { none() }
-
-  predicate isBarrier(Node node, FlowState state) {
-    getConfig(state).isBarrier(node, getState(state)) or
-    getConfig(state).isBarrier(node) or
-    any(BarrierGuardGuardedNodeBridge b).guardedNode(node, getState(state), getConfig(state)) or
-    any(BarrierGuardGuardedNodeBridge b).guardedNode(node, getConfig(state))
-  }
-
-  predicate isBarrierIn(Node node) { any(Configuration config).isBarrierIn(node) }
-
-  predicate isBarrierOut(Node node) { any(Configuration config).isBarrierOut(node) }
-
-  predicate isAdditionalFlowStep(Node node1, Node node2) {
-    singleConfiguration() and
-    any(Configuration config).isAdditionalFlowStep(node1, node2)
-  }
-
-  predicate isAdditionalFlowStep(Node node1, FlowState state1, Node node2, FlowState state2) {
-    getConfig(state1).isAdditionalFlowStep(node1, getState(state1), node2, getState(state2)) and
-    getConfig(state2) = getConfig(state1)
-    or
-    not singleConfiguration() and
-    getConfig(state1).isAdditionalFlowStep(node1, node2) and
-    state2 = state1
-  }
-
-  predicate allowImplicitRead(Node node, ContentSet c) {
-    any(Configuration config).allowImplicitRead(node, c)
-  }
-
-  int fieldFlowBranchLimit() { result = min(any(Configuration config).fieldFlowBranchLimit()) }
-
-  FlowFeature getAFeature() { result = any(Configuration config).getAFeature() }
-
-  predicate sourceGrouping(Node source, string sourceGroup) {
-    any(Configuration config).sourceGrouping(source, sourceGroup)
-  }
-
-  predicate sinkGrouping(Node sink, string sinkGroup) {
-    any(Configuration config).sinkGrouping(sink, sinkGroup)
-  }
-
-  predicate includeHiddenNodes() { any(Configuration config).includeHiddenNodes() }
-}
-
-private import Impl<Config> as I
-
-/**
- * A `Node` augmented with a call context (except for sinks), an access path, and a configuration.
- * Only those `PathNode`s that are reachable from a source, and which can reach a sink, are generated.
- */
-class PathNode instanceof I::PathNode {
-  /** Gets a textual representation of this element. */
-  final string toString() { result = super.toString() }
-
-  /**
-   * Gets a textual representation of this element, including a textual
-   * representation of the call context.
-   */
-  final string toStringWithContext() { result = super.toStringWithContext() }
-
-  /**
-   * Holds if this element is at the specified location.
-   * The location spans column `startcolumn` of line `startline` to
-   * column `endcolumn` of line `endline` in file `filepath`.
-   * For more information, see
-   * [Locations](https://codeql.github.com/docs/writing-codeql-queries/providing-locations-in-codeql-queries/).
-   */
-  final predicate hasLocationInfo(
-    string filepath, int startline, int startcolumn, int endline, int endcolumn
-  ) {
-    super.hasLocationInfo(filepath, startline, startcolumn, endline, endcolumn)
-  }
-
-  /** Gets the underlying `Node`. */
-  final Node getNode() { result = super.getNode() }
-
-  /** Gets the `FlowState` of this node. */
-  final FlowState getState() { result = getState(super.getState()) }
-
-  /** Gets the associated configuration. */
-  final Configuration getConfiguration() { result = getConfig(super.getState()) }
-
-  /** Gets a successor of this node, if any. */
-  final PathNode getASuccessor() { result = super.getASuccessor() }
-
-  /** Holds if this node is a source. */
-  final predicate isSource() { super.isSource() }
-
-  /** Holds if this node is a grouping of source nodes. */
-  final predicate isSourceGroup(string group) { super.isSourceGroup(group) }
-
-  /** Holds if this node is a grouping of sink nodes. */
-  final predicate isSinkGroup(string group) { super.isSinkGroup(group) }
-}
-
-module PathGraph = I::PathGraph;
-
-private predicate hasFlow(Node source, Node sink, Configuration config) {
-  exists(PathNode source0, PathNode sink0 |
-    hasFlowPath(source0, sink0, config) and
-    source0.getNode() = source and
-    sink0.getNode() = sink
-  )
-}
-
-private predicate hasFlowPath(PathNode source, PathNode sink, Configuration config) {
-  I::flowPath(source, sink) and source.getConfiguration() = config
-}
-
-private predicate hasFlowTo(Node sink, Configuration config) { hasFlow(_, sink, config) }
-
-predicate flowsTo = hasFlow/3;
--- a/python/ql/lib/semmle/python/frameworks/Django.qll
+++ b/python/ql/lib/semmle/python/frameworks/Django.qll
@@ -2512,9 +2512,10 @@ module PrivateDjango {
          any(int i | i < routeHandler.getFirstPossibleRoutedParamIndex() | routeHandler.getArg(i))
      )
      or
-      exists(DjangoRouteHandler routeHandler, DjangoRouteRegex regex |
+      exists(DjangoRouteHandler routeHandler, DjangoRouteRegex regexUse, RegExp regex |
+        regex.getAUse() = regexUse and
        routeHandler = this.getARequestHandler() and
-        regex.getRouteSetup() = this
+        regexUse.getRouteSetup() = this
      |
        // either using named capture groups (passed as keyword arguments) or using
        // unnamed capture groups (passed as positional arguments)
@@ -2533,14 +2534,12 @@ module PrivateDjango {
  /**
   * A regex that is used to set up a route.
   *
-   * Needs this subclass to be considered a RegexString.
+   * Needs this subclass to be considered a RegExpInterpretation.
   */
-  private class DjangoRouteRegex extends RegexString instanceof StrConst {
+  private class DjangoRouteRegex extends RegExpInterpretation::Range {
    DjangoRegexRouteSetup rePathCall;

-    DjangoRouteRegex() {
-      rePathCall.getUrlPatternArg().getALocalSource() = DataFlow::exprNode(this)
-    }
+    DjangoRouteRegex() { this = rePathCall.getUrlPatternArg() }

    DjangoRegexRouteSetup getRouteSetup() { result = rePathCall }
  }
--- a/python/ql/lib/semmle/python/frameworks/Stdlib.qll
+++ b/python/ql/lib/semmle/python/frameworks/Stdlib.qll
@@ -3015,6 +3015,17 @@ private module StdlibPrivate {
    override string getKind() { result = Escaping::getRegexKind() }
  }

+  /**
+   * A node interpreted as a regular expression.
+   * Speficically nodes where string values are interpreted as regular expressions.
+   */
+  private class StdLibRegExpInterpretation extends RegExpInterpretation::Range {
+    StdLibRegExpInterpretation() {
+      this =
+        API::moduleImport("re").getMember("compile").getACall().getParameter(0, "pattern").asSink()
+    }
+  }
+
  // ---------------------------------------------------------------------------
  // urllib
  // ---------------------------------------------------------------------------
--- a/python/ql/lib/semmle/python/frameworks/Tornado.qll
+++ b/python/ql/lib/semmle/python/frameworks/Tornado.qll
@@ -384,12 +384,12 @@ module Tornado {
  /**
   * A regex that is used to set up a route.
   *
-   * Needs this subclass to be considered a RegexString.
+   * Needs this subclass to be considered a RegExpInterpretation.
   */
-  private class TornadoRouteRegex extends RegexString instanceof StrConst {
+  private class TornadoRouteRegex extends RegExpInterpretation::Range {
    TornadoRouteSetup setup;

-    TornadoRouteRegex() { setup.getUrlPatternArg().getALocalSource() = DataFlow::exprNode(this) }
+    TornadoRouteRegex() { this = setup.getUrlPatternArg() }

    TornadoRouteSetup getRouteSetup() { result = setup }
  }
@@ -423,9 +423,10 @@ module Tornado {
        not result = requestHandler.getArg(0)
      )
      or
-      exists(Function requestHandler, TornadoRouteRegex regex |
+      exists(Function requestHandler, TornadoRouteRegex regexUse, RegExp regex |
+        regex.getAUse() = regexUse and
        requestHandler = this.getARequestHandler() and
-        regex.getRouteSetup() = this
+        regexUse.getRouteSetup() = this
      |
        // first group will have group number 1
        result = requestHandler.getArg(regex.getGroupNumber(_, _))
--- a/python/ql/lib/semmle/python/regex.qll
+++ b/python/ql/lib/semmle/python/regex.qll
--- a/python/ql/lib/semmle/python/regexp/RegexTreeView.qll
+++ b/python/ql/lib/semmle/python/regexp/RegexTreeView.qll
--- a/python/ql/lib/semmle/python/regexp/internal/ParseRegExp.qll
+++ b/python/ql/lib/semmle/python/regexp/internal/ParseRegExp.qll
--- a/python/ql/lib/semmle/python/regexp/internal/RegExpTracking.qll
+++ b/python/ql/lib/semmle/python/regexp/internal/RegExpTracking.qll
@@ -0,0 +1,76 @@
+/**
+ * Provides predicates that track strings to where they are used as regular expressions.
+ * This is implemented using TypeTracking in two phases:
+ *
+ * 1: An exploratory backwards analysis that imprecisely tracks all nodes that may be used as regular expressions.
+ * The exploratory phase ends with a forwards analysis from string constants that were reached by the backwards analysis.
+ * This is similar to the exploratory phase of the JavaScript global DataFlow library.
+ *
+ * 2: A precise type tracking analysis that tracks constant strings to where they are used as regular expressions.
+ * This phase keeps track of which strings and regular expressions end up in which places.
+ */
+
+import python
+private import semmle.python.dataflow.new.DataFlow
+private import semmle.python.Concepts as Concepts
+
+/** Gets a constant string value that may be used as a regular expression. */
+DataFlow::LocalSourceNode strStart() { result.asExpr() instanceof StrConst }
+
+private import semmle.python.regex as Regex
+
+/** A node where regular expressions that flow to the node are used. */
+class RegExpSink extends DataFlow::Node {
+  RegExpSink() {
+    this = any(Concepts::RegexExecution exec).getRegex()
+    or
+    this instanceof Concepts::RegExpInterpretation
+  }
+}
+
+/**
+ * Gets a dataflow node that may end up being in any regular expression execution.
+ * This is the backwards exploratory phase of the analysis.
+ */
+private DataFlow::TypeTrackingNode backwards(DataFlow::TypeBackTracker t) {
+  t.start() and
+  result = any(RegExpSink sink).getALocalSource()
+  or
+  exists(DataFlow::TypeBackTracker t2 | result = backwards(t2).backtrack(t2, t))
+}
+
+/**
+ * Gets a reference to a string that reaches any regular expression execution.
+ * This is the forwards exploratory phase of the analysis.
+ */
+private DataFlow::TypeTrackingNode forwards(DataFlow::TypeTracker t) {
+  t.start() and
+  result = backwards(DataFlow::TypeBackTracker::end()) and
+  result = strStart()
+  or
+  exists(DataFlow::TypeTracker t2 | result = forwards(t2).track(t2, t)) and
+  result = backwards(_)
+}
+
+/**
+ * Gets a node that has been tracked from the string constant `start` to some node.
+ * This is used to figure out where `start` is evaluated as a regular expression.
+ *
+ * The result of the exploratory phase is used to limit the size of the search space in this precise analysis.
+ */
+private DataFlow::TypeTrackingNode regexTracking(DataFlow::Node start, DataFlow::TypeTracker t) {
+  result = forwards(t) and
+  (
+    t.start() and
+    start = strStart() and
+    result = start
+    or
+    exists(DataFlow::TypeTracker t2 | result = regexTracking(start, t2).track(t2, t))
+  )
+}
+
+/** Gets a node holding a value for the regular expression that is evaluated at `re`. */
+cached
+DataFlow::Node regExpSource(RegExpSink re) {
+  regexTracking(result, DataFlow::TypeTracker::end()).flowsTo(re)
+}
--- a/python/ql/lib/semmle/python/security/dataflow/PolynomialReDoSCustomizations.qll
+++ b/python/ql/lib/semmle/python/security/dataflow/PolynomialReDoSCustomizations.qll
@@ -11,7 +11,7 @@ private import semmle.python.dataflow.new.TaintTracking
 private import semmle.python.Concepts
 private import semmle.python.dataflow.new.RemoteFlowSources
 private import semmle.python.dataflow.new.BarrierGuards
-private import semmle.python.RegexTreeView::RegexTreeView as TreeView
+private import semmle.python.regexp.RegexTreeView::RegexTreeView as TreeView
 private import semmle.python.ApiGraphs
 private import semmle.python.regex

--- a/python/ql/lib/semmle/python/security/regexp/HostnameRegex.qll
+++ b/python/ql/lib/semmle/python/security/regexp/HostnameRegex.qll
@@ -5,14 +5,25 @@

 private import python
 private import semmle.python.dataflow.new.DataFlow
-private import semmle.python.RegexTreeView::RegexTreeView as TreeImpl
+private import semmle.python.regexp.RegexTreeView::RegexTreeView as TreeImpl
 private import semmle.python.dataflow.new.Regexp as Regexp
 private import codeql.regex.HostnameRegexp as Shared

 private module Impl implements Shared::HostnameRegexpSig<TreeImpl> {
  class DataFlowNode = DataFlow::Node;

-  class RegExpPatternSource = Regexp::RegExpPatternSource;
+  class RegExpPatternSource extends DataFlow::Node instanceof Regexp::RegExpPatternSource {
+    /**
+     * Gets a node where the pattern of this node is parsed as a part of
+     * a regular expression.
+     */
+    DataFlow::Node getAParse() { result = super.getAParse() }
+
+    /**
+     * Gets the root term of the regular expression parsed from this pattern.
+     */
+    TreeImpl::RegExpTerm getRegExpTerm() { result = super.getRegExpTerm() }
+  }
 }

 import Shared::Make<TreeImpl, Impl>
--- a/python/ql/src/Expressions/Regex/BackspaceEscape.ql
+++ b/python/ql/src/Expressions/Regex/BackspaceEscape.ql
@@ -13,7 +13,7 @@
 import python
 import semmle.python.regex

-from Regex r, int offset
+from RegExp r, int offset
 where
  r.escapingChar(offset) and
  r.getChar(offset + 1) = "b" and
--- a/python/ql/src/Expressions/Regex/DuplicateCharacterInSet.ql
+++ b/python/ql/src/Expressions/Regex/DuplicateCharacterInSet.ql
@@ -13,7 +13,7 @@
 import python
 import semmle.python.regex

-predicate duplicate_char_in_class(Regex r, string char) {
+predicate duplicate_char_in_class(RegExp r, string char) {
  exists(int i, int j, int x, int y, int start, int end |
    i != x and
    j != y and
@@ -36,7 +36,7 @@ predicate duplicate_char_in_class(Regex r, string char) {
  )
 }

-from Regex r, string char
+from RegExp r, string char
 where duplicate_char_in_class(r, char)
 select r,
  "This regular expression includes duplicate character '" + char + "' in a set of characters."
--- a/python/ql/src/Expressions/Regex/MissingPartSpecialGroup.ql
+++ b/python/ql/src/Expressions/Regex/MissingPartSpecialGroup.ql
@@ -13,6 +13,6 @@
 import python
 import semmle.python.regex

-from Regex r, string missing, string part
+from RegExp r, string missing, string part
 where r.getText().regexpMatch(".*\\(P<\\w+>.*") and missing = "?" and part = "named group"
 select r, "Regular expression is missing '" + missing + "' in " + part + "."
--- a/python/ql/src/Expressions/Regex/UnmatchableCaret.ql
+++ b/python/ql/src/Expressions/Regex/UnmatchableCaret.ql
@@ -13,14 +13,14 @@
 import python
 import semmle.python.regex

-predicate unmatchable_caret(Regex r, int start) {
+predicate unmatchable_caret(RegExp r, int start) {
  not r.getAMode() = "MULTILINE" and
  not r.getAMode() = "VERBOSE" and
  r.specialCharacter(start, start + 1, "^") and
  not r.firstItem(start, start + 1)
 }

-from Regex r, int offset
+from RegExp r, int offset
 where unmatchable_caret(r, offset)
 select r,
  "This regular expression includes an unmatchable caret at offset " + offset.toString() + "."
--- a/python/ql/src/Expressions/Regex/UnmatchableDollar.ql
+++ b/python/ql/src/Expressions/Regex/UnmatchableDollar.ql
@@ -13,14 +13,14 @@
 import python
 import semmle.python.regex

-predicate unmatchable_dollar(Regex r, int start) {
+predicate unmatchable_dollar(RegExp r, int start) {
  not r.getAMode() = "MULTILINE" and
  not r.getAMode() = "VERBOSE" and
  r.specialCharacter(start, start + 1, "$") and
  not r.lastItem(start, start + 1)
 }

-from Regex r, int offset
+from RegExp r, int offset
 where unmatchable_dollar(r, offset)
 select r,
  "This regular expression includes an unmatchable dollar at offset " + offset.toString() + "."
--- a/python/ql/src/Security/CWE-020/OverlyLargeRange.ql
+++ b/python/ql/src/Security/CWE-020/OverlyLargeRange.ql
@@ -12,7 +12,7 @@
 *       external/cwe/cwe-020
 */

-private import semmle.python.RegexTreeView::RegexTreeView as TreeView
+private import semmle.python.regexp.RegexTreeView::RegexTreeView as TreeView
 import codeql.regex.OverlyLargeRangeQuery::Make<TreeView>

 from TreeView::RegExpCharacterRange range, string reason
--- a/python/ql/src/Security/CWE-116/BadTagFilter.ql
+++ b/python/ql/src/Security/CWE-116/BadTagFilter.ql
@@ -14,7 +14,7 @@
 *       external/cwe/cwe-186
 */

-private import semmle.python.RegexTreeView::RegexTreeView as TreeView
+private import semmle.python.regexp.RegexTreeView::RegexTreeView as TreeView
 import codeql.regex.nfa.BadTagFilterQuery::Make<TreeView>

 from HtmlMatchingRegExp regexp, string msg
--- a/python/ql/src/Security/CWE-730/ReDoS.ql
+++ b/python/ql/src/Security/CWE-730/ReDoS.ql
@@ -14,7 +14,7 @@
 *       external/cwe/cwe-400
 */

-private import semmle.python.RegexTreeView::RegexTreeView as TreeView
+private import semmle.python.regexp.RegexTreeView::RegexTreeView as TreeView
 import codeql.regex.nfa.ExponentialBackTracking::Make<TreeView>

 from TreeView::RegExpTerm t, string pump, State s, string prefixMsg
--- a/python/ql/test/library-tests/regex/Alternation.ql
+++ b/python/ql/test/library-tests/regex/Alternation.ql
@@ -1,7 +1,7 @@
 import python
 import semmle.python.regex

-from Regex r, int start, int end, int part_start, int part_end
+from RegExp r, int start, int end, int part_start, int part_end
 where
  r.getLocation().getFile().getBaseName() = "test.py" and
  r.alternationOption(start, end, part_start, part_end)
--- a/python/ql/test/library-tests/regex/Characters.ql
+++ b/python/ql/test/library-tests/regex/Characters.ql
@@ -6,6 +6,6 @@
 import python
 import semmle.python.regex

-from Regex r, int start, int end
+from RegExp r, int start, int end
 where r.character(start, end) and r.getLocation().getFile().getBaseName() = "test.py"
 select r.getText(), start, end
--- a/python/ql/test/library-tests/regex/Consistency.ql
+++ b/python/ql/test/library-tests/regex/Consistency.ql
@@ -7,6 +7,6 @@ import semmle.python.regex

 from string str, Location loc, int counter
 where
-  counter = strictcount(Regex term | term.getLocation() = loc and term.getText() = str) and
+  counter = strictcount(RegExp term | term.getLocation() = loc and term.getText() = str) and
  counter > 1
 select str, counter, loc
--- a/python/ql/test/library-tests/regex/FirstLast.ql
+++ b/python/ql/test/library-tests/regex/FirstLast.ql
@@ -1,12 +1,12 @@
 import python
 import semmle.python.regex

-predicate part(Regex r, int start, int end, string kind) {
+predicate part(RegExp r, int start, int end, string kind) {
  r.lastItem(start, end) and kind = "last"
  or
  r.firstItem(start, end) and kind = "first"
 }

-from Regex r, int start, int end, string kind
+from RegExp r, int start, int end, string kind
 where part(r, start, end, kind) and r.getLocation().getFile().getBaseName() = "test.py"
 select r.getText(), kind, start, end
--- a/python/ql/test/library-tests/regex/GroupContents.ql
+++ b/python/ql/test/library-tests/regex/GroupContents.ql
@@ -1,7 +1,7 @@
 import python
 import semmle.python.regex

-from Regex r, int start, int end, int part_start, int part_end
+from RegExp r, int start, int end, int part_start, int part_end
 where
  r.getLocation().getFile().getBaseName() = "test.py" and
  r.groupContents(start, end, part_start, part_end)
--- a/python/ql/test/library-tests/regex/Mode.ql
+++ b/python/ql/test/library-tests/regex/Mode.ql
@@ -1,6 +1,6 @@
 import python
 import semmle.python.regex

-from Regex r
+from RegExp r
 where r.getLocation().getFile().getBaseName() = "test.py"
 select r.getLocation().getStartLine(), r.getAMode()
--- a/python/ql/test/library-tests/regex/Qualified.ql
+++ b/python/ql/test/library-tests/regex/Qualified.ql
@@ -1,7 +1,7 @@
 import python
 import semmle.python.regex

-from Regex r, int start, int end, boolean maybe_empty, boolean may_repeat_forever
+from RegExp r, int start, int end, boolean maybe_empty, boolean may_repeat_forever
 where
  r.getLocation().getFile().getBaseName() = "test.py" and
  r.qualifiedItem(start, end, maybe_empty, may_repeat_forever)
--- a/python/ql/test/library-tests/regex/Regex.ql
+++ b/python/ql/test/library-tests/regex/Regex.ql
@@ -1,7 +1,7 @@
 import python
 import semmle.python.regex

-predicate part(Regex r, int start, int end, string kind) {
+predicate part(RegExp r, int start, int end, string kind) {
  r.alternation(start, end) and kind = "choice"
  or
  r.normalCharacter(start, end) and kind = "char"
@@ -23,6 +23,6 @@ predicate part(Regex r, int start, int end, string kind) {
  r.qualifiedItem(start, end, _, _) and kind = "qualified"
 }

-from Regex r, int start, int end, string kind
+from RegExp r, int start, int end, string kind
 where part(r, start, end, kind) and r.getLocation().getFile().getBaseName() = "test.py"
 select r.getText(), kind, start, end
--- a/python/ql/test/library-tests/regex/SubstructureTests.ql
+++ b/python/ql/test/library-tests/regex/SubstructureTests.ql
@@ -10,7 +10,7 @@ class CharacterSetTest extends InlineExpectationsTest {
  override predicate hasActualResult(Location location, string element, string tag, string value) {
    exists(location.getFile().getRelativePath()) and
    location.getFile().getBaseName() = "charSetTest.py" and
-    exists(Regex re, int start, int end |
+    exists(RegExp re, int start, int end |
      re.charSet(start, end) and
      location = re.getLocation() and
      element = re.getText().substring(start, end) and
@@ -28,7 +28,7 @@ class CharacterRangeTest extends InlineExpectationsTest {
  override predicate hasActualResult(Location location, string element, string tag, string value) {
    exists(location.getFile().getRelativePath()) and
    location.getFile().getBaseName() = "charRangeTest.py" and
-    exists(Regex re, int start, int lower_end, int upper_start, int end |
+    exists(RegExp re, int start, int lower_end, int upper_start, int end |
      re.charRange(_, start, lower_end, upper_start, end) and
      location = re.getLocation() and
      element = re.getText().substring(start, end) and
@@ -46,7 +46,7 @@ class EscapeTest extends InlineExpectationsTest {
  override predicate hasActualResult(Location location, string element, string tag, string value) {
    exists(location.getFile().getRelativePath()) and
    location.getFile().getBaseName() = "escapedCharacterTest.py" and
-    exists(Regex re, int start, int end |
+    exists(RegExp re, int start, int end |
      re.escapedCharacter(start, end) and
      location = re.getLocation() and
      element = re.getText().substring(start, end) and
@@ -64,7 +64,7 @@ class GroupTest extends InlineExpectationsTest {
  override predicate hasActualResult(Location location, string element, string tag, string value) {
    exists(location.getFile().getRelativePath()) and
    location.getFile().getBaseName() = "groupTest.py" and
-    exists(Regex re, int start, int end |
+    exists(RegExp re, int start, int end |
      re.group(start, end) and
      location = re.getLocation() and
      element = re.getText().substring(start, end) and
--- a/python/ql/test/library-tests/regexparser/Consistency.ql
+++ b/python/ql/test/library-tests/regexparser/Consistency.ql
@@ -3,7 +3,7 @@
 */

 import python
-import semmle.python.RegexTreeView
+import semmle.python.regexp.RegexTreeView

 from string str, int counter, Location loc
 where
--- a/python/ql/test/query-tests/Security/CWE-730-PolynomialReDoS/PolynomialBackTracking.ql
+++ b/python/ql/test/query-tests/Security/CWE-730-PolynomialReDoS/PolynomialBackTracking.ql
@@ -1,5 +1,5 @@
 import python
-private import semmle.python.RegexTreeView::RegexTreeView as TreeView
+private import semmle.python.regexp.RegexTreeView::RegexTreeView as TreeView
 import codeql.regex.nfa.SuperlinearBackTracking::Make<TreeView>

 from PolynomialBackTrackingTerm t