refactor to share predicates between regular expression queries

2025-12-23 04:06:37 +01:00 · 2020-12-15 00:18:13 +01:00
parent b2116dc5b4
commit 34dda6d38b
3 changed files with 941 additions and 891 deletions
--- a/javascript/ql/src/Performance/ReDoS.ql
+++ b/javascript/ql/src/Performance/ReDoS.ql
@@ -13,7 +13,7 @@
 */
 import javascript
-import semmle.javascript.security.performance.SuperlinearBackTracking
+import semmle.javascript.security.performance.ReDoSUtil
 /*
 * This query implements the analysis described in the following two papers:
@@ -80,35 +80,11 @@ import semmle.javascript.security.performance.SuperlinearBackTracking
 */
 /**
- * A branch in a disjunction that is the root node in a literal, or a literal
+ * Holds if state `s` might be inside a backtracking repetition.
 * whose root node is not a disjunction.
 */
-class RegExpRoot extends RegExpTerm {
+pragma[noinline]
-  RegExpParent parent;
+predicate stateInsideBacktracking(State s) {
-
+  s.getRepr().getParent*() instanceof MaybeBacktrackingRepetition
  RegExpRoot() {
    exists(RegExpAlt alt |
      alt.isRootTerm() and
      this = alt.getAChild() and
      parent = alt.getParent()
    )
    or
    this.isRootTerm() and
    not this instanceof RegExpAlt and
    parent = this.getParent()
  }
  /**
   * Holds if this root term is relevant to the ReDoS analysis.
   */
  predicate isRelevant() {
    // there is at least one repetition
    exists(MaybeBacktrackingRepetition rep | getRoot(rep) = this) and
    // there are no lookbehinds
    not exists(RegExpLookbehind lbh | getRoot(lbh) = this) and
    // is actually used as a RegExp
    isUsedAsRegExp()
  }
 }
 /**
@@ -125,503 +101,6 @@ class MaybeBacktrackingRepetition extends InfiniteRepetitionQuantifier {
  }
 }
 /**
 * A constant in a regular expression that represents valid Unicode character(s).
 */
 class RegexpCharacterConstant extends RegExpConstant {
  RegexpCharacterConstant() { this.isCharacter() }
 }
 /**
 * Gets the root containing the given term, that is, the root of the literal,
 * or a branch of the root disjunction.
 */
 RegExpRoot getRoot(RegExpTerm term) {
  result = term or
  result = getRoot(term.getParent())
 }
 /**
 * An abstract input symbol, representing a set of concrete characters.
 */
 newtype TInputSymbol =
  /** An input symbol corresponding to character `c`. */
  Char(string c) {
    c = any(RegexpCharacterConstant cc | getRoot(cc).isRelevant()).getValue().charAt(_)
  } or
  /**
   * An input symbol representing all characters matched by
   * (non-universal) character class `recc`.
   */
  CharClass(RegExpTerm recc) {
    getRoot(recc).isRelevant() and
    (
      recc instanceof RegExpCharacterClass and
      not recc.(RegExpCharacterClass).isUniversalClass()
      or
      recc instanceof RegExpCharacterClassEscape
    )
  } or
  /** An input symbol representing all characters matched by `.`. */
  Dot() or
  /** An input symbol representing all characters. */
  Any() or
  /** An epsilon transition in the automaton. */
  Epsilon()
 /**
 * Holds if `a` and `b` are input symbols from the same regexp.
 * (And not a `Dot()`, `Any()` or `Epsilon()`)
 */
 private predicate sharesRoot(TInputSymbol a, TInputSymbol b) {
  exists(RegExpRoot root |
    belongsTo(a, root) and
    belongsTo(b, root)
  )
 }
 /**
 * Holds if the `a` is an input symbol from a regexp that has root `root`.
 */
 private predicate belongsTo(TInputSymbol a, RegExpRoot root) {
  exists(RegExpTerm term | getRoot(term) = root |
    a = Char(term.(RegexpCharacterConstant).getValue().charAt(_))
    or
    a = CharClass(term)
  )
 }
 /**
 * An abstract input symbol, representing a set of concrete characters.
 */
 class InputSymbol extends TInputSymbol {
  InputSymbol() { not this instanceof Epsilon }
  string toString() {
    this = Char(result)
    or
    result = any(RegExpTerm recc | this = CharClass(recc)).toString()
    or
    this = Dot() and result = "."
    or
    this = Any() and result = "[^]"
  }
 }
 /**
 * An abstract input symbol that represents a character class.
 */
 abstract class CharacterClass extends InputSymbol {
  /**
   * Gets a character that is relevant for intersection-tests involving this
   * character class.
   *
   * Specifically, this is any of the characters mentioned explicitly in the
   * character class, offset by one if it is inverted. For character class escapes,
   * the result is as if the class had been written out as a series of intervals.
   *
   * This set is large enough to ensure that for any two intersecting character
   * classes, one contains a relevant character from the other.
   */
  abstract string getARelevantChar();
  /**
   * Holds if this character class matches `char`.
   */
  bindingset[char]
  abstract predicate matches(string char);
  /**
   * Gets a character matched by this character class.
   */
  string choose() { result = getARelevantChar() and matches(result) }
 }
 /**
 * Provides implementations for `CharacterClass`.
 */
 private module CharacterClasses {
  /**
   * Holds if the character class `cc` has a child (constant or range) that matches `char`.
   */
  pragma[noinline]
  predicate hasChildThatMatches(RegExpCharacterClass cc, string char) {
    exists(CharClass(cc)) and
    exists(RegExpTerm child | child = cc.getAChild() |
      char = child.(RegexpCharacterConstant).getValue()
      or
      rangeMatchesOnLetterOrDigits(child, char)
      or
      not rangeMatchesOnLetterOrDigits(child, _) and
      char = getARelevantChar() and
      exists(string lo, string hi | child.(RegExpCharacterRange).isRange(lo, hi) |
        lo <= char and
        char <= hi
      )
      or
      exists(RegExpCharacterClassEscape escape | escape = child |
        escape.getValue() = escape.getValue().toLowerCase() and
        classEscapeMatches(escape.getValue(), char)
        or
        char = getARelevantChar() and
        escape.getValue() = escape.getValue().toUpperCase() and
        not classEscapeMatches(escape.getValue().toLowerCase(), char)
      )
    )
  }
  /**
   * Holds if `range` is a range on lower-case, upper-case, or digits, and matches `char`.
   * This predicate is used to restrict the searchspace for ranges by only joining `getAnyPossiblyMatchedChar`
   * on a few ranges.
   */
  private predicate rangeMatchesOnLetterOrDigits(RegExpCharacterRange range, string char) {
    exists(string lo, string hi |
      range.isRange(lo, hi) and lo = lowercaseLetter() and hi = lowercaseLetter()
    |
      lo <= char and
      char <= hi and
      char = lowercaseLetter()
    )
    or
    exists(string lo, string hi |
      range.isRange(lo, hi) and lo = upperCaseLetter() and hi = upperCaseLetter()
    |
      lo <= char and
      char <= hi and
      char = upperCaseLetter()
    )
    or
    exists(string lo, string hi | range.isRange(lo, hi) and lo = digit() and hi = digit() |
      lo <= char and
      char <= hi and
      char = digit()
    )
  }
  private string lowercaseLetter() { result = "abdcefghijklmnopqrstuvwxyz".charAt(_) }
  private string upperCaseLetter() { result = "ABCDEFGHIJKLMNOPQRSTUVWXYZ".charAt(_) }
  private string digit() { result = [0 .. 9].toString() }
  /**
   * Gets a char that could be matched by a regular expression.
   * Includes all printable ascii chars, all constants mentioned in a regexp, and all chars matches by the regexp `/\s|\d|\w/`.
   */
  string getARelevantChar() {
    exists(ascii(result))
    or
    exists(RegexpCharacterConstant c | result = c.getValue().charAt(_))
    or
    classEscapeMatches(_, result)
  }
  /**
   * Gets a char that is mentioned in the character class `c`.
   */
  private string getAMentionedChar(RegExpCharacterClass c) {
    exists(RegExpTerm child | child = c.getAChild() |
      result = child.(RegexpCharacterConstant).getValue()
      or
      child.(RegExpCharacterRange).isRange(result, _)
      or
      child.(RegExpCharacterRange).isRange(_, result)
      or
      exists(RegExpCharacterClassEscape escape | child = escape |
        result = min(string s | classEscapeMatches(escape.getValue().toLowerCase(), s))
        or
        result = max(string s | classEscapeMatches(escape.getValue().toLowerCase(), s))
      )
    )
  }
  /**
   * An implementation of `CharacterClass` for positive (non inverted) character classes.
   */
  private class PositiveCharacterClass extends CharacterClass {
    RegExpCharacterClass cc;
    PositiveCharacterClass() { this = CharClass(cc) and not cc.isInverted() }
    override string getARelevantChar() { result = getAMentionedChar(cc) }
    override predicate matches(string char) { hasChildThatMatches(cc, char) }
  }
  /**
   * An implementation of `CharacterClass` for inverted character classes.
   */
  private class InvertedCharacterClass extends CharacterClass {
    RegExpCharacterClass cc;
    InvertedCharacterClass() { this = CharClass(cc) and cc.isInverted() }
    override string getARelevantChar() {
      result = nextChar(getAMentionedChar(cc)) or
      nextChar(result) = getAMentionedChar(cc)
    }
    bindingset[char]
    override predicate matches(string char) { not hasChildThatMatches(cc, char) }
  }
  /**
   * Holds if the character class escape `clazz` (\d, \s, or \w) matches `char`.
   */
  pragma[noinline]
  private predicate classEscapeMatches(string clazz, string char) {
    clazz = "d" and
    char = "0123456789".charAt(_)
    or
    clazz = "s" and
    (
      char = [" ", "\t", "\r", "\n"]
      or
      char = getARelevantChar() and
      char.regexpMatch("\\u000b|\\u000c") // \v|\f (vertical tab | form feed)
    )
    or
    clazz = "w" and
    char = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_".charAt(_)
  }
  /**
   * An implementation of `CharacterClass` for \d, \s, and \w.
   */
  private class PositiveCharacterClassEscape extends CharacterClass {
    RegExpCharacterClassEscape cc;
    PositiveCharacterClassEscape() { this = CharClass(cc) and cc.getValue() = ["d", "s", "w"] }
    override string getARelevantChar() {
      cc.getValue() = "d" and
      result = ["0", "9"]
      or
      cc.getValue() = "s" and
      result = [" "]
      or
      cc.getValue() = "w" and
      result = ["a", "Z", "_", "0", "9"]
    }
    override predicate matches(string char) { classEscapeMatches(cc.getValue(), char) }
    override string choose() {
      cc.getValue() = "d" and
      result = "9"
      or
      cc.getValue() = "s" and
      result = [" "]
      or
      cc.getValue() = "w" and
      result = "a"
    }
  }
  /**
   * An implementation of `CharacterClass` for \D, \S, and \W.
   */
  private class NegativeCharacterClassEscape extends CharacterClass {
    RegExpCharacterClassEscape cc;
    NegativeCharacterClassEscape() { this = CharClass(cc) and cc.getValue() = ["D", "S", "W"] }
    override string getARelevantChar() {
      cc.getValue() = "D" and
      result = ["a", "Z", "!"]
      or
      cc.getValue() = "S" and
      result = ["a", "9", "!"]
      or
      cc.getValue() = "W" and
      result = [" ", "!"]
    }
    bindingset[char]
    override predicate matches(string char) {
      not classEscapeMatches(cc.getValue().toLowerCase(), char)
    }
  }
 }
 newtype TState =
  Match(RegExpTerm t, int i) {
    getRoot(t).isRelevant() and
    (
      i = 0
      or
      exists(t.(RegexpCharacterConstant).getValue().charAt(i))
    )
  } or
  Accept(RegExpRoot l) { l.isRelevant() } or
  AcceptAnySuffix(RegExpRoot l) { l.isRelevant() }
 /**
 * A state in the NFA corresponding to a regular expression.
 *
 * Each regular expression literal `l` has one accepting state
 * `Accept(l)`, one state that accepts all suffixes `AcceptAnySuffix(l)`,
 * and a state `Match(t, i)` for every subterm `t`,
 * which represents the state of the NFA before starting to
 * match `t`, or the `i`th character in `t` if `t` is a constant.
 */
 class State extends TState {
  RegExpTerm repr;
  State() {
    this = Match(repr, _) or
    this = Accept(repr) or
    this = AcceptAnySuffix(repr)
  }
  string toString() {
    exists(int i | this = Match(repr, i) | result = "Match(" + repr + "," + i + ")")
    or
    this instanceof Accept and
    result = "Accept(" + repr + ")"
    or
    this instanceof AcceptAnySuffix and
    result = "AcceptAny(" + repr + ")"
  }
  Location getLocation() { result = repr.getLocation() }
  /**
   * Gets the term represented by this state.
   */
  RegExpTerm getRepr() { result = repr }
 }
 class EdgeLabel extends TInputSymbol {
  string toString() {
    this = Epsilon() and result = ""
    or
    exists(InputSymbol s | this = s and result = s.toString())
  }
 }
 /**
 * Gets the state before matching `t`.
 */
 pragma[inline]
 State before(RegExpTerm t) { result = Match(t, 0) }
 /**
 * Gets a state the NFA may be in after matching `t`.
 */
 State after(RegExpTerm t) {
  exists(RegExpAlt alt | t = alt.getAChild() | result = after(alt))
  or
  exists(RegExpSequence seq, int i | t = seq.getChild(i) |
    result = before(seq.getChild(i + 1))
    or
    i + 1 = seq.getNumChild() and result = after(seq)
  )
  or
  exists(RegExpGroup grp | t = grp.getAChild() | result = after(grp))
  or
  exists(RegExpStar star | t = star.getAChild() | result = before(star))
  or
  exists(RegExpPlus plus | t = plus.getAChild() |
    result = before(plus) or
    result = after(plus)
  )
  or
  exists(RegExpOpt opt | t = opt.getAChild() | result = after(opt))
  or
  exists(RegExpRoot root | t = root | result = AcceptAnySuffix(root))
 }
 /**
 * Holds if the NFA has a transition from `q1` to `q2` labelled with `lbl`.
 */
 predicate delta(State q1, EdgeLabel lbl, State q2) {
  exists(RegexpCharacterConstant s, int i |
    q1 = Match(s, i) and
    lbl = Char(s.getValue().charAt(i)) and
    (
      q2 = Match(s, i + 1)
      or
      s.getValue().length() = i + 1 and
      q2 = after(s)
    )
  )
  or
  exists(RegExpDot dot | q1 = before(dot) and q2 = after(dot) |
    if dot.getLiteral().isDotAll() then lbl = Any() else lbl = Dot()
  )
  or
  exists(RegExpCharacterClass cc |
    cc.isUniversalClass() and q1 = before(cc) and lbl = Any() and q2 = after(cc)
    or
    q1 = before(cc) and
    lbl = CharClass(cc) and
    q2 = after(cc)
  )
  or
  exists(RegExpCharacterClassEscape cc |
    q1 = before(cc) and
    lbl = CharClass(cc) and
    q2 = after(cc)
  )
  or
  exists(RegExpAlt alt | lbl = Epsilon() | q1 = before(alt) and q2 = before(alt.getAChild()))
  or
  exists(RegExpSequence seq | lbl = Epsilon() | q1 = before(seq) and q2 = before(seq.getChild(0)))
  or
  exists(RegExpGroup grp | lbl = Epsilon() | q1 = before(grp) and q2 = before(grp.getChild(0)))
  or
  exists(RegExpStar star | lbl = Epsilon() |
    q1 = before(star) and q2 = before(star.getChild(0))
    or
    q1 = before(star) and q2 = after(star)
  )
  or
  exists(RegExpPlus plus | lbl = Epsilon() | q1 = before(plus) and q2 = before(plus.getChild(0)))
  or
  exists(RegExpOpt opt | lbl = Epsilon() |
    q1 = before(opt) and q2 = before(opt.getChild(0))
    or
    q1 = before(opt) and q2 = after(opt)
  )
  or
  exists(RegExpRoot root | q1 = AcceptAnySuffix(root) |
    lbl = Any() and q2 = q1
    or
    lbl = Epsilon() and q2 = Accept(root)
  )
  or
  exists(RegExpDollar dollar | q1 = before(dollar) |
    lbl = Epsilon() and q2 = Accept(getRoot(dollar))
  )
 }
 /**
 * Gets a state that `q` has an epsilon transition to.
 */
 State epsilonSucc(State q) { delta(q, Epsilon(), result) }
 /**
 * Gets a state that has an epsilon transition to `q`.
 */
 State epsilonPred(State q) { q = epsilonSucc(result) }
 /**
 * Holds if there is a state `q` that can be reached from `q1`
 * along epsilon edges, such that there is a transition from
 * `q` to `q2` that consumes symbol `s`.
 */
 predicate deltaClosed(State q1, InputSymbol s, State q2) { delta(epsilonSucc*(q1), s, q2) }
 /**
 * Holds if state `s` might be inside a backtracking repetition.
 */
 pragma[noinline]
 predicate stateInsideBacktracking(State s) {
  s.getRepr().getParent*() instanceof MaybeBacktrackingRepetition
 }
 /**
 * A state in the product automaton.
 *
@@ -759,96 +238,6 @@ class Trace extends TTrace {
  }
 }
 /**
 * Gets the minimum char that is matched by both the character classes `c` and `d`.
 */
 private string getMinOverlapBetweenCharacterClasses(CharacterClass c, CharacterClass d) {
  result = min(getAOverlapBetweenCharacterClasses(c, d))
 }
 /**
 * Gets a char that is matched by both the character classes `c` and `d`.
 * And `c` and `d` is not the same character class.
 */
 private string getAOverlapBetweenCharacterClasses(CharacterClass c, CharacterClass d) {
  sharesRoot(c, d) and
  result = [c.getARelevantChar(), d.getARelevantChar()] and
  c.matches(result) and
  d.matches(result) and
  not c = d
 }
 /**
 * Gets a character that is represented by both `c` and `d`.
 */
 string intersect(InputSymbol c, InputSymbol d) {
  c = Char(result) and
  d = getAnInputSymbolMatching(result) and
  (
    sharesRoot(c, d)
    or
    d = Dot()
    or
    d = Any()
  )
  or
  result = getMinOverlapBetweenCharacterClasses(c, d)
  or
  result = c.(CharacterClass).choose() and
  (
    d = c
    or
    d = Dot() and
    not (result = "\n" or result = "\r")
    or
    d = Any()
  )
  or
  c = Dot() and
  (
    d = Dot() and result = "a"
    or
    d = Any() and result = "a"
  )
  or
  c = Any() and d = Any() and result = "a"
  or
  result = intersect(d, c)
 }
 /**
 * Gets a symbol that matches `char`.
 */
 bindingset[char]
 InputSymbol getAnInputSymbolMatching(string char) {
  result = Char(char)
  or
  result.(CharacterClass).matches(char)
  or
  result = Dot() and
  not (char = "\n" or char = "\r")
  or
  result = Any()
 }
 /**
 * Gets the char after `c` (from a simplified ASCII table).
 */
 string nextChar(string c) { exists(int code | code = ascii(c) | code + 1 = ascii(result)) }
 /**
 * Gets an approximation for the ASCII code for `char`.
 * Only the easily printable chars are included (so no newline, tab, null, etc).
 */
 int ascii(string char) {
  char =
    rank[result](string c |
      c =
        "! \"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~"
            .charAt(_)
    )
 }
 /**
 * Gets a string corresponding to the trace `t`.
 */
@@ -903,273 +292,16 @@ predicate isPumpable(State fork, string w) {
 }
 /**
- * Predicates for constructing a prefix string that leads to a given state.
+ * An instantiation of `ReDoSConfiguration` for exponential backtracking.
 */
-module PrefixConstruction {
+class ExponentialReDoSConfiguration extends ReDoSConfiguration {
-  /**
+  ExponentialReDoSConfiguration() { this = "ExponentialReDoSConfiguration" }
   * Holds if `state` starts the string matched by the regular expression.
   */
  private predicate isStartState(State state) {
    state instanceof StateInPumpableRegexp and
    (
      state = Match(any(RegExpRoot r), _)
      or
      exists(RegExpCaret car | state = after(car))
    )
  }
-  /**
+  override predicate isReDoSCandidate(State state, string pump) { isPumpable(state, pump) }
   * Holds if `state` is the textually last start state for the regular expression.
   */
  private predicate lastStartState(State state) {
    exists(RegExpRoot root |
      state =
        max(State s, Location l |
          isStartState(s) and getRoot(s.getRepr()) = root and l = s.getRepr().getLocation()
        |
          s order by l.getStartLine(), l.getStartColumn()
        )
    )
  }
  /**
   * Holds if there exists any transition (Epsilon() or other) from `a` to `b`.
   */
  private predicate existsTransition(State a, State b) { delta(a, _, b) }
  /**
   * Gets the minimum number of transitions it takes to reach `state` from the `start` state.
   */
  int prefixLength(State start, State state) =
    shortestDistances(lastStartState/1, existsTransition/2)(start, state, result)
  /**
   * Gets the minimum number of transitions it takes to reach `state` from the start state.
   */
  private int lengthFromStart(State state) { result = prefixLength(_, state) }
  /**
   * Gets a string for which the regular expression will reach `state`.
   *
   * Has at most one result for any given `state`.
   * This predicate will not always have a result even if there is a ReDoS issue in
   * the regular expression.
   */
  string prefix(State state) {
    lastStartState(state) and
    result = ""
    or
    // the search stops past the last redos candidate state.
    lengthFromStart(state) <= max(lengthFromStart(any(State s | isReDoSCandidate(s, _)))) and
    exists(State prev |
      // select a unique predecessor (by an arbitrary measure)
      prev =
        min(State s, Location loc |
          lengthFromStart(s) = lengthFromStart(state) - 1 and
          loc = s.getRepr().getLocation() and
          delta(s, _, state)
        |
          s order by loc.getStartLine(), loc.getStartColumn(), loc.getEndLine(), loc.getEndColumn()
        )
    |
      // greedy search for the shortest prefix
      result = prefix(prev) and delta(prev, Epsilon(), state)
      or
      not delta(prev, Epsilon(), state) and
      result =
        prefix(prev) +
          min(string c | delta(prev, any(InputSymbol symbol | c = intersect(Any(), symbol)), state))
    )
  }
  /**
   * A state within a regular expression that has a pumpable state.
   */
  class StateInPumpableRegexp extends State {
    pragma[noinline]
    StateInPumpableRegexp() {
      exists(State s | isReDoSCandidate(s, _) | getRoot(s.getRepr()) = getRoot(this.getRepr()))
    }
  }
 }
 /**
 * Predicates for testing the presence of a rejecting suffix.
 *
 * These predicates are used to ensure that the all states reached from the fork
 * by repeating `w` have a rejecting suffix.
 *
 * For example, a regexp like `/^(a+)+/` will accept any string as long the prefix is
 * some number of `"a"`s, and it is therefore not possible to construct a rejecting suffix.
 *
 * A regexp like `/(a+)+$/` or `/(a+)+b/` trivially has a rejecting suffix,
 * as the suffix "X" will cause both the regular expressions to be rejected.
 *
 * The string `w` is repeated any number of times because it needs to be
 * infinitely repeatedable for the attack to work.
 * For the regular expression `/((ab)+)*abab/` the accepting state is not reachable from the fork
 * using epsilon transitions. But any attempt at repeating `w` will end in a state that accepts all suffixes.
 */
 module SuffixConstruction {
  import PrefixConstruction
  /**
   * Holds if all states reachable from `fork` by repeating `w`
   * are likely rejectable by appending some suffix.
   */
  predicate reachesOnlyRejectableSuffixes(State fork, string w) {
    isReDoSCandidate(fork, w) and
    forex(State next | next = process(fork, w, w.length() - 1) | isLikelyRejectable(next))
  }
  /**
   * Holds if there likely exists a suffix starting from `s` that leads to the regular expression being rejected.
   * This predicate might find impossible suffixes when searching for suffixes of length > 1, which can cause FPs.
   */
  pragma[nomagic]
  private predicate isLikelyRejectable(StateInPumpableRegexp s) {
    // exists a reject edge with some char.
    hasRejectEdge(s)
    or
    hasEdgeToLikelyRejectable(s)
    or
    // stopping here is rejection
    isRejectState(s)
  }
  /**
   * Holds if `s` is not an accept state, and there is no epsilon transition to an accept state.
   */
  predicate isRejectState(StateInPumpableRegexp s) { not epsilonSucc*(s) = Accept(_) }
  /**
   * Holds if there is likely a non-empty suffix leading to rejection starting in `s`.
   */
  predicate hasEdgeToLikelyRejectable(StateInPumpableRegexp s) {
    // all edges (at least one) with some char leads to another state that is rejectable.
    // the `next` states might not share a common suffix, which can cause FPs.
    exists(string char | char = relevant() |
      forex(State next | deltaClosedChar(s, char, next) | isLikelyRejectable(next))
    )
  }
  /**
   * Holds if there is a state `next` that can be reached from `prev`
   * along epsilon edges, such that there is a transition from
   * `prev` to `next` that the character symbol `char`.
   */
  predicate deltaClosedChar(StateInPumpableRegexp prev, string char, StateInPumpableRegexp next) {
    char = relevant() and
    deltaClosed(prev, getAnInputSymbolMatching(char), next)
  }
  /**
   * Gets a char used for finding possible suffixes.
   */
  private string relevant() { result = CharacterClasses::getARelevantChar() }
  /**
   * Holds if there is no edge from `s` labeled `char` in our NFA.
   * The NFA does not model reject states, so the above is the same as saying there is a reject edge.
   */
  private predicate hasRejectEdge(State s) {
    exists(string char | char = relevant() | not deltaClosedChar(s, char, _))
  }
  /**
   * Gets a state that can be reached from pumpable `fork` consuming all
   * chars in `w` any number of times followed by the first `i+1` characters of `w`.
   */
  private State process(State fork, string w, int i) {
    isReDoSCandidate(fork, w) and
    exists(State prev |
      i = 0 and prev = fork
      or
      prev = process(fork, w, i - 1)
      or
      // repeat until fixpoint
      i = 0 and
      prev = process(fork, w, w.length() - 1)
    |
      deltaClosed(prev, getAnInputSymbolMatching(w.charAt(i)), result)
    )
  }
 }
 /**
 * Holds if `term` may cause exponential backtracking on strings containing many repetitions of `pump`.
 * Gets the minimum possible string that causes exponential backtracking.
 */
 predicate isReDoSAttackable(RegExpTerm term, string pump, State s) {
  exists(int i, string c | s = Match(term, i) |
    c =
      min(string w |
        isReDoSCandidate(s, w) and
        SuffixConstruction::reachesOnlyRejectableSuffixes(s, w)
      |
        w order by w.length(), w
      ) and
    pump = escape(rotate(c, i))
  )
 }
 /**
 * Holds if repeating `pump' starting at `state` is a candidate for causing exponential backtracking.
 * No check whether a rejected suffix exists has been made.
 */
 predicate isReDoSCandidate(State state, string pump) {
  isPumpable(state, pump) and
  (
    not isPumpable(epsilonSucc+(state), _)
    or
    epsilonSucc+(state) = state and
    state =
      max(State s, Location l |
        s = epsilonSucc+(state) and
        l = s.getRepr().getLocation() and
        isPumpable(s, _) and
        s.getRepr() instanceof InfiniteRepetitionQuantifier
      |
        s order by l.getStartLine(), l.getStartColumn(), l.getEndColumn(), l.getEndLine()
      )
  )
 }
 /**
 * Gets the result of backslash-escaping newlines, carriage-returns and
 * backslashes in `s`.
 */
 bindingset[s]
 string escape(string s) {
  result =
    s.replaceAll("\\", "\\\\")
        .replaceAll("\n", "\\n")
        .replaceAll("\r", "\\r")
        .replaceAll("\t", "\\t")
 }
 /**
 * Gets `str` with the last `i` characters moved to the front.
 *
 * We use this to adjust the pump string to match with the beginning of
 * a RegExpTerm, so it doesn't start in the middle of a constant.
 */
 bindingset[str, i]
 string rotate(string str, int i) {
  result = str.suffix(str.length() - i) + str.prefix(str.length() - i)
 }
 from RegExpTerm t, string pump, State s, string prefixMsg
-where
+where hasReDoSResult(t, pump, s, prefixMsg)
  isReDoSAttackable(t, pump, s) and
  (
    prefixMsg = "starting with '" + escape(PrefixConstruction::prefix(s)) + "' and " and
    not PrefixConstruction::prefix(s) = ""
    or
    PrefixConstruction::prefix(s) = "" and prefixMsg = ""
    or
    not exists(PrefixConstruction::prefix(s)) and prefixMsg = ""
  )
 select t,
  "This part of the regular expression may cause exponential backtracking on strings " + prefixMsg +
    "containing many repetitions of '" + pump + "'."
--- a/javascript/ql/src/semmle/javascript/security/performance/ReDoSUtil.qll
+++ b/javascript/ql/src/semmle/javascript/security/performance/ReDoSUtil.qll
@@ -0,0 +1,930 @@
 /**
 * Provides classes for working with regular expressions that can
 * perform backtracking in superlinear/exponential time.
 *
 * This module contains a number of utility predicates for compiling a regular expression into a NFA and reasoning about this NFA.
 *
 * The `ReDoSConfiguration` contains a `isReDoSCandidate` predicate that is used to
 * to determine which states the prefix/suffix search should happen on.
 * There is only meant to exist one `ReDoSConfiguration` at a time.
 *
 * The predicate `hasReDoSResult` outputs a de-duplicated set of
 * states that will cause backtracking (a rejecting suffix exists).
 */
 import javascript
 /**
 * A configuration for which parts of a regular expression should be considered relevant for
 * the different predicates in `ReDoS.qll`.
 * Used to adjust the computations for either superliniear or exponential backtracking.
 */
 abstract class ReDoSConfiguration extends string {
  bindingset[this]
  ReDoSConfiguration() { any() }
  /**
   * Holds if `state` with the pump string `pump` is a candidate for a
   * ReDoS vulnerable state.
   * This is used to determine which states are considered for the prefix/suffix construction.
   */
  abstract predicate isReDoSCandidate(State state, string pump);
 }
 /**
 * Holds if repeating `pump' starting at `state` is a candidate for causing backtracking.
 * No check whether a rejected suffix exists has been made.
 */
 private predicate isReDoSCandidate(State state, string pump) {
  any(ReDoSConfiguration conf).isReDoSCandidate(state, pump) and
  (
    not any(ReDoSConfiguration conf).isReDoSCandidate(epsilonSucc+(state), _)
    or
    epsilonSucc+(state) = state and
    state =
      max(State s, Location l |
        s = epsilonSucc+(state) and
        l = s.getRepr().getLocation() and
        any(ReDoSConfiguration conf).isReDoSCandidate(s, _) and
        s.getRepr() instanceof InfiniteRepetitionQuantifier
      |
        s order by l.getStartLine(), l.getStartColumn(), l.getEndColumn(), l.getEndLine()
      )
  )
 }
 /**
 * A regular expression term that permits unlimited repetitions.
 */
 class InfiniteRepetitionQuantifier extends RegExpQuantifier {
  InfiniteRepetitionQuantifier() {
    this instanceof RegExpPlus
    or
    this instanceof RegExpStar
    or
    this instanceof RegExpRange and not exists(this.(RegExpRange).getUpperBound())
  }
 }
 /**
 * Gets the char after `c` (from a simplified ASCII table).
 */
 private string nextChar(string c) { exists(int code | code = ascii(c) | code + 1 = ascii(result)) }
 /**
 * Gets an approximation for the ASCII code for `char`.
 * Only the easily printable chars are included (so no newline, tab, null, etc).
 */
 private int ascii(string char) {
  char =
    rank[result](string c |
      c =
        "! \"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~"
            .charAt(_)
    )
 }
 /**
 * A branch in a disjunction that is the root node in a literal, or a literal
 * whose root node is not a disjunction.
 */
 class RegExpRoot extends RegExpTerm {
  RegExpParent parent;
  RegExpRoot() {
    exists(RegExpAlt alt |
      alt.isRootTerm() and
      this = alt.getAChild() and
      parent = alt.getParent()
    )
    or
    this.isRootTerm() and
    not this instanceof RegExpAlt and
    parent = this.getParent()
  }
  /**
   * Holds if this root term is relevant to the ReDoS analysis.
   */
  predicate isRelevant() {
    // there is at least one repetition
    getRoot(any(InfiniteRepetitionQuantifier q)) = this and
    // there are no lookbehinds
    not exists(RegExpLookbehind lbh | getRoot(lbh) = this) and
    // is actually used as a RegExp
    isUsedAsRegExp()
  }
 }
 /**
 * A constant in a regular expression that represents valid Unicode character(s).
 */
 private class RegexpCharacterConstant extends RegExpConstant {
  RegexpCharacterConstant() { this.isCharacter() }
 }
 /**
 * An abstract input symbol, representing a set of concrete characters.
 */
 private newtype TInputSymbol =
  /** An input symbol corresponding to character `c`. */
  Char(string c) {
    c = any(RegexpCharacterConstant cc | getRoot(cc).isRelevant()).getValue().charAt(_)
  } or
  /**
   * An input symbol representing all characters matched by
   * (non-universal) character class `recc`.
   */
  CharClass(RegExpTerm recc) {
    getRoot(recc).isRelevant() and
    (
      recc instanceof RegExpCharacterClass and
      not recc.(RegExpCharacterClass).isUniversalClass()
      or
      recc instanceof RegExpCharacterClassEscape
    )
  } or
  /** An input symbol representing all characters matched by `.`. */
  Dot() or
  /** An input symbol representing all characters. */
  Any() or
  /** An epsilon transition in the automaton. */
  Epsilon()
 /**
 * Holds if `a` and `b` are input symbols from the same regexp.
 * (And not a `Dot()`, `Any()` or `Epsilon()`)
 */
 private predicate sharesRoot(TInputSymbol a, TInputSymbol b) {
  exists(RegExpRoot root |
    belongsTo(a, root) and
    belongsTo(b, root)
  )
 }
 /**
 * Holds if the `a` is an input symbol from a regexp that has root `root`.
 */
 private predicate belongsTo(TInputSymbol a, RegExpRoot root) {
  exists(RegExpTerm term | getRoot(term) = root |
    a = Char(term.(RegexpCharacterConstant).getValue().charAt(_))
    or
    a = CharClass(term)
  )
 }
 /**
 * An abstract input symbol, representing a set of concrete characters.
 */
 class InputSymbol extends TInputSymbol {
  InputSymbol() { not this instanceof Epsilon }
  string toString() {
    this = Char(result)
    or
    result = any(RegExpTerm recc | this = CharClass(recc)).toString()
    or
    this = Dot() and result = "."
    or
    this = Any() and result = "[^]"
  }
 }
 /**
 * An abstract input symbol that represents a character class.
 */
 abstract private class CharacterClass extends InputSymbol {
  /**
   * Gets a character that is relevant for intersection-tests involving this
   * character class.
   *
   * Specifically, this is any of the characters mentioned explicitly in the
   * character class, offset by one if it is inverted. For character class escapes,
   * the result is as if the class had been written out as a series of intervals.
   *
   * This set is large enough to ensure that for any two intersecting character
   * classes, one contains a relevant character from the other.
   */
  abstract string getARelevantChar();
  /**
   * Holds if this character class matches `char`.
   */
  bindingset[char]
  abstract predicate matches(string char);
  /**
   * Gets a character matched by this character class.
   */
  string choose() { result = getARelevantChar() and matches(result) }
 }
 /**
 * Provides implementations for `CharacterClass`.
 */
 private module CharacterClasses {
  /**
   * Holds if the character class `cc` has a child (constant or range) that matches `char`.
   */
  pragma[noinline]
  predicate hasChildThatMatches(RegExpCharacterClass cc, string char) {
    exists(CharClass(cc)) and
    exists(RegExpTerm child | child = cc.getAChild() |
      char = child.(RegexpCharacterConstant).getValue()
      or
      rangeMatchesOnLetterOrDigits(child, char)
      or
      not rangeMatchesOnLetterOrDigits(child, _) and
      char = getARelevantChar() and
      exists(string lo, string hi | child.(RegExpCharacterRange).isRange(lo, hi) |
        lo <= char and
        char <= hi
      )
      or
      exists(RegExpCharacterClassEscape escape | escape = child |
        escape.getValue() = escape.getValue().toLowerCase() and
        classEscapeMatches(escape.getValue(), char)
        or
        char = getARelevantChar() and
        escape.getValue() = escape.getValue().toUpperCase() and
        not classEscapeMatches(escape.getValue().toLowerCase(), char)
      )
    )
  }
  /**
   * Holds if `range` is a range on lower-case, upper-case, or digits, and matches `char`.
   * This predicate is used to restrict the searchspace for ranges by only joining `getAnyPossiblyMatchedChar`
   * on a few ranges.
   */
  private predicate rangeMatchesOnLetterOrDigits(RegExpCharacterRange range, string char) {
    exists(string lo, string hi |
      range.isRange(lo, hi) and lo = lowercaseLetter() and hi = lowercaseLetter()
    |
      lo <= char and
      char <= hi and
      char = lowercaseLetter()
    )
    or
    exists(string lo, string hi |
      range.isRange(lo, hi) and lo = upperCaseLetter() and hi = upperCaseLetter()
    |
      lo <= char and
      char <= hi and
      char = upperCaseLetter()
    )
    or
    exists(string lo, string hi | range.isRange(lo, hi) and lo = digit() and hi = digit() |
      lo <= char and
      char <= hi and
      char = digit()
    )
  }
  private string lowercaseLetter() { result = "abdcefghijklmnopqrstuvwxyz".charAt(_) }
  private string upperCaseLetter() { result = "ABCDEFGHIJKLMNOPQRSTUVWXYZ".charAt(_) }
  private string digit() { result = [0 .. 9].toString() }
  /**
   * Gets a char that could be matched by a regular expression.
   * Includes all printable ascii chars, all constants mentioned in a regexp, and all chars matches by the regexp `/\s|\d|\w/`.
   */
  string getARelevantChar() {
    exists(ascii(result))
    or
    exists(RegexpCharacterConstant c | result = c.getValue().charAt(_))
    or
    classEscapeMatches(_, result)
  }
  /**
   * Gets a char that is mentioned in the character class `c`.
   */
  private string getAMentionedChar(RegExpCharacterClass c) {
    exists(RegExpTerm child | child = c.getAChild() |
      result = child.(RegexpCharacterConstant).getValue()
      or
      child.(RegExpCharacterRange).isRange(result, _)
      or
      child.(RegExpCharacterRange).isRange(_, result)
      or
      exists(RegExpCharacterClassEscape escape | child = escape |
        result = min(string s | classEscapeMatches(escape.getValue().toLowerCase(), s))
        or
        result = max(string s | classEscapeMatches(escape.getValue().toLowerCase(), s))
      )
    )
  }
  /**
   * An implementation of `CharacterClass` for positive (non inverted) character classes.
   */
  private class PositiveCharacterClass extends CharacterClass {
    RegExpCharacterClass cc;
    PositiveCharacterClass() { this = CharClass(cc) and not cc.isInverted() }
    override string getARelevantChar() { result = getAMentionedChar(cc) }
    override predicate matches(string char) { hasChildThatMatches(cc, char) }
  }
  /**
   * An implementation of `CharacterClass` for inverted character classes.
   */
  private class InvertedCharacterClass extends CharacterClass {
    RegExpCharacterClass cc;
    InvertedCharacterClass() { this = CharClass(cc) and cc.isInverted() }
    override string getARelevantChar() {
      result = nextChar(getAMentionedChar(cc)) or
      nextChar(result) = getAMentionedChar(cc)
    }
    bindingset[char]
    override predicate matches(string char) { not hasChildThatMatches(cc, char) }
  }
  /**
   * Holds if the character class escape `clazz` (\d, \s, or \w) matches `char`.
   */
  pragma[noinline]
  private predicate classEscapeMatches(string clazz, string char) {
    clazz = "d" and
    char = "0123456789".charAt(_)
    or
    clazz = "s" and
    (
      char = [" ", "\t", "\r", "\n"]
      or
      char = getARelevantChar() and
      char.regexpMatch("\\u000b|\\u000c") // \v|\f (vertical tab | form feed)
    )
    or
    clazz = "w" and
    char = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_".charAt(_)
  }
  /**
   * An implementation of `CharacterClass` for \d, \s, and \w.
   */
  private class PositiveCharacterClassEscape extends CharacterClass {
    RegExpCharacterClassEscape cc;
    PositiveCharacterClassEscape() { this = CharClass(cc) and cc.getValue() = ["d", "s", "w"] }
    override string getARelevantChar() {
      cc.getValue() = "d" and
      result = ["0", "9"]
      or
      cc.getValue() = "s" and
      result = [" "]
      or
      cc.getValue() = "w" and
      result = ["a", "Z", "_", "0", "9"]
    }
    override predicate matches(string char) { classEscapeMatches(cc.getValue(), char) }
    override string choose() {
      cc.getValue() = "d" and
      result = "9"
      or
      cc.getValue() = "s" and
      result = [" "]
      or
      cc.getValue() = "w" and
      result = "a"
    }
  }
  /**
   * An implementation of `CharacterClass` for \D, \S, and \W.
   */
  private class NegativeCharacterClassEscape extends CharacterClass {
    RegExpCharacterClassEscape cc;
    NegativeCharacterClassEscape() { this = CharClass(cc) and cc.getValue() = ["D", "S", "W"] }
    override string getARelevantChar() {
      cc.getValue() = "D" and
      result = ["a", "Z", "!"]
      or
      cc.getValue() = "S" and
      result = ["a", "9", "!"]
      or
      cc.getValue() = "W" and
      result = [" ", "!"]
    }
    bindingset[char]
    override predicate matches(string char) {
      not classEscapeMatches(cc.getValue().toLowerCase(), char)
    }
  }
 }
 private class EdgeLabel extends TInputSymbol {
  string toString() {
    this = Epsilon() and result = ""
    or
    exists(InputSymbol s | this = s and result = s.toString())
  }
 }
 /**
 * Gets the state before matching `t`.
 */
 pragma[inline]
 private State before(RegExpTerm t) { result = Match(t, 0) }
 /**
 * Gets a state the NFA may be in after matching `t`.
 */
 private State after(RegExpTerm t) {
  exists(RegExpAlt alt | t = alt.getAChild() | result = after(alt))
  or
  exists(RegExpSequence seq, int i | t = seq.getChild(i) |
    result = before(seq.getChild(i + 1))
    or
    i + 1 = seq.getNumChild() and result = after(seq)
  )
  or
  exists(RegExpGroup grp | t = grp.getAChild() | result = after(grp))
  or
  exists(RegExpStar star | t = star.getAChild() | result = before(star))
  or
  exists(RegExpPlus plus | t = plus.getAChild() |
    result = before(plus) or
    result = after(plus)
  )
  or
  exists(RegExpOpt opt | t = opt.getAChild() | result = after(opt))
  or
  exists(RegExpRoot root | t = root | result = AcceptAnySuffix(root))
 }
 /**
 * Holds if the NFA has a transition from `q1` to `q2` labelled with `lbl`.
 */
 predicate delta(State q1, EdgeLabel lbl, State q2) {
  exists(RegexpCharacterConstant s, int i |
    q1 = Match(s, i) and
    lbl = Char(s.getValue().charAt(i)) and
    (
      q2 = Match(s, i + 1)
      or
      s.getValue().length() = i + 1 and
      q2 = after(s)
    )
  )
  or
  exists(RegExpDot dot | q1 = before(dot) and q2 = after(dot) |
    if dot.getLiteral().isDotAll() then lbl = Any() else lbl = Dot()
  )
  or
  exists(RegExpCharacterClass cc |
    cc.isUniversalClass() and q1 = before(cc) and lbl = Any() and q2 = after(cc)
    or
    q1 = before(cc) and
    lbl = CharClass(cc) and
    q2 = after(cc)
  )
  or
  exists(RegExpCharacterClassEscape cc |
    q1 = before(cc) and
    lbl = CharClass(cc) and
    q2 = after(cc)
  )
  or
  exists(RegExpAlt alt | lbl = Epsilon() | q1 = before(alt) and q2 = before(alt.getAChild()))
  or
  exists(RegExpSequence seq | lbl = Epsilon() | q1 = before(seq) and q2 = before(seq.getChild(0)))
  or
  exists(RegExpGroup grp | lbl = Epsilon() | q1 = before(grp) and q2 = before(grp.getChild(0)))
  or
  exists(RegExpStar star | lbl = Epsilon() |
    q1 = before(star) and q2 = before(star.getChild(0))
    or
    q1 = before(star) and q2 = after(star)
  )
  or
  exists(RegExpPlus plus | lbl = Epsilon() | q1 = before(plus) and q2 = before(plus.getChild(0)))
  or
  exists(RegExpOpt opt | lbl = Epsilon() |
    q1 = before(opt) and q2 = before(opt.getChild(0))
    or
    q1 = before(opt) and q2 = after(opt)
  )
  or
  exists(RegExpRoot root | q1 = AcceptAnySuffix(root) |
    lbl = Any() and q2 = q1
    or
    lbl = Epsilon() and q2 = Accept(root)
  )
  or
  exists(RegExpDollar dollar | q1 = before(dollar) |
    lbl = Epsilon() and q2 = Accept(getRoot(dollar))
  )
 }
 /**
 * Gets a state that `q` has an epsilon transition to.
 */
 State epsilonSucc(State q) { delta(q, Epsilon(), result) }
 /**
 * Gets a state that has an epsilon transition to `q`.
 */
 State epsilonPred(State q) { q = epsilonSucc(result) }
 /**
 * Holds if there is a state `q` that can be reached from `q1`
 * along epsilon edges, such that there is a transition from
 * `q` to `q2` that consumes symbol `s`.
 */
 predicate deltaClosed(State q1, InputSymbol s, State q2) { delta(epsilonSucc*(q1), s, q2) }
 /**
 * Gets the root containing the given term, that is, the root of the literal,
 * or a branch of the root disjunction.
 */
 RegExpRoot getRoot(RegExpTerm term) {
  result = term or
  result = getRoot(term.getParent())
 }
 newtype TState =
  Match(RegExpTerm t, int i) {
    getRoot(t).isRelevant() and
    (
      i = 0
      or
      exists(t.(RegexpCharacterConstant).getValue().charAt(i))
    )
  } or
  Accept(RegExpRoot l) { l.isRelevant() } or
  AcceptAnySuffix(RegExpRoot l) { l.isRelevant() }
 /**
 * A state in the NFA corresponding to a regular expression.
 *
 * Each regular expression literal `l` has one accepting state
 * `Accept(l)`, one state that accepts all suffixes `AcceptAnySuffix(l)`,
 * and a state `Match(t, i)` for every subterm `t`,
 * which represents the state of the NFA before starting to
 * match `t`, or the `i`th character in `t` if `t` is a constant.
 */
 class State extends TState {
  RegExpTerm repr;
  State() {
    this = Match(repr, _) or
    this = Accept(repr) or
    this = AcceptAnySuffix(repr)
  }
  string toString() {
    exists(int i | this = Match(repr, i) | result = "Match(" + repr + "," + i + ")")
    or
    this instanceof Accept and
    result = "Accept(" + repr + ")"
    or
    this instanceof AcceptAnySuffix and
    result = "AcceptAny(" + repr + ")"
  }
  Location getLocation() { result = repr.getLocation() }
  /**
   * Gets the term represented by this state.
   */
  RegExpTerm getRepr() { result = repr }
 }
 /**
 * Gets the minimum char that is matched by both the character classes `c` and `d`.
 */
 private string getMinOverlapBetweenCharacterClasses(CharacterClass c, CharacterClass d) {
  result = min(getAOverlapBetweenCharacterClasses(c, d))
 }
 /**
 * Gets a char that is matched by both the character classes `c` and `d`.
 * And `c` and `d` is not the same character class.
 */
 private string getAOverlapBetweenCharacterClasses(CharacterClass c, CharacterClass d) {
  sharesRoot(c, d) and
  result = [c.getARelevantChar(), d.getARelevantChar()] and
  c.matches(result) and
  d.matches(result) and
  not c = d
 }
 /**
 * Gets a character that is represented by both `c` and `d`.
 */
 string intersect(InputSymbol c, InputSymbol d) {
  c = Char(result) and
  d = getAnInputSymbolMatching(result) and
  (
    sharesRoot(c, d)
    or
    d = Dot()
    or
    d = Any()
  )
  or
  result = getMinOverlapBetweenCharacterClasses(c, d)
  or
  result = c.(CharacterClass).choose() and
  (
    d = c
    or
    d = Dot() and
    not (result = "\n" or result = "\r")
    or
    d = Any()
  )
  or
  c = Dot() and
  (
    d = Dot() and result = "a"
    or
    d = Any() and result = "a"
  )
  or
  c = Any() and d = Any() and result = "a"
  or
  result = intersect(d, c)
 }
 /**
 * Gets a symbol that matches `char`.
 */
 bindingset[char]
 InputSymbol getAnInputSymbolMatching(string char) {
  result = Char(char)
  or
  result.(CharacterClass).matches(char)
  or
  result = Dot() and
  not (char = "\n" or char = "\r")
  or
  result = Any()
 }
 /**
 * Predicates for constructing a prefix string that leads to a given state.
 */
 private module PrefixConstruction {
  /**
   * Holds if `state` starts the string matched by the regular expression.
   */
  private predicate isStartState(State state) {
    state instanceof StateInPumpableRegexp and
    (
      state = Match(any(RegExpRoot r), _)
      or
      exists(RegExpCaret car | state = after(car))
    )
  }
  /**
   * Holds if `state` is the textually last start state for the regular expression.
   */
  private predicate lastStartState(State state) {
    exists(RegExpRoot root |
      state =
        max(State s, Location l |
          isStartState(s) and getRoot(s.getRepr()) = root and l = s.getRepr().getLocation()
        |
          s order by l.getStartLine(), l.getStartColumn()
        )
    )
  }
  /**
   * Holds if there exists any transition (Epsilon() or other) from `a` to `b`.
   */
  private predicate existsTransition(State a, State b) { delta(a, _, b) }
  /**
   * Gets the minimum number of transitions it takes to reach `state` from the `start` state.
   */
  int prefixLength(State start, State state) =
    shortestDistances(lastStartState/1, existsTransition/2)(start, state, result)
  /**
   * Gets the minimum number of transitions it takes to reach `state` from the start state.
   */
  private int lengthFromStart(State state) { result = prefixLength(_, state) }
  /**
   * Gets a string for which the regular expression will reach `state`.
   *
   * Has at most one result for any given `state`.
   * This predicate will not always have a result even if there is a ReDoS issue in
   * the regular expression.
   */
  string prefix(State state) {
    lastStartState(state) and
    result = ""
    or
    // the search stops past the last redos candidate state.
    lengthFromStart(state) <= max(lengthFromStart(any(State s | isReDoSCandidate(s, _)))) and
    exists(State prev |
      // select a unique predecessor (by an arbitrary measure)
      prev =
        min(State s, Location loc |
          lengthFromStart(s) = lengthFromStart(state) - 1 and
          loc = s.getRepr().getLocation() and
          delta(s, _, state)
        |
          s order by loc.getStartLine(), loc.getStartColumn(), loc.getEndLine(), loc.getEndColumn()
        )
    |
      // greedy search for the shortest prefix
      result = prefix(prev) and delta(prev, Epsilon(), state)
      or
      not delta(prev, Epsilon(), state) and
      result =
        prefix(prev) +
          min(string c | delta(prev, any(InputSymbol symbol | c = intersect(Any(), symbol)), state))
    )
  }
  /**
   * A state within a regular expression that has a pumpable state.
   */
  class StateInPumpableRegexp extends State {
    pragma[noinline]
    StateInPumpableRegexp() {
      exists(State s | isReDoSCandidate(s, _) | getRoot(s.getRepr()) = getRoot(this.getRepr()))
    }
  }
 }
 /**
 * Predicates for testing the presence of a rejecting suffix.
 *
 * These predicates are used to ensure that the all states reached from the fork
 * by repeating `w` have a rejecting suffix.
 *
 * For example, a regexp like `/^(a+)+/` will accept any string as long the prefix is
 * some number of `"a"`s, and it is therefore not possible to construct a rejecting suffix.
 *
 * A regexp like `/(a+)+$/` or `/(a+)+b/` trivially has a rejecting suffix,
 * as the suffix "X" will cause both the regular expressions to be rejected.
 *
 * The string `w` is repeated any number of times because it needs to be
 * infinitely repeatedable for the attack to work.
 * For the regular expression `/((ab)+)*abab/` the accepting state is not reachable from the fork
 * using epsilon transitions. But any attempt at repeating `w` will end in a state that accepts all suffixes.
 */
 private module SuffixConstruction {
  import PrefixConstruction
  /**
   * Holds if all states reachable from `fork` by repeating `w`
   * are likely rejectable by appending some suffix.
   */
  predicate reachesOnlyRejectableSuffixes(State fork, string w) {
    isReDoSCandidate(fork, w) and
    forex(State next | next = process(fork, w, w.length() - 1) | isLikelyRejectable(next))
  }
  /**
   * Holds if there likely exists a suffix starting from `s` that leads to the regular expression being rejected.
   * This predicate might find impossible suffixes when searching for suffixes of length > 1, which can cause FPs.
   */
  pragma[nomagic]
  private predicate isLikelyRejectable(StateInPumpableRegexp s) {
    // exists a reject edge with some char.
    hasRejectEdge(s)
    or
    hasEdgeToLikelyRejectable(s)
    or
    // stopping here is rejection
    isRejectState(s)
  }
  /**
   * Holds if `s` is not an accept state, and there is no epsilon transition to an accept state.
   */
  predicate isRejectState(StateInPumpableRegexp s) { not epsilonSucc*(s) = Accept(_) }
  /**
   * Holds if there is likely a non-empty suffix leading to rejection starting in `s`.
   */
  predicate hasEdgeToLikelyRejectable(StateInPumpableRegexp s) {
    // all edges (at least one) with some char leads to another state that is rejectable.
    // the `next` states might not share a common suffix, which can cause FPs.
    exists(string char | char = relevant() |
      forex(State next | deltaClosedChar(s, char, next) | isLikelyRejectable(next))
    )
  }
  /**
   * Holds if there is a state `next` that can be reached from `prev`
   * along epsilon edges, such that there is a transition from
   * `prev` to `next` that the character symbol `char`.
   */
  predicate deltaClosedChar(StateInPumpableRegexp prev, string char, StateInPumpableRegexp next) {
    char = relevant() and
    deltaClosed(prev, getAnInputSymbolMatching(char), next)
  }
  /**
   * Gets a char used for finding possible suffixes.
   */
  private string relevant() { result = CharacterClasses::getARelevantChar() }
  /**
   * Holds if there is no edge from `s` labeled `char` in our NFA.
   * The NFA does not model reject states, so the above is the same as saying there is a reject edge.
   */
  private predicate hasRejectEdge(State s) {
    exists(string char | char = relevant() | not deltaClosedChar(s, char, _))
  }
  /**
   * Gets a state that can be reached from pumpable `fork` consuming all
   * chars in `w` any number of times followed by the first `i+1` characters of `w`.
   */
  private State process(State fork, string w, int i) {
    isReDoSCandidate(fork, w) and
    exists(State prev |
      i = 0 and prev = fork
      or
      prev = process(fork, w, i - 1)
      or
      // repeat until fixpoint
      i = 0 and
      prev = process(fork, w, w.length() - 1)
    |
      deltaClosed(prev, getAnInputSymbolMatching(w.charAt(i)), result)
    )
  }
 }
 /**
 * Gets the result of backslash-escaping newlines, carriage-returns and
 * backslashes in `s`.
 */
 bindingset[s]
 private string escape(string s) {
  result =
    s.replaceAll("\\", "\\\\")
        .replaceAll("\n", "\\n")
        .replaceAll("\r", "\\r")
        .replaceAll("\t", "\\t")
 }
 /**
 * Gets `str` with the last `i` characters moved to the front.
 *
 * We use this to adjust the pump string to match with the beginning of
 * a RegExpTerm, so it doesn't start in the middle of a constant.
 */
 bindingset[str, i]
 private string rotate(string str, int i) {
  result = str.suffix(str.length() - i) + str.prefix(str.length() - i)
 }
 /**
 * Holds if `term` may cause superliniear backtracking on strings containing many repetitions of `pump`.
 * Gets the minimum possible string that causes superliniear backtracking.
 */
 private predicate isReDoSAttackable(RegExpTerm term, string pump, State s) {
  exists(int i, string c | s = Match(term, i) |
    c =
      min(string w |
        any(ReDoSConfiguration conf).isReDoSCandidate(s, w) and
        SuffixConstruction::reachesOnlyRejectableSuffixes(s, w)
      |
        w order by w.length(), w
      ) and
    pump = escape(rotate(c, i))
  )
 }
 /**
 * Holds if the state `s` (represented by the term `t`) can have backtracking with repetitions of `pump`.
 *
 * `prefixMsg` contains a friendly message for a prefix that reaches `s` (or `prefixMsg` is the empty string if the prefix is empty or if no prefix could be found).
 */
 predicate hasReDoSResult(RegExpTerm t, string pump, State s, string prefixMsg) {
  isReDoSAttackable(t, pump, s) and
  (
    prefixMsg = "starting with '" + escape(PrefixConstruction::prefix(s)) + "' and " and
    not PrefixConstruction::prefix(s) = ""
    or
    PrefixConstruction::prefix(s) = "" and prefixMsg = ""
    or
    not exists(PrefixConstruction::prefix(s)) and prefixMsg = ""
  )
 }
--- a/javascript/ql/src/semmle/javascript/security/performance/SuperlinearBackTracking.qll
+++ b/javascript/ql/src/semmle/javascript/security/performance/SuperlinearBackTracking.qll
@@ -4,19 +4,7 @@
 */
 import javascript
-
+import ReDoSUtil
 /**
 * A regular expression term that permits unlimited repetitions.
 */
 class InfiniteRepetitionQuantifier extends RegExpQuantifier {
  InfiniteRepetitionQuantifier() {
    this instanceof RegExpPlus
    or
    this instanceof RegExpStar
    or
    this instanceof RegExpRange and not exists(this.(RegExpRange).getUpperBound())
  }
 }
 /**
 * Holds if `t` matches at least an epsilon symbol.