/**
 * @name Inefficient regular expression
 * @description A regular expression that requires exponential time to match certain inputs
 *              can be a performance bottleneck, and may be vulnerable to denial-of-service
 *              attacks.
 * @kind problem
 * @problem.severity error
 * @precision high
 * @id js/redos
 * @tags security
 *       external/cwe/cwe-730
 *       external/cwe/cwe-400
 */

import javascript

/*
 * This query implements the analysis described in the following two papers:
 *
 *   James Kirrage, Asiri Rathnayake, Hayo Thielecke: Static Analysis for
 *     Regular Expression Denial-of-Service Attacks. NSS 2013.
 *     (http://www.cs.bham.ac.uk/~hxt/research/reg-exp-sec.pdf)
 *   Asiri Rathnayake, Hayo Thielecke: Static Analysis for Regular Expression
 *     Exponential Runtime via Substructural Logics. 2014.
 *     (https://www.cs.bham.ac.uk/~hxt/research/redos_full.pdf)
 *
 * The basic idea is to search for overlapping cycles in the NFA, that is,
 * states `q` such that there are two distinct paths from `q` to itself
 * that consume the same word `w`.
 *
 * For any such state `q`, an attack string can be constructed as follows:
 * concatenate a prefix `v` that takes the NFA to `q` with `n` copies of
 * the word `w` that leads back to `q` along two different paths, followed
 * by a suffix `x` that is _not_ accepted in state `q`. A backtracking
 * implementation will need to explore at least 2^n different ways of going
 * from `q` back to itself while trying to match the `n` copies of `w`
 * before finally giving up.
 *
 * Now in order to identify overlapping cycles, all we have to do is find
 * pumpable forks, that is, states `q` that can transition to two different
 * states `r1` and `r2` on the same input symbol `c`, such that there are
 * paths from both `r1` and `r2` to `q` that consume the same word. The latter
 * condition is equivalent to saying that `(q, q)` is reachable from `(r1, r2)`
 * in the product NFA.
 *
 * This is what the query does. It makes no attempt to construct a prefix
 * leading into `q`, and only a weak one to construct a suffix that ensures
 * rejection; this causes some false positives. Also, the query does not fully
 * handle character classes and does not handle various other features at all;
 * this causes false negatives.
 *
 * Finally, sometimes it depends on the translation whether the NFA generated
 * for a regular expression has a pumpable fork or not. We implement one
 * particular translation, which may result in false positives or negatives
 * relative to some particular JavaScript engine.
 *
 * More precisely, the query constructs an NFA from a regular expression `r`
 * as follows:
 *
 *   * Every sub-term `t` gives rise to an NFA state `Match(t,i)`, representing
 *     the state of the automaton before attempting to match the `i`th character in `t`.
 *   * There is one additional accepting state `Accept(r)`.
 *   * Transitions between states may be labelled with epsilon, or an abstract
 *     input symbol.
 *   * Each abstract input symbol represents a set of concrete input characters:
 *     either a single character, a set of characters represented by a (positive)
 *     character class, or the set of all characters.
 *   * The product automaton is constructed lazily, starting with pair states
 *     `(q, q)` where `q` is a fork, and proceding along an over-approximate
 *     step relation.
 *   * The over-approximate step relation allows transitions along pairs of
 *     abstract input symbols as long as the symbols are not trivially incompatible.
 *   * Once a trace of pairs of abstract input symbols that leads from a fork
 *     back to itself has been identified, we attempt to construct a concrete
 *     string corresponding to it, which may fail.
 *   * Instead of trying to construct a suffix that makes the automaton fail,
 *     we ensure that it isn't possible to reach the accepting state from the
 *     fork along epsilon transitions. In this case, it is very likely (though
 *     not guaranteed) that a rejecting suffix exists.
 */

/**
 * A branch in a disjunction that is the root node in a literal, or a literal
 * whose root node is not a disjunction.
 */
class RegExpRoot extends RegExpTerm {
  RegExpParent parent;

  RegExpRoot() {
    exists(RegExpAlt alt |
      alt.isRootTerm() and
      this = alt.getAChild() and
      parent = alt.getParent()
    )
    or
    this.isRootTerm() and
    not this instanceof RegExpAlt and
    parent = this.getParent()
  }

  /**
   * Holds if this root term is relevant to the ReDoS analysis.
   */
  predicate isRelevant() {
    // there is at least one repetition
    exists(RegExpRepetition rep | getRoot(rep) = this) and
    // there are no lookbehinds
    not exists(RegExpLookbehind lbh | getRoot(lbh) = this) and
    // is actually used as a RegExp
    isUsedAsRegExp()
  }
}

/**
 * A term that matches repetitions of a given pattern, that is, `E*`, `E+`, or `E{n,m}`.
 */
class RegExpRepetition extends RegExpParent {
  RegExpRepetition() {
    this instanceof RegExpStar or
    this instanceof RegExpPlus or
    this instanceof RegExpRange
  }
}

/**
 * Gets the root containing the given term, that is, the root of the literal,
 * or a branch of the root disjunction.
 */
RegExpRoot getRoot(RegExpTerm term) {
  result = term or
  result = getRoot(term.getParent())
}

/**
 * An abstract input symbol, representing a set of concrete characters.
 */
newtype TInputSymbol =
  /** An input symbol corresponding to character `c`. */
  Char(string c) { c = any(RegExpConstant cc).getValue().charAt(_) } or
  /**
   * An input symbol representing all characters matched by
   * (positive, non-universal) character class `recc`.
   */
  CharClass(RegExpCharacterClass recc) {
    getRoot(recc).isRelevant() and
    not recc.isInverted() and
    not recc.isUniversalClass()
  } or
  /** An input symbol representing all characters matched by `.`. */
  Dot() or
  /** An input symbol representing all characters. */
  Any() or
  /** An epsilon transition in the automaton. */
  Epsilon()

/**
 * An abstract input symbol, representing a set of concrete characters.
 */
class InputSymbol extends TInputSymbol {
  InputSymbol() { not this instanceof Epsilon }

  string toString() {
    this = Char(result)
    or
    result = any(RegExpCharacterClass recc | this = CharClass(recc)).toString()
    or
    this = Dot() and result = "."
    or
    this = Any() and result = "[^]"
  }
}

/**
 * Gets a lower bound on the characters matched by the given character class term.
 */
string getCCLowerBound(RegExpTerm t) {
  t.getParent() instanceof RegExpCharacterClass and
  (
    result = t.(RegExpConstant).getValue()
    or
    t.(RegExpCharacterRange).isRange(result, _)
    or
    exists(string name | name = t.(RegExpCharacterClassEscape).getValue() |
      name = "w" and result = "0"
      or
      name = "W" and result = ""
      or
      name = "s" and result = ""
      or
      name = "S" and result = ""
    )
  )
}

/**
 * The highest character used in a regular expression. Used to represent intervals without an upper bound.
 */
string highestCharacter() { result = max(RegExpConstant c | | c.getValue()) }

/**
 * Gets an upper bound on the characters matched by the given character class term.
 */
string getCCUpperBound(RegExpTerm t) {
  t.getParent() instanceof RegExpCharacterClass and
  (
    result = t.(RegExpConstant).getValue()
    or
    t.(RegExpCharacterRange).isRange(_, result)
    or
    exists(string name | name = t.(RegExpCharacterClassEscape).getValue() |
      name = "w" and result = "z"
      or
      name = "W" and result = highestCharacter()
      or
      name = "s" and result = highestCharacter()
      or
      name = "S" and result = highestCharacter()
    )
  )
}

/**
 * Holds if `s` belongs to `l` and is a character class whose set of matched characters is contained
 * in the interval `lo-hi`.
 */
predicate hasBounds(RegExpRoot l, InputSymbol s, string lo, string hi) {
  exists(RegExpCharacterClass cc | s = CharClass(cc) |
    l = getRoot(cc) and
    lo = min(getCCLowerBound(cc.getAChild())) and
    hi = max(getCCUpperBound(cc.getAChild()))
  )
}

/**
 * Holds if `s1` and `s2` possibly have a non-empty intersection.
 *
 * This predicate is over-approximate; it is only used for pruning the search space.
 */
predicate compatible(InputSymbol s1, InputSymbol s2) {
  exists(RegExpRoot l, string lo1, string lo2, string hi1, string hi2 |
    hasBounds(l, s1, lo1, hi1) and
    hasBounds(l, s2, lo2, hi2) and
    max(string s | s = lo1 or s = lo2) <= min(string s | s = hi1 or s = hi2)
  )
  or
  exists(intersect(s1, s2))
}

newtype TState =
  Match(RegExpTerm t, int i) {
    getRoot(t).isRelevant() and
    (
      i = 0
      or
      exists(t.(RegExpConstant).getValue().charAt(i))
    )
  } or
  Accept(RegExpRoot l) { l.isRelevant() }

/**
 * A state in the NFA corresponding to a regular expression.
 *
 * Each regular expression literal `l` has one accepting state
 * `Accept(l)` and a state `Match(t, i)` for every subterm `t`,
 * which represents the state of the NFA before starting to
 * match `t`, or the `i`th character in `t` if `t` is a constant.
 */
class State extends TState {
  RegExpParent repr;

  State() { this = Match(repr, _) or this = Accept(repr) }

  string toString() {
    exists(int i | this = Match(repr, i) | result = "Match(" + repr + "," + i + ")")
    or
    this instanceof Accept and
    result = "Accept(" + repr + ")"
  }

  Location getLocation() { result = repr.getLocation() }
}

class EdgeLabel extends TInputSymbol {
  string toString() {
    this = Epsilon() and result = ""
    or
    exists(InputSymbol s | this = s and result = s.toString())
  }
}

/**
 * Gets the state before matching `t`.
 */
pragma[inline]
State before(RegExpTerm t) {
  result = Match(t, 0)
}

/**
 * Gets a state the NFA may be in after matching `t`.
 */
State after(RegExpTerm t) {
  exists(RegExpAlt alt | t = alt.getAChild() | result = after(alt))
  or
  exists(RegExpSequence seq, int i | t = seq.getChild(i) |
    result = before(seq.getChild(i + 1))
    or
    i + 1 = seq.getNumChild() and result = after(seq)
  )
  or
  exists(RegExpGroup grp | t = grp.getAChild() | result = after(grp))
  or
  exists(RegExpStar star | t = star.getAChild() | result = before(star))
  or
  exists(RegExpPlus plus | t = plus.getAChild() |
    result = before(plus) or
    result = after(plus)
  )
  or
  exists(RegExpOpt opt | t = opt.getAChild() | result = after(opt))
  or
  exists(RegExpRoot root | t = root | result = Accept(root))
}

/**
 * Holds if the NFA has a transition from `q1` to `q2` labelled with `lbl`.
 */
predicate delta(State q1, EdgeLabel lbl, State q2) {
  exists(RegExpConstant s, int i |
    q1 = Match(s, i) and
    lbl = Char(s.getValue().charAt(i)) and
    (
      q2 = Match(s, i + 1)
      or
      s.getValue().length() = i + 1 and
      q2 = after(s)
    )
  )
  or
  exists(RegExpDot dot |
    q1 = before(dot) and q2 = after(dot)
  |
    if dot.getLiteral().isDotAll() then lbl = Any() else lbl = Dot()
  )
  or
  exists(RegExpCharacterClass cc |
    cc.isUniversalClass() and q1 = before(cc) and lbl = Any() and q2 = after(cc)
    or
    q1 = before(cc) and lbl = CharClass(cc) and q2 = after(cc)
  )
  or
  exists(RegExpAlt alt | lbl = Epsilon() | q1 = before(alt) and q2 = before(alt.getAChild()))
  or
  exists(RegExpSequence seq | lbl = Epsilon() | q1 = before(seq) and q2 = before(seq.getChild(0)))
  or
  exists(RegExpGroup grp | lbl = Epsilon() | q1 = before(grp) and q2 = before(grp.getChild(0)))
  or
  exists(RegExpStar star | lbl = Epsilon() |
    q1 = before(star) and q2 = before(star.getChild(0))
    or
    q1 = before(star) and q2 = after(star)
  )
  or
  exists(RegExpPlus plus | lbl = Epsilon() | q1 = before(plus) and q2 = before(plus.getChild(0)))
  or
  exists(RegExpOpt opt | lbl = Epsilon() |
    q1 = before(opt) and q2 = before(opt.getChild(0))
    or
    q1 = before(opt) and q2 = after(opt)
  )
}

/**
 * Gets a state that `q` has an epsilon transition to.
 */
State epsilonSucc(State q) { delta(q, Epsilon(), result) }

/**
 * Gets a state that has an epsilon transition to `q`.
 */
State epsilonPred(State q) { q = epsilonSucc(result) }

/**
 * Holds if there is a state `q` that can be reached from `q1`
 * along epsilon edges, such that there is a transition from
 * `q` to `q2` that consumes symbol `s`.
 */
predicate deltaClosed(State q1, InputSymbol s, State q2) { delta(epsilonSucc*(q1), s, q2) }

/**
 * A state in the product automaton.
 *
 * We lazily only construct those states that we are actually
 * going to need: `(q, q)` for every fork state `q`, and any
 * pair of states that can be reached from a pair that we have
 * already constructed. To cut down on the number of states,
 * we only represent states `(q1, q2)` where `q1` is lexicographically
 * no bigger than `q2`.
 */
newtype TStatePair =
  MkStatePair(State q1, State q2) {
    isFork(q1, _, _, _, _) and q2 = q1
    or
    step(_, _, _, q1, q2) and q1.toString() <= q2.toString()
  }

class StatePair extends TStatePair {
  State q1;
  State q2;

  StatePair() { this = MkStatePair(q1, q2) }

  string toString() { result = "(" + q1 + ", " + q2 + ")" }
}

/**
 * Gets the state pair `(q1, q2)` or `(q2, q1)`; note that only
 * one or the other is defined.
 */
StatePair mkStatePair(State q1, State q2) {
  result = MkStatePair(q1, q2) or result = MkStatePair(q2, q1)
}

predicate isStatePair(StatePair p) { any() }

predicate delta2(StatePair q, StatePair r) { step(q, _, _, r) }

/**
 * Gets the minimum length of a path from `q` to `r` in the
 * product automaton.
 */
int statePairDist(StatePair q, StatePair r) =
  shortestDistances(isStatePair/1, delta2/2)(q, r, result)

/**
 * Holds if there are transitions from `q` to `r1` and from `q` to `r2`
 * labelled with `s1` and `s2`, respectively, where `s1` and `s2` do not
 * trivially have an empty intersection.
 *
 * This predicate only holds for states associated with regular expressions
 * that have at least one repetition quantifier in them (otherwise the
 * expression cannot be vulnerable to ReDoS attacks anyway).
 */
pragma[noopt]
predicate isFork(State q, InputSymbol s1, InputSymbol s2, State r1, State r2) {
  exists(State q1, State q2 |
    q1 = epsilonSucc*(q) and
    delta(q1, s1, r1) and
    q2 = epsilonSucc*(q) and
    delta(q2, s2, r2) and
    // Use pragma[noopt] to prevent compatible(s1,s2) from being the starting point of the join.
    // From (s1,s2) it would find a huge number of intermediate state pairs (q1,q2) originating from different literals,
    // and discover at the end that no `q` can reach both `q1` and `q2` by epsilon transitions.
    compatible(s1, s2)
  |
    s1 != s2
    or
    r1 != r2
    or
    r1 = r2 and q1 != q2
  )
}

/**
 * Holds if there are transitions from the components of `q` to the corresponding
 * components of `r` labelled with `s1` and `s2`, respectively.
 */
predicate step(StatePair q, InputSymbol s1, InputSymbol s2, StatePair r) {
  exists(State r1, State r2 | step(q, s1, s2, r1, r2) and r = mkStatePair(r1, r2))
}

/**
 * Holds if there are transitions from the components of `q` to `r1` and `r2`
 * labelled with `s1` and `s2`, respectively.
 */
predicate step(StatePair q, InputSymbol s1, InputSymbol s2, State r1, State r2) {
  exists(State q1, State q2 | q = MkStatePair(q1, q2) |
    deltaClosed(q1, s1, r1) and
    deltaClosed(q2, s2, r2) and
    compatible(s1, s2)
  )
}

/**
 * A list of pairs of input symbols that describe a path in the product automaton
 * starting from some fork state.
 */
newtype Trace =
  Nil() or
  Step(InputSymbol s1, InputSymbol s2, Trace t) {
    exists(StatePair p |
      isReachableFromFork(_, p, t, _) and
      step(p, s1, s2, _)
    )
    or
    t = Nil() and isFork(_, s1, s2, _, _)
  }

/**
 * Gets a character that is represented by both `c` and `d`.
 */
string intersect(InputSymbol c, InputSymbol d) {
  c = Char(result) and
  (
    d = Char(result)
    or
    exists(RegExpCharacterClass cc | d = CharClass(cc) |
      exists(RegExpTerm child | child = cc.getAChild() |
        result = child.(RegExpConstant).getValue()
        or
        exists(string lo, string hi | child.(RegExpCharacterRange).isRange(lo, hi) |
          lo <= result and result <= hi
        )
      )
    )
    or
    d = Dot() and
    not (result = "\n" or result = "\r")
    or
    d = Any()
  )
  or
  exists(RegExpCharacterClass cc | c = CharClass(cc) and result = choose(cc) |
    d = CharClass(cc)
    or
    d = Dot() and
    not (result = "\n" or result = "\r")
    or
    d = Any()
  )
  or
  c = Dot() and
  (
    d = Dot() and result = "a"
    or
    d = Any() and result = "a"
  )
  or
  c = Any() and d = Any() and result = "a"
  or
  result = intersect(d, c)
}

/**
 * Gets a character matched by character class `cc`.
 */
string choose(RegExpCharacterClass cc) {
  result = min(string c |
      exists(RegExpTerm child | child = cc.getAChild() |
        c = child.(RegExpConstant).getValue() or
        child.(RegExpCharacterRange).isRange(c, _)
      )
    )
}

/**
 * Gets a string corresponding to the trace `t`.
 */
string concretise(Trace t) {
  t = Nil() and result = ""
  or
  exists(InputSymbol s1, InputSymbol s2, Trace rest | t = Step(s1, s2, rest) |
    result = concretise(rest) + intersect(s1, s2)
  )
}

/**
 * Holds if `r` is reachable from `(fork, fork)` under input `w`, and there is
 * a path from `r` back to `(fork, fork)` with `rem` steps.
 */
predicate isReachableFromFork(State fork, StatePair r, Trace w, int rem) {
  exists(InputSymbol s1, InputSymbol s2, State q1, State q2 |
    isFork(fork, s1, s2, q1, q2) and
    r = MkStatePair(q1, q2) and
    w = Step(s1, s2, Nil()) and
    rem = statePairDist(r, MkStatePair(fork, fork))
  )
  or
  exists(StatePair p, Trace v, InputSymbol s1, InputSymbol s2 |
    isReachableFromFork(fork, p, v, rem + 1) and
    step(p, s1, s2, r) and
    w = Step(s1, s2, v) and
    rem > 0
  )
}

/**
 * Gets a state in the product automaton from which `(fork, fork)` is
 * reachable in zero or more epsilon transitions.
 */
StatePair getAForkPair(State fork) {
  isFork(fork, _, _, _, _) and
  result = MkStatePair(epsilonPred*(fork), epsilonPred*(fork))
}

/**
 * Holds if `fork` is a pumpable fork with word `w`.
 */
predicate isPumpable(State fork, string w) {
  exists(StatePair q, Trace t |
    isReachableFromFork(fork, q, t, _) and
    (
      q = getAForkPair(fork) and w = concretise(t)
      or
      exists(InputSymbol s1, InputSymbol s2 |
        step(q, s1, s2, getAForkPair(fork)) and
        w = concretise(Step(s1, s2, t))
      )
    )
  )
}

/**
 * Gets a state that can be reached from pumpable `fork` consuming
 * the first `i+1` characters of `w`.
 *
 * Character classes are overapproximated as intervals; for example,
 * `[a-ln-z]` is treated the same as `[a-z]`, and hence considered
 * to match `m`, even though in fact it does not. This is fine for
 * our purposes, since we only use this predicate to avoid false
 * positives.
 */
State process(State fork, string w, int i) {
  isPumpable(fork, w) and
  exists(State prev |
    i = 0 and prev = fork
    or
    prev = process(fork, w, i - 1)
  |
    exists(InputSymbol s |
      deltaClosed(prev, s, result) and
      exists(intersect(Char(w.charAt(i)), s))
    )
  )
}

/**
 * Gets the result of backslash-escaping newlines, carriage-returns and
 * backslashes in `s`.
 */
bindingset[s]
string escape(string s) {
  result = s.replaceAll("\\", "\\\\").replaceAll("\n", "\\n").replaceAll("\r", "\\r")
}

/**
 * Gets `str` with the last `i` characters moved to the front.
 *
 * We use this to adjust the witness string to match with the beginning of
 * a RegExpTerm, so it doesn't start in the middle of a constant.
 */
bindingset[str, i]
string rotate(string str, int i) {
  result = str.suffix(str.length() - i) + str.prefix(str.length() - i)
}

from RegExpTerm t, string c, int i
where
  c = min(string w | isPumpable(Match(t, i), w)) and
  not isPumpable(epsilonSucc+(Match(t, i)), _) and
  not epsilonSucc*(process(Match(t, i), c, c.length() - 1)) = Accept(_)
select t,
  "This part of the regular expression may cause exponential backtracking on strings " +
    "containing many repetitions of '" + escape(rotate(c, i)) + "'."