codeql/javascript/ql/lib/semmle/javascript/CFG.qll

/**
 * Provides classes for working with a CFG-based program representation.
 *
 * ## Overview
 *
 * Each `StmtContainer` (that is, function or toplevel) has an intra-procedural
 * CFG associated with it, which is composed of `ControlFlowNode`s under a successor
 * relation exposed by predicates `ControlFlowNode.getASuccessor()` and
 * `ControlFlowNode.getAPredecessor()`.
 *
 * Each CFG has designated entry and exit nodes with types
 * `ControlFlowEntryNode` and `ControlFlowExitNode`, respectively, which are the only two
 * subtypes of `SyntheticControlFlowNode`. All `ControlFlowNode`s that are _not_
 * `SyntheticControlFlowNode`s belong to class `ConcreteControlFlowNode`.
 *
 * The predicate `ASTNode.getFirstControlFlowNode()` relates AST nodes
 * to the first (concrete) CFG node in the sub-graph of the CFG
 * corresponding to the node.
 *
 * Most statement containers also have a _start node_, obtained by
 * `StmtContainer.getStart()`, which is the unique CFG node at which execution
 * of the toplevel or function begins. Unlike the entry node, which is a synthetic
 * construct, the start node corresponds to an AST node: for instance, for
 * toplevels, it is the first CFG node of the first statement, and for functions
 * with parameters it is the CFG node corresponding to the first parameter.
 *
 * Empty toplevels do not have a start node, since all their CFG nodes are
 * synthetic.
 *
 * ## CFG Nodes
 *
 * Non-synthetic CFG nodes exist for six kinds of AST nodes, representing various
 * aspects of the program's runtime semantics:
 *
 *   - `Expr`: the CFG node represents the evaluation of the expression,
 *      including any side effects this may have;
 *   - `Stmt`: the CFG node represents the execution of the statement;
 *   - `Property`: the CFG node represents the assignment of the property;
 *   - `PropertyPattern`: the CFG node represents the matching of the property;
 *   - `MemberDefinition`: the CFG node represents the definition of the member
 *     method or field;
 *   - `MemberSignature`: the CFG node represents the point where the signature
 *     is declared, although this has no effect at runtime.
 *
 * ## CFG Structure
 *
 * ### Expressions
 *
 * For most expressions, the successor relation visits sub-expressions first,
 * and then the expression itself, representing the order of evaluation at
 * runtime. For example, the CFG for the expression `23 + 19` is
 *
 * <pre>
 * &hellip; &rarr; [23] &rarr; [19] &rarr; [23 + 19] &rarr; &hellip;
 * </pre>
 *
 * In particular, this means that `23` is the first CFG node of the expression
 * `23 + 19`.
 *
 * Similarly, for assignments the left hand side is visited first, then
 * the right hand side, then the assignment itself:
 *
 * <pre>
 * &hellip; &rarr; [x] &rarr; [y] &rarr; [x = y] &rarr; &hellip;
 * </pre>
 *
 * For properties, the name expression is visited first, then the value,
 * then the default value, if any. The same principle applies for getter
 * and setter properties: in this case, the "value" is simply the accessor
 * function, and there is no default value.
 *
 * There are only a few exceptions, generally for cases where the value of
 * the whole expression is the value of one of its sub-expressions. That
 * sub-expression then comes last in the CFG:
 *
 *   - Parenthesized expression:
 * <pre>
 * &hellip; &rarr; [(x)] &rarr; [x] &rarr; &hellip;
 * </pre>
 *   - Conditional expressions:
 * <pre>
 * &hellip; &rarr;  [x ? y : z]  &rarr; [x] &#x252c;&rarr; [y] &rarr; &hellip; <br>
 * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; &#x2514;&rarr; [z] &rarr; &hellip;
 * </pre>
 *   - Short-circuiting operator `&&` (same for `||`):
 * <pre>
 * &hellip; &rarr; [x && y] &rarr; [x] &rarr; &hellip; <br>
 * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; &nbsp; &darr; <br>
 * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; [y] &rarr; &hellip;
 * </pre>
 *   - Sequence/comma expressions:
 * <pre>
 * &hellip; &rarr; [x, y] &rarr; [x] &rarr; [y] &rarr; &hellip;
 * </pre>
 *
 * Finally, array expressions and object expressions also precede their
 * sub-expressions in the CFG to model the fact that the new array/object
 * is created before its elements/properties are evaluated:
 *
 * <pre>
 * &hellip; &rarr; [{ x: 42 }] &rarr; [x] &rarr; [42] &rarr; [x : 42] &rarr; &hellip;
 * </pre>
 *
 * ### Statements
 *
 * For most statements, the successor relation visits the statement first and then
 * its sub-expressions and sub-statements.
 *
 * For example, the CFG of a block statement first visits the individual statements,
 * then the block statement itself.
 *
 * Similarly, the CFG for an `if` statement first visits the statement itself, then
 * the condition. The condition, in turn, has the "then" branch as one of its successors
 * and the "else" branch (if it exists) or the next statement after the "if" (if it does not)
 * as the other:
 *
 * <pre>
 * &hellip; &rarr; [if (x) s1 else s2] &rarr; [x] &#x252c;&rarr; [s1] &rarr; &hellip;
 * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; &#x2514;&rarr; [s2] &rarr; &hellip;
 * </pre>
 *
 * For loops, the CFG reflects the order in which the loop test and the body are
 * executed.
 *
 * For instance, the CFG of a `while` loop starts with the statement itself, followed by
 * the condition. The condition has two successors: the body, and the statement following
 * the loop. The body, in turn, has the condition as its successor. This reflects the fact
 * that `while` loops first test their condition before executing their body:
 *
 * <pre>
 * &hellip; &rarr; [while (x) s] &rarr; [x] &rarr; &hellip;
 * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; &#x21c5;
 * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; [s]
 * </pre>
 *
 * On the other hand, `do`-`while` loops first execute their body before testing their condition:
 *
 * <pre>
 * &hellip; &rarr; [do s while (x)] &rarr; [s] &#x21c4; [x] &rarr; &hellip;
 * </pre>
 *
 * The CFG of a for loop starts with the loop itself, followed by the initializer expression
 * (if any), then the test expression (if any). The test expression has two successors: the
 * body, and the statement following the loop. The body, in turn, has the update expression
 * (if any) as its successor, and the update expression has the test expression as its only
 * successor:
 *
 * <pre>
 * &hellip; &rarr; [for(i;t;u) s] &rarr; [i] &rarr; [t] &rarr; &hellip;
 * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&#x2199;&nbsp;&#x2196
 * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[s] &rarr; [u]
 * </pre>
 *
 * The CFG of a for-in loop `for(x in y) s` starts with the loop itself, followed by the
 * iteration domain `y`. That node has two successors: the iterator `x`, and the statement
 * following the loop (modeling early exit in case `y` is empty). After the iterator `x`
 * comes the loop body `s`, which again has two successors: the iterator `x` (modeling the
 * case where there are more elements to iterate over), and the statement following the loop
 * (modeling the case where there are no more elements to iterate):
 *
 * <pre>
 * &hellip; &rarr; [for(x in y) s] &rarr; [y] &rarr; &nbsp;&hellip;
 * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&darr;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&uarr;
 * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[x] &#x21c4; [s]
 * </pre>
 *
 * For-of loops are the same.
 *
 * Finally, `return` and `throw` statements are different from all other statement types in
 * that for them the statement itself comes _after_ the operand, reflecting the fact that
 * the operand is evaluated before the return or throw is initiated:
 *
 * <pre>
 * &hellip; &rarr; [x] &rarr; [return x;] &rarr; &hellip;
 * </pre>
 *
 * ### Unstructured control flow
 *
 * Unstructured control flow is modeled in the obvious way: `break` and `continue` statements
 * have as their successor the next statement that is executed after the jump; `throw`
 * statements have the nearest enclosing `catch` clause as their successor, or the exit node
 * of the enclosing container if there is no enclosing `catch`; `return` statements have the
 * exit node of the enclosing container as their successor.
 *
 * In all cases, the control flow may be intercepted by an intervening `finally` block. For
 * instance, consider the following code snippet:
 *
 * <pre>
 * try {
 * &nbsp;&nbsp;if (x)
 * &nbsp;&nbsp;&nbsp;&nbsp;return;
 * &nbsp;&nbsp;s
 * } finally {
 * &nbsp;&nbsp;t
 * }
 * u
 * </pre>
 *
 * Here, the successor of `return` is not the exit node of the enclosing container, but instead
 * the `finally` block. The last statement of the `finally` block (here, `t`) has two successors:
 * `u` to model the case where `finally` was entered from `s`, and the exit node of the enclosing
 * container to model the case where the `return` is resumed after the `finally` block.
 *
 * Note that `finally` blocks can lead to imprecise control flow modeling since the `finally`
 * block resumes the action of _all_ statements it intercepts: in the above example, the CFG
 * not only models the executions `return` &rarr; `finally` &rarr; `t` &rarr; `exit` and
 * `s` &rarr; `finally` &rarr; `t` &rarr; `u`, but also allows the path  `return` &rarr;
 * `finally` &rarr; `t` &rarr; `u`, which does not correspond to any actual execution.
 *
 * The CFG also models the fact that certain kinds of expressions (calls, `new` expressions,
 * property accesses and `await` expressions) can throw exceptions, but _only_ if there is
 * an enclosing `try`-`catch` statement.
 *
 * ### Function preambles
 *
 * The CFG of a function starts with its entry node, followed by a _preamble_, which is a part of
 * the CFG that models parameter passing and function hoisting. The preamble is followed by the
 * function body, which in turn is followed by the exit node.
 *
 * For function expressions, the preamble starts with the function name, if any, to reflect the
 * fact that the function object is bound to that name inside the scope of the function. Next,
 * for both function expressions and function declarations, the parameters are executed in sequence
 * to represent parameter passing. If a parameter has a default value, that value is visited before
 * the parameter itself. Finally, the CFG nodes corresponding to the names of all hoisted functions
 * inside the outer function body are visited in lexical order. This reflects the fact that hoisted
 * functions are initialized before the body starts executing, but _after_ parameters have been
 * initialized.
 *
 * For instance, consider the following function declaration:
 *
 * <pre>
 * function outer(x, y = 42) {
 * &nbsp;&nbsp;s
 * &nbsp;&nbsp;function inner() {}
 * &nbsp;&nbsp;t
 * }
 * </pre>
 *
 * Its CFG is
 *
 * <pre>
 * [entry] &rarr; [x] &rarr; [42] &rarr; [y] &rarr; [inner] &rarr; [s] &rarr; [function inner() {}] &rarr; [t] &rarr; [exit]
 * </pre>
 *
 * Note that the function declaration `[function inner() {}]` as a whole is part of the CFG of the
 * body of `outer`, while its function identifier `inner` is part of the preamble.
 *
 * ### Toplevel preambles
 *
 * Similar to functions, toplevels (that is, modules, scripts or event handlers) also have a
 * preamble. For ECMAScript 2015 modules, all import specifiers are traversed first, in lexical
 * order, reflecting the fact that imports are resolved before execution of the module itself
 * begins; next, for all toplevels, the names of hoisted functions are traversed in lexical order
 * (as for functions). Afterwards, the CFG continues with the body of the toplevel, and ends
 * with the exit node.
 *
 * As an example, consider the following module:
 *
 * ```
 * s
 * import x as y from 'foo';
 * function f() {}
 * t
 * ```
 *
 * Its CFG is
 *
 * <pre>
 * [entry] &rarr; [x as y] &rarr; [f] &rarr; [s] &rarr; [import x as y from 'foo';] &rarr; [function f() {}] &rarr; [t] &rarr; [exit]
 * </pre>
 *
 * Note that the `import` statement as a whole is part of the CFG of the body, while its single
 * import specifier `x as y` forms part of the preamble.
 */
overlay[local]
module;

import javascript
private import internal.StmtContainers

/**
 * A node in the control flow graph, which is an expression, a statement,
 * or a synthetic node.
 */
class ControlFlowNode extends @cfg_node, Locatable, NodeInStmtContainer {
  /** Gets a node succeeding this node in the CFG. */
  ControlFlowNode getASuccessor() { successor(this, result) }

  /** Gets a node preceding this node in the CFG. */
  ControlFlowNode getAPredecessor() { this = result.getASuccessor() }

  /** Holds if this is a node with more than one successor. */
  predicate isBranch() { strictcount(this.getASuccessor()) > 1 }

  /** Holds if this is a node with more than one predecessor. */
  predicate isJoin() { strictcount(this.getAPredecessor()) > 1 }

  /**
   * Holds if this is a start node, that is, the CFG node where execution of a
   * toplevel or function begins.
   */
  predicate isStart() { this = any(StmtContainer sc).getStart() }

  /**
   * Holds if this is a final node of `container`, that is, a CFG node where execution
   * of that toplevel or function terminates.
   */
  predicate isAFinalNodeOfContainer(StmtContainer container) {
    this.getASuccessor().(SyntheticControlFlowNode).isAFinalNodeOfContainer(container)
  }

  /**
   * Holds if this is a final node, that is, a CFG node where execution of a
   * toplevel or function terminates.
   */
  final predicate isAFinalNode() { this.isAFinalNodeOfContainer(_) }

  /**
   * Holds if this node is unreachable, that is, it has no predecessors in the CFG.
   * Entry nodes are always considered reachable.
   *
   * Note that in a block of unreachable code, only the first node is unreachable
   * in this sense. For instance, in
   *
   * ```
   * function foo() { return; s1; s2; }
   * ```
   *
   * `s1` is unreachable, but `s2` is not.
   */
  predicate isUnreachable() {
    forall(ControlFlowNode pred | pred = this.getAPredecessor() |
      pred.(SyntheticControlFlowNode).isUnreachable()
    )
    // note the override in ControlFlowEntryNode below
  }

  /** Gets the basic block this node belongs to. */
  BasicBlock getBasicBlock() { this = result.getANode() }

  /**
   * For internal use.
   *
   * Gets a string representation of this control-flow node that can help
   * distinguish it from other nodes with the same `toString` value.
   */
  string describeControlFlowNode() {
    if this = any(MethodDeclaration mem).getBody()
    then result = "function in " + any(MethodDeclaration mem | mem.getBody() = this)
    else
      if this instanceof @decorator_list
      then result = "parameter decorators of " + this.(AstNode).getParent().(Function).describe()
      else result = this.toString()
  }
}

/**
 * A synthetic CFG node that does not correspond to a statement or expression;
 * examples include guard nodes and entry/exit nodes.
 */
class SyntheticControlFlowNode extends @synthetic_cfg_node, ControlFlowNode { }

/** A synthetic CFG node marking the entry point of a function or toplevel script. */
class ControlFlowEntryNode extends SyntheticControlFlowNode, @entry_node {
  override predicate isUnreachable() { none() }

  override string toString() {
    result = "entry node of " + pragma[only_bind_out](this.getContainer()).toString()
  }
}

/** A synthetic CFG node marking the exit of a function or toplevel script. */
class ControlFlowExitNode extends SyntheticControlFlowNode, @exit_node {
  override predicate isAFinalNodeOfContainer(StmtContainer container) {
    exit_cfg_node(this, container)
  }

  override string toString() {
    result = "exit node of " + pragma[only_bind_out](this.getContainer()).toString()
  }
}

/**
 * A synthetic CFG node recording that some condition is known to hold
 * at this point in the program.
 */
class GuardControlFlowNode extends SyntheticControlFlowNode, @guard_node {
  /** Gets the expression that this guard concerns. */
  Expr getTest() { guard_node(this, _, result) }

  /**
   * Holds if this guard dominates basic block `bb`, that is, the guard
   * is known to hold at `bb`.
   */
  predicate dominates(ReachableBasicBlock bb) {
    this = bb.getANode()
    or
    exists(ReachableBasicBlock prev | prev.strictlyDominates(bb) | this = prev.getANode())
  }
}

/**
 * A guard node recording that some condition is known to be truthy or
 * falsy at this point in the program.
 */
class ConditionGuardNode extends GuardControlFlowNode, @condition_guard {
  /** Gets the value recorded for the condition. */
  boolean getOutcome() {
    guard_node(this, 0, _) and result = false
    or
    guard_node(this, 1, _) and result = true
  }

  override string toString() { result = "guard: " + this.getTest() + " is " + this.getOutcome() }
}

/**
 * A CFG node corresponding to a program element, that is, a CFG node that is
 * not a `SyntheticControlFlowNode`.
 */
class ConcreteControlFlowNode extends ControlFlowNode {
  ConcreteControlFlowNode() { not this instanceof SyntheticControlFlowNode }
}