Files
codeql/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPublic.qll
yoff 408ba6218f Python: switch dataflow library to new (shared) CFG + SSA
Flips the Python dataflow trunk from the legacy CFG (semmle/python/Flow.qll)
and legacy ESSA SSA (semmle/python/essa/*) to the new shared CFG facade
(semmle.python.controlflow.internal.Cfg) and the new SSA adapter
(semmle.python.dataflow.new.internal.SsaImpl), both introduced
additively in the preceding PRs in this stack.

This is the trunk-flip equivalent of the original draft PR #21894 (kept
around as documentation), rebased on top of the four preparatory PRs:

  P1: Remove AstNode.getAFlowNode() and rewrite callers (#21919).
  P2: Qualify Flow.qll's AST references with Py:: prefix (#21920).
  P3: Add new shared-CFG-backed control flow graph (#21921).
  P4: Add new shared-SSA-backed SSA adapter (#21923).

The Python dataflow library (semmle/python/dataflow/new/) now imports
the new CFG facade and SSA adapter. All CFG-typed predicates
(ControlFlowNode, CallNode, BasicBlock, NameNode, AttrNode, ...) are
qualified with the Cfg:: prefix; SSA references switch from
EssaVariable/EssaDefinition to SsaImpl::Definition/SourceVariable.

GuardNode is redesigned to use the new CFG's outcome-node model
(isAfterTrue / isAfterFalse) instead of the legacy ConditionBlock +
flipped indirection. Only BarrierGuard<...> is preserved as public
API.

Framework files (Bottle, FastApi, Django, Tornado, Pyramid, Stdlib,
...) are updated to take CFG nodes from the new facade.

A handful of dataflow consistency tweaks for the new CFG:
- Augmented-assignment targets are treated as both load and store.
- 'from X import *' produces uncertain SSA writes for unknown names.
- CFG nodes are canonicalised so dataflow does not see equivalent
  pre/post-order pairs as distinct nodes.

Two AST tweaks for the new CFG:
- AstNodeImpl: omit PEP 695 type-parameter names from
  FunctionDefExpr / ClassDefExpr children.
- ImportResolution: drop the legacy essa import.

Test churn (~175 files): reblessed library- and query-test .expected
files reflect slightly different CFG granularity, different toString
output, and a handful of true alert deltas in security queries.

Verification: all 367 lib + src + consistency-queries compile clean.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-06-22 13:46:43 +00:00

1001 lines
36 KiB
Plaintext

/**
* Provides Python-specific definitions for use in the data flow library.
*/
overlay[local]
module;
private import python
private import semmle.python.controlflow.internal.Cfg as Cfg
private import semmle.python.controlflow.internal.AstNodeImpl as CfgImpl
private import codeql.controlflow.SuccessorType
private import DataFlowPrivate
import semmle.python.dataflow.new.TypeTracking
import Attributes
import LocalSources
private import semmle.python.dataflow.new.internal.SsaImpl as SsaImpl
private import semmle.python.dataflow.new.internal.ImportStar
private import semmle.python.frameworks.data.ModelsAsData
private import FlowSummaryImpl as FlowSummaryImpl
private import semmle.python.frameworks.data.ModelsAsData
/**
* IPA type for data flow nodes.
*
* Nodes broadly fall into three categories.
*
* - Control flow nodes: Flow between these is based on use-use flow computed via an SSA analysis.
* - Module variable nodes: These represent global variables and act as canonical targets for reads and writes of these.
* - Synthetic nodes: These handle flow in various special cases.
*/
overlay[local]
newtype TNode =
/** A node corresponding to a control flow node. */
TCfgNode(Cfg::ControlFlowNode node) {
isExpressionNode(node)
or
node.injects(_) and node.getNode() instanceof Pattern
} or
/**
* A node corresponding to a scope entry definition. That is, the value of a variable
* as it enters a scope.
*/
TScopeEntryDefinitionNode(SsaImpl::ScopeEntryDefinition def) {
not def.getScope() instanceof Module
} or
/**
* A synthetic node representing the value of an object before a state change.
*
* For class calls we pass a synthetic self argument, so attribute writes in
* `__init__` is reflected on the resulting object (we need special logic for this
* since there is no `return` in `__init__`)
*/
// NOTE: since we can't rely on the call graph, but we want to have synthetic
// pre-update nodes for class calls, we end up getting synthetic pre-update nodes for
// ALL calls :|
TSyntheticPreUpdateNode(Cfg::CallNode call) { call.injects(_) } or
/**
* A synthetic node representing the value of an object after a state change.
* See QLDoc for `PostUpdateNode`.
*/
TSyntheticPostUpdateNode(Cfg::ControlFlowNode node) {
node.injects(_) and
(
exists(Cfg::CallNode call |
node = call.getArg(_)
or
node = call.getArgByName(_)
or
// `self` argument when handling class instance calls (`__call__` special method))
node = call.getFunction()
)
or
node = any(Cfg::AttrNode a).getObject()
or
node = any(Cfg::SubscriptNode s).getObject()
or
// self parameter when used implicitly in `super()`
exists(Class cls, Function func, SsaImpl::ParameterDefinition def |
func = cls.getAMethod() and
not isStaticmethod(func) and
// this matches what we do in ExtractedParameterNode
def.getDefiningNode() = node and
def.getParameter() = func.getArg(0)
)
or
// the iterable argument to the implicit comprehension function
node.getNode() = any(Comp c).getIterable()
)
} or
/** A node representing a global (module-level) variable in a specific module. */
TModuleVariableNode(Module m, GlobalVariable v) { v.getScope() = m } or
/**
* A synthetic node representing that an iterable sequence flows to consumer.
*/
TIterableSequenceNode(UnpackingAssignmentSequenceTarget consumer) or
/**
* A synthetic node representing that there may be an iterable element
* for `consumer` to consume.
*/
TIterableElementNode(UnpackingAssignmentTarget consumer) or
/**
* A synthetic node representing element content in a star pattern.
*/
TStarPatternElementNode(MatchStarPattern target) or
/**
* INTERNAL: Do not use.
*
* A synthetic node representing the data for an ORM model saved in a DB.
*/
// TODO: Limiting the classes here to the ones that are actually ORM models was
// non-trivial, since that logic is based on API::Node results, and trying to do this
// causes non-monotonic recursion, and makes the API graph evaluation recursive with
// data-flow, which might do bad things for performance.
//
// So for now we live with having these synthetic ORM nodes for _all_ classes, which
// is a bit wasteful, but we don't think it will hurt too much.
TSyntheticOrmModelNode(Class cls) or
TFlowSummaryNode(FlowSummaryImpl::Private::SummaryNode sn) or
/** A synthetic node to capture positional arguments that are passed to a `*args` parameter. */
TSynthStarArgsElementParameterNode(DataFlowCallable callable) {
exists(ParameterPosition ppos | ppos.isStarArgs(_) | exists(callable.getParameter(ppos)))
} or
/** A synthetic node to capture keyword arguments that are passed to a `**kwargs` parameter. */
TSynthDictSplatArgumentNode(Cfg::CallNode call) {
call.injects(_) and exists(call.getArgByName(_))
} or
/** A synthetic node to allow flow to keyword parameters from a `**kwargs` argument. */
TSynthDictSplatParameterNode(DataFlowCallable callable) {
exists(ParameterPosition ppos | ppos.isKeyword(_) | exists(callable.getParameter(ppos)))
} or
/** A synthetic node representing a captured variable. */
TSynthCaptureNode(VariableCapture::Flow::SynthesizedCaptureNode cn) or
/** A synthetic node representing the heap of a function. Used for variable capture. */
TSynthCapturedVariablesParameterNode(Function f) {
f = any(VariableCapture::CapturedVariable v).getACapturingScope() and
exists(TFunction(f))
} or
/**
* A synthetic node representing the values of the variables captured
* by the callable being called.
*/
TSynthCapturedVariablesArgumentNode(Cfg::ControlFlowNode callable) {
callable.injects(_) and callable = any(Cfg::CallNode c).getFunction()
} or
/**
* A synthetic node representing the values of the variables captured
* by the callable being called, after the output has been computed.
*/
TSynthCapturedVariablesArgumentPostUpdateNode(Cfg::ControlFlowNode callable) {
callable.injects(_) and callable = any(Cfg::CallNode c).getFunction()
} or
/** A synthetic node representing the values of variables captured by a comprehension. */
TSynthCompCapturedVariablesArgumentNode(Comp comp) {
comp.getFunction() = any(VariableCapture::CapturedVariable v).getACapturingScope()
} or
/** A synthetic node representing the values of variables captured by a comprehension after the output has been computed. */
TSynthCompCapturedVariablesArgumentPostUpdateNode(Comp comp) {
comp.getFunction() = any(VariableCapture::CapturedVariable v).getACapturingScope()
} or
/** An empty, unused node type that exists to prevent unwanted dependencies on data flow nodes. */
TForbiddenRecursionGuard() {
none() and
// We want to prune irrelevant models before materialising data flow nodes, so types contributed
// directly from CodeQL must expose their pruning info without depending on data flow nodes.
(any(ModelInput::TypeModel tm).isTypeUsed("") implies any())
}
private import semmle.python.internal.CachedStages
/**
* An element, viewed as a node in a data flow graph. Either an SSA variable
* (`EssaNode`) or a control flow node (`CfgNode`).
*/
overlay[local]
class Node extends TNode {
/** Gets a textual representation of this element. */
cached
string toString() {
Stages::DataFlow::ref() and
result = "Data flow node"
}
/** Gets the scope of this node. */
Scope getScope() { none() }
/** Gets the enclosing callable of this node. */
DataFlowCallable getEnclosingCallable() { result = getCallableScope(this.getScope()) }
/** Gets the location of this node */
cached
Location getLocation() { none() }
/**
* Holds if this element is at the specified location.
* The location spans column `startcolumn` of line `startline` to
* column `endcolumn` of line `endline` in file `filepath`.
* For more information, see
* [Locations](https://codeql.github.com/docs/writing-codeql-queries/providing-locations-in-codeql-queries/).
*/
deprecated predicate hasLocationInfo(
string filepath, int startline, int startcolumn, int endline, int endcolumn
) {
Stages::DataFlow::ref() and
this.getLocation().hasLocationInfo(filepath, startline, startcolumn, endline, endcolumn)
}
/** Gets the control-flow node corresponding to this node, if any. */
Cfg::ControlFlowNode asCfgNode() { none() }
/** Gets the expression corresponding to this node, if any. */
Expr asExpr() { none() }
/**
* Gets a local source node from which data may flow to this node in zero or more local data-flow steps.
*/
LocalSourceNode getALocalSource() { result.flowsTo(this) }
}
/** A data-flow node corresponding to a control-flow node. */
class CfgNode extends Node, TCfgNode {
Cfg::ControlFlowNode node;
CfgNode() { this = TCfgNode(node) }
/** Gets the `Cfg::ControlFlowNode` represented by this data-flow node. */
Cfg::ControlFlowNode getNode() { result = node }
override Cfg::ControlFlowNode asCfgNode() { result = node }
/** Gets a textual representation of this element. */
override string toString() { result = node.toString() }
override Scope getScope() { result = node.getScope() }
override Location getLocation() { result = node.getLocation() }
}
/** A data-flow node corresponding to a `Cfg::CallNode` in the control-flow graph. */
class CallCfgNode extends CfgNode, LocalSourceNode {
override Cfg::CallNode node;
/**
* Gets the data-flow node for the function component of the call corresponding to this data-flow
* node.
*/
Node getFunction() { result.asCfgNode() = node.getFunction() }
/** Gets the data-flow node corresponding to the i'th positional argument of the call corresponding to this data-flow node */
Node getArg(int i) { result.asCfgNode() = node.getArg(i) }
/** Gets the data-flow node corresponding to the named argument of the call corresponding to this data-flow node */
Node getArgByName(string name) { result.asCfgNode() = node.getArgByName(name) }
/** Gets the data-flow node corresponding to the first tuple (*) argument of the call corresponding to this data-flow node, if any. */
Node getStarArg() { result.asCfgNode() = node.getStarArg() }
/** Gets the data-flow node corresponding to a dictionary (**) argument of the call corresponding to this data-flow node, if any. */
Node getKwargs() { result.asCfgNode() = node.getKwargs() }
}
/**
* A data-flow node corresponding to a method call, that is `foo.bar(...)`.
*
* Also covers the case where the method lookup is done separately from the call itself, as in
* `temp = foo.bar; temp(...)`. Note that this is only tracked through local scope.
*/
class MethodCallNode extends CallCfgNode {
AttrRead method_lookup;
MethodCallNode() { method_lookup = this.getFunction().getALocalSource() }
/**
* Gets the name of the method being invoked (the `bar` in `foo.bar(...)`) if it can be determined.
*
* Note that this method may have multiple results if a single call node represents calls to
* multiple different objects and methods. If you want to link up objects and method names
* accurately, use the `calls` method instead.
*/
string getMethodName() { result = method_lookup.getAttributeName() }
/**
* Gets the data-flow node corresponding to the object receiving this call. That is, the `foo` in
* `foo.bar(...)`.
*
* Note that this method may have multiple results if a single call node represents calls to
* multiple different objects and methods. If you want to link up objects and method names
* accurately, use the `calls` method instead.
*/
Node getObject() { result = method_lookup.getObject() }
/** Holds if this data-flow node calls method `methodName` on the object node `object`. */
predicate calls(Node object, string methodName) {
// As `getObject` and `getMethodName` may both have multiple results, we must look up the object
// and method name directly on `method_lookup`.
object = method_lookup.getObject() and
methodName = method_lookup.getAttributeName()
}
}
/**
* An expression, viewed as a node in a data flow graph.
*
* Note that because of control-flow splitting, one `Expr` may correspond
* to multiple `ExprNode`s, just like it may correspond to multiple
* `ControlFlow::Node`s.
*/
class ExprNode extends CfgNode {
ExprNode() { isExpressionNode(node) }
override Expr asExpr() { result = node.getNode() }
}
/** Gets a node corresponding to expression `e`. */
ExprNode exprNode(DataFlowExpr e) { result.getNode().getNode() = e }
/**
* A node corresponding to a scope entry definition. That is, the value of a variable
* as it enters a scope.
*/
class ScopeEntryDefinitionNode extends Node, TScopeEntryDefinitionNode {
SsaImpl::ScopeEntryDefinition def;
ScopeEntryDefinitionNode() { this = TScopeEntryDefinitionNode(def) }
/** Gets the `SsaImpl::ScopeEntryDefinition` associated with this node. */
SsaImpl::ScopeEntryDefinition getDefinition() { result = def }
/** Gets the source variable represented by this node. */
SsaImpl::SsaSourceVariable getVariable() { result = def.getSourceVariable() }
override Location getLocation() { result = def.getLocation() }
override Scope getScope() { result = def.getScope() }
override string toString() { result = "Entry definition for " + this.getVariable().toString() }
}
/**
* The value of a parameter at function entry, viewed as a node in a data
* flow graph.
*/
overlay[local]
class ParameterNode extends Node instanceof ParameterNodeImpl {
/** Gets the parameter corresponding to this node, if any. */
final Parameter getParameter() { result = super.getParameter() }
}
/** A parameter node found in the source code (not in a summary). */
class ExtractedParameterNode extends ParameterNodeImpl, CfgNode {
//, LocalSourceNode {
SsaImpl::ParameterDefinition def;
ExtractedParameterNode() { node = def.getDefiningNode() }
override Parameter getParameter() { result = def.getParameter() }
}
class LocalSourceParameterNode extends ExtractedParameterNode, LocalSourceNode { }
/** Gets a node corresponding to parameter `p`. */
ExtractedParameterNode parameterNode(Parameter p) { result.getParameter() = p }
/** A data flow node that represents a call argument. */
overlay[global]
abstract class ArgumentNode extends Node {
/** Holds if this argument occurs at the given position in the given call. */
abstract predicate argumentOf(DataFlowCall call, ArgumentPosition pos);
/** Gets the call in which this node is an argument, if any. */
final ExtractedDataFlowCall getCall() { this.argumentOf(result, _) }
}
/** Gets an overapproximation of the argument nodes that are included in `getCallArg`. */
Node getCallArgApproximation() {
// pre-update nodes for calls
result = any(CallCfgNode c).(PostUpdateNode).getPreUpdateNode()
or
// self parameters in methods
exists(Class c | result.asExpr() = c.getAMethod().getArg(0))
or
// the object part of an attribute expression (which might be a bound method)
result.asCfgNode() = any(Cfg::AttrNode a).getObject()
or
// the function part of any call
result.asCfgNode() = any(Cfg::CallNode c).getFunction()
}
/** Gets the extracted argument nodes that do not rely on `getCallArg`. */
private Node implicitArgumentNode() {
// for potential summaries we allow all normal call arguments
normalCallArg(_, result, _)
or
// and self arguments
result.asCfgNode() = any(Cfg::CallNode c).getFunction().(Cfg::AttrNode).getObject()
or
// for comprehensions, we allow the synthetic `iterable` argument
result.asExpr() = any(Comp c).getIterable()
}
/**
* A data flow node that represents a call argument found in the source code.
*/
overlay[global]
class ExtractedArgumentNode extends ArgumentNode {
ExtractedArgumentNode() {
this = getCallArgApproximation()
or
this = implicitArgumentNode()
}
final override predicate argumentOf(DataFlowCall call, ArgumentPosition pos) {
this = call.getArgument(pos) and
call instanceof ExtractedDataFlowCall and
(
this = implicitArgumentNode()
or
this = getCallArgApproximation() and getCallArg(_, _, _, this, _)
)
}
}
/**
* A node associated with an object after an operation that might have
* changed its state.
*
* This can be either the argument to a callable after the callable returns
* (which might have mutated the argument), the qualifier of a field after
* an update to the field, or a container such as a list/dictionary after an element
* update.
*
* Nodes corresponding to AST elements, for example `ExprNode`s, usually refer
* to the value before the update with the exception of class calls,
* which represents the value _after_ the constructor has run.
*/
class PostUpdateNode extends Node instanceof PostUpdateNodeImpl {
/** Gets the node before the state update. */
Node getPreUpdateNode() { result = super.getPreUpdateNode() }
}
/**
* A data flow node corresponding to a module-level (global) variable that is accessed outside of the module scope.
*
* Global variables may appear twice in the data flow graph, as both `EssaNode`s and
* `ModuleVariableNode`s. The former is used to represent data flow between global variables as it
* occurs during module initialization, and the latter is used to represent data flow via global
* variable reads and writes during run-time.
*
* It is possible for data to flow from assignments made at module initialization time to reads made
* at run-time, but not vice versa. For example, there will be flow from `SOURCE` to `SINK` in the
* following snippet:
*
* ```python
* g = SOURCE
*
* def foo():
* SINK(g)
* ```
* but not the other way round:
*
* ```python
* SINK(g)
*
* def bar()
* global g
* g = SOURCE
* ```
*
* Data flow through `ModuleVariableNode`s is represented as `jumpStep`s, and so any write of a
* global variable can flow to any read of the same variable.
*/
class ModuleVariableNode extends Node, TModuleVariableNode {
Module mod;
GlobalVariable var;
ModuleVariableNode() { this = TModuleVariableNode(mod, var) }
override Scope getScope() { result = mod }
override string toString() {
result = "ModuleVariableNode in " + concat( | | mod.toString(), ",") + " for " + var.getId()
}
/** Gets the module in which this variable appears. */
Module getModule() { result = mod }
/** Gets the global variable corresponding to this node. */
GlobalVariable getVariable() { result = var }
/** Gets a node that reads this variable. */
overlay[global]
Node getARead() {
result = this.getALocalRead()
or
this = import_star_read(result)
}
/** Gets a node that reads this variable, excluding reads that happen through `from ... import *`. */
Node getALocalRead() {
result.asCfgNode().getNode() = var.getALoad() and
not result.getScope() = mod
}
/** Gets a CFG node that corresponds to an assignment of this global variable. */
Node getAWrite() {
exists(Cfg::NameNode n |
n.defines(var) and
result.asCfgNode() = n
)
}
/** Gets the possible values of the variable at the end of import time */
CfgNode getADefiningWrite() {
exists(SsaImpl::EssaVariable def |
def = any(SsaImpl::EssaVariable ssa_var).getAnUltimateDefinition() and
def.getDefinition().(SsaImpl::EssaNodeDefinition).getDefiningNode() = result.asCfgNode() and
def.getSourceVariable().getVariable() = var
)
}
override DataFlowCallable getEnclosingCallable() { result.(DataFlowModuleScope).getScope() = mod }
override Location getLocation() { result = mod.getLocation() }
}
overlay[global]
private ModuleVariableNode import_star_read(Node n) {
resolved_import_star_module(result.getModule(), result.getVariable().getId(), n)
}
overlay[global]
pragma[nomagic]
private predicate resolved_import_star_module(Module m, string name, Node n) {
exists(Cfg::NameNode nn | nn = n.asCfgNode() |
ImportStar::importStarResolvesTo(pragma[only_bind_into](nn), m) and
nn.getId() = name
)
}
/**
* A synthetic node representing an iterable sequence. Used for changing content type
* for instance from a `ListElement` to a `TupleElement`, especially if the content is
* transferred via a read step which cannot be broken up into a read and a store. The
* read step then targets TIterableSequence, and the conversion can happen via a read
* step to TIterableElement followed by a store step to the target.
*/
class IterableSequenceNode extends Node, TIterableSequenceNode {
CfgNode consumer;
IterableSequenceNode() { this = TIterableSequenceNode(consumer.getNode()) }
override string toString() { result = "IterableSequence" }
override Scope getScope() { result = consumer.getScope() }
override Location getLocation() { result = consumer.getLocation() }
}
/**
* A synthetic node representing an iterable element. Used for changing content type
* for instance from a `ListElement` to a `TupleElement`. This would happen via a
* read step from the list to IterableElement followed by a store step to the tuple.
*/
class IterableElementNode extends Node, TIterableElementNode {
CfgNode consumer;
IterableElementNode() { this = TIterableElementNode(consumer.getNode()) }
override string toString() { result = "IterableElement" }
override Scope getScope() { result = consumer.getScope() }
override Location getLocation() { result = consumer.getLocation() }
}
/**
* A synthetic node representing element content of a star pattern.
*/
class StarPatternElementNode extends Node, TStarPatternElementNode {
CfgNode consumer;
StarPatternElementNode() { this = TStarPatternElementNode(consumer.getNode().getNode()) }
override string toString() { result = "StarPatternElement" }
override Scope getScope() { result = consumer.getScope() }
override Location getLocation() { result = consumer.getLocation() }
}
/**
* A node that participates in a conditional split: a CFG node whose
* evaluation outcome (true/false) is used to choose between two
* successor basic blocks. In the shared CFG, branching is detected
* via typed successor edges (boolean successor types) on the unique
* `injects` node for each AST expression.
*
* Users typically obtain a `GuardNode` by casting from a more specific
* Cfg type: `g.(Cfg::CallNode)` for a call-based check, etc.
*/
class GuardNode extends Cfg::ControlFlowNode {
GuardNode() {
// This node has boolean successor edges (directly or via wrapping).
outcomeOfGuard(this, _, _)
}
/** Holds if this guard controls block `b` upon evaluating to `branch`. */
predicate controlsBlock(Cfg::BasicBlock b, boolean branch) {
exists(CfgImpl::BasicBlock outcomeBB |
outcomeOfGuard(this, outcomeBB, branch) and
outcomeBB.dominates(b)
)
}
}
/**
* Holds if `outcomeBB` is the basic block entered when `guard` evaluates
* to `branch`.
*
* For a direct guard `if g:`, the outcome BB starts at the after-value
* node for the matching branch. For wrapped guards like `not g` or
* `g == True`, we follow those wrappers up the AST to find the
* outermost expression that actually branches, with an appropriate
* polarity transform.
*/
private predicate outcomeOfGuard(
Cfg::ControlFlowNode guard, CfgImpl::BasicBlock outcomeBB, boolean branch
) {
// Base case: the guard has boolean successor edges.
// Only the canonical representative (injects) can act as a guard base.
guard.injects(_) and
exists(BooleanSuccessor t |
t.getValue() = branch and
outcomeBB = guard.(CfgImpl::ControlFlowNode).getASuccessor(t).getBasicBlock()
)
or
// Recursive: `not guard` — same outcome split as `guard`, flipped.
exists(Cfg::UnaryExprNode notNode, boolean notBranch |
notNode.injects(_) and
notNode.getOperand().getNode() = guard.getNode() and
notNode.getNode().getOp() instanceof Not and
outcomeOfGuard(notNode, outcomeBB, notBranch) and
branch = notBranch.booleanNot()
)
or
// Recursive: comparisons against a boolean literal.
exists(
Cfg::CompareNode cmpNode, Cmpop op, Cfg::ControlFlowNode otherOperand,
Cfg::ControlFlowNode guardOperand, boolean polarity, boolean cmpBranch
|
cmpNode.injects(_) and
guardOperand.getNode() = guard.getNode() and
(
cmpNode.operands(guardOperand, op, otherOperand) or
cmpNode.operands(otherOperand, op, guardOperand)
) and
not guard.getNode() instanceof BooleanLiteral and
(
(op instanceof Eq or op instanceof Is) and
polarity = otherOperand.getNode().(BooleanLiteral).booleanValue()
or
(op instanceof NotEq or op instanceof IsNot) and
polarity = otherOperand.getNode().(BooleanLiteral).booleanValue().booleanNot()
) and
outcomeOfGuard(cmpNode, outcomeBB, cmpBranch) and
branch = cmpBranch.booleanXor(polarity.booleanNot())
)
}
/**
* Holds if the guard `g` validates `node` upon evaluating to `branch`.
*/
signature predicate guardChecksSig(GuardNode g, Cfg::ControlFlowNode node, boolean branch);
/**
* Provides a set of barrier nodes for a guard that validates a node.
*
* This is expected to be used in `isBarrier`/`isSanitizer` definitions
* in data flow and taint tracking.
*/
overlay[global]
module BarrierGuard<guardChecksSig/3 guardChecks> {
/** Gets a node that is safely guarded by the given guard check. */
ExprNode getABarrierNode() {
result = ParameterizedBarrierGuard<Unit, extendedGuardChecks/4>::getABarrierNode(_)
}
private predicate extendedGuardChecks(
GuardNode g, Cfg::ControlFlowNode node, boolean branch, Unit u
) {
guardChecks(g, node, branch) and
u = u
}
}
bindingset[this]
private signature class ParamSig;
private module WithParam<ParamSig P> {
signature predicate guardChecksSig(GuardNode g, Cfg::ControlFlowNode node, boolean branch, P param);
}
/**
* Provides a set of barrier nodes for a guard that validates a node.
*
* This is expected to be used in `isBarrier`/`isSanitizer` definitions
* in data flow and taint tracking.
*/
module ParameterizedBarrierGuard<ParamSig P, WithParam<P>::guardChecksSig/4 guardChecks> {
/** Gets a node that is safely guarded by the given guard check with parameter `param`. */
overlay[global]
ExprNode getABarrierNode(P param) {
exists(GuardNode g, SsaImpl::EssaDefinition def, Cfg::ControlFlowNode node, boolean branch |
SsaImpl::AdjacentUses::useOfDef(def, node) and
guardChecks(g, node, branch, param) and
SsaImpl::AdjacentUses::useOfDef(def, result.asCfgNode()) and
// The protected use must be a different SSA position than the test
// position itself: `controlsBlock` is reflexive on dominance, and
// the test expression is an SSA-use position on the def-use chain.
// Without this guard, the test position would be returned as a
// barrier and block flow before it can reach genuine branch uses.
node != result.asCfgNode() and
g.controlsBlock(result.asCfgNode().getBasicBlock(), branch)
)
}
}
/**
* Provides a set of barrier nodes for a guard that validates a node as described by an external predicate.
*
* This is expected to be used in `isBarrier`/`isSanitizer` definitions
* in data flow and taint tracking.
*/
module ExternalBarrierGuard {
private import semmle.python.ApiGraphs
overlay[global]
private predicate guardCheck(GuardNode g, Cfg::ControlFlowNode node, boolean branch, string kind) {
exists(API::CallNode call, API::Node parameter |
parameter = call.getAParameter() and
parameter = ModelOutput::getABarrierGuardNode(kind, branch)
|
g = call.asCfgNode() and
node = parameter.asSink().asCfgNode()
)
}
/**
* Gets a node that is an external barrier of the given kind.
*
* This only provides external barrier nodes defined as guards. To get all externally defined barrer nodes,
* use `ModelOutput::barrierNode(node, kind)`.
*
* INTERNAL: Do not use.
*/
overlay[global]
ExprNode getAnExternalBarrierNode(string kind) {
result = ParameterizedBarrierGuard<string, guardCheck/4>::getABarrierNode(kind)
}
}
/**
* Algebraic datatype for tracking data content associated with values.
* Content can be collection elements or object attributes.
*/
overlay[local]
newtype TContent =
/** An element of a list. */
TListElementContent() or
/** An element of a set. */
TSetElementContent() or
/** An element of a tuple at a specific index. */
TTupleElementContent(int index) {
exists(any(Cfg::TupleNode tn).getElement(index))
or
// Arguments can overflow and end up in the starred parameter tuple.
exists(any(Cfg::CallNode cn).getArg(index))
or
// since flow summaries might use tuples, we ensure that we at least have valid
// TTupleElementContent for the 0..7 (7 was picked to match `small_tuple` in
// data-flow-private)
index in [0 .. 7]
} or
/** An element of a dictionary under a specific key. */
TDictionaryElementContent(string key) {
// {"key": ...}
key = any(KeyValuePair kvp).getKey().(StringLiteral).getText()
or
// func(key=...)
key = any(Keyword kw).getArg()
or
// d["key"] = ...
key =
any(Cfg::SubscriptNode sub |
sub.isStore()
|
sub.getIndex().getNode().(StringLiteral).getText()
)
or
// d.setdefault("key", ...)
exists(Cfg::CallNode call | call.getFunction().(Cfg::AttrNode).getName() = "setdefault" |
key = call.getArg(0).getNode().(StringLiteral).getText()
)
} or
/** An element of a dictionary under any key. */
TDictionaryElementAnyContent() or
/** An object attribute. */
TAttributeContent(string attr) {
attr = any(Attribute a).getName()
or
// Flow summaries that target attributes rely on a TAttributeContent being
// available. However, since the code above only constructs a TAttributeContent
// based on the attribute names seen in the DB, we can end up in a scenario where
// flow summaries don't work due to missing TAttributeContent. To get around this,
// we need to add the attribute names used by flow summaries. This needs to be done
// both for the summaries written in QL and the ones written in data-extension
// files.
//
// 1) Summaries in QL. Sadly the following code leads to non-monotonic recursion
// name = any(AccessPathToken a).getAnArgument("Attribute")
// instead we use a qltest to alert if we write a new summary in QL that uses an
// attribute -- see
// python/ql/test/library-tests/dataflow/summaries-checks/missing-attribute-content.ql
attr in ["re", "string", "pattern"]
or
//
// 2) summaries in data-extension files
exists(string input, string output |
ModelOutput::relevantSummaryModel(_, _, input, output, _, _)
|
attr = [input, output].regexpFind("(?<=(^|\\.)Attribute\\[)[^\\]]+(?=\\])", _, _).trim()
)
} or
/** A captured variable. */
TCapturedVariableContent(VariableCapture::CapturedVariable v)
/**
* A data-flow value can have associated content.
* If the value is a collection, it can have elements,
* if it is an object, it can have attribute values.
*/
overlay[local]
class Content extends TContent {
/** Gets a textual representation of this element. */
string toString() { result = "Content" }
/** Gets the Models-as-Data representation of this content (if any). */
string getMaDRepresentation() { none() }
}
/** An element of a list. */
class ListElementContent extends TListElementContent, Content {
override string toString() { result = "List element" }
override string getMaDRepresentation() { result = "ListElement" }
}
/** An element of a set. */
class SetElementContent extends TSetElementContent, Content {
override string toString() { result = "Set element" }
override string getMaDRepresentation() { result = "SetElement" }
}
/** An element of a tuple at a specific index. */
class TupleElementContent extends TTupleElementContent, Content {
int index;
TupleElementContent() { this = TTupleElementContent(index) }
/** Gets the index for this tuple element. */
int getIndex() { result = index }
override string toString() { result = "Tuple element at index " + index.toString() }
override string getMaDRepresentation() { result = "TupleElement[" + index + "]" }
}
/** An element of a dictionary under a specific key. */
class DictionaryElementContent extends TDictionaryElementContent, Content {
string key;
DictionaryElementContent() { this = TDictionaryElementContent(key) }
/** Gets the key for this dictionary element. */
string getKey() { result = key }
override string toString() { result = "Dictionary element at key " + key }
override string getMaDRepresentation() { result = "DictionaryElement[" + key + "]" }
}
/** An element of a dictionary under any key. */
class DictionaryElementAnyContent extends TDictionaryElementAnyContent, Content {
override string toString() { result = "Any dictionary element" }
override string getMaDRepresentation() { result = "DictionaryElementAny" }
}
/** An object attribute. */
class AttributeContent extends TAttributeContent, Content {
private string attr;
AttributeContent() { this = TAttributeContent(attr) }
/** Gets the name of the attribute under which this content is stored. */
string getAttribute() { result = attr }
override string toString() { result = "Attribute " + attr }
override string getMaDRepresentation() { result = "Attribute[" + attr + "]" }
}
/** A captured variable. */
class CapturedVariableContent extends Content, TCapturedVariableContent {
private VariableCapture::CapturedVariable v;
CapturedVariableContent() { this = TCapturedVariableContent(v) }
/** Gets the captured variable. */
VariableCapture::CapturedVariable getVariable() { result = v }
override string toString() { result = "captured " + v }
override string getMaDRepresentation() { none() }
}
/**
* An entity that represents a set of `Content`s.
*
* Most `ContentSet`s are singletons (i.e. they consist of a single `Content`),
* but `AnyDictionaryElement` and `AnyTupleElement` act as wildcards on the
* read side: a read at such a `ContentSet` matches any specific dictionary
* key / tuple index store, as well as (for dictionaries) the
* "unknown-bucket" Content `DictionaryElementAnyContent`.
*
* Keeping these as wildcard `ContentSet`s (rather than enumerating one
* `ContentSet` per key/index) keeps the dataflow `readSetEx` relation small
* when implicit reads are used (e.g. at sinks via `defaultImplicitTaintRead`).
*/
private newtype TContentSet =
TSingletonContent(Content c) or
TAnyTupleElement() or
TAnyDictionaryElement() or
TAnyTupleOrDictionaryElement()
/**
* An entity that represents a set of `Content`s.
*
* The set may be interpreted differently depending on whether it is
* stored into (`getAStoreContent`) or read from (`getAReadContent`).
*/
class ContentSet extends TContentSet {
/** Holds if this content set is the singleton `{c}`. */
predicate isSingleton(Content c) { this = TSingletonContent(c) }
/** Holds if this content set is the wildcard for all tuple elements. */
predicate isAnyTupleElement() { this = TAnyTupleElement() }
/** Holds if this content set is the wildcard for all dictionary elements. */
predicate isAnyDictionaryElement() { this = TAnyDictionaryElement() }
/** Holds if this content set is the wildcard for all tuple elements or dictionary elements. */
predicate isAnyTupleOrDictionaryElement() { this = TAnyTupleOrDictionaryElement() }
/** Gets a content that may be stored into when storing into this set. */
Content getAStoreContent() { this = TSingletonContent(result) }
/** Gets a content that may be read from when reading from this set. */
Content getAReadContent() {
this = TSingletonContent(result)
or
// Wildcard expansion: a read at "any tuple element" matches a store at any
// specific tuple index. (Stores always target a specific index, so we don't
// need a `TupleElementAnyContent` Content kind here.)
this = TAnyTupleElement() and result instanceof TupleElementContent
or
this = TAnyDictionaryElement() and
(result instanceof DictionaryElementContent or result instanceof DictionaryElementAnyContent)
or
this = TAnyTupleOrDictionaryElement() and
(
result instanceof TupleElementContent or
result instanceof DictionaryElementContent or
result instanceof DictionaryElementAnyContent
)
}
/** Gets a textual representation of this content set. */
string toString() {
exists(Content c | this = TSingletonContent(c) | result = c.toString())
or
this = TAnyTupleElement() and result = "Any tuple element"
or
this = TAnyDictionaryElement() and result = "Any dictionary element"
or
this = TAnyTupleOrDictionaryElement() and result = "Any tuple or dictionary element"
}
}
/** Gets the singleton `ContentSet` wrapping the `Content` `c`. */
ContentSet singleton(Content c) { result = TSingletonContent(c) }