Python: canonicalize CFG nodes for dataflow

The shared CFG creates multiple ControlFlowNodes per AST node in
conditional contexts (e.g. afterTrue/afterFalse for boolean conditions,
empty/non-empty for for-loops, matched/unmatched for match cases).
These splits matter for control-flow analysis, but for dataflow — where
we ask 'what is the value of this expression?' — we need exactly one
representative per AST or we double-count calls, arguments, and store
steps.

This adds Cfg::isCanonicalAstNodeRepresentative as a purely structural
pick: for split ASTs it selects the 'positive' outcome variant; for
non-split ASTs it selects the unique variant. The picker is implemented
via genuine-outcome helpers that work around the shared CFG's
cross-kind isAfterValue fallback (ControlFlowGraph.qll:870-892), see
the doc on isGenuineAfterTrue for details.

The TCfgNode-family newtypes in DataFlowPublic, TNormalCall and
TPotentialLibraryCall in DataFlowDispatch, and the SSA-projected
use-use/def-use steps in DataFlowPrivate are all routed through the
canonical filter. DataFlowConsistency and the test UnresolvedCalls
helper qualify their CallNode casts with Cfg:: to keep working.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
yoff
2026-05-28 07:39:32 +00:00
parent 0a9946121b
commit b55186338e
7 changed files with 165 additions and 40 deletions

View File

@@ -9,6 +9,7 @@ private import semmle.python.dataflow.new.internal.DataFlowImplSpecific
private import semmle.python.dataflow.new.internal.DataFlowDispatch
private import semmle.python.dataflow.new.internal.TaintTrackingImplSpecific
private import codeql.dataflow.internal.DataFlowImplConsistency
private import semmle.python.controlflow.internal.Cfg as Cfg
private module Input implements InputSig<Location, PythonDataFlow> {
private import Private
@@ -72,7 +73,7 @@ private module Input implements InputSig<Location, PythonDataFlow> {
// resolve to multiple functions), but we only make _one_ ArgumentNode for each
// argument in the CallNode, we end up violating this consistency check in those
// cases. (see `getCallArg` in DataFlowDispatch.qll)
exists(DataFlowCall other, CallNode cfgCall | other != call |
exists(DataFlowCall other, Cfg::CallNode cfgCall | other != call |
call.getNode() = cfgCall and
other.getNode() = cfgCall and
isArgumentNode(arg, call, _) and
@@ -88,16 +89,16 @@ private module Input implements InputSig<Location, PythonDataFlow> {
// allow it instead.
(
call.getScope() = attr.getScope() and
any(CfgNode n | n.asCfgNode() = call.getNode().(CallNode).getFunction()).getALocalSource() =
attr
any(CfgNode n | n.asCfgNode() = call.getNode().(Cfg::CallNode).getFunction())
.getALocalSource() = attr
or
not exists(call.getScope().(Function).getDefinition()) and
call.getScope().getScope+() = attr.getScope()
) and
(
other.getScope() = attr.getScope() and
any(CfgNode n | n.asCfgNode() = other.getNode().(CallNode).getFunction()).getALocalSource() =
attr
any(CfgNode n | n.asCfgNode() = other.getNode().(Cfg::CallNode).getFunction())
.getALocalSource() = attr
or
not exists(other.getScope().(Function).getDefinition()) and
other.getScope().getScope+() = attr.getScope()

View File

@@ -0,0 +1,4 @@
---
category: minorAnalysis
---
* The Python dataflow library is now built on the shared CFG and SSA libraries (`shared/controlflow` and `shared/ssa`), bringing Python in line with the other CodeQL languages. The legacy CFG in `semmle/python/Flow.qll` and the legacy ESSA SSA in `semmle/python/essa/*` remain available for downstream queries but are no longer used by the new dataflow library, type tracking, or API graphs. Most queries should be unaffected; a small number may produce slightly different results because of differences in CFG granularity (e.g. separate pre/post nodes per expression) and in how attribute and tuple-unpacking writes are modelled.

View File

@@ -76,6 +76,92 @@ private predicate isCanonical(CfgImpl::ControlFlowNode n) {
n instanceof CfgImpl::ControlFlow::AnnotatedExitNode
}
/**
* Holds if `n` is genuinely the `TAfterValueNode` variant for a boolean-true
* outcome of its AST node.
*
* The shared CFG's `isAfterValue` predicate has a kind-mismatch fallback
* (see `ControlFlowGraph.qll`'s `isAfterValue` lines 870-892): when asking
* `isAfterValue(_, BooleanSuccessor true)` on, say, an emptiness-empty
* variant, it falsely returns `true` because the kinds differ and Python
* does not provide `successorValueImplies`. The same fallback also makes
* `TAfterNode` (the unsplit case) satisfy `isAfterValue(_, t)` for *every*
* `t`.
*
* The combination `isAfterTrue ∧ ¬isAfterFalse` excludes both: a genuine
* boolean-true variant satisfies `isAfterTrue` directly (newtype branch 3)
* but not `isAfterFalse` (the dual variant, same kind, no fallback). A
* non-boolean variant satisfies both via the cross-kind fallback. A
* `TAfterNode` satisfies both via branch 1. So `isAfterTrue ∧ ¬isAfterFalse`
* picks exactly the genuine boolean-true variant.
*/
private predicate isGenuineAfterTrue(ControlFlowNode n) {
n.isAfterTrue(_) and not n.isAfterFalse(_)
}
/**
* Holds if `n` is genuinely the `TAfterValueNode` variant for the "empty"
* outcome of its AST node (e.g. `for x in xs: ...` when `xs` is empty).
*
* See `isGenuineAfterTrue` for why we cannot just use a single
* `isAfterValue` check.
*/
private predicate isGenuineAfterEmpty(ControlFlowNode n) {
exists(EmptinessSuccessor empty |
empty.getValue() = true and n.isAfterValue(n.getAstNode(), empty)
) and
not exists(EmptinessSuccessor nonEmpty |
nonEmpty.getValue() = false and n.isAfterValue(n.getAstNode(), nonEmpty)
)
}
/**
* Holds if `n` is genuinely the `TAfterValueNode` variant for the "matched"
* outcome of its AST node (e.g. a `match` case-pattern that matched).
*
* See `isGenuineAfterTrue` for why we cannot just use a single
* `isAfterValue` check.
*/
private predicate isGenuineAfterMatched(ControlFlowNode n) {
exists(MatchingSuccessor matched |
matched.getValue() = true and n.isAfterValue(n.getAstNode(), matched)
) and
not exists(MatchingSuccessor unmatched |
unmatched.getValue() = false and n.isAfterValue(n.getAstNode(), unmatched)
)
}
/**
* Holds if `n` is the canonical representative of its corresponding AST node
* for dataflow purposes.
*
* The shared CFG associates a single AST node with multiple `ControlFlowNode`s
* when the AST appears in a conditional context (boolean conditions split into
* `afterTrue`/`afterFalse`; for-loop iters split into `[empty]`/`[non-empty]`;
* `match`-case patterns split into `[matched]`/`[unmatched]`). These splits
* matter for control-flow analysis, but for dataflow purposes — where we
* ask "what is the value of this expression?" — a single representative
* suffices and is required to avoid double-counting calls, arguments, store
* steps, etc.
*
* The pick is structural: when an AST has a single `ControlFlowNode` (the
* normal `TAfterNode` or `TBeforeNode`-leaf case), that node is canonical.
* When an AST has a conditional split, the "positive" outcome variant
* (true / empty / matched) is canonical. The three split kinds are mutually
* exclusive per AST, so exactly one variant is selected.
*/
predicate isCanonicalAstNodeRepresentative(ControlFlowNode n) {
// Non-split AST: the unique variant is canonical.
not exists(ControlFlowNode other | other.getNode() = n.getNode() and other != n)
or
// Split AST: pick the "positive" outcome of the split.
isGenuineAfterTrue(n)
or
isGenuineAfterEmpty(n)
or
isGenuineAfterMatched(n)
}
/**
* A control flow node. Control flow nodes have a many-to-one relation
* with syntactic nodes, although most syntactic nodes have only one

View File

@@ -1444,11 +1444,12 @@ private predicate sameEnclosingCallable(Node node1, Node node2) {
// =============================================================================
newtype TDataFlowCall =
TNormalCall(Cfg::CallNode call, Function target, CallType type) {
resolveCall(call, target, type)
resolveCall(call, target, type) and
Cfg::isCanonicalAstNodeRepresentative(call)
} or
/** A call to the generated function inside a comprehension */
TComprehensionCall(Comp c) or
TPotentialLibraryCall(Cfg::CallNode call) or
TPotentialLibraryCall(Cfg::CallNode call) { Cfg::isCanonicalAstNodeRepresentative(call) } or
/** A synthesized call inside a summarized callable */
TSummaryCall(
FlowSummaryImpl::Public::SummarizedCallable c, FlowSummaryImpl::Private::SummaryNode receiver

View File

@@ -365,7 +365,11 @@ module LocalFlow {
exists(Cfg::DefinitionNode def |
nodeFrom.(CfgNode).getNode() = def.getValue() and
nodeTo.(CfgNode).getNode() = def and
def instanceof Cfg::NameNode
def instanceof Cfg::NameNode and
// Parameter defaults are evaluated in the enclosing scope, while the
// parameter itself lives in the function's scope. The cross-scope
// edge is provided by `runtimeJumpStep` instead.
not exists(Py::Parameter param | def.getNode() = param.asName())
)
or
// With definition
@@ -410,11 +414,27 @@ module LocalFlow {
}
predicate useToNextUse(Cfg::NameNode nodeFrom, Cfg::NameNode nodeTo) {
SsaImpl::AdjacentUses::adjacentUseUse(nodeFrom, nodeTo)
// The SSA-level adjacent-use predicate works on specific CFG variants
// (e.g. boolean-outcome `[true]`/`[false]` or emptiness `[empty]`/`[non-empty]`
// splits of the same AST node), but dataflow values are insensitive to
// those splits — there is at most one `CfgNode` per AST. Project both
// ends through `Cfg::isCanonicalAstNodeRepresentative` so all variants
// contribute their use-use edges to the canonical pair.
exists(Cfg::NameNode fromVariant, Cfg::NameNode toVariant |
SsaImpl::AdjacentUses::adjacentUseUse(fromVariant, toVariant) and
fromVariant.getNode() = nodeFrom.getNode() and
toVariant.getNode() = nodeTo.getNode() and
Cfg::isCanonicalAstNodeRepresentative(nodeFrom) and
Cfg::isCanonicalAstNodeRepresentative(nodeTo)
)
}
predicate defToFirstUse(SsaImpl::EssaVariable var, Cfg::NameNode nodeTo) {
SsaImpl::AdjacentUses::firstUse(var.getDefinition(), nodeTo)
exists(Cfg::NameNode toVariant |
SsaImpl::AdjacentUses::firstUse(var.getDefinition(), toVariant) and
toVariant.getNode() = nodeTo.getNode() and
Cfg::isCanonicalAstNodeRepresentative(nodeTo)
)
}
predicate useUseFlowStep(Node nodeFrom, Node nodeTo) {
@@ -423,12 +443,14 @@ module LocalFlow {
// `x = f(y)`
// nodeFrom is `y` on first line
// nodeTo is `y` on second line
exists(SsaImpl::EssaDefinition def |
exists(SsaImpl::EssaDefinition def, Cfg::NameNode toVariant |
nodeFrom.(CfgNode).getNode() = def.(SsaImpl::EssaNodeDefinition).getDefiningNode()
or
nodeFrom.(ScopeEntryDefinitionNode).getDefinition() = def
|
SsaImpl::AdjacentUses::firstUse(def, nodeTo.(CfgNode).getNode())
SsaImpl::AdjacentUses::firstUse(def, toVariant) and
toVariant.getNode() = nodeTo.(CfgNode).getNode().getNode() and
Cfg::isCanonicalAstNodeRepresentative(nodeTo.(CfgNode).getNode())
)
or
// Next use after use

View File

@@ -29,9 +29,12 @@ overlay[local]
newtype TNode =
/** A node corresponding to a control flow node. */
TCfgNode(Cfg::ControlFlowNode node) {
isExpressionNode(node)
or
node.getNode() instanceof Pattern
(
isExpressionNode(node)
or
node.getNode() instanceof Pattern
) and
Cfg::isCanonicalAstNodeRepresentative(node)
} or
/**
* A node corresponding to a scope entry definition. That is, the value of a variable
@@ -50,36 +53,39 @@ newtype TNode =
// NOTE: since we can't rely on the call graph, but we want to have synthetic
// pre-update nodes for class calls, we end up getting synthetic pre-update nodes for
// ALL calls :|
TSyntheticPreUpdateNode(Cfg::CallNode call) or
TSyntheticPreUpdateNode(Cfg::CallNode call) { Cfg::isCanonicalAstNodeRepresentative(call) } or
/**
* A synthetic node representing the value of an object after a state change.
* See QLDoc for `PostUpdateNode`.
*/
TSyntheticPostUpdateNode(Cfg::ControlFlowNode node) {
exists(Cfg::CallNode call |
node = call.getArg(_)
Cfg::isCanonicalAstNodeRepresentative(node) and
(
exists(Cfg::CallNode call |
node = call.getArg(_)
or
node = call.getArgByName(_)
or
// `self` argument when handling class instance calls (`__call__` special method))
node = call.getFunction()
)
or
node = call.getArgByName(_)
node = any(Cfg::AttrNode a).getObject()
or
// `self` argument when handling class instance calls (`__call__` special method))
node = call.getFunction()
node = any(Cfg::SubscriptNode s).getObject()
or
// self parameter when used implicitly in `super()`
exists(Class cls, Function func, SsaImpl::ParameterDefinition def |
func = cls.getAMethod() and
not isStaticmethod(func) and
// this matches what we do in ExtractedParameterNode
def.getDefiningNode() = node and
def.getParameter() = func.getArg(0)
)
or
// the iterable argument to the implicit comprehension function
node.getNode() = any(Comp c).getIterable()
)
or
node = any(Cfg::AttrNode a).getObject()
or
node = any(Cfg::SubscriptNode s).getObject()
or
// self parameter when used implicitly in `super()`
exists(Class cls, Function func, SsaImpl::ParameterDefinition def |
func = cls.getAMethod() and
not isStaticmethod(func) and
// this matches what we do in ExtractedParameterNode
def.getDefiningNode() = node and
def.getParameter() = func.getArg(0)
)
or
// the iterable argument to the implicit comprehension function
node.getNode() = any(Comp c).getIterable()
} or
/** A node representing a global (module-level) variable in a specific module. */
TModuleVariableNode(Module m, GlobalVariable v) { v.getScope() = m } or
@@ -115,7 +121,9 @@ newtype TNode =
exists(ParameterPosition ppos | ppos.isStarArgs(_) | exists(callable.getParameter(ppos)))
} or
/** A synthetic node to capture keyword arguments that are passed to a `**kwargs` parameter. */
TSynthDictSplatArgumentNode(Cfg::CallNode call) { exists(call.getArgByName(_)) } or
TSynthDictSplatArgumentNode(Cfg::CallNode call) {
exists(call.getArgByName(_)) and Cfg::isCanonicalAstNodeRepresentative(call)
} or
/** A synthetic node to allow flow to keyword parameters from a `**kwargs` argument. */
TSynthDictSplatParameterNode(DataFlowCallable callable) {
exists(ParameterPosition ppos | ppos.isKeyword(_) | exists(callable.getParameter(ppos)))
@@ -132,14 +140,16 @@ newtype TNode =
* by the callable being called.
*/
TSynthCapturedVariablesArgumentNode(Cfg::ControlFlowNode callable) {
callable = any(Cfg::CallNode c).getFunction()
callable = any(Cfg::CallNode c).getFunction() and
Cfg::isCanonicalAstNodeRepresentative(callable)
} or
/**
* A synthetic node representing the values of the variables captured
* by the callable being called, after the output has been computed.
*/
TSynthCapturedVariablesArgumentPostUpdateNode(Cfg::ControlFlowNode callable) {
callable = any(Cfg::CallNode c).getFunction()
callable = any(Cfg::CallNode c).getFunction() and
Cfg::isCanonicalAstNodeRepresentative(callable)
} or
/** A synthetic node representing the values of variables captured by a comprehension. */
TSynthCompCapturedVariablesArgumentNode(Comp comp) {

View File

@@ -11,6 +11,7 @@ signature module UnresolvedCallExpectationsSig {
module DefaultUnresolvedCallExpectations implements UnresolvedCallExpectationsSig {
predicate unresolvedCall(Cfg::CallNode call) {
Cfg::isCanonicalAstNodeRepresentative(call) and
not exists(DataFlowPrivate::DataFlowCall dfc |
exists(dfc.getCallable()) and dfc.getNode() = call
) and