From b55186338e0ff1434559bdcd27056df3870eb675 Mon Sep 17 00:00:00 2001 From: yoff Date: Thu, 28 May 2026 07:39:32 +0000 Subject: [PATCH] Python: canonicalize CFG nodes for dataflow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The shared CFG creates multiple ControlFlowNodes per AST node in conditional contexts (e.g. afterTrue/afterFalse for boolean conditions, empty/non-empty for for-loops, matched/unmatched for match cases). These splits matter for control-flow analysis, but for dataflow — where we ask 'what is the value of this expression?' — we need exactly one representative per AST or we double-count calls, arguments, and store steps. This adds Cfg::isCanonicalAstNodeRepresentative as a purely structural pick: for split ASTs it selects the 'positive' outcome variant; for non-split ASTs it selects the unique variant. The picker is implemented via genuine-outcome helpers that work around the shared CFG's cross-kind isAfterValue fallback (ControlFlowGraph.qll:870-892), see the doc on isGenuineAfterTrue for details. The TCfgNode-family newtypes in DataFlowPublic, TNormalCall and TPotentialLibraryCall in DataFlowDispatch, and the SSA-projected use-use/def-use steps in DataFlowPrivate are all routed through the canonical filter. DataFlowConsistency and the test UnresolvedCalls helper qualify their CallNode casts with Cfg:: to keep working. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../DataFlowConsistency.ql | 11 +-- .../2026-05-26-shared-cfg-and-ssa.md | 4 + .../python/controlflow/internal/Cfg.qll | 86 +++++++++++++++++++ .../new/internal/DataFlowDispatch.qll | 5 +- .../dataflow/new/internal/DataFlowPrivate.qll | 32 +++++-- .../dataflow/new/internal/DataFlowPublic.qll | 66 ++++++++------ .../utils/test/dataflow/UnresolvedCalls.qll | 1 + 7 files changed, 165 insertions(+), 40 deletions(-) create mode 100644 python/ql/lib/change-notes/2026-05-26-shared-cfg-and-ssa.md diff --git a/python/ql/consistency-queries/DataFlowConsistency.ql b/python/ql/consistency-queries/DataFlowConsistency.ql index 829aa6debef..2eec17c78fe 100644 --- a/python/ql/consistency-queries/DataFlowConsistency.ql +++ b/python/ql/consistency-queries/DataFlowConsistency.ql @@ -9,6 +9,7 @@ private import semmle.python.dataflow.new.internal.DataFlowImplSpecific private import semmle.python.dataflow.new.internal.DataFlowDispatch private import semmle.python.dataflow.new.internal.TaintTrackingImplSpecific private import codeql.dataflow.internal.DataFlowImplConsistency +private import semmle.python.controlflow.internal.Cfg as Cfg private module Input implements InputSig { private import Private @@ -72,7 +73,7 @@ private module Input implements InputSig { // resolve to multiple functions), but we only make _one_ ArgumentNode for each // argument in the CallNode, we end up violating this consistency check in those // cases. (see `getCallArg` in DataFlowDispatch.qll) - exists(DataFlowCall other, CallNode cfgCall | other != call | + exists(DataFlowCall other, Cfg::CallNode cfgCall | other != call | call.getNode() = cfgCall and other.getNode() = cfgCall and isArgumentNode(arg, call, _) and @@ -88,16 +89,16 @@ private module Input implements InputSig { // allow it instead. ( call.getScope() = attr.getScope() and - any(CfgNode n | n.asCfgNode() = call.getNode().(CallNode).getFunction()).getALocalSource() = - attr + any(CfgNode n | n.asCfgNode() = call.getNode().(Cfg::CallNode).getFunction()) + .getALocalSource() = attr or not exists(call.getScope().(Function).getDefinition()) and call.getScope().getScope+() = attr.getScope() ) and ( other.getScope() = attr.getScope() and - any(CfgNode n | n.asCfgNode() = other.getNode().(CallNode).getFunction()).getALocalSource() = - attr + any(CfgNode n | n.asCfgNode() = other.getNode().(Cfg::CallNode).getFunction()) + .getALocalSource() = attr or not exists(other.getScope().(Function).getDefinition()) and other.getScope().getScope+() = attr.getScope() diff --git a/python/ql/lib/change-notes/2026-05-26-shared-cfg-and-ssa.md b/python/ql/lib/change-notes/2026-05-26-shared-cfg-and-ssa.md new file mode 100644 index 00000000000..39bdadbbaa4 --- /dev/null +++ b/python/ql/lib/change-notes/2026-05-26-shared-cfg-and-ssa.md @@ -0,0 +1,4 @@ +--- +category: minorAnalysis +--- +* The Python dataflow library is now built on the shared CFG and SSA libraries (`shared/controlflow` and `shared/ssa`), bringing Python in line with the other CodeQL languages. The legacy CFG in `semmle/python/Flow.qll` and the legacy ESSA SSA in `semmle/python/essa/*` remain available for downstream queries but are no longer used by the new dataflow library, type tracking, or API graphs. Most queries should be unaffected; a small number may produce slightly different results because of differences in CFG granularity (e.g. separate pre/post nodes per expression) and in how attribute and tuple-unpacking writes are modelled. diff --git a/python/ql/lib/semmle/python/controlflow/internal/Cfg.qll b/python/ql/lib/semmle/python/controlflow/internal/Cfg.qll index ae5062092ba..4d5b7d4d805 100644 --- a/python/ql/lib/semmle/python/controlflow/internal/Cfg.qll +++ b/python/ql/lib/semmle/python/controlflow/internal/Cfg.qll @@ -76,6 +76,92 @@ private predicate isCanonical(CfgImpl::ControlFlowNode n) { n instanceof CfgImpl::ControlFlow::AnnotatedExitNode } +/** + * Holds if `n` is genuinely the `TAfterValueNode` variant for a boolean-true + * outcome of its AST node. + * + * The shared CFG's `isAfterValue` predicate has a kind-mismatch fallback + * (see `ControlFlowGraph.qll`'s `isAfterValue` lines 870-892): when asking + * `isAfterValue(_, BooleanSuccessor true)` on, say, an emptiness-empty + * variant, it falsely returns `true` because the kinds differ and Python + * does not provide `successorValueImplies`. The same fallback also makes + * `TAfterNode` (the unsplit case) satisfy `isAfterValue(_, t)` for *every* + * `t`. + * + * The combination `isAfterTrue ∧ ¬isAfterFalse` excludes both: a genuine + * boolean-true variant satisfies `isAfterTrue` directly (newtype branch 3) + * but not `isAfterFalse` (the dual variant, same kind, no fallback). A + * non-boolean variant satisfies both via the cross-kind fallback. A + * `TAfterNode` satisfies both via branch 1. So `isAfterTrue ∧ ¬isAfterFalse` + * picks exactly the genuine boolean-true variant. + */ +private predicate isGenuineAfterTrue(ControlFlowNode n) { + n.isAfterTrue(_) and not n.isAfterFalse(_) +} + +/** + * Holds if `n` is genuinely the `TAfterValueNode` variant for the "empty" + * outcome of its AST node (e.g. `for x in xs: ...` when `xs` is empty). + * + * See `isGenuineAfterTrue` for why we cannot just use a single + * `isAfterValue` check. + */ +private predicate isGenuineAfterEmpty(ControlFlowNode n) { + exists(EmptinessSuccessor empty | + empty.getValue() = true and n.isAfterValue(n.getAstNode(), empty) + ) and + not exists(EmptinessSuccessor nonEmpty | + nonEmpty.getValue() = false and n.isAfterValue(n.getAstNode(), nonEmpty) + ) +} + +/** + * Holds if `n` is genuinely the `TAfterValueNode` variant for the "matched" + * outcome of its AST node (e.g. a `match` case-pattern that matched). + * + * See `isGenuineAfterTrue` for why we cannot just use a single + * `isAfterValue` check. + */ +private predicate isGenuineAfterMatched(ControlFlowNode n) { + exists(MatchingSuccessor matched | + matched.getValue() = true and n.isAfterValue(n.getAstNode(), matched) + ) and + not exists(MatchingSuccessor unmatched | + unmatched.getValue() = false and n.isAfterValue(n.getAstNode(), unmatched) + ) +} + +/** + * Holds if `n` is the canonical representative of its corresponding AST node + * for dataflow purposes. + * + * The shared CFG associates a single AST node with multiple `ControlFlowNode`s + * when the AST appears in a conditional context (boolean conditions split into + * `afterTrue`/`afterFalse`; for-loop iters split into `[empty]`/`[non-empty]`; + * `match`-case patterns split into `[matched]`/`[unmatched]`). These splits + * matter for control-flow analysis, but for dataflow purposes — where we + * ask "what is the value of this expression?" — a single representative + * suffices and is required to avoid double-counting calls, arguments, store + * steps, etc. + * + * The pick is structural: when an AST has a single `ControlFlowNode` (the + * normal `TAfterNode` or `TBeforeNode`-leaf case), that node is canonical. + * When an AST has a conditional split, the "positive" outcome variant + * (true / empty / matched) is canonical. The three split kinds are mutually + * exclusive per AST, so exactly one variant is selected. + */ +predicate isCanonicalAstNodeRepresentative(ControlFlowNode n) { + // Non-split AST: the unique variant is canonical. + not exists(ControlFlowNode other | other.getNode() = n.getNode() and other != n) + or + // Split AST: pick the "positive" outcome of the split. + isGenuineAfterTrue(n) + or + isGenuineAfterEmpty(n) + or + isGenuineAfterMatched(n) +} + /** * A control flow node. Control flow nodes have a many-to-one relation * with syntactic nodes, although most syntactic nodes have only one diff --git a/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowDispatch.qll b/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowDispatch.qll index 06cada5a669..872e030e0ec 100644 --- a/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowDispatch.qll +++ b/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowDispatch.qll @@ -1444,11 +1444,12 @@ private predicate sameEnclosingCallable(Node node1, Node node2) { // ============================================================================= newtype TDataFlowCall = TNormalCall(Cfg::CallNode call, Function target, CallType type) { - resolveCall(call, target, type) + resolveCall(call, target, type) and + Cfg::isCanonicalAstNodeRepresentative(call) } or /** A call to the generated function inside a comprehension */ TComprehensionCall(Comp c) or - TPotentialLibraryCall(Cfg::CallNode call) or + TPotentialLibraryCall(Cfg::CallNode call) { Cfg::isCanonicalAstNodeRepresentative(call) } or /** A synthesized call inside a summarized callable */ TSummaryCall( FlowSummaryImpl::Public::SummarizedCallable c, FlowSummaryImpl::Private::SummaryNode receiver diff --git a/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPrivate.qll b/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPrivate.qll index 480f29eed39..2ef018a912e 100644 --- a/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPrivate.qll +++ b/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPrivate.qll @@ -365,7 +365,11 @@ module LocalFlow { exists(Cfg::DefinitionNode def | nodeFrom.(CfgNode).getNode() = def.getValue() and nodeTo.(CfgNode).getNode() = def and - def instanceof Cfg::NameNode + def instanceof Cfg::NameNode and + // Parameter defaults are evaluated in the enclosing scope, while the + // parameter itself lives in the function's scope. The cross-scope + // edge is provided by `runtimeJumpStep` instead. + not exists(Py::Parameter param | def.getNode() = param.asName()) ) or // With definition @@ -410,11 +414,27 @@ module LocalFlow { } predicate useToNextUse(Cfg::NameNode nodeFrom, Cfg::NameNode nodeTo) { - SsaImpl::AdjacentUses::adjacentUseUse(nodeFrom, nodeTo) + // The SSA-level adjacent-use predicate works on specific CFG variants + // (e.g. boolean-outcome `[true]`/`[false]` or emptiness `[empty]`/`[non-empty]` + // splits of the same AST node), but dataflow values are insensitive to + // those splits — there is at most one `CfgNode` per AST. Project both + // ends through `Cfg::isCanonicalAstNodeRepresentative` so all variants + // contribute their use-use edges to the canonical pair. + exists(Cfg::NameNode fromVariant, Cfg::NameNode toVariant | + SsaImpl::AdjacentUses::adjacentUseUse(fromVariant, toVariant) and + fromVariant.getNode() = nodeFrom.getNode() and + toVariant.getNode() = nodeTo.getNode() and + Cfg::isCanonicalAstNodeRepresentative(nodeFrom) and + Cfg::isCanonicalAstNodeRepresentative(nodeTo) + ) } predicate defToFirstUse(SsaImpl::EssaVariable var, Cfg::NameNode nodeTo) { - SsaImpl::AdjacentUses::firstUse(var.getDefinition(), nodeTo) + exists(Cfg::NameNode toVariant | + SsaImpl::AdjacentUses::firstUse(var.getDefinition(), toVariant) and + toVariant.getNode() = nodeTo.getNode() and + Cfg::isCanonicalAstNodeRepresentative(nodeTo) + ) } predicate useUseFlowStep(Node nodeFrom, Node nodeTo) { @@ -423,12 +443,14 @@ module LocalFlow { // `x = f(y)` // nodeFrom is `y` on first line // nodeTo is `y` on second line - exists(SsaImpl::EssaDefinition def | + exists(SsaImpl::EssaDefinition def, Cfg::NameNode toVariant | nodeFrom.(CfgNode).getNode() = def.(SsaImpl::EssaNodeDefinition).getDefiningNode() or nodeFrom.(ScopeEntryDefinitionNode).getDefinition() = def | - SsaImpl::AdjacentUses::firstUse(def, nodeTo.(CfgNode).getNode()) + SsaImpl::AdjacentUses::firstUse(def, toVariant) and + toVariant.getNode() = nodeTo.(CfgNode).getNode().getNode() and + Cfg::isCanonicalAstNodeRepresentative(nodeTo.(CfgNode).getNode()) ) or // Next use after use diff --git a/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPublic.qll b/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPublic.qll index 0a59b66f07f..4724a8e5912 100644 --- a/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPublic.qll +++ b/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPublic.qll @@ -29,9 +29,12 @@ overlay[local] newtype TNode = /** A node corresponding to a control flow node. */ TCfgNode(Cfg::ControlFlowNode node) { - isExpressionNode(node) - or - node.getNode() instanceof Pattern + ( + isExpressionNode(node) + or + node.getNode() instanceof Pattern + ) and + Cfg::isCanonicalAstNodeRepresentative(node) } or /** * A node corresponding to a scope entry definition. That is, the value of a variable @@ -50,36 +53,39 @@ newtype TNode = // NOTE: since we can't rely on the call graph, but we want to have synthetic // pre-update nodes for class calls, we end up getting synthetic pre-update nodes for // ALL calls :| - TSyntheticPreUpdateNode(Cfg::CallNode call) or + TSyntheticPreUpdateNode(Cfg::CallNode call) { Cfg::isCanonicalAstNodeRepresentative(call) } or /** * A synthetic node representing the value of an object after a state change. * See QLDoc for `PostUpdateNode`. */ TSyntheticPostUpdateNode(Cfg::ControlFlowNode node) { - exists(Cfg::CallNode call | - node = call.getArg(_) + Cfg::isCanonicalAstNodeRepresentative(node) and + ( + exists(Cfg::CallNode call | + node = call.getArg(_) + or + node = call.getArgByName(_) + or + // `self` argument when handling class instance calls (`__call__` special method)) + node = call.getFunction() + ) or - node = call.getArgByName(_) + node = any(Cfg::AttrNode a).getObject() or - // `self` argument when handling class instance calls (`__call__` special method)) - node = call.getFunction() + node = any(Cfg::SubscriptNode s).getObject() + or + // self parameter when used implicitly in `super()` + exists(Class cls, Function func, SsaImpl::ParameterDefinition def | + func = cls.getAMethod() and + not isStaticmethod(func) and + // this matches what we do in ExtractedParameterNode + def.getDefiningNode() = node and + def.getParameter() = func.getArg(0) + ) + or + // the iterable argument to the implicit comprehension function + node.getNode() = any(Comp c).getIterable() ) - or - node = any(Cfg::AttrNode a).getObject() - or - node = any(Cfg::SubscriptNode s).getObject() - or - // self parameter when used implicitly in `super()` - exists(Class cls, Function func, SsaImpl::ParameterDefinition def | - func = cls.getAMethod() and - not isStaticmethod(func) and - // this matches what we do in ExtractedParameterNode - def.getDefiningNode() = node and - def.getParameter() = func.getArg(0) - ) - or - // the iterable argument to the implicit comprehension function - node.getNode() = any(Comp c).getIterable() } or /** A node representing a global (module-level) variable in a specific module. */ TModuleVariableNode(Module m, GlobalVariable v) { v.getScope() = m } or @@ -115,7 +121,9 @@ newtype TNode = exists(ParameterPosition ppos | ppos.isStarArgs(_) | exists(callable.getParameter(ppos))) } or /** A synthetic node to capture keyword arguments that are passed to a `**kwargs` parameter. */ - TSynthDictSplatArgumentNode(Cfg::CallNode call) { exists(call.getArgByName(_)) } or + TSynthDictSplatArgumentNode(Cfg::CallNode call) { + exists(call.getArgByName(_)) and Cfg::isCanonicalAstNodeRepresentative(call) + } or /** A synthetic node to allow flow to keyword parameters from a `**kwargs` argument. */ TSynthDictSplatParameterNode(DataFlowCallable callable) { exists(ParameterPosition ppos | ppos.isKeyword(_) | exists(callable.getParameter(ppos))) @@ -132,14 +140,16 @@ newtype TNode = * by the callable being called. */ TSynthCapturedVariablesArgumentNode(Cfg::ControlFlowNode callable) { - callable = any(Cfg::CallNode c).getFunction() + callable = any(Cfg::CallNode c).getFunction() and + Cfg::isCanonicalAstNodeRepresentative(callable) } or /** * A synthetic node representing the values of the variables captured * by the callable being called, after the output has been computed. */ TSynthCapturedVariablesArgumentPostUpdateNode(Cfg::ControlFlowNode callable) { - callable = any(Cfg::CallNode c).getFunction() + callable = any(Cfg::CallNode c).getFunction() and + Cfg::isCanonicalAstNodeRepresentative(callable) } or /** A synthetic node representing the values of variables captured by a comprehension. */ TSynthCompCapturedVariablesArgumentNode(Comp comp) { diff --git a/python/ql/lib/utils/test/dataflow/UnresolvedCalls.qll b/python/ql/lib/utils/test/dataflow/UnresolvedCalls.qll index d48f9844938..b6067257f97 100644 --- a/python/ql/lib/utils/test/dataflow/UnresolvedCalls.qll +++ b/python/ql/lib/utils/test/dataflow/UnresolvedCalls.qll @@ -11,6 +11,7 @@ signature module UnresolvedCallExpectationsSig { module DefaultUnresolvedCallExpectations implements UnresolvedCallExpectationsSig { predicate unresolvedCall(Cfg::CallNode call) { + Cfg::isCanonicalAstNodeRepresentative(call) and not exists(DataFlowPrivate::DataFlowCall dfc | exists(dfc.getCallable()) and dfc.getNode() = call ) and