From 79db96c717d4f98aab293310e45743df237be460 Mon Sep 17 00:00:00 2001 From: yoff Date: Mon, 18 May 2026 11:03:45 +0000 Subject: [PATCH] Python: introduce shared-SSA adapter on the new CFG Adds 'python/ql/lib/semmle/python/dataflow/new/internal/SsaImpl.qll', a minimal Python SSA implementation built on the shared SSA library ('codeql.ssa.Ssa::Make'). The structure mirrors Java's adapter at 'java/ql/lib/semmle/code/java/dataflow/internal/SsaImpl.qll'. Key design choices: * 'SourceVariable' wraps 'Py::Variable'. Only variables that are read or deleted somewhere are tracked - write-only variables don't benefit from SSA construction. * Variable references are positional ('BasicBlock', 'int') pairs looked up via 'Cfg::NameNode.defines'/'.uses'/'.deletes' (which themselves are one-line bridges to AST-level 'Name.defines' etc.). * Parameter writes are not synthesised: parameter Name nodes are already wired into the CFG (per the earlier C#-style parameter extension in 'AstNodeImpl.qll'), so the regular 'variableWrite' path handles them at their natural CFG index. * Non-local / captured / global / builtin variables read in a scope but not written in it receive a synthetic entry definition at index '-1' of the scope's entry basic block. This matches Java's 'hasEntryDef'. * 'del x' is modelled as a certain write at the deletion site. Includes an inline-expectations test under 'python/ql/test/library-tests/dataflow-new-ssa/' covering: plain parameter pass-through, simple assignment + read, reassignment with dead-write pruning, if/else with phi insertion at the join, and an undefined-name read (currently a known limitation - no SSA flow without an enclosing definition). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../python/dataflow/new/internal/SsaImpl.qll | 179 ++++++++++++++++++ .../dataflow-new-ssa/SsaTest.expected | 0 .../library-tests/dataflow-new-ssa/SsaTest.ql | 59 ++++++ .../library-tests/dataflow-new-ssa/test.py | 40 ++++ 4 files changed, 278 insertions(+) create mode 100644 python/ql/lib/semmle/python/dataflow/new/internal/SsaImpl.qll create mode 100644 python/ql/test/library-tests/dataflow-new-ssa/SsaTest.expected create mode 100644 python/ql/test/library-tests/dataflow-new-ssa/SsaTest.ql create mode 100644 python/ql/test/library-tests/dataflow-new-ssa/test.py diff --git a/python/ql/lib/semmle/python/dataflow/new/internal/SsaImpl.qll b/python/ql/lib/semmle/python/dataflow/new/internal/SsaImpl.qll new file mode 100644 index 00000000000..11e7b9f4d3d --- /dev/null +++ b/python/ql/lib/semmle/python/dataflow/new/internal/SsaImpl.qll @@ -0,0 +1,179 @@ +/** + * Provides the Python SSA implementation built on the new (shared) CFG. + * + * Mirrors the Java SSA adapter at + * `java/ql/lib/semmle/code/java/dataflow/internal/SsaImpl.qll`: + * an `InputSig` is defined in terms of positional `(BasicBlock, int)` + * variable references, and the shared + * `codeql.ssa.Ssa::Make` module is then + * instantiated. + * + * `SourceVariable` is the AST-level `Py::Variable`. Variable references + * are looked up via the CFG facade's `NameNode.defines`/`uses`/`deletes` + * predicates, which themselves are one-line bridges to AST-level + * `Name.defines`/`uses`/`deletes`. + * + * Implicit-entry definitions are inserted for: + * - non-local / global / builtin variables that are read in the scope + * but never assigned (no enclosing CFG node defines them), + * - captured variables (variables defined in an enclosing scope that + * are read inside the scope), and + * - parameters, but only if the corresponding parameter name is *not* + * itself a CFG node. With the C#-style parameter wiring already + * installed in `AstNodeImpl.qll`, parameter names *are* CFG nodes, + * so the regular `variableWrite` path handles them — no `i = -1` + * entry is needed for ordinary parameters. + */ +overlay[local?] +module; + +private import python as Py +private import semmle.python.controlflow.internal.AstNodeImpl as CfgImpl +private import semmle.python.controlflow.internal.Cfg as Cfg +private import codeql.ssa.Ssa as SsaImplCommon +private import codeql.controlflow.BasicBlock as BB + +/** + * Adapts the Python `Cfg` facade to the shared SSA library's `CfgSig`. + * All members are inherited from `Cfg::ControlFlowNode` and + * `Cfg::BasicBlock`. + */ +private module CfgForSsa implements BB::CfgSig { + class ControlFlowNode = CfgImpl::ControlFlowNode; + + class BasicBlock = CfgImpl::BasicBlock; + + class EntryBasicBlock = CfgImpl::Cfg::EntryBasicBlock; + + predicate dominatingEdge = CfgImpl::Cfg::dominatingEdge/2; +} + +/** + * A source variable for SSA. Wraps a Python `Variable` (the AST-level + * notion of a named binding within a scope) so that the shared SSA + * implementation can use it as a `SourceVariable`. + * + * We only track variables that are read at least once in their scope — + * tracking write-only variables is unnecessary work. + */ +private newtype TSsaSourceVariable = + TPyVar(Py::Variable v) { + // Has a use somewhere — read-relevant for SSA. + exists(Cfg::NameNode n | n.uses(v)) + or + // Or has a deletion (treated as a write that destroys the value). + exists(Cfg::NameNode n | n.deletes(v)) + } + +/** + * A source variable for SSA, wrapping a Python AST `Variable`. + */ +class SsaSourceVariable extends TSsaSourceVariable { + /** Gets the underlying Python AST variable. */ + Py::Variable getVariable() { this = TPyVar(result) } + + /** Gets a textual representation of this source variable. */ + string toString() { result = this.getVariable().toString() } + + /** Gets the location of this source variable. */ + Py::Location getLocation() { result = this.getVariable().getScope().getLocation() } +} + +/** + * Holds if `v` is a non-local read in scope `s`, in the sense that `s` + * uses `v` but does not write it within `s`. This includes globals, + * builtins, and variables captured from an enclosing function scope. + */ +private predicate nonLocalReadIn(Py::Variable v, Py::Scope s) { + exists(Cfg::NameNode n | + n.uses(v) and + n.getScope() = s and + not exists(Cfg::NameNode def | def.defines(v) and def.getScope() = s) + ) +} + +/** + * Holds if `v` should have an implicit entry definition at the start of + * scope `s`. This covers: + * - non-local / global / builtin variables (defined outside `s`), and + * - captured variables (defined in an enclosing scope but read here). + * + * Parameters are *not* included: their bound `Name` is itself a CFG + * node (per the C#-style parameter wiring), so `variableWrite` fires at + * the parameter's natural CFG index. + */ +private predicate hasEntryDef(SsaSourceVariable v, Py::Scope s) { + nonLocalReadIn(v.getVariable(), s) +} + +/** + * Gets the entry basic block of scope `s`, where implicit entry + * definitions are placed (at synthetic index `-1`). + */ +private CfgImpl::BasicBlock entryBlock(Py::Scope s) { + exists(CfgImpl::ControlFlowNode entry | + entry instanceof CfgImpl::ControlFlow::EntryNode and + entry.getEnclosingCallable().asScope() = s and + result = entry.getBasicBlock() + ) +} + +/** + * The SSA `InputSig` for Python. References are positional + * `(BasicBlock, int)` pairs into the new CFG. + */ +private module SsaImplInput implements SsaImplCommon::InputSig { + class SourceVariable = SsaSourceVariable; + + predicate variableWrite(CfgImpl::BasicBlock bb, int i, SourceVariable v, boolean certain) { + // Explicit binding at a CFG node — includes assignments, + // parameter Names (wired in via the C# pattern), exception-handler + // `as`-bindings, import aliases, and match-pattern captures. + exists(Cfg::NameNode n | + bb.getNode(i) = n and + n.defines(v.getVariable()) and + certain = true + ) + or + // `del x` — removes the binding. Modelled as a certain write that + // makes any subsequent read invalid. + exists(Cfg::NameNode n | + bb.getNode(i) = n and + n.deletes(v.getVariable()) and + certain = true + ) + or + // Implicit entry definition for non-local / captured / global / + // builtin variables read in the scope. + bb = entryBlock(v.getVariable().getScope()) and + hasEntryDef(v, v.getVariable().getScope()) and + i = -1 and + certain = true + } + + predicate variableRead(CfgImpl::BasicBlock bb, int i, SourceVariable v, boolean certain) { + exists(Cfg::NameNode n | + bb.getNode(i) = n and + n.uses(v.getVariable()) and + certain = true + ) + } +} + +/** + * The shared SSA instantiation for Python. + * + * Members: + * - `Definition` — the union of explicit, uncertain, and phi definitions + * - `WriteDefinition`, `UncertainWriteDefinition`, `PhiNode` + * - the standard SSA predicates (`getAUse`, `getAnUltimateDefinition`, ...). + */ +module Ssa = SsaImplCommon::Make; + +final class Definition = Ssa::Definition; + +final class WriteDefinition = Ssa::WriteDefinition; + +final class UncertainWriteDefinition = Ssa::UncertainWriteDefinition; + +final class PhiNode = Ssa::PhiNode; diff --git a/python/ql/test/library-tests/dataflow-new-ssa/SsaTest.expected b/python/ql/test/library-tests/dataflow-new-ssa/SsaTest.expected new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/ql/test/library-tests/dataflow-new-ssa/SsaTest.ql b/python/ql/test/library-tests/dataflow-new-ssa/SsaTest.ql new file mode 100644 index 00000000000..0bebf4a637d --- /dev/null +++ b/python/ql/test/library-tests/dataflow-new-ssa/SsaTest.ql @@ -0,0 +1,59 @@ +/** + * Inline-expectations test for the new-CFG SSA adapter + * (`semmle.python.dataflow.new.internal.SsaImpl`). + * + * Tags: + * - `def=`: there is an SSA write definition of `` at this + * line (parameter init, plain assignment, augmented assignment, + * exception-handler binding, deletion, etc.). + * - `use=`: `` is used at this line, and some SSA definition + * of `` reaches the read. + * - `phi=`: there is an SSA phi definition of `` whose BB + * starts on this line. + */ + +import python +import semmle.python.dataflow.new.internal.SsaImpl as SsaImpl +import semmle.python.controlflow.internal.AstNodeImpl as CfgImpl +import semmle.python.controlflow.internal.Cfg as Cfg +import utils.test.InlineExpectationsTest + +module SsaTest implements TestSig { + string getARelevantTag() { result = ["def", "use", "phi"] } + + predicate hasActualResult(Location location, string element, string tag, string value) { + // A `def=` fires when an SSA WriteDefinition is at a CFG node + // on the given line. + exists(SsaImpl::Ssa::WriteDefinition def, CfgImpl::BasicBlock bb, int i, Cfg::NameNode n | + def.definesAt(_, bb, i) and + bb.getNode(i) = n and + tag = "def" and + location = n.getLocation() and + element = n.toString() and + value = n.getId() + ) + or + // A `use=` fires when an SSA Definition reaches a read at this + // CFG node. + exists(SsaImpl::Ssa::Definition def, CfgImpl::BasicBlock bb, int i, Cfg::NameNode n | + SsaImpl::Ssa::ssaDefReachesRead(_, def, bb, i) and + bb.getNode(i) = n and + tag = "use" and + location = n.getLocation() and + element = n.toString() and + value = n.getId() + ) + or + // A `phi=` fires when there is a phi node whose BB's first + // CFG node is on the given line. + exists(SsaImpl::Ssa::PhiNode phi, CfgImpl::BasicBlock bb | + phi.definesAt(_, bb, _) and + tag = "phi" and + location = bb.getNode(0).getLocation() and + element = bb.toString() and + value = phi.getSourceVariable().(SsaImpl::SsaSourceVariable).getVariable().getId() + ) + } +} + +import MakeTest diff --git a/python/ql/test/library-tests/dataflow-new-ssa/test.py b/python/ql/test/library-tests/dataflow-new-ssa/test.py new file mode 100644 index 00000000000..fb1f658c4fb --- /dev/null +++ b/python/ql/test/library-tests/dataflow-new-ssa/test.py @@ -0,0 +1,40 @@ +# Basic SSA tests for the new-CFG SSA adapter. +# +# The shared SSA implementation prunes its construction by liveness: +# definitions of variables that are not read are never materialised. +# This is by design — write-only variables would only bloat the SSA +# graph. Tests therefore must always include a read of each variable +# being verified. +# +# Annotations: +# def=: there is an SSA write definition of at this line +# use=: is used here and the read resolves to some def + + +def basic_param(x): # $ def=x + return x # $ use=x + + +def basic_assign(): + y = 1 # $ def=y + return y # $ use=y + + +def reassignment(): + x = 1 + x = 2 # $ def=x + return x # $ use=x + + +def if_else_phi(cond): # $ def=cond + if cond: # $ use=cond phi=x + x = 1 # $ def=x + else: + x = 2 # $ def=x + return x # $ use=x + + +def use_global(): + return some_undefined # known limitation: undefined globals not resolved here + +