Python: add new shared-SSA-backed SSA adapter

Preparatory refactor for the shared-CFG dataflow migration. Adds the
new Python SSA adapter additively, without changing any production
behaviour.

Library additions:

- semmle.python.dataflow.new.internal.SsaImpl — Python SSA
  implementation built on the new (shared) CFG. Mirrors the Java SSA
  adapter (java/ql/lib/semmle/code/java/dataflow/internal/SsaImpl.qll):
  an InputSig is defined in terms of positional (BasicBlock, int)
  variable references, and the shared
  codeql.ssa.Ssa::Make<Location, Cfg, Input> module is then
  instantiated.

  SourceVariable is the AST-level Py::Variable. Variable references
  are looked up via the new CFG facade's NameNode.defines/uses/deletes
  predicates (added in the preceding PR), which themselves are
  one-line bridges to AST-level Name.defines/uses/deletes.

  Implicit-entry definitions are inserted for non-local/global/builtin
  reads, captured variables, and (when needed) parameters.

Test additions:

- library-tests/dataflow-new-ssa/ — exercises the new SSA over a
  representative test corpus and checks expected def/use chains.

- library-tests/dataflow-new-ssa-vs-legacy/ — runs both new SSA and
  legacy ESSA over the same corpus and diffs the results, so any
  semantic divergence shows up as a test failure.

Production impact:

None. The new SSA adapter has zero callers in lib/ and src/ — the
legacy ESSA SSA (semmle/python/essa/*) remains the default. The
dataflow library is not migrated yet; that lands in a follow-up PR.

Verified by:
- All 367 lib + src + consistency-queries compile clean.
- All 641 ControlFlow + PointsTo + dataflow + essa + consistency
  library-tests pass.
- Both new dataflow-new-ssa[/vs-legacy] test packs pass.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
Copilot
2026-06-01 12:04:10 +00:00
committed by yoff
parent 4aee0b3c87
commit b2ff09f70a
8 changed files with 774 additions and 0 deletions

View File

@@ -0,0 +1,6 @@
| def-only-old | $:0:0 |
| def-only-old | __name__:0:0 |
| def-only-old | __package__:0:0 |
| def-only-old | e:37:1 |
| def-only-old | e:40:25 |
| def-only-old | x:20:1 |

View File

@@ -0,0 +1,59 @@
/**
* Compares the new-CFG SSA against the legacy ESSA on the same Python
* sources. Reports definitions present in one implementation but not
* the other, identified by variable name + source position.
*
* The `.expected` file records the current diff as a snapshot: as the
* new SSA matures (closing captured-variable gap, exception bindings,
* etc.) and tracks more variables, the snapshot should monotonically
* shrink.
*
* Known categories of `def-only-old` mismatches:
* - Function / class / global definitions with no in-scope read
* (intentional: SSA is liveness-pruned, write-only variables are
* not tracked).
* - Captured / closure variables (gap: new SSA does not yet model
* closure captures).
* - Module variables `__name__`, `__package__`, `$` (legacy ESSA
* adds implicit bindings the new SSA does not).
* - Exception-handler `as` bindings (depend on raise modelling).
*
* `def-only-new` mismatches would indicate the new SSA produces spurious
* definitions; currently none are expected.
*/
import python
import semmle.python.dataflow.new.internal.SsaImpl as NewSsa
import semmle.python.controlflow.internal.Cfg as Cfg
import semmle.python.essa.Essa
string newDefSig(NewSsa::EssaNodeDefinition def) {
exists(Cfg::ControlFlowNode n | n = def.getDefiningNode() |
result =
def.getVariable().getVariable().getId() + ":" + n.getLocation().getStartLine() + ":" +
n.getLocation().getStartColumn()
)
}
string legacyDefSig(EssaNodeDefinition def) {
exists(ControlFlowNode n | n = def.getDefiningNode() |
result =
def.getSourceVariable().getName() + ":" + n.getLocation().getStartLine() + ":" +
n.getLocation().getStartColumn()
)
}
from string kind, string sig
where
kind = "def-only-new" and
exists(NewSsa::EssaNodeDefinition def |
sig = newDefSig(def) and
not exists(EssaNodeDefinition legacyDef | sig = legacyDefSig(legacyDef))
)
or
kind = "def-only-old" and
exists(EssaNodeDefinition legacyDef |
sig = legacyDefSig(legacyDef) and
not exists(NewSsa::EssaNodeDefinition def | sig = newDefSig(def))
)
select kind, sig

View File

@@ -0,0 +1,53 @@
def simple_assign():
x = 1
return x
def reassignment():
x = 1
x = 2
return x
def if_else_branch(cond):
if cond:
x = 1
else:
x = 2
return x
def loop(xs):
total = 0
for x in xs:
total = total + x
return total
def parameter(a, b=2, *args, **kwargs):
return a + b + sum(args)
def closure(x):
def inner():
return x
return inner
def exception_binding():
try:
compute()
except Exception as e:
return e
def with_binding():
with open("file") as f:
return f.read()
GLOBAL = 1
def read_global():
return GLOBAL

View File

@@ -0,0 +1,6 @@
| test.py:14:5:14:15 | basic_param | Unexpected result: def=basic_param |
| test.py:18:5:18:16 | basic_assign | Unexpected result: def=basic_assign |
| test.py:23:5:23:16 | reassignment | Unexpected result: def=reassignment |
| test.py:29:5:29:15 | if_else_phi | Unexpected result: def=if_else_phi |
| test.py:37:5:37:14 | use_global | Unexpected result: def=use_global |
| test.py:38:28:38:49 | Comment # $ use=some_undefined | Missing result: use=some_undefined |

View File

@@ -0,0 +1,59 @@
/**
* Inline-expectations test for the new-CFG SSA adapter
* (`semmle.python.dataflow.new.internal.SsaImpl`).
*
* Tags:
* - `def=<var>`: there is an SSA write definition of `<var>` at this
* line (parameter init, plain assignment, augmented assignment,
* exception-handler binding, deletion, etc.).
* - `use=<var>`: `<var>` is used at this line, and some SSA definition
* of `<var>` reaches the read.
* - `phi=<var>`: there is an SSA phi definition of `<var>` whose BB
* starts on this line.
*/
import python
import semmle.python.dataflow.new.internal.SsaImpl as SsaImpl
import semmle.python.controlflow.internal.AstNodeImpl as CfgImpl
import semmle.python.controlflow.internal.Cfg as Cfg
import utils.test.InlineExpectationsTest
module SsaTest implements TestSig {
string getARelevantTag() { result = ["def", "use", "phi"] }
predicate hasActualResult(Location location, string element, string tag, string value) {
// A `def=<id>` fires when an SSA WriteDefinition is at a CFG node
// on the given line.
exists(SsaImpl::Ssa::WriteDefinition def, CfgImpl::BasicBlock bb, int i, Cfg::NameNode n |
def.definesAt(_, bb, i) and
bb.getNode(i) = n and
tag = "def" and
location = n.getLocation() and
element = n.toString() and
value = n.getId()
)
or
// A `use=<id>` fires when an SSA Definition reaches a read at this
// CFG node.
exists(SsaImpl::Ssa::Definition def, CfgImpl::BasicBlock bb, int i, Cfg::NameNode n |
SsaImpl::Ssa::ssaDefReachesRead(_, def, bb, i) and
bb.getNode(i) = n and
tag = "use" and
location = n.getLocation() and
element = n.toString() and
value = n.getId()
)
or
// A `phi=<id>` fires when there is a phi node whose BB's first
// CFG node is on the given line.
exists(SsaImpl::Ssa::PhiNode phi, CfgImpl::BasicBlock bb |
phi.definesAt(_, bb, _) and
tag = "phi" and
location = bb.getNode(0).getLocation() and
element = bb.toString() and
value = phi.getSourceVariable().(SsaImpl::SsaSourceVariable).getVariable().getId()
)
}
}
import MakeTest<SsaTest>

View File

@@ -0,0 +1,40 @@
# Basic SSA tests for the new-CFG SSA adapter.
#
# The shared SSA implementation prunes its construction by liveness:
# definitions of variables that are not read are never materialised.
# This is by design — write-only variables would only bloat the SSA
# graph. Tests therefore must always include a read of each variable
# being verified.
#
# Annotations:
# def=<var>: there is an SSA write definition of <var> at this line
# use=<var>: <var> is used here and the read resolves to some def
def basic_param(x): # $ def=x
return x # $ use=x
def basic_assign():
y = 1 # $ def=y
return y # $ use=y
def reassignment():
x = 1
x = 2 # $ def=x
return x # $ use=x
def if_else_phi(cond): # $ def=cond
if cond: # $ use=cond phi=x
x = 1 # $ def=x
else:
x = 2 # $ def=x
return x # $ use=x
def use_global():
return some_undefined # $ use=some_undefined