Compare commits

..

3 Commits

Author SHA1 Message Date
Taus
cdd557f877 Python: hotfix - disable instanceFieldStep to avoid type-tracker blowup
The `instanceFieldStep` disjunct of `TypeTrackingInput::levelStepCall`
that was added in 7.2.0 uses `classInstanceTracker(cls)` -- which is
itself a type-tracker -- inside `levelStepCall`. That creates a
structural mutual recursion between the main type-tracker fixpoint and
`classInstanceTracker`, causing the type-tracker delta to blow up to
~100M tuples per iteration on some OOP-heavy Python codebases.
Verified on the python/mypy database: SSRF query wall time goes from
~12s before the offending commit to >40 minutes after it.

This hotfix temporarily drops the `instanceFieldStep` disjunct and
keeps only `inheritedFieldStep`, which does not pull on the call
graph and is well-behaved (verified at ~12s on mypy). The
`instanceFieldStep` helper predicate itself is kept in place, and
the `levelStepCall` body has a commented-out call to it so the
change is trivial to re-enable once the recursion issue is properly
addressed.
2026-07-01 14:31:00 +01:00
Tom Hvitved
2bf6031c0f Python: Update inline test expectations 2026-07-01 13:10:41 +02:00
Tom Hvitved
a5444b573a Python: Improve some flow summaries 2026-07-01 12:05:53 +02:00
73 changed files with 312 additions and 4096 deletions

View File

@@ -1,2 +0,0 @@
import semmle.python.controlflow.internal.AstNodeImpl
import ControlFlow::Consistency

View File

@@ -1,4 +0,0 @@
---
category: minorAnalysis
---
* A new Python control flow graph implementation has been added under `semmle.python.controlflow.internal.Cfg` (backed by `AstNodeImpl.qll`), built on the shared `codeql.controlflow.ControlFlowGraph` library. It is not yet used by the dataflow library or any production query; the legacy CFG in `semmle/python/Flow.qll` remains the default. The new library is exposed for tests and for upcoming migrations.

View File

@@ -1,4 +0,0 @@
---
category: minorAnalysis
---
* The new (shared-CFG-based) Python control flow graph now visits parameter and return type annotations as CFG nodes for function definitions, matching the legacy CFG. This restores annotation-based type tracking through framework models such as FastAPI's `Depends()`, Pydantic request models, Starlette `WebSocket` handlers, and any other models that flow a class reference through `Parameter.getAnnotation()` to identify instances of the annotated class.

View File

@@ -0,0 +1,5 @@
---
category: minorAnalysis
---
- Temporarily disabled the `instanceFieldStep` disjunct of the internal `TypeTrackingInput::levelStepCall` predicate, which was introduced in 7.2.0 and caused catastrophic query slowdowns on some OOP-heavy Python codebases (e.g. `mypy` and `dask`).

View File

@@ -1,42 +0,0 @@
/**
* @name Print CFG
* @description Produces a representation of a file's Control Flow Graph.
* This query is used by the VS Code extension.
* @id py/print-cfg
* @kind graph
* @tags ide-contextual-queries/print-cfg
*/
import semmle.python.Files as Files
// import semmle.python.Scope
import semmle.python.controlflow.internal.AstNodeImpl
external string selectedSourceFile();
private predicate selectedSourceFileAlias = selectedSourceFile/0;
external int selectedSourceLine();
private predicate selectedSourceLineAlias = selectedSourceLine/0;
external int selectedSourceColumn();
private predicate selectedSourceColumnAlias = selectedSourceColumn/0;
module ViewCfgQueryInput implements ControlFlow::ViewCfgQueryInputSig<Files::File> {
predicate selectedSourceFile = selectedSourceFileAlias/0;
predicate selectedSourceLine = selectedSourceLineAlias/0;
predicate selectedSourceColumn = selectedSourceColumnAlias/0;
predicate cfgScopeSpan(
Ast::Callable scope, Files::File file, int startLine, int startColumn, int endLine,
int endColumn
) {
file = scope.getLocation().getFile() and
scope.getLocation().hasLocationInfo(_, startLine, startColumn, endLine, endColumn)
}
}
import ControlFlow::ViewCfgQuery<Files::File, ViewCfgQueryInput>

File diff suppressed because it is too large Load Diff

View File

@@ -1138,7 +1138,9 @@ predicate clearsContent(Node n, ContentSet cs) {
* Holds if the value that is being tracked is expected to be stored inside content `c`
* at node `n`.
*/
predicate expectsContent(Node n, ContentSet c) { none() }
predicate expectsContent(Node n, ContentSet c) {
FlowSummaryImpl::Private::Steps::summaryExpectsContent(n.(FlowSummaryNode).getSummaryNode(), c)
}
/**
* Holds if values stored inside attribute `c` are cleared at node `n`.

View File

@@ -91,6 +91,8 @@ module Input implements InputSig<Location, DataFlowImplSpecific::PythonDataFlow>
cs.isAnyTupleOrDictionaryElement() and result = "AnyTupleOrDictionaryElement" and arg = ""
}
string encodeWithContent(ContentSet c, string arg) { result = "With" + encodeContent(c, arg) }
bindingset[token]
ParameterPosition decodeUnknownParameterPosition(AccessPath::AccessPathTokenBase token) {
// needed to support `Argument[x..y]` ranges

View File

@@ -170,7 +170,13 @@ module TypeTrackingInput implements Shared::TypeTrackingInput<Location> {
/** Holds if there is a level step from `nodeFrom` to `nodeTo`, which may depend on the call graph. */
predicate levelStepCall(Node nodeFrom, LocalSourceNode nodeTo) {
instanceFieldStep(nodeFrom, nodeTo)
// HOTFIX: `instanceFieldStep` is temporarily disabled (via `and none()`).
// It uses `classInstanceTracker(cls)` -- itself a type-tracker run --
// from inside `levelStepCall`, creating a structural mutual recursion
// that causes catastrophic query slowdowns on some OOP-heavy Python
// codebases (e.g. mypy and dask). The `and none()` should be removed
// once that recursion is redesigned.
instanceFieldStep(nodeFrom, nodeTo) and none()
or
inheritedFieldStep(nodeFrom, nodeTo)
}

View File

@@ -4199,11 +4199,9 @@ module StdlibPrivate {
// The positional argument contains a mapping.
// TODO: these values can be overwritten by keyword arguments
// - dict mapping
exists(DataFlow::DictionaryElementContent dc, string key | key = dc.getKey() |
input = "Argument[0].DictionaryElement[" + key + "]" and
output = "ReturnValue.DictionaryElement[" + key + "]" and
preservesValue = true
)
input = "Argument[0].WithAnyDictionaryElement" and
output = "ReturnValue" and
preservesValue = true
or
// - list-of-pairs mapping
input = "Argument[0].ListElement.TupleElement[1]" and
@@ -4240,9 +4238,7 @@ module StdlibPrivate {
or
input = "Argument[0].SetElement"
or
exists(DataFlow::TupleElementContent tc, int i | i = tc.getIndex() |
input = "Argument[0].TupleElement[" + i.toString() + "]"
)
input = "Argument[0].AnyTupleElement"
// TODO: Once we have DictKeyContent, we need to transform that into ListElementContent
) and
// Element content is mutated into list element content
@@ -4266,11 +4262,9 @@ module StdlibPrivate {
}
override predicate propagatesFlow(string input, string output, boolean preservesValue) {
exists(DataFlow::TupleElementContent tc, int i | i = tc.getIndex() |
input = "Argument[0].TupleElement[" + i.toString() + "]" and
output = "ReturnValue.TupleElement[" + i.toString() + "]" and
preservesValue = true
)
input = "Argument[0].WithAnyTupleElement" and
output = "ReturnValue" and
preservesValue = true
or
input = "Argument[0].ListElement" and
output = "ReturnValue" and
@@ -4294,9 +4288,7 @@ module StdlibPrivate {
or
input = "Argument[0].SetElement"
or
exists(DataFlow::TupleElementContent tc, int i | i = tc.getIndex() |
input = "Argument[0].TupleElement[" + i.toString() + "]"
)
input = "Argument[0].AnyTupleElement"
// TODO: Once we have DictKeyContent, we need to transform that into ListElementContent
) and
output = "ReturnValue.SetElement" and
@@ -4342,9 +4334,7 @@ module StdlibPrivate {
or
input = "Argument[0].SetElement"
or
exists(DataFlow::TupleElementContent tc, int i | i = tc.getIndex() |
input = "Argument[0].TupleElement[" + i.toString() + "]"
)
input = "Argument[0].AnyTupleElement"
// TODO: Once we have DictKeyContent, we need to transform that into ListElementContent
) and
output = "ReturnValue.ListElement" and
@@ -4372,9 +4362,7 @@ module StdlibPrivate {
or
content = "SetElement"
or
exists(DataFlow::TupleElementContent tc, int i | i = tc.getIndex() |
content = "TupleElement[" + i.toString() + "]"
)
content = "AnyTupleElement"
|
// TODO: Once we have DictKeyContent, we need to transform that into ListElementContent
input = "Argument[0]." + content and
@@ -4404,9 +4392,7 @@ module StdlibPrivate {
or
input = "Argument[0].SetElement"
or
exists(DataFlow::TupleElementContent tc, int i | i = tc.getIndex() |
input = "Argument[0].TupleElement[" + i.toString() + "]"
)
input = "Argument[0].AnyTupleElement"
// TODO: Once we have DictKeyContent, we need to transform that into ListElementContent
) and
output = "ReturnValue.ListElement" and
@@ -4434,9 +4420,7 @@ module StdlibPrivate {
or
input = "Argument[0].SetElement"
or
exists(DataFlow::TupleElementContent tc, int i | i = tc.getIndex() |
input = "Argument[0].TupleElement[" + i.toString() + "]"
)
input = "Argument[0].AnyTupleElement"
// TODO: Once we have DictKeyContent, we need to transform that into ListElementContent
) and
output = "ReturnValue" and
@@ -4468,9 +4452,7 @@ module StdlibPrivate {
// We reduce generality slightly by not tracking tuple contents on list arguments beyond the first, for performance.
// TODO: Once we have TupleElementAny, this generality can be increased.
i = 0 and
exists(DataFlow::TupleElementContent tc, int j | j = tc.getIndex() |
input = "Argument[1].TupleElement[" + j.toString() + "]"
)
input = "Argument[1].AnyTupleElement"
// TODO: Once we have DictKeyContent, we need to transform that into ListElementContent
) and
output = "Argument[0].Parameter[" + i.toString() + "]" and
@@ -4499,9 +4481,7 @@ module StdlibPrivate {
or
input = "Argument[1].SetElement"
or
exists(DataFlow::TupleElementContent tc, int i | i = tc.getIndex() |
input = "Argument[1].TupleElement[" + i.toString() + "]"
)
input = "Argument[1].AnyTupleElement"
// TODO: Once we have DictKeyContent, we need to transform that into ListElementContent
) and
(output = "Argument[0].Parameter[0]" or output = "ReturnValue.ListElement") and
@@ -4525,9 +4505,7 @@ module StdlibPrivate {
or
input = "Argument[0].SetElement"
or
exists(DataFlow::TupleElementContent tc, int i | i = tc.getIndex() |
input = "Argument[0].TupleElement[" + i.toString() + "]"
)
input = "Argument[0].AnyTupleElement"
// TODO: Once we have DictKeyContent, we need to transform that into ListElementContent
) and
output = "ReturnValue.ListElement.TupleElement[1]" and
@@ -4552,12 +4530,7 @@ module StdlibPrivate {
or
input = "Argument[" + i.toString() + "].SetElement"
or
// We reduce generality slightly by not tracking tuple contents on arguments beyond the first two, for performance.
// TODO: Once we have TupleElementAny, this generality can be increased.
i in [0 .. 1] and
exists(DataFlow::TupleElementContent tc, int j | j = tc.getIndex() |
input = "Argument[" + i.toString() + "].TupleElement[" + j.toString() + "]"
)
input = "Argument[" + i.toString() + "].AnyTupleElement"
// TODO: Once we have DictKeyContent, we need to transform that into ListElementContent
) and
output = "ReturnValue.ListElement.TupleElement[" + i.toString() + "]" and
@@ -4580,12 +4553,6 @@ module StdlibPrivate {
override DataFlow::ArgumentNode getACallback() { none() }
override predicate propagatesFlow(string input, string output, boolean preservesValue) {
exists(DataFlow::Content c |
input = "Argument[self]." + c.getMaDRepresentation() and
output = "ReturnValue." + c.getMaDRepresentation() and
preservesValue = true
)
or
input = "Argument[self]" and
output = "ReturnValue" and
preservesValue = true
@@ -4741,12 +4708,10 @@ module StdlibPrivate {
override DataFlow::ArgumentNode getACallback() { none() }
override predicate propagatesFlow(string input, string output, boolean preservesValue) {
exists(DataFlow::DictionaryElementContent dc, string key | key = dc.getKey() |
input = "Argument[self].DictionaryElement[" + key + "]" and
output = "ReturnValue.TupleElement[1]" and
preservesValue = true
// TODO: put `key` into "ReturnValue.TupleElement[0]"
)
input = "Argument[self].AnyDictionaryElement" and
output = "ReturnValue.TupleElement[1]" and
preservesValue = true
// TODO: put `key` into "ReturnValue.TupleElement[0]"
}
}
@@ -4825,11 +4790,9 @@ module StdlibPrivate {
}
override predicate propagatesFlow(string input, string output, boolean preservesValue) {
exists(DataFlow::DictionaryElementContent dc, string key | key = dc.getKey() |
input = "Argument[self].DictionaryElement[" + key + "]" and
output = "ReturnValue.ListElement" and
preservesValue = true
)
input = "Argument[self].AnyDictionaryElement" and
output = "ReturnValue.ListElement" and
preservesValue = true
or
input = "Argument[self]" and
output = "ReturnValue" and
@@ -4876,11 +4839,9 @@ module StdlibPrivate {
}
override predicate propagatesFlow(string input, string output, boolean preservesValue) {
exists(DataFlow::DictionaryElementContent dc, string key | key = dc.getKey() |
input = "Argument[self].DictionaryElement[" + key + "]" and
output = "ReturnValue.ListElement.TupleElement[1]" and
preservesValue = true
)
input = "Argument[self].AnyDictionaryElement" and
output = "ReturnValue.ListElement.TupleElement[1]" and
preservesValue = true
or
// TODO: Add the keys to output list
input = "Argument[self]" and

View File

@@ -1,4 +0,0 @@
consistencyOverview
| deadEnd | 1 |
deadEnd
| without_loop.py:7:5:7:9 | Break |

View File

@@ -1,32 +0,0 @@
/**
* Phase -1 of the dataflow CFG migration: verifies that every variable
* binding visible to the AST (`Name.defines(v)`) corresponds to a CFG node
* in the new CFG (`semmle.python.controlflow.internal.AstNodeImpl`).
*
* The expected tag is `cfgdefines=<name>`. Each binding annotation in the
* test sources looks like `# $ cfgdefines=x` for a binding currently
* covered by the new CFG, or `# $ MISSING: cfgdefines=x` for a binding
* that is known to be uncovered (a "red" test case that should be
* green-flipped once the corresponding `cfg-ext-*` extension lands).
*/
import python
import semmle.python.controlflow.internal.AstNodeImpl as CfgImpl
import utils.test.InlineExpectationsTest
module CfgBindingsTest implements TestSig {
string getARelevantTag() { result = "cfgdefines" }
predicate hasActualResult(Location location, string element, string tag, string value) {
exists(Name n, Variable v, CfgImpl::ControlFlowNode cfg |
n.defines(v) and
cfg.getAstNode().asExpr() = n and
location = n.getLocation() and
element = n.toString() and
tag = "cfgdefines" and
value = v.getId()
)
}
}
import MakeTest<CfgBindingsTest>

View File

@@ -1,13 +0,0 @@
# Annotated assignment (PEP 526). Both with and without an initializer.
a: int = 1 # $ cfgdefines=a
b: str = "hi" # $ cfgdefines=b
# Annotation without value: the AST records `c` as defined,
# and the new CFG now visits it via the AnnAssignStmt wrapper.
c: int # $ cfgdefines=c
class K: # $ cfgdefines=K
field: int = 0 # $ cfgdefines=field

View File

@@ -1,14 +0,0 @@
# Compound (tuple/list) assignment targets — actually wired in the new CFG.
a, b = (1, 2) # $ cfgdefines=a cfgdefines=b
[c, d] = [3, 4] # $ cfgdefines=c cfgdefines=d
# Nested unpacking.
(e, (f, g)) = (1, (2, 3)) # $ cfgdefines=e cfgdefines=f cfgdefines=g
# Star unpacking.
h, *i = [1, 2, 3] # $ cfgdefines=h cfgdefines=i
# Chained assignment with compound target.
j = k, l = (5, 6) # $ cfgdefines=j cfgdefines=k cfgdefines=l

View File

@@ -1,21 +0,0 @@
# Comprehension and `for` loop targets — wired in the new CFG.
# Comprehensions are nested function scopes with a synthetic `.0` parameter
# bound to the iterable.
# Bare-name `for` target.
for i in range(3): # $ cfgdefines=i
pass
# Compound `for` target.
for k, v in [(1, 2)]: # $ cfgdefines=k cfgdefines=v
pass
# Comprehension targets.
_ = [x for x in range(3)] # $ cfgdefines=_ cfgdefines=x cfgdefines=.0
_ = {y: z for y, z in []} # $ cfgdefines=_ cfgdefines=y cfgdefines=z cfgdefines=.0
_ = (a for a in []) # $ cfgdefines=_ cfgdefines=a cfgdefines=.0
# Nested comprehensions.
_ = [b for c in [] for b in c] # $ cfgdefines=_ cfgdefines=c cfgdefines=b cfgdefines=.0

View File

@@ -1,53 +0,0 @@
# Reachability of code following a try whose body always returns.
#
# The new CFG models exception edges for raise-prone expressions when
# they appear inside a `try` (or `with`) statement, mirroring Java's
# `mayThrow`. This means the body of a `try` has both a normal
# completion edge and an exception edge to its handlers, so code
# following the try-statement is reachable via the except-handler path
# even when the try-body would otherwise always return.
#
# Code that is not reachable under either normal or exception flow
# (for example, the `else` clause of a try whose body unconditionally
# raises) remains correctly classified as dead.
def f(obj): # $ cfgdefines=f cfgdefines=obj
try:
return len(obj)
except TypeError:
pass
# The try-body always returns, but `len(obj)` can raise (it is
# inside the try, so we model its exception edge). The
# `except TypeError: pass` handler falls through to here, making
# the code below reachable.
try:
hint = type(obj).__length_hint__ # $ cfgdefines=hint
except AttributeError:
return None
return hint
def g(): # $ cfgdefines=g
try:
raise Exception("inner")
except:
raise Exception("outer")
else:
# Unreachable: the inner try body always raises (via an explicit
# `raise`, which is modelled unconditionally), so the `else:`
# clause never runs.
hit_inner_else = True
def h(cache, key): # $ cfgdefines=h cfgdefines=cache cfgdefines=key
try:
return cache[key]
except KeyError:
pass
# Same pattern as `f`: reachable via the except-handler fall-through.
value = compute(key) # $ cfgdefines=value
cache[key] = value
return value

View File

@@ -1,30 +0,0 @@
# Decorated `def`/`class` — wired in the new CFG.
def deco(f): # $ cfgdefines=deco cfgdefines=f
return f
@deco
def decorated_func(): # $ cfgdefines=decorated_func
pass
@deco
class DecoratedClass: # $ cfgdefines=DecoratedClass
pass
# Stacked decorators.
@deco
@deco
def doubly(): # $ cfgdefines=doubly
pass
# Inside a class body.
class Outer: # $ cfgdefines=Outer
@staticmethod
def inner(): # $ cfgdefines=inner
pass

View File

@@ -1,19 +0,0 @@
# Exception-handler name bindings. These are already wired in the new
# CFG provided the try body can raise; `raise` statements are reliably
# treated as exception sources.
try:
raise ValueError("oops")
except ValueError as e: # $ cfgdefines=e
pass
try:
raise TypeError("oops")
except (TypeError, KeyError) as err: # $ cfgdefines=err
pass
# Exception groups (Python 3.11+).
try:
raise ValueError("oops")
except* ValueError as eg: # $ cfgdefines=eg
pass

View File

@@ -1,14 +0,0 @@
# Import aliases — all bound names below are now reachable via the new
# CFG's `ImportStmt` wrapper.
import os # $ cfgdefines=os
import os.path # $ cfgdefines=os
import os as o # $ cfgdefines=o
from os import path # $ cfgdefines=path
from os import path as p # $ cfgdefines=p
from os import sep, linesep # $ cfgdefines=sep cfgdefines=linesep
from os import (
getcwd, # $ cfgdefines=getcwd
getcwdb, # $ cfgdefines=getcwdb
)

View File

@@ -1,24 +0,0 @@
# Match-statement pattern bindings — wired in the new CFG.
def f(subject): # $ cfgdefines=f cfgdefines=subject
match subject:
case x: # $ cfgdefines=x
pass
case [a, b]: # $ cfgdefines=a cfgdefines=b
pass
case {"k": v}: # $ cfgdefines=v
pass
case Point(p, q): # $ cfgdefines=p cfgdefines=q
pass
case [_, *rest]: # $ cfgdefines=rest
pass
case (1 | 2) as n: # $ cfgdefines=n
pass
class Point: # $ cfgdefines=Point
__match_args__ = ("x", "y") # $ cfgdefines=__match_args__
x: int # $ cfgdefines=x
y: int # $ cfgdefines=y

View File

@@ -1,42 +0,0 @@
# Function parameters.
def positional(a, b): # $ cfgdefines=positional cfgdefines=a cfgdefines=b
pass
def with_default(x=1, y=2): # $ cfgdefines=with_default cfgdefines=x cfgdefines=y
pass
def with_vararg(*args): # $ cfgdefines=with_vararg cfgdefines=args
pass
def with_kwarg(**kwargs): # $ cfgdefines=with_kwarg cfgdefines=kwargs
pass
def with_kwonly(*, k1, k2=5): # $ cfgdefines=with_kwonly cfgdefines=k1 cfgdefines=k2
pass
def kitchen_sink(a, b=2, *args, k1, k2=5, **kw): # $ cfgdefines=kitchen_sink cfgdefines=a cfgdefines=b cfgdefines=args cfgdefines=k1 cfgdefines=k2 cfgdefines=kw
pass
# Methods get `self` / `cls`.
class C: # $ cfgdefines=C
def method(self, x): # $ cfgdefines=method cfgdefines=self cfgdefines=x
pass
@classmethod
def cmethod(cls, x): # $ cfgdefines=cmethod cfgdefines=cls cfgdefines=x
pass
# Lambda parameter.
_ = lambda p: p + 1 # $ cfgdefines=_ cfgdefines=p
# PEP 570 positional-only.
def pos_only(a, b, /, c): # $ cfgdefines=pos_only cfgdefines=a cfgdefines=b cfgdefines=c
pass

View File

@@ -1,14 +0,0 @@
# Simple bindings that should already work in the new CFG.
# No MISSING annotations expected.
x = 1 # $ cfgdefines=x
y = x + 1 # $ cfgdefines=y
def f(): # $ cfgdefines=f
pass
class C: # $ cfgdefines=C
pass
# Re-assignment.
x = 2 # $ cfgdefines=x

View File

@@ -1,21 +0,0 @@
# PEP 695 type parameters (Python 3.12+).
# PEP 695 type-param names on `def`/`class` bind in an annotation scope
# that nests the function/class body — they have no CFG node in the
# enclosing scope (matching the legacy CFG).
def func[T](x: T) -> T: # $ cfgdefines=func cfgdefines=x
return x
class Box[T]: # $ cfgdefines=Box
item: T # $ cfgdefines=item
# Multi-parameter, with bound and variadics.
def multi[T: int, *Ts, **P](x: T, *args: *Ts, **kwargs: P.kwargs) -> T: # $ cfgdefines=multi cfgdefines=x cfgdefines=args cfgdefines=kwargs
return x
# `type` statement (PEP 695).
type Alias[T] = list[T] # $ cfgdefines=Alias cfgdefines=T

View File

@@ -1,14 +0,0 @@
# Walrus and starred-target edge cases — wired in the new CFG.
# Walrus in expression context.
if (y := 5) > 0: # $ cfgdefines=y
pass
# Walrus in a comprehension. The comprehension introduces a synthetic
# `.0` parameter bound to the iterable.
_ = [w for _ in range(3) if (w := 1)] # $ cfgdefines=_ cfgdefines=w cfgdefines=.0
# Starred target in a Tuple LHS.
*head, tail = [1, 2, 3] # $ cfgdefines=head cfgdefines=tail

View File

@@ -1,21 +0,0 @@
# `with cm() as x:` bindings — wired in the new CFG.
class CM: # $ cfgdefines=CM
def __enter__(self): return self # $ cfgdefines=__enter__ cfgdefines=self
def __exit__(self, *a): pass # $ cfgdefines=__exit__ cfgdefines=self cfgdefines=a
with CM() as x: # $ cfgdefines=x
pass
# Multiple items.
with CM() as a, CM() as b: # $ cfgdefines=a cfgdefines=b
pass
# Parenthesised form (Python 3.10+).
with (CM() as p, CM() as q): # $ cfgdefines=p cfgdefines=q
pass
# Compound target in `with`.
with CM() as (m, n): # $ cfgdefines=m cfgdefines=n
pass

View File

@@ -1,14 +0,0 @@
/** New-CFG version of AllLiveReachable. */
import python
import TimerUtils
import NewCfgImpl
private module Utils = EvalOrderCfgUtils<NewCfg>;
private import Utils
private import Utils::CfgTests
from TimerCfgNode a, TestFunction f
where allLiveReachable(a, f)
select a, "Unreachable live annotation; entry of $@ does not reach this node", f, f.getName()

View File

@@ -1,18 +0,0 @@
/**
* New-CFG version of AnnotationHasCfgNode.
*
* Checks that every timer annotation has a corresponding CFG node.
*/
import python
import TimerUtils
import NewCfgImpl
private module Utils = EvalOrderCfgUtils<NewCfg>;
private import Utils::CfgTests
from TimerAnnotation ann
where annotationWithoutCfgNode(ann)
select ann, "Annotation in $@ has no CFG node", ann.getTestFunction(),
ann.getTestFunction().getName()

View File

@@ -1,26 +0,0 @@
/**
* New-CFG version of BasicBlockAnnotationGap.
*
* Original:
* Checks that within a basic block, if a node is annotated then its
* successor is also annotated (or excluded). A gap in annotations
* within a basic block indicates a missing annotation, since there
* are no branches to justify the gap.
*
* Nodes with exceptional successors are excluded, as the exception
* edge leaves the basic block and the normal successor may be dead.
*/
import python
import TimerUtils
import NewCfgImpl
private module Utils = EvalOrderCfgUtils<NewCfg>;
private import Utils
private import Utils::CfgTests
from TimerCfgNode a, CfgNode succ
where basicBlockAnnotationGap(a, succ)
select a, "Annotated node followed by unannotated $@ in the same basic block", succ,
succ.getNode().toString()

View File

@@ -1,21 +0,0 @@
/**
* New-CFG version of BasicBlockOrdering.
*
* Original:
* Checks that within a single basic block, annotations appear in
* increasing minimum-timestamp order.
*/
import python
import TimerUtils
import NewCfgImpl
private module Utils = EvalOrderCfgUtils<NewCfg>;
private import Utils
private import Utils::CfgTests
from TimerCfgNode a, TimerCfgNode b, int minA, int minB
where basicBlockOrdering(a, b, minA, minB)
select a, "Basic block ordering: $@ appears before $@", a.getTimestampExpr(minA),
"timestamp " + minA, b.getTimestampExpr(minB), "timestamp " + minB

View File

@@ -1,80 +0,0 @@
/**
* New-CFG version of BranchTimestamps.
*
* Checks that when a node has both a true and false successor, the
* live timestamps on one branch are marked as dead on the other.
* This ensures that boolean branches are fully annotated with dead()
* markers for the paths not taken.
*
* Limitation: the `@ t[ts, ...]` / `dead(ts)` annotation scheme can only
* model branch-dead-ness for plain boolean control flow that reconverges
* linearly after the split — i.e. `if`-with-else and `if`-expression.
* It cannot model:
*
* * loops (`while` / `for`): body timestamps repeat across iterations,
* so the loop-exit annotation can't list them as dead;
* * `match` statements: each `case` body is a syntactically distinct
* sub-tree, and the branches don't reconverge through a common
* annotation point in the timeline;
* * `try` / `with` and `raise` / `assert`: exception edges are modelled
* as true/false but flow to syntactically distinct handlers, with no
* reconvergence in the linear annotation order;
* * short-circuit `and` / `or` (`BoolExpr`): the branches reconverge at
* the BoolExpr's after-node, so timestamps on one branch are live
* downstream of the other rather than dead;
* * `if` without an `else` clause, and `if`/`elif` chains: the false
* branch reconverges with the true branch at the post-if statement
* (no-else) or fans out across multiple elif-test annotations,
* neither of which fit the binary annotation scheme.
*
* Branch nodes inside those constructs are therefore whitelisted out
* below. The check still fires (and is useful) for plain `if`/`else`
* and conditional-expression branching.
*/
import python
import TimerUtils
import NewCfgImpl
private module Utils = EvalOrderCfgUtils<NewCfg>;
private import Utils
private import Utils::CfgTests
/**
* Holds if `f` contains a construct whose branches the linear-timestamp
* annotation scheme cannot describe (see file-level comment).
*/
private predicate hasUnmodellableBranching(Function f) {
exists(AstNode bad |
bad.getScope() = f and
(
bad instanceof While
or
bad instanceof For
or
bad instanceof MatchStmt
or
bad instanceof Try
or
bad instanceof With
or
bad instanceof Raise
or
bad instanceof Assert
or
bad instanceof BoolExpr
or
bad instanceof If and
(not exists(bad.(If).getAnOrelse()) or bad.(If).isElif())
)
)
}
from TimerCfgNode node, int ts, string branch
where
missingBranchTimestamp(node, ts, branch) and
not hasUnmodellableBranching(node.getTestFunction())
select node,
"Timestamp " + ts + " on true/false branch is missing a dead() annotation on the " + branch +
" successor in $@", node.getTestFunction(), node.getTestFunction().getName()

View File

@@ -1,22 +0,0 @@
/**
* New-CFG version of ConsecutivePredecessorTimestamps.
*
* Checks that each annotated node (except the minimum timestamp) has
* a predecessor annotation with timestamp `a - 1`. This is the reverse
* of ConsecutiveTimestamps: it catches nodes that are reachable but
* arrived at from the wrong place (skipping an intermediate node).
*/
import python
import TimerUtils
import NewCfgImpl
private module Utils = EvalOrderCfgUtils<NewCfg>;
private import Utils
private import Utils::CfgTests
from TimerAnnotation ann, int a
where consecutivePredecessorTimestamps(ann, a)
select ann, "$@ in $@ has no consecutive predecessor (expected " + (a - 1) + ")",
ann.getTimestampExpr(a), "Timestamp " + a, ann.getTestFunction(), ann.getTestFunction().getName()

View File

@@ -1,29 +0,0 @@
/**
* New-CFG version of ConsecutiveTimestamps.
*
* Original:
* Checks that consecutive annotated nodes have consecutive timestamps:
* for each annotation with timestamp `a`, some CFG node for that annotation
* must have a next annotation containing `a + 1`.
*
* Handles CFG splitting (e.g., finally blocks duplicated for normal/exceptional
* flow) by checking that at least one split has the required successor.
*
* Only applies to functions where all annotations are in the function's
* own scope (excludes tests with generators, async, comprehensions, or
* lambdas that have annotations in nested scopes).
*/
import python
import TimerUtils
import NewCfgImpl
private module Utils = EvalOrderCfgUtils<NewCfg>;
private import Utils
private import Utils::CfgTests
from TimerAnnotation ann, int a
where consecutiveTimestamps(ann, a)
select ann, "$@ in $@ has no consecutive successor (expected " + (a + 1) + ")",
ann.getTimestampExpr(a), "Timestamp " + a, ann.getTestFunction(), ann.getTestFunction().getName()

View File

@@ -1,120 +0,0 @@
/**
* Implementation of the evaluation-order CFG signature using the new
* shared control flow graph from AstNodeImpl.
*/
private import python as Py
import TimerUtils
private import semmle.python.controlflow.internal.AstNodeImpl as CfgImpl
private import codeql.controlflow.SuccessorType
private class NewControlFlowNode = CfgImpl::ControlFlowNode;
private class NewBasicBlock = CfgImpl::BasicBlock;
/** New (shared) CFG implementation of the evaluation-order signature. */
module NewCfg implements EvalOrderCfgSig {
class CfgNode instanceof NewControlFlowNode {
// We must pick a *unique* representative CFG node for each AST node. The
// shared CFG has several nodes per AST node (before / in-post-order / after
// / after-value splits), but the timer test framework keys annotations on
// `getNode()` and assumes one CFG node per annotated AST node. Without a
// filter, an annotated `f()` would map to both `f()` and `After f()`, which
// breaks two framework invariants: (1) the "no shared reachable" check
// requires that two distinct nodes sharing a timestamp be mutually
// unreachable (true/false branches of a condition), but `Before f()`,
// `f()` and `After f()` share the annotation's timestamp *and* lie on one
// linear path; and (2) the annotation walk (`nextTimerAnnotation`) halts at
// the first reachable representative, so a second node for the same AST
// node would stall the walk on the same timestamp instead of advancing to
// the next evaluation event.
//
// We use the "after" node (`isAfter`) rather than the canonical `injects`
// node, because `injects` represents short-circuit / conditional
// expressions (`and`/`or`/`not`/ternary) by their *before* node, placing
// them ahead of their operands — wrong for evaluation order. `isAfter`
// instead picks the post-evaluation node: the merged before/after node for
// simple leaves, the `TAfterNode` for post-order expressions, and the
// `AfterValueNode`(s) for pre-order conditionals, all positioned after the
// operands. The two value-split nodes of a conditional are genuinely
// distinct evaluation outcomes (handled by `getATrueSuccessor` /
// `getAFalseSuccessor`), so they do not violate the uniqueness assumption.
CfgNode() { NewControlFlowNode.super.isAfter(_) }
string toString() { result = NewControlFlowNode.super.toString() }
Py::Location getLocation() { result = NewControlFlowNode.super.getLocation() }
Py::AstNode getNode() {
result = CfgImpl::astNodeToPyNode(NewControlFlowNode.super.getAstNode())
}
CfgNode getASuccessor() { nextCfgNode(this, result) }
CfgNode getATrueSuccessor() {
NewControlFlowNode.super.isAfterTrue(_) and
// Only where there's also a false branch (true boolean split)
exists(NewControlFlowNode other | other.isAfterFalse(NewControlFlowNode.super.getAstNode())) and
nextCfgNodeFrom(this, result)
}
CfgNode getAFalseSuccessor() {
NewControlFlowNode.super.isAfterFalse(_) and
// Only where there's also a true branch (true boolean split)
exists(NewControlFlowNode other | other.isAfterTrue(NewControlFlowNode.super.getAstNode())) and
nextCfgNodeFrom(this, result)
}
CfgNode getAnExceptionalSuccessor() {
exists(NewControlFlowNode mid |
mid = NewControlFlowNode.super.getAnExceptionSuccessor() and
nextCfgNodeFrom(mid, result)
)
}
Py::Scope getScope() { result = NewControlFlowNode.super.getEnclosingCallable().asScope() }
BasicBlock getBasicBlock() {
exists(NewBasicBlock bb, int i | bb.getNode(i) = this and result = bb)
}
}
/**
* Holds if `next` is the nearest CfgNode reachable from `n` via
* one or more raw CFG successor edges, skipping non-CfgNode intermediaries.
*/
private predicate nextCfgNodeFrom(NewControlFlowNode n, CfgNode next) {
next = n.getASuccessor()
or
exists(NewControlFlowNode mid |
mid = n.getASuccessor() and
not mid instanceof CfgNode and
nextCfgNodeFrom(mid, next)
)
}
/**
* Holds if `next` is the nearest CfgNode successor of `n`,
* skipping synthetic intermediate nodes.
*/
private predicate nextCfgNode(CfgNode n, CfgNode next) { nextCfgNodeFrom(n, next) }
class BasicBlock instanceof NewBasicBlock {
string toString() { result = NewBasicBlock.super.toString() }
CfgNode getNode(int n) { result = NewBasicBlock.super.getNode(n) }
predicate reaches(BasicBlock bb) { this = bb or this.strictlyReaches(bb) }
predicate strictlyReaches(BasicBlock bb) { NewBasicBlock.super.getASuccessor+() = bb }
predicate strictlyDominates(BasicBlock bb) { NewBasicBlock.super.strictlyDominates(bb) }
}
CfgNode scopeGetEntryNode(Py::Scope s) {
exists(CfgImpl::ControlFlow::EntryNode entry |
entry.getEnclosingCallable().asScope() = s and
nextCfgNodeFrom(entry, result)
)
}
}

View File

@@ -1,21 +0,0 @@
/**
* New-CFG version of NeverReachable.
*
* Original:
* Checks that expressions annotated with `t.never` either have no CFG
* node, or if they do, that the node is not reachable from its scope's
* entry (including within the same basic block).
*/
import python
import TimerUtils
import NewCfgImpl
private module Utils = EvalOrderCfgUtils<NewCfg>;
private import Utils::CfgTests
from TimerAnnotation ann
where neverReachable(ann)
select ann, "Node annotated with t.never is reachable in $@", ann.getTestFunction(),
ann.getTestFunction().getName()

View File

@@ -1,22 +0,0 @@
/**
* New-CFG version of NoBackwardFlow.
*
* Original:
* Checks that time never flows backward between consecutive timer annotations
* in the CFG. For each pair of consecutive annotated nodes (A -> B), there must
* exist timestamps a in A and b in B with a < b.
*/
import python
import TimerUtils
import NewCfgImpl
private module Utils = EvalOrderCfgUtils<NewCfg>;
private import Utils
private import Utils::CfgTests
from TimerCfgNode a, TimerCfgNode b, int minA, int maxB
where noBackwardFlow(a, b, minA, maxB)
select a, "Backward flow: $@ flows to $@ (max timestamp $@)", a.getTimestampExpr(minA),
minA.toString(), b, b.getNode().toString(), b.getTimestampExpr(maxB), maxB.toString()

View File

@@ -1,18 +0,0 @@
/**
* New-CFG version of NoBasicBlock.
*
* Checks that every annotated CFG node belongs to a basic block.
*/
import python
import TimerUtils
import NewCfgImpl
private module Utils = EvalOrderCfgUtils<NewCfg>;
private import Utils
private import Utils::CfgTests
from CfgNode n, TestFunction f
where noBasicBlock(n, f)
select n, "CFG node in $@ does not belong to any basic block", f, f.getName()

View File

@@ -1,21 +0,0 @@
/**
* New-CFG version of NoSharedReachable.
*
* Original:
* Checks that two annotations sharing a timestamp value are on
* mutually exclusive CFG paths (neither can reach the other).
*/
import python
import TimerUtils
import NewCfgImpl
private module Utils = EvalOrderCfgUtils<NewCfg>;
private import Utils
private import Utils::CfgTests
from TimerCfgNode a, TimerCfgNode b, int ts
where noSharedReachable(a, b, ts)
select a, "Shared timestamp $@ but this node reaches $@", a.getTimestampExpr(ts), ts.toString(), b,
b.getNode().toString()

View File

@@ -1,22 +0,0 @@
/**
* New-CFG version of StrictForward.
*
* Original:
* Stronger version of NoBackwardFlow: for consecutive annotated nodes
* A -> B that both have a single timestamp (non-loop code) and B does
* NOT dominate A (forward edge), requires max(A) < min(B).
*/
import python
import TimerUtils
import NewCfgImpl
private module Utils = EvalOrderCfgUtils<NewCfg>;
private import Utils
private import Utils::CfgTests
from TimerCfgNode a, TimerCfgNode b, int maxA, int minB
where strictForward(a, b, maxA, minB)
select a, "Strict forward violation: $@ flows to $@", a.getTimestampExpr(maxA), "timestamp " + maxA,
b.getTimestampExpr(minB), "timestamp " + minB

View File

@@ -3,14 +3,14 @@
* Python control flow graph.
*/
private import python as Py
private import python as PY
import TimerUtils
/** Existing Python CFG implementation of the evaluation-order signature. */
module OldCfg implements EvalOrderCfgSig {
class CfgNode = Py::ControlFlowNode;
class CfgNode = PY::ControlFlowNode;
class BasicBlock = Py::BasicBlock;
class BasicBlock = PY::BasicBlock;
CfgNode scopeGetEntryNode(Py::Scope s) { result = s.getEntryNode() }
CfgNode scopeGetEntryNode(PY::Scope s) { result = s.getEntryNode() }
}

View File

@@ -85,7 +85,7 @@ def test_nested_if_else(t):
else:
z = 2 @ t[dead(4)]
else:
z = 3 @ t[dead(3), dead(4)]
z = 3 @ t[dead(4)]
w = 0 @ t[5]

View File

@@ -1,41 +0,0 @@
/**
* Inline-expectations test for the store/load/delete/parameter
* classification predicates on the new-CFG facade.
*
* Each tag fires when the corresponding predicate (`isLoad`,
* `isStore`, `isDelete`, `isParameter`, `isAugLoad`, `isAugStore`)
* holds on the canonical CFG node wrapping a `Py::Name` with the
* given identifier. Subscript and attribute stores are not covered
* by these tags — only the `Name`-typed targets/loads they involve.
*/
import python
import semmle.python.controlflow.internal.Cfg as Cfg
import utils.test.InlineExpectationsTest
module StoreLoadTest implements TestSig {
string getARelevantTag() { result = ["load", "store", "delete", "param", "augload", "augstore"] }
predicate hasActualResult(Location location, string element, string tag, string value) {
exists(Cfg::NameNode n |
location = n.getLocation() and
element = n.toString() and
value = n.getId() and
(
n.isLoad() and not n.isAugLoad() and tag = "load"
or
n.isStore() and not n.isAugStore() and tag = "store"
or
n.isDelete() and tag = "delete"
or
n.isParameter() and tag = "param"
or
n.isAugLoad() and tag = "augload"
or
n.isAugStore() and tag = "augstore"
)
)
}
}
import MakeTest<StoreLoadTest>

View File

@@ -1,56 +0,0 @@
# Store/load/delete/parameter classification on the new-CFG facade.
#
# Each annotated location carries the (sorted, deduplicated) set of
# kinds the CFG facade reports there. Comparing against the legacy
# 'semmle.python.Flow' classification is done by the comparison query
# 'StoreLoadParity.ql' — annotations here are only the positive
# assertions for the new facade.
#
# Tags:
# load=<id> -- isLoad() fires on the Name
# store=<id> -- isStore() fires
# delete=<id> -- isDelete() fires
# param=<id> -- isParameter() fires
# augload=<id> -- isAugLoad() fires (the LHS of x += ... when read)
# augstore=<id> -- isAugStore() fires (the LHS of x += ... when written)
# --- plain load / store / delete ---
x = 1 # $ store=x
y = x + 1 # $ store=y load=x
print(y) # $ load=print load=y
del x # $ delete=x
# --- function definitions (parameters) ---
def f(a, b=2, *args, c, **kwargs): # $ store=f param=a param=b param=args param=c param=kwargs
return a + b + c # $ load=a load=b load=c
# --- augmented assignment splits one Name into load + store halves ---
def aug(): # $ store=aug
n = 0 # $ store=n
n += 1 # $ augload=n augstore=n
return n # $ load=n
# --- subscript / attribute stores ---
class C: # $ store=C
pass
def stores(obj, container, idx): # $ store=stores param=obj param=container param=idx
obj.attr = 1 # $ load=obj
container[idx] = 2 # $ load=container load=idx
return obj # $ load=obj
# --- tuple unpacking ---
def unpack(pair): # $ store=unpack param=pair
a, b = pair # $ store=a store=b load=pair
return a + b # $ load=a load=b

View File

@@ -589,11 +589,11 @@ def test_zip_tuple():
SINK(z[0][0]) # $ flow="SOURCE, l:-7 -> z[0][0]"
SINK(z[0][1]) # $ flow="SOURCE, l:-7 -> z[0][1]"
SINK_F(z[0][2])
SINK_F(z[0][2]) # $ SPURIOUS: flow="SOURCE, l:-7 -> z[0][2]"
SINK_F(z[0][3])
SINK(z[1][0]) # $ flow="SOURCE, l:-11 -> z[1][0]"
SINK_F(z[1][1]) # $ SPURIOUS: flow="SOURCE, l:-11 -> z[1][1]"
SINK(z[1][2]) # $ MISSING: flow="SOURCE, l:-11 -> z[1][2]" # Tuple contents are not tracked beyond the first two arguments for performance.
SINK(z[1][2]) # $ flow="SOURCE, l:-11 -> z[1][2]"
SINK_F(z[1][3])
@expects(4)

View File

@@ -157,7 +157,7 @@ class MyClass2(object):
print(self.foo) # $ tracked MISSING: tracked=foo
instance = MyClass2()
print(instance.foo) # $ tracked MISSING: tracked=foo
print(instance.foo) # $ MISSING: tracked=foo tracked
instance.print_foo() # $ MISSING: tracked=foo
@@ -195,7 +195,7 @@ class Sub1(Base1):
sub1 = Sub1()
sub1.read_foo()
print(sub1.foo) # $ tracked MISSING: tracked=foo
print(sub1.foo) # $ MISSING: tracked=foo tracked
# attribute written in a subclass method, read in an inherited base class method
@@ -210,7 +210,7 @@ class Sub2(Base2):
sub2 = Sub2()
sub2.read_bar()
print(sub2.bar) # $ tracked MISSING: tracked=bar
print(sub2.bar) # $ MISSING: tracked=bar tracked
# attribute written in a base class method, read on an instance of the subclass
@@ -223,4 +223,4 @@ class Sub3(Base3):
pass
sub3 = Sub3()
print(sub3.baz) # $ tracked MISSING: tracked=baz
print(sub3.baz) # $ MISSING: tracked=baz tracked

View File

@@ -362,7 +362,7 @@ def test_load_in_bulk():
# see https://docs.djangoproject.com/en/4.0/ref/models/querysets/#in-bulk
d = TestLoad.objects.in_bulk([1])
for val in d.values():
SINK(val.text) # $ MISSING: flow
SINK(val.text) # $ flow="SOURCE, l:-65 -> val.text"
SINK(d[1].text) # $ flow="SOURCE, l:-66 -> d[1].text"

View File

@@ -1,7 +1,6 @@
#select
| app.py:23:20:23:24 | ControlFlowNode for query | app.py:20:18:20:21 | ControlFlowNode for name | app.py:23:20:23:24 | ControlFlowNode for query | This SQL query depends on a $@. | app.py:20:18:20:21 | ControlFlowNode for name | user-provided value |
| app.py:30:20:30:24 | ControlFlowNode for query | app.py:27:19:27:22 | ControlFlowNode for name | app.py:30:20:30:24 | ControlFlowNode for query | This SQL query depends on a $@. | app.py:27:19:27:22 | ControlFlowNode for name | user-provided value |
| app.py:37:20:37:24 | ControlFlowNode for query | app.py:34:19:34:22 | ControlFlowNode for name | app.py:37:20:37:24 | ControlFlowNode for query | This SQL query depends on a $@. | app.py:34:19:34:22 | ControlFlowNode for name | user-provided value |
| app.py:44:20:44:24 | ControlFlowNode for query | app.py:41:19:41:22 | ControlFlowNode for name | app.py:44:20:44:24 | ControlFlowNode for query | This SQL query depends on a $@. | app.py:41:19:41:22 | ControlFlowNode for name | user-provided value |
| app.py:51:20:51:24 | ControlFlowNode for query | app.py:48:19:48:22 | ControlFlowNode for name | app.py:51:20:51:24 | ControlFlowNode for query | This SQL query depends on a $@. | app.py:48:19:48:22 | ControlFlowNode for name | user-provided value |
| sql_injection.py:21:24:21:77 | ControlFlowNode for BinaryExpr | sql_injection.py:14:15:14:22 | ControlFlowNode for username | sql_injection.py:21:24:21:77 | ControlFlowNode for BinaryExpr | This SQL query depends on a $@. | sql_injection.py:14:15:14:22 | ControlFlowNode for username | user-provided value |
@@ -25,8 +24,6 @@ edges
| app.py:21:5:21:9 | ControlFlowNode for query | app.py:23:20:23:24 | ControlFlowNode for query | provenance | |
| app.py:27:19:27:22 | ControlFlowNode for name | app.py:28:5:28:9 | ControlFlowNode for query | provenance | |
| app.py:28:5:28:9 | ControlFlowNode for query | app.py:30:20:30:24 | ControlFlowNode for query | provenance | |
| app.py:34:19:34:22 | ControlFlowNode for name | app.py:35:5:35:9 | ControlFlowNode for query | provenance | |
| app.py:35:5:35:9 | ControlFlowNode for query | app.py:37:20:37:24 | ControlFlowNode for query | provenance | |
| app.py:41:19:41:22 | ControlFlowNode for name | app.py:42:5:42:9 | ControlFlowNode for query | provenance | |
| app.py:42:5:42:9 | ControlFlowNode for query | app.py:44:20:44:24 | ControlFlowNode for query | provenance | |
| app.py:48:19:48:22 | ControlFlowNode for name | app.py:49:5:49:9 | ControlFlowNode for query | provenance | |
@@ -54,9 +51,6 @@ nodes
| app.py:27:19:27:22 | ControlFlowNode for name | semmle.label | ControlFlowNode for name |
| app.py:28:5:28:9 | ControlFlowNode for query | semmle.label | ControlFlowNode for query |
| app.py:30:20:30:24 | ControlFlowNode for query | semmle.label | ControlFlowNode for query |
| app.py:34:19:34:22 | ControlFlowNode for name | semmle.label | ControlFlowNode for name |
| app.py:35:5:35:9 | ControlFlowNode for query | semmle.label | ControlFlowNode for query |
| app.py:37:20:37:24 | ControlFlowNode for query | semmle.label | ControlFlowNode for query |
| app.py:41:19:41:22 | ControlFlowNode for name | semmle.label | ControlFlowNode for name |
| app.py:42:5:42:9 | ControlFlowNode for query | semmle.label | ControlFlowNode for query |
| app.py:44:20:44:24 | ControlFlowNode for query | semmle.label | ControlFlowNode for query |

View File

@@ -31,10 +31,10 @@ async def unsafe2(name: str): # $ Source
cursor.close()
@app.get("/unsafe3/")
async def unsafe3(name: str): # $ Source
async def unsafe3(name: str): # $ MISSING: Source
query = "select * from users where name=" + name
cursor = hdb_con3.cursor()
cursor.execute(query) # $ Alert
cursor.execute(query) # $ MISSING: Alert
cursor.close()
@app.get("/unsafe4/")

View File

@@ -28,6 +28,8 @@ nodes
| string_flow.rb:227:10:227:10 | a | semmle.label | a |
subpaths
testFailures
| string_flow.rb:85:10:85:10 | a | Unexpected result: hasValueFlow=a |
| string_flow.rb:227:10:227:10 | a | Unexpected result: hasValueFlow=a |
#select
| string_flow.rb:3:10:3:22 | call to new | string_flow.rb:2:9:2:18 | call to source | string_flow.rb:3:10:3:22 | call to new | $@ | string_flow.rb:2:9:2:18 | call to source | call to source |
| string_flow.rb:85:10:85:10 | a | string_flow.rb:83:9:83:18 | call to source | string_flow.rb:85:10:85:10 | a | $@ | string_flow.rb:83:9:83:18 | call to source | call to source |

View File

@@ -82,7 +82,7 @@ end
def m_clear
a = source "a"
a.clear
sink a # $ SPURIOUS: hasValueFlow=a
sink a
end
# concat and prepend omitted because they clash with the summaries for
@@ -224,7 +224,7 @@ def m_replace
b = source "b"
sink a.replace(b) # $ hasTaintFlow=b
# TODO: currently we get value flow for a, because we don't clear content
sink a # $ hasTaintFlow=b SPURIOUS: hasValueFlow=a
sink a # $ hasTaintFlow=b
end
def m_reverse
@@ -316,4 +316,4 @@ def m_upto(i)
a.upto("b", true) { |x| sink x } # $ hasTaintFlow=a
"b".upto(a) { |x| sink x } # $ hasTaintFlow=a
"b".upto(a, true) { |x| sink x }
end
end

View File

@@ -9,7 +9,7 @@ end
class OneController < ActionController::Base
before_action :a
after_action :c
def a
@foo = params[:foo]
end
@@ -18,14 +18,14 @@ class OneController < ActionController::Base
end
def c
sink @foo # $ hasTaintFlow
sink @foo
end
end
class TwoController < ActionController::Base
before_action :a
after_action :c
def a
@foo = params[:foo]
end
@@ -35,14 +35,14 @@ class TwoController < ActionController::Base
end
def c
sink @foo # $ SPURIOUS: hasTaintFlow
sink @foo
end
end
class ThreeController < ActionController::Base
before_action :a
after_action :c
def a
@foo = params[:foo]
@foo = "safe"
@@ -52,14 +52,14 @@ class ThreeController < ActionController::Base
end
def c
sink @foo # $ SPURIOUS: hasTaintFlow
sink @foo
end
end
class FourController < ActionController::Base
before_action :a
after_action :c
def a
@foo.bar = params[:foo]
end
@@ -68,14 +68,14 @@ class FourController < ActionController::Base
end
def c
sink(@foo.bar) # $ hasTaintFlow
sink(@foo.bar)
end
end
class FiveController < ActionController::Base
before_action :a
after_action :c
def a
self.taint_foo
end
@@ -84,10 +84,10 @@ class FiveController < ActionController::Base
end
def c
sink @foo # $ hasTaintFlow
sink @foo
end
def taint_foo
@foo = params[:foo]
end
end
end

View File

@@ -270,6 +270,11 @@ nodes
| params_flow.rb:205:10:205:10 | a | semmle.label | a |
subpaths
testFailures
| filter_flow.rb:21:10:21:13 | @foo | Unexpected result: hasTaintFlow |
| filter_flow.rb:38:10:38:13 | @foo | Unexpected result: hasTaintFlow |
| filter_flow.rb:55:10:55:13 | @foo | Unexpected result: hasTaintFlow |
| filter_flow.rb:71:10:71:17 | call to bar | Unexpected result: hasTaintFlow |
| filter_flow.rb:87:11:87:14 | @foo | Unexpected result: hasTaintFlow |
#select
| filter_flow.rb:21:10:21:13 | @foo | filter_flow.rb:14:12:14:17 | call to params | filter_flow.rb:21:10:21:13 | @foo | $@ | filter_flow.rb:14:12:14:17 | call to params | call to params |
| filter_flow.rb:38:10:38:13 | @foo | filter_flow.rb:30:12:30:17 | call to params | filter_flow.rb:38:10:38:13 | @foo | $@ | filter_flow.rb:30:12:30:17 | call to params | call to params |

View File

@@ -121,3 +121,37 @@ pub fn rule(input: TokenStream) -> TokenStream {
Err(err) => err.to_compile_error().into(),
}
}
/// Define a desugaring rule whose transform is a hand-written Rust block.
///
/// Use `manual_rule!` when the transform needs control over capture
/// translation timing — for example, when an outer rule needs to set
/// state in `ctx` (the `BuildCtx`'s user context) before recursive
/// translation reaches inner rules that read that state.
///
/// ```text
/// manual_rule!(
/// (query_pattern field: (_) @name)
/// {
/// // `ctx` is a `&mut BuildCtx<'_, C>`; capture variables
/// // (`name: NodeRef`, etc.) are bound from the query.
/// let translated = ctx.translate(name)?;
/// Ok(translated)
/// }
/// )
/// ```
///
/// Differences from [`rule!`]:
/// - Captures are **not** auto-translated before the body runs; they
/// refer to raw input-schema nodes. Use [`BuildCtx::translate`] (or
/// [`BuildCtx::translate_opt`]) to translate them when you choose.
/// - The body is plain Rust returning `Result<Vec<Id>, String>` — no
/// tree template, no `Ok(...)` wrap.
#[proc_macro]
pub fn manual_rule(input: TokenStream) -> TokenStream {
let input2: TokenStream2 = input.into();
match parse::parse_manual_rule_top(input2) {
Ok(output) => output.into(),
Err(err) => err.to_compile_error().into(),
}
}

View File

@@ -22,9 +22,10 @@ pub fn parse_query_top(input: TokenStream) -> Result<TokenStream> {
/// Parse a single query node (possibly with a trailing `@capture`).
fn parse_query_node(tokens: &mut Tokens) -> Result<TokenStream> {
let base = parse_query_atom(tokens)?;
// Check for trailing @capture or @@capture
// Check for trailing @capture
if peek_is_at(tokens) {
let capture_name = consume_capture_marker(tokens)?;
tokens.next(); // consume @
let capture_name = expect_ident(tokens, "expected capture name after @")?;
let name_str = capture_name.to_string();
Ok(quote! {
yeast::query::QueryNode::Capture {
@@ -158,7 +159,8 @@ fn parse_query_fields(tokens: &mut Tokens) -> Result<Vec<TokenStream>> {
push_field_elem(&mut field_order, &mut field_elems, field_str, elem);
} else {
let child = if peek_is_at(tokens) {
let capture_name = consume_capture_marker(tokens)?;
tokens.next();
let capture_name = expect_ident(tokens, "expected capture name after @")?;
let name_str = capture_name.to_string();
quote! {
yeast::query::QueryNode::Capture {
@@ -648,9 +650,6 @@ fn parse_direct_list(tokens: &mut Tokens, ctx: &Ident) -> Result<Vec<TokenStream
struct CaptureInfo {
name: String,
multiplicity: CaptureMultiplicity,
/// `true` for `@@name` captures: the auto-translate prefix skips them,
/// so the bound `NodeRef` refers to the raw (input-schema) node.
raw: bool,
}
#[derive(Clone, Copy, PartialEq)]
@@ -709,14 +708,6 @@ fn extract_captures_inner(
extract_captures_inner(&mut inner, captures, child_mult);
}
TokenTree::Punct(p) if p.as_char() == '@' => {
// `@@name` marks the capture as raw (skip auto-translate).
let raw = matches!(
tokens.peek(),
Some(TokenTree::Punct(p)) if p.as_char() == '@'
);
if raw {
tokens.next(); // consume the second `@`
}
if let Some(TokenTree::Ident(name)) = tokens.next() {
let mult = if parent_mult == CaptureMultiplicity::Repeated
|| last_mult == CaptureMultiplicity::Repeated
@@ -732,7 +723,6 @@ fn extract_captures_inner(
captures.push(CaptureInfo {
name: name.to_string(),
multiplicity: mult,
raw,
});
}
last_mult = CaptureMultiplicity::Single;
@@ -786,14 +776,6 @@ pub fn parse_rule_top(input: TokenStream) -> Result<TokenStream> {
// Parse query
let query_code = parse_query_top(query_stream.clone())?;
// Capture names marked `@@name` (raw) — passed to the auto-translate
// prefix as a skip list so those captures keep their input-schema ids.
let raw_capture_names: Vec<&str> = captures
.iter()
.filter(|c| c.raw)
.map(|c| c.name.as_str())
.collect();
// Generate capture bindings
let ctx_ident = Ident::new(IMPLICIT_CTX, Span::call_site());
let bindings: Vec<TokenStream> = captures
@@ -909,14 +891,11 @@ pub fn parse_rule_top(input: TokenStream) -> Result<TokenStream> {
let __query = #query_code;
yeast::Rule::new(__query, Box::new(|__ast: &mut yeast::Ast, mut __captures: yeast::captures::Captures, __fresh: &yeast::tree_builder::FreshScope, __source_range: Option<tree_sitter::Range>, __user_ctx: &mut _, __translator: yeast::TranslatorHandle<'_, _>| {
// Auto-translation prefix: recursively translate every
// captured node before invoking the user's transform body,
// except for `@@name` captures listed in `__skip` which the
// body consumes raw.
// captured node before invoking the user's transform body.
// For OneShot rules this preserves the legacy behaviour
// (input-schema captures translated to output-schema
// nodes); for Repeating rules it is a no-op.
let __skip: &[&str] = &[#(#raw_capture_names),*];
__translator.auto_translate_captures(&mut __captures, __ast, __user_ctx, __skip)?;
__translator.auto_translate_captures(&mut __captures, __ast, __user_ctx)?;
#(#bindings)*
let mut #ctx_ident = yeast::build::BuildCtx::with_translator(__ast, &__captures, __fresh, __source_range, __user_ctx, __translator);
let __result: Vec<usize> = { #transform_body };
@@ -926,6 +905,106 @@ pub fn parse_rule_top(input: TokenStream) -> Result<TokenStream> {
})
}
/// Parse `manual_rule!( query { body } )`.
///
/// Like [`parse_rule_top`] but:
/// - Expects a Rust block `{ ... }` after the query (no `=>` arrow).
/// - Generates code that does NOT auto-translate captures before
/// running the body. Capture variables refer to raw (input-schema)
/// nodes; the body is responsible for explicit translation via
/// `ctx.translate(...)`.
/// - The body is included verbatim and must evaluate to
/// `Result<Vec<usize>, String>`.
pub fn parse_manual_rule_top(input: TokenStream) -> Result<TokenStream> {
let mut tokens = input.into_iter().peekable();
// Collect query tokens up to the body block `{ ... }`.
let mut query_tokens = Vec::new();
loop {
match tokens.peek() {
None => {
return Err(syn::Error::new(
Span::call_site(),
"expected a Rust block `{ ... }` after the query in manual_rule!",
))
}
Some(TokenTree::Group(g)) if g.delimiter() == Delimiter::Brace => break,
_ => {
query_tokens.push(tokens.next().unwrap());
}
}
}
let query_stream: TokenStream = query_tokens.into_iter().collect();
// Extract captures from the query (same as in `rule!`).
let captures = extract_captures(&query_stream);
// Parse the query into the QueryNode-building expression.
let query_code = parse_query_top(query_stream)?;
// Generate capture bindings (same as in `rule!`).
let ctx_ident = Ident::new(IMPLICIT_CTX, Span::call_site());
let bindings: Vec<TokenStream> = captures
.iter()
.map(|cap| {
let name = Ident::new(&cap.name, Span::call_site());
let name_str = &cap.name;
match cap.multiplicity {
CaptureMultiplicity::Repeated => quote! {
let #name: Vec<yeast::NodeRef> = __captures.get_all(#name_str)
.into_iter()
.map(yeast::NodeRef)
.collect();
},
CaptureMultiplicity::Optional => quote! {
let #name: Option<yeast::NodeRef> =
__captures.get_opt(#name_str).map(yeast::NodeRef);
},
CaptureMultiplicity::Single => quote! {
let #name: yeast::NodeRef =
yeast::NodeRef(__captures.get_var(#name_str).unwrap());
},
}
})
.collect();
// Consume the body block.
let body_group = match tokens.next() {
Some(TokenTree::Group(g)) if g.delimiter() == Delimiter::Brace => g,
other => {
return Err(syn::Error::new(
Span::call_site(),
format!(
"expected a Rust block `{{ ... }}` after the query in manual_rule!, found: {other:?}"
),
))
}
};
let body_stream = body_group.stream();
// No tokens should follow the body.
if let Some(tok) = tokens.next() {
return Err(syn::Error::new_spanned(
tok,
"unexpected token after manual_rule! body",
));
}
Ok(quote! {
{
let __query = #query_code;
yeast::Rule::new(__query, Box::new(|__ast: &mut yeast::Ast, __captures: yeast::captures::Captures, __fresh: &yeast::tree_builder::FreshScope, __source_range: Option<tree_sitter::Range>, __user_ctx: &mut _, __translator: yeast::TranslatorHandle<'_, _>| {
// No auto-translate prefix for manual rules — the body
// is responsible for translating captures explicitly.
#(#bindings)*
let mut #ctx_ident = yeast::build::BuildCtx::with_translator(__ast, &__captures, __fresh, __source_range, __user_ctx, __translator);
#body_stream
}))
}
})
}
// ---------------------------------------------------------------------------
// Token utilities
// ---------------------------------------------------------------------------
@@ -934,16 +1013,6 @@ fn peek_is_at(tokens: &mut Tokens) -> bool {
matches!(tokens.peek(), Some(TokenTree::Punct(p)) if p.as_char() == '@')
}
/// Consume an `@` or `@@` capture marker and the following name ident.
/// Caller has already verified `peek_is_at(tokens)`.
fn consume_capture_marker(tokens: &mut Tokens) -> Result<Ident> {
tokens.next(); // consume the first `@`
if peek_is_at(tokens) {
tokens.next(); // consume the second `@` of `@@`
}
expect_ident(tokens, "expected capture name after `@` or `@@`")
}
fn peek_is_literal(tokens: &mut Tokens) -> bool {
matches!(tokens.peek(), Some(TokenTree::Literal(_)))
}
@@ -1044,7 +1113,8 @@ fn expect_repetition(tokens: &mut Tokens) -> Result<TokenStream> {
fn maybe_wrap_capture(tokens: &mut Tokens, base: TokenStream) -> Result<TokenStream> {
if peek_is_at(tokens) {
let name = consume_capture_marker(tokens)?;
tokens.next(); // consume @
let name = expect_ident(tokens, "expected capture name after @")?;
let name_str = name.to_string();
Ok(quote! {
yeast::query::QueryNode::Capture {
@@ -1071,12 +1141,13 @@ fn maybe_wrap_repetition(tokens: &mut Tokens, single: TokenStream) -> Result<Tok
}
}
/// If `@name` (or `@@name`) follows a Repeated list element, wrap each
/// child SingleNode inside the repetition with a Capture. This matches
/// tree-sitter semantics where `(_)* @name` captures each matched node.
/// If `@name` follows a Repeated list element, wrap each child SingleNode
/// inside the repetition with a Capture. This matches tree-sitter semantics
/// where `(_)* @name` captures each matched node.
fn maybe_wrap_list_capture(tokens: &mut Tokens, elem: TokenStream) -> Result<TokenStream> {
if peek_is_at(tokens) {
let name = consume_capture_marker(tokens)?;
tokens.next();
let name = expect_ident(tokens, "expected capture name after @")?;
let name_str = name.to_string();
// Re-parse the element isn't practical, so we generate a wrapper
// that creates a new Repeated with each child wrapped in a capture.

View File

@@ -292,37 +292,6 @@ Inside `rule!`, captures are Rust variables, so `{name}` inserts a
single capture (`Id`) and `{..name}` splices a repeated capture
(`Vec<Id>`).
### Raw captures (`@@name`)
The default `@name` capture marker is *auto-translated*: in OneShot
phases the macro recursively translates the captured node before
binding it, so `{name}` in the output template splices a node that
already conforms to the output schema.
For rules that need the raw (input-schema) capture — typically to read
its source text or to translate it explicitly with mutable context
state between calls — use `@@name` instead. The body sees the original
input-schema `NodeRef`:
```rust
yeast::rule!(
(assignment left: (_) @@raw_lhs right: (_) @rhs)
=>
{
// raw_lhs is untranslated: read its original source text.
let text = ctx.ast.source_text(raw_lhs.into());
// rhs is already translated by the auto-translate prefix.
tree!((call
method: (identifier #{text.as_str()})
receiver: {rhs}))
}
);
```
Mix `@` and `@@` freely in the same rule. In a Repeating phase both
markers are equivalent (auto-translation is a no-op for repeating
rules).
## Complete example: for-loop desugaring
This rule rewrites Ruby's `for pat in val do body end` into

View File

@@ -80,28 +80,6 @@ impl Captures {
}
Ok(())
}
/// Like [`try_map_all_captures`] but leaves captures whose name appears
/// in `skip` untouched. Used by the `rule!` macro to support `@@name`
/// (raw) captures alongside the default auto-translated `@name`
/// captures.
pub fn try_map_captures_except<E>(
&mut self,
skip: &[&str],
mut f: impl FnMut(Id) -> Result<Vec<Id>, E>,
) -> Result<(), E> {
for (name, ids) in self.captures.iter_mut() {
if skip.contains(name) {
continue;
}
let mut new_ids = Vec::with_capacity(ids.len());
for &id in ids.iter() {
new_ids.extend(f(id)?);
}
*ids = new_ids;
}
Ok(())
}
pub fn map_captures_to(&mut self, from: &str, to: &'static str, f: &mut impl FnMut(Id) -> Id) {
if let Some(from_ids) = self.captures.get(from) {
let new_values = from_ids.iter().copied().map(f).collect();

View File

@@ -16,7 +16,7 @@ pub mod schema;
pub mod tree_builder;
mod visitor;
pub use yeast_macros::{query, rule, tree, trees};
pub use yeast_macros::{manual_rule, query, rule, tree, trees};
use captures::Captures;
pub use cursor::Cursor;
@@ -48,12 +48,6 @@ impl From<NodeRef> for Id {
}
}
impl From<Id> for NodeRef {
fn from(value: Id) -> Self {
NodeRef(value)
}
}
/// Like [`std::fmt::Display`], but the formatting routine is given access to
/// the [`Ast`] so that node references can resolve to their source text.
///
@@ -763,14 +757,13 @@ impl<'a, C: Clone> TranslatorHandle<'a, C> {
}
/// Translate every captured node in `captures` in place (OneShot phase
/// only), except for captures whose name appears in `skip` — those are
/// left as raw (input-schema) ids for the rule body to consume
/// directly. In a Repeating phase this is a no-op — Repeating rules
/// receive raw captures regardless of `skip`.
/// only). In a Repeating phase this is a no-op — Repeating rules
/// receive raw captures.
///
/// Used by the `rule!` macro's generated prefix. `skip` is populated
/// from the macro's `@@name` capture markers; for plain `@name`
/// captures (and rules with no `@@` markers) it is empty.
/// Used by the `rule!` macro's generated prefix to preserve the
/// pre-existing "auto-translate captures before running the transform
/// body" behavior. Manually-written transforms typically translate
/// captures selectively via [`translate`] instead.
///
/// To avoid infinite recursion, a capture whose id matches the rule's
/// matched root (e.g. from a `(_) @_` pattern) is left unchanged.
@@ -779,12 +772,11 @@ impl<'a, C: Clone> TranslatorHandle<'a, C> {
captures: &mut Captures,
ast: &mut Ast,
user_ctx: &mut C,
skip: &[&str],
) -> Result<(), String> {
match &self.inner {
TranslatorImpl::OneShot { matched_root, .. } => {
let root = *matched_root;
captures.try_map_captures_except(skip, |cid| {
captures.try_map_all_captures(|cid| {
if cid == root {
Ok(vec![cid])
} else {

View File

@@ -1058,111 +1058,6 @@ fn test_one_shot_does_not_recurse_into_wrapper_output() {
);
}
/// Verify that `@@name` capture markers skip the auto-translate prefix:
/// the body sees the *raw* (input-schema) NodeRef and can read its
/// source text or call `ctx.translate(...)` explicitly. Compare with
/// the bare `@name` form, where the auto-translate prefix runs the
/// same translation up front and the body sees the post-translate id.
#[test]
fn test_raw_capture_marker() {
let lang: tree_sitter::Language = tree_sitter_ruby::LANGUAGE.into();
let schema =
yeast::node_types_yaml::schema_from_yaml_with_language(OUTPUT_SCHEMA_YAML, &lang).unwrap();
let rules: Vec<Rule> = vec![
yeast::rule!(
(program (_)* @stmts)
=>
(program stmt: {..stmts})
),
// `@@raw_lhs` is untranslated: the body reads its source text
// ("x") and embeds it directly as the identifier content. `@rhs`
// is auto-translated (rhs already points to (integer "INT")).
yeast::rule!(
(assignment left: (_) @@raw_lhs right: (_) @rhs)
=>
{
let text = ctx.ast.source_text(raw_lhs.into());
tree!((call
method: (identifier #{text.as_str()})
receiver: {rhs}))
}
),
yeast::rule!((identifier) => (identifier "ID")),
yeast::rule!((integer) => (integer "INT")),
];
let phases = vec![Phase::new("translate", PhaseKind::OneShot, rules)];
let runner: Runner = Runner::with_schema(lang, &schema, &phases);
let input = "x = 1";
let ast = runner.run(input).unwrap();
let dump = dump_ast(&ast, ast.get_root(), input);
// `method:` uses the raw source text ("x"); if `@@` were broken and
// auto-translation ran on `raw_lhs`, it would still produce the
// string "x" (source_text inherits the input range), so the dump
// wouldn't change here. The companion test
// `test_raw_capture_marker_explicit_translate` exercises the
// stronger property that `ctx.translate(raw_lhs)?` succeeds and
// produces the translated `(identifier "ID")`.
assert_dump_eq(
&dump,
r#"
program
stmt:
call
method: identifier "x"
receiver: integer "INT"
"#,
);
}
/// Companion to `test_raw_capture_marker`: confirms that calling
/// `ctx.translate(raw)` on a `@@`-captured NodeRef from the rule body
/// produces the correctly-translated output-schema node. With `@`, the
/// translation has already happened, so `ctx.translate(...)` inside the
/// body would attempt to re-translate an output node (which has no
/// matching rule and would error).
#[test]
fn test_raw_capture_marker_explicit_translate() {
let lang: tree_sitter::Language = tree_sitter_ruby::LANGUAGE.into();
let schema =
yeast::node_types_yaml::schema_from_yaml_with_language(OUTPUT_SCHEMA_YAML, &lang).unwrap();
let rules: Vec<Rule> = vec![
yeast::rule!(
(program (_)* @stmts)
=>
(program stmt: {..stmts})
),
yeast::rule!(
(assignment left: (_) @@raw_lhs right: (_) @rhs)
=>
{
let translated_lhs = ctx.translate(raw_lhs)?;
tree!((call
method: {..translated_lhs}
receiver: {rhs}))
}
),
yeast::rule!((identifier) => (identifier "ID")),
yeast::rule!((integer) => (integer "INT")),
];
let phases = vec![Phase::new("translate", PhaseKind::OneShot, rules)];
let runner: Runner = Runner::with_schema(lang, &schema, &phases);
let input = "x = 1";
let ast = runner.run(input).unwrap();
let dump = dump_ast(&ast, ast.get_root(), input);
assert_dump_eq(
&dump,
r#"
program
stmt:
call
method: identifier "ID"
receiver: integer "INT"
"#,
);
}
// ---- Cursor tests ----
#[test]

View File

@@ -1,5 +1,5 @@
use codeql_extractor::extractor::simple;
use yeast::{ConcreteDesugarer, DesugaringConfig, PhaseKind, Rule, rule, tree};
use yeast::{ConcreteDesugarer, DesugaringConfig, PhaseKind, Rule, manual_rule, rule, tree};
/// User context propagated from outer rules down to the inner rules that
/// emit the corresponding output declarations, so that each emitted node
@@ -15,26 +15,26 @@ struct SwiftContext {
/// (`computed_getter`/`computed_setter`/`computed_modify`/
/// `willset_clause`/`didset_clause`/`getter_specifier`/
/// `setter_specifier`).
property_name: Option<yeast::NodeRef>,
property_name: Option<yeast::Id>,
/// Translated type node for the property type. Set by the outer
/// `property_binding` rule (computed accessors variant) and
/// `protocol_property_declaration` when present; read by the
/// accessor inner rules.
property_type: Option<yeast::NodeRef>,
property_type: Option<yeast::Id>,
/// Default-value expression for the next translated `parameter`. Set
/// by the outer `function_parameter` rule; read by the `parameter`
/// rules.
default_value: Option<yeast::NodeRef>,
default_value: Option<yeast::Id>,
/// Translated outer modifiers (e.g. visibility, attributes) to
/// attach to each child of a flattening outer rule. Set by
/// `property_declaration`, `enum_entry`, and
/// `protocol_property_declaration`.
outer_modifiers: Vec<yeast::NodeRef>,
outer_modifiers: Vec<yeast::Id>,
/// The `let`/`var` binding modifier for a `property_declaration`.
/// Set by `property_declaration`; read by the inner declaration
/// rules (`property_binding` variants, accessor rules) so they
/// emit it as part of the output node's `modifier:` field.
binding_modifier: Option<yeast::NodeRef>,
binding_modifier: Option<yeast::Id>,
/// True when the current child of a flattening outer rule is not
/// the first one — its inner rule should emit a
/// `chained_declaration` modifier so the original grouping can be
@@ -45,10 +45,10 @@ struct SwiftContext {
/// Build a freshly-created `chained_declaration` modifier node if
/// `ctx.is_chained`, else `None`. Used by inner declaration rules to
/// emit the chained tag for non-first children of a flattening outer
/// rule. Returns `Option<NodeRef>` so it splices via `{..…}` to 0 or 1 ids.
fn chained_modifier(ctx: &mut yeast::build::BuildCtx<'_, SwiftContext>) -> Option<yeast::NodeRef> {
/// rule. Returns `Option<Id>` so it splices via `{..…}` to 0 or 1 ids.
fn chained_modifier(ctx: &mut yeast::build::BuildCtx<'_, SwiftContext>) -> Option<yeast::Id> {
if ctx.is_chained {
Some(ctx.literal("modifier", "chained_declaration").into())
Some(ctx.literal("modifier", "chained_declaration"))
} else {
None
}
@@ -192,15 +192,21 @@ fn translation_rules() -> Vec<Rule<SwiftContext>> {
// this whole property_binding is itself a non-first declarator
// of a containing property_declaration); subsequent accessors
// always emit `chained_declaration`.
rule!(
manual_rule!(
(property_binding
name: @pattern
type: _? @ty
computed_value: (computed_property accessor: _+ @@accessors))
=>
{..{
ctx.property_name = Some(tree!((identifier #{pattern})).into());
ctx.property_type = ty;
computed_value: (computed_property accessor: _+ @accessors))
{
// Translate `ty` first so the context holds an
// output-schema node id.
let translated_ty = ctx.translate_opt(ty)?;
// Build the property-name identifier from the
// (untranslated) pattern leaf.
let name_id = tree!((identifier #{pattern}));
ctx.property_name = Some(name_id);
ctx.property_type = translated_ty;
let mut result = Vec::new();
for (i, acc) in accessors.into_iter().enumerate() {
@@ -209,8 +215,8 @@ fn translation_rules() -> Vec<Rule<SwiftContext>> {
}
result.extend(ctx.translate(acc)?);
}
result
}}
Ok(result)
}
),
// Computed property: shorthand getter (no explicit get/set, just
// statements) → a single accessor_declaration with kind "get".
@@ -242,26 +248,30 @@ fn translation_rules() -> Vec<Rule<SwiftContext>> {
// The `variable_declaration` itself inherits the outer rule's
// chained state; observers always get `chained_declaration`
// because they're subsequent outputs of this flattening rule.
rule!(
manual_rule!(
(property_binding
name: (pattern bound_identifier: @name)
type: _? @ty
value: _? @val
observers: (willset_didset_block willset: _? @@ws didset: _? @@ds))
=>
{..{
observers: (willset_didset_block willset: _? @ws didset: _? @ds))
{
// Translate ty and val so the variable_declaration
// below contains output-schema nodes.
let translated_ty = ctx.translate_opt(ty)?;
let translated_val = ctx.translate_opt(val)?;
let var_decl = tree!(
(variable_declaration
modifier: {..ctx.binding_modifier}
modifier: {..ctx.outer_modifiers.clone()}
modifier: {..chained_modifier(&mut ctx)}
pattern: (name_pattern identifier: (identifier #{name}))
type: {..ty}
value: {..val})
type: {..translated_ty}
value: {..translated_val})
);
// Publish the property name for the observer rules.
ctx.property_name = Some(tree!((identifier #{name})).into());
ctx.property_name = Some(tree!((identifier #{name})));
// Observers are subsequent outputs of this flattening
// rule, so they always get `chained_declaration`.
ctx.is_chained = true;
@@ -270,8 +280,8 @@ fn translation_rules() -> Vec<Rule<SwiftContext>> {
for obs in ws.into_iter().chain(ds) {
result.extend(ctx.translate(obs)?);
}
result
}}
Ok(result)
}
),
// property_binding with any pattern name (identifier or
// destructuring). Reads outer modifiers / chained tag from `ctx`.
@@ -299,24 +309,27 @@ fn translation_rules() -> Vec<Rule<SwiftContext>> {
// inner declaration rules (`property_binding` variants,
// accessor inner rules) read these fields and emit complete
// `modifier:` lists from the start.
rule!(
manual_rule!(
(property_declaration
binding: (value_binding_pattern mutability: @@binding_kind)
declarator: _* @@decls
binding: (value_binding_pattern mutability: @binding_kind)
declarator: _* @decls
(modifiers)* @mods)
=>
{..{
let binding_text = ctx.ast.source_text(binding_kind.into());
ctx.binding_modifier = Some(ctx.literal("modifier", &binding_text).into());
ctx.outer_modifiers = mods;
{
let binding_text = ctx.ast.source_text(binding_kind.0);
ctx.binding_modifier = Some(ctx.literal("modifier", &binding_text));
let mut modifiers = Vec::new();
for m in mods {
modifiers.extend(ctx.translate(m)?);
}
ctx.outer_modifiers = modifiers;
let mut result = Vec::new();
for (i, decl) in decls.into_iter().enumerate() {
ctx.is_chained = i > 0;
result.extend(ctx.translate(decl)?);
}
result
}}
Ok(result)
}
),
// ---- Enums ----
// enum_type_parameter → parameter (with optional name as pattern).
@@ -373,19 +386,22 @@ fn translation_rules() -> Vec<Rule<SwiftContext>> {
// into `ctx` and translate each case with `ctx.is_chained`
// toggled per iteration so the inner `enum_case_entry` rules
// emit complete `modifier:` lists from the start.
rule!(
(enum_entry case: _+ @@cases (modifiers)* @mods)
=>
{..{
ctx.outer_modifiers = mods;
manual_rule!(
(enum_entry case: _+ @cases (modifiers)* @mods)
{
let mut modifiers = Vec::new();
for m in mods {
modifiers.extend(ctx.translate(m)?);
}
ctx.outer_modifiers = modifiers;
let mut result = Vec::new();
for (i, case) in cases.into_iter().enumerate() {
ctx.is_chained = i > 0;
result.extend(ctx.translate(case)?);
}
result
}}
Ok(result)
}
),
// Plain assignment: `x = expr`
rule!(
@@ -460,13 +476,12 @@ fn translation_rules() -> Vec<Rule<SwiftContext>> {
// optional default values. Publishes the default value into `ctx`
// before translating the inner `parameter` so the `parameter`
// rules can include it as a `default:` field directly.
rule!(
(function_parameter parameter: @@p default_value: _? @def)
=>
{..{
ctx.default_value = def;
ctx.translate(p)?
}}
manual_rule!(
(function_parameter parameter: @p default_value: _? @def)
{
ctx.default_value = ctx.translate_opt(def)?;
ctx.translate(p)
}
),
// Parameter with external name and type
rule!(
@@ -1011,25 +1026,28 @@ fn translation_rules() -> Vec<Rule<SwiftContext>> {
// inner `getter_specifier`/`setter_specifier` rules emit
// complete nodes from the start (including the
// `chained_declaration` tag for non-first accessors).
rule!(
manual_rule!(
(protocol_property_declaration
name: (pattern bound_identifier: @name)
requirements: (protocol_property_requirements accessor: _+ @@accessors)
requirements: (protocol_property_requirements accessor: _+ @accessors)
type: _? @ty
(modifiers)* @mods)
=>
{..{
ctx.property_name = Some(tree!((identifier #{name})).into());
ctx.property_type = ty;
ctx.outer_modifiers = mods;
{
ctx.property_name = Some(tree!((identifier #{name})));
ctx.property_type = ctx.translate_opt(ty)?;
let mut modifiers = Vec::new();
for m in mods {
modifiers.extend(ctx.translate(m)?);
}
ctx.outer_modifiers = modifiers;
let mut result = Vec::new();
for (i, acc) in accessors.into_iter().enumerate() {
ctx.is_chained = i > 0;
result.extend(ctx.translate(acc)?);
}
result
}}
Ok(result)
}
),
// getter_specifier / setter_specifier → bodyless accessor_declaration
// getter_specifier / setter_specifier → bodyless