python: instantiate module for variable capture

This provides variable capture in standard situations:
- nested functions
- lambdas
There are some deficiencies:
- we do not yet handle objects capturing variables.
- we do not handle variables captured via the `nonlocal` keyword.
  This should be solved at the AST level, though, and then it
  should "just work".

There are still inconsistencies in the case where
a `SynthesizedCaptureNode` has a comprehensions
as its enclosing callable. In this case,
`TFunction(cn.getEnclosingCallable())` is not
defined and so getEnclosingCallable does not exist
for the `CaptureNode`.
This commit is contained in:
Rasmus Lerchedahl Petersen
2023-10-11 16:28:57 +02:00
parent 6db55cd12f
commit c054ba6a97
9 changed files with 352 additions and 16 deletions

View File

@@ -43,6 +43,7 @@ private import semmle.python.dataflow.new.internal.TypeTracker::CallGraphConstru
newtype TParameterPosition =
/** Used for `self` in methods, and `cls` in classmethods. */
TSelfParameterPosition() or
TLambdaSelfParameterPosition() or
TPositionalParameterPosition(int index) {
index = any(Parameter p).getPosition()
or
@@ -79,6 +80,9 @@ class ParameterPosition extends TParameterPosition {
/** Holds if this position represents a `self`/`cls` parameter. */
predicate isSelf() { this = TSelfParameterPosition() }
/** Holds if this position represents a reference to a lambda itself. Only used for tracking flow through captured variables. */
predicate isLambdaSelf() { this = TLambdaSelfParameterPosition() }
/** Holds if this position represents a positional parameter at (0-based) `index`. */
predicate isPositional(int index) { this = TPositionalParameterPosition(index) }
@@ -110,6 +114,8 @@ class ParameterPosition extends TParameterPosition {
string toString() {
this.isSelf() and result = "self"
or
this.isLambdaSelf() and result = "lambda self"
or
exists(int index | this.isPositional(index) and result = "position " + index)
or
exists(string name | this.isKeyword(name) and result = "keyword " + name)
@@ -130,6 +136,7 @@ class ParameterPosition extends TParameterPosition {
newtype TArgumentPosition =
/** Used for `self` in methods, and `cls` in classmethods. */
TSelfArgumentPosition() or
TLambdaSelfArgumentPosition() or
TPositionalArgumentPosition(int index) {
exists(any(CallNode c).getArg(index))
or
@@ -154,6 +161,9 @@ class ArgumentPosition extends TArgumentPosition {
/** Holds if this position represents a `self`/`cls` argument. */
predicate isSelf() { this = TSelfArgumentPosition() }
/** Holds if this position represents a lambda `self` argument. Only used for tracking flow through captured variables. */
predicate isLambdaSelf() { this = TLambdaSelfArgumentPosition() }
/** Holds if this position represents a positional argument at (0-based) `index`. */
predicate isPositional(int index) { this = TPositionalArgumentPosition(index) }
@@ -184,6 +194,8 @@ class ArgumentPosition extends TArgumentPosition {
predicate parameterMatch(ParameterPosition ppos, ArgumentPosition apos) {
ppos.isSelf() and apos.isSelf()
or
ppos.isLambdaSelf() and apos.isLambdaSelf()
or
exists(int index | ppos.isPositional(index) and apos.isPositional(index))
or
exists(string name | ppos.isKeyword(name) and apos.isKeyword(name))
@@ -1507,6 +1519,36 @@ abstract class ParameterNodeImpl extends Node {
}
}
/**
* The value of a lambda itself at function entry, viewed as a node in a data
* flow graph.
*
* This is used for tracking flow through captured variables, and we use a
* separate node and parameter/argument positions in order to distinguish
* "lambda self" from "normal self", as lambdas may also access outer `self`
* variables (through variable capture).
*/
class LambdaSelfReferenceNode extends ParameterNodeImpl, TLambdaSelfReferenceNode {
private Function callable;
LambdaSelfReferenceNode() { this = TLambdaSelfReferenceNode(callable) }
final Function getCallable() { result = callable }
override Parameter getParameter() { none() }
override predicate isParameterOf(DataFlowCallable c, ParameterPosition pos) {
c = TFunction(callable) and
pos.isLambdaSelf()
}
override Scope getScope() { result = callable }
override Location getLocation() { result = callable.getLocation() }
override string toString() { result = "lambda self in " + callable }
}
/** A parameter for a library callable with a flow summary. */
class SummaryParameterNode extends ParameterNodeImpl, FlowSummaryNode {
SummaryParameterNode() {
@@ -1578,6 +1620,33 @@ private class SummaryPostUpdateNode extends FlowSummaryNode, PostUpdateNodeImpl
override Node getPreUpdateNode() { result = pre }
}
private class CapturePostUpdateNode extends PostUpdateNodeImpl, CaptureNode {
private CaptureNode pre;
CapturePostUpdateNode() {
VariableCapture::Flow::capturePostUpdateNode(this.getSynthesizedCaptureNode(),
pre.getSynthesizedCaptureNode())
}
override Node getPreUpdateNode() { result = pre }
}
class CaptureArgumentNode extends CfgNode, ArgumentNode {
CallNode callNode;
CaptureArgumentNode() {
this.getNode() = callNode.getFunction() and
exists(Function target | resolveCall(callNode, target, _) |
target = any(VariableCapture::CapturedVariable v).getACapturingScope()
)
}
override predicate argumentOf(DataFlowCall call, ArgumentPosition pos) {
callNode = call.getNode() and
pos.isLambdaSelf()
}
}
/** Gets a viable run-time target for the call `call`. */
DataFlowCallable viableCallable(DataFlowCall call) {
call instanceof ExtractedDataFlowCall and

View File

@@ -378,6 +378,193 @@ module LocalFlow {
}
}
/** Provides logic related to captured variables. */
module VariableCapture {
private import codeql.dataflow.VariableCapture as Shared
class ExprCfgNode extends ControlFlowNode {
ExprCfgNode() { isExpressionNode(this) }
}
private predicate closureFlowStep(ExprCfgNode e1, ExprCfgNode e2) {
// simpleAstFlowStep(e1, e2)
// or
exists(SsaVariable def |
def.getAUse() = e2 and
def.getAnUltimateDefinition().getDefinition().(DefinitionNode).getValue() = e1
)
}
private module CaptureInput implements Shared::InputSig<Location> {
private import python as P
class BasicBlock extends P::BasicBlock {
Callable getEnclosingCallable() { result = this.getScope() }
// TODO: check that this gives useful results
Location getLocation() { result = super.getNode(0).getLocation() }
}
BasicBlock getImmediateBasicBlockDominator(BasicBlock bb) {
result = bb.getImmediateDominator()
}
BasicBlock getABasicBlockSuccessor(BasicBlock bb) { result = bb.getASuccessor() }
class CapturedVariable extends LocalVariable {
Function f;
CapturedVariable() {
this.getScope() = f and
this.getAnAccess().getScope() != f
}
Callable getCallable() { result = f }
Location getLocation() { result = f.getLocation() }
/** Gets a scope that captures this variable. */
Scope getACapturingScope() {
result = this.getAnAccess().getScope().getScope*() and
result.getScope+() = f
}
}
class CapturedParameter extends CapturedVariable {
CapturedParameter() { this.isParameter() }
ControlFlowNode getCfgNode() { result.getNode().(Parameter) = this.getAnAccess() }
}
class Expr extends ExprCfgNode {
predicate hasCfgNode(BasicBlock bb, int i) { this = bb.getNode(i) }
}
class VariableWrite extends ControlFlowNode {
CapturedVariable v;
VariableWrite() { this = v.getAStore().getAFlowNode() }
CapturedVariable getVariable() { result = v }
predicate hasCfgNode(BasicBlock bb, int i) { this = bb.getNode(i) }
}
class VariableRead extends Expr {
CapturedVariable v;
VariableRead() { this = v.getALoad().getAFlowNode() }
CapturedVariable getVariable() { result = v }
}
class ClosureExpr extends Expr {
ClosureExpr() {
this.getNode() instanceof CallableExpr
or
this.getNode() instanceof Comp
}
predicate hasBody(Callable body) {
body = this.getNode().(CallableExpr).getInnerScope()
or
body = this.getNode().(Comp).getFunction()
}
predicate hasAliasedAccess(Expr f) { closureFlowStep+(this, f) and not closureFlowStep(f, _) }
}
class Callable extends Scope {
predicate isConstructor() { none() }
}
}
class CapturedVariable = CaptureInput::CapturedVariable;
class ClosureExpr = CaptureInput::ClosureExpr;
module Flow = Shared::Flow<Location, CaptureInput>;
private Flow::ClosureNode asClosureNode(Node n) {
result = n.(CaptureNode).getSynthesizedCaptureNode()
or
result.(Flow::ExprNode).getExpr() = n.(CfgNode).getNode()
or
result.(Flow::VariableWriteSourceNode).getVariableWrite() = n.(CfgNode).getNode()
or
result.(Flow::ExprPostUpdateNode).getExpr() =
n.(PostUpdateNode).getPreUpdateNode().(CfgNode).getNode()
or
result.(Flow::ParameterNode).getParameter().getCfgNode() = n.(CfgNode).getNode()
or
result.(Flow::ThisParameterNode).getCallable() = n.(LambdaSelfReferenceNode).getCallable()
}
predicate storeStep(Node nodeFrom, CapturedVariableContent c, Node nodeTo) {
Flow::storeStep(asClosureNode(nodeFrom), c.getVariable(), asClosureNode(nodeTo))
}
predicate readStep(Node nodeFrom, CapturedVariableContent c, Node nodeTo) {
Flow::readStep(asClosureNode(nodeFrom), c.getVariable(), asClosureNode(nodeTo))
}
predicate valueStep(Node nodeFrom, Node nodeTo) {
Flow::localFlowStep(asClosureNode(nodeFrom), asClosureNode(nodeTo))
}
// Note: Learn from JS, https://github.com/github/codeql/pull/14412
// - JS: Capture flow
// - JS: Disallow consecutive captured contents
private module Debug {
predicate flowStoreStep(
Node nodeFrom, Flow::ClosureNode closureNodeFrom, CapturedVariable v,
Flow::ClosureNode closureNodeTo, Node nodeTo
) {
closureNodeFrom = asClosureNode(nodeFrom) and
closureNodeTo = asClosureNode(nodeTo) and
Flow::storeStep(closureNodeFrom, v, closureNodeTo)
}
predicate unmappedFlowStoreStep(
Flow::ClosureNode closureNodeFrom, CapturedVariable v, Flow::ClosureNode closureNodeTo
) {
Flow::storeStep(closureNodeFrom, v, closureNodeTo) and
not flowStoreStep(_, closureNodeFrom, v, closureNodeTo, _)
}
predicate flowReadStep(
Node nodeFrom, Flow::ClosureNode closureNodeFrom, CapturedVariable v,
Flow::ClosureNode closureNodeTo, Node nodeTo
) {
closureNodeFrom = asClosureNode(nodeFrom) and
closureNodeTo = asClosureNode(nodeTo) and
Flow::readStep(closureNodeFrom, v, closureNodeTo)
}
predicate unmappedFlowReadStep(
Flow::ClosureNode closureNodeFrom, CapturedVariable v, Flow::ClosureNode closureNodeTo
) {
Flow::readStep(closureNodeFrom, v, closureNodeTo) and
not flowReadStep(_, closureNodeFrom, v, closureNodeTo, _)
}
predicate flowValueStep(
Node nodeFrom, Flow::ClosureNode closureNodeFrom, Flow::ClosureNode closureNodeTo, Node nodeTo
) {
closureNodeFrom = asClosureNode(nodeFrom) and
closureNodeTo = asClosureNode(nodeTo) and
Flow::localFlowStep(closureNodeFrom, closureNodeTo)
}
predicate unmappedFlowValueStep(
Flow::ClosureNode closureNodeFrom, Flow::ClosureNode closureNodeTo
) {
Flow::localFlowStep(closureNodeFrom, closureNodeTo) and
not flowValueStep(_, closureNodeFrom, closureNodeTo, _)
}
}
}
//--------
// Local flow
//--------
@@ -471,6 +658,8 @@ predicate simpleLocalFlowStep(Node nodeFrom, Node nodeTo) {
simpleLocalFlowStepForTypetracking(nodeFrom, nodeTo)
or
summaryFlowSteps(nodeFrom, nodeTo)
or
variableCaptureFlowStep(nodeFrom, nodeTo)
}
/**
@@ -494,6 +683,11 @@ predicate summaryFlowSteps(Node nodeFrom, Node nodeTo) {
IncludePostUpdateFlow<PhaseDependentFlow<summaryLocalStep/2>::step/2>::step(nodeFrom, nodeTo)
}
predicate variableCaptureFlowStep(Node nodeFrom, Node nodeTo) {
IncludePostUpdateFlow<PhaseDependentFlow<VariableCapture::valueStep/2>::step/2>::step(nodeFrom,
nodeTo)
}
/** `ModuleVariable`s are accessed via jump steps at runtime. */
predicate runtimeJumpStep(Node nodeFrom, Node nodeTo) {
// Module variable read
@@ -557,7 +751,7 @@ predicate compatibleTypes(DataFlowType t1, DataFlowType t2) { any() }
predicate typeStrongerThan(DataFlowType t1, DataFlowType t2) { none() }
predicate localMustFlowStep(Node node1, Node node2) { none() }
predicate localMustFlowStep(Node nodeFrom, Node nodeTo) { none() }
/**
* Gets the type of `node`.
@@ -661,6 +855,8 @@ predicate storeStep(Node nodeFrom, ContentSet c, Node nodeTo) {
synthStarArgsElementParameterNodeStoreStep(nodeFrom, c, nodeTo)
or
synthDictSplatArgumentNodeStoreStep(nodeFrom, c, nodeTo)
or
VariableCapture::storeStep(nodeFrom, c, nodeTo)
}
/**
@@ -864,6 +1060,8 @@ predicate readStep(Node nodeFrom, ContentSet c, Node nodeTo) {
nodeTo.(FlowSummaryNode).getSummaryNode())
or
synthDictSplatParameterNodeReadStep(nodeFrom, c, nodeTo)
or
VariableCapture::readStep(nodeFrom, c, nodeTo)
}
/** Data flows from a sequence to a subscript of the sequence. */
@@ -993,6 +1191,8 @@ predicate nodeIsHidden(Node n) {
n instanceof SynthDictSplatArgumentNode
or
n instanceof SynthDictSplatParameterNode
// or
// n instanceof CaptureNode
}
class LambdaCallKind = Unit;
@@ -1029,6 +1229,11 @@ predicate additionalLambdaFlowStep(Node nodeFrom, Node nodeTo, boolean preserves
*/
predicate allowParameterReturnInSelf(ParameterNode p) {
FlowSummaryImpl::Private::summaryAllowParameterReturnInSelf(p)
or
exists(Function f |
VariableCapture::Flow::heuristicAllowInstanceParameterReturnInSelf(f) and
p = TLambdaSelfReferenceNode(f)
)
}
/** An approximated `Content`. */

View File

@@ -114,6 +114,12 @@ newtype TNode =
/** A synthetic node to allow flow to keyword parameters from a `**kwargs` argument. */
TSynthDictSplatParameterNode(DataFlowCallable callable) {
exists(ParameterPosition ppos | ppos.isKeyword(_) | exists(callable.getParameter(ppos)))
} or
/** A synthetic node representing a captured variable. */
TCaptureNode(VariableCapture::Flow::SynthesizedCaptureNode cn) or
/** A synthetic node representing the heap of a function. Used for variable capture. */
TLambdaSelfReferenceNode(Function f) {
f = any(VariableCapture::CapturedVariable v).getACapturingScope()
}
private import semmle.python.internal.CachedStages
@@ -471,6 +477,29 @@ class StarPatternElementNode extends Node, TStarPatternElementNode {
override Location getLocation() { result = consumer.getLocation() }
}
/**
* A synthesized data flow node representing a closure object that tracks
* captured variables.
*/
class CaptureNode extends Node, TCaptureNode {
private VariableCapture::Flow::SynthesizedCaptureNode cn;
CaptureNode() { this = TCaptureNode(cn) }
/** Gets the `SynthesizedCaptureNode` that this node represents. */
VariableCapture::Flow::SynthesizedCaptureNode getSynthesizedCaptureNode() { result = cn }
override DataFlowCallable getEnclosingCallable() {
result = TFunction(cn.getEnclosingCallable())
or
result = TModule(cn.getEnclosingCallable())
}
override Location getLocation() { result = cn.getLocation() }
override string toString() { result = cn.toString() }
}
/**
* Gets a node that controls whether other nodes are evaluated.
*
@@ -602,7 +631,9 @@ newtype TContent =
exists(string input, string output | ModelOutput::relevantSummaryModel(_, _, input, output, _) |
attr = [input, output].regexpFind("(?<=(^|\\.)Attribute\\[)[^\\]]+(?=\\])", _, _).trim()
)
}
} or
/** A captured variable. */
TCapturedVariableContent(VariableCapture::CapturedVariable v)
/**
* A data-flow value can have associated content.
@@ -665,6 +696,18 @@ class AttributeContent extends TAttributeContent, Content {
override string toString() { result = "Attribute " + attr }
}
/** A captured variable. */
class CapturedVariableContent extends Content, TCapturedVariableContent {
private VariableCapture::CapturedVariable v;
CapturedVariableContent() { this = TCapturedVariableContent(v) }
/** Gets the captured variable. */
VariableCapture::CapturedVariable getVariable() { result = v }
override string toString() { result = "captured " + v }
}
/**
* An entity that represents a set of `Content`s.
*

View File

@@ -34,14 +34,14 @@ def by_value1():
a = SOURCE
def inner(a_val=a):
SINK(a_val) #$ captured
SINK_F(a)
SINK_F(a) #$ SPURIOUS: captured
a = NONSOURCE
inner()
def by_value2():
a = NONSOURCE
def inner(a_val=a):
SINK(a) #$ MISSING:captured
SINK(a) #$ captured
SINK_F(a_val)
a = SOURCE
inner()

View File

@@ -0,0 +1,17 @@
uniqueToString
uniqueEnclosingCallable
uniqueDominator
localDominator
localSuccessor
uniqueDefiningScope
variableIsCaptured
uniqueLocation
uniqueCfgNode
uniqueWriteTarget
uniqueWriteCfgNode
uniqueReadVariable
closureMustHaveBody
closureAliasMustBeInSameScope
variableAccessAstNesting
uniqueCallableLocation
consistencyOverview

View File

@@ -0,0 +1,2 @@
import python
import semmle.python.dataflow.new.internal.DataFlowPrivate::VariableCapture::Flow::ConsistencyChecks

View File

@@ -37,7 +37,7 @@ def out():
def captureOut1():
sinkO1["x"] = SOURCE
captureOut1()
SINK(sinkO1["x"]) #$ MISSING:captured
SINK(sinkO1["x"]) #$ captured
sinkO2 = { "x": "" }
def captureOut2():
@@ -45,7 +45,7 @@ def out():
sinkO2["x"] = SOURCE
m()
captureOut2()
SINK(sinkO2["x"]) #$ MISSING:captured
SINK(sinkO2["x"]) #$ captured
nonSink0 = { "x": "" }
def captureOut1NotCalled():
@@ -67,7 +67,7 @@ def through(tainted):
def captureOut1():
sinkO1["x"] = tainted
captureOut1()
SINK(sinkO1["x"]) #$ MISSING:captured
SINK(sinkO1["x"]) #$ captured
sinkO2 = { "x": "" }
def captureOut2():
@@ -75,7 +75,7 @@ def through(tainted):
sinkO2["x"] = tainted
m()
captureOut2()
SINK(sinkO2["x"]) #$ MISSING:captured
SINK(sinkO2["x"]) #$ captured
nonSink1 = { "x": "" }
def captureOut1NotCalled():

View File

@@ -78,7 +78,7 @@ def through(tainted):
global sinkT1
sinkT1 = tainted
captureOut1()
SINK(sinkT1) #$ MISSING:captured
SINK(sinkT1) #$ captured
def captureOut2():
def m():
@@ -86,7 +86,7 @@ def through(tainted):
sinkT2 = tainted
m()
captureOut2()
SINK(sinkT2) #$ MISSING:captured
SINK(sinkT2) #$ captured
def captureOut1NotCalled():
global nonSinkT1

View File

@@ -34,17 +34,17 @@ def SINK_F(x):
def inParam(tainted):
def captureIn1():
sinkI1 = tainted
SINK(sinkI1) #$ MISSING:captured
SINK(sinkI1) #$ captured
captureIn1()
def captureIn2():
def m():
sinkI2 = tainted
SINK(sinkI2) #$ MISSING:captured
SINK(sinkI2) #$ captured
m()
captureIn2()
captureIn3 = lambda arg: SINK(tainted)
captureIn3 = lambda arg: SINK(tainted) #$ captured
captureIn3("")
def captureIn1NotCalled():
@@ -68,17 +68,17 @@ def inLocal():
def captureIn1():
sinkI1 = tainted
SINK(sinkI1) #$ MISSING:captured
SINK(sinkI1) #$ captured
captureIn1()
def captureIn2():
def m():
sinkI2 = tainted
SINK(sinkI2) #$ MISSING:captured
SINK(sinkI2) #$ captured
m()
captureIn2()
captureIn3 = lambda arg: SINK(tainted) #$ MISSING:captured
captureIn3 = lambda arg: SINK(tainted) #$ captured
captureIn3("")
def captureIn1NotCalled():