Merge pull request #12537 from yoff/python/captured-variables-for-typetracking

Python: Captured variables for type tracking and the API graph
This commit is contained in:
yoff
2023-05-09 12:34:22 +02:00
committed by GitHub
22 changed files with 299 additions and 48 deletions

View File

@@ -0,0 +1,4 @@
---
category: minorAnalysis
---
* Type tracking is now aware of reads of captured variables (variables defined in an outer scope). This leads to a richer API graph, and may lead to more results in some queries.

View File

@@ -987,7 +987,7 @@ module API {
DataFlow::LocalSourceNode trackUseNode(DataFlow::LocalSourceNode src) { DataFlow::LocalSourceNode trackUseNode(DataFlow::LocalSourceNode src) {
Stages::TypeTracking::ref() and Stages::TypeTracking::ref() and
result = trackUseNode(src, DataFlow::TypeTracker::end()) and result = trackUseNode(src, DataFlow::TypeTracker::end()) and
result instanceof DataFlow::ExprNode result instanceof DataFlow::LocalSourceNodeNotModuleVariableNode
} }
/** /**

View File

@@ -51,6 +51,10 @@ class LocalSourceNode extends Node {
// We explicitly include any read of a global variable, as some of these may have local flow going // We explicitly include any read of a global variable, as some of these may have local flow going
// into them. // into them.
this = any(ModuleVariableNode mvn).getARead() this = any(ModuleVariableNode mvn).getARead()
or
// We include all scope entry definitions, as these act as the local source within the scope they
// enter.
this.asVar() instanceof ScopeEntryDefinition
} }
/** Holds if this `LocalSourceNode` can flow to `nodeTo` in one or more local flow steps. */ /** Holds if this `LocalSourceNode` can flow to `nodeTo` in one or more local flow steps. */
@@ -133,6 +137,21 @@ class LocalSourceNode extends Node {
LocalSourceNode backtrack(TypeBackTracker t2, TypeBackTracker t) { t2 = t.step(result, this) } LocalSourceNode backtrack(TypeBackTracker t2, TypeBackTracker t) { t2 = t.step(result, this) }
} }
/**
* A LocalSourceNode that is not a ModuleVariableNode
* This class provides a positive formulation of that in its charpred.
*
* Aka FutureLocalSourceNode (see FutureWork below), but until the future is here...
*/
class LocalSourceNodeNotModuleVariableNode extends LocalSourceNode {
cached
LocalSourceNodeNotModuleVariableNode() {
this instanceof ExprNode
or
this.asVar() instanceof ScopeEntryDefinition
}
}
/** /**
* A node that can be used for type tracking or type back-tracking. * A node that can be used for type tracking or type back-tracking.
* *

View File

@@ -43,7 +43,19 @@ predicate compatibleContents(TypeTrackerContent storeContent, TypeTrackerContent
predicate simpleLocalFlowStep = DataFlowPrivate::simpleLocalFlowStepForTypetracking/2; predicate simpleLocalFlowStep = DataFlowPrivate::simpleLocalFlowStepForTypetracking/2;
predicate jumpStep = DataFlowPrivate::jumpStepSharedWithTypeTracker/2; predicate jumpStep(Node nodeFrom, Node nodeTo) {
DataFlowPrivate::jumpStepSharedWithTypeTracker(nodeFrom, nodeTo)
or
capturedJumpStep(nodeFrom, nodeTo)
}
predicate capturedJumpStep(Node nodeFrom, Node nodeTo) {
exists(SsaSourceVariable var, DefinitionNode def | var.hasDefiningNode(def) |
nodeTo.asVar().(ScopeEntryDefinition).getSourceVariable() = var and
nodeFrom.asCfgNode() = def.getValue() and
var.getScope().getScope*() = nodeFrom.getScope()
)
}
/** Holds if there is a level step from `nodeFrom` to `nodeTo`, which may depend on the call graph. */ /** Holds if there is a level step from `nodeFrom` to `nodeTo`, which may depend on the call graph. */
predicate levelStepCall(Node nodeFrom, Node nodeTo) { none() } predicate levelStepCall(Node nodeFrom, Node nodeTo) { none() }

View File

@@ -726,15 +726,15 @@ def test_deep_callgraph():
return f5(arg) return f5(arg)
x = f6(SOURCE) x = f6(SOURCE)
SINK(x) #$ MISSING:flow="SOURCE, l:-1 -> x" SINK(x) #$ flow="SOURCE, l:-1 -> x"
x = f5(SOURCE) x = f5(SOURCE)
SINK(x) #$ MISSING:flow="SOURCE, l:-1 -> x" SINK(x) #$ flow="SOURCE, l:-1 -> x"
x = f4(SOURCE) x = f4(SOURCE)
SINK(x) #$ MISSING:flow="SOURCE, l:-1 -> x" SINK(x) #$ flow="SOURCE, l:-1 -> x"
x = f3(SOURCE) x = f3(SOURCE)
SINK(x) #$ MISSING:flow="SOURCE, l:-1 -> x" SINK(x) #$ flow="SOURCE, l:-1 -> x"
x = f2(SOURCE) x = f2(SOURCE)
SINK(x) #$ MISSING:flow="SOURCE, l:-1 -> x" SINK(x) #$ flow="SOURCE, l:-1 -> x"
x = f1(SOURCE) x = f1(SOURCE)
SINK(x) #$ flow="SOURCE, l:-1 -> x" SINK(x) #$ flow="SOURCE, l:-1 -> x"

View File

@@ -6,5 +6,6 @@ module_attr_tracker
| import_as_attr.py:1:28:1:35 | GSSA Variable attr_ref | | import_as_attr.py:1:28:1:35 | GSSA Variable attr_ref |
| import_as_attr.py:3:1:3:1 | GSSA Variable x | | import_as_attr.py:3:1:3:1 | GSSA Variable x |
| import_as_attr.py:3:5:3:12 | ControlFlowNode for attr_ref | | import_as_attr.py:3:5:3:12 | ControlFlowNode for attr_ref |
| import_as_attr.py:5:1:5:10 | GSSA Variable attr_ref |
| import_as_attr.py:6:5:6:5 | SSA variable y | | import_as_attr.py:6:5:6:5 | SSA variable y |
| import_as_attr.py:6:9:6:16 | ControlFlowNode for attr_ref | | import_as_attr.py:6:9:6:16 | ControlFlowNode for attr_ref |

View File

@@ -60,10 +60,10 @@ def test_import():
def to_inner_scope(): def to_inner_scope():
x = tracked # $tracked x = tracked # $tracked
def foo(): def foo():
y = x # $ MISSING: tracked y = x # $ tracked
return y # $ MISSING: tracked return y # $ tracked
also_x = foo() # $ MISSING: tracked also_x = foo() # $ tracked
print(also_x) # $ MISSING: tracked print(also_x) # $ tracked
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
# Function decorator # Function decorator

View File

@@ -24,6 +24,11 @@ class TrackedTest extends InlineExpectationsTest {
tracked(t).flowsTo(e) and tracked(t).flowsTo(e) and
// Module variables have no sensible location, and hence can't be annotated. // Module variables have no sensible location, and hence can't be annotated.
not e instanceof DataFlow::ModuleVariableNode and not e instanceof DataFlow::ModuleVariableNode and
// Global variables on line 0 also cannot be annotated
not e.getLocation().getStartLine() = 0 and
// We do not wish to annotate scope entry definitions,
// as they do not appear in the source code.
not e.asVar() instanceof ScopeEntryDefinition and
tag = "tracked" and tag = "tracked" and
location = e.getLocation() and location = e.getLocation() and
value = t.getAttr() and value = t.getAttr() and

View File

@@ -68,8 +68,10 @@ if __name__ == "__main__":
check_tests_valid("coverage-py3.classes") check_tests_valid("coverage-py3.classes")
check_tests_valid("variable-capture.in") check_tests_valid("variable-capture.in")
check_tests_valid("variable-capture.nonlocal") check_tests_valid("variable-capture.nonlocal")
check_tests_valid("variable-capture.global")
check_tests_valid("variable-capture.dict") check_tests_valid("variable-capture.dict")
check_tests_valid("variable-capture.collections") check_tests_valid("variable-capture.test_collections")
check_tests_valid("variable-capture.by_value")
check_tests_valid("module-initialization.multiphase") check_tests_valid("module-initialization.multiphase")
check_tests_valid("fieldflow.test") check_tests_valid("fieldflow.test")
check_tests_valid("fieldflow.test_dict") check_tests_valid("fieldflow.test_dict")

View File

@@ -0,0 +1,52 @@
# Here we test capturing the _value_ of a variable (by using it as the default value for a parameter)
# All functions starting with "test_" should run and execute `print("OK")` exactly once.
# This can be checked by running validTest.py.
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname((__file__))))
from testlib import expects
# These are defined so that we can evaluate the test code.
NONSOURCE = "not a source"
SOURCE = "source"
def is_source(x):
return x == "source" or x == b"source" or x == 42 or x == 42.0 or x == 42j
def SINK(x):
if is_source(x):
print("OK")
else:
print("Unexpected flow", x)
def SINK_F(x):
if is_source(x):
print("Unexpected flow", x)
else:
print("OK")
def by_value1():
a = SOURCE
def inner(a_val=a):
SINK(a_val) #$ captured
SINK_F(a)
a = NONSOURCE
inner()
def by_value2():
a = NONSOURCE
def inner(a_val=a):
SINK(a) #$ MISSING:captured
SINK_F(a_val)
a = SOURCE
inner()
@expects(4)
def test_by_value():
by_value1()
by_value2()

View File

@@ -1,10 +1,11 @@
uniqueEnclosingCallable uniqueEnclosingCallable
uniqueCallEnclosingCallable uniqueCallEnclosingCallable
| collections.py:39:17:39:38 | Lambda() | Call should have one enclosing callable but has 0. | | test_collections.py:39:17:39:38 | Lambda() | Call should have one enclosing callable but has 0. |
| collections.py:39:17:39:38 | Lambda() | Call should have one enclosing callable but has 0. | | test_collections.py:39:17:39:38 | Lambda() | Call should have one enclosing callable but has 0. |
| collections.py:45:19:45:24 | mod() | Call should have one enclosing callable but has 0. | | test_collections.py:45:19:45:24 | mod() | Call should have one enclosing callable but has 0. |
| collections.py:45:19:45:24 | mod() | Call should have one enclosing callable but has 0. | | test_collections.py:45:19:45:24 | mod() | Call should have one enclosing callable but has 0. |
| collections.py:52:13:52:24 | mod_local() | Call should have one enclosing callable but has 0. | | test_collections.py:52:13:52:24 | mod_local() | Call should have one enclosing callable but has 0. |
| test_collections.py:52:13:52:24 | mod_local() | Call should have one enclosing callable but has 0. |
uniqueType uniqueType
uniqueNodeLocation uniqueNodeLocation
missingLocation missingLocation

View File

@@ -77,16 +77,18 @@ def through(tainted):
captureOut2() captureOut2()
SINK(sinkO2["x"]) #$ MISSING:captured SINK(sinkO2["x"]) #$ MISSING:captured
nonSink0 = { "x": "" } nonSink1 = { "x": "" }
def captureOut1NotCalled(): def captureOut1NotCalled():
nonSink0["x"] = tainted nonSink1["x"] = tainted
SINK_F(nonSink0["x"]) SINK_F(nonSink1["x"])
nonSink2 = { "x": "" }
def captureOut2NotCalled(): def captureOut2NotCalled():
# notice that `m` is not called
def m(): def m():
nonSink0["x"] = tainted nonSink2["x"] = tainted
captureOut2NotCalled() captureOut2NotCalled()
SINK_F(nonSink0["x"]) SINK_F(nonSink2["x"])
@expects(4) @expects(4)
def test_through(): def test_through():

View File

@@ -0,0 +1,106 @@
# Here we test writing to a captured global variable via the `global` keyword (see `out`).
# We also test reading one captured variable and writing the value to another (see `through`).
# All functions starting with "test_" should run and execute `print("OK")` exactly once.
# This can be checked by running validTest.py.
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname((__file__))))
from testlib import expects
# These are defined so that we can evaluate the test code.
NONSOURCE = "not a source"
SOURCE = "source"
def is_source(x):
return x == "source" or x == b"source" or x == 42 or x == 42.0 or x == 42j
def SINK(x):
if is_source(x):
print("OK")
else:
print("Unexpected flow", x)
def SINK_F(x):
if is_source(x):
print("Unexpected flow", x)
else:
print("OK")
sinkO1 = ""
sinkO2 = ""
nonSink1 = ""
nonSink2 = ""
def out():
def captureOut1():
global sinkO1
sinkO1 = SOURCE
captureOut1()
SINK(sinkO1) #$ captured
def captureOut2():
def m():
global sinkO2
sinkO2 = SOURCE
m()
captureOut2()
SINK(sinkO2) #$ captured
def captureOut1NotCalled():
global nonSink1
nonSink1 = SOURCE
SINK_F(nonSink1) #$ SPURIOUS: captured
def captureOut2NotCalled():
# notice that `m` is not called
def m():
global nonSink2
nonSink2 = SOURCE
captureOut2NotCalled()
SINK_F(nonSink2) #$ SPURIOUS: captured
@expects(4)
def test_out():
out()
sinkT1 = ""
sinkT2 = ""
nonSinkT1 = ""
nonSinkT2 = ""
def through(tainted):
def captureOut1():
global sinkT1
sinkT1 = tainted
captureOut1()
SINK(sinkT1) #$ MISSING:captured
def captureOut2():
def m():
global sinkT2
sinkT2 = tainted
m()
captureOut2()
SINK(sinkT2) #$ MISSING:captured
def captureOut1NotCalled():
global nonSinkT1
nonSinkT1 = tainted
SINK_F(nonSinkT1)
def captureOut2NotCalled():
# notice that `m` is not called
def m():
global nonSinkT2
nonSinkT2 = tainted
captureOut2NotCalled()
SINK_F(nonSinkT2)
@expects(4)
def test_through():
through(SOURCE)

View File

@@ -48,13 +48,14 @@ def inParam(tainted):
captureIn3("") captureIn3("")
def captureIn1NotCalled(): def captureIn1NotCalled():
nonSink0 = tainted nonSink1 = tainted
SINK_F(nonSink0) SINK_F(nonSink1)
def captureIn2NotCalled(): def captureIn2NotCalled():
# notice that `m` is not called
def m(): def m():
nonSink0 = tainted nonSink1 = tainted
SINK_F(nonSink0) SINK_F(nonSink1)
captureIn2NotCalled() captureIn2NotCalled()
@expects(3) @expects(3)
@@ -81,13 +82,14 @@ def inLocal():
captureIn3("") captureIn3("")
def captureIn1NotCalled(): def captureIn1NotCalled():
nonSink0 = tainted nonSink1 = tainted
SINK_F(nonSink0) SINK_F(nonSink1)
def captureIn2NotCalled(): def captureIn2NotCalled():
# notice that `m` is not called
def m(): def m():
nonSink0 = tainted nonSink2 = tainted
SINK_F(nonSink0) SINK_F(nonSink2)
captureIn2NotCalled() captureIn2NotCalled()
@expects(3) @expects(3)

View File

@@ -49,18 +49,20 @@ def out():
captureOut2() captureOut2()
SINK(sinkO2) #$ MISSING:captured SINK(sinkO2) #$ MISSING:captured
nonSink0 = "" nonSink1 = ""
def captureOut1NotCalled(): def captureOut1NotCalled():
nonlocal nonSink0 nonlocal nonSink1
nonSink0 = SOURCE nonSink1 = SOURCE
SINK_F(nonSink0) SINK_F(nonSink1)
nonSink2 = ""
def captureOut2NotCalled(): def captureOut2NotCalled():
# notice that `m` is not called
def m(): def m():
nonlocal nonSink0 nonlocal nonSink2
nonSink0 = SOURCE nonSink2 = SOURCE
captureOut2NotCalled() captureOut2NotCalled()
SINK_F(nonSink0) SINK_F(nonSink2)
@expects(4) @expects(4)
def test_out(): def test_out():
@@ -83,18 +85,20 @@ def through(tainted):
captureOut2() captureOut2()
SINK(sinkO2) #$ MISSING:captured SINK(sinkO2) #$ MISSING:captured
nonSink0 = "" nonSink1 = ""
def captureOut1NotCalled(): def captureOut1NotCalled():
nonlocal nonSink0 nonlocal nonSink1
nonSink0 = tainted nonSink1 = tainted
SINK_F(nonSink0) SINK_F(nonSink1)
nonSink2 = ""
def captureOut2NotCalled(): def captureOut2NotCalled():
# notice that `m` is not called
def m(): def m():
nonlocal nonSink0 nonlocal nonSink2
nonSink0 = tainted nonSink2 = tainted
captureOut2NotCalled() captureOut2NotCalled()
SINK_F(nonSink0) SINK_F(nonSink2)
@expects(4) @expects(4)
def test_through(): def test_through():

View File

@@ -52,7 +52,7 @@ def mod_list(l):
return [mod_local(x) for x in l] return [mod_local(x) for x in l]
l_modded = mod_list(l) l_modded = mod_list(l)
SINK(l_modded[0]) #$ MISSING: captured SINK(l_modded[0]) #$ captured
def mod_list_first(l): def mod_list_first(l):
def mod_local(x): def mod_local(x):

View File

@@ -3,6 +3,7 @@ uniqueCallEnclosingCallable
| test_captured.py:7:22:7:25 | p() | Call should have one enclosing callable but has 0. | | test_captured.py:7:22:7:25 | p() | Call should have one enclosing callable but has 0. |
| test_captured.py:7:22:7:25 | p() | Call should have one enclosing callable but has 0. | | test_captured.py:7:22:7:25 | p() | Call should have one enclosing callable but has 0. |
| test_captured.py:14:26:14:30 | pp() | Call should have one enclosing callable but has 0. | | test_captured.py:14:26:14:30 | pp() | Call should have one enclosing callable but has 0. |
| test_captured.py:14:26:14:30 | pp() | Call should have one enclosing callable but has 0. |
uniqueType uniqueType
uniqueNodeLocation uniqueNodeLocation
missingLocation missingLocation

View File

@@ -89,7 +89,7 @@ def use_of_builtins():
def imported_builtins(): def imported_builtins():
import builtins #$ use=moduleImport("builtins") import builtins #$ use=moduleImport("builtins")
def open(f): def open(f):
return builtins.open(f) #$ MISSING: use=moduleImport("builtins").getMember("open").getReturn() return builtins.open(f) #$ use=moduleImport("builtins").getMember("open").getReturn()
def redefine_print(): def redefine_print():
def my_print(x): def my_print(x):

View File

@@ -11,4 +11,4 @@ def pp_list(l):
return escape(x) #$ use=moduleImport("html").getMember("escape").getReturn() return escape(x) #$ use=moduleImport("html").getMember("escape").getReturn()
def pp_list_inner(l): def pp_list_inner(l):
return ", ".join(pp(x) for x in l) #$ MISSING: use=moduleImport("html").getMember("escape").getReturn() return ", ".join(pp(x) for x in l) #$ use=moduleImport("html").getMember("escape").getReturn()

View File

@@ -0,0 +1,14 @@
from flask import Flask
from flask_sqlalchemy import SQLAlchemy
from flask_user import UserMixin
def create_app():
app = Flask(__name__)
db = SQLAlchemy(app) #$ use=moduleImport("flask_sqlalchemy").getMember("SQLAlchemy").getReturn()
class Users(db.Model, UserMixin): #$ use=moduleImport("flask_sqlalchemy").getMember("SQLAlchemy").getReturn().getMember("Model").getASubclass()
__tablename__ = 'users'
@app.route('/v2/user/<int:id>', methods=['GET','PUT'])
def users(id):
Users.query.filter_by(id=id).first() #$ use=moduleImport("flask_sqlalchemy").getMember("SQLAlchemy").getReturn().getMember("Model").getASubclass().getMember("query").getMember("filter_by")

View File

@@ -0,0 +1,26 @@
from foo import A, B
def func():
if cond():
class Foo(A): pass
else:
class Foo(B): pass
class Bar(A): pass
class Bar(B): pass
class Baz(A): pass
def other_func():
print(Foo) #$ use=moduleImport("foo").getMember("A").getASubclass() use=moduleImport("foo").getMember("B").getASubclass()
# On the next line, we wish to express that it is not possible for `Bar` to be a subclass of `A`.
# However, we have no "true negative" annotation, so we use the MISSING annotation instead.
# (Normally, "true negative" is not needed as all applicable annotations must be present,
# but these API graph tests work differently, since having all results recorded in annotations
# would be excessive)
print(Bar) #$ use=moduleImport("foo").getMember("B").getASubclass() MISSING: use=moduleImport("foo").getMember("A").getASubclass()
print(Baz) #$ use=moduleImport("foo").getMember("B").getASubclass() SPURIOUS: use=moduleImport("foo").getMember("A").getASubclass()
class Baz(B): pass
other_func()

View File

@@ -32,5 +32,5 @@ def func1():
def func3(): def func3():
var2 = print #$ use=moduleImport("builtins").getMember("print") var2 = print #$ use=moduleImport("builtins").getMember("print")
def func4(): def func4():
var2() #$ MISSING: use=moduleImport("builtins").getMember("print").getReturn() var2() #$ use=moduleImport("builtins").getMember("print").getReturn()
func4() func4()