Merge pull request #10783 from yoff/python/subscript-nodes

Python: API graph improvements for subscripts
This commit is contained in:
Taus
2022-10-17 15:21:56 +02:00
committed by GitHub
11 changed files with 187 additions and 103 deletions

View File

@@ -0,0 +1,7 @@
---
category: minorAnalysis
---
* Fixed labels in the API graph pertaining to definitions of subscripts. Previously, these were found by `getMember` rather than `getASubscript`.
* Added edges for indices of subscripts to the API graph. Now a subscripted API node will have an edge to the API node for the index expression. So if `foo` is matched by API node `A`, then `"key"` in `foo["key"]` will be matched by the API node `A.getIndex()`. This can be used to track the origin of the index.
* Added member predicate `getSubscriptAt(API::Node index)` to `API::Node`. Like `getASubscript()`, this will return an API node that matches a subscript of the node, but here it will be restricted to subscripts where the index matches the `index` parameter.
* Added convenience predicate `getSubscript("key")` to obtain a subscript at a specific index, when the index happens to be a statically known string.

View File

@@ -249,6 +249,60 @@ module API {
*/
Node getASubscript() { result = this.getASuccessor(Label::subscript()) }
/**
* Gets a node representing an index of a subscript of this node.
* For example, in `obj[x]`, `x` is an index of `obj`.
*/
Node getIndex() { result = this.getASuccessor(Label::index()) }
/**
* Gets a node representing a subscript of this node at (string) index `key`.
* This requires that the index can be statically determined.
*
* For example, the subscripts of `a` and `b` below would be found using
* the index `foo`:
* ```py
* a["foo"]
* x = "foo" if cond else "bar"
* b[x]
* ```
*/
Node getSubscript(string key) {
exists(API::Node index | result = this.getSubscriptAt(index) |
key = index.getAValueReachingSink().asExpr().(PY::StrConst).getText()
)
}
/**
* Gets a node representing a subscript of this node at index `index`.
*/
Node getSubscriptAt(API::Node index) {
result = this.getASubscript() and
index = this.getIndex() and
(
// subscripting
exists(PY::SubscriptNode subscript |
subscript.getObject() = this.getAValueReachableFromSource().asCfgNode() and
subscript.getIndex() = index.asSink().asCfgNode()
|
// reading
subscript = result.asSource().asCfgNode()
or
// writing
subscript.(PY::DefinitionNode).getValue() = result.asSink().asCfgNode()
)
or
// dictionary literals
exists(PY::Dict dict, PY::KeyValuePair item |
dict = this.getAValueReachingSink().asExpr() and
dict.getItem(_) = item and
item.getKey() = index.asSink().asExpr()
|
item.getValue() = result.asSink().asExpr()
)
)
}
/**
* Gets a string representation of the lexicographically least among all shortest access paths
* from the root to this node.
@@ -405,7 +459,7 @@ module API {
Node builtin(string n) { result = moduleImport("builtins").getMember(n) }
/**
* An `CallCfgNode` that is connected to the API graph.
* A `CallCfgNode` that is connected to the API graph.
*
* Can be used to reason about calls to an external API in which the correlation between
* parameters and/or return values must be retained.
@@ -694,12 +748,31 @@ module API {
rhs = aw.getValue()
)
or
// TODO: I had expected `DataFlow::AttrWrite` to contain the attribute writes from a dict, that's how JS works.
// dictionary literals
exists(PY::Dict dict, PY::KeyValuePair item |
dict = pred.(DataFlow::ExprNode).getNode().getNode() and
dict.getItem(_) = item and
lbl = Label::member(item.getKey().(PY::StrConst).getS()) and
rhs.(DataFlow::ExprNode).getNode().getNode() = item.getValue()
dict.getItem(_) = item
|
// from `x` to `{ "key": x }`
// TODO: once convenient, this should be done at a higher level than the AST,
// at least at the CFG layer, to take splitting into account.
rhs.(DataFlow::ExprNode).getNode().getNode() = item.getValue() and
lbl = Label::subscript()
or
// from `"key"` to `{ "key": x }`
// TODO: once convenient, this should be done at a higher level than the AST,
// at least at the CFG layer, to take splitting into account.
rhs.(DataFlow::ExprNode).getNode().getNode() = item.getKey() and
lbl = Label::index()
)
or
// list literals, from `x` to `[x]`
// TODO: once convenient, this should be done at a higher level than the AST,
// at least at the CFG layer, to take splitting into account.
// Also consider `SequenceNode for generality.
exists(PY::List list | list = pred.(DataFlow::ExprNode).getNode().getNode() |
rhs.(DataFlow::ExprNode).getNode().getNode() = list.getAnElt() and
lbl = Label::subscript()
)
or
exists(PY::CallableExpr fn | fn = pred.(DataFlow::ExprNode).getNode().getNode() |
@@ -720,6 +793,20 @@ module API {
lbl = Label::memberFromRef(aw)
)
or
// subscripting
exists(DataFlow::LocalSourceNode src, DataFlow::Node subscript, DataFlow::Node index |
use(base, src) and
subscript = trackUseNode(src).getSubscript(index)
|
// from `x` to a definition of `x[...]`
rhs.asCfgNode() = subscript.asCfgNode().(PY::DefinitionNode).getValue() and
lbl = Label::subscript()
or
// from `x` to `"key"` in `x["key"]`
rhs = index and
lbl = Label::index()
)
or
exists(EntryPoint entry |
base = root() and
lbl = Label::entryPoint(entry) and
@@ -757,7 +844,8 @@ module API {
or
// Subscripting a node that is a use of `base`
lbl = Label::subscript() and
ref = pred.getASubscript()
ref = pred.getSubscript(_) and
ref.asCfgNode().isLoad()
or
// Subclassing a node
lbl = Label::subclass() and
@@ -973,8 +1061,7 @@ module API {
member = any(DataFlow::AttrRef pr).getAttributeName() or
exists(Builtins::likelyBuiltin(member)) or
ImportStar::namePossiblyDefinedInImportStar(_, member, _) or
Impl::prefix_member(_, member, _) or
member = any(PY::Dict d).getAnItem().(PY::KeyValuePair).getKey().(PY::StrConst).getS()
Impl::prefix_member(_, member, _)
} or
MkLabelUnknownMember() or
MkLabelParameter(int i) {
@@ -992,6 +1079,7 @@ module API {
MkLabelSubclass() or
MkLabelAwait() or
MkLabelSubscript() or
MkLabelIndex() or
MkLabelEntryPoint(EntryPoint ep)
/** A label for a module. */
@@ -1072,6 +1160,11 @@ module API {
override string toString() { result = "getASubscript()" }
}
/** A label that gets the index of a subscript. */
class LabelIndex extends ApiLabel, MkLabelIndex {
override string toString() { result = "getIndex()" }
}
/** A label for entry points. */
class LabelEntryPoint extends ApiLabel, MkLabelEntryPoint {
private EntryPoint entry;
@@ -1120,6 +1213,9 @@ module API {
/** Gets the `subscript` edge label. */
LabelSubscript subscript() { any() }
/** Gets the `subscript` edge label. */
LabelIndex index() { any() }
/** Gets the label going from the root node to the nodes associated with the given entry point. */
LabelEntryPoint entryPoint(EntryPoint ep) { result = MkLabelEntryPoint(ep) }
}

View File

@@ -104,7 +104,7 @@ class LocalSourceNode extends Node {
/**
* Gets a subscript of this node.
*/
Node getASubscript() { Cached::subscript(this, result) }
Node getSubscript(Node index) { Cached::subscript(this, result, index) }
/**
* Gets a call to the method `methodName` on this node.
@@ -249,13 +249,14 @@ private module Cached {
}
/**
* Holds if `node` flows to a sequence/mapping of which `subscript` is a subscript.
* Holds if `node` flows to a sequence/mapping of which `subscript` is a subscript with index/key `index`.
*/
cached
predicate subscript(LocalSourceNode node, CfgNode subscript) {
predicate subscript(LocalSourceNode node, CfgNode subscript, CfgNode index) {
exists(CfgNode seq, SubscriptNode subscriptNode | subscriptNode = subscript.getNode() |
node.flowsTo(seq) and
seq.getNode() = subscriptNode.getObject()
seq.getNode() = subscriptNode.getObject() and
index.getNode() = subscriptNode.getIndex()
)
}
}

View File

@@ -621,15 +621,12 @@ module AiohttpWebModel {
DataFlow::Node value;
AiohttpResponseCookieSubscriptWrite() {
exists(SubscriptNode subscript |
exists(API::Node i |
value = aiohttpResponseInstance().getMember("cookies").getSubscriptAt(i).asSink() and
index = i.asSink() and
// To give `this` a value, we need to choose between either LHS or RHS,
// and just go with the LHS
this.asCfgNode() = subscript
|
subscript.getObject() =
aiohttpResponseInstance().getMember("cookies").getAValueReachableFromSource().asCfgNode() and
value.asCfgNode() = subscript.(DefinitionNode).getValue() and
index.asCfgNode() = subscript.getIndex()
// and just go with the RHS as it is readily available
this = value
)
}

View File

@@ -91,14 +91,10 @@ private module ExperimentalPrivateDjango {
result = baseClassRef().getReturn().getAMember()
}
/** Gets a reference to a header instance call with `__setitem__`. */
API::Node headerSetItem() {
result = headerInstance() and
result.asSource().(DataFlow::AttrRead).getAttributeName() = "__setitem__"
}
class DjangoResponseSetItemCall extends DataFlow::CallCfgNode, HeaderDeclaration::Range {
DjangoResponseSetItemCall() { this = headerSetItem().getACall() }
DjangoResponseSetItemCall() {
this = baseClassRef().getReturn().getMember("__setitem__").getACall()
}
override DataFlow::Node getNameArg() { result = this.getArg(0) }
@@ -109,8 +105,7 @@ private module ExperimentalPrivateDjango {
DataFlow::Node headerInput;
DjangoResponseDefinition() {
this.asCfgNode().(DefinitionNode) =
headerInstance().getAValueReachableFromSource().asCfgNode() and
headerInput = headerInstance().asSink() and
headerInput.asCfgNode() = this.asCfgNode().(DefinitionNode).getValue()
}

View File

@@ -26,7 +26,7 @@ private module Sendgrid {
}
/** Gets a reference to a `SendGridAPIClient` instance call with `send` or `post`. */
private DataFlow::CallCfgNode sendgridApiSendCall() {
private API::CallNode sendgridApiSendCall() {
result = sendgridApiClient().getMember("send").getACall()
or
result =
@@ -62,7 +62,7 @@ private module Sendgrid {
* * `getFrom()`'s result would be `"from@example.com"`.
* * `getSubject()`'s result would be `"Sending with SendGrid is Fun"`.
*/
private class SendGridMail extends DataFlow::CallCfgNode, EmailSender::Range {
private class SendGridMail extends API::CallNode, EmailSender::Range {
SendGridMail() { this = sendgridApiSendCall() }
private DataFlow::CallCfgNode getMailCall() {
@@ -118,40 +118,28 @@ private module Sendgrid {
or
result = this.sendgridWrite("html_content")
or
exists(KeyValuePair content, Dict generalDict, KeyValuePair typePair, KeyValuePair valuePair |
content.getKey().(StrConst).getText() = "content" and
content.getValue().(List).getAnElt() = generalDict and
// declare KeyValuePairs keys and values
typePair.getKey().(StrConst).getText() = "type" and
typePair.getValue().(StrConst).getText() = ["text/html", "text/x-amp-html"] and
valuePair.getKey().(StrConst).getText() = "value" and
result.asExpr() = valuePair.getValue() and
// correlate generalDict with previously set KeyValuePairs
generalDict.getAnItem() in [typePair, valuePair] and
[this.getArg(0), this.getArgByName("request_body")].getALocalSource().asExpr() =
any(Dict d | d.getAnItem() = content)
exists(API::Node contentElement |
contentElement =
this.getKeywordParameter("request_body").getSubscript("content").getASubscript()
|
contentElement.getSubscript("type").getAValueReachingSink().asExpr().(StrConst).getText() =
["text/html", "text/x-amp-html"] and
result = contentElement.getSubscript("value").getAValueReachingSink()
)
or
exists(KeyValuePair footer, Dict generalDict, KeyValuePair enablePair, KeyValuePair htmlPair |
footer.getKey().(StrConst).getText() = ["footer", "subscription_tracking"] and
footer.getValue() = generalDict and
// check footer is enabled
enablePair.getKey().(StrConst).getText() = "enable" and
exists(enablePair.getValue().(True)) and
// get html content
htmlPair.getKey().(StrConst).getText() = "html" and
result.asExpr() = htmlPair.getValue() and
// correlate generalDict with previously set KeyValuePairs
generalDict.getAnItem() in [enablePair, htmlPair] and
exists(KeyValuePair k |
k.getKey() =
[this.getArg(0), this.getArgByName("request_body")]
.getALocalSource()
.asExpr()
.(Dict)
.getAKey() and
k.getValue() = any(Dict d | d.getAKey() = footer.getKey())
)
exists(API::Node html |
html =
this.getKeywordParameter("request_body")
.getSubscript("tracking_settings")
.getSubscript("subscription_tracking")
or
html =
this.getKeywordParameter("request_body")
.getSubscript("mail_settings")
.getSubscript("footer")
|
html.getSubscript("enable").getAValueReachingSink().asExpr() instanceof True and
result = html.getSubscript("html").getAValueReachingSink()
)
}

View File

@@ -101,33 +101,6 @@ module SmtpLib {
)
}
/**
* Gets a message subscript write by correlating subscript's object local source with
* `smtp`'s `sendmail` call 3rd argument's local source.
*
* Given the following example with `getSMTPSubscriptByIndex(any(SmtpLibSendMail s), "Subject")`:
*
* ```py
* message = MIMEMultipart("alternative")
* message["Subject"] = "multipart test"
* server.sendmail(sender_email, receiver_email, message.as_string())
* ```
*
* * `def` would be `message["Subject"]` (`DefinitionNode`)
* * `sub` would be `message["Subject"]` (`Subscript`)
* * `result` would be `"multipart test"`
*/
private DataFlow::Node getSmtpSubscriptByIndex(DataFlow::CallCfgNode sendCall, string index) {
exists(DefinitionNode def, Subscript sub |
sub = def.getNode() and
DataFlow::exprNode(sub.getObject()).getALocalSource() =
[sendCall.getArg(2), sendCall.getArg(2).(DataFlow::MethodCallNode).getObject()]
.getALocalSource() and
sub.getIndex().(StrConst).getText() = index and
result.asCfgNode() = def.getValue()
)
}
/**
* Gets a reference to `smtplib.SMTP_SSL().sendmail()`.
*
@@ -153,7 +126,7 @@ module SmtpLib {
* * `getFrom()`'s result would be `sender_email`.
* * `getSubject()`'s result would be `"multipart test"`.
*/
private class SmtpLibSendMail extends DataFlow::CallCfgNode, EmailSender::Range {
private class SmtpLibSendMail extends API::CallNode, EmailSender::Range {
SmtpLibSendMail() {
this = smtpConnectionInstance().getReturn().getMember("sendmail").getACall()
}
@@ -163,15 +136,24 @@ module SmtpLib {
override DataFlow::Node getHtmlBody() { result = getSmtpMessage(this, "html") }
override DataFlow::Node getTo() {
result in [this.getArg(1), getSmtpSubscriptByIndex(this, "To")]
result = this.getParameter(1, "to_addrs").asSink()
or
result = this.getMsg().getSubscript("To").asSink()
}
override DataFlow::Node getFrom() {
result in [this.getArg(0), getSmtpSubscriptByIndex(this, "From")]
result = this.getParameter(0, "from_addr").asSink()
or
result = this.getMsg().getSubscript("From").asSink()
}
override DataFlow::Node getSubject() {
result in [this.getArg(2), getSmtpSubscriptByIndex(this, "Subject")]
override DataFlow::Node getSubject() { result = this.getMsg().getSubscript("Subject").asSink() }
private API::Node getMsg() {
result.getAValueReachableFromSource() = this.getParameter(2, "msg").asSink()
or
result.getMember("as_string").getReturn().getAValueReachableFromSource() =
this.getParameter(2, "msg").asSink()
}
}
}

View File

@@ -5,12 +5,12 @@ def callback(x): #$ use=moduleImport("mypkg").getMember("foo").getMember("bar").
foo.bar(callback) #$ def=moduleImport("mypkg").getMember("foo").getMember("bar").getParameter(0) use=moduleImport("mypkg").getMember("foo").getMember("bar").getReturn()
def callback2(x): #$ use=moduleImport("mypkg").getMember("foo").getMember("baz").getParameter(0).getMember("c").getParameter(0)
x.baz2() #$ use=moduleImport("mypkg").getMember("foo").getMember("baz").getParameter(0).getMember("c").getParameter(0).getMember("baz2").getReturn()
def callback2(x): #$ use=moduleImport("mypkg").getMember("foo").getMember("baz").getParameter(0).getASubscript().getParameter(0)
x.baz2() #$ use=moduleImport("mypkg").getMember("foo").getMember("baz").getParameter(0).getASubscript().getParameter(0).getMember("baz2").getReturn()
mydict = {
"c": callback2, #$ def=moduleImport("mypkg").getMember("foo").getMember("baz").getParameter(0).getMember("c")
"other": "whatever" #$ def=moduleImport("mypkg").getMember("foo").getMember("baz").getParameter(0).getMember("other")
"c": callback2, #$ def=moduleImport("mypkg").getMember("foo").getMember("baz").getParameter(0).getASubscript()
"other": "whatever" #$ def=moduleImport("mypkg").getMember("foo").getMember("baz").getParameter(0).getASubscript()
}
foo.baz(mydict) #$ def=moduleImport("mypkg").getMember("foo").getMember("baz").getParameter(0) use=moduleImport("mypkg").getMember("foo").getMember("baz").getReturn()
@@ -34,11 +34,11 @@ otherDict.fourth = callback4
foo.quack(otherDict.fourth) #$ def=moduleImport("mypkg").getMember("foo").getMember("quack").getParameter(0) use=moduleImport("mypkg").getMember("foo").getMember("quack").getReturn()
def namedCallback(myName, otherName):
# Using named parameters:
def namedCallback(myName, otherName):
# Using named parameters:
myName() #$ use=moduleImport("mypkg").getMember("foo").getMember("blob").getParameter(0).getKeywordParameter("myName").getReturn()
otherName() #$ use=moduleImport("mypkg").getMember("foo").getMember("blob").getParameter(0).getKeywordParameter("otherName").getReturn()
# Using numbered parameters:
# Using numbered parameters:
myName() #$ use=moduleImport("mypkg").getMember("foo").getMember("blob").getParameter(0).getParameter(0).getReturn()
otherName() #$ use=moduleImport("mypkg").getMember("foo").getMember("blob").getParameter(0).getParameter(1).getReturn()
@@ -58,4 +58,4 @@ recursiveDict.callback = recusisionCallback;
recursiveDict.rec1 = recursiveDict;
recursiveDict.rec2 = recursiveDict;
foo.rec(recursiveDict); #$ def=moduleImport("mypkg").getMember("foo").getMember("rec").getParameter(0)
foo.rec(recursiveDict); #$ def=moduleImport("mypkg").getMember("foo").getMember("rec").getParameter(0)

View File

@@ -0,0 +1,6 @@
| test_subscript.py:4:11:4:28 | Use moduleImport("mypkg").getMember("foo").getReturn().getASubscript() |
| test_subscript.py:5:26:5:27 | Def moduleImport("mypkg").getMember("foo").getReturn().getASubscript() |
| test_subscript.py:6:5:6:22 | Use moduleImport("mypkg").getMember("foo").getReturn().getASubscript() |
| test_subscript.py:6:5:6:28 | Def moduleImport("mypkg").getMember("foo").getReturn().getASubscript() |
| test_subscript.py:7:5:7:22 | Use moduleImport("mypkg").getMember("foo").getReturn().getASubscript() |
| test_subscript.py:7:5:7:28 | Def moduleImport("mypkg").getMember("foo").getReturn().getASubscript() |

View File

@@ -0,0 +1,4 @@
import python
import semmle.python.ApiGraphs
select API::moduleImport("mypkg").getMember("foo").getReturn().getSubscript(["bar", "baz", "qux"])

View File

@@ -0,0 +1,8 @@
import mypkg
def test_subscript():
bar = mypkg.foo()["bar"] #$ use=moduleImport("mypkg").getMember("foo").getReturn().getASubscript()
mypkg.foo()["baz"] = 42 #$ def=moduleImport("mypkg").getMember("foo").getReturn().getASubscript()
mypkg.foo()["qux"] += 42 #$ use=moduleImport("mypkg").getMember("foo").getReturn().getASubscript()
mypkg.foo()["qux"] += 42 #$ def=moduleImport("mypkg").getMember("foo").getReturn().getASubscript()
mypkg.foo()[mypkg.index] = mypkg.value #$ def=moduleImport("mypkg").getMember("foo").getReturn().getASubscript()