mirror of
https://github.com/github/codeql.git
synced 2025-12-20 10:46:30 +01:00
Python: Add support for API graphs
Currently only supports the "use" side of things. For the most part, this follows the corresponding implementation for JavaScript. Major differences include: - No `MkImportUse` nodes -- we just move directly from `MkModuleImport` to its uses. - Paths are no longer labelled by s-expressions, but rather by a string that mirrors how you would access it in QL. This makes it very easy to see how to access an API component -- simply look at its `toString`! This PR also extends `LocalSourceNode` to support looking up attribute references and invocations of such nodes. This was again based on the JavaScript equivalent (though without specific classes for `InvokeNode` and the like, it's a bit more awkward to use).
This commit is contained in:
343
python/ql/src/semmle/python/ApiGraphs.qll
Normal file
343
python/ql/src/semmle/python/ApiGraphs.qll
Normal file
@@ -0,0 +1,343 @@
|
||||
import python
|
||||
import semmle.python.dataflow.new.DataFlow
|
||||
|
||||
module API {
|
||||
class Node extends Impl::TApiNode {
|
||||
/**
|
||||
* Gets a data-flow node corresponding to a use of the API component represented by this node.
|
||||
*
|
||||
* For example, `import re; re.escape` is a use of the `escape` function from the
|
||||
* `re` module, and `import re; re.escape("hello")` is a use of the return of that function.
|
||||
*
|
||||
* This includes indirect uses found via data flow, meaning that in
|
||||
* ```python
|
||||
* def f(x):
|
||||
* pass
|
||||
*
|
||||
* f(obj.foo)
|
||||
* ```
|
||||
* both `obj.foo` and `x` are uses of the `foo` member from `obj`.
|
||||
*/
|
||||
DataFlow::Node getAUse() {
|
||||
exists(DataFlow::LocalSourceNode src | Impl::use(this, src) |
|
||||
Impl::trackUseNode(src).flowsTo(result)
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets an immediate use of the API component represented by this node.
|
||||
*
|
||||
* For example, `import re; re.escape` is a an immediate use of the `escape` member
|
||||
* from the `re` module.
|
||||
*
|
||||
* Unlike `getAUse()`, this predicate only gets the immediate references, not the indirect uses
|
||||
* found via data flow. This means that in `x = re.escape` only `re.escape` is a reference
|
||||
* to the `escape` member of `re`, neither `x` nor any node that `x` flows to is a reference to
|
||||
* this API component.
|
||||
*/
|
||||
DataFlow::LocalSourceNode getAnImmediateUse() { Impl::use(this, result) }
|
||||
|
||||
/**
|
||||
* Gets a call to the function represented by this API component.
|
||||
*/
|
||||
DataFlow::Node getACall() { result = getReturn().getAnImmediateUse() }
|
||||
|
||||
/**
|
||||
* Gets a node representing member `m` of this API component.
|
||||
*
|
||||
* For example, modules have an `exports` member representing their exports, and objects have
|
||||
* their properties as members.
|
||||
*/
|
||||
bindingset[m]
|
||||
bindingset[result]
|
||||
Node getMember(string m) { result = getASuccessor(Label::member(m)) }
|
||||
|
||||
/**
|
||||
* Gets a node representing a member of this API component where the name of the member is
|
||||
* not known statically.
|
||||
*/
|
||||
Node getUnknownMember() { result = getASuccessor(Label::unknownMember()) }
|
||||
|
||||
/**
|
||||
* Gets a node representing a member of this API component where the name of the member may
|
||||
* or may not be known statically.
|
||||
*/
|
||||
Node getAMember() {
|
||||
result = getASuccessor(Label::member(_)) or
|
||||
result = getUnknownMember()
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets a node representing the result of the function represented by this node.
|
||||
*
|
||||
* This predicate may have multiple results when there are multiple invocations of this API component.
|
||||
* Consider using `getACall()` if there is a need to distingiush between individual calls.
|
||||
*/
|
||||
Node getReturn() { result = getASuccessor(Label::return()) }
|
||||
|
||||
/**
|
||||
* Gets a string representation of the lexicographically least among all shortest access paths
|
||||
* from the root to this node.
|
||||
*/
|
||||
string getPath() { result = min(string p | p = getAPath(Impl::distanceFromRoot(this)) | p) }
|
||||
|
||||
/**
|
||||
* Gets a node such that there is an edge in the API graph between this node and the other
|
||||
* one, and that edge is labeled with `lbl`.
|
||||
*/
|
||||
Node getASuccessor(string lbl) { Impl::edge(this, lbl, result) }
|
||||
|
||||
/**
|
||||
* Gets a node such that there is an edge in the API graph between that other node and
|
||||
* this one, and that edge is labeled with `lbl`
|
||||
*/
|
||||
Node getAPredecessor(string lbl) { this = result.getASuccessor(lbl) }
|
||||
|
||||
/**
|
||||
* Gets a node such that there is an edge in the API graph between this node and the other
|
||||
* one.
|
||||
*/
|
||||
Node getAPredecessor() { result = getAPredecessor(_) }
|
||||
|
||||
/**
|
||||
* Gets a node such that there is an edge in the API graph between that other node and
|
||||
* this one.
|
||||
*/
|
||||
Node getASuccessor() { result = getASuccessor(_) }
|
||||
|
||||
/**
|
||||
* Gets the data-flow node that gives rise to this node, if any.
|
||||
*/
|
||||
DataFlow::Node getInducingNode() { this = Impl::MkUse(result) }
|
||||
|
||||
/**
|
||||
* Holds if this node is located in file `path` between line `startline`, column `startcol`,
|
||||
* and line `endline`, column `endcol`.
|
||||
*
|
||||
* For nodes that do not have a meaningful location, `path` is the empty string and all other
|
||||
* parameters are zero.
|
||||
*/
|
||||
predicate hasLocationInfo(string path, int startline, int startcol, int endline, int endcol) {
|
||||
getInducingNode().hasLocationInfo(path, startline, startcol, endline, endcol)
|
||||
or
|
||||
not exists(getInducingNode()) and
|
||||
path = "" and
|
||||
startline = 0 and
|
||||
startcol = 0 and
|
||||
endline = 0 and
|
||||
endcol = 0
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets a textual representation of this node.
|
||||
*/
|
||||
string toString() {
|
||||
none() // defined in subclasses
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets a path of the given `length` from the root to this node.
|
||||
*/
|
||||
private string getAPath(int length) {
|
||||
this instanceof Impl::MkRoot and
|
||||
length = 0 and
|
||||
result = ""
|
||||
or
|
||||
exists(Node pred, string lbl, string predpath |
|
||||
Impl::edge(pred, lbl, this) and
|
||||
lbl != "" and
|
||||
predpath = pred.getAPath(length - 1) and
|
||||
exists(string dot | if length = 1 then dot = "" else dot = "." |
|
||||
result = predpath + dot + lbl and
|
||||
// avoid producing strings longer than 1MB
|
||||
result.length() < 1000 * 1000
|
||||
)
|
||||
) and
|
||||
length in [1 .. Impl::distanceFromRoot(this)]
|
||||
}
|
||||
|
||||
/** Gets the shortest distance from the root to this node in the API graph. */
|
||||
int getDepth() { result = Impl::distanceFromRoot(this) }
|
||||
}
|
||||
|
||||
/** The root node of an API graph. */
|
||||
class Root extends Node, Impl::MkRoot {
|
||||
override string toString() { result = "root" }
|
||||
}
|
||||
|
||||
/** A node corresponding to the use of an API component. */
|
||||
class Use extends Node, Impl::TUse {
|
||||
override string toString() {
|
||||
exists(string type |
|
||||
this = Impl::MkUse(_) and type = "Use "
|
||||
or
|
||||
this = Impl::MkModuleImport(_) and type = "ModuleImport "
|
||||
|
|
||||
result = type + getPath()
|
||||
or
|
||||
not exists(this.getPath()) and result = type + "with no path"
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/** Gets the root node. */
|
||||
Root root() { any() }
|
||||
|
||||
/** Gets a node corresponding to an import of module `m`. */
|
||||
Node moduleImport(string m) { result = Impl::MkModuleImport(m) }
|
||||
|
||||
/**
|
||||
* Provides the actual implementation of API graphs, cached for performance.
|
||||
*
|
||||
* Ideally, we'd like nodes to correspond to (global) access paths, with edge labels
|
||||
* corresponding to extending the access path by one element. We also want to be able to map
|
||||
* nodes to their definitions and uses in the data-flow graph, and this should happen modulo
|
||||
* (inter-procedural) data flow.
|
||||
*
|
||||
* This, however, is not easy to implement, since access paths can have unbounded length
|
||||
* and we need some way of recognizing cycles to avoid non-termination. Unfortunately, expressing
|
||||
* a condition like "this node hasn't been involved in constructing any predecessor of
|
||||
* this node in the API graph" without negative recursion is tricky.
|
||||
*
|
||||
* So instead most nodes are directly associated with a data-flow node, representing
|
||||
* either a use or a definition of an API component. This ensures that we only have a finite
|
||||
* number of nodes. However, we can now have multiple nodes with the same access
|
||||
* path, which are essentially indistinguishable for a client of the API.
|
||||
*
|
||||
* On the other hand, a single node can have multiple access paths (which is, of
|
||||
* course, unavoidable). We pick as canonical the alphabetically least access path with
|
||||
* shortest length.
|
||||
*/
|
||||
cached
|
||||
private module Impl {
|
||||
cached
|
||||
newtype TApiNode =
|
||||
/** The root of the API graph. */
|
||||
MkRoot() or
|
||||
/** An abstract representative for imports of the module called `name`. */
|
||||
MkModuleImport(string name) { imports(_, name) } or
|
||||
/** A use of an API member at the node `nd`. */
|
||||
MkUse(DataFlow::Node nd) { use(_, _, nd) }
|
||||
|
||||
class TUse = MkModuleImport or MkUse;
|
||||
|
||||
/** Holds if `imp` is an import of a module named `name` */
|
||||
private predicate imports(DataFlow::Node imp, string name) { imp = DataFlow::importNode(name) }
|
||||
|
||||
/**
|
||||
* Holds if `ref` is a use of a node that should have an incoming edge from `base` labeled
|
||||
* `lbl` in the API graph.
|
||||
*/
|
||||
cached
|
||||
predicate use(TApiNode base, string lbl, DataFlow::Node ref) {
|
||||
exists(DataFlow::LocalSourceNode src, DataFlow::LocalSourceNode pred |
|
||||
use(base, src) and pred = trackUseNode(src)
|
||||
|
|
||||
lbl = Label::memberFromRef(ref) and
|
||||
ref = pred.getAnAttributeRead()
|
||||
or
|
||||
lbl = Label::return() and
|
||||
ref = pred.getAnInvocation()
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Holds if `ref` is a use of node `nd`.
|
||||
*/
|
||||
cached
|
||||
predicate use(TApiNode nd, DataFlow::Node ref) {
|
||||
exists(string name |
|
||||
nd = MkModuleImport(name) and
|
||||
ref = DataFlow::importNode(name)
|
||||
)
|
||||
or
|
||||
nd = MkUse(ref)
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets a data-flow node to which `nd`, which is a use of an API-graph node, flows.
|
||||
*
|
||||
* The flow from `nd` to that node may be inter-procedural.
|
||||
*/
|
||||
private DataFlow::LocalSourceNode trackUseNode(
|
||||
DataFlow::LocalSourceNode src, DataFlow::TypeTracker t
|
||||
) {
|
||||
t.start() and
|
||||
use(_, src) and
|
||||
result = src
|
||||
or
|
||||
// Due to bad performance when using `trackUseNode(t2, attr_name).track(t2, t)`
|
||||
// we have inlined that code and forced a join
|
||||
exists(DataFlow::StepSummary summary |
|
||||
t = trackUseNode_first_join(src, result, summary).append(summary)
|
||||
)
|
||||
}
|
||||
|
||||
pragma[nomagic]
|
||||
private DataFlow::TypeTracker trackUseNode_first_join(
|
||||
DataFlow::LocalSourceNode src, DataFlow::LocalSourceNode res, DataFlow::StepSummary summary
|
||||
) {
|
||||
DataFlow::StepSummary::step(trackUseNode(src, result), res, summary)
|
||||
}
|
||||
|
||||
cached
|
||||
DataFlow::LocalSourceNode trackUseNode(DataFlow::LocalSourceNode src) {
|
||||
result = trackUseNode(src, DataFlow::TypeTracker::end())
|
||||
}
|
||||
|
||||
/**
|
||||
* Holds if there is an edge from `pred` to `succ` in the API graph that is labeled with `lbl`.
|
||||
*/
|
||||
cached
|
||||
predicate edge(Node pred, string lbl, Node succ) {
|
||||
/* There's an edge from the root node for each imported module. */
|
||||
exists(string m |
|
||||
pred = MkRoot() and
|
||||
lbl = Label::mod(m)
|
||||
|
|
||||
succ = MkModuleImport(m)
|
||||
)
|
||||
or
|
||||
/* Every node that is a use of an API component is itself added to the API graph. */
|
||||
exists(DataFlow::LocalSourceNode ref |
|
||||
use(pred, lbl, ref) and
|
||||
succ = MkUse(ref)
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Holds if there is an edge from `pred` to `succ` in the API graph.
|
||||
*/
|
||||
private predicate edge(TApiNode pred, TApiNode succ) { edge(pred, _, succ) }
|
||||
|
||||
/** Gets the shortest distance from the root to `nd` in the API graph. */
|
||||
cached
|
||||
int distanceFromRoot(TApiNode nd) = shortestDistances(MkRoot/0, edge/2)(_, nd, result)
|
||||
}
|
||||
}
|
||||
|
||||
private module Label {
|
||||
/** Gets the edge label for the module `m`. */
|
||||
bindingset[m]
|
||||
bindingset[result]
|
||||
string mod(string m) { result = "moduleImport(\"" + m + "\")" }
|
||||
|
||||
/** Gets the `member` edge label for member `m`. */
|
||||
bindingset[m]
|
||||
bindingset[result]
|
||||
string member(string m) { result = "getMember(\"" + m + "\")" }
|
||||
|
||||
/** Gets the `member` edge label for the unknown member. */
|
||||
string unknownMember() { result = "getUnknownMember()" }
|
||||
|
||||
/** Gets the `member` edge label for the given attribute reference. */
|
||||
string memberFromRef(DataFlow::AttrRef pr) {
|
||||
result = member(pr.getAttributeName())
|
||||
or
|
||||
not exists(pr.getAttributeName()) and
|
||||
result = unknownMember()
|
||||
}
|
||||
|
||||
/** Gets the `return` edge label. */
|
||||
string return() { result = "getReturn()" }
|
||||
}
|
||||
@@ -447,7 +447,87 @@ class LocalSourceNode extends Node {
|
||||
|
||||
/** Holds if this `LocalSourceNode` can flow to `nodeTo` in one or more local flow steps. */
|
||||
cached
|
||||
predicate flowsTo(Node nodeTo) { simpleLocalFlowStep*(this, nodeTo) }
|
||||
predicate flowsTo(Node nodeTo) { Cached::hasLocalSource(nodeTo, this) }
|
||||
|
||||
/**
|
||||
* Gets a reference (read or write) of attribute `attrName` on this node.
|
||||
*/
|
||||
AttrRef getAnAttributeReference(string attrName) { Cached::namedAttrRef(this, attrName, result) }
|
||||
|
||||
/**
|
||||
* Gets a read of attribute `attrName` on this node.
|
||||
*/
|
||||
AttrRead getAnAttributeRead(string attrName) { result = getAnAttributeReference(attrName) }
|
||||
|
||||
/**
|
||||
* Gets a reference (read or write) of any attribute on this node.
|
||||
*/
|
||||
AttrRef getAnAttributeReference() {
|
||||
Cached::namedAttrRef(this, _, result)
|
||||
or
|
||||
Cached::dynamicAttrRef(this, result)
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets a read of any attribute on this node.
|
||||
*/
|
||||
AttrRead getAnAttributeRead() { result = getAnAttributeReference() }
|
||||
|
||||
/**
|
||||
* Gets an invocation (with our without `new`) of this node.
|
||||
*/
|
||||
Node getAnInvocation() { Cached::invocation(this, result) }
|
||||
}
|
||||
|
||||
cached
|
||||
private module Cached {
|
||||
/**
|
||||
* Holds if `source` is a `LocalSourceNode` that can reach `sink` via local flow steps.
|
||||
*
|
||||
* The slightly backwards parametering ordering is to force correct indexing.
|
||||
*/
|
||||
cached
|
||||
predicate hasLocalSource(Node sink, Node source) {
|
||||
// Declaring `source` to be a `SourceNode` currently causes a redundant check in the
|
||||
// recursive case, so instead we check it explicitly here.
|
||||
source = sink and
|
||||
source instanceof LocalSourceNode
|
||||
or
|
||||
exists(Node mid |
|
||||
hasLocalSource(mid, source) and
|
||||
simpleLocalFlowStep(mid, sink)
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Holds if `base` flows to the base of `ref` and `ref` has attribute name `attr`.
|
||||
*/
|
||||
cached
|
||||
predicate namedAttrRef(LocalSourceNode base, string attr, AttrRef ref) {
|
||||
hasLocalSource(ref.getObject(), base) and
|
||||
ref.getAttributeName() = attr
|
||||
}
|
||||
|
||||
/**
|
||||
* Holds if `base` flows to the base of `ref` and `ref` has no known attribute name.
|
||||
*/
|
||||
cached
|
||||
predicate dynamicAttrRef(LocalSourceNode base, AttrRef ref) {
|
||||
hasLocalSource(ref.getObject(), base) and
|
||||
not exists(ref.getAttributeName())
|
||||
}
|
||||
|
||||
/**
|
||||
* Holds if `func` flows to the callee of `invoke`.
|
||||
*/
|
||||
cached
|
||||
predicate invocation(LocalSourceNode func, Node invoke) {
|
||||
exists(CfgNode n, CallNode call |
|
||||
invoke.asCfgNode() = call and n.asCfgNode() = call.getFunction()
|
||||
|
|
||||
hasLocalSource(n, func)
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
Reference in New Issue
Block a user