Files
codeql/python/ql/src/Security/CWE-020-ExternalAPIs/ExternalAPIs.qll
Copilot 4ed5722e3e Python: switch dataflow library to new (shared) CFG + SSA
Flips the Python dataflow trunk from the legacy CFG (semmle/python/Flow.qll)
and legacy ESSA SSA (semmle/python/essa/*) to the new shared CFG facade
(semmle.python.controlflow.internal.Cfg) and the new SSA adapter
(semmle.python.dataflow.new.internal.SsaImpl), both introduced
additively in the preceding PRs in this stack.

This is the trunk-flip equivalent of the original draft PR #21894 (kept
around as documentation), rebased on top of the four preparatory PRs:

  P1: Remove AstNode.getAFlowNode() and rewrite callers (#21919).
  P2: Qualify Flow.qll's AST references with Py:: prefix (#21920).
  P3: Add new shared-CFG-backed control flow graph (#21921).
  P4: Add new shared-SSA-backed SSA adapter (#21923).

The Python dataflow library (semmle/python/dataflow/new/) now imports
the new CFG facade and SSA adapter. All CFG-typed predicates
(ControlFlowNode, CallNode, BasicBlock, NameNode, AttrNode, ...) are
qualified with the Cfg:: prefix; SSA references switch from
EssaVariable/EssaDefinition to SsaImpl::Definition/SourceVariable.

GuardNode is redesigned to use the new CFG's outcome-node model
(isAfterTrue / isAfterFalse) instead of the legacy ConditionBlock +
flipped indirection. Only BarrierGuard<...> is preserved as public
API.

Framework files (Bottle, FastApi, Django, Tornado, Pyramid, Stdlib,
...) are updated to take CFG nodes from the new facade.

A handful of dataflow consistency tweaks for the new CFG:
- Augmented-assignment targets are treated as both load and store.
- 'from X import *' produces uncertain SSA writes for unknown names.
- CFG nodes are canonicalised so dataflow does not see equivalent
  pre/post-order pairs as distinct nodes.

Two AST tweaks for the new CFG:
- AstNodeImpl: omit PEP 695 type-parameter names from
  FunctionDefExpr / ClassDefExpr children.
- ImportResolution: drop the legacy essa import.

Test churn (~175 files): reblessed library- and query-test .expected
files reflect slightly different CFG granularity, different toString
output, and a handful of true alert deltas in security queries.

Verification: all 367 lib + src + consistency-queries compile clean.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-06-02 14:09:45 +00:00

224 lines
7.8 KiB
Plaintext

/**
* Definitions for reasoning about untrusted data used in APIs defined outside the
* user-written code.
*/
private import python
import semmle.python.dataflow.new.DataFlow
private import semmle.python.dataflow.new.TaintTracking
private import semmle.python.dataflow.new.RemoteFlowSources
private import semmle.python.ApiGraphs
private import semmle.python.dataflow.new.internal.DataFlowPrivate as DataFlowPrivate
private import semmle.python.dataflow.new.internal.TaintTrackingPrivate as TaintTrackingPrivate
private import semmle.python.controlflow.internal.Cfg as Cfg
/**
* An external API that is considered "safe" from a security perspective.
*/
class SafeExternalApi extends Unit {
/**
* Gets a call that is considered "safe" from a security perspective. You can use API
* graphs to find calls to functions you know are safe.
*
* Which works even when the external library isn't extracted.
*/
abstract DataFlow::CallCfgNode getSafeCall();
/**
* Gets a callable that is considered a "safe" external API from a security
* perspective.
*
* You probably want to define this as `none()` and use `getSafeCall` instead, since
* that can handle the external library not being extracted.
*/
DataFlowPrivate::DataFlowCallable getSafeCallable() { none() }
}
/** The default set of "safe" external APIs. */
private class DefaultSafeExternalApi extends SafeExternalApi {
override DataFlow::CallCfgNode getSafeCall() {
result =
API::builtin([
"len", "enumerate", "isinstance", "getattr", "hasattr", "bool", "float", "int", "repr",
"str", "type"
]).getACall()
}
}
/**
* Gets a human readable representation of `node`.
*
* Note that this is only defined for API nodes that are allowed as external APIs,
* so `None.json.dumps` will for example not be allowed.
*/
string apiNodeToStringRepr(API::Node node) {
node = API::builtin(result)
or
node = API::moduleImport(result)
or
exists(API::Node base, string basename |
base.getDepth() < node.getDepth() and
basename = apiNodeToStringRepr(base) and
not base = API::builtin(["None", "True", "False"])
|
exists(string m | node = base.getMember(m) | result = basename + "." + m)
or
node = base.getReturn() and
result = basename + "()" and
not base.getACall() = any(SafeExternalApi safe).getSafeCall()
or
node = base.getAwaited() and
result = basename
)
}
predicate resolvedCall(Cfg::CallNode call) {
DataFlowPrivate::resolveCall(call, _, _) or
DataFlowPrivate::resolveClassCall(call, _)
}
newtype TInterestingExternalApiCall =
TUnresolvedCall(DataFlow::CallCfgNode call) {
exists(call.getLocation().getFile().getRelativePath()) and
not resolvedCall(call.getNode()) and
not call = any(SafeExternalApi safe).getSafeCall()
} or
TResolvedCall(DataFlowPrivate::DataFlowCall call) {
exists(call.getLocation().getFile().getRelativePath()) and
exists(call.getCallable()) and
not call.getCallable() = any(SafeExternalApi safe).getSafeCallable() and
// ignore calls inside codebase, and ignore calls that are marked as safe. This is
// only needed as long as we extract dependencies. When we stop doing that, all
// targets of resolved calls will be from user-written code.
not exists(call.getCallable().getLocation().getFile().getRelativePath()) and
not exists(DataFlow::CallCfgNode callCfgNode | callCfgNode.getNode() = call.getNode() |
any(SafeExternalApi safe).getSafeCall() = callCfgNode
)
}
abstract class InterestingExternalApiCall extends TInterestingExternalApiCall {
/** Gets the argument at position `apos`, if any */
abstract DataFlow::Node getArgument(DataFlowPrivate::ArgumentPosition apos);
/** Gets a textual representation of this element. */
abstract string toString();
/**
* Gets a human-readable name for the external API.
*/
abstract string getApiName();
}
class UnresolvedCall extends InterestingExternalApiCall, TUnresolvedCall {
DataFlow::CallCfgNode call;
UnresolvedCall() { this = TUnresolvedCall(call) }
override DataFlow::Node getArgument(DataFlowPrivate::ArgumentPosition apos) {
exists(int i | apos.isPositional(i) | result = call.getArg(i))
or
exists(string name | apos.isKeyword(name) | result = call.getArgByName(name))
}
override string toString() {
result = "ExternalAPI:UnresolvedCall: " + call.getNode().getNode().toString()
}
override string getApiName() {
exists(API::Node apiNode |
result = apiNodeToStringRepr(apiNode) and
apiNode.getACall() = call
)
}
}
class ResolvedCall extends InterestingExternalApiCall, TResolvedCall {
DataFlowPrivate::DataFlowCall dfCall;
ResolvedCall() { this = TResolvedCall(dfCall) }
override DataFlow::Node getArgument(DataFlowPrivate::ArgumentPosition apos) {
result = dfCall.getArgument(apos)
}
override string toString() {
result = "ExternalAPI:ResolvedCall: " + dfCall.getNode().getNode().toString()
}
override string getApiName() {
exists(DataFlow::CallCfgNode call, API::Node apiNode | dfCall.getNode() = call.getNode() |
result = apiNodeToStringRepr(apiNode) and
apiNode.getACall() = call
)
}
}
/** A node representing data being passed to an external API through a call. */
class ExternalApiDataNode extends DataFlow::Node {
ExternalApiDataNode() {
exists(InterestingExternalApiCall call | this = call.getArgument(_)) and
// Not already modeled as a taint step
not TaintTrackingPrivate::defaultAdditionalTaintStep(this, _, _) and
// for `list.append(x)`, we have a additional taint step from x -> [post] list.
// Since we have modeled this explicitly, I don't see any cases where we would want to report this.
not exists(DataFlow::PostUpdateNode post |
post.getPreUpdateNode() = this and
TaintTrackingPrivate::defaultAdditionalTaintStep(_, post, _)
)
}
}
private module UntrustedDataToExternalApiConfig implements DataFlow::ConfigSig {
predicate isSource(DataFlow::Node source) { source instanceof RemoteFlowSource }
predicate isSink(DataFlow::Node sink) { sink instanceof ExternalApiDataNode }
predicate observeDiffInformedIncrementalMode() {
none() // Not used for PR analysis
}
}
/** Global taint-tracking from `RemoteFlowSource`s to `ExternalApiDataNode`s. */
module UntrustedDataToExternalApiFlow = TaintTracking::Global<UntrustedDataToExternalApiConfig>;
/** A node representing untrusted data being passed to an external API. */
class UntrustedExternalApiDataNode extends ExternalApiDataNode {
UntrustedExternalApiDataNode() { UntrustedDataToExternalApiFlow::flowTo(this) }
/** Gets a source of untrusted data which is passed to this external API data node. */
DataFlow::Node getAnUntrustedSource() { UntrustedDataToExternalApiFlow::flow(result, this) }
}
/** An external API which is used with untrusted data. */
private newtype TExternalApi =
MkExternalApi(string repr, DataFlowPrivate::ArgumentPosition apos) {
exists(UntrustedExternalApiDataNode ex, InterestingExternalApiCall call |
ex = call.getArgument(apos) and
repr = call.getApiName()
)
}
/** A argument of an external API which is used with untrusted data. */
class ExternalApiUsedWithUntrustedData extends MkExternalApi {
string repr;
DataFlowPrivate::ArgumentPosition apos;
ExternalApiUsedWithUntrustedData() { this = MkExternalApi(repr, apos) }
/** Gets a possibly untrusted use of this external API. */
UntrustedExternalApiDataNode getUntrustedDataNode() {
exists(InterestingExternalApiCall call |
result = call.getArgument(apos) and
call.getApiName() = repr
)
}
/** Gets the number of untrusted sources used with this external API. */
int getNumberOfUntrustedSources() {
result = count(this.getUntrustedDataNode().getAnUntrustedSource())
}
/** Gets a textual representation of this element. */
string toString() { result = repr + " [" + apos + "]" }
}