Python: Untrusted data used in external APIs

A port of the one for Java that was added in https://github.com/github/codeql/pull/3938
2025-12-20 10:46:30 +01:00 · 2020-11-26 11:34:39 +01:00
parent 7e3dbb0149
commit 9e4910f863
12 changed files with 435 additions and 0 deletions
--- a/python/ql/src/Security/CWE-020-ExternalAPIs/ExternalAPISinkExample.py
+++ b/python/ql/src/Security/CWE-020-ExternalAPIs/ExternalAPISinkExample.py
@@ -0,0 +1,7 @@
+from flask import Flask, request, make_response
+app = Flask(__name__)
+
+@app.route("/xss")
+def xss():
+    username = request.args.get("username")
+    return make_response("Hello {}".format(username))
--- a/python/ql/src/Security/CWE-020-ExternalAPIs/ExternalAPITaintStepExample.py
+++ b/python/ql/src/Security/CWE-020-ExternalAPIs/ExternalAPITaintStepExample.py
@@ -0,0 +1,12 @@
+import base64
+import pickle
+
+from flask import Flask, request, make_response
+app = Flask(__name__)
+
+@app.route("/example")
+def profile():
+    raw_data = request.args.get("data").encode('utf-8')
+    data = base64.decodebytes(raw_data)
+    obj = pickle.loads(data)
+    ...
--- a/python/ql/src/Security/CWE-020-ExternalAPIs/ExternalAPIs.qll
+++ b/python/ql/src/Security/CWE-020-ExternalAPIs/ExternalAPIs.qll
@@ -0,0 +1,196 @@
+/**
+ * Definitions for reasoning about untrusted data used in APIs defined outside the
+ * database.
+ */
+
+import python
+import semmle.python.dataflow.new.DataFlow
+import semmle.python.dataflow.new.TaintTracking
+import semmle.python.Concepts
+import semmle.python.dataflow.new.RemoteFlowSources
+private import semmle.python.dataflow.new.internal.DataFlowPrivate as DataFlowPrivate
+private import semmle.python.dataflow.new.internal.TaintTrackingPrivate as TaintTrackingPrivate
+private import semmle.python.types.Builtins
+private import semmle.python.objects.ObjectInternal
+
+// IMPLEMENTATION NOTES:
+//
+// This query uses *both* the new data-flow library, and points-to. Why? To get this
+// finished quickly, so it can provide value for our field team and ourselves.
+//
+// In the long run, it should not need to use points-to for anything. Possibly this can
+// even be helpful in figuring out what we need from TypeTrackers and the new data-flow
+// library to be fully operational.
+//
+// At least it will allow us to provide a baseline comparison against a solution that
+// doesn't use points-to at all
+//
+// There is a few dirty things we do here:
+// 1. DataFlowPrivate: since `DataFlowCall` and `DataFlowCallable` are not exposed
+//    publicly, but we really want access to them.
+// 2. points-to: we kinda need to do this since this is what powers `DataFlowCall` and
+//    `DataFlowCallable`
+// 3. ObjectInternal: to provide better names for built-in functions and methods. If we
+//    really wanted to polish our points-to implementation, we could move this
+//    functionality into `BuiltinFunctionValue` and `BuiltinMethodValue`, but will
+//    probably require some more work: for this query, it's totally ok to use
+//    `builtins.open` for the code `open(f)`, but well, it requires a bit of thinking to
+//    figure out if that is desireable in general. I simply skipped a corner here!
+// 4. TaintTrackingPrivate: Nothing else gives us access to `defaultAdditionalTaintStep` :(
+/**
+ * A callable that is considered a "safe" external API from a security perspective.
+ */
+class SafeExternalAPI extends Unit {
+  abstract DataFlowPrivate::DataFlowCallable getSafeCallable();
+}
+
+/** The default set of "safe" external APIs. */
+private class DefaultSafeExternalAPI extends SafeExternalAPI {
+  override DataFlowPrivate::DataFlowCallable getSafeCallable() {
+    exists(CallableValue cv | cv = result.getCallableValue() |
+      cv = Value::named(["len", "isinstance", "getattr", "hasattr"])
+      or
+      exists(ClassValue cls, string attr |
+        cls = Value::named("dict") and attr in ["__getitem__", "__setitem__"]
+      |
+        cls.lookup(attr) = cv
+      )
+    )
+  }
+}
+
+/** A node representing data being passed to an external API through a call. */
+class ExternalAPIDataNode extends DataFlow::Node {
+  DataFlowPrivate::DataFlowCall call;
+  DataFlowPrivate::DataFlowCallable callable;
+  int i;
+
+  ExternalAPIDataNode() {
+    exists(call.getLocation().getFile().getRelativePath()) and
+    callable = call.getCallable() and
+    not any(SafeExternalAPI safe).getSafeCallable() = callable and
+    exists(Value cv | cv = callable.getCallableValue() |
+      cv.isAbsent()
+      or
+      cv.isBuiltin()
+      or
+      cv.(CallableValue).getScope().getLocation().getFile().inStdlib()
+      or
+      not exists(cv.(CallableValue).getScope().getLocation().getFile().getRelativePath())
+    ) and
+    // TODO: this ignores some complexity of keyword arguments (especially keyword-only args)
+    this = call.getArg(i) and
+    // Not already modeled as a taint step
+    not exists(DataFlow::Node next | TaintTrackingPrivate::defaultAdditionalTaintStep(this, next)) and
+    // for `list.append(x)`, we have a additional taint step from x -> [post] list.
+    // Since we have modeled this explicitly, I don't see any cases where we would want to report this.
+    not exists(DataFlow::Node prev, DataFlow::PostUpdateNode post |
+      post.getPreUpdateNode() = this and
+      TaintTrackingPrivate::defaultAdditionalTaintStep(prev, post)
+    )
+  }
+
+  /** Gets the index for the parameter that will receive this untrusted data */
+  int getIndex() { result = i }
+
+  /** Gets the callable to which this argument is passed. */
+  DataFlowPrivate::DataFlowCallable getCallable() { result = callable }
+}
+
+/** A configuration for tracking flow from `RemoteFlowSource`s to `ExternalAPIDataNode`s. */
+class UntrustedDataToExternalAPIConfig extends TaintTracking::Configuration {
+  UntrustedDataToExternalAPIConfig() { this = "UntrustedDataToExternalAPIConfig" }
+
+  override predicate isSource(DataFlow::Node source) { source instanceof RemoteFlowSource }
+
+  override predicate isSink(DataFlow::Node sink) { sink instanceof ExternalAPIDataNode }
+}
+
+/** A node representing untrusted data being passed to an external API. */
+class UntrustedExternalAPIDataNode extends ExternalAPIDataNode {
+  UntrustedExternalAPIDataNode() { any(UntrustedDataToExternalAPIConfig c).hasFlow(_, this) }
+
+  /** Gets a source of untrusted data which is passed to this external API data node. */
+  DataFlow::Node getAnUntrustedSource() {
+    any(UntrustedDataToExternalAPIConfig c).hasFlow(result, this)
+  }
+}
+
+private newtype TExternalAPI =
+  TExternalAPIParameter(DataFlowPrivate::DataFlowCallable callable, int index) {
+    exists(UntrustedExternalAPIDataNode n |
+      callable = n.getCallable() and
+      index = n.getIndex()
+    )
+  }
+
+/** An external API which is used with untrusted data. */
+class ExternalAPIUsedWithUntrustedData extends TExternalAPI {
+  /** Gets a possibly untrusted use of this external API. */
+  UntrustedExternalAPIDataNode getUntrustedDataNode() {
+    this = TExternalAPIParameter(result.getCallable(), result.getIndex())
+  }
+
+  /** Gets the number of untrusted sources used with this external API. */
+  int getNumberOfUntrustedSources() {
+    result = count(getUntrustedDataNode().getAnUntrustedSource())
+  }
+
+  /** Gets a textual representation of this element. */
+  string toString() {
+    exists(
+      DataFlowPrivate::DataFlowCallable callable, int index, string callableString,
+      string indexString
+    |
+      this = TExternalAPIParameter(callable, index) and
+      indexString = "param " + index and
+      exists(CallableValue cv | cv = callable.getCallableValue() |
+        callableString =
+          cv.getScope().getEnclosingModule().getName() + "." + cv.getScope().getQualifiedName()
+        or
+        not exists(cv.getScope()) and
+        (
+          cv instanceof BuiltinFunctionValue and
+          callableString = pretty_builtin_function_value(cv)
+          or
+          cv instanceof BuiltinMethodValue and
+          callableString = pretty_builtin_method_value(cv)
+          or
+          not cv instanceof BuiltinFunctionValue and
+          not cv instanceof BuiltinMethodValue and
+          callableString = cv.toString()
+        )
+      ) and
+      result = callableString + " [" + indexString + "]"
+    )
+  }
+}
+
+/** Gets the fully qualified name for the `BuiltinFunctionValue` bfv. */
+private string pretty_builtin_function_value(BuiltinFunctionValue bfv) {
+  exists(Builtin b | b = bfv.(BuiltinFunctionObjectInternal).getBuiltin() |
+    result = prefix_with_module_if_found(b)
+  )
+}
+
+/** Gets the fully qualified name for the `BuiltinMethodValue` bmv. */
+private string pretty_builtin_method_value(BuiltinMethodValue bmv) {
+  exists(Builtin b | b = bmv.(BuiltinMethodObjectInternal).getBuiltin() |
+    exists(Builtin cls | cls.isClass() and cls.getMember(b.getName()) = b |
+      result = prefix_with_module_if_found(cls) + "." + b.getName()
+    )
+    or
+    not exists(Builtin cls | cls.isClass() and cls.getMember(b.getName()) = b) and
+    result = b.getName()
+  )
+}
+
+/** Helper predicate that tries to adds module qualifier to `b`. Will succeed even if module not found. */
+private string prefix_with_module_if_found(Builtin b) {
+  exists(Builtin mod | mod.isModule() and mod.getMember(b.getName()) = b |
+    result = mod.getName() + "." + b.getName()
+  )
+  or
+  not exists(Builtin mod | mod.isModule() and mod.getMember(b.getName()) = b) and
+  result = b.getName()
+}
--- a/python/ql/src/Security/CWE-020-ExternalAPIs/ExternalAPIsUsedWithUntrustedData.qhelp
+++ b/python/ql/src/Security/CWE-020-ExternalAPIs/ExternalAPIsUsedWithUntrustedData.qhelp
@@ -0,0 +1,61 @@
+<!DOCTYPE qhelp PUBLIC
+  "-//Semmle//qhelp//EN"
+  "qhelp.dtd">
+<qhelp>
+<overview>
+<p>Using unsanitized untrusted data in an external API can cause a variety of security
+issues. This query reports external APIs that are used with untrusted data, along
+with how frequently the API is used, and how many unique sources of untrusted data flow
+to this API. This query is designed primarily to help identify which APIs may be
+relevant for security analysis of this application.</p>
+
+<p>An external API is defined as a call to a method that is not defined in the source
+code, and is not modeled as a taint step in the default taint library. External APIs may
+be from the Python standard library or dependencies. The query will report the fully qualified name,
+along with <code>[param x]</code>, where <code>x</code> indicates the position of
+the parameter receiving the untrusted data. Note that for methods and
+<code>classmethod</code>s, parameter 0 represents the class instance or class itself
+respectively.</p>
+
+<p>Note that an excepted sink might not be included in the results, if it also defines a
+taint step. This is the case for <code>pickle.loads</code> which is a sink for the
+Unsafe Deserialization query, but is also a taint step for other queries.</p>
+
+<p>Note: Compared to the Java version of this query, we currently do not give special
+care to methods that are overridden in the source code.</p>
+
+<p>Note: Currently this query will only report results for external packages that are extracted.</p>
+
+</overview>
+<recommendation>
+
+<p>For each result:</p>
+
+<ul>
+  <li>If the result highlights a known sink, no action is required.</li>
+  <li>If the result highlights an unknown sink for a problem, then add modeling for the sink to the relevant query.</li>
+  <li>If the result represents a call to an external API which transfers taint, add the appropriate modeling, and
+  re-run the query to determine what new results have appeared due to this additional modeling.</li>
+</ul>
+
+<p>Otherwise, the result is likely uninteresting. Custom versions of this query can extend the <code>SafeExternalAPI</code>
+class and specify <code>getSafeCallable</code> to exclude known safe external APIs from future analysis.</p>
+
+</recommendation>
+<example>
+
+<p>If the query were to return the API <code>flask.make_response [param 0]</code>
+then we should first consider whether this a security relevant sink. In this case, this is making a HTTP response, so we should
+consider whether this is an XSS sink. If it is, we should confirm that it is handled by the XSS query.</p>
+
+<p>If the query were to return the API <code>base64.decodebytes [param 0]</code>, then this should be
+reviewed as a possible taint step, because tainted data would flow from the 0th argument to the result of the call.</p>
+
+<p>Note that both examples are correctly handled by the standard taint tracking library and XSS query.</p>
+</example>
+<references>
+
+</references>
+</qhelp>
+
+Mention something about `pickle.loads(user_input)`
--- a/python/ql/src/Security/CWE-020-ExternalAPIs/ExternalAPIsUsedWithUntrustedData.ql
+++ b/python/ql/src/Security/CWE-020-ExternalAPIs/ExternalAPIsUsedWithUntrustedData.ql
@@ -0,0 +1,17 @@
+/**
+ * @name Frequency counts for external APIs that are used with untrusted data
+ * @description This reports the external APIs that are used with untrusted data, along with how
+ *              frequently the API is called, and how many unique sources of untrusted data flow
+ *              to it.
+ * @id python/count-untrusted-data-external-api
+ * @kind table
+ * @tags security external/cwe/cwe-20
+ */
+
+import python
+import ExternalAPIs
+
+from ExternalAPIUsedWithUntrustedData externalAPI
+select externalAPI, count(externalAPI.getUntrustedDataNode()) as numberOfUses,
+  externalAPI.getNumberOfUntrustedSources() as numberOfUntrustedSources order by
+    numberOfUntrustedSources desc
--- a/python/ql/src/Security/CWE-020-ExternalAPIs/UntrustedDataToExternalAPI.qhelp
+++ b/python/ql/src/Security/CWE-020-ExternalAPIs/UntrustedDataToExternalAPI.qhelp
@@ -0,0 +1,74 @@
+<!DOCTYPE qhelp PUBLIC
+  "-//Semmle//qhelp//EN"
+  "qhelp.dtd">
+<qhelp>
+<overview>
+<p>Using unsanitized untrusted data in an external API can cause a variety of security
+issues. This query reports external APIs that use untrusted data. The results are not
+filtered so that you can audit all examples. The query provides data for security
+reviews of the application and you can also use it to identify external APIs that should
+be modeled as either taint steps, or sinks for specific problems.</p>
+
+<p>An external API is defined as a call to a method that is not defined in the source
+code, and is not modeled as a taint step in the default taint library. External APIs may
+be from the Python standard library or dependencies. The query will report the fully qualified name,
+along with <code>[param x]</code>, where <code>x</code> indicates the position of
+the parameter receiving the untrusted data. Note that for methods and
+<code>classmethod</code>s, parameter 0 represents the class instance or class itself
+respectively.</p>
+
+<p>Note that an excepted sink might not be included in the results, if it also defines a
+taint step. This is the case for <code>pickle.loads</code> which is a sink for the
+Unsafe Deserialization query, but is also a taint step for other queries.</p>
+
+<p>Note: Compared to the Java version of this query, we currently do not give special
+care to methods that are overridden in the source code.</p>
+
+<p>Note: Currently this query will only report results for external packages that are extracted.</p>
+
+</overview>
+<recommendation>
+
+<p>For each result:</p>
+
+<ul>
+  <li>If the result highlights a known sink, confirm that the result is reported by the relevant query, or
+  that the result is a false positive because this data is sanitized.</li>
+  <li>If the result highlights an unknown sink for a problem, then add modeling for the sink to the relevant query,
+  and confirm that the result is either found, or is safe due to appropriate sanitization.</li>
+  <li>If the result represents a call to an external API that transfers taint, add the appropriate modeling, and
+  re-run the query to determine what new results have appeared due to this additional modeling.</li>
+</ul>
+
+<p>Otherwise, the result is likely uninteresting. Custom versions of this query can extend the <code>SafeExternalAPI</code>
+class and specify <code>getSafeCallable</code> to exclude known safe external APIs from future analysis.</p>
+
+</recommendation>
+<example>
+
+<p>In this first example, a request parameter is read from the Flask <code>request</code> and then ultimately used in a call to the
+<code>flask.make_response</code> external API:</p>
+
+<sample src="ExternalAPISinkExample.py" />
+
+<p>This is an XSS sink. The XSS query should therefore be reviewed to confirm that this sink is appropriately modeled,
+and if it is, to confirm that the query reports this particular result, or that the result is a false positive due to
+some existing sanitization.</p>
+
+<p>In this second example, again a request parameter is read from the Flask <code>request</code>.</p>
+
+<sample src="ExternalAPITaintStepExample.py" />
+
+<p>If the query reported the call to <code>base64.decodebytes</code> on line 10, this
+would suggest that this external API is not currently modeled as a taint step in the
+taint tracking library. The next step would be to model this as a taint step, then
+re-run the query to determine what additional results might be found. In this example,
+the result of the Base64 decoding is pickled, which can result in remote code execution due
+to unsafe deserialization.
+
+<p>Note that both examples are correctly handled by the standard taint tracking library and Unsafe Deserialization query.</p>
+</example>
+<references>
+
+</references>
+</qhelp>
--- a/python/ql/src/Security/CWE-020-ExternalAPIs/UntrustedDataToExternalAPI.ql
+++ b/python/ql/src/Security/CWE-020-ExternalAPIs/UntrustedDataToExternalAPI.ql
@@ -0,0 +1,23 @@
+/**
+ * @name Untrusted data passed to external API
+ * @description Data provided remotely is used in this external API without sanitization, which could be a security risk.
+ * @id python/untrusted-data-to-external-api
+ * @kind path-problem
+ * @precision low
+ * @problem.severity error
+ * @tags security external/cwe/cwe-20
+ */
+
+import python
+import ExternalAPIs
+import DataFlow::PathGraph
+
+from
+  UntrustedDataToExternalAPIConfig config, DataFlow::PathNode source, DataFlow::PathNode sink,
+  ExternalAPIUsedWithUntrustedData externalAPI
+where
+  sink.getNode() = externalAPI.getUntrustedDataNode() and
+  config.hasFlowPath(source, sink)
+select sink.getNode(), source, sink,
+  "Call to " + externalAPI.toString() + " with untrusted data from $@.", source.getNode(),
+  source.toString()