Merge pull request #4435 from RasmusWL/python-port-code-injection

Python: port code injection query
2025-12-16 16:53:25 +01:00 · 2020-10-14 16:41:42 +02:00
parent 5f6f85c998 5db4f906d0
commit 466c22f4a8
21 changed files with 319 additions and 3 deletions
--- a/python/ql/src/experimental/Security-new-dataflow/CWE-094/CodeInjection.ql
+++ b/python/ql/src/experimental/Security-new-dataflow/CWE-094/CodeInjection.ql
@@ -0,0 +1,35 @@
+/**
+ * @name Code injection
+ * @description Interpreting unsanitized user input as code allows a malicious user to perform arbitrary
+ *              code execution.
+ * @kind path-problem
+ * @problem.severity error
+ * @sub-severity high
+ * @precision high
+ * @id py/code-injection
+ * @tags security
+ *       external/owasp/owasp-a1
+ *       external/cwe/cwe-094
+ *       external/cwe/cwe-095
+ *       external/cwe/cwe-116
+ */
+
+import python
+import experimental.dataflow.DataFlow
+import experimental.dataflow.TaintTracking
+import experimental.semmle.python.Concepts
+import experimental.dataflow.RemoteFlowSources
+import DataFlow::PathGraph
+
+class CodeInjectionConfiguration extends TaintTracking::Configuration {
+  CodeInjectionConfiguration() { this = "CodeInjectionConfiguration" }
+
+  override predicate isSource(DataFlow::Node source) { source instanceof RemoteFlowSource }
+
+  override predicate isSink(DataFlow::Node sink) { sink = any(CodeExecution e).getCode() }
+}
+
+from CodeInjectionConfiguration config, DataFlow::PathNode source, DataFlow::PathNode sink
+where config.hasFlowPath(source, sink)
+select sink.getNode(), source, sink, "$@ flows to here and is interpreted as code.",
+  source.getNode(), "A user-provided value"
--- a/python/ql/src/experimental/semmle/python/Concepts.qll
+++ b/python/ql/src/experimental/semmle/python/Concepts.qll
@@ -17,12 +17,12 @@ private import experimental.dataflow.RemoteFlowSources
 * extend `SystemCommandExecution::Range` instead.
 */
 class SystemCommandExecution extends DataFlow::Node {
-  SystemCommandExecution::Range self;
+  SystemCommandExecution::Range range;

-  SystemCommandExecution() { this = self }
+  SystemCommandExecution() { this = range }

  /** Gets the argument that specifies the command to be executed. */
-  DataFlow::Node getCommand() { result = self.getCommand() }
+  DataFlow::Node getCommand() { result = range.getCommand() }
 }

 /** Provides a class for modeling new system-command execution APIs. */
@@ -40,6 +40,35 @@ module SystemCommandExecution {
  }
 }

+/**
+ * A data-flow node that dynamically executes Python code.
+ *
+ * Extend this class to refine existing API models. If you want to model new APIs,
+ * extend `CodeExecution::Range` instead.
+ */
+class CodeExecution extends DataFlow::Node {
+  CodeExecution::Range range;
+
+  CodeExecution() { this = range }
+
+  /** Gets the argument that specifies the code to be executed. */
+  DataFlow::Node getCode() { result = range.getCode() }
+}
+
+/** Provides a class for modeling new dynamic code execution APIs. */
+module CodeExecution {
+  /**
+   * A data-flow node that dynamically executes Python code.
+   *
+   * Extend this class to model new APIs. If you want to refine existing API models,
+   * extend `CodeExecution` instead.
+   */
+  abstract class Range extends DataFlow::Node {
+    /** Gets the argument that specifies the code to be executed. */
+    abstract DataFlow::Node getCode();
+  }
+}
+
 /** Provides classes for modeling HTTP-related APIs. */
 module HTTP {
  /** Provides classes for modeling HTTP servers. */
--- a/python/ql/src/experimental/semmle/python/frameworks/Stdlib.qll
+++ b/python/ql/src/experimental/semmle/python/frameworks/Stdlib.qll
@@ -327,4 +327,115 @@ private module Stdlib {
      )
    }
  }
+
+  // ---------------------------------------------------------------------------
+  // builtins
+  // ---------------------------------------------------------------------------
+  /** Gets a reference to the `builtins` module (called `__builtin__` in Python 2). */
+  private DataFlow::Node builtins(DataFlow::TypeTracker t) {
+    t.start() and
+    result = DataFlow::importNode(["builtins", "__builtin__"])
+    or
+    exists(DataFlow::TypeTracker t2 | result = builtins(t2).track(t2, t))
+  }
+
+  /** Gets a reference to the `builtins` module. */
+  DataFlow::Node builtins() { result = builtins(DataFlow::TypeTracker::end()) }
+
+  /**
+   * Gets a reference to the attribute `attr_name` of the `builtins` module.
+   * WARNING: Only holds for a few predefined attributes.
+   */
+  private DataFlow::Node builtins_attr(DataFlow::TypeTracker t, string attr_name) {
+    attr_name in ["exec", "eval", "compile"] and
+    (
+      t.start() and
+      result = DataFlow::importNode(["builtins", "__builtin__"] + "." + attr_name)
+      or
+      t.startInAttr(attr_name) and
+      result = DataFlow::importNode(["builtins", "__builtin__"])
+      or
+      // special handling of builtins, that are in scope without any imports
+      // TODO: Take care of overrides, either `def eval: ...`, `eval = ...`, or `builtins.eval = ...`
+      t.start() and
+      exists(NameNode ref | result.asCfgNode() = ref |
+        ref.isGlobal() and
+        ref.getId() = attr_name and
+        ref.isLoad()
+      )
+    )
+    or
+    // Due to bad performance when using normal setup with `builtins_attr(t2, attr_name).track(t2, t)`
+    // we have inlined that code and forced a join
+    exists(DataFlow::TypeTracker t2 |
+      exists(DataFlow::StepSummary summary |
+        builtins_attr_first_join(t2, attr_name, result, summary) and
+        t = t2.append(summary)
+      )
+    )
+  }
+
+  pragma[nomagic]
+  private predicate builtins_attr_first_join(
+    DataFlow::TypeTracker t2, string attr_name, DataFlow::Node res, DataFlow::StepSummary summary
+  ) {
+    DataFlow::StepSummary::step(builtins_attr(t2, attr_name), res, summary)
+  }
+
+  /**
+   * Gets a reference to the attribute `attr_name` of the `builtins` module.
+   * WARNING: Only holds for a few predefined attributes.
+   */
+  private DataFlow::Node builtins_attr(string attr_name) {
+    result = builtins_attr(DataFlow::TypeTracker::end(), attr_name)
+  }
+
+  /**
+   * A call to the builtin `exec` function.
+   * See https://docs.python.org/3/library/functions.html#exec
+   */
+  private class BuiltinsExecCall extends CodeExecution::Range, DataFlow::CfgNode {
+    override CallNode node;
+
+    BuiltinsExecCall() { node.getFunction() = builtins_attr("exec").asCfgNode() }
+
+    override DataFlow::Node getCode() { result.asCfgNode() = node.getArg(0) }
+  }
+
+  /**
+   * A call to the builtin `eval` function.
+   * See https://docs.python.org/3/library/functions.html#eval
+   */
+  private class BuiltinsEvalCall extends CodeExecution::Range, DataFlow::CfgNode {
+    override CallNode node;
+
+    BuiltinsEvalCall() { node.getFunction() = builtins_attr("eval").asCfgNode() }
+
+    override DataFlow::Node getCode() { result.asCfgNode() = node.getArg(0) }
+  }
+
+  /** An additional taint step for calls to the builtin function `compile` */
+  private class BuiltinsCompileCallAdditionalTaintStep extends TaintTracking::AdditionalTaintStep {
+    override predicate step(DataFlow::Node nodeFrom, DataFlow::Node nodeTo) {
+      exists(CallNode call |
+        nodeTo.asCfgNode() = call and
+        call.getFunction() = builtins_attr("compile").asCfgNode() and
+        nodeFrom.asCfgNode() in [call.getArg(0), call.getArgByName("source")]
+      )
+    }
+  }
+}
+
+/**
+ * An exec statement (only Python 2).
+ * Se ehttps://docs.python.org/2/reference/simple_stmts.html#the-exec-statement.
+ */
+private class ExecStatement extends CodeExecution::Range {
+  ExecStatement() {
+    // since there are no DataFlow::Nodes for a Statement, we can't do anything like
+    // `this = any(Exec exec)`
+    this.asExpr() = any(Exec exec).getBody()
+  }
+
+  override DataFlow::Node getCode() { result = this }
 }
--- a/python/ql/test/experimental/library-tests/frameworks/stdlib-py2/CodeExecution.py
+++ b/python/ql/test/experimental/library-tests/frameworks/stdlib-py2/CodeExecution.py
@@ -0,0 +1,2 @@
+# exec statement is Python 2 specific
+exec "print(42)"  # $getCode="print(42)"
--- a/python/ql/test/experimental/library-tests/frameworks/stdlib-py2/ConceptsTest.expected
+++ b/python/ql/test/experimental/library-tests/frameworks/stdlib-py2/ConceptsTest.expected
--- a/python/ql/test/experimental/library-tests/frameworks/stdlib-py2/ConceptsTest.ql
+++ b/python/ql/test/experimental/library-tests/frameworks/stdlib-py2/ConceptsTest.ql
@@ -0,0 +1,2 @@
+import python
+import experimental.meta.ConceptsTest
--- a/python/ql/test/experimental/library-tests/frameworks/stdlib-py2/options
+++ b/python/ql/test/experimental/library-tests/frameworks/stdlib-py2/options
@@ -0,0 +1 @@
+semmle-extractor-options: --max-import-depth=1 --lang=2
--- a/python/ql/test/experimental/library-tests/frameworks/stdlib-py3/CodeExecution.py
+++ b/python/ql/test/experimental/library-tests/frameworks/stdlib-py3/CodeExecution.py
@@ -0,0 +1,4 @@
+import builtins
+
+# exec being part of builtins is Python 3 only
+builtins.exec("print(42)")  # $getCode="print(42)"
--- a/python/ql/test/experimental/library-tests/frameworks/stdlib-py3/ConceptsTest.expected
+++ b/python/ql/test/experimental/library-tests/frameworks/stdlib-py3/ConceptsTest.expected
--- a/python/ql/test/experimental/library-tests/frameworks/stdlib-py3/ConceptsTest.ql
+++ b/python/ql/test/experimental/library-tests/frameworks/stdlib-py3/ConceptsTest.ql
@@ -0,0 +1,2 @@
+import python
+import experimental.meta.ConceptsTest
--- a/python/ql/test/experimental/library-tests/frameworks/stdlib-py3/options
+++ b/python/ql/test/experimental/library-tests/frameworks/stdlib-py3/options
@@ -0,0 +1 @@
+semmle-extractor-options: --max-import-depth=1 --lang=3
--- a/python/ql/test/experimental/library-tests/frameworks/stdlib/CodeExecution.py
+++ b/python/ql/test/experimental/library-tests/frameworks/stdlib/CodeExecution.py
@@ -0,0 +1,39 @@
+# without this, `eval("print(42)")` becomes invalid syntax in Python 2, since print is a
+# statement
+from __future__ import print_function
+
+import sys
+
+if sys.version_info[0] == 3:
+    import builtins
+if sys.version_info[0] == 2:
+    import __builtin__ as builtins
+
+exec("print(42)")  # $getCode="print(42)"
+eval("print(42)")  # $getCode="print(42)"
+
+builtins.eval("print(42)")  # $getCode="print(42)"
+
+cmd = compile("print(42)", "<filename>", "exec")
+exec(cmd)  # $getCode=cmd
+
+cmd = builtins.compile("print(42)", "<filename>", "exec")
+exec(cmd)  # $getCode=cmd
+
+# ------------------------------------------------------------------------------
+# taint related
+
+
+def test_additional_taint():
+    src = TAINTED_STRING
+
+    cmd1 = compile(src, "<filename>", "exec")
+    cmd2 = compile(source=src, filename="<filename>", mode="exec")
+    cmd3 = builtins.compile(src, "<filename>", "exec")
+
+    ensure_tainted(
+        src,
+        cmd1,
+        cmd2,
+        cmd3,
+    )
--- a/python/ql/test/experimental/library-tests/frameworks/stdlib/CodeExecutionPossibleFP1.py
+++ b/python/ql/test/experimental/library-tests/frameworks/stdlib/CodeExecutionPossibleFP1.py
@@ -0,0 +1,11 @@
+# without this, `eval("print(42)")` becomes invalid syntax in Python 2, since print is a
+# statement
+from __future__ import print_function
+
+
+def eval(*args, **kwargs):
+    raise Exception("no eval")
+
+
+# This function call might be marked as a code execution, but it actually isn't.
+eval("print(42)")  # $f+:getCode="print(42)"
--- a/python/ql/test/experimental/library-tests/frameworks/stdlib/CodeExecutionPossibleFP2.py
+++ b/python/ql/test/experimental/library-tests/frameworks/stdlib/CodeExecutionPossibleFP2.py
@@ -0,0 +1,13 @@
+# without this, `eval("print(42)")` becomes invalid syntax in Python 2, since print is a
+# statement
+from __future__ import print_function
+
+
+def foo(*args, **kwargs):
+    raise Exception("no eval")
+
+
+eval = foo
+
+# This function call might be marked as a code execution, but it actually isn't.
+eval("print(42)")  # $f+:getCode="print(42)"
--- a/python/ql/test/experimental/library-tests/frameworks/stdlib/CodeExecutionPossibleFP3.py
+++ b/python/ql/test/experimental/library-tests/frameworks/stdlib/CodeExecutionPossibleFP3.py
@@ -0,0 +1,19 @@
+# without this, `eval("print(42)")` becomes invalid syntax in Python 2, since print is a
+# statement
+from __future__ import print_function
+import sys
+
+if sys.version_info[0] == 3:
+    import builtins
+if sys.version_info[0] == 2:
+    import __builtin__ as builtins
+
+
+def foo(*args, **kwargs):
+    raise Exception("no eval")
+
+
+builtins.eval = foo
+
+# This function call might be marked as a code execution, but it actually isn't.
+eval("print(42)")  # $f+:getCode="print(42)"
--- a/python/ql/test/experimental/library-tests/frameworks/stdlib/TestTaint.expected
+++ b/python/ql/test/experimental/library-tests/frameworks/stdlib/TestTaint.expected
@@ -0,0 +1,4 @@
+| CodeExecution.py:35 | ok   | test_additional_taint | src |
+| CodeExecution.py:36 | ok   | test_additional_taint | cmd1 |
+| CodeExecution.py:37 | ok   | test_additional_taint | cmd2 |
+| CodeExecution.py:38 | ok   | test_additional_taint | cmd3 |
--- a/python/ql/test/experimental/library-tests/frameworks/stdlib/TestTaint.ql
+++ b/python/ql/test/experimental/library-tests/frameworks/stdlib/TestTaint.ql
@@ -0,0 +1,2 @@
+import experimental.dataflow.tainttracking.TestTaintLib
+import experimental.dataflow.RemoteFlowSources
--- a/python/ql/test/experimental/meta/ConceptsTest.qll
+++ b/python/ql/test/experimental/meta/ConceptsTest.qll
@@ -33,6 +33,23 @@ class SystemCommandExecutionTest extends InlineExpectationsTest {
  }
 }

+class CodeExecutionTest extends InlineExpectationsTest {
+  CodeExecutionTest() { this = "CodeExecutionTest" }
+
+  override string getARelevantTag() { result = "getCode" }
+
+  override predicate hasActualResult(Location location, string element, string tag, string value) {
+    exists(CodeExecution ce, DataFlow::Node code |
+      exists(location.getFile().getRelativePath()) and
+      code = ce.getCode() and
+      location = code.getLocation() and
+      element = code.toString() and
+      value = value_from_expr(code.asExpr()) and
+      tag = "getCode"
+    )
+  }
+}
+
 class HttpServerRouteSetupTest extends InlineExpectationsTest {
  HttpServerRouteSetupTest() { this = "HttpServerRouteSetupTest" }

--- a/python/ql/test/experimental/query-tests/Security-new-dataflow/CWE-094/CodeInjection.expected
+++ b/python/ql/test/experimental/query-tests/Security-new-dataflow/CWE-094/CodeInjection.expected
@@ -0,0 +1,13 @@
+edges
+| code_injection.py:6:12:6:23 | ControlFlowNode for Attribute | code_injection.py:7:10:7:13 | ControlFlowNode for code |
+| code_injection.py:6:12:6:23 | ControlFlowNode for Attribute | code_injection.py:8:10:8:13 | ControlFlowNode for code |
+| code_injection.py:6:12:6:23 | ControlFlowNode for Attribute | code_injection.py:10:10:10:12 | ControlFlowNode for cmd |
+nodes
+| code_injection.py:6:12:6:23 | ControlFlowNode for Attribute | semmle.label | ControlFlowNode for Attribute |
+| code_injection.py:7:10:7:13 | ControlFlowNode for code | semmle.label | ControlFlowNode for code |
+| code_injection.py:8:10:8:13 | ControlFlowNode for code | semmle.label | ControlFlowNode for code |
+| code_injection.py:10:10:10:12 | ControlFlowNode for cmd | semmle.label | ControlFlowNode for cmd |
+#select
+| code_injection.py:7:10:7:13 | ControlFlowNode for code | code_injection.py:6:12:6:23 | ControlFlowNode for Attribute | code_injection.py:7:10:7:13 | ControlFlowNode for code | $@ flows to here and is interpreted as code. | code_injection.py:6:12:6:23 | ControlFlowNode for Attribute | A user-provided value |
+| code_injection.py:8:10:8:13 | ControlFlowNode for code | code_injection.py:6:12:6:23 | ControlFlowNode for Attribute | code_injection.py:8:10:8:13 | ControlFlowNode for code | $@ flows to here and is interpreted as code. | code_injection.py:6:12:6:23 | ControlFlowNode for Attribute | A user-provided value |
+| code_injection.py:10:10:10:12 | ControlFlowNode for cmd | code_injection.py:6:12:6:23 | ControlFlowNode for Attribute | code_injection.py:10:10:10:12 | ControlFlowNode for cmd | $@ flows to here and is interpreted as code. | code_injection.py:6:12:6:23 | ControlFlowNode for Attribute | A user-provided value |
--- a/python/ql/test/experimental/query-tests/Security-new-dataflow/CWE-094/CodeInjection.qlref
+++ b/python/ql/test/experimental/query-tests/Security-new-dataflow/CWE-094/CodeInjection.qlref
@@ -0,0 +1 @@
+experimental/Security-new-dataflow/CWE-094/CodeInjection.ql
--- a/python/ql/test/experimental/query-tests/Security-new-dataflow/CWE-094/code_injection.py
+++ b/python/ql/test/experimental/query-tests/Security-new-dataflow/CWE-094/code_injection.py
@@ -0,0 +1,10 @@
+from flask import Flask, request
+app = Flask(__name__)
+
+@app.route("/code-execution")
+def code_execution():
+    code = request.args.get("code")
+    exec(code)
+    eval(code)
+    cmd = compile(code, "<filename>", "exec")
+    exec(cmd)
				`@@ -0,0 +1 @@`
				`semmle-extractor-options: --max-import-depth=1 --lang=2`
				`@@ -0,0 +1 @@`
				`experimental/Security-new-dataflow/CWE-094/CodeInjection.ql`