Merge pull request #4124 from RasmusWL/python-taint-tracking-string-methods

Approved by yoff
2026-04-25 16:55:19 +02:00 · 2020-08-25 14:14:47 +01:00
parent 74db25d80c 2dbf83b579
commit 92c97b1778
10 changed files with 490 additions and 7 deletions
--- a/python/ql/src/experimental/dataflow/internal/TaintTrackingPrivate.qll
+++ b/python/ql/src/experimental/dataflow/internal/TaintTrackingPrivate.qll
@@ -3,13 +3,6 @@ private import experimental.dataflow.DataFlow
 private import experimental.dataflow.internal.DataFlowPrivate
 private import experimental.dataflow.internal.TaintTrackingPublic

-/**
- * Holds if taint can flow in one local step from `nodeFrom` to `nodeTo` excluding
- * local data flow steps. That is, `nodeFrom` and `nodeTo` are likely to represent
- * different objects.
- */
-predicate localAdditionalTaintStep(DataFlow::Node nodeFrom, DataFlow::Node nodeTo) { none() }
-
 /**
 * Holds if `node` should be a barrier in all global taint flow configurations
 * but not in local taint.
@@ -25,3 +18,108 @@ predicate defaultAdditionalTaintStep(DataFlow::Node nodeFrom, DataFlow::Node nod
  or
  any(AdditionalTaintStep a).step(nodeFrom, nodeTo)
 }
+
+/**
+ * Holds if taint can flow in one local step from `nodeFrom` to `nodeTo` excluding
+ * local data flow steps. That is, `nodeFrom` and `nodeTo` are likely to represent
+ * different objects.
+ */
+predicate localAdditionalTaintStep(DataFlow::Node nodeFrom, DataFlow::Node nodeTo) {
+  concatStep(nodeFrom, nodeTo)
+  or
+  subscriptStep(nodeFrom, nodeTo)
+  or
+  stringManipulation(nodeFrom, nodeTo)
+}
+
+/**
+ * Holds if taint can flow from `nodeFrom` to `nodeTo` with a step related to concatenation.
+ *
+ * Note that since we cannot easily distinguish interesting types (like string, list, tuple),
+ * we consider any `+` operation to propagate taint. After consulting with the JS team, this
+ * doesn't sound like it is a big problem in practice.
+ */
+predicate concatStep(DataFlow::CfgNode nodeFrom, DataFlow::CfgNode nodeTo) {
+  exists(BinaryExprNode add | add = nodeTo.getNode() |
+    add.getOp() instanceof Add and add.getAnOperand() = nodeFrom.getNode()
+  )
+}
+
+/**
+ * Holds if taint can flow from `nodeFrom` to `nodeTo` with a step related to subscripting.
+ */
+predicate subscriptStep(DataFlow::CfgNode nodeFrom, DataFlow::CfgNode nodeTo) {
+  nodeTo.getNode().(SubscriptNode).getObject() = nodeFrom.getNode()
+}
+
+/**
+ * Holds if taint can flow from `nodeFrom` to `nodeTo` with a step related to string
+ * manipulation.
+ *
+ * Note that since we cannot easily distinguish when something is a string, this can
+ * also make taint flow on `<non string>.replace(foo, bar)`.
+ */
+predicate stringManipulation(DataFlow::CfgNode nodeFrom, DataFlow::CfgNode nodeTo) {
+  // transforming something tainted into a string will make the string tainted
+  exists(CallNode call | call = nodeTo.getNode() |
+    call.getFunction().(NameNode).getId() in ["str", "bytes", "unicode"] and
+    (
+      nodeFrom.getNode() = call.getArg(0)
+      or
+      nodeFrom.getNode() = call.getArgByName("object")
+    )
+  )
+  or
+  // String methods. Note that this doesn't recognize `meth = "foo".upper; meth()`
+  exists(CallNode call, string method_name, ControlFlowNode object |
+    call = nodeTo.getNode() and
+    object = call.getFunction().(AttrNode).getObject(method_name)
+  |
+    nodeFrom.getNode() = object and
+    method_name in ["capitalize", "casefold", "center", "expandtabs", "format", "format_map",
+          "join", "ljust", "lstrip", "lower", "replace", "rjust", "rstrip", "strip", "swapcase",
+          "title", "upper", "zfill", "encode", "decode"]
+    or
+    method_name = "replace" and
+    nodeFrom.getNode() = call.getArg(1)
+    or
+    method_name = "format" and
+    nodeFrom.getNode() = call.getAnArg()
+    or
+    // str -> List[str]
+    // TODO: check if these should be handled differently in regards to content
+    nodeFrom.getNode() = object and
+    method_name in ["partition", "rpartition", "rsplit", "split", "splitlines"]
+    or
+    // List[str] -> str
+    // TODO: check if these should be handled differently in regards to content
+    method_name = "join" and
+    nodeFrom.getNode() = call.getArg(0)
+    or
+    // Mapping[str, Any] -> str
+    method_name = "format_map" and
+    nodeFrom.getNode() = call.getArg(0)
+  )
+  or
+  // % formatting
+  exists(BinaryExprNode fmt | fmt = nodeTo.getNode() |
+    fmt.getOp() instanceof Mod and
+    (
+      fmt.getLeft() = nodeFrom.getNode()
+      or
+      fmt.getRight() = nodeFrom.getNode()
+    )
+  )
+  or
+  // string multiplication -- `"foo" * 10`
+  exists(BinaryExprNode mult | mult = nodeTo.getNode() |
+    mult.getOp() instanceof Mult and
+    mult.getLeft() = nodeFrom.getNode()
+  )
+  or
+  // f-strings
+  nodeTo.getNode().getNode().(Fstring).getAValue() = nodeFrom.getNode().getNode()
+  // TODO: Handle encode/decode from base64/quopri
+  // TODO: Handle os.path.join
+  // TODO: Handle functions in https://docs.python.org/3/library/binascii.html
+}
--- a/python/ql/test/experimental/dataflow/options
+++ b/python/ql/test/experimental/dataflow/options
@@ -0,0 +1 @@
+semmle-extractor-options: --max-import-depth=1
--- a/python/ql/test/experimental/dataflow/tainttracking/TestTaintLib.qll
+++ b/python/ql/test/experimental/dataflow/tainttracking/TestTaintLib.qll
@@ -0,0 +1,72 @@
+import python
+import experimental.dataflow.TaintTracking
+import experimental.dataflow.DataFlow
+
+class TestTaintTrackingConfiguration extends TaintTracking::Configuration {
+  TestTaintTrackingConfiguration() { this = "TestTaintTrackingConfiguration" }
+
+  override predicate isSource(DataFlow::Node source) {
+    source.(DataFlow::CfgNode).getNode().(NameNode).getId() in ["TAINTED_STRING", "TAINTED_BYTES"]
+  }
+
+  override predicate isSink(DataFlow::Node sink) {
+    exists(CallNode call |
+      call.getFunction().(NameNode).getId() in ["ensure_tainted", "ensure_not_tainted"] and
+      sink.(DataFlow::CfgNode).getNode() = call.getAnArg()
+    )
+  }
+}
+
+private string repr(Expr e) {
+  not e instanceof Num and
+  not e instanceof StrConst and
+  not e instanceof Subscript and
+  not e instanceof Call and
+  not e instanceof Attribute and
+  result = e.toString()
+  or
+  result = e.(Num).getN()
+  or
+  result =
+    e.(StrConst).getPrefix() + e.(StrConst).getText() +
+      e.(StrConst).getPrefix().regexpReplaceAll("[a-zA-Z]+", "")
+  or
+  result = repr(e.(Subscript).getObject()) + "[" + repr(e.(Subscript).getIndex()) + "]"
+  or
+  (
+    if exists(e.(Call).getAnArg()) or exists(e.(Call).getANamedArg())
+    then result = repr(e.(Call).getFunc()) + "(..)"
+    else result = repr(e.(Call).getFunc()) + "()"
+  )
+  or
+  result = repr(e.(Attribute).getObject()) + "." + e.(Attribute).getName()
+}
+
+query predicate test_taint(string arg_location, string test_res, string function_name, string repr) {
+  exists(Call call, Expr arg, boolean expected_taint, boolean has_taint |
+    call.getLocation().getFile().getShortName() = "test.py" and
+    (
+      call.getFunc().(Name).getId() = "ensure_tainted" and
+      expected_taint = true
+      or
+      call.getFunc().(Name).getId() = "ensure_not_tainted" and
+      expected_taint = false
+    ) and
+    arg = call.getAnArg() and
+    (
+      // TODO: Replace with `hasFlowToExpr` once that is working
+      if
+        exists(TaintTracking::Configuration c |
+          c.hasFlowTo(any(DataFlow::Node n | n.(DataFlow::CfgNode).getNode() = arg.getAFlowNode()))
+        )
+      then has_taint = true
+      else has_taint = false
+    ) and
+    (if expected_taint = has_taint then test_res = "ok  " else test_res = "fail") and
+    // select
+    arg_location = arg.getLocation().toString() and
+    test_res = test_res and
+    function_name = call.getScope().(Function).getName() and
+    repr = repr(arg)
+  )
+}
--- a/python/ql/test/experimental/dataflow/tainttracking/string-py3/TestTaint.expected
+++ b/python/ql/test/experimental/dataflow/tainttracking/string-py3/TestTaint.expected
@@ -0,0 +1,10 @@
+| test.py:26 | ok   | str_methods | ts.casefold() |
+| test.py:28 | ok   | str_methods | ts.format_map(..) |
+| test.py:29 | fail | str_methods | "{unsafe}".format_map(..) |
+| test.py:40 | fail | binary_decode_encode | base64.a85encode(..) |
+| test.py:41 | fail | binary_decode_encode | base64.a85decode(..) |
+| test.py:44 | fail | binary_decode_encode | base64.b85encode(..) |
+| test.py:45 | fail | binary_decode_encode | base64.b85decode(..) |
+| test.py:48 | fail | binary_decode_encode | base64.encodebytes(..) |
+| test.py:49 | fail | binary_decode_encode | base64.decodebytes(..) |
+| test.py:57 | ok   | f_strings | Fstring |
--- a/python/ql/test/experimental/dataflow/tainttracking/string-py3/TestTaint.ql
+++ b/python/ql/test/experimental/dataflow/tainttracking/string-py3/TestTaint.ql
@@ -0,0 +1 @@
+import experimental.dataflow.tainttracking.TestTaintLib
--- a/python/ql/test/experimental/dataflow/tainttracking/string-py3/options
+++ b/python/ql/test/experimental/dataflow/tainttracking/string-py3/options
@@ -0,0 +1 @@
+semmle-extractor-options: --max-import-depth=1 --lang=3
--- a/python/ql/test/experimental/dataflow/tainttracking/string-py3/test.py
+++ b/python/ql/test/experimental/dataflow/tainttracking/string-py3/test.py
@@ -0,0 +1,64 @@
+# Python 3 specific taint tracking for string
+
+TAINTED_STRING = "TAINTED_STRING"
+TAINTED_BYTES = b"TAINTED_BYTES"
+
+
+def ensure_tainted(*args):
+    print("- ensure_tainted")
+    for i, arg in enumerate(args):
+        print("arg {}: {!r}".format(i, arg))
+
+
+def ensure_not_tainted(*args):
+    print("- ensure_not_tainted")
+    for i, arg in enumerate(args):
+        print("arg {}: {!r}".format(i, arg))
+
+
+# Actual tests
+
+def str_methods():
+    print("\n# str_methods")
+    ts = TAINTED_STRING
+    tb = TAINTED_BYTES
+    ensure_tainted(
+        ts.casefold(),
+
+        ts.format_map({}),
+        "{unsafe}".format_map({"unsafe": ts}),
+    )
+
+
+def binary_decode_encode():
+    print("\n#percent_fmt")
+    tb = TAINTED_BYTES
+    import base64
+
+    ensure_tainted(
+        # New in Python 3.4
+        base64.a85encode(tb),
+        base64.a85decode(base64.a85encode(tb)),
+
+        # New in Python 3.4
+        base64.b85encode(tb),
+        base64.b85decode(base64.b85encode(tb)),
+
+        # New in Python 3.1
+        base64.encodebytes(tb),
+        base64.decodebytes(base64.encodebytes(tb)),
+    )
+
+
+def f_strings():
+    print("\n#f_strings")
+    ts = TAINTED_STRING
+
+    ensure_tainted(f"foo {ts} bar")
+
+
+# Make tests runable
+
+str_methods()
+binary_decode_encode()
+f_strings()
--- a/python/ql/test/experimental/dataflow/tainttracking/string/TestTaint.expected
+++ b/python/ql/test/experimental/dataflow/tainttracking/string/TestTaint.expected
@@ -0,0 +1,62 @@
+| test.py:32 | ok   | str_operations | ts |
+| test.py:33 | ok   | str_operations | BinaryExpr |
+| test.py:34 | ok   | str_operations | BinaryExpr |
+| test.py:35 | ok   | str_operations | BinaryExpr |
+| test.py:36 | ok   | str_operations | ts[Slice] |
+| test.py:37 | ok   | str_operations | ts[Slice] |
+| test.py:38 | ok   | str_operations | ts[Slice] |
+| test.py:39 | ok   | str_operations | ts[0] |
+| test.py:40 | ok   | str_operations | str(..) |
+| test.py:41 | ok   | str_operations | bytes(..) |
+| test.py:42 | ok   | str_operations | unicode(..) |
+| test.py:51 | ok   | str_methods | ts.capitalize() |
+| test.py:52 | ok   | str_methods | ts.center(..) |
+| test.py:53 | ok   | str_methods | ts.expandtabs() |
+| test.py:55 | ok   | str_methods | ts.format() |
+| test.py:56 | ok   | str_methods | "{}".format(..) |
+| test.py:57 | ok   | str_methods | "{unsafe}".format(..) |
+| test.py:59 | ok   | str_methods | ts.join(..) |
+| test.py:60 | fail | str_methods | "".join(..) |
+| test.py:62 | ok   | str_methods | ts.ljust(..) |
+| test.py:63 | ok   | str_methods | ts.lstrip() |
+| test.py:64 | ok   | str_methods | ts.lower() |
+| test.py:66 | ok   | str_methods | ts.replace(..) |
+| test.py:67 | ok   | str_methods | "safe".replace(..) |
+| test.py:69 | ok   | str_methods | ts.rjust(..) |
+| test.py:70 | ok   | str_methods | ts.rstrip() |
+| test.py:71 | ok   | str_methods | ts.strip() |
+| test.py:72 | ok   | str_methods | ts.swapcase() |
+| test.py:73 | ok   | str_methods | ts.title() |
+| test.py:74 | ok   | str_methods | ts.upper() |
+| test.py:75 | ok   | str_methods | ts.zfill(..) |
+| test.py:77 | ok   | str_methods | ts.encode(..) |
+| test.py:78 | ok   | str_methods | ts.encode(..).decode(..) |
+| test.py:80 | ok   | str_methods | tb.decode(..) |
+| test.py:81 | ok   | str_methods | tb.decode(..).encode(..) |
+| test.py:84 | ok   | str_methods | ts.partition(..) |
+| test.py:85 | ok   | str_methods | ts.rpartition(..) |
+| test.py:86 | ok   | str_methods | ts.rsplit(..) |
+| test.py:87 | ok   | str_methods | ts.split(..) |
+| test.py:88 | ok   | str_methods | ts.splitlines() |
+| test.py:93 | ok   | str_methods | "safe".replace(..) |
+| test.py:95 | fail | str_methods | ts.join(..) |
+| test.py:96 | fail | str_methods | ts.join(..) |
+| test.py:106 | fail | non_syntactic | meth() |
+| test.py:107 | fail | non_syntactic | _str(..) |
+| test.py:116 | ok   | percent_fmt | BinaryExpr |
+| test.py:117 | ok   | percent_fmt | BinaryExpr |
+| test.py:118 | fail | percent_fmt | BinaryExpr |
+| test.py:128 | fail | binary_decode_encode | base64.b64encode(..) |
+| test.py:129 | fail | binary_decode_encode | base64.b64decode(..) |
+| test.py:131 | fail | binary_decode_encode | base64.standard_b64encode(..) |
+| test.py:132 | fail | binary_decode_encode | base64.standard_b64decode(..) |
+| test.py:134 | fail | binary_decode_encode | base64.urlsafe_b64encode(..) |
+| test.py:135 | fail | binary_decode_encode | base64.urlsafe_b64decode(..) |
+| test.py:137 | fail | binary_decode_encode | base64.b32encode(..) |
+| test.py:138 | fail | binary_decode_encode | base64.b32decode(..) |
+| test.py:140 | fail | binary_decode_encode | base64.b16encode(..) |
+| test.py:141 | fail | binary_decode_encode | base64.b16decode(..) |
+| test.py:156 | fail | binary_decode_encode | base64.encodestring(..) |
+| test.py:157 | fail | binary_decode_encode | base64.decodestring(..) |
+| test.py:162 | fail | binary_decode_encode | quopri.encodestring(..) |
+| test.py:163 | fail | binary_decode_encode | quopri.decodestring(..) |
--- a/python/ql/test/experimental/dataflow/tainttracking/string/TestTaint.ql
+++ b/python/ql/test/experimental/dataflow/tainttracking/string/TestTaint.ql
@@ -0,0 +1 @@
+import experimental.dataflow.tainttracking.TestTaintLib
--- a/python/ql/test/experimental/dataflow/tainttracking/string/test.py
+++ b/python/ql/test/experimental/dataflow/tainttracking/string/test.py
@@ -0,0 +1,173 @@
+import sys
+
+if sys.version_info[0] == 3:
+    unicode = str
+
+
+TAINTED_STRING = "TAINTED_STRING"
+TAINTED_BYTES = b"TAINTED_BYTES"
+
+
+def ensure_tainted(*args):
+    print("- ensure_tainted")
+    for i, arg in enumerate(args):
+        print("arg {}: {!r}".format(i, arg))
+
+
+def ensure_not_tainted(*args):
+    print("- ensure_not_tainted")
+    for i, arg in enumerate(args):
+        print("arg {}: {!r}".format(i, arg))
+
+
+# Actual tests
+
+
+def str_operations():
+    print("\n# str_operations")
+    ts = TAINTED_STRING
+    tb = TAINTED_BYTES
+
+    ensure_tainted(
+        ts,
+        ts + "foo",
+        "foo" + ts,
+        ts * 5,
+        ts[0 : len(ts)],
+        ts[:],
+        ts[0:1000],
+        ts[0],
+        str(ts),
+        bytes(tb),
+        unicode(ts),
+    )
+
+
+def str_methods():
+    print("\n# str_methods")
+    ts = TAINTED_STRING
+    tb = TAINTED_BYTES
+    ensure_tainted(
+        ts.capitalize(),
+        ts.center(100),
+        ts.expandtabs(),
+
+        ts.format(),
+        "{}".format(ts),
+        "{unsafe}".format(unsafe=ts),
+
+        ts.join(["", ""]),
+        "".join([ts]),
+
+        ts.ljust(100),
+        ts.lstrip(),
+        ts.lower(),
+
+        ts.replace("old", "new"),
+        "safe".replace("safe", ts),
+
+        ts.rjust(100),
+        ts.rstrip(),
+        ts.strip(),
+        ts.swapcase(),
+        ts.title(),
+        ts.upper(),
+        ts.zfill(100),
+
+        ts.encode("utf-8"),
+        ts.encode("utf-8").decode("utf-8"),
+
+        tb.decode("utf-8"),
+        tb.decode("utf-8").encode("utf-8"),
+
+        # string methods that return a list
+        ts.partition("_"),
+        ts.rpartition("_"),
+        ts.rsplit("_"),
+        ts.split("_"),
+        ts.splitlines(),
+    )
+
+    ensure_not_tainted(
+        # Intuitively I think this should be safe, but better discuss it
+        "safe".replace(ts, "also-safe"),
+
+        ts.join([]),  # FP due to separator not being used with zero/one elements
+        ts.join(["safe"]),  # FP due to separator not being used with zero/one elements
+    )
+
+
+def non_syntactic():
+    print("\n# non_syntactic")
+    ts = TAINTED_STRING
+    meth = ts.upper
+    _str = str
+    ensure_tainted(
+        meth(),
+        _str(ts),
+    )
+
+
+def percent_fmt():
+    print("\n#percent_fmt")
+    ts = TAINTED_STRING
+    tainted_fmt = ts + " %s %s"
+    ensure_tainted(
+        tainted_fmt % (1, 2),
+        "%s foo bar" % ts,
+        "%s %s %s" % (1, 2, ts),
+    )
+
+
+def binary_decode_encode():
+    print("\n#percent_fmt")
+    tb = TAINTED_BYTES
+    import base64
+
+    ensure_tainted(
+        base64.b64encode(tb),
+        base64.b64decode(base64.b64encode(tb)),
+
+        base64.standard_b64encode(tb),
+        base64.standard_b64decode(base64.standard_b64encode(tb)),
+
+        base64.urlsafe_b64encode(tb),
+        base64.urlsafe_b64decode(base64.urlsafe_b64encode(tb)),
+
+        base64.b32encode(tb),
+        base64.b32decode(base64.b32encode(tb)),
+
+        base64.b16encode(tb),
+        base64.b16decode(base64.b16encode(tb)),
+
+        # # New in Python 3.4
+        # base64.a85encode(tb),
+        # base64.a85decode(base64.a85encode(tb)),
+
+        # # New in Python 3.4
+        # base64.b85encode(tb),
+        # base64.b85decode(base64.b85encode(tb)),
+
+        # # New in Python 3.1
+        # base64.encodebytes(tb),
+        # base64.decodebytes(base64.encodebytes(tb)),
+
+        # deprecated since Python 3.1, but still works
+        base64.encodestring(tb),
+        base64.decodestring(base64.encodestring(tb)),
+    )
+
+    import quopri
+    ensure_tainted(
+        quopri.encodestring(tb),
+        quopri.decodestring(quopri.encodestring(tb)),
+    )
+
+
+# Make tests runable
+
+str_operations()
+str_methods()
+non_syntactic()
+percent_fmt()
+binary_decode_encode()
				`@@ -0,0 +1 @@`
				`semmle-extractor-options: --max-import-depth=1`
				`@@ -0,0 +1 @@`
				`import experimental.dataflow.tainttracking.TestTaintLib`