Merge pull request #4124 from RasmusWL/python-taint-tracking-string-methods

Approved by yoff
This commit is contained in:
CodeQL CI
2020-08-25 14:14:47 +01:00
committed by GitHub
10 changed files with 490 additions and 7 deletions

View File

@@ -3,13 +3,6 @@ private import experimental.dataflow.DataFlow
private import experimental.dataflow.internal.DataFlowPrivate
private import experimental.dataflow.internal.TaintTrackingPublic
/**
* Holds if taint can flow in one local step from `nodeFrom` to `nodeTo` excluding
* local data flow steps. That is, `nodeFrom` and `nodeTo` are likely to represent
* different objects.
*/
predicate localAdditionalTaintStep(DataFlow::Node nodeFrom, DataFlow::Node nodeTo) { none() }
/**
* Holds if `node` should be a barrier in all global taint flow configurations
* but not in local taint.
@@ -25,3 +18,108 @@ predicate defaultAdditionalTaintStep(DataFlow::Node nodeFrom, DataFlow::Node nod
or
any(AdditionalTaintStep a).step(nodeFrom, nodeTo)
}
/**
* Holds if taint can flow in one local step from `nodeFrom` to `nodeTo` excluding
* local data flow steps. That is, `nodeFrom` and `nodeTo` are likely to represent
* different objects.
*/
predicate localAdditionalTaintStep(DataFlow::Node nodeFrom, DataFlow::Node nodeTo) {
concatStep(nodeFrom, nodeTo)
or
subscriptStep(nodeFrom, nodeTo)
or
stringManipulation(nodeFrom, nodeTo)
}
/**
* Holds if taint can flow from `nodeFrom` to `nodeTo` with a step related to concatenation.
*
* Note that since we cannot easily distinguish interesting types (like string, list, tuple),
* we consider any `+` operation to propagate taint. After consulting with the JS team, this
* doesn't sound like it is a big problem in practice.
*/
predicate concatStep(DataFlow::CfgNode nodeFrom, DataFlow::CfgNode nodeTo) {
exists(BinaryExprNode add | add = nodeTo.getNode() |
add.getOp() instanceof Add and add.getAnOperand() = nodeFrom.getNode()
)
}
/**
* Holds if taint can flow from `nodeFrom` to `nodeTo` with a step related to subscripting.
*/
predicate subscriptStep(DataFlow::CfgNode nodeFrom, DataFlow::CfgNode nodeTo) {
nodeTo.getNode().(SubscriptNode).getObject() = nodeFrom.getNode()
}
/**
* Holds if taint can flow from `nodeFrom` to `nodeTo` with a step related to string
* manipulation.
*
* Note that since we cannot easily distinguish when something is a string, this can
* also make taint flow on `<non string>.replace(foo, bar)`.
*/
predicate stringManipulation(DataFlow::CfgNode nodeFrom, DataFlow::CfgNode nodeTo) {
// transforming something tainted into a string will make the string tainted
exists(CallNode call | call = nodeTo.getNode() |
call.getFunction().(NameNode).getId() in ["str", "bytes", "unicode"] and
(
nodeFrom.getNode() = call.getArg(0)
or
nodeFrom.getNode() = call.getArgByName("object")
)
)
or
// String methods. Note that this doesn't recognize `meth = "foo".upper; meth()`
exists(CallNode call, string method_name, ControlFlowNode object |
call = nodeTo.getNode() and
object = call.getFunction().(AttrNode).getObject(method_name)
|
nodeFrom.getNode() = object and
method_name in ["capitalize", "casefold", "center", "expandtabs", "format", "format_map",
"join", "ljust", "lstrip", "lower", "replace", "rjust", "rstrip", "strip", "swapcase",
"title", "upper", "zfill", "encode", "decode"]
or
method_name = "replace" and
nodeFrom.getNode() = call.getArg(1)
or
method_name = "format" and
nodeFrom.getNode() = call.getAnArg()
or
// str -> List[str]
// TODO: check if these should be handled differently in regards to content
nodeFrom.getNode() = object and
method_name in ["partition", "rpartition", "rsplit", "split", "splitlines"]
or
// List[str] -> str
// TODO: check if these should be handled differently in regards to content
method_name = "join" and
nodeFrom.getNode() = call.getArg(0)
or
// Mapping[str, Any] -> str
method_name = "format_map" and
nodeFrom.getNode() = call.getArg(0)
)
or
// % formatting
exists(BinaryExprNode fmt | fmt = nodeTo.getNode() |
fmt.getOp() instanceof Mod and
(
fmt.getLeft() = nodeFrom.getNode()
or
fmt.getRight() = nodeFrom.getNode()
)
)
or
// string multiplication -- `"foo" * 10`
exists(BinaryExprNode mult | mult = nodeTo.getNode() |
mult.getOp() instanceof Mult and
mult.getLeft() = nodeFrom.getNode()
)
or
// f-strings
nodeTo.getNode().getNode().(Fstring).getAValue() = nodeFrom.getNode().getNode()
// TODO: Handle encode/decode from base64/quopri
// TODO: Handle os.path.join
// TODO: Handle functions in https://docs.python.org/3/library/binascii.html
}

View File

@@ -0,0 +1 @@
semmle-extractor-options: --max-import-depth=1

View File

@@ -0,0 +1,72 @@
import python
import experimental.dataflow.TaintTracking
import experimental.dataflow.DataFlow
class TestTaintTrackingConfiguration extends TaintTracking::Configuration {
TestTaintTrackingConfiguration() { this = "TestTaintTrackingConfiguration" }
override predicate isSource(DataFlow::Node source) {
source.(DataFlow::CfgNode).getNode().(NameNode).getId() in ["TAINTED_STRING", "TAINTED_BYTES"]
}
override predicate isSink(DataFlow::Node sink) {
exists(CallNode call |
call.getFunction().(NameNode).getId() in ["ensure_tainted", "ensure_not_tainted"] and
sink.(DataFlow::CfgNode).getNode() = call.getAnArg()
)
}
}
private string repr(Expr e) {
not e instanceof Num and
not e instanceof StrConst and
not e instanceof Subscript and
not e instanceof Call and
not e instanceof Attribute and
result = e.toString()
or
result = e.(Num).getN()
or
result =
e.(StrConst).getPrefix() + e.(StrConst).getText() +
e.(StrConst).getPrefix().regexpReplaceAll("[a-zA-Z]+", "")
or
result = repr(e.(Subscript).getObject()) + "[" + repr(e.(Subscript).getIndex()) + "]"
or
(
if exists(e.(Call).getAnArg()) or exists(e.(Call).getANamedArg())
then result = repr(e.(Call).getFunc()) + "(..)"
else result = repr(e.(Call).getFunc()) + "()"
)
or
result = repr(e.(Attribute).getObject()) + "." + e.(Attribute).getName()
}
query predicate test_taint(string arg_location, string test_res, string function_name, string repr) {
exists(Call call, Expr arg, boolean expected_taint, boolean has_taint |
call.getLocation().getFile().getShortName() = "test.py" and
(
call.getFunc().(Name).getId() = "ensure_tainted" and
expected_taint = true
or
call.getFunc().(Name).getId() = "ensure_not_tainted" and
expected_taint = false
) and
arg = call.getAnArg() and
(
// TODO: Replace with `hasFlowToExpr` once that is working
if
exists(TaintTracking::Configuration c |
c.hasFlowTo(any(DataFlow::Node n | n.(DataFlow::CfgNode).getNode() = arg.getAFlowNode()))
)
then has_taint = true
else has_taint = false
) and
(if expected_taint = has_taint then test_res = "ok " else test_res = "fail") and
// select
arg_location = arg.getLocation().toString() and
test_res = test_res and
function_name = call.getScope().(Function).getName() and
repr = repr(arg)
)
}

View File

@@ -0,0 +1,10 @@
| test.py:26 | ok | str_methods | ts.casefold() |
| test.py:28 | ok | str_methods | ts.format_map(..) |
| test.py:29 | fail | str_methods | "{unsafe}".format_map(..) |
| test.py:40 | fail | binary_decode_encode | base64.a85encode(..) |
| test.py:41 | fail | binary_decode_encode | base64.a85decode(..) |
| test.py:44 | fail | binary_decode_encode | base64.b85encode(..) |
| test.py:45 | fail | binary_decode_encode | base64.b85decode(..) |
| test.py:48 | fail | binary_decode_encode | base64.encodebytes(..) |
| test.py:49 | fail | binary_decode_encode | base64.decodebytes(..) |
| test.py:57 | ok | f_strings | Fstring |

View File

@@ -0,0 +1 @@
import experimental.dataflow.tainttracking.TestTaintLib

View File

@@ -0,0 +1 @@
semmle-extractor-options: --max-import-depth=1 --lang=3

View File

@@ -0,0 +1,64 @@
# Python 3 specific taint tracking for string
TAINTED_STRING = "TAINTED_STRING"
TAINTED_BYTES = b"TAINTED_BYTES"
def ensure_tainted(*args):
print("- ensure_tainted")
for i, arg in enumerate(args):
print("arg {}: {!r}".format(i, arg))
def ensure_not_tainted(*args):
print("- ensure_not_tainted")
for i, arg in enumerate(args):
print("arg {}: {!r}".format(i, arg))
# Actual tests
def str_methods():
print("\n# str_methods")
ts = TAINTED_STRING
tb = TAINTED_BYTES
ensure_tainted(
ts.casefold(),
ts.format_map({}),
"{unsafe}".format_map({"unsafe": ts}),
)
def binary_decode_encode():
print("\n#percent_fmt")
tb = TAINTED_BYTES
import base64
ensure_tainted(
# New in Python 3.4
base64.a85encode(tb),
base64.a85decode(base64.a85encode(tb)),
# New in Python 3.4
base64.b85encode(tb),
base64.b85decode(base64.b85encode(tb)),
# New in Python 3.1
base64.encodebytes(tb),
base64.decodebytes(base64.encodebytes(tb)),
)
def f_strings():
print("\n#f_strings")
ts = TAINTED_STRING
ensure_tainted(f"foo {ts} bar")
# Make tests runable
str_methods()
binary_decode_encode()
f_strings()

View File

@@ -0,0 +1,62 @@
| test.py:32 | ok | str_operations | ts |
| test.py:33 | ok | str_operations | BinaryExpr |
| test.py:34 | ok | str_operations | BinaryExpr |
| test.py:35 | ok | str_operations | BinaryExpr |
| test.py:36 | ok | str_operations | ts[Slice] |
| test.py:37 | ok | str_operations | ts[Slice] |
| test.py:38 | ok | str_operations | ts[Slice] |
| test.py:39 | ok | str_operations | ts[0] |
| test.py:40 | ok | str_operations | str(..) |
| test.py:41 | ok | str_operations | bytes(..) |
| test.py:42 | ok | str_operations | unicode(..) |
| test.py:51 | ok | str_methods | ts.capitalize() |
| test.py:52 | ok | str_methods | ts.center(..) |
| test.py:53 | ok | str_methods | ts.expandtabs() |
| test.py:55 | ok | str_methods | ts.format() |
| test.py:56 | ok | str_methods | "{}".format(..) |
| test.py:57 | ok | str_methods | "{unsafe}".format(..) |
| test.py:59 | ok | str_methods | ts.join(..) |
| test.py:60 | fail | str_methods | "".join(..) |
| test.py:62 | ok | str_methods | ts.ljust(..) |
| test.py:63 | ok | str_methods | ts.lstrip() |
| test.py:64 | ok | str_methods | ts.lower() |
| test.py:66 | ok | str_methods | ts.replace(..) |
| test.py:67 | ok | str_methods | "safe".replace(..) |
| test.py:69 | ok | str_methods | ts.rjust(..) |
| test.py:70 | ok | str_methods | ts.rstrip() |
| test.py:71 | ok | str_methods | ts.strip() |
| test.py:72 | ok | str_methods | ts.swapcase() |
| test.py:73 | ok | str_methods | ts.title() |
| test.py:74 | ok | str_methods | ts.upper() |
| test.py:75 | ok | str_methods | ts.zfill(..) |
| test.py:77 | ok | str_methods | ts.encode(..) |
| test.py:78 | ok | str_methods | ts.encode(..).decode(..) |
| test.py:80 | ok | str_methods | tb.decode(..) |
| test.py:81 | ok | str_methods | tb.decode(..).encode(..) |
| test.py:84 | ok | str_methods | ts.partition(..) |
| test.py:85 | ok | str_methods | ts.rpartition(..) |
| test.py:86 | ok | str_methods | ts.rsplit(..) |
| test.py:87 | ok | str_methods | ts.split(..) |
| test.py:88 | ok | str_methods | ts.splitlines() |
| test.py:93 | ok | str_methods | "safe".replace(..) |
| test.py:95 | fail | str_methods | ts.join(..) |
| test.py:96 | fail | str_methods | ts.join(..) |
| test.py:106 | fail | non_syntactic | meth() |
| test.py:107 | fail | non_syntactic | _str(..) |
| test.py:116 | ok | percent_fmt | BinaryExpr |
| test.py:117 | ok | percent_fmt | BinaryExpr |
| test.py:118 | fail | percent_fmt | BinaryExpr |
| test.py:128 | fail | binary_decode_encode | base64.b64encode(..) |
| test.py:129 | fail | binary_decode_encode | base64.b64decode(..) |
| test.py:131 | fail | binary_decode_encode | base64.standard_b64encode(..) |
| test.py:132 | fail | binary_decode_encode | base64.standard_b64decode(..) |
| test.py:134 | fail | binary_decode_encode | base64.urlsafe_b64encode(..) |
| test.py:135 | fail | binary_decode_encode | base64.urlsafe_b64decode(..) |
| test.py:137 | fail | binary_decode_encode | base64.b32encode(..) |
| test.py:138 | fail | binary_decode_encode | base64.b32decode(..) |
| test.py:140 | fail | binary_decode_encode | base64.b16encode(..) |
| test.py:141 | fail | binary_decode_encode | base64.b16decode(..) |
| test.py:156 | fail | binary_decode_encode | base64.encodestring(..) |
| test.py:157 | fail | binary_decode_encode | base64.decodestring(..) |
| test.py:162 | fail | binary_decode_encode | quopri.encodestring(..) |
| test.py:163 | fail | binary_decode_encode | quopri.decodestring(..) |

View File

@@ -0,0 +1 @@
import experimental.dataflow.tainttracking.TestTaintLib

View File

@@ -0,0 +1,173 @@
import sys
if sys.version_info[0] == 3:
unicode = str
TAINTED_STRING = "TAINTED_STRING"
TAINTED_BYTES = b"TAINTED_BYTES"
def ensure_tainted(*args):
print("- ensure_tainted")
for i, arg in enumerate(args):
print("arg {}: {!r}".format(i, arg))
def ensure_not_tainted(*args):
print("- ensure_not_tainted")
for i, arg in enumerate(args):
print("arg {}: {!r}".format(i, arg))
# Actual tests
def str_operations():
print("\n# str_operations")
ts = TAINTED_STRING
tb = TAINTED_BYTES
ensure_tainted(
ts,
ts + "foo",
"foo" + ts,
ts * 5,
ts[0 : len(ts)],
ts[:],
ts[0:1000],
ts[0],
str(ts),
bytes(tb),
unicode(ts),
)
def str_methods():
print("\n# str_methods")
ts = TAINTED_STRING
tb = TAINTED_BYTES
ensure_tainted(
ts.capitalize(),
ts.center(100),
ts.expandtabs(),
ts.format(),
"{}".format(ts),
"{unsafe}".format(unsafe=ts),
ts.join(["", ""]),
"".join([ts]),
ts.ljust(100),
ts.lstrip(),
ts.lower(),
ts.replace("old", "new"),
"safe".replace("safe", ts),
ts.rjust(100),
ts.rstrip(),
ts.strip(),
ts.swapcase(),
ts.title(),
ts.upper(),
ts.zfill(100),
ts.encode("utf-8"),
ts.encode("utf-8").decode("utf-8"),
tb.decode("utf-8"),
tb.decode("utf-8").encode("utf-8"),
# string methods that return a list
ts.partition("_"),
ts.rpartition("_"),
ts.rsplit("_"),
ts.split("_"),
ts.splitlines(),
)
ensure_not_tainted(
# Intuitively I think this should be safe, but better discuss it
"safe".replace(ts, "also-safe"),
ts.join([]), # FP due to separator not being used with zero/one elements
ts.join(["safe"]), # FP due to separator not being used with zero/one elements
)
def non_syntactic():
print("\n# non_syntactic")
ts = TAINTED_STRING
meth = ts.upper
_str = str
ensure_tainted(
meth(),
_str(ts),
)
def percent_fmt():
print("\n#percent_fmt")
ts = TAINTED_STRING
tainted_fmt = ts + " %s %s"
ensure_tainted(
tainted_fmt % (1, 2),
"%s foo bar" % ts,
"%s %s %s" % (1, 2, ts),
)
def binary_decode_encode():
print("\n#percent_fmt")
tb = TAINTED_BYTES
import base64
ensure_tainted(
base64.b64encode(tb),
base64.b64decode(base64.b64encode(tb)),
base64.standard_b64encode(tb),
base64.standard_b64decode(base64.standard_b64encode(tb)),
base64.urlsafe_b64encode(tb),
base64.urlsafe_b64decode(base64.urlsafe_b64encode(tb)),
base64.b32encode(tb),
base64.b32decode(base64.b32encode(tb)),
base64.b16encode(tb),
base64.b16decode(base64.b16encode(tb)),
# # New in Python 3.4
# base64.a85encode(tb),
# base64.a85decode(base64.a85encode(tb)),
# # New in Python 3.4
# base64.b85encode(tb),
# base64.b85decode(base64.b85encode(tb)),
# # New in Python 3.1
# base64.encodebytes(tb),
# base64.decodebytes(base64.encodebytes(tb)),
# deprecated since Python 3.1, but still works
base64.encodestring(tb),
base64.decodestring(base64.encodestring(tb)),
)
import quopri
ensure_tainted(
quopri.encodestring(tb),
quopri.decodestring(quopri.encodestring(tb)),
)
# Make tests runable
str_operations()
str_methods()
non_syntactic()
percent_fmt()
binary_decode_encode()