mirror of
https://github.com/github/codeql.git
synced 2026-04-25 16:55:19 +02:00
Merge pull request #4124 from RasmusWL/python-taint-tracking-string-methods
Approved by yoff
This commit is contained in:
@@ -3,13 +3,6 @@ private import experimental.dataflow.DataFlow
|
||||
private import experimental.dataflow.internal.DataFlowPrivate
|
||||
private import experimental.dataflow.internal.TaintTrackingPublic
|
||||
|
||||
/**
|
||||
* Holds if taint can flow in one local step from `nodeFrom` to `nodeTo` excluding
|
||||
* local data flow steps. That is, `nodeFrom` and `nodeTo` are likely to represent
|
||||
* different objects.
|
||||
*/
|
||||
predicate localAdditionalTaintStep(DataFlow::Node nodeFrom, DataFlow::Node nodeTo) { none() }
|
||||
|
||||
/**
|
||||
* Holds if `node` should be a barrier in all global taint flow configurations
|
||||
* but not in local taint.
|
||||
@@ -25,3 +18,108 @@ predicate defaultAdditionalTaintStep(DataFlow::Node nodeFrom, DataFlow::Node nod
|
||||
or
|
||||
any(AdditionalTaintStep a).step(nodeFrom, nodeTo)
|
||||
}
|
||||
|
||||
/**
|
||||
* Holds if taint can flow in one local step from `nodeFrom` to `nodeTo` excluding
|
||||
* local data flow steps. That is, `nodeFrom` and `nodeTo` are likely to represent
|
||||
* different objects.
|
||||
*/
|
||||
predicate localAdditionalTaintStep(DataFlow::Node nodeFrom, DataFlow::Node nodeTo) {
|
||||
concatStep(nodeFrom, nodeTo)
|
||||
or
|
||||
subscriptStep(nodeFrom, nodeTo)
|
||||
or
|
||||
stringManipulation(nodeFrom, nodeTo)
|
||||
}
|
||||
|
||||
/**
|
||||
* Holds if taint can flow from `nodeFrom` to `nodeTo` with a step related to concatenation.
|
||||
*
|
||||
* Note that since we cannot easily distinguish interesting types (like string, list, tuple),
|
||||
* we consider any `+` operation to propagate taint. After consulting with the JS team, this
|
||||
* doesn't sound like it is a big problem in practice.
|
||||
*/
|
||||
predicate concatStep(DataFlow::CfgNode nodeFrom, DataFlow::CfgNode nodeTo) {
|
||||
exists(BinaryExprNode add | add = nodeTo.getNode() |
|
||||
add.getOp() instanceof Add and add.getAnOperand() = nodeFrom.getNode()
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Holds if taint can flow from `nodeFrom` to `nodeTo` with a step related to subscripting.
|
||||
*/
|
||||
predicate subscriptStep(DataFlow::CfgNode nodeFrom, DataFlow::CfgNode nodeTo) {
|
||||
nodeTo.getNode().(SubscriptNode).getObject() = nodeFrom.getNode()
|
||||
}
|
||||
|
||||
/**
|
||||
* Holds if taint can flow from `nodeFrom` to `nodeTo` with a step related to string
|
||||
* manipulation.
|
||||
*
|
||||
* Note that since we cannot easily distinguish when something is a string, this can
|
||||
* also make taint flow on `<non string>.replace(foo, bar)`.
|
||||
*/
|
||||
predicate stringManipulation(DataFlow::CfgNode nodeFrom, DataFlow::CfgNode nodeTo) {
|
||||
// transforming something tainted into a string will make the string tainted
|
||||
exists(CallNode call | call = nodeTo.getNode() |
|
||||
call.getFunction().(NameNode).getId() in ["str", "bytes", "unicode"] and
|
||||
(
|
||||
nodeFrom.getNode() = call.getArg(0)
|
||||
or
|
||||
nodeFrom.getNode() = call.getArgByName("object")
|
||||
)
|
||||
)
|
||||
or
|
||||
// String methods. Note that this doesn't recognize `meth = "foo".upper; meth()`
|
||||
exists(CallNode call, string method_name, ControlFlowNode object |
|
||||
call = nodeTo.getNode() and
|
||||
object = call.getFunction().(AttrNode).getObject(method_name)
|
||||
|
|
||||
nodeFrom.getNode() = object and
|
||||
method_name in ["capitalize", "casefold", "center", "expandtabs", "format", "format_map",
|
||||
"join", "ljust", "lstrip", "lower", "replace", "rjust", "rstrip", "strip", "swapcase",
|
||||
"title", "upper", "zfill", "encode", "decode"]
|
||||
or
|
||||
method_name = "replace" and
|
||||
nodeFrom.getNode() = call.getArg(1)
|
||||
or
|
||||
method_name = "format" and
|
||||
nodeFrom.getNode() = call.getAnArg()
|
||||
or
|
||||
// str -> List[str]
|
||||
// TODO: check if these should be handled differently in regards to content
|
||||
nodeFrom.getNode() = object and
|
||||
method_name in ["partition", "rpartition", "rsplit", "split", "splitlines"]
|
||||
or
|
||||
// List[str] -> str
|
||||
// TODO: check if these should be handled differently in regards to content
|
||||
method_name = "join" and
|
||||
nodeFrom.getNode() = call.getArg(0)
|
||||
or
|
||||
// Mapping[str, Any] -> str
|
||||
method_name = "format_map" and
|
||||
nodeFrom.getNode() = call.getArg(0)
|
||||
)
|
||||
or
|
||||
// % formatting
|
||||
exists(BinaryExprNode fmt | fmt = nodeTo.getNode() |
|
||||
fmt.getOp() instanceof Mod and
|
||||
(
|
||||
fmt.getLeft() = nodeFrom.getNode()
|
||||
or
|
||||
fmt.getRight() = nodeFrom.getNode()
|
||||
)
|
||||
)
|
||||
or
|
||||
// string multiplication -- `"foo" * 10`
|
||||
exists(BinaryExprNode mult | mult = nodeTo.getNode() |
|
||||
mult.getOp() instanceof Mult and
|
||||
mult.getLeft() = nodeFrom.getNode()
|
||||
)
|
||||
or
|
||||
// f-strings
|
||||
nodeTo.getNode().getNode().(Fstring).getAValue() = nodeFrom.getNode().getNode()
|
||||
// TODO: Handle encode/decode from base64/quopri
|
||||
// TODO: Handle os.path.join
|
||||
// TODO: Handle functions in https://docs.python.org/3/library/binascii.html
|
||||
}
|
||||
|
||||
1
python/ql/test/experimental/dataflow/options
Normal file
1
python/ql/test/experimental/dataflow/options
Normal file
@@ -0,0 +1 @@
|
||||
semmle-extractor-options: --max-import-depth=1
|
||||
@@ -0,0 +1,72 @@
|
||||
import python
|
||||
import experimental.dataflow.TaintTracking
|
||||
import experimental.dataflow.DataFlow
|
||||
|
||||
class TestTaintTrackingConfiguration extends TaintTracking::Configuration {
|
||||
TestTaintTrackingConfiguration() { this = "TestTaintTrackingConfiguration" }
|
||||
|
||||
override predicate isSource(DataFlow::Node source) {
|
||||
source.(DataFlow::CfgNode).getNode().(NameNode).getId() in ["TAINTED_STRING", "TAINTED_BYTES"]
|
||||
}
|
||||
|
||||
override predicate isSink(DataFlow::Node sink) {
|
||||
exists(CallNode call |
|
||||
call.getFunction().(NameNode).getId() in ["ensure_tainted", "ensure_not_tainted"] and
|
||||
sink.(DataFlow::CfgNode).getNode() = call.getAnArg()
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
private string repr(Expr e) {
|
||||
not e instanceof Num and
|
||||
not e instanceof StrConst and
|
||||
not e instanceof Subscript and
|
||||
not e instanceof Call and
|
||||
not e instanceof Attribute and
|
||||
result = e.toString()
|
||||
or
|
||||
result = e.(Num).getN()
|
||||
or
|
||||
result =
|
||||
e.(StrConst).getPrefix() + e.(StrConst).getText() +
|
||||
e.(StrConst).getPrefix().regexpReplaceAll("[a-zA-Z]+", "")
|
||||
or
|
||||
result = repr(e.(Subscript).getObject()) + "[" + repr(e.(Subscript).getIndex()) + "]"
|
||||
or
|
||||
(
|
||||
if exists(e.(Call).getAnArg()) or exists(e.(Call).getANamedArg())
|
||||
then result = repr(e.(Call).getFunc()) + "(..)"
|
||||
else result = repr(e.(Call).getFunc()) + "()"
|
||||
)
|
||||
or
|
||||
result = repr(e.(Attribute).getObject()) + "." + e.(Attribute).getName()
|
||||
}
|
||||
|
||||
query predicate test_taint(string arg_location, string test_res, string function_name, string repr) {
|
||||
exists(Call call, Expr arg, boolean expected_taint, boolean has_taint |
|
||||
call.getLocation().getFile().getShortName() = "test.py" and
|
||||
(
|
||||
call.getFunc().(Name).getId() = "ensure_tainted" and
|
||||
expected_taint = true
|
||||
or
|
||||
call.getFunc().(Name).getId() = "ensure_not_tainted" and
|
||||
expected_taint = false
|
||||
) and
|
||||
arg = call.getAnArg() and
|
||||
(
|
||||
// TODO: Replace with `hasFlowToExpr` once that is working
|
||||
if
|
||||
exists(TaintTracking::Configuration c |
|
||||
c.hasFlowTo(any(DataFlow::Node n | n.(DataFlow::CfgNode).getNode() = arg.getAFlowNode()))
|
||||
)
|
||||
then has_taint = true
|
||||
else has_taint = false
|
||||
) and
|
||||
(if expected_taint = has_taint then test_res = "ok " else test_res = "fail") and
|
||||
// select
|
||||
arg_location = arg.getLocation().toString() and
|
||||
test_res = test_res and
|
||||
function_name = call.getScope().(Function).getName() and
|
||||
repr = repr(arg)
|
||||
)
|
||||
}
|
||||
@@ -0,0 +1,10 @@
|
||||
| test.py:26 | ok | str_methods | ts.casefold() |
|
||||
| test.py:28 | ok | str_methods | ts.format_map(..) |
|
||||
| test.py:29 | fail | str_methods | "{unsafe}".format_map(..) |
|
||||
| test.py:40 | fail | binary_decode_encode | base64.a85encode(..) |
|
||||
| test.py:41 | fail | binary_decode_encode | base64.a85decode(..) |
|
||||
| test.py:44 | fail | binary_decode_encode | base64.b85encode(..) |
|
||||
| test.py:45 | fail | binary_decode_encode | base64.b85decode(..) |
|
||||
| test.py:48 | fail | binary_decode_encode | base64.encodebytes(..) |
|
||||
| test.py:49 | fail | binary_decode_encode | base64.decodebytes(..) |
|
||||
| test.py:57 | ok | f_strings | Fstring |
|
||||
@@ -0,0 +1 @@
|
||||
import experimental.dataflow.tainttracking.TestTaintLib
|
||||
@@ -0,0 +1 @@
|
||||
semmle-extractor-options: --max-import-depth=1 --lang=3
|
||||
@@ -0,0 +1,64 @@
|
||||
# Python 3 specific taint tracking for string
|
||||
|
||||
TAINTED_STRING = "TAINTED_STRING"
|
||||
TAINTED_BYTES = b"TAINTED_BYTES"
|
||||
|
||||
|
||||
def ensure_tainted(*args):
|
||||
print("- ensure_tainted")
|
||||
for i, arg in enumerate(args):
|
||||
print("arg {}: {!r}".format(i, arg))
|
||||
|
||||
|
||||
def ensure_not_tainted(*args):
|
||||
print("- ensure_not_tainted")
|
||||
for i, arg in enumerate(args):
|
||||
print("arg {}: {!r}".format(i, arg))
|
||||
|
||||
|
||||
# Actual tests
|
||||
|
||||
def str_methods():
|
||||
print("\n# str_methods")
|
||||
ts = TAINTED_STRING
|
||||
tb = TAINTED_BYTES
|
||||
ensure_tainted(
|
||||
ts.casefold(),
|
||||
|
||||
ts.format_map({}),
|
||||
"{unsafe}".format_map({"unsafe": ts}),
|
||||
)
|
||||
|
||||
|
||||
def binary_decode_encode():
|
||||
print("\n#percent_fmt")
|
||||
tb = TAINTED_BYTES
|
||||
import base64
|
||||
|
||||
ensure_tainted(
|
||||
# New in Python 3.4
|
||||
base64.a85encode(tb),
|
||||
base64.a85decode(base64.a85encode(tb)),
|
||||
|
||||
# New in Python 3.4
|
||||
base64.b85encode(tb),
|
||||
base64.b85decode(base64.b85encode(tb)),
|
||||
|
||||
# New in Python 3.1
|
||||
base64.encodebytes(tb),
|
||||
base64.decodebytes(base64.encodebytes(tb)),
|
||||
)
|
||||
|
||||
|
||||
def f_strings():
|
||||
print("\n#f_strings")
|
||||
ts = TAINTED_STRING
|
||||
|
||||
ensure_tainted(f"foo {ts} bar")
|
||||
|
||||
|
||||
# Make tests runable
|
||||
|
||||
str_methods()
|
||||
binary_decode_encode()
|
||||
f_strings()
|
||||
@@ -0,0 +1,62 @@
|
||||
| test.py:32 | ok | str_operations | ts |
|
||||
| test.py:33 | ok | str_operations | BinaryExpr |
|
||||
| test.py:34 | ok | str_operations | BinaryExpr |
|
||||
| test.py:35 | ok | str_operations | BinaryExpr |
|
||||
| test.py:36 | ok | str_operations | ts[Slice] |
|
||||
| test.py:37 | ok | str_operations | ts[Slice] |
|
||||
| test.py:38 | ok | str_operations | ts[Slice] |
|
||||
| test.py:39 | ok | str_operations | ts[0] |
|
||||
| test.py:40 | ok | str_operations | str(..) |
|
||||
| test.py:41 | ok | str_operations | bytes(..) |
|
||||
| test.py:42 | ok | str_operations | unicode(..) |
|
||||
| test.py:51 | ok | str_methods | ts.capitalize() |
|
||||
| test.py:52 | ok | str_methods | ts.center(..) |
|
||||
| test.py:53 | ok | str_methods | ts.expandtabs() |
|
||||
| test.py:55 | ok | str_methods | ts.format() |
|
||||
| test.py:56 | ok | str_methods | "{}".format(..) |
|
||||
| test.py:57 | ok | str_methods | "{unsafe}".format(..) |
|
||||
| test.py:59 | ok | str_methods | ts.join(..) |
|
||||
| test.py:60 | fail | str_methods | "".join(..) |
|
||||
| test.py:62 | ok | str_methods | ts.ljust(..) |
|
||||
| test.py:63 | ok | str_methods | ts.lstrip() |
|
||||
| test.py:64 | ok | str_methods | ts.lower() |
|
||||
| test.py:66 | ok | str_methods | ts.replace(..) |
|
||||
| test.py:67 | ok | str_methods | "safe".replace(..) |
|
||||
| test.py:69 | ok | str_methods | ts.rjust(..) |
|
||||
| test.py:70 | ok | str_methods | ts.rstrip() |
|
||||
| test.py:71 | ok | str_methods | ts.strip() |
|
||||
| test.py:72 | ok | str_methods | ts.swapcase() |
|
||||
| test.py:73 | ok | str_methods | ts.title() |
|
||||
| test.py:74 | ok | str_methods | ts.upper() |
|
||||
| test.py:75 | ok | str_methods | ts.zfill(..) |
|
||||
| test.py:77 | ok | str_methods | ts.encode(..) |
|
||||
| test.py:78 | ok | str_methods | ts.encode(..).decode(..) |
|
||||
| test.py:80 | ok | str_methods | tb.decode(..) |
|
||||
| test.py:81 | ok | str_methods | tb.decode(..).encode(..) |
|
||||
| test.py:84 | ok | str_methods | ts.partition(..) |
|
||||
| test.py:85 | ok | str_methods | ts.rpartition(..) |
|
||||
| test.py:86 | ok | str_methods | ts.rsplit(..) |
|
||||
| test.py:87 | ok | str_methods | ts.split(..) |
|
||||
| test.py:88 | ok | str_methods | ts.splitlines() |
|
||||
| test.py:93 | ok | str_methods | "safe".replace(..) |
|
||||
| test.py:95 | fail | str_methods | ts.join(..) |
|
||||
| test.py:96 | fail | str_methods | ts.join(..) |
|
||||
| test.py:106 | fail | non_syntactic | meth() |
|
||||
| test.py:107 | fail | non_syntactic | _str(..) |
|
||||
| test.py:116 | ok | percent_fmt | BinaryExpr |
|
||||
| test.py:117 | ok | percent_fmt | BinaryExpr |
|
||||
| test.py:118 | fail | percent_fmt | BinaryExpr |
|
||||
| test.py:128 | fail | binary_decode_encode | base64.b64encode(..) |
|
||||
| test.py:129 | fail | binary_decode_encode | base64.b64decode(..) |
|
||||
| test.py:131 | fail | binary_decode_encode | base64.standard_b64encode(..) |
|
||||
| test.py:132 | fail | binary_decode_encode | base64.standard_b64decode(..) |
|
||||
| test.py:134 | fail | binary_decode_encode | base64.urlsafe_b64encode(..) |
|
||||
| test.py:135 | fail | binary_decode_encode | base64.urlsafe_b64decode(..) |
|
||||
| test.py:137 | fail | binary_decode_encode | base64.b32encode(..) |
|
||||
| test.py:138 | fail | binary_decode_encode | base64.b32decode(..) |
|
||||
| test.py:140 | fail | binary_decode_encode | base64.b16encode(..) |
|
||||
| test.py:141 | fail | binary_decode_encode | base64.b16decode(..) |
|
||||
| test.py:156 | fail | binary_decode_encode | base64.encodestring(..) |
|
||||
| test.py:157 | fail | binary_decode_encode | base64.decodestring(..) |
|
||||
| test.py:162 | fail | binary_decode_encode | quopri.encodestring(..) |
|
||||
| test.py:163 | fail | binary_decode_encode | quopri.decodestring(..) |
|
||||
@@ -0,0 +1 @@
|
||||
import experimental.dataflow.tainttracking.TestTaintLib
|
||||
@@ -0,0 +1,173 @@
|
||||
import sys
|
||||
|
||||
if sys.version_info[0] == 3:
|
||||
unicode = str
|
||||
|
||||
|
||||
TAINTED_STRING = "TAINTED_STRING"
|
||||
TAINTED_BYTES = b"TAINTED_BYTES"
|
||||
|
||||
|
||||
def ensure_tainted(*args):
|
||||
print("- ensure_tainted")
|
||||
for i, arg in enumerate(args):
|
||||
print("arg {}: {!r}".format(i, arg))
|
||||
|
||||
|
||||
def ensure_not_tainted(*args):
|
||||
print("- ensure_not_tainted")
|
||||
for i, arg in enumerate(args):
|
||||
print("arg {}: {!r}".format(i, arg))
|
||||
|
||||
|
||||
# Actual tests
|
||||
|
||||
|
||||
def str_operations():
|
||||
print("\n# str_operations")
|
||||
ts = TAINTED_STRING
|
||||
tb = TAINTED_BYTES
|
||||
|
||||
ensure_tainted(
|
||||
ts,
|
||||
ts + "foo",
|
||||
"foo" + ts,
|
||||
ts * 5,
|
||||
ts[0 : len(ts)],
|
||||
ts[:],
|
||||
ts[0:1000],
|
||||
ts[0],
|
||||
str(ts),
|
||||
bytes(tb),
|
||||
unicode(ts),
|
||||
)
|
||||
|
||||
|
||||
def str_methods():
|
||||
print("\n# str_methods")
|
||||
ts = TAINTED_STRING
|
||||
tb = TAINTED_BYTES
|
||||
ensure_tainted(
|
||||
ts.capitalize(),
|
||||
ts.center(100),
|
||||
ts.expandtabs(),
|
||||
|
||||
ts.format(),
|
||||
"{}".format(ts),
|
||||
"{unsafe}".format(unsafe=ts),
|
||||
|
||||
ts.join(["", ""]),
|
||||
"".join([ts]),
|
||||
|
||||
ts.ljust(100),
|
||||
ts.lstrip(),
|
||||
ts.lower(),
|
||||
|
||||
ts.replace("old", "new"),
|
||||
"safe".replace("safe", ts),
|
||||
|
||||
ts.rjust(100),
|
||||
ts.rstrip(),
|
||||
ts.strip(),
|
||||
ts.swapcase(),
|
||||
ts.title(),
|
||||
ts.upper(),
|
||||
ts.zfill(100),
|
||||
|
||||
ts.encode("utf-8"),
|
||||
ts.encode("utf-8").decode("utf-8"),
|
||||
|
||||
tb.decode("utf-8"),
|
||||
tb.decode("utf-8").encode("utf-8"),
|
||||
|
||||
# string methods that return a list
|
||||
ts.partition("_"),
|
||||
ts.rpartition("_"),
|
||||
ts.rsplit("_"),
|
||||
ts.split("_"),
|
||||
ts.splitlines(),
|
||||
)
|
||||
|
||||
ensure_not_tainted(
|
||||
# Intuitively I think this should be safe, but better discuss it
|
||||
"safe".replace(ts, "also-safe"),
|
||||
|
||||
ts.join([]), # FP due to separator not being used with zero/one elements
|
||||
ts.join(["safe"]), # FP due to separator not being used with zero/one elements
|
||||
)
|
||||
|
||||
|
||||
def non_syntactic():
|
||||
print("\n# non_syntactic")
|
||||
ts = TAINTED_STRING
|
||||
meth = ts.upper
|
||||
_str = str
|
||||
ensure_tainted(
|
||||
meth(),
|
||||
_str(ts),
|
||||
)
|
||||
|
||||
|
||||
def percent_fmt():
|
||||
print("\n#percent_fmt")
|
||||
ts = TAINTED_STRING
|
||||
tainted_fmt = ts + " %s %s"
|
||||
ensure_tainted(
|
||||
tainted_fmt % (1, 2),
|
||||
"%s foo bar" % ts,
|
||||
"%s %s %s" % (1, 2, ts),
|
||||
)
|
||||
|
||||
|
||||
def binary_decode_encode():
|
||||
print("\n#percent_fmt")
|
||||
tb = TAINTED_BYTES
|
||||
import base64
|
||||
|
||||
ensure_tainted(
|
||||
base64.b64encode(tb),
|
||||
base64.b64decode(base64.b64encode(tb)),
|
||||
|
||||
base64.standard_b64encode(tb),
|
||||
base64.standard_b64decode(base64.standard_b64encode(tb)),
|
||||
|
||||
base64.urlsafe_b64encode(tb),
|
||||
base64.urlsafe_b64decode(base64.urlsafe_b64encode(tb)),
|
||||
|
||||
base64.b32encode(tb),
|
||||
base64.b32decode(base64.b32encode(tb)),
|
||||
|
||||
base64.b16encode(tb),
|
||||
base64.b16decode(base64.b16encode(tb)),
|
||||
|
||||
# # New in Python 3.4
|
||||
# base64.a85encode(tb),
|
||||
# base64.a85decode(base64.a85encode(tb)),
|
||||
|
||||
# # New in Python 3.4
|
||||
# base64.b85encode(tb),
|
||||
# base64.b85decode(base64.b85encode(tb)),
|
||||
|
||||
# # New in Python 3.1
|
||||
# base64.encodebytes(tb),
|
||||
# base64.decodebytes(base64.encodebytes(tb)),
|
||||
|
||||
# deprecated since Python 3.1, but still works
|
||||
base64.encodestring(tb),
|
||||
base64.decodestring(base64.encodestring(tb)),
|
||||
)
|
||||
|
||||
import quopri
|
||||
ensure_tainted(
|
||||
quopri.encodestring(tb),
|
||||
quopri.decodestring(quopri.encodestring(tb)),
|
||||
)
|
||||
|
||||
|
||||
# Make tests runable
|
||||
|
||||
str_operations()
|
||||
str_methods()
|
||||
non_syntactic()
|
||||
percent_fmt()
|
||||
binary_decode_encode()
|
||||
Reference in New Issue
Block a user