Python: Port URL sanitisation queries to API graphs

Really, this boils down to "Port `re` library model to use API graphs
instead of points-to", which is what this PR actually does.

Instead of using points-to to track flags, we use a type tracker. To
handle multiple flags at the same time, we add additional flow from

`x` to `x | y` and `y | x`

and, as an added bonus, the above with `+` instead of `|`, neatly
fixing https://github.com/github/codeql/issues/4707

I had to modify the `Qualified.ql` test slightly, as it now had a
result stemming from the standard library (in `warnings.py`) that
points-to previously ignored.

It might be possible to implement this as a type tracker on
`LocalSourceNode`s, but with the added steps for the above operations,
this was not obvious to me, and so I opted for the simpler
"`smallstep`" variant.
This commit is contained in:
Taus Brock-Nannestad
2021-02-23 22:02:35 +01:00
parent f65843a273
commit e812eb777d
4 changed files with 73 additions and 29 deletions

View File

@@ -1,34 +1,39 @@
import python
import semmle.python.objects.ObjectInternal
deprecated import semmle.python.objects.ObjectInternal as OI
import semmle.python.ApiGraphs
private predicate re_module_function(string name, int flags) {
name = "compile" and flags = 1
/**
* Gets the positional argument index containing the regular expression flags for the member of the
* `re` module with the name `name`.
*/
private int re_member_flags_arg(string name) {
name = "compile" and result = 1
or
name = "search" and flags = 2
name = "search" and result = 2
or
name = "match" and flags = 2
name = "match" and result = 2
or
name = "split" and flags = 3
name = "split" and result = 3
or
name = "findall" and flags = 2
name = "findall" and result = 2
or
name = "finditer" and flags = 2
name = "finditer" and result = 2
or
name = "sub" and flags = 4
name = "sub" and result = 4
or
name = "subn" and flags = 4
name = "subn" and result = 4
}
/**
* Gets the names and corresponding values of attributes of the `re` module that are likely to be
* Gets the names and corresponding API nodes of members of the `re` module that are likely to be
* methods taking regular expressions as arguments.
*
* This is a helper predicate that fixes a bad join order, and should not be inlined without checking
* that this is safe.
*/
pragma[nomagic]
private Value relevant_re_attr(string name) {
result = Module::named("re").attr(name) and
private API::Node relevant_re_member(string name) {
result = API::moduleImport("re").getMember(name) and
name != "escape"
}
@@ -39,24 +44,60 @@ private Value relevant_re_attr(string name) {
predicate used_as_regex(Expr s, string mode) {
(s instanceof Bytes or s instanceof Unicode) and
/* Call to re.xxx(regex, ... [mode]) */
exists(CallNode call, string name |
call.getArg(0).pointsTo(_, _, s.getAFlowNode()) and
call.getFunction().pointsTo(relevant_re_attr(name))
exists(DataFlow::CallCfgNode call, string name |
call.getArg(0).asExpr() = s and
call = relevant_re_member(name).getACall()
|
mode = "None"
or
exists(Value obj | mode = mode_from_mode_object(obj) |
exists(int flags_arg |
re_module_function(name, flags_arg) and
call.getArg(flags_arg).pointsTo(obj)
)
or
call.getArgByName("flags").pointsTo(obj)
)
mode = mode_from_node([call.getArg(re_member_flags_arg(name)), call.getArgByName("flags")])
)
}
string mode_from_mode_object(Value obj) {
/**
* Gets the canonical name for the API graph node corresponding to the `re` flag `flag`. For flags
* that have multiple names, we pick the long-form name as a canonical representative.
*/
private string canonical_name(API::Node flag) {
result in ["ASCII", "IGNORECASE", "LOCALE", "UNICODE", "MULTILINE", "TEMPLATE"] and
flag = API::moduleImport("re").getMember([result, result.prefix(1)])
or
flag = API::moduleImport("re").getMember(["DOTALL", "S"]) and result = "DOTALL"
or
flag = API::moduleImport("re").getMember(["VERBOSE", "X"]) and result = "VERBOSE"
}
/**
* A type tracker for regular expression flag names. Holds if the result is a node that may refer
* to the `re` flag with the canonical name `flag_name`
*/
private DataFlow::Node re_flag_tracker(string flag_name, DataFlow::TypeTracker t) {
t.start() and
exists(API::Node flag | flag_name = canonical_name(flag) and result = flag.getAUse())
or
exists(BinaryExprNode binop |
(binop.getOp() instanceof BitOr or binop.getOp() instanceof Add) and
binop.getAnOperand() = re_flag_tracker(flag_name, t.continue()).asCfgNode() and
result.asCfgNode() = binop
)
or
exists(DataFlow::TypeTracker t2, DataFlow::Node prev | prev = re_flag_tracker(flag_name, t2) |
t2 = t.smallstep(prev, result)
)
}
/**
* A type tracker for regular expression flag names. Holds if the result is a node that may refer
* to the `re` flag with the canonical name `flag_name`
*/
private DataFlow::Node re_flag_tracker(string flag_name) {
result = re_flag_tracker(flag_name, DataFlow::TypeTracker::end())
}
/** Gets a regular expression mode flag associated with the given data flow node. */
string mode_from_node(DataFlow::Node node) { node = re_flag_tracker(result) }
deprecated string mode_from_mode_object(Value obj) {
(
result = "DEBUG" or
result = "IGNORECASE" or
@@ -67,8 +108,8 @@ string mode_from_mode_object(Value obj) {
result = "VERBOSE"
) and
exists(int flag |
flag = Value::named("sre_constants.SRE_FLAG_" + result).(ObjectInternal).intValue() and
obj.(ObjectInternal).intValue().bitAnd(flag) = flag
flag = Value::named("sre_constants.SRE_FLAG_" + result).(OI::ObjectInternal).intValue() and
obj.(OI::ObjectInternal).intValue().bitAnd(flag) = flag
)
}