Merge pull request #14725 from RasmusWL/re-modeling

Python: Add taint-flow modeling for `re` module
This commit is contained in:
Rasmus Wriedt Larsen
2023-11-23 11:35:36 +01:00
committed by GitHub
9 changed files with 342 additions and 1 deletions

View File

@@ -0,0 +1,4 @@
---
category: minorAnalysis
---
* Added taint-flow modeling for regular expressions with `re` module from the standard library.

View File

@@ -10,6 +10,7 @@ import LocalSources
private import semmle.python.essa.SsaCompute
private import semmle.python.dataflow.new.internal.ImportStar
private import FlowSummaryImpl as FlowSummaryImpl
private import semmle.python.frameworks.data.ModelsAsData
/**
* IPA type for data flow nodes.
@@ -587,6 +588,11 @@ newtype TContent =
or
// Arguments can overflow and end up in the starred parameter tuple.
exists(any(CallNode cn).getArg(index))
or
// since flow summaries might use tuples, we ensure that we at least have valid
// TTupleElementContent for the 0..7 (7 was picked to match `small_tuple` in
// data-flow-private)
index in [0 .. 7]
} or
/** An element of a dictionary under a specific key. */
TDictionaryElementContent(string key) {
@@ -597,7 +603,30 @@ newtype TContent =
/** An element of a dictionary under any key. */
TDictionaryElementAnyContent() or
/** An object attribute. */
TAttributeContent(string attr) { attr = any(Attribute a).getName() }
TAttributeContent(string attr) {
attr = any(Attribute a).getName()
or
// Flow summaries that target attributes rely on a TAttributeContent being
// available. However, since the code above only constructs a TAttributeContent
// based on the attribute names seen in the DB, we can end up in a scenario where
// flow summaries don't work due to missing TAttributeContent. To get around this,
// we need to add the attribute names used by flow summaries. This needs to be done
// both for the summaries written in QL and the ones written in data-extension
// files.
//
// 1) Summaries in QL. Sadly the following code leads to non-monotonic recursion
// name = any(AccessPathToken a).getAnArgument("Attribute")
// instead we use a qltest to alert if we write a new summary in QL that uses an
// attribute -- see
// python/ql/test/experimental/dataflow/summaries-checks/missing-attribute-content.ql
attr in ["re", "string", "pattern"]
or
//
// 2) summaries in data-extension files
exists(string input, string output | ModelOutput::relevantSummaryModel(_, _, input, output, _) |
attr = [input, output].regexpFind("(?<=(^|\\.)Attribute\\[)[^\\]]+(?=\\])", _, _).trim()
)
}
/**
* A data-flow value can have associated content.

View File

@@ -3069,6 +3069,212 @@ private module StdlibPrivate {
override string getName() { result = "re." + method }
}
/**
* A flow summary for compiled regex objects
*
* See https://docs.python.org/3.11/library/re.html#re-objects
*/
class RePatternSummary extends SummarizedCallable {
RePatternSummary() { this = "re.Pattern" }
override DataFlow::CallCfgNode getACall() {
result = API::moduleImport("re").getMember("compile").getACall()
}
override DataFlow::ArgumentNode getACallback() {
result = API::moduleImport("re").getMember("compile").getAValueReachableFromSource()
}
override predicate propagatesFlowExt(string input, string output, boolean preservesValue) {
input in ["Argument[0]", "Argument[pattern:]"] and
output = "ReturnValue.Attribute[pattern]" and
preservesValue = true
}
}
/**
* A flow summary for methods returning a `re.Match` object
*
* See https://docs.python.org/3/library/re.html#re.Match
*/
class ReMatchSummary extends SummarizedCallable {
ReMatchSummary() { this = ["re.Match", "compiled re.Match"] }
override DataFlow::CallCfgNode getACall() {
this = "re.Match" and
result = API::moduleImport("re").getMember(["match", "search", "fullmatch"]).getACall()
or
this = "compiled re.Match" and
result =
any(RePatternSummary c)
.getACall()
.(API::CallNode)
.getReturn()
.getMember(["match", "search", "fullmatch"])
.getACall()
}
override DataFlow::ArgumentNode getACallback() { none() }
override predicate propagatesFlowExt(string input, string output, boolean preservesValue) {
exists(string arg |
this = "re.Match" and arg = "Argument[1]"
or
this = "compiled re.Match" and arg = "Argument[0]"
|
input in [arg, "Argument[string:]"] and
(
output = "ReturnValue.Attribute[string]" and
preservesValue = true
or
// indexing such as `match[g]` is the same as `match.group(g)`
// since you can index with both integers and strings, we model it as
// both list element and dictionary... a bit of a hack, but no way to model
// subscript operators directly with flow-summaries :|
output in ["ReturnValue.ListElement", "ReturnValue.DictionaryElementAny"] and
preservesValue = false
)
)
or
// regex pattern
(
this = "re.Match" and input in ["Argument[0]", "Argument[pattern:]"]
or
// for compiled regexes, this it is already stored in the `pattern` attribute
this = "compiled re.Match" and input = "Argument[self].Attribute[pattern]"
) and
output = "ReturnValue.Attribute[re].Attribute[pattern]" and
preservesValue = true
}
}
/**
* A flow summary for methods on a `re.Match` object
*
* See https://docs.python.org/3/library/re.html#re.Match
*/
class ReMatchMethodsSummary extends SummarizedCallable {
string methodName;
ReMatchMethodsSummary() {
this = "re.Match." + methodName and
methodName in ["expand", "group", "groups", "groupdict"]
}
override DataFlow::CallCfgNode getACall() {
result =
any(ReMatchSummary c)
.getACall()
.(API::CallNode)
.getReturn()
.getMember(methodName)
.getACall()
}
override DataFlow::ArgumentNode getACallback() { none() }
override predicate propagatesFlowExt(string input, string output, boolean preservesValue) {
methodName = "expand" and
preservesValue = false and
(
input = "Argument[0]" and output = "ReturnValue"
or
input = "Argument[self].Attribute[string]" and
output = "ReturnValue"
)
or
methodName = "group" and
input = "Argument[self].Attribute[string]" and
output in ["ReturnValue", "ReturnValue.ListElement"] and
preservesValue = false
or
methodName = "groups" and
input = "Argument[self].Attribute[string]" and
output = "ReturnValue.ListElement" and
preservesValue = false
or
methodName = "groupdict" and
input = "Argument[self].Attribute[string]" and
output = "ReturnValue.DictionaryElementAny" and
preservesValue = false
}
}
/**
* A flow summary for `re` methods not returning a `re.Match` object
*
* See https://docs.python.org/3/library/re.html#functions
*/
class ReFunctionsSummary extends SummarizedCallable {
string methodName;
ReFunctionsSummary() {
methodName in ["split", "findall", "finditer", "sub", "subn"] and
this = ["re.", "compiled re."] + methodName
}
override DataFlow::CallCfgNode getACall() {
this = "re." + methodName and
result = API::moduleImport("re").getMember(methodName).getACall()
or
this = "compiled re." + methodName and
result =
any(RePatternSummary c)
.getACall()
.(API::CallNode)
.getReturn()
.getMember(methodName)
.getACall()
}
override DataFlow::ArgumentNode getACallback() { none() }
override predicate propagatesFlowExt(string input, string output, boolean preservesValue) {
exists(int offset |
// for non-compiled regex the first argument is the pattern, so we need to
// account for this difference
this = "re." + methodName and offset = 0
or
this = "compiled re." + methodName and offset = 1
|
// flow from input string to results
exists(int arg | arg = methodName.(RegexExecutionMethod).getStringArgIndex() - offset |
preservesValue = false and
input in ["Argument[" + arg + "]", "Argument[string:]"] and
(
methodName in ["split", "findall", "finditer"] and
output = "ReturnValue.ListElement"
or
// TODO: Since we currently model iterables as tainted when their elements
// are, the result of findall, finditer, split needs to be tainted
methodName in ["split", "findall", "finditer"] and
output = "ReturnValue"
or
methodName = "sub" and
output = "ReturnValue"
or
methodName = "subn" and
output = "ReturnValue.TupleElement[0]"
)
)
or
// flow from replacement value for substitution
exists(string argumentSpec |
argumentSpec in ["Argument[" + (1 - offset) + "]", "Argument[repl:]"] and
// `repl` can also be a function
input = [argumentSpec, argumentSpec + ".ReturnValue"]
|
(
methodName = "sub" and output = "ReturnValue"
or
methodName = "subn" and output = "ReturnValue.TupleElement[0]"
) and
preservesValue = false
)
)
}
}
/**
* A call to 're.escape'.
* See https://docs.python.org/3/library/re.html#re.escape

View File

@@ -0,0 +1 @@
# an empty file, since we want the test to run on an empty db

View File

@@ -0,0 +1,8 @@
import python
import semmle.python.dataflow.new.FlowSummary
import semmle.python.dataflow.new.internal.FlowSummaryImpl
query predicate invalidSpecComponent(SummarizedCallable sc, string s, string c) {
(sc.propagatesFlowExt(s, _, _) or sc.propagatesFlowExt(_, s, _)) and
Private::External::invalidSpecComponent(s, c)
}

View File

@@ -0,0 +1,11 @@
import python
import semmle.python.dataflow.new.FlowSummary
import semmle.python.dataflow.new.internal.FlowSummaryImpl
from SummarizedCallable sc, string s, string c, string attr
where
(sc.propagatesFlowExt(s, _, _) or sc.propagatesFlowExt(_, s, _)) and
Private::External::invalidSpecComponent(s, c) and
c = "Attribute[" + attr + "]"
select "The attribute \"" + attr +
"\" is not a valid TAttributeContent, please add it to the hardcoded list of TAttributeContent in the dataflow library."

View File

@@ -0,0 +1,82 @@
import re
ts = TAINTED_STRING
pat = ... # some pattern
compiled_pat = re.compile(pat)
# see https://docs.python.org/3/library/re.html#functions
ensure_not_tainted(
# returns Match object, which is tested properly below. (note: with the flow summary
# modeling, objects containing tainted values are not themselves tainted).
re.search(pat, ts),
re.match(pat, ts),
re.fullmatch(pat, ts),
compiled_pat.search(ts),
compiled_pat.match(ts),
compiled_pat.fullmatch(ts),
)
# Match object
tainted_match = re.match(pat, ts)
safe_match = re.match(pat, "safe")
ensure_tainted(
tainted_match.expand("Hello \1"), # $ tainted
safe_match.expand(ts), # $ tainted
tainted_match.group(), # $ tainted
tainted_match.group(1, 2), # $ tainted
tainted_match.group(1, 2)[0], # $ tainted
tainted_match[0], # $ tainted
tainted_match["key"], # $ tainted
tainted_match.groups()[0], # $ tainted
tainted_match.groupdict()["key"], # $ tainted
re.match(pat, ts).string, # $ tainted
re.match(ts, "safe").re.pattern, # $ tainted
compiled_pat.match(ts).string, # $ tainted
re.compile(ts).match("safe").re.pattern, # $ tainted
)
ensure_not_tainted(
safe_match.expand("Hello \1"),
safe_match.group(),
re.match(pat, "safe").re,
re.match(pat, "safe").string,
)
ensure_tainted(
# other functions not returning Match objects
re.split(pat, ts), # $ tainted
re.split(pat, ts)[0], # $ tainted
re.findall(pat, ts), # $ tainted
re.findall(pat, ts)[0], # $ tainted
re.finditer(pat, ts), # $ tainted
[x for x in re.finditer(pat, ts)], # $ tainted
re.sub(pat, repl="safe", string=ts), # $ tainted
re.sub(pat, repl=lambda m: ..., string=ts), # $ tainted
re.sub(pat, repl=ts, string="safe"), # $ tainted
re.sub(pat, repl=lambda m: ts, string="safe"), # $ tainted
# same for compiled patterns
compiled_pat.split(ts), # $ tainted
compiled_pat.split(ts)[0], # $ tainted
# ...
# user-controlled compiled pattern
re.compile(ts), # $ tainted
re.compile(ts).pattern, # $ tainted
)
ensure_not_tainted(
re.subn(pat, repl="safe", string=ts),
re.subn(pat, repl="safe", string=ts)[1], # // the number of substitutions made
)
ensure_tainted(
re.subn(pat, repl="safe", string=ts)[0], # $ tainted // the string
)