mirror of
https://github.com/github/codeql.git
synced 2025-12-21 11:16:30 +01:00
Merge pull request #14725 from RasmusWL/re-modeling
Python: Add taint-flow modeling for `re` module
This commit is contained in:
4
python/ql/lib/change-notes/2023-11-08-re-modeling.md
Normal file
4
python/ql/lib/change-notes/2023-11-08-re-modeling.md
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
---
|
||||||
|
category: minorAnalysis
|
||||||
|
---
|
||||||
|
* Added taint-flow modeling for regular expressions with `re` module from the standard library.
|
||||||
@@ -10,6 +10,7 @@ import LocalSources
|
|||||||
private import semmle.python.essa.SsaCompute
|
private import semmle.python.essa.SsaCompute
|
||||||
private import semmle.python.dataflow.new.internal.ImportStar
|
private import semmle.python.dataflow.new.internal.ImportStar
|
||||||
private import FlowSummaryImpl as FlowSummaryImpl
|
private import FlowSummaryImpl as FlowSummaryImpl
|
||||||
|
private import semmle.python.frameworks.data.ModelsAsData
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* IPA type for data flow nodes.
|
* IPA type for data flow nodes.
|
||||||
@@ -587,6 +588,11 @@ newtype TContent =
|
|||||||
or
|
or
|
||||||
// Arguments can overflow and end up in the starred parameter tuple.
|
// Arguments can overflow and end up in the starred parameter tuple.
|
||||||
exists(any(CallNode cn).getArg(index))
|
exists(any(CallNode cn).getArg(index))
|
||||||
|
or
|
||||||
|
// since flow summaries might use tuples, we ensure that we at least have valid
|
||||||
|
// TTupleElementContent for the 0..7 (7 was picked to match `small_tuple` in
|
||||||
|
// data-flow-private)
|
||||||
|
index in [0 .. 7]
|
||||||
} or
|
} or
|
||||||
/** An element of a dictionary under a specific key. */
|
/** An element of a dictionary under a specific key. */
|
||||||
TDictionaryElementContent(string key) {
|
TDictionaryElementContent(string key) {
|
||||||
@@ -597,7 +603,30 @@ newtype TContent =
|
|||||||
/** An element of a dictionary under any key. */
|
/** An element of a dictionary under any key. */
|
||||||
TDictionaryElementAnyContent() or
|
TDictionaryElementAnyContent() or
|
||||||
/** An object attribute. */
|
/** An object attribute. */
|
||||||
TAttributeContent(string attr) { attr = any(Attribute a).getName() }
|
TAttributeContent(string attr) {
|
||||||
|
attr = any(Attribute a).getName()
|
||||||
|
or
|
||||||
|
// Flow summaries that target attributes rely on a TAttributeContent being
|
||||||
|
// available. However, since the code above only constructs a TAttributeContent
|
||||||
|
// based on the attribute names seen in the DB, we can end up in a scenario where
|
||||||
|
// flow summaries don't work due to missing TAttributeContent. To get around this,
|
||||||
|
// we need to add the attribute names used by flow summaries. This needs to be done
|
||||||
|
// both for the summaries written in QL and the ones written in data-extension
|
||||||
|
// files.
|
||||||
|
//
|
||||||
|
// 1) Summaries in QL. Sadly the following code leads to non-monotonic recursion
|
||||||
|
// name = any(AccessPathToken a).getAnArgument("Attribute")
|
||||||
|
// instead we use a qltest to alert if we write a new summary in QL that uses an
|
||||||
|
// attribute -- see
|
||||||
|
// python/ql/test/experimental/dataflow/summaries-checks/missing-attribute-content.ql
|
||||||
|
attr in ["re", "string", "pattern"]
|
||||||
|
or
|
||||||
|
//
|
||||||
|
// 2) summaries in data-extension files
|
||||||
|
exists(string input, string output | ModelOutput::relevantSummaryModel(_, _, input, output, _) |
|
||||||
|
attr = [input, output].regexpFind("(?<=(^|\\.)Attribute\\[)[^\\]]+(?=\\])", _, _).trim()
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A data-flow value can have associated content.
|
* A data-flow value can have associated content.
|
||||||
|
|||||||
@@ -3069,6 +3069,212 @@ private module StdlibPrivate {
|
|||||||
override string getName() { result = "re." + method }
|
override string getName() { result = "re." + method }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A flow summary for compiled regex objects
|
||||||
|
*
|
||||||
|
* See https://docs.python.org/3.11/library/re.html#re-objects
|
||||||
|
*/
|
||||||
|
class RePatternSummary extends SummarizedCallable {
|
||||||
|
RePatternSummary() { this = "re.Pattern" }
|
||||||
|
|
||||||
|
override DataFlow::CallCfgNode getACall() {
|
||||||
|
result = API::moduleImport("re").getMember("compile").getACall()
|
||||||
|
}
|
||||||
|
|
||||||
|
override DataFlow::ArgumentNode getACallback() {
|
||||||
|
result = API::moduleImport("re").getMember("compile").getAValueReachableFromSource()
|
||||||
|
}
|
||||||
|
|
||||||
|
override predicate propagatesFlowExt(string input, string output, boolean preservesValue) {
|
||||||
|
input in ["Argument[0]", "Argument[pattern:]"] and
|
||||||
|
output = "ReturnValue.Attribute[pattern]" and
|
||||||
|
preservesValue = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A flow summary for methods returning a `re.Match` object
|
||||||
|
*
|
||||||
|
* See https://docs.python.org/3/library/re.html#re.Match
|
||||||
|
*/
|
||||||
|
class ReMatchSummary extends SummarizedCallable {
|
||||||
|
ReMatchSummary() { this = ["re.Match", "compiled re.Match"] }
|
||||||
|
|
||||||
|
override DataFlow::CallCfgNode getACall() {
|
||||||
|
this = "re.Match" and
|
||||||
|
result = API::moduleImport("re").getMember(["match", "search", "fullmatch"]).getACall()
|
||||||
|
or
|
||||||
|
this = "compiled re.Match" and
|
||||||
|
result =
|
||||||
|
any(RePatternSummary c)
|
||||||
|
.getACall()
|
||||||
|
.(API::CallNode)
|
||||||
|
.getReturn()
|
||||||
|
.getMember(["match", "search", "fullmatch"])
|
||||||
|
.getACall()
|
||||||
|
}
|
||||||
|
|
||||||
|
override DataFlow::ArgumentNode getACallback() { none() }
|
||||||
|
|
||||||
|
override predicate propagatesFlowExt(string input, string output, boolean preservesValue) {
|
||||||
|
exists(string arg |
|
||||||
|
this = "re.Match" and arg = "Argument[1]"
|
||||||
|
or
|
||||||
|
this = "compiled re.Match" and arg = "Argument[0]"
|
||||||
|
|
|
||||||
|
input in [arg, "Argument[string:]"] and
|
||||||
|
(
|
||||||
|
output = "ReturnValue.Attribute[string]" and
|
||||||
|
preservesValue = true
|
||||||
|
or
|
||||||
|
// indexing such as `match[g]` is the same as `match.group(g)`
|
||||||
|
// since you can index with both integers and strings, we model it as
|
||||||
|
// both list element and dictionary... a bit of a hack, but no way to model
|
||||||
|
// subscript operators directly with flow-summaries :|
|
||||||
|
output in ["ReturnValue.ListElement", "ReturnValue.DictionaryElementAny"] and
|
||||||
|
preservesValue = false
|
||||||
|
)
|
||||||
|
)
|
||||||
|
or
|
||||||
|
// regex pattern
|
||||||
|
(
|
||||||
|
this = "re.Match" and input in ["Argument[0]", "Argument[pattern:]"]
|
||||||
|
or
|
||||||
|
// for compiled regexes, this it is already stored in the `pattern` attribute
|
||||||
|
this = "compiled re.Match" and input = "Argument[self].Attribute[pattern]"
|
||||||
|
) and
|
||||||
|
output = "ReturnValue.Attribute[re].Attribute[pattern]" and
|
||||||
|
preservesValue = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A flow summary for methods on a `re.Match` object
|
||||||
|
*
|
||||||
|
* See https://docs.python.org/3/library/re.html#re.Match
|
||||||
|
*/
|
||||||
|
class ReMatchMethodsSummary extends SummarizedCallable {
|
||||||
|
string methodName;
|
||||||
|
|
||||||
|
ReMatchMethodsSummary() {
|
||||||
|
this = "re.Match." + methodName and
|
||||||
|
methodName in ["expand", "group", "groups", "groupdict"]
|
||||||
|
}
|
||||||
|
|
||||||
|
override DataFlow::CallCfgNode getACall() {
|
||||||
|
result =
|
||||||
|
any(ReMatchSummary c)
|
||||||
|
.getACall()
|
||||||
|
.(API::CallNode)
|
||||||
|
.getReturn()
|
||||||
|
.getMember(methodName)
|
||||||
|
.getACall()
|
||||||
|
}
|
||||||
|
|
||||||
|
override DataFlow::ArgumentNode getACallback() { none() }
|
||||||
|
|
||||||
|
override predicate propagatesFlowExt(string input, string output, boolean preservesValue) {
|
||||||
|
methodName = "expand" and
|
||||||
|
preservesValue = false and
|
||||||
|
(
|
||||||
|
input = "Argument[0]" and output = "ReturnValue"
|
||||||
|
or
|
||||||
|
input = "Argument[self].Attribute[string]" and
|
||||||
|
output = "ReturnValue"
|
||||||
|
)
|
||||||
|
or
|
||||||
|
methodName = "group" and
|
||||||
|
input = "Argument[self].Attribute[string]" and
|
||||||
|
output in ["ReturnValue", "ReturnValue.ListElement"] and
|
||||||
|
preservesValue = false
|
||||||
|
or
|
||||||
|
methodName = "groups" and
|
||||||
|
input = "Argument[self].Attribute[string]" and
|
||||||
|
output = "ReturnValue.ListElement" and
|
||||||
|
preservesValue = false
|
||||||
|
or
|
||||||
|
methodName = "groupdict" and
|
||||||
|
input = "Argument[self].Attribute[string]" and
|
||||||
|
output = "ReturnValue.DictionaryElementAny" and
|
||||||
|
preservesValue = false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A flow summary for `re` methods not returning a `re.Match` object
|
||||||
|
*
|
||||||
|
* See https://docs.python.org/3/library/re.html#functions
|
||||||
|
*/
|
||||||
|
class ReFunctionsSummary extends SummarizedCallable {
|
||||||
|
string methodName;
|
||||||
|
|
||||||
|
ReFunctionsSummary() {
|
||||||
|
methodName in ["split", "findall", "finditer", "sub", "subn"] and
|
||||||
|
this = ["re.", "compiled re."] + methodName
|
||||||
|
}
|
||||||
|
|
||||||
|
override DataFlow::CallCfgNode getACall() {
|
||||||
|
this = "re." + methodName and
|
||||||
|
result = API::moduleImport("re").getMember(methodName).getACall()
|
||||||
|
or
|
||||||
|
this = "compiled re." + methodName and
|
||||||
|
result =
|
||||||
|
any(RePatternSummary c)
|
||||||
|
.getACall()
|
||||||
|
.(API::CallNode)
|
||||||
|
.getReturn()
|
||||||
|
.getMember(methodName)
|
||||||
|
.getACall()
|
||||||
|
}
|
||||||
|
|
||||||
|
override DataFlow::ArgumentNode getACallback() { none() }
|
||||||
|
|
||||||
|
override predicate propagatesFlowExt(string input, string output, boolean preservesValue) {
|
||||||
|
exists(int offset |
|
||||||
|
// for non-compiled regex the first argument is the pattern, so we need to
|
||||||
|
// account for this difference
|
||||||
|
this = "re." + methodName and offset = 0
|
||||||
|
or
|
||||||
|
this = "compiled re." + methodName and offset = 1
|
||||||
|
|
|
||||||
|
// flow from input string to results
|
||||||
|
exists(int arg | arg = methodName.(RegexExecutionMethod).getStringArgIndex() - offset |
|
||||||
|
preservesValue = false and
|
||||||
|
input in ["Argument[" + arg + "]", "Argument[string:]"] and
|
||||||
|
(
|
||||||
|
methodName in ["split", "findall", "finditer"] and
|
||||||
|
output = "ReturnValue.ListElement"
|
||||||
|
or
|
||||||
|
// TODO: Since we currently model iterables as tainted when their elements
|
||||||
|
// are, the result of findall, finditer, split needs to be tainted
|
||||||
|
methodName in ["split", "findall", "finditer"] and
|
||||||
|
output = "ReturnValue"
|
||||||
|
or
|
||||||
|
methodName = "sub" and
|
||||||
|
output = "ReturnValue"
|
||||||
|
or
|
||||||
|
methodName = "subn" and
|
||||||
|
output = "ReturnValue.TupleElement[0]"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
or
|
||||||
|
// flow from replacement value for substitution
|
||||||
|
exists(string argumentSpec |
|
||||||
|
argumentSpec in ["Argument[" + (1 - offset) + "]", "Argument[repl:]"] and
|
||||||
|
// `repl` can also be a function
|
||||||
|
input = [argumentSpec, argumentSpec + ".ReturnValue"]
|
||||||
|
|
|
||||||
|
(
|
||||||
|
methodName = "sub" and output = "ReturnValue"
|
||||||
|
or
|
||||||
|
methodName = "subn" and output = "ReturnValue.TupleElement[0]"
|
||||||
|
) and
|
||||||
|
preservesValue = false
|
||||||
|
)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A call to 're.escape'.
|
* A call to 're.escape'.
|
||||||
* See https://docs.python.org/3/library/re.html#re.escape
|
* See https://docs.python.org/3/library/re.html#re.escape
|
||||||
|
|||||||
@@ -0,0 +1 @@
|
|||||||
|
# an empty file, since we want the test to run on an empty db
|
||||||
@@ -0,0 +1,8 @@
|
|||||||
|
import python
|
||||||
|
import semmle.python.dataflow.new.FlowSummary
|
||||||
|
import semmle.python.dataflow.new.internal.FlowSummaryImpl
|
||||||
|
|
||||||
|
query predicate invalidSpecComponent(SummarizedCallable sc, string s, string c) {
|
||||||
|
(sc.propagatesFlowExt(s, _, _) or sc.propagatesFlowExt(_, s, _)) and
|
||||||
|
Private::External::invalidSpecComponent(s, c)
|
||||||
|
}
|
||||||
@@ -0,0 +1,11 @@
|
|||||||
|
import python
|
||||||
|
import semmle.python.dataflow.new.FlowSummary
|
||||||
|
import semmle.python.dataflow.new.internal.FlowSummaryImpl
|
||||||
|
|
||||||
|
from SummarizedCallable sc, string s, string c, string attr
|
||||||
|
where
|
||||||
|
(sc.propagatesFlowExt(s, _, _) or sc.propagatesFlowExt(_, s, _)) and
|
||||||
|
Private::External::invalidSpecComponent(s, c) and
|
||||||
|
c = "Attribute[" + attr + "]"
|
||||||
|
select "The attribute \"" + attr +
|
||||||
|
"\" is not a valid TAttributeContent, please add it to the hardcoded list of TAttributeContent in the dataflow library."
|
||||||
82
python/ql/test/library-tests/frameworks/stdlib/test_re.py
Normal file
82
python/ql/test/library-tests/frameworks/stdlib/test_re.py
Normal file
@@ -0,0 +1,82 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
|
ts = TAINTED_STRING
|
||||||
|
|
||||||
|
pat = ... # some pattern
|
||||||
|
compiled_pat = re.compile(pat)
|
||||||
|
|
||||||
|
# see https://docs.python.org/3/library/re.html#functions
|
||||||
|
ensure_not_tainted(
|
||||||
|
# returns Match object, which is tested properly below. (note: with the flow summary
|
||||||
|
# modeling, objects containing tainted values are not themselves tainted).
|
||||||
|
re.search(pat, ts),
|
||||||
|
re.match(pat, ts),
|
||||||
|
re.fullmatch(pat, ts),
|
||||||
|
|
||||||
|
compiled_pat.search(ts),
|
||||||
|
compiled_pat.match(ts),
|
||||||
|
compiled_pat.fullmatch(ts),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Match object
|
||||||
|
tainted_match = re.match(pat, ts)
|
||||||
|
safe_match = re.match(pat, "safe")
|
||||||
|
ensure_tainted(
|
||||||
|
tainted_match.expand("Hello \1"), # $ tainted
|
||||||
|
safe_match.expand(ts), # $ tainted
|
||||||
|
tainted_match.group(), # $ tainted
|
||||||
|
tainted_match.group(1, 2), # $ tainted
|
||||||
|
tainted_match.group(1, 2)[0], # $ tainted
|
||||||
|
tainted_match[0], # $ tainted
|
||||||
|
tainted_match["key"], # $ tainted
|
||||||
|
|
||||||
|
tainted_match.groups()[0], # $ tainted
|
||||||
|
tainted_match.groupdict()["key"], # $ tainted
|
||||||
|
|
||||||
|
re.match(pat, ts).string, # $ tainted
|
||||||
|
re.match(ts, "safe").re.pattern, # $ tainted
|
||||||
|
|
||||||
|
compiled_pat.match(ts).string, # $ tainted
|
||||||
|
re.compile(ts).match("safe").re.pattern, # $ tainted
|
||||||
|
)
|
||||||
|
ensure_not_tainted(
|
||||||
|
safe_match.expand("Hello \1"),
|
||||||
|
safe_match.group(),
|
||||||
|
|
||||||
|
re.match(pat, "safe").re,
|
||||||
|
re.match(pat, "safe").string,
|
||||||
|
)
|
||||||
|
|
||||||
|
ensure_tainted(
|
||||||
|
# other functions not returning Match objects
|
||||||
|
re.split(pat, ts), # $ tainted
|
||||||
|
re.split(pat, ts)[0], # $ tainted
|
||||||
|
|
||||||
|
re.findall(pat, ts), # $ tainted
|
||||||
|
re.findall(pat, ts)[0], # $ tainted
|
||||||
|
|
||||||
|
re.finditer(pat, ts), # $ tainted
|
||||||
|
[x for x in re.finditer(pat, ts)], # $ tainted
|
||||||
|
|
||||||
|
re.sub(pat, repl="safe", string=ts), # $ tainted
|
||||||
|
re.sub(pat, repl=lambda m: ..., string=ts), # $ tainted
|
||||||
|
re.sub(pat, repl=ts, string="safe"), # $ tainted
|
||||||
|
re.sub(pat, repl=lambda m: ts, string="safe"), # $ tainted
|
||||||
|
|
||||||
|
# same for compiled patterns
|
||||||
|
compiled_pat.split(ts), # $ tainted
|
||||||
|
compiled_pat.split(ts)[0], # $ tainted
|
||||||
|
# ...
|
||||||
|
|
||||||
|
# user-controlled compiled pattern
|
||||||
|
re.compile(ts), # $ tainted
|
||||||
|
re.compile(ts).pattern, # $ tainted
|
||||||
|
)
|
||||||
|
|
||||||
|
ensure_not_tainted(
|
||||||
|
re.subn(pat, repl="safe", string=ts),
|
||||||
|
re.subn(pat, repl="safe", string=ts)[1], # // the number of substitutions made
|
||||||
|
)
|
||||||
|
ensure_tainted(
|
||||||
|
re.subn(pat, repl="safe", string=ts)[0], # $ tainted // the string
|
||||||
|
)
|
||||||
Reference in New Issue
Block a user