Merge pull request #14725 from RasmusWL/re-modeling

Python: Add taint-flow modeling for `re` module
2025-12-21 11:16:30 +01:00 · 2023-11-23 11:35:36 +01:00
parent ef8d38e9e0 3d46129bbf
commit d056706af5
9 changed files with 342 additions and 1 deletions
--- a/python/ql/lib/change-notes/2023-11-08-re-modeling.md
+++ b/python/ql/lib/change-notes/2023-11-08-re-modeling.md
@@ -0,0 +1,4 @@
+---
+category: minorAnalysis
+---
+* Added taint-flow modeling for regular expressions with `re` module from the standard library.
--- a/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPublic.qll
+++ b/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPublic.qll
@@ -10,6 +10,7 @@ import LocalSources
 private import semmle.python.essa.SsaCompute
 private import semmle.python.dataflow.new.internal.ImportStar
 private import FlowSummaryImpl as FlowSummaryImpl
+private import semmle.python.frameworks.data.ModelsAsData

 /**
 * IPA type for data flow nodes.
@@ -587,6 +588,11 @@ newtype TContent =
    or
    // Arguments can overflow and end up in the starred parameter tuple.
    exists(any(CallNode cn).getArg(index))
+    or
+    // since flow summaries might use tuples, we ensure that we at least have valid
+    // TTupleElementContent for the 0..7 (7 was picked to match `small_tuple` in
+    // data-flow-private)
+    index in [0 .. 7]
  } or
  /** An element of a dictionary under a specific key. */
  TDictionaryElementContent(string key) {
@@ -597,7 +603,30 @@ newtype TContent =
  /** An element of a dictionary under any key. */
  TDictionaryElementAnyContent() or
  /** An object attribute. */
-  TAttributeContent(string attr) { attr = any(Attribute a).getName() }
+  TAttributeContent(string attr) {
+    attr = any(Attribute a).getName()
+    or
+    // Flow summaries that target attributes rely on a TAttributeContent being
+    // available. However, since the code above only constructs a TAttributeContent
+    // based on the attribute names seen in the DB, we can end up in a scenario where
+    // flow summaries don't work due to missing TAttributeContent. To get around this,
+    // we need to add the attribute names used by flow summaries. This needs to be done
+    // both for the summaries written in QL and the ones written in data-extension
+    // files.
+    //
+    // 1) Summaries in QL. Sadly the following code leads to non-monotonic recursion
+    //   name = any(AccessPathToken a).getAnArgument("Attribute")
+    // instead we use a qltest to alert if we write a new summary in QL that uses an
+    // attribute -- see
+    // python/ql/test/experimental/dataflow/summaries-checks/missing-attribute-content.ql
+    attr in ["re", "string", "pattern"]
+    or
+    //
+    // 2) summaries in data-extension files
+    exists(string input, string output | ModelOutput::relevantSummaryModel(_, _, input, output, _) |
+      attr = [input, output].regexpFind("(?<=(^|\\.)Attribute\\[)[^\\]]+(?=\\])", _, _).trim()
+    )
+  }

 /**
 * A data-flow value can have associated content.
--- a/python/ql/lib/semmle/python/frameworks/Stdlib.qll
+++ b/python/ql/lib/semmle/python/frameworks/Stdlib.qll
@@ -3069,6 +3069,212 @@ private module StdlibPrivate {
    override string getName() { result = "re." + method }
  }

+  /**
+   * A flow summary for compiled regex objects
+   *
+   * See https://docs.python.org/3.11/library/re.html#re-objects
+   */
+  class RePatternSummary extends SummarizedCallable {
+    RePatternSummary() { this = "re.Pattern" }
+
+    override DataFlow::CallCfgNode getACall() {
+      result = API::moduleImport("re").getMember("compile").getACall()
+    }
+
+    override DataFlow::ArgumentNode getACallback() {
+      result = API::moduleImport("re").getMember("compile").getAValueReachableFromSource()
+    }
+
+    override predicate propagatesFlowExt(string input, string output, boolean preservesValue) {
+      input in ["Argument[0]", "Argument[pattern:]"] and
+      output = "ReturnValue.Attribute[pattern]" and
+      preservesValue = true
+    }
+  }
+
+  /**
+   * A flow summary for methods returning a `re.Match` object
+   *
+   * See https://docs.python.org/3/library/re.html#re.Match
+   */
+  class ReMatchSummary extends SummarizedCallable {
+    ReMatchSummary() { this = ["re.Match", "compiled re.Match"] }
+
+    override DataFlow::CallCfgNode getACall() {
+      this = "re.Match" and
+      result = API::moduleImport("re").getMember(["match", "search", "fullmatch"]).getACall()
+      or
+      this = "compiled re.Match" and
+      result =
+        any(RePatternSummary c)
+            .getACall()
+            .(API::CallNode)
+            .getReturn()
+            .getMember(["match", "search", "fullmatch"])
+            .getACall()
+    }
+
+    override DataFlow::ArgumentNode getACallback() { none() }
+
+    override predicate propagatesFlowExt(string input, string output, boolean preservesValue) {
+      exists(string arg |
+        this = "re.Match" and arg = "Argument[1]"
+        or
+        this = "compiled re.Match" and arg = "Argument[0]"
+      |
+        input in [arg, "Argument[string:]"] and
+        (
+          output = "ReturnValue.Attribute[string]" and
+          preservesValue = true
+          or
+          // indexing such as `match[g]` is the same as `match.group(g)`
+          // since you can index with both integers and strings, we model it as
+          // both list element and dictionary... a bit of a hack, but no way to model
+          // subscript operators directly with flow-summaries :|
+          output in ["ReturnValue.ListElement", "ReturnValue.DictionaryElementAny"] and
+          preservesValue = false
+        )
+      )
+      or
+      // regex pattern
+      (
+        this = "re.Match" and input in ["Argument[0]", "Argument[pattern:]"]
+        or
+        // for compiled regexes, this it is already stored in the `pattern` attribute
+        this = "compiled re.Match" and input = "Argument[self].Attribute[pattern]"
+      ) and
+      output = "ReturnValue.Attribute[re].Attribute[pattern]" and
+      preservesValue = true
+    }
+  }
+
+  /**
+   * A flow summary for methods on a `re.Match` object
+   *
+   * See https://docs.python.org/3/library/re.html#re.Match
+   */
+  class ReMatchMethodsSummary extends SummarizedCallable {
+    string methodName;
+
+    ReMatchMethodsSummary() {
+      this = "re.Match." + methodName and
+      methodName in ["expand", "group", "groups", "groupdict"]
+    }
+
+    override DataFlow::CallCfgNode getACall() {
+      result =
+        any(ReMatchSummary c)
+            .getACall()
+            .(API::CallNode)
+            .getReturn()
+            .getMember(methodName)
+            .getACall()
+    }
+
+    override DataFlow::ArgumentNode getACallback() { none() }
+
+    override predicate propagatesFlowExt(string input, string output, boolean preservesValue) {
+      methodName = "expand" and
+      preservesValue = false and
+      (
+        input = "Argument[0]" and output = "ReturnValue"
+        or
+        input = "Argument[self].Attribute[string]" and
+        output = "ReturnValue"
+      )
+      or
+      methodName = "group" and
+      input = "Argument[self].Attribute[string]" and
+      output in ["ReturnValue", "ReturnValue.ListElement"] and
+      preservesValue = false
+      or
+      methodName = "groups" and
+      input = "Argument[self].Attribute[string]" and
+      output = "ReturnValue.ListElement" and
+      preservesValue = false
+      or
+      methodName = "groupdict" and
+      input = "Argument[self].Attribute[string]" and
+      output = "ReturnValue.DictionaryElementAny" and
+      preservesValue = false
+    }
+  }
+
+  /**
+   * A flow summary for `re` methods not returning a `re.Match` object
+   *
+   * See https://docs.python.org/3/library/re.html#functions
+   */
+  class ReFunctionsSummary extends SummarizedCallable {
+    string methodName;
+
+    ReFunctionsSummary() {
+      methodName in ["split", "findall", "finditer", "sub", "subn"] and
+      this = ["re.", "compiled re."] + methodName
+    }
+
+    override DataFlow::CallCfgNode getACall() {
+      this = "re." + methodName and
+      result = API::moduleImport("re").getMember(methodName).getACall()
+      or
+      this = "compiled re." + methodName and
+      result =
+        any(RePatternSummary c)
+            .getACall()
+            .(API::CallNode)
+            .getReturn()
+            .getMember(methodName)
+            .getACall()
+    }
+
+    override DataFlow::ArgumentNode getACallback() { none() }
+
+    override predicate propagatesFlowExt(string input, string output, boolean preservesValue) {
+      exists(int offset |
+        // for non-compiled regex the first argument is the pattern, so we need to
+        // account for this difference
+        this = "re." + methodName and offset = 0
+        or
+        this = "compiled re." + methodName and offset = 1
+      |
+        // flow from input string to results
+        exists(int arg | arg = methodName.(RegexExecutionMethod).getStringArgIndex() - offset |
+          preservesValue = false and
+          input in ["Argument[" + arg + "]", "Argument[string:]"] and
+          (
+            methodName in ["split", "findall", "finditer"] and
+            output = "ReturnValue.ListElement"
+            or
+            // TODO: Since we currently model iterables as tainted when their elements
+            // are, the result of findall, finditer, split needs to be tainted
+            methodName in ["split", "findall", "finditer"] and
+            output = "ReturnValue"
+            or
+            methodName = "sub" and
+            output = "ReturnValue"
+            or
+            methodName = "subn" and
+            output = "ReturnValue.TupleElement[0]"
+          )
+        )
+        or
+        // flow from replacement value for substitution
+        exists(string argumentSpec |
+          argumentSpec in ["Argument[" + (1 - offset) + "]", "Argument[repl:]"] and
+          // `repl` can also be a function
+          input = [argumentSpec, argumentSpec + ".ReturnValue"]
+        |
+          (
+            methodName = "sub" and output = "ReturnValue"
+            or
+            methodName = "subn" and output = "ReturnValue.TupleElement[0]"
+          ) and
+          preservesValue = false
+        )
+      )
+    }
+  }
+
  /**
   * A call to 're.escape'.
   * See https://docs.python.org/3/library/re.html#re.escape
--- a/python/ql/test/experimental/dataflow/summaries-checks/dummy.py
+++ b/python/ql/test/experimental/dataflow/summaries-checks/dummy.py
@@ -0,0 +1 @@
+# an empty file, since we want the test to run on an empty db
--- a/python/ql/test/experimental/dataflow/summaries-checks/invalid-spec.expected
+++ b/python/ql/test/experimental/dataflow/summaries-checks/invalid-spec.expected
--- a/python/ql/test/experimental/dataflow/summaries-checks/invalid-spec.ql
+++ b/python/ql/test/experimental/dataflow/summaries-checks/invalid-spec.ql
@@ -0,0 +1,8 @@
+import python
+import semmle.python.dataflow.new.FlowSummary
+import semmle.python.dataflow.new.internal.FlowSummaryImpl
+
+query predicate invalidSpecComponent(SummarizedCallable sc, string s, string c) {
+  (sc.propagatesFlowExt(s, _, _) or sc.propagatesFlowExt(_, s, _)) and
+  Private::External::invalidSpecComponent(s, c)
+}
--- a/python/ql/test/experimental/dataflow/summaries-checks/missing-attribute-content.expected
+++ b/python/ql/test/experimental/dataflow/summaries-checks/missing-attribute-content.expected
--- a/python/ql/test/experimental/dataflow/summaries-checks/missing-attribute-content.ql
+++ b/python/ql/test/experimental/dataflow/summaries-checks/missing-attribute-content.ql
@@ -0,0 +1,11 @@
+import python
+import semmle.python.dataflow.new.FlowSummary
+import semmle.python.dataflow.new.internal.FlowSummaryImpl
+
+from SummarizedCallable sc, string s, string c, string attr
+where
+  (sc.propagatesFlowExt(s, _, _) or sc.propagatesFlowExt(_, s, _)) and
+  Private::External::invalidSpecComponent(s, c) and
+  c = "Attribute[" + attr + "]"
+select "The attribute \"" + attr +
+    "\" is not a valid TAttributeContent, please add it to the hardcoded list of TAttributeContent in the dataflow library."
--- a/python/ql/test/library-tests/frameworks/stdlib/test_re.py
+++ b/python/ql/test/library-tests/frameworks/stdlib/test_re.py
@@ -0,0 +1,82 @@
+import re
+
+ts = TAINTED_STRING
+
+pat = ... # some pattern
+compiled_pat = re.compile(pat)
+
+# see https://docs.python.org/3/library/re.html#functions
+ensure_not_tainted(
+    # returns Match object, which is tested properly below. (note: with the flow summary
+    # modeling, objects containing tainted values are not themselves tainted).
+    re.search(pat, ts),
+    re.match(pat, ts),
+    re.fullmatch(pat, ts),
+
+    compiled_pat.search(ts),
+    compiled_pat.match(ts),
+    compiled_pat.fullmatch(ts),
+)
+
+# Match object
+tainted_match = re.match(pat, ts)
+safe_match = re.match(pat, "safe")
+ensure_tainted(
+    tainted_match.expand("Hello \1"), # $ tainted
+    safe_match.expand(ts), # $ tainted
+    tainted_match.group(), # $ tainted
+    tainted_match.group(1, 2), # $ tainted
+    tainted_match.group(1, 2)[0], # $ tainted
+    tainted_match[0], # $ tainted
+    tainted_match["key"], # $ tainted
+
+    tainted_match.groups()[0], # $ tainted
+    tainted_match.groupdict()["key"], # $ tainted
+
+    re.match(pat, ts).string, # $ tainted
+    re.match(ts, "safe").re.pattern, # $ tainted
+
+    compiled_pat.match(ts).string, # $ tainted
+    re.compile(ts).match("safe").re.pattern, # $ tainted
+)
+ensure_not_tainted(
+    safe_match.expand("Hello \1"),
+    safe_match.group(),
+
+    re.match(pat, "safe").re,
+    re.match(pat, "safe").string,
+)
+
+ensure_tainted(
+    # other functions not returning Match objects
+    re.split(pat, ts), # $ tainted
+    re.split(pat, ts)[0], # $ tainted
+
+    re.findall(pat, ts), # $ tainted
+    re.findall(pat, ts)[0], # $ tainted
+
+    re.finditer(pat, ts), # $ tainted
+    [x for x in re.finditer(pat, ts)], # $ tainted
+
+    re.sub(pat, repl="safe", string=ts), # $ tainted
+    re.sub(pat, repl=lambda m: ..., string=ts), # $ tainted
+    re.sub(pat, repl=ts, string="safe"), # $ tainted
+    re.sub(pat, repl=lambda m: ts, string="safe"), # $ tainted
+
+    # same for compiled patterns
+    compiled_pat.split(ts), # $ tainted
+    compiled_pat.split(ts)[0], # $ tainted
+    # ...
+
+    # user-controlled compiled pattern
+    re.compile(ts), # $ tainted
+    re.compile(ts).pattern, # $ tainted
+)
+
+ensure_not_tainted(
+    re.subn(pat, repl="safe", string=ts),
+    re.subn(pat, repl="safe", string=ts)[1], # // the number of substitutions made
+)
+ensure_tainted(
+    re.subn(pat, repl="safe", string=ts)[0], # $ tainted // the string
+)
				`@@ -0,0 +1 @@`
				`# an empty file, since we want the test to run on an empty db`