Python: make mode characters not be characters

They are simply considered part of the group start.
2025-12-24 04:36:35 +01:00 · 2023-08-15 21:23:50 +02:00
parent a834703195
commit 7ad1a21c2d
2 changed files with 29 additions and 8 deletions
--- a/python/ql/lib/semmle/python/regexp/internal/ParseRegExp.qll
+++ b/python/ql/lib/semmle/python/regexp/internal/ParseRegExp.qll
@@ -683,12 +683,34 @@ class RegExp extends Expr instanceof StrConst {
   * Holds if a parse mode starts between `start` and `end`.
   */
  private predicate flag_group_start(int start, int end) {
+    exists(int no_modes_end |
+      this.flag_group_start_no_modes(start, no_modes_end) and
+      end = max(int i | this.mode_character(start, i) | i + 1)
+    )
+  }
+
+  /**
+   * Holds if the initial part of a parse mode, not containing any
+   * mode characters is between `start` and `end`.
+   */
+  private predicate flag_group_start_no_modes(int start, int end) {
    this.isGroupStart(start) and
    this.getChar(start + 1) = "?" and
    this.getChar(start + 2) in ["i", "L", "m", "s", "u", "x"] and
    end = start + 2
  }

+  /**
+   * Holds if `pos` contains a mo character from the
+   * flag group starting at `start`.
+   */
+  private predicate mode_character(int start, int pos) {
+    this.flag_group_start_no_modes(start, pos)
+    or
+    this.mode_character(start, pos - 1) and
+    this.getChar(pos) in ["i", "L", "m", "s", "u", "x"]
+  }
+
  /**
   * Holds if a parse mode group is between `start` and `end`, and includes the
   * mode flag `c`. For example the following span, with mode flag `i`:
@@ -696,11 +718,10 @@ class RegExp extends Expr instanceof StrConst {
   * (?i)
   * ```
   */
-  private predicate flag_group(int start, int end, string c) {
-    exists(int inStart, int inEnd |
-      this.flag_group_start(start, inStart) and
-      this.groupContents(start, end, inStart, inEnd) and
-      this.getChar([inStart .. inEnd - 1]) = c
+  private predicate flag(string c) {
+    exists(int pos |
+      this.mode_character(_, pos) and
+      this.getChar(pos) = c
    )
  }

@@ -709,7 +730,7 @@ class RegExp extends Expr instanceof StrConst {
   * it is defined by a prefix.
   */
  string getModeFromPrefix() {
-    exists(string c | this.flag_group(_, _, c) |
+    exists(string c | this.flag(c) |
      c = "i" and result = "IGNORECASE"
      or
      c = "L" and result = "LOCALE"
--- a/python/ql/test/query-tests/Security/CWE-730-ReDoS/ReDoS.expected
+++ b/python/ql/test/query-tests/Security/CWE-730-ReDoS/ReDoS.expected
@@ -105,5 +105,5 @@
 | redos.py:391:15:391:25 | (\\u0061\|a)* | This part of the regular expression may cause exponential backtracking on strings starting with 'X' and containing many repetitions of 'a'. |
 | unittests.py:5:17:5:23 | (\u00c6\|\\\u00c6)+ | This part of the regular expression may cause exponential backtracking on strings starting with 'X' and containing many repetitions of '\\u00c6'. |
 | unittests.py:9:16:9:24 | (?:.\|\\n)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\n'. |
-| unittests.py:11:20:11:28 | (?:.\|\\n)* | This part of the regular expression may cause exponential backtracking on strings starting with 's' and containing many repetitions of '\\n'. |
-| unittests.py:12:21:12:29 | (?:.\|\\n)* | This part of the regular expression may cause exponential backtracking on strings starting with 'is' and containing many repetitions of '\\n'. |
+| unittests.py:11:20:11:28 | (?:.\|\\n)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\n'. |
+| unittests.py:12:21:12:29 | (?:.\|\\n)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\n'. |