Ruby/Python: regex parser: group sequences of 'normal' characters

2026-07-21 11:18:20 +02:00 · 2022-02-22 10:51:47 +01:00
parent 36e02ae9ac
commit 69ed121ecb
9 changed files with 166 additions and 231 deletions
--- a/python/ql/lib/semmle/python/RegexTreeView.qll
+++ b/python/ql/lib/semmle/python/RegexTreeView.qll
@@ -39,7 +39,12 @@ newtype TRegExpParent =
  /** A special character */
  TRegExpSpecialChar(Regex re, int start, int end) { re.specialCharacter(start, end, _) } or
  /** A normal character */
-  TRegExpNormalChar(Regex re, int start, int end) { re.normalCharacter(start, end) } or
+  TRegExpNormalChar(Regex re, int start, int end) {
+    re.normalCharacterSequence(start, end)
+    or
+    re.escapedCharacter(start, end) and
+    not re.specialCharacter(start, end, _)
+  } or
  /** A back reference */
  TRegExpBackRef(Regex re, int start, int end) { re.backreference(start, end) }

--- a/python/ql/lib/semmle/python/regex.qll
+++ b/python/ql/lib/semmle/python/regex.qll
@@ -446,6 +446,45 @@ abstract class RegexString extends Expr {
    )
  }

+  /**
+   * A sequence of 'normal' characters.
+   */
+  predicate normalCharacterSequence(int start, int end) {
+    this.normalCharacter(start, end) and
+    end = start + 1 and
+    exists(int x, int y | this.charSet(x, y) and x <= start and y >= end)
+    or
+    exists(int s, int e |
+      e = max(int i | normalCharacterSub(s, i)) and
+      not exists(int x, int y | this.charSet(x, y) and x <= s and y >= e)
+    |
+      if qualifier(e, _, _, _)
+      then
+        end = e and start = e - 1
+        or
+        end = e - 1 and start = s and start < end
+      else (
+        end = e and
+        start = s
+      )
+    )
+  }
+
+  private predicate normalCharacterSub(int start, int end) {
+    (
+      normalCharacterSub(start, end - 1)
+      or
+      start = end - 1 and not normalCharacter(start - 1, start)
+    ) and
+    this.normalCharacter(end - 1, end)
+  }
+
+  private predicate characterItem(int start, int end) {
+    this.normalCharacterSequence(start, end) or
+    this.escapedCharacter(start, end) or
+    this.specialCharacter(start, end, _)
+  }
+
  /** Whether the text in the range start,end is a group */
  predicate group(int start, int end) {
    this.groupContents(start, end, _, _)
@@ -717,7 +756,7 @@ abstract class RegexString extends Expr {
  string getBackrefName(int start, int end) { this.named_backreference(start, end, result) }

  private predicate baseItem(int start, int end) {
-    this.character(start, end) and
+    this.characterItem(start, end) and
    not exists(int x, int y | this.charSet(x, y) and x <= start and y >= end)
    or
    this.group(start, end)
@@ -837,14 +876,14 @@ abstract class RegexString extends Expr {
  }

  private predicate item_start(int start) {
-    this.character(start, _) or
+    this.characterItem(start, _) or
    this.isGroupStart(start) or
    this.charSet(start, _) or
    this.backreference(start, _)
  }

  private predicate item_end(int end) {
-    this.character(_, end)
+    this.characterItem(_, end)
    or
    exists(int endm1 | this.isGroupEnd(endm1) and end = endm1 + 1)
    or
@@ -953,7 +992,7 @@ abstract class RegexString extends Expr {
   */
  predicate firstItem(int start, int end) {
    (
-      this.character(start, end)
+      this.characterItem(start, end)
      or
      this.qualifiedItem(start, end, _, _)
      or
@@ -968,7 +1007,7 @@ abstract class RegexString extends Expr {
   */
  predicate lastItem(int start, int end) {
    (
-      this.character(start, end)
+      this.characterItem(start, end)
      or
      this.qualifiedItem(start, end, _, _)
      or
--- a/python/ql/test/library-tests/regex/FirstLast.expected
+++ b/python/ql/test/library-tests/regex/FirstLast.expected
@@ -1,6 +1,6 @@
-| 012345678 | first | 0 | 1 |
-| 012345678 | last | 8 | 9 |
-| (?!not-this)^[A-Z_]+$ | first | 3 | 4 |
+| 012345678 | first | 0 | 9 |
+| 012345678 | last | 0 | 9 |
+| (?!not-this)^[A-Z_]+$ | first | 3 | 11 |
 | (?!not-this)^[A-Z_]+$ | first | 12 | 13 |
 | (?!not-this)^[A-Z_]+$ | first | 13 | 19 |
 | (?!not-this)^[A-Z_]+$ | first | 13 | 20 |
@@ -27,9 +27,9 @@
 | (?m)^(?!$) | last | 4 | 5 |
 | (?m)^(?!$) | last | 8 | 9 |
 | (\\033\|~{) | first | 1 | 5 |
-| (\\033\|~{) | first | 6 | 7 |
+| (\\033\|~{) | first | 6 | 8 |
 | (\\033\|~{) | last | 1 | 5 |
-| (\\033\|~{) | last | 7 | 8 |
+| (\\033\|~{) | last | 6 | 8 |
 | [\ufffd-\ufffd] | first | 0 | 5 |
 | [\ufffd-\ufffd] | last | 0 | 5 |
 | [\ufffd-\ufffd][\ufffd-\ufffd] | first | 0 | 5 |
@@ -52,8 +52,8 @@
 | \\A[+-]?\\d+ | last | 7 | 9 |
 | \\A[+-]?\\d+ | last | 7 | 10 |
 | \\Afoo\\Z | first | 0 | 2 |
-| \\Afoo\\Z | first | 2 | 3 |
-| \\Afoo\\Z | last | 4 | 5 |
+| \\Afoo\\Z | first | 2 | 5 |
+| \\Afoo\\Z | last | 2 | 5 |
 | \\Afoo\\Z | last | 5 | 7 |
 | \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | first | 0 | 2 |
 | \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | last | 28 | 32 |
@@ -86,24 +86,24 @@
 | ^[A-Z_]+$(?<!not-this) | last | 1 | 7 |
 | ^[A-Z_]+$(?<!not-this) | last | 1 | 8 |
 | ^[A-Z_]+$(?<!not-this) | last | 8 | 9 |
-| ^[A-Z_]+$(?<!not-this) | last | 20 | 21 |
+| ^[A-Z_]+$(?<!not-this) | last | 13 | 21 |
 | ax{01,3} | first | 0 | 1 |
 | ax{01,3} | last | 1 | 2 |
 | ax{01,3} | last | 1 | 8 |
-| ax{01,3} | last | 7 | 8 |
+| ax{01,3} | last | 3 | 8 |
 | ax{3,} | first | 0 | 1 |
 | ax{3,} | last | 1 | 2 |
 | ax{3,} | last | 1 | 6 |
-| ax{3,} | last | 5 | 6 |
+| ax{3,} | last | 3 | 6 |
 | ax{3} | first | 0 | 1 |
 | ax{3} | last | 1 | 2 |
 | ax{3} | last | 1 | 5 |
-| ax{3} | last | 4 | 5 |
+| ax{3} | last | 3 | 5 |
 | ax{,3} | first | 0 | 1 |
 | ax{,3} | last | 0 | 1 |
 | ax{,3} | last | 1 | 2 |
 | ax{,3} | last | 1 | 6 |
-| ax{,3} | last | 5 | 6 |
+| ax{,3} | last | 3 | 6 |
 | x\| | first | 0 | 1 |
 | x\| | last | 0 | 1 |
 | x\|(?<!\\w)l | first | 0 | 1 |
@@ -111,5 +111,5 @@
 | x\|(?<!\\w)l | first | 9 | 10 |
 | x\|(?<!\\w)l | last | 0 | 1 |
 | x\|(?<!\\w)l | last | 9 | 10 |
-| x{Not qual} | first | 0 | 1 |
-| x{Not qual} | last | 10 | 11 |
+| x{Not qual} | first | 0 | 11 |
+| x{Not qual} | last | 0 | 11 |
--- a/python/ql/test/query-tests/Security/CWE-730-ReDoS/ReDoS.expected
+++ b/python/ql/test/query-tests/Security/CWE-730-ReDoS/ReDoS.expected
@@ -59,7 +59,7 @@
 | redos.py:220:25:220:29 | [^X]+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'W'. |
 | redos.py:223:30:223:30 | b | This part of the regular expression may cause exponential backtracking on strings starting with 'W' and containing many repetitions of 'bW'. |
 | redos.py:229:30:229:30 | b | This part of the regular expression may cause exponential backtracking on strings starting with 'W' and containing many repetitions of 'bW'. |
-| redos.py:241:27:241:27 | b | This part of the regular expression may cause exponential backtracking on strings starting with 'a' and containing many repetitions of 'ba'. |
+| redos.py:241:26:241:27 | ab | This part of the regular expression may cause exponential backtracking on strings starting with 'a' and containing many repetitions of 'ab'. |
 | redos.py:247:25:247:31 | [\\n\\s]+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\n'. |
 | redos.py:256:25:256:27 | \\w* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'foobarbazfoobarbazfoobarbazfoobarbazfoobarbazfoobarbaz'. |
 | redos.py:256:37:256:39 | \\w* | This part of the regular expression may cause exponential backtracking on strings starting with 'foobarbaz' and containing many repetitions of 'foobarbazfoobarbazfoobarbazfoobarbazfoobarbazfoobarbaz'. |