Python: Fix parsing of octal escapes

2026-07-21 03:08:25 +02:00 · 2021-08-04 20:41:56 +02:00
parent 34b054ff53
commit c08f94ec04
9 changed files with 42 additions and 17 deletions
--- a/python/ql/src/semmle/python/RegexTreeView.qll
+++ b/python/ql/src/semmle/python/RegexTreeView.qll
@@ -75,7 +75,11 @@ class RegExpTerm extends RegExpParent {
  int end;

  RegExpTerm() {
-    this = TRegExpAlt(re, start, end)
+    this = TRegExpAlt(re, start, end) and
+    exists(int part_end |
+      re.alternationOption(start, end, start, part_end) and
+      part_end < end
+    ) // if an alternation does not have more than one element, it should be treated as that element instead.
    or
    this = TRegExpBackRef(re, start, end)
    or
--- a/python/ql/src/semmle/python/regex.qll
+++ b/python/ql/src/semmle/python/regex.qll
@@ -371,10 +371,13 @@ abstract class RegexString extends Expr {
      or
      // octal value \ooo
      end in [start + 2 .. start + 4] and
-      this.getText().substring(start + 1, end).toInt() >= 0 and
+      // this.isOctal([start + 1 .. end]) and
+      forall(int i | i in [start + 1 .. end - 1] | this.isOctal(i)) and
+      // this.getText().substring(start + 1, end).toInt() >= 0 and
      not (
        end < start + 4 and
-        exists(this.getText().substring(start + 1, end + 1).toInt())
+        this.isOctal(end) //and
+        // exists(this.getText().substring(start + 1, end + 1).toInt())
      )
      or
      // 16-bit hex value \uhhhh
@@ -392,6 +395,11 @@ abstract class RegexString extends Expr {
    )
  }

+  pragma[inline]
+  private predicate isOctal(int index) {
+    this.getChar(index) in ["0", "1", "2", "3", "4", "5", "6", "7"]
+  }
+
  /** Holds if `index` is inside a character set. */
  predicate inCharSet(int index) {
    exists(int x, int y | this.charSet(x, y) and index in [x + 1 .. y - 2])
@@ -690,6 +698,7 @@ abstract class RegexString extends Expr {

  private predicate numbered_backreference(int start, int end, int value) {
    this.escapingChar(start) and
+    // starting with 0 makes it an octal escape
    not this.getChar(start + 1) = "0" and
    exists(string text, string svalue, int len |
      end = start + len and
@@ -698,8 +707,18 @@ abstract class RegexString extends Expr {
    |
      svalue = text.substring(start + 1, start + len) and
      value = svalue.toInt() and
-      not exists(text.substring(start + 1, start + len + 1).toInt()) and
-      value > 0
+      // value is composed of digits
+      forall(int i | i in [start + 1 .. start + len - 1] |
+        this.getChar(i) in ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
+      ) and
+      // a longer reference is not possible
+      not (
+        len = 2 and
+        exists(text.substring(start + 1, start + len + 1).toInt())
+      ) and
+      // 3 octal digits makes it an octal escape
+      not forall(int i | i in [start + 1 .. start + 4] | this.isOctal(i))
+      // TODO: Inside a character set, all numeric escapes are treated as characters.
    )
  }

--- a/python/ql/test/library-tests/regex/Alternation.expected
+++ b/python/ql/test/library-tests/regex/Alternation.expected
@@ -8,8 +8,6 @@
 | (?P<name>[\\w]+)\| | 0 | 16 | (?P<name>[\\w]+)\| | 16 | 16 |  |
 | (\\033\|~{) | 1 | 8 | \\033\|~{ | 1 | 5 | \\033 |
 | (\\033\|~{) | 1 | 8 | \\033\|~{ | 6 | 8 | ~{ |
-| \\+0 | 0 | 3 | \\+0 | 0 | 2 | \\+ |
-| \\+0 | 0 | 3 | \\+0 | 0 | 3 | \\+0 |
 | \\\|\\[\\][123]\|\\{\\} | 0 | 16 | \\\|\\[\\][123]\|\\{\\} | 0 | 11 | \\\|\\[\\][123] |
 | \\\|\\[\\][123]\|\\{\\} | 0 | 16 | \\\|\\[\\][123]\|\\{\\} | 12 | 16 | \\{\\} |
 | \|x | 0 | 2 | \|x | 0 | 0 |  |
--- a/python/ql/test/library-tests/regex/Characters.expected
+++ b/python/ql/test/library-tests/regex/Characters.expected
@@ -53,7 +53,7 @@
 | [^A-Z] | 4 | 5 |
 | [^]] | 2 | 3 |
 | \\+0 | 0 | 2 |
-| \\+0 | 0 | 3 |
+| \\+0 | 2 | 3 |
 | \\A[+-]?\\d+ | 0 | 2 |
 | \\A[+-]?\\d+ | 3 | 4 |
 | \\A[+-]?\\d+ | 4 | 5 |
--- a/python/ql/test/library-tests/regex/FirstLast.expected
+++ b/python/ql/test/library-tests/regex/FirstLast.expected
@@ -43,9 +43,7 @@
 | [^]] | first | 0 | 4 |
 | [^]] | last | 0 | 4 |
 | \\+0 | first | 0 | 2 |
-| \\+0 | first | 0 | 3 |
-| \\+0 | last | 0 | 2 |
-| \\+0 | last | 0 | 3 |
+| \\+0 | last | 2 | 3 |
 | \\A[+-]?\\d+ | first | 0 | 2 |
 | \\A[+-]?\\d+ | last | 7 | 9 |
 | \\A[+-]?\\d+ | last | 7 | 10 |
--- a/python/ql/test/library-tests/regex/Regex.expected
+++ b/python/ql/test/library-tests/regex/Regex.expected
@@ -114,9 +114,7 @@
 | [^]] | char-set | 0 | 4 |
 | [^]] | sequence | 0 | 4 |
 | \\+0 | char | 0 | 2 |
-| \\+0 | char | 0 | 3 |
-| \\+0 | choice | 0 | 3 |
-| \\+0 | sequence | 0 | 2 |
+| \\+0 | char | 2 | 3 |
 | \\+0 | sequence | 0 | 3 |
 | \\A[+-]?\\d+ | char | 0 | 2 |
 | \\A[+-]?\\d+ | char | 3 | 4 |
--- a/python/ql/test/library-tests/regex/charRangeTest.py
+++ b/python/ql/test/library-tests/regex/charRangeTest.py
@@ -24,7 +24,8 @@ except re.error:

 re.compile(r'[^A-Z]') #$ charRange=2:3-4:5

-re.compile(r'[\0-\09]') #$ charRange=1:3-4:7
+re.compile(r'[\0-\09]') #$ charRange=1:3-4:6
+re.compile(r'[\0-\07]') #$ charRange=1:3-4:7

 re.compile(r'[\0123-5]') #$ charRange=5:6-7:8

--- a/python/ql/test/library-tests/regex/escapedCharacterTest.py
+++ b/python/ql/test/library-tests/regex/escapedCharacterTest.py
@@ -10,8 +10,10 @@ re.compile(r'[\---]') #$ escapedCharacter=1:3
 re.compile(r'[--\-]') #$ escapedCharacter=3:5
 re.compile(r'[\--\-]') #$ escapedCharacter=1:3 escapedCharacter=4:6
 re.compile(r'[0\-9-A-Z]') #$ escapedCharacter=2:4
-re.compile(r'[\0-\09]') #$ escapedCharacter=1:3 escapedCharacter=4:7
+re.compile(r'[\0-\09]') #$ escapedCharacter=1:3 escapedCharacter=4:6
+re.compile(r'[\0-\07]') #$ escapedCharacter=1:3 escapedCharacter=4:7
 re.compile(r'[\0123-5]') #$ escapedCharacter=1:5
+re.compile(r'\1754\1854\17\18\07\08') #$ escapedCharacter=0:4 escapedCharacter=16:19 escapedCharacter=19:21

 #ODASA-3985
 #Half Surrogate pairs
@@ -21,3 +23,9 @@ re.compile(u'[\U00010000-\U0010ffff]') # not escapes

 #Misparsed on LGTM
 re.compile(r"\[(?P<txt>[^[]*)\]\((?P<uri>[^)]*)") #$ escapedCharacter=0:2 escapedCharacter=16:18 escapedCharacter=18:20
+
+#Non-raw string
+re_blank = re.compile('(\n|\r|\\s)*\n', re.M) #$ escapedCharacter=5:7
+
+#Backreference confusion
+re.compile(r'\+0') #$ escapedCharacter=0:2
--- a/python/ql/test/library-tests/regexparser/Consistency.expected
+++ b/python/ql/test/library-tests/regexparser/Consistency.expected
@@ -1 +0,0 @@
-| \\+0 | 2 | test.py:2:18:2:23 | test.py:2 |
				`@@ -1 +0,0 @@`
				`\| \\+0 \| 2 \| test.py:2:18:2:23 \| test.py:2 \|`