From c08f94ec040163d0762d73cab2fedd3abef01002 Mon Sep 17 00:00:00 2001 From: Rasmus Lerchedahl Petersen Date: Wed, 4 Aug 2021 20:41:56 +0200 Subject: [PATCH] Python: Fix parsing of octal escapes --- python/ql/src/semmle/python/RegexTreeView.qll | 6 ++++- python/ql/src/semmle/python/regex.qll | 27 ++++++++++++++++--- .../library-tests/regex/Alternation.expected | 2 -- .../library-tests/regex/Characters.expected | 2 +- .../library-tests/regex/FirstLast.expected | 4 +-- .../test/library-tests/regex/Regex.expected | 4 +-- .../test/library-tests/regex/charRangeTest.py | 3 ++- .../regex/escapedCharacterTest.py | 10 ++++++- .../regexparser/Consistency.expected | 1 - 9 files changed, 42 insertions(+), 17 deletions(-) diff --git a/python/ql/src/semmle/python/RegexTreeView.qll b/python/ql/src/semmle/python/RegexTreeView.qll index 5aae3021899..d458f75bc2d 100644 --- a/python/ql/src/semmle/python/RegexTreeView.qll +++ b/python/ql/src/semmle/python/RegexTreeView.qll @@ -75,7 +75,11 @@ class RegExpTerm extends RegExpParent { int end; RegExpTerm() { - this = TRegExpAlt(re, start, end) + this = TRegExpAlt(re, start, end) and + exists(int part_end | + re.alternationOption(start, end, start, part_end) and + part_end < end + ) // if an alternation does not have more than one element, it should be treated as that element instead. or this = TRegExpBackRef(re, start, end) or diff --git a/python/ql/src/semmle/python/regex.qll b/python/ql/src/semmle/python/regex.qll index cb38f7f2b3c..8a38d271f32 100644 --- a/python/ql/src/semmle/python/regex.qll +++ b/python/ql/src/semmle/python/regex.qll @@ -371,10 +371,13 @@ abstract class RegexString extends Expr { or // octal value \ooo end in [start + 2 .. start + 4] and - this.getText().substring(start + 1, end).toInt() >= 0 and + // this.isOctal([start + 1 .. end]) and + forall(int i | i in [start + 1 .. end - 1] | this.isOctal(i)) and + // this.getText().substring(start + 1, end).toInt() >= 0 and not ( end < start + 4 and - exists(this.getText().substring(start + 1, end + 1).toInt()) + this.isOctal(end) //and + // exists(this.getText().substring(start + 1, end + 1).toInt()) ) or // 16-bit hex value \uhhhh @@ -392,6 +395,11 @@ abstract class RegexString extends Expr { ) } + pragma[inline] + private predicate isOctal(int index) { + this.getChar(index) in ["0", "1", "2", "3", "4", "5", "6", "7"] + } + /** Holds if `index` is inside a character set. */ predicate inCharSet(int index) { exists(int x, int y | this.charSet(x, y) and index in [x + 1 .. y - 2]) @@ -690,6 +698,7 @@ abstract class RegexString extends Expr { private predicate numbered_backreference(int start, int end, int value) { this.escapingChar(start) and + // starting with 0 makes it an octal escape not this.getChar(start + 1) = "0" and exists(string text, string svalue, int len | end = start + len and @@ -698,8 +707,18 @@ abstract class RegexString extends Expr { | svalue = text.substring(start + 1, start + len) and value = svalue.toInt() and - not exists(text.substring(start + 1, start + len + 1).toInt()) and - value > 0 + // value is composed of digits + forall(int i | i in [start + 1 .. start + len - 1] | + this.getChar(i) in ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"] + ) and + // a longer reference is not possible + not ( + len = 2 and + exists(text.substring(start + 1, start + len + 1).toInt()) + ) and + // 3 octal digits makes it an octal escape + not forall(int i | i in [start + 1 .. start + 4] | this.isOctal(i)) + // TODO: Inside a character set, all numeric escapes are treated as characters. ) } diff --git a/python/ql/test/library-tests/regex/Alternation.expected b/python/ql/test/library-tests/regex/Alternation.expected index c9c5c4b931e..e50655fdc24 100644 --- a/python/ql/test/library-tests/regex/Alternation.expected +++ b/python/ql/test/library-tests/regex/Alternation.expected @@ -8,8 +8,6 @@ | (?P[\\w]+)\| | 0 | 16 | (?P[\\w]+)\| | 16 | 16 | | | (\\033\|~{) | 1 | 8 | \\033\|~{ | 1 | 5 | \\033 | | (\\033\|~{) | 1 | 8 | \\033\|~{ | 6 | 8 | ~{ | -| \\+0 | 0 | 3 | \\+0 | 0 | 2 | \\+ | -| \\+0 | 0 | 3 | \\+0 | 0 | 3 | \\+0 | | \\\|\\[\\][123]\|\\{\\} | 0 | 16 | \\\|\\[\\][123]\|\\{\\} | 0 | 11 | \\\|\\[\\][123] | | \\\|\\[\\][123]\|\\{\\} | 0 | 16 | \\\|\\[\\][123]\|\\{\\} | 12 | 16 | \\{\\} | | \|x | 0 | 2 | \|x | 0 | 0 | | diff --git a/python/ql/test/library-tests/regex/Characters.expected b/python/ql/test/library-tests/regex/Characters.expected index 98b24733a4f..6983b8f47e5 100644 --- a/python/ql/test/library-tests/regex/Characters.expected +++ b/python/ql/test/library-tests/regex/Characters.expected @@ -53,7 +53,7 @@ | [^A-Z] | 4 | 5 | | [^]] | 2 | 3 | | \\+0 | 0 | 2 | -| \\+0 | 0 | 3 | +| \\+0 | 2 | 3 | | \\A[+-]?\\d+ | 0 | 2 | | \\A[+-]?\\d+ | 3 | 4 | | \\A[+-]?\\d+ | 4 | 5 | diff --git a/python/ql/test/library-tests/regex/FirstLast.expected b/python/ql/test/library-tests/regex/FirstLast.expected index 8af390ed900..2a29eb83ce9 100644 --- a/python/ql/test/library-tests/regex/FirstLast.expected +++ b/python/ql/test/library-tests/regex/FirstLast.expected @@ -43,9 +43,7 @@ | [^]] | first | 0 | 4 | | [^]] | last | 0 | 4 | | \\+0 | first | 0 | 2 | -| \\+0 | first | 0 | 3 | -| \\+0 | last | 0 | 2 | -| \\+0 | last | 0 | 3 | +| \\+0 | last | 2 | 3 | | \\A[+-]?\\d+ | first | 0 | 2 | | \\A[+-]?\\d+ | last | 7 | 9 | | \\A[+-]?\\d+ | last | 7 | 10 | diff --git a/python/ql/test/library-tests/regex/Regex.expected b/python/ql/test/library-tests/regex/Regex.expected index 67a7483ed97..05bc6e81ce6 100644 --- a/python/ql/test/library-tests/regex/Regex.expected +++ b/python/ql/test/library-tests/regex/Regex.expected @@ -114,9 +114,7 @@ | [^]] | char-set | 0 | 4 | | [^]] | sequence | 0 | 4 | | \\+0 | char | 0 | 2 | -| \\+0 | char | 0 | 3 | -| \\+0 | choice | 0 | 3 | -| \\+0 | sequence | 0 | 2 | +| \\+0 | char | 2 | 3 | | \\+0 | sequence | 0 | 3 | | \\A[+-]?\\d+ | char | 0 | 2 | | \\A[+-]?\\d+ | char | 3 | 4 | diff --git a/python/ql/test/library-tests/regex/charRangeTest.py b/python/ql/test/library-tests/regex/charRangeTest.py index de9de7ccaf3..706037df7d0 100644 --- a/python/ql/test/library-tests/regex/charRangeTest.py +++ b/python/ql/test/library-tests/regex/charRangeTest.py @@ -24,7 +24,8 @@ except re.error: re.compile(r'[^A-Z]') #$ charRange=2:3-4:5 -re.compile(r'[\0-\09]') #$ charRange=1:3-4:7 +re.compile(r'[\0-\09]') #$ charRange=1:3-4:6 +re.compile(r'[\0-\07]') #$ charRange=1:3-4:7 re.compile(r'[\0123-5]') #$ charRange=5:6-7:8 diff --git a/python/ql/test/library-tests/regex/escapedCharacterTest.py b/python/ql/test/library-tests/regex/escapedCharacterTest.py index e6add851a8c..5e2e3b7638a 100644 --- a/python/ql/test/library-tests/regex/escapedCharacterTest.py +++ b/python/ql/test/library-tests/regex/escapedCharacterTest.py @@ -10,8 +10,10 @@ re.compile(r'[\---]') #$ escapedCharacter=1:3 re.compile(r'[--\-]') #$ escapedCharacter=3:5 re.compile(r'[\--\-]') #$ escapedCharacter=1:3 escapedCharacter=4:6 re.compile(r'[0\-9-A-Z]') #$ escapedCharacter=2:4 -re.compile(r'[\0-\09]') #$ escapedCharacter=1:3 escapedCharacter=4:7 +re.compile(r'[\0-\09]') #$ escapedCharacter=1:3 escapedCharacter=4:6 +re.compile(r'[\0-\07]') #$ escapedCharacter=1:3 escapedCharacter=4:7 re.compile(r'[\0123-5]') #$ escapedCharacter=1:5 +re.compile(r'\1754\1854\17\18\07\08') #$ escapedCharacter=0:4 escapedCharacter=16:19 escapedCharacter=19:21 #ODASA-3985 #Half Surrogate pairs @@ -21,3 +23,9 @@ re.compile(u'[\U00010000-\U0010ffff]') # not escapes #Misparsed on LGTM re.compile(r"\[(?P[^[]*)\]\((?P[^)]*)") #$ escapedCharacter=0:2 escapedCharacter=16:18 escapedCharacter=18:20 + +#Non-raw string +re_blank = re.compile('(\n|\r|\\s)*\n', re.M) #$ escapedCharacter=5:7 + +#Backreference confusion +re.compile(r'\+0') #$ escapedCharacter=0:2 diff --git a/python/ql/test/library-tests/regexparser/Consistency.expected b/python/ql/test/library-tests/regexparser/Consistency.expected index 7c646458fbf..e69de29bb2d 100644 --- a/python/ql/test/library-tests/regexparser/Consistency.expected +++ b/python/ql/test/library-tests/regexparser/Consistency.expected @@ -1 +0,0 @@ -| \\+0 | 2 | test.py:2:18:2:23 | test.py:2 |