From 7ad1a21c2dafafc54ba82de5b37dc12b84dc2320 Mon Sep 17 00:00:00 2001 From: Rasmus Lerchedahl Petersen Date: Tue, 15 Aug 2023 21:23:50 +0200 Subject: [PATCH 1/6] Python: make mode characters not be characters They are simply considered part of the group start. --- .../python/regexp/internal/ParseRegExp.qll | 33 +++++++++++++++---- .../Security/CWE-730-ReDoS/ReDoS.expected | 4 +-- 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/python/ql/lib/semmle/python/regexp/internal/ParseRegExp.qll b/python/ql/lib/semmle/python/regexp/internal/ParseRegExp.qll index 52fa85ded56..b4383ff58ff 100644 --- a/python/ql/lib/semmle/python/regexp/internal/ParseRegExp.qll +++ b/python/ql/lib/semmle/python/regexp/internal/ParseRegExp.qll @@ -683,12 +683,34 @@ class RegExp extends Expr instanceof StrConst { * Holds if a parse mode starts between `start` and `end`. */ private predicate flag_group_start(int start, int end) { + exists(int no_modes_end | + this.flag_group_start_no_modes(start, no_modes_end) and + end = max(int i | this.mode_character(start, i) | i + 1) + ) + } + + /** + * Holds if the initial part of a parse mode, not containing any + * mode characters is between `start` and `end`. + */ + private predicate flag_group_start_no_modes(int start, int end) { this.isGroupStart(start) and this.getChar(start + 1) = "?" and this.getChar(start + 2) in ["i", "L", "m", "s", "u", "x"] and end = start + 2 } + /** + * Holds if `pos` contains a mo character from the + * flag group starting at `start`. + */ + private predicate mode_character(int start, int pos) { + this.flag_group_start_no_modes(start, pos) + or + this.mode_character(start, pos - 1) and + this.getChar(pos) in ["i", "L", "m", "s", "u", "x"] + } + /** * Holds if a parse mode group is between `start` and `end`, and includes the * mode flag `c`. For example the following span, with mode flag `i`: @@ -696,11 +718,10 @@ class RegExp extends Expr instanceof StrConst { * (?i) * ``` */ - private predicate flag_group(int start, int end, string c) { - exists(int inStart, int inEnd | - this.flag_group_start(start, inStart) and - this.groupContents(start, end, inStart, inEnd) and - this.getChar([inStart .. inEnd - 1]) = c + private predicate flag(string c) { + exists(int pos | + this.mode_character(_, pos) and + this.getChar(pos) = c ) } @@ -709,7 +730,7 @@ class RegExp extends Expr instanceof StrConst { * it is defined by a prefix. */ string getModeFromPrefix() { - exists(string c | this.flag_group(_, _, c) | + exists(string c | this.flag(c) | c = "i" and result = "IGNORECASE" or c = "L" and result = "LOCALE" diff --git a/python/ql/test/query-tests/Security/CWE-730-ReDoS/ReDoS.expected b/python/ql/test/query-tests/Security/CWE-730-ReDoS/ReDoS.expected index 0c2dccbd2d4..051fd9ccc5f 100644 --- a/python/ql/test/query-tests/Security/CWE-730-ReDoS/ReDoS.expected +++ b/python/ql/test/query-tests/Security/CWE-730-ReDoS/ReDoS.expected @@ -105,5 +105,5 @@ | redos.py:391:15:391:25 | (\\u0061\|a)* | This part of the regular expression may cause exponential backtracking on strings starting with 'X' and containing many repetitions of 'a'. | | unittests.py:5:17:5:23 | (\u00c6\|\\\u00c6)+ | This part of the regular expression may cause exponential backtracking on strings starting with 'X' and containing many repetitions of '\\u00c6'. | | unittests.py:9:16:9:24 | (?:.\|\\n)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\n'. | -| unittests.py:11:20:11:28 | (?:.\|\\n)* | This part of the regular expression may cause exponential backtracking on strings starting with 's' and containing many repetitions of '\\n'. | -| unittests.py:12:21:12:29 | (?:.\|\\n)* | This part of the regular expression may cause exponential backtracking on strings starting with 'is' and containing many repetitions of '\\n'. | +| unittests.py:11:20:11:28 | (?:.\|\\n)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\n'. | +| unittests.py:12:21:12:29 | (?:.\|\\n)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\n'. | From 88fc96e8d74593c42934a12c284f27d2b65809c3 Mon Sep 17 00:00:00 2001 From: Rasmus Lerchedahl Petersen Date: Wed, 16 Aug 2023 10:56:32 +0200 Subject: [PATCH 2/6] Python: Add test with prefix --- python/ql/test/query-tests/Security/CWE-730-ReDoS/unittests.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/ql/test/query-tests/Security/CWE-730-ReDoS/unittests.py b/python/ql/test/query-tests/Security/CWE-730-ReDoS/unittests.py index 0a49b8a52a9..4106a691558 100644 --- a/python/ql/test/query-tests/Security/CWE-730-ReDoS/unittests.py +++ b/python/ql/test/query-tests/Security/CWE-730-ReDoS/unittests.py @@ -10,3 +10,4 @@ re.compile(r'(?:.|\n)*b', re.DOTALL) # Has ReDoS. re.compile(r'(?i)(?:.|\n)*b') # No ReDoS. re.compile(r'(?s)(?:.|\n)*b') # Has ReDoS. re.compile(r'(?is)(?:.|\n)*b') # Has ReDoS. +re.compile(r'(?is)X(?:.|\n)*Y') # Has ReDoS. From e9e6bce80a7a33924742cc3f9373accb26ea0c4b Mon Sep 17 00:00:00 2001 From: Rasmus Lerchedahl Petersen Date: Wed, 16 Aug 2023 11:49:23 +0200 Subject: [PATCH 3/6] shared: handle empty groups in delta --- shared/regex/codeql/regex/nfa/NfaUtils.qll | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/shared/regex/codeql/regex/nfa/NfaUtils.qll b/shared/regex/codeql/regex/nfa/NfaUtils.qll index a951820bda1..2b14fe28aca 100644 --- a/shared/regex/codeql/regex/nfa/NfaUtils.qll +++ b/shared/regex/codeql/regex/nfa/NfaUtils.qll @@ -760,6 +760,12 @@ module Make { or exists(RegExpGroup grp | lbl = Epsilon() | q1 = before(grp) and q2 = before(grp.getChild(0))) or + exists(RegExpGroup grp | lbl = Epsilon() | + not exists(grp.getAChild()) and + q1 = before(grp) and + q2 = before(grp.getSuccessor()) + ) + or exists(EffectivelyStar star | lbl = Epsilon() | q1 = before(star) and q2 = before(star.getChild(0)) or From d3c24ba11093d89fa5bee293b63c9008cd939340 Mon Sep 17 00:00:00 2001 From: Rasmus Lerchedahl Petersen Date: Wed, 16 Aug 2023 13:38:10 +0200 Subject: [PATCH 4/6] =?UTF-8?q?Python=C3=86=20fix=20test=20expectations?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Security/CWE-116-BadTagFilter/BadTagFilter.expected | 1 + .../ql/test/query-tests/Security/CWE-730-ReDoS/ReDoS.expected | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/python/ql/test/query-tests/Security/CWE-116-BadTagFilter/BadTagFilter.expected b/python/ql/test/query-tests/Security/CWE-116-BadTagFilter/BadTagFilter.expected index cc9da9cfdc8..407a5490e8c 100644 --- a/python/ql/test/query-tests/Security/CWE-116-BadTagFilter/BadTagFilter.expected +++ b/python/ql/test/query-tests/Security/CWE-116-BadTagFilter/BadTagFilter.expected @@ -1,6 +1,7 @@ | tst.py:4:20:4:43 | .*?<\\/script> | This regular expression does not match script end tags like . | | tst.py:5:20:5:43 | .*?<\\/script> | This regular expression does not match script end tags like . | | tst.py:9:20:9:30 |