Merge pull request #6460 from yoff/python-regex-parsing-consistency-checks

Python: Add regex parsing consistency checks
2026-04-30 11:15:13 +02:00 · 2021-09-07 13:33:59 +02:00
parent 1dc712f54d a01fca5d48
commit b99c075282
17 changed files with 583 additions and 17 deletions
--- a/python/ql/test/library-tests/regex/Alternation.expected
+++ b/python/ql/test/library-tests/regex/Alternation.expected
@@ -19,4 +19,4 @@
 | x\| | 0 | 2 | x\| | 0 | 1 | x |
 | x\| | 0 | 2 | x\| | 2 | 2 |  |
 | x\|(?<!\\w)l | 0 | 10 | x\|(?<!\\w)l | 0 | 1 | x |
-| x\|(?<!\\w)l | 0 | 10 | x\|(?<!\\w)l | 2 | 10 | (?<!\\w)l |
+| x\|(?<!\\w)l | 0 | 10 | x\|(?<!\\w)l | 2 | 10 | (?<!\\w)l |
--- a/python/ql/test/library-tests/regex/Characters.expected
+++ b/python/ql/test/library-tests/regex/Characters.expected
@@ -52,6 +52,8 @@
 | [^A-Z] | 2 | 3 |
 | [^A-Z] | 4 | 5 |
 | [^]] | 2 | 3 |
+| \\+0 | 0 | 2 |
+| \\+0 | 2 | 3 |
 | \\A[+-]?\\d+ | 0 | 2 |
 | \\A[+-]?\\d+ | 3 | 4 |
 | \\A[+-]?\\d+ | 4 | 5 |
--- a/python/ql/test/library-tests/regex/Consistency.expected
+++ b/python/ql/test/library-tests/regex/Consistency.expected
--- a/python/ql/test/library-tests/regex/Consistency.ql
+++ b/python/ql/test/library-tests/regex/Consistency.ql
@@ -0,0 +1,12 @@
+/**
+ * Flags regular expressions that are parsed ambigously
+ */
+
+import python
+import semmle.python.regex
+
+from string str, Location loc, int counter
+where
+  counter = strictcount(Regex term | term.getLocation() = loc and term.getText() = str) and
+  counter > 1
+select str, counter, loc
--- a/python/ql/test/library-tests/regex/FirstLast.expected
+++ b/python/ql/test/library-tests/regex/FirstLast.expected
@@ -42,6 +42,8 @@
 | [^A-Z] | last | 0 | 6 |
 | [^]] | first | 0 | 4 |
 | [^]] | last | 0 | 4 |
+| \\+0 | first | 0 | 2 |
+| \\+0 | last | 2 | 3 |
 | \\A[+-]?\\d+ | first | 0 | 2 |
 | \\A[+-]?\\d+ | last | 7 | 9 |
 | \\A[+-]?\\d+ | last | 7 | 10 |
--- a/python/ql/test/library-tests/regex/Regex.expected
+++ b/python/ql/test/library-tests/regex/Regex.expected
@@ -113,6 +113,9 @@
 | [^]] | char | 2 | 3 |
 | [^]] | char-set | 0 | 4 |
 | [^]] | sequence | 0 | 4 |
+| \\+0 | char | 0 | 2 |
+| \\+0 | char | 2 | 3 |
+| \\+0 | sequence | 0 | 3 |
 | \\A[+-]?\\d+ | char | 0 | 2 |
 | \\A[+-]?\\d+ | char | 3 | 4 |
 | \\A[+-]?\\d+ | char | 4 | 5 |
--- a/python/ql/test/library-tests/regex/charRangeTest.py
+++ b/python/ql/test/library-tests/regex/charRangeTest.py
@@ -24,7 +24,8 @@ except re.error:

 re.compile(r'[^A-Z]') #$ charRange=2:3-4:5

-re.compile(r'[\0-\09]') #$ charRange=1:3-4:7
+re.compile(r'[\0-\09]') #$ charRange=1:3-4:6
+re.compile(r'[\0-\07]') #$ charRange=1:3-4:7

 re.compile(r'[\0123-5]') #$ charRange=5:6-7:8

--- a/python/ql/test/library-tests/regex/escapedCharacterTest.py
+++ b/python/ql/test/library-tests/regex/escapedCharacterTest.py
@@ -10,8 +10,10 @@ re.compile(r'[\---]') #$ escapedCharacter=1:3
 re.compile(r'[--\-]') #$ escapedCharacter=3:5
 re.compile(r'[\--\-]') #$ escapedCharacter=1:3 escapedCharacter=4:6
 re.compile(r'[0\-9-A-Z]') #$ escapedCharacter=2:4
-re.compile(r'[\0-\09]') #$ escapedCharacter=1:3 escapedCharacter=4:7
+re.compile(r'[\0-\09]') #$ escapedCharacter=1:3 escapedCharacter=4:6
+re.compile(r'[\0-\07]') #$ escapedCharacter=1:3 escapedCharacter=4:7
 re.compile(r'[\0123-5]') #$ escapedCharacter=1:5
+re.compile(r'\1754\1854\17\18\07\08') #$ escapedCharacter=0:4 escapedCharacter=16:19 escapedCharacter=19:21

 #ODASA-3985
 #Half Surrogate pairs
@@ -21,3 +23,9 @@ re.compile(u'[\U00010000-\U0010ffff]') # not escapes

 #Misparsed on LGTM
 re.compile(r"\[(?P<txt>[^[]*)\]\((?P<uri>[^)]*)") #$ escapedCharacter=0:2 escapedCharacter=16:18 escapedCharacter=18:20
+
+#Non-raw string
+re_blank = re.compile('(\n|\r|\\s)*\n', re.M) #$ escapedCharacter=5:7
+
+#Backreference confusion
+re.compile(r'\+0') #$ escapedCharacter=0:2
--- a/python/ql/test/library-tests/regex/test.py
+++ b/python/ql/test/library-tests/regex/test.py
@@ -70,3 +70,6 @@ re.compile("", re.M) # ODASA-8056
 # FP reported in https://github.com/github/codeql/issues/3712
 # This does not define a regex (but could be used by other code to do so)
 escaped = re.escape("https://www.humblebundle.com/home/library")
+
+# Consistency check
+baz = re.compile(r'\+0')
--- a/python/ql/test/library-tests/regexparser/Consistency.expected
+++ b/python/ql/test/library-tests/regexparser/Consistency.expected
--- a/python/ql/test/library-tests/regexparser/Consistency.ql
+++ b/python/ql/test/library-tests/regexparser/Consistency.ql
@@ -0,0 +1,15 @@
+/**
+ * Flags regular expressions that are parsed ambigously
+ */
+
+import python
+import semmle.python.RegexTreeView
+
+from string str, int counter, Location loc
+where
+  counter =
+    strictcount(RegExpTerm term |
+      term.getLocation() = loc and term.isRootTerm() and term.toString() = str
+    ) and
+  counter > 1
+select str, counter, loc
--- a/python/ql/test/library-tests/regexparser/KnownCVEs.py
+++ b/python/ql/test/library-tests/regexparser/KnownCVEs.py
@@ -0,0 +1,94 @@
+import re
+
+# linear
+# https://github.com/github/codeql-python-CVE-coverage/issues/439
+rex_blame = re.compile(r'\s*(\d+)\s*(\S+) (.*)')
+
+# https://github.com/github/codeql-python-CVE-coverage/issues/402
+whitespace = br"[\000\011\012\014\015\040]"
+whitespace_optional = whitespace + b"*"
+newline_only = br"[\r\n]+"
+newline = whitespace_optional + newline_only + whitespace_optional
+toFlag = re.compile(newline)
+
+# https://github.com/github/codeql-python-CVE-coverage/issues/400
+re.compile(r'[+-]?(\d+)*\.\d+%?')
+re.compile(r'"""\s+(?:.|\n)*?\s+"""')
+re.compile(r'(\{\s+)(\S+)(\s+[^}]+\s+\}\s)')
+re.compile(r'".*``.*``.*"')
+re.compile(r'(\s*)(?:(.+)(\s*)(=)(\s*))?(.+)(\()(.*)(\))(\s*)')
+re.compile(r'(%config)(\s*\(\s*)(\w+)(\s*=\s*)(.*?)(\s*\)\s*)')
+re.compile(r'(%new)(\s*)(\()(\s*.*?\s*)(\))')
+re.compile(r'(\$)(evoque|overlay)(\{(%)?)(\s*[#\w\-"\'.]+[^=,%}]+?)?')
+re.compile(r'(\.\w+\b)(\s*=\s*)([^;]*)(\s*;)')
+
+# linear
+# https://github.com/github/codeql-python-CVE-coverage/issues/392
+simple_email_re = re.compile(r"^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$")
+
+# https://github.com/github/codeql-python-CVE-coverage/issues/249
+rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
+                     'realm=(["\']?)([^"\']*)\\2', re.I)
+
+# https://github.com/github/codeql-python-CVE-coverage/issues/248
+gauntlet = re.compile(
+            r"""^([-/:,#%.'"\s!\w]|\w-\w|'[\s\w]+'\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$""",
+            flags=re.U
+        )
+
+# https://github.com/github/codeql-python-CVE-coverage/issues/227
+# from .compat import tobytes
+
+WS = "[ \t]"
+OWS = WS + "{0,}?"
+
+# RFC 7230 Section 3.2.6 "Field Value Components":
+# tchar          = "!" / "#" / "$" / "%" / "&" / "'" / "*"
+#                / "+" / "-" / "." / "^" / "_" / "`" / "|" / "~"
+#                / DIGIT / ALPHA
+# obs-text      = %x80-FF
+TCHAR = r"[!#$%&'*+\-.^_`|~0-9A-Za-z]"
+OBS_TEXT = r"\x80-\xff"
+TOKEN = TCHAR + "{1,}"
+# RFC 5234 Appendix B.1 "Core Rules":
+# VCHAR         =  %x21-7E
+#                  ; visible (printing) characters
+VCHAR = r"\x21-\x7e"
+# header-field   = field-name ":" OWS field-value OWS
+# field-name     = token
+# field-value    = *( field-content / obs-fold )
+# field-content  = field-vchar [ 1*( SP / HTAB ) field-vchar ]
+# field-vchar    = VCHAR / obs-text
+# Errata from: https://www.rfc-editor.org/errata_search.php?rfc=7230&eid=4189
+# changes field-content to:
+#
+# field-content  = field-vchar [ 1*( SP / HTAB / field-vchar )
+#                  field-vchar ]
+
+FIELD_VCHAR = "[" + VCHAR + OBS_TEXT + "]"
+FIELD_CONTENT = FIELD_VCHAR + "([ \t" + VCHAR + OBS_TEXT + "]+" + FIELD_VCHAR + "){,1}"
+FIELD_VALUE = "(" + FIELD_CONTENT + "){0,}"
+
+HEADER_FIELD = re.compile(
+    #  tobytes(
+         "^(?P<name>" + TOKEN + "):" + OWS + "(?P<value>" + FIELD_VALUE + ")" + OWS + "$"
+    #  )
+ )
+
+# https://github.com/github/codeql-python-CVE-coverage/issues/224
+pattern = re.compile(
+    r'^(:?(([a-zA-Z]{1})|([a-zA-Z]{1}[a-zA-Z]{1})|'  # domain pt.1
+    r'([a-zA-Z]{1}[0-9]{1})|([0-9]{1}[a-zA-Z]{1})|'  # domain pt.2
+    r'([a-zA-Z0-9][-_a-zA-Z0-9]{0,61}[a-zA-Z0-9]))\.)+'  # domain pt.3
+    r'([a-zA-Z]{2,13}|(xn--[a-zA-Z0-9]{2,30}))$'  # TLD
+)
+
+# https://github.com/github/codeql-python-CVE-coverage/issues/189
+URL_REGEX = (
+     r'(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|'
+     r'[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|'
+     r'(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|'
+     r'[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))'  # "emacs!
+)
+
+url = re.compile(URL_REGEX)
--- a/python/ql/test/library-tests/regexparser/polredos.py
+++ b/python/ql/test/library-tests/regexparser/polredos.py
@@ -0,0 +1,9 @@
+import re
+from flask import Flask, request
+app = Flask(__name__)
+
+@app.route("/poly-redos")
+def code_execution():
+    text = request.args.get("text")
+    re.sub(r"^\s+|\s+$", "", text) # NOT OK
+    re.match(r"^0\.\d+E?\d+$", text) # NOT OK
--- a/python/ql/test/library-tests/regexparser/redos.py
+++ b/python/ql/test/library-tests/regexparser/redos.py
@@ -0,0 +1,376 @@
+# This is currently a copy of the redos test-file, since that one contains many regexes.
+
+import re
+
+# NOT GOOD; attack: "_" + "__".repeat(100)
+# Adapted from marked (https://github.com/markedjs/marked), which is licensed
+# under the MIT license; see file marked-LICENSE.
+bad1 = re.compile(r'''^\b_((?:__|[\s\S])+?)_\b|^\*((?:\*\*|[\s\S])+?)\*(?!\*)''')
+
+# GOOD
+# Adapted from marked (https://github.com/markedjs/marked), which is licensed
+# under the MIT license; see file marked-LICENSE.
+good1 = re.compile(r'^\b_((?:__|[^_])+?)_\b|^\*((?:\*\*|[^*])+?)\*(?!\*)')
+
+# GOOD - there is no witness in the end that could cause the regexp to not match
+# Adapted from brace-expansion (https://github.com/juliangruber/brace-expansion),
+# which is licensed under the MIT license; see file brace-expansion-LICENSE.
+good2 = re.compile(r'(.*,)+.+')
+
+# NOT GOOD; attack: " '" + "\\\\".repeat(100)
+# Adapted from CodeMirror (https://github.com/codemirror/codemirror),
+# which is licensed under the MIT license; see file CodeMirror-LICENSE.
+bad2 = re.compile(r'''^(?:\s+(?:"(?:[^"\\]|\\\\|\\.)+"|'(?:[^'\\]|\\\\|\\.)+'|\((?:[^)\\]|\\\\|\\.)+\)))?''')
+
+# GOOD
+# Adapted from lulucms2 (https://github.com/yiifans/lulucms2).
+good2 = re.compile(r'''\(\*(?:[\s\S]*?\(\*[\s\S]*?\*\))*[\s\S]*?\*\)''')
+
+# GOOD
+# Adapted from jest (https://github.com/facebook/jest), which is licensed
+# under the MIT license; see file jest-LICENSE.
+good3 = re.compile(r'''^ *(\S.*\|.*)\n *([-:]+ *\|[-| :]*)\n((?:.*\|.*(?:\n|$))*)\n*''')
+
+# NOT GOOD, variant of good3; attack: "a|\n:|\n" + "||\n".repeat(100)
+bad4 = re.compile(r'''^ *(\S.*\|.*)\n *([-:]+ *\|[-| :]*)\n((?:.*\|.*(?:\n|$))*)a''')
+
+# NOT GOOD; attack: "/" + "\\/a".repeat(100)
+# Adapted from ANodeBlog (https://github.com/gefangshuai/ANodeBlog),
+# which is licensed under the Apache License 2.0; see file ANodeBlog-LICENSE.
+bad5 = re.compile(r'''\/(?![ *])(\\\/|.)*?\/[gim]*(?=\W|$)''')
+
+# NOT GOOD; attack: "##".repeat(100) + "\na"
+# Adapted from CodeMirror (https://github.com/codemirror/codemirror),
+# which is licensed under the MIT license; see file CodeMirror-LICENSE.
+bad6 = re.compile(r'''^([\s\[\{\(]|#.*)*$''')
+
+# GOOD
+good4 = re.compile(r'''(\r\n|\r|\n)+''')
+
+# BAD - PoC: `node -e "/((?:[^\"\']|\".*?\"|\'.*?\')*?)([(,)]|$)/.test(\"'''''''''''''''''''''''''''''''''''''''''''''\\\"\");"`. It's complicated though, because the regexp still matches something, it just matches the empty-string after the attack string.
+actuallyBad = re.compile(r'''((?:[^"']|".*?"|'.*?')*?)([(,)]|$)''')
+
+# NOT GOOD; attack: "a" + "[]".repeat(100) + ".b\n"
+# Adapted from Knockout (https://github.com/knockout/knockout), which is
+# licensed under the MIT license; see file knockout-LICENSE
+bad6 = re.compile(r'''^[\_$a-z][\_$a-z0-9]*(\[.*?\])*(\.[\_$a-z][\_$a-z0-9]*(\[.*?\])*)*$''')
+
+# GOOD
+good6 = re.compile(r'''(a|.)*''')
+
+# Testing the NFA - only some of the below are detected.
+bad7 = re.compile(r'''^([a-z]+)+$''')
+bad8 = re.compile(r'''^([a-z]*)*$''')
+bad9 = re.compile(r'''^([a-zA-Z0-9])(([\\-.]|[_]+)?([a-zA-Z0-9]+))*(@){1}[a-z0-9]+[.]{1}(([a-z]{2,3})|([a-z]{2,3}[.]{1}[a-z]{2,3}))$''')
+bad10 = re.compile(r'''^(([a-z])+.)+[A-Z]([a-z])+$''')
+
+# NOT GOOD; attack: "[" + "][".repeat(100) + "]!"
+# Adapted from Prototype.js (https://github.com/prototypejs/prototype), which
+# is licensed under the MIT license; see file Prototype.js-LICENSE.
+bad11 = re.compile(r'''(([\w#:.~>+()\s-]+|\*|\[.*?\])+)\s*(,|$)''')
+
+# NOT GOOD; attack: "'" + "\\a".repeat(100) + '"'
+# Adapted from Prism (https://github.com/PrismJS/prism), which is licensed
+# under the MIT license; see file Prism-LICENSE.
+bad12 = re.compile(r'''("|')(\\?.)*?\1''')
+
+# NOT GOOD
+bad13 = re.compile(r'''(b|a?b)*c''')
+
+# NOT GOOD
+bad15 = re.compile(r'''(a|aa?)*b''')
+
+# GOOD
+good7 = re.compile(r'''(.|\n)*!''')
+
+# NOT GOOD; attack: "\n".repeat(100) + "."
+bad16 = re.compile(r'''(.|\n)*!''')
+
+# GOOD
+good8 = re.compile(r'''([\w.]+)*''')
+
+# NOT GOOD
+bad17 = re.compile(r'''(a|aa?)*b''')
+
+# GOOD - not used as regexp
+good9 = '(a|aa?)*b'
+
+# NOT GOOD
+bad18 = re.compile(r'''(([\s\S]|[^a])*)"''')
+
+# GOOD - there is no witness in the end that could cause the regexp to not match
+good10 = re.compile(r'''([^"']+)*''')
+
+# NOT GOOD
+bad20 = re.compile(r'''((.|[^a])*)"''')
+
+# GOOD
+good10 = re.compile(r'''((a|[^a])*)"''')
+
+# NOT GOOD
+bad21 = re.compile(r'''((b|[^a])*)"''')
+
+# NOT GOOD
+bad22 = re.compile(r'''((G|[^a])*)"''')
+
+# NOT GOOD
+bad23 = re.compile(r'''(([0-9]|[^a])*)"''')
+
+# NOT GOOD
+bad24 = re.compile(r'''(?:=(?:([!#\$%&'\*\+\-\.\^_`\|~0-9A-Za-z]+)|"((?:\\[\x00-\x7f]|[^\x00-\x08\x0a-\x1f\x7f"])*)"))?''')
+
+# NOT GOOD
+bad25 = re.compile(r'''"((?:\\[\x00-\x7f]|[^\x00-\x08\x0a-\x1f\x7f"])*)"''')
+
+# GOOD
+bad26 = re.compile(r'''"((?:\\[\x00-\x7f]|[^\x00-\x08\x0a-\x1f\x7f"\\])*)"''')
+
+# NOT GOOD
+bad27 = re.compile(r'''(([a-z]|[d-h])*)"''')
+
+# NOT GOOD
+bad27 = re.compile(r'''(([^a-z]|[^0-9])*)"''')
+
+# NOT GOOD
+bad28 = re.compile(r'''((\d|[0-9])*)"''')
+
+# NOT GOOD
+bad29 = re.compile(r'''((\s|\s)*)"''')
+
+# NOT GOOD
+bad30 = re.compile(r'''((\w|G)*)"''')
+
+# GOOD
+good11 = re.compile(r'''((\s|\d)*)"''')
+
+# NOT GOOD
+bad31 = re.compile(r'''((\d|\w)*)"''')
+
+# NOT GOOD
+bad32 = re.compile(r'''((\d|5)*)"''')
+
+# NOT GOOD
+bad33 = re.compile(r'''((\s|[\f])*)"''')
+
+# NOT GOOD
+bad34 = re.compile(r'''((\s|[\v]|\\v)*)"''')
+
+# NOT GOOD
+bad35 = re.compile(r'''((\f|[\f])*)"''')
+
+# NOT GOOD
+bad36 = re.compile(r'''((\W|\D)*)"''')
+
+# NOT GOOD
+bad37 = re.compile(r'''((\S|\w)*)"''')
+
+# NOT GOOD
+bad38 = re.compile(r'''((\S|[\w])*)"''')
+
+# NOT GOOD
+bad39 = re.compile(r'''((1s|[\da-z])*)"''')
+
+# NOT GOOD
+bad40 = re.compile(r'''((0|[\d])*)"''')
+
+# NOT GOOD
+bad41 = re.compile(r'''(([\d]+)*)"''')
+
+# GOOD - there is no witness in the end that could cause the regexp to not match
+good12 = re.compile(r'''(\d+(X\d+)?)+''')
+
+# GOOD - there is no witness in the end that could cause the regexp to not match
+good13 = re.compile(r'''([0-9]+(X[0-9]*)?)*''')
+
+# GOOD
+good15 = re.compile(r'''^([^>]+)*(>|$)''')
+
+# NOT GOOD
+bad43 = re.compile(r'''^([^>a]+)*(>|$)''')
+
+# NOT GOOD
+bad44 = re.compile(r'''(\n\s*)+$''')
+
+# NOT GOOD
+bad45 = re.compile(r'''^(?:\s+|#.*|\(\?#[^)]*\))*(?:[?*+]|{\d+(?:,\d*)?})''')
+
+# NOT GOOD
+bad46 = re.compile(r'''\{\[\s*([a-zA-Z]+)\(([a-zA-Z]+)\)((\s*([a-zA-Z]+)\: ?([ a-zA-Z{}]+),?)+)*\s*\]\}''')
+
+# NOT GOOD
+bad47 = re.compile(r'''(a+|b+|c+)*c''')
+
+# NOT GOOD
+bad48 = re.compile(r'''(((a+a?)*)+b+)''')
+
+# NOT GOOD
+bad49 = re.compile(r'''(a+)+bbbb''')
+
+# GOOD
+good16 = re.compile(r'''(a+)+aaaaa*a+''')
+
+# NOT GOOD
+bad50 = re.compile(r'''(a+)+aaaaa$''')
+
+# GOOD
+good17 = re.compile(r'''(\n+)+\n\n''')
+
+# NOT GOOD
+bad51 = re.compile(r'''(\n+)+\n\n$''')
+
+# NOT GOOD
+bad52 = re.compile(r'''([^X]+)*$''')
+
+# NOT GOOD
+bad53 = re.compile(r'''(([^X]b)+)*$''')
+
+# GOOD
+good18 = re.compile(r'''(([^X]b)+)*($|[^X]b)''')
+
+# NOT GOOD
+bad54 = re.compile(r'''(([^X]b)+)*($|[^X]c)''')
+
+# GOOD
+good20 = re.compile(r'''((ab)+)*ababab''')
+
+# GOOD
+good21 = re.compile(r'''((ab)+)*abab(ab)*(ab)+''')
+
+# GOOD
+good22 = re.compile(r'''((ab)+)*''')
+
+# NOT GOOD
+bad55 = re.compile(r'''((ab)+)*$''')
+
+# GOOD
+good23 = re.compile(r'''((ab)+)*[a1][b1][a2][b2][a3][b3]''')
+
+# NOT GOOD
+bad56 = re.compile(r'''([\n\s]+)*(.)''')
+
+# GOOD - any witness passes through the accept state.
+good24 = re.compile(r'''(A*A*X)*''')
+
+# GOOD
+good26 = re.compile(r'''([^\\\]]+)*''')
+
+# NOT GOOD
+bad59 = re.compile(r'''(\w*foobarbaz\w*foobarbaz\w*foobarbaz\w*foobarbaz\s*foobarbaz\d*foobarbaz\w*)+-''')
+
+# NOT GOOD
+bad60 = re.compile(r'''(.thisisagoddamnlongstringforstresstestingthequery|\sthisisagoddamnlongstringforstresstestingthequery)*-''')
+
+# NOT GOOD
+bad61 = re.compile(r'''(thisisagoddamnlongstringforstresstestingthequery|this\w+query)*-''')
+
+# GOOD
+good27 = re.compile(r'''(thisisagoddamnlongstringforstresstestingthequery|imanotherbutunrelatedstringcomparedtotheotherstring)*-''')
+
+# GOOD
+good28 = re.compile(r'''foo([\uDC66\uDC67]|[\uDC68\uDC69])*foo''')
+
+# GOOD
+good29 = re.compile(r'''foo((\uDC66|\uDC67)|(\uDC68|\uDC69))*foo''')
+
+# NOT GOOD (but cannot currently construct a prefix)
+bad62 = re.compile(r'''a{2,3}(b+)+X''')
+
+# NOT GOOD (and a good prefix test)
+bad63 = re.compile(r'''^<(\w+)((?:\s+\w+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>''')
+
+# GOOD
+good30 = re.compile(r'''(a+)*[\s\S][\s\S][\s\S]?''')
+
+# GOOD - but we fail to see that repeating the attack string ends in the "accept any" state (due to not parsing the range `[\s\S]{2,3}`).
+good31 = re.compile(r'''(a+)*[\s\S]{2,3}''')
+
+# GOOD - but we spuriously conclude that a rejecting suffix exists (due to not parsing the range `[\s\S]{2,}` when constructing the NFA).
+good32 = re.compile(r'''(a+)*([\s\S]{2,}|X)$''')
+
+# GOOD
+good33 = re.compile(r'''(a+)*([\s\S]*|X)$''')
+
+# NOT GOOD
+bad64 = re.compile(r'''((a+)*$|[\s\S]+)''')
+
+# GOOD - but still flagged. The only change compared to the above is the order of alternatives, which we don't model.
+good34 = re.compile(r'''([\s\S]+|(a+)*$)''')
+
+# GOOD
+good35 = re.compile(r'''((;|^)a+)+$''')
+
+# NOT GOOD (a good prefix test)
+bad65 = re.compile(r'''(^|;)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(e+)+f''')
+
+# NOT GOOD
+bad66 = re.compile(r'''^ab(c+)+$''')
+
+# NOT GOOD
+bad67 = re.compile(r'''(\d(\s+)*){20}''')
+
+# GOOD - but we spuriously conclude that a rejecting suffix exists.
+good36 = re.compile(r'''(([^/]|X)+)(\/[\s\S]*)*$''')
+
+# GOOD - but we spuriously conclude that a rejecting suffix exists.
+good37 = re.compile(r'''^((x([^Y]+)?)*(Y|$))''')
+
+# NOT GOOD
+bad68 = re.compile(r'''(a*)+b''')
+
+# NOT GOOD
+bad69 = re.compile(r'''foo([\w-]*)+bar''')
+
+# NOT GOOD
+bad70 = re.compile(r'''((ab)*)+c''')
+
+# NOT GOOD
+bad71 = re.compile(r'''(a?a?)*b''')
+
+# GOOD
+good38 = re.compile(r'''(a?)*b''')
+
+# NOT GOOD - but not detected
+bad72 = re.compile(r'''(c?a?)*b''')
+
+# NOT GOOD
+bad73 = re.compile(r'''(?:a|a?)+b''')
+
+# NOT GOOD - but not detected.
+bad74 = re.compile(r'''(a?b?)*$''')
+
+# NOT GOOD
+bad76 = re.compile(r'''PRE(([a-c]|[c-d])T(e?e?e?e?|X))+(cTcT|cTXcTX$)''')
+
+# NOT GOOD - but not detected
+bad77 = re.compile(r'''^((a)+\w)+$''')
+
+# NOT GOOD
+bad78 = re.compile(r'''^(b+.)+$''')
+
+# GOOD
+good39 = re.compile(r'''a*b''')
+
+# All 4 bad combinations of nested * and +
+bad79 = re.compile(r'''(a*)*b''')
+bad80 = re.compile(r'''(a+)*b''')
+bad81 = re.compile(r'''(a*)+b''')
+bad82 = re.compile(r'''(a+)+b''')
+
+# GOOD
+good40 = re.compile(r'''(a|b)+''')
+good41 = re.compile(r'''(?:[\s;,"'<>(){}|[\]@=+*]|:(?![/\\]))+''') # parses wrongly, sees column 42 as a char set start
+
+# NOT GOOD
+bad83 = re.compile(r'''^((?:a{|-)|\w\{)+X$''')
+bad84 = re.compile(r'''^((?:a{0|-)|\w\{\d)+X$''')
+bad85 = re.compile(r'''^((?:a{0,|-)|\w\{\d,)+X$''')
+bad86 = re.compile(r'''^((?:a{0,2|-)|\w\{\d,\d)+X$''')
+
+# GOOD:
+good42 = re.compile(r'''^((?:a{0,2}|-)|\w\{\d,\d\})+X$''')
+
+# NOT GOOD
+bad87 = re.compile(r'X(\u0061|a)*Y')
+
+# GOOD
+good43 = re.compile(r'X(\u0061|b)+Y')
--- a/python/ql/test/library-tests/regexparser/unittests.py
+++ b/python/ql/test/library-tests/regexparser/unittests.py
@@ -0,0 +1,20 @@
+import re
+
+# Treatment of escapes
+re.compile(r"X([^\.]|\.)*$") # No ReDoS.
+re.compile(r"X(Æ|\Æ)+$") # Has ReDoS.
+
+# Treatment of line breaks
+re.compile(r'(?:.|\n)*b') # No ReDoS.
+re.compile(r'(?:.|\n)*b', re.DOTALL) # Has ReDoS.
+
+# minimal example constructed by @erik-krogh
+baz = re.compile(r'\+0')
+
+# exerpts from LGTM.com
+re.compile(r'\+0x')
+re.compile(r'\+0x.*')
+re.compile(r'+\-0+\.')
+re.compile('\s+\+0x[0-9]+')
+re.compile(r'\+0000 .*')
+re.compile('\#[0-9]+ 0x[0-9]')