Merge branch 'main' into promote-sqlalchemy

2026-05-05 05:35:13 +02:00 · 2021-09-21 09:36:07 +02:00
parent 97c0f1c7b7 eaf05305ff
commit c7c8e2f3e3
1290 changed files with 83375 additions and 11187 deletions
--- a/python/ql/test/library-tests/PointsTo/new/ImpliesDataflow.expected
+++ b/python/ql/test/library-tests/PointsTo/new/ImpliesDataflow.expected
@@ -1,7 +1,5 @@
 | code/h_classes.py:3:1:3:16 | ControlFlowNode for ClassExpr | code/h_classes.py:10:1:10:9 | ControlFlowNode for type() |
 | code/h_classes.py:3:1:3:16 | ControlFlowNode for ClassExpr | code/h_classes.py:15:5:15:13 | ControlFlowNode for type() |
-| code/l_calls.py:3:13:3:14 | ControlFlowNode for List | code/l_calls.py:4:12:4:12 | ControlFlowNode for x |
-| code/l_calls.py:6:13:6:14 | ControlFlowNode for List | code/l_calls.py:7:16:7:16 | ControlFlowNode for x |
 | code/l_calls.py:12:1:12:20 | ControlFlowNode for ClassExpr | code/l_calls.py:16:16:16:18 | ControlFlowNode for cls |
 | code/l_calls.py:12:1:12:20 | ControlFlowNode for ClassExpr | code/l_calls.py:24:13:24:22 | ControlFlowNode for Attribute() |
 | code/l_calls.py:12:1:12:20 | ControlFlowNode for ClassExpr | code/l_calls.py:25:16:25:16 | ControlFlowNode for a |
--- a/python/ql/test/library-tests/frameworks/aiohttp/taint_test.py
+++ b/python/ql/test/library-tests/frameworks/aiohttp/taint_test.py
@@ -55,10 +55,10 @@ async def test_taint(request: web.Request): # $ requestHandler
        await request.content.readline(), # $ tainted
        await request.content.readchunk(), # $ tainted
        (await request.content.readchunk())[0], # $ tainted
-        [line async for line in request.content], # $ MISSING: tainted
-        [data async for data in request.content.iter_chunked(1024)], # $ MISSING: tainted
-        [data async for data in request.content.iter_any()], # $ MISSING: tainted
-        [data async for data, _ in request.content.iter_chunks()], # $ MISSING: tainted
+        [line async for line in request.content], # $ tainted
+        [data async for data in request.content.iter_chunked(1024)], # $ tainted
+        [data async for data in request.content.iter_any()], # $ tainted
+        [data async for data, _ in request.content.iter_chunks()], # $ tainted
        request.content.read_nowait(), # $ tainted

        # aiohttp.StreamReader
--- a/python/ql/test/library-tests/frameworks/django-v2-v3/taint_test.py
+++ b/python/ql/test/library-tests/frameworks/django-v2-v3/taint_test.py
@@ -11,6 +11,9 @@ def test_taint(request: HttpRequest, foo, bar, baz=None):  # $requestHandler rou
    # Manually inspected all fields of the HttpRequest object
    # https://docs.djangoproject.com/en/3.0/ref/request-response/#httprequest-objects

+    import django.urls
+    django.urls.ResolverMatch
+
    ensure_tainted(
        request, # $ tainted

@@ -35,8 +38,8 @@ def test_taint(request: HttpRequest, foo, bar, baz=None):  # $requestHandler rou
        request.GET, # $ tainted
        request.GET["key"], # $ tainted
        request.GET.get("key"), # $ tainted
-        request.GET.getlist("key"), # $ MISSING: tainted
-        request.GET.getlist("key")[0], # $ MISSING: tainted
+        request.GET.getlist("key"), # $ tainted
+        request.GET.getlist("key")[0], # $ tainted
        request.GET.pop("key"), # $ tainted
        request.GET.pop("key")[0], # $ tainted
        # key
@@ -45,9 +48,10 @@ def test_taint(request: HttpRequest, foo, bar, baz=None):  # $requestHandler rou
        request.GET.popitem()[1], # $ tainted
        # values[0]
        request.GET.popitem()[1][0], # $ tainted
-        request.GET.dict(), # $ MISSING: tainted
-        request.GET.dict()["key"], # $ MISSING: tainted
-        request.GET.urlencode(), # $ MISSING: tainted
+        request.GET.lists(), # $ tainted
+        request.GET.dict(), # $ tainted
+        request.GET.dict()["key"], # $ tainted
+        request.GET.urlencode(), # $ tainted

        # django.http.QueryDict (same as above, did not duplicate tests)
        request.POST, # $ tainted
@@ -60,22 +64,23 @@ def test_taint(request: HttpRequest, foo, bar, baz=None):  # $requestHandler rou
        # MultiValueDict[str, UploadedFile]
        request.FILES, # $ tainted
        request.FILES["key"], # $ tainted
-        request.FILES["key"].content_type, # $ MISSING: tainted
-        request.FILES["key"].content_type_extra, # $ MISSING: tainted
-        request.FILES["key"].content_type_extra["key"], # $ MISSING: tainted
-        request.FILES["key"].charset, # $ MISSING: tainted
-        request.FILES["key"].name, # $ MISSING: tainted
-        request.FILES["key"].file, # $ MISSING: tainted
-        request.FILES["key"].file.read(), # $ MISSING: tainted
+        request.FILES["key"].content_type, # $ tainted
+        request.FILES["key"].content_type_extra, # $ tainted
+        request.FILES["key"].content_type_extra["key"], # $ tainted
+        request.FILES["key"].charset, # $ tainted
+        request.FILES["key"].name, # $ tainted
+        request.FILES["key"].file, # $ tainted
+        request.FILES["key"].file.read(), # $ tainted

        request.FILES.get("key"), # $ tainted
-        request.FILES.get("key").name, # $ MISSING: tainted
-        request.FILES.getlist("key"), # $ MISSING: tainted
-        request.FILES.getlist("key")[0], # $ MISSING: tainted
-        request.FILES.getlist("key")[0].name, # $ MISSING: tainted
-        request.FILES.dict(), # $ MISSING: tainted
-        request.FILES.dict()["key"], # $ MISSING: tainted
-        request.FILES.dict()["key"].name, # $ MISSING: tainted
+        request.FILES.get("key").name, # $ tainted
+        request.FILES.getlist("key"), # $ tainted
+        request.FILES.getlist("key")[0], # $ tainted
+        request.FILES.getlist("key")[0].name, # $ tainted
+        request.FILES.dict(), # $ tainted
+        request.FILES.dict()["key"], # $ tainted
+        request.FILES.dict()["key"].name, # $ tainted
+        request.FILES.dict().get("key").name, # $ tainted

        # Dict[str, Any]
        request.META, # $ tainted
@@ -89,21 +94,21 @@ def test_taint(request: HttpRequest, foo, bar, baz=None):  # $requestHandler rou

        # django.urls.ResolverMatch
        request.resolver_match, # $ tainted
-        request.resolver_match.args, # $ MISSING: tainted
-        request.resolver_match.args[0], # $ MISSING: tainted
-        request.resolver_match.kwargs, # $ MISSING: tainted
-        request.resolver_match.kwargs["key"], # $ MISSING: tainted
+        request.resolver_match.args, # $ tainted
+        request.resolver_match.args[0], # $ tainted
+        request.resolver_match.kwargs, # $ tainted
+        request.resolver_match.kwargs["key"], # $ tainted

-        request.get_full_path(), # $ MISSING: tainted
-        request.get_full_path_info(), # $ MISSING: tainted
+        request.get_full_path(), # $ tainted
+        request.get_full_path_info(), # $ tainted
        # build_absolute_uri handled below
        # get_signed_cookie handled below

-        request.read(), # $ MISSING: tainted
-        request.readline(), # $ MISSING: tainted
-        request.readlines(), # $ MISSING: tainted
-        request.readlines()[0], # $ MISSING: tainted
-        [line for line in request], # $ MISSING: tainted
+        request.read(), # $ tainted
+        request.readline(), # $ tainted
+        request.readlines(), # $ tainted
+        request.readlines()[0], # $ tainted
+        [line for line in request], # $ tainted
    )

    # django.urls.ResolverMatch also supports iterable unpacking
@@ -129,9 +134,9 @@ def test_taint(request: HttpRequest, foo, bar, baz=None):  # $requestHandler rou
    # build_absolute_uri
    ####################################
    ensure_tainted(
-        request.build_absolute_uri(), # $ MISSING: tainted
-        request.build_absolute_uri(request.GET["key"]), # $ MISSING: tainted
-        request.build_absolute_uri(location=request.GET["key"]), # $ MISSING: tainted
+        request.build_absolute_uri(), # $ tainted
+        request.build_absolute_uri(request.GET["key"]), # $ tainted
+        request.build_absolute_uri(location=request.GET["key"]), # $ tainted
    )
    ensure_not_tainted(
        request.build_absolute_uri("/hardcoded/"),
--- a/python/ql/test/library-tests/frameworks/flask/save_uploaded_file.py
+++ b/python/ql/test/library-tests/frameworks/flask/save_uploaded_file.py
@@ -0,0 +1,6 @@
+from flask import Flask, request
+app = Flask(__name__)
+
+@app.route("/save-uploaded-file")  # $routeSetup="/save-uploaded-file"
+def test_taint():  # $requestHandler
+    request.files['key'].save("path") # $ getAPathArgument="path"
--- a/python/ql/test/library-tests/frameworks/flask/taint_test.py
+++ b/python/ql/test/library-tests/frameworks/flask/taint_test.py
@@ -44,7 +44,16 @@ def test_taint(name = "World!", number="0", foo="foo"):  # $requestHandler route
        # werkzeug.datastructures.Authorization (a dict, with some properties)
        request.authorization, # $ tainted
        request.authorization['username'], # $ tainted
-        request.authorization.username, # $ MISSING: tainted
+        request.authorization.username, # $ tainted
+        request.authorization.password, # $ tainted
+        request.authorization.realm, # $ tainted
+        request.authorization.nonce, # $ tainted
+        request.authorization.uri, # $ tainted
+        request.authorization.nc, # $ tainted
+        request.authorization.cnonce, # $ tainted
+        request.authorization.response, # $ tainted
+        request.authorization.opaque, # $ tainted
+        request.authorization.qop, # $ tainted

        # werkzeug.datastructures.RequestCacheControl
        request.cache_control, # $ tainted
@@ -68,14 +77,16 @@ def test_taint(name = "World!", number="0", foo="foo"):  # $requestHandler route
        # a werkzeug.datastructures.MultiDict, mapping [str, werkzeug.datastructures.FileStorage]
        request.files, # $ tainted
        request.files['key'], # $ tainted
-        request.files['key'].filename, # $ MISSING: tainted
-        request.files['key'].stream, # $ MISSING: tainted
+        request.files['key'].filename, # $ tainted
+        request.files['key'].stream, # $ tainted
+        request.files['key'].read(), # $ tainted
+        request.files['key'].stream.read(), # $ tainted
        request.files.get('key'), # $ tainted
-        request.files.get('key').filename, # $ MISSING: tainted
-        request.files.get('key').stream, # $ MISSING: tainted
+        request.files.get('key').filename, # $ tainted
+        request.files.get('key').stream, # $ tainted
        request.files.getlist('key'), # $ tainted
-        request.files.getlist('key')[0].filename, # $ MISSING: tainted
-        request.files.getlist('key')[0].stream, # $ MISSING: tainted
+        request.files.getlist('key')[0].filename, # $ tainted
+        request.files.getlist('key')[0].stream, # $ tainted

        # By default werkzeug.datastructures.ImmutableMultiDict -- although can be changed :\
        request.form, # $ tainted
@@ -94,11 +105,15 @@ def test_taint(name = "World!", number="0", foo="foo"):  # $requestHandler route
        request.headers, # $ tainted
        request.headers['key'], # $ tainted
        request.headers.get('key'), # $ tainted
-        request.headers.get_all('key'), # $ MISSING: tainted
-        request.headers.getlist('key'), # $ MISSING: tainted
+        request.headers.get_all('key'), # $ tainted
+        request.headers.getlist('key'), # $ tainted
+        # popitem returns `(key, value)`
+        request.headers.popitem(), # $ tainted
+        request.headers.popitem()[0], # $ tainted
+        request.headers.popitem()[1], # $ tainted
        # two ways to get (k, v) lists
        list(request.headers), # $ tainted
-        request.headers.to_wsgi_list(), # $ MISSING: tainted
+        request.headers.to_wsgi_list(), # $ tainted

        request.json, # $ tainted
        request.json['foo'], # $ tainted
--- a/python/ql/test/library-tests/frameworks/modeling-example/NaiveModel.expected
+++ b/python/ql/test/library-tests/frameworks/modeling-example/NaiveModel.expected
@@ -26,6 +26,7 @@ nodes
 | test.py:90:11:90:14 | ControlFlowNode for bm() | semmle.label | ControlFlowNode for bm() |
 | test.py:91:10:91:12 | ControlFlowNode for val | semmle.label | ControlFlowNode for val |
 | test.py:107:11:107:18 | ControlFlowNode for source() | semmle.label | ControlFlowNode for source() |
+subpaths
 #select
 | test.py:22:10:22:24 | ControlFlowNode for Attribute() | test.py:21:11:21:18 | ControlFlowNode for source() | test.py:22:10:22:24 | ControlFlowNode for Attribute() | test flow (naive): test_simple |
 | test.py:33:10:33:12 | ControlFlowNode for val | test.py:29:11:29:18 | ControlFlowNode for source() | test.py:33:10:33:12 | ControlFlowNode for val | test flow (naive): test_alias |
--- a/python/ql/test/library-tests/frameworks/modeling-example/ProperModel.expected
+++ b/python/ql/test/library-tests/frameworks/modeling-example/ProperModel.expected
@@ -66,6 +66,7 @@ nodes
 | test.py:103:46:103:47 | ControlFlowNode for bm | semmle.label | ControlFlowNode for bm |
 | test.py:107:11:107:18 | ControlFlowNode for source() | semmle.label | ControlFlowNode for source() |
 | test.py:108:46:108:58 | ControlFlowNode for Attribute | semmle.label | ControlFlowNode for Attribute |
+subpaths
 #select
 | test.py:22:10:22:24 | ControlFlowNode for Attribute() | test.py:21:11:21:18 | ControlFlowNode for source() | test.py:22:10:22:24 | ControlFlowNode for Attribute() | test flow (proper): test_simple |
 | test.py:33:10:33:12 | ControlFlowNode for val | test.py:29:11:29:18 | ControlFlowNode for source() | test.py:33:10:33:12 | ControlFlowNode for val | test flow (proper): test_alias |
--- a/python/ql/test/library-tests/frameworks/stdlib/http_server.py
+++ b/python/ql/test/library-tests/frameworks/stdlib/http_server.py
@@ -58,17 +58,17 @@ class MyHandler(BaseHTTPRequestHandler):
            self.headers, # $ tainted
            self.headers['Foo'], # $ tainted
            self.headers.get('Foo'), # $ tainted
-            self.headers.get_all('Foo'), # $ MISSING: tainted
-            self.headers.keys(), # $ MISSING: tainted
+            self.headers.get_all('Foo'), # $ tainted
+            self.headers.keys(), # $ tainted
            self.headers.values(), # $ tainted
            self.headers.items(), # $ tainted
-            self.headers.as_bytes(), # $ MISSING: tainted
-            self.headers.as_string(), # $ MISSING: tainted
+            self.headers.as_bytes(), # $ tainted
+            self.headers.as_string(), # $ tainted
            str(self.headers), # $ tainted
            bytes(self.headers), # $ tainted

            self.rfile, # $ tainted
-            self.rfile.read(), # $ MISSING: tainted
+            self.rfile.read(), # $ tainted
        )

        form = cgi.FieldStorage(
--- a/python/ql/test/library-tests/frameworks/tornado/taint_test.py
+++ b/python/ql/test/library-tests/frameworks/tornado/taint_test.py
@@ -61,15 +61,16 @@ class TaintTest(tornado.web.RequestHandler):
            # dict-like, see https://www.tornadoweb.org/en/stable/httputil.html#tornado.httputil.HTTPHeaders
            request.headers, # $ tainted
            request.headers["header-name"], # $ tainted
-            request.headers.get_list("header-name"), # $ MISSING: tainted
-            request.headers.get_all(), # $ MISSING: tainted
-            [(k, v) for (k, v) in request.headers.get_all()], # $ MISSING: tainted
+            request.headers.get_list("header-name"), # $ tainted
+            request.headers.get_all(), # $ tainted
+            [(k, v) for (k, v) in request.headers.get_all()], # $ tainted

            # Dict[str, http.cookies.Morsel]
            request.cookies, # $ tainted
            request.cookies["cookie-name"], # $ tainted
-            request.cookies["cookie-name"].key, # $ MISSING: tainted
-            request.cookies["cookie-name"].value, # $ MISSING: tainted
+            request.cookies["cookie-name"].key, # $ tainted
+            request.cookies["cookie-name"].value, # $ tainted
+            request.cookies["cookie-name"].coded_value, # $ tainted
        )


--- a/python/ql/test/library-tests/regex/Alternation.expected
+++ b/python/ql/test/library-tests/regex/Alternation.expected
@@ -19,4 +19,4 @@
 | x\| | 0 | 2 | x\| | 0 | 1 | x |
 | x\| | 0 | 2 | x\| | 2 | 2 |  |
 | x\|(?<!\\w)l | 0 | 10 | x\|(?<!\\w)l | 0 | 1 | x |
-| x\|(?<!\\w)l | 0 | 10 | x\|(?<!\\w)l | 2 | 10 | (?<!\\w)l |
+| x\|(?<!\\w)l | 0 | 10 | x\|(?<!\\w)l | 2 | 10 | (?<!\\w)l |
--- a/python/ql/test/library-tests/regex/Characters.expected
+++ b/python/ql/test/library-tests/regex/Characters.expected
@@ -52,6 +52,8 @@
 | [^A-Z] | 2 | 3 |
 | [^A-Z] | 4 | 5 |
 | [^]] | 2 | 3 |
+| \\+0 | 0 | 2 |
+| \\+0 | 2 | 3 |
 | \\A[+-]?\\d+ | 0 | 2 |
 | \\A[+-]?\\d+ | 3 | 4 |
 | \\A[+-]?\\d+ | 4 | 5 |
--- a/python/ql/test/library-tests/regex/Consistency.expected
+++ b/python/ql/test/library-tests/regex/Consistency.expected
--- a/python/ql/test/library-tests/regex/Consistency.ql
+++ b/python/ql/test/library-tests/regex/Consistency.ql
@@ -0,0 +1,12 @@
+/**
+ * Flags regular expressions that are parsed ambigously
+ */
+
+import python
+import semmle.python.regex
+
+from string str, Location loc, int counter
+where
+  counter = strictcount(Regex term | term.getLocation() = loc and term.getText() = str) and
+  counter > 1
+select str, counter, loc
--- a/python/ql/test/library-tests/regex/FirstLast.expected
+++ b/python/ql/test/library-tests/regex/FirstLast.expected
@@ -42,6 +42,8 @@
 | [^A-Z] | last | 0 | 6 |
 | [^]] | first | 0 | 4 |
 | [^]] | last | 0 | 4 |
+| \\+0 | first | 0 | 2 |
+| \\+0 | last | 2 | 3 |
 | \\A[+-]?\\d+ | first | 0 | 2 |
 | \\A[+-]?\\d+ | last | 7 | 9 |
 | \\A[+-]?\\d+ | last | 7 | 10 |
--- a/python/ql/test/library-tests/regex/Regex.expected
+++ b/python/ql/test/library-tests/regex/Regex.expected
@@ -113,6 +113,9 @@
 | [^]] | char | 2 | 3 |
 | [^]] | char-set | 0 | 4 |
 | [^]] | sequence | 0 | 4 |
+| \\+0 | char | 0 | 2 |
+| \\+0 | char | 2 | 3 |
+| \\+0 | sequence | 0 | 3 |
 | \\A[+-]?\\d+ | char | 0 | 2 |
 | \\A[+-]?\\d+ | char | 3 | 4 |
 | \\A[+-]?\\d+ | char | 4 | 5 |
--- a/python/ql/test/library-tests/regex/charRangeTest.py
+++ b/python/ql/test/library-tests/regex/charRangeTest.py
@@ -24,7 +24,8 @@ except re.error:

 re.compile(r'[^A-Z]') #$ charRange=2:3-4:5

-re.compile(r'[\0-\09]') #$ charRange=1:3-4:7
+re.compile(r'[\0-\09]') #$ charRange=1:3-4:6
+re.compile(r'[\0-\07]') #$ charRange=1:3-4:7

 re.compile(r'[\0123-5]') #$ charRange=5:6-7:8

--- a/python/ql/test/library-tests/regex/escapedCharacterTest.py
+++ b/python/ql/test/library-tests/regex/escapedCharacterTest.py
@@ -10,8 +10,10 @@ re.compile(r'[\---]') #$ escapedCharacter=1:3
 re.compile(r'[--\-]') #$ escapedCharacter=3:5
 re.compile(r'[\--\-]') #$ escapedCharacter=1:3 escapedCharacter=4:6
 re.compile(r'[0\-9-A-Z]') #$ escapedCharacter=2:4
-re.compile(r'[\0-\09]') #$ escapedCharacter=1:3 escapedCharacter=4:7
+re.compile(r'[\0-\09]') #$ escapedCharacter=1:3 escapedCharacter=4:6
+re.compile(r'[\0-\07]') #$ escapedCharacter=1:3 escapedCharacter=4:7
 re.compile(r'[\0123-5]') #$ escapedCharacter=1:5
+re.compile(r'\1754\1854\17\18\07\08') #$ escapedCharacter=0:4 escapedCharacter=16:19 escapedCharacter=19:21

 #ODASA-3985
 #Half Surrogate pairs
@@ -21,3 +23,9 @@ re.compile(u'[\U00010000-\U0010ffff]') # not escapes

 #Misparsed on LGTM
 re.compile(r"\[(?P<txt>[^[]*)\]\((?P<uri>[^)]*)") #$ escapedCharacter=0:2 escapedCharacter=16:18 escapedCharacter=18:20
+
+#Non-raw string
+re_blank = re.compile('(\n|\r|\\s)*\n', re.M) #$ escapedCharacter=5:7
+
+#Backreference confusion
+re.compile(r'\+0') #$ escapedCharacter=0:2
--- a/python/ql/test/library-tests/regex/test.py
+++ b/python/ql/test/library-tests/regex/test.py
@@ -70,3 +70,6 @@ re.compile("", re.M) # ODASA-8056
 # FP reported in https://github.com/github/codeql/issues/3712
 # This does not define a regex (but could be used by other code to do so)
 escaped = re.escape("https://www.humblebundle.com/home/library")
+
+# Consistency check
+baz = re.compile(r'\+0')
--- a/python/ql/test/library-tests/regexparser/Consistency.expected
+++ b/python/ql/test/library-tests/regexparser/Consistency.expected
--- a/python/ql/test/library-tests/regexparser/Consistency.ql
+++ b/python/ql/test/library-tests/regexparser/Consistency.ql
@@ -0,0 +1,15 @@
+/**
+ * Flags regular expressions that are parsed ambigously
+ */
+
+import python
+import semmle.python.RegexTreeView
+
+from string str, int counter, Location loc
+where
+  counter =
+    strictcount(RegExpTerm term |
+      term.getLocation() = loc and term.isRootTerm() and term.toString() = str
+    ) and
+  counter > 1
+select str, counter, loc
--- a/python/ql/test/library-tests/regexparser/KnownCVEs.py
+++ b/python/ql/test/library-tests/regexparser/KnownCVEs.py
@@ -0,0 +1,94 @@
+import re
+
+# linear
+# https://github.com/github/codeql-python-CVE-coverage/issues/439
+rex_blame = re.compile(r'\s*(\d+)\s*(\S+) (.*)')
+
+# https://github.com/github/codeql-python-CVE-coverage/issues/402
+whitespace = br"[\000\011\012\014\015\040]"
+whitespace_optional = whitespace + b"*"
+newline_only = br"[\r\n]+"
+newline = whitespace_optional + newline_only + whitespace_optional
+toFlag = re.compile(newline)
+
+# https://github.com/github/codeql-python-CVE-coverage/issues/400
+re.compile(r'[+-]?(\d+)*\.\d+%?')
+re.compile(r'"""\s+(?:.|\n)*?\s+"""')
+re.compile(r'(\{\s+)(\S+)(\s+[^}]+\s+\}\s)')
+re.compile(r'".*``.*``.*"')
+re.compile(r'(\s*)(?:(.+)(\s*)(=)(\s*))?(.+)(\()(.*)(\))(\s*)')
+re.compile(r'(%config)(\s*\(\s*)(\w+)(\s*=\s*)(.*?)(\s*\)\s*)')
+re.compile(r'(%new)(\s*)(\()(\s*.*?\s*)(\))')
+re.compile(r'(\$)(evoque|overlay)(\{(%)?)(\s*[#\w\-"\'.]+[^=,%}]+?)?')
+re.compile(r'(\.\w+\b)(\s*=\s*)([^;]*)(\s*;)')
+
+# linear
+# https://github.com/github/codeql-python-CVE-coverage/issues/392
+simple_email_re = re.compile(r"^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$")
+
+# https://github.com/github/codeql-python-CVE-coverage/issues/249
+rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
+                     'realm=(["\']?)([^"\']*)\\2', re.I)
+
+# https://github.com/github/codeql-python-CVE-coverage/issues/248
+gauntlet = re.compile(
+            r"""^([-/:,#%.'"\s!\w]|\w-\w|'[\s\w]+'\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$""",
+            flags=re.U
+        )
+
+# https://github.com/github/codeql-python-CVE-coverage/issues/227
+# from .compat import tobytes
+
+WS = "[ \t]"
+OWS = WS + "{0,}?"
+
+# RFC 7230 Section 3.2.6 "Field Value Components":
+# tchar          = "!" / "#" / "$" / "%" / "&" / "'" / "*"
+#                / "+" / "-" / "." / "^" / "_" / "`" / "|" / "~"
+#                / DIGIT / ALPHA
+# obs-text      = %x80-FF
+TCHAR = r"[!#$%&'*+\-.^_`|~0-9A-Za-z]"
+OBS_TEXT = r"\x80-\xff"
+TOKEN = TCHAR + "{1,}"
+# RFC 5234 Appendix B.1 "Core Rules":
+# VCHAR         =  %x21-7E
+#                  ; visible (printing) characters
+VCHAR = r"\x21-\x7e"
+# header-field   = field-name ":" OWS field-value OWS
+# field-name     = token
+# field-value    = *( field-content / obs-fold )
+# field-content  = field-vchar [ 1*( SP / HTAB ) field-vchar ]
+# field-vchar    = VCHAR / obs-text
+# Errata from: https://www.rfc-editor.org/errata_search.php?rfc=7230&eid=4189
+# changes field-content to:
+#
+# field-content  = field-vchar [ 1*( SP / HTAB / field-vchar )
+#                  field-vchar ]
+
+FIELD_VCHAR = "[" + VCHAR + OBS_TEXT + "]"
+FIELD_CONTENT = FIELD_VCHAR + "([ \t" + VCHAR + OBS_TEXT + "]+" + FIELD_VCHAR + "){,1}"
+FIELD_VALUE = "(" + FIELD_CONTENT + "){0,}"
+
+HEADER_FIELD = re.compile(
+    #  tobytes(
+         "^(?P<name>" + TOKEN + "):" + OWS + "(?P<value>" + FIELD_VALUE + ")" + OWS + "$"
+    #  )
+ )
+
+# https://github.com/github/codeql-python-CVE-coverage/issues/224
+pattern = re.compile(
+    r'^(:?(([a-zA-Z]{1})|([a-zA-Z]{1}[a-zA-Z]{1})|'  # domain pt.1
+    r'([a-zA-Z]{1}[0-9]{1})|([0-9]{1}[a-zA-Z]{1})|'  # domain pt.2
+    r'([a-zA-Z0-9][-_a-zA-Z0-9]{0,61}[a-zA-Z0-9]))\.)+'  # domain pt.3
+    r'([a-zA-Z]{2,13}|(xn--[a-zA-Z0-9]{2,30}))$'  # TLD
+)
+
+# https://github.com/github/codeql-python-CVE-coverage/issues/189
+URL_REGEX = (
+     r'(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|'
+     r'[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|'
+     r'(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|'
+     r'[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))'  # "emacs!
+)
+
+url = re.compile(URL_REGEX)
--- a/python/ql/test/library-tests/regexparser/polredos.py
+++ b/python/ql/test/library-tests/regexparser/polredos.py
@@ -0,0 +1,9 @@
+import re
+from flask import Flask, request
+app = Flask(__name__)
+
+@app.route("/poly-redos")
+def code_execution():
+    text = request.args.get("text")
+    re.sub(r"^\s+|\s+$", "", text) # NOT OK
+    re.match(r"^0\.\d+E?\d+$", text) # NOT OK
--- a/python/ql/test/library-tests/regexparser/redos.py
+++ b/python/ql/test/library-tests/regexparser/redos.py
@@ -0,0 +1,376 @@
+# This is currently a copy of the redos test-file, since that one contains many regexes.
+
+import re
+
+# NOT GOOD; attack: "_" + "__".repeat(100)
+# Adapted from marked (https://github.com/markedjs/marked), which is licensed
+# under the MIT license; see file marked-LICENSE.
+bad1 = re.compile(r'''^\b_((?:__|[\s\S])+?)_\b|^\*((?:\*\*|[\s\S])+?)\*(?!\*)''')
+
+# GOOD
+# Adapted from marked (https://github.com/markedjs/marked), which is licensed
+# under the MIT license; see file marked-LICENSE.
+good1 = re.compile(r'^\b_((?:__|[^_])+?)_\b|^\*((?:\*\*|[^*])+?)\*(?!\*)')
+
+# GOOD - there is no witness in the end that could cause the regexp to not match
+# Adapted from brace-expansion (https://github.com/juliangruber/brace-expansion),
+# which is licensed under the MIT license; see file brace-expansion-LICENSE.
+good2 = re.compile(r'(.*,)+.+')
+
+# NOT GOOD; attack: " '" + "\\\\".repeat(100)
+# Adapted from CodeMirror (https://github.com/codemirror/codemirror),
+# which is licensed under the MIT license; see file CodeMirror-LICENSE.
+bad2 = re.compile(r'''^(?:\s+(?:"(?:[^"\\]|\\\\|\\.)+"|'(?:[^'\\]|\\\\|\\.)+'|\((?:[^)\\]|\\\\|\\.)+\)))?''')
+
+# GOOD
+# Adapted from lulucms2 (https://github.com/yiifans/lulucms2).
+good2 = re.compile(r'''\(\*(?:[\s\S]*?\(\*[\s\S]*?\*\))*[\s\S]*?\*\)''')
+
+# GOOD
+# Adapted from jest (https://github.com/facebook/jest), which is licensed
+# under the MIT license; see file jest-LICENSE.
+good3 = re.compile(r'''^ *(\S.*\|.*)\n *([-:]+ *\|[-| :]*)\n((?:.*\|.*(?:\n|$))*)\n*''')
+
+# NOT GOOD, variant of good3; attack: "a|\n:|\n" + "||\n".repeat(100)
+bad4 = re.compile(r'''^ *(\S.*\|.*)\n *([-:]+ *\|[-| :]*)\n((?:.*\|.*(?:\n|$))*)a''')
+
+# NOT GOOD; attack: "/" + "\\/a".repeat(100)
+# Adapted from ANodeBlog (https://github.com/gefangshuai/ANodeBlog),
+# which is licensed under the Apache License 2.0; see file ANodeBlog-LICENSE.
+bad5 = re.compile(r'''\/(?![ *])(\\\/|.)*?\/[gim]*(?=\W|$)''')
+
+# NOT GOOD; attack: "##".repeat(100) + "\na"
+# Adapted from CodeMirror (https://github.com/codemirror/codemirror),
+# which is licensed under the MIT license; see file CodeMirror-LICENSE.
+bad6 = re.compile(r'''^([\s\[\{\(]|#.*)*$''')
+
+# GOOD
+good4 = re.compile(r'''(\r\n|\r|\n)+''')
+
+# BAD - PoC: `node -e "/((?:[^\"\']|\".*?\"|\'.*?\')*?)([(,)]|$)/.test(\"'''''''''''''''''''''''''''''''''''''''''''''\\\"\");"`. It's complicated though, because the regexp still matches something, it just matches the empty-string after the attack string.
+actuallyBad = re.compile(r'''((?:[^"']|".*?"|'.*?')*?)([(,)]|$)''')
+
+# NOT GOOD; attack: "a" + "[]".repeat(100) + ".b\n"
+# Adapted from Knockout (https://github.com/knockout/knockout), which is
+# licensed under the MIT license; see file knockout-LICENSE
+bad6 = re.compile(r'''^[\_$a-z][\_$a-z0-9]*(\[.*?\])*(\.[\_$a-z][\_$a-z0-9]*(\[.*?\])*)*$''')
+
+# GOOD
+good6 = re.compile(r'''(a|.)*''')
+
+# Testing the NFA - only some of the below are detected.
+bad7 = re.compile(r'''^([a-z]+)+$''')
+bad8 = re.compile(r'''^([a-z]*)*$''')
+bad9 = re.compile(r'''^([a-zA-Z0-9])(([\\-.]|[_]+)?([a-zA-Z0-9]+))*(@){1}[a-z0-9]+[.]{1}(([a-z]{2,3})|([a-z]{2,3}[.]{1}[a-z]{2,3}))$''')
+bad10 = re.compile(r'''^(([a-z])+.)+[A-Z]([a-z])+$''')
+
+# NOT GOOD; attack: "[" + "][".repeat(100) + "]!"
+# Adapted from Prototype.js (https://github.com/prototypejs/prototype), which
+# is licensed under the MIT license; see file Prototype.js-LICENSE.
+bad11 = re.compile(r'''(([\w#:.~>+()\s-]+|\*|\[.*?\])+)\s*(,|$)''')
+
+# NOT GOOD; attack: "'" + "\\a".repeat(100) + '"'
+# Adapted from Prism (https://github.com/PrismJS/prism), which is licensed
+# under the MIT license; see file Prism-LICENSE.
+bad12 = re.compile(r'''("|')(\\?.)*?\1''')
+
+# NOT GOOD
+bad13 = re.compile(r'''(b|a?b)*c''')
+
+# NOT GOOD
+bad15 = re.compile(r'''(a|aa?)*b''')
+
+# GOOD
+good7 = re.compile(r'''(.|\n)*!''')
+
+# NOT GOOD; attack: "\n".repeat(100) + "."
+bad16 = re.compile(r'''(.|\n)*!''')
+
+# GOOD
+good8 = re.compile(r'''([\w.]+)*''')
+
+# NOT GOOD
+bad17 = re.compile(r'''(a|aa?)*b''')
+
+# GOOD - not used as regexp
+good9 = '(a|aa?)*b'
+
+# NOT GOOD
+bad18 = re.compile(r'''(([\s\S]|[^a])*)"''')
+
+# GOOD - there is no witness in the end that could cause the regexp to not match
+good10 = re.compile(r'''([^"']+)*''')
+
+# NOT GOOD
+bad20 = re.compile(r'''((.|[^a])*)"''')
+
+# GOOD
+good10 = re.compile(r'''((a|[^a])*)"''')
+
+# NOT GOOD
+bad21 = re.compile(r'''((b|[^a])*)"''')
+
+# NOT GOOD
+bad22 = re.compile(r'''((G|[^a])*)"''')
+
+# NOT GOOD
+bad23 = re.compile(r'''(([0-9]|[^a])*)"''')
+
+# NOT GOOD
+bad24 = re.compile(r'''(?:=(?:([!#\$%&'\*\+\-\.\^_`\|~0-9A-Za-z]+)|"((?:\\[\x00-\x7f]|[^\x00-\x08\x0a-\x1f\x7f"])*)"))?''')
+
+# NOT GOOD
+bad25 = re.compile(r'''"((?:\\[\x00-\x7f]|[^\x00-\x08\x0a-\x1f\x7f"])*)"''')
+
+# GOOD
+bad26 = re.compile(r'''"((?:\\[\x00-\x7f]|[^\x00-\x08\x0a-\x1f\x7f"\\])*)"''')
+
+# NOT GOOD
+bad27 = re.compile(r'''(([a-z]|[d-h])*)"''')
+
+# NOT GOOD
+bad27 = re.compile(r'''(([^a-z]|[^0-9])*)"''')
+
+# NOT GOOD
+bad28 = re.compile(r'''((\d|[0-9])*)"''')
+
+# NOT GOOD
+bad29 = re.compile(r'''((\s|\s)*)"''')
+
+# NOT GOOD
+bad30 = re.compile(r'''((\w|G)*)"''')
+
+# GOOD
+good11 = re.compile(r'''((\s|\d)*)"''')
+
+# NOT GOOD
+bad31 = re.compile(r'''((\d|\w)*)"''')
+
+# NOT GOOD
+bad32 = re.compile(r'''((\d|5)*)"''')
+
+# NOT GOOD
+bad33 = re.compile(r'''((\s|[\f])*)"''')
+
+# NOT GOOD
+bad34 = re.compile(r'''((\s|[\v]|\\v)*)"''')
+
+# NOT GOOD
+bad35 = re.compile(r'''((\f|[\f])*)"''')
+
+# NOT GOOD
+bad36 = re.compile(r'''((\W|\D)*)"''')
+
+# NOT GOOD
+bad37 = re.compile(r'''((\S|\w)*)"''')
+
+# NOT GOOD
+bad38 = re.compile(r'''((\S|[\w])*)"''')
+
+# NOT GOOD
+bad39 = re.compile(r'''((1s|[\da-z])*)"''')
+
+# NOT GOOD
+bad40 = re.compile(r'''((0|[\d])*)"''')
+
+# NOT GOOD
+bad41 = re.compile(r'''(([\d]+)*)"''')
+
+# GOOD - there is no witness in the end that could cause the regexp to not match
+good12 = re.compile(r'''(\d+(X\d+)?)+''')
+
+# GOOD - there is no witness in the end that could cause the regexp to not match
+good13 = re.compile(r'''([0-9]+(X[0-9]*)?)*''')
+
+# GOOD
+good15 = re.compile(r'''^([^>]+)*(>|$)''')
+
+# NOT GOOD
+bad43 = re.compile(r'''^([^>a]+)*(>|$)''')
+
+# NOT GOOD
+bad44 = re.compile(r'''(\n\s*)+$''')
+
+# NOT GOOD
+bad45 = re.compile(r'''^(?:\s+|#.*|\(\?#[^)]*\))*(?:[?*+]|{\d+(?:,\d*)?})''')
+
+# NOT GOOD
+bad46 = re.compile(r'''\{\[\s*([a-zA-Z]+)\(([a-zA-Z]+)\)((\s*([a-zA-Z]+)\: ?([ a-zA-Z{}]+),?)+)*\s*\]\}''')
+
+# NOT GOOD
+bad47 = re.compile(r'''(a+|b+|c+)*c''')
+
+# NOT GOOD
+bad48 = re.compile(r'''(((a+a?)*)+b+)''')
+
+# NOT GOOD
+bad49 = re.compile(r'''(a+)+bbbb''')
+
+# GOOD
+good16 = re.compile(r'''(a+)+aaaaa*a+''')
+
+# NOT GOOD
+bad50 = re.compile(r'''(a+)+aaaaa$''')
+
+# GOOD
+good17 = re.compile(r'''(\n+)+\n\n''')
+
+# NOT GOOD
+bad51 = re.compile(r'''(\n+)+\n\n$''')
+
+# NOT GOOD
+bad52 = re.compile(r'''([^X]+)*$''')
+
+# NOT GOOD
+bad53 = re.compile(r'''(([^X]b)+)*$''')
+
+# GOOD
+good18 = re.compile(r'''(([^X]b)+)*($|[^X]b)''')
+
+# NOT GOOD
+bad54 = re.compile(r'''(([^X]b)+)*($|[^X]c)''')
+
+# GOOD
+good20 = re.compile(r'''((ab)+)*ababab''')
+
+# GOOD
+good21 = re.compile(r'''((ab)+)*abab(ab)*(ab)+''')
+
+# GOOD
+good22 = re.compile(r'''((ab)+)*''')
+
+# NOT GOOD
+bad55 = re.compile(r'''((ab)+)*$''')
+
+# GOOD
+good23 = re.compile(r'''((ab)+)*[a1][b1][a2][b2][a3][b3]''')
+
+# NOT GOOD
+bad56 = re.compile(r'''([\n\s]+)*(.)''')
+
+# GOOD - any witness passes through the accept state.
+good24 = re.compile(r'''(A*A*X)*''')
+
+# GOOD
+good26 = re.compile(r'''([^\\\]]+)*''')
+
+# NOT GOOD
+bad59 = re.compile(r'''(\w*foobarbaz\w*foobarbaz\w*foobarbaz\w*foobarbaz\s*foobarbaz\d*foobarbaz\w*)+-''')
+
+# NOT GOOD
+bad60 = re.compile(r'''(.thisisagoddamnlongstringforstresstestingthequery|\sthisisagoddamnlongstringforstresstestingthequery)*-''')
+
+# NOT GOOD
+bad61 = re.compile(r'''(thisisagoddamnlongstringforstresstestingthequery|this\w+query)*-''')
+
+# GOOD
+good27 = re.compile(r'''(thisisagoddamnlongstringforstresstestingthequery|imanotherbutunrelatedstringcomparedtotheotherstring)*-''')
+
+# GOOD (but false positive caused by the extractor converting all four unpaired surrogates to \uFFFD)
+good28 = re.compile('''foo([\uDC66\uDC67]|[\uDC68\uDC69])*foo''')
+
+# GOOD (but false positive caused by the extractor converting all four unpaired surrogates to \uFFFD)
+good29 = re.compile('''foo((\uDC66|\uDC67)|(\uDC68|\uDC69))*foo''')
+
+# NOT GOOD (but cannot currently construct a prefix)
+bad62 = re.compile(r'''a{2,3}(b+)+X''')
+
+# NOT GOOD (and a good prefix test)
+bad63 = re.compile(r'''^<(\w+)((?:\s+\w+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>''')
+
+# GOOD
+good30 = re.compile(r'''(a+)*[\s\S][\s\S][\s\S]?''')
+
+# GOOD - but we fail to see that repeating the attack string ends in the "accept any" state (due to not parsing the range `[\s\S]{2,3}`).
+good31 = re.compile(r'''(a+)*[\s\S]{2,3}''')
+
+# GOOD - but we spuriously conclude that a rejecting suffix exists (due to not parsing the range `[\s\S]{2,}` when constructing the NFA).
+good32 = re.compile(r'''(a+)*([\s\S]{2,}|X)$''')
+
+# GOOD
+good33 = re.compile(r'''(a+)*([\s\S]*|X)$''')
+
+# NOT GOOD
+bad64 = re.compile(r'''((a+)*$|[\s\S]+)''')
+
+# GOOD - but still flagged. The only change compared to the above is the order of alternatives, which we don't model.
+good34 = re.compile(r'''([\s\S]+|(a+)*$)''')
+
+# GOOD
+good35 = re.compile(r'''((;|^)a+)+$''')
+
+# NOT GOOD (a good prefix test)
+bad65 = re.compile(r'''(^|;)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(e+)+f''')
+
+# NOT GOOD
+bad66 = re.compile(r'''^ab(c+)+$''')
+
+# NOT GOOD
+bad67 = re.compile(r'''(\d(\s+)*){20}''')
+
+# GOOD - but we spuriously conclude that a rejecting suffix exists.
+good36 = re.compile(r'''(([^/]|X)+)(\/[\s\S]*)*$''')
+
+# GOOD - but we spuriously conclude that a rejecting suffix exists.
+good37 = re.compile(r'''^((x([^Y]+)?)*(Y|$))''')
+
+# NOT GOOD
+bad68 = re.compile(r'''(a*)+b''')
+
+# NOT GOOD
+bad69 = re.compile(r'''foo([\w-]*)+bar''')
+
+# NOT GOOD
+bad70 = re.compile(r'''((ab)*)+c''')
+
+# NOT GOOD
+bad71 = re.compile(r'''(a?a?)*b''')
+
+# GOOD
+good38 = re.compile(r'''(a?)*b''')
+
+# NOT GOOD - but not detected
+bad72 = re.compile(r'''(c?a?)*b''')
+
+# NOT GOOD
+bad73 = re.compile(r'''(?:a|a?)+b''')
+
+# NOT GOOD - but not detected.
+bad74 = re.compile(r'''(a?b?)*$''')
+
+# NOT GOOD
+bad76 = re.compile(r'''PRE(([a-c]|[c-d])T(e?e?e?e?|X))+(cTcT|cTXcTX$)''')
+
+# NOT GOOD - but not detected
+bad77 = re.compile(r'''^((a)+\w)+$''')
+
+# NOT GOOD
+bad78 = re.compile(r'''^(b+.)+$''')
+
+# GOOD
+good39 = re.compile(r'''a*b''')
+
+# All 4 bad combinations of nested * and +
+bad79 = re.compile(r'''(a*)*b''')
+bad80 = re.compile(r'''(a+)*b''')
+bad81 = re.compile(r'''(a*)+b''')
+bad82 = re.compile(r'''(a+)+b''')
+
+# GOOD
+good40 = re.compile(r'''(a|b)+''')
+good41 = re.compile(r'''(?:[\s;,"'<>(){}|[\]@=+*]|:(?![/\\]))+''') # parses wrongly, sees column 42 as a char set start
+
+# NOT GOOD
+bad83 = re.compile(r'''^((?:a{|-)|\w\{)+X$''')
+bad84 = re.compile(r'''^((?:a{0|-)|\w\{\d)+X$''')
+bad85 = re.compile(r'''^((?:a{0,|-)|\w\{\d,)+X$''')
+bad86 = re.compile(r'''^((?:a{0,2|-)|\w\{\d,\d)+X$''')
+
+# GOOD:
+good42 = re.compile(r'''^((?:a{0,2}|-)|\w\{\d,\d\})+X$''')
+
+# NOT GOOD
+bad87 = re.compile(r'X(\u0061|a)*Y')
+
+# GOOD
+good43 = re.compile(r'X(\u0061|b)+Y')
--- a/python/ql/test/library-tests/regexparser/unittests.py
+++ b/python/ql/test/library-tests/regexparser/unittests.py
@@ -0,0 +1,20 @@
+import re
+
+# Treatment of escapes
+re.compile(r"X([^\.]|\.)*$") # No ReDoS.
+re.compile(r"X(Æ|\Æ)+$") # Has ReDoS.
+
+# Treatment of line breaks
+re.compile(r'(?:.|\n)*b') # No ReDoS.
+re.compile(r'(?:.|\n)*b', re.DOTALL) # Has ReDoS.
+
+# minimal example constructed by @erik-krogh
+baz = re.compile(r'\+0')
+
+# exerpts from LGTM.com
+re.compile(r'\+0x')
+re.compile(r'\+0x.*')
+re.compile(r'+\-0+\.')
+re.compile('\s+\+0x[0-9]+')
+re.compile(r'\+0000 .*')
+re.compile('\#[0-9]+ 0x[0-9]')