Merge branch 'main' into promote-sqlalchemy

This commit is contained in:
Rasmus Wriedt Larsen
2021-09-21 09:36:07 +02:00
1290 changed files with 83375 additions and 11187 deletions

View File

@@ -1,7 +1,5 @@
| code/h_classes.py:3:1:3:16 | ControlFlowNode for ClassExpr | code/h_classes.py:10:1:10:9 | ControlFlowNode for type() |
| code/h_classes.py:3:1:3:16 | ControlFlowNode for ClassExpr | code/h_classes.py:15:5:15:13 | ControlFlowNode for type() |
| code/l_calls.py:3:13:3:14 | ControlFlowNode for List | code/l_calls.py:4:12:4:12 | ControlFlowNode for x |
| code/l_calls.py:6:13:6:14 | ControlFlowNode for List | code/l_calls.py:7:16:7:16 | ControlFlowNode for x |
| code/l_calls.py:12:1:12:20 | ControlFlowNode for ClassExpr | code/l_calls.py:16:16:16:18 | ControlFlowNode for cls |
| code/l_calls.py:12:1:12:20 | ControlFlowNode for ClassExpr | code/l_calls.py:24:13:24:22 | ControlFlowNode for Attribute() |
| code/l_calls.py:12:1:12:20 | ControlFlowNode for ClassExpr | code/l_calls.py:25:16:25:16 | ControlFlowNode for a |

View File

@@ -55,10 +55,10 @@ async def test_taint(request: web.Request): # $ requestHandler
await request.content.readline(), # $ tainted
await request.content.readchunk(), # $ tainted
(await request.content.readchunk())[0], # $ tainted
[line async for line in request.content], # $ MISSING: tainted
[data async for data in request.content.iter_chunked(1024)], # $ MISSING: tainted
[data async for data in request.content.iter_any()], # $ MISSING: tainted
[data async for data, _ in request.content.iter_chunks()], # $ MISSING: tainted
[line async for line in request.content], # $ tainted
[data async for data in request.content.iter_chunked(1024)], # $ tainted
[data async for data in request.content.iter_any()], # $ tainted
[data async for data, _ in request.content.iter_chunks()], # $ tainted
request.content.read_nowait(), # $ tainted
# aiohttp.StreamReader

View File

@@ -11,6 +11,9 @@ def test_taint(request: HttpRequest, foo, bar, baz=None): # $requestHandler rou
# Manually inspected all fields of the HttpRequest object
# https://docs.djangoproject.com/en/3.0/ref/request-response/#httprequest-objects
import django.urls
django.urls.ResolverMatch
ensure_tainted(
request, # $ tainted
@@ -35,8 +38,8 @@ def test_taint(request: HttpRequest, foo, bar, baz=None): # $requestHandler rou
request.GET, # $ tainted
request.GET["key"], # $ tainted
request.GET.get("key"), # $ tainted
request.GET.getlist("key"), # $ MISSING: tainted
request.GET.getlist("key")[0], # $ MISSING: tainted
request.GET.getlist("key"), # $ tainted
request.GET.getlist("key")[0], # $ tainted
request.GET.pop("key"), # $ tainted
request.GET.pop("key")[0], # $ tainted
# key
@@ -45,9 +48,10 @@ def test_taint(request: HttpRequest, foo, bar, baz=None): # $requestHandler rou
request.GET.popitem()[1], # $ tainted
# values[0]
request.GET.popitem()[1][0], # $ tainted
request.GET.dict(), # $ MISSING: tainted
request.GET.dict()["key"], # $ MISSING: tainted
request.GET.urlencode(), # $ MISSING: tainted
request.GET.lists(), # $ tainted
request.GET.dict(), # $ tainted
request.GET.dict()["key"], # $ tainted
request.GET.urlencode(), # $ tainted
# django.http.QueryDict (same as above, did not duplicate tests)
request.POST, # $ tainted
@@ -60,22 +64,23 @@ def test_taint(request: HttpRequest, foo, bar, baz=None): # $requestHandler rou
# MultiValueDict[str, UploadedFile]
request.FILES, # $ tainted
request.FILES["key"], # $ tainted
request.FILES["key"].content_type, # $ MISSING: tainted
request.FILES["key"].content_type_extra, # $ MISSING: tainted
request.FILES["key"].content_type_extra["key"], # $ MISSING: tainted
request.FILES["key"].charset, # $ MISSING: tainted
request.FILES["key"].name, # $ MISSING: tainted
request.FILES["key"].file, # $ MISSING: tainted
request.FILES["key"].file.read(), # $ MISSING: tainted
request.FILES["key"].content_type, # $ tainted
request.FILES["key"].content_type_extra, # $ tainted
request.FILES["key"].content_type_extra["key"], # $ tainted
request.FILES["key"].charset, # $ tainted
request.FILES["key"].name, # $ tainted
request.FILES["key"].file, # $ tainted
request.FILES["key"].file.read(), # $ tainted
request.FILES.get("key"), # $ tainted
request.FILES.get("key").name, # $ MISSING: tainted
request.FILES.getlist("key"), # $ MISSING: tainted
request.FILES.getlist("key")[0], # $ MISSING: tainted
request.FILES.getlist("key")[0].name, # $ MISSING: tainted
request.FILES.dict(), # $ MISSING: tainted
request.FILES.dict()["key"], # $ MISSING: tainted
request.FILES.dict()["key"].name, # $ MISSING: tainted
request.FILES.get("key").name, # $ tainted
request.FILES.getlist("key"), # $ tainted
request.FILES.getlist("key")[0], # $ tainted
request.FILES.getlist("key")[0].name, # $ tainted
request.FILES.dict(), # $ tainted
request.FILES.dict()["key"], # $ tainted
request.FILES.dict()["key"].name, # $ tainted
request.FILES.dict().get("key").name, # $ tainted
# Dict[str, Any]
request.META, # $ tainted
@@ -89,21 +94,21 @@ def test_taint(request: HttpRequest, foo, bar, baz=None): # $requestHandler rou
# django.urls.ResolverMatch
request.resolver_match, # $ tainted
request.resolver_match.args, # $ MISSING: tainted
request.resolver_match.args[0], # $ MISSING: tainted
request.resolver_match.kwargs, # $ MISSING: tainted
request.resolver_match.kwargs["key"], # $ MISSING: tainted
request.resolver_match.args, # $ tainted
request.resolver_match.args[0], # $ tainted
request.resolver_match.kwargs, # $ tainted
request.resolver_match.kwargs["key"], # $ tainted
request.get_full_path(), # $ MISSING: tainted
request.get_full_path_info(), # $ MISSING: tainted
request.get_full_path(), # $ tainted
request.get_full_path_info(), # $ tainted
# build_absolute_uri handled below
# get_signed_cookie handled below
request.read(), # $ MISSING: tainted
request.readline(), # $ MISSING: tainted
request.readlines(), # $ MISSING: tainted
request.readlines()[0], # $ MISSING: tainted
[line for line in request], # $ MISSING: tainted
request.read(), # $ tainted
request.readline(), # $ tainted
request.readlines(), # $ tainted
request.readlines()[0], # $ tainted
[line for line in request], # $ tainted
)
# django.urls.ResolverMatch also supports iterable unpacking
@@ -129,9 +134,9 @@ def test_taint(request: HttpRequest, foo, bar, baz=None): # $requestHandler rou
# build_absolute_uri
####################################
ensure_tainted(
request.build_absolute_uri(), # $ MISSING: tainted
request.build_absolute_uri(request.GET["key"]), # $ MISSING: tainted
request.build_absolute_uri(location=request.GET["key"]), # $ MISSING: tainted
request.build_absolute_uri(), # $ tainted
request.build_absolute_uri(request.GET["key"]), # $ tainted
request.build_absolute_uri(location=request.GET["key"]), # $ tainted
)
ensure_not_tainted(
request.build_absolute_uri("/hardcoded/"),

View File

@@ -0,0 +1,6 @@
from flask import Flask, request
app = Flask(__name__)
@app.route("/save-uploaded-file") # $routeSetup="/save-uploaded-file"
def test_taint(): # $requestHandler
request.files['key'].save("path") # $ getAPathArgument="path"

View File

@@ -44,7 +44,16 @@ def test_taint(name = "World!", number="0", foo="foo"): # $requestHandler route
# werkzeug.datastructures.Authorization (a dict, with some properties)
request.authorization, # $ tainted
request.authorization['username'], # $ tainted
request.authorization.username, # $ MISSING: tainted
request.authorization.username, # $ tainted
request.authorization.password, # $ tainted
request.authorization.realm, # $ tainted
request.authorization.nonce, # $ tainted
request.authorization.uri, # $ tainted
request.authorization.nc, # $ tainted
request.authorization.cnonce, # $ tainted
request.authorization.response, # $ tainted
request.authorization.opaque, # $ tainted
request.authorization.qop, # $ tainted
# werkzeug.datastructures.RequestCacheControl
request.cache_control, # $ tainted
@@ -68,14 +77,16 @@ def test_taint(name = "World!", number="0", foo="foo"): # $requestHandler route
# a werkzeug.datastructures.MultiDict, mapping [str, werkzeug.datastructures.FileStorage]
request.files, # $ tainted
request.files['key'], # $ tainted
request.files['key'].filename, # $ MISSING: tainted
request.files['key'].stream, # $ MISSING: tainted
request.files['key'].filename, # $ tainted
request.files['key'].stream, # $ tainted
request.files['key'].read(), # $ tainted
request.files['key'].stream.read(), # $ tainted
request.files.get('key'), # $ tainted
request.files.get('key').filename, # $ MISSING: tainted
request.files.get('key').stream, # $ MISSING: tainted
request.files.get('key').filename, # $ tainted
request.files.get('key').stream, # $ tainted
request.files.getlist('key'), # $ tainted
request.files.getlist('key')[0].filename, # $ MISSING: tainted
request.files.getlist('key')[0].stream, # $ MISSING: tainted
request.files.getlist('key')[0].filename, # $ tainted
request.files.getlist('key')[0].stream, # $ tainted
# By default werkzeug.datastructures.ImmutableMultiDict -- although can be changed :\
request.form, # $ tainted
@@ -94,11 +105,15 @@ def test_taint(name = "World!", number="0", foo="foo"): # $requestHandler route
request.headers, # $ tainted
request.headers['key'], # $ tainted
request.headers.get('key'), # $ tainted
request.headers.get_all('key'), # $ MISSING: tainted
request.headers.getlist('key'), # $ MISSING: tainted
request.headers.get_all('key'), # $ tainted
request.headers.getlist('key'), # $ tainted
# popitem returns `(key, value)`
request.headers.popitem(), # $ tainted
request.headers.popitem()[0], # $ tainted
request.headers.popitem()[1], # $ tainted
# two ways to get (k, v) lists
list(request.headers), # $ tainted
request.headers.to_wsgi_list(), # $ MISSING: tainted
request.headers.to_wsgi_list(), # $ tainted
request.json, # $ tainted
request.json['foo'], # $ tainted

View File

@@ -26,6 +26,7 @@ nodes
| test.py:90:11:90:14 | ControlFlowNode for bm() | semmle.label | ControlFlowNode for bm() |
| test.py:91:10:91:12 | ControlFlowNode for val | semmle.label | ControlFlowNode for val |
| test.py:107:11:107:18 | ControlFlowNode for source() | semmle.label | ControlFlowNode for source() |
subpaths
#select
| test.py:22:10:22:24 | ControlFlowNode for Attribute() | test.py:21:11:21:18 | ControlFlowNode for source() | test.py:22:10:22:24 | ControlFlowNode for Attribute() | test flow (naive): test_simple |
| test.py:33:10:33:12 | ControlFlowNode for val | test.py:29:11:29:18 | ControlFlowNode for source() | test.py:33:10:33:12 | ControlFlowNode for val | test flow (naive): test_alias |

View File

@@ -66,6 +66,7 @@ nodes
| test.py:103:46:103:47 | ControlFlowNode for bm | semmle.label | ControlFlowNode for bm |
| test.py:107:11:107:18 | ControlFlowNode for source() | semmle.label | ControlFlowNode for source() |
| test.py:108:46:108:58 | ControlFlowNode for Attribute | semmle.label | ControlFlowNode for Attribute |
subpaths
#select
| test.py:22:10:22:24 | ControlFlowNode for Attribute() | test.py:21:11:21:18 | ControlFlowNode for source() | test.py:22:10:22:24 | ControlFlowNode for Attribute() | test flow (proper): test_simple |
| test.py:33:10:33:12 | ControlFlowNode for val | test.py:29:11:29:18 | ControlFlowNode for source() | test.py:33:10:33:12 | ControlFlowNode for val | test flow (proper): test_alias |

View File

@@ -58,17 +58,17 @@ class MyHandler(BaseHTTPRequestHandler):
self.headers, # $ tainted
self.headers['Foo'], # $ tainted
self.headers.get('Foo'), # $ tainted
self.headers.get_all('Foo'), # $ MISSING: tainted
self.headers.keys(), # $ MISSING: tainted
self.headers.get_all('Foo'), # $ tainted
self.headers.keys(), # $ tainted
self.headers.values(), # $ tainted
self.headers.items(), # $ tainted
self.headers.as_bytes(), # $ MISSING: tainted
self.headers.as_string(), # $ MISSING: tainted
self.headers.as_bytes(), # $ tainted
self.headers.as_string(), # $ tainted
str(self.headers), # $ tainted
bytes(self.headers), # $ tainted
self.rfile, # $ tainted
self.rfile.read(), # $ MISSING: tainted
self.rfile.read(), # $ tainted
)
form = cgi.FieldStorage(

View File

@@ -61,15 +61,16 @@ class TaintTest(tornado.web.RequestHandler):
# dict-like, see https://www.tornadoweb.org/en/stable/httputil.html#tornado.httputil.HTTPHeaders
request.headers, # $ tainted
request.headers["header-name"], # $ tainted
request.headers.get_list("header-name"), # $ MISSING: tainted
request.headers.get_all(), # $ MISSING: tainted
[(k, v) for (k, v) in request.headers.get_all()], # $ MISSING: tainted
request.headers.get_list("header-name"), # $ tainted
request.headers.get_all(), # $ tainted
[(k, v) for (k, v) in request.headers.get_all()], # $ tainted
# Dict[str, http.cookies.Morsel]
request.cookies, # $ tainted
request.cookies["cookie-name"], # $ tainted
request.cookies["cookie-name"].key, # $ MISSING: tainted
request.cookies["cookie-name"].value, # $ MISSING: tainted
request.cookies["cookie-name"].key, # $ tainted
request.cookies["cookie-name"].value, # $ tainted
request.cookies["cookie-name"].coded_value, # $ tainted
)

View File

@@ -19,4 +19,4 @@
| x\| | 0 | 2 | x\| | 0 | 1 | x |
| x\| | 0 | 2 | x\| | 2 | 2 | |
| x\|(?<!\\w)l | 0 | 10 | x\|(?<!\\w)l | 0 | 1 | x |
| x\|(?<!\\w)l | 0 | 10 | x\|(?<!\\w)l | 2 | 10 | (?<!\\w)l |
| x\|(?<!\\w)l | 0 | 10 | x\|(?<!\\w)l | 2 | 10 | (?<!\\w)l |

View File

@@ -52,6 +52,8 @@
| [^A-Z] | 2 | 3 |
| [^A-Z] | 4 | 5 |
| [^]] | 2 | 3 |
| \\+0 | 0 | 2 |
| \\+0 | 2 | 3 |
| \\A[+-]?\\d+ | 0 | 2 |
| \\A[+-]?\\d+ | 3 | 4 |
| \\A[+-]?\\d+ | 4 | 5 |

View File

@@ -0,0 +1,12 @@
/**
* Flags regular expressions that are parsed ambigously
*/
import python
import semmle.python.regex
from string str, Location loc, int counter
where
counter = strictcount(Regex term | term.getLocation() = loc and term.getText() = str) and
counter > 1
select str, counter, loc

View File

@@ -42,6 +42,8 @@
| [^A-Z] | last | 0 | 6 |
| [^]] | first | 0 | 4 |
| [^]] | last | 0 | 4 |
| \\+0 | first | 0 | 2 |
| \\+0 | last | 2 | 3 |
| \\A[+-]?\\d+ | first | 0 | 2 |
| \\A[+-]?\\d+ | last | 7 | 9 |
| \\A[+-]?\\d+ | last | 7 | 10 |

View File

@@ -113,6 +113,9 @@
| [^]] | char | 2 | 3 |
| [^]] | char-set | 0 | 4 |
| [^]] | sequence | 0 | 4 |
| \\+0 | char | 0 | 2 |
| \\+0 | char | 2 | 3 |
| \\+0 | sequence | 0 | 3 |
| \\A[+-]?\\d+ | char | 0 | 2 |
| \\A[+-]?\\d+ | char | 3 | 4 |
| \\A[+-]?\\d+ | char | 4 | 5 |

View File

@@ -24,7 +24,8 @@ except re.error:
re.compile(r'[^A-Z]') #$ charRange=2:3-4:5
re.compile(r'[\0-\09]') #$ charRange=1:3-4:7
re.compile(r'[\0-\09]') #$ charRange=1:3-4:6
re.compile(r'[\0-\07]') #$ charRange=1:3-4:7
re.compile(r'[\0123-5]') #$ charRange=5:6-7:8

View File

@@ -10,8 +10,10 @@ re.compile(r'[\---]') #$ escapedCharacter=1:3
re.compile(r'[--\-]') #$ escapedCharacter=3:5
re.compile(r'[\--\-]') #$ escapedCharacter=1:3 escapedCharacter=4:6
re.compile(r'[0\-9-A-Z]') #$ escapedCharacter=2:4
re.compile(r'[\0-\09]') #$ escapedCharacter=1:3 escapedCharacter=4:7
re.compile(r'[\0-\09]') #$ escapedCharacter=1:3 escapedCharacter=4:6
re.compile(r'[\0-\07]') #$ escapedCharacter=1:3 escapedCharacter=4:7
re.compile(r'[\0123-5]') #$ escapedCharacter=1:5
re.compile(r'\1754\1854\17\18\07\08') #$ escapedCharacter=0:4 escapedCharacter=16:19 escapedCharacter=19:21
#ODASA-3985
#Half Surrogate pairs
@@ -21,3 +23,9 @@ re.compile(u'[\U00010000-\U0010ffff]') # not escapes
#Misparsed on LGTM
re.compile(r"\[(?P<txt>[^[]*)\]\((?P<uri>[^)]*)") #$ escapedCharacter=0:2 escapedCharacter=16:18 escapedCharacter=18:20
#Non-raw string
re_blank = re.compile('(\n|\r|\\s)*\n', re.M) #$ escapedCharacter=5:7
#Backreference confusion
re.compile(r'\+0') #$ escapedCharacter=0:2

View File

@@ -70,3 +70,6 @@ re.compile("", re.M) # ODASA-8056
# FP reported in https://github.com/github/codeql/issues/3712
# This does not define a regex (but could be used by other code to do so)
escaped = re.escape("https://www.humblebundle.com/home/library")
# Consistency check
baz = re.compile(r'\+0')

View File

@@ -0,0 +1,15 @@
/**
* Flags regular expressions that are parsed ambigously
*/
import python
import semmle.python.RegexTreeView
from string str, int counter, Location loc
where
counter =
strictcount(RegExpTerm term |
term.getLocation() = loc and term.isRootTerm() and term.toString() = str
) and
counter > 1
select str, counter, loc

View File

@@ -0,0 +1,94 @@
import re
# linear
# https://github.com/github/codeql-python-CVE-coverage/issues/439
rex_blame = re.compile(r'\s*(\d+)\s*(\S+) (.*)')
# https://github.com/github/codeql-python-CVE-coverage/issues/402
whitespace = br"[\000\011\012\014\015\040]"
whitespace_optional = whitespace + b"*"
newline_only = br"[\r\n]+"
newline = whitespace_optional + newline_only + whitespace_optional
toFlag = re.compile(newline)
# https://github.com/github/codeql-python-CVE-coverage/issues/400
re.compile(r'[+-]?(\d+)*\.\d+%?')
re.compile(r'"""\s+(?:.|\n)*?\s+"""')
re.compile(r'(\{\s+)(\S+)(\s+[^}]+\s+\}\s)')
re.compile(r'".*``.*``.*"')
re.compile(r'(\s*)(?:(.+)(\s*)(=)(\s*))?(.+)(\()(.*)(\))(\s*)')
re.compile(r'(%config)(\s*\(\s*)(\w+)(\s*=\s*)(.*?)(\s*\)\s*)')
re.compile(r'(%new)(\s*)(\()(\s*.*?\s*)(\))')
re.compile(r'(\$)(evoque|overlay)(\{(%)?)(\s*[#\w\-"\'.]+[^=,%}]+?)?')
re.compile(r'(\.\w+\b)(\s*=\s*)([^;]*)(\s*;)')
# linear
# https://github.com/github/codeql-python-CVE-coverage/issues/392
simple_email_re = re.compile(r"^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$")
# https://github.com/github/codeql-python-CVE-coverage/issues/249
rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
'realm=(["\']?)([^"\']*)\\2', re.I)
# https://github.com/github/codeql-python-CVE-coverage/issues/248
gauntlet = re.compile(
r"""^([-/:,#%.'"\s!\w]|\w-\w|'[\s\w]+'\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$""",
flags=re.U
)
# https://github.com/github/codeql-python-CVE-coverage/issues/227
# from .compat import tobytes
WS = "[ \t]"
OWS = WS + "{0,}?"
# RFC 7230 Section 3.2.6 "Field Value Components":
# tchar = "!" / "#" / "$" / "%" / "&" / "'" / "*"
# / "+" / "-" / "." / "^" / "_" / "`" / "|" / "~"
# / DIGIT / ALPHA
# obs-text = %x80-FF
TCHAR = r"[!#$%&'*+\-.^_`|~0-9A-Za-z]"
OBS_TEXT = r"\x80-\xff"
TOKEN = TCHAR + "{1,}"
# RFC 5234 Appendix B.1 "Core Rules":
# VCHAR = %x21-7E
# ; visible (printing) characters
VCHAR = r"\x21-\x7e"
# header-field = field-name ":" OWS field-value OWS
# field-name = token
# field-value = *( field-content / obs-fold )
# field-content = field-vchar [ 1*( SP / HTAB ) field-vchar ]
# field-vchar = VCHAR / obs-text
# Errata from: https://www.rfc-editor.org/errata_search.php?rfc=7230&eid=4189
# changes field-content to:
#
# field-content = field-vchar [ 1*( SP / HTAB / field-vchar )
# field-vchar ]
FIELD_VCHAR = "[" + VCHAR + OBS_TEXT + "]"
FIELD_CONTENT = FIELD_VCHAR + "([ \t" + VCHAR + OBS_TEXT + "]+" + FIELD_VCHAR + "){,1}"
FIELD_VALUE = "(" + FIELD_CONTENT + "){0,}"
HEADER_FIELD = re.compile(
# tobytes(
"^(?P<name>" + TOKEN + "):" + OWS + "(?P<value>" + FIELD_VALUE + ")" + OWS + "$"
# )
)
# https://github.com/github/codeql-python-CVE-coverage/issues/224
pattern = re.compile(
r'^(:?(([a-zA-Z]{1})|([a-zA-Z]{1}[a-zA-Z]{1})|' # domain pt.1
r'([a-zA-Z]{1}[0-9]{1})|([0-9]{1}[a-zA-Z]{1})|' # domain pt.2
r'([a-zA-Z0-9][-_a-zA-Z0-9]{0,61}[a-zA-Z0-9]))\.)+' # domain pt.3
r'([a-zA-Z]{2,13}|(xn--[a-zA-Z0-9]{2,30}))$' # TLD
)
# https://github.com/github/codeql-python-CVE-coverage/issues/189
URL_REGEX = (
r'(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|'
r'[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|'
r'(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|'
r'[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))' # "emacs!
)
url = re.compile(URL_REGEX)

View File

@@ -0,0 +1,9 @@
import re
from flask import Flask, request
app = Flask(__name__)
@app.route("/poly-redos")
def code_execution():
text = request.args.get("text")
re.sub(r"^\s+|\s+$", "", text) # NOT OK
re.match(r"^0\.\d+E?\d+$", text) # NOT OK

View File

@@ -0,0 +1,376 @@
# This is currently a copy of the redos test-file, since that one contains many regexes.
import re
# NOT GOOD; attack: "_" + "__".repeat(100)
# Adapted from marked (https://github.com/markedjs/marked), which is licensed
# under the MIT license; see file marked-LICENSE.
bad1 = re.compile(r'''^\b_((?:__|[\s\S])+?)_\b|^\*((?:\*\*|[\s\S])+?)\*(?!\*)''')
# GOOD
# Adapted from marked (https://github.com/markedjs/marked), which is licensed
# under the MIT license; see file marked-LICENSE.
good1 = re.compile(r'^\b_((?:__|[^_])+?)_\b|^\*((?:\*\*|[^*])+?)\*(?!\*)')
# GOOD - there is no witness in the end that could cause the regexp to not match
# Adapted from brace-expansion (https://github.com/juliangruber/brace-expansion),
# which is licensed under the MIT license; see file brace-expansion-LICENSE.
good2 = re.compile(r'(.*,)+.+')
# NOT GOOD; attack: " '" + "\\\\".repeat(100)
# Adapted from CodeMirror (https://github.com/codemirror/codemirror),
# which is licensed under the MIT license; see file CodeMirror-LICENSE.
bad2 = re.compile(r'''^(?:\s+(?:"(?:[^"\\]|\\\\|\\.)+"|'(?:[^'\\]|\\\\|\\.)+'|\((?:[^)\\]|\\\\|\\.)+\)))?''')
# GOOD
# Adapted from lulucms2 (https://github.com/yiifans/lulucms2).
good2 = re.compile(r'''\(\*(?:[\s\S]*?\(\*[\s\S]*?\*\))*[\s\S]*?\*\)''')
# GOOD
# Adapted from jest (https://github.com/facebook/jest), which is licensed
# under the MIT license; see file jest-LICENSE.
good3 = re.compile(r'''^ *(\S.*\|.*)\n *([-:]+ *\|[-| :]*)\n((?:.*\|.*(?:\n|$))*)\n*''')
# NOT GOOD, variant of good3; attack: "a|\n:|\n" + "||\n".repeat(100)
bad4 = re.compile(r'''^ *(\S.*\|.*)\n *([-:]+ *\|[-| :]*)\n((?:.*\|.*(?:\n|$))*)a''')
# NOT GOOD; attack: "/" + "\\/a".repeat(100)
# Adapted from ANodeBlog (https://github.com/gefangshuai/ANodeBlog),
# which is licensed under the Apache License 2.0; see file ANodeBlog-LICENSE.
bad5 = re.compile(r'''\/(?![ *])(\\\/|.)*?\/[gim]*(?=\W|$)''')
# NOT GOOD; attack: "##".repeat(100) + "\na"
# Adapted from CodeMirror (https://github.com/codemirror/codemirror),
# which is licensed under the MIT license; see file CodeMirror-LICENSE.
bad6 = re.compile(r'''^([\s\[\{\(]|#.*)*$''')
# GOOD
good4 = re.compile(r'''(\r\n|\r|\n)+''')
# BAD - PoC: `node -e "/((?:[^\"\']|\".*?\"|\'.*?\')*?)([(,)]|$)/.test(\"'''''''''''''''''''''''''''''''''''''''''''''\\\"\");"`. It's complicated though, because the regexp still matches something, it just matches the empty-string after the attack string.
actuallyBad = re.compile(r'''((?:[^"']|".*?"|'.*?')*?)([(,)]|$)''')
# NOT GOOD; attack: "a" + "[]".repeat(100) + ".b\n"
# Adapted from Knockout (https://github.com/knockout/knockout), which is
# licensed under the MIT license; see file knockout-LICENSE
bad6 = re.compile(r'''^[\_$a-z][\_$a-z0-9]*(\[.*?\])*(\.[\_$a-z][\_$a-z0-9]*(\[.*?\])*)*$''')
# GOOD
good6 = re.compile(r'''(a|.)*''')
# Testing the NFA - only some of the below are detected.
bad7 = re.compile(r'''^([a-z]+)+$''')
bad8 = re.compile(r'''^([a-z]*)*$''')
bad9 = re.compile(r'''^([a-zA-Z0-9])(([\\-.]|[_]+)?([a-zA-Z0-9]+))*(@){1}[a-z0-9]+[.]{1}(([a-z]{2,3})|([a-z]{2,3}[.]{1}[a-z]{2,3}))$''')
bad10 = re.compile(r'''^(([a-z])+.)+[A-Z]([a-z])+$''')
# NOT GOOD; attack: "[" + "][".repeat(100) + "]!"
# Adapted from Prototype.js (https://github.com/prototypejs/prototype), which
# is licensed under the MIT license; see file Prototype.js-LICENSE.
bad11 = re.compile(r'''(([\w#:.~>+()\s-]+|\*|\[.*?\])+)\s*(,|$)''')
# NOT GOOD; attack: "'" + "\\a".repeat(100) + '"'
# Adapted from Prism (https://github.com/PrismJS/prism), which is licensed
# under the MIT license; see file Prism-LICENSE.
bad12 = re.compile(r'''("|')(\\?.)*?\1''')
# NOT GOOD
bad13 = re.compile(r'''(b|a?b)*c''')
# NOT GOOD
bad15 = re.compile(r'''(a|aa?)*b''')
# GOOD
good7 = re.compile(r'''(.|\n)*!''')
# NOT GOOD; attack: "\n".repeat(100) + "."
bad16 = re.compile(r'''(.|\n)*!''')
# GOOD
good8 = re.compile(r'''([\w.]+)*''')
# NOT GOOD
bad17 = re.compile(r'''(a|aa?)*b''')
# GOOD - not used as regexp
good9 = '(a|aa?)*b'
# NOT GOOD
bad18 = re.compile(r'''(([\s\S]|[^a])*)"''')
# GOOD - there is no witness in the end that could cause the regexp to not match
good10 = re.compile(r'''([^"']+)*''')
# NOT GOOD
bad20 = re.compile(r'''((.|[^a])*)"''')
# GOOD
good10 = re.compile(r'''((a|[^a])*)"''')
# NOT GOOD
bad21 = re.compile(r'''((b|[^a])*)"''')
# NOT GOOD
bad22 = re.compile(r'''((G|[^a])*)"''')
# NOT GOOD
bad23 = re.compile(r'''(([0-9]|[^a])*)"''')
# NOT GOOD
bad24 = re.compile(r'''(?:=(?:([!#\$%&'\*\+\-\.\^_`\|~0-9A-Za-z]+)|"((?:\\[\x00-\x7f]|[^\x00-\x08\x0a-\x1f\x7f"])*)"))?''')
# NOT GOOD
bad25 = re.compile(r'''"((?:\\[\x00-\x7f]|[^\x00-\x08\x0a-\x1f\x7f"])*)"''')
# GOOD
bad26 = re.compile(r'''"((?:\\[\x00-\x7f]|[^\x00-\x08\x0a-\x1f\x7f"\\])*)"''')
# NOT GOOD
bad27 = re.compile(r'''(([a-z]|[d-h])*)"''')
# NOT GOOD
bad27 = re.compile(r'''(([^a-z]|[^0-9])*)"''')
# NOT GOOD
bad28 = re.compile(r'''((\d|[0-9])*)"''')
# NOT GOOD
bad29 = re.compile(r'''((\s|\s)*)"''')
# NOT GOOD
bad30 = re.compile(r'''((\w|G)*)"''')
# GOOD
good11 = re.compile(r'''((\s|\d)*)"''')
# NOT GOOD
bad31 = re.compile(r'''((\d|\w)*)"''')
# NOT GOOD
bad32 = re.compile(r'''((\d|5)*)"''')
# NOT GOOD
bad33 = re.compile(r'''((\s|[\f])*)"''')
# NOT GOOD
bad34 = re.compile(r'''((\s|[\v]|\\v)*)"''')
# NOT GOOD
bad35 = re.compile(r'''((\f|[\f])*)"''')
# NOT GOOD
bad36 = re.compile(r'''((\W|\D)*)"''')
# NOT GOOD
bad37 = re.compile(r'''((\S|\w)*)"''')
# NOT GOOD
bad38 = re.compile(r'''((\S|[\w])*)"''')
# NOT GOOD
bad39 = re.compile(r'''((1s|[\da-z])*)"''')
# NOT GOOD
bad40 = re.compile(r'''((0|[\d])*)"''')
# NOT GOOD
bad41 = re.compile(r'''(([\d]+)*)"''')
# GOOD - there is no witness in the end that could cause the regexp to not match
good12 = re.compile(r'''(\d+(X\d+)?)+''')
# GOOD - there is no witness in the end that could cause the regexp to not match
good13 = re.compile(r'''([0-9]+(X[0-9]*)?)*''')
# GOOD
good15 = re.compile(r'''^([^>]+)*(>|$)''')
# NOT GOOD
bad43 = re.compile(r'''^([^>a]+)*(>|$)''')
# NOT GOOD
bad44 = re.compile(r'''(\n\s*)+$''')
# NOT GOOD
bad45 = re.compile(r'''^(?:\s+|#.*|\(\?#[^)]*\))*(?:[?*+]|{\d+(?:,\d*)?})''')
# NOT GOOD
bad46 = re.compile(r'''\{\[\s*([a-zA-Z]+)\(([a-zA-Z]+)\)((\s*([a-zA-Z]+)\: ?([ a-zA-Z{}]+),?)+)*\s*\]\}''')
# NOT GOOD
bad47 = re.compile(r'''(a+|b+|c+)*c''')
# NOT GOOD
bad48 = re.compile(r'''(((a+a?)*)+b+)''')
# NOT GOOD
bad49 = re.compile(r'''(a+)+bbbb''')
# GOOD
good16 = re.compile(r'''(a+)+aaaaa*a+''')
# NOT GOOD
bad50 = re.compile(r'''(a+)+aaaaa$''')
# GOOD
good17 = re.compile(r'''(\n+)+\n\n''')
# NOT GOOD
bad51 = re.compile(r'''(\n+)+\n\n$''')
# NOT GOOD
bad52 = re.compile(r'''([^X]+)*$''')
# NOT GOOD
bad53 = re.compile(r'''(([^X]b)+)*$''')
# GOOD
good18 = re.compile(r'''(([^X]b)+)*($|[^X]b)''')
# NOT GOOD
bad54 = re.compile(r'''(([^X]b)+)*($|[^X]c)''')
# GOOD
good20 = re.compile(r'''((ab)+)*ababab''')
# GOOD
good21 = re.compile(r'''((ab)+)*abab(ab)*(ab)+''')
# GOOD
good22 = re.compile(r'''((ab)+)*''')
# NOT GOOD
bad55 = re.compile(r'''((ab)+)*$''')
# GOOD
good23 = re.compile(r'''((ab)+)*[a1][b1][a2][b2][a3][b3]''')
# NOT GOOD
bad56 = re.compile(r'''([\n\s]+)*(.)''')
# GOOD - any witness passes through the accept state.
good24 = re.compile(r'''(A*A*X)*''')
# GOOD
good26 = re.compile(r'''([^\\\]]+)*''')
# NOT GOOD
bad59 = re.compile(r'''(\w*foobarbaz\w*foobarbaz\w*foobarbaz\w*foobarbaz\s*foobarbaz\d*foobarbaz\w*)+-''')
# NOT GOOD
bad60 = re.compile(r'''(.thisisagoddamnlongstringforstresstestingthequery|\sthisisagoddamnlongstringforstresstestingthequery)*-''')
# NOT GOOD
bad61 = re.compile(r'''(thisisagoddamnlongstringforstresstestingthequery|this\w+query)*-''')
# GOOD
good27 = re.compile(r'''(thisisagoddamnlongstringforstresstestingthequery|imanotherbutunrelatedstringcomparedtotheotherstring)*-''')
# GOOD (but false positive caused by the extractor converting all four unpaired surrogates to \uFFFD)
good28 = re.compile('''foo([\uDC66\uDC67]|[\uDC68\uDC69])*foo''')
# GOOD (but false positive caused by the extractor converting all four unpaired surrogates to \uFFFD)
good29 = re.compile('''foo((\uDC66|\uDC67)|(\uDC68|\uDC69))*foo''')
# NOT GOOD (but cannot currently construct a prefix)
bad62 = re.compile(r'''a{2,3}(b+)+X''')
# NOT GOOD (and a good prefix test)
bad63 = re.compile(r'''^<(\w+)((?:\s+\w+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>''')
# GOOD
good30 = re.compile(r'''(a+)*[\s\S][\s\S][\s\S]?''')
# GOOD - but we fail to see that repeating the attack string ends in the "accept any" state (due to not parsing the range `[\s\S]{2,3}`).
good31 = re.compile(r'''(a+)*[\s\S]{2,3}''')
# GOOD - but we spuriously conclude that a rejecting suffix exists (due to not parsing the range `[\s\S]{2,}` when constructing the NFA).
good32 = re.compile(r'''(a+)*([\s\S]{2,}|X)$''')
# GOOD
good33 = re.compile(r'''(a+)*([\s\S]*|X)$''')
# NOT GOOD
bad64 = re.compile(r'''((a+)*$|[\s\S]+)''')
# GOOD - but still flagged. The only change compared to the above is the order of alternatives, which we don't model.
good34 = re.compile(r'''([\s\S]+|(a+)*$)''')
# GOOD
good35 = re.compile(r'''((;|^)a+)+$''')
# NOT GOOD (a good prefix test)
bad65 = re.compile(r'''(^|;)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(e+)+f''')
# NOT GOOD
bad66 = re.compile(r'''^ab(c+)+$''')
# NOT GOOD
bad67 = re.compile(r'''(\d(\s+)*){20}''')
# GOOD - but we spuriously conclude that a rejecting suffix exists.
good36 = re.compile(r'''(([^/]|X)+)(\/[\s\S]*)*$''')
# GOOD - but we spuriously conclude that a rejecting suffix exists.
good37 = re.compile(r'''^((x([^Y]+)?)*(Y|$))''')
# NOT GOOD
bad68 = re.compile(r'''(a*)+b''')
# NOT GOOD
bad69 = re.compile(r'''foo([\w-]*)+bar''')
# NOT GOOD
bad70 = re.compile(r'''((ab)*)+c''')
# NOT GOOD
bad71 = re.compile(r'''(a?a?)*b''')
# GOOD
good38 = re.compile(r'''(a?)*b''')
# NOT GOOD - but not detected
bad72 = re.compile(r'''(c?a?)*b''')
# NOT GOOD
bad73 = re.compile(r'''(?:a|a?)+b''')
# NOT GOOD - but not detected.
bad74 = re.compile(r'''(a?b?)*$''')
# NOT GOOD
bad76 = re.compile(r'''PRE(([a-c]|[c-d])T(e?e?e?e?|X))+(cTcT|cTXcTX$)''')
# NOT GOOD - but not detected
bad77 = re.compile(r'''^((a)+\w)+$''')
# NOT GOOD
bad78 = re.compile(r'''^(b+.)+$''')
# GOOD
good39 = re.compile(r'''a*b''')
# All 4 bad combinations of nested * and +
bad79 = re.compile(r'''(a*)*b''')
bad80 = re.compile(r'''(a+)*b''')
bad81 = re.compile(r'''(a*)+b''')
bad82 = re.compile(r'''(a+)+b''')
# GOOD
good40 = re.compile(r'''(a|b)+''')
good41 = re.compile(r'''(?:[\s;,"'<>(){}|[\]@=+*]|:(?![/\\]))+''') # parses wrongly, sees column 42 as a char set start
# NOT GOOD
bad83 = re.compile(r'''^((?:a{|-)|\w\{)+X$''')
bad84 = re.compile(r'''^((?:a{0|-)|\w\{\d)+X$''')
bad85 = re.compile(r'''^((?:a{0,|-)|\w\{\d,)+X$''')
bad86 = re.compile(r'''^((?:a{0,2|-)|\w\{\d,\d)+X$''')
# GOOD:
good42 = re.compile(r'''^((?:a{0,2}|-)|\w\{\d,\d\})+X$''')
# NOT GOOD
bad87 = re.compile(r'X(\u0061|a)*Y')
# GOOD
good43 = re.compile(r'X(\u0061|b)+Y')

View File

@@ -0,0 +1,20 @@
import re
# Treatment of escapes
re.compile(r"X([^\.]|\.)*$") # No ReDoS.
re.compile(r"X(Æ|\Æ)+$") # Has ReDoS.
# Treatment of line breaks
re.compile(r'(?:.|\n)*b') # No ReDoS.
re.compile(r'(?:.|\n)*b', re.DOTALL) # Has ReDoS.
# minimal example constructed by @erik-krogh
baz = re.compile(r'\+0')
# exerpts from LGTM.com
re.compile(r'\+0x')
re.compile(r'\+0x.*')
re.compile(r'+\-0+\.')
re.compile('\s+\+0x[0-9]+')
re.compile(r'\+0000 .*')
re.compile('\#[0-9]+ 0x[0-9]')