Merge pull request #6460 from yoff/python-regex-parsing-consistency-checks

Python: Add regex parsing consistency checks
This commit is contained in:
Taus
2021-09-07 13:33:59 +02:00
committed by GitHub
17 changed files with 583 additions and 17 deletions

View File

@@ -19,4 +19,4 @@
| x\| | 0 | 2 | x\| | 0 | 1 | x |
| x\| | 0 | 2 | x\| | 2 | 2 | |
| x\|(?<!\\w)l | 0 | 10 | x\|(?<!\\w)l | 0 | 1 | x |
| x\|(?<!\\w)l | 0 | 10 | x\|(?<!\\w)l | 2 | 10 | (?<!\\w)l |
| x\|(?<!\\w)l | 0 | 10 | x\|(?<!\\w)l | 2 | 10 | (?<!\\w)l |

View File

@@ -52,6 +52,8 @@
| [^A-Z] | 2 | 3 |
| [^A-Z] | 4 | 5 |
| [^]] | 2 | 3 |
| \\+0 | 0 | 2 |
| \\+0 | 2 | 3 |
| \\A[+-]?\\d+ | 0 | 2 |
| \\A[+-]?\\d+ | 3 | 4 |
| \\A[+-]?\\d+ | 4 | 5 |

View File

@@ -0,0 +1,12 @@
/**
* Flags regular expressions that are parsed ambigously
*/
import python
import semmle.python.regex
from string str, Location loc, int counter
where
counter = strictcount(Regex term | term.getLocation() = loc and term.getText() = str) and
counter > 1
select str, counter, loc

View File

@@ -42,6 +42,8 @@
| [^A-Z] | last | 0 | 6 |
| [^]] | first | 0 | 4 |
| [^]] | last | 0 | 4 |
| \\+0 | first | 0 | 2 |
| \\+0 | last | 2 | 3 |
| \\A[+-]?\\d+ | first | 0 | 2 |
| \\A[+-]?\\d+ | last | 7 | 9 |
| \\A[+-]?\\d+ | last | 7 | 10 |

View File

@@ -113,6 +113,9 @@
| [^]] | char | 2 | 3 |
| [^]] | char-set | 0 | 4 |
| [^]] | sequence | 0 | 4 |
| \\+0 | char | 0 | 2 |
| \\+0 | char | 2 | 3 |
| \\+0 | sequence | 0 | 3 |
| \\A[+-]?\\d+ | char | 0 | 2 |
| \\A[+-]?\\d+ | char | 3 | 4 |
| \\A[+-]?\\d+ | char | 4 | 5 |

View File

@@ -24,7 +24,8 @@ except re.error:
re.compile(r'[^A-Z]') #$ charRange=2:3-4:5
re.compile(r'[\0-\09]') #$ charRange=1:3-4:7
re.compile(r'[\0-\09]') #$ charRange=1:3-4:6
re.compile(r'[\0-\07]') #$ charRange=1:3-4:7
re.compile(r'[\0123-5]') #$ charRange=5:6-7:8

View File

@@ -10,8 +10,10 @@ re.compile(r'[\---]') #$ escapedCharacter=1:3
re.compile(r'[--\-]') #$ escapedCharacter=3:5
re.compile(r'[\--\-]') #$ escapedCharacter=1:3 escapedCharacter=4:6
re.compile(r'[0\-9-A-Z]') #$ escapedCharacter=2:4
re.compile(r'[\0-\09]') #$ escapedCharacter=1:3 escapedCharacter=4:7
re.compile(r'[\0-\09]') #$ escapedCharacter=1:3 escapedCharacter=4:6
re.compile(r'[\0-\07]') #$ escapedCharacter=1:3 escapedCharacter=4:7
re.compile(r'[\0123-5]') #$ escapedCharacter=1:5
re.compile(r'\1754\1854\17\18\07\08') #$ escapedCharacter=0:4 escapedCharacter=16:19 escapedCharacter=19:21
#ODASA-3985
#Half Surrogate pairs
@@ -21,3 +23,9 @@ re.compile(u'[\U00010000-\U0010ffff]') # not escapes
#Misparsed on LGTM
re.compile(r"\[(?P<txt>[^[]*)\]\((?P<uri>[^)]*)") #$ escapedCharacter=0:2 escapedCharacter=16:18 escapedCharacter=18:20
#Non-raw string
re_blank = re.compile('(\n|\r|\\s)*\n', re.M) #$ escapedCharacter=5:7
#Backreference confusion
re.compile(r'\+0') #$ escapedCharacter=0:2

View File

@@ -70,3 +70,6 @@ re.compile("", re.M) # ODASA-8056
# FP reported in https://github.com/github/codeql/issues/3712
# This does not define a regex (but could be used by other code to do so)
escaped = re.escape("https://www.humblebundle.com/home/library")
# Consistency check
baz = re.compile(r'\+0')

View File

@@ -0,0 +1,15 @@
/**
* Flags regular expressions that are parsed ambigously
*/
import python
import semmle.python.RegexTreeView
from string str, int counter, Location loc
where
counter =
strictcount(RegExpTerm term |
term.getLocation() = loc and term.isRootTerm() and term.toString() = str
) and
counter > 1
select str, counter, loc

View File

@@ -0,0 +1,94 @@
import re
# linear
# https://github.com/github/codeql-python-CVE-coverage/issues/439
rex_blame = re.compile(r'\s*(\d+)\s*(\S+) (.*)')
# https://github.com/github/codeql-python-CVE-coverage/issues/402
whitespace = br"[\000\011\012\014\015\040]"
whitespace_optional = whitespace + b"*"
newline_only = br"[\r\n]+"
newline = whitespace_optional + newline_only + whitespace_optional
toFlag = re.compile(newline)
# https://github.com/github/codeql-python-CVE-coverage/issues/400
re.compile(r'[+-]?(\d+)*\.\d+%?')
re.compile(r'"""\s+(?:.|\n)*?\s+"""')
re.compile(r'(\{\s+)(\S+)(\s+[^}]+\s+\}\s)')
re.compile(r'".*``.*``.*"')
re.compile(r'(\s*)(?:(.+)(\s*)(=)(\s*))?(.+)(\()(.*)(\))(\s*)')
re.compile(r'(%config)(\s*\(\s*)(\w+)(\s*=\s*)(.*?)(\s*\)\s*)')
re.compile(r'(%new)(\s*)(\()(\s*.*?\s*)(\))')
re.compile(r'(\$)(evoque|overlay)(\{(%)?)(\s*[#\w\-"\'.]+[^=,%}]+?)?')
re.compile(r'(\.\w+\b)(\s*=\s*)([^;]*)(\s*;)')
# linear
# https://github.com/github/codeql-python-CVE-coverage/issues/392
simple_email_re = re.compile(r"^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$")
# https://github.com/github/codeql-python-CVE-coverage/issues/249
rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
'realm=(["\']?)([^"\']*)\\2', re.I)
# https://github.com/github/codeql-python-CVE-coverage/issues/248
gauntlet = re.compile(
r"""^([-/:,#%.'"\s!\w]|\w-\w|'[\s\w]+'\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$""",
flags=re.U
)
# https://github.com/github/codeql-python-CVE-coverage/issues/227
# from .compat import tobytes
WS = "[ \t]"
OWS = WS + "{0,}?"
# RFC 7230 Section 3.2.6 "Field Value Components":
# tchar = "!" / "#" / "$" / "%" / "&" / "'" / "*"
# / "+" / "-" / "." / "^" / "_" / "`" / "|" / "~"
# / DIGIT / ALPHA
# obs-text = %x80-FF
TCHAR = r"[!#$%&'*+\-.^_`|~0-9A-Za-z]"
OBS_TEXT = r"\x80-\xff"
TOKEN = TCHAR + "{1,}"
# RFC 5234 Appendix B.1 "Core Rules":
# VCHAR = %x21-7E
# ; visible (printing) characters
VCHAR = r"\x21-\x7e"
# header-field = field-name ":" OWS field-value OWS
# field-name = token
# field-value = *( field-content / obs-fold )
# field-content = field-vchar [ 1*( SP / HTAB ) field-vchar ]
# field-vchar = VCHAR / obs-text
# Errata from: https://www.rfc-editor.org/errata_search.php?rfc=7230&eid=4189
# changes field-content to:
#
# field-content = field-vchar [ 1*( SP / HTAB / field-vchar )
# field-vchar ]
FIELD_VCHAR = "[" + VCHAR + OBS_TEXT + "]"
FIELD_CONTENT = FIELD_VCHAR + "([ \t" + VCHAR + OBS_TEXT + "]+" + FIELD_VCHAR + "){,1}"
FIELD_VALUE = "(" + FIELD_CONTENT + "){0,}"
HEADER_FIELD = re.compile(
# tobytes(
"^(?P<name>" + TOKEN + "):" + OWS + "(?P<value>" + FIELD_VALUE + ")" + OWS + "$"
# )
)
# https://github.com/github/codeql-python-CVE-coverage/issues/224
pattern = re.compile(
r'^(:?(([a-zA-Z]{1})|([a-zA-Z]{1}[a-zA-Z]{1})|' # domain pt.1
r'([a-zA-Z]{1}[0-9]{1})|([0-9]{1}[a-zA-Z]{1})|' # domain pt.2
r'([a-zA-Z0-9][-_a-zA-Z0-9]{0,61}[a-zA-Z0-9]))\.)+' # domain pt.3
r'([a-zA-Z]{2,13}|(xn--[a-zA-Z0-9]{2,30}))$' # TLD
)
# https://github.com/github/codeql-python-CVE-coverage/issues/189
URL_REGEX = (
r'(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|'
r'[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|'
r'(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|'
r'[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))' # "emacs!
)
url = re.compile(URL_REGEX)

View File

@@ -0,0 +1,9 @@
import re
from flask import Flask, request
app = Flask(__name__)
@app.route("/poly-redos")
def code_execution():
text = request.args.get("text")
re.sub(r"^\s+|\s+$", "", text) # NOT OK
re.match(r"^0\.\d+E?\d+$", text) # NOT OK

View File

@@ -0,0 +1,376 @@
# This is currently a copy of the redos test-file, since that one contains many regexes.
import re
# NOT GOOD; attack: "_" + "__".repeat(100)
# Adapted from marked (https://github.com/markedjs/marked), which is licensed
# under the MIT license; see file marked-LICENSE.
bad1 = re.compile(r'''^\b_((?:__|[\s\S])+?)_\b|^\*((?:\*\*|[\s\S])+?)\*(?!\*)''')
# GOOD
# Adapted from marked (https://github.com/markedjs/marked), which is licensed
# under the MIT license; see file marked-LICENSE.
good1 = re.compile(r'^\b_((?:__|[^_])+?)_\b|^\*((?:\*\*|[^*])+?)\*(?!\*)')
# GOOD - there is no witness in the end that could cause the regexp to not match
# Adapted from brace-expansion (https://github.com/juliangruber/brace-expansion),
# which is licensed under the MIT license; see file brace-expansion-LICENSE.
good2 = re.compile(r'(.*,)+.+')
# NOT GOOD; attack: " '" + "\\\\".repeat(100)
# Adapted from CodeMirror (https://github.com/codemirror/codemirror),
# which is licensed under the MIT license; see file CodeMirror-LICENSE.
bad2 = re.compile(r'''^(?:\s+(?:"(?:[^"\\]|\\\\|\\.)+"|'(?:[^'\\]|\\\\|\\.)+'|\((?:[^)\\]|\\\\|\\.)+\)))?''')
# GOOD
# Adapted from lulucms2 (https://github.com/yiifans/lulucms2).
good2 = re.compile(r'''\(\*(?:[\s\S]*?\(\*[\s\S]*?\*\))*[\s\S]*?\*\)''')
# GOOD
# Adapted from jest (https://github.com/facebook/jest), which is licensed
# under the MIT license; see file jest-LICENSE.
good3 = re.compile(r'''^ *(\S.*\|.*)\n *([-:]+ *\|[-| :]*)\n((?:.*\|.*(?:\n|$))*)\n*''')
# NOT GOOD, variant of good3; attack: "a|\n:|\n" + "||\n".repeat(100)
bad4 = re.compile(r'''^ *(\S.*\|.*)\n *([-:]+ *\|[-| :]*)\n((?:.*\|.*(?:\n|$))*)a''')
# NOT GOOD; attack: "/" + "\\/a".repeat(100)
# Adapted from ANodeBlog (https://github.com/gefangshuai/ANodeBlog),
# which is licensed under the Apache License 2.0; see file ANodeBlog-LICENSE.
bad5 = re.compile(r'''\/(?![ *])(\\\/|.)*?\/[gim]*(?=\W|$)''')
# NOT GOOD; attack: "##".repeat(100) + "\na"
# Adapted from CodeMirror (https://github.com/codemirror/codemirror),
# which is licensed under the MIT license; see file CodeMirror-LICENSE.
bad6 = re.compile(r'''^([\s\[\{\(]|#.*)*$''')
# GOOD
good4 = re.compile(r'''(\r\n|\r|\n)+''')
# BAD - PoC: `node -e "/((?:[^\"\']|\".*?\"|\'.*?\')*?)([(,)]|$)/.test(\"'''''''''''''''''''''''''''''''''''''''''''''\\\"\");"`. It's complicated though, because the regexp still matches something, it just matches the empty-string after the attack string.
actuallyBad = re.compile(r'''((?:[^"']|".*?"|'.*?')*?)([(,)]|$)''')
# NOT GOOD; attack: "a" + "[]".repeat(100) + ".b\n"
# Adapted from Knockout (https://github.com/knockout/knockout), which is
# licensed under the MIT license; see file knockout-LICENSE
bad6 = re.compile(r'''^[\_$a-z][\_$a-z0-9]*(\[.*?\])*(\.[\_$a-z][\_$a-z0-9]*(\[.*?\])*)*$''')
# GOOD
good6 = re.compile(r'''(a|.)*''')
# Testing the NFA - only some of the below are detected.
bad7 = re.compile(r'''^([a-z]+)+$''')
bad8 = re.compile(r'''^([a-z]*)*$''')
bad9 = re.compile(r'''^([a-zA-Z0-9])(([\\-.]|[_]+)?([a-zA-Z0-9]+))*(@){1}[a-z0-9]+[.]{1}(([a-z]{2,3})|([a-z]{2,3}[.]{1}[a-z]{2,3}))$''')
bad10 = re.compile(r'''^(([a-z])+.)+[A-Z]([a-z])+$''')
# NOT GOOD; attack: "[" + "][".repeat(100) + "]!"
# Adapted from Prototype.js (https://github.com/prototypejs/prototype), which
# is licensed under the MIT license; see file Prototype.js-LICENSE.
bad11 = re.compile(r'''(([\w#:.~>+()\s-]+|\*|\[.*?\])+)\s*(,|$)''')
# NOT GOOD; attack: "'" + "\\a".repeat(100) + '"'
# Adapted from Prism (https://github.com/PrismJS/prism), which is licensed
# under the MIT license; see file Prism-LICENSE.
bad12 = re.compile(r'''("|')(\\?.)*?\1''')
# NOT GOOD
bad13 = re.compile(r'''(b|a?b)*c''')
# NOT GOOD
bad15 = re.compile(r'''(a|aa?)*b''')
# GOOD
good7 = re.compile(r'''(.|\n)*!''')
# NOT GOOD; attack: "\n".repeat(100) + "."
bad16 = re.compile(r'''(.|\n)*!''')
# GOOD
good8 = re.compile(r'''([\w.]+)*''')
# NOT GOOD
bad17 = re.compile(r'''(a|aa?)*b''')
# GOOD - not used as regexp
good9 = '(a|aa?)*b'
# NOT GOOD
bad18 = re.compile(r'''(([\s\S]|[^a])*)"''')
# GOOD - there is no witness in the end that could cause the regexp to not match
good10 = re.compile(r'''([^"']+)*''')
# NOT GOOD
bad20 = re.compile(r'''((.|[^a])*)"''')
# GOOD
good10 = re.compile(r'''((a|[^a])*)"''')
# NOT GOOD
bad21 = re.compile(r'''((b|[^a])*)"''')
# NOT GOOD
bad22 = re.compile(r'''((G|[^a])*)"''')
# NOT GOOD
bad23 = re.compile(r'''(([0-9]|[^a])*)"''')
# NOT GOOD
bad24 = re.compile(r'''(?:=(?:([!#\$%&'\*\+\-\.\^_`\|~0-9A-Za-z]+)|"((?:\\[\x00-\x7f]|[^\x00-\x08\x0a-\x1f\x7f"])*)"))?''')
# NOT GOOD
bad25 = re.compile(r'''"((?:\\[\x00-\x7f]|[^\x00-\x08\x0a-\x1f\x7f"])*)"''')
# GOOD
bad26 = re.compile(r'''"((?:\\[\x00-\x7f]|[^\x00-\x08\x0a-\x1f\x7f"\\])*)"''')
# NOT GOOD
bad27 = re.compile(r'''(([a-z]|[d-h])*)"''')
# NOT GOOD
bad27 = re.compile(r'''(([^a-z]|[^0-9])*)"''')
# NOT GOOD
bad28 = re.compile(r'''((\d|[0-9])*)"''')
# NOT GOOD
bad29 = re.compile(r'''((\s|\s)*)"''')
# NOT GOOD
bad30 = re.compile(r'''((\w|G)*)"''')
# GOOD
good11 = re.compile(r'''((\s|\d)*)"''')
# NOT GOOD
bad31 = re.compile(r'''((\d|\w)*)"''')
# NOT GOOD
bad32 = re.compile(r'''((\d|5)*)"''')
# NOT GOOD
bad33 = re.compile(r'''((\s|[\f])*)"''')
# NOT GOOD
bad34 = re.compile(r'''((\s|[\v]|\\v)*)"''')
# NOT GOOD
bad35 = re.compile(r'''((\f|[\f])*)"''')
# NOT GOOD
bad36 = re.compile(r'''((\W|\D)*)"''')
# NOT GOOD
bad37 = re.compile(r'''((\S|\w)*)"''')
# NOT GOOD
bad38 = re.compile(r'''((\S|[\w])*)"''')
# NOT GOOD
bad39 = re.compile(r'''((1s|[\da-z])*)"''')
# NOT GOOD
bad40 = re.compile(r'''((0|[\d])*)"''')
# NOT GOOD
bad41 = re.compile(r'''(([\d]+)*)"''')
# GOOD - there is no witness in the end that could cause the regexp to not match
good12 = re.compile(r'''(\d+(X\d+)?)+''')
# GOOD - there is no witness in the end that could cause the regexp to not match
good13 = re.compile(r'''([0-9]+(X[0-9]*)?)*''')
# GOOD
good15 = re.compile(r'''^([^>]+)*(>|$)''')
# NOT GOOD
bad43 = re.compile(r'''^([^>a]+)*(>|$)''')
# NOT GOOD
bad44 = re.compile(r'''(\n\s*)+$''')
# NOT GOOD
bad45 = re.compile(r'''^(?:\s+|#.*|\(\?#[^)]*\))*(?:[?*+]|{\d+(?:,\d*)?})''')
# NOT GOOD
bad46 = re.compile(r'''\{\[\s*([a-zA-Z]+)\(([a-zA-Z]+)\)((\s*([a-zA-Z]+)\: ?([ a-zA-Z{}]+),?)+)*\s*\]\}''')
# NOT GOOD
bad47 = re.compile(r'''(a+|b+|c+)*c''')
# NOT GOOD
bad48 = re.compile(r'''(((a+a?)*)+b+)''')
# NOT GOOD
bad49 = re.compile(r'''(a+)+bbbb''')
# GOOD
good16 = re.compile(r'''(a+)+aaaaa*a+''')
# NOT GOOD
bad50 = re.compile(r'''(a+)+aaaaa$''')
# GOOD
good17 = re.compile(r'''(\n+)+\n\n''')
# NOT GOOD
bad51 = re.compile(r'''(\n+)+\n\n$''')
# NOT GOOD
bad52 = re.compile(r'''([^X]+)*$''')
# NOT GOOD
bad53 = re.compile(r'''(([^X]b)+)*$''')
# GOOD
good18 = re.compile(r'''(([^X]b)+)*($|[^X]b)''')
# NOT GOOD
bad54 = re.compile(r'''(([^X]b)+)*($|[^X]c)''')
# GOOD
good20 = re.compile(r'''((ab)+)*ababab''')
# GOOD
good21 = re.compile(r'''((ab)+)*abab(ab)*(ab)+''')
# GOOD
good22 = re.compile(r'''((ab)+)*''')
# NOT GOOD
bad55 = re.compile(r'''((ab)+)*$''')
# GOOD
good23 = re.compile(r'''((ab)+)*[a1][b1][a2][b2][a3][b3]''')
# NOT GOOD
bad56 = re.compile(r'''([\n\s]+)*(.)''')
# GOOD - any witness passes through the accept state.
good24 = re.compile(r'''(A*A*X)*''')
# GOOD
good26 = re.compile(r'''([^\\\]]+)*''')
# NOT GOOD
bad59 = re.compile(r'''(\w*foobarbaz\w*foobarbaz\w*foobarbaz\w*foobarbaz\s*foobarbaz\d*foobarbaz\w*)+-''')
# NOT GOOD
bad60 = re.compile(r'''(.thisisagoddamnlongstringforstresstestingthequery|\sthisisagoddamnlongstringforstresstestingthequery)*-''')
# NOT GOOD
bad61 = re.compile(r'''(thisisagoddamnlongstringforstresstestingthequery|this\w+query)*-''')
# GOOD
good27 = re.compile(r'''(thisisagoddamnlongstringforstresstestingthequery|imanotherbutunrelatedstringcomparedtotheotherstring)*-''')
# GOOD
good28 = re.compile(r'''foo([\uDC66\uDC67]|[\uDC68\uDC69])*foo''')
# GOOD
good29 = re.compile(r'''foo((\uDC66|\uDC67)|(\uDC68|\uDC69))*foo''')
# NOT GOOD (but cannot currently construct a prefix)
bad62 = re.compile(r'''a{2,3}(b+)+X''')
# NOT GOOD (and a good prefix test)
bad63 = re.compile(r'''^<(\w+)((?:\s+\w+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>''')
# GOOD
good30 = re.compile(r'''(a+)*[\s\S][\s\S][\s\S]?''')
# GOOD - but we fail to see that repeating the attack string ends in the "accept any" state (due to not parsing the range `[\s\S]{2,3}`).
good31 = re.compile(r'''(a+)*[\s\S]{2,3}''')
# GOOD - but we spuriously conclude that a rejecting suffix exists (due to not parsing the range `[\s\S]{2,}` when constructing the NFA).
good32 = re.compile(r'''(a+)*([\s\S]{2,}|X)$''')
# GOOD
good33 = re.compile(r'''(a+)*([\s\S]*|X)$''')
# NOT GOOD
bad64 = re.compile(r'''((a+)*$|[\s\S]+)''')
# GOOD - but still flagged. The only change compared to the above is the order of alternatives, which we don't model.
good34 = re.compile(r'''([\s\S]+|(a+)*$)''')
# GOOD
good35 = re.compile(r'''((;|^)a+)+$''')
# NOT GOOD (a good prefix test)
bad65 = re.compile(r'''(^|;)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(e+)+f''')
# NOT GOOD
bad66 = re.compile(r'''^ab(c+)+$''')
# NOT GOOD
bad67 = re.compile(r'''(\d(\s+)*){20}''')
# GOOD - but we spuriously conclude that a rejecting suffix exists.
good36 = re.compile(r'''(([^/]|X)+)(\/[\s\S]*)*$''')
# GOOD - but we spuriously conclude that a rejecting suffix exists.
good37 = re.compile(r'''^((x([^Y]+)?)*(Y|$))''')
# NOT GOOD
bad68 = re.compile(r'''(a*)+b''')
# NOT GOOD
bad69 = re.compile(r'''foo([\w-]*)+bar''')
# NOT GOOD
bad70 = re.compile(r'''((ab)*)+c''')
# NOT GOOD
bad71 = re.compile(r'''(a?a?)*b''')
# GOOD
good38 = re.compile(r'''(a?)*b''')
# NOT GOOD - but not detected
bad72 = re.compile(r'''(c?a?)*b''')
# NOT GOOD
bad73 = re.compile(r'''(?:a|a?)+b''')
# NOT GOOD - but not detected.
bad74 = re.compile(r'''(a?b?)*$''')
# NOT GOOD
bad76 = re.compile(r'''PRE(([a-c]|[c-d])T(e?e?e?e?|X))+(cTcT|cTXcTX$)''')
# NOT GOOD - but not detected
bad77 = re.compile(r'''^((a)+\w)+$''')
# NOT GOOD
bad78 = re.compile(r'''^(b+.)+$''')
# GOOD
good39 = re.compile(r'''a*b''')
# All 4 bad combinations of nested * and +
bad79 = re.compile(r'''(a*)*b''')
bad80 = re.compile(r'''(a+)*b''')
bad81 = re.compile(r'''(a*)+b''')
bad82 = re.compile(r'''(a+)+b''')
# GOOD
good40 = re.compile(r'''(a|b)+''')
good41 = re.compile(r'''(?:[\s;,"'<>(){}|[\]@=+*]|:(?![/\\]))+''') # parses wrongly, sees column 42 as a char set start
# NOT GOOD
bad83 = re.compile(r'''^((?:a{|-)|\w\{)+X$''')
bad84 = re.compile(r'''^((?:a{0|-)|\w\{\d)+X$''')
bad85 = re.compile(r'''^((?:a{0,|-)|\w\{\d,)+X$''')
bad86 = re.compile(r'''^((?:a{0,2|-)|\w\{\d,\d)+X$''')
# GOOD:
good42 = re.compile(r'''^((?:a{0,2}|-)|\w\{\d,\d\})+X$''')
# NOT GOOD
bad87 = re.compile(r'X(\u0061|a)*Y')
# GOOD
good43 = re.compile(r'X(\u0061|b)+Y')

View File

@@ -0,0 +1,20 @@
import re
# Treatment of escapes
re.compile(r"X([^\.]|\.)*$") # No ReDoS.
re.compile(r"X(Æ|\Æ)+$") # Has ReDoS.
# Treatment of line breaks
re.compile(r'(?:.|\n)*b') # No ReDoS.
re.compile(r'(?:.|\n)*b', re.DOTALL) # Has ReDoS.
# minimal example constructed by @erik-krogh
baz = re.compile(r'\+0')
# exerpts from LGTM.com
re.compile(r'\+0x')
re.compile(r'\+0x.*')
re.compile(r'+\-0+\.')
re.compile('\s+\+0x[0-9]+')
re.compile(r'\+0000 .*')
re.compile('\#[0-9]+ 0x[0-9]')