mirror of
https://github.com/github/codeql.git
synced 2026-04-30 11:15:13 +02:00
Merge pull request #6460 from yoff/python-regex-parsing-consistency-checks
Python: Add regex parsing consistency checks
This commit is contained in:
@@ -19,4 +19,4 @@
|
||||
| x\| | 0 | 2 | x\| | 0 | 1 | x |
|
||||
| x\| | 0 | 2 | x\| | 2 | 2 | |
|
||||
| x\|(?<!\\w)l | 0 | 10 | x\|(?<!\\w)l | 0 | 1 | x |
|
||||
| x\|(?<!\\w)l | 0 | 10 | x\|(?<!\\w)l | 2 | 10 | (?<!\\w)l |
|
||||
| x\|(?<!\\w)l | 0 | 10 | x\|(?<!\\w)l | 2 | 10 | (?<!\\w)l |
|
||||
|
||||
@@ -52,6 +52,8 @@
|
||||
| [^A-Z] | 2 | 3 |
|
||||
| [^A-Z] | 4 | 5 |
|
||||
| [^]] | 2 | 3 |
|
||||
| \\+0 | 0 | 2 |
|
||||
| \\+0 | 2 | 3 |
|
||||
| \\A[+-]?\\d+ | 0 | 2 |
|
||||
| \\A[+-]?\\d+ | 3 | 4 |
|
||||
| \\A[+-]?\\d+ | 4 | 5 |
|
||||
|
||||
12
python/ql/test/library-tests/regex/Consistency.ql
Normal file
12
python/ql/test/library-tests/regex/Consistency.ql
Normal file
@@ -0,0 +1,12 @@
|
||||
/**
|
||||
* Flags regular expressions that are parsed ambigously
|
||||
*/
|
||||
|
||||
import python
|
||||
import semmle.python.regex
|
||||
|
||||
from string str, Location loc, int counter
|
||||
where
|
||||
counter = strictcount(Regex term | term.getLocation() = loc and term.getText() = str) and
|
||||
counter > 1
|
||||
select str, counter, loc
|
||||
@@ -42,6 +42,8 @@
|
||||
| [^A-Z] | last | 0 | 6 |
|
||||
| [^]] | first | 0 | 4 |
|
||||
| [^]] | last | 0 | 4 |
|
||||
| \\+0 | first | 0 | 2 |
|
||||
| \\+0 | last | 2 | 3 |
|
||||
| \\A[+-]?\\d+ | first | 0 | 2 |
|
||||
| \\A[+-]?\\d+ | last | 7 | 9 |
|
||||
| \\A[+-]?\\d+ | last | 7 | 10 |
|
||||
|
||||
@@ -113,6 +113,9 @@
|
||||
| [^]] | char | 2 | 3 |
|
||||
| [^]] | char-set | 0 | 4 |
|
||||
| [^]] | sequence | 0 | 4 |
|
||||
| \\+0 | char | 0 | 2 |
|
||||
| \\+0 | char | 2 | 3 |
|
||||
| \\+0 | sequence | 0 | 3 |
|
||||
| \\A[+-]?\\d+ | char | 0 | 2 |
|
||||
| \\A[+-]?\\d+ | char | 3 | 4 |
|
||||
| \\A[+-]?\\d+ | char | 4 | 5 |
|
||||
|
||||
@@ -24,7 +24,8 @@ except re.error:
|
||||
|
||||
re.compile(r'[^A-Z]') #$ charRange=2:3-4:5
|
||||
|
||||
re.compile(r'[\0-\09]') #$ charRange=1:3-4:7
|
||||
re.compile(r'[\0-\09]') #$ charRange=1:3-4:6
|
||||
re.compile(r'[\0-\07]') #$ charRange=1:3-4:7
|
||||
|
||||
re.compile(r'[\0123-5]') #$ charRange=5:6-7:8
|
||||
|
||||
|
||||
@@ -10,8 +10,10 @@ re.compile(r'[\---]') #$ escapedCharacter=1:3
|
||||
re.compile(r'[--\-]') #$ escapedCharacter=3:5
|
||||
re.compile(r'[\--\-]') #$ escapedCharacter=1:3 escapedCharacter=4:6
|
||||
re.compile(r'[0\-9-A-Z]') #$ escapedCharacter=2:4
|
||||
re.compile(r'[\0-\09]') #$ escapedCharacter=1:3 escapedCharacter=4:7
|
||||
re.compile(r'[\0-\09]') #$ escapedCharacter=1:3 escapedCharacter=4:6
|
||||
re.compile(r'[\0-\07]') #$ escapedCharacter=1:3 escapedCharacter=4:7
|
||||
re.compile(r'[\0123-5]') #$ escapedCharacter=1:5
|
||||
re.compile(r'\1754\1854\17\18\07\08') #$ escapedCharacter=0:4 escapedCharacter=16:19 escapedCharacter=19:21
|
||||
|
||||
#ODASA-3985
|
||||
#Half Surrogate pairs
|
||||
@@ -21,3 +23,9 @@ re.compile(u'[\U00010000-\U0010ffff]') # not escapes
|
||||
|
||||
#Misparsed on LGTM
|
||||
re.compile(r"\[(?P<txt>[^[]*)\]\((?P<uri>[^)]*)") #$ escapedCharacter=0:2 escapedCharacter=16:18 escapedCharacter=18:20
|
||||
|
||||
#Non-raw string
|
||||
re_blank = re.compile('(\n|\r|\\s)*\n', re.M) #$ escapedCharacter=5:7
|
||||
|
||||
#Backreference confusion
|
||||
re.compile(r'\+0') #$ escapedCharacter=0:2
|
||||
|
||||
@@ -70,3 +70,6 @@ re.compile("", re.M) # ODASA-8056
|
||||
# FP reported in https://github.com/github/codeql/issues/3712
|
||||
# This does not define a regex (but could be used by other code to do so)
|
||||
escaped = re.escape("https://www.humblebundle.com/home/library")
|
||||
|
||||
# Consistency check
|
||||
baz = re.compile(r'\+0')
|
||||
|
||||
15
python/ql/test/library-tests/regexparser/Consistency.ql
Normal file
15
python/ql/test/library-tests/regexparser/Consistency.ql
Normal file
@@ -0,0 +1,15 @@
|
||||
/**
|
||||
* Flags regular expressions that are parsed ambigously
|
||||
*/
|
||||
|
||||
import python
|
||||
import semmle.python.RegexTreeView
|
||||
|
||||
from string str, int counter, Location loc
|
||||
where
|
||||
counter =
|
||||
strictcount(RegExpTerm term |
|
||||
term.getLocation() = loc and term.isRootTerm() and term.toString() = str
|
||||
) and
|
||||
counter > 1
|
||||
select str, counter, loc
|
||||
94
python/ql/test/library-tests/regexparser/KnownCVEs.py
Normal file
94
python/ql/test/library-tests/regexparser/KnownCVEs.py
Normal file
@@ -0,0 +1,94 @@
|
||||
import re
|
||||
|
||||
# linear
|
||||
# https://github.com/github/codeql-python-CVE-coverage/issues/439
|
||||
rex_blame = re.compile(r'\s*(\d+)\s*(\S+) (.*)')
|
||||
|
||||
# https://github.com/github/codeql-python-CVE-coverage/issues/402
|
||||
whitespace = br"[\000\011\012\014\015\040]"
|
||||
whitespace_optional = whitespace + b"*"
|
||||
newline_only = br"[\r\n]+"
|
||||
newline = whitespace_optional + newline_only + whitespace_optional
|
||||
toFlag = re.compile(newline)
|
||||
|
||||
# https://github.com/github/codeql-python-CVE-coverage/issues/400
|
||||
re.compile(r'[+-]?(\d+)*\.\d+%?')
|
||||
re.compile(r'"""\s+(?:.|\n)*?\s+"""')
|
||||
re.compile(r'(\{\s+)(\S+)(\s+[^}]+\s+\}\s)')
|
||||
re.compile(r'".*``.*``.*"')
|
||||
re.compile(r'(\s*)(?:(.+)(\s*)(=)(\s*))?(.+)(\()(.*)(\))(\s*)')
|
||||
re.compile(r'(%config)(\s*\(\s*)(\w+)(\s*=\s*)(.*?)(\s*\)\s*)')
|
||||
re.compile(r'(%new)(\s*)(\()(\s*.*?\s*)(\))')
|
||||
re.compile(r'(\$)(evoque|overlay)(\{(%)?)(\s*[#\w\-"\'.]+[^=,%}]+?)?')
|
||||
re.compile(r'(\.\w+\b)(\s*=\s*)([^;]*)(\s*;)')
|
||||
|
||||
# linear
|
||||
# https://github.com/github/codeql-python-CVE-coverage/issues/392
|
||||
simple_email_re = re.compile(r"^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$")
|
||||
|
||||
# https://github.com/github/codeql-python-CVE-coverage/issues/249
|
||||
rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
|
||||
'realm=(["\']?)([^"\']*)\\2', re.I)
|
||||
|
||||
# https://github.com/github/codeql-python-CVE-coverage/issues/248
|
||||
gauntlet = re.compile(
|
||||
r"""^([-/:,#%.'"\s!\w]|\w-\w|'[\s\w]+'\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$""",
|
||||
flags=re.U
|
||||
)
|
||||
|
||||
# https://github.com/github/codeql-python-CVE-coverage/issues/227
|
||||
# from .compat import tobytes
|
||||
|
||||
WS = "[ \t]"
|
||||
OWS = WS + "{0,}?"
|
||||
|
||||
# RFC 7230 Section 3.2.6 "Field Value Components":
|
||||
# tchar = "!" / "#" / "$" / "%" / "&" / "'" / "*"
|
||||
# / "+" / "-" / "." / "^" / "_" / "`" / "|" / "~"
|
||||
# / DIGIT / ALPHA
|
||||
# obs-text = %x80-FF
|
||||
TCHAR = r"[!#$%&'*+\-.^_`|~0-9A-Za-z]"
|
||||
OBS_TEXT = r"\x80-\xff"
|
||||
TOKEN = TCHAR + "{1,}"
|
||||
# RFC 5234 Appendix B.1 "Core Rules":
|
||||
# VCHAR = %x21-7E
|
||||
# ; visible (printing) characters
|
||||
VCHAR = r"\x21-\x7e"
|
||||
# header-field = field-name ":" OWS field-value OWS
|
||||
# field-name = token
|
||||
# field-value = *( field-content / obs-fold )
|
||||
# field-content = field-vchar [ 1*( SP / HTAB ) field-vchar ]
|
||||
# field-vchar = VCHAR / obs-text
|
||||
# Errata from: https://www.rfc-editor.org/errata_search.php?rfc=7230&eid=4189
|
||||
# changes field-content to:
|
||||
#
|
||||
# field-content = field-vchar [ 1*( SP / HTAB / field-vchar )
|
||||
# field-vchar ]
|
||||
|
||||
FIELD_VCHAR = "[" + VCHAR + OBS_TEXT + "]"
|
||||
FIELD_CONTENT = FIELD_VCHAR + "([ \t" + VCHAR + OBS_TEXT + "]+" + FIELD_VCHAR + "){,1}"
|
||||
FIELD_VALUE = "(" + FIELD_CONTENT + "){0,}"
|
||||
|
||||
HEADER_FIELD = re.compile(
|
||||
# tobytes(
|
||||
"^(?P<name>" + TOKEN + "):" + OWS + "(?P<value>" + FIELD_VALUE + ")" + OWS + "$"
|
||||
# )
|
||||
)
|
||||
|
||||
# https://github.com/github/codeql-python-CVE-coverage/issues/224
|
||||
pattern = re.compile(
|
||||
r'^(:?(([a-zA-Z]{1})|([a-zA-Z]{1}[a-zA-Z]{1})|' # domain pt.1
|
||||
r'([a-zA-Z]{1}[0-9]{1})|([0-9]{1}[a-zA-Z]{1})|' # domain pt.2
|
||||
r'([a-zA-Z0-9][-_a-zA-Z0-9]{0,61}[a-zA-Z0-9]))\.)+' # domain pt.3
|
||||
r'([a-zA-Z]{2,13}|(xn--[a-zA-Z0-9]{2,30}))$' # TLD
|
||||
)
|
||||
|
||||
# https://github.com/github/codeql-python-CVE-coverage/issues/189
|
||||
URL_REGEX = (
|
||||
r'(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|'
|
||||
r'[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|'
|
||||
r'(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|'
|
||||
r'[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))' # "emacs!
|
||||
)
|
||||
|
||||
url = re.compile(URL_REGEX)
|
||||
9
python/ql/test/library-tests/regexparser/polredos.py
Normal file
9
python/ql/test/library-tests/regexparser/polredos.py
Normal file
@@ -0,0 +1,9 @@
|
||||
import re
|
||||
from flask import Flask, request
|
||||
app = Flask(__name__)
|
||||
|
||||
@app.route("/poly-redos")
|
||||
def code_execution():
|
||||
text = request.args.get("text")
|
||||
re.sub(r"^\s+|\s+$", "", text) # NOT OK
|
||||
re.match(r"^0\.\d+E?\d+$", text) # NOT OK
|
||||
376
python/ql/test/library-tests/regexparser/redos.py
Normal file
376
python/ql/test/library-tests/regexparser/redos.py
Normal file
@@ -0,0 +1,376 @@
|
||||
# This is currently a copy of the redos test-file, since that one contains many regexes.
|
||||
|
||||
import re
|
||||
|
||||
# NOT GOOD; attack: "_" + "__".repeat(100)
|
||||
# Adapted from marked (https://github.com/markedjs/marked), which is licensed
|
||||
# under the MIT license; see file marked-LICENSE.
|
||||
bad1 = re.compile(r'''^\b_((?:__|[\s\S])+?)_\b|^\*((?:\*\*|[\s\S])+?)\*(?!\*)''')
|
||||
|
||||
# GOOD
|
||||
# Adapted from marked (https://github.com/markedjs/marked), which is licensed
|
||||
# under the MIT license; see file marked-LICENSE.
|
||||
good1 = re.compile(r'^\b_((?:__|[^_])+?)_\b|^\*((?:\*\*|[^*])+?)\*(?!\*)')
|
||||
|
||||
# GOOD - there is no witness in the end that could cause the regexp to not match
|
||||
# Adapted from brace-expansion (https://github.com/juliangruber/brace-expansion),
|
||||
# which is licensed under the MIT license; see file brace-expansion-LICENSE.
|
||||
good2 = re.compile(r'(.*,)+.+')
|
||||
|
||||
# NOT GOOD; attack: " '" + "\\\\".repeat(100)
|
||||
# Adapted from CodeMirror (https://github.com/codemirror/codemirror),
|
||||
# which is licensed under the MIT license; see file CodeMirror-LICENSE.
|
||||
bad2 = re.compile(r'''^(?:\s+(?:"(?:[^"\\]|\\\\|\\.)+"|'(?:[^'\\]|\\\\|\\.)+'|\((?:[^)\\]|\\\\|\\.)+\)))?''')
|
||||
|
||||
# GOOD
|
||||
# Adapted from lulucms2 (https://github.com/yiifans/lulucms2).
|
||||
good2 = re.compile(r'''\(\*(?:[\s\S]*?\(\*[\s\S]*?\*\))*[\s\S]*?\*\)''')
|
||||
|
||||
# GOOD
|
||||
# Adapted from jest (https://github.com/facebook/jest), which is licensed
|
||||
# under the MIT license; see file jest-LICENSE.
|
||||
good3 = re.compile(r'''^ *(\S.*\|.*)\n *([-:]+ *\|[-| :]*)\n((?:.*\|.*(?:\n|$))*)\n*''')
|
||||
|
||||
# NOT GOOD, variant of good3; attack: "a|\n:|\n" + "||\n".repeat(100)
|
||||
bad4 = re.compile(r'''^ *(\S.*\|.*)\n *([-:]+ *\|[-| :]*)\n((?:.*\|.*(?:\n|$))*)a''')
|
||||
|
||||
# NOT GOOD; attack: "/" + "\\/a".repeat(100)
|
||||
# Adapted from ANodeBlog (https://github.com/gefangshuai/ANodeBlog),
|
||||
# which is licensed under the Apache License 2.0; see file ANodeBlog-LICENSE.
|
||||
bad5 = re.compile(r'''\/(?![ *])(\\\/|.)*?\/[gim]*(?=\W|$)''')
|
||||
|
||||
# NOT GOOD; attack: "##".repeat(100) + "\na"
|
||||
# Adapted from CodeMirror (https://github.com/codemirror/codemirror),
|
||||
# which is licensed under the MIT license; see file CodeMirror-LICENSE.
|
||||
bad6 = re.compile(r'''^([\s\[\{\(]|#.*)*$''')
|
||||
|
||||
# GOOD
|
||||
good4 = re.compile(r'''(\r\n|\r|\n)+''')
|
||||
|
||||
# BAD - PoC: `node -e "/((?:[^\"\']|\".*?\"|\'.*?\')*?)([(,)]|$)/.test(\"'''''''''''''''''''''''''''''''''''''''''''''\\\"\");"`. It's complicated though, because the regexp still matches something, it just matches the empty-string after the attack string.
|
||||
actuallyBad = re.compile(r'''((?:[^"']|".*?"|'.*?')*?)([(,)]|$)''')
|
||||
|
||||
# NOT GOOD; attack: "a" + "[]".repeat(100) + ".b\n"
|
||||
# Adapted from Knockout (https://github.com/knockout/knockout), which is
|
||||
# licensed under the MIT license; see file knockout-LICENSE
|
||||
bad6 = re.compile(r'''^[\_$a-z][\_$a-z0-9]*(\[.*?\])*(\.[\_$a-z][\_$a-z0-9]*(\[.*?\])*)*$''')
|
||||
|
||||
# GOOD
|
||||
good6 = re.compile(r'''(a|.)*''')
|
||||
|
||||
# Testing the NFA - only some of the below are detected.
|
||||
bad7 = re.compile(r'''^([a-z]+)+$''')
|
||||
bad8 = re.compile(r'''^([a-z]*)*$''')
|
||||
bad9 = re.compile(r'''^([a-zA-Z0-9])(([\\-.]|[_]+)?([a-zA-Z0-9]+))*(@){1}[a-z0-9]+[.]{1}(([a-z]{2,3})|([a-z]{2,3}[.]{1}[a-z]{2,3}))$''')
|
||||
bad10 = re.compile(r'''^(([a-z])+.)+[A-Z]([a-z])+$''')
|
||||
|
||||
# NOT GOOD; attack: "[" + "][".repeat(100) + "]!"
|
||||
# Adapted from Prototype.js (https://github.com/prototypejs/prototype), which
|
||||
# is licensed under the MIT license; see file Prototype.js-LICENSE.
|
||||
bad11 = re.compile(r'''(([\w#:.~>+()\s-]+|\*|\[.*?\])+)\s*(,|$)''')
|
||||
|
||||
# NOT GOOD; attack: "'" + "\\a".repeat(100) + '"'
|
||||
# Adapted from Prism (https://github.com/PrismJS/prism), which is licensed
|
||||
# under the MIT license; see file Prism-LICENSE.
|
||||
bad12 = re.compile(r'''("|')(\\?.)*?\1''')
|
||||
|
||||
# NOT GOOD
|
||||
bad13 = re.compile(r'''(b|a?b)*c''')
|
||||
|
||||
# NOT GOOD
|
||||
bad15 = re.compile(r'''(a|aa?)*b''')
|
||||
|
||||
# GOOD
|
||||
good7 = re.compile(r'''(.|\n)*!''')
|
||||
|
||||
# NOT GOOD; attack: "\n".repeat(100) + "."
|
||||
bad16 = re.compile(r'''(.|\n)*!''')
|
||||
|
||||
# GOOD
|
||||
good8 = re.compile(r'''([\w.]+)*''')
|
||||
|
||||
# NOT GOOD
|
||||
bad17 = re.compile(r'''(a|aa?)*b''')
|
||||
|
||||
# GOOD - not used as regexp
|
||||
good9 = '(a|aa?)*b'
|
||||
|
||||
# NOT GOOD
|
||||
bad18 = re.compile(r'''(([\s\S]|[^a])*)"''')
|
||||
|
||||
# GOOD - there is no witness in the end that could cause the regexp to not match
|
||||
good10 = re.compile(r'''([^"']+)*''')
|
||||
|
||||
# NOT GOOD
|
||||
bad20 = re.compile(r'''((.|[^a])*)"''')
|
||||
|
||||
# GOOD
|
||||
good10 = re.compile(r'''((a|[^a])*)"''')
|
||||
|
||||
# NOT GOOD
|
||||
bad21 = re.compile(r'''((b|[^a])*)"''')
|
||||
|
||||
# NOT GOOD
|
||||
bad22 = re.compile(r'''((G|[^a])*)"''')
|
||||
|
||||
# NOT GOOD
|
||||
bad23 = re.compile(r'''(([0-9]|[^a])*)"''')
|
||||
|
||||
# NOT GOOD
|
||||
bad24 = re.compile(r'''(?:=(?:([!#\$%&'\*\+\-\.\^_`\|~0-9A-Za-z]+)|"((?:\\[\x00-\x7f]|[^\x00-\x08\x0a-\x1f\x7f"])*)"))?''')
|
||||
|
||||
# NOT GOOD
|
||||
bad25 = re.compile(r'''"((?:\\[\x00-\x7f]|[^\x00-\x08\x0a-\x1f\x7f"])*)"''')
|
||||
|
||||
# GOOD
|
||||
bad26 = re.compile(r'''"((?:\\[\x00-\x7f]|[^\x00-\x08\x0a-\x1f\x7f"\\])*)"''')
|
||||
|
||||
# NOT GOOD
|
||||
bad27 = re.compile(r'''(([a-z]|[d-h])*)"''')
|
||||
|
||||
# NOT GOOD
|
||||
bad27 = re.compile(r'''(([^a-z]|[^0-9])*)"''')
|
||||
|
||||
# NOT GOOD
|
||||
bad28 = re.compile(r'''((\d|[0-9])*)"''')
|
||||
|
||||
# NOT GOOD
|
||||
bad29 = re.compile(r'''((\s|\s)*)"''')
|
||||
|
||||
# NOT GOOD
|
||||
bad30 = re.compile(r'''((\w|G)*)"''')
|
||||
|
||||
# GOOD
|
||||
good11 = re.compile(r'''((\s|\d)*)"''')
|
||||
|
||||
# NOT GOOD
|
||||
bad31 = re.compile(r'''((\d|\w)*)"''')
|
||||
|
||||
# NOT GOOD
|
||||
bad32 = re.compile(r'''((\d|5)*)"''')
|
||||
|
||||
# NOT GOOD
|
||||
bad33 = re.compile(r'''((\s|[\f])*)"''')
|
||||
|
||||
# NOT GOOD
|
||||
bad34 = re.compile(r'''((\s|[\v]|\\v)*)"''')
|
||||
|
||||
# NOT GOOD
|
||||
bad35 = re.compile(r'''((\f|[\f])*)"''')
|
||||
|
||||
# NOT GOOD
|
||||
bad36 = re.compile(r'''((\W|\D)*)"''')
|
||||
|
||||
# NOT GOOD
|
||||
bad37 = re.compile(r'''((\S|\w)*)"''')
|
||||
|
||||
# NOT GOOD
|
||||
bad38 = re.compile(r'''((\S|[\w])*)"''')
|
||||
|
||||
# NOT GOOD
|
||||
bad39 = re.compile(r'''((1s|[\da-z])*)"''')
|
||||
|
||||
# NOT GOOD
|
||||
bad40 = re.compile(r'''((0|[\d])*)"''')
|
||||
|
||||
# NOT GOOD
|
||||
bad41 = re.compile(r'''(([\d]+)*)"''')
|
||||
|
||||
# GOOD - there is no witness in the end that could cause the regexp to not match
|
||||
good12 = re.compile(r'''(\d+(X\d+)?)+''')
|
||||
|
||||
# GOOD - there is no witness in the end that could cause the regexp to not match
|
||||
good13 = re.compile(r'''([0-9]+(X[0-9]*)?)*''')
|
||||
|
||||
# GOOD
|
||||
good15 = re.compile(r'''^([^>]+)*(>|$)''')
|
||||
|
||||
# NOT GOOD
|
||||
bad43 = re.compile(r'''^([^>a]+)*(>|$)''')
|
||||
|
||||
# NOT GOOD
|
||||
bad44 = re.compile(r'''(\n\s*)+$''')
|
||||
|
||||
# NOT GOOD
|
||||
bad45 = re.compile(r'''^(?:\s+|#.*|\(\?#[^)]*\))*(?:[?*+]|{\d+(?:,\d*)?})''')
|
||||
|
||||
# NOT GOOD
|
||||
bad46 = re.compile(r'''\{\[\s*([a-zA-Z]+)\(([a-zA-Z]+)\)((\s*([a-zA-Z]+)\: ?([ a-zA-Z{}]+),?)+)*\s*\]\}''')
|
||||
|
||||
# NOT GOOD
|
||||
bad47 = re.compile(r'''(a+|b+|c+)*c''')
|
||||
|
||||
# NOT GOOD
|
||||
bad48 = re.compile(r'''(((a+a?)*)+b+)''')
|
||||
|
||||
# NOT GOOD
|
||||
bad49 = re.compile(r'''(a+)+bbbb''')
|
||||
|
||||
# GOOD
|
||||
good16 = re.compile(r'''(a+)+aaaaa*a+''')
|
||||
|
||||
# NOT GOOD
|
||||
bad50 = re.compile(r'''(a+)+aaaaa$''')
|
||||
|
||||
# GOOD
|
||||
good17 = re.compile(r'''(\n+)+\n\n''')
|
||||
|
||||
# NOT GOOD
|
||||
bad51 = re.compile(r'''(\n+)+\n\n$''')
|
||||
|
||||
# NOT GOOD
|
||||
bad52 = re.compile(r'''([^X]+)*$''')
|
||||
|
||||
# NOT GOOD
|
||||
bad53 = re.compile(r'''(([^X]b)+)*$''')
|
||||
|
||||
# GOOD
|
||||
good18 = re.compile(r'''(([^X]b)+)*($|[^X]b)''')
|
||||
|
||||
# NOT GOOD
|
||||
bad54 = re.compile(r'''(([^X]b)+)*($|[^X]c)''')
|
||||
|
||||
# GOOD
|
||||
good20 = re.compile(r'''((ab)+)*ababab''')
|
||||
|
||||
# GOOD
|
||||
good21 = re.compile(r'''((ab)+)*abab(ab)*(ab)+''')
|
||||
|
||||
# GOOD
|
||||
good22 = re.compile(r'''((ab)+)*''')
|
||||
|
||||
# NOT GOOD
|
||||
bad55 = re.compile(r'''((ab)+)*$''')
|
||||
|
||||
# GOOD
|
||||
good23 = re.compile(r'''((ab)+)*[a1][b1][a2][b2][a3][b3]''')
|
||||
|
||||
# NOT GOOD
|
||||
bad56 = re.compile(r'''([\n\s]+)*(.)''')
|
||||
|
||||
# GOOD - any witness passes through the accept state.
|
||||
good24 = re.compile(r'''(A*A*X)*''')
|
||||
|
||||
# GOOD
|
||||
good26 = re.compile(r'''([^\\\]]+)*''')
|
||||
|
||||
# NOT GOOD
|
||||
bad59 = re.compile(r'''(\w*foobarbaz\w*foobarbaz\w*foobarbaz\w*foobarbaz\s*foobarbaz\d*foobarbaz\w*)+-''')
|
||||
|
||||
# NOT GOOD
|
||||
bad60 = re.compile(r'''(.thisisagoddamnlongstringforstresstestingthequery|\sthisisagoddamnlongstringforstresstestingthequery)*-''')
|
||||
|
||||
# NOT GOOD
|
||||
bad61 = re.compile(r'''(thisisagoddamnlongstringforstresstestingthequery|this\w+query)*-''')
|
||||
|
||||
# GOOD
|
||||
good27 = re.compile(r'''(thisisagoddamnlongstringforstresstestingthequery|imanotherbutunrelatedstringcomparedtotheotherstring)*-''')
|
||||
|
||||
# GOOD
|
||||
good28 = re.compile(r'''foo([\uDC66\uDC67]|[\uDC68\uDC69])*foo''')
|
||||
|
||||
# GOOD
|
||||
good29 = re.compile(r'''foo((\uDC66|\uDC67)|(\uDC68|\uDC69))*foo''')
|
||||
|
||||
# NOT GOOD (but cannot currently construct a prefix)
|
||||
bad62 = re.compile(r'''a{2,3}(b+)+X''')
|
||||
|
||||
# NOT GOOD (and a good prefix test)
|
||||
bad63 = re.compile(r'''^<(\w+)((?:\s+\w+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>''')
|
||||
|
||||
# GOOD
|
||||
good30 = re.compile(r'''(a+)*[\s\S][\s\S][\s\S]?''')
|
||||
|
||||
# GOOD - but we fail to see that repeating the attack string ends in the "accept any" state (due to not parsing the range `[\s\S]{2,3}`).
|
||||
good31 = re.compile(r'''(a+)*[\s\S]{2,3}''')
|
||||
|
||||
# GOOD - but we spuriously conclude that a rejecting suffix exists (due to not parsing the range `[\s\S]{2,}` when constructing the NFA).
|
||||
good32 = re.compile(r'''(a+)*([\s\S]{2,}|X)$''')
|
||||
|
||||
# GOOD
|
||||
good33 = re.compile(r'''(a+)*([\s\S]*|X)$''')
|
||||
|
||||
# NOT GOOD
|
||||
bad64 = re.compile(r'''((a+)*$|[\s\S]+)''')
|
||||
|
||||
# GOOD - but still flagged. The only change compared to the above is the order of alternatives, which we don't model.
|
||||
good34 = re.compile(r'''([\s\S]+|(a+)*$)''')
|
||||
|
||||
# GOOD
|
||||
good35 = re.compile(r'''((;|^)a+)+$''')
|
||||
|
||||
# NOT GOOD (a good prefix test)
|
||||
bad65 = re.compile(r'''(^|;)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(e+)+f''')
|
||||
|
||||
# NOT GOOD
|
||||
bad66 = re.compile(r'''^ab(c+)+$''')
|
||||
|
||||
# NOT GOOD
|
||||
bad67 = re.compile(r'''(\d(\s+)*){20}''')
|
||||
|
||||
# GOOD - but we spuriously conclude that a rejecting suffix exists.
|
||||
good36 = re.compile(r'''(([^/]|X)+)(\/[\s\S]*)*$''')
|
||||
|
||||
# GOOD - but we spuriously conclude that a rejecting suffix exists.
|
||||
good37 = re.compile(r'''^((x([^Y]+)?)*(Y|$))''')
|
||||
|
||||
# NOT GOOD
|
||||
bad68 = re.compile(r'''(a*)+b''')
|
||||
|
||||
# NOT GOOD
|
||||
bad69 = re.compile(r'''foo([\w-]*)+bar''')
|
||||
|
||||
# NOT GOOD
|
||||
bad70 = re.compile(r'''((ab)*)+c''')
|
||||
|
||||
# NOT GOOD
|
||||
bad71 = re.compile(r'''(a?a?)*b''')
|
||||
|
||||
# GOOD
|
||||
good38 = re.compile(r'''(a?)*b''')
|
||||
|
||||
# NOT GOOD - but not detected
|
||||
bad72 = re.compile(r'''(c?a?)*b''')
|
||||
|
||||
# NOT GOOD
|
||||
bad73 = re.compile(r'''(?:a|a?)+b''')
|
||||
|
||||
# NOT GOOD - but not detected.
|
||||
bad74 = re.compile(r'''(a?b?)*$''')
|
||||
|
||||
# NOT GOOD
|
||||
bad76 = re.compile(r'''PRE(([a-c]|[c-d])T(e?e?e?e?|X))+(cTcT|cTXcTX$)''')
|
||||
|
||||
# NOT GOOD - but not detected
|
||||
bad77 = re.compile(r'''^((a)+\w)+$''')
|
||||
|
||||
# NOT GOOD
|
||||
bad78 = re.compile(r'''^(b+.)+$''')
|
||||
|
||||
# GOOD
|
||||
good39 = re.compile(r'''a*b''')
|
||||
|
||||
# All 4 bad combinations of nested * and +
|
||||
bad79 = re.compile(r'''(a*)*b''')
|
||||
bad80 = re.compile(r'''(a+)*b''')
|
||||
bad81 = re.compile(r'''(a*)+b''')
|
||||
bad82 = re.compile(r'''(a+)+b''')
|
||||
|
||||
# GOOD
|
||||
good40 = re.compile(r'''(a|b)+''')
|
||||
good41 = re.compile(r'''(?:[\s;,"'<>(){}|[\]@=+*]|:(?![/\\]))+''') # parses wrongly, sees column 42 as a char set start
|
||||
|
||||
# NOT GOOD
|
||||
bad83 = re.compile(r'''^((?:a{|-)|\w\{)+X$''')
|
||||
bad84 = re.compile(r'''^((?:a{0|-)|\w\{\d)+X$''')
|
||||
bad85 = re.compile(r'''^((?:a{0,|-)|\w\{\d,)+X$''')
|
||||
bad86 = re.compile(r'''^((?:a{0,2|-)|\w\{\d,\d)+X$''')
|
||||
|
||||
# GOOD:
|
||||
good42 = re.compile(r'''^((?:a{0,2}|-)|\w\{\d,\d\})+X$''')
|
||||
|
||||
# NOT GOOD
|
||||
bad87 = re.compile(r'X(\u0061|a)*Y')
|
||||
|
||||
# GOOD
|
||||
good43 = re.compile(r'X(\u0061|b)+Y')
|
||||
20
python/ql/test/library-tests/regexparser/unittests.py
Normal file
20
python/ql/test/library-tests/regexparser/unittests.py
Normal file
@@ -0,0 +1,20 @@
|
||||
import re
|
||||
|
||||
# Treatment of escapes
|
||||
re.compile(r"X([^\.]|\.)*$") # No ReDoS.
|
||||
re.compile(r"X(Æ|\Æ)+$") # Has ReDoS.
|
||||
|
||||
# Treatment of line breaks
|
||||
re.compile(r'(?:.|\n)*b') # No ReDoS.
|
||||
re.compile(r'(?:.|\n)*b', re.DOTALL) # Has ReDoS.
|
||||
|
||||
# minimal example constructed by @erik-krogh
|
||||
baz = re.compile(r'\+0')
|
||||
|
||||
# exerpts from LGTM.com
|
||||
re.compile(r'\+0x')
|
||||
re.compile(r'\+0x.*')
|
||||
re.compile(r'+\-0+\.')
|
||||
re.compile('\s+\+0x[0-9]+')
|
||||
re.compile(r'\+0000 .*')
|
||||
re.compile('\#[0-9]+ 0x[0-9]')
|
||||
Reference in New Issue
Block a user