codeql/python/ql/test/library-tests/regexparser/redos.py

# This is currently a copy of the redos test-file, since that one contains many regexes.

import re

# NOT GOOD; attack: "_" + "__".repeat(100)
# Adapted from marked (https://github.com/markedjs/marked), which is licensed
# under the MIT license; see file marked-LICENSE.
bad1 = re.compile(r'''^\b_((?:__|[\s\S])+?)_\b|^\*((?:\*\*|[\s\S])+?)\*(?!\*)''')

# GOOD
# Adapted from marked (https://github.com/markedjs/marked), which is licensed
# under the MIT license; see file marked-LICENSE.
good1 = re.compile(r'^\b_((?:__|[^_])+?)_\b|^\*((?:\*\*|[^*])+?)\*(?!\*)')

# GOOD - there is no witness in the end that could cause the regexp to not match
# Adapted from brace-expansion (https://github.com/juliangruber/brace-expansion),
# which is licensed under the MIT license; see file brace-expansion-LICENSE.
good2 = re.compile(r'(.*,)+.+')

# NOT GOOD; attack: " '" + "\\\\".repeat(100)
# Adapted from CodeMirror (https://github.com/codemirror/codemirror),
# which is licensed under the MIT license; see file CodeMirror-LICENSE.
bad2 = re.compile(r'''^(?:\s+(?:"(?:[^"\\]|\\\\|\\.)+"|'(?:[^'\\]|\\\\|\\.)+'|\((?:[^)\\]|\\\\|\\.)+\)))?''')

# GOOD
# Adapted from lulucms2 (https://github.com/yiifans/lulucms2).
good2 = re.compile(r'''\(\*(?:[\s\S]*?\(\*[\s\S]*?\*\))*[\s\S]*?\*\)''')

# GOOD
# Adapted from jest (https://github.com/facebook/jest), which is licensed
# under the MIT license; see file jest-LICENSE.
good3 = re.compile(r'''^ *(\S.*\|.*)\n *([-:]+ *\|[-| :]*)\n((?:.*\|.*(?:\n|$))*)\n*''')

# NOT GOOD, variant of good3; attack: "a|\n:|\n" + "||\n".repeat(100)
bad4 = re.compile(r'''^ *(\S.*\|.*)\n *([-:]+ *\|[-| :]*)\n((?:.*\|.*(?:\n|$))*)a''')

# NOT GOOD; attack: "/" + "\\/a".repeat(100)
# Adapted from ANodeBlog (https://github.com/gefangshuai/ANodeBlog),
# which is licensed under the Apache License 2.0; see file ANodeBlog-LICENSE.
bad5 = re.compile(r'''\/(?![ *])(\\\/|.)*?\/[gim]*(?=\W|$)''')

# NOT GOOD; attack: "##".repeat(100) + "\na"
# Adapted from CodeMirror (https://github.com/codemirror/codemirror),
# which is licensed under the MIT license; see file CodeMirror-LICENSE.
bad6 = re.compile(r'''^([\s\[\{\(]|#.*)*$''')

# GOOD
good4 = re.compile(r'''(\r\n|\r|\n)+''')

# BAD - PoC: `node -e "/((?:[^\"\']|\".*?\"|\'.*?\')*?)([(,)]|$)/.test(\"'''''''''''''''''''''''''''''''''''''''''''''\\\"\");"`. It's complicated though, because the regexp still matches something, it just matches the empty-string after the attack string.
actuallyBad = re.compile(r'''((?:[^"']|".*?"|'.*?')*?)([(,)]|$)''')

# NOT GOOD; attack: "a" + "[]".repeat(100) + ".b\n"
# Adapted from Knockout (https://github.com/knockout/knockout), which is
# licensed under the MIT license; see file knockout-LICENSE
bad6 = re.compile(r'''^[\_$a-z][\_$a-z0-9]*(\[.*?\])*(\.[\_$a-z][\_$a-z0-9]*(\[.*?\])*)*$''')

# GOOD
good6 = re.compile(r'''(a|.)*''')

# Testing the NFA - only some of the below are detected.
bad7 = re.compile(r'''^([a-z]+)+$''')
bad8 = re.compile(r'''^([a-z]*)*$''')
bad9 = re.compile(r'''^([a-zA-Z0-9])(([\\-.]|[_]+)?([a-zA-Z0-9]+))*(@){1}[a-z0-9]+[.]{1}(([a-z]{2,3})|([a-z]{2,3}[.]{1}[a-z]{2,3}))$''')
bad10 = re.compile(r'''^(([a-z])+.)+[A-Z]([a-z])+$''')

# NOT GOOD; attack: "[" + "][".repeat(100) + "]!"
# Adapted from Prototype.js (https://github.com/prototypejs/prototype), which
# is licensed under the MIT license; see file Prototype.js-LICENSE.
bad11 = re.compile(r'''(([\w#:.~>+()\s-]+|\*|\[.*?\])+)\s*(,|$)''')

# NOT GOOD; attack: "'" + "\\a".repeat(100) + '"'
# Adapted from Prism (https://github.com/PrismJS/prism), which is licensed
# under the MIT license; see file Prism-LICENSE.
bad12 = re.compile(r'''("|')(\\?.)*?\1''')

# NOT GOOD
bad13 = re.compile(r'''(b|a?b)*c''')

# NOT GOOD
bad15 = re.compile(r'''(a|aa?)*b''')

# GOOD
good7 = re.compile(r'''(.|\n)*!''')

# NOT GOOD; attack: "\n".repeat(100) + "."
bad16 = re.compile(r'''(.|\n)*!''')

# GOOD
good8 = re.compile(r'''([\w.]+)*''')

# NOT GOOD
bad17 = re.compile(r'''(a|aa?)*b''')

# GOOD - not used as regexp
good9 = '(a|aa?)*b'

# NOT GOOD
bad18 = re.compile(r'''(([\s\S]|[^a])*)"''')

# GOOD - there is no witness in the end that could cause the regexp to not match
good10 = re.compile(r'''([^"']+)*''')

# NOT GOOD
bad20 = re.compile(r'''((.|[^a])*)"''')

# GOOD
good10 = re.compile(r'''((a|[^a])*)"''')

# NOT GOOD
bad21 = re.compile(r'''((b|[^a])*)"''')

# NOT GOOD
bad22 = re.compile(r'''((G|[^a])*)"''')

# NOT GOOD
bad23 = re.compile(r'''(([0-9]|[^a])*)"''')

# NOT GOOD
bad24 = re.compile(r'''(?:=(?:([!#\$%&'\*\+\-\.\^_`\|~0-9A-Za-z]+)|"((?:\\[\x00-\x7f]|[^\x00-\x08\x0a-\x1f\x7f"])*)"))?''')

# NOT GOOD
bad25 = re.compile(r'''"((?:\\[\x00-\x7f]|[^\x00-\x08\x0a-\x1f\x7f"])*)"''')

# GOOD
bad26 = re.compile(r'''"((?:\\[\x00-\x7f]|[^\x00-\x08\x0a-\x1f\x7f"\\])*)"''')

# NOT GOOD
bad27 = re.compile(r'''(([a-z]|[d-h])*)"''')

# NOT GOOD
bad27 = re.compile(r'''(([^a-z]|[^0-9])*)"''')

# NOT GOOD
bad28 = re.compile(r'''((\d|[0-9])*)"''')

# NOT GOOD
bad29 = re.compile(r'''((\s|\s)*)"''')

# NOT GOOD
bad30 = re.compile(r'''((\w|G)*)"''')

# GOOD
good11 = re.compile(r'''((\s|\d)*)"''')

# NOT GOOD
bad31 = re.compile(r'''((\d|\w)*)"''')

# NOT GOOD
bad32 = re.compile(r'''((\d|5)*)"''')

# NOT GOOD
bad33 = re.compile(r'''((\s|[\f])*)"''')

# NOT GOOD
bad34 = re.compile(r'''((\s|[\v]|\\v)*)"''')

# NOT GOOD
bad35 = re.compile(r'''((\f|[\f])*)"''')

# NOT GOOD
bad36 = re.compile(r'''((\W|\D)*)"''')

# NOT GOOD
bad37 = re.compile(r'''((\S|\w)*)"''')

# NOT GOOD
bad38 = re.compile(r'''((\S|[\w])*)"''')

# NOT GOOD
bad39 = re.compile(r'''((1s|[\da-z])*)"''')

# NOT GOOD
bad40 = re.compile(r'''((0|[\d])*)"''')

# NOT GOOD
bad41 = re.compile(r'''(([\d]+)*)"''')

# GOOD - there is no witness in the end that could cause the regexp to not match
good12 = re.compile(r'''(\d+(X\d+)?)+''')

# GOOD - there is no witness in the end that could cause the regexp to not match
good13 = re.compile(r'''([0-9]+(X[0-9]*)?)*''')

# GOOD
good15 = re.compile(r'''^([^>]+)*(>|$)''')

# NOT GOOD
bad43 = re.compile(r'''^([^>a]+)*(>|$)''')

# NOT GOOD
bad44 = re.compile(r'''(\n\s*)+$''')

# NOT GOOD
bad45 = re.compile(r'''^(?:\s+|#.*|\(\?#[^)]*\))*(?:[?*+]|{\d+(?:,\d*)?})''')

# NOT GOOD
bad46 = re.compile(r'''\{\[\s*([a-zA-Z]+)\(([a-zA-Z]+)\)((\s*([a-zA-Z]+)\: ?([ a-zA-Z{}]+),?)+)*\s*\]\}''')

# NOT GOOD
bad47 = re.compile(r'''(a+|b+|c+)*c''')

# NOT GOOD
bad48 = re.compile(r'''(((a+a?)*)+b+)''')

# NOT GOOD
bad49 = re.compile(r'''(a+)+bbbb''')

# GOOD
good16 = re.compile(r'''(a+)+aaaaa*a+''')

# NOT GOOD
bad50 = re.compile(r'''(a+)+aaaaa$''')

# GOOD
good17 = re.compile(r'''(\n+)+\n\n''')

# NOT GOOD
bad51 = re.compile(r'''(\n+)+\n\n$''')

# NOT GOOD
bad52 = re.compile(r'''([^X]+)*$''')

# NOT GOOD
bad53 = re.compile(r'''(([^X]b)+)*$''')

# GOOD
good18 = re.compile(r'''(([^X]b)+)*($|[^X]b)''')

# NOT GOOD
bad54 = re.compile(r'''(([^X]b)+)*($|[^X]c)''')

# GOOD
good20 = re.compile(r'''((ab)+)*ababab''')

# GOOD
good21 = re.compile(r'''((ab)+)*abab(ab)*(ab)+''')

# GOOD
good22 = re.compile(r'''((ab)+)*''')

# NOT GOOD
bad55 = re.compile(r'''((ab)+)*$''')

# GOOD
good23 = re.compile(r'''((ab)+)*[a1][b1][a2][b2][a3][b3]''')

# NOT GOOD
bad56 = re.compile(r'''([\n\s]+)*(.)''')

# GOOD - any witness passes through the accept state.
good24 = re.compile(r'''(A*A*X)*''')

# GOOD
good26 = re.compile(r'''([^\\\]]+)*''')

# NOT GOOD
bad59 = re.compile(r'''(\w*foobarbaz\w*foobarbaz\w*foobarbaz\w*foobarbaz\s*foobarbaz\d*foobarbaz\w*)+-''')

# NOT GOOD
bad60 = re.compile(r'''(.thisisagoddamnlongstringforstresstestingthequery|\sthisisagoddamnlongstringforstresstestingthequery)*-''')

# NOT GOOD
bad61 = re.compile(r'''(thisisagoddamnlongstringforstresstestingthequery|this\w+query)*-''')

# GOOD
good27 = re.compile(r'''(thisisagoddamnlongstringforstresstestingthequery|imanotherbutunrelatedstringcomparedtotheotherstring)*-''')

# GOOD (but false positive caused by the extractor converting all four unpaired surrogates to \uFFFD)
good28 = re.compile('''foo([\uDC66\uDC67]|[\uDC68\uDC69])*foo''')

# GOOD (but false positive caused by the extractor converting all four unpaired surrogates to \uFFFD)
good29 = re.compile('''foo((\uDC66|\uDC67)|(\uDC68|\uDC69))*foo''')

# NOT GOOD (but cannot currently construct a prefix)
bad62 = re.compile(r'''a{2,3}(b+)+X''')

# NOT GOOD (and a good prefix test)
bad63 = re.compile(r'''^<(\w+)((?:\s+\w+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>''')

# GOOD
good30 = re.compile(r'''(a+)*[\s\S][\s\S][\s\S]?''')

# GOOD - but we fail to see that repeating the attack string ends in the "accept any" state (due to not parsing the range `[\s\S]{2,3}`).
good31 = re.compile(r'''(a+)*[\s\S]{2,3}''')

# GOOD - but we spuriously conclude that a rejecting suffix exists (due to not parsing the range `[\s\S]{2,}` when constructing the NFA).
good32 = re.compile(r'''(a+)*([\s\S]{2,}|X)$''')

# GOOD
good33 = re.compile(r'''(a+)*([\s\S]*|X)$''')

# NOT GOOD
bad64 = re.compile(r'''((a+)*$|[\s\S]+)''')

# GOOD - but still flagged. The only change compared to the above is the order of alternatives, which we don't model.
good34 = re.compile(r'''([\s\S]+|(a+)*$)''')

# GOOD
good35 = re.compile(r'''((;|^)a+)+$''')

# NOT GOOD (a good prefix test)
bad65 = re.compile(r'''(^|;)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(e+)+f''')

# NOT GOOD
bad66 = re.compile(r'''^ab(c+)+$''')

# NOT GOOD
bad67 = re.compile(r'''(\d(\s+)*){20}''')

# GOOD - but we spuriously conclude that a rejecting suffix exists.
good36 = re.compile(r'''(([^/]|X)+)(\/[\s\S]*)*$''')

# GOOD - but we spuriously conclude that a rejecting suffix exists.
good37 = re.compile(r'''^((x([^Y]+)?)*(Y|$))''')

# NOT GOOD
bad68 = re.compile(r'''(a*)+b''')

# NOT GOOD
bad69 = re.compile(r'''foo([\w-]*)+bar''')

# NOT GOOD
bad70 = re.compile(r'''((ab)*)+c''')

# NOT GOOD
bad71 = re.compile(r'''(a?a?)*b''')

# GOOD
good38 = re.compile(r'''(a?)*b''')

# NOT GOOD - but not detected
bad72 = re.compile(r'''(c?a?)*b''')

# NOT GOOD
bad73 = re.compile(r'''(?:a|a?)+b''')

# NOT GOOD - but not detected.
bad74 = re.compile(r'''(a?b?)*$''')

# NOT GOOD
bad76 = re.compile(r'''PRE(([a-c]|[c-d])T(e?e?e?e?|X))+(cTcT|cTXcTX$)''')

# NOT GOOD - but not detected
bad77 = re.compile(r'''^((a)+\w)+$''')

# NOT GOOD
bad78 = re.compile(r'''^(b+.)+$''')

# GOOD
good39 = re.compile(r'''a*b''')

# All 4 bad combinations of nested * and +
bad79 = re.compile(r'''(a*)*b''')
bad80 = re.compile(r'''(a+)*b''')
bad81 = re.compile(r'''(a*)+b''')
bad82 = re.compile(r'''(a+)+b''')

# GOOD
good40 = re.compile(r'''(a|b)+''')
good41 = re.compile(r'''(?:[\s;,"'<>(){}|[\]@=+*]|:(?![/\\]))+''') # parses wrongly, sees column 42 as a char set start

# NOT GOOD
bad83 = re.compile(r'''^((?:a{|-)|\w\{)+X$''')
bad84 = re.compile(r'''^((?:a{0|-)|\w\{\d)+X$''')
bad85 = re.compile(r'''^((?:a{0,|-)|\w\{\d,)+X$''')
bad86 = re.compile(r'''^((?:a{0,2|-)|\w\{\d,\d)+X$''')

# GOOD:
good42 = re.compile(r'''^((?:a{0,2}|-)|\w\{\d,\d\})+X$''')

# NOT GOOD
bad87 = re.compile(r'X(\u0061|a)*Y')

# GOOD
good43 = re.compile(r'X(\u0061|b)+Y')