enable unicode support in the Python ReDoS query

This commit is contained in:
Erik Krogh Kristensen
2021-07-11 21:28:40 +02:00
parent 1d56748eed
commit 440e4b9a92
3 changed files with 42 additions and 37 deletions

View File

@@ -473,46 +473,44 @@ class RegExpEscape extends RegExpNormalChar {
* E.g. for `\u0061` this returns "a".
*/
private string getUnicode() {
// TODO: Enable this once a supporting CLI is released.
// exists(int codepoint | codepoint = sum(getHexValueFromUnicode(_)) |
// result = codepoint.toUnicode()
// )
none()
exists(int codepoint | codepoint = sum(getHexValueFromUnicode(_)) |
result = codepoint.toUnicode()
)
}
/**
* Gets int value for the `index`th char in the hex number of the unicode escape.
* E.g. for `\u0061` and `index = 2` this returns 96 (the number `6` interpreted as hex).
*/
private int getHexValueFromUnicode(int index) {
isUnicode() and
exists(string hex, string char | hex = getText().suffix(2) |
char = hex.charAt(index) and
result = 16.pow(hex.length() - index - 1) * toHex(char)
)
}
// TODO: Enable this once a supporting CLI is released.
// /**
// * Gets int value for the `index`th char in the hex number of the unicode escape.
// * E.g. for `\u0061` and `index = 2` this returns 96 (the number `6` interpreted as hex).
// */
// private int getHexValueFromUnicode(int index) {
// isUnicode() and
// exists(string hex, string char | hex = getText().suffix(2) |
// char = hex.charAt(index) and
// result = 16.pow(hex.length() - index - 1) * toHex(char)
// )
// }
}
// TODO: Enable this once a supporting CLI is released.
// /**
// * Gets the hex number for the `hex` char.
// */
// private int toHex(string hex) {
// hex = [0 .. 9].toString() and
// result = hex.toInt()
// or
// result = 10 and hex = ["a", "A"]
// or
// result = 11 and hex = ["b", "B"]
// or
// result = 12 and hex = ["c", "C"]
// or
// result = 13 and hex = ["d", "D"]
// or
// result = 14 and hex = ["e", "E"]
// or
// result = 15 and hex = ["f", "F"]
// }
/**
* Gets the hex number for the `hex` char.
*/
private int toHex(string hex) {
hex = [0 .. 9].toString() and
result = hex.toInt()
or
result = 10 and hex = ["a", "A"]
or
result = 11 and hex = ["b", "B"]
or
result = 12 and hex = ["c", "C"]
or
result = 13 and hex = ["d", "D"]
or
result = 14 and hex = ["e", "E"]
or
result = 15 and hex = ["f", "F"]
}
/**
* A character class escape in a regular expression.
* That is, an escaped charachter that denotes multiple characters.

View File

@@ -92,5 +92,6 @@
| redos.py:363:25:363:43 | ((?:a{0\|-)\|\\w\\{\\d)+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a{0'. |
| redos.py:364:25:364:45 | ((?:a{0,\|-)\|\\w\\{\\d,)+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a{0,'. |
| redos.py:365:25:365:48 | ((?:a{0,2\|-)\|\\w\\{\\d,\\d)+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a{0,2'. |
| redos.py:371:25:371:35 | (\\u0061\|a)* | This part of the regular expression may cause exponential backtracking on strings starting with 'X' and containing many repetitions of 'a'. |
| unittests.py:5:17:5:23 | (\u00c6\|\\\u00c6)+ | This part of the regular expression may cause exponential backtracking on strings starting with 'X' and containing many repetitions of '\u00c6'. |
| unittests.py:9:16:9:24 | (?:.\|\\n)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\n'. |

View File

@@ -366,3 +366,9 @@ bad86 = re.compile(r'''^((?:a{0,2|-)|\w\{\d,\d)+X$''')
# GOOD:
good42 = re.compile(r'''^((?:a{0,2}|-)|\w\{\d,\d\})+X$''')
# NOT GOOD
bad87 = re.compile(r'X(\u0061|a)*Y')
# GOOD
good43 = re.compile(r'X(\u0061|b)+Y')