Merge pull request #14481 from erik-krogh/proper-codepoints

ReDoS: use the new codePointAt and codePointCount methods instead of regex hacks
This commit is contained in:
Erik Krogh Kristensen
2023-10-13 09:35:55 +02:00
committed by GitHub
3 changed files with 34 additions and 9 deletions

View File

@@ -1993,6 +1993,10 @@ The following built-in predicates are members of type ``int``:
+-------------------------+-------------+----------------+----------------------------------------------------------------------------------------------------------------+
| ``toUnicode`` | string | | The result is the unicode character for the receiver seen as a unicode code point. |
+-------------------------+-------------+----------------+----------------------------------------------------------------------------------------------------------------+
| ``codePointAt`` | int | int | The result is the unicode code point at the index given by the argument. |
+-------------------------+-------------+----------------+----------------------------------------------------------------------------------------------------------------+
| ``codePointCount`` | int | int, int | The result is the number of unicode code points in the receiver between the given indices. |
+-------------------------+-------------+----------------+----------------------------------------------------------------------------------------------------------------+
The leftmost bit after ``bitShiftRightSigned`` depends on sign extension, whereas after ``bitShiftRight`` it is zero.

View File

@@ -164,17 +164,17 @@ module Make<RegexTreeViewSig TreeImpl> {
/** An input symbol corresponding to character `c`. */
Char(string c) {
c =
getCodepointAt(any(RegexpCharacterConstant cc |
getACodepoint(any(RegexpCharacterConstant cc |
cc instanceof RelevantRegExpTerm and
not isIgnoreCase(cc.getRootTerm())
).getValue(), _)
).getValue())
or
// normalize everything to lower case if the regexp is case insensitive
c =
any(RegexpCharacterConstant cc, string char |
cc instanceof RelevantRegExpTerm and
isIgnoreCase(cc.getRootTerm()) and
char = getCodepointAt(cc.getValue(), _)
char = getACodepoint(cc.getValue())
|
char.toLowerCase()
)
@@ -370,7 +370,7 @@ module Make<RegexTreeViewSig TreeImpl> {
string getARelevantChar() {
exists(asciiPrintable(result))
or
exists(RegexpCharacterConstant c | result = getCodepointAt(c.getValue(), _))
exists(RegexpCharacterConstant c | result = getACodepoint(c.getValue()))
or
classEscapeMatches(_, result)
}
@@ -1258,7 +1258,7 @@ module Make<RegexTreeViewSig TreeImpl> {
* Gets a `char` that occurs in a `pump` string.
*/
private string getAProcessChar() {
result = getCodepointAt(any(string s | isReDoSCandidate(_, s)), _)
result = getACodepoint(any(string s | isReDoSCandidate(_, s)))
}
}

View File

@@ -18,7 +18,11 @@ string escape(string s) {
bindingset[s]
private string escapeUnicodeString(string s) {
result =
concat(int i, string char | char = escapeUnicodeChar(getCodepointAt(s, i)) | char order by i)
concat(int i, string char |
char = escapeUnicodeChar(s.codePointAt(i).toUnicode())
|
char order by i
)
}
/**
@@ -44,15 +48,32 @@ private predicate isPrintable(string char) {
/**
* Gets the `i`th codepoint in `s`.
* Unpaired surrogates are skipped.
*/
bindingset[s]
string getCodepointAt(string s, int i) { result = s.regexpFind("(.|\\s)", i, _) }
string getCodepointAt(string s, int i) {
// codePointAt returns the integer codePoint, so we need to convert to a string.
// codePointAt returns integers for both the high and low end. The invalid strings are filtered out by `toUnicode`, but we need to re-count the index, therefore the rank.
// rank is 1-indexed, so we need to offset for that to make this predicate 0-indexed.
result =
rank[i + 1](string char, int charIndex |
char = s.codePointAt(charIndex).toUnicode()
|
char order by charIndex
)
}
/**
* Gets the length of `s` in codepoints.
* Gets any unicode character that appears in `s`.
*/
bindingset[s]
string getACodepoint(string s) { result = s.codePointAt(_).toUnicode() }
/**
* Gets the number of unicode codepoints in `s` not counting unpaired surrogates.
*/
bindingset[str]
int getCodepointLength(string str) { result = str.regexpReplaceAll("(.|\\s)", "x").length() }
int getCodepointLength(string str) { result = str.codePointCount(0, str.length()) }
/**
* Gets the ASCII code for `char`.