mirror of
https://github.com/github/codeql.git
synced 2026-04-28 10:15:14 +02:00
Merge pull request #13914 from erik-krogh/escape-unicode
ReDoS: escape unicode chars in the output for the ReDoS queries
This commit is contained in:
@@ -3,6 +3,7 @@
|
||||
*/
|
||||
|
||||
private import codeql.regex.RegexTreeView
|
||||
private import codeql.util.Numbers
|
||||
|
||||
/**
|
||||
* Classes and predicates that create an NFA and various algorithms for working with it.
|
||||
@@ -17,6 +18,20 @@ module Make<RegexTreeViewSig TreeImpl> {
|
||||
exists(int code | code = ascii(c) | code + 1 = ascii(result))
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the `i`th codepoint in `s`.
|
||||
*/
|
||||
bindingset[s]
|
||||
private string getCodepointAt(string s, int i) { result = s.regexpFind("(.|\\s)", i, _) }
|
||||
|
||||
/**
|
||||
* Gets the length of `s` in codepoints.
|
||||
*/
|
||||
bindingset[str]
|
||||
private int getCodepointLength(string str) {
|
||||
result = str.regexpReplaceAll("(.|\\s)", "x").length()
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets an approximation for the ASCII code for `char`.
|
||||
* Only the easily printable chars are included (so no newline, tab, null, etc).
|
||||
@@ -189,17 +204,17 @@ module Make<RegexTreeViewSig TreeImpl> {
|
||||
/** An input symbol corresponding to character `c`. */
|
||||
Char(string c) {
|
||||
c =
|
||||
any(RegexpCharacterConstant cc |
|
||||
cc instanceof RelevantRegExpTerm and
|
||||
not isIgnoreCase(cc.getRootTerm())
|
||||
).getValue().charAt(_)
|
||||
getCodepointAt(any(RegexpCharacterConstant cc |
|
||||
cc instanceof RelevantRegExpTerm and
|
||||
not isIgnoreCase(cc.getRootTerm())
|
||||
).getValue(), _)
|
||||
or
|
||||
// normalize everything to lower case if the regexp is case insensitive
|
||||
c =
|
||||
any(RegexpCharacterConstant cc, string char |
|
||||
cc instanceof RelevantRegExpTerm and
|
||||
isIgnoreCase(cc.getRootTerm()) and
|
||||
char = cc.getValue().charAt(_)
|
||||
char = getCodepointAt(cc.getValue(), _)
|
||||
|
|
||||
char.toLowerCase()
|
||||
)
|
||||
@@ -395,7 +410,7 @@ module Make<RegexTreeViewSig TreeImpl> {
|
||||
string getARelevantChar() {
|
||||
exists(ascii(result))
|
||||
or
|
||||
exists(RegexpCharacterConstant c | result = c.getValue().charAt(_))
|
||||
exists(RegexpCharacterConstant c | result = getCodepointAt(c.getValue(), _))
|
||||
or
|
||||
classEscapeMatches(_, result)
|
||||
}
|
||||
@@ -693,6 +708,12 @@ module Make<RegexTreeViewSig TreeImpl> {
|
||||
)
|
||||
}
|
||||
|
||||
pragma[noinline]
|
||||
private int getCodepointLengthForState(string s) {
|
||||
result = getCodepointLength(s) and
|
||||
s = any(RegexpCharacterConstant reg).getValue()
|
||||
}
|
||||
|
||||
/**
|
||||
* Holds if the NFA has a transition from `q1` to `q2` labelled with `lbl`.
|
||||
*/
|
||||
@@ -701,16 +722,16 @@ module Make<RegexTreeViewSig TreeImpl> {
|
||||
q1 = Match(s, i) and
|
||||
(
|
||||
not isIgnoreCase(s.getRootTerm()) and
|
||||
lbl = Char(s.getValue().charAt(i))
|
||||
lbl = Char(getCodepointAt(s.getValue(), i))
|
||||
or
|
||||
// normalize everything to lower case if the regexp is case insensitive
|
||||
isIgnoreCase(s.getRootTerm()) and
|
||||
exists(string c | c = s.getValue().charAt(i) | lbl = Char(c.toLowerCase()))
|
||||
exists(string c | c = getCodepointAt(s.getValue(), i) | lbl = Char(c.toLowerCase()))
|
||||
) and
|
||||
(
|
||||
q2 = Match(s, i + 1)
|
||||
or
|
||||
s.getValue().length() = i + 1 and
|
||||
getCodepointLengthForState(s.getValue()) = i + 1 and
|
||||
q2 = after(s)
|
||||
)
|
||||
)
|
||||
@@ -811,7 +832,7 @@ module Make<RegexTreeViewSig TreeImpl> {
|
||||
Match(RelevantRegExpTerm t, int i) {
|
||||
i = 0
|
||||
or
|
||||
exists(t.(RegexpCharacterConstant).getValue().charAt(i))
|
||||
exists(getCodepointAt(t.(RegexpCharacterConstant).getValue(), i))
|
||||
} or
|
||||
/**
|
||||
* An accept state, where exactly the given input string is accepted.
|
||||
@@ -1104,7 +1125,9 @@ module Make<RegexTreeViewSig TreeImpl> {
|
||||
*/
|
||||
predicate reachesOnlyRejectableSuffixes(State fork, string w) {
|
||||
isReDoSCandidate(fork, w) and
|
||||
forex(State next | next = process(fork, w, w.length() - 1) | isLikelyRejectable(next)) and
|
||||
forex(State next | next = process(fork, w, getCodepointLengthForCandidate(w) - 1) |
|
||||
isLikelyRejectable(next)
|
||||
) and
|
||||
not getProcessPrevious(fork, _, w) = acceptsAnySuffix() // we stop `process(..)` early if we can, check here if it happened.
|
||||
}
|
||||
|
||||
@@ -1214,6 +1237,13 @@ module Make<RegexTreeViewSig TreeImpl> {
|
||||
exists(string char | char = ["|", "\n", "Z"] | not deltaClosedChar(s, char, _))
|
||||
}
|
||||
|
||||
// `process` can't use pragma[inline] predicates. So a materialized version of `getCodepointAt` is needed.
|
||||
pragma[noinline]
|
||||
private string getCodePointAtForProcess(string str, int i) {
|
||||
result = getCodepointAt(str, i) and
|
||||
isReDoSCandidate(_, str)
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets a state that can be reached from pumpable `fork` consuming all
|
||||
* chars in `w` any number of times followed by the first `i+1` characters of `w`.
|
||||
@@ -1223,7 +1253,7 @@ module Make<RegexTreeViewSig TreeImpl> {
|
||||
exists(State prev | prev = getProcessPrevious(fork, i, w) |
|
||||
not prev = acceptsAnySuffix() and // we stop `process(..)` early if we can. If the successor accepts any suffix, then we know it can never be rejected.
|
||||
exists(string char, InputSymbol sym |
|
||||
char = w.charAt(i) and
|
||||
char = getCodePointAtForProcess(w, i) and
|
||||
deltaClosed(prev, sym, result) and
|
||||
// noopt to prevent joining `prev` with all possible `chars` that could transition away from `prev`.
|
||||
// Instead only join with the set of `chars` where a relevant `InputSymbol` has already been found.
|
||||
@@ -1232,6 +1262,12 @@ module Make<RegexTreeViewSig TreeImpl> {
|
||||
)
|
||||
}
|
||||
|
||||
pragma[noinline]
|
||||
private int getCodepointLengthForCandidate(string s) {
|
||||
result = getCodepointLength(s) and
|
||||
isReDoSCandidate(_, s)
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets a state that can be reached from pumpable `fork` consuming all
|
||||
* chars in `w` any number of times followed by the first `i` characters of `w`.
|
||||
@@ -1245,7 +1281,7 @@ module Make<RegexTreeViewSig TreeImpl> {
|
||||
or
|
||||
// repeat until fixpoint
|
||||
i = 0 and
|
||||
result = process(fork, w, w.length() - 1)
|
||||
result = process(fork, w, getCodepointLengthForCandidate(w) - 1)
|
||||
)
|
||||
}
|
||||
|
||||
@@ -1261,7 +1297,9 @@ module Make<RegexTreeViewSig TreeImpl> {
|
||||
/**
|
||||
* Gets a `char` that occurs in a `pump` string.
|
||||
*/
|
||||
private string getAProcessChar() { result = any(string s | isReDoSCandidate(_, s)).charAt(_) }
|
||||
private string getAProcessChar() {
|
||||
result = getCodepointAt(any(string s | isReDoSCandidate(_, s)), _)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -1305,10 +1343,40 @@ module Make<RegexTreeViewSig TreeImpl> {
|
||||
bindingset[s]
|
||||
private string escape(string s) {
|
||||
result =
|
||||
s.replaceAll("\\", "\\\\")
|
||||
.replaceAll("\n", "\\n")
|
||||
.replaceAll("\r", "\\r")
|
||||
.replaceAll("\t", "\\t")
|
||||
escapeUnicodeString(s.replaceAll("\\", "\\\\")
|
||||
.replaceAll("\n", "\\n")
|
||||
.replaceAll("\r", "\\r")
|
||||
.replaceAll("\t", "\\t"))
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets a string where the unicode characters in `s` have been escaped.
|
||||
*/
|
||||
bindingset[s]
|
||||
private string escapeUnicodeString(string s) {
|
||||
result =
|
||||
concat(int i, string char | char = escapeUnicodeChar(getCodepointAt(s, i)) | char order by i)
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets a unicode escaped string for `char`.
|
||||
* If `char` is a printable char, then `char` is returned.
|
||||
*/
|
||||
bindingset[char]
|
||||
private string escapeUnicodeChar(string char) {
|
||||
if isPrintable(char)
|
||||
then result = char
|
||||
else
|
||||
if exists(to4digitHex(any(int i | i.toUnicode() = char)))
|
||||
then result = "\\u" + to4digitHex(any(int i | i.toUnicode() = char))
|
||||
else result = "\\u{" + toHex(any(int i | i.toUnicode() = char)) + "}"
|
||||
}
|
||||
|
||||
/** Holds if `char` is easily printable char, or whitespace. */
|
||||
private predicate isPrintable(string char) {
|
||||
exists(ascii(char))
|
||||
or
|
||||
char = "\n\r\t".charAt(_)
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -3,4 +3,5 @@ version: 0.1.3-dev
|
||||
groups: shared
|
||||
library: true
|
||||
dependencies:
|
||||
codeql/util: ${workspace}
|
||||
warnOnImplicitThis: true
|
||||
|
||||
@@ -50,7 +50,7 @@ int parseHexInt(string hex) {
|
||||
sum(int index, string c |
|
||||
c = stripped.charAt(index)
|
||||
|
|
||||
sixteenToThe(stripped.length() - 1 - index) * toHex(c)
|
||||
sixteenToThe(stripped.length() - 1 - index) * charToHex(c)
|
||||
)
|
||||
)
|
||||
}
|
||||
@@ -83,7 +83,7 @@ int parseOctalInt(string octal) {
|
||||
}
|
||||
|
||||
/** Gets the integer value of the `hex` char. */
|
||||
private int toHex(string hex) {
|
||||
private int charToHex(string hex) {
|
||||
hex = [0 .. 9].toString() and
|
||||
result = hex.toInt()
|
||||
or
|
||||
@@ -100,6 +100,32 @@ private int toHex(string hex) {
|
||||
result = 15 and hex = ["f", "F"]
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets a 4-digit hex representation of `i`.
|
||||
*/
|
||||
bindingset[i]
|
||||
string to4digitHex(int i) {
|
||||
i >= 0 and
|
||||
i <= 65535 and
|
||||
exists(string hex | hex = toHex(i) |
|
||||
result = concat(int zeroes | zeroes = [1 .. 4 - hex.length()] | "0") + hex
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets a hex representation of `i`.
|
||||
*/
|
||||
bindingset[i]
|
||||
string toHex(int i) {
|
||||
result =
|
||||
// make the number with lots of preceding zeroes, then remove all preceding zeroes in a post-processing step
|
||||
concat(int shift |
|
||||
shift in [28, 24, 20, 16, 12, 8, 4, 0]
|
||||
|
|
||||
"0123456789abcdef".charAt(i.bitShiftRight(shift).bitAnd(15)) order by shift desc
|
||||
).regexpReplaceAll("^0*", "")
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the value of 16 to the power of `n`. Holds only for `n` in the range
|
||||
* 0..7 (inclusive).
|
||||
|
||||
Reference in New Issue
Block a user