mirror of
https://github.com/github/codeql.git
synced 2025-12-16 16:53:25 +01:00
support case insensitive regexps in the ReDoS queries
This commit is contained in:
@@ -462,4 +462,4 @@
|
||||
"javascript/ql/lib/semmle/javascript/security/performance/SuperlinearBackTracking.qll",
|
||||
"python/ql/lib/semmle/python/security/performance/SuperlinearBackTracking.qll"
|
||||
]
|
||||
}
|
||||
}
|
||||
@@ -164,6 +164,7 @@ class RelevantRegExpTerm extends RegExpTerm {
|
||||
|
||||
/**
|
||||
* Holds if `term` is the chosen canonical representative for all terms with string representation `str`.
|
||||
* The string representation includes which flags are used with the regular expression.
|
||||
*
|
||||
* Using canonical representatives gives a huge performance boost when working with tuples containing multiple `InputSymbol`s.
|
||||
* The number of `InputSymbol`s is decreased by 3 orders of magnitude or more in some larger benchmarks.
|
||||
@@ -173,26 +174,54 @@ private predicate isCanonicalTerm(RelevantRegExpTerm term, string str) {
|
||||
min(RelevantRegExpTerm t, Location loc, File file |
|
||||
loc = t.getLocation() and
|
||||
file = t.getFile() and
|
||||
str = t.getRawValue()
|
||||
str = t.getRawValue() + "|" + getCanonicalizationFlags(t.getRootTerm())
|
||||
|
|
||||
t order by t.getFile().getRelativePath(), loc.getStartLine(), loc.getStartColumn()
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets a string reperesentation of the flags used with the regular expression.
|
||||
* Only the flags that are relevant for the canonicalization are included.
|
||||
*/
|
||||
string getCanonicalizationFlags(RegExpTerm root) {
|
||||
root.isRootTerm() and
|
||||
(
|
||||
RegExpFlags::isIgnoreCase(root) and
|
||||
result = "i"
|
||||
or
|
||||
not RegExpFlags::isIgnoreCase(root) and
|
||||
result = ""
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* An abstract input symbol, representing a set of concrete characters.
|
||||
*/
|
||||
private newtype TInputSymbol =
|
||||
/** An input symbol corresponding to character `c`. */
|
||||
Char(string c) {
|
||||
c = any(RegexpCharacterConstant cc | cc instanceof RelevantRegExpTerm).getValue().charAt(_)
|
||||
c =
|
||||
any(RegexpCharacterConstant cc |
|
||||
cc instanceof RelevantRegExpTerm and
|
||||
not RegExpFlags::isIgnoreCase(cc.getRootTerm())
|
||||
).getValue().charAt(_)
|
||||
or
|
||||
// normalize to lower case if the regexp is case insensitive
|
||||
c =
|
||||
any(RegexpCharacterConstant cc, string char |
|
||||
cc instanceof RelevantRegExpTerm and
|
||||
RegExpFlags::isIgnoreCase(cc.getRootTerm()) and
|
||||
char = cc.getValue().charAt(_)
|
||||
|
|
||||
char.toLowerCase()
|
||||
)
|
||||
} or
|
||||
/**
|
||||
* An input symbol representing all characters matched by
|
||||
* a (non-universal) character class that has string representation `charClassString`.
|
||||
*/
|
||||
CharClass(string charClassString) {
|
||||
exists(RelevantRegExpTerm term | term.getRawValue() = charClassString) and
|
||||
exists(RelevantRegExpTerm recc | isCanonicalTerm(recc, charClassString) |
|
||||
recc instanceof RegExpCharacterClass and
|
||||
not recc.(RegExpCharacterClass).isUniversalClass()
|
||||
@@ -293,6 +322,19 @@ private module CharacterClasses {
|
||||
*/
|
||||
pragma[noinline]
|
||||
predicate hasChildThatMatches(RegExpCharacterClass cc, string char) {
|
||||
if RegExpFlags::isIgnoreCase(cc.getRootTerm())
|
||||
then
|
||||
// normalize everything to lower case if the regexp is case insensitive
|
||||
exists(string c | hasChildThatMatchesIgnoringCasing(cc, c) | char = c.toLowerCase())
|
||||
else hasChildThatMatchesIgnoringCasing(cc, char)
|
||||
}
|
||||
|
||||
/**
|
||||
* Holds if the character class `cc` has a child (constant or range) that matches `char`.
|
||||
* Ignores whether the character class is inside a regular expression that ignores casing.
|
||||
*/
|
||||
pragma[noinline]
|
||||
predicate hasChildThatMatchesIgnoringCasing(RegExpCharacterClass cc, string char) {
|
||||
exists(getCanonicalCharClass(cc)) and
|
||||
exists(RegExpTerm child | child = cc.getAChild() |
|
||||
char = child.(RegexpCharacterConstant).getValue()
|
||||
@@ -508,7 +550,7 @@ private State before(RegExpTerm t) { result = Match(t, 0) }
|
||||
/**
|
||||
* Gets a state the NFA may be in after matching `t`.
|
||||
*/
|
||||
private State after(RegExpTerm t) {
|
||||
State after(RegExpTerm t) {
|
||||
exists(RegExpAlt alt | t = alt.getAChild() | result = after(alt))
|
||||
or
|
||||
exists(RegExpSequence seq, int i | t = seq.getChild(i) |
|
||||
@@ -537,7 +579,14 @@ private State after(RegExpTerm t) {
|
||||
predicate delta(State q1, EdgeLabel lbl, State q2) {
|
||||
exists(RegexpCharacterConstant s, int i |
|
||||
q1 = Match(s, i) and
|
||||
lbl = Char(s.getValue().charAt(i)) and
|
||||
(
|
||||
not RegExpFlags::isIgnoreCase(s.getRootTerm()) and
|
||||
lbl = Char(s.getValue().charAt(i))
|
||||
or
|
||||
// normalizing to lower case if ignorecase flag is set
|
||||
RegExpFlags::isIgnoreCase(s.getRootTerm()) and
|
||||
exists(string c | c = s.getValue().charAt(i) | lbl = Char(c.toLowerCase()))
|
||||
) and
|
||||
(
|
||||
q2 = Match(s, i + 1)
|
||||
or
|
||||
@@ -547,20 +596,20 @@ predicate delta(State q1, EdgeLabel lbl, State q2) {
|
||||
)
|
||||
or
|
||||
exists(RegExpDot dot | q1 = before(dot) and q2 = after(dot) |
|
||||
if dot.getLiteral().isDotAll() then lbl = Any() else lbl = Dot()
|
||||
if RegExpFlags::isDotAll(dot.getRootTerm()) then lbl = Any() else lbl = Dot()
|
||||
)
|
||||
or
|
||||
exists(RegExpCharacterClass cc |
|
||||
cc.isUniversalClass() and q1 = before(cc) and lbl = Any() and q2 = after(cc)
|
||||
or
|
||||
q1 = before(cc) and
|
||||
lbl = CharClass(cc.getRawValue()) and
|
||||
lbl = CharClass(cc.getRawValue() + "|" + getCanonicalizationFlags(cc.getRootTerm())) and
|
||||
q2 = after(cc)
|
||||
)
|
||||
or
|
||||
exists(RegExpCharacterClassEscape cc |
|
||||
q1 = before(cc) and
|
||||
lbl = CharClass(cc.getRawValue()) and
|
||||
lbl = CharClass(cc.getRawValue() + "|" + getCanonicalizationFlags(cc.getRootTerm())) and
|
||||
q2 = after(cc)
|
||||
)
|
||||
or
|
||||
@@ -627,13 +676,27 @@ RegExpRoot getRoot(RegExpTerm term) {
|
||||
result = getRoot(term.getParent())
|
||||
}
|
||||
|
||||
private newtype TState =
|
||||
/**
|
||||
* A state in the NFA.
|
||||
*/
|
||||
newtype TState =
|
||||
/**
|
||||
* A state representing that the NFA is about to match a term.
|
||||
* `i` is used to index into multi-char literals.
|
||||
*/
|
||||
Match(RelevantRegExpTerm t, int i) {
|
||||
i = 0
|
||||
or
|
||||
exists(t.(RegexpCharacterConstant).getValue().charAt(i))
|
||||
} or
|
||||
/**
|
||||
* An accept state, where exactly the given input string is accepted.
|
||||
*/
|
||||
Accept(RegExpRoot l) { l.isRelevant() } or
|
||||
/**
|
||||
* An accept state, where the given input string, or any string that has this
|
||||
* string as a prefix, is accepted.
|
||||
*/
|
||||
AcceptAnySuffix(RegExpRoot l) { l.isRelevant() }
|
||||
|
||||
/**
|
||||
|
||||
@@ -12,3 +12,41 @@ import javascript
|
||||
* For javascript we make the pragmatic performance optimization to ignore minified files.
|
||||
*/
|
||||
predicate isExcluded(RegExpParent parent) { parent.(Expr).getTopLevel().isMinified() }
|
||||
|
||||
/**
|
||||
* A module containing predicates for determining which flags a regular expression have.
|
||||
*/
|
||||
module RegExpFlags {
|
||||
/**
|
||||
* Holds if `root` has the `i` flag for case-insensitive matching.
|
||||
*/
|
||||
predicate isIgnoreCase(RegExpTerm root) {
|
||||
root.isRootTerm() and
|
||||
exists(DataFlow::RegExpCreationNode node | node.getRoot() = root |
|
||||
RegExp::isIgnoreCase(node.getFlags())
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the flags for `root`, or the empty string if `root` has no flags.
|
||||
*/
|
||||
string getFlags(RegExpTerm root) {
|
||||
root.isRootTerm() and
|
||||
exists(DataFlow::RegExpCreationNode node | node.getRoot() = root |
|
||||
result = node.getFlags()
|
||||
or
|
||||
not exists(node.getFlags()) and
|
||||
result = ""
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Holds if `root` has the `s` flag for multi-line matching.
|
||||
*/
|
||||
predicate isDotAll(RegExpTerm root) {
|
||||
root.isRootTerm() and
|
||||
exists(DataFlow::RegExpCreationNode node | node.getRoot() = root |
|
||||
RegExp::isDotAll(node.getFlags())
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -502,3 +502,7 @@
|
||||
| tst.js:375:15:375:16 | x* | Strings with many repetitions of 'x' can start matching anywhere after the start of the preceeding (x*)+(?=$\|y) |
|
||||
| tst.js:378:16:378:22 | [\\s\\S]* | Strings with many repetitions of 'a' can start matching anywhere after the start of the preceeding ([\\s\\S]*)+(?=$) |
|
||||
| tst.js:379:16:379:22 | [\\s\\S]* | Strings with many repetitions of 'a' can start matching anywhere after the start of the preceeding ([\\s\\S]*)+(?=$\|y) |
|
||||
| tst.js:381:15:381:24 | (foo\|FOO)* | Strings with many repetitions of 'FOO' can start matching anywhere after the start of the preceeding (foo\|FOO)*bar |
|
||||
| tst.js:382:14:382:23 | (foo\|FOO)* | Strings with many repetitions of 'foo' can start matching anywhere after the start of the preceeding (foo\|FOO)*bar |
|
||||
| tst.js:384:15:384:26 | ([AB]\|[ab])* | Strings with many repetitions of 'A' can start matching anywhere after the start of the preceeding ([AB]\|[ab])*C |
|
||||
| tst.js:385:14:385:25 | ([DE]\|[de])* | Strings with many repetitions of 'd' can start matching anywhere after the start of the preceeding ([DE]\|[de])*F |
|
||||
|
||||
@@ -178,3 +178,5 @@
|
||||
| tst.js:375:15:375:16 | x* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'x'. |
|
||||
| tst.js:378:16:378:22 | [\\s\\S]* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a'. |
|
||||
| tst.js:379:16:379:22 | [\\s\\S]* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a'. |
|
||||
| tst.js:382:14:382:23 | (foo\|FOO)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'foo'. |
|
||||
| tst.js:385:14:385:25 | ([DE]\|[de])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'd'. |
|
||||
|
||||
@@ -376,4 +376,10 @@ var bad90 = /(x*)+(?=$|y)/
|
||||
|
||||
// GOOD - but we spuriously conclude that a rejecting suffix exists.
|
||||
var good44 = /([\s\S]*)+(?=$)/;
|
||||
var good45 = /([\s\S]*)+(?=$|y)/;
|
||||
var good45 = /([\s\S]*)+(?=$|y)/;
|
||||
|
||||
var good46 = /(foo|FOO)*bar/;
|
||||
var bad91 = /(foo|FOO)*bar/i;
|
||||
|
||||
var good47 = /([AB]|[ab])*C/;
|
||||
var bad92 = /([DE]|[de])*F/i;
|
||||
|
||||
@@ -61,6 +61,14 @@ class RegExpLiteral extends TRegExpLiteral, RegExpParent {
|
||||
|
||||
predicate isDotAll() { re.getAMode() = "DOTALL" }
|
||||
|
||||
predicate isIgnoreCase() { re.getAMode() = "IGNORECASE" }
|
||||
|
||||
string getFlags() {
|
||||
not exists(re.getAMode()) and result = ""
|
||||
or
|
||||
result = strictconcat(string mode | mode = re.getAMode() | mode, " | ")
|
||||
}
|
||||
|
||||
override Regex getRegex() { result = re }
|
||||
|
||||
string getPrimaryQLClass() { result = "RegExpLiteral" }
|
||||
|
||||
@@ -164,6 +164,7 @@ class RelevantRegExpTerm extends RegExpTerm {
|
||||
|
||||
/**
|
||||
* Holds if `term` is the chosen canonical representative for all terms with string representation `str`.
|
||||
* The string representation includes which flags are used with the regular expression.
|
||||
*
|
||||
* Using canonical representatives gives a huge performance boost when working with tuples containing multiple `InputSymbol`s.
|
||||
* The number of `InputSymbol`s is decreased by 3 orders of magnitude or more in some larger benchmarks.
|
||||
@@ -173,26 +174,54 @@ private predicate isCanonicalTerm(RelevantRegExpTerm term, string str) {
|
||||
min(RelevantRegExpTerm t, Location loc, File file |
|
||||
loc = t.getLocation() and
|
||||
file = t.getFile() and
|
||||
str = t.getRawValue()
|
||||
str = t.getRawValue() + "|" + getCanonicalizationFlags(t.getRootTerm())
|
||||
|
|
||||
t order by t.getFile().getRelativePath(), loc.getStartLine(), loc.getStartColumn()
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets a string reperesentation of the flags used with the regular expression.
|
||||
* Only the flags that are relevant for the canonicalization are included.
|
||||
*/
|
||||
string getCanonicalizationFlags(RegExpTerm root) {
|
||||
root.isRootTerm() and
|
||||
(
|
||||
RegExpFlags::isIgnoreCase(root) and
|
||||
result = "i"
|
||||
or
|
||||
not RegExpFlags::isIgnoreCase(root) and
|
||||
result = ""
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* An abstract input symbol, representing a set of concrete characters.
|
||||
*/
|
||||
private newtype TInputSymbol =
|
||||
/** An input symbol corresponding to character `c`. */
|
||||
Char(string c) {
|
||||
c = any(RegexpCharacterConstant cc | cc instanceof RelevantRegExpTerm).getValue().charAt(_)
|
||||
c =
|
||||
any(RegexpCharacterConstant cc |
|
||||
cc instanceof RelevantRegExpTerm and
|
||||
not RegExpFlags::isIgnoreCase(cc.getRootTerm())
|
||||
).getValue().charAt(_)
|
||||
or
|
||||
// normalize to lower case if the regexp is case insensitive
|
||||
c =
|
||||
any(RegexpCharacterConstant cc, string char |
|
||||
cc instanceof RelevantRegExpTerm and
|
||||
RegExpFlags::isIgnoreCase(cc.getRootTerm()) and
|
||||
char = cc.getValue().charAt(_)
|
||||
|
|
||||
char.toLowerCase()
|
||||
)
|
||||
} or
|
||||
/**
|
||||
* An input symbol representing all characters matched by
|
||||
* a (non-universal) character class that has string representation `charClassString`.
|
||||
*/
|
||||
CharClass(string charClassString) {
|
||||
exists(RelevantRegExpTerm term | term.getRawValue() = charClassString) and
|
||||
exists(RelevantRegExpTerm recc | isCanonicalTerm(recc, charClassString) |
|
||||
recc instanceof RegExpCharacterClass and
|
||||
not recc.(RegExpCharacterClass).isUniversalClass()
|
||||
@@ -293,6 +322,19 @@ private module CharacterClasses {
|
||||
*/
|
||||
pragma[noinline]
|
||||
predicate hasChildThatMatches(RegExpCharacterClass cc, string char) {
|
||||
if RegExpFlags::isIgnoreCase(cc.getRootTerm())
|
||||
then
|
||||
// normalize everything to lower case if the regexp is case insensitive
|
||||
exists(string c | hasChildThatMatchesIgnoringCasing(cc, c) | char = c.toLowerCase())
|
||||
else hasChildThatMatchesIgnoringCasing(cc, char)
|
||||
}
|
||||
|
||||
/**
|
||||
* Holds if the character class `cc` has a child (constant or range) that matches `char`.
|
||||
* Ignores whether the character class is inside a regular expression that ignores casing.
|
||||
*/
|
||||
pragma[noinline]
|
||||
predicate hasChildThatMatchesIgnoringCasing(RegExpCharacterClass cc, string char) {
|
||||
exists(getCanonicalCharClass(cc)) and
|
||||
exists(RegExpTerm child | child = cc.getAChild() |
|
||||
char = child.(RegexpCharacterConstant).getValue()
|
||||
@@ -508,7 +550,7 @@ private State before(RegExpTerm t) { result = Match(t, 0) }
|
||||
/**
|
||||
* Gets a state the NFA may be in after matching `t`.
|
||||
*/
|
||||
private State after(RegExpTerm t) {
|
||||
State after(RegExpTerm t) {
|
||||
exists(RegExpAlt alt | t = alt.getAChild() | result = after(alt))
|
||||
or
|
||||
exists(RegExpSequence seq, int i | t = seq.getChild(i) |
|
||||
@@ -537,7 +579,14 @@ private State after(RegExpTerm t) {
|
||||
predicate delta(State q1, EdgeLabel lbl, State q2) {
|
||||
exists(RegexpCharacterConstant s, int i |
|
||||
q1 = Match(s, i) and
|
||||
lbl = Char(s.getValue().charAt(i)) and
|
||||
(
|
||||
not RegExpFlags::isIgnoreCase(s.getRootTerm()) and
|
||||
lbl = Char(s.getValue().charAt(i))
|
||||
or
|
||||
// normalizing to lower case if ignorecase flag is set
|
||||
RegExpFlags::isIgnoreCase(s.getRootTerm()) and
|
||||
exists(string c | c = s.getValue().charAt(i) | lbl = Char(c.toLowerCase()))
|
||||
) and
|
||||
(
|
||||
q2 = Match(s, i + 1)
|
||||
or
|
||||
@@ -547,20 +596,20 @@ predicate delta(State q1, EdgeLabel lbl, State q2) {
|
||||
)
|
||||
or
|
||||
exists(RegExpDot dot | q1 = before(dot) and q2 = after(dot) |
|
||||
if dot.getLiteral().isDotAll() then lbl = Any() else lbl = Dot()
|
||||
if RegExpFlags::isDotAll(dot.getRootTerm()) then lbl = Any() else lbl = Dot()
|
||||
)
|
||||
or
|
||||
exists(RegExpCharacterClass cc |
|
||||
cc.isUniversalClass() and q1 = before(cc) and lbl = Any() and q2 = after(cc)
|
||||
or
|
||||
q1 = before(cc) and
|
||||
lbl = CharClass(cc.getRawValue()) and
|
||||
lbl = CharClass(cc.getRawValue() + "|" + getCanonicalizationFlags(cc.getRootTerm())) and
|
||||
q2 = after(cc)
|
||||
)
|
||||
or
|
||||
exists(RegExpCharacterClassEscape cc |
|
||||
q1 = before(cc) and
|
||||
lbl = CharClass(cc.getRawValue()) and
|
||||
lbl = CharClass(cc.getRawValue() + "|" + getCanonicalizationFlags(cc.getRootTerm())) and
|
||||
q2 = after(cc)
|
||||
)
|
||||
or
|
||||
@@ -627,13 +676,27 @@ RegExpRoot getRoot(RegExpTerm term) {
|
||||
result = getRoot(term.getParent())
|
||||
}
|
||||
|
||||
private newtype TState =
|
||||
/**
|
||||
* A state in the NFA.
|
||||
*/
|
||||
newtype TState =
|
||||
/**
|
||||
* A state representing that the NFA is about to match a term.
|
||||
* `i` is used to index into multi-char literals.
|
||||
*/
|
||||
Match(RelevantRegExpTerm t, int i) {
|
||||
i = 0
|
||||
or
|
||||
exists(t.(RegexpCharacterConstant).getValue().charAt(i))
|
||||
} or
|
||||
/**
|
||||
* An accept state, where exactly the given input string is accepted.
|
||||
*/
|
||||
Accept(RegExpRoot l) { l.isRelevant() } or
|
||||
/**
|
||||
* An accept state, where the given input string, or any string that has this
|
||||
* string as a prefix, is accepted.
|
||||
*/
|
||||
AcceptAnySuffix(RegExpRoot l) { l.isRelevant() }
|
||||
|
||||
/**
|
||||
|
||||
@@ -18,3 +18,32 @@ predicate isExcluded(RegExpParent parent) {
|
||||
// we explicitly exclude these.
|
||||
count(int i | exists(parent.getRegex().getText().regexpFind("\\.\\*", i, _)) | i) > 10
|
||||
}
|
||||
|
||||
/**
|
||||
* A module containing predicates for determining which flags a regular expression have.
|
||||
*/
|
||||
module RegExpFlags {
|
||||
/**
|
||||
* Holds if `root` has the `i` flag for case-insensitive matching.
|
||||
*/
|
||||
predicate isIgnoreCase(RegExpTerm root) {
|
||||
root.isRootTerm() and
|
||||
root.getLiteral().isIgnoreCase()
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the flags for `root`, or the empty string if `root` has no flags.
|
||||
*/
|
||||
string getFlags(RegExpTerm root) {
|
||||
root.isRootTerm() and
|
||||
result = root.getLiteral().getFlags()
|
||||
}
|
||||
|
||||
/**
|
||||
* Holds if `root` has the `s` flag for multi-line matching.
|
||||
*/
|
||||
predicate isDotAll(RegExpTerm root) {
|
||||
root.isRootTerm() and
|
||||
root.getLiteral().isDotAll()
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user