support case insensitive regexps in the ReDoS queries

This commit is contained in:
Erik Krogh Kristensen
2021-08-29 12:03:27 +02:00
parent bf15b18f22
commit f5a1a12435
9 changed files with 233 additions and 20 deletions

View File

@@ -462,4 +462,4 @@
"javascript/ql/lib/semmle/javascript/security/performance/SuperlinearBackTracking.qll",
"python/ql/lib/semmle/python/security/performance/SuperlinearBackTracking.qll"
]
}
}

View File

@@ -164,6 +164,7 @@ class RelevantRegExpTerm extends RegExpTerm {
/**
* Holds if `term` is the chosen canonical representative for all terms with string representation `str`.
* The string representation includes which flags are used with the regular expression.
*
* Using canonical representatives gives a huge performance boost when working with tuples containing multiple `InputSymbol`s.
* The number of `InputSymbol`s is decreased by 3 orders of magnitude or more in some larger benchmarks.
@@ -173,26 +174,54 @@ private predicate isCanonicalTerm(RelevantRegExpTerm term, string str) {
min(RelevantRegExpTerm t, Location loc, File file |
loc = t.getLocation() and
file = t.getFile() and
str = t.getRawValue()
str = t.getRawValue() + "|" + getCanonicalizationFlags(t.getRootTerm())
|
t order by t.getFile().getRelativePath(), loc.getStartLine(), loc.getStartColumn()
)
}
/**
* Gets a string reperesentation of the flags used with the regular expression.
* Only the flags that are relevant for the canonicalization are included.
*/
string getCanonicalizationFlags(RegExpTerm root) {
root.isRootTerm() and
(
RegExpFlags::isIgnoreCase(root) and
result = "i"
or
not RegExpFlags::isIgnoreCase(root) and
result = ""
)
}
/**
* An abstract input symbol, representing a set of concrete characters.
*/
private newtype TInputSymbol =
/** An input symbol corresponding to character `c`. */
Char(string c) {
c = any(RegexpCharacterConstant cc | cc instanceof RelevantRegExpTerm).getValue().charAt(_)
c =
any(RegexpCharacterConstant cc |
cc instanceof RelevantRegExpTerm and
not RegExpFlags::isIgnoreCase(cc.getRootTerm())
).getValue().charAt(_)
or
// normalize to lower case if the regexp is case insensitive
c =
any(RegexpCharacterConstant cc, string char |
cc instanceof RelevantRegExpTerm and
RegExpFlags::isIgnoreCase(cc.getRootTerm()) and
char = cc.getValue().charAt(_)
|
char.toLowerCase()
)
} or
/**
* An input symbol representing all characters matched by
* a (non-universal) character class that has string representation `charClassString`.
*/
CharClass(string charClassString) {
exists(RelevantRegExpTerm term | term.getRawValue() = charClassString) and
exists(RelevantRegExpTerm recc | isCanonicalTerm(recc, charClassString) |
recc instanceof RegExpCharacterClass and
not recc.(RegExpCharacterClass).isUniversalClass()
@@ -293,6 +322,19 @@ private module CharacterClasses {
*/
pragma[noinline]
predicate hasChildThatMatches(RegExpCharacterClass cc, string char) {
if RegExpFlags::isIgnoreCase(cc.getRootTerm())
then
// normalize everything to lower case if the regexp is case insensitive
exists(string c | hasChildThatMatchesIgnoringCasing(cc, c) | char = c.toLowerCase())
else hasChildThatMatchesIgnoringCasing(cc, char)
}
/**
* Holds if the character class `cc` has a child (constant or range) that matches `char`.
* Ignores whether the character class is inside a regular expression that ignores casing.
*/
pragma[noinline]
predicate hasChildThatMatchesIgnoringCasing(RegExpCharacterClass cc, string char) {
exists(getCanonicalCharClass(cc)) and
exists(RegExpTerm child | child = cc.getAChild() |
char = child.(RegexpCharacterConstant).getValue()
@@ -508,7 +550,7 @@ private State before(RegExpTerm t) { result = Match(t, 0) }
/**
* Gets a state the NFA may be in after matching `t`.
*/
private State after(RegExpTerm t) {
State after(RegExpTerm t) {
exists(RegExpAlt alt | t = alt.getAChild() | result = after(alt))
or
exists(RegExpSequence seq, int i | t = seq.getChild(i) |
@@ -537,7 +579,14 @@ private State after(RegExpTerm t) {
predicate delta(State q1, EdgeLabel lbl, State q2) {
exists(RegexpCharacterConstant s, int i |
q1 = Match(s, i) and
lbl = Char(s.getValue().charAt(i)) and
(
not RegExpFlags::isIgnoreCase(s.getRootTerm()) and
lbl = Char(s.getValue().charAt(i))
or
// normalizing to lower case if ignorecase flag is set
RegExpFlags::isIgnoreCase(s.getRootTerm()) and
exists(string c | c = s.getValue().charAt(i) | lbl = Char(c.toLowerCase()))
) and
(
q2 = Match(s, i + 1)
or
@@ -547,20 +596,20 @@ predicate delta(State q1, EdgeLabel lbl, State q2) {
)
or
exists(RegExpDot dot | q1 = before(dot) and q2 = after(dot) |
if dot.getLiteral().isDotAll() then lbl = Any() else lbl = Dot()
if RegExpFlags::isDotAll(dot.getRootTerm()) then lbl = Any() else lbl = Dot()
)
or
exists(RegExpCharacterClass cc |
cc.isUniversalClass() and q1 = before(cc) and lbl = Any() and q2 = after(cc)
or
q1 = before(cc) and
lbl = CharClass(cc.getRawValue()) and
lbl = CharClass(cc.getRawValue() + "|" + getCanonicalizationFlags(cc.getRootTerm())) and
q2 = after(cc)
)
or
exists(RegExpCharacterClassEscape cc |
q1 = before(cc) and
lbl = CharClass(cc.getRawValue()) and
lbl = CharClass(cc.getRawValue() + "|" + getCanonicalizationFlags(cc.getRootTerm())) and
q2 = after(cc)
)
or
@@ -627,13 +676,27 @@ RegExpRoot getRoot(RegExpTerm term) {
result = getRoot(term.getParent())
}
private newtype TState =
/**
* A state in the NFA.
*/
newtype TState =
/**
* A state representing that the NFA is about to match a term.
* `i` is used to index into multi-char literals.
*/
Match(RelevantRegExpTerm t, int i) {
i = 0
or
exists(t.(RegexpCharacterConstant).getValue().charAt(i))
} or
/**
* An accept state, where exactly the given input string is accepted.
*/
Accept(RegExpRoot l) { l.isRelevant() } or
/**
* An accept state, where the given input string, or any string that has this
* string as a prefix, is accepted.
*/
AcceptAnySuffix(RegExpRoot l) { l.isRelevant() }
/**

View File

@@ -12,3 +12,41 @@ import javascript
* For javascript we make the pragmatic performance optimization to ignore minified files.
*/
predicate isExcluded(RegExpParent parent) { parent.(Expr).getTopLevel().isMinified() }
/**
* A module containing predicates for determining which flags a regular expression have.
*/
module RegExpFlags {
/**
* Holds if `root` has the `i` flag for case-insensitive matching.
*/
predicate isIgnoreCase(RegExpTerm root) {
root.isRootTerm() and
exists(DataFlow::RegExpCreationNode node | node.getRoot() = root |
RegExp::isIgnoreCase(node.getFlags())
)
}
/**
* Gets the flags for `root`, or the empty string if `root` has no flags.
*/
string getFlags(RegExpTerm root) {
root.isRootTerm() and
exists(DataFlow::RegExpCreationNode node | node.getRoot() = root |
result = node.getFlags()
or
not exists(node.getFlags()) and
result = ""
)
}
/**
* Holds if `root` has the `s` flag for multi-line matching.
*/
predicate isDotAll(RegExpTerm root) {
root.isRootTerm() and
exists(DataFlow::RegExpCreationNode node | node.getRoot() = root |
RegExp::isDotAll(node.getFlags())
)
}
}

View File

@@ -502,3 +502,7 @@
| tst.js:375:15:375:16 | x* | Strings with many repetitions of 'x' can start matching anywhere after the start of the preceeding (x*)+(?=$\|y) |
| tst.js:378:16:378:22 | [\\s\\S]* | Strings with many repetitions of 'a' can start matching anywhere after the start of the preceeding ([\\s\\S]*)+(?=$) |
| tst.js:379:16:379:22 | [\\s\\S]* | Strings with many repetitions of 'a' can start matching anywhere after the start of the preceeding ([\\s\\S]*)+(?=$\|y) |
| tst.js:381:15:381:24 | (foo\|FOO)* | Strings with many repetitions of 'FOO' can start matching anywhere after the start of the preceeding (foo\|FOO)*bar |
| tst.js:382:14:382:23 | (foo\|FOO)* | Strings with many repetitions of 'foo' can start matching anywhere after the start of the preceeding (foo\|FOO)*bar |
| tst.js:384:15:384:26 | ([AB]\|[ab])* | Strings with many repetitions of 'A' can start matching anywhere after the start of the preceeding ([AB]\|[ab])*C |
| tst.js:385:14:385:25 | ([DE]\|[de])* | Strings with many repetitions of 'd' can start matching anywhere after the start of the preceeding ([DE]\|[de])*F |

View File

@@ -178,3 +178,5 @@
| tst.js:375:15:375:16 | x* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'x'. |
| tst.js:378:16:378:22 | [\\s\\S]* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a'. |
| tst.js:379:16:379:22 | [\\s\\S]* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a'. |
| tst.js:382:14:382:23 | (foo\|FOO)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'foo'. |
| tst.js:385:14:385:25 | ([DE]\|[de])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'd'. |

View File

@@ -376,4 +376,10 @@ var bad90 = /(x*)+(?=$|y)/
// GOOD - but we spuriously conclude that a rejecting suffix exists.
var good44 = /([\s\S]*)+(?=$)/;
var good45 = /([\s\S]*)+(?=$|y)/;
var good45 = /([\s\S]*)+(?=$|y)/;
var good46 = /(foo|FOO)*bar/;
var bad91 = /(foo|FOO)*bar/i;
var good47 = /([AB]|[ab])*C/;
var bad92 = /([DE]|[de])*F/i;

View File

@@ -61,6 +61,14 @@ class RegExpLiteral extends TRegExpLiteral, RegExpParent {
predicate isDotAll() { re.getAMode() = "DOTALL" }
predicate isIgnoreCase() { re.getAMode() = "IGNORECASE" }
string getFlags() {
not exists(re.getAMode()) and result = ""
or
result = strictconcat(string mode | mode = re.getAMode() | mode, " | ")
}
override Regex getRegex() { result = re }
string getPrimaryQLClass() { result = "RegExpLiteral" }

View File

@@ -164,6 +164,7 @@ class RelevantRegExpTerm extends RegExpTerm {
/**
* Holds if `term` is the chosen canonical representative for all terms with string representation `str`.
* The string representation includes which flags are used with the regular expression.
*
* Using canonical representatives gives a huge performance boost when working with tuples containing multiple `InputSymbol`s.
* The number of `InputSymbol`s is decreased by 3 orders of magnitude or more in some larger benchmarks.
@@ -173,26 +174,54 @@ private predicate isCanonicalTerm(RelevantRegExpTerm term, string str) {
min(RelevantRegExpTerm t, Location loc, File file |
loc = t.getLocation() and
file = t.getFile() and
str = t.getRawValue()
str = t.getRawValue() + "|" + getCanonicalizationFlags(t.getRootTerm())
|
t order by t.getFile().getRelativePath(), loc.getStartLine(), loc.getStartColumn()
)
}
/**
* Gets a string reperesentation of the flags used with the regular expression.
* Only the flags that are relevant for the canonicalization are included.
*/
string getCanonicalizationFlags(RegExpTerm root) {
root.isRootTerm() and
(
RegExpFlags::isIgnoreCase(root) and
result = "i"
or
not RegExpFlags::isIgnoreCase(root) and
result = ""
)
}
/**
* An abstract input symbol, representing a set of concrete characters.
*/
private newtype TInputSymbol =
/** An input symbol corresponding to character `c`. */
Char(string c) {
c = any(RegexpCharacterConstant cc | cc instanceof RelevantRegExpTerm).getValue().charAt(_)
c =
any(RegexpCharacterConstant cc |
cc instanceof RelevantRegExpTerm and
not RegExpFlags::isIgnoreCase(cc.getRootTerm())
).getValue().charAt(_)
or
// normalize to lower case if the regexp is case insensitive
c =
any(RegexpCharacterConstant cc, string char |
cc instanceof RelevantRegExpTerm and
RegExpFlags::isIgnoreCase(cc.getRootTerm()) and
char = cc.getValue().charAt(_)
|
char.toLowerCase()
)
} or
/**
* An input symbol representing all characters matched by
* a (non-universal) character class that has string representation `charClassString`.
*/
CharClass(string charClassString) {
exists(RelevantRegExpTerm term | term.getRawValue() = charClassString) and
exists(RelevantRegExpTerm recc | isCanonicalTerm(recc, charClassString) |
recc instanceof RegExpCharacterClass and
not recc.(RegExpCharacterClass).isUniversalClass()
@@ -293,6 +322,19 @@ private module CharacterClasses {
*/
pragma[noinline]
predicate hasChildThatMatches(RegExpCharacterClass cc, string char) {
if RegExpFlags::isIgnoreCase(cc.getRootTerm())
then
// normalize everything to lower case if the regexp is case insensitive
exists(string c | hasChildThatMatchesIgnoringCasing(cc, c) | char = c.toLowerCase())
else hasChildThatMatchesIgnoringCasing(cc, char)
}
/**
* Holds if the character class `cc` has a child (constant or range) that matches `char`.
* Ignores whether the character class is inside a regular expression that ignores casing.
*/
pragma[noinline]
predicate hasChildThatMatchesIgnoringCasing(RegExpCharacterClass cc, string char) {
exists(getCanonicalCharClass(cc)) and
exists(RegExpTerm child | child = cc.getAChild() |
char = child.(RegexpCharacterConstant).getValue()
@@ -508,7 +550,7 @@ private State before(RegExpTerm t) { result = Match(t, 0) }
/**
* Gets a state the NFA may be in after matching `t`.
*/
private State after(RegExpTerm t) {
State after(RegExpTerm t) {
exists(RegExpAlt alt | t = alt.getAChild() | result = after(alt))
or
exists(RegExpSequence seq, int i | t = seq.getChild(i) |
@@ -537,7 +579,14 @@ private State after(RegExpTerm t) {
predicate delta(State q1, EdgeLabel lbl, State q2) {
exists(RegexpCharacterConstant s, int i |
q1 = Match(s, i) and
lbl = Char(s.getValue().charAt(i)) and
(
not RegExpFlags::isIgnoreCase(s.getRootTerm()) and
lbl = Char(s.getValue().charAt(i))
or
// normalizing to lower case if ignorecase flag is set
RegExpFlags::isIgnoreCase(s.getRootTerm()) and
exists(string c | c = s.getValue().charAt(i) | lbl = Char(c.toLowerCase()))
) and
(
q2 = Match(s, i + 1)
or
@@ -547,20 +596,20 @@ predicate delta(State q1, EdgeLabel lbl, State q2) {
)
or
exists(RegExpDot dot | q1 = before(dot) and q2 = after(dot) |
if dot.getLiteral().isDotAll() then lbl = Any() else lbl = Dot()
if RegExpFlags::isDotAll(dot.getRootTerm()) then lbl = Any() else lbl = Dot()
)
or
exists(RegExpCharacterClass cc |
cc.isUniversalClass() and q1 = before(cc) and lbl = Any() and q2 = after(cc)
or
q1 = before(cc) and
lbl = CharClass(cc.getRawValue()) and
lbl = CharClass(cc.getRawValue() + "|" + getCanonicalizationFlags(cc.getRootTerm())) and
q2 = after(cc)
)
or
exists(RegExpCharacterClassEscape cc |
q1 = before(cc) and
lbl = CharClass(cc.getRawValue()) and
lbl = CharClass(cc.getRawValue() + "|" + getCanonicalizationFlags(cc.getRootTerm())) and
q2 = after(cc)
)
or
@@ -627,13 +676,27 @@ RegExpRoot getRoot(RegExpTerm term) {
result = getRoot(term.getParent())
}
private newtype TState =
/**
* A state in the NFA.
*/
newtype TState =
/**
* A state representing that the NFA is about to match a term.
* `i` is used to index into multi-char literals.
*/
Match(RelevantRegExpTerm t, int i) {
i = 0
or
exists(t.(RegexpCharacterConstant).getValue().charAt(i))
} or
/**
* An accept state, where exactly the given input string is accepted.
*/
Accept(RegExpRoot l) { l.isRelevant() } or
/**
* An accept state, where the given input string, or any string that has this
* string as a prefix, is accepted.
*/
AcceptAnySuffix(RegExpRoot l) { l.isRelevant() }
/**

View File

@@ -18,3 +18,32 @@ predicate isExcluded(RegExpParent parent) {
// we explicitly exclude these.
count(int i | exists(parent.getRegex().getText().regexpFind("\\.\\*", i, _)) | i) > 10
}
/**
* A module containing predicates for determining which flags a regular expression have.
*/
module RegExpFlags {
/**
* Holds if `root` has the `i` flag for case-insensitive matching.
*/
predicate isIgnoreCase(RegExpTerm root) {
root.isRootTerm() and
root.getLiteral().isIgnoreCase()
}
/**
* Gets the flags for `root`, or the empty string if `root` has no flags.
*/
string getFlags(RegExpTerm root) {
root.isRootTerm() and
result = root.getLiteral().getFlags()
}
/**
* Holds if `root` has the `s` flag for multi-line matching.
*/
predicate isDotAll(RegExpTerm root) {
root.isRootTerm() and
root.getLiteral().isDotAll()
}
}