From f5a1a124350e5363c4267cdd82f298b93a0d108f Mon Sep 17 00:00:00 2001 From: Erik Krogh Kristensen Date: Sun, 29 Aug 2021 12:03:27 +0200 Subject: [PATCH] support case insensitive regexps in the ReDoS queries --- config/identical-files.json | 2 +- .../security/performance/ReDoSUtil.qll | 81 ++++++++++++++++--- .../security/performance/RegExpTreeView.qll | 38 +++++++++ .../ReDoS/PolynomialBackTracking.expected | 4 + .../Performance/ReDoS/ReDoS.expected | 2 + .../test/query-tests/Performance/ReDoS/tst.js | 8 +- python/ql/lib/semmle/python/RegexTreeView.qll | 8 ++ .../python/security/performance/ReDoSUtil.qll | 81 ++++++++++++++++--- .../security/performance/RegExpTreeView.qll | 29 +++++++ 9 files changed, 233 insertions(+), 20 deletions(-) diff --git a/config/identical-files.json b/config/identical-files.json index 037b0126cb3..74ef7b82323 100644 --- a/config/identical-files.json +++ b/config/identical-files.json @@ -462,4 +462,4 @@ "javascript/ql/lib/semmle/javascript/security/performance/SuperlinearBackTracking.qll", "python/ql/lib/semmle/python/security/performance/SuperlinearBackTracking.qll" ] -} +} \ No newline at end of file diff --git a/javascript/ql/lib/semmle/javascript/security/performance/ReDoSUtil.qll b/javascript/ql/lib/semmle/javascript/security/performance/ReDoSUtil.qll index 43601b1d407..cdd42ce1540 100644 --- a/javascript/ql/lib/semmle/javascript/security/performance/ReDoSUtil.qll +++ b/javascript/ql/lib/semmle/javascript/security/performance/ReDoSUtil.qll @@ -164,6 +164,7 @@ class RelevantRegExpTerm extends RegExpTerm { /** * Holds if `term` is the chosen canonical representative for all terms with string representation `str`. + * The string representation includes which flags are used with the regular expression. * * Using canonical representatives gives a huge performance boost when working with tuples containing multiple `InputSymbol`s. * The number of `InputSymbol`s is decreased by 3 orders of magnitude or more in some larger benchmarks. @@ -173,26 +174,54 @@ private predicate isCanonicalTerm(RelevantRegExpTerm term, string str) { min(RelevantRegExpTerm t, Location loc, File file | loc = t.getLocation() and file = t.getFile() and - str = t.getRawValue() + str = t.getRawValue() + "|" + getCanonicalizationFlags(t.getRootTerm()) | t order by t.getFile().getRelativePath(), loc.getStartLine(), loc.getStartColumn() ) } +/** + * Gets a string reperesentation of the flags used with the regular expression. + * Only the flags that are relevant for the canonicalization are included. + */ +string getCanonicalizationFlags(RegExpTerm root) { + root.isRootTerm() and + ( + RegExpFlags::isIgnoreCase(root) and + result = "i" + or + not RegExpFlags::isIgnoreCase(root) and + result = "" + ) +} + /** * An abstract input symbol, representing a set of concrete characters. */ private newtype TInputSymbol = /** An input symbol corresponding to character `c`. */ Char(string c) { - c = any(RegexpCharacterConstant cc | cc instanceof RelevantRegExpTerm).getValue().charAt(_) + c = + any(RegexpCharacterConstant cc | + cc instanceof RelevantRegExpTerm and + not RegExpFlags::isIgnoreCase(cc.getRootTerm()) + ).getValue().charAt(_) + or + // normalize to lower case if the regexp is case insensitive + c = + any(RegexpCharacterConstant cc, string char | + cc instanceof RelevantRegExpTerm and + RegExpFlags::isIgnoreCase(cc.getRootTerm()) and + char = cc.getValue().charAt(_) + | + char.toLowerCase() + ) } or /** * An input symbol representing all characters matched by * a (non-universal) character class that has string representation `charClassString`. */ CharClass(string charClassString) { - exists(RelevantRegExpTerm term | term.getRawValue() = charClassString) and exists(RelevantRegExpTerm recc | isCanonicalTerm(recc, charClassString) | recc instanceof RegExpCharacterClass and not recc.(RegExpCharacterClass).isUniversalClass() @@ -293,6 +322,19 @@ private module CharacterClasses { */ pragma[noinline] predicate hasChildThatMatches(RegExpCharacterClass cc, string char) { + if RegExpFlags::isIgnoreCase(cc.getRootTerm()) + then + // normalize everything to lower case if the regexp is case insensitive + exists(string c | hasChildThatMatchesIgnoringCasing(cc, c) | char = c.toLowerCase()) + else hasChildThatMatchesIgnoringCasing(cc, char) + } + + /** + * Holds if the character class `cc` has a child (constant or range) that matches `char`. + * Ignores whether the character class is inside a regular expression that ignores casing. + */ + pragma[noinline] + predicate hasChildThatMatchesIgnoringCasing(RegExpCharacterClass cc, string char) { exists(getCanonicalCharClass(cc)) and exists(RegExpTerm child | child = cc.getAChild() | char = child.(RegexpCharacterConstant).getValue() @@ -508,7 +550,7 @@ private State before(RegExpTerm t) { result = Match(t, 0) } /** * Gets a state the NFA may be in after matching `t`. */ -private State after(RegExpTerm t) { +State after(RegExpTerm t) { exists(RegExpAlt alt | t = alt.getAChild() | result = after(alt)) or exists(RegExpSequence seq, int i | t = seq.getChild(i) | @@ -537,7 +579,14 @@ private State after(RegExpTerm t) { predicate delta(State q1, EdgeLabel lbl, State q2) { exists(RegexpCharacterConstant s, int i | q1 = Match(s, i) and - lbl = Char(s.getValue().charAt(i)) and + ( + not RegExpFlags::isIgnoreCase(s.getRootTerm()) and + lbl = Char(s.getValue().charAt(i)) + or + // normalizing to lower case if ignorecase flag is set + RegExpFlags::isIgnoreCase(s.getRootTerm()) and + exists(string c | c = s.getValue().charAt(i) | lbl = Char(c.toLowerCase())) + ) and ( q2 = Match(s, i + 1) or @@ -547,20 +596,20 @@ predicate delta(State q1, EdgeLabel lbl, State q2) { ) or exists(RegExpDot dot | q1 = before(dot) and q2 = after(dot) | - if dot.getLiteral().isDotAll() then lbl = Any() else lbl = Dot() + if RegExpFlags::isDotAll(dot.getRootTerm()) then lbl = Any() else lbl = Dot() ) or exists(RegExpCharacterClass cc | cc.isUniversalClass() and q1 = before(cc) and lbl = Any() and q2 = after(cc) or q1 = before(cc) and - lbl = CharClass(cc.getRawValue()) and + lbl = CharClass(cc.getRawValue() + "|" + getCanonicalizationFlags(cc.getRootTerm())) and q2 = after(cc) ) or exists(RegExpCharacterClassEscape cc | q1 = before(cc) and - lbl = CharClass(cc.getRawValue()) and + lbl = CharClass(cc.getRawValue() + "|" + getCanonicalizationFlags(cc.getRootTerm())) and q2 = after(cc) ) or @@ -627,13 +676,27 @@ RegExpRoot getRoot(RegExpTerm term) { result = getRoot(term.getParent()) } -private newtype TState = +/** + * A state in the NFA. + */ +newtype TState = + /** + * A state representing that the NFA is about to match a term. + * `i` is used to index into multi-char literals. + */ Match(RelevantRegExpTerm t, int i) { i = 0 or exists(t.(RegexpCharacterConstant).getValue().charAt(i)) } or + /** + * An accept state, where exactly the given input string is accepted. + */ Accept(RegExpRoot l) { l.isRelevant() } or + /** + * An accept state, where the given input string, or any string that has this + * string as a prefix, is accepted. + */ AcceptAnySuffix(RegExpRoot l) { l.isRelevant() } /** diff --git a/javascript/ql/lib/semmle/javascript/security/performance/RegExpTreeView.qll b/javascript/ql/lib/semmle/javascript/security/performance/RegExpTreeView.qll index f730f62f5b8..f896e44f5e9 100644 --- a/javascript/ql/lib/semmle/javascript/security/performance/RegExpTreeView.qll +++ b/javascript/ql/lib/semmle/javascript/security/performance/RegExpTreeView.qll @@ -12,3 +12,41 @@ import javascript * For javascript we make the pragmatic performance optimization to ignore minified files. */ predicate isExcluded(RegExpParent parent) { parent.(Expr).getTopLevel().isMinified() } + +/** + * A module containing predicates for determining which flags a regular expression have. + */ +module RegExpFlags { + /** + * Holds if `root` has the `i` flag for case-insensitive matching. + */ + predicate isIgnoreCase(RegExpTerm root) { + root.isRootTerm() and + exists(DataFlow::RegExpCreationNode node | node.getRoot() = root | + RegExp::isIgnoreCase(node.getFlags()) + ) + } + + /** + * Gets the flags for `root`, or the empty string if `root` has no flags. + */ + string getFlags(RegExpTerm root) { + root.isRootTerm() and + exists(DataFlow::RegExpCreationNode node | node.getRoot() = root | + result = node.getFlags() + or + not exists(node.getFlags()) and + result = "" + ) + } + + /** + * Holds if `root` has the `s` flag for multi-line matching. + */ + predicate isDotAll(RegExpTerm root) { + root.isRootTerm() and + exists(DataFlow::RegExpCreationNode node | node.getRoot() = root | + RegExp::isDotAll(node.getFlags()) + ) + } +} diff --git a/javascript/ql/test/query-tests/Performance/ReDoS/PolynomialBackTracking.expected b/javascript/ql/test/query-tests/Performance/ReDoS/PolynomialBackTracking.expected index 9dce8af29ad..e38b5f9326d 100644 --- a/javascript/ql/test/query-tests/Performance/ReDoS/PolynomialBackTracking.expected +++ b/javascript/ql/test/query-tests/Performance/ReDoS/PolynomialBackTracking.expected @@ -502,3 +502,7 @@ | tst.js:375:15:375:16 | x* | Strings with many repetitions of 'x' can start matching anywhere after the start of the preceeding (x*)+(?=$\|y) | | tst.js:378:16:378:22 | [\\s\\S]* | Strings with many repetitions of 'a' can start matching anywhere after the start of the preceeding ([\\s\\S]*)+(?=$) | | tst.js:379:16:379:22 | [\\s\\S]* | Strings with many repetitions of 'a' can start matching anywhere after the start of the preceeding ([\\s\\S]*)+(?=$\|y) | +| tst.js:381:15:381:24 | (foo\|FOO)* | Strings with many repetitions of 'FOO' can start matching anywhere after the start of the preceeding (foo\|FOO)*bar | +| tst.js:382:14:382:23 | (foo\|FOO)* | Strings with many repetitions of 'foo' can start matching anywhere after the start of the preceeding (foo\|FOO)*bar | +| tst.js:384:15:384:26 | ([AB]\|[ab])* | Strings with many repetitions of 'A' can start matching anywhere after the start of the preceeding ([AB]\|[ab])*C | +| tst.js:385:14:385:25 | ([DE]\|[de])* | Strings with many repetitions of 'd' can start matching anywhere after the start of the preceeding ([DE]\|[de])*F | diff --git a/javascript/ql/test/query-tests/Performance/ReDoS/ReDoS.expected b/javascript/ql/test/query-tests/Performance/ReDoS/ReDoS.expected index 05352f7a20d..fe23342eb02 100644 --- a/javascript/ql/test/query-tests/Performance/ReDoS/ReDoS.expected +++ b/javascript/ql/test/query-tests/Performance/ReDoS/ReDoS.expected @@ -178,3 +178,5 @@ | tst.js:375:15:375:16 | x* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'x'. | | tst.js:378:16:378:22 | [\\s\\S]* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a'. | | tst.js:379:16:379:22 | [\\s\\S]* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a'. | +| tst.js:382:14:382:23 | (foo\|FOO)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'foo'. | +| tst.js:385:14:385:25 | ([DE]\|[de])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'd'. | diff --git a/javascript/ql/test/query-tests/Performance/ReDoS/tst.js b/javascript/ql/test/query-tests/Performance/ReDoS/tst.js index 762929080ae..c3521343111 100644 --- a/javascript/ql/test/query-tests/Performance/ReDoS/tst.js +++ b/javascript/ql/test/query-tests/Performance/ReDoS/tst.js @@ -376,4 +376,10 @@ var bad90 = /(x*)+(?=$|y)/ // GOOD - but we spuriously conclude that a rejecting suffix exists. var good44 = /([\s\S]*)+(?=$)/; -var good45 = /([\s\S]*)+(?=$|y)/; \ No newline at end of file +var good45 = /([\s\S]*)+(?=$|y)/; + +var good46 = /(foo|FOO)*bar/; +var bad91 = /(foo|FOO)*bar/i; + +var good47 = /([AB]|[ab])*C/; +var bad92 = /([DE]|[de])*F/i; diff --git a/python/ql/lib/semmle/python/RegexTreeView.qll b/python/ql/lib/semmle/python/RegexTreeView.qll index cc4c08c6364..784ea06ca29 100644 --- a/python/ql/lib/semmle/python/RegexTreeView.qll +++ b/python/ql/lib/semmle/python/RegexTreeView.qll @@ -61,6 +61,14 @@ class RegExpLiteral extends TRegExpLiteral, RegExpParent { predicate isDotAll() { re.getAMode() = "DOTALL" } + predicate isIgnoreCase() { re.getAMode() = "IGNORECASE" } + + string getFlags() { + not exists(re.getAMode()) and result = "" + or + result = strictconcat(string mode | mode = re.getAMode() | mode, " | ") + } + override Regex getRegex() { result = re } string getPrimaryQLClass() { result = "RegExpLiteral" } diff --git a/python/ql/lib/semmle/python/security/performance/ReDoSUtil.qll b/python/ql/lib/semmle/python/security/performance/ReDoSUtil.qll index 43601b1d407..cdd42ce1540 100644 --- a/python/ql/lib/semmle/python/security/performance/ReDoSUtil.qll +++ b/python/ql/lib/semmle/python/security/performance/ReDoSUtil.qll @@ -164,6 +164,7 @@ class RelevantRegExpTerm extends RegExpTerm { /** * Holds if `term` is the chosen canonical representative for all terms with string representation `str`. + * The string representation includes which flags are used with the regular expression. * * Using canonical representatives gives a huge performance boost when working with tuples containing multiple `InputSymbol`s. * The number of `InputSymbol`s is decreased by 3 orders of magnitude or more in some larger benchmarks. @@ -173,26 +174,54 @@ private predicate isCanonicalTerm(RelevantRegExpTerm term, string str) { min(RelevantRegExpTerm t, Location loc, File file | loc = t.getLocation() and file = t.getFile() and - str = t.getRawValue() + str = t.getRawValue() + "|" + getCanonicalizationFlags(t.getRootTerm()) | t order by t.getFile().getRelativePath(), loc.getStartLine(), loc.getStartColumn() ) } +/** + * Gets a string reperesentation of the flags used with the regular expression. + * Only the flags that are relevant for the canonicalization are included. + */ +string getCanonicalizationFlags(RegExpTerm root) { + root.isRootTerm() and + ( + RegExpFlags::isIgnoreCase(root) and + result = "i" + or + not RegExpFlags::isIgnoreCase(root) and + result = "" + ) +} + /** * An abstract input symbol, representing a set of concrete characters. */ private newtype TInputSymbol = /** An input symbol corresponding to character `c`. */ Char(string c) { - c = any(RegexpCharacterConstant cc | cc instanceof RelevantRegExpTerm).getValue().charAt(_) + c = + any(RegexpCharacterConstant cc | + cc instanceof RelevantRegExpTerm and + not RegExpFlags::isIgnoreCase(cc.getRootTerm()) + ).getValue().charAt(_) + or + // normalize to lower case if the regexp is case insensitive + c = + any(RegexpCharacterConstant cc, string char | + cc instanceof RelevantRegExpTerm and + RegExpFlags::isIgnoreCase(cc.getRootTerm()) and + char = cc.getValue().charAt(_) + | + char.toLowerCase() + ) } or /** * An input symbol representing all characters matched by * a (non-universal) character class that has string representation `charClassString`. */ CharClass(string charClassString) { - exists(RelevantRegExpTerm term | term.getRawValue() = charClassString) and exists(RelevantRegExpTerm recc | isCanonicalTerm(recc, charClassString) | recc instanceof RegExpCharacterClass and not recc.(RegExpCharacterClass).isUniversalClass() @@ -293,6 +322,19 @@ private module CharacterClasses { */ pragma[noinline] predicate hasChildThatMatches(RegExpCharacterClass cc, string char) { + if RegExpFlags::isIgnoreCase(cc.getRootTerm()) + then + // normalize everything to lower case if the regexp is case insensitive + exists(string c | hasChildThatMatchesIgnoringCasing(cc, c) | char = c.toLowerCase()) + else hasChildThatMatchesIgnoringCasing(cc, char) + } + + /** + * Holds if the character class `cc` has a child (constant or range) that matches `char`. + * Ignores whether the character class is inside a regular expression that ignores casing. + */ + pragma[noinline] + predicate hasChildThatMatchesIgnoringCasing(RegExpCharacterClass cc, string char) { exists(getCanonicalCharClass(cc)) and exists(RegExpTerm child | child = cc.getAChild() | char = child.(RegexpCharacterConstant).getValue() @@ -508,7 +550,7 @@ private State before(RegExpTerm t) { result = Match(t, 0) } /** * Gets a state the NFA may be in after matching `t`. */ -private State after(RegExpTerm t) { +State after(RegExpTerm t) { exists(RegExpAlt alt | t = alt.getAChild() | result = after(alt)) or exists(RegExpSequence seq, int i | t = seq.getChild(i) | @@ -537,7 +579,14 @@ private State after(RegExpTerm t) { predicate delta(State q1, EdgeLabel lbl, State q2) { exists(RegexpCharacterConstant s, int i | q1 = Match(s, i) and - lbl = Char(s.getValue().charAt(i)) and + ( + not RegExpFlags::isIgnoreCase(s.getRootTerm()) and + lbl = Char(s.getValue().charAt(i)) + or + // normalizing to lower case if ignorecase flag is set + RegExpFlags::isIgnoreCase(s.getRootTerm()) and + exists(string c | c = s.getValue().charAt(i) | lbl = Char(c.toLowerCase())) + ) and ( q2 = Match(s, i + 1) or @@ -547,20 +596,20 @@ predicate delta(State q1, EdgeLabel lbl, State q2) { ) or exists(RegExpDot dot | q1 = before(dot) and q2 = after(dot) | - if dot.getLiteral().isDotAll() then lbl = Any() else lbl = Dot() + if RegExpFlags::isDotAll(dot.getRootTerm()) then lbl = Any() else lbl = Dot() ) or exists(RegExpCharacterClass cc | cc.isUniversalClass() and q1 = before(cc) and lbl = Any() and q2 = after(cc) or q1 = before(cc) and - lbl = CharClass(cc.getRawValue()) and + lbl = CharClass(cc.getRawValue() + "|" + getCanonicalizationFlags(cc.getRootTerm())) and q2 = after(cc) ) or exists(RegExpCharacterClassEscape cc | q1 = before(cc) and - lbl = CharClass(cc.getRawValue()) and + lbl = CharClass(cc.getRawValue() + "|" + getCanonicalizationFlags(cc.getRootTerm())) and q2 = after(cc) ) or @@ -627,13 +676,27 @@ RegExpRoot getRoot(RegExpTerm term) { result = getRoot(term.getParent()) } -private newtype TState = +/** + * A state in the NFA. + */ +newtype TState = + /** + * A state representing that the NFA is about to match a term. + * `i` is used to index into multi-char literals. + */ Match(RelevantRegExpTerm t, int i) { i = 0 or exists(t.(RegexpCharacterConstant).getValue().charAt(i)) } or + /** + * An accept state, where exactly the given input string is accepted. + */ Accept(RegExpRoot l) { l.isRelevant() } or + /** + * An accept state, where the given input string, or any string that has this + * string as a prefix, is accepted. + */ AcceptAnySuffix(RegExpRoot l) { l.isRelevant() } /** diff --git a/python/ql/lib/semmle/python/security/performance/RegExpTreeView.qll b/python/ql/lib/semmle/python/security/performance/RegExpTreeView.qll index 56a6c90ffde..4fbe32f7e7e 100644 --- a/python/ql/lib/semmle/python/security/performance/RegExpTreeView.qll +++ b/python/ql/lib/semmle/python/security/performance/RegExpTreeView.qll @@ -18,3 +18,32 @@ predicate isExcluded(RegExpParent parent) { // we explicitly exclude these. count(int i | exists(parent.getRegex().getText().regexpFind("\\.\\*", i, _)) | i) > 10 } + +/** + * A module containing predicates for determining which flags a regular expression have. + */ +module RegExpFlags { + /** + * Holds if `root` has the `i` flag for case-insensitive matching. + */ + predicate isIgnoreCase(RegExpTerm root) { + root.isRootTerm() and + root.getLiteral().isIgnoreCase() + } + + /** + * Gets the flags for `root`, or the empty string if `root` has no flags. + */ + string getFlags(RegExpTerm root) { + root.isRootTerm() and + result = root.getLiteral().getFlags() + } + + /** + * Holds if `root` has the `s` flag for multi-line matching. + */ + predicate isDotAll(RegExpTerm root) { + root.isRootTerm() and + root.getLiteral().isDotAll() + } +}