From f5a1a124350e5363c4267cdd82f298b93a0d108f Mon Sep 17 00:00:00 2001 From: Erik Krogh Kristensen Date: Sun, 29 Aug 2021 12:03:27 +0200 Subject: [PATCH 1/7] support case insensitive regexps in the ReDoS queries --- config/identical-files.json | 2 +- .../security/performance/ReDoSUtil.qll | 81 ++++++++++++++++--- .../security/performance/RegExpTreeView.qll | 38 +++++++++ .../ReDoS/PolynomialBackTracking.expected | 4 + .../Performance/ReDoS/ReDoS.expected | 2 + .../test/query-tests/Performance/ReDoS/tst.js | 8 +- python/ql/lib/semmle/python/RegexTreeView.qll | 8 ++ .../python/security/performance/ReDoSUtil.qll | 81 ++++++++++++++++--- .../security/performance/RegExpTreeView.qll | 29 +++++++ 9 files changed, 233 insertions(+), 20 deletions(-) diff --git a/config/identical-files.json b/config/identical-files.json index 037b0126cb3..74ef7b82323 100644 --- a/config/identical-files.json +++ b/config/identical-files.json @@ -462,4 +462,4 @@ "javascript/ql/lib/semmle/javascript/security/performance/SuperlinearBackTracking.qll", "python/ql/lib/semmle/python/security/performance/SuperlinearBackTracking.qll" ] -} +} \ No newline at end of file diff --git a/javascript/ql/lib/semmle/javascript/security/performance/ReDoSUtil.qll b/javascript/ql/lib/semmle/javascript/security/performance/ReDoSUtil.qll index 43601b1d407..cdd42ce1540 100644 --- a/javascript/ql/lib/semmle/javascript/security/performance/ReDoSUtil.qll +++ b/javascript/ql/lib/semmle/javascript/security/performance/ReDoSUtil.qll @@ -164,6 +164,7 @@ class RelevantRegExpTerm extends RegExpTerm { /** * Holds if `term` is the chosen canonical representative for all terms with string representation `str`. + * The string representation includes which flags are used with the regular expression. * * Using canonical representatives gives a huge performance boost when working with tuples containing multiple `InputSymbol`s. * The number of `InputSymbol`s is decreased by 3 orders of magnitude or more in some larger benchmarks. @@ -173,26 +174,54 @@ private predicate isCanonicalTerm(RelevantRegExpTerm term, string str) { min(RelevantRegExpTerm t, Location loc, File file | loc = t.getLocation() and file = t.getFile() and - str = t.getRawValue() + str = t.getRawValue() + "|" + getCanonicalizationFlags(t.getRootTerm()) | t order by t.getFile().getRelativePath(), loc.getStartLine(), loc.getStartColumn() ) } +/** + * Gets a string reperesentation of the flags used with the regular expression. + * Only the flags that are relevant for the canonicalization are included. + */ +string getCanonicalizationFlags(RegExpTerm root) { + root.isRootTerm() and + ( + RegExpFlags::isIgnoreCase(root) and + result = "i" + or + not RegExpFlags::isIgnoreCase(root) and + result = "" + ) +} + /** * An abstract input symbol, representing a set of concrete characters. */ private newtype TInputSymbol = /** An input symbol corresponding to character `c`. */ Char(string c) { - c = any(RegexpCharacterConstant cc | cc instanceof RelevantRegExpTerm).getValue().charAt(_) + c = + any(RegexpCharacterConstant cc | + cc instanceof RelevantRegExpTerm and + not RegExpFlags::isIgnoreCase(cc.getRootTerm()) + ).getValue().charAt(_) + or + // normalize to lower case if the regexp is case insensitive + c = + any(RegexpCharacterConstant cc, string char | + cc instanceof RelevantRegExpTerm and + RegExpFlags::isIgnoreCase(cc.getRootTerm()) and + char = cc.getValue().charAt(_) + | + char.toLowerCase() + ) } or /** * An input symbol representing all characters matched by * a (non-universal) character class that has string representation `charClassString`. */ CharClass(string charClassString) { - exists(RelevantRegExpTerm term | term.getRawValue() = charClassString) and exists(RelevantRegExpTerm recc | isCanonicalTerm(recc, charClassString) | recc instanceof RegExpCharacterClass and not recc.(RegExpCharacterClass).isUniversalClass() @@ -293,6 +322,19 @@ private module CharacterClasses { */ pragma[noinline] predicate hasChildThatMatches(RegExpCharacterClass cc, string char) { + if RegExpFlags::isIgnoreCase(cc.getRootTerm()) + then + // normalize everything to lower case if the regexp is case insensitive + exists(string c | hasChildThatMatchesIgnoringCasing(cc, c) | char = c.toLowerCase()) + else hasChildThatMatchesIgnoringCasing(cc, char) + } + + /** + * Holds if the character class `cc` has a child (constant or range) that matches `char`. + * Ignores whether the character class is inside a regular expression that ignores casing. + */ + pragma[noinline] + predicate hasChildThatMatchesIgnoringCasing(RegExpCharacterClass cc, string char) { exists(getCanonicalCharClass(cc)) and exists(RegExpTerm child | child = cc.getAChild() | char = child.(RegexpCharacterConstant).getValue() @@ -508,7 +550,7 @@ private State before(RegExpTerm t) { result = Match(t, 0) } /** * Gets a state the NFA may be in after matching `t`. */ -private State after(RegExpTerm t) { +State after(RegExpTerm t) { exists(RegExpAlt alt | t = alt.getAChild() | result = after(alt)) or exists(RegExpSequence seq, int i | t = seq.getChild(i) | @@ -537,7 +579,14 @@ private State after(RegExpTerm t) { predicate delta(State q1, EdgeLabel lbl, State q2) { exists(RegexpCharacterConstant s, int i | q1 = Match(s, i) and - lbl = Char(s.getValue().charAt(i)) and + ( + not RegExpFlags::isIgnoreCase(s.getRootTerm()) and + lbl = Char(s.getValue().charAt(i)) + or + // normalizing to lower case if ignorecase flag is set + RegExpFlags::isIgnoreCase(s.getRootTerm()) and + exists(string c | c = s.getValue().charAt(i) | lbl = Char(c.toLowerCase())) + ) and ( q2 = Match(s, i + 1) or @@ -547,20 +596,20 @@ predicate delta(State q1, EdgeLabel lbl, State q2) { ) or exists(RegExpDot dot | q1 = before(dot) and q2 = after(dot) | - if dot.getLiteral().isDotAll() then lbl = Any() else lbl = Dot() + if RegExpFlags::isDotAll(dot.getRootTerm()) then lbl = Any() else lbl = Dot() ) or exists(RegExpCharacterClass cc | cc.isUniversalClass() and q1 = before(cc) and lbl = Any() and q2 = after(cc) or q1 = before(cc) and - lbl = CharClass(cc.getRawValue()) and + lbl = CharClass(cc.getRawValue() + "|" + getCanonicalizationFlags(cc.getRootTerm())) and q2 = after(cc) ) or exists(RegExpCharacterClassEscape cc | q1 = before(cc) and - lbl = CharClass(cc.getRawValue()) and + lbl = CharClass(cc.getRawValue() + "|" + getCanonicalizationFlags(cc.getRootTerm())) and q2 = after(cc) ) or @@ -627,13 +676,27 @@ RegExpRoot getRoot(RegExpTerm term) { result = getRoot(term.getParent()) } -private newtype TState = +/** + * A state in the NFA. + */ +newtype TState = + /** + * A state representing that the NFA is about to match a term. + * `i` is used to index into multi-char literals. + */ Match(RelevantRegExpTerm t, int i) { i = 0 or exists(t.(RegexpCharacterConstant).getValue().charAt(i)) } or + /** + * An accept state, where exactly the given input string is accepted. + */ Accept(RegExpRoot l) { l.isRelevant() } or + /** + * An accept state, where the given input string, or any string that has this + * string as a prefix, is accepted. + */ AcceptAnySuffix(RegExpRoot l) { l.isRelevant() } /** diff --git a/javascript/ql/lib/semmle/javascript/security/performance/RegExpTreeView.qll b/javascript/ql/lib/semmle/javascript/security/performance/RegExpTreeView.qll index f730f62f5b8..f896e44f5e9 100644 --- a/javascript/ql/lib/semmle/javascript/security/performance/RegExpTreeView.qll +++ b/javascript/ql/lib/semmle/javascript/security/performance/RegExpTreeView.qll @@ -12,3 +12,41 @@ import javascript * For javascript we make the pragmatic performance optimization to ignore minified files. */ predicate isExcluded(RegExpParent parent) { parent.(Expr).getTopLevel().isMinified() } + +/** + * A module containing predicates for determining which flags a regular expression have. + */ +module RegExpFlags { + /** + * Holds if `root` has the `i` flag for case-insensitive matching. + */ + predicate isIgnoreCase(RegExpTerm root) { + root.isRootTerm() and + exists(DataFlow::RegExpCreationNode node | node.getRoot() = root | + RegExp::isIgnoreCase(node.getFlags()) + ) + } + + /** + * Gets the flags for `root`, or the empty string if `root` has no flags. + */ + string getFlags(RegExpTerm root) { + root.isRootTerm() and + exists(DataFlow::RegExpCreationNode node | node.getRoot() = root | + result = node.getFlags() + or + not exists(node.getFlags()) and + result = "" + ) + } + + /** + * Holds if `root` has the `s` flag for multi-line matching. + */ + predicate isDotAll(RegExpTerm root) { + root.isRootTerm() and + exists(DataFlow::RegExpCreationNode node | node.getRoot() = root | + RegExp::isDotAll(node.getFlags()) + ) + } +} diff --git a/javascript/ql/test/query-tests/Performance/ReDoS/PolynomialBackTracking.expected b/javascript/ql/test/query-tests/Performance/ReDoS/PolynomialBackTracking.expected index 9dce8af29ad..e38b5f9326d 100644 --- a/javascript/ql/test/query-tests/Performance/ReDoS/PolynomialBackTracking.expected +++ b/javascript/ql/test/query-tests/Performance/ReDoS/PolynomialBackTracking.expected @@ -502,3 +502,7 @@ | tst.js:375:15:375:16 | x* | Strings with many repetitions of 'x' can start matching anywhere after the start of the preceeding (x*)+(?=$\|y) | | tst.js:378:16:378:22 | [\\s\\S]* | Strings with many repetitions of 'a' can start matching anywhere after the start of the preceeding ([\\s\\S]*)+(?=$) | | tst.js:379:16:379:22 | [\\s\\S]* | Strings with many repetitions of 'a' can start matching anywhere after the start of the preceeding ([\\s\\S]*)+(?=$\|y) | +| tst.js:381:15:381:24 | (foo\|FOO)* | Strings with many repetitions of 'FOO' can start matching anywhere after the start of the preceeding (foo\|FOO)*bar | +| tst.js:382:14:382:23 | (foo\|FOO)* | Strings with many repetitions of 'foo' can start matching anywhere after the start of the preceeding (foo\|FOO)*bar | +| tst.js:384:15:384:26 | ([AB]\|[ab])* | Strings with many repetitions of 'A' can start matching anywhere after the start of the preceeding ([AB]\|[ab])*C | +| tst.js:385:14:385:25 | ([DE]\|[de])* | Strings with many repetitions of 'd' can start matching anywhere after the start of the preceeding ([DE]\|[de])*F | diff --git a/javascript/ql/test/query-tests/Performance/ReDoS/ReDoS.expected b/javascript/ql/test/query-tests/Performance/ReDoS/ReDoS.expected index 05352f7a20d..fe23342eb02 100644 --- a/javascript/ql/test/query-tests/Performance/ReDoS/ReDoS.expected +++ b/javascript/ql/test/query-tests/Performance/ReDoS/ReDoS.expected @@ -178,3 +178,5 @@ | tst.js:375:15:375:16 | x* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'x'. | | tst.js:378:16:378:22 | [\\s\\S]* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a'. | | tst.js:379:16:379:22 | [\\s\\S]* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a'. | +| tst.js:382:14:382:23 | (foo\|FOO)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'foo'. | +| tst.js:385:14:385:25 | ([DE]\|[de])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'd'. | diff --git a/javascript/ql/test/query-tests/Performance/ReDoS/tst.js b/javascript/ql/test/query-tests/Performance/ReDoS/tst.js index 762929080ae..c3521343111 100644 --- a/javascript/ql/test/query-tests/Performance/ReDoS/tst.js +++ b/javascript/ql/test/query-tests/Performance/ReDoS/tst.js @@ -376,4 +376,10 @@ var bad90 = /(x*)+(?=$|y)/ // GOOD - but we spuriously conclude that a rejecting suffix exists. var good44 = /([\s\S]*)+(?=$)/; -var good45 = /([\s\S]*)+(?=$|y)/; \ No newline at end of file +var good45 = /([\s\S]*)+(?=$|y)/; + +var good46 = /(foo|FOO)*bar/; +var bad91 = /(foo|FOO)*bar/i; + +var good47 = /([AB]|[ab])*C/; +var bad92 = /([DE]|[de])*F/i; diff --git a/python/ql/lib/semmle/python/RegexTreeView.qll b/python/ql/lib/semmle/python/RegexTreeView.qll index cc4c08c6364..784ea06ca29 100644 --- a/python/ql/lib/semmle/python/RegexTreeView.qll +++ b/python/ql/lib/semmle/python/RegexTreeView.qll @@ -61,6 +61,14 @@ class RegExpLiteral extends TRegExpLiteral, RegExpParent { predicate isDotAll() { re.getAMode() = "DOTALL" } + predicate isIgnoreCase() { re.getAMode() = "IGNORECASE" } + + string getFlags() { + not exists(re.getAMode()) and result = "" + or + result = strictconcat(string mode | mode = re.getAMode() | mode, " | ") + } + override Regex getRegex() { result = re } string getPrimaryQLClass() { result = "RegExpLiteral" } diff --git a/python/ql/lib/semmle/python/security/performance/ReDoSUtil.qll b/python/ql/lib/semmle/python/security/performance/ReDoSUtil.qll index 43601b1d407..cdd42ce1540 100644 --- a/python/ql/lib/semmle/python/security/performance/ReDoSUtil.qll +++ b/python/ql/lib/semmle/python/security/performance/ReDoSUtil.qll @@ -164,6 +164,7 @@ class RelevantRegExpTerm extends RegExpTerm { /** * Holds if `term` is the chosen canonical representative for all terms with string representation `str`. + * The string representation includes which flags are used with the regular expression. * * Using canonical representatives gives a huge performance boost when working with tuples containing multiple `InputSymbol`s. * The number of `InputSymbol`s is decreased by 3 orders of magnitude or more in some larger benchmarks. @@ -173,26 +174,54 @@ private predicate isCanonicalTerm(RelevantRegExpTerm term, string str) { min(RelevantRegExpTerm t, Location loc, File file | loc = t.getLocation() and file = t.getFile() and - str = t.getRawValue() + str = t.getRawValue() + "|" + getCanonicalizationFlags(t.getRootTerm()) | t order by t.getFile().getRelativePath(), loc.getStartLine(), loc.getStartColumn() ) } +/** + * Gets a string reperesentation of the flags used with the regular expression. + * Only the flags that are relevant for the canonicalization are included. + */ +string getCanonicalizationFlags(RegExpTerm root) { + root.isRootTerm() and + ( + RegExpFlags::isIgnoreCase(root) and + result = "i" + or + not RegExpFlags::isIgnoreCase(root) and + result = "" + ) +} + /** * An abstract input symbol, representing a set of concrete characters. */ private newtype TInputSymbol = /** An input symbol corresponding to character `c`. */ Char(string c) { - c = any(RegexpCharacterConstant cc | cc instanceof RelevantRegExpTerm).getValue().charAt(_) + c = + any(RegexpCharacterConstant cc | + cc instanceof RelevantRegExpTerm and + not RegExpFlags::isIgnoreCase(cc.getRootTerm()) + ).getValue().charAt(_) + or + // normalize to lower case if the regexp is case insensitive + c = + any(RegexpCharacterConstant cc, string char | + cc instanceof RelevantRegExpTerm and + RegExpFlags::isIgnoreCase(cc.getRootTerm()) and + char = cc.getValue().charAt(_) + | + char.toLowerCase() + ) } or /** * An input symbol representing all characters matched by * a (non-universal) character class that has string representation `charClassString`. */ CharClass(string charClassString) { - exists(RelevantRegExpTerm term | term.getRawValue() = charClassString) and exists(RelevantRegExpTerm recc | isCanonicalTerm(recc, charClassString) | recc instanceof RegExpCharacterClass and not recc.(RegExpCharacterClass).isUniversalClass() @@ -293,6 +322,19 @@ private module CharacterClasses { */ pragma[noinline] predicate hasChildThatMatches(RegExpCharacterClass cc, string char) { + if RegExpFlags::isIgnoreCase(cc.getRootTerm()) + then + // normalize everything to lower case if the regexp is case insensitive + exists(string c | hasChildThatMatchesIgnoringCasing(cc, c) | char = c.toLowerCase()) + else hasChildThatMatchesIgnoringCasing(cc, char) + } + + /** + * Holds if the character class `cc` has a child (constant or range) that matches `char`. + * Ignores whether the character class is inside a regular expression that ignores casing. + */ + pragma[noinline] + predicate hasChildThatMatchesIgnoringCasing(RegExpCharacterClass cc, string char) { exists(getCanonicalCharClass(cc)) and exists(RegExpTerm child | child = cc.getAChild() | char = child.(RegexpCharacterConstant).getValue() @@ -508,7 +550,7 @@ private State before(RegExpTerm t) { result = Match(t, 0) } /** * Gets a state the NFA may be in after matching `t`. */ -private State after(RegExpTerm t) { +State after(RegExpTerm t) { exists(RegExpAlt alt | t = alt.getAChild() | result = after(alt)) or exists(RegExpSequence seq, int i | t = seq.getChild(i) | @@ -537,7 +579,14 @@ private State after(RegExpTerm t) { predicate delta(State q1, EdgeLabel lbl, State q2) { exists(RegexpCharacterConstant s, int i | q1 = Match(s, i) and - lbl = Char(s.getValue().charAt(i)) and + ( + not RegExpFlags::isIgnoreCase(s.getRootTerm()) and + lbl = Char(s.getValue().charAt(i)) + or + // normalizing to lower case if ignorecase flag is set + RegExpFlags::isIgnoreCase(s.getRootTerm()) and + exists(string c | c = s.getValue().charAt(i) | lbl = Char(c.toLowerCase())) + ) and ( q2 = Match(s, i + 1) or @@ -547,20 +596,20 @@ predicate delta(State q1, EdgeLabel lbl, State q2) { ) or exists(RegExpDot dot | q1 = before(dot) and q2 = after(dot) | - if dot.getLiteral().isDotAll() then lbl = Any() else lbl = Dot() + if RegExpFlags::isDotAll(dot.getRootTerm()) then lbl = Any() else lbl = Dot() ) or exists(RegExpCharacterClass cc | cc.isUniversalClass() and q1 = before(cc) and lbl = Any() and q2 = after(cc) or q1 = before(cc) and - lbl = CharClass(cc.getRawValue()) and + lbl = CharClass(cc.getRawValue() + "|" + getCanonicalizationFlags(cc.getRootTerm())) and q2 = after(cc) ) or exists(RegExpCharacterClassEscape cc | q1 = before(cc) and - lbl = CharClass(cc.getRawValue()) and + lbl = CharClass(cc.getRawValue() + "|" + getCanonicalizationFlags(cc.getRootTerm())) and q2 = after(cc) ) or @@ -627,13 +676,27 @@ RegExpRoot getRoot(RegExpTerm term) { result = getRoot(term.getParent()) } -private newtype TState = +/** + * A state in the NFA. + */ +newtype TState = + /** + * A state representing that the NFA is about to match a term. + * `i` is used to index into multi-char literals. + */ Match(RelevantRegExpTerm t, int i) { i = 0 or exists(t.(RegexpCharacterConstant).getValue().charAt(i)) } or + /** + * An accept state, where exactly the given input string is accepted. + */ Accept(RegExpRoot l) { l.isRelevant() } or + /** + * An accept state, where the given input string, or any string that has this + * string as a prefix, is accepted. + */ AcceptAnySuffix(RegExpRoot l) { l.isRelevant() } /** diff --git a/python/ql/lib/semmle/python/security/performance/RegExpTreeView.qll b/python/ql/lib/semmle/python/security/performance/RegExpTreeView.qll index 56a6c90ffde..4fbe32f7e7e 100644 --- a/python/ql/lib/semmle/python/security/performance/RegExpTreeView.qll +++ b/python/ql/lib/semmle/python/security/performance/RegExpTreeView.qll @@ -18,3 +18,32 @@ predicate isExcluded(RegExpParent parent) { // we explicitly exclude these. count(int i | exists(parent.getRegex().getText().regexpFind("\\.\\*", i, _)) | i) > 10 } + +/** + * A module containing predicates for determining which flags a regular expression have. + */ +module RegExpFlags { + /** + * Holds if `root` has the `i` flag for case-insensitive matching. + */ + predicate isIgnoreCase(RegExpTerm root) { + root.isRootTerm() and + root.getLiteral().isIgnoreCase() + } + + /** + * Gets the flags for `root`, or the empty string if `root` has no flags. + */ + string getFlags(RegExpTerm root) { + root.isRootTerm() and + result = root.getLiteral().getFlags() + } + + /** + * Holds if `root` has the `s` flag for multi-line matching. + */ + predicate isDotAll(RegExpTerm root) { + root.isRootTerm() and + root.getLiteral().isDotAll() + } +} From 75a3f34e86e08ec44aa3554582a4d849f9957e6a Mon Sep 17 00:00:00 2001 From: Erik Krogh Kristensen Date: Wed, 1 Sep 2021 12:44:02 +0200 Subject: [PATCH 2/7] use if-else in `ReDoSUtil::getCanonicalizationFlags` Co-authored-by: Esben Sparre Andreasen --- .../semmle/javascript/security/performance/ReDoSUtil.qll | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/javascript/ql/lib/semmle/javascript/security/performance/ReDoSUtil.qll b/javascript/ql/lib/semmle/javascript/security/performance/ReDoSUtil.qll index cdd42ce1540..845276204a7 100644 --- a/javascript/ql/lib/semmle/javascript/security/performance/ReDoSUtil.qll +++ b/javascript/ql/lib/semmle/javascript/security/performance/ReDoSUtil.qll @@ -187,11 +187,10 @@ private predicate isCanonicalTerm(RelevantRegExpTerm term, string str) { string getCanonicalizationFlags(RegExpTerm root) { root.isRootTerm() and ( - RegExpFlags::isIgnoreCase(root) and - result = "i" - or - not RegExpFlags::isIgnoreCase(root) and - result = "" + if RegExpFlags::isIgnoreCase(root) then + result = "i" + else + result = "" ) } From ff74fe1e03e70e092122e679d45bdceeff39fb8f Mon Sep 17 00:00:00 2001 From: Erik Krogh Kristensen Date: Wed, 1 Sep 2021 12:45:20 +0200 Subject: [PATCH 3/7] rename hasChildThatMatchesIgnoringCasing to hasChildThatMatchesIgnoringCasingFlags --- .../javascript/security/performance/ReDoSUtil.qll | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/javascript/ql/lib/semmle/javascript/security/performance/ReDoSUtil.qll b/javascript/ql/lib/semmle/javascript/security/performance/ReDoSUtil.qll index 845276204a7..0fdeccc05b2 100644 --- a/javascript/ql/lib/semmle/javascript/security/performance/ReDoSUtil.qll +++ b/javascript/ql/lib/semmle/javascript/security/performance/ReDoSUtil.qll @@ -186,12 +186,7 @@ private predicate isCanonicalTerm(RelevantRegExpTerm term, string str) { */ string getCanonicalizationFlags(RegExpTerm root) { root.isRootTerm() and - ( - if RegExpFlags::isIgnoreCase(root) then - result = "i" - else - result = "" - ) + (if RegExpFlags::isIgnoreCase(root) then result = "i" else result = "") } /** @@ -324,16 +319,16 @@ private module CharacterClasses { if RegExpFlags::isIgnoreCase(cc.getRootTerm()) then // normalize everything to lower case if the regexp is case insensitive - exists(string c | hasChildThatMatchesIgnoringCasing(cc, c) | char = c.toLowerCase()) - else hasChildThatMatchesIgnoringCasing(cc, char) + exists(string c | hasChildThatMatchesIgnoringCasingFlags(cc, c) | char = c.toLowerCase()) + else hasChildThatMatchesIgnoringCasingFlags(cc, char) } /** * Holds if the character class `cc` has a child (constant or range) that matches `char`. - * Ignores whether the character class is inside a regular expression that ignores casing. + * Ignores whether the character class is inside a regular expression that has the ignore case flag. */ pragma[noinline] - predicate hasChildThatMatchesIgnoringCasing(RegExpCharacterClass cc, string char) { + predicate hasChildThatMatchesIgnoringCasingFlags(RegExpCharacterClass cc, string char) { exists(getCanonicalCharClass(cc)) and exists(RegExpTerm child | child = cc.getAChild() | char = child.(RegexpCharacterConstant).getValue() From 537450606e929a7d2d643d0e0defb1a384c87c8b Mon Sep 17 00:00:00 2001 From: Erik Krogh Kristensen Date: Wed, 1 Sep 2021 12:46:50 +0200 Subject: [PATCH 4/7] use a consistent comment about the ignore case flag --- .../lib/semmle/javascript/security/performance/ReDoSUtil.qll | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/javascript/ql/lib/semmle/javascript/security/performance/ReDoSUtil.qll b/javascript/ql/lib/semmle/javascript/security/performance/ReDoSUtil.qll index 0fdeccc05b2..302510a1c55 100644 --- a/javascript/ql/lib/semmle/javascript/security/performance/ReDoSUtil.qll +++ b/javascript/ql/lib/semmle/javascript/security/performance/ReDoSUtil.qll @@ -201,7 +201,7 @@ private newtype TInputSymbol = not RegExpFlags::isIgnoreCase(cc.getRootTerm()) ).getValue().charAt(_) or - // normalize to lower case if the regexp is case insensitive + // normalize everything to lower case if the regexp is case insensitive c = any(RegexpCharacterConstant cc, string char | cc instanceof RelevantRegExpTerm and @@ -577,7 +577,7 @@ predicate delta(State q1, EdgeLabel lbl, State q2) { not RegExpFlags::isIgnoreCase(s.getRootTerm()) and lbl = Char(s.getValue().charAt(i)) or - // normalizing to lower case if ignorecase flag is set + // normalize everything to lower case if the regexp is case insensitive RegExpFlags::isIgnoreCase(s.getRootTerm()) and exists(string c | c = s.getValue().charAt(i) | lbl = Char(c.toLowerCase())) ) and From a3289fabe196e17900339045d97fb933c1169ff9 Mon Sep 17 00:00:00 2001 From: Erik Krogh Kristensen Date: Wed, 1 Sep 2021 12:47:06 +0200 Subject: [PATCH 5/7] sync ReDoSUtil with python --- .../python/security/performance/ReDoSUtil.qll | 20 +++++++------------ 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/python/ql/lib/semmle/python/security/performance/ReDoSUtil.qll b/python/ql/lib/semmle/python/security/performance/ReDoSUtil.qll index cdd42ce1540..302510a1c55 100644 --- a/python/ql/lib/semmle/python/security/performance/ReDoSUtil.qll +++ b/python/ql/lib/semmle/python/security/performance/ReDoSUtil.qll @@ -186,13 +186,7 @@ private predicate isCanonicalTerm(RelevantRegExpTerm term, string str) { */ string getCanonicalizationFlags(RegExpTerm root) { root.isRootTerm() and - ( - RegExpFlags::isIgnoreCase(root) and - result = "i" - or - not RegExpFlags::isIgnoreCase(root) and - result = "" - ) + (if RegExpFlags::isIgnoreCase(root) then result = "i" else result = "") } /** @@ -207,7 +201,7 @@ private newtype TInputSymbol = not RegExpFlags::isIgnoreCase(cc.getRootTerm()) ).getValue().charAt(_) or - // normalize to lower case if the regexp is case insensitive + // normalize everything to lower case if the regexp is case insensitive c = any(RegexpCharacterConstant cc, string char | cc instanceof RelevantRegExpTerm and @@ -325,16 +319,16 @@ private module CharacterClasses { if RegExpFlags::isIgnoreCase(cc.getRootTerm()) then // normalize everything to lower case if the regexp is case insensitive - exists(string c | hasChildThatMatchesIgnoringCasing(cc, c) | char = c.toLowerCase()) - else hasChildThatMatchesIgnoringCasing(cc, char) + exists(string c | hasChildThatMatchesIgnoringCasingFlags(cc, c) | char = c.toLowerCase()) + else hasChildThatMatchesIgnoringCasingFlags(cc, char) } /** * Holds if the character class `cc` has a child (constant or range) that matches `char`. - * Ignores whether the character class is inside a regular expression that ignores casing. + * Ignores whether the character class is inside a regular expression that has the ignore case flag. */ pragma[noinline] - predicate hasChildThatMatchesIgnoringCasing(RegExpCharacterClass cc, string char) { + predicate hasChildThatMatchesIgnoringCasingFlags(RegExpCharacterClass cc, string char) { exists(getCanonicalCharClass(cc)) and exists(RegExpTerm child | child = cc.getAChild() | char = child.(RegexpCharacterConstant).getValue() @@ -583,7 +577,7 @@ predicate delta(State q1, EdgeLabel lbl, State q2) { not RegExpFlags::isIgnoreCase(s.getRootTerm()) and lbl = Char(s.getValue().charAt(i)) or - // normalizing to lower case if ignorecase flag is set + // normalize everything to lower case if the regexp is case insensitive RegExpFlags::isIgnoreCase(s.getRootTerm()) and exists(string c | c = s.getValue().charAt(i) | lbl = Char(c.toLowerCase())) ) and From df04c5044c085463065f1335fa266289bd2bb44f Mon Sep 17 00:00:00 2001 From: Erik Krogh Kristensen Date: Thu, 2 Sep 2021 08:54:39 +0200 Subject: [PATCH 6/7] use `concat` instead of `strictconcat` in RegexTreeView.qll --- python/ql/lib/semmle/python/RegexTreeView.qll | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/python/ql/lib/semmle/python/RegexTreeView.qll b/python/ql/lib/semmle/python/RegexTreeView.qll index 784ea06ca29..9784e057402 100644 --- a/python/ql/lib/semmle/python/RegexTreeView.qll +++ b/python/ql/lib/semmle/python/RegexTreeView.qll @@ -63,11 +63,7 @@ class RegExpLiteral extends TRegExpLiteral, RegExpParent { predicate isIgnoreCase() { re.getAMode() = "IGNORECASE" } - string getFlags() { - not exists(re.getAMode()) and result = "" - or - result = strictconcat(string mode | mode = re.getAMode() | mode, " | ") - } + string getFlags() { result = concat(string mode | mode = re.getAMode() | mode, " | ") } override Regex getRegex() { result = re } From 1ad204d89e0786d252402bc863c2412ca21c9d17 Mon Sep 17 00:00:00 2001 From: Erik Krogh Kristensen Date: Thu, 2 Sep 2021 09:15:43 +0200 Subject: [PATCH 7/7] make after and TState private in ReDoSUtil --- .../lib/semmle/javascript/security/performance/ReDoSUtil.qll | 4 ++-- .../ql/lib/semmle/python/security/performance/ReDoSUtil.qll | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/javascript/ql/lib/semmle/javascript/security/performance/ReDoSUtil.qll b/javascript/ql/lib/semmle/javascript/security/performance/ReDoSUtil.qll index 302510a1c55..65431d7179e 100644 --- a/javascript/ql/lib/semmle/javascript/security/performance/ReDoSUtil.qll +++ b/javascript/ql/lib/semmle/javascript/security/performance/ReDoSUtil.qll @@ -544,7 +544,7 @@ private State before(RegExpTerm t) { result = Match(t, 0) } /** * Gets a state the NFA may be in after matching `t`. */ -State after(RegExpTerm t) { +private State after(RegExpTerm t) { exists(RegExpAlt alt | t = alt.getAChild() | result = after(alt)) or exists(RegExpSequence seq, int i | t = seq.getChild(i) | @@ -673,7 +673,7 @@ RegExpRoot getRoot(RegExpTerm term) { /** * A state in the NFA. */ -newtype TState = +private newtype TState = /** * A state representing that the NFA is about to match a term. * `i` is used to index into multi-char literals. diff --git a/python/ql/lib/semmle/python/security/performance/ReDoSUtil.qll b/python/ql/lib/semmle/python/security/performance/ReDoSUtil.qll index 302510a1c55..65431d7179e 100644 --- a/python/ql/lib/semmle/python/security/performance/ReDoSUtil.qll +++ b/python/ql/lib/semmle/python/security/performance/ReDoSUtil.qll @@ -544,7 +544,7 @@ private State before(RegExpTerm t) { result = Match(t, 0) } /** * Gets a state the NFA may be in after matching `t`. */ -State after(RegExpTerm t) { +private State after(RegExpTerm t) { exists(RegExpAlt alt | t = alt.getAChild() | result = after(alt)) or exists(RegExpSequence seq, int i | t = seq.getChild(i) | @@ -673,7 +673,7 @@ RegExpRoot getRoot(RegExpTerm term) { /** * A state in the NFA. */ -newtype TState = +private newtype TState = /** * A state representing that the NFA is about to match a term. * `i` is used to index into multi-char literals.