introduce cannonical representatives of RegExpTerms to decrease the number of InputSymbols in the NFA

This commit is contained in:
Erik Krogh Kristensen
2020-12-17 17:36:44 +01:00
parent 34dda6d38b
commit 7ce91e9146
2 changed files with 52 additions and 17 deletions

View File

@@ -123,6 +123,23 @@ private class RegexpCharacterConstant extends RegExpConstant {
RegexpCharacterConstant() { this.isCharacter() }
}
/**
* Holds if `term` is the chosen cannonical representative for all terms with string representation `str`.
*
* Using cannonical representatives gives a huge performance boost when working with tuples containing multiple `InputSymbol`s.
* The number of `InputSymbol`s is decreased by 3 orders of magnitude or more in some larger benchmarks.
*/
private predicate isCannonicalTerm(RegExpTerm term, string str) {
term =
rank[1](RegExpTerm t, Location loc, File file |
loc = t.getLocation() and
file = t.getFile() and
str = t.getRawValue()
|
t order by t.getFile().getRelativePath(), loc.getStartLine(), loc.getStartColumn()
)
}
/**
* An abstract input symbol, representing a set of concrete characters.
*/
@@ -133,11 +150,11 @@ private newtype TInputSymbol =
} or
/**
* An input symbol representing all characters matched by
* (non-universal) character class `recc`.
* a (non-universal) character class that has string representation `charClassString`.
*/
CharClass(RegExpTerm recc) {
getRoot(recc).isRelevant() and
(
CharClass(string charClassString) {
exists(RegExpTerm term | term.getRawValue() = charClassString | getRoot(term).isRelevant()) and
exists(RegExpTerm recc | isCannonicalTerm(recc, charClassString) |
recc instanceof RegExpCharacterClass and
not recc.(RegExpCharacterClass).isUniversalClass()
or
@@ -168,8 +185,11 @@ private predicate sharesRoot(TInputSymbol a, TInputSymbol b) {
private predicate belongsTo(TInputSymbol a, RegExpRoot root) {
exists(RegExpTerm term | getRoot(term) = root |
a = Char(term.(RegexpCharacterConstant).getValue().charAt(_))
or
a = CharClass(term)
)
or
exists(string str, RegExpTerm term | a = CharClass(str) |
term.getRawValue() = str and
getRoot(term) = root
)
}
@@ -182,7 +202,7 @@ class InputSymbol extends TInputSymbol {
string toString() {
this = Char(result)
or
result = any(RegExpTerm recc | this = CharClass(recc)).toString()
this = CharClass(result)
or
this = Dot() and result = "."
or
@@ -228,7 +248,10 @@ private module CharacterClasses {
*/
pragma[noinline]
predicate hasChildThatMatches(RegExpCharacterClass cc, string char) {
exists(CharClass(cc)) and
exists(string str |
isCannonicalTerm(cc, str) and
exists(CharClass(str))
) and
exists(RegExpTerm child | child = cc.getAChild() |
char = child.(RegexpCharacterConstant).getValue()
or
@@ -324,7 +347,9 @@ private module CharacterClasses {
private class PositiveCharacterClass extends CharacterClass {
RegExpCharacterClass cc;
PositiveCharacterClass() { this = CharClass(cc) and not cc.isInverted() }
PositiveCharacterClass() {
exists(string str | isCannonicalTerm(cc, str) | this = CharClass(str) and not cc.isInverted())
}
override string getARelevantChar() { result = getAMentionedChar(cc) }
@@ -337,7 +362,9 @@ private module CharacterClasses {
private class InvertedCharacterClass extends CharacterClass {
RegExpCharacterClass cc;
InvertedCharacterClass() { this = CharClass(cc) and cc.isInverted() }
InvertedCharacterClass() {
exists(string str | isCannonicalTerm(cc, str) | this = CharClass(str) and cc.isInverted())
}
override string getARelevantChar() {
result = nextChar(getAMentionedChar(cc)) or
@@ -374,7 +401,11 @@ private module CharacterClasses {
private class PositiveCharacterClassEscape extends CharacterClass {
RegExpCharacterClassEscape cc;
PositiveCharacterClassEscape() { this = CharClass(cc) and cc.getValue() = ["d", "s", "w"] }
PositiveCharacterClassEscape() {
exists(string str | isCannonicalTerm(cc, str) |
this = CharClass(str) and cc.getValue() = ["d", "s", "w"]
)
}
override string getARelevantChar() {
cc.getValue() = "d" and
@@ -407,7 +438,11 @@ private module CharacterClasses {
private class NegativeCharacterClassEscape extends CharacterClass {
RegExpCharacterClassEscape cc;
NegativeCharacterClassEscape() { this = CharClass(cc) and cc.getValue() = ["D", "S", "W"] }
NegativeCharacterClassEscape() {
exists(string str | isCannonicalTerm(cc, str) |
this = CharClass(str) and cc.getValue() = ["D", "S", "W"]
)
}
override string getARelevantChar() {
cc.getValue() = "D" and
@@ -490,13 +525,13 @@ predicate delta(State q1, EdgeLabel lbl, State q2) {
cc.isUniversalClass() and q1 = before(cc) and lbl = Any() and q2 = after(cc)
or
q1 = before(cc) and
lbl = CharClass(cc) and
lbl = CharClass(cc.getRawValue()) and
q2 = after(cc)
)
or
exists(RegExpCharacterClassEscape cc |
q1 = before(cc) and
lbl = CharClass(cc) and
lbl = CharClass(cc.getRawValue()) and
q2 = after(cc)
)
or

View File

@@ -15,7 +15,7 @@
| regexplib/email.js:25:106:25:117 | [a-zA-Z0-9]+ | This part of the regular expression may cause exponential backtracking on strings starting with '0@0' and containing many repetitions of '0'. |
| regexplib/email.js:25:212:25:223 | [a-zA-Z0-9]+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '0'. |
| regexplib/email.js:25:251:25:262 | [a-zA-Z0-9]+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '0'. |
| regexplib/email.js:32:10:32:25 | (?:\\w[\\.\\-\\+]?)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '0'. |
| regexplib/email.js:32:10:32:25 | (?:\\w[\\.\\-\\+]?)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a'. |
| regexplib/email.js:33:16:33:22 | [-.\\w]* | This part of the regular expression may cause exponential backtracking on strings starting with '0' and containing many repetitions of '0'. |
| regexplib/email.js:33:38:33:51 | ([0-9a-zA-Z])+ | This part of the regular expression may cause exponential backtracking on strings starting with '0@' and containing many repetitions of '00.'. |
| regexplib/email.js:33:53:33:58 | [-\\w]* | This part of the regular expression may cause exponential backtracking on strings starting with '0@0' and containing many repetitions of '0'. |
@@ -41,7 +41,7 @@
| regexplib/strings.js:57:17:57:19 | \\d+ | This part of the regular expression may cause exponential backtracking on strings starting with '?se[' and containing many repetitions of '9'. |
| regexplib/strings.js:81:17:81:19 | \\d+ | This part of the regular expression may cause exponential backtracking on strings starting with '?se[' and containing many repetitions of '9'. |
| regexplib/uri.js:3:128:3:129 | .* | This part of the regular expression may cause exponential backtracking on strings starting with 'ftp:// /' and containing many repetitions of '/'. |
| regexplib/uri.js:3:200:3:215 | (?:\\&?\\w+\\=\\w+)* | This part of the regular expression may cause exponential backtracking on strings starting with 'ftp:// a="' and containing many repetitions of '0=0'. |
| regexplib/uri.js:3:200:3:215 | (?:\\&?\\w+\\=\\w+)* | This part of the regular expression may cause exponential backtracking on strings starting with 'ftp:// a="' and containing many repetitions of 'a=0'. |
| regexplib/uri.js:5:42:5:43 | .* | This part of the regular expression may cause exponential backtracking on strings starting with 'A:\\\\a' and containing many repetitions of '\\\\a'. |
| regexplib/uri.js:17:42:17:43 | .* | This part of the regular expression may cause exponential backtracking on strings starting with 'A:\\\\a' and containing many repetitions of '\\\\a'. |
| regexplib/uri.js:38:35:38:40 | [a-z]+ | This part of the regular expression may cause exponential backtracking on strings starting with 'a.' and containing many repetitions of 'a'. |
@@ -109,7 +109,7 @@
| tst.js:227:20:227:20 | b | This part of the regular expression may cause exponential backtracking on strings starting with 'W' and containing many repetitions of 'bW'. |
| tst.js:239:16:239:17 | ab | This part of the regular expression may cause exponential backtracking on strings starting with 'a' and containing many repetitions of 'ab'. |
| tst.js:245:15:245:21 | [\\n\\s]+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\n'. |
| tst.js:254:87:254:89 | \\w* | This part of the regular expression may cause exponential backtracking on strings starting with 'foobarbazfoobarbazfoobarbazfoobarbazfoobarbazfoobarbaz' and containing many repetitions of '0foobarbazfoobarbazfoobarbazfoobarbazfoobarbazfoobarbaz'. |
| tst.js:254:87:254:89 | \\w* | This part of the regular expression may cause exponential backtracking on strings starting with 'foobarbazfoobarbazfoobarbazfoobarbazfoobarbazfoobarbaz' and containing many repetitions of 'afoobarbazfoobarbazfoobarbazfoobarbazfoobarbazfoobarbaz'. |
| tst.js:257:14:257:116 | (.thisisagoddamnlongstringforstresstestingthequery\|\\sthisisagoddamnlongstringforstresstestingthequery)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of ' thisisagoddamnlongstringforstresstestingthequery'. |
| tst.js:260:14:260:77 | (thisisagoddamnlongstringforstresstestingthequery\|this\\w+query)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'thisisagoddamnlongstringforstresstestingthequery'. |
| tst.js:272:21:272:22 | b+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'b'. |