add redos support for the simplest possible inverted char class

This commit is contained in:
Erik Krogh Kristensen
2020-10-31 12:37:58 +01:00
parent d04f3df1cd
commit 321cf09bd8
3 changed files with 63 additions and 1 deletions

View File

@@ -146,6 +146,15 @@ newtype TInputSymbol =
not recc.isInverted() and
not recc.isUniversalClass()
} or
/**
* An input symbol representing all characters matched by
* the inverted (non-universal) character class `recc`.
*/
InvertedCharClass(RegExpCharacterClass recc) {
getRoot(recc).isRelevant() and
recc.isInverted() and
not recc.isUniversalClass()
} or
/** An input symbol representing all characters matched by `.`. */
Dot() or
/** An input symbol representing all characters. */
@@ -164,6 +173,8 @@ class InputSymbol extends TInputSymbol {
or
result = any(RegExpCharacterClass recc | this = CharClass(recc)).toString()
or
result = any(RegExpCharacterClass recc | this = InvertedCharClass(recc)).toString()
or
this = Dot() and result = "."
or
this = Any() and result = "[^]"
@@ -266,7 +277,9 @@ predicate delta(State q1, EdgeLabel lbl, State q2) {
exists(RegExpCharacterClass cc |
cc.isUniversalClass() and q1 = before(cc) and lbl = Any() and q2 = after(cc)
or
q1 = before(cc) and lbl = CharClass(cc) and q2 = after(cc)
q1 = before(cc) and
(lbl = CharClass(cc) or lbl = InvertedCharClass(cc)) and
q2 = after(cc)
)
or
exists(RegExpAlt alt | lbl = Epsilon() | q1 = before(alt) and q2 = before(alt.getAChild()))
@@ -440,6 +453,16 @@ string intersect(InputSymbol c, InputSymbol d) {
d = Any()
)
or
exists(RegExpCharacterClass cc | c = InvertedCharClass(cc) and result = chooseFromInverted(cc) |
// TODO: Not done here - later commits will add more
//d = InvertedCharClass(cc)
//or
//d = Dot() and
//not (result = "\n" or result = "\r")
//or
d = Any()
)
or
exists(RegExpCharacterClass cc | c = CharClass(cc) and result = choose(cc) |
d = CharClass(cc)
or
@@ -465,6 +488,7 @@ string intersect(InputSymbol c, InputSymbol d) {
* Gets a character matched by character class `cc`.
*/
string choose(RegExpCharacterClass cc) {
exists(CharClass(cc)) and
result =
min(string c |
exists(RegExpTerm child | child = cc.getAChild() |
@@ -474,6 +498,39 @@ string choose(RegExpCharacterClass cc) {
)
}
/**
* Gets the char after `c` (from a simplified ASCII table).
*/
string nextChar(string c) { exists(int code | code = ascii(c) | code + 1 = ascii(result)) }
/**
* Gets an approximation for the ASCII code for `char`.
* Only the easily printable chars are included (so no newline, tab, null, etc).
*/
int ascii(string char) {
char =
rank[result](string c |
c =
"! \"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~"
.charAt(_)
)
}
/**
* Chooses a char matched by the inverted char class `cc`.
*/
string chooseFromInverted(RegExpCharacterClass cc) {
exists(InvertedCharClass(cc)) and
// The next char after the max of the inverted charclass.
result =
nextChar(max(string c |
exists(RegExpTerm child | child = cc.getAChild() |
c = child.(RegExpConstant).getValue() or
child.(RegExpCharacterRange).isRange(_, c)
)
))
}
/**
* Gets a string corresponding to the trace `t`.
*/

View File

@@ -52,3 +52,4 @@
| tst.js:77:14:77:21 | (a\|aa?)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a'. |
| tst.js:83:14:83:20 | (.\|\\n)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\n'. |
| tst.js:89:25:89:32 | (a\|aa?)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a'. |
| tst.js:95:15:95:25 | ([^]\|[^a])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'b'. |

View File

@@ -90,3 +90,7 @@ var bad17 = new RegExp('(a|aa?)*b');
// GOOD - not used as regexp
var good9 = '(a|aa?)*b';
// NOT GOOD
var bad18 = /(([^]|[^a])*)"/;