improve performance of suffix checking

This commit is contained in:
Erik Krogh Kristensen
2020-12-17 20:51:43 +01:00
parent 6369374224
commit 05569187b4
2 changed files with 18 additions and 8 deletions

View File

@@ -838,7 +838,7 @@ private module SuffixConstruction {
* Holds if there likely exists a suffix starting from `s` that leads to the regular expression being rejected.
* This predicate might find impossible suffixes when searching for suffixes of length > 1, which can cause FPs.
*/
pragma[nomagic]
pragma[noinline]
private predicate isLikelyRejectable(StateInPumpableRegexp s) {
// exists a reject edge with some char.
hasRejectEdge(s)
@@ -857,10 +857,11 @@ private module SuffixConstruction {
/**
* Holds if there is likely a non-empty suffix leading to rejection starting in `s`.
*/
pragma[noinline]
predicate hasEdgeToLikelyRejectable(StateInPumpableRegexp s) {
// all edges (at least one) with some char leads to another state that is rejectable.
// the `next` states might not share a common suffix, which can cause FPs.
exists(string char | char = relevant() |
exists(string char | char = relevant(getRoot(s.getRepr())) |
forex(State next | deltaClosedChar(s, char, next) | isLikelyRejectable(next))
)
}
@@ -871,21 +872,31 @@ private module SuffixConstruction {
* `prev` to `next` that the character symbol `char`.
*/
predicate deltaClosedChar(StateInPumpableRegexp prev, string char, StateInPumpableRegexp next) {
char = relevant() and
deltaClosed(prev, getAnInputSymbolMatching(char), next)
deltaClosed(prev, getAnInputSymbolMatchingRelevant(char), next)
}
pragma[noinline]
InputSymbol getAnInputSymbolMatchingRelevant(string char) {
char = relevant(_) and
result = getAnInputSymbolMatching(char)
}
/**
* Gets a char used for finding possible suffixes.
* Gets a char used for finding possible suffixes inside `root`.
*/
private string relevant() { result = CharacterClasses::getARelevantChar() }
pragma[noinline]
private string relevant(RegExpRoot root) {
result = ["a", "9", "|", "\n", " "]
or
exists(InputSymbol s | belongsTo(s, root) | result = intersect(s, _))
}
/**
* Holds if there is no edge from `s` labeled `char` in our NFA.
* The NFA does not model reject states, so the above is the same as saying there is a reject edge.
*/
private predicate hasRejectEdge(State s) {
exists(string char | char = relevant() | not deltaClosedChar(s, char, _))
exists(string char | char = relevant(getRoot(s.getRepr())) | not deltaClosedChar(s, char, _))
}
/**

View File

@@ -348,7 +348,6 @@
| tst.js:23:33:23:40 | [\\s\\S]*? | Strings starting with '(*(*' and with many repetitions of '(*' can start matching anywhere after the start of the preceeding (?:[\\s\\S]*?\\(\\*[\\s\\S]*?\\*\\))* |
| tst.js:23:47:23:54 | [\\s\\S]*? | Strings starting with '(*' and with many repetitions of '(*a' can start matching anywhere after the start of the preceeding (?:[\\s\\S]*?\\(\\*[\\s\\S]*?\\*\\))* |
| tst.js:28:24:28:25 | .* | Strings starting with '!\|' and with many repetitions of '\|' can start matching anywhere after the start of the preceeding .* |
| tst.js:28:59:28:60 | .* | Strings starting with '!\|\\n-\|\\n\|' and with many repetitions of '\|' can start matching anywhere after the start of the preceeding .* |
| tst.js:31:23:31:24 | .* | Strings starting with '!\|' and with many repetitions of '\|' can start matching anywhere after the start of the preceeding .* |
| tst.js:31:54:31:55 | .* | Strings starting with '!\|\\n-\|\\n' and with many repetitions of '\|\\n\|' can start matching anywhere after the start of the preceeding .* |
| tst.js:31:58:31:59 | .* | Strings starting with '!\|\\n-\|\\n\|' and with many repetitions of '\|' can start matching anywhere after the start of the preceeding .* |