add further normalization of char classses

This commit is contained in:
Erik Krogh Kristensen
2022-02-14 18:49:57 +01:00
parent 3be4a86acd
commit 7fb3d81d2f
9 changed files with 415 additions and 252 deletions

View File

@@ -199,7 +199,7 @@ CharClass getCanonicalCharClass(RegExpTerm term) {
/**
* Holds if `a` and `b` are input symbols from the same regexp.
*/
private predicate sharesRoot(TInputSymbol a, TInputSymbol b) {
private predicate sharesRoot(InputSymbol a, InputSymbol b) {
exists(RegExpRoot root |
belongsTo(a, root) and
belongsTo(b, root)
@@ -209,7 +209,7 @@ private predicate sharesRoot(TInputSymbol a, TInputSymbol b) {
/**
* Holds if the `a` is an input symbol from a regexp that has root `root`.
*/
private predicate belongsTo(TInputSymbol a, RegExpRoot root) {
private predicate belongsTo(InputSymbol a, RegExpRoot root) {
exists(State s | getRoot(s.getRepr()) = root |
delta(s, a, _)
or
@@ -378,6 +378,13 @@ private module CharacterClasses {
)
}
bindingset[char, cc]
private string caseNormalize(string char, RegExpTerm cc) {
if RegExpFlags::isIgnoreCase(cc.getRootTerm())
then result = char.toLowerCase()
else result = char
}
/**
* An implementation of `CharacterClass` for positive (non inverted) character classes.
*/
@@ -386,7 +393,7 @@ private module CharacterClasses {
PositiveCharacterClass() { this = getCanonicalCharClass(cc) and not cc.isInverted() }
override string getARelevantChar() { result = getAMentionedChar(cc) }
override string getARelevantChar() { result = caseNormalize(getAMentionedChar(cc), cc) }
override predicate matches(string char) { hasChildThatMatches(cc, char) }
}
@@ -400,8 +407,8 @@ private module CharacterClasses {
InvertedCharacterClass() { this = getCanonicalCharClass(cc) and cc.isInverted() }
override string getARelevantChar() {
result = nextChar(getAMentionedChar(cc)) or
nextChar(result) = getAMentionedChar(cc)
result = nextChar(caseNormalize(getAMentionedChar(cc), cc)) or
nextChar(result) = caseNormalize(getAMentionedChar(cc), cc)
}
bindingset[char]
@@ -428,13 +435,12 @@ private module CharacterClasses {
*/
private class PositiveCharacterClassEscape extends CharacterClass {
string charClass;
RegExpTerm cc;
PositiveCharacterClassEscape() {
exists(RegExpTerm cc |
isEscapeClass(cc, charClass) and
this = getCanonicalCharClass(cc) and
charClass = ["d", "s", "w"]
)
isEscapeClass(cc, charClass) and
this = getCanonicalCharClass(cc) and
charClass = ["d", "s", "w"]
}
override string getARelevantChar() {
@@ -445,7 +451,9 @@ private module CharacterClasses {
result = " "
or
charClass = "w" and
result = ["a", "Z", "_", "0", "9"]
if RegExpFlags::isIgnoreCase(cc.getRootTerm())
then result = ["a", "z", "_", "0", "9"]
else result = ["a", "Z", "_", "0", "9"]
}
override predicate matches(string char) { classEscapeMatches(charClass, char) }
@@ -492,6 +500,34 @@ private module CharacterClasses {
not classEscapeMatches(charClass.toLowerCase(), char)
}
}
/** Gets a representative for all char classes that match the same chars as `c`. */
CharacterClass normalize(CharacterClass c) {
exists(string normalization |
normalization = getMormalizationString(c) and
result =
min(CharacterClass cc, string raw |
getMormalizationString(cc) = normalization and cc = CharClass(raw)
|
cc order by raw
)
)
}
/** Gets a string representing all the chars matched by `c` */
private string getMormalizationString(CharacterClass c) {
(c instanceof PositiveCharacterClass or c instanceof PositiveCharacterClassEscape) and
result = concat(string char | c.matches(char) and char = CharacterClasses::getARelevantChar())
or
(c instanceof InvertedCharacterClass or c instanceof NegativeCharacterClassEscape) and
// the string produced by the concat can not contain repeated chars
// so by starting the below with "nn" we can guarantee that
// it will not overlap with the above case.
// and a negative char class can never match the same chars as a positive one, so we don't miss any results from this.
result =
"nn:" +
concat(string char | not c.matches(char) and char = CharacterClasses::getARelevantChar())
}
}
private class EdgeLabel extends TInputSymbol {
@@ -620,13 +656,17 @@ predicate delta(State q1, EdgeLabel lbl, State q2) {
cc.isUniversalClass() and q1 = before(cc) and lbl = Any() and q2 = after(cc)
or
q1 = before(cc) and
lbl = CharClass(cc.getRawValue() + "|" + getCanonicalizationFlags(cc.getRootTerm())) and
lbl =
CharacterClasses::normalize(CharClass(cc.getRawValue() + "|" +
getCanonicalizationFlags(cc.getRootTerm()))) and
q2 = after(cc)
)
or
exists(RegExpTerm cc | isEscapeClass(cc, _) |
q1 = before(cc) and
lbl = CharClass(cc.getRawValue() + "|" + getCanonicalizationFlags(cc.getRootTerm())) and
lbl =
CharacterClasses::normalize(CharClass(cc.getRawValue() + "|" +
getCanonicalizationFlags(cc.getRootTerm()))) and
q2 = after(cc)
)
or