mirror of
https://github.com/github/codeql.git
synced 2026-02-11 04:31:05 +01:00
add further normalization of char classses
This commit is contained in:
@@ -199,7 +199,7 @@ CharClass getCanonicalCharClass(RegExpTerm term) {
|
||||
/**
|
||||
* Holds if `a` and `b` are input symbols from the same regexp.
|
||||
*/
|
||||
private predicate sharesRoot(TInputSymbol a, TInputSymbol b) {
|
||||
private predicate sharesRoot(InputSymbol a, InputSymbol b) {
|
||||
exists(RegExpRoot root |
|
||||
belongsTo(a, root) and
|
||||
belongsTo(b, root)
|
||||
@@ -209,7 +209,7 @@ private predicate sharesRoot(TInputSymbol a, TInputSymbol b) {
|
||||
/**
|
||||
* Holds if the `a` is an input symbol from a regexp that has root `root`.
|
||||
*/
|
||||
private predicate belongsTo(TInputSymbol a, RegExpRoot root) {
|
||||
private predicate belongsTo(InputSymbol a, RegExpRoot root) {
|
||||
exists(State s | getRoot(s.getRepr()) = root |
|
||||
delta(s, a, _)
|
||||
or
|
||||
@@ -378,6 +378,13 @@ private module CharacterClasses {
|
||||
)
|
||||
}
|
||||
|
||||
bindingset[char, cc]
|
||||
private string caseNormalize(string char, RegExpTerm cc) {
|
||||
if RegExpFlags::isIgnoreCase(cc.getRootTerm())
|
||||
then result = char.toLowerCase()
|
||||
else result = char
|
||||
}
|
||||
|
||||
/**
|
||||
* An implementation of `CharacterClass` for positive (non inverted) character classes.
|
||||
*/
|
||||
@@ -386,7 +393,7 @@ private module CharacterClasses {
|
||||
|
||||
PositiveCharacterClass() { this = getCanonicalCharClass(cc) and not cc.isInverted() }
|
||||
|
||||
override string getARelevantChar() { result = getAMentionedChar(cc) }
|
||||
override string getARelevantChar() { result = caseNormalize(getAMentionedChar(cc), cc) }
|
||||
|
||||
override predicate matches(string char) { hasChildThatMatches(cc, char) }
|
||||
}
|
||||
@@ -400,8 +407,8 @@ private module CharacterClasses {
|
||||
InvertedCharacterClass() { this = getCanonicalCharClass(cc) and cc.isInverted() }
|
||||
|
||||
override string getARelevantChar() {
|
||||
result = nextChar(getAMentionedChar(cc)) or
|
||||
nextChar(result) = getAMentionedChar(cc)
|
||||
result = nextChar(caseNormalize(getAMentionedChar(cc), cc)) or
|
||||
nextChar(result) = caseNormalize(getAMentionedChar(cc), cc)
|
||||
}
|
||||
|
||||
bindingset[char]
|
||||
@@ -428,13 +435,12 @@ private module CharacterClasses {
|
||||
*/
|
||||
private class PositiveCharacterClassEscape extends CharacterClass {
|
||||
string charClass;
|
||||
RegExpTerm cc;
|
||||
|
||||
PositiveCharacterClassEscape() {
|
||||
exists(RegExpTerm cc |
|
||||
isEscapeClass(cc, charClass) and
|
||||
this = getCanonicalCharClass(cc) and
|
||||
charClass = ["d", "s", "w"]
|
||||
)
|
||||
isEscapeClass(cc, charClass) and
|
||||
this = getCanonicalCharClass(cc) and
|
||||
charClass = ["d", "s", "w"]
|
||||
}
|
||||
|
||||
override string getARelevantChar() {
|
||||
@@ -445,7 +451,9 @@ private module CharacterClasses {
|
||||
result = " "
|
||||
or
|
||||
charClass = "w" and
|
||||
result = ["a", "Z", "_", "0", "9"]
|
||||
if RegExpFlags::isIgnoreCase(cc.getRootTerm())
|
||||
then result = ["a", "z", "_", "0", "9"]
|
||||
else result = ["a", "Z", "_", "0", "9"]
|
||||
}
|
||||
|
||||
override predicate matches(string char) { classEscapeMatches(charClass, char) }
|
||||
@@ -492,6 +500,34 @@ private module CharacterClasses {
|
||||
not classEscapeMatches(charClass.toLowerCase(), char)
|
||||
}
|
||||
}
|
||||
|
||||
/** Gets a representative for all char classes that match the same chars as `c`. */
|
||||
CharacterClass normalize(CharacterClass c) {
|
||||
exists(string normalization |
|
||||
normalization = getMormalizationString(c) and
|
||||
result =
|
||||
min(CharacterClass cc, string raw |
|
||||
getMormalizationString(cc) = normalization and cc = CharClass(raw)
|
||||
|
|
||||
cc order by raw
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
/** Gets a string representing all the chars matched by `c` */
|
||||
private string getMormalizationString(CharacterClass c) {
|
||||
(c instanceof PositiveCharacterClass or c instanceof PositiveCharacterClassEscape) and
|
||||
result = concat(string char | c.matches(char) and char = CharacterClasses::getARelevantChar())
|
||||
or
|
||||
(c instanceof InvertedCharacterClass or c instanceof NegativeCharacterClassEscape) and
|
||||
// the string produced by the concat can not contain repeated chars
|
||||
// so by starting the below with "nn" we can guarantee that
|
||||
// it will not overlap with the above case.
|
||||
// and a negative char class can never match the same chars as a positive one, so we don't miss any results from this.
|
||||
result =
|
||||
"nn:" +
|
||||
concat(string char | not c.matches(char) and char = CharacterClasses::getARelevantChar())
|
||||
}
|
||||
}
|
||||
|
||||
private class EdgeLabel extends TInputSymbol {
|
||||
@@ -620,13 +656,17 @@ predicate delta(State q1, EdgeLabel lbl, State q2) {
|
||||
cc.isUniversalClass() and q1 = before(cc) and lbl = Any() and q2 = after(cc)
|
||||
or
|
||||
q1 = before(cc) and
|
||||
lbl = CharClass(cc.getRawValue() + "|" + getCanonicalizationFlags(cc.getRootTerm())) and
|
||||
lbl =
|
||||
CharacterClasses::normalize(CharClass(cc.getRawValue() + "|" +
|
||||
getCanonicalizationFlags(cc.getRootTerm()))) and
|
||||
q2 = after(cc)
|
||||
)
|
||||
or
|
||||
exists(RegExpTerm cc | isEscapeClass(cc, _) |
|
||||
q1 = before(cc) and
|
||||
lbl = CharClass(cc.getRawValue() + "|" + getCanonicalizationFlags(cc.getRootTerm())) and
|
||||
lbl =
|
||||
CharacterClasses::normalize(CharClass(cc.getRawValue() + "|" +
|
||||
getCanonicalizationFlags(cc.getRootTerm()))) and
|
||||
q2 = after(cc)
|
||||
)
|
||||
or
|
||||
|
||||
Reference in New Issue
Block a user