Improve handling of hex escapes; and support some named character classes

This commit is contained in:
Joe Farebrother
2022-02-21 16:39:59 +00:00
parent 5143585080
commit 57ba8a4d1b
3 changed files with 96 additions and 23 deletions

View File

@@ -55,7 +55,7 @@ class RegExpParent extends TRegExpParent {
string toString() { result = "RegExpParent" }
/** Gets the `i`th child term. */
abstract RegExpTerm getChild(int i);
RegExpTerm getChild(int i) { none() }
/** Gets a child term . */
RegExpTerm getAChild() { result = this.getChild(_) }
@@ -143,26 +143,6 @@ class RegExpTerm extends RegExpParent {
*/
predicate isRootTerm() { start = 0 and end = re.getText().length() }
override RegExpTerm getChild(int i) {
result = this.(RegExpAlt).getChild(i)
or
result = this.(RegExpBackRef).getChild(i)
or
result = this.(RegExpCharacterClass).getChild(i)
or
result = this.(RegExpCharacterRange).getChild(i)
or
result = this.(RegExpNormalChar).getChild(i)
or
result = this.(RegExpGroup).getChild(i)
or
result = this.(RegExpQuantifier).getChild(i)
or
result = this.(RegExpSequence).getChild(i)
or
result = this.(RegExpSpecialChar).getChild(i)
}
/**
* Gets the parent term of this regular expression term, or the
* regular expression literal if this is the root term.
@@ -508,7 +488,7 @@ class RegExpEscape extends RegExpNormalChar {
/**
* Holds if this is a unicode escape.
*/
private predicate isUnicode() { this.getText().matches("\\u%") }
private predicate isUnicode() { this.getText().matches(["\\u%", "\\x%"]) }
/**
* Gets the unicode char for this escape.
@@ -520,13 +500,24 @@ class RegExpEscape extends RegExpNormalChar {
)
}
/** Gets the part of this escape that is a hexidecimal string */
private string getHexString() {
this.isUnicode() and
if this.getText().matches("\\u%") // \uhhhh
then result = this.getText().suffix(2)
else
if this.getText().matches("\\x{%") // \x{h..h}
then result = this.getText().substring(3, this.getText().length() - 1)
else result = this.getText().suffix(2) // \xhh
}
/**
* Gets int value for the `index`th char in the hex number of the unicode escape.
* E.g. for `\u0061` and `index = 2` this returns 96 (the number `6` interpreted as hex).
*/
private int getHexValueFromUnicode(int index) {
this.isUnicode() and
exists(string hex, string char | hex = this.getText().suffix(2) |
exists(string hex, string char | hex = this.getHexString() |
char = hex.charAt(index) and
result = 16.pow(hex.length() - index - 1) * toHex(char)
)
@@ -574,6 +565,50 @@ class RegExpCharacterClassEscape extends RegExpEscape {
override string getPrimaryQLClass() { result = "RegExpCharacterClassEscape" }
}
/**
* A named character class in a regular expression.
*
* Examples:
*
* ```
* \p{Digit}
* \p{IsLowerCase}
*/
class RegExpNamedProperty extends RegExpCharacterClassEscape {
boolean inverted;
string name;
RegExpNamedProperty() {
name = this.getValue().substring(2, this.getValue().length() - 1) and
(
inverted = false and
this.getValue().charAt(0) = "p"
or
inverted = true and
this.getValue().charAt(0) = "P"
)
}
/** Holds if this class is inverted. */
predicate isInverted() { inverted = true }
/** Gets the name of this class. */
string getClassName() { result = name }
/**
* Gets an equivalent single-chcracter escape sequence for this class (e.g. \d) if possible, excluding the escape character.
*/
string getBackslashEquivalent() {
exists(string eq | if inverted = true then result = eq.toUpperCase() else result = eq |
name = ["Digit", "IsDigit"] and
eq = "d"
or
name = ["Space", "IsWhite_Space"] and
eq = "s"
)
}
}
/**
* A character class in a regular expression.
*