Support quote sequences

This commit is contained in:
Joe Farebrother
2022-01-13 14:15:06 +00:00
parent 59945cd8b3
commit d04c99b0be
2 changed files with 94 additions and 6 deletions

View File

@@ -40,6 +40,8 @@ newtype TRegExpParent =
TRegExpSpecialChar(Regex re, int start, int end) { re.specialCharacter(start, end, _) } or
/** A normal character */
TRegExpNormalChar(Regex re, int start, int end) { re.normalCharacter(start, end) } or
/** A quoted sequence */
TRegExpQuote(Regex re, int start, int end) { re.quote(start, end) } or
/** A back reference */
TRegExpBackRef(Regex re, int start, int end) { re.backreference(start, end) }
@@ -107,6 +109,8 @@ class RegExpTerm extends RegExpParent {
or
this = TRegExpNormalChar(re, start, end)
or
this = TRegExpQuote(re, start, end)
or
this = TRegExpGroup(re, start, end)
or
this = TRegExpQuantifier(re, start, end)
@@ -675,9 +679,34 @@ class RegExpNormalChar extends RegExpTerm, TRegExpNormalChar {
override string getPrimaryQLClass() { result = "RegExpNormalChar" }
}
/**
* A quoted sequence.
*
* Example:
* ```
* \Qabc\E
* ```
*/
class RegExpQuote extends RegExpTerm, TRegExpQuote {
string value;
RegExpQuote() {
exists(int inner_start, int inner_end |
this = TRegExpQuote(re, start, end) and
re.quote(start, end, inner_start, inner_end) and
value = re.getText().substring(inner_start, inner_end)
)
}
/** Gets the string matched by this quote term. */
string getValue() { result = value }
override string getPrimaryQLClass() { result = "RegExpQuote" }
}
/**
* A constant regular expression term, that is, a regular expression
* term matching a single string. Currently, this will always be a single character.
* term matching a single string. This can be a single character or a quoted sequence.
*
* Example:
*
@@ -689,14 +718,14 @@ class RegExpConstant extends RegExpTerm {
string value;
RegExpConstant() {
this = TRegExpNormalChar(re, start, end) and
(this = TRegExpNormalChar(re, start, end) or this = TRegExpQuote(re, start, end)) and
not this instanceof RegExpCharacterClassEscape and
// exclude chars in qualifiers
// TODO: push this into regex library
not exists(int qstart, int qend | re.qualifiedPart(_, qstart, qend, _, _) |
qstart <= start and end <= qend
) and
value = this.(RegExpNormalChar).getValue()
(value = this.(RegExpNormalChar).getValue() or value = this.(RegExpQuote).getValue())
}
/**

View File

@@ -189,13 +189,17 @@ abstract class RegexString extends Expr {
}
/** Holds if the character at `pos` is a "\" that is actually escaping what comes after. */
predicate escapingChar(int pos) { this.escaping(pos) = true }
predicate escapingChar(int pos) {
this.escaping(pos) = true and
not exists(int x, int y | this.quote(x, y) and pos in [x .. y - 1])
}
/**
* Helper predicate for `escapingChar`.
* In order to avoid negative recusrion, we return a boolean.
* This way, we can refer to `escaping(pos - 1).booleanNot()`
* rather than to a negated version of `escaping(pos)`.
* Does not take into account escape characters inside quote sequences.
*/
private boolean escaping(int pos) {
pos = -1 and result = false
@@ -205,6 +209,53 @@ abstract class RegexString extends Expr {
this.getChar(pos) != "\\" and result = false
}
/**
* Helper predicate for `quoteSequence`.
* Holds if the char at `pos` could be the beginning of a quote delimiter, i.e. `\Q` (non-escaped) or `\E` (escaping not checked, as quote sequences turn off escapes).
* Result is `true` for `\Q` and `false` for `\E`.
*/
private boolean quote_delimiter(int pos) {
result = true and
this.escaping(pos) = true and
this.getChar(pos + 1) = "Q"
or
result = false and
this.getChar(pos) = "\\" and
this.getChar(pos + 1) = "E"
}
/**
* Helper predicate for `quoteSequence`.
* Holds if the char at `pos` is the one-based `index`th occourence of a quote delimiter (`\Q` or `\E`)
* Result is `true` for `\Q` and `false` for `\E`.
*/
private boolean quote_delimiter(int index, int pos) {
result = this.quote_delimiter(pos) and
pos = rank[index](int p | this.quote_delimiter(p) = [true, false])
}
/** Holds if a quoted sequence is found between `start` and `end` */
predicate quote(int start, int end) { this.quote(start, end, _, _) }
/** Holds if a quoted sequence is fund between `start` and `end`, with ontent found between `inner_start` and `inner_end`. */
predicate quote(int start, int end, int inner_start, int inner_end) {
exists(int index |
this.quote_delimiter(index, start) = true and
(
index = 1
or
this.quote_delimiter(index - 1, _) = false
) and
inner_start = start + 2 and
inner_end = end - 2 and
inner_end > inner_start and
this.quote_delimiter(inner_end) = false and
not exists(int mid |
this.quote_delimiter(mid) = false and mid in [inner_start .. inner_end - 1]
)
)
}
/** Gets the text of this regex */
string getText() { result = this.(StringLiteral).getValue() }
@@ -212,7 +263,8 @@ abstract class RegexString extends Expr {
string nonEscapedCharAt(int i) {
result = this.getText().charAt(i) and
not exists(int x, int y | this.escapedCharacter(x, y) and i in [x .. y - 1])
not exists(int x, int y | this.escapedCharacter(x, y) and i in [x .. y - 1]) and
not exists(int x, int y | this.quote(x, y) and i in [x .. y - 1])
}
private predicate isOptionDivider(int i) { this.nonEscapedCharAt(i) = "|" }
@@ -728,7 +780,8 @@ abstract class RegexString extends Expr {
this.character(start, _) or
this.isGroupStart(start) or
this.charSet(start, _) or
this.backreference(start, _)
this.backreference(start, _) or
this.quote(start, _)
}
private predicate item_end(int end) {
@@ -739,6 +792,8 @@ abstract class RegexString extends Expr {
this.charSet(_, end)
or
this.qualifier(_, end, _, _)
or
this.quote(_, end)
}
private predicate top_level(int start, int end) {
@@ -846,6 +901,8 @@ abstract class RegexString extends Expr {
this.qualifiedItem(start, end, _, _)
or
this.charSet(start, end)
or
this.quote(start, end)
) and
this.firstPart(start, end)
}
@@ -861,6 +918,8 @@ abstract class RegexString extends Expr {
this.qualifiedItem(start, end, _, _)
or
this.charSet(start, end)
or
this.quote(start, end)
) and
this.lastPart(start, end)
}