Merge pull request #6460 from yoff/python-regex-parsing-consistency-checks

Python: Add regex parsing consistency checks
This commit is contained in:
Taus
2021-09-07 13:33:59 +02:00
committed by GitHub
17 changed files with 583 additions and 17 deletions

View File

@@ -7,6 +7,10 @@ private import semmle.python.regex
* An element containing a regular expression term, that is, either
* a string literal (parsed as a regular expression)
* or another regular expression term.
*
* For sequences and alternations, we require at least one child.
* Otherwise, we wish to represent the term differently.
* This avoids multiple representations of the same term.
*/
newtype TRegExpParent =
/** A string literal used as a regular expression */
@@ -14,9 +18,18 @@ newtype TRegExpParent =
/** A quantified term */
TRegExpQuantifier(Regex re, int start, int end) { re.qualifiedItem(start, end, _, _) } or
/** A sequence term */
TRegExpSequence(Regex re, int start, int end) { re.sequence(start, end) } or
/** An alternatio term */
TRegExpAlt(Regex re, int start, int end) { re.alternation(start, end) } or
TRegExpSequence(Regex re, int start, int end) {
re.sequence(start, end) and
exists(seqChild(re, start, end, 1)) // if a sequence does not have more than one element, it should be treated as that element instead.
} or
/** An alternation term */
TRegExpAlt(Regex re, int start, int end) {
re.alternation(start, end) and
exists(int part_end |
re.alternationOption(start, end, start, part_end) and
part_end < end
) // if an alternation does not have more than one element, it should be treated as that element instead.
} or
/** A character class term */
TRegExpCharacterClass(Regex re, int start, int end) { re.charSet(start, end) } or
/** A character range term */
@@ -93,8 +106,7 @@ class RegExpTerm extends RegExpParent {
or
this = TRegExpQuantifier(re, start, end)
or
this = TRegExpSequence(re, start, end) and
exists(seqChild(re, start, end, 1)) // if a sequence does not have more than one element, it should be treated as that element instead.
this = TRegExpSequence(re, start, end)
or
this = TRegExpSpecialChar(re, start, end)
}
@@ -341,10 +353,7 @@ class RegExpRange extends RegExpQuantifier {
* This is a sequence with the elements `(ECMA|Java)` and `Script`.
*/
class RegExpSequence extends RegExpTerm, TRegExpSequence {
RegExpSequence() {
this = TRegExpSequence(re, start, end) and
exists(seqChild(re, start, end, 1)) // if a sequence does not have more than one element, it should be treated as that element instead.
}
RegExpSequence() { this = TRegExpSequence(re, start, end) }
override RegExpTerm getChild(int i) { result = seqChild(re, start, end, i) }

View File

@@ -369,12 +369,12 @@ abstract class RegexString extends Expr {
// hex value \xhh
this.getChar(start + 1) = "x" and end = start + 4
or
// octal value \ooo
// octal value \o, \oo, or \ooo
end in [start + 2 .. start + 4] and
this.getText().substring(start + 1, end).toInt() >= 0 and
forall(int i | i in [start + 1 .. end - 1] | this.isOctal(i)) and
not (
end < start + 4 and
exists(this.getText().substring(start + 1, end + 1).toInt())
this.isOctal(end)
)
or
// 16-bit hex value \uhhhh
@@ -392,6 +392,9 @@ abstract class RegexString extends Expr {
)
}
pragma[inline]
private predicate isOctal(int index) { this.getChar(index) = [0 .. 7].toString() }
/** Holds if `index` is inside a character set. */
predicate inCharSet(int index) {
exists(int x, int y | this.charSet(x, y) and index in [x + 1 .. y - 2])
@@ -690,6 +693,7 @@ abstract class RegexString extends Expr {
private predicate numbered_backreference(int start, int end, int value) {
this.escapingChar(start) and
// starting with 0 makes it an octal escape
not this.getChar(start + 1) = "0" and
exists(string text, string svalue, int len |
end = start + len and
@@ -698,8 +702,16 @@ abstract class RegexString extends Expr {
|
svalue = text.substring(start + 1, start + len) and
value = svalue.toInt() and
not exists(text.substring(start + 1, start + len + 1).toInt()) and
value > 0
// value is composed of digits
forall(int i | i in [start + 1 .. start + len - 1] | this.getChar(i) = [0 .. 9].toString()) and
// a longer reference is not possible
not (
len = 2 and
exists(text.substring(start + 1, start + len + 1).toInt())
) and
// 3 octal digits makes it an octal escape
not forall(int i | i in [start + 1 .. start + 4] | this.isOctal(i))
// TODO: Inside a character set, all numeric escapes are treated as characters.
)
}