Ruby/Python: regex parser: group sequences of 'normal' characters

This commit is contained in:
Arthur Baars
2022-02-22 10:51:47 +01:00
parent 36e02ae9ac
commit 69ed121ecb
9 changed files with 166 additions and 231 deletions

View File

@@ -39,7 +39,12 @@ newtype TRegExpParent =
/** A special character */
TRegExpSpecialChar(Regex re, int start, int end) { re.specialCharacter(start, end, _) } or
/** A normal character */
TRegExpNormalChar(Regex re, int start, int end) { re.normalCharacter(start, end) } or
TRegExpNormalChar(Regex re, int start, int end) {
re.normalCharacterSequence(start, end)
or
re.escapedCharacter(start, end) and
not re.specialCharacter(start, end, _)
} or
/** A back reference */
TRegExpBackRef(Regex re, int start, int end) { re.backreference(start, end) }

View File

@@ -446,6 +446,45 @@ abstract class RegexString extends Expr {
)
}
/**
* A sequence of 'normal' characters.
*/
predicate normalCharacterSequence(int start, int end) {
this.normalCharacter(start, end) and
end = start + 1 and
exists(int x, int y | this.charSet(x, y) and x <= start and y >= end)
or
exists(int s, int e |
e = max(int i | normalCharacterSub(s, i)) and
not exists(int x, int y | this.charSet(x, y) and x <= s and y >= e)
|
if qualifier(e, _, _, _)
then
end = e and start = e - 1
or
end = e - 1 and start = s and start < end
else (
end = e and
start = s
)
)
}
private predicate normalCharacterSub(int start, int end) {
(
normalCharacterSub(start, end - 1)
or
start = end - 1 and not normalCharacter(start - 1, start)
) and
this.normalCharacter(end - 1, end)
}
private predicate characterItem(int start, int end) {
this.normalCharacterSequence(start, end) or
this.escapedCharacter(start, end) or
this.specialCharacter(start, end, _)
}
/** Whether the text in the range start,end is a group */
predicate group(int start, int end) {
this.groupContents(start, end, _, _)
@@ -717,7 +756,7 @@ abstract class RegexString extends Expr {
string getBackrefName(int start, int end) { this.named_backreference(start, end, result) }
private predicate baseItem(int start, int end) {
this.character(start, end) and
this.characterItem(start, end) and
not exists(int x, int y | this.charSet(x, y) and x <= start and y >= end)
or
this.group(start, end)
@@ -837,14 +876,14 @@ abstract class RegexString extends Expr {
}
private predicate item_start(int start) {
this.character(start, _) or
this.characterItem(start, _) or
this.isGroupStart(start) or
this.charSet(start, _) or
this.backreference(start, _)
}
private predicate item_end(int end) {
this.character(_, end)
this.characterItem(_, end)
or
exists(int endm1 | this.isGroupEnd(endm1) and end = endm1 + 1)
or
@@ -953,7 +992,7 @@ abstract class RegexString extends Expr {
*/
predicate firstItem(int start, int end) {
(
this.character(start, end)
this.characterItem(start, end)
or
this.qualifiedItem(start, end, _, _)
or
@@ -968,7 +1007,7 @@ abstract class RegexString extends Expr {
*/
predicate lastItem(int start, int end) {
(
this.character(start, end)
this.characterItem(start, end)
or
this.qualifiedItem(start, end, _, _)
or

View File

@@ -1,6 +1,6 @@
| 012345678 | first | 0 | 1 |
| 012345678 | last | 8 | 9 |
| (?!not-this)^[A-Z_]+$ | first | 3 | 4 |
| 012345678 | first | 0 | 9 |
| 012345678 | last | 0 | 9 |
| (?!not-this)^[A-Z_]+$ | first | 3 | 11 |
| (?!not-this)^[A-Z_]+$ | first | 12 | 13 |
| (?!not-this)^[A-Z_]+$ | first | 13 | 19 |
| (?!not-this)^[A-Z_]+$ | first | 13 | 20 |
@@ -27,9 +27,9 @@
| (?m)^(?!$) | last | 4 | 5 |
| (?m)^(?!$) | last | 8 | 9 |
| (\\033\|~{) | first | 1 | 5 |
| (\\033\|~{) | first | 6 | 7 |
| (\\033\|~{) | first | 6 | 8 |
| (\\033\|~{) | last | 1 | 5 |
| (\\033\|~{) | last | 7 | 8 |
| (\\033\|~{) | last | 6 | 8 |
| [\ufffd-\ufffd] | first | 0 | 5 |
| [\ufffd-\ufffd] | last | 0 | 5 |
| [\ufffd-\ufffd][\ufffd-\ufffd] | first | 0 | 5 |
@@ -52,8 +52,8 @@
| \\A[+-]?\\d+ | last | 7 | 9 |
| \\A[+-]?\\d+ | last | 7 | 10 |
| \\Afoo\\Z | first | 0 | 2 |
| \\Afoo\\Z | first | 2 | 3 |
| \\Afoo\\Z | last | 4 | 5 |
| \\Afoo\\Z | first | 2 | 5 |
| \\Afoo\\Z | last | 2 | 5 |
| \\Afoo\\Z | last | 5 | 7 |
| \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | first | 0 | 2 |
| \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | last | 28 | 32 |
@@ -86,24 +86,24 @@
| ^[A-Z_]+$(?<!not-this) | last | 1 | 7 |
| ^[A-Z_]+$(?<!not-this) | last | 1 | 8 |
| ^[A-Z_]+$(?<!not-this) | last | 8 | 9 |
| ^[A-Z_]+$(?<!not-this) | last | 20 | 21 |
| ^[A-Z_]+$(?<!not-this) | last | 13 | 21 |
| ax{01,3} | first | 0 | 1 |
| ax{01,3} | last | 1 | 2 |
| ax{01,3} | last | 1 | 8 |
| ax{01,3} | last | 7 | 8 |
| ax{01,3} | last | 3 | 8 |
| ax{3,} | first | 0 | 1 |
| ax{3,} | last | 1 | 2 |
| ax{3,} | last | 1 | 6 |
| ax{3,} | last | 5 | 6 |
| ax{3,} | last | 3 | 6 |
| ax{3} | first | 0 | 1 |
| ax{3} | last | 1 | 2 |
| ax{3} | last | 1 | 5 |
| ax{3} | last | 4 | 5 |
| ax{3} | last | 3 | 5 |
| ax{,3} | first | 0 | 1 |
| ax{,3} | last | 0 | 1 |
| ax{,3} | last | 1 | 2 |
| ax{,3} | last | 1 | 6 |
| ax{,3} | last | 5 | 6 |
| ax{,3} | last | 3 | 6 |
| x\| | first | 0 | 1 |
| x\| | last | 0 | 1 |
| x\|(?<!\\w)l | first | 0 | 1 |
@@ -111,5 +111,5 @@
| x\|(?<!\\w)l | first | 9 | 10 |
| x\|(?<!\\w)l | last | 0 | 1 |
| x\|(?<!\\w)l | last | 9 | 10 |
| x{Not qual} | first | 0 | 1 |
| x{Not qual} | last | 10 | 11 |
| x{Not qual} | first | 0 | 11 |
| x{Not qual} | last | 0 | 11 |

View File

@@ -59,7 +59,7 @@
| redos.py:220:25:220:29 | [^X]+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'W'. |
| redos.py:223:30:223:30 | b | This part of the regular expression may cause exponential backtracking on strings starting with 'W' and containing many repetitions of 'bW'. |
| redos.py:229:30:229:30 | b | This part of the regular expression may cause exponential backtracking on strings starting with 'W' and containing many repetitions of 'bW'. |
| redos.py:241:27:241:27 | b | This part of the regular expression may cause exponential backtracking on strings starting with 'a' and containing many repetitions of 'ba'. |
| redos.py:241:26:241:27 | ab | This part of the regular expression may cause exponential backtracking on strings starting with 'a' and containing many repetitions of 'ab'. |
| redos.py:247:25:247:31 | [\\n\\s]+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\n'. |
| redos.py:256:25:256:27 | \\w* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'foobarbazfoobarbazfoobarbazfoobarbazfoobarbazfoobarbaz'. |
| redos.py:256:37:256:39 | \\w* | This part of the regular expression may cause exponential backtracking on strings starting with 'foobarbaz' and containing many repetitions of 'foobarbazfoobarbazfoobarbazfoobarbazfoobarbazfoobarbaz'. |