Merge pull request #8489 from aibaars/regex-refactor

Ruby: refactor regex libraries
This commit is contained in:
Arthur Baars
2022-03-28 12:17:00 +02:00
committed by GitHub
16 changed files with 1316 additions and 879 deletions

View File

@@ -552,7 +552,7 @@ class RegExpWordBoundary extends RegExpSpecialChar {
/**
* A character class escape in a regular expression.
* That is, an escaped charachter that denotes multiple characters.
* That is, an escaped character that denotes multiple characters.
*
* Examples:
*

View File

@@ -188,7 +188,7 @@ abstract class RegexString extends Expr {
)
}
/** Hold is a character set starts between `start` and `end`. */
/** Holds if a character set starts between `start` and `end`. */
predicate char_set_start(int start, int end) {
this.char_set_start(start) = true and
(
@@ -316,8 +316,10 @@ abstract class RegexString extends Expr {
result = this.(Bytes).getS()
}
/** Gets the `i`th character of this regex */
string getChar(int i) { result = this.getText().charAt(i) }
/** Gets the `i`th character of this regex, unless it is part of a character escape sequence. */
string nonEscapedCharAt(int i) {
result = this.getText().charAt(i) and
not exists(int x, int y | this.escapedCharacter(x, y) and i in [x .. y - 1])
@@ -329,6 +331,9 @@ abstract class RegexString extends Expr {
private predicate isGroupStart(int i) { this.nonEscapedCharAt(i) = "(" and not this.inCharSet(i) }
/**
* Holds if the `i`th character could not be parsed.
*/
predicate failedToParse(int i) {
exists(this.getChar(i)) and
not exists(int start, int end |
@@ -417,6 +422,9 @@ abstract class RegexString extends Expr {
)
}
/**
* Holds if a simple or escaped character is found between `start` and `end`.
*/
predicate character(int start, int end) {
(
this.simpleCharacter(start, end) and
@@ -428,12 +436,18 @@ abstract class RegexString extends Expr {
not exists(int x, int y | this.backreference(x, y) and x <= start and y >= end)
}
/**
* Holds if a normal character is found between `start` and `end`.
*/
predicate normalCharacter(int start, int end) {
end = start + 1 and
this.character(start, end) and
not this.specialCharacter(start, end, _)
}
/**
* Holds if a special character is found between `start` and `end`.
*/
predicate specialCharacter(int start, int end, string char) {
not this.inCharSet(start) and
this.character(start, end) and
@@ -492,7 +506,7 @@ abstract class RegexString extends Expr {
this.specialCharacter(start, end, _)
}
/** Whether the text in the range start,end is a group */
/** Whether the text in the range `start,end` is a group */
predicate group(int start, int end) {
this.groupContents(start, end, _, _)
or
@@ -611,6 +625,7 @@ abstract class RegexString extends Expr {
this.simple_group_start(start, end)
}
/** Matches the start of a non-capturing group, e.g. `(?:` */
private predicate non_capturing_group_start(int start, int end) {
this.isGroupStart(start) and
this.getChar(start + 1) = "?" and
@@ -618,12 +633,18 @@ abstract class RegexString extends Expr {
end = start + 3
}
/** Matches the start of a simple group, e.g. `(a+)`. */
private predicate simple_group_start(int start, int end) {
this.isGroupStart(start) and
this.getChar(start + 1) != "?" and
end = start + 1
}
/**
* Matches the start of a named group, such as:
* - `(?<name>\w+)`
* - `(?'name'\w+)`
*/
private predicate named_group_start(int start, int end) {
this.isGroupStart(start) and
this.getChar(start + 1) = "?" and
@@ -675,6 +696,7 @@ abstract class RegexString extends Expr {
)
}
/** Matches the start of a positive lookahead assertion, i.e. `(?=`. */
private predicate lookahead_assertion_start(int start, int end) {
this.isGroupStart(start) and
this.getChar(start + 1) = "?" and
@@ -682,6 +704,7 @@ abstract class RegexString extends Expr {
end = start + 3
}
/** Matches the start of a negative lookahead assertion, i.e. `(?!`. */
private predicate negative_lookahead_assertion_start(int start, int end) {
this.isGroupStart(start) and
this.getChar(start + 1) = "?" and
@@ -689,6 +712,7 @@ abstract class RegexString extends Expr {
end = start + 3
}
/** Matches the start of a positive lookbehind assertion, i.e. `(?<=`. */
private predicate lookbehind_assertion_start(int start, int end) {
this.isGroupStart(start) and
this.getChar(start + 1) = "?" and
@@ -697,6 +721,7 @@ abstract class RegexString extends Expr {
end = start + 4
}
/** Matches the start of a negative lookbehind assertion, i.e. `(?<!`. */
private predicate negative_lookbehind_assertion_start(int start, int end) {
this.isGroupStart(start) and
this.getChar(start + 1) = "?" and
@@ -705,6 +730,7 @@ abstract class RegexString extends Expr {
end = start + 4
}
/** Matches the start of a comment group, i.e. `(?#`. */
private predicate comment_group_start(int start, int end) {
this.isGroupStart(start) and
this.getChar(start + 1) = "?" and
@@ -712,6 +738,7 @@ abstract class RegexString extends Expr {
end = start + 3
}
/** Matches the contents of a group. */
predicate groupContents(int start, int end, int in_start, int in_end) {
this.group_start(start, in_start) and
end = in_end + 1 and
@@ -719,12 +746,14 @@ abstract class RegexString extends Expr {
this.isGroupEnd(in_end)
}
/** Matches a named backreference, e.g. `\k<foo>`. */
private predicate named_backreference(int start, int end, string name) {
this.named_backreference_start(start, start + 4) and
end = min(int i | i > start + 4 and this.getChar(i) = ")") + 1 and
name = this.getText().substring(start + 4, end - 2)
}
/** Matches a numbered backreference, e.g. `\1`. */
private predicate numbered_backreference(int start, int end, int value) {
this.escapingChar(start) and
// starting with 0 makes it an octal escape
@@ -749,7 +778,7 @@ abstract class RegexString extends Expr {
)
}
/** Whether the text in the range start,end is a back reference */
/** Whether the text in the range `start,end` is a back reference */
predicate backreference(int start, int end) {
this.numbered_backreference(start, end, _)
or