Merge pull request #15244 from Marcono1234/marcono1234/regex-flags

Java: Improve Regex flag parsing
This commit is contained in:
Tony Torralba
2024-01-16 08:25:49 +01:00
committed by GitHub
5 changed files with 60 additions and 15 deletions

View File

@@ -0,0 +1,4 @@
---
category: fix
---
* Fixed regular expressions containing flags not being parsed correctly in some cases.

View File

@@ -479,7 +479,7 @@ abstract class RegexString extends StringLiteral {
private predicate flagGroupStartNoModes(int start, int end) {
this.isGroupStart(start) and
this.getChar(start + 1) = "?" and
this.getChar(start + 2) in ["i", "m", "s", "u", "x", "U"] and
this.getChar(start + 2) in ["-", "i", "d", "m", "s", "u", "x", "U"] and
end = start + 2
}
@@ -491,7 +491,7 @@ abstract class RegexString extends StringLiteral {
this.flagGroupStartNoModes(start, pos)
or
this.modeCharacter(start, pos - 1) and
this.getChar(pos) in ["i", "m", "s", "u", "x", "U"]
this.getChar(pos) in ["-", "i", "d", "m", "s", "u", "x", "U"]
}
/**
@@ -499,7 +499,10 @@ abstract class RegexString extends StringLiteral {
*/
private predicate flagGroupStart(int start, int end) {
this.flagGroupStartNoModes(start, _) and
end = max(int i | this.modeCharacter(start, i) | i + 1)
// Check if this is a capturing group with flags, and therefore the `:` should be excluded
exists(int maybeEnd | maybeEnd = max(int i | this.modeCharacter(start, i) | i + 1) |
if this.getChar(maybeEnd) = ":" then end = maybeEnd + 1 else end = maybeEnd
)
}
/**
@@ -510,9 +513,15 @@ abstract class RegexString extends StringLiteral {
* ```
*/
private predicate flag(string c) {
exists(int pos |
this.modeCharacter(_, pos) and
this.getChar(pos) = c
exists(int start, int pos |
this.modeCharacter(start, pos) and
this.getChar(pos) = c and
// Ignore if flag is disabled; use `<=` to also exclude `-` itself
// This does not properly handle the (contrived) case where a flag is both enabled and
// disabled, e.g. `(?i-i)a+`, in which case the flag seems to acts as if it was disabled
not exists(int minusPos |
this.modeCharacter(start, minusPos) and this.getChar(minusPos) = "-" and minusPos <= pos
)
)
}
@@ -524,6 +533,8 @@ abstract class RegexString extends StringLiteral {
exists(string c | this.flag(c) |
c = "i" and result = "IGNORECASE"
or
c = "d" and result = "UNIXLINES"
or
c = "m" and result = "MULTILINE"
or
c = "s" and result = "DOTALL"
@@ -930,13 +941,13 @@ class Regex extends RegexString {
/**
* Gets a mode (if any) of this regular expression. Can be any of:
* DEBUG
* IGNORECASE
* MULTILINE
* DOTALL
* UNICODE
* VERBOSE
* UNICODECLASS
* - IGNORECASE
* - UNIXLINES
* - MULTILINE
* - DOTALL
* - UNICODE
* - VERBOSE
* - UNICODECLASS
*/
string getAMode() {
result != "None" and
@@ -946,7 +957,7 @@ class Regex extends RegexString {
}
/**
* Holds if this regex is used to match against a full string,
* Holds if this regex is used to match against a full string,
* as though it was implicitly surrounded by ^ and $.
*/
predicate matchesFullString() { matches_full_string = true }

View File

@@ -1,4 +1,8 @@
parseFailures
modes
| Test.java:17:9:17:37 | "(?i)(?=a)(?!b)(?<=c)(?<!d)+" | IGNORECASE |
| Test.java:22:9:22:85 | "(?idmsuxU-idmsuxU)a+(?-idmsuxU)b+(?idmsuxU:c)d+(?-idmsuxU:e)f+(?idmsuxU:)g+" | DOTALL,IGNORECASE,MULTILINE,UNICODE,UNICODECLASS,UNIXLINES,VERBOSE |
| Test.java:23:9:23:24 | "(?idms-iuxU)a+" | DOTALL,IGNORECASE,MULTILINE,UNIXLINES |
#select
| Test.java:5:10:5:17 | [A-Z\\d] | [RegExpCharacterClass] |
| Test.java:5:10:5:19 | [A-Z\\d]++ | [RegExpPlus] |
@@ -205,3 +209,25 @@ parseFailures
| Test.java:21:62:21:62 | b | [RegExpConstant,RegExpNormalChar] |
| Test.java:21:64:21:64 | b | [RegExpConstant,RegExpNormalChar] |
| Test.java:21:66:21:66 | b | [RegExpConstant,RegExpNormalChar] |
| Test.java:22:10:22:27 | (?idmsuxU-idmsuxU) | [RegExpZeroWidthMatch] |
| Test.java:22:10:22:84 | (?idmsuxU-idmsuxU)a+(?-idmsuxU)b+(?idmsuxU:c)d+(?-idmsuxU:e)f+(?idmsuxU:)g+ | [RegExpSequence] |
| Test.java:22:28:22:28 | a | [RegExpConstant,RegExpNormalChar] |
| Test.java:22:28:22:29 | a+ | [RegExpPlus] |
| Test.java:22:30:22:40 | (?-idmsuxU) | [RegExpZeroWidthMatch] |
| Test.java:22:41:22:41 | b | [RegExpConstant,RegExpNormalChar] |
| Test.java:22:41:22:42 | b+ | [RegExpPlus] |
| Test.java:22:43:22:54 | (?idmsuxU:c) | [RegExpGroup] |
| Test.java:22:53:22:53 | c | [RegExpConstant,RegExpNormalChar] |
| Test.java:22:55:22:55 | d | [RegExpConstant,RegExpNormalChar] |
| Test.java:22:55:22:56 | d+ | [RegExpPlus] |
| Test.java:22:57:22:69 | (?-idmsuxU:e) | [RegExpGroup] |
| Test.java:22:68:22:68 | e | [RegExpConstant,RegExpNormalChar] |
| Test.java:22:70:22:70 | f | [RegExpConstant,RegExpNormalChar] |
| Test.java:22:70:22:71 | f+ | [RegExpPlus] |
| Test.java:22:72:22:82 | (?idmsuxU:) | [RegExpZeroWidthMatch] |
| Test.java:22:83:22:83 | g | [RegExpConstant,RegExpNormalChar] |
| Test.java:22:83:22:84 | g+ | [RegExpPlus] |
| Test.java:23:10:23:21 | (?idms-iuxU) | [RegExpZeroWidthMatch] |
| Test.java:23:10:23:23 | (?idms-iuxU)a+ | [RegExpSequence] |
| Test.java:23:22:23:22 | a | [RegExpConstant,RegExpNormalChar] |
| Test.java:23:22:23:23 | a+ | [RegExpPlus] |

View File

@@ -8,5 +8,7 @@ string getQLClases(RegexTreeView::RegExpTerm t) {
query predicate parseFailures(Regex::Regex r, int i) { r.failedToParse(i) }
query predicate modes(Regex::Regex r, string modes) { modes = strictconcat(r.getAMode(), ",") }
from RegexTreeView::RegExpTerm t
select t, getQLClases(t)

View File

@@ -18,7 +18,9 @@ class Test {
"a||b|c(d|e|)f|g+",
"\\018\\033\\0377\\0777\u1337+",
"[|]+",
"(a(a(a(a(a(a((((c))))a))))))((((((b(((((d)))))b)b)b)b)b)b)+"
"(a(a(a(a(a(a((((c))))a))))))((((((b(((((d)))))b)b)b)b)b)b)+",
"(?idmsuxU-idmsuxU)a+(?-idmsuxU)b+(?idmsuxU:c)d+(?-idmsuxU:e)f+(?idmsuxU:)g+",
"(?idms-iuxU)a+",
};
void test() {