Java: Improve Regex flag parsing

Fixes:
- Flag `d` not being recognized
- Syntax for disabling flags (`-`) not being recognized
- Non-capturing group with flags erroneously containing `:` as literal
This commit is contained in:
Marcono1234
2024-01-06 03:13:18 +01:00
parent 93d9332ab4
commit 3edfdc5ceb
5 changed files with 60 additions and 15 deletions

View File

@@ -479,7 +479,7 @@ abstract class RegexString extends StringLiteral {
private predicate flagGroupStartNoModes(int start, int end) {
this.isGroupStart(start) and
this.getChar(start + 1) = "?" and
this.getChar(start + 2) in ["i", "m", "s", "u", "x", "U"] and
this.getChar(start + 2) in ["-", "i", "d", "m", "s", "u", "x", "U"] and
end = start + 2
}
@@ -491,7 +491,7 @@ abstract class RegexString extends StringLiteral {
this.flagGroupStartNoModes(start, pos)
or
this.modeCharacter(start, pos - 1) and
this.getChar(pos) in ["i", "m", "s", "u", "x", "U"]
this.getChar(pos) in ["-", "i", "d", "m", "s", "u", "x", "U"]
}
/**
@@ -499,7 +499,10 @@ abstract class RegexString extends StringLiteral {
*/
private predicate flagGroupStart(int start, int end) {
this.flagGroupStartNoModes(start, _) and
end = max(int i | this.modeCharacter(start, i) | i + 1)
// Check if this is a capturing group with flags, and therefore the `:` should be excluded
exists(int maybeEnd | maybeEnd = max(int i | this.modeCharacter(start, i) | i + 1) |
if this.getChar(maybeEnd) = ":" then end = maybeEnd + 1 else end = maybeEnd
)
}
/**
@@ -510,9 +513,15 @@ abstract class RegexString extends StringLiteral {
* ```
*/
private predicate flag(string c) {
exists(int pos |
this.modeCharacter(_, pos) and
this.getChar(pos) = c
exists(int start, int pos |
this.modeCharacter(start, pos) and
this.getChar(pos) = c and
// Ignore if flag is disabled; use `<=` to also exclude `-` itself
// This does not properly handle the (contrived) case where a flag is both enabled and
// disabled, e.g. `(?i-i)a+`, in which case the flag seems to acts as if it was disabled
not exists(int minusPos |
this.modeCharacter(start, minusPos) and this.getChar(minusPos) = "-" and minusPos <= pos
)
)
}
@@ -524,6 +533,8 @@ abstract class RegexString extends StringLiteral {
exists(string c | this.flag(c) |
c = "i" and result = "IGNORECASE"
or
c = "d" and result = "UNIXLINES"
or
c = "m" and result = "MULTILINE"
or
c = "s" and result = "DOTALL"
@@ -930,13 +941,13 @@ class Regex extends RegexString {
/**
* Gets a mode (if any) of this regular expression. Can be any of:
* DEBUG
* IGNORECASE
* MULTILINE
* DOTALL
* UNICODE
* VERBOSE
* UNICODECLASS
* - IGNORECASE
* - UNIXLINES
* - MULTILINE
* - DOTALL
* - UNICODE
* - VERBOSE
* - UNICODECLASS
*/
string getAMode() {
result != "None" and
@@ -946,7 +957,7 @@ class Regex extends RegexString {
}
/**
* Holds if this regex is used to match against a full string,
* Holds if this regex is used to match against a full string,
* as though it was implicitly surrounded by ^ and $.
*/
predicate matchesFullString() { matches_full_string = true }