From 3edfdc5cebdc4aa0c9ca8540ccb165c56ea2147a Mon Sep 17 00:00:00 2001 From: Marcono1234 Date: Sat, 6 Jan 2024 03:13:18 +0100 Subject: [PATCH] Java: Improve Regex flag parsing Fixes: - Flag `d` not being recognized - Syntax for disabling flags (`-`) not being recognized - Non-capturing group with flags erroneously containing `:` as literal --- .../2024-01-06-regex-flag-parsing.md | 4 ++ java/ql/lib/semmle/code/java/regex/regex.qll | 39 ++++++++++++------- .../regex/parser/RegexParseTests.expected | 26 +++++++++++++ .../regex/parser/RegexParseTests.ql | 2 + .../test/library-tests/regex/parser/Test.java | 4 +- 5 files changed, 60 insertions(+), 15 deletions(-) create mode 100644 java/ql/lib/change-notes/2024-01-06-regex-flag-parsing.md diff --git a/java/ql/lib/change-notes/2024-01-06-regex-flag-parsing.md b/java/ql/lib/change-notes/2024-01-06-regex-flag-parsing.md new file mode 100644 index 00000000000..532ab1a88dc --- /dev/null +++ b/java/ql/lib/change-notes/2024-01-06-regex-flag-parsing.md @@ -0,0 +1,4 @@ +--- +category: fix +--- +* Fixed regular expressions containing flags not being parsed correctly in some cases. diff --git a/java/ql/lib/semmle/code/java/regex/regex.qll b/java/ql/lib/semmle/code/java/regex/regex.qll index a131ac0deb5..f0336c2d023 100644 --- a/java/ql/lib/semmle/code/java/regex/regex.qll +++ b/java/ql/lib/semmle/code/java/regex/regex.qll @@ -479,7 +479,7 @@ abstract class RegexString extends StringLiteral { private predicate flagGroupStartNoModes(int start, int end) { this.isGroupStart(start) and this.getChar(start + 1) = "?" and - this.getChar(start + 2) in ["i", "m", "s", "u", "x", "U"] and + this.getChar(start + 2) in ["-", "i", "d", "m", "s", "u", "x", "U"] and end = start + 2 } @@ -491,7 +491,7 @@ abstract class RegexString extends StringLiteral { this.flagGroupStartNoModes(start, pos) or this.modeCharacter(start, pos - 1) and - this.getChar(pos) in ["i", "m", "s", "u", "x", "U"] + this.getChar(pos) in ["-", "i", "d", "m", "s", "u", "x", "U"] } /** @@ -499,7 +499,10 @@ abstract class RegexString extends StringLiteral { */ private predicate flagGroupStart(int start, int end) { this.flagGroupStartNoModes(start, _) and - end = max(int i | this.modeCharacter(start, i) | i + 1) + // Check if this is a capturing group with flags, and therefore the `:` should be excluded + exists(int maybeEnd | maybeEnd = max(int i | this.modeCharacter(start, i) | i + 1) | + if this.getChar(maybeEnd) = ":" then end = maybeEnd + 1 else end = maybeEnd + ) } /** @@ -510,9 +513,15 @@ abstract class RegexString extends StringLiteral { * ``` */ private predicate flag(string c) { - exists(int pos | - this.modeCharacter(_, pos) and - this.getChar(pos) = c + exists(int start, int pos | + this.modeCharacter(start, pos) and + this.getChar(pos) = c and + // Ignore if flag is disabled; use `<=` to also exclude `-` itself + // This does not properly handle the (contrived) case where a flag is both enabled and + // disabled, e.g. `(?i-i)a+`, in which case the flag seems to acts as if it was disabled + not exists(int minusPos | + this.modeCharacter(start, minusPos) and this.getChar(minusPos) = "-" and minusPos <= pos + ) ) } @@ -524,6 +533,8 @@ abstract class RegexString extends StringLiteral { exists(string c | this.flag(c) | c = "i" and result = "IGNORECASE" or + c = "d" and result = "UNIXLINES" + or c = "m" and result = "MULTILINE" or c = "s" and result = "DOTALL" @@ -930,13 +941,13 @@ class Regex extends RegexString { /** * Gets a mode (if any) of this regular expression. Can be any of: - * DEBUG - * IGNORECASE - * MULTILINE - * DOTALL - * UNICODE - * VERBOSE - * UNICODECLASS + * - IGNORECASE + * - UNIXLINES + * - MULTILINE + * - DOTALL + * - UNICODE + * - VERBOSE + * - UNICODECLASS */ string getAMode() { result != "None" and @@ -946,7 +957,7 @@ class Regex extends RegexString { } /** - * Holds if this regex is used to match against a full string, + * Holds if this regex is used to match against a full string, * as though it was implicitly surrounded by ^ and $. */ predicate matchesFullString() { matches_full_string = true } diff --git a/java/ql/test/library-tests/regex/parser/RegexParseTests.expected b/java/ql/test/library-tests/regex/parser/RegexParseTests.expected index ad94d005289..03dc2261610 100644 --- a/java/ql/test/library-tests/regex/parser/RegexParseTests.expected +++ b/java/ql/test/library-tests/regex/parser/RegexParseTests.expected @@ -1,4 +1,8 @@ parseFailures +modes +| Test.java:17:9:17:37 | "(?i)(?=a)(?!b)(?<=c)(?