diff --git a/python/ql/lib/semmle/python/regexp/RegexTreeView.qll b/python/ql/lib/semmle/python/regexp/RegexTreeView.qll index a2952a5680b..897c97bb783 100644 --- a/python/ql/lib/semmle/python/regexp/RegexTreeView.qll +++ b/python/ql/lib/semmle/python/regexp/RegexTreeView.qll @@ -964,7 +964,7 @@ module Impl implements RegexTreeViewSig { * ``` */ class RegExpPositiveLookahead extends RegExpLookahead { - RegExpPositiveLookahead() { re.positiveLookaheadAssertionGroup(start, end) } + RegExpPositiveLookahead() { re.positiveLookaheadAssertionGroup(start, end, _, _) } override string getPrimaryQLClass() { result = "RegExpPositiveLookahead" } } @@ -979,7 +979,7 @@ module Impl implements RegexTreeViewSig { * ``` */ additional class RegExpNegativeLookahead extends RegExpLookahead { - RegExpNegativeLookahead() { re.negativeLookaheadAssertionGroup(start, end) } + RegExpNegativeLookahead() { re.negativeLookaheadAssertionGroup(start, end, _, _) } override string getPrimaryQLClass() { result = "RegExpNegativeLookahead" } } @@ -1006,7 +1006,7 @@ module Impl implements RegexTreeViewSig { * ``` */ class RegExpPositiveLookbehind extends RegExpLookbehind { - RegExpPositiveLookbehind() { re.positiveLookbehindAssertionGroup(start, end) } + RegExpPositiveLookbehind() { re.positiveLookbehindAssertionGroup(start, end, _, _) } override string getPrimaryQLClass() { result = "RegExpPositiveLookbehind" } } @@ -1021,7 +1021,7 @@ module Impl implements RegexTreeViewSig { * ``` */ additional class RegExpNegativeLookbehind extends RegExpLookbehind { - RegExpNegativeLookbehind() { re.negativeLookbehindAssertionGroup(start, end) } + RegExpNegativeLookbehind() { re.negativeLookbehindAssertionGroup(start, end, _, _) } override string getPrimaryQLClass() { result = "RegExpNegativeLookbehind" } } diff --git a/python/ql/lib/semmle/python/regexp/internal/ParseRegExp.qll b/python/ql/lib/semmle/python/regexp/internal/ParseRegExp.qll index 7e23554e058..d91c4bbd78c 100644 --- a/python/ql/lib/semmle/python/regexp/internal/ParseRegExp.qll +++ b/python/ql/lib/semmle/python/regexp/internal/ParseRegExp.qll @@ -554,9 +554,9 @@ class RegExp extends Expr instanceof StringLiteral { or this.negativeAssertionGroup(start, end) or - this.positiveLookaheadAssertionGroup(start, end) + this.positiveLookaheadAssertionGroup(start, end, _, _) or - this.positiveLookbehindAssertionGroup(start, end) + this.positiveLookbehindAssertionGroup(start, end, _, _) } /** Holds if an empty group is found between `start` and `end`. */ @@ -572,7 +572,7 @@ class RegExp extends Expr instanceof StringLiteral { or this.negativeAssertionGroup(start, end) or - this.positiveLookaheadAssertionGroup(start, end) + this.positiveLookaheadAssertionGroup(start, end, _, _) } private predicate emptyMatchAtEndGroup(int start, int end) { @@ -580,7 +580,7 @@ class RegExp extends Expr instanceof StringLiteral { or this.negativeAssertionGroup(start, end) or - this.positiveLookbehindAssertionGroup(start, end) + this.positiveLookbehindAssertionGroup(start, end, _, _) } private predicate negativeAssertionGroup(int start, int end) { @@ -593,32 +593,40 @@ class RegExp extends Expr instanceof StringLiteral { ) } - /** Holds if a negative lookahead is found between `start` and `end` */ - predicate negativeLookaheadAssertionGroup(int start, int end) { - exists(int in_start | this.negative_lookahead_assertion_start(start, in_start) | - this.groupContents(start, end, in_start, _) - ) + /** + * Holds if a negative lookahead is found between `start` and `end`, with contents + * between `in_start` and `in_end`. + */ + predicate negativeLookaheadAssertionGroup(int start, int end, int in_start, int in_end) { + this.negative_lookahead_assertion_start(start, in_start) and + this.groupContents(start, end, in_start, in_end) } - /** Holds if a negative lookbehind is found between `start` and `end` */ - predicate negativeLookbehindAssertionGroup(int start, int end) { - exists(int in_start | this.negative_lookbehind_assertion_start(start, in_start) | - this.groupContents(start, end, in_start, _) - ) + /** + * Holds if a negative lookbehind is found between `start` and `end`, with contents + * between `in_start` and `in_end`. + */ + predicate negativeLookbehindAssertionGroup(int start, int end, int in_start, int in_end) { + this.negative_lookbehind_assertion_start(start, in_start) and + this.groupContents(start, end, in_start, in_end) } - /** Holds if a positive lookahead is found between `start` and `end` */ - predicate positiveLookaheadAssertionGroup(int start, int end) { - exists(int in_start | this.lookahead_assertion_start(start, in_start) | - this.groupContents(start, end, in_start, _) - ) + /** + * Holds if a positive lookahead is found between `start` and `end`, with contents + * between `in_start` and `in_end`. + */ + predicate positiveLookaheadAssertionGroup(int start, int end, int in_start, int in_end) { + this.lookahead_assertion_start(start, in_start) and + this.groupContents(start, end, in_start, in_end) } - /** Holds if a positive lookbehind is found between `start` and `end` */ - predicate positiveLookbehindAssertionGroup(int start, int end) { - exists(int in_start | this.lookbehind_assertion_start(start, in_start) | - this.groupContents(start, end, in_start, _) - ) + /** + * Holds if a positive lookbehind is found between `start` and `end`, with contents + * between `in_start` and `in_end`. + */ + predicate positiveLookbehindAssertionGroup(int start, int end, int in_start, int in_end) { + this.lookbehind_assertion_start(start, in_start) and + this.groupContents(start, end, in_start, in_end) } private predicate group_start(int start, int end) { @@ -1049,6 +1057,13 @@ class RegExp extends Expr instanceof StringLiteral { or this.alternationOption(x, y, start, end) ) + or + // Lookbehind assertions can potentially match the start of the string + ( + this.positiveLookbehindAssertionGroup(_, _, start, _) or + this.negativeLookbehindAssertionGroup(_, _, start, _) + ) and + this.item(start, end) } /** A part of the regex that may match the end of the string. */ @@ -1074,6 +1089,13 @@ class RegExp extends Expr instanceof StringLiteral { or this.alternationOption(x, y, start, end) ) + or + // Lookahead assertions can potentially match the end of the string + ( + this.positiveLookaheadAssertionGroup(_, _, _, end) or + this.negativeLookaheadAssertionGroup(_, _, _, end) + ) and + this.item(start, end) } /** diff --git a/python/ql/src/change-notes/2025-09-19-fix-unmatchable-dollar-and-caret-in-assertions.md b/python/ql/src/change-notes/2025-09-19-fix-unmatchable-dollar-and-caret-in-assertions.md new file mode 100644 index 00000000000..cf63dd9ed4d --- /dev/null +++ b/python/ql/src/change-notes/2025-09-19-fix-unmatchable-dollar-and-caret-in-assertions.md @@ -0,0 +1,5 @@ +--- +category: minorAnalysis +--- + +- The queries that check for unmatchable `$` and `^` in regular expressions did not account correctly for occurrences inside lookahead and lookbehind assertions. These occurrences are now handled correctly, eliminating this source of false positives. diff --git a/python/ql/test/library-tests/regex/FirstLast.expected b/python/ql/test/library-tests/regex/FirstLast.expected index b187033ee22..0abf9c790c2 100644 --- a/python/ql/test/library-tests/regex/FirstLast.expected +++ b/python/ql/test/library-tests/regex/FirstLast.expected @@ -4,6 +4,7 @@ | (?!not-this)^[A-Z_]+$ | first | 12 | 13 | | (?!not-this)^[A-Z_]+$ | first | 13 | 19 | | (?!not-this)^[A-Z_]+$ | first | 13 | 20 | +| (?!not-this)^[A-Z_]+$ | last | 3 | 11 | | (?!not-this)^[A-Z_]+$ | last | 13 | 19 | | (?!not-this)^[A-Z_]+$ | last | 13 | 20 | | (?!not-this)^[A-Z_]+$ | last | 20 | 21 | @@ -101,6 +102,7 @@ | ^[A-Z_]+$(?