Python: Fix false positive for unmatchable dollar/caret

Our previous modelling did not account for the fact that a lookahead can
potentially extend all the way to the end of the input (and similarly,
that a lookbehind can extend all the way to the beginning).

To fix this, I extended `firstPart` and `lastPart` to handle lookbehinds
and lookaheads correctly, and added some test cases (all of which yield
no new results).

Fixes #20429.
This commit is contained in:
Taus
2025-09-19 15:06:46 +00:00
parent c1c0828082
commit 95a84ad655
4 changed files with 64 additions and 29 deletions

View File

@@ -964,7 +964,7 @@ module Impl implements RegexTreeViewSig {
* ```
*/
class RegExpPositiveLookahead extends RegExpLookahead {
RegExpPositiveLookahead() { re.positiveLookaheadAssertionGroup(start, end) }
RegExpPositiveLookahead() { re.positiveLookaheadAssertionGroup(start, end, _, _) }
override string getPrimaryQLClass() { result = "RegExpPositiveLookahead" }
}
@@ -979,7 +979,7 @@ module Impl implements RegexTreeViewSig {
* ```
*/
additional class RegExpNegativeLookahead extends RegExpLookahead {
RegExpNegativeLookahead() { re.negativeLookaheadAssertionGroup(start, end) }
RegExpNegativeLookahead() { re.negativeLookaheadAssertionGroup(start, end, _, _) }
override string getPrimaryQLClass() { result = "RegExpNegativeLookahead" }
}
@@ -1006,7 +1006,7 @@ module Impl implements RegexTreeViewSig {
* ```
*/
class RegExpPositiveLookbehind extends RegExpLookbehind {
RegExpPositiveLookbehind() { re.positiveLookbehindAssertionGroup(start, end) }
RegExpPositiveLookbehind() { re.positiveLookbehindAssertionGroup(start, end, _, _) }
override string getPrimaryQLClass() { result = "RegExpPositiveLookbehind" }
}
@@ -1021,7 +1021,7 @@ module Impl implements RegexTreeViewSig {
* ```
*/
additional class RegExpNegativeLookbehind extends RegExpLookbehind {
RegExpNegativeLookbehind() { re.negativeLookbehindAssertionGroup(start, end) }
RegExpNegativeLookbehind() { re.negativeLookbehindAssertionGroup(start, end, _, _) }
override string getPrimaryQLClass() { result = "RegExpNegativeLookbehind" }
}

View File

@@ -554,9 +554,9 @@ class RegExp extends Expr instanceof StringLiteral {
or
this.negativeAssertionGroup(start, end)
or
this.positiveLookaheadAssertionGroup(start, end)
this.positiveLookaheadAssertionGroup(start, end, _, _)
or
this.positiveLookbehindAssertionGroup(start, end)
this.positiveLookbehindAssertionGroup(start, end, _, _)
}
/** Holds if an empty group is found between `start` and `end`. */
@@ -572,7 +572,7 @@ class RegExp extends Expr instanceof StringLiteral {
or
this.negativeAssertionGroup(start, end)
or
this.positiveLookaheadAssertionGroup(start, end)
this.positiveLookaheadAssertionGroup(start, end, _, _)
}
private predicate emptyMatchAtEndGroup(int start, int end) {
@@ -580,7 +580,7 @@ class RegExp extends Expr instanceof StringLiteral {
or
this.negativeAssertionGroup(start, end)
or
this.positiveLookbehindAssertionGroup(start, end)
this.positiveLookbehindAssertionGroup(start, end, _, _)
}
private predicate negativeAssertionGroup(int start, int end) {
@@ -593,32 +593,40 @@ class RegExp extends Expr instanceof StringLiteral {
)
}
/** Holds if a negative lookahead is found between `start` and `end` */
predicate negativeLookaheadAssertionGroup(int start, int end) {
exists(int in_start | this.negative_lookahead_assertion_start(start, in_start) |
this.groupContents(start, end, in_start, _)
)
/**
* Holds if a negative lookahead is found between `start` and `end`, with contents
* between `in_start` and `in_end`.
*/
predicate negativeLookaheadAssertionGroup(int start, int end, int in_start, int in_end) {
this.negative_lookahead_assertion_start(start, in_start) and
this.groupContents(start, end, in_start, in_end)
}
/** Holds if a negative lookbehind is found between `start` and `end` */
predicate negativeLookbehindAssertionGroup(int start, int end) {
exists(int in_start | this.negative_lookbehind_assertion_start(start, in_start) |
this.groupContents(start, end, in_start, _)
)
/**
* Holds if a negative lookbehind is found between `start` and `end`, with contents
* between `in_start` and `in_end`.
*/
predicate negativeLookbehindAssertionGroup(int start, int end, int in_start, int in_end) {
this.negative_lookbehind_assertion_start(start, in_start) and
this.groupContents(start, end, in_start, in_end)
}
/** Holds if a positive lookahead is found between `start` and `end` */
predicate positiveLookaheadAssertionGroup(int start, int end) {
exists(int in_start | this.lookahead_assertion_start(start, in_start) |
this.groupContents(start, end, in_start, _)
)
/**
* Holds if a positive lookahead is found between `start` and `end`, with contents
* between `in_start` and `in_end`.
*/
predicate positiveLookaheadAssertionGroup(int start, int end, int in_start, int in_end) {
this.lookahead_assertion_start(start, in_start) and
this.groupContents(start, end, in_start, in_end)
}
/** Holds if a positive lookbehind is found between `start` and `end` */
predicate positiveLookbehindAssertionGroup(int start, int end) {
exists(int in_start | this.lookbehind_assertion_start(start, in_start) |
this.groupContents(start, end, in_start, _)
)
/**
* Holds if a positive lookbehind is found between `start` and `end`, with contents
* between `in_start` and `in_end`.
*/
predicate positiveLookbehindAssertionGroup(int start, int end, int in_start, int in_end) {
this.lookbehind_assertion_start(start, in_start) and
this.groupContents(start, end, in_start, in_end)
}
private predicate group_start(int start, int end) {
@@ -1049,6 +1057,13 @@ class RegExp extends Expr instanceof StringLiteral {
or
this.alternationOption(x, y, start, end)
)
or
// Lookbehind assertions can potentially match the start of the string
(
this.positiveLookbehindAssertionGroup(_, _, start, _) or
this.negativeLookbehindAssertionGroup(_, _, start, _)
) and
this.item(start, end)
}
/** A part of the regex that may match the end of the string. */
@@ -1074,6 +1089,13 @@ class RegExp extends Expr instanceof StringLiteral {
or
this.alternationOption(x, y, start, end)
)
or
// Lookahead assertions can potentially match the end of the string
(
this.positiveLookaheadAssertionGroup(_, _, _, end) or
this.negativeLookaheadAssertionGroup(_, _, _, end)
) and
this.item(start, end)
}
/**

View File

@@ -0,0 +1,5 @@
---
category: minorAnalysis
---
- The queries that check for unmatchable `$` and `^` in regular expressions did not account correctly for occurrences inside lookahead and lookbehind assertions. These occurrences are now handled correctly, eliminating this source of false positives.

View File

@@ -150,4 +150,12 @@ re.compile(r"[\U00010000-\U0010FFFF]")
re.compile(r"[\u0000-\uFFFF]")
#Allow unicode names
re.compile(r"[\N{degree sign}\N{EM DASH}]")
re.compile(r"[\N{degree sign}\N{EM DASH}]")
#Lookahead assertions. None of these are unmatchable dollars:
re.compile(r"^(?=a$)[ab]")
re.compile(r"^(?!a$)[ab]")
#Lookbehind assertions. None of these are unmatchable carets:
re.compile(r"(?<=^a)a")
re.compile(r"(?<!^a)a")