Correctly account for length of string literal prefix when computing locations for RegExpTerms.

2025-12-24 04:36:35 +01:00 · 2023-09-22 10:22:12 +01:00
parent d4ff9c8ed1
commit 6f67055852
5 changed files with 44 additions and 28 deletions
--- a/python/ql/lib/change-notes/2023-09-22-regex-prefix.md
+++ b/python/ql/lib/change-notes/2023-09-22-regex-prefix.md
@@ -0,0 +1,4 @@
+---
+category: fix
+---
+* Subterms of regular expressions encoded as single-line string literals now have better source-location information.
--- a/python/ql/lib/semmle/python/regexp/RegexTreeView.qll
+++ b/python/ql/lib/semmle/python/regexp/RegexTreeView.qll
@@ -227,10 +227,11 @@ module Impl implements RegexTreeViewSig {
    predicate hasLocationInfo(
      string filepath, int startline, int startcolumn, int endline, int endcolumn
    ) {
-      exists(int re_start |
+      exists(int re_start, int prefix_len | prefix_len = re.getPrefix().length() |
        re.getLocation().hasLocationInfo(filepath, startline, re_start, endline, _) and
-        startcolumn = re_start + start + 4 and
-        endcolumn = re_start + end + 3
+        startcolumn = re_start + start + prefix_len and
+        endcolumn = re_start + end + prefix_len - 1
+        /* inclusive vs exclusive */
      )
    }

--- a/python/ql/lib/semmle/python/regexp/internal/ParseRegExp.qll
+++ b/python/ql/lib/semmle/python/regexp/internal/ParseRegExp.qll
@@ -101,7 +101,7 @@ private module FindRegexMode {
 }

 /**
- * DEPRECATED: Use `Regex` instead.
+ * DEPRECATED: Use `RegExp` instead.
 */
 deprecated class Regex = RegExp;

@@ -327,6 +327,17 @@ class RegExp extends Expr instanceof StrConst {
  /** Gets the text of this regex */
  string getText() { result = super.getText() }

+  /**
+   * Gets the prefix of this regex
+   *
+   * Examples:
+   *
+   *   - The prefix of `'x*y'` is `'`.
+   *   - The prefix of `r''` is `r'`.
+   *   - The prefix of `r"""x*y"""` is `r"""`.
+   */
+  string getPrefix() { result = super.getPrefix() }
+
  /** Gets the `i`th character of this regex */
  string getChar(int i) { result = this.getText().charAt(i) }

--- a/python/ql/test/library-tests/regexparser/Locations.expected
+++ b/python/ql/test/library-tests/regexparser/Locations.expected
@@ -1,20 +1,20 @@
-| locations.py | 14 | 5 |
-| locations.py | 19 | 5 |
-| locations.py | 24 | 5 |
-| locations.py | 29 | 5 |
-| locations.py | 34 | 5 |
+| locations.py | 14 | 2 |
+| locations.py | 19 | 3 |
+| locations.py | 24 | 3 |
+| locations.py | 29 | 4 |
+| locations.py | 34 | 4 |
 | locations.py | 39 | 5 |
 | locations.py | 44 | 5 |
-| locations.py | 49 | 5 |
-| locations.py | 54 | 5 |
-| locations.py | 54 | 26 |
-| locations.py | 59 | 5 |
-| locations.py | 59 | 26 |
-| locations.py | 65 | 5 |
-| locations.py | 65 | 26 |
+| locations.py | 49 | 6 |
+| locations.py | 54 | 2 |
+| locations.py | 54 | 23 |
+| locations.py | 59 | 2 |
+| locations.py | 59 | 23 |
+| locations.py | 65 | 2 |
+| locations.py | 65 | 23 |
 | locations.py | 72 | 6 |
 | locations.py | 72 | 27 |
-| locations.py | 80 | 6 |
-| locations.py | 85 | 7 |
-| locations.py | 90 | 5 |
-| locations.py | 90 | 26 |
+| locations.py | 80 | 3 |
+| locations.py | 85 | 5 |
+| locations.py | 90 | 2 |
+| locations.py | 90 | 23 |
--- a/python/ql/test/library-tests/regexparser/locations.py
+++ b/python/ql/test/library-tests/regexparser/locations.py
@@ -6,8 +6,8 @@ import re
 # regexp term `[this]`, appearing in various kinds of regexps.
 #
 # To make the location information easier to understand, we generally put each
-# regexp on its own line, even though this is not the way one would normally
-# write regexps in Python.
+# regexp on its own line, even though this is not idiomatic Python.
+# Comments indicate cases we currently do not handle correctly.

 # plain string
 re.compile(
@@ -49,25 +49,25 @@ re.compile(
 br'''[this] is a test'''
 )

-# plain string with multiple parts
+# plain string with multiple parts (second [this] gets wrong column: 23 instead of 26)
 re.compile(
 '[this] is a test' ' and [this] is another test'
 )

-# plain string with multiple parts across lines
+# plain string with multiple parts across lines (second [this] gets wrong location: 59:23 instead of 60:7)
 re.compile(
 '[this] is a test'
 ' and [this] is another test'
 )

-# plain string with multiple parts across lines and comments
+# plain string with multiple parts across lines and comments (second [this] gets wrong location: 65:23 instead of 67:7)
 re.compile(
 '[this] is a test'
 # comment
 ' and [this] is another test'
 )

-# actual multiline string
+# actual multiline string (both [this]s get wrong location: 72:6 and 72:27 instead of 73:1 and 74:5)
 re.compile(
 r'''
 [this] is a test
@@ -75,7 +75,7 @@ and [this] is another test
 '''
 )

-# plain string with escape sequences
+# plain string with escape sequences ([this] gets wrong location: 80:3 instead of 80:4)
 re.compile(
 '\t[this] is a test'
 )
@@ -85,7 +85,7 @@ re.compile(
 r'\A[this] is a test'
 )

-# plain string with escaped newline
+# plain string with escaped newline (second [this] gets wrong location: 90:23 instead of 91:6)
 re.compile(
 '[this] is a test\
 and [this] is another test'