Merge pull request #14317 from yoff/python/fix-regex-string-part-locations

Python: Improve computation of regex fragments inside string parts
2025-12-21 11:16:30 +01:00 · 2023-09-28 14:35:27 +02:00
parent eb2db59127 8ade9ed164
commit bc17bf69f4
5 changed files with 78 additions and 5 deletions
--- a/python/ql/lib/change-notes/2023-09-26-regex-locations-in-string-parts.md
+++ b/python/ql/lib/change-notes/2023-09-26-regex-locations-in-string-parts.md
@@ -0,0 +1,4 @@
+---
+category: minorAnalysis
+---
+* Regular expression fragments residing inside implicitly concatenated strings now have better location information.
--- a/python/ql/lib/semmle/python/AstExtended.qll
+++ b/python/ql/lib/semmle/python/AstExtended.qll
@@ -154,6 +154,28 @@ class StringPart extends StringPart_, AstNode {
  override string toString() { result = StringPart_.super.toString() }

  override Location getLocation() { result = StringPart_.super.getLocation() }
+
+  /**
+   * Holds if the content of string `StringPart` is surrounded by
+   * a prefix (including a quote) of length `prefixLength` and
+   * a quote of length `quoteLength`.
+   */
+  predicate contextSize(int prefixLength, int quoteLength) {
+    exists(int occurrenceOffset |
+      quoteLength = this.getText().regexpFind("\"{3}|\"{1}|'{3}|'{1}", 0, occurrenceOffset).length() and
+      prefixLength = occurrenceOffset + quoteLength
+    )
+  }
+
+  /**
+   * Gets the length of the content, that is the text between the prefix and the quote.
+   * See `context` for obtaining the prefix and the quote.
+   */
+  int getContentLength() {
+    exists(int prefixLength, int quoteLength | this.contextSize(prefixLength, quoteLength) |
+      result = this.getText().length() - prefixLength - quoteLength
+    )
+  }
 }

 class StringPartList extends StringPartList_ { }
--- a/python/ql/lib/semmle/python/regexp/RegexTreeView.qll
+++ b/python/ql/lib/semmle/python/regexp/RegexTreeView.qll
@@ -223,16 +223,55 @@ module Impl implements RegexTreeViewSig {
     */
    Location getLocation() { result = re.getLocation() }

+    /** Gets the accumulated length of string parts with lower index than `index`, if any. */
+    private int getPartOffset(int index) {
+      index = 0 and result = 0
+      or
+      index > 0 and
+      exists(int previousOffset | previousOffset = this.getPartOffset(index - 1) |
+        result =
+          previousOffset + re.(StrConst).getImplicitlyConcatenatedPart(index - 1).getContentLength()
+      )
+    }
+
+    /**
+     * Gets the `StringPart` in which this `RegExpTerm` resides, if any.
+     * `localOffset` will be the offset of this `RegExpTerm` inside `result`.
+     */
+    StringPart getPart(int localOffset) {
+      exists(int index, int prefixLength | index = max(int i | this.getPartOffset(i) <= start) |
+        result = re.(StrConst).getImplicitlyConcatenatedPart(index) and
+        result.contextSize(prefixLength, _) and
+        // Example:
+        // re.compile('...' r"""...this..""")
+        // - `start` is the offset from `(` to `this` as counted after concatenating all parts.
+        // - we subtract the length of the previous `StringPart`s, `'...'`, to know how far into this `StringPart` we go.
+        // - as the prefix 'r"""' is part of the `StringPart`, `this` is found that much further in.
+        localOffset = start - this.getPartOffset(index) + prefixLength
+      )
+    }
+
    /** Holds if this term is found at the specified location offsets. */
    predicate hasLocationInfo(
      string filepath, int startline, int startcolumn, int endline, int endcolumn
    ) {
+      not exists(this.getPart(_)) and
      exists(int re_start, int prefix_len | prefix_len = re.getPrefix().length() |
-        re.getLocation().hasLocationInfo(filepath, startline, re_start, endline, _) and
+        re.getLocation().hasLocationInfo(filepath, startline, re_start, _, _) and
        startcolumn = re_start + start + prefix_len and
+        endline = startline and
        endcolumn = re_start + end + prefix_len - 1
        /* inclusive vs exclusive */
      )
+      or
+      exists(StringPart part, int localOffset, int partStartColumn |
+        part = this.getPart(localOffset)
+      |
+        part.getLocation().hasLocationInfo(filepath, startline, partStartColumn, _, _) and
+        startcolumn = partStartColumn + localOffset and
+        endline = startline and
+        endcolumn = (end - start) + startcolumn
+      )
    }

    /** Gets the file in which this term is found. */
--- a/python/ql/test/library-tests/regexparser/locations.py
+++ b/python/ql/test/library-tests/regexparser/locations.py
@@ -50,23 +50,31 @@ br'''[this] is a test'''
 )

 # plain string with multiple parts
-re.compile( # $ location=1:2 SPURIOUS:location=1:23 MISSING:location=1:26
+re.compile( # $ location=1:2 location=1:26
 '[this] is a test' ' and [this] is another test'
 )

 # plain string with multiple parts across lines
-re.compile( # $ location=1:2 SPURIOUS:location=1:23 MISSING:location=2:7
+re.compile( # $ location=1:2 location=2:7 location=3:2
 '[this] is a test'
 ' and [this] is another test'
+'[this] comes right at the start of a part'
 )

 # plain string with multiple parts across lines and comments
-re.compile( # $ location=1:2 SPURIOUS:location=1:23 MISSING:location=3:7
+re.compile( # $ location=1:2 location=3:7
 '[this] is a test'
 # comment
 ' and [this] is another test'
 )

+# multiple parts of different kinds
+re.compile( # $ location=1:2 location=1:28 location=2:11 location=3:8
+'[this] is a test' ''' and [this] is another test'''
+br""" and [this] is yet another test"""
+r' and [this] is one more'
+)
+
 # actual multiline string
 re.compile( # $ SPURIOUS:location=1:6 location=1:27 MISSING:location=2:1 location=3:5
 r'''
--- a/python/ql/test/query-tests/Security/CWE-730-ReDoS/ReDoS.expected
+++ b/python/ql/test/query-tests/Security/CWE-730-ReDoS/ReDoS.expected
@@ -1,5 +1,5 @@
 | KnownCVEs.py:15:20:15:22 | \\d+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '0'. |
-| KnownCVEs.py:30:21:31:22 | .* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of ','. |
+| KnownCVEs.py:30:21:30:23 | .* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of ','. |
 | KnownCVEs.py:35:18:35:81 | ([-/:,#%.'"\\s!\\w]\|\\w-\\w\|'[\\s\\w]+'\\s*\|"[\\s\\w]+"\|\\([\\d,%\\.\\s]+\\))* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '"\\t"'. |
 | redos.py:6:28:6:42 | (?:__\|[\\s\\S])+? | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '__'. |
 | redos.py:6:52:6:68 | (?:\\*\\*\|[\\s\\S])+? | This part of the regular expression may cause exponential backtracking on strings starting with '*' and containing many repetitions of '**'. |