Python: inline test of regex components

- Added naive implementation of `charRange` so the test can run. - Made predicates public as needed.
2026-07-20 18:58:36 +02:00 · 2021-06-28 13:21:02 +02:00
parent a1c38b78a9
commit e5f07cc4d3
7 changed files with 190 additions and 3 deletions
--- a/python/ql/src/semmle/python/regex.qll
+++ b/python/ql/src/semmle/python/regex.qll
@@ -143,6 +143,26 @@ abstract class RegexString extends Expr {
    )
  }

+  /**
+   * Holds if the character set starting at `charset_start` contains a character range
+   * with lower bound found between `start` and `lower_end`
+   * and upper bound found between `upper_start` and `end`.
+   */
+  predicate charRange(int charset_start, int start, int lower_end, int upper_start, int end) {
+    // mirror logic from `simpleCharacter`
+    exists(int x, int y |
+      this.charSet(charset_start, y) and
+      this.char_set_start(charset_start, x)
+    |
+      x <= start and
+      this.simpleCharacter(start, lower_end) and
+      this.nonEscapedCharAt(lower_end) = "-" and
+      lower_end + 1 = upper_start and
+      this.simpleCharacter(upper_start, end) and
+      end < y
+    )
+  }
+
  predicate escapingChar(int pos) { this.escaping(pos) = true }

  private boolean escaping(int pos) {
@@ -192,7 +212,12 @@ abstract class RegexString extends Expr {
    not exists(int i | start + 2 < i and i < end - 1 | this.getChar(i) = "}")
  }

-  private predicate escapedCharacter(int start, int end) {
+  /**
+   * Holds if an escaped character is found between `start` and `end`.
+   * Escaped characters include hex values, octal values and named escapes,
+   * but excludes backreferences.
+   */
+  predicate escapedCharacter(int start, int end) {
    this.escapingChar(start) and
    not exists(this.getText().substring(start + 1, end + 1).toInt()) and
    (
@@ -221,10 +246,9 @@ abstract class RegexString extends Expr {
    exists(int x, int y | this.charSet(x, y) and index in [x + 1 .. y - 2])
  }

-  /*
+  /**
   * 'simple' characters are any that don't alter the parsing of the regex.
   */
-
  private predicate simpleCharacter(int start, int end) {
    end = start + 1 and
    not this.charSet(start, _) and
--- a/python/ql/test/library-tests/regex/SubstructureTests.expected
+++ b/python/ql/test/library-tests/regex/SubstructureTests.expected
--- a/python/ql/test/library-tests/regex/SubstructureTests.ql
+++ b/python/ql/test/library-tests/regex/SubstructureTests.ql
@@ -0,0 +1,75 @@
+import python
+import TestUtilities.InlineExpectationsTest
+private import semmle.python.regex
+
+class CharacterSetTest extends InlineExpectationsTest {
+  CharacterSetTest() { this = "CharacterSetTest" }
+
+  override string getARelevantTag() { result = "charSet" }
+
+  override predicate hasActualResult(Location location, string element, string tag, string value) {
+    exists(location.getFile().getRelativePath()) and
+    location.getFile().getBaseName() = "charSetTest.py" and
+    exists(Regex re, int start, int end |
+      re.charSet(start, end) and
+      location = re.getLocation() and
+      element = re.toString().substring(start, end) and
+      value = start + ":" + end and
+      tag = "charSet"
+    )
+  }
+}
+
+class CharacterRangeTest extends InlineExpectationsTest {
+  CharacterRangeTest() { this = "CharacterRangeTest" }
+
+  override string getARelevantTag() { result = "charRange" }
+
+  override predicate hasActualResult(Location location, string element, string tag, string value) {
+    exists(location.getFile().getRelativePath()) and
+    location.getFile().getBaseName() = "charRangeTest.py" and
+    exists(Regex re, int start, int lower_end, int upper_start, int end |
+      re.charRange(_, start, lower_end, upper_start, end) and
+      location = re.getLocation() and
+      element = re.toString().substring(start, end) and
+      value = start + ":" + lower_end + "-" + upper_start + ":" + end and
+      tag = "charRange"
+    )
+  }
+}
+
+class EscapeTest extends InlineExpectationsTest {
+  EscapeTest() { this = "EscapeTest" }
+
+  override string getARelevantTag() { result = "escapedCharacter" }
+
+  override predicate hasActualResult(Location location, string element, string tag, string value) {
+    exists(location.getFile().getRelativePath()) and
+    location.getFile().getBaseName() = "escapedCharacterTest.py" and
+    exists(Regex re, int start, int end |
+      re.escapedCharacter(start, end) and
+      location = re.getLocation() and
+      element = re.toString().substring(start, end) and
+      value = start + ":" + end and
+      tag = "escapedCharacter"
+    )
+  }
+}
+
+class GroupTest extends InlineExpectationsTest {
+  GroupTest() { this = "GroupTest" }
+
+  override string getARelevantTag() { result = "group" }
+
+  override predicate hasActualResult(Location location, string element, string tag, string value) {
+    exists(location.getFile().getRelativePath()) and
+    location.getFile().getBaseName() = "groupTest.py" and
+    exists(Regex re, int start, int end |
+      re.group(start, end) and
+      location = re.getLocation() and
+      element = re.toString().substring(start, end) and
+      value = start + ":" + end and
+      tag = "group"
+    )
+  }
+}
--- a/python/ql/test/library-tests/regex/charRangeTest.py
+++ b/python/ql/test/library-tests/regex/charRangeTest.py
@@ -0,0 +1,32 @@
+import re
+
+re.compile(r'[]-[]') #$ MISSING: charRange=1:2-3:4
+re.compile(r'[---]') #$ MISSING: charRange=1:2-3:4
+re.compile(r'[\---]') #$ MISSING: charRange=1:3-4:5
+re.compile(r'[--\-]') #$ MISSING: charRange=1:2-3:5
+re.compile(r'[\--\-]') #$ cMISSING: harRange=1:3-4:6
+re.compile(r'[0-9-A-Z]') #$ MISSING: charRange=1:2-3:4 charRange=5:6-7:8
+re.compile(r'[0\-9-A-Z]') #$ MISSING: charRange=4:5-6:7
+re.compile(r'[0--9-A-Z]') #$ MISSING: charRange=1:2-3:4 charRange=4:5-6:7
+
+re.compile(r'[^A-Z]') #$ MISSING: charRange=2:3-4:5
+
+re.compile(r'[\0-\09]') #$ MISSING: charRange=1:3-4:7
+
+re.compile(r'[\0123-5]') #$ MISSING: charRange=5:6-7:8
+
+
+#Negative lookahead
+re.compile(r'(?!not-this)^[A-Z_]+$') #$ MISSING: charRange=14:15-16:17
+#Negative lookbehind
+re.compile(r'^[A-Z_]+$(?<!not-this)') #$ MISSING: charRange=2:3-4:5
+
+
+#OK -- ODASA-ODASA-3968
+re.compile('(?:[^%]|^)?%\((\w*)\)[a-z]') #$ MISSING: charRange=22:23-24:25
+
+#ODASA-3985
+#Half Surrogate pairs
+re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]') #$ MISSING: charRange=1:2-3:4 charRange=6:7-8:9
+#Outside BMP
+re.compile(u'[\U00010000-\U0010ffff]') #$ MISSING: charRange=1:2-3:4
--- a/python/ql/test/library-tests/regex/charSetTest.py
+++ b/python/ql/test/library-tests/regex/charSetTest.py
@@ -0,0 +1,29 @@
+import re
+re.compile(r'\A[+-]?\d+') #$ MISSING: charSet=2:6
+re.compile(r'(?P<name>[\w]+)|') #$ MISSING: charSet=9:13
+re.compile(r'\|\[\][123]|\{\}') #$ MISSING: charSet=6:11
+re.compile(r'[^A-Z]') #$ MISSING: charSet=0:6
+re.compile("[]]") #$ charSet=0:3
+re.compile("[][]") #$ MISSING: charSet=0:4
+re.compile("[^][^]") #$ MISSING: charSet=0:6
+re.compile("[.][.]") #$ charSet=0:3 MISSING: charSet=3:6
+re.compile("[[]]") #$ charSet=0:3
+re.compile("[^]]") #$ MISSING: charSet=0:4
+re.compile("[^-]") #$ MISSING: charSet=0:4
+re.compile("[]-[]") #$ MISSING: charSet=0:5
+re.compile("[^]-[]") #$ MISSING: charSet=0:6
+
+re.compile("]]][[[[]") #$ MISSING: charSet=3:8
+
+
+#ODASA-3985
+#Half Surrogate pairs
+re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]') #$ MISSING: charSet=0:5 charSet=5:10
+#Outside BMP
+re.compile(u'[\U00010000-\U0010ffff]') #$ MISSING: charSet=0:5
+
+#Misparsed on LGTM
+re.compile(r"\[(?P<txt>[^[]*)\]\((?P<uri>[^)]*)") #$ MISSING: charSet=10:14 charSet=28:32
+
+ # parses wrongly, sees this   \|/ as a char set start
+re.compile(r'''(?:[\s;,"'<>(){}|[\]@=+*]|:(?![/\\]))+''') #$ MISSING: charSet=3:25 charSet=30:35
--- a/python/ql/test/library-tests/regex/escapedCharacterTest.py
+++ b/python/ql/test/library-tests/regex/escapedCharacterTest.py
@@ -0,0 +1,23 @@
+import re
+
+re.compile(r'\b') #$ escapedCharacter=0:2
+re.compile(r'''\b''') #$ escapedCharacter=0:2
+re.compile(r"\b") #$ escapedCharacter=0:2
+re.compile(u"\b") # not escape
+re.compile("\b") # not escape
+re.compile(r'\\\b') #$ escapedCharacter=0:2 MISSING: escapedCharacter=2:4
+re.compile(r'[\---]') #$ escapedCharacter=1:3
+re.compile(r'[--\-]') #$ MISSING: escapedCharacter=3:5
+re.compile(r'[\--\-]') #$ escapedCharacter=1:3 MISSING: escapedCharacter=4:6
+re.compile(r'[0\-9-A-Z]') #$ MISSING: escapedCharacter=2:4
+re.compile(r'[\0-\09]') #$ escapedCharacter=1:3 MISSING: escapedCharacter=4:7
+re.compile(r'[\0123-5]') #$ MISSING: escapedCharacter=1:5
+
+#ODASA-3985
+#Half Surrogate pairs
+re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]') # not escapes
+#Outside BMP
+re.compile(u'[\U00010000-\U0010ffff]') # not escapes
+
+#Misparsed on LGTM
+re.compile(r"\[(?P<txt>[^[]*)\]\((?P<uri>[^)]*)") #$ escapedCharacter=0:2 MISSING: escapedCharacter=16:18 escapedCharacter=18:20
--- a/python/ql/test/library-tests/regex/groupTest.py
+++ b/python/ql/test/library-tests/regex/groupTest.py
@@ -0,0 +1,4 @@
+import re
+
+re.compile(r'(?P<first>\w+) (?P<second>\w+)') #$ MISSING: group=0:14 group=15:30
+re.compile(r'([)(])') #$ MISSING: group=0:6