Improve handling of hex escapes; and support some named character classes

2026-02-28 21:03:50 +01:00 · 2022-02-21 16:39:59 +00:00
parent 5143585080
commit 57ba8a4d1b
3 changed files with 96 additions and 23 deletions
--- a/java/ql/lib/semmle/code/java/regex/RegexTreeView.qll
+++ b/java/ql/lib/semmle/code/java/regex/RegexTreeView.qll
@@ -55,7 +55,7 @@ class RegExpParent extends TRegExpParent {
  string toString() { result = "RegExpParent" }

  /** Gets the `i`th child term. */
-  abstract RegExpTerm getChild(int i);
+  RegExpTerm getChild(int i) { none() }

  /** Gets a child term . */
  RegExpTerm getAChild() { result = this.getChild(_) }
@@ -143,26 +143,6 @@ class RegExpTerm extends RegExpParent {
   */
  predicate isRootTerm() { start = 0 and end = re.getText().length() }

-  override RegExpTerm getChild(int i) {
-    result = this.(RegExpAlt).getChild(i)
-    or
-    result = this.(RegExpBackRef).getChild(i)
-    or
-    result = this.(RegExpCharacterClass).getChild(i)
-    or
-    result = this.(RegExpCharacterRange).getChild(i)
-    or
-    result = this.(RegExpNormalChar).getChild(i)
-    or
-    result = this.(RegExpGroup).getChild(i)
-    or
-    result = this.(RegExpQuantifier).getChild(i)
-    or
-    result = this.(RegExpSequence).getChild(i)
-    or
-    result = this.(RegExpSpecialChar).getChild(i)
-  }
-
  /**
   * Gets the parent term of this regular expression term, or the
   * regular expression literal if this is the root term.
@@ -508,7 +488,7 @@ class RegExpEscape extends RegExpNormalChar {
  /**
   * Holds if this is a unicode escape.
   */
-  private predicate isUnicode() { this.getText().matches("\\u%") }
+  private predicate isUnicode() { this.getText().matches(["\\u%", "\\x%"]) }

  /**
   * Gets the unicode char for this escape.
@@ -520,13 +500,24 @@ class RegExpEscape extends RegExpNormalChar {
    )
  }

+  /** Gets the part of this escape that is a hexidecimal string */
+  private string getHexString() {
+    this.isUnicode() and
+    if this.getText().matches("\\u%") // \uhhhh
+    then result = this.getText().suffix(2)
+    else
+      if this.getText().matches("\\x{%") // \x{h..h}
+      then result = this.getText().substring(3, this.getText().length() - 1)
+      else result = this.getText().suffix(2) // \xhh
+  }
+
  /**
   * Gets int value for the `index`th char in the hex number of the unicode escape.
   * E.g. for `\u0061` and `index = 2` this returns 96 (the number `6` interpreted as hex).
   */
  private int getHexValueFromUnicode(int index) {
    this.isUnicode() and
-    exists(string hex, string char | hex = this.getText().suffix(2) |
+    exists(string hex, string char | hex = this.getHexString() |
      char = hex.charAt(index) and
      result = 16.pow(hex.length() - index - 1) * toHex(char)
    )
@@ -574,6 +565,50 @@ class RegExpCharacterClassEscape extends RegExpEscape {
  override string getPrimaryQLClass() { result = "RegExpCharacterClassEscape" }
 }

+/**
+ * A named character class in a regular expression.
+ *
+ * Examples:
+ *
+ * ```
+ * \p{Digit}
+ * \p{IsLowerCase}
+ */
+class RegExpNamedProperty extends RegExpCharacterClassEscape {
+  boolean inverted;
+  string name;
+
+  RegExpNamedProperty() {
+    name = this.getValue().substring(2, this.getValue().length() - 1) and
+    (
+      inverted = false and
+      this.getValue().charAt(0) = "p"
+      or
+      inverted = true and
+      this.getValue().charAt(0) = "P"
+    )
+  }
+
+  /** Holds if this class is inverted. */
+  predicate isInverted() { inverted = true }
+
+  /** Gets the name of this class. */
+  string getClassName() { result = name }
+
+  /**
+   * Gets an equivalent single-chcracter escape sequence for this class (e.g. \d) if possible, excluding the escape character.
+   */
+  string getBackslashEquivalent() {
+    exists(string eq | if inverted = true then result = eq.toUpperCase() else result = eq |
+      name = ["Digit", "IsDigit"] and
+      eq = "d"
+      or
+      name = ["Space", "IsWhite_Space"] and
+      eq = "s"
+    )
+  }
+}
+
 /**
 * A character class in a regular expression.
 *