From 57ba8a4d1b9d4f8a1c112834191fba5dc593ea5e Mon Sep 17 00:00:00 2001
From: Joe Farebrother <joefarebrother@github.com>
Date: Mon, 21 Feb 2022 16:39:59 +0000
Subject: [PATCH] Improve handling of hex escapes; and support some named
 character classes

---
 .../semmle/code/java/regex/RegexTreeView.qll  | 81 +++++++++++++------
 .../security/performance/RegExpTreeView.qll   |  2 +
 .../security/CWE-730/ExpRedosTest.java        | 36 +++++++++
 3 files changed, 96 insertions(+), 23 deletions(-)

diff --git a/java/ql/lib/semmle/code/java/regex/RegexTreeView.qll b/java/ql/lib/semmle/code/java/regex/RegexTreeView.qll
index f7e85fe3edf..04bda79e9ad 100644
--- a/java/ql/lib/semmle/code/java/regex/RegexTreeView.qll
+++ b/java/ql/lib/semmle/code/java/regex/RegexTreeView.qll
@@ -55,7 +55,7 @@ class RegExpParent extends TRegExpParent {
   string toString() { result = "RegExpParent" }
 
   /** Gets the `i`th child term. */
-  abstract RegExpTerm getChild(int i);
+  RegExpTerm getChild(int i) { none() }
 
   /** Gets a child term . */
   RegExpTerm getAChild() { result = this.getChild(_) }
@@ -143,26 +143,6 @@ class RegExpTerm extends RegExpParent {
    */
   predicate isRootTerm() { start = 0 and end = re.getText().length() }
 
-  override RegExpTerm getChild(int i) {
-    result = this.(RegExpAlt).getChild(i)
-    or
-    result = this.(RegExpBackRef).getChild(i)
-    or
-    result = this.(RegExpCharacterClass).getChild(i)
-    or
-    result = this.(RegExpCharacterRange).getChild(i)
-    or
-    result = this.(RegExpNormalChar).getChild(i)
-    or
-    result = this.(RegExpGroup).getChild(i)
-    or
-    result = this.(RegExpQuantifier).getChild(i)
-    or
-    result = this.(RegExpSequence).getChild(i)
-    or
-    result = this.(RegExpSpecialChar).getChild(i)
-  }
-
   /**
    * Gets the parent term of this regular expression term, or the
    * regular expression literal if this is the root term.
@@ -508,7 +488,7 @@ class RegExpEscape extends RegExpNormalChar {
   /**
    * Holds if this is a unicode escape.
    */
-  private predicate isUnicode() { this.getText().matches("\\u%") }
+  private predicate isUnicode() { this.getText().matches(["\\u%", "\\x%"]) }
 
   /**
    * Gets the unicode char for this escape.
@@ -520,13 +500,24 @@ class RegExpEscape extends RegExpNormalChar {
     )
   }
 
+  /** Gets the part of this escape that is a hexidecimal string */
+  private string getHexString() {
+    this.isUnicode() and
+    if this.getText().matches("\\u%") // \uhhhh
+    then result = this.getText().suffix(2)
+    else
+      if this.getText().matches("\\x{%") // \x{h..h}
+      then result = this.getText().substring(3, this.getText().length() - 1)
+      else result = this.getText().suffix(2) // \xhh
+  }
+
   /**
    * Gets int value for the `index`th char in the hex number of the unicode escape.
    * E.g. for `\u0061` and `index = 2` this returns 96 (the number `6` interpreted as hex).
    */
   private int getHexValueFromUnicode(int index) {
     this.isUnicode() and
-    exists(string hex, string char | hex = this.getText().suffix(2) |
+    exists(string hex, string char | hex = this.getHexString() |
       char = hex.charAt(index) and
       result = 16.pow(hex.length() - index - 1) * toHex(char)
     )
@@ -574,6 +565,50 @@ class RegExpCharacterClassEscape extends RegExpEscape {
   override string getPrimaryQLClass() { result = "RegExpCharacterClassEscape" }
 }
 
+/**
+ * A named character class in a regular expression.
+ *
+ * Examples:
+ *
+ * ```
+ * \p{Digit}
+ * \p{IsLowerCase}
+ */
+class RegExpNamedProperty extends RegExpCharacterClassEscape {
+  boolean inverted;
+  string name;
+
+  RegExpNamedProperty() {
+    name = this.getValue().substring(2, this.getValue().length() - 1) and
+    (
+      inverted = false and
+      this.getValue().charAt(0) = "p"
+      or
+      inverted = true and
+      this.getValue().charAt(0) = "P"
+    )
+  }
+
+  /** Holds if this class is inverted. */
+  predicate isInverted() { inverted = true }
+
+  /** Gets the name of this class. */
+  string getClassName() { result = name }
+
+  /**
+   * Gets an equivalent single-chcracter escape sequence for this class (e.g. \d) if possible, excluding the escape character.
+   */
+  string getBackslashEquivalent() {
+    exists(string eq | if inverted = true then result = eq.toUpperCase() else result = eq |
+      name = ["Digit", "IsDigit"] and
+      eq = "d"
+      or
+      name = ["Space", "IsWhite_Space"] and
+      eq = "s"
+    )
+  }
+}
+
 /**
  * A character class in a regular expression.
  *
diff --git a/java/ql/lib/semmle/code/java/security/performance/RegExpTreeView.qll b/java/ql/lib/semmle/code/java/security/performance/RegExpTreeView.qll
index ff3443acbca..608c03d006a 100644
--- a/java/ql/lib/semmle/code/java/security/performance/RegExpTreeView.qll
+++ b/java/ql/lib/semmle/code/java/security/performance/RegExpTreeView.qll
@@ -12,6 +12,8 @@ import semmle.code.java.regex.RegexTreeView
  */
 predicate isEscapeClass(RegExpTerm term, string clazz) {
   term.(RegExpCharacterClassEscape).getValue() = clazz
+  or
+  term.(RegExpNamedProperty).getBackslashEquivalent() = clazz
 }
 
 /**
diff --git a/java/ql/test/query-tests/security/CWE-730/ExpRedosTest.java b/java/ql/test/query-tests/security/CWE-730/ExpRedosTest.java
index 846f6139d08..a9fc19e5d50 100644
--- a/java/ql/test/query-tests/security/CWE-730/ExpRedosTest.java
+++ b/java/ql/test/query-tests/security/CWE-730/ExpRedosTest.java
@@ -371,6 +371,42 @@ class ExpRedosTest {
         // GOOD
         "X(\\u0061|b)+Y",
 
+        // NOT GOOD
+        "X(\\x61|a)*Y", // $ hasExpRedos
+
+        // GOOD
+        "X(\\x61|b)+Y",
+
+        // NOT GOOD
+        "X(\\x{061}|a)*Y", // $ hasExpRedos
+
+        // GOOD
+        "X(\\x{061}|b)+Y",
+
+        // NOT GOOD
+        "X(\\p{Digit}|7)*Y", // $ hasExpRedos
+
+        // GOOD
+        "X(\\p{Digit}|b)+Y",
+
+         // NOT GOOD
+        "X(\\P{Digit}|b)*Y", // $ hasExpRedos
+
+        // GOOD
+        "X(\\P{Digit}|7)+Y",
+
+        // NOT GOOD
+        "X(\\p{IsDigit}|7)*Y", // $ hasExpRedos
+
+        // GOOD
+        "X(\\p{IsDigit}|b)+Y",
+
+         // NOT GOOD - but not detected
+        "X(\\p{Alpha}|a)*Y", // $ MISSING: hasExpRedos
+
+        // GOOD
+        "X(\\p{Alpha}|7)+Y",
+
         // GOOD
         "(\"[^\"]*?\"|[^\"\\s]+)+(?=\\s*|\\s*$)",