From 57ba8a4d1b9d4f8a1c112834191fba5dc593ea5e Mon Sep 17 00:00:00 2001 From: Joe Farebrother Date: Mon, 21 Feb 2022 16:39:59 +0000 Subject: [PATCH] Improve handling of hex escapes; and support some named character classes --- .../semmle/code/java/regex/RegexTreeView.qll | 81 +++++++++++++------ .../security/performance/RegExpTreeView.qll | 2 + .../security/CWE-730/ExpRedosTest.java | 36 +++++++++ 3 files changed, 96 insertions(+), 23 deletions(-) diff --git a/java/ql/lib/semmle/code/java/regex/RegexTreeView.qll b/java/ql/lib/semmle/code/java/regex/RegexTreeView.qll index f7e85fe3edf..04bda79e9ad 100644 --- a/java/ql/lib/semmle/code/java/regex/RegexTreeView.qll +++ b/java/ql/lib/semmle/code/java/regex/RegexTreeView.qll @@ -55,7 +55,7 @@ class RegExpParent extends TRegExpParent { string toString() { result = "RegExpParent" } /** Gets the `i`th child term. */ - abstract RegExpTerm getChild(int i); + RegExpTerm getChild(int i) { none() } /** Gets a child term . */ RegExpTerm getAChild() { result = this.getChild(_) } @@ -143,26 +143,6 @@ class RegExpTerm extends RegExpParent { */ predicate isRootTerm() { start = 0 and end = re.getText().length() } - override RegExpTerm getChild(int i) { - result = this.(RegExpAlt).getChild(i) - or - result = this.(RegExpBackRef).getChild(i) - or - result = this.(RegExpCharacterClass).getChild(i) - or - result = this.(RegExpCharacterRange).getChild(i) - or - result = this.(RegExpNormalChar).getChild(i) - or - result = this.(RegExpGroup).getChild(i) - or - result = this.(RegExpQuantifier).getChild(i) - or - result = this.(RegExpSequence).getChild(i) - or - result = this.(RegExpSpecialChar).getChild(i) - } - /** * Gets the parent term of this regular expression term, or the * regular expression literal if this is the root term. @@ -508,7 +488,7 @@ class RegExpEscape extends RegExpNormalChar { /** * Holds if this is a unicode escape. */ - private predicate isUnicode() { this.getText().matches("\\u%") } + private predicate isUnicode() { this.getText().matches(["\\u%", "\\x%"]) } /** * Gets the unicode char for this escape. @@ -520,13 +500,24 @@ class RegExpEscape extends RegExpNormalChar { ) } + /** Gets the part of this escape that is a hexidecimal string */ + private string getHexString() { + this.isUnicode() and + if this.getText().matches("\\u%") // \uhhhh + then result = this.getText().suffix(2) + else + if this.getText().matches("\\x{%") // \x{h..h} + then result = this.getText().substring(3, this.getText().length() - 1) + else result = this.getText().suffix(2) // \xhh + } + /** * Gets int value for the `index`th char in the hex number of the unicode escape. * E.g. for `\u0061` and `index = 2` this returns 96 (the number `6` interpreted as hex). */ private int getHexValueFromUnicode(int index) { this.isUnicode() and - exists(string hex, string char | hex = this.getText().suffix(2) | + exists(string hex, string char | hex = this.getHexString() | char = hex.charAt(index) and result = 16.pow(hex.length() - index - 1) * toHex(char) ) @@ -574,6 +565,50 @@ class RegExpCharacterClassEscape extends RegExpEscape { override string getPrimaryQLClass() { result = "RegExpCharacterClassEscape" } } +/** + * A named character class in a regular expression. + * + * Examples: + * + * ``` + * \p{Digit} + * \p{IsLowerCase} + */ +class RegExpNamedProperty extends RegExpCharacterClassEscape { + boolean inverted; + string name; + + RegExpNamedProperty() { + name = this.getValue().substring(2, this.getValue().length() - 1) and + ( + inverted = false and + this.getValue().charAt(0) = "p" + or + inverted = true and + this.getValue().charAt(0) = "P" + ) + } + + /** Holds if this class is inverted. */ + predicate isInverted() { inverted = true } + + /** Gets the name of this class. */ + string getClassName() { result = name } + + /** + * Gets an equivalent single-chcracter escape sequence for this class (e.g. \d) if possible, excluding the escape character. + */ + string getBackslashEquivalent() { + exists(string eq | if inverted = true then result = eq.toUpperCase() else result = eq | + name = ["Digit", "IsDigit"] and + eq = "d" + or + name = ["Space", "IsWhite_Space"] and + eq = "s" + ) + } +} + /** * A character class in a regular expression. * diff --git a/java/ql/lib/semmle/code/java/security/performance/RegExpTreeView.qll b/java/ql/lib/semmle/code/java/security/performance/RegExpTreeView.qll index ff3443acbca..608c03d006a 100644 --- a/java/ql/lib/semmle/code/java/security/performance/RegExpTreeView.qll +++ b/java/ql/lib/semmle/code/java/security/performance/RegExpTreeView.qll @@ -12,6 +12,8 @@ import semmle.code.java.regex.RegexTreeView */ predicate isEscapeClass(RegExpTerm term, string clazz) { term.(RegExpCharacterClassEscape).getValue() = clazz + or + term.(RegExpNamedProperty).getBackslashEquivalent() = clazz } /** diff --git a/java/ql/test/query-tests/security/CWE-730/ExpRedosTest.java b/java/ql/test/query-tests/security/CWE-730/ExpRedosTest.java index 846f6139d08..a9fc19e5d50 100644 --- a/java/ql/test/query-tests/security/CWE-730/ExpRedosTest.java +++ b/java/ql/test/query-tests/security/CWE-730/ExpRedosTest.java @@ -371,6 +371,42 @@ class ExpRedosTest { // GOOD "X(\\u0061|b)+Y", + // NOT GOOD + "X(\\x61|a)*Y", // $ hasExpRedos + + // GOOD + "X(\\x61|b)+Y", + + // NOT GOOD + "X(\\x{061}|a)*Y", // $ hasExpRedos + + // GOOD + "X(\\x{061}|b)+Y", + + // NOT GOOD + "X(\\p{Digit}|7)*Y", // $ hasExpRedos + + // GOOD + "X(\\p{Digit}|b)+Y", + + // NOT GOOD + "X(\\P{Digit}|b)*Y", // $ hasExpRedos + + // GOOD + "X(\\P{Digit}|7)+Y", + + // NOT GOOD + "X(\\p{IsDigit}|7)*Y", // $ hasExpRedos + + // GOOD + "X(\\p{IsDigit}|b)+Y", + + // NOT GOOD - but not detected + "X(\\p{Alpha}|a)*Y", // $ MISSING: hasExpRedos + + // GOOD + "X(\\p{Alpha}|7)+Y", + // GOOD "(\"[^\"]*?\"|[^\"\\s]+)+(?=\\s*|\\s*$)",