Merge remote-tracking branch 'origin/main' into nickrolfe/misspelling

This commit is contained in:
Nick Rolfe
2022-05-12 16:10:27 +01:00
124 changed files with 22038 additions and 5967 deletions

View File

@@ -7,6 +7,7 @@
*/
import java
import semmle.code.java.regex.RegexTreeView
private newtype TPrintAstConfiguration = MkPrintAstConfiguration()
@@ -132,6 +133,9 @@ private newtype TPrintAstNode =
} or
TImportsNode(CompilationUnit cu) {
shouldPrint(cu, _) and exists(Import i | i.getCompilationUnit() = cu)
} or
TRegExpTermNode(RegExpTerm term) {
exists(StringLiteral str | term.getRootTerm() = getParsedRegExp(str) and shouldPrint(str, _))
}
/**
@@ -164,6 +168,19 @@ class PrintAstNode extends TPrintAstNode {
*/
Location getLocation() { none() }
/**
* Holds if this node is at the specified location.
* The location spans column `startcolumn` of line `startline` to
* column `endcolumn` of line `endline` in file `filepath`.
* For more information, see
* [Locations](https://codeql.github.com/docs/writing-codeql-queries/providing-locations-in-codeql-queries/).
*/
predicate hasLocationInfo(
string filepath, int startline, int startcolumn, int endline, int endcolumn
) {
this.getLocation().hasLocationInfo(filepath, startline, startcolumn, endline, endcolumn)
}
/**
* Gets the value of the property of this node, where the name of the property
* is `key`.
@@ -290,6 +307,47 @@ final class AnnotationPartNode extends ExprStmtNode {
}
}
/**
* A node representing a `StringLiteral`.
* If it is used as a regular expression, then it has a single child, the root of the parsed regular expression.
*/
final class StringLiteralNode extends ExprStmtNode {
StringLiteralNode() { element instanceof StringLiteral }
override PrintAstNode getChild(int childIndex) {
childIndex = 0 and
result.(RegExpTermNode).getTerm() = getParsedRegExp(element)
}
}
/**
* A node representing a regular expression term.
*/
class RegExpTermNode extends TRegExpTermNode, PrintAstNode {
RegExpTerm term;
RegExpTermNode() { this = TRegExpTermNode(term) }
/** Gets the `RegExpTerm` for this node. */
RegExpTerm getTerm() { result = term }
override PrintAstNode getChild(int childIndex) {
result.(RegExpTermNode).getTerm() = term.getChild(childIndex)
}
override string toString() {
result = "[" + strictconcat(term.getPrimaryQLClass(), " | ") + "] " + term.toString()
}
override Location getLocation() { result = term.getLocation() }
override predicate hasLocationInfo(
string filepath, int startline, int startcolumn, int endline, int endcolumn
) {
term.hasLocationInfo(filepath, startline, startcolumn, endline, endcolumn)
}
}
/**
* A node representing a `LocalVariableDeclExpr`.
*/

View File

@@ -142,6 +142,7 @@ private module Frameworks {
private import semmle.code.java.frameworks.jOOQ
private import semmle.code.java.frameworks.JMS
private import semmle.code.java.frameworks.RabbitMQ
private import semmle.code.java.regex.RegexFlowModels
}
private predicate sourceModelCsv(string row) {

View File

@@ -170,6 +170,14 @@ abstract class Configuration extends string {
*/
int explorationLimit() { none() }
/**
* Holds if hidden nodes should be included in the data flow graph.
*
* This feature should only be used for debugging or when the data flow graph
* is not visualized (for example in a `path-problem` query).
*/
predicate includeHiddenNodes() { none() }
/**
* Holds if there is a partial data flow path from `source` to `node`. The
* approximate distance between `node` and the closest source is `dist` and
@@ -3868,11 +3876,14 @@ abstract private class PathNodeImpl extends PathNode {
abstract NodeEx getNodeEx();
predicate isHidden() {
hiddenNode(this.getNodeEx().asNode()) and
not this.isSource() and
not this instanceof PathNodeSink
or
this.getNodeEx() instanceof TNodeImplicitRead
not this.getConfiguration().includeHiddenNodes() and
(
hiddenNode(this.getNodeEx().asNode()) and
not this.isSource() and
not this instanceof PathNodeSink
or
this.getNodeEx() instanceof TNodeImplicitRead
)
}
private string ppAp() {

View File

@@ -170,6 +170,14 @@ abstract class Configuration extends string {
*/
int explorationLimit() { none() }
/**
* Holds if hidden nodes should be included in the data flow graph.
*
* This feature should only be used for debugging or when the data flow graph
* is not visualized (for example in a `path-problem` query).
*/
predicate includeHiddenNodes() { none() }
/**
* Holds if there is a partial data flow path from `source` to `node`. The
* approximate distance between `node` and the closest source is `dist` and
@@ -3868,11 +3876,14 @@ abstract private class PathNodeImpl extends PathNode {
abstract NodeEx getNodeEx();
predicate isHidden() {
hiddenNode(this.getNodeEx().asNode()) and
not this.isSource() and
not this instanceof PathNodeSink
or
this.getNodeEx() instanceof TNodeImplicitRead
not this.getConfiguration().includeHiddenNodes() and
(
hiddenNode(this.getNodeEx().asNode()) and
not this.isSource() and
not this instanceof PathNodeSink
or
this.getNodeEx() instanceof TNodeImplicitRead
)
}
private string ppAp() {

View File

@@ -170,6 +170,14 @@ abstract class Configuration extends string {
*/
int explorationLimit() { none() }
/**
* Holds if hidden nodes should be included in the data flow graph.
*
* This feature should only be used for debugging or when the data flow graph
* is not visualized (for example in a `path-problem` query).
*/
predicate includeHiddenNodes() { none() }
/**
* Holds if there is a partial data flow path from `source` to `node`. The
* approximate distance between `node` and the closest source is `dist` and
@@ -3868,11 +3876,14 @@ abstract private class PathNodeImpl extends PathNode {
abstract NodeEx getNodeEx();
predicate isHidden() {
hiddenNode(this.getNodeEx().asNode()) and
not this.isSource() and
not this instanceof PathNodeSink
or
this.getNodeEx() instanceof TNodeImplicitRead
not this.getConfiguration().includeHiddenNodes() and
(
hiddenNode(this.getNodeEx().asNode()) and
not this.isSource() and
not this instanceof PathNodeSink
or
this.getNodeEx() instanceof TNodeImplicitRead
)
}
private string ppAp() {

View File

@@ -170,6 +170,14 @@ abstract class Configuration extends string {
*/
int explorationLimit() { none() }
/**
* Holds if hidden nodes should be included in the data flow graph.
*
* This feature should only be used for debugging or when the data flow graph
* is not visualized (for example in a `path-problem` query).
*/
predicate includeHiddenNodes() { none() }
/**
* Holds if there is a partial data flow path from `source` to `node`. The
* approximate distance between `node` and the closest source is `dist` and
@@ -3868,11 +3876,14 @@ abstract private class PathNodeImpl extends PathNode {
abstract NodeEx getNodeEx();
predicate isHidden() {
hiddenNode(this.getNodeEx().asNode()) and
not this.isSource() and
not this instanceof PathNodeSink
or
this.getNodeEx() instanceof TNodeImplicitRead
not this.getConfiguration().includeHiddenNodes() and
(
hiddenNode(this.getNodeEx().asNode()) and
not this.isSource() and
not this instanceof PathNodeSink
or
this.getNodeEx() instanceof TNodeImplicitRead
)
}
private string ppAp() {

View File

@@ -170,6 +170,14 @@ abstract class Configuration extends string {
*/
int explorationLimit() { none() }
/**
* Holds if hidden nodes should be included in the data flow graph.
*
* This feature should only be used for debugging or when the data flow graph
* is not visualized (for example in a `path-problem` query).
*/
predicate includeHiddenNodes() { none() }
/**
* Holds if there is a partial data flow path from `source` to `node`. The
* approximate distance between `node` and the closest source is `dist` and
@@ -3868,11 +3876,14 @@ abstract private class PathNodeImpl extends PathNode {
abstract NodeEx getNodeEx();
predicate isHidden() {
hiddenNode(this.getNodeEx().asNode()) and
not this.isSource() and
not this instanceof PathNodeSink
or
this.getNodeEx() instanceof TNodeImplicitRead
not this.getConfiguration().includeHiddenNodes() and
(
hiddenNode(this.getNodeEx().asNode()) and
not this.isSource() and
not this instanceof PathNodeSink
or
this.getNodeEx() instanceof TNodeImplicitRead
)
}
private string ppAp() {

View File

@@ -170,6 +170,14 @@ abstract class Configuration extends string {
*/
int explorationLimit() { none() }
/**
* Holds if hidden nodes should be included in the data flow graph.
*
* This feature should only be used for debugging or when the data flow graph
* is not visualized (for example in a `path-problem` query).
*/
predicate includeHiddenNodes() { none() }
/**
* Holds if there is a partial data flow path from `source` to `node`. The
* approximate distance between `node` and the closest source is `dist` and
@@ -3868,11 +3876,14 @@ abstract private class PathNodeImpl extends PathNode {
abstract NodeEx getNodeEx();
predicate isHidden() {
hiddenNode(this.getNodeEx().asNode()) and
not this.isSource() and
not this instanceof PathNodeSink
or
this.getNodeEx() instanceof TNodeImplicitRead
not this.getConfiguration().includeHiddenNodes() and
(
hiddenNode(this.getNodeEx().asNode()) and
not this.isSource() and
not this instanceof PathNodeSink
or
this.getNodeEx() instanceof TNodeImplicitRead
)
}
private string ppAp() {

View File

@@ -170,6 +170,14 @@ abstract class Configuration extends string {
*/
int explorationLimit() { none() }
/**
* Holds if hidden nodes should be included in the data flow graph.
*
* This feature should only be used for debugging or when the data flow graph
* is not visualized (for example in a `path-problem` query).
*/
predicate includeHiddenNodes() { none() }
/**
* Holds if there is a partial data flow path from `source` to `node`. The
* approximate distance between `node` and the closest source is `dist` and
@@ -3868,11 +3876,14 @@ abstract private class PathNodeImpl extends PathNode {
abstract NodeEx getNodeEx();
predicate isHidden() {
hiddenNode(this.getNodeEx().asNode()) and
not this.isSource() and
not this instanceof PathNodeSink
or
this.getNodeEx() instanceof TNodeImplicitRead
not this.getConfiguration().includeHiddenNodes() and
(
hiddenNode(this.getNodeEx().asNode()) and
not this.isSource() and
not this instanceof PathNodeSink
or
this.getNodeEx() instanceof TNodeImplicitRead
)
}
private string ppAp() {

View File

@@ -170,6 +170,14 @@ abstract class Configuration extends string {
*/
int explorationLimit() { none() }
/**
* Holds if hidden nodes should be included in the data flow graph.
*
* This feature should only be used for debugging or when the data flow graph
* is not visualized (for example in a `path-problem` query).
*/
predicate includeHiddenNodes() { none() }
/**
* Holds if there is a partial data flow path from `source` to `node`. The
* approximate distance between `node` and the closest source is `dist` and
@@ -3868,11 +3876,14 @@ abstract private class PathNodeImpl extends PathNode {
abstract NodeEx getNodeEx();
predicate isHidden() {
hiddenNode(this.getNodeEx().asNode()) and
not this.isSource() and
not this instanceof PathNodeSink
or
this.getNodeEx() instanceof TNodeImplicitRead
not this.getConfiguration().includeHiddenNodes() and
(
hiddenNode(this.getNodeEx().asNode()) and
not this.isSource() and
not this instanceof PathNodeSink
or
this.getNodeEx() instanceof TNodeImplicitRead
)
}
private string ppAp() {

View File

@@ -0,0 +1,193 @@
/**
* Defines configurations and steps for handling regexes
*/
import java
import semmle.code.java.dataflow.ExternalFlow
private import semmle.code.java.dataflow.DataFlow
private import semmle.code.java.dataflow.DataFlow2
private import RegexFlowModels
private import semmle.code.java.security.SecurityTests
private class ExploitableStringLiteral extends StringLiteral {
ExploitableStringLiteral() { this.getValue().matches(["%+%", "%*%", "%{%}%"]) }
}
/**
* Holds if `kind` is an external sink kind that is relevant for regex flow.
* `full` is true if sinks with this kind match against the full string of its input.
* `strArg` is the index of the argument to methods with this sink kind that contan the string to be matched against,
* where -1 is the qualifier; or -2 if no such argument exists.
*/
private predicate regexSinkKindInfo(string kind, boolean full, int strArg) {
sinkModel(_, _, _, _, _, _, _, kind, _) and
exists(string fullStr, string strArgStr |
(
full = true and fullStr = "f"
or
full = false and fullStr = ""
) and
(
strArgStr.toInt() = strArg
or
strArg = -2 and
strArgStr = ""
)
|
kind = "regex-use[" + fullStr + strArgStr + "]"
)
}
/** A sink that is relevant for regex flow. */
private class RegexFlowSink extends DataFlow::Node {
boolean full;
int strArg;
RegexFlowSink() {
exists(string kind |
regexSinkKindInfo(kind, full, strArg) and
sinkNode(this, kind)
)
}
/** Holds if a regex that flows here is matched against a full string (rather than a substring). */
predicate matchesFullString() { full = true }
/** Gets the string expression that a regex that flows here is matched against, if any. */
Expr getStringArgument() {
exists(MethodAccess ma |
this.asExpr() = argOf(ma, _) and
result = argOf(ma, strArg)
)
}
}
private Expr argOf(MethodAccess ma, int arg) {
arg = -1 and result = ma.getQualifier()
or
result = ma.getArgument(arg)
}
/**
* A unit class for adding additional regex flow steps.
*
* Extend this class to add additional flow steps that should apply to regex flow configurations.
*/
class RegexAdditionalFlowStep extends Unit {
/**
* Holds if the step from `node1` to `node2` should be considered a flow
* step for regex flow configurations.
*/
abstract predicate step(DataFlow::Node node1, DataFlow::Node node2);
}
// TODO: This may be able to be done with models-as-data if query-specific flow steps beome supported.
private class JdkRegexFlowStep extends RegexAdditionalFlowStep {
override predicate step(DataFlow::Node node1, DataFlow::Node node2) {
exists(MethodAccess ma, Method m, string package, string type, string name, int arg |
ma.getMethod().getSourceDeclaration().overrides*(m) and
m.hasQualifiedName(package, type, name) and
node1.asExpr() = argOf(ma, arg) and
node2.asExpr() = ma
|
package = "java.util.regex" and
type = "Pattern" and
(
name = ["asMatchPredicate", "asPredicate", "matcher"] and
arg = -1
or
name = "compile" and
arg = 0
)
or
package = "java.util.function" and
type = "Predicate" and
name = ["and", "or", "not", "negate"] and
arg = [-1, 0]
)
}
}
private class GuavaRegexFlowStep extends RegexAdditionalFlowStep {
override predicate step(DataFlow::Node node1, DataFlow::Node node2) {
exists(MethodAccess ma, Method m, string package, string type, string name, int arg |
ma.getMethod().getSourceDeclaration().overrides*(m) and
m.hasQualifiedName(package, type, name) and
node1.asExpr() = argOf(ma, arg) and
node2.asExpr() = ma
|
package = "com.google.common.base" and
type = "Splitter" and
(
name = "on" and
m.getParameterType(0).(RefType).hasQualifiedName("java.util.regex", "Pattern") and
arg = 0
or
name = "withKeyValueSeparator" and
m.getParameterType(0).(RefType).hasQualifiedName("com.google.common.base", "Splitter") and
arg = 0
or
name = "onPattern" and
arg = 0
or
name = ["limit", "omitEmptyStrings", "trimResults", "withKeyValueSeparator"] and
arg = -1
)
)
}
}
private class RegexFlowConf extends DataFlow2::Configuration {
RegexFlowConf() { this = "RegexFlowConfig" }
override predicate isSource(DataFlow::Node node) {
node.asExpr() instanceof ExploitableStringLiteral
}
override predicate isSink(DataFlow::Node node) { node instanceof RegexFlowSink }
override predicate isAdditionalFlowStep(DataFlow::Node node1, DataFlow::Node node2) {
any(RegexAdditionalFlowStep s).step(node1, node2)
}
override predicate isBarrier(DataFlow::Node node) {
node.getEnclosingCallable().getDeclaringType() instanceof NonSecurityTestClass
}
}
/**
* Holds if `regex` is used as a regex, with the mode `mode` (if known).
* If regex mode is not known, `mode` will be `"None"`.
*
* As an optimisation, only regexes containing an infinite repitition quatifier (`+`, `*`, or `{x,}`)
* and therefore may be relevant for ReDoS queries are considered.
*/
predicate usedAsRegex(StringLiteral regex, string mode, boolean match_full_string) {
any(RegexFlowConf c).hasFlow(DataFlow2::exprNode(regex), _) and
mode = "None" and // TODO: proper mode detection
(if matchesFullString(regex) then match_full_string = true else match_full_string = false)
}
/**
* Holds if `regex` is used as a regular expression that is matched against a full string,
* as though it was implicitly surrounded by ^ and $.
*/
private predicate matchesFullString(StringLiteral regex) {
exists(RegexFlowConf c, RegexFlowSink sink |
sink.matchesFullString() and
c.hasFlow(DataFlow2::exprNode(regex), sink)
)
}
/**
* Holds if the string literal `regex` is a regular expression that is matched against the expression `str`.
*
* As an optimisation, only regexes containing an infinite repitition quatifier (`+`, `*`, or `{x,}`)
* and therefore may be relevant for ReDoS queries are considered.
*/
predicate regexMatchedAgainst(StringLiteral regex, Expr str) {
exists(RegexFlowConf c, RegexFlowSink sink |
str = sink.getStringArgument() and
c.hasFlow(DataFlow2::exprNode(regex), sink)
)
}

View File

@@ -0,0 +1,32 @@
/** Definitions of data flow steps for determining flow of regular expressions. */
import java
import semmle.code.java.dataflow.ExternalFlow
private class RegexSinkCsv extends SinkModelCsv {
override predicate row(string row) {
row =
[
//"namespace;type;subtypes;name;signature;ext;input;kind"
"java.util.regex;Matcher;false;matches;();;Argument[-1];regex-use[f]",
"java.util.regex;Pattern;false;asMatchPredicate;();;Argument[-1];regex-use[f]",
"java.util.regex;Pattern;false;compile;(String);;Argument[0];regex-use[]",
"java.util.regex;Pattern;false;compile;(String,int);;Argument[0];regex-use[]",
"java.util.regex;Pattern;false;matcher;(CharSequence);;Argument[-1];regex-use[0]",
"java.util.regex;Pattern;false;matches;(String,CharSequence);;Argument[0];regex-use[f1]",
"java.util.regex;Pattern;false;split;(CharSequence);;Argument[-1];regex-use[0]",
"java.util.regex;Pattern;false;split;(CharSequence,int);;Argument[-1];regex-use[0]",
"java.util.regex;Pattern;false;splitAsStream;(CharSequence);;Argument[-1];regex-use[0]",
"java.util.function;Predicate;false;test;(Object);;Argument[-1];regex-use[0]",
"java.lang;String;false;matches;(String);;Argument[0];regex-use[f-1]",
"java.lang;String;false;split;(String);;Argument[0];regex-use[-1]",
"java.lang;String;false;split;(String,int);;Argument[0];regex-use[-1]",
"java.lang;String;false;replaceAll;(String,String);;Argument[0];regex-use[-1]",
"java.lang;String;false;replaceFirst;(String,String);;Argument[0];regex-use[-1]",
"com.google.common.base;Splitter;false;onPattern;(String);;Argument[0];regex-use[]",
"com.google.common.base;Splitter;false;split;(CharSequence);;Argument[-1];regex-use[0]",
"com.google.common.base;Splitter;false;splitToList;(CharSequence);;Argument[-1];regex-use[0]",
"com.google.common.base;Splitter$MapSplitter;false;split;(CharSequence);;Argument[-1];regex-use[0]",
]
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,917 @@
/**
* Definitions for parsing regular expressions.
*/
import java
private import RegexFlowConfigs
// In all ranges handled by this library, `start` is inclusive and `end` is exclusive.
/**
* A string literal that is used as a regular expression.
*/
abstract class RegexString extends StringLiteral {
/** Gets the text of this regex */
string getText() { result = this.(StringLiteral).getValue() }
/** Gets the `i`th character of this regex. */
string getChar(int i) { result = this.getText().charAt(i) }
/** Holds if the regex failed to parse. */
predicate failedToParse(int i) {
exists(this.getChar(i)) and
not exists(int start, int end |
this.topLevel(start, end) and
start <= i and
end > i
)
}
/**
* Helper predicate for `escapingChar`.
* In order to avoid negative recursion, we return a boolean.
* This way, we can refer to `escaping(pos - 1).booleanNot()`
* rather than to a negated version of `escaping(pos)`.
*/
private boolean escaping(int pos) {
pos = -1 and result = false
or
this.getChar(pos) = "\\" and
(
if this.getChar(pos - 1) = "c" // in `\c\`, the latter `\` isn't escaping
then result = this.escaping(pos - 2).booleanNot()
else result = this.escaping(pos - 1).booleanNot()
)
or
this.getChar(pos) != "\\" and result = false
}
/**
* Helper predicate for `quote`.
* Holds if the char at `pos` could be the beginning of a quote delimiter, i.e. `\Q` (non-escaped) or `\E` (escaping not checked, as quote sequences turn off escapes).
* Result is `true` for `\Q` and `false` for `\E`.
*/
private boolean quoteDelimiter(int pos) {
result = true and
this.escaping(pos) = true and
this.getChar(pos + 1) = "Q"
or
result = false and
this.getChar(pos) = "\\" and
this.getChar(pos + 1) = "E"
}
/**
* Helper predicate for `quote`.
* Holds if the char at `pos` is the one-based `index`th occurence of a quote delimiter (`\Q` or `\E`)
* Result is `true` for `\Q` and `false` for `\E`.
*/
private boolean quoteDelimiter(int index, int pos) {
result = this.quoteDelimiter(pos) and
pos = rank[index](int p | exists(this.quoteDelimiter(p)))
}
/** Holds if a quoted sequence is found between `start` and `end` */
predicate quote(int start, int end) { this.quote(start, end, _, _) }
/** Holds if a quoted sequence is found between `start` and `end`, with content found between `inner_start` and `inner_end`. */
predicate quote(int start, int end, int inner_start, int inner_end) {
exists(int index |
this.quoteDelimiter(index, start) = true and
(
index = 1
or
this.quoteDelimiter(index - 1, _) = false
) and
inner_start = start + 2 and
inner_end = end - 2 and
inner_end =
min(int end_delimiter |
this.quoteDelimiter(end_delimiter) = false and end_delimiter > inner_start
)
)
}
/** Holds if the character at `pos` is a "\" that is actually escaping what comes after. */
predicate escapingChar(int pos) {
this.escaping(pos) = true and
not exists(int x, int y | this.quote(x, y) and pos in [x .. y - 1])
}
/**
* Holds if there is a control sequence, `\cx`, between `start` and `end`.
* `x` may be any ascii character including special characters.
*/
predicate controlEscape(int start, int end) {
this.escapingChar(start) and
this.getChar(start + 1) = "c" and
end = start + 3
}
private predicate namedBackreference(int start, int end, string name) {
this.escapingChar(start) and
this.getChar(start + 1) = "k" and
this.getChar(start + 2) = "<" and
end = min(int i | i > start + 2 and this.getChar(i) = ">") + 1 and
name = this.getText().substring(start + 3, end - 1)
}
private predicate numberedBackreference(int start, int end, int value) {
this.escapingChar(start) and
// starting with 0 makes it an octal escape
not this.getChar(start + 1) = "0" and
exists(string text, string svalue, int len |
end = start + len and
text = this.getText() and
len in [2 .. 3]
|
svalue = text.substring(start + 1, start + len) and
value = svalue.toInt() and
// value is composed of digits
forall(int i | i in [start + 1 .. start + len - 1] | this.getChar(i) = [0 .. 9].toString()) and
// a longer reference is not possible
not (
len = 2 and
exists(text.substring(start + 1, start + len + 1).toInt())
) and
// 3 octal digits makes it an octal escape
not forall(int i | i in [start + 1 .. start + 4] | this.isOctal(i))
// TODO: Inside a character set, all numeric escapes are treated as characters.
)
}
/** Holds if the text in the range start,end is a back reference */
predicate backreference(int start, int end) {
this.numberedBackreference(start, end, _)
or
this.namedBackreference(start, end, _)
}
/** Gets the number of the back reference in start,end */
int getBackrefNumber(int start, int end) { this.numberedBackreference(start, end, result) }
/** Gets the name, if it has one, of the back reference in start,end */
string getBackrefName(int start, int end) { this.namedBackreference(start, end, result) }
pragma[inline]
private predicate isOctal(int index) { this.getChar(index) = [0 .. 7].toString() }
/** An escape sequence that includes braces, such as named characters (\N{degree sign}), named classes (\p{Lower}), or hex values (\x{h..h}) */
private predicate escapedBraces(int start, int end) {
this.escapingChar(start) and
this.getChar(start + 1) = ["N", "p", "P", "x"] and
this.getChar(start + 2) = "{" and
end = min(int i | start + 2 < i and this.getChar(i - 1) = "}")
}
/**
* Holds if an escaped character is found between `start` and `end`.
* Escaped characters include hex values, octal values and named escapes,
* but excludes backreferences.
*/
predicate escapedCharacter(int start, int end) {
this.escapingChar(start) and
not this.backreference(start, _) and
(
// hex value \xhh
this.getChar(start + 1) = "x" and
this.getChar(start + 2) != "{" and
end = start + 4
or
// octal value \0o, \0oo, or \0ooo. Max of 0377.
this.getChar(start + 1) = "0" and
this.isOctal(start + 2) and
(
if this.isOctal(start + 3)
then
if this.isOctal(start + 4) and this.getChar(start + 2) in ["0", "1", "2", "3"]
then end = start + 5
else end = start + 4
else end = start + 3
)
or
// 16-bit hex value \uhhhh
this.getChar(start + 1) = "u" and end = start + 6
or
this.escapedBraces(start, end)
or
// Boundary matchers \b, \b{g}
this.getChar(start + 1) = "b" and
(
if this.getText().substring(start + 2, start + 5) = "{g}"
then end = start + 5
else end = start + 2
)
or
this.controlEscape(start, end)
or
// escape not handled above, update when adding a new case
not this.getChar(start + 1) in ["x", "0", "u", "p", "P", "N", "b", "c"] and
not exists(this.getChar(start + 1).toInt()) and
end = start + 2
)
}
private string nonEscapedCharAt(int i) {
result = this.getChar(i) and
not exists(int x, int y | this.escapedCharacter(x, y) and i in [x .. y - 1]) and
not exists(int x, int y | this.quote(x, y) and i in [x .. y - 1])
}
/** Holds if a character set starts between `start` and `end`, including any negation character (`^`). */
private predicate charSetStart0(int start, int end) {
this.nonEscapedCharAt(start) = "[" and
(if this.getChar(start + 1) = "^" then end = start + 2 else end = start + 1)
}
/** Holds if the character at `pos` marks the end of a character class. */
private predicate charSetEnd0(int pos) {
this.nonEscapedCharAt(pos) = "]" and
/* special case: `[]]` and `[^]]` are valid char classes. */
not this.charSetStart0(_, pos)
}
/**
* Holds if the character at `pos` starts a character set delimiter.
* Result is 1 for `[` and -1 for `]`.
*/
private int charSetDelimiter(int pos) {
result = 1 and this.charSetStart0(pos, _)
or
result = -1 and this.charSetEnd0(pos)
}
/**
* Holds if the char at `pos` is the one-based `index`th occourence of a character set delimiter (`[` or `]`).
* Result is 1 for `[` and -1 for `]`.
*/
private int charSetDelimiter(int index, int pos) {
result = this.charSetDelimiter(pos) and
pos = rank[index](int p | exists(this.charSetDelimiter(p)))
}
/**
* Gets the nesting depth of character classes after position `pos`,
* where `pos` is the position of a character set delimiter.
*/
private int charSetDepth(int index, int pos) {
index = 1 and result = 0.maximum(this.charSetDelimiter(index, pos))
or
result = 0.maximum(this.charSetDelimiter(index, pos) + this.charSetDepth(index - 1, _))
}
/** Hold if a top-level character set starts between `start` and `end`. */
predicate charSetStart(int start, int end) {
this.charSetStart0(start, end) and
this.charSetDepth(_, start) = 1
}
/** Holds if a top-level character set ends at `pos`. */
predicate charSetEnd(int pos) {
this.charSetEnd0(pos) and
this.charSetDepth(_, pos) = 0
}
/**
* Holds if there is a top-level character class beginning at `start` (inclusive) and ending at `end` (exclusive)
*
* For now, nested character classes are approximated by only considering the top-level class for parsing.
* This leads to very similar results for ReDoS queries.
*/
predicate charSet(int start, int end) {
exists(int inner_start, int inner_end | this.charSetStart(start, inner_start) |
end = inner_end + 1 and
inner_end =
min(int end_delimiter | this.charSetEnd(end_delimiter) and end_delimiter > inner_start)
)
}
/** Either a char or a - */
private predicate charSetToken(int charset_start, int start, int end) {
this.charSetStart(charset_start, start) and
(
this.escapedCharacter(start, end)
or
exists(this.nonEscapedCharAt(start)) and end = start + 1
or
this.quote(start, end)
)
or
this.charSetToken(charset_start, _, start) and
(
this.escapedCharacter(start, end)
or
exists(this.nonEscapedCharAt(start)) and
end = start + 1 and
not this.charSetEnd(start)
or
this.quote(start, end)
)
}
/** An indexed version of `charSetToken/3` */
private predicate charSetToken(int charset_start, int index, int token_start, int token_end) {
token_start = rank[index](int start | this.charSetToken(charset_start, start, _) | start) and
this.charSetToken(charset_start, token_start, token_end)
}
/**
* Helper predicate for `charRange`.
* We can determine where character ranges end by a left to right sweep.
*
* To avoid negative recursion we return a boolean. See `escaping`,
* the helper for `escapingChar`, for a clean use of this pattern.
*/
private boolean charRangeEnd(int charset_start, int index) {
this.charSetToken(charset_start, index, _, _) and
(
index in [1, 2] and result = false
or
index > 2 and
exists(int connector_start |
this.charSetToken(charset_start, index - 1, connector_start, _) and
this.nonEscapedCharAt(connector_start) = "-" and
result =
this.charRangeEnd(charset_start, index - 2)
.booleanNot()
.booleanAnd(this.charRangeEnd(charset_start, index - 1).booleanNot())
)
or
not exists(int connector_start |
this.charSetToken(charset_start, index - 1, connector_start, _) and
this.nonEscapedCharAt(connector_start) = "-"
) and
result = false
)
}
/**
* Holds if the character set starting at `charset_start` contains a character range
* with lower bound found between `start` and `lower_end`
* and upper bound found between `upper_start` and `end`.
*/
predicate charRange(int charset_start, int start, int lower_end, int upper_start, int end) {
exists(int index |
this.charRangeEnd(charset_start, index) = true and
this.charSetToken(charset_start, index - 2, start, lower_end) and
this.charSetToken(charset_start, index, upper_start, end)
)
}
/**
* Holds if the character set starting at `charset_start` contains either
* a character or a range found between `start` and `end`.
*/
predicate charSetChild(int charset_start, int start, int end) {
this.charSetToken(charset_start, start, end) and
not exists(int range_start, int range_end |
this.charRange(charset_start, range_start, _, _, range_end) and
range_start <= start and
range_end >= end
)
or
this.charRange(charset_start, start, _, _, end)
}
/** Holds if `index` is inside a character set. */
predicate inCharSet(int index) {
exists(int x, int y | this.charSet(x, y) and index in [x + 1 .. y - 2])
}
/**
* 'simple' characters are any that don't alter the parsing of the regex.
*/
private predicate simpleCharacter(int start, int end) {
end = start + 1 and
not this.charSet(start, _) and
not this.charSet(_, start + 1) and
exists(string c | c = this.getChar(start) |
exists(int x, int y, int z |
this.charSet(x, z) and
this.charSetStart(x, y)
|
start = y
or
start = z - 2
or
start > y and start < z - 2 and not this.charRange(_, _, start, end, _)
)
or
not this.inCharSet(start) and
not c = "(" and
not c = "[" and
not c = ")" and
not c = "|" and
not c = "{" and
not exists(int qstart, int qend | this.quantifier(qstart, qend, _, _) |
qstart <= start and start < qend
)
)
}
private predicate character(int start, int end) {
(
this.simpleCharacter(start, end) and
not exists(int x, int y | this.escapedCharacter(x, y) and x <= start and y >= end) and
not exists(int x, int y | this.quote(x, y) and x <= start and y >= end)
or
this.escapedCharacter(start, end)
) and
not exists(int x, int y | this.groupStart(x, y) and x <= start and y >= end) and
not exists(int x, int y | this.backreference(x, y) and x <= start and y >= end)
}
/** Holds if a normal character or escape sequence is between `start` and `end`. */
predicate normalCharacter(int start, int end) {
this.character(start, end) and
not this.specialCharacter(start, end, _)
}
/** Holds if a special character `char` is between `start` and `end`. */
predicate specialCharacter(int start, int end, string char) {
this.character(start, end) and
end = start + 1 and
char = this.getChar(start) and
(char = "$" or char = "^" or char = ".") and
not this.inCharSet(start)
}
private predicate isGroupEnd(int i) { this.nonEscapedCharAt(i) = ")" and not this.inCharSet(i) }
private predicate isGroupStart(int i) { this.nonEscapedCharAt(i) = "(" and not this.inCharSet(i) }
/** Holds if an empty group is found between `start` and `end`. */
predicate emptyGroup(int start, int end) {
exists(int endm1 | end = endm1 + 1 |
this.groupStart(start, endm1) and
this.isGroupEnd(endm1)
)
}
private predicate nonCapturingGroupStart(int start, int end) {
this.isGroupStart(start) and
this.getChar(start + 1) = "?" and
this.getChar(start + 2) = ":" and
end = start + 3
}
private predicate simpleGroupStart(int start, int end) {
this.isGroupStart(start) and
this.getChar(start + 1) != "?" and
end = start + 1
}
private predicate namedGroupStart(int start, int end) {
this.isGroupStart(start) and
this.getChar(start + 1) = "?" and
this.getChar(start + 2) = "<" and
not this.getChar(start + 3) = "=" and
not this.getChar(start + 3) = "!" and
exists(int name_end |
name_end = min(int i | i > start + 3 and this.getChar(i) = ">") and
end = name_end + 1
)
}
private predicate flagGroupStart(int start, int end, string c) {
this.isGroupStart(start) and
this.getChar(start + 1) = "?" and
end = start + 3 and
c = this.getChar(start + 2) and
c in ["i", "m", "s", "u", "x", "U"]
}
/**
* Gets the mode of this regular expression string if
* it is defined by a prefix.
*/
string getModeFromPrefix() {
exists(string c | this.flagGroupStart(_, _, c) |
c = "i" and result = "IGNORECASE"
or
c = "m" and result = "MULTILINE"
or
c = "s" and result = "DOTALL"
or
c = "u" and result = "UNICODE"
or
c = "x" and result = "VERBOSE"
or
c = "U" and result = "UNICODECLASS"
)
}
private predicate lookaheadAssertionStart(int start, int end) {
this.isGroupStart(start) and
this.getChar(start + 1) = "?" and
this.getChar(start + 2) = "=" and
end = start + 3
}
private predicate negativeLookaheadAssertionStart(int start, int end) {
this.isGroupStart(start) and
this.getChar(start + 1) = "?" and
this.getChar(start + 2) = "!" and
end = start + 3
}
private predicate lookbehindAssertionStart(int start, int end) {
this.isGroupStart(start) and
this.getChar(start + 1) = "?" and
this.getChar(start + 2) = "<" and
this.getChar(start + 3) = "=" and
end = start + 4
}
private predicate negativeLookbehindAssertionStart(int start, int end) {
this.isGroupStart(start) and
this.getChar(start + 1) = "?" and
this.getChar(start + 2) = "<" and
this.getChar(start + 3) = "!" and
end = start + 4
}
private predicate atomicGroupStart(int start, int end) {
this.isGroupStart(start) and
this.getChar(start + 1) = "?" and
this.getChar(start + 2) = ">" and
end = start + 3
}
private predicate groupStart(int start, int end) {
this.nonCapturingGroupStart(start, end)
or
this.flagGroupStart(start, end, _)
or
this.namedGroupStart(start, end)
or
this.lookaheadAssertionStart(start, end)
or
this.negativeLookaheadAssertionStart(start, end)
or
this.lookbehindAssertionStart(start, end)
or
this.negativeLookbehindAssertionStart(start, end)
or
this.atomicGroupStart(start, end)
or
this.simpleGroupStart(start, end)
}
/** Holds if the text in the range start,end is a group with contents in the range in_start,in_end */
predicate groupContents(int start, int end, int in_start, int in_end) {
this.groupStart(start, in_start) and
end = in_end + 1 and
this.topLevel(in_start, in_end) and
this.isGroupEnd(in_end)
}
/** Holds if the text in the range start,end is a group */
predicate group(int start, int end) {
this.groupContents(start, end, _, _)
or
this.emptyGroup(start, end)
}
/** Gets the number of the group in start,end */
int getGroupNumber(int start, int end) {
this.group(start, end) and
result =
count(int i | this.group(i, _) and i < start and not this.nonCapturingGroupStart(i, _)) + 1
}
/** Gets the name, if it has one, of the group in start,end */
string getGroupName(int start, int end) {
this.group(start, end) and
exists(int name_end |
this.namedGroupStart(start, name_end) and
result = this.getText().substring(start + 3, name_end - 1)
)
}
/** Holds if a negative lookahead is found between `start` and `end` */
predicate negativeLookaheadAssertionGroup(int start, int end) {
exists(int in_start | this.negativeLookaheadAssertionStart(start, in_start) |
this.groupContents(start, end, in_start, _)
)
}
/** Holds if a negative lookbehind is found between `start` and `end` */
predicate negativeLookbehindAssertionGroup(int start, int end) {
exists(int in_start | this.negativeLookbehindAssertionStart(start, in_start) |
this.groupContents(start, end, in_start, _)
)
}
private predicate negativeAssertionGroup(int start, int end) {
exists(int in_start |
this.negativeLookaheadAssertionStart(start, in_start)
or
this.negativeLookbehindAssertionStart(start, in_start)
|
this.groupContents(start, end, in_start, _)
)
}
/** Holds if a positive lookahead is found between `start` and `end` */
predicate positiveLookaheadAssertionGroup(int start, int end) {
exists(int in_start | this.lookaheadAssertionStart(start, in_start) |
this.groupContents(start, end, in_start, _)
)
}
/** Holds if a positive lookbehind is found between `start` and `end` */
predicate positiveLookbehindAssertionGroup(int start, int end) {
exists(int in_start | this.lookbehindAssertionStart(start, in_start) |
this.groupContents(start, end, in_start, _)
)
}
/** Holds if the text in the range start, end is a group and can match the empty string. */
predicate zeroWidthMatch(int start, int end) {
this.emptyGroup(start, end)
or
this.negativeAssertionGroup(start, end)
or
this.positiveLookaheadAssertionGroup(start, end)
or
this.positiveLookbehindAssertionGroup(start, end)
}
private predicate baseItem(int start, int end) {
this.character(start, end) and
not exists(int x, int y | this.charSet(x, y) and x <= start and y >= end)
or
this.group(start, end)
or
this.charSet(start, end)
or
this.backreference(start, end)
or
this.quote(start, end)
}
private predicate shortQuantifier(
int start, int end, boolean maybe_empty, boolean may_repeat_forever
) {
(
this.getChar(start) = "+" and maybe_empty = false and may_repeat_forever = true
or
this.getChar(start) = "*" and maybe_empty = true and may_repeat_forever = true
or
this.getChar(start) = "?" and maybe_empty = true and may_repeat_forever = false
) and
end = start + 1
or
exists(string lower, string upper |
this.multiples(start, end, lower, upper) and
(if lower = "" or lower.toInt() = 0 then maybe_empty = true else maybe_empty = false) and
if upper = "" then may_repeat_forever = true else may_repeat_forever = false
)
}
/**
* Holds if a repetition quantifier is found between `start` and `end`,
* with the given lower and upper bounds. If a bound is omitted, the corresponding
* string is empty.
*/
predicate multiples(int start, int end, string lower, string upper) {
exists(string text, string match, string inner |
text = this.getText() and
end = start + match.length() and
inner = match.substring(1, match.length() - 1)
|
match = text.regexpFind("\\{[0-9]+\\}", _, start) and
lower = inner and
upper = lower
or
match = text.regexpFind("\\{[0-9]*,[0-9]*\\}", _, start) and
exists(int commaIndex |
commaIndex = inner.indexOf(",") and
lower = inner.prefix(commaIndex) and
upper = inner.suffix(commaIndex + 1)
)
)
}
private predicate quantifier(int start, int end, boolean maybe_empty, boolean may_repeat_forever) {
this.shortQuantifier(start, end, maybe_empty, may_repeat_forever) and
not this.getChar(end) = ["?", "+"]
or
exists(int short_end | this.shortQuantifier(start, short_end, maybe_empty, may_repeat_forever) |
if this.getChar(short_end) = ["?", "+"] then end = short_end + 1 else end = short_end
)
}
/**
* Holds if a quantified part is found between `start` and `part_end` and the quantifier is
* found between `part_end` and `end`.
*
* `maybe_empty` is true if the part is optional.
* `may_repeat_forever` is true if the part may be repeated unboundedly.
*/
predicate quantifiedPart(
int start, int part_end, int end, boolean maybe_empty, boolean may_repeat_forever
) {
this.baseItem(start, part_end) and
this.quantifier(part_end, end, maybe_empty, may_repeat_forever)
}
/**
* Holds if the text in the range start,end is a quantified item, where item is a character,
* a character set or a group.
*/
predicate quantifiedItem(int start, int end, boolean maybe_empty, boolean may_repeat_forever) {
this.quantifiedPart(start, _, end, maybe_empty, may_repeat_forever)
}
/** Holds if the range `start`, `end` contains a character, a quantifier, a character set or a group. */
predicate item(int start, int end) {
this.quantifiedItem(start, end, _, _)
or
this.baseItem(start, end) and not this.quantifier(end, _, _, _)
}
private predicate itemStart(int start) {
this.character(start, _) or
this.isGroupStart(start) or
this.charSet(start, _) or
this.backreference(start, _) or
this.quote(start, _)
}
private predicate itemEnd(int end) {
this.character(_, end)
or
exists(int endm1 | this.isGroupEnd(endm1) and end = endm1 + 1)
or
this.charSet(_, end)
or
this.quantifier(_, end, _, _)
or
this.quote(_, end)
}
private predicate isOptionDivider(int i) { this.nonEscapedCharAt(i) = "|" }
private predicate subsequence(int start, int end) {
(
start = 0 or
this.groupStart(_, start) or
this.isOptionDivider(start - 1)
) and
this.item(start, end)
or
exists(int mid |
this.subsequence(start, mid) and
this.item(mid, end)
)
}
private predicate sequenceOrQuantified(int start, int end) {
this.subsequence(start, end) and
not this.itemStart(end)
}
/**
* Holds if the text in the range start,end is a sequence of 1 or more items, where an item is a character,
* a character set or a group.
*/
predicate sequence(int start, int end) {
this.sequenceOrQuantified(start, end) and
not this.quantifiedItem(start, end, _, _)
}
private predicate subalternation(int start, int end, int itemStart) {
this.sequenceOrQuantified(start, end) and
not this.isOptionDivider(start - 1) and
itemStart = start
or
start = end and
not this.itemEnd(start) and
this.isOptionDivider(end) and
itemStart = start
or
exists(int mid |
this.subalternation(start, mid, _) and
this.isOptionDivider(mid) and
itemStart = mid + 1
|
this.sequenceOrQuantified(itemStart, end)
or
not this.itemStart(end) and end = itemStart
)
}
private predicate topLevel(int start, int end) {
not this.inCharSet(start) and
this.subalternation(start, end, _) and
not this.isOptionDivider(end)
}
/**
* Holds if the text in the range start,end is an alternation
*/
predicate alternation(int start, int end) {
this.topLevel(start, end) and
exists(int less | this.subalternation(start, less, _) and less < end)
}
/**
* Holds if the text in the range start,end is an alternation and the text in part_start, part_end is one of the
* options in that alternation.
*/
predicate alternationOption(int start, int end, int part_start, int part_end) {
this.alternation(start, end) and
this.subalternation(start, part_end, part_start)
}
/**
* Gets the `i`th character of this literal as it was written in the source code.
*/
string getSourceChar(int i) { result = this.(StringLiteral).getLiteral().charAt(i) }
/**
* Helper predicate for `sourceEscapingChar` that
* results in a boolean in order to avoid negative recursion.
*/
private boolean sourceEscaping(int pos) {
pos = -1 and result = false
or
this.getSourceChar(pos) = "\\" and
result = this.sourceEscaping(pos - 1).booleanNot()
or
this.getSourceChar(pos) != "\\" and result = false
}
/**
* Equivalent of `escapingChar` for the literal source rather than the string value.
* Holds if the character at position `pos` in the source literal is a '\' that is
* actually escaping what comes after it.
*/
private predicate sourceEcapingChar(int pos) { this.sourceEscaping(pos) = true }
/**
* Holds if an escaped character exists between `start` and `end` in the source iteral.
*/
private predicate sourceEscapedCharacter(int start, int end) {
this.sourceEcapingChar(start) and
(if this.getSourceChar(start + 1) = "u" then end = start + 6 else end = start + 2)
}
private predicate sourceNonEscapedCharacter(int i) {
exists(this.getSourceChar(i)) and
not exists(int x, int y | this.sourceEscapedCharacter(x, y) and i in [x .. y - 1])
}
/**
* Holds if a character is represented between `start` and `end` in the source literal.
*/
private predicate sourceCharacter(int start, int end) {
this.sourceEscapedCharacter(start, end)
or
this.sourceNonEscapedCharacter(start) and
end = start + 1
}
/**
* Holds if the `i`th character of the string is represented between offsets
* `start` (inclusive) and `end` (exclusive) in the source code of this literal.
* This only gives correct results if the literal is written as a normal single-line string literal;
* without compile-time concatenation involved.
*/
predicate sourceCharacter(int pos, int start, int end) {
exists(this.getChar(pos)) and
this.sourceCharacter(start, end) and
start = rank[pos + 2](int s | this.sourceCharacter(s, _))
}
}
/** A string literal used as a regular expression */
class Regex extends RegexString {
boolean matches_full_string;
Regex() { usedAsRegex(this, _, matches_full_string) }
/**
* Gets a mode (if any) of this regular expression. Can be any of:
* DEBUG
* IGNORECASE
* MULTILINE
* DOTALL
* UNICODE
* VERBOSE
* UNICODECLASS
*/
string getAMode() {
result != "None" and
usedAsRegex(this, result, _)
or
result = this.getModeFromPrefix()
}
/**
* Holds if this regex is used to match against a full string,
* as though it was implicitly surrounded by ^ and $.
*/
predicate matchesFullString() { matches_full_string = true }
}

View File

@@ -0,0 +1,371 @@
/**
* This library implements the analysis described in the following two papers:
*
* James Kirrage, Asiri Rathnayake, Hayo Thielecke: Static Analysis for
* Regular Expression Denial-of-Service Attacks. NSS 2013.
* (http://www.cs.bham.ac.uk/~hxt/research/reg-exp-sec.pdf)
* Asiri Rathnayake, Hayo Thielecke: Static Analysis for Regular Expression
* Exponential Runtime via Substructural Logics. 2014.
* (https://www.cs.bham.ac.uk/~hxt/research/redos_full.pdf)
*
* The basic idea is to search for overlapping cycles in the NFA, that is,
* states `q` such that there are two distinct paths from `q` to itself
* that consume the same word `w`.
*
* For any such state `q`, an attack string can be constructed as follows:
* concatenate a prefix `v` that takes the NFA to `q` with `n` copies of
* the word `w` that leads back to `q` along two different paths, followed
* by a suffix `x` that is _not_ accepted in state `q`. A backtracking
* implementation will need to explore at least 2^n different ways of going
* from `q` back to itself while trying to match the `n` copies of `w`
* before finally giving up.
*
* Now in order to identify overlapping cycles, all we have to do is find
* pumpable forks, that is, states `q` that can transition to two different
* states `r1` and `r2` on the same input symbol `c`, such that there are
* paths from both `r1` and `r2` to `q` that consume the same word. The latter
* condition is equivalent to saying that `(q, q)` is reachable from `(r1, r2)`
* in the product NFA.
*
* This is what the library does. It makes a simple attempt to construct a
* prefix `v` leading into `q`, but only to improve the alert message.
* And the library tries to prove the existence of a suffix that ensures
* rejection. This check might fail, which can cause false positives.
*
* Finally, sometimes it depends on the translation whether the NFA generated
* for a regular expression has a pumpable fork or not. We implement one
* particular translation, which may result in false positives or negatives
* relative to some particular JavaScript engine.
*
* More precisely, the library constructs an NFA from a regular expression `r`
* as follows:
*
* * Every sub-term `t` gives rise to an NFA state `Match(t,i)`, representing
* the state of the automaton before attempting to match the `i`th character in `t`.
* * There is one accepting state `Accept(r)`.
* * There is a special `AcceptAnySuffix(r)` state, which accepts any suffix string
* by using an epsilon transition to `Accept(r)` and an any transition to itself.
* * Transitions between states may be labelled with epsilon, or an abstract
* input symbol.
* * Each abstract input symbol represents a set of concrete input characters:
* either a single character, a set of characters represented by a
* character class, or the set of all characters.
* * The product automaton is constructed lazily, starting with pair states
* `(q, q)` where `q` is a fork, and proceding along an over-approximate
* step relation.
* * The over-approximate step relation allows transitions along pairs of
* abstract input symbols where the symbols have overlap in the characters they accept.
* * Once a trace of pairs of abstract input symbols that leads from a fork
* back to itself has been identified, we attempt to construct a concrete
* string corresponding to it, which may fail.
* * Lastly we ensure that any state reached by repeating `n` copies of `w` has
* a suffix `x` (possible empty) that is most likely __not__ accepted.
*/
import ReDoSUtil
/**
* Holds if state `s` might be inside a backtracking repetition.
*/
pragma[noinline]
private predicate stateInsideBacktracking(State s) {
s.getRepr().getParent*() instanceof MaybeBacktrackingRepetition
}
/**
* A infinitely repeating quantifier that might backtrack.
*/
private class MaybeBacktrackingRepetition extends InfiniteRepetitionQuantifier {
MaybeBacktrackingRepetition() {
exists(RegExpTerm child |
child instanceof RegExpAlt or
child instanceof RegExpQuantifier
|
child.getParent+() = this
)
}
}
/**
* A state in the product automaton.
*/
private newtype TStatePair =
/**
* We lazily only construct those states that we are actually
* going to need: `(q, q)` for every fork state `q`, and any
* pair of states that can be reached from a pair that we have
* already constructed. To cut down on the number of states,
* we only represent states `(q1, q2)` where `q1` is lexicographically
* no bigger than `q2`.
*
* States are only constructed if both states in the pair are
* inside a repetition that might backtrack.
*/
MkStatePair(State q1, State q2) {
isFork(q1, _, _, _, _) and q2 = q1
or
(step(_, _, _, q1, q2) or step(_, _, _, q2, q1)) and
rankState(q1) <= rankState(q2)
}
/**
* Gets a unique number for a `state`.
* Is used to create an ordering of states, where states with the same `toString()` will be ordered differently.
*/
private int rankState(State state) {
state =
rank[result](State s, Location l |
l = s.getRepr().getLocation()
|
s order by l.getStartLine(), l.getStartColumn(), s.toString()
)
}
/**
* A state in the product automaton.
*/
private class StatePair extends TStatePair {
State q1;
State q2;
StatePair() { this = MkStatePair(q1, q2) }
/** Gets a textual representation of this element. */
string toString() { result = "(" + q1 + ", " + q2 + ")" }
/** Gets the first component of the state pair. */
State getLeft() { result = q1 }
/** Gets the second component of the state pair. */
State getRight() { result = q2 }
}
/**
* Holds for all constructed state pairs.
*
* Used in `statePairDist`
*/
private predicate isStatePair(StatePair p) { any() }
/**
* Holds if there are transitions from the components of `q` to the corresponding
* components of `r`.
*
* Used in `statePairDist`
*/
private predicate delta2(StatePair q, StatePair r) { step(q, _, _, r) }
/**
* Gets the minimum length of a path from `q` to `r` in the
* product automaton.
*/
private int statePairDist(StatePair q, StatePair r) =
shortestDistances(isStatePair/1, delta2/2)(q, r, result)
/**
* Holds if there are transitions from `q` to `r1` and from `q` to `r2`
* labelled with `s1` and `s2`, respectively, where `s1` and `s2` do not
* trivially have an empty intersection.
*
* This predicate only holds for states associated with regular expressions
* that have at least one repetition quantifier in them (otherwise the
* expression cannot be vulnerable to ReDoS attacks anyway).
*/
pragma[noopt]
private predicate isFork(State q, InputSymbol s1, InputSymbol s2, State r1, State r2) {
stateInsideBacktracking(q) and
exists(State q1, State q2 |
q1 = epsilonSucc*(q) and
delta(q1, s1, r1) and
q2 = epsilonSucc*(q) and
delta(q2, s2, r2) and
// Use pragma[noopt] to prevent intersect(s1,s2) from being the starting point of the join.
// From (s1,s2) it would find a huge number of intermediate state pairs (q1,q2) originating from different literals,
// and discover at the end that no `q` can reach both `q1` and `q2` by epsilon transitions.
exists(intersect(s1, s2))
|
s1 != s2
or
r1 != r2
or
r1 = r2 and q1 != q2
or
// If q can reach itself by epsilon transitions, then there are two distinct paths to the q1/q2 state:
// one that uses the loop and one that doesn't. The engine will separately attempt to match with each path,
// despite ending in the same state. The "fork" thus arises from the choice of whether to use the loop or not.
// To avoid every state in the loop becoming a fork state,
// we arbitrarily pick the InfiniteRepetitionQuantifier state as the canonical fork state for the loop
// (every epsilon-loop must contain such a state).
//
// We additionally require that the there exists another InfiniteRepetitionQuantifier `mid` on the path from `q` to itself.
// This is done to avoid flagging regular expressions such as `/(a?)*b/` - that only has polynomial runtime, and is detected by `js/polynomial-redos`.
// The below code is therefore a heuritic, that only flags regular expressions such as `/(a*)*b/`,
// and does not flag regular expressions such as `/(a?b?)c/`, but the latter pattern is not used frequently.
r1 = r2 and
q1 = q2 and
epsilonSucc+(q) = q and
exists(RegExpTerm term | term = q.getRepr() | term instanceof InfiniteRepetitionQuantifier) and
// One of the mid states is an infinite quantifier itself
exists(State mid, RegExpTerm term |
mid = epsilonSucc+(q) and
term = mid.getRepr() and
term instanceof InfiniteRepetitionQuantifier and
q = epsilonSucc+(mid) and
not mid = q
)
) and
stateInsideBacktracking(r1) and
stateInsideBacktracking(r2)
}
/**
* Gets the state pair `(q1, q2)` or `(q2, q1)`; note that only
* one or the other is defined.
*/
private StatePair mkStatePair(State q1, State q2) {
result = MkStatePair(q1, q2) or result = MkStatePair(q2, q1)
}
/**
* Holds if there are transitions from the components of `q` to the corresponding
* components of `r` labelled with `s1` and `s2`, respectively.
*/
private predicate step(StatePair q, InputSymbol s1, InputSymbol s2, StatePair r) {
exists(State r1, State r2 | step(q, s1, s2, r1, r2) and r = mkStatePair(r1, r2))
}
/**
* Holds if there are transitions from the components of `q` to `r1` and `r2`
* labelled with `s1` and `s2`, respectively.
*
* We only consider transitions where the resulting states `(r1, r2)` are both
* inside a repetition that might backtrack.
*/
pragma[noopt]
private predicate step(StatePair q, InputSymbol s1, InputSymbol s2, State r1, State r2) {
exists(State q1, State q2 | q.getLeft() = q1 and q.getRight() = q2 |
deltaClosed(q1, s1, r1) and
deltaClosed(q2, s2, r2) and
// use noopt to force the join on `intersect` to happen last.
exists(intersect(s1, s2))
) and
stateInsideBacktracking(r1) and
stateInsideBacktracking(r2)
}
private newtype TTrace =
Nil() or
Step(InputSymbol s1, InputSymbol s2, TTrace t) {
exists(StatePair p |
isReachableFromFork(_, p, t, _) and
step(p, s1, s2, _)
)
or
t = Nil() and isFork(_, s1, s2, _, _)
}
/**
* A list of pairs of input symbols that describe a path in the product automaton
* starting from some fork state.
*/
private class Trace extends TTrace {
/** Gets a textual representation of this element. */
string toString() {
this = Nil() and result = "Nil()"
or
exists(InputSymbol s1, InputSymbol s2, Trace t | this = Step(s1, s2, t) |
result = "Step(" + s1 + ", " + s2 + ", " + t + ")"
)
}
}
/**
* Holds if `r` is reachable from `(fork, fork)` under input `w`, and there is
* a path from `r` back to `(fork, fork)` with `rem` steps.
*/
private predicate isReachableFromFork(State fork, StatePair r, Trace w, int rem) {
// base case
exists(InputSymbol s1, InputSymbol s2, State q1, State q2 |
isFork(fork, s1, s2, q1, q2) and
r = MkStatePair(q1, q2) and
w = Step(s1, s2, Nil()) and
rem = statePairDist(r, MkStatePair(fork, fork))
)
or
// recursive case
exists(StatePair p, Trace v, InputSymbol s1, InputSymbol s2 |
isReachableFromFork(fork, p, v, rem + 1) and
step(p, s1, s2, r) and
w = Step(s1, s2, v) and
rem >= statePairDist(r, MkStatePair(fork, fork))
)
}
/**
* Gets a state in the product automaton from which `(fork, fork)` is
* reachable in zero or more epsilon transitions.
*/
private StatePair getAForkPair(State fork) {
isFork(fork, _, _, _, _) and
result = MkStatePair(epsilonPred*(fork), epsilonPred*(fork))
}
private predicate hasSuffix(Trace suffix, Trace t, int i) {
// Declaring `t` to be a `RelevantTrace` currently causes a redundant check in the
// recursive case, so instead we check it explicitly here.
t instanceof RelevantTrace and
i = 0 and
suffix = t
or
hasSuffix(Step(_, _, suffix), t, i - 1)
}
pragma[noinline]
private predicate hasTuple(InputSymbol s1, InputSymbol s2, Trace t, int i) {
hasSuffix(Step(s1, s2, _), t, i)
}
private class RelevantTrace extends Trace, Step {
RelevantTrace() {
exists(State fork, StatePair q |
isReachableFromFork(fork, q, this, _) and
q = getAForkPair(fork)
)
}
pragma[noinline]
private string intersect(int i) {
exists(InputSymbol s1, InputSymbol s2 |
hasTuple(s1, s2, this, i) and
result = intersect(s1, s2)
)
}
/** Gets a string corresponding to this trace. */
// the pragma is needed for the case where `intersect(s1, s2)` has multiple values,
// not for recursion
language[monotonicAggregates]
string concretise() {
result = strictconcat(int i | hasTuple(_, _, this, i) | this.intersect(i) order by i desc)
}
}
/**
* Holds if `fork` is a pumpable fork with word `w`.
*/
private predicate isPumpable(State fork, string w) {
exists(StatePair q, RelevantTrace t |
isReachableFromFork(fork, q, t, _) and
q = getAForkPair(fork) and
w = t.concretise()
)
}
/**
* An instantiation of `ReDoSConfiguration` for exponential backtracking.
*/
class ExponentialReDoSConfiguration extends ReDoSConfiguration {
ExponentialReDoSConfiguration() { this = "ExponentialReDoSConfiguration" }
override predicate isReDoSCandidate(State state, string pump) { isPumpable(state, pump) }
}

View File

@@ -0,0 +1,56 @@
/** Definitions and configurations for the Polynomial ReDoS query */
import semmle.code.java.security.performance.SuperlinearBackTracking
import semmle.code.java.dataflow.DataFlow
import semmle.code.java.regex.RegexTreeView
import semmle.code.java.regex.RegexFlowConfigs
import semmle.code.java.dataflow.FlowSources
/** A sink for polynomial redos queries, where a regex is matched. */
class PolynomialRedosSink extends DataFlow::Node {
RegExpLiteral reg;
PolynomialRedosSink() { regexMatchedAgainst(reg.getRegex(), this.asExpr()) }
/** Gets the regex that is matched against this node. */
RegExpTerm getRegExp() { result.getParent() = reg }
}
/**
* A method whose result typically has a limited length,
* such as HTTP headers, and values derrived from them.
*/
private class LengthRestrictedMethod extends Method {
LengthRestrictedMethod() {
this.getName().toLowerCase().matches(["%header%", "%requesturi%", "%requesturl%", "%cookie%"])
or
this.getDeclaringType().getName().toLowerCase().matches("%cookie%") and
this.getName().matches("get%")
or
this.getDeclaringType().getName().toLowerCase().matches("%request%") and
this.getName().toLowerCase().matches(["%get%path%", "get%user%", "%querystring%"])
}
}
/** A configuration for Polynomial ReDoS queries. */
class PolynomialRedosConfig extends TaintTracking::Configuration {
PolynomialRedosConfig() { this = "PolynomialRedosConfig" }
override predicate isSource(DataFlow::Node src) { src instanceof RemoteFlowSource }
override predicate isSink(DataFlow::Node sink) { sink instanceof PolynomialRedosSink }
override predicate isSanitizer(DataFlow::Node node) {
node.getType() instanceof PrimitiveType or
node.getType() instanceof BoxedType or
node.asExpr().(MethodAccess).getMethod() instanceof LengthRestrictedMethod
}
}
/** Holds if there is flow from `source` to `sink` that is matched against the regexp term `regexp` that is vulnerable to Polynomial ReDoS. */
predicate hasPolynomialReDoSResult(
DataFlow::PathNode source, DataFlow::PathNode sink, PolynomialBackTrackingTerm regexp
) {
any(PolynomialRedosConfig config).hasFlowPath(source, sink) and
regexp.getRootTerm() = sink.getNode().(PolynomialRedosSink).getRegExp()
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,76 @@
/**
* This module should provide a class hierarchy corresponding to a parse tree of regular expressions.
* This is the interface to the shared ReDoS library.
*/
private import java
import semmle.code.FileSystem
import semmle.code.java.regex.RegexTreeView
/**
* Holds if `term` is an escape class representing e.g. `\d`.
* `clazz` is which character class it represents, e.g. "d" for `\d`.
*/
predicate isEscapeClass(RegExpTerm term, string clazz) {
term.(RegExpCharacterClassEscape).getValue() = clazz
or
term.(RegExpNamedProperty).getBackslashEquivalent() = clazz
}
/**
* Holds if `term` is a possessive quantifier, e.g. `a*+`.
*/
predicate isPossessive(RegExpQuantifier term) { term.isPossessive() }
/**
* Holds if the regex that `term` is part of is used in a way that ignores any leading prefix of the input it's matched against.
*/
predicate matchesAnyPrefix(RegExpTerm term) { not term.getRegex().matchesFullString() }
/**
* Holds if the regex that `term` is part of is used in a way that ignores any trailing suffix of the input it's matched against.
*/
predicate matchesAnySuffix(RegExpTerm term) { not term.getRegex().matchesFullString() }
/**
* Holds if the regular expression should not be considered.
*
* We make the pragmatic performance optimization to ignore regular expressions in files
* that do not belong to the project code (such as installed dependencies).
*/
predicate isExcluded(RegExpParent parent) {
not exists(parent.getRegex().getLocation().getFile().getRelativePath())
or
// Regexes with many occurrences of ".*" may cause the polynomial ReDoS computation to explode, so
// we explicitly exclude these.
strictcount(int i | exists(parent.getRegex().getText().regexpFind("\\.\\*", i, _)) | i) > 10
}
/**
* A module containing predicates for determining which flags a regular expression have.
*/
module RegExpFlags {
/**
* Holds if `root` has the `i` flag for case-insensitive matching.
*/
predicate isIgnoreCase(RegExpTerm root) {
root.isRootTerm() and
root.getLiteral().isIgnoreCase()
}
/**
* Gets the flags for `root`, or the empty string if `root` has no flags.
*/
string getFlags(RegExpTerm root) {
root.isRootTerm() and
result = root.getLiteral().getFlags()
}
/**
* Holds if `root` has the `s` flag for multi-line matching.
*/
predicate isDotAll(RegExpTerm root) {
root.isRootTerm() and
root.getLiteral().isDotAll()
}
}

View File

@@ -0,0 +1,454 @@
/**
* Provides classes for working with regular expressions that can
* perform backtracking in superlinear time.
*/
import ReDoSUtil
/*
* This module implements the analysis described in the paper:
* Valentin Wustholz, Oswaldo Olivo, Marijn J. H. Heule, and Isil Dillig:
* Static Detection of DoS Vulnerabilities in
* Programs that use Regular Expressions
* (Extended Version).
* (https://arxiv.org/pdf/1701.04045.pdf)
*
* Theorem 3 from the paper describes the basic idea.
*
* The following explains the idea using variables and predicate names that are used in the implementation:
* We consider a pair of repetitions, which we will call `pivot` and `succ`.
*
* We create a product automaton of 3-tuples of states (see `StateTuple`).
* There exists a transition `(a,b,c) -> (d,e,f)` in the product automaton
* iff there exists three transitions in the NFA `a->d, b->e, c->f` where those three
* transitions all match a shared character `char`. (see `getAThreewayIntersect`)
*
* We start a search in the product automaton at `(pivot, pivot, succ)`,
* and search for a series of transitions (a `Trace`), such that we end
* at `(pivot, succ, succ)` (see `isReachableFromStartTuple`).
*
* For example, consider the regular expression `/^\d*5\w*$/`.
* The search will start at the tuple `(\d*, \d*, \w*)` and search
* for a path to `(\d*, \w*, \w*)`.
* This path exists, and consists of a single transition in the product automaton,
* where the three corresponding NFA edges all match the character `"5"`.
*
* The start-state in the NFA has an any-transition to itself, this allows us to
* flag regular expressions such as `/a*$/` - which does not have a start anchor -
* and can thus start matching anywhere.
*
* The implementation is not perfect.
* It has the same suffix detection issue as the `js/redos` query, which can cause false positives.
* It also doesn't find all transitions in the product automaton, which can cause false negatives.
*/
/**
* An instantiaion of `ReDoSConfiguration` for superlinear ReDoS.
*/
class SuperLinearReDoSConfiguration extends ReDoSConfiguration {
SuperLinearReDoSConfiguration() { this = "SuperLinearReDoSConfiguration" }
override predicate isReDoSCandidate(State state, string pump) { isPumpable(_, state, pump) }
}
/**
* Gets any root (start) state of a regular expression.
*/
private State getRootState() { result = mkMatch(any(RegExpRoot r)) }
private newtype TStateTuple =
MkStateTuple(State q1, State q2, State q3) {
// starts at (pivot, pivot, succ)
isStartLoops(q1, q3) and q1 = q2
or
step(_, _, _, _, q1, q2, q3) and FeasibleTuple::isFeasibleTuple(q1, q2, q3)
}
/**
* A state in the product automaton.
* The product automaton contains 3-tuples of states.
*
* We lazily only construct those states that we are actually
* going to need.
* Either a start state `(pivot, pivot, succ)`, or a state
* where there exists a transition from an already existing state.
*
* The exponential variant of this query (`js/redos`) uses an optimization
* trick where `q1 <= q2`. This trick cannot be used here as the order
* of the elements matter.
*/
class StateTuple extends TStateTuple {
State q1;
State q2;
State q3;
StateTuple() { this = MkStateTuple(q1, q2, q3) }
/**
* Gest a string repesentation of this tuple.
*/
string toString() { result = "(" + q1 + ", " + q2 + ", " + q3 + ")" }
/**
* Holds if this tuple is `(r1, r2, r3)`.
*/
pragma[noinline]
predicate isTuple(State r1, State r2, State r3) { r1 = q1 and r2 = q2 and r3 = q3 }
}
/**
* A module for determining feasible tuples for the product automaton.
*
* The implementation is split into many predicates for performance reasons.
*/
private module FeasibleTuple {
/**
* Holds if the tuple `(r1, r2, r3)` might be on path from a start-state to an end-state in the product automaton.
*/
pragma[inline]
predicate isFeasibleTuple(State r1, State r2, State r3) {
// The first element is either inside a repetition (or the start state itself)
isRepetitionOrStart(r1) and
// The last element is inside a repetition
stateInsideRepetition(r3) and
// The states are reachable in the NFA in the order r1 -> r2 -> r3
delta+(r1) = r2 and
delta+(r2) = r3 and
// The first element can reach a beginning (the "pivot" state in a `(pivot, succ)` pair).
canReachABeginning(r1) and
// The last element can reach a target (the "succ" state in a `(pivot, succ)` pair).
canReachATarget(r3)
}
/**
* Holds if `s` is either inside a repetition, or is the start state (which is a repetition).
*/
pragma[noinline]
private predicate isRepetitionOrStart(State s) { stateInsideRepetition(s) or s = getRootState() }
/**
* Holds if state `s` might be inside a backtracking repetition.
*/
pragma[noinline]
private predicate stateInsideRepetition(State s) {
s.getRepr().getParent*() instanceof InfiniteRepetitionQuantifier
}
/**
* Holds if there exists a path in the NFA from `s` to a "pivot" state
* (from a `(pivot, succ)` pair that starts the search).
*/
pragma[noinline]
private predicate canReachABeginning(State s) {
delta+(s) = any(State pivot | isStartLoops(pivot, _))
}
/**
* Holds if there exists a path in the NFA from `s` to a "succ" state
* (from a `(pivot, succ)` pair that starts the search).
*/
pragma[noinline]
private predicate canReachATarget(State s) { delta+(s) = any(State succ | isStartLoops(_, succ)) }
}
/**
* Holds if `pivot` and `succ` are a pair of loops that could be the beginning of a quadratic blowup.
*
* There is a slight implementation difference compared to the paper: this predicate requires that `pivot != succ`.
* The case where `pivot = succ` causes exponential backtracking and is handled by the `js/redos` query.
*/
predicate isStartLoops(State pivot, State succ) {
pivot != succ and
succ.getRepr() instanceof InfiniteRepetitionQuantifier and
delta+(pivot) = succ and
(
pivot.getRepr() instanceof InfiniteRepetitionQuantifier
or
pivot = mkMatch(any(RegExpRoot root))
)
}
/**
* Gets a state for which there exists a transition in the NFA from `s'.
*/
State delta(State s) { delta(s, _, result) }
/**
* Holds if there are transitions from the components of `q` to the corresponding
* components of `r` labelled with `s1`, `s2`, and `s3`, respectively.
*/
pragma[noinline]
predicate step(StateTuple q, InputSymbol s1, InputSymbol s2, InputSymbol s3, StateTuple r) {
exists(State r1, State r2, State r3 |
step(q, s1, s2, s3, r1, r2, r3) and r = MkStateTuple(r1, r2, r3)
)
}
/**
* Holds if there are transitions from the components of `q` to `r1`, `r2`, and `r3
* labelled with `s1`, `s2`, and `s3`, respectively.
*/
pragma[noopt]
predicate step(
StateTuple q, InputSymbol s1, InputSymbol s2, InputSymbol s3, State r1, State r2, State r3
) {
exists(State q1, State q2, State q3 | q.isTuple(q1, q2, q3) |
deltaClosed(q1, s1, r1) and
deltaClosed(q2, s2, r2) and
deltaClosed(q3, s3, r3) and
// use noopt to force the join on `getAThreewayIntersect` to happen last.
exists(getAThreewayIntersect(s1, s2, s3))
)
}
/**
* Gets a char that is matched by all the edges `s1`, `s2`, and `s3`.
*
* The result is not complete, and might miss some combination of edges that share some character.
*/
pragma[noinline]
string getAThreewayIntersect(InputSymbol s1, InputSymbol s2, InputSymbol s3) {
result = minAndMaxIntersect(s1, s2) and result = [intersect(s2, s3), intersect(s1, s3)]
or
result = minAndMaxIntersect(s1, s3) and result = [intersect(s2, s3), intersect(s1, s2)]
or
result = minAndMaxIntersect(s2, s3) and result = [intersect(s1, s2), intersect(s1, s3)]
}
/**
* Gets the minimum and maximum characters that intersect between `a` and `b`.
* This predicate is used to limit the size of `getAThreewayIntersect`.
*/
pragma[noinline]
string minAndMaxIntersect(InputSymbol a, InputSymbol b) {
result = [min(intersect(a, b)), max(intersect(a, b))]
}
private newtype TTrace =
Nil() or
Step(InputSymbol s1, InputSymbol s2, InputSymbol s3, TTrace t) {
exists(StateTuple p |
isReachableFromStartTuple(_, _, p, t, _) and
step(p, s1, s2, s3, _)
)
or
exists(State pivot, State succ | isStartLoops(pivot, succ) |
t = Nil() and step(MkStateTuple(pivot, pivot, succ), s1, s2, s3, _)
)
}
/**
* A list of tuples of input symbols that describe a path in the product automaton
* starting from some start state.
*/
class Trace extends TTrace {
/**
* Gets a string representation of this Trace that can be used for debug purposes.
*/
string toString() {
this = Nil() and result = "Nil()"
or
exists(InputSymbol s1, InputSymbol s2, InputSymbol s3, Trace t | this = Step(s1, s2, s3, t) |
result = "Step(" + s1 + ", " + s2 + ", " + s3 + ", " + t + ")"
)
}
}
/**
* Holds if there exists a transition from `r` to `q` in the product automaton.
* Notice that the arguments are flipped, and thus the direction is backwards.
*/
pragma[noinline]
predicate tupleDeltaBackwards(StateTuple q, StateTuple r) { step(r, _, _, _, q) }
/**
* Holds if `tuple` is an end state in our search.
* That means there exists a pair of loops `(pivot, succ)` such that `tuple = (pivot, succ, succ)`.
*/
predicate isEndTuple(StateTuple tuple) { tuple = getAnEndTuple(_, _) }
/**
* Gets the minimum length of a path from `r` to some an end state `end`.
*
* The implementation searches backwards from the end-tuple.
* This approach was chosen because it is way more efficient if the first predicate given to `shortestDistances` is small.
* The `end` argument must always be an end state.
*/
int distBackFromEnd(StateTuple r, StateTuple end) =
shortestDistances(isEndTuple/1, tupleDeltaBackwards/2)(end, r, result)
/**
* Holds if there exists a pair of repetitions `(pivot, succ)` in the regular expression such that:
* `tuple` is reachable from `(pivot, pivot, succ)` in the product automaton,
* and there is a distance of `dist` from `tuple` to the nearest end-tuple `(pivot, succ, succ)`,
* and a path from a start-state to `tuple` follows the transitions in `trace`.
*/
predicate isReachableFromStartTuple(State pivot, State succ, StateTuple tuple, Trace trace, int dist) {
// base case. The first step is inlined to start the search after all possible 1-steps, and not just the ones with the shortest path.
exists(InputSymbol s1, InputSymbol s2, InputSymbol s3, State q1, State q2, State q3 |
isStartLoops(pivot, succ) and
step(MkStateTuple(pivot, pivot, succ), s1, s2, s3, tuple) and
tuple = MkStateTuple(q1, q2, q3) and
trace = Step(s1, s2, s3, Nil()) and
dist = distBackFromEnd(tuple, MkStateTuple(pivot, succ, succ))
)
or
// recursive case
exists(StateTuple p, Trace v, InputSymbol s1, InputSymbol s2, InputSymbol s3 |
isReachableFromStartTuple(pivot, succ, p, v, dist + 1) and
dist = isReachableFromStartTupleHelper(pivot, succ, tuple, p, s1, s2, s3) and
trace = Step(s1, s2, s3, v)
)
}
/**
* Helper predicate for the recursive case in `isReachableFromStartTuple`.
*/
pragma[noinline]
private int isReachableFromStartTupleHelper(
State pivot, State succ, StateTuple r, StateTuple p, InputSymbol s1, InputSymbol s2,
InputSymbol s3
) {
result = distBackFromEnd(r, MkStateTuple(pivot, succ, succ)) and
step(p, s1, s2, s3, r)
}
/**
* Gets the tuple `(pivot, succ, succ)` from the product automaton.
*/
StateTuple getAnEndTuple(State pivot, State succ) {
isStartLoops(pivot, succ) and
result = MkStateTuple(pivot, succ, succ)
}
private predicate hasSuffix(Trace suffix, Trace t, int i) {
// Declaring `t` to be a `RelevantTrace` currently causes a redundant check in the
// recursive case, so instead we check it explicitly here.
t instanceof RelevantTrace and
i = 0 and
suffix = t
or
hasSuffix(Step(_, _, _, suffix), t, i - 1)
}
pragma[noinline]
private predicate hasTuple(InputSymbol s1, InputSymbol s2, InputSymbol s3, Trace t, int i) {
hasSuffix(Step(s1, s2, s3, _), t, i)
}
private class RelevantTrace extends Trace, Step {
RelevantTrace() {
exists(State pivot, State succ, StateTuple q |
isReachableFromStartTuple(pivot, succ, q, this, _) and
q = getAnEndTuple(pivot, succ)
)
}
pragma[noinline]
private string getAThreewayIntersect(int i) {
exists(InputSymbol s1, InputSymbol s2, InputSymbol s3 |
hasTuple(s1, s2, s3, this, i) and
result = getAThreewayIntersect(s1, s2, s3)
)
}
/** Gets a string corresponding to this trace. */
// the pragma is needed for the case where `getAThreewayIntersect(s1, s2, s3)` has multiple values,
// not for recursion
language[monotonicAggregates]
string concretise() {
result =
strictconcat(int i |
hasTuple(_, _, _, this, i)
|
this.getAThreewayIntersect(i) order by i desc
)
}
}
/**
* Holds if matching repetitions of `pump` can:
* 1) Transition from `pivot` back to `pivot`.
* 2) Transition from `pivot` to `succ`.
* 3) Transition from `succ` to `succ`.
*
* From theorem 3 in the paper linked in the top of this file we can therefore conclude that
* the regular expression has polynomial backtracking - if a rejecting suffix exists.
*
* This predicate is used by `SuperLinearReDoSConfiguration`, and the final results are
* available in the `hasReDoSResult` predicate.
*/
predicate isPumpable(State pivot, State succ, string pump) {
exists(StateTuple q, RelevantTrace t |
isReachableFromStartTuple(pivot, succ, q, t, _) and
q = getAnEndTuple(pivot, succ) and
pump = t.concretise()
)
}
/**
* Holds if repetitions of `pump` at `t` will cause polynomial backtracking.
*/
predicate polynimalReDoS(RegExpTerm t, string pump, string prefixMsg, RegExpTerm prev) {
exists(State s, State pivot |
hasReDoSResult(t, pump, s, prefixMsg) and
isPumpable(pivot, s, _) and
prev = pivot.getRepr()
)
}
/**
* Gets a message for why `term` can cause polynomial backtracking.
*/
string getReasonString(RegExpTerm term, string pump, string prefixMsg, RegExpTerm prev) {
polynimalReDoS(term, pump, prefixMsg, prev) and
result =
"Strings " + prefixMsg + "with many repetitions of '" + pump +
"' can start matching anywhere after the start of the preceeding " + prev
}
/**
* A term that may cause a regular expression engine to perform a
* polynomial number of match attempts, relative to the input length.
*/
class PolynomialBackTrackingTerm extends InfiniteRepetitionQuantifier {
string reason;
string pump;
string prefixMsg;
RegExpTerm prev;
PolynomialBackTrackingTerm() {
reason = getReasonString(this, pump, prefixMsg, prev) and
// there might be many reasons for this term to have polynomial backtracking - we pick the shortest one.
reason = min(string msg | msg = getReasonString(this, _, _, _) | msg order by msg.length(), msg)
}
/**
* Holds if all non-empty successors to the polynomial backtracking term matches the end of the line.
*/
predicate isAtEndLine() {
forall(RegExpTerm succ | this.getSuccessor+() = succ and not matchesEpsilon(succ) |
succ instanceof RegExpDollar
)
}
/**
* Gets the string that should be repeated to cause this regular expression to perform polynomially.
*/
string getPumpString() { result = pump }
/**
* Gets a message for which prefix a matching string must start with for this term to cause polynomial backtracking.
*/
string getPrefixMessage() { result = prefixMsg }
/**
* Gets a predecessor to `this`, which also loops on the pump string, and thereby causes polynomial backtracking.
*/
RegExpTerm getPreviousLoop() { result = prev }
/**
* Gets the reason for the number of match attempts.
*/
string getReason() { result = reason }
}

View File

@@ -0,0 +1,108 @@
<!DOCTYPE qhelp PUBLIC
"-//Semmle//qhelp//EN"
"qhelp.dtd">
<qhelp>
<include src="ReDoSIntroduction.inc.qhelp" />
<example>
<p>
Consider this use of a regular expression, which removes
all leading and trailing whitespace in a string:
</p>
<sample language="java">
Pattern.compile("^\\s+|\\s+$").matcher(text).replaceAll("") // BAD
</sample>
<p>
The sub-expression <code>"\\s+$"</code> will match the
whitespace characters in <code>text</code> from left to right, but it
can start matching anywhere within a whitespace sequence. This is
problematic for strings that do <strong>not</strong> end with a whitespace
character. Such a string will force the regular expression engine to
process each whitespace sequence once per whitespace character in the
sequence.
</p>
<p>
This ultimately means that the time cost of trimming a
string is quadratic in the length of the string. So a string like
<code>"a b"</code> will take milliseconds to process, but a similar
string with a million spaces instead of just one will take several
minutes.
</p>
<p>
Avoid this problem by rewriting the regular expression to
not contain the ambiguity about when to start matching whitespace
sequences. For instance, by using a negative look-behind
(<code>"^\\s+|(?&lt;!\\s)\\s+$"</code>), or just by using the built-in trim
method (<code>text.trim()</code>).
</p>
<p>
Note that the sub-expression <code>"^\\s+"</code> is
<strong>not</strong> problematic as the <code>^</code> anchor restricts
when that sub-expression can start matching, and as the regular
expression engine matches from left to right.
</p>
</example>
<example>
<p>
As a similar, but slightly subtler problem, consider the
regular expression that matches lines with numbers, possibly written
using scientific notation:
</p>
<sample language="java">
"^0\\.\\d+E?\\d+$""
</sample>
<p>
The problem with this regular expression is in the
sub-expression <code>\d+E?\d+</code> because the second
<code>\d+</code> can start matching digits anywhere after the first
match of the first <code>\d+</code> if there is no <code>E</code> in
the input string.
</p>
<p>
This is problematic for strings that do <strong>not</strong>
end with a digit. Such a string will force the regular expression
engine to process each digit sequence once per digit in the sequence,
again leading to a quadratic time complexity.
</p>
<p>
To make the processing faster, the regular expression
should be rewritten such that the two <code>\d+</code> sub-expressions
do not have overlapping matches: <code>"^0\\.\\d+(E\\d+)?$"</code>.
</p>
</example>
<include src="ReDoSReferences.inc.qhelp"/>
</qhelp>

View File

@@ -0,0 +1,24 @@
/**
* @name Polynomial regular expression used on uncontrolled data
* @description A regular expression that can require polynomial time
* to match may be vulnerable to denial-of-service attacks.
* @kind path-problem
* @problem.severity warning
* @security-severity 7.5
* @precision high
* @id java/polynomial-redos
* @tags security
* external/cwe/cwe-730
* external/cwe/cwe-400
*/
import java
import semmle.code.java.security.performance.PolynomialReDoSQuery
import DataFlow::PathGraph
from DataFlow::PathNode source, DataFlow::PathNode sink, PolynomialBackTrackingTerm regexp
where hasPolynomialReDoSResult(source, sink, regexp)
select sink, source, sink,
"This $@ that depends on $@ may run slow on strings " + regexp.getPrefixMessage() +
"with many repetitions of '" + regexp.getPumpString() + "'.", regexp, "regular expression",
source.getNode(), "a user-provided value"

View File

@@ -0,0 +1,34 @@
<!DOCTYPE qhelp PUBLIC
"-//Semmle//qhelp//EN"
"qhelp.dtd">
<qhelp>
<include src="ReDoSIntroduction.inc.qhelp" />
<example>
<p>
Consider this regular expression:
</p>
<sample language="java">
^_(__|.)+_$
</sample>
<p>
Its sub-expression <code>"(__|.)+?"</code> can match the string <code>"__"</code> either by the
first alternative <code>"__"</code> to the left of the <code>"|"</code> operator, or by two
repetitions of the second alternative <code>"."</code> to the right. Thus, a string consisting
of an odd number of underscores followed by some other character will cause the regular
expression engine to run for an exponential amount of time before rejecting the input.
</p>
<p>
This problem can be avoided by rewriting the regular expression to remove the ambiguity between
the two branches of the alternative inside the repetition:
</p>
<sample language="java">
^_(__|[^_])+_$
</sample>
</example>
<include src="ReDoSReferences.inc.qhelp"/>
</qhelp>

View File

@@ -0,0 +1,26 @@
/**
* @name Inefficient regular expression
* @description A regular expression that requires exponential time to match certain inputs
* can be a performance bottleneck, and may be vulnerable to denial-of-service
* attacks.
* @kind problem
* @problem.severity error
* @security-severity 7.5
* @precision high
* @id java/redos
* @tags security
* external/cwe/cwe-730
* external/cwe/cwe-400
*/
import java
import semmle.code.java.security.performance.ExponentialBackTracking
from RegExpTerm t, string pump, State s, string prefixMsg
where
hasReDoSResult(t, pump, s, prefixMsg) and
// exclude verbose mode regexes for now
not t.getRegex().getAMode() = "VERBOSE"
select t,
"This part of the regular expression may cause exponential backtracking on strings " + prefixMsg +
"containing many repetitions of '" + pump + "'."

View File

@@ -0,0 +1,61 @@
<!DOCTYPE qhelp PUBLIC
"-//Semmle//qhelp//EN"
"qhelp.dtd">
<qhelp>
<overview>
<p>
Some regular expressions take a long time to match certain
input strings to the point where the time it takes to match a string
of length <i>n</i> is proportional to <i>n<sup>k</sup></i> or even
<i>2<sup>n</sup></i>. Such regular expressions can negatively affect
performance, or even allow a malicious user to perform a Denial of
Service ("DoS") attack by crafting an expensive input string for the
regular expression to match.
</p>
<p>
The regular expression engine provided by Java uses a backtracking non-deterministic finite
automata to implement regular expression matching. While this approach
is space-efficient and allows supporting advanced features like
capture groups, it is not time-efficient in general. The worst-case
time complexity of such an automaton can be polynomial or even
exponential, meaning that for strings of a certain shape, increasing
the input length by ten characters may make the automaton about 1000
times slower.
</p>
<p>
Typically, a regular expression is affected by this
problem if it contains a repetition of the form <code>r*</code> or
<code>r+</code> where the sub-expression <code>r</code> is ambiguous
in the sense that it can match some string in multiple ways. More
information about the precise circumstances can be found in the
references.
</p>
<p>
Note that Java versions 9 and above have some mitigations against ReDoS; however they aren't perfect
and more complex regular expressions can still be affected by this problem.
</p>
</overview>
<recommendation>
<p>
Modify the regular expression to remove the ambiguity, or
ensure that the strings matched with the regular expression are short
enough that the time-complexity does not matter.
Alternatively, an alternate regex library that guarantees linear time execution, such as Google's RE2J, may be used.
</p>
</recommendation>
</qhelp>

View File

@@ -0,0 +1,16 @@
<!DOCTYPE qhelp PUBLIC
"-//Semmle//qhelp//EN"
"qhelp.dtd">
<qhelp>
<references>
<li>
OWASP:
<a href="https://www.owasp.org/index.php/Regular_expression_Denial_of_Service_-_ReDoS">Regular expression Denial of Service - ReDoS</a>.
</li>
<li>Wikipedia: <a href="https://en.wikipedia.org/wiki/ReDoS">ReDoS</a>.</li>
<li>Wikipedia: <a href="https://en.wikipedia.org/wiki/Time_complexity">Time complexity</a>.</li>
<li>James Kirrage, Asiri Rathnayake, Hayo Thielecke:
<a href="http://www.cs.bham.ac.uk/~hxt/research/reg-exp-sec.pdf">Static Analysis for Regular Expression Denial-of-Service Attack</a>.
</li>
</references>
</qhelp>

View File

@@ -0,0 +1,6 @@
---
category: newQuery
---
* Two new queries "Inefficient regular expression" (`java/redos`) and "Polynomial regular expression used on uncontrolled data" (`java/polynomial-redos`) have been added.
These queries help find instances of Regular Expression Denial of Service vulnerabilities.

View File

@@ -73,6 +73,11 @@ jdk/StringMatch.java:
# 5| 0: [MethodAccess] matches(...)
# 5| -1: [VarAccess] STR
# 5| 0: [StringLiteral] "[a-z]+"
# 5| 0: [RegExpPlus] [a-z]+
# 5| 0: [RegExpCharacterClass] [a-z]
# 5| 0: [RegExpCharacterRange] a-z
# 5| 0: [RegExpConstant | RegExpNormalChar] a
# 5| 1: [RegExpConstant | RegExpNormalChar] z
# 8| 5: [Method] b
# 8| 3: [TypeAccess] void
# 8| 5: [BlockStmt] { ... }

View File

@@ -101,4 +101,4 @@ public class Test {
}
}
}

View File

@@ -0,0 +1,207 @@
parseFailures
#select
| Test.java:5:10:5:17 | [A-Z\\d] | [RegExpCharacterClass] |
| Test.java:5:10:5:19 | [A-Z\\d]++ | [RegExpPlus] |
| Test.java:5:11:5:11 | A | [RegExpConstant,RegExpNormalChar] |
| Test.java:5:11:5:13 | A-Z | [RegExpCharacterRange] |
| Test.java:5:13:5:13 | Z | [RegExpConstant,RegExpNormalChar] |
| Test.java:5:14:5:16 | \\d | [RegExpCharacterClassEscape] |
| Test.java:6:10:6:42 | \\Q hello world [ *** \\Q ) ( \\E | [RegExpConstant,RegExpQuote] |
| Test.java:6:10:6:43 | \\Q hello world [ *** \\Q ) ( \\E+ | [RegExpPlus] |
| Test.java:7:10:7:23 | [\\Q hi ] \\E] | [RegExpCharacterClass] |
| Test.java:7:10:7:24 | [\\Q hi ] \\E]+ | [RegExpPlus] |
| Test.java:7:11:7:22 | \\Q hi ] \\E | [RegExpConstant,RegExpQuote] |
| Test.java:8:10:8:12 | []] | [RegExpCharacterClass] |
| Test.java:8:10:8:13 | []]+ | [RegExpPlus] |
| Test.java:8:11:8:11 | ] | [RegExpConstant,RegExpNormalChar] |
| Test.java:9:10:9:13 | [^]] | [RegExpCharacterClass] |
| Test.java:9:10:9:14 | [^]]+ | [RegExpPlus] |
| Test.java:9:12:9:12 | ] | [RegExpConstant,RegExpNormalChar] |
| Test.java:10:10:10:20 | [abc[defg]] | [RegExpCharacterClass] |
| Test.java:10:10:10:21 | [abc[defg]]+ | [RegExpPlus] |
| Test.java:10:11:10:11 | a | [RegExpConstant,RegExpNormalChar] |
| Test.java:10:12:10:12 | b | [RegExpConstant,RegExpNormalChar] |
| Test.java:10:13:10:13 | c | [RegExpConstant,RegExpNormalChar] |
| Test.java:10:14:10:14 | [ | [RegExpConstant,RegExpNormalChar] |
| Test.java:10:15:10:15 | d | [RegExpConstant,RegExpNormalChar] |
| Test.java:10:16:10:16 | e | [RegExpConstant,RegExpNormalChar] |
| Test.java:10:17:10:17 | f | [RegExpConstant,RegExpNormalChar] |
| Test.java:10:18:10:18 | g | [RegExpConstant,RegExpNormalChar] |
| Test.java:10:19:10:19 | ] | [RegExpConstant,RegExpNormalChar] |
| Test.java:11:10:11:57 | [abc&&[\\W\\p{Lower}\\P{Space}\\N{degree sign}]] | [RegExpCharacterClass] |
| Test.java:11:10:11:69 | [abc&&[\\W\\p{Lower}\\P{Space}\\N{degree sign}]]\\b7\\b{g}8+ | [RegExpSequence] |
| Test.java:11:11:11:11 | a | [RegExpConstant,RegExpNormalChar] |
| Test.java:11:12:11:12 | b | [RegExpConstant,RegExpNormalChar] |
| Test.java:11:13:11:13 | c | [RegExpConstant,RegExpNormalChar] |
| Test.java:11:14:11:14 | & | [RegExpConstant,RegExpNormalChar] |
| Test.java:11:15:11:15 | & | [RegExpConstant,RegExpNormalChar] |
| Test.java:11:16:11:16 | [ | [RegExpConstant,RegExpNormalChar] |
| Test.java:11:17:11:19 | \\W | [RegExpCharacterClassEscape] |
| Test.java:11:20:11:29 | \\p{Lower} | [RegExpCharacterClassEscape] |
| Test.java:11:30:11:39 | \\P{Space} | [RegExpCharacterClassEscape] |
| Test.java:11:40:11:55 | \\N{degree sign} | [RegExpConstant,RegExpEscape] |
| Test.java:11:56:11:56 | ] | [RegExpConstant,RegExpNormalChar] |
| Test.java:11:58:11:60 | \\b | [RegExpConstant,RegExpEscape] |
| Test.java:11:61:11:61 | 7 | [RegExpConstant,RegExpNormalChar] |
| Test.java:11:62:11:67 | \\b{g} | [RegExpConstant,RegExpEscape] |
| Test.java:11:68:11:68 | 8 | [RegExpConstant,RegExpNormalChar] |
| Test.java:11:68:11:69 | 8+ | [RegExpPlus] |
| Test.java:12:10:12:13 | \\cA | [RegExpConstant,RegExpEscape] |
| Test.java:12:10:12:14 | \\cA+ | [RegExpPlus] |
| Test.java:13:10:13:13 | \\c( | [RegExpConstant,RegExpEscape] |
| Test.java:13:10:13:14 | \\c(+ | [RegExpPlus] |
| Test.java:14:10:14:14 | \\c\\ | [RegExpConstant,RegExpEscape] |
| Test.java:14:10:14:19 | \\c\\(ab)+ | [RegExpSequence] |
| Test.java:14:15:14:18 | (ab) | [RegExpGroup] |
| Test.java:14:15:14:19 | (ab)+ | [RegExpPlus] |
| Test.java:14:16:14:16 | a | [RegExpConstant,RegExpNormalChar] |
| Test.java:14:16:14:17 | ab | [RegExpSequence] |
| Test.java:14:17:14:17 | b | [RegExpConstant,RegExpNormalChar] |
| Test.java:15:10:15:15 | (?>hi) | [RegExpGroup] |
| Test.java:15:10:15:45 | (?>hi)(?<name>hell*?o*+)123\\k<name> | [RegExpSequence] |
| Test.java:15:13:15:13 | h | [RegExpConstant,RegExpNormalChar] |
| Test.java:15:13:15:14 | hi | [RegExpSequence] |
| Test.java:15:14:15:14 | i | [RegExpConstant,RegExpNormalChar] |
| Test.java:15:16:15:33 | (?<name>hell*?o*+) | [RegExpGroup] |
| Test.java:15:24:15:24 | h | [RegExpConstant,RegExpNormalChar] |
| Test.java:15:24:15:32 | hell*?o*+ | [RegExpSequence] |
| Test.java:15:25:15:25 | e | [RegExpConstant,RegExpNormalChar] |
| Test.java:15:26:15:26 | l | [RegExpConstant,RegExpNormalChar] |
| Test.java:15:27:15:27 | l | [RegExpConstant,RegExpNormalChar] |
| Test.java:15:27:15:29 | l*? | [RegExpStar] |
| Test.java:15:30:15:30 | o | [RegExpConstant,RegExpNormalChar] |
| Test.java:15:30:15:32 | o*+ | [RegExpStar] |
| Test.java:15:34:15:34 | 1 | [RegExpConstant,RegExpNormalChar] |
| Test.java:15:35:15:35 | 2 | [RegExpConstant,RegExpNormalChar] |
| Test.java:15:36:15:36 | 3 | [RegExpConstant,RegExpNormalChar] |
| Test.java:15:37:15:45 | \\k<name> | [RegExpBackRef] |
| Test.java:16:10:16:10 | a | [RegExpConstant,RegExpNormalChar] |
| Test.java:16:10:16:11 | a+ | [RegExpPlus] |
| Test.java:16:10:16:108 | a+b*c?d{2}e{3,4}f{,5}g{6,}h+?i*?j??k{7}?l{8,9}?m{,10}?n{11,}?o++p*+q?+r{12}+s{13,14}+t{,15}+u{16,}+ | [RegExpSequence] |
| Test.java:16:12:16:12 | b | [RegExpConstant,RegExpNormalChar] |
| Test.java:16:12:16:13 | b* | [RegExpStar] |
| Test.java:16:14:16:14 | c | [RegExpConstant,RegExpNormalChar] |
| Test.java:16:14:16:15 | c? | [RegExpOpt] |
| Test.java:16:16:16:16 | d | [RegExpConstant,RegExpNormalChar] |
| Test.java:16:16:16:19 | d{2} | [RegExpRange] |
| Test.java:16:20:16:20 | e | [RegExpConstant,RegExpNormalChar] |
| Test.java:16:20:16:25 | e{3,4} | [RegExpRange] |
| Test.java:16:26:16:26 | f | [RegExpConstant,RegExpNormalChar] |
| Test.java:16:26:16:30 | f{,5} | [RegExpRange] |
| Test.java:16:31:16:31 | g | [RegExpConstant,RegExpNormalChar] |
| Test.java:16:31:16:35 | g{6,} | [RegExpRange] |
| Test.java:16:36:16:36 | h | [RegExpConstant,RegExpNormalChar] |
| Test.java:16:36:16:38 | h+? | [RegExpPlus] |
| Test.java:16:39:16:39 | i | [RegExpConstant,RegExpNormalChar] |
| Test.java:16:39:16:41 | i*? | [RegExpStar] |
| Test.java:16:42:16:42 | j | [RegExpConstant,RegExpNormalChar] |
| Test.java:16:42:16:44 | j?? | [RegExpOpt] |
| Test.java:16:45:16:45 | k | [RegExpConstant,RegExpNormalChar] |
| Test.java:16:45:16:49 | k{7}? | [RegExpQuantifier] |
| Test.java:16:50:16:50 | l | [RegExpConstant,RegExpNormalChar] |
| Test.java:16:50:16:56 | l{8,9}? | [RegExpQuantifier] |
| Test.java:16:57:16:57 | m | [RegExpConstant,RegExpNormalChar] |
| Test.java:16:57:16:63 | m{,10}? | [RegExpQuantifier] |
| Test.java:16:64:16:64 | n | [RegExpConstant,RegExpNormalChar] |
| Test.java:16:64:16:70 | n{11,}? | [RegExpQuantifier] |
| Test.java:16:71:16:71 | o | [RegExpConstant,RegExpNormalChar] |
| Test.java:16:71:16:73 | o++ | [RegExpPlus] |
| Test.java:16:74:16:74 | p | [RegExpConstant,RegExpNormalChar] |
| Test.java:16:74:16:76 | p*+ | [RegExpStar] |
| Test.java:16:77:16:77 | q | [RegExpConstant,RegExpNormalChar] |
| Test.java:16:77:16:79 | q?+ | [RegExpOpt] |
| Test.java:16:80:16:80 | r | [RegExpConstant,RegExpNormalChar] |
| Test.java:16:80:16:85 | r{12}+ | [RegExpQuantifier] |
| Test.java:16:86:16:86 | s | [RegExpConstant,RegExpNormalChar] |
| Test.java:16:86:16:94 | s{13,14}+ | [RegExpQuantifier] |
| Test.java:16:95:16:95 | t | [RegExpConstant,RegExpNormalChar] |
| Test.java:16:95:16:101 | t{,15}+ | [RegExpQuantifier] |
| Test.java:16:102:16:102 | u | [RegExpConstant,RegExpNormalChar] |
| Test.java:16:102:16:108 | u{16,}+ | [RegExpQuantifier] |
| Test.java:17:10:17:13 | (?i) | [RegExpZeroWidthMatch] |
| Test.java:17:10:17:36 | (?i)(?=a)(?!b)(?<=c)(?<!d)+ | [RegExpSequence] |
| Test.java:17:14:17:18 | (?=a) | [RegExpPositiveLookahead] |
| Test.java:17:17:17:17 | a | [RegExpConstant,RegExpNormalChar] |
| Test.java:17:19:17:23 | (?!b) | [RegExpNegativeLookahead] |
| Test.java:17:22:17:22 | b | [RegExpConstant,RegExpNormalChar] |
| Test.java:17:24:17:29 | (?<=c) | [RegExpPositiveLookbehind] |
| Test.java:17:28:17:28 | c | [RegExpConstant,RegExpNormalChar] |
| Test.java:17:30:17:35 | (?<!d) | [RegExpNegativeLookbehind] |
| Test.java:17:30:17:36 | (?<!d)+ | [RegExpPlus] |
| Test.java:17:34:17:34 | d | [RegExpConstant,RegExpNormalChar] |
| Test.java:18:10:18:10 | a | [RegExpConstant,RegExpNormalChar] |
| Test.java:18:10:18:25 | a\|\|b\|c(d\|e\|)f\|g+ | [RegExpAlt] |
| Test.java:18:12:18:25 | \|b\|c(d\|e\|)f\|g+ | [RegExpAlt] |
| Test.java:18:13:18:13 | b | [RegExpConstant,RegExpNormalChar] |
| Test.java:18:15:18:15 | c | [RegExpConstant,RegExpNormalChar] |
| Test.java:18:15:18:22 | c(d\|e\|)f | [RegExpSequence] |
| Test.java:18:16:18:21 | (d\|e\|) | [RegExpGroup] |
| Test.java:18:17:18:17 | d | [RegExpConstant,RegExpNormalChar] |
| Test.java:18:17:18:20 | d\|e\| | [RegExpAlt] |
| Test.java:18:19:18:19 | e | [RegExpConstant,RegExpNormalChar] |
| Test.java:18:22:18:22 | f | [RegExpConstant,RegExpNormalChar] |
| Test.java:18:24:18:24 | g | [RegExpConstant,RegExpNormalChar] |
| Test.java:18:24:18:25 | g+ | [RegExpPlus] |
| Test.java:19:10:19:13 | \\01 | [RegExpConstant,RegExpEscape] |
| Test.java:19:10:19:38 | \\018\\033\\0377\\0777\u1337+ | [RegExpSequence] |
| Test.java:19:14:19:14 | 8 | [RegExpConstant,RegExpNormalChar] |
| Test.java:19:15:19:19 | \\033 | [RegExpConstant,RegExpEscape] |
| Test.java:19:20:19:25 | \\0377 | [RegExpConstant,RegExpEscape] |
| Test.java:19:26:19:30 | \\077 | [RegExpConstant,RegExpEscape] |
| Test.java:19:31:19:31 | 7 | [RegExpConstant,RegExpNormalChar] |
| Test.java:19:32:19:37 | \u1337 | [RegExpConstant,RegExpNormalChar] |
| Test.java:19:32:19:38 | \u1337+ | [RegExpPlus] |
| Test.java:20:10:20:12 | [\|] | [RegExpCharacterClass] |
| Test.java:20:10:20:13 | [\|]+ | [RegExpPlus] |
| Test.java:20:11:20:11 | \| | [RegExpConstant,RegExpNormalChar] |
| Test.java:21:10:21:37 | (a(a(a(a(a(a((((c))))a)))))) | [RegExpGroup] |
| Test.java:21:10:21:68 | (a(a(a(a(a(a((((c))))a))))))((((((b(((((d)))))b)b)b)b)b)b)+ | [RegExpSequence] |
| Test.java:21:11:21:11 | a | [RegExpConstant,RegExpNormalChar] |
| Test.java:21:11:21:36 | a(a(a(a(a(a((((c))))a))))) | [RegExpSequence] |
| Test.java:21:12:21:36 | (a(a(a(a(a((((c))))a))))) | [RegExpGroup] |
| Test.java:21:13:21:13 | a | [RegExpConstant,RegExpNormalChar] |
| Test.java:21:13:21:35 | a(a(a(a(a((((c))))a)))) | [RegExpSequence] |
| Test.java:21:14:21:35 | (a(a(a(a((((c))))a)))) | [RegExpGroup] |
| Test.java:21:15:21:15 | a | [RegExpConstant,RegExpNormalChar] |
| Test.java:21:15:21:34 | a(a(a(a((((c))))a))) | [RegExpSequence] |
| Test.java:21:16:21:34 | (a(a(a((((c))))a))) | [RegExpGroup] |
| Test.java:21:17:21:17 | a | [RegExpConstant,RegExpNormalChar] |
| Test.java:21:17:21:33 | a(a(a((((c))))a)) | [RegExpSequence] |
| Test.java:21:18:21:33 | (a(a((((c))))a)) | [RegExpGroup] |
| Test.java:21:19:21:19 | a | [RegExpConstant,RegExpNormalChar] |
| Test.java:21:19:21:32 | a(a((((c))))a) | [RegExpSequence] |
| Test.java:21:20:21:32 | (a((((c))))a) | [RegExpGroup] |
| Test.java:21:21:21:21 | a | [RegExpConstant,RegExpNormalChar] |
| Test.java:21:21:21:31 | a((((c))))a | [RegExpSequence] |
| Test.java:21:22:21:30 | ((((c)))) | [RegExpGroup] |
| Test.java:21:23:21:29 | (((c))) | [RegExpGroup] |
| Test.java:21:24:21:28 | ((c)) | [RegExpGroup] |
| Test.java:21:25:21:27 | (c) | [RegExpGroup] |
| Test.java:21:26:21:26 | c | [RegExpConstant,RegExpNormalChar] |
| Test.java:21:31:21:31 | a | [RegExpConstant,RegExpNormalChar] |
| Test.java:21:38:21:67 | ((((((b(((((d)))))b)b)b)b)b)b) | [RegExpGroup] |
| Test.java:21:38:21:68 | ((((((b(((((d)))))b)b)b)b)b)b)+ | [RegExpPlus] |
| Test.java:21:39:21:65 | (((((b(((((d)))))b)b)b)b)b) | [RegExpGroup] |
| Test.java:21:39:21:66 | (((((b(((((d)))))b)b)b)b)b)b | [RegExpSequence] |
| Test.java:21:40:21:63 | ((((b(((((d)))))b)b)b)b) | [RegExpGroup] |
| Test.java:21:40:21:64 | ((((b(((((d)))))b)b)b)b)b | [RegExpSequence] |
| Test.java:21:41:21:61 | (((b(((((d)))))b)b)b) | [RegExpGroup] |
| Test.java:21:41:21:62 | (((b(((((d)))))b)b)b)b | [RegExpSequence] |
| Test.java:21:42:21:59 | ((b(((((d)))))b)b) | [RegExpGroup] |
| Test.java:21:42:21:60 | ((b(((((d)))))b)b)b | [RegExpSequence] |
| Test.java:21:43:21:57 | (b(((((d)))))b) | [RegExpGroup] |
| Test.java:21:43:21:58 | (b(((((d)))))b)b | [RegExpSequence] |
| Test.java:21:44:21:44 | b | [RegExpConstant,RegExpNormalChar] |
| Test.java:21:44:21:56 | b(((((d)))))b | [RegExpSequence] |
| Test.java:21:45:21:55 | (((((d))))) | [RegExpGroup] |
| Test.java:21:46:21:54 | ((((d)))) | [RegExpGroup] |
| Test.java:21:47:21:53 | (((d))) | [RegExpGroup] |
| Test.java:21:48:21:52 | ((d)) | [RegExpGroup] |
| Test.java:21:49:21:51 | (d) | [RegExpGroup] |
| Test.java:21:50:21:50 | d | [RegExpConstant,RegExpNormalChar] |
| Test.java:21:56:21:56 | b | [RegExpConstant,RegExpNormalChar] |
| Test.java:21:58:21:58 | b | [RegExpConstant,RegExpNormalChar] |
| Test.java:21:60:21:60 | b | [RegExpConstant,RegExpNormalChar] |
| Test.java:21:62:21:62 | b | [RegExpConstant,RegExpNormalChar] |
| Test.java:21:64:21:64 | b | [RegExpConstant,RegExpNormalChar] |
| Test.java:21:66:21:66 | b | [RegExpConstant,RegExpNormalChar] |

View File

@@ -0,0 +1,10 @@
import java
import semmle.code.java.regex.RegexTreeView
import semmle.code.java.regex.regex
string getQLClases(RegExpTerm t) { result = "[" + strictconcat(t.getPrimaryQLClass(), ",") + "]" }
query predicate parseFailures(Regex r, int i) { r.failedToParse(i) }
from RegExpTerm t
select t, getQLClases(t)

View File

@@ -0,0 +1,29 @@
import java.util.regex.Pattern;
class Test {
static String[] regs = {
"[A-Z\\d]++",
"\\Q hello world [ *** \\Q ) ( \\E+",
"[\\Q hi ] \\E]+",
"[]]+",
"[^]]+",
"[abc[defg]]+",
"[abc&&[\\W\\p{Lower}\\P{Space}\\N{degree sign}]]\\b7\\b{g}8+",
"\\cA+",
"\\c(+",
"\\c\\(ab)+",
"(?>hi)(?<name>hell*?o*+)123\\k<name>",
"a+b*c?d{2}e{3,4}f{,5}g{6,}h+?i*?j??k{7}?l{8,9}?m{,10}?n{11,}?o++p*+q?+r{12}+s{13,14}+t{,15}+u{16,}+",
"(?i)(?=a)(?!b)(?<=c)(?<!d)+",
"a||b|c(d|e|)f|g+",
"\\018\\033\\0377\\0777\u1337+",
"[|]+",
"(a(a(a(a(a(a((((c))))a))))))((((((b(((((d)))))b)b)b)b)b)b)+"
};
void test() {
for (int i = 0; i < regs.length; i++) {
Pattern.compile(regs[i]);
}
}
}

View File

@@ -0,0 +1,439 @@
import java.util.regex.Pattern;
class ExpRedosTest {
static String[] regs = {
// NOT GOOD; attack: "_" + "__".repeat(100)
// Adapted from marked (https://github.com/markedjs/marked), which is licensed
// under the MIT license; see file marked-LICENSE.
"^\\b_((?:__|[\\s\\S])+?)_\\b|^\\*((?:\\*\\*|[\\s\\S])+?)\\*(?!\\*)", // $ hasExpRedos
// GOOD
// Adapted from marked (https://github.com/markedjs/marked), which is licensed
// under the MIT license; see file marked-LICENSE.
"^\\b_((?:__|[^_])+?)_\\b|^\\*((?:\\*\\*|[^*])+?)\\*(?!\\*)",
// GOOD - there is no witness in the end that could cause the regexp to not match
// Adapted from brace-expansion (https://github.com/juliangruber/brace-expansion),
// which is licensed under the MIT license; see file brace-expansion-LICENSE.
"(.*,)+.+",
// NOT GOOD; attack: " '" + "\\\\".repeat(100)
// Adapted from CodeMirror (https://github.com/codemirror/codemirror),
// which is licensed under the MIT license; see file CodeMirror-LICENSE.
"^(?:\\s+(?:\"(?:[^\"\\\\]|\\\\\\\\|\\\\.)+\"|'(?:[^'\\\\]|\\\\\\\\|\\\\.)+'|\\((?:[^)\\\\]|\\\\\\\\|\\\\.)+\\)))?", // $ hasExpRedos
// GOOD
// Adapted from lulucms2 (https://github.com/yiifans/lulucms2).
"\\(\\*(?:[\\s\\S]*?\\(\\*[\\s\\S]*?\\*\\))*[\\s\\S]*?\\*\\)",
// GOOD
// Adapted from jest (https://github.com/facebook/jest), which is licensed
// under the MIT license; see file jest-LICENSE.
"^ *(\\S.*\\|.*)\\n *([-:]+ *\\|[-| :]*)\\n((?:.*\\|.*(?:\\n|$))*)\\n*",
// NOT GOOD, variant of good3; attack: "a|\n:|\n" + "||\n".repeat(100)
"^ *(\\S.*\\|.*)\\n *([-:]+ *\\|[-| :]*)\\n((?:.*\\|.*(?:\\n|$))*)a", // $ hasExpRedos
// NOT GOOD; attack: "/" + "\\/a".repeat(100)
// Adapted from ANodeBlog (https://github.com/gefangshuai/ANodeBlog),
// which is licensed under the Apache License 2.0; see file ANodeBlog-LICENSE.
"\\/(?![ *])(\\\\\\/|.)*?\\/[gim]*(?=\\W|$)", // $ hasExpRedos
// NOT GOOD; attack: "##".repeat(100) + "\na"
// Adapted from CodeMirror (https://github.com/codemirror/codemirror),
// which is licensed under the MIT license; see file CodeMirror-LICENSE.
"^([\\s\\[\\{\\(]|#.*)*$", // $ hasExpRedos
// GOOD
"(\\r\\n|\\r|\\n)+",
// BAD - PoC: `node -e "/((?:[^\"\']|\".*?\"|\'.*?\')*?)([(,)]|$)/.test(\"'''''''''''''''''''''''''''''''''''''''''''''\\\"\");"`. It's complicated though, because the regexp still matches something, it just matches the empty-string after the attack string.
// NOT GOOD; attack: "a" + "[]".repeat(100) + ".b\n"
// Adapted from Knockout (https://github.com/knockout/knockout), which is
// licensed under the MIT license; see file knockout-LICENSE
"^[\\_$a-z][\\_$a-z0-9]*(\\[.*?\\])*(\\.[\\_$a-z][\\_$a-z0-9]*(\\[.*?\\])*)*$", // $ hasExpRedos
// GOOD
"(a|.)*",
// Testing the NFA - only some of the below are detected.
"^([a-z]+)+$", // $ hasExpRedos
"^([a-z]*)*$", // $ hasExpRedos
"^([a-zA-Z0-9])(([\\\\-.]|[_]+)?([a-zA-Z0-9]+))*(@){1}[a-z0-9]+[.]{1}(([a-z]{2,3})|([a-z]{2,3}[.]{1}[a-z]{2,3}))$", // $ hasExpRedos
"^(([a-z])+.)+[A-Z]([a-z])+$", // $ hasExpRedos
// NOT GOOD; attack: "[" + "][".repeat(100) + "]!"
// Adapted from Prototype.js (https://github.com/prototypejs/prototype), which
// is licensed under the MIT license; see file Prototype.js-LICENSE.
"(([\\w#:.~>+()\\s-]+|\\*|\\[.*?\\])+)\\s*(,|$)", // $ hasExpRedos
// NOT GOOD; attack: "'" + "\\a".repeat(100) + '"'
// Adapted from Prism (https://github.com/PrismJS/prism), which is licensed
// under the MIT license; see file Prism-LICENSE.
"(\"|')(\\\\?.)*?\\1", // $ hasExpRedos
// NOT GOOD
"(b|a?b)*c", // $ hasExpRedos
// NOT GOOD
"(a|aa?)*b", // $ hasExpRedos
// GOOD
"(.|\\n)*!",
// NOT GOOD; attack: "\n".repeat(100) + "."
"(?s)(.|\\n)*!", // $ hasExpRedos
// GOOD
"([\\w.]+)*",
// NOT GOOD
"(a|aa?)*b", // $ hasExpRedos
// NOT GOOD
"(([\\s\\S]|[^a])*)\"", // $ hasExpRedos
// GOOD - there is no witness in the end that could cause the regexp to not match
"([^\"']+)*",
// NOT GOOD
"((.|[^a])*)\"", // $ hasExpRedos
// GOOD
"((a|[^a])*)\"",
// NOT GOOD
"((b|[^a])*)\"", // $ hasExpRedos
// NOT GOOD
"((G|[^a])*)\"", // $ hasExpRedos
// NOT GOOD
"(([0-9]|[^a])*)\"", // $ hasExpRedos
// NOT GOOD
"(?:=(?:([!#\\$%&'\\*\\+\\-\\.\\^_`\\|~0-9A-Za-z]+)|\"((?:\\\\[\\x00-\\x7f]|[^\\x00-\\x08\\x0a-\\x1f\\x7f\"])*)\"))?", // $ MISSING: hasExpRedos
// NOT GOOD
"\"((?:\\\\[\\x00-\\x7f]|[^\\x00-\\x08\\x0a-\\x1f\\x7f\"])*)\"", // $ MISSING: hasExpRedos
// GOOD
"\"((?:\\\\[\\x00-\\x7f]|[^\\x00-\\x08\\x0a-\\x1f\\x7f\"\\\\])*)\"",
// NOT GOOD
"(([a-z]|[d-h])*)\"", // $ hasExpRedos
// NOT GOOD
"(([^a-z]|[^0-9])*)\"", // $ hasExpRedos
// NOT GOOD
"((\\d|[0-9])*)\"", // $ hasExpRedos
// NOT GOOD
"((\\s|\\s)*)\"", // $ hasExpRedos
// NOT GOOD
"((\\w|G)*)\"", // $ hasExpRedos
// GOOD
"((\\s|\\d)*)\"",
// NOT GOOD
"((\\d|\\w)*)\"", // $ hasExpRedos
// NOT GOOD
"((\\d|5)*)\"", // $ hasExpRedos
// NOT GOOD
"((\\s|[\\f])*)\"", // $ hasExpRedos
// NOT GOOD - but not detected (likely because \v is a character class in Java rather than a specific character in other langs)
"((\\s|[\\v]|\\\\v)*)\"", // $ MISSING: hasExpRedos
// NOT GOOD
"((\\f|[\\f])*)\"", // $ hasExpRedos
// NOT GOOD
"((\\W|\\D)*)\"", // $ hasExpRedos
// NOT GOOD
"((\\S|\\w)*)\"", // $ hasExpRedos
// NOT GOOD
"((\\S|[\\w])*)\"", // $ hasExpRedos
// NOT GOOD
"((1s|[\\da-z])*)\"", // $ hasExpRedos
// NOT GOOD
"((0|[\\d])*)\"", // $ hasExpRedos
// NOT GOOD
"(([\\d]+)*)\"", // $ hasExpRedos
// GOOD - there is no witness in the end that could cause the regexp to not match
"(\\d+(X\\d+)?)+",
// GOOD - there is no witness in the end that could cause the regexp to not match
"([0-9]+(X[0-9]*)?)*",
// GOOD
"^([^>]+)*(>|$)",
// NOT GOOD
"^([^>a]+)*(>|$)", // $ hasExpRedos
// NOT GOOD
"(\\n\\s*)+$", // $ hasExpRedos
// NOT GOOD
"^(?:\\s+|#.*|\\(\\?#[^)]*\\))*(?:[?*+]|\\{\\d+(?:,\\d*)?})", // $ hasExpRedos
// NOT GOOD
"\\{\\[\\s*([a-zA-Z]+)\\(([a-zA-Z]+)\\)((\\s*([a-zA-Z]+)\\: ?([ a-zA-Z{}]+),?)+)*\\s*\\]\\}", // $ hasExpRedos
// NOT GOOD
"(a+|b+|c+)*c", // $ hasExpRedos
// NOT GOOD
"(((a+a?)*)+b+)", // $ hasExpRedos
// NOT GOOD
"(a+)+bbbb", // $ hasExpRedos
// GOOD
"(a+)+aaaaa*a+",
// NOT GOOD
"(a+)+aaaaa$", // $ hasExpRedos
// GOOD
"(\\n+)+\\n\\n",
// NOT GOOD
"(\\n+)+\\n\\n$", // $ hasExpRedos
// NOT GOOD
"([^X]+)*$", // $ hasExpRedos
// NOT GOOD
"(([^X]b)+)*$", // $ hasExpRedos
// GOOD
"(([^X]b)+)*($|[^X]b)",
// NOT GOOD
"(([^X]b)+)*($|[^X]c)", // $ hasExpRedos
// GOOD
"((ab)+)*ababab",
// GOOD
"((ab)+)*abab(ab)*(ab)+",
// GOOD
"((ab)+)*",
// NOT GOOD
"((ab)+)*$", // $ hasExpRedos
// GOOD
"((ab)+)*[a1][b1][a2][b2][a3][b3]",
// NOT GOOD
"([\\n\\s]+)*(.)", // $ hasExpRedos
// GOOD - any witness passes through the accept state.
"(A*A*X)*",
// GOOD
"([^\\\\\\]]+)*",
// NOT GOOD
"(\\w*foobarbaz\\w*foobarbaz\\w*foobarbaz\\w*foobarbaz\\s*foobarbaz\\d*foobarbaz\\w*)+-", // $ hasExpRedos
// NOT GOOD
"(.thisisagoddamnlongstringforstresstestingthequery|\\sthisisagoddamnlongstringforstresstestingthequery)*-", // $ hasExpRedos
// NOT GOOD
"(thisisagoddamnlongstringforstresstestingthequery|this\\w+query)*-", // $ hasExpRedos
// GOOD
"(thisisagoddamnlongstringforstresstestingthequery|imanotherbutunrelatedstringcomparedtotheotherstring)*-",
// GOOD (but false positive caused by the extractor converting all four unpaired surrogates to \uFFFD)
"foo([\uDC66\uDC67]|[\uDC68\uDC69])*foo", // $ SPURIOUS: hasExpRedos
// GOOD (but false positive caused by the extractor converting all four unpaired surrogates to \uFFFD)
"foo((\uDC66|\uDC67)|(\uDC68|\uDC69))*foo", // $ SPURIOUS: hasExpRedos
// NOT GOOD (but cannot currently construct a prefix)
"a{2,3}(b+)+X", // $ hasExpRedos
// NOT GOOD (and a good prefix test)
"^<(\\w+)((?:\\s+\\w+(?:\\s*=\\s*(?:(?:\"[^\"]*\")|(?:'[^']*')|[^>\\s]+))?)*)\\s*(\\/?)>", // $ hasExpRedos
// GOOD
"(a+)*[\\s\\S][\\s\\S][\\s\\S]?",
// GOOD - but we fail to see that repeating the attack string ends in the "accept any" state (due to not parsing the range `[\s\S]{2,3}`).
"(a+)*[\\s\\S]{2,3}", // $ SPURIOUS: hasExpRedos
// GOOD - but we spuriously conclude that a rejecting suffix exists (due to not parsing the range `[\s\S]{2,}` when constructing the NFA).
"(a+)*([\\s\\S]{2,}|X)$", // $ SPURIOUS: hasExpRedos
// GOOD
"(a+)*([\\s\\S]*|X)$",
// NOT GOOD
"((a+)*$|[\\s\\S]+)", // $ hasExpRedos
// GOOD - but still flagged. The only change compared to the above is the order of alternatives, which we don't model.
"([\\s\\S]+|(a+)*$)", // $ SPURIOUS: hasExpRedos
// GOOD
"((;|^)a+)+$",
// NOT GOOD (a good prefix test)
"(^|;)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(e+)+f", // $ hasExpRedos
// NOT GOOD
"^ab(c+)+$", // $ hasExpRedos
// NOT GOOD
"(\\d(\\s+)*){20}", // $ hasExpRedos
// GOOD - but we spuriously conclude that a rejecting suffix exists.
"(([^/]|X)+)(\\/[\\s\\S]*)*$", // $ SPURIOUS: hasExpRedos
// GOOD - but we spuriously conclude that a rejecting suffix exists.
"^((x([^Y]+)?)*(Y|$))", // $ SPURIOUS: hasExpRedos
// NOT GOOD
"(a*)+b", // $ hasExpRedos
// NOT GOOD
"foo([\\w-]*)+bar", // $ hasExpRedos
// NOT GOOD
"((ab)*)+c", // $ hasExpRedos
// NOT GOOD
"(a?a?)*b", // $ hasExpRedos
// GOOD
"(a?)*b",
// NOT GOOD - but not detected
"(c?a?)*b", // $ MISSING: hasExpRedos
// NOT GOOD
"(?:a|a?)+b", // $ hasExpRedos
// NOT GOOD - but not detected.
"(a?b?)*$", // $ MISSING: hasExpRedos
// NOT GOOD
"PRE(([a-c]|[c-d])T(e?e?e?e?|X))+(cTcT|cTXcTX$)", // $ hasExpRedos
// NOT GOOD
"^((a)+\\w)+$", // $ hasExpRedos
// NOT GOOD
"^(b+.)+$", // $ hasExpRedos
// GOOD
"a*b",
// All 4 bad combinations of nested * and +
"(a*)*b", // $ hasExpRedos
"(a+)*b", // $ hasExpRedos
"(a*)+b", // $ hasExpRedos
"(a+)+b", // $ hasExpRedos
// GOOD
"(a|b)+",
"(?:[\\s;,\"'<>(){}|\\[\\]@=+*]|:(?![/\\\\]))+",
"^((?:a{|-)|\\w\\{)+X$", // $ hasParseFailure
"^((?:a{0|-)|\\w\\{\\d)+X$", // $ hasParseFailure
"^((?:a{0,|-)|\\w\\{\\d,)+X$", // $ hasParseFailure
"^((?:a{0,2|-)|\\w\\{\\d,\\d)+X$", // $ hasParseFailure
// GOOD
"^((?:a{0,2}|-)|\\w\\{\\d,\\d\\})+X$",
// NOT GOOD
"X(\\u0061|a)*Y", // $ hasExpRedos
// GOOD
"X(\\u0061|b)+Y",
// NOT GOOD
"X(\\x61|a)*Y", // $ hasExpRedos
// GOOD
"X(\\x61|b)+Y",
// NOT GOOD
"X(\\x{061}|a)*Y", // $ hasExpRedos
// GOOD
"X(\\x{061}|b)+Y",
// NOT GOOD
"X(\\p{Digit}|7)*Y", // $ hasExpRedos
// GOOD
"X(\\p{Digit}|b)+Y",
// NOT GOOD
"X(\\P{Digit}|b)*Y", // $ hasExpRedos
// GOOD
"X(\\P{Digit}|7)+Y",
// NOT GOOD
"X(\\p{IsDigit}|7)*Y", // $ hasExpRedos
// GOOD
"X(\\p{IsDigit}|b)+Y",
// NOT GOOD - but not detected
"X(\\p{Alpha}|a)*Y", // $ MISSING: hasExpRedos
// GOOD
"X(\\p{Alpha}|7)+Y",
// GOOD
"(\"[^\"]*?\"|[^\"\\s]+)+(?=\\s*|\\s*$)",
// BAD
"/(\"[^\"]*?\"|[^\"\\s]+)+(?=\\s*|\\s*$)X", // $ hasExpRedos
"/(\"[^\"]*?\"|[^\"\\s]+)+(?=X)", // $ hasExpRedos
// BAD
"\\A(\\d|0)*x", // $ hasExpRedos
"(\\d|0)*\\Z", // $ hasExpRedos
"\\b(\\d|0)*x", // $ hasExpRedos
// GOOD - possessive quantifiers don't backtrack
"(a*+)*+b",
"(a*)*+b",
"(a*+)*b",
// BAD
"(a*)*b", // $ hasExpRedos
// BAD - but not detected due to the way possessive quantifiers are approximated
"((aa|a*+)b)*c" // $ MISSING: hasExpRedos
};
void test() {
for (int i = 0; i < regs.length; i++) {
Pattern.compile(regs[i]);
}
}
}

View File

@@ -0,0 +1,84 @@
import java.util.regex.Pattern;
import java.util.function.Predicate;
import javax.servlet.http.HttpServletRequest;
import com.google.common.base.Splitter;
class PolyRedosTest {
void test(HttpServletRequest request) {
String tainted = request.getParameter("inp");
String reg = "0\\.\\d+E?\\d+!";
Predicate<String> dummyPred = (s -> s.length() % 7 == 0);
tainted.matches(reg); // $ hasPolyRedos
tainted.split(reg); // $ hasPolyRedos
tainted.split(reg, 7); // $ hasPolyRedos
tainted.replaceAll(reg, "a"); // $ hasPolyRedos
tainted.replaceFirst(reg, "a"); // $ hasPolyRedos
Pattern.matches(reg, tainted); // $ hasPolyRedos
Pattern.compile(reg).matcher(tainted).matches(); // $ hasPolyRedos
Pattern.compile(reg).split(tainted); // $ hasPolyRedos
Pattern.compile(reg, Pattern.DOTALL).split(tainted); // $ hasPolyRedos
Pattern.compile(reg).split(tainted, 7); // $ hasPolyRedos
Pattern.compile(reg).splitAsStream(tainted); // $ hasPolyRedos
Pattern.compile(reg).asPredicate().test(tainted); // $ hasPolyRedos
Pattern.compile(reg).asMatchPredicate().negate().and(dummyPred).or(dummyPred).test(tainted); // $ hasPolyRedos
Predicate.not(dummyPred.and(dummyPred.or(Pattern.compile(reg).asPredicate()))).test(tainted); // $ hasPolyRedos
Splitter.on(Pattern.compile(reg)).split(tainted); // $ hasPolyRedos
Splitter.on(reg).split(tainted);
Splitter.onPattern(reg).split(tainted); // $ hasPolyRedos
Splitter.onPattern(reg).splitToList(tainted); // $ hasPolyRedos
Splitter.onPattern(reg).limit(7).omitEmptyStrings().trimResults().split(tainted); // $ hasPolyRedos
Splitter.onPattern(reg).withKeyValueSeparator(" => ").split(tainted); // $ hasPolyRedos
Splitter.on(";").withKeyValueSeparator(reg).split(tainted);
Splitter.on(";").withKeyValueSeparator(Splitter.onPattern(reg)).split(tainted); // $ hasPolyRedos
}
void test2(HttpServletRequest request) {
String tainted = request.getParameter("inp");
Pattern p1 = Pattern.compile(".*a");
Pattern p2 = Pattern.compile(".*b");
p1.matcher(tainted).matches();
p2.matcher(tainted).find(); // $ hasPolyRedos
}
void test3(HttpServletRequest request) {
String tainted = request.getParameter("inp");
Pattern p1 = Pattern.compile("ab*b*");
Pattern p2 = Pattern.compile("cd*d*");
p1.matcher(tainted).matches(); // $ hasPolyRedos
p2.matcher(tainted).find();
}
void test4(HttpServletRequest request) {
String tainted = request.getParameter("inp");
tainted.matches(".*a");
tainted.replaceAll(".*b", "c"); // $ hasPolyRedos
}
static Pattern p3 = Pattern.compile(".*a");
static Pattern p4 = Pattern.compile(".*b");
void test5(HttpServletRequest request) {
String tainted = request.getParameter("inp");
p3.asMatchPredicate().test(tainted);
p4.asPredicate().test(tainted); // $ hasPolyRedos
}
void test6(HttpServletRequest request) {
Pattern p = Pattern.compile("^a*a*$");
p.matcher(request.getParameter("inp")).matches(); // $ hasPolyRedos
p.matcher(request.getHeader("If-None-Match")).matches();
p.matcher(request.getRequestURI()).matches();
p.matcher(request.getCookies()[0].getName()).matches();
}
}

View File

@@ -0,0 +1,19 @@
import java
import TestUtilities.InlineExpectationsTest
import semmle.code.java.security.performance.PolynomialReDoSQuery
class HasPolyRedos extends InlineExpectationsTest {
HasPolyRedos() { this = "HasPolyRedos" }
override string getARelevantTag() { result = "hasPolyRedos" }
override predicate hasActualResult(Location location, string element, string tag, string value) {
tag = "hasPolyRedos" and
exists(DataFlow::PathNode source, DataFlow::PathNode sink, PolynomialBackTrackingTerm regexp |
hasPolynomialReDoSResult(source, sink, regexp) and
location = sink.getNode().getLocation() and
element = sink.getNode().toString() and
value = ""
)
}
}

View File

@@ -0,0 +1,29 @@
import java
import TestUtilities.InlineExpectationsTest
import semmle.code.java.security.performance.ExponentialBackTracking
import semmle.code.java.regex.regex
class HasExpRedos extends InlineExpectationsTest {
HasExpRedos() { this = "HasExpRedos" }
override string getARelevantTag() { result = ["hasExpRedos", "hasParseFailure"] }
override predicate hasActualResult(Location location, string element, string tag, string value) {
tag = "hasExpRedos" and
exists(RegExpTerm t, string pump, State s, string prefixMsg |
hasReDoSResult(t, pump, s, prefixMsg) and
not t.getRegex().getAMode() = "VERBOSE" and
value = "" and
location = t.getLocation() and
element = t.toString()
)
or
tag = "hasParseFailure" and
exists(Regex r |
r.failedToParse(_) and
value = "" and
location = r.getLocation() and
element = r.toString()
)
}
}

View File

@@ -0,0 +1 @@
// semmle-extractor-options: --javac-args -cp ${testdir}/../../../stubs/servlet-api-2.4:${testdir}/../../../stubs/guava-30.0

View File

@@ -0,0 +1,53 @@
// Generated automatically from com.google.common.base.CharMatcher for testing purposes
package com.google.common.base;
import com.google.common.base.Predicate;
abstract public class CharMatcher implements Predicate<Character>
{
protected CharMatcher(){}
public CharMatcher and(CharMatcher p0){ return null; }
public CharMatcher negate(){ return null; }
public CharMatcher or(CharMatcher p0){ return null; }
public CharMatcher precomputed(){ return null; }
public String collapseFrom(CharSequence p0, char p1){ return null; }
public String removeFrom(CharSequence p0){ return null; }
public String replaceFrom(CharSequence p0, CharSequence p1){ return null; }
public String replaceFrom(CharSequence p0, char p1){ return null; }
public String retainFrom(CharSequence p0){ return null; }
public String toString(){ return null; }
public String trimAndCollapseFrom(CharSequence p0, char p1){ return null; }
public String trimFrom(CharSequence p0){ return null; }
public String trimLeadingFrom(CharSequence p0){ return null; }
public String trimTrailingFrom(CharSequence p0){ return null; }
public abstract boolean matches(char p0);
public boolean apply(Character p0){ return false; }
public boolean matchesAllOf(CharSequence p0){ return false; }
public boolean matchesAnyOf(CharSequence p0){ return false; }
public boolean matchesNoneOf(CharSequence p0){ return false; }
public int countIn(CharSequence p0){ return 0; }
public int indexIn(CharSequence p0){ return 0; }
public int indexIn(CharSequence p0, int p1){ return 0; }
public int lastIndexIn(CharSequence p0){ return 0; }
public static CharMatcher any(){ return null; }
public static CharMatcher anyOf(CharSequence p0){ return null; }
public static CharMatcher ascii(){ return null; }
public static CharMatcher breakingWhitespace(){ return null; }
public static CharMatcher digit(){ return null; }
public static CharMatcher forPredicate(Predicate<? super Character> p0){ return null; }
public static CharMatcher inRange(char p0, char p1){ return null; }
public static CharMatcher invisible(){ return null; }
public static CharMatcher is(char p0){ return null; }
public static CharMatcher isNot(char p0){ return null; }
public static CharMatcher javaDigit(){ return null; }
public static CharMatcher javaIsoControl(){ return null; }
public static CharMatcher javaLetter(){ return null; }
public static CharMatcher javaLetterOrDigit(){ return null; }
public static CharMatcher javaLowerCase(){ return null; }
public static CharMatcher javaUpperCase(){ return null; }
public static CharMatcher none(){ return null; }
public static CharMatcher noneOf(CharSequence p0){ return null; }
public static CharMatcher singleWidth(){ return null; }
public static CharMatcher whitespace(){ return null; }
}

View File

@@ -1,48 +1,35 @@
/*
* Copyright (C) 2009 The Guava Authors
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
* in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing permissions and limitations under
* the License.
*/
// Generated automatically from com.google.common.base.Splitter for testing purposes
package com.google.common.base;
import java.util.Iterator;
import com.google.common.base.CharMatcher;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import java.util.stream.Stream;
public final class Splitter {
public static Splitter on(final String separator) {
return null;
}
public Splitter omitEmptyStrings() {
return null;
}
public Iterable<String> split(final CharSequence sequence) {
return null;
}
public List<String> splitToList(CharSequence sequence) {
return null;
}
public MapSplitter withKeyValueSeparator(String separator) {
return null;
}
public static final class MapSplitter {
public Map<String, String> split(CharSequence sequence) {
return null;
public class Splitter
{
protected Splitter() {}
public Iterable<String> split(CharSequence p0){ return null; }
public List<String> splitToList(CharSequence p0){ return null; }
public Splitter limit(int p0){ return null; }
public Splitter omitEmptyStrings(){ return null; }
public Splitter trimResults(){ return null; }
public Splitter trimResults(CharMatcher p0){ return null; }
public Splitter.MapSplitter withKeyValueSeparator(Splitter p0){ return null; }
public Splitter.MapSplitter withKeyValueSeparator(String p0){ return null; }
public Splitter.MapSplitter withKeyValueSeparator(char p0){ return null; }
public Stream<String> splitToStream(CharSequence p0){ return null; }
public static Splitter fixedLength(int p0){ return null; }
public static Splitter on(CharMatcher p0){ return null; }
public static Splitter on(Pattern p0){ return null; }
public static Splitter on(String p0){ return null; }
public static Splitter on(char p0){ return null; }
public static Splitter onPattern(String p0){ return null; }
static public class MapSplitter
{
protected MapSplitter() {}
public Map<String, String> split(CharSequence p0){ return null; }
}
}
}