extract regexp literals from string concatenations

This commit is contained in:
Erik Krogh Kristensen
2021-09-24 18:41:03 +02:00
parent 9478faf040
commit 12305aae42
16 changed files with 3460 additions and 149 deletions

View File

@@ -1559,6 +1559,14 @@ class URShiftExpr extends @urshift_expr, BinaryExpr {
*/
class AddExpr extends @add_expr, BinaryExpr {
override string getOperator() { result = "+" }
/**
* Gets the value of this string concatenation parsed as a regular expression, if possible.
*
* All string literals have an associated regular expression tree, provided they can
* be parsed without syntax errors.
*/
RegExpTerm asRegExp() { this = result.getParent() }
}
/**

View File

@@ -155,7 +155,7 @@ class RegExpTerm extends Locatable, @regexpterm {
exists(RegExpParent parent | parent = getRootTerm().getParent() |
parent instanceof RegExpLiteral
or
parent.(StringLiteral).flow() instanceof RegExpPatternSource
parent.(Expr).flow() instanceof RegExpPatternSource
)
}
@@ -1104,6 +1104,30 @@ private class StringRegExpPatternSource extends RegExpPatternSource {
override RegExpTerm getRegExpTerm() { result = asExpr().(StringLiteral).asRegExp() }
}
/**
* A node whose string value may flow to a position where it is interpreted
* as a part of a regular expression.
*/
private class StringConcatRegExpPatternSource extends RegExpPatternSource {
DataFlow::Node parse;
StringConcatRegExpPatternSource() { this = regExpSource(parse) }
override DataFlow::Node getAParse() { result = parse }
override DataFlow::SourceNode getARegExpObject() {
exists(DataFlow::InvokeNode constructor |
constructor = DataFlow::globalVarRef("RegExp").getAnInvocation() and
parse = constructor.getArgument(0) and
result = constructor
)
}
override string getPattern() { result = getStringValue() }
override RegExpTerm getRegExpTerm() { result = asExpr().(AddExpr).asRegExp() }
}
module RegExp {
/** Gets the string `"?"` used to represent a regular expression whose flags are unknown. */
string unknownFlag() { result = "?" }

View File

@@ -855,7 +855,7 @@ regexpterm (unique int id: @regexpterm,
int idx: int ref,
varchar(900) tostring: string ref);
@regexpparent = @regexpterm | @regexp_literal | @string_literal;
@regexpparent = @regexpterm | @regexp_literal | @string_literal | @add_expr;
case @regexpterm.kind of
0 = @regexp_alt

View File

@@ -512,3 +512,8 @@
| tst.js:384:15:384:26 | ([AB]\|[ab])* | Strings with many repetitions of 'A' can start matching anywhere after the start of the preceeding ([AB]\|[ab])*C |
| tst.js:385:14:385:25 | ([DE]\|[de])* | Strings with many repetitions of 'd' can start matching anywhere after the start of the preceeding ([DE]\|[de])*F |
| tst.js:388:14:388:20 | (a\|aa)* | Strings with many repetitions of 'a' can start matching anywhere after the start of the preceeding (a\|aa)*$ |
| tst.js:391:6:394:6 | (a\|aa)* | Strings with many repetitions of 'a' can start matching anywhere after the start of the preceeding (a\|aa)*b$ |
| tst.js:398:7:399:5 | (c\|cc)* | Strings with many repetitions of 'c' can start matching anywhere after the start of the preceeding ((c\|cc)*\|(d\|dd)*\|(e\|ee)*)f$ |
| tst.js:399:7:400:5 | (d\|dd)* | Strings with many repetitions of 'd' can start matching anywhere after the start of the preceeding ((c\|cc)*\|(d\|dd)*\|(e\|ee)*)f$ |
| tst.js:400:7:401:2 | (e\|ee)* | Strings with many repetitions of 'e' can start matching anywhere after the start of the preceeding ((c\|cc)*\|(d\|dd)*\|(e\|ee)*)f$ |
| tst.js:404:6:405:8 | (g\|gg)* | Strings with many repetitions of 'g' can start matching anywhere after the start of the preceeding (g\|gg)*h$ |

View File

@@ -183,3 +183,8 @@
| tst.js:385:14:385:25 | ([DE]\|[de])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'd'. |
| tst.js:387:27:387:33 | (a\|aa)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'aa'. |
| tst.js:388:14:388:20 | (a\|aa)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'aa'. |
| tst.js:391:6:394:6 | (a\|aa)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'aa'. |
| tst.js:398:7:399:5 | (c\|cc)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'cc'. |
| tst.js:399:7:400:5 | (d\|dd)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'dd'. |
| tst.js:400:7:401:2 | (e\|ee)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'ee'. |
| tst.js:404:6:405:8 | (g\|gg)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'gg'. |

View File

@@ -385,4 +385,21 @@ var good47 = /([AB]|[ab])*C/;
var bad92 = /([DE]|[de])*F/i;
var bad93 = /(?<=^v?|\sv?)(a|aa)*$/;
var bad94 = /(a|aa)*$/;
var bad94 = /(a|aa)*$/;
var bad95 = new RegExp(
"(a" +
"|" +
"aa)*" +
"b$"
);
var bad96 = new RegExp("(" +
"(c|cc)*|" +
"(d|dd)*|" +
"(e|ee)*" +
")f$");
var bad97 = new RegExp(
"(g|gg" +
")*h$");

View File

@@ -15,11 +15,15 @@
| tst-IncompleteHostnameRegExp.js:38:3:38:43 | ^(http\|https):\\/\\/www.example.com\\/p\\/f\\/ | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:38:2:38:44 | /^(http ... p\\/f\\// | here |
| tst-IncompleteHostnameRegExp.js:39:5:39:30 | http:\\/\\/sub.example.com\\/ | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:39:2:39:33 | /^(http ... om\\/)/g | here |
| tst-IncompleteHostnameRegExp.js:40:3:40:29 | ^https?:\\/\\/api.example.com | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:40:2:40:30 | /^https ... le.com/ | here |
| tst-IncompleteHostnameRegExp.js:41:42:41:48 | ^https?://.+\\.example\\.com/ | This regular expression has an unrestricted wildcard '.+' which may cause 'example\\.com/' to be matched anywhere in the URL, outside the hostname. | tst-IncompleteHostnameRegExp.js:41:13:41:71 | '^http: ... \\.com/' | here |
| tst-IncompleteHostnameRegExp.js:41:42:41:70 | ^https?://.+\\.example\\.com/ | This string, which is used as a regular expression $@, has an unrestricted wildcard '.+' which may cause 'example\\.com/' to be matched anywhere in the URL, outside the hostname. | tst-IncompleteHostnameRegExp.js:41:13:41:71 | '^http: ... \\.com/' | here |
| tst-IncompleteHostnameRegExp.js:43:3:43:32 | ^https:\\/\\/[a-z]*.example.com$ | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:43:2:43:33 | /^https ... e.com$/ | here |
| tst-IncompleteHostnameRegExp.js:44:32:44:45 | .+.example.net | This regular expression has an unescaped '.' before 'example.net', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:44:9:44:101 | '^proto ... ernal)' | here |
| tst-IncompleteHostnameRegExp.js:44:47:44:62 | .+.example-a.com | This regular expression has an unescaped '.' before 'example-a.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:44:9:44:101 | '^proto ... ernal)' | here |
| tst-IncompleteHostnameRegExp.js:44:64:44:79 | .+.example-b.com | This regular expression has an unescaped '.' before 'example-b.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:44:9:44:101 | '^proto ... ernal)' | here |
| tst-IncompleteHostnameRegExp.js:48:42:48:47 | ^https?://.+.example\\.com/ | This regular expression has an unescaped '.' before 'example\\.com/', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:48:13:48:69 | '^http: ... \\.com/' | here |
| tst-IncompleteHostnameRegExp.js:48:42:48:47 | ^https?://.+.example\\.com/ | This regular expression has an unrestricted wildcard '.+' which may cause 'example\\.com/' to be matched anywhere in the URL, outside the hostname. | tst-IncompleteHostnameRegExp.js:48:13:48:69 | '^http: ... \\.com/' | here |
| tst-IncompleteHostnameRegExp.js:48:42:48:68 | ^https?://.+.example\\.com/ | This string, which is used as a regular expression $@, has an unescaped '.' before 'example\\.com/', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:48:13:48:69 | '^http: ... \\.com/' | here |
| tst-IncompleteHostnameRegExp.js:48:42:48:68 | ^https?://.+.example\\.com/ | This string, which is used as a regular expression $@, has an unrestricted wildcard '.+' which may cause 'example\\.com/' to be matched anywhere in the URL, outside the hostname. | tst-IncompleteHostnameRegExp.js:48:13:48:69 | '^http: ... \\.com/' | here |
| tst-IncompleteHostnameRegExp.js:53:14:53:35 | test.example.com$ | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:53:13:53:36 | 'test.' ... e.com$' | here |
| tst-IncompleteHostnameRegExp.js:59:5:59:20 | foo.example\\.com | This regular expression has an unescaped '.' before 'example\\.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:59:2:59:32 | /^(foo. ... ever)$/ | here |