fix parse error in regular expressions

This commit is contained in:
Erik Krogh Kristensen
2021-03-08 11:57:28 +01:00
parent 84554af7f5
commit bff59a1aaa
5 changed files with 33 additions and 4 deletions

View File

@@ -43,7 +43,7 @@ public class Main {
* A version identifier that should be updated every time the extractor changes in such a way that
* it may produce different tuples for the same file under the same {@link ExtractorConfig}.
*/
public static final String EXTRACTOR_VERSION = "2021-02-24";
public static final String EXTRACTOR_VERSION = "2021-03-08";
public static final Pattern NEWLINE = Pattern.compile("\n");

View File

@@ -282,11 +282,18 @@ public class RegExpParser {
if (this.match("+")) return this.finishTerm(new Plus(loc, atom, !this.match("?")));
if (this.match("?")) return this.finishTerm(new Opt(loc, atom, !this.match("?")));
if (this.match("{")) {
Double lo = toNumber(this.readDigits(false)), hi;
String matched = "{"; // keeping track of the string matched so far, in case this turns out not to be a quantifier.
String digits = this.readDigits(false);
matched += digits;
Double lo = toNumber(digits), hi;
int prevPos = this.pos;
if (this.match(",")) {
matched += ",";
if (!this.lookahead("}")) {
// atom{lo, hi}
hi = toNumber(this.readDigits(false));
digits = this.readDigits(false);
matched += digits;
hi = toNumber(digits);
} else {
// atom{lo,}
hi = null;
@@ -295,7 +302,11 @@ public class RegExpParser {
// atom{lo}
hi = lo;
}
this.expectRBrace();
if (!this.match("}")) {
// Not a quantifier, just parsing it as a constant.
// E.g. a Regexp such as `/a{|X/`, where there is no matching `}`.
return this.finishTerm(new Sequence(loc, Arrays.asList(atom, new Constant(loc, matched))));
}
return this.finishTerm(new Range(loc, atom, !this.match("?"), lo, hi));
}
return atom;

View File

@@ -168,3 +168,7 @@
| tst.js:351:15:351:16 | a+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a'. |
| tst.js:352:15:352:16 | a* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a'. |
| tst.js:353:15:353:16 | a+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a'. |
| tst.js:360:15:360:30 | ((?:a{\|-)\|\\w\\{)+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a{'. |
| tst.js:361:15:361:33 | ((?:a{0\|-)\|\\w\\{\\d)+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a{0'. |
| tst.js:362:15:362:35 | ((?:a{0,\|-)\|\\w\\{\\d,)+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a{0,'. |
| tst.js:363:15:363:38 | ((?:a{0,2\|-)\|\\w\\{\\d,\\d)+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a{0,2'. |

View File

@@ -355,3 +355,12 @@ var bad82 = /(a+)+b/;
// GOOD
var good40 = /(a|b)+/;
var good41 = /(?:[\s;,"'<>(){}|[\]@=+*]|:(?![/\\]))+/;
// NOT GOOD
var bad83 = /^((?:a{|-)|\w\{)+X$/;
var bad84 = /^((?:a{0|-)|\w\{\d)+X$/;
var bad85 = /^((?:a{0,|-)|\w\{\d,)+X$/;
var bad86 = /^((?:a{0,2|-)|\w\{\d,\d)+X$/;
// GOOD:
var good42 = /^((?:a{0,2}|-)|\w\{\d,\d\})+X$/;

View File

@@ -85,3 +85,8 @@ function nonWordBoundary(x) {
function emptyRegex(x) {
return new RegExp("").test(x); // NOT OK
}
function parserTest(x) {
/(\w\s*:\s*[^:}]+|#){|@import[^\n]+(?:url|,)/.test(x); // OK
/^((?:a{0,2}|-)|\w\{\d,\d\})+X$/.text(x); // ok
}