Merge pull request #461 from xiemaisi/js/bye-bye-rhino

Approved by esben-semmle
This commit is contained in:
semmle-qlci
2018-11-14 14:00:07 +00:00
committed by GitHub
12 changed files with 4060 additions and 1648 deletions

View File

@@ -21,6 +21,10 @@ public class SourceLocation {
this(source, start, null);
}
public SourceLocation(SourceLocation that) {
this(that.source, that.start, that.end);
}
/**
* The source code contained in this location.
*/

View File

@@ -7,6 +7,20 @@ import com.semmle.js.ast.SourceLocation;
* An error encountered while parsing a regular expression.
*/
public class Error extends SourceElement {
public static final int UNEXPECTED_EOS = 0;
public static final int UNEXPECTED_CHARACTER = 1;
public static final int EXPECTED_DIGIT = 2;
public static final int EXPECTED_HEX_DIGIT = 3;
public static final int EXPECTED_CONTROL_LETTER = 4;
public static final int EXPECTED_CLOSING_PAREN = 5;
public static final int EXPECTED_CLOSING_BRACE = 6;
public static final int EXPECTED_EOS = 7;
public static final int OCTAL_ESCAPE = 8;
public static final int INVALID_BACKREF = 9;
public static final int EXPECTED_RBRACKET = 10;
public static final int EXPECTED_IDENTIFIER = 11;
public static final int EXPECTED_CLOSING_ANGLE = 12;
private final int code;
public Error(SourceLocation loc, Number code) {

View File

@@ -39,7 +39,7 @@ public class Main {
* such a way that it may produce different tuples for the same file under the same
* {@link ExtractorConfig}.
*/
public static final String EXTRACTOR_VERSION = "2018-10-16";
public static final String EXTRACTOR_VERSION = "2018-11-12";
public static final Pattern NEWLINE = Pattern.compile("\n");

File diff suppressed because it is too large Load Diff

View File

@@ -1,17 +1,9 @@
package com.semmle.js.parser;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import org.mozilla.javascript.Function;
import org.mozilla.javascript.NativeArray;
import org.mozilla.javascript.NativeObject;
import org.mozilla.javascript.ScriptableObject;
import com.semmle.js.ast.Position;
import com.semmle.js.ast.SourceLocation;
import com.semmle.js.ast.regexp.BackReference;
import com.semmle.js.ast.regexp.Caret;
@@ -47,54 +39,9 @@ import com.semmle.js.ast.regexp.ZeroWidthPositiveLookahead;
import com.semmle.js.ast.regexp.ZeroWidthPositiveLookbehind;
/**
* Wrapper for invoking esregex through Rhino.
* A parser for ECMAScript 2018 regular expressions.
*/
public class RegExpParser extends ScriptLoader {
/**
* Specification for esregex AST types.
*/
private static final Map<Class<? extends RegExpTerm>, List<String>> spec = new LinkedHashMap<Class<? extends RegExpTerm>, List<String>>();
static {
spec.put(BackReference.class, Arrays.asList("value", "raw"));
spec.put(Caret.class, Collections.<String>emptyList());
spec.put(CharacterClass.class, Arrays.asList("elements", "inverted"));
spec.put(CharacterClassEscape.class, Arrays.asList("class", "raw"));
spec.put(CharacterClassRange.class, Arrays.asList("left", "right"));
spec.put(Constant.class, Arrays.asList("value"));
spec.put(ControlEscape.class, Arrays.asList("value", "codepoint", "raw"));
spec.put(ControlLetter.class, Arrays.asList("value", "codepoint", "raw"));
spec.put(DecimalEscape.class, Arrays.asList("value", "codepoint", "raw"));
spec.put(Disjunction.class, Arrays.asList("disjuncts"));
spec.put(Dollar.class, Collections.<String>emptyList());
spec.put(Dot.class, Collections.<String>emptyList());
spec.put(Group.class, Arrays.asList("capture", "number", "name", "operand"));
spec.put(HexEscapeSequence.class, Arrays.asList("value", "codepoint", "raw"));
spec.put(IdentityEscape.class, Arrays.asList("value", "codepoint", "raw"));
spec.put(NamedBackReference.class, Arrays.asList("name", "raw"));
spec.put(NonWordBoundary.class, Collections.<String>emptyList());
spec.put(OctalEscape.class, Arrays.asList("value", "codepoint", "raw"));
spec.put(Opt.class, Arrays.asList("operand", "greedy"));
spec.put(Plus.class, Arrays.asList("operand", "greedy"));
spec.put(Range.class, Arrays.asList("operand", "greedy", "lo", "hi"));
spec.put(Sequence.class, Arrays.asList("elements"));
spec.put(Star.class, Arrays.asList("operand", "greedy"));
spec.put(UnicodeEscapeSequence.class, Arrays.asList("value", "codepoint", "raw"));
spec.put(WordBoundary.class, Collections.<String>emptyList());
spec.put(ZeroWidthNegativeLookahead.class, Arrays.asList("operand"));
spec.put(ZeroWidthPositiveLookahead.class, Arrays.asList("operand"));
spec.put(ZeroWidthNegativeLookbehind.class, Arrays.asList("operand"));
spec.put(ZeroWidthPositiveLookbehind.class, Arrays.asList("operand"));
spec.put(UnicodePropertyEscape.class, Arrays.asList("name", "value", "raw"));
}
/**
* Specification for esregex parse errors.
*/
private static final Map<Class<? extends Error>, List<String>> errspec = new LinkedHashMap<Class<? extends Error>, List<String>>();
static {
errspec.put(Error.class, Arrays.asList("code"));
}
public class RegExpParser {
/**
* The result of a parse.
*/
@@ -102,12 +49,12 @@ public class RegExpParser extends ScriptLoader {
/**
* The root of the parsed AST.
*/
private final RegExpTerm ast;
public final RegExpTerm ast;
/**
* A list of errors encountered during parsing.
*/
private final List<Error> errors;
public final List<Error> errors;
public Result(RegExpTerm ast, List<Error> errors) {
this.ast = ast;
@@ -123,28 +70,450 @@ public class RegExpParser extends ScriptLoader {
}
}
public RegExpParser() {
super("/regexparser.js");
}
private String src;
private int pos;
private List<Error> errors;
private List<BackReference> backrefs;
private int maxbackref;
/**
* Parse the given string as a regular expression.
*/
public Result parse(String src) {
Function ctor = (Function)readGlobal("RegExpParser");
ScriptableObject parser = construct(ctor, src);
NativeObject ast = (NativeObject)callMethod(parser, "Pattern");
NativeArray errors = (NativeArray)readProperty(parser, "errors");
JSObjectDecoder<RegExpTerm> decoder = new JSObjectDecoder<RegExpTerm>(src, this, "com.semmle.js.ast.regexp", spec);
List<Error> errs = null;
RegExpTerm term = null;
try {
term = decoder.decodeObject(ast);
errs = new JSObjectDecoder<Error>(src, this, "com.semmle.js.ast.regexp", errspec).decodeObjects(errors);
} catch (ParseError e) {
errs = new ArrayList<Error>();
errs.add(new Error(new SourceLocation("", e.getPosition(), e.getPosition()), 1));
this.src = src;
this.pos = 0;
this.errors = new ArrayList<>();
this.backrefs = new ArrayList<>();
this.maxbackref = 0;
RegExpTerm root = parsePattern();
for (BackReference backref : backrefs)
if (backref.getValue() > maxbackref)
errors.add(new Error(backref.getLoc(), Error.INVALID_BACKREF));
return new Result(root, errors);
}
private static String fromCodePoint(int codepoint) {
if (Character.isValidCodePoint(codepoint))
return new String(Character.toChars(codepoint));
// replacement character
return "\ufffd";
}
private Position pos() {
return new Position(1, pos, pos);
}
private void error(int code, int start, int end) {
Position startPos, endPos;
startPos = new Position(1, start, start);
endPos = new Position(1, end, end);
this.errors.add(new Error(new SourceLocation(inputSubstring(start, end), startPos, endPos), code));
}
private void error(int code, int start) {
error(code, start, start+1);
}
private void error(int code) {
error(code, this.pos);
}
private boolean atEOS() {
return pos >= src.length();
}
private char peekChar(boolean opt) {
if (this.atEOS()) {
if (!opt)
this.error(Error.UNEXPECTED_EOS);
return '\0';
} else {
return this.src.charAt(this.pos);
}
return new Result(term, errs);
}
private char nextChar() {
char c = peekChar(false);
if (this.pos < src.length())
++this.pos;
return c;
}
private String readHexDigit() {
char c = this.peekChar(false);
if (c >= '0' && c <= '9' || c >= 'a' && c <= 'f' || c >= 'A' && c <= 'F') {
++this.pos;
return String.valueOf(c);
}
if (c != '\0')
this.error(Error.EXPECTED_HEX_DIGIT, this.pos);
return "";
}
private String readHexDigits(int n) {
StringBuilder res = new StringBuilder();
while (n-->0) {
res.append(readHexDigit());
}
if (res.length() == 0)
return "0";
return res.toString();
}
private String readDigits(boolean opt) {
StringBuilder res = new StringBuilder();
for (char c=peekChar(true); c >= '0' && c <= '9'; nextChar(), c=peekChar(true))
res.append(c);
if (res.length() == 0 && !opt)
this.error(Error.EXPECTED_DIGIT);
return res.toString();
}
private Double toNumber(String s) {
if (s.isEmpty())
return 0.0;
return Double.valueOf(s);
}
private String readIdentifier() {
StringBuilder res = new StringBuilder();
for (char c=peekChar(true);
c != '\0' && Character.isJavaIdentifierPart(c);
nextChar(), c=peekChar(true))
res.append(c);
if (res.length() == 0)
this.error(Error.EXPECTED_IDENTIFIER);
return res.toString();
}
private void expectRParen() {
if (!this.match(")"))
this.error(Error.EXPECTED_CLOSING_PAREN, this.pos-1);
}
private void expectRBrace() {
if (!this.match("}"))
this.error(Error.EXPECTED_CLOSING_BRACE, this.pos-1);
}
private void expectRAngle() {
if (!this.match(">"))
this.error(Error.EXPECTED_CLOSING_ANGLE, this.pos-1);
}
private boolean lookahead(String... arguments) {
for (String prefix : arguments) {
if (prefix == null) {
if (atEOS())
return true;
} else if (inputSubstring(pos, pos+prefix.length()).equals(prefix)) {
return true;
}
}
return false;
}
private boolean match(String... arguments) {
for (String prefix : arguments) {
if (this.lookahead(prefix)) {
if (prefix == null)
prefix = "";
this.pos += prefix.length();
return true;
}
}
return false;
}
private RegExpTerm parsePattern() {
RegExpTerm res = parseDisjunction();
if (!this.atEOS())
this.error(Error.EXPECTED_EOS);
return res;
}
protected String inputSubstring(int start, int end) {
if (start >= src.length())
return "";
if (end > src.length())
end = src.length();
return src.substring(start, end);
}
private <T extends RegExpTerm> T finishTerm(T term) {
SourceLocation loc = term.getLoc();
Position end = pos();
loc.setSource(inputSubstring(loc.getStart().getOffset(), end.getOffset()));
loc.setEnd(end);
return term;
}
private RegExpTerm parseDisjunction() {
SourceLocation loc = new SourceLocation(pos());
List<RegExpTerm> disjuncts = new ArrayList<>();
disjuncts.add(this.parseAlternative());
while (this.match("|"))
disjuncts.add(this.parseAlternative());
if (disjuncts.size() == 1)
return disjuncts.get(0);
return this.finishTerm(new Disjunction(loc, disjuncts));
}
private RegExpTerm parseAlternative() {
SourceLocation loc = new SourceLocation(pos());
List<RegExpTerm> elements = new ArrayList<>();
while (!this.lookahead(null, "|", ")"))
elements.add(this.parseTerm());
if (elements.size() == 1)
return elements.get(0);
return this.finishTerm(new Sequence(loc, elements));
}
private RegExpTerm parseTerm() {
SourceLocation loc = new SourceLocation(pos());
if (this.match("^"))
return this.finishTerm(new Caret(loc));
if (this.match("$"))
return this.finishTerm(new Dollar(loc));
if (this.match("\\b"))
return this.finishTerm(new WordBoundary(loc));
if (this.match("\\B"))
return this.finishTerm(new NonWordBoundary(loc));
if (this.match("(?=")) {
RegExpTerm dis = this.parseDisjunction();
this.expectRParen();
return this.finishTerm(new ZeroWidthPositiveLookahead(loc, dis));
}
if (this.match("(?!")) {
RegExpTerm dis = this.parseDisjunction();
this.expectRParen();
return this.finishTerm(new ZeroWidthNegativeLookahead(loc, dis));
}
if (this.match("(?<=")) {
RegExpTerm dis = this.parseDisjunction();
this.expectRParen();
return this.finishTerm(new ZeroWidthPositiveLookbehind(loc, dis));
}
if (this.match("(?<!")) {
RegExpTerm dis = this.parseDisjunction();
this.expectRParen();
return this.finishTerm(new ZeroWidthNegativeLookbehind(loc, dis));
}
return this.finishTerm(this.parseQuantifierOpt(loc, this.parseAtom()));
}
private RegExpTerm parseQuantifierOpt(SourceLocation loc, RegExpTerm atom) {
if (this.match("*"))
return this.finishTerm(new Star(loc, atom, !this.match("?")));
if (this.match("+"))
return this.finishTerm(new Plus(loc, atom, !this.match("?")));
if (this.match("?"))
return this.finishTerm(new Opt(loc, atom, !this.match("?")));
if (this.match("{")) {
Double lo = toNumber(this.readDigits(false)),
hi = null;
if (this.match(",") && !this.lookahead("}"))
hi = toNumber(this.readDigits(false));
this.expectRBrace();
return this.finishTerm(new Range(loc, atom, !this.match("?"), lo, hi));
}
return atom;
}
private RegExpTerm parseAtom() {
SourceLocation loc = new SourceLocation(pos());
if (this.match("."))
return this.finishTerm(new Dot(loc));
if (this.match("\\"))
return this.parseAtomEscape(loc, false);
if (this.lookahead("["))
return this.parseCharacterClass();
if (this.match("(")) {
boolean capture = !this.match("?:");
String name = null;
if (this.match("?<")) {
name = this.readIdentifier();
this.expectRAngle();
}
if (capture)
++this.maxbackref;
int number = this.maxbackref;
RegExpTerm dis = this.parseDisjunction();
this.expectRParen();
return this.finishTerm(new Group(loc, capture, number, name, dis));
}
char c = this.nextChar();
if ("^$\\.*+?()[]{}|".indexOf(c) != -1)
this.error(Error.UNEXPECTED_CHARACTER, this.pos-1);
return this.finishTerm(new Constant(loc, String.valueOf(c)));
}
private RegExpTerm parseAtomEscape(SourceLocation loc, boolean inCharClass) {
String raw, value;
double codepoint;
if (this.match("x")) {
raw = this.readHexDigits(2);
codepoint = Integer.parseInt(raw, 16);
value = fromCodePoint((int) codepoint);
return this.finishTerm(new HexEscapeSequence(loc, value, (double)codepoint, "\\x" + raw));
}
if (this.match("u")) {
if (this.match("{")) {
int closePos = this.src.indexOf("}", this.pos);
int n;
if (closePos == -1) {
// don't attempt to read any digits, but
// report missing `}`
n = 0;
} else if (closePos == this.pos) {
// empty escape sequence, trigger an error
n = 1;
} else {
n = closePos - this.pos;
}
raw = this.readHexDigits(n);
this.expectRBrace();
try {
codepoint = Long.parseLong(raw, 16);
} catch (NumberFormatException nfe) {
codepoint = 0;
}
raw = "{" + raw + "}";
} else {
raw = this.readHexDigits(4);
codepoint = Integer.parseInt(raw, 16);
}
value = fromCodePoint((int) codepoint);
return this.finishTerm(new UnicodeEscapeSequence(loc, value, (double)codepoint, "\\u" + raw));
}
if (this.match("k<")) {
String name = this.readIdentifier();
this.expectRAngle();
return this.finishTerm(new NamedBackReference(loc, name, "\\k<" + name + ">"));
}
if (this.match("p{", "P{")) {
String name = this.readIdentifier();
if (this.match("=")) {
value = this.readIdentifier();
raw = "\\p{" + name + "=" + value + "}";
} else {
value = null;
raw = "\\p{" + name + "}";
}
this.expectRBrace();
return this.finishTerm(new UnicodePropertyEscape(loc, name, value, raw));
}
int startpos = this.pos-1;
char c = this.nextChar();
if (c >= '0' && c <= '9') {
raw = c + this.readDigits(true);
if (c == '0' || inCharClass) {
int base = c == '0' && raw.length() > 1 ? 8 : 10;
try {
codepoint = Long.parseLong(raw, base);
value = fromCodePoint((int) codepoint);
} catch (NumberFormatException nfe) {
codepoint = 0;
value = "\0";
}
if (base == 8) {
this.error(Error.OCTAL_ESCAPE, startpos, this.pos);
return this.finishTerm(new OctalEscape(loc, value, (double)codepoint, "\\" + raw));
} else {
return this.finishTerm(new DecimalEscape(loc, value, (double)codepoint, "\\" + raw));
}
} else {
try {
codepoint = Long.parseLong(raw, 10);
} catch (NumberFormatException nfe) {
codepoint = 0;
}
BackReference br = this.finishTerm(new BackReference(loc, (double)codepoint, "\\" + raw));
this.backrefs.add(br);
return br;
}
}
String ctrltab = "f\fn\nr\rt\tv\u000b";
int idx;
if ((idx=ctrltab.indexOf(c)) % 2 == 0) {
codepoint = ctrltab.charAt(idx+1);
value = String.valueOf((char)codepoint);
return this.finishTerm(new ControlEscape(loc, value, codepoint, "\\" + c));
}
if (c == 'c') {
c = this.nextChar();
if (!(c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z'))
this.error(Error.EXPECTED_CONTROL_LETTER, this.pos-1);
codepoint = c % 32;
value = String.valueOf((char)codepoint);
return this.finishTerm(new ControlLetter(loc, value, codepoint, "\\c" + c));
}
if ("dDsSwW".indexOf(c) >= 0) {
return this.finishTerm(new CharacterClassEscape(loc, String.valueOf(c), "\\" + c));
}
codepoint = c;
value = String.valueOf((char)codepoint);
return this.finishTerm(new IdentityEscape(loc, value, codepoint, "\\" + c));
}
private RegExpTerm parseCharacterClass() {
SourceLocation loc = new SourceLocation(pos());
List<RegExpTerm> elements = new ArrayList<>();
this.match("[");
boolean inverted = this.match("^");
while (!this.match("]")) {
if (this.atEOS()) {
this.error(Error.EXPECTED_RBRACKET);
break;
}
elements.add(this.parseCharacterClassElement());
}
return this.finishTerm(new CharacterClass(loc, elements, inverted));
}
private RegExpTerm parseCharacterClassElement() {
SourceLocation loc = new SourceLocation(pos());
RegExpTerm atom = this.parseCharacterClassAtom();
if (!this.lookahead("-]") && this.match("-"))
return this.finishTerm(new CharacterClassRange(loc, atom, this.parseCharacterClassAtom()));
return atom;
}
private RegExpTerm parseCharacterClassAtom() {
SourceLocation loc = new SourceLocation(pos());
char c = this.nextChar();
if (c == '\\') {
if (this.match("b"))
return this.finishTerm(new ControlEscape(loc, "\b", 8, "\\b"));
return this.finishTerm(this.parseAtomEscape(loc, true));
}
return this.finishTerm(new Constant(loc, String.valueOf(c)));
}
}