package com.semmle.js.parser; import com.semmle.js.ast.Position; import com.semmle.js.ast.SourceLocation; import com.semmle.js.ast.regexp.BackReference; import com.semmle.js.ast.regexp.Caret; import com.semmle.js.ast.regexp.CharacterClass; import com.semmle.js.ast.regexp.CharacterClassEscape; import com.semmle.js.ast.regexp.CharacterClassRange; import com.semmle.js.ast.regexp.Constant; import com.semmle.js.ast.regexp.ControlEscape; import com.semmle.js.ast.regexp.ControlLetter; import com.semmle.js.ast.regexp.DecimalEscape; import com.semmle.js.ast.regexp.Disjunction; import com.semmle.js.ast.regexp.Dollar; import com.semmle.js.ast.regexp.Dot; import com.semmle.js.ast.regexp.Error; import com.semmle.js.ast.regexp.Group; import com.semmle.js.ast.regexp.HexEscapeSequence; import com.semmle.js.ast.regexp.IdentityEscape; import com.semmle.js.ast.regexp.NamedBackReference; import com.semmle.js.ast.regexp.NonWordBoundary; import com.semmle.js.ast.regexp.OctalEscape; import com.semmle.js.ast.regexp.Opt; import com.semmle.js.ast.regexp.Plus; import com.semmle.js.ast.regexp.Range; import com.semmle.js.ast.regexp.RegExpTerm; import com.semmle.js.ast.regexp.Sequence; import com.semmle.js.ast.regexp.Star; import com.semmle.js.ast.regexp.UnicodeEscapeSequence; import com.semmle.js.ast.regexp.UnicodePropertyEscape; import com.semmle.js.ast.regexp.WordBoundary; import com.semmle.js.ast.regexp.ZeroWidthNegativeLookahead; import com.semmle.js.ast.regexp.ZeroWidthNegativeLookbehind; import com.semmle.js.ast.regexp.ZeroWidthPositiveLookahead; import com.semmle.js.ast.regexp.ZeroWidthPositiveLookbehind; import java.util.ArrayList; import java.util.List; /** A parser for ECMAScript 2018 regular expressions. */ public class RegExpParser { /** The result of a parse. */ public static class Result { /** The root of the parsed AST. */ public final RegExpTerm ast; /** A list of errors encountered during parsing. */ public final List errors; public Result(RegExpTerm ast, List errors) { this.ast = ast; this.errors = errors; } public RegExpTerm getAST() { return ast; } public List getErrors() { return errors; } } private String src; private int pos; private List errors; private List backrefs; private int maxbackref; /** Parse the given string as a regular expression. */ public Result parse(String src) { this.src = src; this.pos = 0; this.errors = new ArrayList<>(); this.backrefs = new ArrayList<>(); this.maxbackref = 0; RegExpTerm root = parsePattern(); for (BackReference backref : backrefs) if (backref.getValue() > maxbackref) errors.add(new Error(backref.getLoc(), Error.INVALID_BACKREF)); return new Result(root, errors); } private static String fromCodePoint(int codepoint) { if (Character.isValidCodePoint(codepoint)) return new String(Character.toChars(codepoint)); // replacement character return "\ufffd"; } private Position pos() { return new Position(1, pos, pos); } private void error(int code, int start, int end) { Position startPos, endPos; startPos = new Position(1, start, start); endPos = new Position(1, end, end); this.errors.add( new Error(new SourceLocation(inputSubstring(start, end), startPos, endPos), code)); } private void error(int code, int start) { error(code, start, start + 1); } private void error(int code) { error(code, this.pos); } private boolean atEOS() { return pos >= src.length(); } private char peekChar(boolean opt) { if (this.atEOS()) { if (!opt) this.error(Error.UNEXPECTED_EOS); return '\0'; } else { return this.src.charAt(this.pos); } } private char nextChar() { char c = peekChar(false); if (this.pos < src.length()) ++this.pos; return c; } private String readHexDigit() { char c = this.peekChar(false); if (c >= '0' && c <= '9' || c >= 'a' && c <= 'f' || c >= 'A' && c <= 'F') { ++this.pos; return String.valueOf(c); } if (c != '\0') this.error(Error.EXPECTED_HEX_DIGIT, this.pos); return ""; } private String readHexDigits(int n) { StringBuilder res = new StringBuilder(); while (n-- > 0) { res.append(readHexDigit()); } if (res.length() == 0) return "0"; return res.toString(); } private String readDigits(boolean opt) { StringBuilder res = new StringBuilder(); for (char c = peekChar(true); c >= '0' && c <= '9'; nextChar(), c = peekChar(true)) res.append(c); if (res.length() == 0 && !opt) this.error(Error.EXPECTED_DIGIT); return res.toString(); } private Double toNumber(String s) { if (s.isEmpty()) return 0.0; return Double.valueOf(s); } private String readIdentifier() { StringBuilder res = new StringBuilder(); for (char c = peekChar(true); c != '\0' && Character.isJavaIdentifierPart(c); nextChar(), c = peekChar(true)) res.append(c); if (res.length() == 0) this.error(Error.EXPECTED_IDENTIFIER); return res.toString(); } private void expectRParen() { if (!this.match(")")) this.error(Error.EXPECTED_CLOSING_PAREN, this.pos - 1); } private void expectRBrace() { if (!this.match("}")) this.error(Error.EXPECTED_CLOSING_BRACE, this.pos - 1); } private void expectRAngle() { if (!this.match(">")) this.error(Error.EXPECTED_CLOSING_ANGLE, this.pos - 1); } private boolean lookahead(String... arguments) { for (String prefix : arguments) { if (prefix == null) { if (atEOS()) return true; } else if (inputSubstring(pos, pos + prefix.length()).equals(prefix)) { return true; } } return false; } private boolean match(String... arguments) { for (String prefix : arguments) { if (this.lookahead(prefix)) { if (prefix == null) prefix = ""; this.pos += prefix.length(); return true; } } return false; } private RegExpTerm parsePattern() { RegExpTerm res = parseDisjunction(); if (!this.atEOS()) this.error(Error.EXPECTED_EOS); return res; } protected String inputSubstring(int start, int end) { if (start >= src.length()) return ""; if (end > src.length()) end = src.length(); return src.substring(start, end); } private T finishTerm(T term) { SourceLocation loc = term.getLoc(); Position end = pos(); loc.setSource(inputSubstring(loc.getStart().getOffset(), end.getOffset())); loc.setEnd(end); return term; } private RegExpTerm parseDisjunction() { SourceLocation loc = new SourceLocation(pos()); List disjuncts = new ArrayList<>(); disjuncts.add(this.parseAlternative()); while (this.match("|")) disjuncts.add(this.parseAlternative()); if (disjuncts.size() == 1) return disjuncts.get(0); return this.finishTerm(new Disjunction(loc, disjuncts)); } private RegExpTerm parseAlternative() { SourceLocation loc = new SourceLocation(pos()); List elements = new ArrayList<>(); while (!this.lookahead(null, "|", ")")) elements.add(this.parseTerm()); if (elements.size() == 1) return elements.get(0); return this.finishTerm(new Sequence(loc, elements)); } private RegExpTerm parseTerm() { SourceLocation loc = new SourceLocation(pos()); if (this.match("^")) return this.finishTerm(new Caret(loc)); if (this.match("$")) return this.finishTerm(new Dollar(loc)); if (this.match("\\b")) return this.finishTerm(new WordBoundary(loc)); if (this.match("\\B")) return this.finishTerm(new NonWordBoundary(loc)); if (this.match("(?=")) { RegExpTerm dis = this.parseDisjunction(); this.expectRParen(); return this.finishTerm(new ZeroWidthPositiveLookahead(loc, dis)); } if (this.match("(?!")) { RegExpTerm dis = this.parseDisjunction(); this.expectRParen(); return this.finishTerm(new ZeroWidthNegativeLookahead(loc, dis)); } if (this.match("(?<=")) { RegExpTerm dis = this.parseDisjunction(); this.expectRParen(); return this.finishTerm(new ZeroWidthPositiveLookbehind(loc, dis)); } if (this.match("(?")); } if (this.match("p{", "P{")) { String name = this.readIdentifier(); if (this.match("=")) { value = this.readIdentifier(); raw = "\\p{" + name + "=" + value + "}"; } else { value = null; raw = "\\p{" + name + "}"; } this.expectRBrace(); return this.finishTerm(new UnicodePropertyEscape(loc, name, value, raw)); } int startpos = this.pos - 1; char c = this.nextChar(); if (c >= '0' && c <= '9') { raw = c + this.readDigits(true); if (c == '0' || inCharClass) { int base = c == '0' && raw.length() > 1 ? 8 : 10; try { codepoint = Long.parseLong(raw, base); value = fromCodePoint((int) codepoint); } catch (NumberFormatException nfe) { codepoint = 0; value = "\0"; } if (base == 8) { this.error(Error.OCTAL_ESCAPE, startpos, this.pos); return this.finishTerm(new OctalEscape(loc, value, (double) codepoint, "\\" + raw)); } else { return this.finishTerm(new DecimalEscape(loc, value, (double) codepoint, "\\" + raw)); } } else { try { codepoint = Long.parseLong(raw, 10); } catch (NumberFormatException nfe) { codepoint = 0; } BackReference br = this.finishTerm(new BackReference(loc, (double) codepoint, "\\" + raw)); this.backrefs.add(br); return br; } } String ctrltab = "f\fn\nr\rt\tv\u000b"; int idx; if ((idx = ctrltab.indexOf(c)) % 2 == 0) { codepoint = ctrltab.charAt(idx + 1); value = String.valueOf((char) codepoint); return this.finishTerm(new ControlEscape(loc, value, codepoint, "\\" + c)); } if (c == 'c') { c = this.nextChar(); if (!(c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z')) this.error(Error.EXPECTED_CONTROL_LETTER, this.pos - 1); codepoint = c % 32; value = String.valueOf((char) codepoint); return this.finishTerm(new ControlLetter(loc, value, codepoint, "\\c" + c)); } if ("dDsSwW".indexOf(c) >= 0) { return this.finishTerm(new CharacterClassEscape(loc, String.valueOf(c), "\\" + c)); } codepoint = c; value = String.valueOf((char) codepoint); return this.finishTerm(new IdentityEscape(loc, value, codepoint, "\\" + c)); } private RegExpTerm parseCharacterClass() { SourceLocation loc = new SourceLocation(pos()); List elements = new ArrayList<>(); this.match("["); boolean inverted = this.match("^"); while (!this.match("]")) { if (this.atEOS()) { this.error(Error.EXPECTED_RBRACKET); break; } elements.add(this.parseCharacterClassElement()); } return this.finishTerm(new CharacterClass(loc, elements, inverted)); } private RegExpTerm parseCharacterClassElement() { SourceLocation loc = new SourceLocation(pos()); RegExpTerm atom = this.parseCharacterClassAtom(); if (!this.lookahead("-]") && this.match("-")) return this.finishTerm(new CharacterClassRange(loc, atom, this.parseCharacterClassAtom())); return atom; } private RegExpTerm parseCharacterClassAtom() { SourceLocation loc = new SourceLocation(pos()); char c = this.nextChar(); if (c == '\\') { if (this.match("b")) return this.finishTerm(new ControlEscape(loc, "\b", 8, "\\b")); return this.finishTerm(this.parseAtomEscape(loc, true)); } return this.finishTerm(new Constant(loc, String.valueOf(c))); } }