Add support for '\q{}' escape sequence in regular expressions.

This commit is contained in:
Napalys
2025-02-28 18:51:24 +01:00
parent d162acf02c
commit ed418be97a
5 changed files with 203 additions and 267 deletions

View File

@@ -0,0 +1,25 @@
package com.semmle.js.ast.regexp;
import com.semmle.js.ast.SourceLocation;
/**
* A '\q{}' escape sequence in a regular expression, which is a special extension
* to standard regular expressions.
*/
public class CharacterClassQuotedString extends RegExpTerm {
private final RegExpTerm term;
public CharacterClassQuotedString(SourceLocation loc, RegExpTerm term) {
super(loc, "CharacterClassQuotedString");
this.term = term;
}
public RegExpTerm getTerm() {
return term;
}
@Override
public void accept(Visitor v) {
v.visit(this);
}
}

View File

@@ -61,4 +61,6 @@ public interface Visitor {
public void visit(ZeroWidthNegativeLookbehind nd);
public void visit(UnicodePropertyEscape nd);
public void visit(CharacterClassQuotedString nd);
}

View File

@@ -10,6 +10,7 @@ import com.semmle.js.ast.regexp.BackReference;
import com.semmle.js.ast.regexp.Caret;
import com.semmle.js.ast.regexp.CharacterClass;
import com.semmle.js.ast.regexp.CharacterClassEscape;
import com.semmle.js.ast.regexp.CharacterClassQuotedString;
import com.semmle.js.ast.regexp.CharacterClassRange;
import com.semmle.js.ast.regexp.Constant;
import com.semmle.js.ast.regexp.ControlEscape;
@@ -92,6 +93,7 @@ public class RegExpExtractor {
termkinds.put("ZeroWidthPositiveLookbehind", 25);
termkinds.put("ZeroWidthNegativeLookbehind", 26);
termkinds.put("UnicodePropertyEscape", 27);
termkinds.put("CharacterClassQuotedString", 28);
}
private static final String[] errmsgs =
@@ -344,6 +346,12 @@ public class RegExpExtractor {
visit(nd.getLeft(), lbl, 0);
visit(nd.getRight(), lbl, 1);
}
@Override
public void visit(CharacterClassQuotedString nd) {
Label lbl = extractTerm(nd, parent, idx);
visit(nd.getTerm(), lbl, 0);
}
}
public void extract(String src, SourceMap sourceMap, Node parent, boolean isSpeculativeParsing, String flags) {

View File

@@ -6,6 +6,7 @@ import com.semmle.js.ast.regexp.BackReference;
import com.semmle.js.ast.regexp.Caret;
import com.semmle.js.ast.regexp.CharacterClass;
import com.semmle.js.ast.regexp.CharacterClassEscape;
import com.semmle.js.ast.regexp.CharacterClassQuotedString;
import com.semmle.js.ast.regexp.CharacterClassRange;
import com.semmle.js.ast.regexp.Constant;
import com.semmle.js.ast.regexp.ControlEscape;
@@ -283,6 +284,45 @@ public class RegExpParser {
return this.finishTerm(this.parseQuantifierOpt(loc, this.parseAtom()));
}
private RegExpTerm parseDisjunctionInsideQuotedString() {
SourceLocation loc = new SourceLocation(pos());
List<RegExpTerm> disjuncts = new ArrayList<>();
disjuncts.add(this.parseAlternativeInsideQuotedString());
while (this.match("|")) {
disjuncts.add(this.parseAlternativeInsideQuotedString());
}
if (disjuncts.size() == 1) return disjuncts.get(0);
return this.finishTerm(new Disjunction(loc, disjuncts));
}
private RegExpTerm parseAlternativeInsideQuotedString() {
SourceLocation loc = new SourceLocation(pos());
StringBuilder sb = new StringBuilder();
boolean escaped = false;
while (true) {
// If we're at the end of the string, something went wrong.
if (this.atEOS()) {
this.error(Error.UNEXPECTED_EOS);
break;
}
// We can end parsing if we're not escaped and we see a `|` which would mean Alternation
// or `}` which would mean the end of the Quoted String.
if(!escaped && this.lookahead(null, "|", "}")){
break;
}
char c = this.nextChar();
// Track whether the character is an escape character.
escaped = !escaped && (c == '\\');
sb.append(c);
}
String literal = sb.toString();
loc.setEnd(pos());
loc.setSource(literal);
return new Constant(loc, literal);
}
private RegExpTerm parseQuantifierOpt(SourceLocation loc, RegExpTerm atom) {
if (this.match("*")) return this.finishTerm(new Star(loc, atom, !this.match("?")));
if (this.match("+")) return this.finishTerm(new Plus(loc, atom, !this.match("?")));
@@ -427,6 +467,12 @@ public class RegExpParser {
return this.finishTerm(new NamedBackReference(loc, name, "\\k<" + name + ">"));
}
if (this.match("q{")) {
RegExpTerm term = parseDisjunctionInsideQuotedString();
this.expectRBrace();
return this.finishTerm(new CharacterClassQuotedString(loc, term));
}
if (this.match("p{", "P{")) {
String name = this.readIdentifier();
if (this.match("=")) {