Added intersection support

This commit is contained in:
Napalys
2025-03-02 19:24:32 +01:00
parent fa5093f6ad
commit 381b5ebe8a
5 changed files with 250 additions and 226 deletions

View File

@@ -0,0 +1,22 @@
package com.semmle.js.ast.regexp;
import com.semmle.js.ast.SourceLocation;
import java.util.List;
public class CharacterClassIntersection extends RegExpTerm {
private final List<RegExpTerm> intersections;
public CharacterClassIntersection(SourceLocation loc, List<RegExpTerm> intersections) {
super(loc, "CharacterClassIntersection");
this.intersections = intersections;
}
@Override
public void accept(Visitor v) {
v.visit(this);
}
public List<RegExpTerm> getIntersections() {
return intersections;
}
}

View File

@@ -63,4 +63,6 @@ public interface Visitor {
public void visit(UnicodePropertyEscape nd);
public void visit(CharacterClassQuotedString nd);
public void visit(CharacterClassIntersection nd);
}

View File

@@ -23,6 +23,7 @@ import com.semmle.js.ast.regexp.Error;
import com.semmle.js.ast.regexp.Group;
import com.semmle.js.ast.regexp.HexEscapeSequence;
import com.semmle.js.ast.regexp.IdentityEscape;
import com.semmle.js.ast.regexp.CharacterClassIntersection;
import com.semmle.js.ast.regexp.Literal;
import com.semmle.js.ast.regexp.NamedBackReference;
import com.semmle.js.ast.regexp.NonWordBoundary;
@@ -94,6 +95,7 @@ public class RegExpExtractor {
termkinds.put("ZeroWidthNegativeLookbehind", 26);
termkinds.put("UnicodePropertyEscape", 27);
termkinds.put("CharacterClassQuotedString", 28);
termkinds.put("CharacterClassIntersection", 29);
}
private static final String[] errmsgs =
@@ -352,6 +354,14 @@ public class RegExpExtractor {
Label lbl = extractTerm(nd, parent, idx);
visit(nd.getTerm(), lbl, 0);
}
@Override
public void visit(CharacterClassIntersection nd) {
Label lbl = extractTerm(nd, parent, idx);
int i = 0;
for (RegExpTerm element : nd.getIntersections())
visit(element, lbl, i++);
}
}
public void extract(String src, SourceMap sourceMap, Node parent, boolean isSpeculativeParsing, String flags) {

View File

@@ -19,6 +19,7 @@ import com.semmle.js.ast.regexp.Error;
import com.semmle.js.ast.regexp.Group;
import com.semmle.js.ast.regexp.HexEscapeSequence;
import com.semmle.js.ast.regexp.IdentityEscape;
import com.semmle.js.ast.regexp.CharacterClassIntersection;
import com.semmle.js.ast.regexp.NamedBackReference;
import com.semmle.js.ast.regexp.NonWordBoundary;
import com.semmle.js.ast.regexp.OctalEscape;
@@ -37,6 +38,7 @@ import com.semmle.js.ast.regexp.ZeroWidthPositiveLookahead;
import com.semmle.js.ast.regexp.ZeroWidthPositiveLookbehind;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
/** A parser for ECMAScript 2018 regular expressions. */
@@ -561,10 +563,16 @@ public class RegExpParser {
return this.finishTerm(new CharacterClass(loc, elements, inverted));
}
private enum CharacterClassType {
STANDARD,
INTERSECTION,
}
// ECMA 2024 `v` flag allows nested character classes.
private RegExpTerm parseNestedCharacterClass() {
SourceLocation loc = new SourceLocation(pos());
List<RegExpTerm> elements = new ArrayList<>();
CharacterClassType classType = CharacterClassType.STANDARD;
this.match("[");
boolean inverted = this.match("^");
@@ -576,11 +584,23 @@ public class RegExpParser {
if (lookahead("[")) {
elements.add(parseNestedCharacterClass());
}
else if (lookahead("&&")) {
this.match("&&");
classType = CharacterClassType.INTERSECTION;
}
else {
elements.add(this.parseCharacterClassElement());
}
}
return this.finishTerm(new CharacterClass(loc, elements, inverted));
// Create appropriate RegExpTerm based on the detected class type
switch (classType) {
case INTERSECTION:
return this.finishTerm(new CharacterClass(loc, Collections.singletonList(new CharacterClassIntersection(loc, elements)), inverted));
case STANDARD:
default:
return this.finishTerm(new CharacterClass(loc, elements, inverted));
}
}
private static final List<String> escapeClasses = Arrays.asList("d", "D", "s", "S", "w", "W");