Added support for character class union in regex processing

2026-02-12 21:21:16 +01:00 · 2025-03-03 08:26:48 +01:00
parent 1e05f327d6
commit fe6de2f672
6 changed files with 325 additions and 257 deletions
--- a/javascript/extractor/src/com/semmle/js/ast/regexp/CharacterClassUnion.java
+++ b/javascript/extractor/src/com/semmle/js/ast/regexp/CharacterClassUnion.java
@@ -0,0 +1,22 @@
+package com.semmle.js.ast.regexp;
+
+import com.semmle.js.ast.SourceLocation;
+import java.util.List;
+
+public class CharacterClassUnion extends RegExpTerm {
+    private final List<RegExpTerm> union;
+
+    public CharacterClassUnion(SourceLocation loc, List<RegExpTerm> union) {
+        super(loc, "CharacterClassUnion");
+        this.union = union;
+    }
+
+    @Override
+    public void accept(Visitor v) {
+        v.visit(this);
+    }
+
+    public List<RegExpTerm> getUnion() {
+        return union;
+    }
+}
--- a/javascript/extractor/src/com/semmle/js/ast/regexp/Visitor.java
+++ b/javascript/extractor/src/com/semmle/js/ast/regexp/Visitor.java
@@ -67,4 +67,6 @@ public interface Visitor {
  public void visit(CharacterClassIntersection nd);

  public void visit(CharacterClassSubtraction nd);
+
+  public void visit(CharacterClassUnion nd);
 }
--- a/javascript/extractor/src/com/semmle/js/extractor/RegExpExtractor.java
+++ b/javascript/extractor/src/com/semmle/js/extractor/RegExpExtractor.java
@@ -13,6 +13,7 @@ import com.semmle.js.ast.regexp.CharacterClassEscape;
 import com.semmle.js.ast.regexp.CharacterClassQuotedString;
 import com.semmle.js.ast.regexp.CharacterClassRange;
 import com.semmle.js.ast.regexp.CharacterClassSubtraction;
+import com.semmle.js.ast.regexp.CharacterClassUnion;
 import com.semmle.js.ast.regexp.Constant;
 import com.semmle.js.ast.regexp.ControlEscape;
 import com.semmle.js.ast.regexp.ControlLetter;
@@ -98,6 +99,7 @@ public class RegExpExtractor {
    termkinds.put("CharacterClassQuotedString", 28);
    termkinds.put("CharacterClassIntersection", 29);
    termkinds.put("CharacterClassSubtraction", 30);
+    termkinds.put("CharacterClassUnion", 31);
  }

  private static final String[] errmsgs =
@@ -372,6 +374,14 @@ public class RegExpExtractor {
      for (RegExpTerm element : nd.getSubtraction())
        visit(element, lbl, i++);
    }
+
+    @Override
+    public void visit(CharacterClassUnion nd) {
+      Label lbl = extractTerm(nd, parent, idx);
+      int i = 0;
+      for (RegExpTerm element : nd.getUnion())
+        visit(element, lbl, i++);
+    }
  }

  public void extract(String src, SourceMap sourceMap, Node parent, boolean isSpeculativeParsing, String flags) {
--- a/javascript/extractor/src/com/semmle/js/parser/RegExpParser.java
+++ b/javascript/extractor/src/com/semmle/js/parser/RegExpParser.java
@@ -9,6 +9,7 @@ import com.semmle.js.ast.regexp.CharacterClassEscape;
 import com.semmle.js.ast.regexp.CharacterClassQuotedString;
 import com.semmle.js.ast.regexp.CharacterClassRange;
 import com.semmle.js.ast.regexp.CharacterClassSubtraction;
+import com.semmle.js.ast.regexp.CharacterClassUnion;
 import com.semmle.js.ast.regexp.Constant;
 import com.semmle.js.ast.regexp.ControlEscape;
 import com.semmle.js.ast.regexp.ControlLetter;
@@ -568,6 +569,7 @@ public class RegExpParser {
    STANDARD,
    INTERSECTION,
    SUBTRACTION,
+    UNION
  }

  // ECMA 2024 `v` flag allows nested character classes.
@@ -599,12 +601,26 @@ public class RegExpParser {
      }
    }

+    boolean containsComplex = elements.stream().anyMatch(term -> term instanceof UnicodePropertyEscape ||
+                                                        term instanceof CharacterClassQuotedString ||
+                                                        term instanceof CharacterClass);
+
+    // Set type to UNION only if:
+    // 1. We haven't already determined a specific type (intersection/subtraction)
+    // 2. We have more than one element
+    // 3. We have at least one complex element (i.e. a nested character class or a UnicodePropertyEscape)
+    if (containsComplex && classType == CharacterClassType.STANDARD && elements.size() > 1) {
+      classType = CharacterClassType.UNION;
+    }
+
    // Create appropriate RegExpTerm based on the detected class type
    switch (classType) {
      case INTERSECTION:
        return this.finishTerm(new CharacterClass(loc, Collections.singletonList(new CharacterClassIntersection(loc, elements)), inverted));
      case SUBTRACTION:
        return this.finishTerm(new CharacterClass(loc, Collections.singletonList(new CharacterClassSubtraction(loc, elements)), inverted));
+      case UNION:
+        return this.finishTerm(new CharacterClass(loc, Collections.singletonList(new CharacterClassUnion(loc, elements)), inverted));
      case STANDARD:
      default:
        return this.finishTerm(new CharacterClass(loc, elements, inverted));