extract regexp literals from string concatenations

This commit is contained in:
Erik Krogh Kristensen
2021-09-24 18:41:03 +02:00
parent 9478faf040
commit 12305aae42
16 changed files with 3460 additions and 149 deletions

View File

@@ -3,6 +3,8 @@ package com.semmle.js.extractor;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.Arrays;
import java.util.List;
import java.util.Set;
import java.util.Stack;
@@ -164,6 +166,9 @@ import com.semmle.util.locations.SourceMap;
import com.semmle.util.trap.TrapWriter;
import com.semmle.util.trap.TrapWriter.Label;
import com.semmle.util.files.FileLineOffsetCache;
/** Extractor for AST-based information; invoked by the {@link JSExtractor}. */
public class ASTExtractor {
private final TrapWriter trapwriter;
@@ -567,12 +572,17 @@ public class ASTExtractor {
String valueString = nd.getStringValue();
trapwriter.addTuple("literals", valueString, source, key);
Position start = nd.getLoc().getStart();
com.semmle.util.locations.Position startPos = new com.semmle.util.locations.Position(start.getLine(), start.getColumn(), start.getOffset());
if (nd.isRegExp()) {
OffsetTranslation offsets = new OffsetTranslation();
offsets.set(0, 1); // skip the initial '/'
regexpExtractor.extract(source.substring(1, source.lastIndexOf('/')), offsets, nd, false);
SourceMap sourceMap = SourceMap.legacyWithStartPos(SourceMap.fromString(nd.getRaw()).offsetBy(0, offsets), startPos);
regexpExtractor.extract(source.substring(1, source.lastIndexOf('/')), sourceMap, nd, false);
} else if (nd.isStringLiteral() && !c.isInsideType() && nd.getRaw().length() < 1000) {
regexpExtractor.extract(valueString, makeStringLiteralOffsets(nd.getRaw()), nd, true);
SourceMap sourceMap = SourceMap.legacyWithStartPos(SourceMap.fromString(nd.getRaw()).offsetBy(0, makeStringLiteralOffsets(nd.getRaw())), startPos);
regexpExtractor.extract(valueString, sourceMap, nd, true);
// Scan the string for template tags, if we're in a context where such tags are relevant.
if (scopeManager.isInTemplateFile()) {
@@ -593,6 +603,48 @@ public class ASTExtractor {
return '0' <= ch && ch <= '7';
}
private String getStringConcatResult(Expression exp) {
if (exp instanceof BinaryExpression) {
BinaryExpression be = (BinaryExpression) exp;
if (be.getOperator().equals("+")) {
String left = getStringConcatResult(be.getLeft());
String right = getStringConcatResult(be.getRight());
if (left != null && right != null) {
return left + right;
}
}
} else if (exp instanceof Literal) {
Literal lit = (Literal) exp;
if (!lit.isStringLiteral()) {
return null;
}
return lit.getStringValue();
}
return null;
}
private OffsetTranslation computeStringConcatOffset(Expression exp) {
if (exp instanceof Literal && ((Literal)exp).isStringLiteral()) {
String raw = ((Literal) exp).getRaw();
return makeStringLiteralOffsets(raw);
}
if (exp instanceof BinaryExpression) {
BinaryExpression be = (BinaryExpression) exp;
OffsetTranslation left = computeStringConcatOffset(be.getLeft());
OffsetTranslation right = computeStringConcatOffset(be.getRight());
if (left == null || right == null) {
return null;
}
int delta = be.getRight().getLoc().getStart().getOffset() - be.getLeft().getLoc().getStart().getOffset();
int offset = getStringConcatResult(be.getLeft()).length();
return left.append(right, offset, delta);
}
return null;
}
/**
* Builds a translation from offsets in a string value back to its original raw literal text
* (including quotes).
@@ -786,11 +838,31 @@ public class ASTExtractor {
return key;
}
// set to determine which BinaryExpression has been extracted as regexp
private Set<Expression> extractedAsRegexp = new HashSet<>();
@Override
public Label visit(BinaryExpression nd, Context c) {
Label key = super.visit(nd, c);
extractedAsRegexp.add(nd.getLeft());
extractedAsRegexp.add(nd.getRight());
visit(nd.getLeft(), key, 0);
visit(nd.getRight(), key, 1);
if (extractedAsRegexp.contains(nd)) {
return key;
}
String rawString = getStringConcatResult(nd);
if (rawString == null) {
return key;
}
if (rawString.length() > 1000 && !rawString.trim().isEmpty()) {
return key;
}
OffsetTranslation offsets = computeStringConcatOffset(nd);
Position start = nd.getLoc().getStart();
com.semmle.util.locations.Position startPos = new com.semmle.util.locations.Position(start.getLine(), start.getColumn(), start.getOffset());
SourceMap sourceMap = SourceMap.legacyWithStartPos(SourceMap.fromString(nd.getLoc().getSource()).offsetBy(0, offsets), startPos);
regexpExtractor.extract(rawString, sourceMap, nd, true);
return key;
}

View File

@@ -43,7 +43,7 @@ public class Main {
* A version identifier that should be updated every time the extractor changes in such a way that
* it may produce different tuples for the same file under the same {@link ExtractorConfig}.
*/
public static final String EXTRACTOR_VERSION = "2021-10-25";
public static final String EXTRACTOR_VERSION = "2021-10-28";
public static final Pattern NEWLINE = Pattern.compile("\n");

View File

@@ -43,7 +43,7 @@ import com.semmle.js.ast.regexp.ZeroWidthPositiveLookahead;
import com.semmle.js.ast.regexp.ZeroWidthPositiveLookbehind;
import com.semmle.js.parser.RegExpParser;
import com.semmle.js.parser.RegExpParser.Result;
import com.semmle.util.locations.OffsetTranslation;
import com.semmle.util.locations.SourceMap;
import com.semmle.util.trap.TrapWriter;
import com.semmle.util.trap.TrapWriter.Label;
@@ -52,8 +52,7 @@ public class RegExpExtractor {
private final TrapWriter trapwriter;
private final LocationManager locationManager;
private final RegExpParser parser = new RegExpParser();
private Position literalStart;
private OffsetTranslation offsets;
private SourceMap sourceMap;
public RegExpExtractor(TrapWriter trapwriter, LocationManager locationManager) {
this.trapwriter = trapwriter;
@@ -122,17 +121,14 @@ public class RegExpExtractor {
}
public void emitLocation(SourceElement term, Label lbl) {
int col = literalStart.getColumn();
int sl, sc, el, ec;
sl = el = literalStart.getLine();
sc = col + offsets.get(term.getLoc().getStart().getColumn());
ec = col + offsets.get(term.getLoc().getEnd().getColumn());
sc += 1; // convert to 1-based
ec += 1; // convert to 1-based
ec -= 1; // convert to inclusive
int sl = sourceMap.getStart(term.getLoc().getStart().getColumn()).getLine();
int sc = sourceMap.getStart(term.getLoc().getStart().getColumn()).getColumn() + 1; // convert to 1-based
int el = sourceMap.getEnd(term.getLoc().getEnd().getColumn()).getLine();
int ec = sourceMap.getEnd(term.getLoc().getEnd().getColumn()).getColumn() - 1; // convert to inclusive
locationManager.emitSnippetLocation(lbl, sl, sc, el, ec);
}
private class V implements Visitor {
private Label parent;
private int idx;
@@ -348,16 +344,13 @@ public class RegExpExtractor {
}
}
public void extract(
String src, OffsetTranslation offsets, Node parent, boolean isSpeculativeParsing) {
public void extract(String src, SourceMap sourceMap, Node parent, boolean isSpeculativeParsing) {
Result res = parser.parse(src);
if (isSpeculativeParsing && res.getErrors().size() > 0) {
return;
}
this.literalStart = parent.getLoc().getStart();
this.offsets = offsets;
this.sourceMap = sourceMap;
RegExpTerm ast = res.getAST();
new V().visit(ast, trapwriter.localID(parent), 0);