JavaScript: switch to shared YamlPopulator

This commit is contained in:
Arthur Baars
2023-04-11 17:04:52 +01:00
parent 1caca21552
commit dcca0e0c6c

View File

@@ -1,284 +1,28 @@
package com.semmle.js.extractor;
import com.semmle.util.data.StringUtil;
import com.semmle.util.exception.CatastrophicError;
import com.semmle.util.exception.UserError;
import com.semmle.util.locations.LineTable;
import com.semmle.util.trap.TrapWriter;
import com.semmle.util.trap.TrapWriter.Label;
import com.semmle.util.trap.TrapWriter.Table;
import java.util.Collections;
import org.yaml.snakeyaml.composer.Composer;
import org.yaml.snakeyaml.error.Mark;
import org.yaml.snakeyaml.error.MarkedYAMLException;
import org.yaml.snakeyaml.events.AliasEvent;
import org.yaml.snakeyaml.events.Event;
import org.yaml.snakeyaml.events.MappingStartEvent;
import org.yaml.snakeyaml.events.NodeEvent;
import org.yaml.snakeyaml.events.ScalarEvent;
import org.yaml.snakeyaml.events.SequenceStartEvent;
import org.yaml.snakeyaml.nodes.NodeId;
import org.yaml.snakeyaml.parser.Parser;
import org.yaml.snakeyaml.parser.ParserImpl;
import org.yaml.snakeyaml.reader.ReaderException;
import org.yaml.snakeyaml.reader.StreamReader;
import org.yaml.snakeyaml.resolver.Resolver;
import com.semmle.extractor.yaml.YamlPopulator;
/**
* Extractor for populating YAML files.
*
* <p>The extractor uses <a href="http://www.snakeyaml.org/">SnakeYAML</a> to parse YAML.
* <p>
* The extractor uses <a href="http://www.snakeyaml.org/">SnakeYAML</a> to parse
* YAML.
*/
public class YAMLExtractor implements IExtractor {
/** The tables constituting the YAML dbscheme. */
private static enum YAMLTables implements Table {
YAML(6), // yaml (id: @yaml_node, kind: int ref, parent: @yaml_node_parent ref,
// idx: int ref, tag: string ref, tostring: string ref)
YAML_ANCHORS(2), // yaml_anchors (node: @yaml_node ref, anchor: string ref)
YAML_ALIASES(2), // yaml_aliases (alias: @yaml_alias_node ref, target: string ref)
YAML_SCALARS(
3), // yaml_scalars (scalar: @yaml_scalar_node ref, style: int ref, value: string ref)
YAML_ERRORS(2); // yaml_errors (id: @yaml_error, message: string ref)
private final int arity;
private YAMLTables(int arity) {
this.arity = arity;
}
@Override
public String getName() {
return StringUtil.lc(name());
}
@Override
public int getArity() {
return arity;
}
@Override
public boolean validate(Object... values) {
return true;
}
}
/*
* case @yaml_node.kind of
* 0 = @yaml_scalar_node
* | 1 = @yaml_mapping_node
* | 2 = @yaml_sequence_node
* | 3 = @yaml_alias_node
*/
private static enum NodeKind {
SCALAR,
MAPPING,
SEQUENCE,
ALIAS
};
private final boolean tolerateParseErrors;
private TextualExtractor textualExtractor;
private LocationManager locationManager;
private TrapWriter trapWriter;
private LineTable lineTable;
/**
* The underlying SnakeYAML parser; we use the relatively low-level {@linkplain Parser} instead of
* the more high-level {@linkplain Composer}, since our dbscheme represents YAML documents in AST
* form, with aliases left unresolved.
*/
private Parser parser;
/** The resolver used for resolving type tags. */
private Resolver resolver;
public YAMLExtractor(ExtractorConfig config) {
this.tolerateParseErrors = config.isTolerateParseErrors();
}
private LineTable getLineTable() {
if (lineTable == null) {
lineTable = new LineTable(this.textualExtractor.getSource());
}
return lineTable;
}
@Override
public ParseResultInfo extract(TextualExtractor textualExtractor) {
this.textualExtractor = textualExtractor;
locationManager = textualExtractor.getLocationManager();
trapWriter = textualExtractor.getTrapwriter();
Label fileLabel = locationManager.getFileLabel();
locationManager.setHasLocationTable("yaml_locations");
try {
parser = new ParserImpl(new StreamReader(textualExtractor.getSource()));
resolver = new Resolver();
int idx = 0;
while (!atStreamEnd())
extractDocument(fileLabel, idx++, textualExtractor.getSource().codePoints().toArray());
} catch (MarkedYAMLException e) {
int line = e.getProblemMark().getLine() + 1;
int column = e.getProblemMark().getColumn() + 1;
if (!this.tolerateParseErrors)
throw new UserError(e.getProblem() + ": " + line + ":" + column);
Label lbl = trapWriter.freshLabel();
trapWriter.addTuple(YAMLTables.YAML_ERRORS, lbl, e.getProblem());
locationManager.emitSnippetLocation(lbl, line, column, line, column);
} catch (ReaderException e) {
if (!this.tolerateParseErrors) throw new UserError(e.toString());
int c = e.getCodePoint();
String s = String.valueOf(Character.toChars(c));
trapWriter.addTuple(
YAMLTables.YAML_ERRORS,
trapWriter.freshLabel(),
"Unexpected character " + s + "(" + c + ")");
// unfortunately, SnakeYAML does not provide structured location information for
// ReaderExceptions
}
new YamlPopulator(textualExtractor.getExtractedFile(), textualExtractor.getSource(),
textualExtractor.getTrapwriter(),
this.tolerateParseErrors).extract();
return new ParseResultInfo(0, 0, Collections.emptyList());
}
/** Check whether the parser has encountered the end of the YAML input stream. */
private boolean atStreamEnd() {
if (parser.checkEvent(Event.ID.StreamStart)) parser.getEvent();
return parser.checkEvent(Event.ID.StreamEnd);
}
/** Extract a complete YAML document; cf. {@link Composer#getNode}. */
private void extractDocument(Label parent, int idx, int[] codepoints) {
// Drop the DOCUMENT-START event
parser.getEvent();
extractNode(parent, idx, codepoints);
// Drop the DOCUMENT-END event
parser.getEvent();
}
/** Extract a single YAML node; cf. {@link Composer#composeNode}. */
private void extractNode(Label parent, int idx, int[] codepoints) {
Label label = trapWriter.freshLabel();
NodeKind kind;
String tag = "";
Event start = parser.getEvent(), end = start;
if (start.is(Event.ID.Alias)) {
kind = NodeKind.ALIAS;
trapWriter.addTuple(YAMLTables.YAML_ALIASES, label, ((AliasEvent) start).getAnchor());
} else {
String anchor = start instanceof NodeEvent ? ((NodeEvent) start).getAnchor() : null;
if (anchor != null) trapWriter.addTuple(YAMLTables.YAML_ANCHORS, label, anchor);
if (start.is(Event.ID.Scalar)) {
kind = NodeKind.SCALAR;
ScalarEvent scalar = (ScalarEvent) start;
tag =
getTag(
scalar.getTag(),
NodeId.scalar,
scalar.getValue(),
scalar.getImplicit().canOmitTagInPlainScalar());
Character style = scalar.getStyle();
int styleCode = style == null ? 0 : (int) style;
trapWriter.addTuple(YAMLTables.YAML_SCALARS, label, styleCode, scalar.getValue());
} else if (start.is(Event.ID.SequenceStart)) {
kind = NodeKind.SEQUENCE;
SequenceStartEvent sequenceStart = (SequenceStartEvent) start;
tag = getTag(sequenceStart.getTag(), NodeId.sequence, null, sequenceStart.getImplicit());
int childIdx = 0;
while (!parser.checkEvent(Event.ID.SequenceEnd)) extractNode(label, childIdx++, codepoints);
end = parser.getEvent();
} else if (start.is(Event.ID.MappingStart)) {
kind = NodeKind.MAPPING;
MappingStartEvent mappingStart = (MappingStartEvent) start;
tag = getTag(mappingStart.getTag(), NodeId.mapping, null, mappingStart.getImplicit());
int childIdx = 1;
while (!parser.checkEvent(Event.ID.MappingEnd)) {
extractNode(label, childIdx, codepoints);
extractNode(label, -childIdx, codepoints);
++childIdx;
}
end = parser.getEvent();
} else {
throw new CatastrophicError("Unexpected YAML parser event: " + start);
}
}
trapWriter.addTuple(
YAMLTables.YAML,
label,
kind.ordinal(),
parent,
idx,
tag,
mkToString(start.getStartMark(), end.getEndMark(), codepoints));
extractLocation(label, start.getStartMark(), end.getEndMark());
}
/** Determine the type tag of a node. */
private String getTag(String explicitTag, NodeId kind, String value, boolean implicit) {
if (explicitTag == null || "!".equals(explicitTag))
return resolver.resolve(kind, value, implicit).getValue();
return explicitTag;
}
private static boolean isNewLine(int codePoint) {
switch (codePoint) {
case '\n':
case '\r':
case '\u0085':
case '\u2028':
case '\u2029':
return true;
default:
return false;
}
}
/**
* SnakeYAML doesn't directly expose the source text of nodes, but we also take the file contents
* as an array of Unicode code points. The start and end marks each contain an index into the code
* point stream (the end is exclusive), so we can reconstruct the snippet. For readability, we
* stop at the first encountered newline.
*/
private static String mkToString(Mark startMark, Mark endMark, int[] codepoints) {
StringBuilder b = new StringBuilder();
for (int i = startMark.getIndex(); i < endMark.getIndex() && !isNewLine(codepoints[i]); i++)
b.appendCodePoint(codepoints[i]);
return TextualExtractor.sanitiseToString(b.toString());
}
/** Emit a source location for a YAML node. */
private void extractLocation(Label label, Mark startMark, Mark endMark) {
int startLine, startColumn, endLine, endColumn;
// SnakeYAML uses 0-based indexing for both lines and columns, so need to +1
startLine = startMark.getLine() + 1;
startColumn = startMark.getColumn() + 1;
// SnakeYAML's end positions are exclusive, so only need to +1 for the line
endLine = endMark.getLine() + 1;
endColumn = endMark.getColumn();
// Avoid emitting column zero for non-empty locations
if (endColumn == 0 && !(startLine == endLine && startColumn == endColumn)) {
String source = textualExtractor.getSource();
int offset = getLineTable().getOffsetFromPoint(endMark.getLine(), endMark.getColumn()) - 1;
while (offset > 0 && isNewLine((int)source.charAt(offset))) {
--offset;
}
com.semmle.util.locations.Position adjustedEndPos = getLineTable().getEndPositionFromOffset(offset);
endLine = adjustedEndPos.getLine();
endColumn = adjustedEndPos.getColumn();
}
locationManager.emitSnippetLocation(label, startLine, startColumn, endLine, endColumn);
}
}