From eb77b86b03a2cd61cd19e4b2c02c6bd5f0e0fc3d Mon Sep 17 00:00:00 2001 From: Pavel Avgustinov Date: Tue, 13 Aug 2019 10:24:39 +0100 Subject: [PATCH] YAML extraction: Improve toString computation --- .../semmle/js/extractor/YAMLExtractor.java | 68 +++++++++---------- 1 file changed, 32 insertions(+), 36 deletions(-) diff --git a/javascript/extractor/src/com/semmle/js/extractor/YAMLExtractor.java b/javascript/extractor/src/com/semmle/js/extractor/YAMLExtractor.java index c9a9aacce9d..e1966e0a98e 100644 --- a/javascript/extractor/src/com/semmle/js/extractor/YAMLExtractor.java +++ b/javascript/extractor/src/com/semmle/js/extractor/YAMLExtractor.java @@ -103,9 +103,9 @@ public class YAMLExtractor implements IExtractor { try { parser = new ParserImpl(new StreamReader(textualExtractor.getSource())); resolver = new Resolver(); - int idx = 0; - while (!atStreamEnd()) extractDocument(fileLabel, idx++); + while (!atStreamEnd()) + extractDocument(fileLabel, idx++, textualExtractor.getSource().codePoints().toArray()); } catch (MarkedYAMLException e) { int line = e.getProblemMark().getLine() + 1; int column = e.getProblemMark().getColumn() + 1; @@ -136,16 +136,16 @@ public class YAMLExtractor implements IExtractor { } /** Extract a complete YAML document; cf. {@link Composer#composeDocument}. */ - private void extractDocument(Label parent, int idx) { + private void extractDocument(Label parent, int idx, int[] codepoints) { // Drop the DOCUMENT-START event parser.getEvent(); - extractNode(parent, idx); + extractNode(parent, idx, codepoints); // Drop the DOCUMENT-END event parser.getEvent(); } /** Extract a single YAML node; cf. {@link Composer#composeNode}. */ - private void extractNode(Label parent, int idx) { + private void extractNode(Label parent, int idx, int[] codepoints) { Label label = trapWriter.freshLabel(); NodeKind kind; String tag = ""; @@ -169,15 +169,14 @@ public class YAMLExtractor implements IExtractor { scalar.getImplicit().canOmitTagInPlainScalar()); Character style = scalar.getStyle(); int styleCode = style == null ? 0 : (int) style; - trapWriter.addTuple( - YAMLTables.YAML_SCALARS, label, styleCode, scalar.getValue()); + trapWriter.addTuple(YAMLTables.YAML_SCALARS, label, styleCode, scalar.getValue()); } else if (start.is(Event.ID.SequenceStart)) { kind = NodeKind.SEQUENCE; SequenceStartEvent sequenceStart = (SequenceStartEvent) start; tag = getTag(sequenceStart.getTag(), NodeId.sequence, null, sequenceStart.getImplicit()); int childIdx = 0; - while (!parser.checkEvent(Event.ID.SequenceEnd)) extractNode(label, childIdx++); + while (!parser.checkEvent(Event.ID.SequenceEnd)) extractNode(label, childIdx++, codepoints); end = parser.getEvent(); } else if (start.is(Event.ID.MappingStart)) { @@ -187,8 +186,8 @@ public class YAMLExtractor implements IExtractor { int childIdx = 1; while (!parser.checkEvent(Event.ID.MappingEnd)) { - extractNode(label, childIdx); - extractNode(label, -childIdx); + extractNode(label, childIdx, codepoints); + extractNode(label, -childIdx, codepoints); ++childIdx; } @@ -205,7 +204,7 @@ public class YAMLExtractor implements IExtractor { parent, idx, tag, - mkToString(start.getStartMark(), end.getEndMark())); + mkToString(start.getStartMark(), end.getEndMark(), codepoints)); extractLocation(label, start.getStartMark(), end.getEndMark()); } @@ -216,33 +215,30 @@ public class YAMLExtractor implements IExtractor { return explicitTag; } + private static boolean isNewLine(int codePoint) { + switch (codePoint) { + case '\n': + case '\r': + case '\u0085': + case '\u2028': + case '\u2029': + return true; + default: + return false; + } + } + /** - * SnakeYAML doesn't directly expose the source text of nodes, but we can get a decent - * approximation from the snippet associated with the node's start {@linkplain Mark}. - * - *

The snippet of a {@linkplain Mark} is meant to be used for diagnostic messages and consists - * of two lines: the first line contains some context around the source position represented by - * the mark, the second line contains a caret character positioned underneath the source position - * itself. - * - *

To approximate the source text, we take the text on the first line and strip off the first - * n characters, where n is the number of spaces preceding the caret character on - * the second line. - * - *

This is only an approximation, since the context is limited to relatively short strings that - * never extend across newlines, but it suffices for the purposes of toString. + * SnakeYAML doesn't directly expose the source text of nodes, but we also take the file contents + * as an array of Unicode code points. The start and end marks each contain an index into the code + * point stream (the end is exclusive), so we can reconstruct the snippet. For readability, we + * stop at the first encountered newline. */ - private String mkToString(Mark startMark, Mark endMark) { - String snippet = startMark.get_snippet(0, Integer.MAX_VALUE); - int nl = snippet.indexOf('\n'); - String context = snippet.substring(0, nl); - String src = context.substring(snippet.substring(nl + 1).indexOf('^')); - int desiredStringLength = endMark.getColumn() - startMark.getColumn(); - boolean hasAccessToDesiredString = src.length() >= desiredStringLength; - boolean isSingleLine = endMark.getLine() == startMark.getLine(); - if (isSingleLine && hasAccessToDesiredString) - src = src.substring(0, desiredStringLength); - return TextualExtractor.sanitiseToString(src); + private static String mkToString(Mark startMark, Mark endMark, int[] codepoints) { + StringBuilder b = new StringBuilder(); + for (int i = startMark.getIndex(); i < endMark.getIndex() && !isNewLine(codepoints[i]); i++) + b.appendCodePoint(codepoints[i]); + return TextualExtractor.sanitiseToString(b.toString()); } /** Emit a source location for a YAML node. */