Merge pull request #2858 from github/dbartol/long-strings

Use streaming when creating log symbols file.
This commit is contained in:
Dave Bartolomeo
2023-09-27 09:55:25 -04:00
committed by GitHub
4 changed files with 266 additions and 158 deletions

View File

@@ -6,7 +6,6 @@ import { dirname, join, delimiter } from "path";
import * as sarif from "sarif";
import { SemVer } from "semver";
import { Readable } from "stream";
import { StringDecoder } from "string_decoder";
import tk from "tree-kill";
import { promisify } from "util";
import { CancellationToken, Disposable, Uri } from "vscode";
@@ -31,6 +30,7 @@ import { CompilationMessage } from "../query-server/legacy-messages";
import { sarifParser } from "../common/sarif-parser";
import { App } from "../common/app";
import { QueryLanguage } from "../common/query-language";
import { LINE_ENDINGS, splitStreamAtSeparators } from "../common/split-stream";
/**
* The version of the SARIF format that we are using.
@@ -1649,120 +1649,13 @@ export async function runCodeQlCliCommand(
}
}
/**
* Buffer to hold state used when splitting a text stream into lines.
*/
class SplitBuffer {
private readonly decoder = new StringDecoder("utf8");
private readonly maxSeparatorLength: number;
private buffer = "";
private searchIndex = 0;
constructor(private readonly separators: readonly string[]) {
this.maxSeparatorLength = separators
.map((s) => s.length)
.reduce((a, b) => Math.max(a, b), 0);
}
/**
* Append new text data to the buffer.
* @param chunk The chunk of data to append.
*/
public addChunk(chunk: Buffer): void {
this.buffer += this.decoder.write(chunk);
}
/**
* Signal that the end of the input stream has been reached.
*/
public end(): void {
this.buffer += this.decoder.end();
this.buffer += this.separators[0]; // Append a separator to the end to ensure the last line is returned.
}
/**
* A version of startsWith that isn't overriden by a broken version of ms-python.
*
* The definition comes from
* https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/startsWith
* which is CC0/public domain
*
* See https://github.com/github/vscode-codeql/issues/802 for more context as to why we need it.
*/
private static startsWith(
s: string,
searchString: string,
position: number,
): boolean {
const pos = position > 0 ? position | 0 : 0;
return s.substring(pos, pos + searchString.length) === searchString;
}
/**
* Extract the next full line from the buffer, if one is available.
* @returns The text of the next available full line (without the separator), or `undefined` if no
* line is available.
*/
public getNextLine(): string | undefined {
while (this.searchIndex <= this.buffer.length - this.maxSeparatorLength) {
for (const separator of this.separators) {
if (SplitBuffer.startsWith(this.buffer, separator, this.searchIndex)) {
const line = this.buffer.slice(0, this.searchIndex);
this.buffer = this.buffer.slice(this.searchIndex + separator.length);
this.searchIndex = 0;
return line;
}
}
this.searchIndex++;
}
return undefined;
}
}
/**
* Splits a text stream into lines based on a list of valid line separators.
* @param stream The text stream to split. This stream will be fully consumed.
* @param separators The list of strings that act as line separators.
* @returns A sequence of lines (not including separators).
*/
async function* splitStreamAtSeparators(
stream: Readable,
separators: string[],
): AsyncGenerator<string, void, unknown> {
const buffer = new SplitBuffer(separators);
for await (const chunk of stream) {
buffer.addChunk(chunk);
let line: string | undefined;
do {
line = buffer.getNextLine();
if (line !== undefined) {
yield line;
}
} while (line !== undefined);
}
buffer.end();
let line: string | undefined;
do {
line = buffer.getNextLine();
if (line !== undefined) {
yield line;
}
} while (line !== undefined);
}
/**
* Standard line endings for splitting human-readable text.
*/
const lineEndings = ["\r\n", "\r", "\n"];
/**
* Log a text stream to a `Logger` interface.
* @param stream The stream to log.
* @param logger The logger that will consume the stream output.
*/
async function logStream(stream: Readable, logger: BaseLogger): Promise<void> {
for await (const line of splitStreamAtSeparators(stream, lineEndings)) {
for await (const line of splitStreamAtSeparators(stream, LINE_ENDINGS)) {
// Await the result of log here in order to ensure the logs are written in the correct order.
await logger.log(line);
}

View File

@@ -0,0 +1,125 @@
import { Readable } from "stream";
import { StringDecoder } from "string_decoder";
/**
* Buffer to hold state used when splitting a text stream into lines.
*/
export class SplitBuffer {
private readonly decoder = new StringDecoder("utf8");
private readonly maxSeparatorLength: number;
private buffer = "";
private searchIndex = 0;
private ended = false;
constructor(private readonly separators: readonly string[]) {
this.maxSeparatorLength = separators
.map((s) => s.length)
.reduce((a, b) => Math.max(a, b), 0);
}
/**
* Append new text data to the buffer.
* @param chunk The chunk of data to append.
*/
public addChunk(chunk: Buffer): void {
this.buffer += this.decoder.write(chunk);
}
/**
* Signal that the end of the input stream has been reached.
*/
public end(): void {
this.buffer += this.decoder.end();
this.ended = true;
}
/**
* A version of startsWith that isn't overriden by a broken version of ms-python.
*
* The definition comes from
* https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/startsWith
* which is CC0/public domain
*
* See https://github.com/github/vscode-codeql/issues/802 for more context as to why we need it.
*/
private static startsWith(
s: string,
searchString: string,
position: number,
): boolean {
const pos = position > 0 ? position | 0 : 0;
return s.substring(pos, pos + searchString.length) === searchString;
}
/**
* Extract the next full line from the buffer, if one is available.
* @returns The text of the next available full line (without the separator), or `undefined` if no
* line is available.
*/
public getNextLine(): string | undefined {
// If we haven't received all of the input yet, don't search too close to the end of the buffer,
// or we could match a separator that's split across two chunks. For example, we could see "\r"
// at the end of the buffer and match that, even though we were about to receive a "\n" right
// after it.
const maxSearchIndex = this.ended
? this.buffer.length - 1
: this.buffer.length - this.maxSeparatorLength;
while (this.searchIndex <= maxSearchIndex) {
for (const separator of this.separators) {
if (SplitBuffer.startsWith(this.buffer, separator, this.searchIndex)) {
const line = this.buffer.slice(0, this.searchIndex);
this.buffer = this.buffer.slice(this.searchIndex + separator.length);
this.searchIndex = 0;
return line;
}
}
this.searchIndex++;
}
if (this.ended && this.buffer.length > 0) {
// If we still have some text left in the buffer, return it as the last line.
const line = this.buffer;
this.buffer = "";
this.searchIndex = 0;
return line;
} else {
return undefined;
}
}
}
/**
* Splits a text stream into lines based on a list of valid line separators.
* @param stream The text stream to split. This stream will be fully consumed.
* @param separators The list of strings that act as line separators.
* @returns A sequence of lines (not including separators).
*/
export async function* splitStreamAtSeparators(
stream: Readable,
separators: string[],
): AsyncGenerator<string, void, unknown> {
const buffer = new SplitBuffer(separators);
for await (const chunk of stream) {
buffer.addChunk(chunk);
let line: string | undefined;
do {
line = buffer.getNextLine();
if (line !== undefined) {
yield line;
}
} while (line !== undefined);
}
buffer.end();
let line: string | undefined;
do {
line = buffer.getNextLine();
if (line !== undefined) {
yield line;
}
} while (line !== undefined);
}
/**
* Standard line endings for splitting human-readable text.
*/
export const LINE_ENDINGS = ["\r\n", "\r", "\n"];

View File

@@ -1,4 +1,5 @@
import { writeFile, promises } from "fs-extra";
import { createReadStream, writeFile } from "fs-extra";
import { LINE_ENDINGS, splitStreamAtSeparators } from "../common/split-stream";
/**
* Location information for a single pipeline invocation in the RA.
@@ -64,59 +65,64 @@ export async function generateSummarySymbolsFile(
async function generateSummarySymbols(
summaryPath: string,
): Promise<SummarySymbols> {
const summary = await promises.readFile(summaryPath, {
const stream = createReadStream(summaryPath, {
encoding: "utf-8",
});
const symbols: SummarySymbols = {
predicates: {},
};
try {
const lines = splitStreamAtSeparators(stream, LINE_ENDINGS);
const lines = summary.split(/\r?\n/);
let lineNumber = 0;
while (lineNumber < lines.length) {
const startLineNumber = lineNumber;
lineNumber++;
const startLine = lines[startLineNumber];
const nonRecursiveMatch = startLine.match(NON_RECURSIVE_TUPLE_COUNT_REGEXP);
let predicateName: string | undefined = undefined;
const symbols: SummarySymbols = {
predicates: {},
};
let lineNumber = 0;
let raStartLine = 0;
let iteration = 0;
if (nonRecursiveMatch) {
predicateName = nonRecursiveMatch.groups!.predicateName;
} else {
const recursiveMatch = startLine.match(RECURSIVE_TUPLE_COUNT_REGEXP);
if (recursiveMatch?.groups) {
predicateName = recursiveMatch.groups.predicateName;
iteration = parseInt(recursiveMatch.groups.iteration);
}
}
if (predicateName !== undefined) {
const raStartLine = lineNumber;
let raEndLine: number | undefined = undefined;
while (lineNumber < lines.length && raEndLine === undefined) {
const raLine = lines[lineNumber];
const returnMatch = raLine.match(RETURN_REGEXP);
let predicateName: string | undefined = undefined;
let startLine = 0;
for await (const line of lines) {
if (predicateName === undefined) {
// Looking for the start of the predicate.
const nonRecursiveMatch = line.match(NON_RECURSIVE_TUPLE_COUNT_REGEXP);
if (nonRecursiveMatch) {
iteration = 0;
predicateName = nonRecursiveMatch.groups!.predicateName;
} else {
const recursiveMatch = line.match(RECURSIVE_TUPLE_COUNT_REGEXP);
if (recursiveMatch?.groups) {
predicateName = recursiveMatch.groups.predicateName;
iteration = parseInt(recursiveMatch.groups.iteration);
}
}
if (predicateName !== undefined) {
startLine = lineNumber;
raStartLine = lineNumber + 1;
}
} else {
const returnMatch = line.match(RETURN_REGEXP);
if (returnMatch) {
raEndLine = lineNumber;
}
lineNumber++;
}
if (raEndLine !== undefined) {
let symbol = symbols.predicates[predicateName];
if (symbol === undefined) {
symbol = {
iterations: {},
let symbol = symbols.predicates[predicateName];
if (symbol === undefined) {
symbol = {
iterations: {},
};
symbols.predicates[predicateName] = symbol;
}
symbol.iterations[iteration] = {
startLine,
raStartLine,
raEndLine: lineNumber,
};
symbols.predicates[predicateName] = symbol;
}
symbol.iterations[iteration] = {
startLine: lineNumber,
raStartLine,
raEndLine,
};
}
}
}
return symbols;
predicateName = undefined;
}
}
lineNumber++;
}
return symbols;
} finally {
stream.close();
}
}

View File

@@ -0,0 +1,84 @@
import { LINE_ENDINGS, SplitBuffer } from "../../../src/common/split-stream";
interface Chunk {
chunk: string;
lines: string[];
}
function checkLines(
buffer: SplitBuffer,
expectedLinesForChunk: string[],
chunkIndex: number | "end",
): void {
expectedLinesForChunk.forEach((expectedLine, lineIndex) => {
const line = buffer.getNextLine();
const location = `[chunk ${chunkIndex}, line ${lineIndex}]: `;
expect(location + line).toEqual(location + expectedLine);
});
expect(buffer.getNextLine()).toBeUndefined();
}
function testSplitBuffer(chunks: Chunk[], endLines: string[]): void {
const buffer = new SplitBuffer(LINE_ENDINGS);
chunks.forEach((chunk, chunkIndex) => {
buffer.addChunk(Buffer.from(chunk.chunk, "utf-8"));
checkLines(buffer, chunk.lines, chunkIndex);
});
buffer.end();
checkLines(buffer, endLines, "end");
}
describe("split buffer", () => {
it("should handle a one-chunk string with no terminator", async () => {
// Won't return the line until we call `end()`.
testSplitBuffer([{ chunk: "some text", lines: [] }], ["some text"]);
});
it("should handle a one-chunk string with a one-byte terminator", async () => {
// Won't return the line until we call `end()` because the actual terminator is shorter than the
// longest terminator.
testSplitBuffer([{ chunk: "some text\n", lines: [] }], ["some text"]);
});
it("should handle a one-chunk string with a two-byte terminator", async () => {
testSplitBuffer([{ chunk: "some text\r\n", lines: ["some text"] }], []);
});
it("should handle a multi-chunk string with terminators at the end of each chunk", async () => {
testSplitBuffer(
[
{ chunk: "first line\n", lines: [] }, // Waiting for second potential terminator byte
{ chunk: "second line\r", lines: ["first line"] }, // Waiting for second potential terminator byte
{ chunk: "third line\r\n", lines: ["second line", "third line"] }, // No wait, because we're at the end
],
[],
);
});
it("should handle a multi-chunk string with terminators at random offsets", async () => {
testSplitBuffer(
[
{ chunk: "first line\nsecond", lines: ["first line"] },
{
chunk: " line\rthird line",
lines: ["second line"],
},
{ chunk: "\r\n", lines: ["third line"] },
],
[],
);
});
it("should handle a terminator split between chunks", async () => {
testSplitBuffer(
[
{ chunk: "first line\r", lines: [] },
{
chunk: "\nsecond line",
lines: ["first line"],
},
],
["second line"],
);
});
});