Merge pull request #2858 from github/dbartol/long-strings

Use streaming when creating log symbols file.
2023-09-27 09:55:25 -04:00
parent b1debee244 3c63df2221
commit f1533dde2d
4 changed files with 266 additions and 158 deletions
--- a/extensions/ql-vscode/src/codeql-cli/cli.ts
+++ b/extensions/ql-vscode/src/codeql-cli/cli.ts
@@ -6,7 +6,6 @@ import { dirname, join, delimiter } from "path";
 import * as sarif from "sarif";
 import { SemVer } from "semver";
 import { Readable } from "stream";
-import { StringDecoder } from "string_decoder";
 import tk from "tree-kill";
 import { promisify } from "util";
 import { CancellationToken, Disposable, Uri } from "vscode";
@@ -31,6 +30,7 @@ import { CompilationMessage } from "../query-server/legacy-messages";
 import { sarifParser } from "../common/sarif-parser";
 import { App } from "../common/app";
 import { QueryLanguage } from "../common/query-language";
+import { LINE_ENDINGS, splitStreamAtSeparators } from "../common/split-stream";

 /**
 * The version of the SARIF format that we are using.
@@ -1649,120 +1649,13 @@ export async function runCodeQlCliCommand(
  }
 }

-/**
- * Buffer to hold state used when splitting a text stream into lines.
- */
-class SplitBuffer {
-  private readonly decoder = new StringDecoder("utf8");
-  private readonly maxSeparatorLength: number;
-  private buffer = "";
-  private searchIndex = 0;
-
-  constructor(private readonly separators: readonly string[]) {
-    this.maxSeparatorLength = separators
-      .map((s) => s.length)
-      .reduce((a, b) => Math.max(a, b), 0);
-  }
-
-  /**
-   * Append new text data to the buffer.
-   * @param chunk The chunk of data to append.
-   */
-  public addChunk(chunk: Buffer): void {
-    this.buffer += this.decoder.write(chunk);
-  }
-
-  /**
-   * Signal that the end of the input stream has been reached.
-   */
-  public end(): void {
-    this.buffer += this.decoder.end();
-    this.buffer += this.separators[0]; // Append a separator to the end to ensure the last line is returned.
-  }
-
-  /**
-   * A version of startsWith that isn't overriden by a broken version of ms-python.
-   *
-   * The definition comes from
-   * https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/startsWith
-   * which is CC0/public domain
-   *
-   * See https://github.com/github/vscode-codeql/issues/802 for more context as to why we need it.
-   */
-  private static startsWith(
-    s: string,
-    searchString: string,
-    position: number,
-  ): boolean {
-    const pos = position > 0 ? position | 0 : 0;
-    return s.substring(pos, pos + searchString.length) === searchString;
-  }
-
-  /**
-   * Extract the next full line from the buffer, if one is available.
-   * @returns The text of the next available full line (without the separator), or `undefined` if no
-   * line is available.
-   */
-  public getNextLine(): string | undefined {
-    while (this.searchIndex <= this.buffer.length - this.maxSeparatorLength) {
-      for (const separator of this.separators) {
-        if (SplitBuffer.startsWith(this.buffer, separator, this.searchIndex)) {
-          const line = this.buffer.slice(0, this.searchIndex);
-          this.buffer = this.buffer.slice(this.searchIndex + separator.length);
-          this.searchIndex = 0;
-          return line;
-        }
-      }
-      this.searchIndex++;
-    }
-
-    return undefined;
-  }
-}
-
-/**
- * Splits a text stream into lines based on a list of valid line separators.
- * @param stream The text stream to split. This stream will be fully consumed.
- * @param separators The list of strings that act as line separators.
- * @returns A sequence of lines (not including separators).
- */
-async function* splitStreamAtSeparators(
-  stream: Readable,
-  separators: string[],
-): AsyncGenerator<string, void, unknown> {
-  const buffer = new SplitBuffer(separators);
-  for await (const chunk of stream) {
-    buffer.addChunk(chunk);
-    let line: string | undefined;
-    do {
-      line = buffer.getNextLine();
-      if (line !== undefined) {
-        yield line;
-      }
-    } while (line !== undefined);
-  }
-  buffer.end();
-  let line: string | undefined;
-  do {
-    line = buffer.getNextLine();
-    if (line !== undefined) {
-      yield line;
-    }
-  } while (line !== undefined);
-}
-
-/**
- *  Standard line endings for splitting human-readable text.
- */
-const lineEndings = ["\r\n", "\r", "\n"];
-
 /**
 * Log a text stream to a `Logger` interface.
 * @param stream The stream to log.
 * @param logger The logger that will consume the stream output.
 */
 async function logStream(stream: Readable, logger: BaseLogger): Promise<void> {
-  for await (const line of splitStreamAtSeparators(stream, lineEndings)) {
+  for await (const line of splitStreamAtSeparators(stream, LINE_ENDINGS)) {
    // Await the result of log here in order to ensure the logs are written in the correct order.
    await logger.log(line);
  }
--- a/extensions/ql-vscode/src/common/split-stream.ts
+++ b/extensions/ql-vscode/src/common/split-stream.ts
@@ -0,0 +1,125 @@
+import { Readable } from "stream";
+import { StringDecoder } from "string_decoder";
+
+/**
+ * Buffer to hold state used when splitting a text stream into lines.
+ */
+export class SplitBuffer {
+  private readonly decoder = new StringDecoder("utf8");
+  private readonly maxSeparatorLength: number;
+  private buffer = "";
+  private searchIndex = 0;
+  private ended = false;
+
+  constructor(private readonly separators: readonly string[]) {
+    this.maxSeparatorLength = separators
+      .map((s) => s.length)
+      .reduce((a, b) => Math.max(a, b), 0);
+  }
+
+  /**
+   * Append new text data to the buffer.
+   * @param chunk The chunk of data to append.
+   */
+  public addChunk(chunk: Buffer): void {
+    this.buffer += this.decoder.write(chunk);
+  }
+
+  /**
+   * Signal that the end of the input stream has been reached.
+   */
+  public end(): void {
+    this.buffer += this.decoder.end();
+    this.ended = true;
+  }
+
+  /**
+   * A version of startsWith that isn't overriden by a broken version of ms-python.
+   *
+   * The definition comes from
+   * https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/startsWith
+   * which is CC0/public domain
+   *
+   * See https://github.com/github/vscode-codeql/issues/802 for more context as to why we need it.
+   */
+  private static startsWith(
+    s: string,
+    searchString: string,
+    position: number,
+  ): boolean {
+    const pos = position > 0 ? position | 0 : 0;
+    return s.substring(pos, pos + searchString.length) === searchString;
+  }
+
+  /**
+   * Extract the next full line from the buffer, if one is available.
+   * @returns The text of the next available full line (without the separator), or `undefined` if no
+   * line is available.
+   */
+  public getNextLine(): string | undefined {
+    // If we haven't received all of the input yet, don't search too close to the end of the buffer,
+    // or we could match a separator that's split across two chunks. For example, we could see "\r"
+    // at the end of the buffer and match that, even though we were about to receive a "\n" right
+    // after it.
+    const maxSearchIndex = this.ended
+      ? this.buffer.length - 1
+      : this.buffer.length - this.maxSeparatorLength;
+    while (this.searchIndex <= maxSearchIndex) {
+      for (const separator of this.separators) {
+        if (SplitBuffer.startsWith(this.buffer, separator, this.searchIndex)) {
+          const line = this.buffer.slice(0, this.searchIndex);
+          this.buffer = this.buffer.slice(this.searchIndex + separator.length);
+          this.searchIndex = 0;
+          return line;
+        }
+      }
+      this.searchIndex++;
+    }
+
+    if (this.ended && this.buffer.length > 0) {
+      // If we still have some text left in the buffer, return it as the last line.
+      const line = this.buffer;
+      this.buffer = "";
+      this.searchIndex = 0;
+      return line;
+    } else {
+      return undefined;
+    }
+  }
+}
+
+/**
+ * Splits a text stream into lines based on a list of valid line separators.
+ * @param stream The text stream to split. This stream will be fully consumed.
+ * @param separators The list of strings that act as line separators.
+ * @returns A sequence of lines (not including separators).
+ */
+export async function* splitStreamAtSeparators(
+  stream: Readable,
+  separators: string[],
+): AsyncGenerator<string, void, unknown> {
+  const buffer = new SplitBuffer(separators);
+  for await (const chunk of stream) {
+    buffer.addChunk(chunk);
+    let line: string | undefined;
+    do {
+      line = buffer.getNextLine();
+      if (line !== undefined) {
+        yield line;
+      }
+    } while (line !== undefined);
+  }
+  buffer.end();
+  let line: string | undefined;
+  do {
+    line = buffer.getNextLine();
+    if (line !== undefined) {
+      yield line;
+    }
+  } while (line !== undefined);
+}
+
+/**
+ *  Standard line endings for splitting human-readable text.
+ */
+export const LINE_ENDINGS = ["\r\n", "\r", "\n"];
--- a/extensions/ql-vscode/src/log-insights/summary-parser.ts
+++ b/extensions/ql-vscode/src/log-insights/summary-parser.ts
@@ -1,4 +1,5 @@
-import { writeFile, promises } from "fs-extra";
+import { createReadStream, writeFile } from "fs-extra";
+import { LINE_ENDINGS, splitStreamAtSeparators } from "../common/split-stream";

 /**
 * Location information for a single pipeline invocation in the RA.
@@ -64,59 +65,64 @@ export async function generateSummarySymbolsFile(
 async function generateSummarySymbols(
  summaryPath: string,
 ): Promise<SummarySymbols> {
-  const summary = await promises.readFile(summaryPath, {
+  const stream = createReadStream(summaryPath, {
    encoding: "utf-8",
  });
-  const symbols: SummarySymbols = {
-    predicates: {},
-  };
+  try {
+    const lines = splitStreamAtSeparators(stream, LINE_ENDINGS);

-  const lines = summary.split(/\r?\n/);
-  let lineNumber = 0;
-  while (lineNumber < lines.length) {
-    const startLineNumber = lineNumber;
-    lineNumber++;
-    const startLine = lines[startLineNumber];
-    const nonRecursiveMatch = startLine.match(NON_RECURSIVE_TUPLE_COUNT_REGEXP);
-    let predicateName: string | undefined = undefined;
+    const symbols: SummarySymbols = {
+      predicates: {},
+    };
+
+    let lineNumber = 0;
+    let raStartLine = 0;
    let iteration = 0;
-    if (nonRecursiveMatch) {
-      predicateName = nonRecursiveMatch.groups!.predicateName;
-    } else {
-      const recursiveMatch = startLine.match(RECURSIVE_TUPLE_COUNT_REGEXP);
-      if (recursiveMatch?.groups) {
-        predicateName = recursiveMatch.groups.predicateName;
-        iteration = parseInt(recursiveMatch.groups.iteration);
-      }
-    }
-
-    if (predicateName !== undefined) {
-      const raStartLine = lineNumber;
-      let raEndLine: number | undefined = undefined;
-      while (lineNumber < lines.length && raEndLine === undefined) {
-        const raLine = lines[lineNumber];
-        const returnMatch = raLine.match(RETURN_REGEXP);
+    let predicateName: string | undefined = undefined;
+    let startLine = 0;
+    for await (const line of lines) {
+      if (predicateName === undefined) {
+        // Looking for the start of the predicate.
+        const nonRecursiveMatch = line.match(NON_RECURSIVE_TUPLE_COUNT_REGEXP);
+        if (nonRecursiveMatch) {
+          iteration = 0;
+          predicateName = nonRecursiveMatch.groups!.predicateName;
+        } else {
+          const recursiveMatch = line.match(RECURSIVE_TUPLE_COUNT_REGEXP);
+          if (recursiveMatch?.groups) {
+            predicateName = recursiveMatch.groups.predicateName;
+            iteration = parseInt(recursiveMatch.groups.iteration);
+          }
+        }
+        if (predicateName !== undefined) {
+          startLine = lineNumber;
+          raStartLine = lineNumber + 1;
+        }
+      } else {
+        const returnMatch = line.match(RETURN_REGEXP);
        if (returnMatch) {
-          raEndLine = lineNumber;
-        }
-        lineNumber++;
-      }
-      if (raEndLine !== undefined) {
-        let symbol = symbols.predicates[predicateName];
-        if (symbol === undefined) {
-          symbol = {
-            iterations: {},
+          let symbol = symbols.predicates[predicateName];
+          if (symbol === undefined) {
+            symbol = {
+              iterations: {},
+            };
+            symbols.predicates[predicateName] = symbol;
+          }
+          symbol.iterations[iteration] = {
+            startLine,
+            raStartLine,
+            raEndLine: lineNumber,
          };
-          symbols.predicates[predicateName] = symbol;
-        }
-        symbol.iterations[iteration] = {
-          startLine: lineNumber,
-          raStartLine,
-          raEndLine,
-        };
-      }
-    }
-  }

-  return symbols;
+          predicateName = undefined;
+        }
+      }
+
+      lineNumber++;
+    }
+
+    return symbols;
+  } finally {
+    stream.close();
+  }
 }
--- a/extensions/ql-vscode/test/unit-tests/common/split-buffer.test.ts
+++ b/extensions/ql-vscode/test/unit-tests/common/split-buffer.test.ts
@@ -0,0 +1,84 @@
+import { LINE_ENDINGS, SplitBuffer } from "../../../src/common/split-stream";
+
+interface Chunk {
+  chunk: string;
+  lines: string[];
+}
+
+function checkLines(
+  buffer: SplitBuffer,
+  expectedLinesForChunk: string[],
+  chunkIndex: number | "end",
+): void {
+  expectedLinesForChunk.forEach((expectedLine, lineIndex) => {
+    const line = buffer.getNextLine();
+    const location = `[chunk ${chunkIndex}, line ${lineIndex}]: `;
+    expect(location + line).toEqual(location + expectedLine);
+  });
+  expect(buffer.getNextLine()).toBeUndefined();
+}
+
+function testSplitBuffer(chunks: Chunk[], endLines: string[]): void {
+  const buffer = new SplitBuffer(LINE_ENDINGS);
+  chunks.forEach((chunk, chunkIndex) => {
+    buffer.addChunk(Buffer.from(chunk.chunk, "utf-8"));
+    checkLines(buffer, chunk.lines, chunkIndex);
+  });
+  buffer.end();
+  checkLines(buffer, endLines, "end");
+}
+
+describe("split buffer", () => {
+  it("should handle a one-chunk string with no terminator", async () => {
+    // Won't return the line until we call `end()`.
+    testSplitBuffer([{ chunk: "some text", lines: [] }], ["some text"]);
+  });
+
+  it("should handle a one-chunk string with a one-byte terminator", async () => {
+    // Won't return the line until we call `end()` because the actual terminator is shorter than the
+    // longest terminator.
+    testSplitBuffer([{ chunk: "some text\n", lines: [] }], ["some text"]);
+  });
+
+  it("should handle a one-chunk string with a two-byte terminator", async () => {
+    testSplitBuffer([{ chunk: "some text\r\n", lines: ["some text"] }], []);
+  });
+
+  it("should handle a multi-chunk string with terminators at the end of each chunk", async () => {
+    testSplitBuffer(
+      [
+        { chunk: "first line\n", lines: [] }, // Waiting for second potential terminator byte
+        { chunk: "second line\r", lines: ["first line"] }, // Waiting for second potential terminator byte
+        { chunk: "third line\r\n", lines: ["second line", "third line"] }, // No wait, because we're at the end
+      ],
+      [],
+    );
+  });
+
+  it("should handle a multi-chunk string with terminators at random offsets", async () => {
+    testSplitBuffer(
+      [
+        { chunk: "first line\nsecond", lines: ["first line"] },
+        {
+          chunk: " line\rthird line",
+          lines: ["second line"],
+        },
+        { chunk: "\r\n", lines: ["third line"] },
+      ],
+      [],
+    );
+  });
+
+  it("should handle a terminator split between chunks", async () => {
+    testSplitBuffer(
+      [
+        { chunk: "first line\r", lines: [] },
+        {
+          chunk: "\nsecond line",
+          lines: ["first line"],
+        },
+      ],
+      ["second line"],
+    );
+  });
+});