Add streaming jsonl parser
This commit is contained in:
@@ -1,26 +1,54 @@
|
|||||||
import { readFile } from "fs-extra";
|
import { stat } from "fs/promises";
|
||||||
|
import { createReadStream } from "fs-extra";
|
||||||
|
|
||||||
|
const doubleLineBreakRegexp = /\n\r?\n/;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Read a file consisting of multiple JSON objects. Each object is separated from the previous one
|
* Read a file consisting of multiple JSON objects. Each object is separated from the previous one
|
||||||
* by a double newline sequence. This is basically a more human-readable form of JSONL.
|
* by a double newline sequence. This is basically a more human-readable form of JSONL.
|
||||||
*
|
*
|
||||||
* The current implementation reads the entire text of the document into memory, but in the future
|
|
||||||
* it will stream the document to improve the performance with large documents.
|
|
||||||
*
|
|
||||||
* @param path The path to the file.
|
* @param path The path to the file.
|
||||||
* @param handler Callback to be invoked for each top-level JSON object in order.
|
* @param handler Callback to be invoked for each top-level JSON object in order.
|
||||||
*/
|
*/
|
||||||
export async function readJsonlFile<T>(
|
export async function readJsonlFile<T>(
|
||||||
path: string,
|
path: string,
|
||||||
handler: (value: T) => Promise<void>,
|
handler: (value: T) => Promise<void>,
|
||||||
|
logger?: { log: (message: string) => void },
|
||||||
): Promise<void> {
|
): Promise<void> {
|
||||||
const logSummary = await readFile(path, "utf-8");
|
void logger?.log(
|
||||||
|
`Parsing ${path} (${(await stat(path)).size / 1024 / 1024} MB)...`,
|
||||||
// Remove newline delimiters because summary is in .jsonl format.
|
);
|
||||||
const jsonSummaryObjects: string[] = logSummary.split(/\r?\n\r?\n/g);
|
return new Promise((resolve, reject) => {
|
||||||
|
const stream = createReadStream(path, { encoding: "utf8" });
|
||||||
for (const obj of jsonSummaryObjects) {
|
let buffer = "";
|
||||||
const jsonObj = JSON.parse(obj) as T;
|
stream.on("data", async (chunk: string) => {
|
||||||
await handler(jsonObj);
|
const parts = (buffer + chunk).split(doubleLineBreakRegexp);
|
||||||
}
|
buffer = parts.pop()!;
|
||||||
|
if (parts.length > 0) {
|
||||||
|
try {
|
||||||
|
stream.pause();
|
||||||
|
for (const part of parts) {
|
||||||
|
await handler(JSON.parse(part));
|
||||||
|
}
|
||||||
|
stream.resume();
|
||||||
|
} catch (e) {
|
||||||
|
stream.destroy();
|
||||||
|
reject(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
stream.on("end", async () => {
|
||||||
|
if (buffer.trim().length > 0) {
|
||||||
|
try {
|
||||||
|
await handler(JSON.parse(buffer));
|
||||||
|
} catch (e) {
|
||||||
|
reject(e);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void logger?.log(`Finishing parsing ${path}`);
|
||||||
|
resolve();
|
||||||
|
});
|
||||||
|
stream.on("error", reject);
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user