Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions src/batch/services/batch-item-filename.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
const MAX_SLUG_LENGTH = 100;

function slugify(value: string): string {
return value
.replace(/[^a-zA-Z0-9.-]+/g, "-")
.replace(/^-+|-+$/g, "")
.slice(0, MAX_SLUG_LENGTH);
}

export function batchItemFilename(input: string, index: number): string {
const fallback = `item-${index}`;
try {
const url = new URL(input);
const slug = slugify(`${url.hostname}${url.pathname}`);
return slug.length > 0 ? slug : fallback;
} catch {
return fallback;
}
}
16 changes: 16 additions & 0 deletions src/batch/services/batch-record.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import type { BatchResult } from "../types/batch-result.js";

export interface BatchRecord {
error?: { class: string; message: string };
index: number;
input: string;
result?: unknown;
}

export function toBatchRecord(result: BatchResult): BatchRecord {
if (result.ok) {
return { index: result.index, input: result.input, result: result.data };
}

return { index: result.index, input: result.input, error: result.error };
}
33 changes: 33 additions & 0 deletions src/batch/services/directory-sink.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import { mkdirSync, writeFileSync } from "node:fs";
import { join } from "node:path";
import type { BatchResult } from "../types/batch-result.js";
import type { BatchSink } from "../types/batch-sink.js";
import { batchItemFilename } from "./batch-item-filename.js";
import { toBatchRecord } from "./batch-record.js";
import { uniqueName } from "./unique-name.js";

export interface DirectorySinkOptions {
pretty?: boolean;
}

export function createDirectorySink(
dir: string,
options: DirectorySinkOptions = {}
): BatchSink {
mkdirSync(dir, { recursive: true });
const used = new Set<string>();
const indent = options.pretty ? 2 : undefined;

return {
write(result: BatchResult): void {
const base = batchItemFilename(result.input, result.index);
const name = uniqueName(base, used);
const record = toBatchRecord(result);
writeFileSync(
join(dir, `${name}.json`),
JSON.stringify(record, null, indent),
"utf8"
);
},
};
}
11 changes: 11 additions & 0 deletions src/batch/services/ndjson-stdout-sink.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import type { BatchResult } from "../types/batch-result.js";
import type { BatchSink } from "../types/batch-sink.js";
import { toBatchRecord } from "./batch-record.js";

export function createNdjsonStdoutSink(): BatchSink {
return {
write(result: BatchResult): void {
process.stdout.write(`${JSON.stringify(toBatchRecord(result))}\n`);
},
};
}
15 changes: 15 additions & 0 deletions src/batch/services/unique-name.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
export function uniqueName(base: string, used: Set<string>): string {
if (!used.has(base)) {
used.add(base);
return base;
}

let suffix = 2;
while (used.has(`${base}-${suffix}`)) {
suffix++;
}

const candidate = `${base}-${suffix}`;
used.add(candidate);
return candidate;
}
6 changes: 6 additions & 0 deletions src/batch/types/batch-sink.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
import type { BatchResult } from "./batch-result.js";

export interface BatchSink {
close?(): void | Promise<void>;
write(result: BatchResult): void | Promise<void>;
}
19 changes: 19 additions & 0 deletions tests/batch/batch-item-filename.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import { describe, expect, it } from "vitest";
import { batchItemFilename } from "../../src/batch/services/batch-item-filename.js";

describe("batchItemFilename", () => {
it("slugs a URL from host and path, dropping the query", () => {
expect(batchItemFilename("https://example.com/a/b?x=1", 0)).toBe(
"example.com-a-b"
);
});

it("falls back to the row index for non-URL input", () => {
expect(batchItemFilename("how to scrape", 3)).toBe("item-3");
});

it("truncates very long slugs", () => {
const long = `https://example.com/${"a".repeat(300)}`;
expect(batchItemFilename(long, 0).length).toBeLessThanOrEqual(100);
});
});
61 changes: 61 additions & 0 deletions tests/batch/directory-sink.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import { mkdtempSync, readdirSync, readFileSync, rmSync } from "node:fs";
import { tmpdir } from "node:os";
import { join } from "node:path";
import { afterEach, beforeEach, describe, expect, it } from "vitest";
import { createDirectorySink } from "../../src/batch/services/directory-sink.js";

let dir: string;

beforeEach(() => {
dir = mkdtempSync(join(tmpdir(), "batch-dir-sink-"));
});

afterEach(() => {
rmSync(dir, { recursive: true, force: true });
});

describe("createDirectorySink", () => {
it("writes one record file per item, named from the input", () => {
const outDir = join(dir, "out");
const sink = createDirectorySink(outDir);

sink.write({ index: 0, input: "https://a.com/x", ok: true, data: "A" });
sink.write({
index: 1,
input: "https://b.com",
ok: false,
error: { class: "TimeoutError", message: "timed out" },
});

const files = readdirSync(outDir).sort();
expect(files).toEqual(["a.com-x.json", "b.com.json"]);

const success = JSON.parse(
readFileSync(join(outDir, "a.com-x.json"), "utf8")
);
expect(success).toEqual({
index: 0,
input: "https://a.com/x",
result: "A",
});

const failure = JSON.parse(
readFileSync(join(outDir, "b.com.json"), "utf8")
);
expect(failure).toEqual({
index: 1,
input: "https://b.com",
error: { class: "TimeoutError", message: "timed out" },
});
});

it("dedupes colliding names with a numeric suffix", () => {
const sink = createDirectorySink(dir);

sink.write({ index: 0, input: "not a url", ok: true, data: 1 });
sink.write({ index: 0, input: "also not a url", ok: true, data: 2 });

const files = readdirSync(dir).sort();
expect(files).toEqual(["item-0-2.json", "item-0.json"]);
});
});
38 changes: 38 additions & 0 deletions tests/batch/ndjson-stdout-sink.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import { afterEach, describe, expect, it, vi } from "vitest";
import { createNdjsonStdoutSink } from "../../src/batch/services/ndjson-stdout-sink.js";

afterEach(() => {
vi.restoreAllMocks();
});

describe("createNdjsonStdoutSink", () => {
it("writes one JSON line per result to stdout", () => {
const lines: string[] = [];
vi.spyOn(process.stdout, "write").mockImplementation((chunk: unknown) => {
lines.push(String(chunk));
return true;
});

const sink = createNdjsonStdoutSink();
sink.write({ index: 0, input: "https://a.com", ok: true, data: { ok: 1 } });
sink.write({
index: 1,
input: "https://b.com",
ok: false,
error: { class: "ValidationError", message: "nope" },
});

expect(lines).toHaveLength(2);
expect(lines[0].endsWith("\n")).toBe(true);
expect(JSON.parse(lines[0])).toEqual({
index: 0,
input: "https://a.com",
result: { ok: 1 },
});
expect(JSON.parse(lines[1])).toEqual({
index: 1,
input: "https://b.com",
error: { class: "ValidationError", message: "nope" },
});
});
});
Loading