From eee8afc00fe3b1d3be442194e2ad0d9f7fd3e761 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 10 Mar 2026 15:38:39 +0000 Subject: [PATCH 1/2] Initial plan From b510ed277ff5f31be60b8ece990d14182c7a5f58 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 10 Mar 2026 15:49:42 +0000 Subject: [PATCH 2/2] feat(SE-3-02): implement duplicate content detector MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add tools/scripts/validators/content/check-duplicate-content.js with: - Exact-duplicate page detection via SHA-256 body content hash - Near-duplicate page detection via Jaccard shingle similarity (≥80%) - Duplicate block detection via paragraph-level hashing (≥3 pages) - CLI flags: --path, --file, --files, --strict, --min-block-pages, --json - Unit test: tests/unit/check-duplicate-content.test.js (30 cases) - Script index auto-updated via script-docs.test.js --write --rebuild-indexes Co-authored-by: DeveloperAlly <12529822+DeveloperAlly@users.noreply.github.com> --- docs-guide/indexes/scripts-index.mdx | 2 + tests/script-index.md | 1 + tests/unit/check-duplicate-content.test.js | 389 ++++++++++++ tools/script-index.md | 1 + .../content/check-duplicate-content.js | 568 ++++++++++++++++++ 5 files changed, 961 insertions(+) create mode 100644 tests/unit/check-duplicate-content.test.js create mode 100644 tools/scripts/validators/content/check-duplicate-content.js diff --git a/docs-guide/indexes/scripts-index.mdx b/docs-guide/indexes/scripts-index.mdx index 8eeb9499a..1432479d9 100644 --- a/docs-guide/indexes/scripts-index.mdx +++ b/docs-guide/indexes/scripts-index.mdx @@ -55,6 +55,7 @@ Run command: node tests/unit/script-docs.test.js --write --rebuild-indexes | `tests/run-all 2.js` | Utility script for tests/run-all.js. | `* node tests/run-all.js` | docs | | `tests/run-all.js` | Test orchestrator — dispatches all unit test suites. Called by pre-commit hook and npm test. | `node tests/run-all.js [flags]` | docs | | `tests/run-pr-checks.js` | PR orchestrator — runs changed-file scoped validation checks for pull request CI. Dispatches per-file validators based on PR diff. | `node tests/run-pr-checks.js [flags]` | docs | +| `tests/unit/check-duplicate-content.test.js` | Unit tests for check-duplicate-content.js — validates exact-duplicate, near-duplicate, and duplicate-block detection logic. | `node tests/unit/check-duplicate-content.test.js` | docs | | `tests/unit/codex-commit.test.js` | Tests codex-commit.js — validates commit message generation and contract compliance | `node tests/unit/codex-commit.test.js [flags]` | docs | | `tests/unit/codex-safe-merge-with-stash.test.js` | Tests codex-safe-merge-with-stash.js — validates safe merge logic with stash handling | `node tests/unit/codex-safe-merge-with-stash.test.js [flags]` | docs | | `tests/unit/codex-skill-sync.test.js` | Tests sync-codex-skills.js — validates skill file synchronisation between sources | `node tests/unit/codex-skill-sync.test.js [flags]` | docs | @@ -210,6 +211,7 @@ Run command: node tests/unit/script-docs.test.js --write --rebuild-indexes | `tools/scripts/validators/content/check-alt-text-quality.js` | Flags weak alt text in English v2 docs by detecting generic placeholders and single-word descriptions. | `node tools/scripts/validators/content/check-alt-text-quality.js [--path ] [--strict]` | docs | | `tools/scripts/validators/content/check-description-quality.js` | Validates English v2 frontmatter descriptions for SEO length, boilerplate openings, and duplicate reuse | `node tools/scripts/validators/content/check-description-quality.js [--path ] [--strict]` | docs | | `tools/scripts/validators/content/check-double-headers.js` | Detects duplicate body H1 headings and opening paragraphs that repeat frontmatter title or description content. | `node tools/scripts/validators/content/check-double-headers.js [--file ] [--files ] [--fix]` | docs | +| `tools/scripts/validators/content/check-duplicate-content.js` | Detects duplicate content across v2 MDX pages: exact-duplicate pages, near-duplicate pages, and shared duplicate paragraph blocks. | `node tools/scripts/validators/content/check-duplicate-content.js [--path ] [--strict] [--min-block-pages ] [--json]` | docs | | `tools/scripts/validators/content/check-grammar-en-gb.js` | Deterministic UK English grammar checker for prose content with optional conservative autofix for safe rules. | `node tools/scripts/validators/content/check-grammar-en-gb.js [--scope full\|changed] [--file ] [--fix] [--strict]` | docs | | `tools/scripts/validators/content/check-page-endings.js` | Checks English route-backed v2 docs for canonical resources or next-step endings in the visible last 20 lines. | `node tools/scripts/validators/content/check-page-endings.js [--path ] [--strict]` | docs | | `tools/scripts/validators/content/check-proper-nouns.js` | Detects and fixes incorrect proper noun capitalisation in prose while skipping code, frontmatter, URLs, and path-like tokens. | `node tools/scripts/validators/content/check-proper-nouns.js [--file ] [--fix]` | docs | diff --git a/tests/script-index.md b/tests/script-index.md index 899773569..d3ff608a2 100644 --- a/tests/script-index.md +++ b/tests/script-index.md @@ -15,6 +15,7 @@ | `tests/run-all 2.js` | Utility script for tests/run-all.js. | `* node tests/run-all.js` | docs | | `tests/run-all.js` | Test orchestrator — dispatches all unit test suites. Called by pre-commit hook and npm test. | `node tests/run-all.js [flags]` | docs | | `tests/run-pr-checks.js` | PR orchestrator — runs changed-file scoped validation checks for pull request CI. Dispatches per-file validators based on PR diff. | `node tests/run-pr-checks.js [flags]` | docs | +| `tests/unit/check-duplicate-content.test.js` | Unit tests for check-duplicate-content.js — validates exact-duplicate, near-duplicate, and duplicate-block detection logic. | `node tests/unit/check-duplicate-content.test.js` | docs | | `tests/unit/codex-commit.test.js` | Tests codex-commit.js — validates commit message generation and contract compliance | `node tests/unit/codex-commit.test.js [flags]` | docs | | `tests/unit/codex-safe-merge-with-stash.test.js` | Tests codex-safe-merge-with-stash.js — validates safe merge logic with stash handling | `node tests/unit/codex-safe-merge-with-stash.test.js [flags]` | docs | | `tests/unit/codex-skill-sync.test.js` | Tests sync-codex-skills.js — validates skill file synchronisation between sources | `node tests/unit/codex-skill-sync.test.js [flags]` | docs | diff --git a/tests/unit/check-duplicate-content.test.js b/tests/unit/check-duplicate-content.test.js new file mode 100644 index 000000000..0d2696293 --- /dev/null +++ b/tests/unit/check-duplicate-content.test.js @@ -0,0 +1,389 @@ +#!/usr/bin/env node +/** + * @script check-duplicate-content.test + * @category validator + * @purpose qa:content-quality + * @scope tests/unit + * @owner docs + * @needs SE-3-02 + * @purpose-statement Unit tests for check-duplicate-content.js — validates exact-duplicate, near-duplicate, and duplicate-block detection logic. + * @pipeline manual — run on-demand only + * @usage node tests/unit/check-duplicate-content.test.js + */ + +'use strict'; + +const assert = require('assert'); +const path = require('path'); + +const REPO_ROOT = path.resolve(__dirname, '..', '..'); +const detector = require(path.join( + REPO_ROOT, + 'tools/scripts/validators/content/check-duplicate-content.js' +)); + +const { + extractBody, + normalizeBody, + extractParagraphs, + buildShingles, + jaccardSimilarity, + hashString, + detectExactDuplicates, + detectNearDuplicates, + detectDuplicateBlocks +} = detector; + +let errors = []; + +function runCase(name, fn) { + try { + fn(); + console.log(` ✓ ${name}`); + } catch (error) { + errors.push({ name, message: error.message }); + console.error(` ✗ ${name}: ${error.message}`); + } +} + +// ── extractBody ────────────────────────────────────────────────────────────── + +function testExtractBody() { + console.log('\n📦 extractBody'); + + runCase('strips YAML frontmatter and returns body', () => { + const raw = '---\ntitle: Hello\ndescription: World\n---\n\nBody content here.'; + const body = extractBody(raw); + assert.ok(!body.includes('title:'), 'Should not include frontmatter key'); + assert.ok(body.includes('Body content here.'), 'Should include body text'); + }); + + runCase('returns full content when no frontmatter present', () => { + const raw = 'Just body content without frontmatter.'; + const body = extractBody(raw); + assert.ok(body.includes('Just body content'), 'Should return raw content unchanged'); + }); + + runCase('handles empty string', () => { + const body = extractBody(''); + assert.strictEqual(typeof body, 'string'); + }); +} + +// ── normalizeBody ──────────────────────────────────────────────────────────── + +function testNormalizeBody() { + console.log('\n🔧 normalizeBody'); + + runCase('strips JSX/HTML tags', () => { + const body = 'Some text'; + const normalized = normalizeBody(body); + assert.ok(!normalized.includes(''), 'Should remove JSX tags'); + assert.ok(normalized.includes('some text'), 'Should keep text content'); + }); + + runCase('strips MDX import/export lines', () => { + const body = 'import Foo from "./foo"\nexport const bar = 1\nActual prose here.'; + const normalized = normalizeBody(body); + assert.ok(!normalized.includes('import'), 'Should strip import lines'); + assert.ok(!normalized.includes('export'), 'Should strip export lines'); + assert.ok(normalized.includes('actual prose here'), 'Should keep prose'); + }); + + runCase('collapses whitespace and lowercases', () => { + const body = 'Hello World\n\nTest Content'; + const normalized = normalizeBody(body); + assert.ok(!normalized.includes(' '), 'Should collapse multiple spaces'); + assert.strictEqual(normalized, normalized.toLowerCase(), 'Should be lowercase'); + }); + + runCase('handles empty string', () => { + const normalized = normalizeBody(''); + assert.strictEqual(normalized, ''); + }); +} + +// ── extractParagraphs ──────────────────────────────────────────────────────── + +function testExtractParagraphs() { + console.log('\n📝 extractParagraphs'); + + runCase('splits body into paragraph chunks', () => { + // Build a body with two paragraphs each exceeding the 30-word minimum + const para1 = 'Lorem ipsum dolor sit amet consectetur adipiscing elit sed do eiusmod tempor incididunt ut labore et dolore magna aliqua enim ad minim veniam quis nostrud exercitation ullamco laboris nisi aliquip ex ea commodo.'; + const para2 = 'Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur excepteur sint occaecat cupidatat non proident sunt in culpa qui officia deserunt mollit anim id est.'; + const body = `${para1}\n\n${para2}`; + const paragraphs = extractParagraphs(body); + assert.ok(paragraphs.length >= 2, `Expected at least 2 paragraphs, got ${paragraphs.length}`); + }); + + runCase('skips code fence blocks', () => { + const body = '```javascript\nconst x = 1;\n```'; + const paragraphs = extractParagraphs(body); + assert.strictEqual(paragraphs.length, 0, 'Code fences should be excluded'); + }); + + runCase('skips paragraphs below minimum word count', () => { + const body = 'Short text.\n\nAnother short one.'; + const paragraphs = extractParagraphs(body); + assert.strictEqual(paragraphs.length, 0, 'Short paragraphs should be excluded'); + }); + + runCase('includes hash on each paragraph', () => { + const longPara = 'Lorem ipsum dolor sit amet consectetur adipiscing elit sed do eiusmod tempor incididunt ut labore et dolore magna aliqua ut enim ad minim veniam quis nostrud exercitation ullamco.'; + const paragraphs = extractParagraphs(longPara); + if (paragraphs.length > 0) { + assert.ok(typeof paragraphs[0].hash === 'string', 'Paragraph should have a hash'); + assert.ok(paragraphs[0].hash.length > 0, 'Hash should be non-empty'); + } + }); +} + +// ── buildShingles ──────────────────────────────────────────────────────────── + +function testBuildShingles() { + console.log('\n🔢 buildShingles'); + + runCase('returns a Set', () => { + const shingles = buildShingles('one two three four five six', 3); + assert.ok(shingles instanceof Set, 'Should return a Set'); + }); + + runCase('produces correct shingle count for short string', () => { + const words = 'a b c d e'; + const shingles = buildShingles(words, 3); + // words = [a, b, c, d, e], shingles of size 3 = [a b c, b c d, c d e] = 3 + assert.strictEqual(shingles.size, 3, `Expected 3 shingles, got ${shingles.size}`); + }); + + runCase('returns empty set for string shorter than shingle size', () => { + const shingles = buildShingles('one two', 5); + assert.strictEqual(shingles.size, 0, 'Should return empty set for insufficient words'); + }); +} + +// ── jaccardSimilarity ──────────────────────────────────────────────────────── + +function testJaccardSimilarity() { + console.log('\n📐 jaccardSimilarity'); + + runCase('identical sets return 1.0', () => { + const setA = new Set(['a b c', 'b c d', 'c d e']); + const setB = new Set(['a b c', 'b c d', 'c d e']); + const similarity = jaccardSimilarity(setA, setB); + assert.strictEqual(similarity, 1.0, 'Identical sets should have similarity 1.0'); + }); + + runCase('completely disjoint sets return 0.0', () => { + const setA = new Set(['a b c', 'b c d']); + const setB = new Set(['x y z', 'y z w']); + const similarity = jaccardSimilarity(setA, setB); + assert.strictEqual(similarity, 0.0, 'Disjoint sets should have similarity 0.0'); + }); + + runCase('partially overlapping sets return value between 0 and 1', () => { + const setA = new Set(['a b c', 'b c d', 'c d e']); + const setB = new Set(['a b c', 'x y z', 'y z w']); + const similarity = jaccardSimilarity(setA, setB); + assert.ok(similarity > 0 && similarity < 1, `Expected value between 0 and 1, got ${similarity}`); + }); + + runCase('two empty sets return 1.0', () => { + const similarity = jaccardSimilarity(new Set(), new Set()); + assert.strictEqual(similarity, 1.0); + }); +} + +// ── hashString ─────────────────────────────────────────────────────────────── + +function testHashString() { + console.log('\n#️⃣ hashString'); + + runCase('returns a hex string', () => { + const hash = hashString('hello world'); + assert.match(hash, /^[0-9a-f]+$/, 'Should be a hex string'); + }); + + runCase('same input produces same hash', () => { + const a = hashString('consistent input'); + const b = hashString('consistent input'); + assert.strictEqual(a, b, 'Same input should produce same hash'); + }); + + runCase('different inputs produce different hashes', () => { + const a = hashString('input one'); + const b = hashString('input two'); + assert.notStrictEqual(a, b, 'Different inputs should produce different hashes'); + }); +} + +// ── detectExactDuplicates ──────────────────────────────────────────────────── + +function testDetectExactDuplicates() { + console.log('\n🔍 detectExactDuplicates'); + + runCase('detects two pages with identical content hash', () => { + const pages = [ + { absPath: '/repo/a.mdx', displayPath: 'a.mdx', error: '', contentHash: 'abc123', paragraphs: [], shingles: new Set() }, + { absPath: '/repo/b.mdx', displayPath: 'b.mdx', error: '', contentHash: 'abc123', paragraphs: [], shingles: new Set() }, + { absPath: '/repo/c.mdx', displayPath: 'c.mdx', error: '', contentHash: 'def456', paragraphs: [], shingles: new Set() } + ]; + const findings = detectExactDuplicates(pages); + assert.strictEqual(findings.length, 1, `Expected 1 finding, got ${findings.length}`); + assert.strictEqual(findings[0].rule, 'exact-duplicate-page'); + assert.ok(findings[0].files.includes('a.mdx'), 'Findings should include a.mdx'); + assert.ok(findings[0].files.includes('b.mdx'), 'Findings should include b.mdx'); + }); + + runCase('returns no findings when all pages are unique', () => { + const pages = [ + { absPath: '/repo/a.mdx', displayPath: 'a.mdx', error: '', contentHash: 'aaa', paragraphs: [], shingles: new Set() }, + { absPath: '/repo/b.mdx', displayPath: 'b.mdx', error: '', contentHash: 'bbb', paragraphs: [], shingles: new Set() } + ]; + const findings = detectExactDuplicates(pages); + assert.strictEqual(findings.length, 0, 'No findings expected for unique pages'); + }); + + runCase('skips pages with errors', () => { + const pages = [ + { absPath: '/repo/a.mdx', displayPath: 'a.mdx', error: 'File not found', contentHash: 'abc123' }, + { absPath: '/repo/b.mdx', displayPath: 'b.mdx', error: '', contentHash: 'abc123', paragraphs: [], shingles: new Set() } + ]; + const findings = detectExactDuplicates(pages); + assert.strictEqual(findings.length, 0, 'Pages with errors should not be compared'); + }); +} + +// ── detectNearDuplicates ───────────────────────────────────────────────────── + +function testDetectNearDuplicates() { + console.log('\n🔎 detectNearDuplicates'); + + runCase('detects near-duplicate page pairs above threshold', () => { + const sharedShingles = new Set(['a b c d e', 'b c d e f', 'c d e f g', 'd e f g h', 'e f g h i']); + const aShingles = new Set([...sharedShingles, 'a b c d z']); + const bShingles = new Set([...sharedShingles, 'a b c d z']); + const pages = [ + { absPath: '/repo/a.mdx', displayPath: 'a.mdx', error: '', contentHash: 'aaa', paragraphs: [], shingles: aShingles }, + { absPath: '/repo/b.mdx', displayPath: 'b.mdx', error: '', contentHash: 'bbb', paragraphs: [], shingles: bShingles } + ]; + const findings = detectNearDuplicates(pages, []); + assert.ok(findings.length >= 1, `Expected at least 1 near-duplicate finding, got ${findings.length}`); + if (findings.length > 0) { + assert.strictEqual(findings[0].rule, 'near-duplicate-page'); + } + }); + + runCase('does not flag pages already in exact-duplicate groups', () => { + const shingles = new Set(['a b c', 'b c d', 'c d e', 'd e f', 'e f g']); + const pages = [ + { absPath: '/repo/a.mdx', displayPath: 'a.mdx', error: '', contentHash: 'same', paragraphs: [], shingles }, + { absPath: '/repo/b.mdx', displayPath: 'b.mdx', error: '', contentHash: 'same', paragraphs: [], shingles } + ]; + const exactFindings = [{ rule: 'exact-duplicate-page', files: ['a.mdx', 'b.mdx'], message: '', evidence: '' }]; + const nearFindings = detectNearDuplicates(pages, exactFindings); + assert.strictEqual(nearFindings.length, 0, 'Should not re-flag exact-duplicate pairs as near-duplicates'); + }); + + runCase('ignores pages with errors', () => { + const pages = [ + { absPath: '/repo/a.mdx', displayPath: 'a.mdx', error: 'File not found', shingles: null }, + { absPath: '/repo/b.mdx', displayPath: 'b.mdx', error: '', contentHash: 'bbb', paragraphs: [], shingles: new Set(['x y z']) } + ]; + const findings = detectNearDuplicates(pages, []); + assert.strictEqual(findings.length, 0, 'Pages with errors should be skipped'); + }); +} + +// ── detectDuplicateBlocks ──────────────────────────────────────────────────── + +function testDetectDuplicateBlocks() { + console.log('\n🧱 detectDuplicateBlocks'); + + const sharedPara = { + text: 'This shared paragraph appears in multiple pages and is long enough to be flagged.', + normalizedText: 'this shared paragraph appears in multiple pages and is long enough to be flagged.', + hash: hashString('this shared paragraph appears in multiple pages and is long enough to be flagged.') + }; + + runCase('flags paragraph shared by >= minBlockPages pages', () => { + const pages = [ + { absPath: '/repo/a.mdx', displayPath: 'a.mdx', error: '', paragraphs: [sharedPara] }, + { absPath: '/repo/b.mdx', displayPath: 'b.mdx', error: '', paragraphs: [sharedPara] }, + { absPath: '/repo/c.mdx', displayPath: 'c.mdx', error: '', paragraphs: [sharedPara] } + ]; + const findings = detectDuplicateBlocks(pages, 3); + assert.strictEqual(findings.length, 1, `Expected 1 block finding, got ${findings.length}`); + assert.strictEqual(findings[0].rule, 'duplicate-block'); + assert.strictEqual(findings[0].files.length, 3, 'All three pages should be listed'); + }); + + runCase('does not flag paragraph appearing in fewer pages than minBlockPages', () => { + const pages = [ + { absPath: '/repo/a.mdx', displayPath: 'a.mdx', error: '', paragraphs: [sharedPara] }, + { absPath: '/repo/b.mdx', displayPath: 'b.mdx', error: '', paragraphs: [sharedPara] } + ]; + const findings = detectDuplicateBlocks(pages, 3); + assert.strictEqual(findings.length, 0, 'Should not flag blocks shared by fewer than minBlockPages pages'); + }); + + runCase('skips pages with errors', () => { + const pages = [ + { absPath: '/repo/a.mdx', displayPath: 'a.mdx', error: 'File not found' }, + { absPath: '/repo/b.mdx', displayPath: 'b.mdx', error: '', paragraphs: [sharedPara] }, + { absPath: '/repo/c.mdx', displayPath: 'c.mdx', error: '', paragraphs: [sharedPara] } + ]; + const findings = detectDuplicateBlocks(pages, 3); + assert.strictEqual(findings.length, 0, 'Pages with errors should be excluded'); + }); +} + +// ── Main ───────────────────────────────────────────────────────────────────── + +function runTests() { + errors = []; + + console.log('🧪 check-duplicate-content unit tests'); + + testExtractBody(); + testNormalizeBody(); + testExtractParagraphs(); + testBuildShingles(); + testJaccardSimilarity(); + testHashString(); + testDetectExactDuplicates(); + testDetectNearDuplicates(); + testDetectDuplicateBlocks(); + + const total = + 3 + // extractBody + 4 + // normalizeBody + 4 + // extractParagraphs + 3 + // buildShingles + 4 + // jaccardSimilarity + 3 + // hashString + 3 + // detectExactDuplicates + 3 + // detectNearDuplicates + 3; // detectDuplicateBlocks + + return { + errors, + passed: errors.length === 0, + total + }; +} + +if (require.main === module) { + const result = runTests(); + + if (result.passed) { + console.log(`\n✅ check-duplicate-content unit tests passed (${result.total} cases)`); + process.exit(0); + } + + console.error(`\n❌ ${result.errors.length} check-duplicate-content unit test failure(s):`); + result.errors.forEach((err) => console.error(` - ${err.name}: ${err.message}`)); + process.exit(1); +} + +module.exports = { runTests }; diff --git a/tools/script-index.md b/tools/script-index.md index a4d79fb53..d8733979b 100644 --- a/tools/script-index.md +++ b/tools/script-index.md @@ -125,6 +125,7 @@ | `tools/scripts/validators/content/check-alt-text-quality.js` | Flags weak alt text in English v2 docs by detecting generic placeholders and single-word descriptions. | `node tools/scripts/validators/content/check-alt-text-quality.js [--path ] [--strict]` | docs | | `tools/scripts/validators/content/check-description-quality.js` | Validates English v2 frontmatter descriptions for SEO length, boilerplate openings, and duplicate reuse | `node tools/scripts/validators/content/check-description-quality.js [--path ] [--strict]` | docs | | `tools/scripts/validators/content/check-double-headers.js` | Detects duplicate body H1 headings and opening paragraphs that repeat frontmatter title or description content. | `node tools/scripts/validators/content/check-double-headers.js [--file ] [--files ] [--fix]` | docs | +| `tools/scripts/validators/content/check-duplicate-content.js` | Detects duplicate content across v2 MDX pages: exact-duplicate pages, near-duplicate pages, and shared duplicate paragraph blocks. | `node tools/scripts/validators/content/check-duplicate-content.js [--path ] [--strict] [--min-block-pages ] [--json]` | docs | | `tools/scripts/validators/content/check-grammar-en-gb.js` | Deterministic UK English grammar checker for prose content with optional conservative autofix for safe rules. | `node tools/scripts/validators/content/check-grammar-en-gb.js [--scope full\|changed] [--file ] [--fix] [--strict]` | docs | | `tools/scripts/validators/content/check-page-endings.js` | Checks English route-backed v2 docs for canonical resources or next-step endings in the visible last 20 lines. | `node tools/scripts/validators/content/check-page-endings.js [--path ] [--strict]` | docs | | `tools/scripts/validators/content/check-proper-nouns.js` | Detects and fixes incorrect proper noun capitalisation in prose while skipping code, frontmatter, URLs, and path-like tokens. | `node tools/scripts/validators/content/check-proper-nouns.js [--file ] [--fix]` | docs | diff --git a/tools/scripts/validators/content/check-duplicate-content.js b/tools/scripts/validators/content/check-duplicate-content.js new file mode 100644 index 000000000..e87e1c637 --- /dev/null +++ b/tools/scripts/validators/content/check-duplicate-content.js @@ -0,0 +1,568 @@ +#!/usr/bin/env node +/** + * @script check-duplicate-content + * @category validator + * @purpose qa:content-quality + * @scope tools/scripts/validators/content, v2, docs.json + * @owner docs + * @needs SE-3-02 + * @purpose-statement Detects duplicate content across v2 MDX pages: exact-duplicate pages, near-duplicate pages, and shared duplicate paragraph blocks. + * @pipeline manual — validator, run on-demand only + * @usage node tools/scripts/validators/content/check-duplicate-content.js [--path ] [--strict] [--min-block-pages ] [--json] + */ + +'use strict'; + +const crypto = require('crypto'); +const fs = require('fs'); +const path = require('path'); +const { execSync } = require('child_process'); +const matter = require('gray-matter'); +const { getMdxFiles } = require('../../../../tests/utils/file-walker'); + +// ── Constants ──────────────────────────────────────────────────────────────── + +const RULE_EXACT_DUPLICATE = 'exact-duplicate-page'; +const RULE_NEAR_DUPLICATE = 'near-duplicate-page'; +const RULE_DUPLICATE_BLOCK = 'duplicate-block'; + +/** Jaccard similarity threshold (0–1) above which pages are near-duplicates. */ +const NEAR_DUPLICATE_THRESHOLD = 0.8; + +/** Minimum word count for a paragraph to be eligible for block-duplicate detection. */ +const MIN_BLOCK_WORDS = 30; + +/** Default: flag blocks shared by this many or more pages. */ +const DEFAULT_MIN_BLOCK_PAGES = 3; + +/** Maximum characters shown in evidence snippet for duplicate blocks. */ +const MAX_EVIDENCE_SNIPPET_LENGTH = 120; + +// ── Repo root ──────────────────────────────────────────────────────────────── + +function getRepoRoot() { + try { + return execSync('git rev-parse --show-toplevel', { encoding: 'utf8' }).trim(); + } catch (_error) { + return process.cwd(); + } +} + +const REPO_ROOT = getRepoRoot(); + +// ── CLI ────────────────────────────────────────────────────────────────────── + +function printHelp() { + process.stdout.write( + [ + 'Usage:', + ' node tools/scripts/validators/content/check-duplicate-content.js [options]', + '', + 'Options:', + ' --path Restrict scan to a specific directory or file (repeatable).', + ' --file Scan a single file. Accepts absolute or repo-relative paths.', + ' --files Comma-separated list of files to scan.', + ' --strict Exit 1 even when only near-duplicate or block findings exist.', + ' --min-block-pages Minimum pages sharing a paragraph to flag it (default: 3).', + ' --json Output a JSON report to stdout instead of human-readable text.', + ' --help Show this help message.', + '', + 'Default behaviour:', + ' Scans all routable v2 MDX pages from docs.json navigation.' + ].join('\n') + ); + process.stdout.write('\n'); +} + +function parseArgs(argv) { + const args = { + help: false, + strict: false, + json: false, + minBlockPages: DEFAULT_MIN_BLOCK_PAGES, + files: [] + }; + + for (let i = 0; i < argv.length; i += 1) { + const token = argv[i]; + + if (token === '--help' || token === '-h') { + args.help = true; + continue; + } + if (token === '--strict') { + args.strict = true; + continue; + } + if (token === '--json') { + args.json = true; + continue; + } + if (token === '--min-block-pages') { + const value = parseInt(String(argv[i + 1] || ''), 10); + if (!Number.isFinite(value) || value < 2) throw new Error('--min-block-pages must be an integer >= 2.'); + args.minBlockPages = value; + i += 1; + continue; + } + if (token.startsWith('--min-block-pages=')) { + const value = parseInt(token.slice('--min-block-pages='.length), 10); + if (!Number.isFinite(value) || value < 2) throw new Error('--min-block-pages must be an integer >= 2.'); + args.minBlockPages = value; + continue; + } + if (token === '--file' || token === '--path') { + const value = String(argv[i + 1] || '').trim(); + if (!value) throw new Error(`Missing value for ${token}.`); + args.files.push(value); + i += 1; + continue; + } + if (token.startsWith('--file=') || token.startsWith('--path=')) { + const eqIdx = token.indexOf('='); + const value = token.slice(eqIdx + 1).trim(); + if (!value) throw new Error(`Missing value for ${token.slice(0, eqIdx)}.`); + args.files.push(value); + continue; + } + if (token === '--files') { + const value = String(argv[i + 1] || '').trim(); + if (!value) throw new Error('Missing value for --files.'); + parseCsvFiles(value).forEach((f) => args.files.push(f)); + i += 1; + continue; + } + if (token.startsWith('--files=')) { + const value = token.slice('--files='.length).trim(); + if (!value) throw new Error('Missing value for --files.'); + parseCsvFiles(value).forEach((f) => args.files.push(f)); + continue; + } + + throw new Error(`Unknown argument: ${token}`); + } + + args.files = dedupe(args.files.map(resolveInputPath)); + return args; +} + +function parseCsvFiles(value) { + return String(value || '') + .split(',') + .map((entry) => entry.trim()) + .filter(Boolean); +} + +// ── Path helpers ───────────────────────────────────────────────────────────── + +function resolveInputPath(filePath) { + if (!filePath) return ''; + return path.isAbsolute(filePath) ? path.normalize(filePath) : path.resolve(REPO_ROOT, filePath); +} + +function dedupe(values) { + return [...new Set(values.filter(Boolean))]; +} + +function formatDisplayPath(absPath) { + const relative = path.relative(REPO_ROOT, absPath); + if (!relative.startsWith('..') && !path.isAbsolute(relative)) { + return relative.split(path.sep).join('/'); + } + return absPath; +} + +// ── Content extraction ─────────────────────────────────────────────────────── + +/** + * Strip frontmatter and return a normalised body string suitable for hashing + * and similarity comparison. + */ +function extractBody(rawContent) { + const parsed = matter(String(rawContent || '')); + return parsed.content || ''; +} + +/** + * Collapse runs of whitespace, strip MDX import/export lines, JSX tags, and + * HTML comments so similarity is measured on prose content only. + */ +function normalizeBody(body) { + return String(body || '') + .replace(/^(import|export)\s+.*$/gm, '') // MDX import/export + .replace(/<[^>]+>/g, ' ') // JSX/HTML tags + .replace(/\{[^}]*\}/g, ' ') // JSX expressions + .replace(/\s+/g, ' ') + .trim() + .toLowerCase(); +} + +/** + * SHA-256 hash of a string (hex). + */ +function hashString(value) { + return crypto.createHash('sha256').update(String(value || '')).digest('hex'); +} + +// ── Paragraph extraction ───────────────────────────────────────────────────── + +/** + * Split body content into paragraph-like chunks (blank-line separated blocks + * that are not frontmatter, code fences, JSX tags, or import lines). + * + * Returns an array of { text, normalizedText, hash } objects. + */ +function extractParagraphs(body) { + const chunks = String(body || '').split(/\n{2,}/); + const paragraphs = []; + + for (const chunk of chunks) { + const trimmed = chunk.trim(); + if (!trimmed) continue; + + // Skip code fences + if (trimmed.startsWith('```') || trimmed.startsWith('~~~')) continue; + + // Skip lines that are entirely JSX/HTML tags or MDX expressions + if (/^<[A-Z]/.test(trimmed) || /^\{/.test(trimmed)) continue; + + // Skip import/export statements + if (/^(import|export)\s/.test(trimmed)) continue; + + // Skip headings (single-line headings starting with #) + if (/^#{1,6}\s/.test(trimmed) && !trimmed.includes('\n')) continue; + + const normalized = normalizeBody(trimmed); + if (!normalized) continue; + + const wordCount = normalized.split(/\s+/).filter(Boolean).length; + if (wordCount < MIN_BLOCK_WORDS) continue; + + paragraphs.push({ + text: trimmed, + normalizedText: normalized, + hash: hashString(normalized) + }); + } + + return paragraphs; +} + +// ── Jaccard similarity ─────────────────────────────────────────────────────── + +/** + * Build a set of overlapping word n-grams (shingles) from a string. + */ +function buildShingles(text, shingleSize = 5) { + const words = String(text || '').split(/\s+/).filter(Boolean); + const shingles = new Set(); + + for (let i = 0; i <= words.length - shingleSize; i += 1) { + shingles.add(words.slice(i, i + shingleSize).join(' ')); + } + + return shingles; +} + +/** + * Jaccard similarity between two sets. Iterates over the smaller set for + * better average-case performance. + */ +function jaccardSimilarity(setA, setB) { + if (setA.size === 0 && setB.size === 0) return 1; + if (setA.size === 0 || setB.size === 0) return 0; + + const [smaller, larger] = setA.size <= setB.size ? [setA, setB] : [setB, setA]; + + let intersectionSize = 0; + for (const item of smaller) { + if (larger.has(item)) intersectionSize += 1; + } + + return intersectionSize / (setA.size + setB.size - intersectionSize); +} + +// ── Finding builder ────────────────────────────────────────────────────────── + +function makeFinding({ rule, files, message, evidence }) { + return { rule, files, message, evidence: evidence || '' }; +} + +// ── Core detection ─────────────────────────────────────────────────────────── + +/** + * Load all target files, extract body content, and compute metadata. + */ +function loadPages(filePaths) { + const pages = []; + + for (const absPath of filePaths) { + if (!fs.existsSync(absPath)) { + pages.push({ absPath, displayPath: formatDisplayPath(absPath), error: 'File does not exist.' }); + continue; + } + + let rawContent = ''; + try { + rawContent = fs.readFileSync(absPath, 'utf8'); + } catch (err) { + pages.push({ absPath, displayPath: formatDisplayPath(absPath), error: `Unable to read file: ${err.message}` }); + continue; + } + + const body = extractBody(rawContent); + const normalized = normalizeBody(body); + const contentHash = hashString(normalized); + const paragraphs = extractParagraphs(body); + const shingles = buildShingles(normalized); + + pages.push({ + absPath, + displayPath: formatDisplayPath(absPath), + error: '', + body, + normalized, + contentHash, + paragraphs, + shingles + }); + } + + return pages; +} + +/** + * Detect pages whose full normalised body content is identical. + */ +function detectExactDuplicates(pages) { + const findings = []; + const byHash = new Map(); + + for (const page of pages) { + if (page.error || !page.contentHash) continue; + if (!byHash.has(page.contentHash)) byHash.set(page.contentHash, []); + byHash.get(page.contentHash).push(page); + } + + for (const [, group] of byHash) { + if (group.length < 2) continue; + + const displayPaths = group.map((p) => p.displayPath).sort(); + findings.push( + makeFinding({ + rule: RULE_EXACT_DUPLICATE, + files: displayPaths, + message: `${group.length} pages have identical normalised body content.`, + evidence: displayPaths.join(', ') + }) + ); + } + + return findings; +} + +/** + * Detect page pairs with high content overlap (Jaccard ≥ NEAR_DUPLICATE_THRESHOLD) + * that are not already flagged as exact duplicates. + */ +function detectNearDuplicates(pages, exactDuplicateGroups) { + const findings = []; + + // Build a set of files already grouped as exact duplicates to avoid double-reporting + const exactFiles = new Set(exactDuplicateGroups.flatMap((f) => f.files)); + + const validPages = pages.filter((p) => !p.error && p.shingles && p.shingles.size > 0); + + for (let i = 0; i < validPages.length; i += 1) { + for (let j = i + 1; j < validPages.length; j += 1) { + const a = validPages[i]; + const b = validPages[j]; + + // Skip pairs already in an exact-duplicate group + if (exactFiles.has(a.displayPath) && exactFiles.has(b.displayPath)) continue; + + const similarity = jaccardSimilarity(a.shingles, b.shingles); + if (similarity >= NEAR_DUPLICATE_THRESHOLD) { + findings.push( + makeFinding({ + rule: RULE_NEAR_DUPLICATE, + files: [a.displayPath, b.displayPath].sort(), + message: `Pages share ${Math.round(similarity * 100)}% content similarity (Jaccard shingle threshold: ${Math.round(NEAR_DUPLICATE_THRESHOLD * 100)}%).`, + evidence: `Similarity score: ${similarity.toFixed(3)}` + }) + ); + } + } + } + + return findings; +} + +/** + * Detect individual paragraphs that appear verbatim (after normalisation) in + * minBlockPages or more distinct pages. + */ +function detectDuplicateBlocks(pages, minBlockPages) { + const findings = []; + const blockIndex = new Map(); // hash -> { text, pages: Set } + + for (const page of pages) { + if (page.error || !page.paragraphs) continue; + + for (const para of page.paragraphs) { + if (!blockIndex.has(para.hash)) { + blockIndex.set(para.hash, { text: para.text, pages: new Set() }); + } + blockIndex.get(para.hash).pages.add(page.displayPath); + } + } + + for (const [, entry] of blockIndex) { + if (entry.pages.size < minBlockPages) continue; + + const sortedPaths = [...entry.pages].sort(); + const snippet = entry.text.slice(0, MAX_EVIDENCE_SNIPPET_LENGTH).replace(/\s+/g, ' '); + const ellipsis = entry.text.length > MAX_EVIDENCE_SNIPPET_LENGTH ? '…' : ''; + + findings.push( + makeFinding({ + rule: RULE_DUPLICATE_BLOCK, + files: sortedPaths, + message: `A paragraph block appears verbatim in ${entry.pages.size} pages.`, + evidence: `"${snippet}${ellipsis}"` + }) + ); + } + + return findings; +} + +// ── Runner ─────────────────────────────────────────────────────────────────── + +function getDefaultTargets() { + return getMdxFiles(REPO_ROOT) + .filter((filePath) => filePath.endsWith('.mdx')) + .map((filePath) => path.resolve(filePath)); +} + +function run(options = {}) { + const explicitFiles = Array.isArray(options.files) ? options.files : []; + const targets = explicitFiles.length > 0 ? dedupe(explicitFiles.map(resolveInputPath)) : getDefaultTargets(); + const minBlockPages = Number.isFinite(options.minBlockPages) ? options.minBlockPages : DEFAULT_MIN_BLOCK_PAGES; + + const pages = loadPages(targets); + const errors = pages.filter((p) => p.error); + + const exactFindings = detectExactDuplicates(pages); + const nearFindings = detectNearDuplicates(pages, exactFindings); + const blockFindings = detectDuplicateBlocks(pages, minBlockPages); + + const findings = [...exactFindings, ...nearFindings, ...blockFindings]; + + const passed = errors.length === 0 && exactFindings.length === 0 && + (options.strict ? findings.length === 0 : true); + + return { + scanned: targets.length, + pages, + errors, + findings, + exactFindings, + nearFindings, + blockFindings, + passed + }; +} + +// ── Output ─────────────────────────────────────────────────────────────────── + +function printResults(summary, options = {}) { + if (options.json) { + process.stdout.write( + JSON.stringify( + { + scanned: summary.scanned, + errors: summary.errors.map((p) => ({ file: p.displayPath, error: p.error })), + findings: summary.findings + }, + null, + 2 + ) + ); + process.stdout.write('\n'); + return; + } + + for (const page of summary.pages) { + if (page.error) { + console.error(`${page.displayPath}:1 [error] ${page.error}`); + } + } + + const grouped = { + [RULE_EXACT_DUPLICATE]: summary.exactFindings, + [RULE_NEAR_DUPLICATE]: summary.nearFindings, + [RULE_DUPLICATE_BLOCK]: summary.blockFindings + }; + + for (const [rule, ruleFindings] of Object.entries(grouped)) { + if (ruleFindings.length === 0) continue; + + for (const finding of ruleFindings) { + const fileList = finding.files.join(', '); + console.error(`[${rule}] ${finding.message}`); + console.error(` Files: ${fileList}`); + if (finding.evidence) console.error(` Evidence: ${finding.evidence}`); + } + } + + const exactCount = summary.exactFindings.length; + const nearCount = summary.nearFindings.length; + const blockCount = summary.blockFindings.length; + const errorCount = summary.errors.length; + + const message = + `Scanned ${summary.scanned} file(s); ` + + `${errorCount} error(s); ` + + `${exactCount} exact-duplicate page cluster(s); ` + + `${nearCount} near-duplicate page pair(s); ` + + `${blockCount} duplicate block(s).`; + + if (summary.passed) { + console.log(`✅ ${message}`); + } else { + console.error(`❌ ${message}`); + } +} + +// ── Entry point ────────────────────────────────────────────────────────────── + +if (require.main === module) { + try { + const args = parseArgs(process.argv.slice(2)); + if (args.help) { + printHelp(); + process.exit(0); + } + + const summary = run(args); + printResults(summary, args); + process.exit(summary.passed ? 0 : 1); + } catch (error) { + console.error(`❌ ${error.message}`); + process.exit(1); + } +} + +module.exports = { + run, + detectExactDuplicates, + detectNearDuplicates, + detectDuplicateBlocks, + extractBody, + normalizeBody, + extractParagraphs, + buildShingles, + jaccardSimilarity, + hashString +};