From 8b2128c1d2dbf4b5e99a637487d86b3a47b00f7d Mon Sep 17 00:00:00 2001 From: yushijie_27 Date: Thu, 4 Jun 2026 15:00:23 +0800 Subject: [PATCH 01/18] feat: add crawler-based in-site link and broken image checking --- .github/workflows/link-check-cron-crawler.yml | 17 +++++++ .github/workflows/link-check-pr-crawler.yml | 44 +++++++++++++++++++ 2 files changed, 61 insertions(+) create mode 100644 .github/workflows/link-check-cron-crawler.yml create mode 100644 .github/workflows/link-check-pr-crawler.yml diff --git a/.github/workflows/link-check-cron-crawler.yml b/.github/workflows/link-check-cron-crawler.yml new file mode 100644 index 0000000000000..c9da9df8ddfde --- /dev/null +++ b/.github/workflows/link-check-cron-crawler.yml @@ -0,0 +1,17 @@ +name: link-check-cron-crawler + +on: + schedule: + - cron: '0 2 * * *' + workflow_dispatch: + +permissions: + contents: read + +jobs: + cron-crawler: + runs-on: ubuntu-latest + steps: + - name: Run Crawler Link Checker + run: | + npx linkinator https://doris.apache.org --recurse --check-images --skip "^(?!https?://(www\.)?doris\.apache\.org)" diff --git a/.github/workflows/link-check-pr-crawler.yml b/.github/workflows/link-check-pr-crawler.yml new file mode 100644 index 0000000000000..60b4c53ee9877 --- /dev/null +++ b/.github/workflows/link-check-pr-crawler.yml @@ -0,0 +1,44 @@ +name: link-check-pr-crawler + +on: + pull_request: + paths: + - 'docs/**' + - 'i18n/**' + - 'src/**' + - 'static/**' + - 'docusaurus.config.js' + - 'sidebars.ts' + +concurrency: + group: link-check-pr-crawler-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +permissions: + contents: read + +jobs: + pr-crawler: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Use Node.js + uses: actions/setup-node@v4 + with: + node-version: 20 + cache: 'yarn' + + - name: Install dependencies + run: yarn install --frozen-lockfile + + - name: Build website + run: yarn build + + - name: Serve build folder and scan for 404s/broken images + run: | + npx serve build -l 3000 & + # Wait for server to start + sleep 5 + npx linkinator http://localhost:3000 --recurse --check-images --skip "^(?!http://localhost:3000)" From dfd5831e2015f1d15fd34f80d9346ced339b6b22 Mon Sep 17 00:00:00 2001 From: yushijie_27 Date: Thu, 4 Jun 2026 17:38:28 +0800 Subject: [PATCH 02/18] feat: add crawler-based in-site link and image checker workflows --- .github/workflows/link-check-cron-crawler.yml | 2 +- .github/workflows/link-check-pr-crawler.yml | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/link-check-cron-crawler.yml b/.github/workflows/link-check-cron-crawler.yml index c9da9df8ddfde..c3b5053b46333 100644 --- a/.github/workflows/link-check-cron-crawler.yml +++ b/.github/workflows/link-check-cron-crawler.yml @@ -14,4 +14,4 @@ jobs: steps: - name: Run Crawler Link Checker run: | - npx linkinator https://doris.apache.org --recurse --check-images --skip "^(?!https?://(www\.)?doris\.apache\.org)" + npx -y linkinator https://doris.apache.org --recurse --check-images --skip "^(?!https?://(www\.)?doris\.apache\.org)" diff --git a/.github/workflows/link-check-pr-crawler.yml b/.github/workflows/link-check-pr-crawler.yml index 60b4c53ee9877..3bbe63e4a62bf 100644 --- a/.github/workflows/link-check-pr-crawler.yml +++ b/.github/workflows/link-check-pr-crawler.yml @@ -4,11 +4,14 @@ on: pull_request: paths: - 'docs/**' + - 'versioned_docs/**' - 'i18n/**' - 'src/**' - 'static/**' - 'docusaurus.config.js' - 'sidebars.ts' + - 'versioned_sidebars/**' + - 'versions.json' concurrency: group: link-check-pr-crawler-${{ github.event.pull_request.number || github.ref }} @@ -38,7 +41,7 @@ jobs: - name: Serve build folder and scan for 404s/broken images run: | - npx serve build -l 3000 & + npx -y serve build -l 3000 & # Wait for server to start sleep 5 - npx linkinator http://localhost:3000 --recurse --check-images --skip "^(?!http://localhost:3000)" + npx -y linkinator http://localhost:3000 --recurse --check-images --skip "^(?!http://localhost:3000)" From 864b50d5ff36aa35651f26329c62b4b537fb6329 Mon Sep 17 00:00:00 2001 From: yushijie_27 Date: Mon, 8 Jun 2026 19:01:07 +0800 Subject: [PATCH 03/18] feat(link-check): disable PR-level check and add Feishu notification on cron failure --- .github/workflows/link-check-cron-crawler.yml | 9 +- .github/workflows/link-check-pr-crawler.yml | 47 ------ .github/workflows/scripts/feishu-reporter.js | 143 ++++++++++++++++++ 3 files changed, 151 insertions(+), 48 deletions(-) delete mode 100644 .github/workflows/link-check-pr-crawler.yml create mode 100644 .github/workflows/scripts/feishu-reporter.js diff --git a/.github/workflows/link-check-cron-crawler.yml b/.github/workflows/link-check-cron-crawler.yml index c3b5053b46333..ca2f02fa26e34 100644 --- a/.github/workflows/link-check-cron-crawler.yml +++ b/.github/workflows/link-check-cron-crawler.yml @@ -14,4 +14,11 @@ jobs: steps: - name: Run Crawler Link Checker run: | - npx -y linkinator https://doris.apache.org --recurse --check-images --skip "^(?!https?://(www\.)?doris\.apache\.org)" + npx -y linkinator https://doris.apache.org --recurse --check-images --skip "^(?!https?://(www\.)?doris\.apache\.org)" --format json > link_results.json || echo "HAS_BROKEN=true" + + - name: Send Feishu Notification on failure + if: always() + env: + FEISHU_WEBHOOK: ${{ secrets.FEISHU_WEBHOOK_URL }} + run: | + node .github/workflows/scripts/feishu-reporter.js diff --git a/.github/workflows/link-check-pr-crawler.yml b/.github/workflows/link-check-pr-crawler.yml deleted file mode 100644 index 3bbe63e4a62bf..0000000000000 --- a/.github/workflows/link-check-pr-crawler.yml +++ /dev/null @@ -1,47 +0,0 @@ -name: link-check-pr-crawler - -on: - pull_request: - paths: - - 'docs/**' - - 'versioned_docs/**' - - 'i18n/**' - - 'src/**' - - 'static/**' - - 'docusaurus.config.js' - - 'sidebars.ts' - - 'versioned_sidebars/**' - - 'versions.json' - -concurrency: - group: link-check-pr-crawler-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -permissions: - contents: read - -jobs: - pr-crawler: - runs-on: ubuntu-latest - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Use Node.js - uses: actions/setup-node@v4 - with: - node-version: 20 - cache: 'yarn' - - - name: Install dependencies - run: yarn install --frozen-lockfile - - - name: Build website - run: yarn build - - - name: Serve build folder and scan for 404s/broken images - run: | - npx -y serve build -l 3000 & - # Wait for server to start - sleep 5 - npx -y linkinator http://localhost:3000 --recurse --check-images --skip "^(?!http://localhost:3000)" diff --git a/.github/workflows/scripts/feishu-reporter.js b/.github/workflows/scripts/feishu-reporter.js new file mode 100644 index 0000000000000..b324553a4419f --- /dev/null +++ b/.github/workflows/scripts/feishu-reporter.js @@ -0,0 +1,143 @@ +const fs = require('fs'); +const path = require('path'); + +const WEBHOOK_URL = process.env.FEISHU_WEBHOOK; +if (!WEBHOOK_URL) { + console.error('Error: FEISHU_WEBHOOK environment variable is not set.'); + process.exit(1); +} + +const resultsPath = path.join(process.cwd(), 'link_results.json'); +if (!fs.existsSync(resultsPath)) { + console.error('Error: link_results.json not found.'); + process.exit(1); +} + +let data; +try { + data = JSON.parse(fs.readFileSync(resultsPath, 'utf8')); +} catch (err) { + console.error('Error parsing link_results.json:', err.message); + process.exit(1); +} + +const brokenLinks = (data.links || []).filter(link => !link.success); + +if (brokenLinks.length === 0) { + console.log('No broken links found. Exiting with success.'); + process.exit(0); +} + +console.log(`Found ${brokenLinks.length} broken links.`); + +// Gather env variables from GitHub Actions +const repoName = process.env.GITHUB_REPOSITORY || 'doris-website'; +const runId = process.env.GITHUB_RUN_ID || ''; +const serverUrl = process.env.GITHUB_SERVER_URL || 'https://github.com'; +const runUrl = runId ? `${serverUrl}/${repoName}/actions/runs/${runId}` : ''; +const prNumber = process.env.GITHUB_EVENT_NAME === 'pull_request' ? (process.env.GITHUB_REF_NAME ? process.env.GITHUB_REF_NAME.split('/')[0] : '') : ''; +const actor = process.env.GITHUB_ACTOR || 'system'; +const eventName = process.env.GITHUB_EVENT_NAME === 'schedule' ? '每日例行巡检' : 'PR 提交'; + +// Format broken links to Markdown +const limit = 10; +const displayedBroken = brokenLinks.slice(0, limit); +const brokenListMd = displayedBroken.map((link, idx) => { + // Strip domain prefix for cleaner log + const cleanUrl = link.url.replace(/^https?:\/\/(www\.)?doris\.apache\.org/, ''); + const cleanParent = link.parent ? link.parent.replace(/^https?:\/\/(www\.)?doris\.apache\.org/, '') : 'Unknown'; + return `${idx + 1}. ❌ **[${link.status || 'Broken'}]** ${cleanUrl}\n 🔍 引用源文件: \`${cleanParent}\``; +}).join('\n'); + +const totalText = brokenLinks.length > limit ? `\n\n...以及其他 ${brokenLinks.length - limit} 个死链,请点击下方按钮查看完整日志。` : ''; + +// Construct Feishu Card Payload +const payload = { + msg_type: 'interactive', + card: { + header: { + template: 'red', + title: { + tag: 'plain_text', + content: `⚠️ 链接扫描失败警告 | ${repoName.split('/')[1] || repoName}` + } + }, + elements: [ + { + tag: 'div', + text: { + tag: 'lark_md', + content: `**触发场景**: ${eventName}\n**提交人**: @${actor}${prNumber ? `\n**PR号**: #${prNumber}` : ''}\n**总死链数**: **${brokenLinks.length}** 个` + } + }, + { + tag: 'hr' + }, + { + tag: 'div', + text: { + tag: 'lark_md', + content: `**检测到死链列表 (最多展示 ${limit} 条):**\n${brokenListMd}${totalText}` + } + }, + runUrl ? { + tag: 'action', + actions: [ + { + tag: 'button', + text: { + tag: 'plain_text', + content: '查看 GitHub Actions 完整日志' + }, + type: 'primary', + url: runUrl + } + ] + } : null + ].filter(Boolean) + } +}; + +// Send to Feishu Webhook +const payloadStr = JSON.stringify(payload, null, 2); +if (process.env.DEBUG === 'true') { + console.log('--- Generated Feishu Card Payload ---'); + console.log(payloadStr); + console.log('------------------------------------'); +} + +const urlObj = new URL(WEBHOOK_URL); +const protocol = urlObj.protocol === 'https:' ? require('https') : require('http'); + +const options = { + hostname: urlObj.hostname, + port: urlObj.port, + path: urlObj.pathname + urlObj.search, + method: 'POST', + headers: { + 'Content-Type': 'application/json', + 'Content-Length': Buffer.byteLength(payloadStr) + } +}; + +console.log('Sending alert to Feishu...'); +const req = protocol.request(options, (res) => { + let body = ''; + res.setEncoding('utf8'); + res.on('data', (chunk) => { body += chunk; }); + res.on('end', () => { + console.log(`Feishu response status: ${res.statusCode}`); + console.log(`Feishu response body: ${body}`); + // Exit with 1 to indicate step failure to GitHub + process.exit(1); + }); +}); + +req.on('error', (e) => { + console.error(`Problem sending request to Feishu: ${e.message}`); + // Still exit with 1 to fail the check + process.exit(1); +}); + +req.write(payloadStr); +req.end(); From 921c343a6fae56dfa49f0bcfc51ad730451f5190 Mon Sep 17 00:00:00 2001 From: yushijie_27 Date: Tue, 9 Jun 2026 11:39:33 +0800 Subject: [PATCH 04/18] refactor(link-check): enhance Feishu report format with GITHUB_STEP_SUMMARY tables --- .github/workflows/scripts/feishu-reporter.js | 96 +++++++++++++++++--- 1 file changed, 84 insertions(+), 12 deletions(-) diff --git a/.github/workflows/scripts/feishu-reporter.js b/.github/workflows/scripts/feishu-reporter.js index b324553a4419f..ed7e9b42f8c38 100644 --- a/.github/workflows/scripts/feishu-reporter.js +++ b/.github/workflows/scripts/feishu-reporter.js @@ -23,6 +23,84 @@ try { const brokenLinks = (data.links || []).filter(link => !link.success); +function resolveSourceFile(cleanParent) { + if (!cleanParent || cleanParent === 'Unknown' || cleanParent === '/') { + return 'Unknown'; + } + const relativePath = cleanParent.replace(/^\/|\/$/g, ''); + const candidates = [ + relativePath + '.md', + relativePath + '.mdx', + relativePath + '/index.md', + relativePath + '/index.mdx', + 'docs/' + relativePath + '.md', + 'docs/' + relativePath + '.mdx', + 'docs/' + relativePath + '/index.md', + 'docs/' + relativePath + '/index.mdx', + ]; + + for (const cand of candidates) { + if (fs.existsSync(path.join(process.cwd(), cand))) { + return cand; + } + } + return relativePath; +} + +function writeStepSummary(brokenLinks) { + const summaryFile = process.env.GITHUB_STEP_SUMMARY; + if (!summaryFile) return; + + const repoName = process.env.GITHUB_REPOSITORY || 'doris-website'; + const commitSha = process.env.GITHUB_SHA || 'master'; + const serverUrl = process.env.GITHUB_SERVER_URL || 'https://github.com'; + const eventName = process.env.GITHUB_EVENT_NAME === 'schedule' ? 'Schedule (凌晨例行巡检)' : (process.env.GITHUB_EVENT_NAME || 'manual'); + const branchName = process.env.GITHUB_REF_NAME || 'master'; + + let markdown = `# 🔍 链路检测排错报告 (Link Checker Report)\n\n`; + + if (brokenLinks.length > 0) { + markdown += `> [!WARNING]\n`; + markdown += `> **本次检测共发现 ${brokenLinks.length} 处失效链接!** 请开发人员点击下表中【引用源文件】中的蓝色链接,直接跳转到 GitHub 代码行进行修复。\n\n`; + } else { + markdown += `> [!NOTE]\n`; + markdown += `> **本次检测未发现失效链接。** 链路状态良好!\n\n`; + } + + markdown += `| 📌 引用源文件 (Where Referenced) | 🔗 失效链接 (Broken Link) | ❌ 错误原因 (Error Reason) |\n`; + markdown += `| :--- | :--- | :--- |\n`; + + if (brokenLinks.length === 0) { + markdown += `| - | 无失效链接 | - |\n`; + } else { + for (const link of brokenLinks) { + const cleanUrl = link.url.replace(/^http:\/\/localhost:\d+/, ''); + const cleanParent = link.parent ? link.parent.replace(/^http:\/\/localhost:\d+/, '') : 'Unknown'; + const resolvedFile = resolveSourceFile(cleanParent); + const fileLink = resolvedFile !== 'Unknown' ? `${serverUrl}/${repoName}/blob/${commitSha}/${resolvedFile}` : ''; + const fileDisplay = resolvedFile !== 'Unknown' ? resolvedFile : 'Unknown'; + + const fileCol = fileLink ? `[\`${fileDisplay}\`](${fileLink})` : `\`${fileDisplay}\``; + markdown += `| ${fileCol} | \`${cleanUrl}\` | \`${link.status || 'Broken'}\` |\n`; + } + } + + markdown += `\n---\n`; + markdown += `**📊 运行元信息:**\n`; + markdown += `* **检测分支**: \`${branchName}\`\n`; + markdown += `* **触发类型**: \`${eventName}\`\n`; + markdown += `* **检测时间**: \`${new Date().toISOString().replace('T', ' ').substring(0, 19)} (UTC)\`\n`; + + try { + fs.appendFileSync(summaryFile, markdown); + } catch (err) { + console.error('Failed to write GITHUB_STEP_SUMMARY:', err.message); + } +} + +// Write Step Summary in all cases +writeStepSummary(brokenLinks); + if (brokenLinks.length === 0) { console.log('No broken links found. Exiting with success.'); process.exit(0); @@ -30,7 +108,6 @@ if (brokenLinks.length === 0) { console.log(`Found ${brokenLinks.length} broken links.`); -// Gather env variables from GitHub Actions const repoName = process.env.GITHUB_REPOSITORY || 'doris-website'; const runId = process.env.GITHUB_RUN_ID || ''; const serverUrl = process.env.GITHUB_SERVER_URL || 'https://github.com'; @@ -39,19 +116,17 @@ const prNumber = process.env.GITHUB_EVENT_NAME === 'pull_request' ? (process.env const actor = process.env.GITHUB_ACTOR || 'system'; const eventName = process.env.GITHUB_EVENT_NAME === 'schedule' ? '每日例行巡检' : 'PR 提交'; -// Format broken links to Markdown const limit = 10; const displayedBroken = brokenLinks.slice(0, limit); const brokenListMd = displayedBroken.map((link, idx) => { - // Strip domain prefix for cleaner log - const cleanUrl = link.url.replace(/^https?:\/\/(www\.)?doris\.apache\.org/, ''); - const cleanParent = link.parent ? link.parent.replace(/^https?:\/\/(www\.)?doris\.apache\.org/, '') : 'Unknown'; - return `${idx + 1}. ❌ **[${link.status || 'Broken'}]** ${cleanUrl}\n 🔍 引用源文件: \`${cleanParent}\``; + const cleanUrl = link.url.replace(/^http:\/\/localhost:\d+/, ''); + const cleanParent = link.parent ? link.parent.replace(/^http:\/\/localhost:\d+/, '') : 'Unknown'; + const resolvedFile = resolveSourceFile(cleanParent); + return `${idx + 1}. ❌ **[${link.status || 'Broken'}]** ${cleanUrl}\n 🔍 引用源文件: \`${resolvedFile}\``; }).join('\n'); -const totalText = brokenLinks.length > limit ? `\n\n...以及其他 ${brokenLinks.length - limit} 个死链,请点击下方按钮查看完整日志。` : ''; +const totalText = brokenLinks.length > limit ? `\n\n...以及其他 ${brokenLinks.length - limit} 个死链,请点击下方按钮查看完整排错报告。` : ''; -// Construct Feishu Card Payload const payload = { msg_type: 'interactive', card: { @@ -87,7 +162,7 @@ const payload = { tag: 'button', text: { tag: 'plain_text', - content: '查看 GitHub Actions 完整日志' + content: '查看详细排错报告' }, type: 'primary', url: runUrl @@ -98,7 +173,6 @@ const payload = { } }; -// Send to Feishu Webhook const payloadStr = JSON.stringify(payload, null, 2); if (process.env.DEBUG === 'true') { console.log('--- Generated Feishu Card Payload ---'); @@ -128,14 +202,12 @@ const req = protocol.request(options, (res) => { res.on('end', () => { console.log(`Feishu response status: ${res.statusCode}`); console.log(`Feishu response body: ${body}`); - // Exit with 1 to indicate step failure to GitHub process.exit(1); }); }); req.on('error', (e) => { console.error(`Problem sending request to Feishu: ${e.message}`); - // Still exit with 1 to fail the check process.exit(1); }); From 20705b69fe3abf3149d0e5af853650941636ea63 Mon Sep 17 00:00:00 2001 From: yushijie_27 Date: Tue, 9 Jun 2026 12:33:52 +0800 Subject: [PATCH 05/18] refactor(link-check): upgrade Feishu reporter with grouping, cross-repo docs mapping, and error categorization --- .github/workflows/scripts/feishu-reporter.js | 161 +++++++++++++------ 1 file changed, 108 insertions(+), 53 deletions(-) diff --git a/.github/workflows/scripts/feishu-reporter.js b/.github/workflows/scripts/feishu-reporter.js index ed7e9b42f8c38..e5c7c7ff1e6c2 100644 --- a/.github/workflows/scripts/feishu-reporter.js +++ b/.github/workflows/scripts/feishu-reporter.js @@ -9,8 +9,8 @@ if (!WEBHOOK_URL) { const resultsPath = path.join(process.cwd(), 'link_results.json'); if (!fs.existsSync(resultsPath)) { - console.error('Error: link_results.json not found.'); - process.exit(1); + console.log('link_results.json not found. This is likely due to a build or copy docs failure in an earlier step. Exiting gracefully.'); + process.exit(0); } let data; @@ -18,16 +18,29 @@ try { data = JSON.parse(fs.readFileSync(resultsPath, 'utf8')); } catch (err) { console.error('Error parsing link_results.json:', err.message); - process.exit(1); + process.exit(0); } const brokenLinks = (data.links || []).filter(link => !link.success); +// Resolve repo name to determine specific doc mappings +const repoName = process.env.GITHUB_REPOSITORY || 'doris-website'; +const commitSha = process.env.GITHUB_SHA || 'master'; +const serverUrl = process.env.GITHUB_SERVER_URL || 'https://github.com'; +const runId = process.env.GITHUB_RUN_ID || ''; +const runUrl = runId ? `${serverUrl}/${repoName}/actions/runs/${runId}` : ''; +const prNumber = process.env.GITHUB_EVENT_NAME === 'pull_request' ? (process.env.GITHUB_REF_NAME ? process.env.GITHUB_REF_NAME.split('/')[0] : '') : ''; +const actor = process.env.GITHUB_ACTOR || 'system'; +const eventName = process.env.GITHUB_EVENT_NAME === 'schedule' ? '每日例行巡检' : 'PR 提交'; +const branchName = process.env.GITHUB_REF_NAME || 'master'; + function resolveSourceFile(cleanParent) { if (!cleanParent || cleanParent === 'Unknown' || cleanParent === '/') { - return 'Unknown'; + return { file: 'Unknown', link: '' }; } - const relativePath = cleanParent.replace(/^\/|\/$/g, ''); + const relativePath = decodeURIComponent(cleanParent.replace(/^\/|\/$/g, '')); + + // Default: look up in the current repository const candidates = [ relativePath + '.md', relativePath + '.mdx', @@ -41,47 +54,91 @@ function resolveSourceFile(cleanParent) { for (const cand of candidates) { if (fs.existsSync(path.join(process.cwd(), cand))) { - return cand; + return { + file: cand, + link: `${serverUrl}/${repoName}/blob/${commitSha}/${cand}` + }; } } - return relativePath; + + return { + file: relativePath, + link: `${serverUrl}/${repoName}/blob/${commitSha}/${relativePath}` + }; +} + +// Process and group broken links by URL +const urlMap = new Map(); +let cnt404 = 0; +let cntAnchor = 0; +let cntTimeout = 0; +let cntOther = 0; + +for (const link of brokenLinks) { + const cleanUrl = link.url.replace(/^http:\/\/localhost:\d+/, ''); + const cleanParent = link.parent ? link.parent.replace(/^http:\/\/localhost:\d+/, '') : 'Unknown'; + + let errorReason = link.status ? `HTTP ${link.status}` : 'Connection Error'; + if (link.status === 200 && link.url.includes('#')) { + const hashMatch = link.url.match(/#.*/); + const hash = hashMatch ? hashMatch[0] : ''; + errorReason = `200 (锚点 ${hash} 未找到)`; + cntAnchor++; + } else if (link.status === 404) { + cnt404++; + } else if (!link.status || link.status === 0) { + errorReason = 'Timeout / Network Error'; + cntTimeout++; + } else { + cntOther++; + } + + const { file: resolvedFile, link: fileLink } = resolveSourceFile(cleanParent); + + if (!urlMap.has(cleanUrl)) { + urlMap.set(cleanUrl, { + url: cleanUrl, + errorReason, + references: [] + }); + } + urlMap.get(cleanUrl).references.push({ + file: resolvedFile, + link: fileLink + }); } -function writeStepSummary(brokenLinks) { +const uniqueBrokenLinks = [...urlMap.values()]; + +function writeStepSummary() { const summaryFile = process.env.GITHUB_STEP_SUMMARY; if (!summaryFile) return; - const repoName = process.env.GITHUB_REPOSITORY || 'doris-website'; - const commitSha = process.env.GITHUB_SHA || 'master'; - const serverUrl = process.env.GITHUB_SERVER_URL || 'https://github.com'; - const eventName = process.env.GITHUB_EVENT_NAME === 'schedule' ? 'Schedule (凌晨例行巡检)' : (process.env.GITHUB_EVENT_NAME || 'manual'); - const branchName = process.env.GITHUB_REF_NAME || 'master'; - let markdown = `# 🔍 链路检测排错报告 (Link Checker Report)\n\n`; if (brokenLinks.length > 0) { markdown += `> [!WARNING]\n`; - markdown += `> **本次检测共发现 ${brokenLinks.length} 处失效链接!** 请开发人员点击下表中【引用源文件】中的蓝色链接,直接跳转到 GitHub 代码行进行修复。\n\n`; + markdown += `> **本次定时巡检共发现 ${brokenLinks.length} 处失效链接!** 请开发人员点击下表【引用源文件】中的链接,直接跳转到对应的 GitHub 源码行进行修复。\n\n`; } else { markdown += `> [!NOTE]\n`; - markdown += `> **本次检测未发现失效链接。** 链路状态良好!\n\n`; + markdown += `> **本次定时巡检未发现失效链接。** 链路状态良好!\n\n`; } - markdown += `| 📌 引用源文件 (Where Referenced) | 🔗 失效链接 (Broken Link) | ❌ 错误原因 (Error Reason) |\n`; + markdown += `| 🔗 失效链接 (Broken Link) | ❌ 错误原因 (Error Reason) | 📌 引用源文件 (Where Referenced) |\n`; markdown += `| :--- | :--- | :--- |\n`; - if (brokenLinks.length === 0) { + if (uniqueBrokenLinks.length === 0) { markdown += `| - | 无失效链接 | - |\n`; } else { - for (const link of brokenLinks) { - const cleanUrl = link.url.replace(/^http:\/\/localhost:\d+/, ''); - const cleanParent = link.parent ? link.parent.replace(/^http:\/\/localhost:\d+/, '') : 'Unknown'; - const resolvedFile = resolveSourceFile(cleanParent); - const fileLink = resolvedFile !== 'Unknown' ? `${serverUrl}/${repoName}/blob/${commitSha}/${resolvedFile}` : ''; - const fileDisplay = resolvedFile !== 'Unknown' ? resolvedFile : 'Unknown'; - - const fileCol = fileLink ? `[\`${fileDisplay}\`](${fileLink})` : `\`${fileDisplay}\``; - markdown += `| ${fileCol} | \`${cleanUrl}\` | \`${link.status || 'Broken'}\` |\n`; + for (const item of uniqueBrokenLinks) { + const refLinks = item.references.map(ref => { + if (ref.link) { + return `[\`${ref.file}\`](${ref.link})`; + } + return `\`${ref.file}\``; + }).join('
'); + + markdown += `| \`${item.url}\` | \`${item.errorReason}\` | ${refLinks} |\n`; } } @@ -98,34 +155,25 @@ function writeStepSummary(brokenLinks) { } } -// Write Step Summary in all cases -writeStepSummary(brokenLinks); +// Write Step Summary +writeStepSummary(); if (brokenLinks.length === 0) { console.log('No broken links found. Exiting with success.'); process.exit(0); } -console.log(`Found ${brokenLinks.length} broken links.`); - -const repoName = process.env.GITHUB_REPOSITORY || 'doris-website'; -const runId = process.env.GITHUB_RUN_ID || ''; -const serverUrl = process.env.GITHUB_SERVER_URL || 'https://github.com'; -const runUrl = runId ? `${serverUrl}/${repoName}/actions/runs/${runId}` : ''; -const prNumber = process.env.GITHUB_EVENT_NAME === 'pull_request' ? (process.env.GITHUB_REF_NAME ? process.env.GITHUB_REF_NAME.split('/')[0] : '') : ''; -const actor = process.env.GITHUB_ACTOR || 'system'; -const eventName = process.env.GITHUB_EVENT_NAME === 'schedule' ? '每日例行巡检' : 'PR 提交'; +console.log(`Found ${brokenLinks.length} broken links. Sending Feishu notification...`); +// Format broken links summary for Feishu const limit = 10; -const displayedBroken = brokenLinks.slice(0, limit); -const brokenListMd = displayedBroken.map((link, idx) => { - const cleanUrl = link.url.replace(/^http:\/\/localhost:\d+/, ''); - const cleanParent = link.parent ? link.parent.replace(/^http:\/\/localhost:\d+/, '') : 'Unknown'; - const resolvedFile = resolveSourceFile(cleanParent); - return `${idx + 1}. ❌ **[${link.status || 'Broken'}]** ${cleanUrl}\n 🔍 引用源文件: \`${resolvedFile}\``; +const displayedBroken = uniqueBrokenLinks.slice(0, limit); +const brokenListMd = displayedBroken.map((item, idx) => { + const refsText = item.references.slice(0, 3).map(r => r.file).join(', ') + (item.references.length > 3 ? '等' : ''); + return `${idx + 1}. ❌ **[${item.errorReason}]** ${item.url}\n 🔍 引用源: \`${refsText}\``; }).join('\n'); -const totalText = brokenLinks.length > limit ? `\n\n...以及其他 ${brokenLinks.length - limit} 个死链,请点击下方按钮查看完整排错报告。` : ''; +const totalText = uniqueBrokenLinks.length > limit ? `\n\n...以及其他 ${uniqueBrokenLinks.length - limit} 个失效链接,请点击下方按钮查看完整排错报告。` : ''; const payload = { msg_type: 'interactive', @@ -142,7 +190,20 @@ const payload = { tag: 'div', text: { tag: 'lark_md', - content: `**触发场景**: ${eventName}\n**提交人**: @${actor}${prNumber ? `\n**PR号**: #${prNumber}` : ''}\n**总死链数**: **${brokenLinks.length}** 个` + content: `**触发场景**: ${eventName}\n**提交人**: @${actor}${prNumber ? `\n**PR号**: #${prNumber}` : ''}\n**总失效链接数**: **${brokenLinks.length}** 个` + } + }, + { + tag: 'hr' + }, + { + tag: 'div', + text: { + tag: 'lark_md', + content: `**📊 死链分类统计 (Classification):**\n` + + `• 🔴 **页面未找到 (404)**: **${cnt404}** 个\n` + + `• 🟡 **锚点失效 (Anchor)**: **${cntAnchor}** 个\n` + + `• 🔵 **网络超时/其他**: **${cntTimeout + cntOther}** 个` } }, { @@ -152,7 +213,7 @@ const payload = { tag: 'div', text: { tag: 'lark_md', - content: `**检测到死链列表 (最多展示 ${limit} 条):**\n${brokenListMd}${totalText}` + content: `**检测到失效链接列表 (最多展示 ${limit} 个):**\n${brokenListMd}${totalText}` } }, runUrl ? { @@ -174,11 +235,6 @@ const payload = { }; const payloadStr = JSON.stringify(payload, null, 2); -if (process.env.DEBUG === 'true') { - console.log('--- Generated Feishu Card Payload ---'); - console.log(payloadStr); - console.log('------------------------------------'); -} const urlObj = new URL(WEBHOOK_URL); const protocol = urlObj.protocol === 'https:' ? require('https') : require('http'); @@ -201,7 +257,6 @@ const req = protocol.request(options, (res) => { res.on('data', (chunk) => { body += chunk; }); res.on('end', () => { console.log(`Feishu response status: ${res.statusCode}`); - console.log(`Feishu response body: ${body}`); process.exit(1); }); }); From 81c0a23d05895a746c5b60274b614bc2b52e9619 Mon Sep 17 00:00:00 2001 From: yushijie_27 Date: Tue, 9 Jun 2026 13:00:10 +0800 Subject: [PATCH 06/18] chore: optimize link checker report resolveSourceFile --- .github/workflows/scripts/feishu-reporter.js | 58 +++++++++++++++----- 1 file changed, 45 insertions(+), 13 deletions(-) diff --git a/.github/workflows/scripts/feishu-reporter.js b/.github/workflows/scripts/feishu-reporter.js index e5c7c7ff1e6c2..37e30f224bddc 100644 --- a/.github/workflows/scripts/feishu-reporter.js +++ b/.github/workflows/scripts/feishu-reporter.js @@ -36,20 +36,37 @@ const branchName = process.env.GITHUB_REF_NAME || 'master'; function resolveSourceFile(cleanParent) { if (!cleanParent || cleanParent === 'Unknown' || cleanParent === '/') { - return { file: 'Unknown', link: '' }; + return { + file: 'src/pages/index.tsx (首页)', + link: `${serverUrl}/${repoName}/blob/${commitSha}/src/pages/index.tsx` + }; + } + + // 1. Remove query parameters and hash fragments + const pathOnly = cleanParent.split(/[?#]/)[0]; + const decodedPath = decodeURIComponent(pathOnly.replace(/^\/|\/$/g, '')); + + if (decodedPath === '') { + return { + file: 'src/pages/index.tsx (首页)', + link: `${serverUrl}/${repoName}/blob/${commitSha}/src/pages/index.tsx` + }; } - const relativePath = decodeURIComponent(cleanParent.replace(/^\/|\/$/g, '')); - // Default: look up in the current repository + // 2. Candidate files list for local lookup (markdown/docs/pages) const candidates = [ - relativePath + '.md', - relativePath + '.mdx', - relativePath + '/index.md', - relativePath + '/index.mdx', - 'docs/' + relativePath + '.md', - 'docs/' + relativePath + '.mdx', - 'docs/' + relativePath + '/index.md', - 'docs/' + relativePath + '/index.mdx', + decodedPath + '.md', + decodedPath + '.mdx', + decodedPath + '/index.md', + decodedPath + '/index.mdx', + 'docs/' + decodedPath + '.md', + 'docs/' + decodedPath + '.mdx', + 'docs/' + decodedPath + '/index.md', + 'docs/' + decodedPath + '/index.mdx', + 'src/pages/' + decodedPath + '.tsx', + 'src/pages/' + decodedPath + '/index.tsx', + 'src/pages/' + decodedPath + '.js', + 'src/pages/' + decodedPath + '/index.js', ]; for (const cand of candidates) { @@ -61,9 +78,24 @@ function resolveSourceFile(cleanParent) { } } + // 3. Special Docusaurus components mapping / fallbacks + if (decodedPath.startsWith('blog/detail')) { + return { + file: 'src/pages/blog/detail/index.tsx (博客详情页)', + link: `${serverUrl}/${repoName}/blob/${commitSha}/src/pages/blog/detail/index.tsx` + }; + } + if (decodedPath === 'blog') { + return { + file: 'src/pages/blog/index.tsx (博客列表页)', + link: `${serverUrl}/${repoName}/blob/${commitSha}/src/pages/blog/index.tsx` + }; + } + + // Return decoded path as fallback return { - file: relativePath, - link: `${serverUrl}/${repoName}/blob/${commitSha}/${relativePath}` + file: decodedPath, + link: `${serverUrl}/${repoName}/blob/${commitSha}/${decodedPath}` }; } From 4d9f653f8832bd36b60f1e6469c3a22e29def4ed Mon Sep 17 00:00:00 2001 From: yushijie_27 Date: Tue, 9 Jun 2026 13:13:39 +0800 Subject: [PATCH 07/18] chore: URL-encode file links in markdown report to support paths with spaces --- .github/workflows/scripts/feishu-reporter.js | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/scripts/feishu-reporter.js b/.github/workflows/scripts/feishu-reporter.js index 37e30f224bddc..3cf769b529385 100644 --- a/.github/workflows/scripts/feishu-reporter.js +++ b/.github/workflows/scripts/feishu-reporter.js @@ -165,11 +165,12 @@ function writeStepSummary() { for (const item of uniqueBrokenLinks) { const refLinks = item.references.map(ref => { if (ref.link) { - return `[\`${ref.file}\`](${ref.link})`; + return `[\`${ref.file}\`](${encodeURI(ref.link)})`; } return `\`${ref.file}\``; }).join('
'); + markdown += `| \`${item.url}\` | \`${item.errorReason}\` | ${refLinks} |\n`; } } From a61170eda234e1ad04d504eb3aa36267a40e92f3 Mon Sep 17 00:00:00 2001 From: yushijie_27 Date: Tue, 9 Jun 2026 14:07:48 +0800 Subject: [PATCH 08/18] chore: locate exact line number for broken links in markdown and optimize error classifications --- .github/workflows/scripts/feishu-reporter.js | 81 ++++++++++++++++---- 1 file changed, 68 insertions(+), 13 deletions(-) diff --git a/.github/workflows/scripts/feishu-reporter.js b/.github/workflows/scripts/feishu-reporter.js index 3cf769b529385..55a265e2df532 100644 --- a/.github/workflows/scripts/feishu-reporter.js +++ b/.github/workflows/scripts/feishu-reporter.js @@ -38,7 +38,8 @@ function resolveSourceFile(cleanParent) { if (!cleanParent || cleanParent === 'Unknown' || cleanParent === '/') { return { file: 'src/pages/index.tsx (首页)', - link: `${serverUrl}/${repoName}/blob/${commitSha}/src/pages/index.tsx` + link: `${serverUrl}/${repoName}/blob/${commitSha}/src/pages/index.tsx`, + localPath: 'src/pages/index.tsx' }; } @@ -49,7 +50,8 @@ function resolveSourceFile(cleanParent) { if (decodedPath === '') { return { file: 'src/pages/index.tsx (首页)', - link: `${serverUrl}/${repoName}/blob/${commitSha}/src/pages/index.tsx` + link: `${serverUrl}/${repoName}/blob/${commitSha}/src/pages/index.tsx`, + localPath: 'src/pages/index.tsx' }; } @@ -73,7 +75,8 @@ function resolveSourceFile(cleanParent) { if (fs.existsSync(path.join(process.cwd(), cand))) { return { file: cand, - link: `${serverUrl}/${repoName}/blob/${commitSha}/${cand}` + link: `${serverUrl}/${repoName}/blob/${commitSha}/${cand}`, + localPath: cand }; } } @@ -82,23 +85,66 @@ function resolveSourceFile(cleanParent) { if (decodedPath.startsWith('blog/detail')) { return { file: 'src/pages/blog/detail/index.tsx (博客详情页)', - link: `${serverUrl}/${repoName}/blob/${commitSha}/src/pages/blog/detail/index.tsx` + link: `${serverUrl}/${repoName}/blob/${commitSha}/src/pages/blog/detail/index.tsx`, + localPath: 'src/pages/blog/detail/index.tsx' }; } if (decodedPath === 'blog') { return { file: 'src/pages/blog/index.tsx (博客列表页)', - link: `${serverUrl}/${repoName}/blob/${commitSha}/src/pages/blog/index.tsx` + link: `${serverUrl}/${repoName}/blob/${commitSha}/src/pages/blog/index.tsx`, + localPath: 'src/pages/blog/index.tsx' }; } // Return decoded path as fallback return { file: decodedPath, - link: `${serverUrl}/${repoName}/blob/${commitSha}/${decodedPath}` + link: `${serverUrl}/${repoName}/blob/${commitSha}/${decodedPath}`, + localPath: decodedPath }; } +function findLineNumber(localPath, targetUrl) { + if (!localPath || !fs.existsSync(localPath)) return 0; + try { + const content = fs.readFileSync(localPath, 'utf8'); + const lines = content.split('\n'); + + // 1. Search for the exact URL string + for (let i = 0; i < lines.length; i++) { + if (lines[i].includes(targetUrl)) { + return i + 1; + } + } + + // 2. Search for path segment + const urlObj = new URL(targetUrl, 'http://localhost:3000'); + const searchPath = urlObj.pathname.replace(/^\/|\/$/g, ''); + if (searchPath) { + const searchDecoded = decodeURIComponent(searchPath); + for (let i = 0; i < lines.length; i++) { + if (lines[i].includes(searchPath) || lines[i].includes(searchDecoded)) { + return i + 1; + } + } + // Try searching for the last segment of the path + const segments = searchDecoded.split('/'); + const lastSegment = segments[segments.length - 1]; + if (lastSegment && lastSegment.length > 3) { + for (let i = 0; i < lines.length; i++) { + if (lines[i].includes(lastSegment)) { + return i + 1; + } + } + } + } + } catch (err) { + // Silent catch + } + return 0; +} + // Process and group broken links by URL const urlMap = new Map(); let cnt404 = 0; @@ -114,18 +160,28 @@ for (const link of brokenLinks) { if (link.status === 200 && link.url.includes('#')) { const hashMatch = link.url.match(/#.*/); const hash = hashMatch ? hashMatch[0] : ''; - errorReason = `200 (锚点 ${hash} 未找到)`; + errorReason = `200 (锚点 ${hash} 未找到 / Anchor Not Found)`; cntAnchor++; } else if (link.status === 404) { + errorReason = '404 (页面不存在 / Page Not Found)'; cnt404++; + } else if (link.status === 403) { + errorReason = '403 (无访问权限 / Forbidden)'; + cntOther++; + } else if (link.status >= 500) { + errorReason = `${link.status} (服务器内部错误 / Internal Server Error)`; + cntOther++; } else if (!link.status || link.status === 0) { - errorReason = 'Timeout / Network Error'; + errorReason = '连接超时或网络异常 (Timeout / Network Error)'; cntTimeout++; } else { cntOther++; } - const { file: resolvedFile, link: fileLink } = resolveSourceFile(cleanParent); + const { file: resolvedFile, link: fileLink, localPath } = resolveSourceFile(cleanParent); + const line = findLineNumber(localPath, link.url); + const finalLink = line ? `${fileLink}#L${line}` : fileLink; + const displayFile = line ? `${resolvedFile}:${line}` : resolvedFile; if (!urlMap.has(cleanUrl)) { urlMap.set(cleanUrl, { @@ -135,8 +191,8 @@ for (const link of brokenLinks) { }); } urlMap.get(cleanUrl).references.push({ - file: resolvedFile, - link: fileLink + file: displayFile, + link: finalLink }); } @@ -163,14 +219,13 @@ function writeStepSummary() { markdown += `| - | 无失效链接 | - |\n`; } else { for (const item of uniqueBrokenLinks) { - const refLinks = item.references.map(ref => { + const refLinks = item.references.map(ref => { if (ref.link) { return `[\`${ref.file}\`](${encodeURI(ref.link)})`; } return `\`${ref.file}\``; }).join('
'); - markdown += `| \`${item.url}\` | \`${item.errorReason}\` | ${refLinks} |\n`; } } From 30e6fbcaf5f7679a8e177deefc041d0116e198b1 Mon Sep 17 00:00:00 2001 From: yushijie_27 Date: Tue, 9 Jun 2026 14:32:34 +0800 Subject: [PATCH 09/18] chore: use link.state for filtering broken links and enhance errorReason details --- .github/workflows/scripts/feishu-reporter.js | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/.github/workflows/scripts/feishu-reporter.js b/.github/workflows/scripts/feishu-reporter.js index 55a265e2df532..778d3e59c3a20 100644 --- a/.github/workflows/scripts/feishu-reporter.js +++ b/.github/workflows/scripts/feishu-reporter.js @@ -21,7 +21,7 @@ try { process.exit(0); } -const brokenLinks = (data.links || []).filter(link => !link.success); +const brokenLinks = (data.links || []).filter(link => link.state === 'BROKEN'); // Resolve repo name to determine specific doc mappings const repoName = process.env.GITHUB_REPOSITORY || 'doris-website'; @@ -160,24 +160,26 @@ for (const link of brokenLinks) { if (link.status === 200 && link.url.includes('#')) { const hashMatch = link.url.match(/#.*/); const hash = hashMatch ? hashMatch[0] : ''; - errorReason = `200 (锚点 ${hash} 未找到 / Anchor Not Found)`; + errorReason = `锚点失效: 页面可访问但 ${hash} 锚点不存在 (Anchor Not Found)`; cntAnchor++; } else if (link.status === 404) { - errorReason = '404 (页面不存在 / Page Not Found)'; + errorReason = '404: 页面不存在,请检查链接拼写或目标是否已被删除 (Page Not Found)'; cnt404++; } else if (link.status === 403) { - errorReason = '403 (无访问权限 / Forbidden)'; + errorReason = '403: 服务器拒绝访问,可能是鉴权过期或有防爬虫限制 (Forbidden)'; cntOther++; } else if (link.status >= 500) { - errorReason = `${link.status} (服务器内部错误 / Internal Server Error)`; + errorReason = `${link.status}: 目标网站服务错误,请确认服务是否正常运行 (Server Error)`; cntOther++; } else if (!link.status || link.status === 0) { - errorReason = '连接超时或网络异常 (Timeout / Network Error)'; + errorReason = '网络超时/异常: 连接被拒绝或超时,请确认目标链接能否正常访问 (Timeout/Network)'; cntTimeout++; } else { + errorReason = `HTTP ${link.status}: 异常状态码,请点击死链地址确认 (Unexpected Status)`; cntOther++; } + const { file: resolvedFile, link: fileLink, localPath } = resolveSourceFile(cleanParent); const line = findLineNumber(localPath, link.url); const finalLink = line ? `${fileLink}#L${line}` : fileLink; From 04a9ec07a3b3fe26a3e1453fd8934614979fef9d Mon Sep 17 00:00:00 2001 From: yushijie_27 Date: Tue, 9 Jun 2026 17:57:48 +0800 Subject: [PATCH 10/18] feat: support external URL skip list using link-check-ignore.txt --- .github/workflows/link-check-cron-crawler.yml | 16 +++++++++++++++- .github/workflows/link-check-ignore.txt | 9 +++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/link-check-ignore.txt diff --git a/.github/workflows/link-check-cron-crawler.yml b/.github/workflows/link-check-cron-crawler.yml index ca2f02fa26e34..3d6483a2e5e90 100644 --- a/.github/workflows/link-check-cron-crawler.yml +++ b/.github/workflows/link-check-cron-crawler.yml @@ -12,9 +12,23 @@ jobs: cron-crawler: runs-on: ubuntu-latest steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Run Crawler Link Checker run: | - npx -y linkinator https://doris.apache.org --recurse --check-images --skip "^(?!https?://(www\.)?doris\.apache\.org)" --format json > link_results.json || echo "HAS_BROKEN=true" + # Read ignore list and build linkinator skips + skips=() + if [ -f .github/workflows/link-check-ignore.txt ]; then + while IFS= read -r line || [ -n "$line" ]; do + # Skip empty lines and comments + if [[ ! "$line" =~ ^# ]] && [[ -n "$line" ]]; then + skips+=("--skip" "$line") + fi + done < .github/workflows/link-check-ignore.txt + fi + + npx -y linkinator https://doris.apache.org --recurse --check-images --skip "^(?!https?://(www\.)?doris\.apache\.org)" "${skips[@]}" --format json > link_results.json || echo "HAS_BROKEN=true" - name: Send Feishu Notification on failure if: always() diff --git a/.github/workflows/link-check-ignore.txt b/.github/workflows/link-check-ignore.txt new file mode 100644 index 0000000000000..654c486018634 --- /dev/null +++ b/.github/workflows/link-check-ignore.txt @@ -0,0 +1,9 @@ +# Ignore Mozilla domains which often return 403 to crawlers +^https?://([^/]+\.)?mozilla\.org + +# Ignore LinkedIn which blocks GitHub Action runners +^https?://([^/]+\.)?linkedin\.com + +# Ignore Docker Hub / GitHub internal API URLs which might throttle/block automated requests +^https?://hub\.docker\.com +^https?://github\.com/apache/doris-website/pull/ From 0f8e5b227bba6df27b4aaddff61134b0023dec02 Mon Sep 17 00:00:00 2001 From: yushijie_27 Date: Tue, 9 Jun 2026 18:02:04 +0800 Subject: [PATCH 11/18] chore: limit GITHUB_STEP_SUMMARY size to prevent upload failure --- .github/workflows/scripts/feishu-reporter.js | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/scripts/feishu-reporter.js b/.github/workflows/scripts/feishu-reporter.js index 778d3e59c3a20..ccb75c76edd30 100644 --- a/.github/workflows/scripts/feishu-reporter.js +++ b/.github/workflows/scripts/feishu-reporter.js @@ -220,7 +220,9 @@ function writeStepSummary() { if (uniqueBrokenLinks.length === 0) { markdown += `| - | 无失效链接 | - |\n`; } else { - for (const item of uniqueBrokenLinks) { + const limit = 100; + const displayed = uniqueBrokenLinks.slice(0, limit); + for (const item of displayed) { const refLinks = item.references.map(ref => { if (ref.link) { return `[\`${ref.file}\`](${encodeURI(ref.link)})`; @@ -230,6 +232,10 @@ function writeStepSummary() { markdown += `| \`${item.url}\` | \`${item.errorReason}\` | ${refLinks} |\n`; } + if (uniqueBrokenLinks.length > limit) { + markdown += `\n> [!NOTE]\n`; + markdown += `> 由于失效链接数量较多,GitHub 步骤总结(Step Summary)已做截断,仅显示前 ${limit} 个。其余 ${uniqueBrokenLinks.length - limit} 个失效链接已省略,请通过 Feishu 报警消息或运行日志获取完整列表。\n\n`; + } } markdown += `\n---\n`; From 5a2736c6ad0280f567c2899cc6d146662390238d Mon Sep 17 00:00:00 2001 From: yushijie_27 Date: Wed, 10 Jun 2026 16:23:37 +0800 Subject: [PATCH 12/18] fix: improve Doris crawler report source mapping --- .github/workflows/scripts/feishu-reporter.js | 56 +++++++++++++++++--- 1 file changed, 49 insertions(+), 7 deletions(-) diff --git a/.github/workflows/scripts/feishu-reporter.js b/.github/workflows/scripts/feishu-reporter.js index ccb75c76edd30..c4ddb28127f34 100644 --- a/.github/workflows/scripts/feishu-reporter.js +++ b/.github/workflows/scripts/feishu-reporter.js @@ -2,10 +2,6 @@ const fs = require('fs'); const path = require('path'); const WEBHOOK_URL = process.env.FEISHU_WEBHOOK; -if (!WEBHOOK_URL) { - console.error('Error: FEISHU_WEBHOOK environment variable is not set.'); - process.exit(1); -} const resultsPath = path.join(process.cwd(), 'link_results.json'); if (!fs.existsSync(resultsPath)) { @@ -45,7 +41,13 @@ function resolveSourceFile(cleanParent) { // 1. Remove query parameters and hash fragments const pathOnly = cleanParent.split(/[?#]/)[0]; - const decodedPath = decodeURIComponent(pathOnly.replace(/^\/|\/$/g, '')); + let decodedPath = decodeURIComponent(pathOnly.replace(/^\/|\/$/g, '')); + + // Normalize Docusaurus static-build URLs emitted by `serve build`. + decodedPath = decodedPath + .replace(/\.html\/index\.html$/, '') + .replace(/\/index\.html$/, '') + .replace(/\.html$/, ''); if (decodedPath === '') { return { @@ -55,7 +57,42 @@ function resolveSourceFile(cleanParent) { }; } - // 2. Candidate files list for local lookup (markdown/docs/pages) + // 2. Handle Docusaurus docs version routes. + // /docs/dev/foo -> docs/foo.md(x) + // /docs/4.x/foo -> versioned_docs/version-4.x/foo.md(x) + // /zh-CN/docs/4.x/foo -> i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/foo.md(x) + const docsMatch = decodedPath.match(/^(?:(zh-CN)\/)?docs\/([^/]+)\/(.+)$/); + if (docsMatch) { + const [, locale, version, docPath] = docsMatch; + const dirPrefix = locale + ? `i18n/${locale}/docusaurus-plugin-content-docs/${version === 'dev' ? 'current' : `version-${version}`}` + : version === 'dev' + ? 'docs' + : `versioned_docs/version-${version}`; + const docCandidates = [ + `${dirPrefix}/${docPath}.md`, + `${dirPrefix}/${docPath}.mdx`, + `${dirPrefix}/${docPath}/index.md`, + `${dirPrefix}/${docPath}/index.mdx`, + ]; + for (const cand of docCandidates) { + if (fs.existsSync(path.join(process.cwd(), cand))) { + return { + file: cand, + link: `${serverUrl}/${repoName}/blob/${commitSha}/${cand}`, + localPath: cand + }; + } + } + const fallbackCand = `${dirPrefix}/${docPath}.md`; + return { + file: fallbackCand, + link: `${serverUrl}/${repoName}/blob/${commitSha}/${fallbackCand}`, + localPath: fallbackCand + }; + } + + // 3. Candidate files list for local lookup (markdown/docs/pages) const candidates = [ decodedPath + '.md', decodedPath + '.mdx', @@ -81,7 +118,7 @@ function resolveSourceFile(cleanParent) { } } - // 3. Special Docusaurus components mapping / fallbacks + // 4. Special Docusaurus components mapping / fallbacks if (decodedPath.startsWith('blog/detail')) { return { file: 'src/pages/blog/detail/index.tsx (博客详情页)', @@ -332,6 +369,11 @@ const payload = { const payloadStr = JSON.stringify(payload, null, 2); +if (!WEBHOOK_URL) { + console.error('Error: FEISHU_WEBHOOK environment variable is not set. Step summary has been written.'); + process.exit(1); +} + const urlObj = new URL(WEBHOOK_URL); const protocol = urlObj.protocol === 'https:' ? require('https') : require('http'); From f522a55bb18ba255f70fbd82afdd71b2ba735b31 Mon Sep 17 00:00:00 2001 From: yushijie_27 Date: Wed, 10 Jun 2026 16:37:26 +0800 Subject: [PATCH 13/18] chore: ignore non-target Doris crawler routes --- .github/workflows/link-check-ignore.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/link-check-ignore.txt b/.github/workflows/link-check-ignore.txt index 654c486018634..129a8cecf51ad 100644 --- a/.github/workflows/link-check-ignore.txt +++ b/.github/workflows/link-check-ignore.txt @@ -7,3 +7,9 @@ # Ignore Docker Hub / GitHub internal API URLs which might throttle/block automated requests ^https?://hub\.docker\.com ^https?://github\.com/apache/doris-website/pull/ + +# Ignore Japanese routes until the ja site is published on doris.apache.org +^https?://doris\.apache\.org/ja/ + +# Ignore generated 404 pages picked up during crawling +^https?://doris\.apache\.org/(zh-CN/)?404\.html/?$ From cd55977653285bfcdb4cb6c664e26213b412b91d Mon Sep 17 00:00:00 2001 From: yushijie_27 Date: Wed, 10 Jun 2026 17:02:06 +0800 Subject: [PATCH 14/18] fix: distinguish online crawler report target --- .github/workflows/scripts/feishu-reporter.js | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/.github/workflows/scripts/feishu-reporter.js b/.github/workflows/scripts/feishu-reporter.js index c4ddb28127f34..04342a61c008a 100644 --- a/.github/workflows/scripts/feishu-reporter.js +++ b/.github/workflows/scripts/feishu-reporter.js @@ -29,6 +29,8 @@ const prNumber = process.env.GITHUB_EVENT_NAME === 'pull_request' ? (process.env const actor = process.env.GITHUB_ACTOR || 'system'; const eventName = process.env.GITHUB_EVENT_NAME === 'schedule' ? '每日例行巡检' : 'PR 提交'; const branchName = process.env.GITHUB_REF_NAME || 'master'; +const linkCheckMode = process.env.LINK_CHECK_MODE || 'online'; +const linkCheckTarget = process.env.LINK_CHECK_TARGET || 'https://doris.apache.org'; function resolveSourceFile(cleanParent) { if (!cleanParent || cleanParent === 'Unknown' || cleanParent === '/') { @@ -39,9 +41,8 @@ function resolveSourceFile(cleanParent) { }; } - // 1. Remove query parameters and hash fragments - const pathOnly = cleanParent.split(/[?#]/)[0]; - let decodedPath = decodeURIComponent(pathOnly.replace(/^\/|\/$/g, '')); + // 1. Normalize absolute or relative URLs to a pathname without query/hash. + let decodedPath = decodeURIComponent(new URL(cleanParent, 'http://localhost:3000').pathname.replace(/^\/|\/$/g, '')); // Normalize Docusaurus static-build URLs emitted by `serve build`. decodedPath = decodedPath @@ -277,6 +278,8 @@ function writeStepSummary() { markdown += `\n---\n`; markdown += `**📊 运行元信息:**\n`; + markdown += `* **扫描模式**: \`${linkCheckMode}\`\n`; + markdown += `* **扫描目标**: \`${linkCheckTarget}\`\n`; markdown += `* **检测分支**: \`${branchName}\`\n`; markdown += `* **触发类型**: \`${eventName}\`\n`; markdown += `* **检测时间**: \`${new Date().toISOString().replace('T', ' ').substring(0, 19)} (UTC)\`\n`; @@ -323,7 +326,7 @@ const payload = { tag: 'div', text: { tag: 'lark_md', - content: `**触发场景**: ${eventName}\n**提交人**: @${actor}${prNumber ? `\n**PR号**: #${prNumber}` : ''}\n**总失效链接数**: **${brokenLinks.length}** 个` + content: `**触发场景**: ${eventName}\n**扫描模式**: ${linkCheckMode}\n**扫描目标**: ${linkCheckTarget}\n**提交人**: @${actor}${prNumber ? `\n**PR号**: #${prNumber}` : ''}\n**总失效链接数**: **${brokenLinks.length}** 个` } }, { From f093fde9c232ffa8619e952885f923cd56dd7f05 Mon Sep 17 00:00:00 2001 From: yushijie_27 Date: Wed, 10 Jun 2026 17:15:40 +0800 Subject: [PATCH 15/18] fix: reduce online crawler rate-limit false positives --- .github/workflows/link-check-cron-crawler.yml | 2 +- .github/workflows/scripts/feishu-reporter.js | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/link-check-cron-crawler.yml b/.github/workflows/link-check-cron-crawler.yml index 3d6483a2e5e90..fc2c6b54e7feb 100644 --- a/.github/workflows/link-check-cron-crawler.yml +++ b/.github/workflows/link-check-cron-crawler.yml @@ -28,7 +28,7 @@ jobs: done < .github/workflows/link-check-ignore.txt fi - npx -y linkinator https://doris.apache.org --recurse --check-images --skip "^(?!https?://(www\.)?doris\.apache\.org)" "${skips[@]}" --format json > link_results.json || echo "HAS_BROKEN=true" + npx -y linkinator https://doris.apache.org --recurse --check-images --concurrency 5 --timeout 30000 --retry --retry-errors --retry-errors-count 3 --status-code "429:warn" --skip "^(?!https?://(www\.)?doris\.apache\.org)" "${skips[@]}" --format json > link_results.json || echo "HAS_BROKEN=true" - name: Send Feishu Notification on failure if: always() diff --git a/.github/workflows/scripts/feishu-reporter.js b/.github/workflows/scripts/feishu-reporter.js index 04342a61c008a..4929d8c7680df 100644 --- a/.github/workflows/scripts/feishu-reporter.js +++ b/.github/workflows/scripts/feishu-reporter.js @@ -206,6 +206,9 @@ for (const link of brokenLinks) { } else if (link.status === 403) { errorReason = '403: 服务器拒绝访问,可能是鉴权过期或有防爬虫限制 (Forbidden)'; cntOther++; + } else if (link.status === 429) { + errorReason = '429: 触发线上站点限流/防爬策略,浏览器访问可能正常 (Rate Limited)'; + cntOther++; } else if (link.status >= 500) { errorReason = `${link.status}: 目标网站服务错误,请确认服务是否正常运行 (Server Error)`; cntOther++; From 7947ad1261af1af16ebc79a19df642b1c726bd8b Mon Sep 17 00:00:00 2001 From: yushijie_27 Date: Wed, 10 Jun 2026 17:23:22 +0800 Subject: [PATCH 16/18] fix: report only 404 link check failures --- .github/workflows/scripts/feishu-reporter.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/scripts/feishu-reporter.js b/.github/workflows/scripts/feishu-reporter.js index 4929d8c7680df..b5ecc5c69fa7f 100644 --- a/.github/workflows/scripts/feishu-reporter.js +++ b/.github/workflows/scripts/feishu-reporter.js @@ -17,7 +17,7 @@ try { process.exit(0); } -const brokenLinks = (data.links || []).filter(link => link.state === 'BROKEN'); +const brokenLinks = (data.links || []).filter(link => link.state === 'BROKEN' && link.status === 404); // Resolve repo name to determine specific doc mappings const repoName = process.env.GITHUB_REPOSITORY || 'doris-website'; From 59bedfd1cbc442f5be6dc6b037a40fe93c2319ce Mon Sep 17 00:00:00 2001 From: yushijie_27 Date: Mon, 15 Jun 2026 10:20:43 +0800 Subject: [PATCH 17/18] feat: report total checked URLs and send success Feishu notifications --- .github/workflows/scripts/feishu-reporter.js | 189 +++++++++++-------- 1 file changed, 114 insertions(+), 75 deletions(-) diff --git a/.github/workflows/scripts/feishu-reporter.js b/.github/workflows/scripts/feishu-reporter.js index b5ecc5c69fa7f..07d0024b24c6e 100644 --- a/.github/workflows/scripts/feishu-reporter.js +++ b/.github/workflows/scripts/feishu-reporter.js @@ -17,6 +17,8 @@ try { process.exit(0); } +const totalChecks = (data.links || []).length; +const uniqueUrls = new Set((data.links || []).map(link => link.url)).size; const brokenLinks = (data.links || []).filter(link => link.state === 'BROKEN' && link.status === 404); // Resolve repo name to determine specific doc mappings @@ -249,10 +251,10 @@ function writeStepSummary() { if (brokenLinks.length > 0) { markdown += `> [!WARNING]\n`; - markdown += `> **本次定时巡检共发现 ${brokenLinks.length} 处失效链接!** 请开发人员点击下表【引用源文件】中的链接,直接跳转到对应的 GitHub 源码行进行修复。\n\n`; + markdown += `> **本次定时巡检共发现 ${brokenLinks.length} 处失效链接!** 共检查了 ${totalChecks} 次链接,包含 ${uniqueUrls} 个独立 URL。请开发人员点击下表【引用源文件】中的链接,直接跳转到对应的 GitHub 源码行进行修复。\n\n`; } else { markdown += `> [!NOTE]\n`; - markdown += `> **本次定时巡检未发现失效链接。** 链路状态良好!\n\n`; + markdown += `> **本次定时巡检未发现失效链接。** 共检查了 ${totalChecks} 次链接,包含 ${uniqueUrls} 个独立 URL。链路状态良好!\n\n`; } markdown += `| 🔗 失效链接 (Broken Link) | ❌ 错误原因 (Error Reason) | 📌 引用源文件 (Where Referenced) |\n`; @@ -283,6 +285,7 @@ function writeStepSummary() { markdown += `**📊 运行元信息:**\n`; markdown += `* **扫描模式**: \`${linkCheckMode}\`\n`; markdown += `* **扫描目标**: \`${linkCheckTarget}\`\n`; + markdown += `* **检测总量**: \`${totalChecks}\` (独立 URL: \`${uniqueUrls}\`)\n`; markdown += `* **检测分支**: \`${branchName}\`\n`; markdown += `* **触发类型**: \`${eventName}\`\n`; markdown += `* **检测时间**: \`${new Date().toISOString().replace('T', ' ').substring(0, 19)} (UTC)\`\n`; @@ -297,87 +300,123 @@ function writeStepSummary() { // Write Step Summary writeStepSummary(); -if (brokenLinks.length === 0) { - console.log('No broken links found. Exiting with success.'); - process.exit(0); -} - -console.log(`Found ${brokenLinks.length} broken links. Sending Feishu notification...`); - -// Format broken links summary for Feishu -const limit = 10; -const displayedBroken = uniqueBrokenLinks.slice(0, limit); -const brokenListMd = displayedBroken.map((item, idx) => { - const refsText = item.references.slice(0, 3).map(r => r.file).join(', ') + (item.references.length > 3 ? '等' : ''); - return `${idx + 1}. ❌ **[${item.errorReason}]** ${item.url}\n 🔍 引用源: \`${refsText}\``; -}).join('\n'); - -const totalText = uniqueBrokenLinks.length > limit ? `\n\n...以及其他 ${uniqueBrokenLinks.length - limit} 个失效链接,请点击下方按钮查看完整排错报告。` : ''; - -const payload = { - msg_type: 'interactive', - card: { - header: { - template: 'red', - title: { - tag: 'plain_text', - content: `⚠️ 链接扫描失败警告 | ${repoName.split('/')[1] || repoName}` - } - }, - elements: [ - { - tag: 'div', - text: { - tag: 'lark_md', - content: `**触发场景**: ${eventName}\n**扫描模式**: ${linkCheckMode}\n**扫描目标**: ${linkCheckTarget}\n**提交人**: @${actor}${prNumber ? `\n**PR号**: #${prNumber}` : ''}\n**总失效链接数**: **${brokenLinks.length}** 个` +const hasIssues = brokenLinks.length > 0; +const exitCode = hasIssues ? 1 : 0; + +let payload; +if (!hasIssues) { + console.log('No broken links found. Sending success notification to Feishu.'); + payload = { + msg_type: 'interactive', + card: { + header: { + template: 'green', + title: { + tag: 'plain_text', + content: `✅ 链接扫描成功 | ${repoName.split('/')[1] || repoName}` } }, - { - tag: 'hr' - }, - { - tag: 'div', - text: { - tag: 'lark_md', - content: `**📊 死链分类统计 (Classification):**\n` + - `• 🔴 **页面未找到 (404)**: **${cnt404}** 个\n` + - `• 🟡 **锚点失效 (Anchor)**: **${cntAnchor}** 个\n` + - `• 🔵 **网络超时/其他**: **${cntTimeout + cntOther}** 个` - } - }, - { - tag: 'hr' - }, - { - tag: 'div', - text: { - tag: 'lark_md', - content: `**检测到失效链接列表 (最多展示 ${limit} 个):**\n${brokenListMd}${totalText}` + elements: [ + { + tag: 'div', + text: { + tag: 'lark_md', + content: `**触发场景**: ${eventName}\n**扫描模式**: ${linkCheckMode}\n**扫描目标**: ${linkCheckTarget}\n**检测状态**: 全部通过!\n**总检查量**: 共检查了 **${totalChecks}** 次链接,包含 **${uniqueUrls}** 个独立 URL。` + } + }, + runUrl ? { + tag: 'action', + actions: [ + { + tag: 'button', + text: { + tag: 'plain_text', + content: '查看详细排错报告' + }, + type: 'primary', + url: runUrl + } + ] + } : null + ].filter(Boolean) + } + }; +} else { + console.log(`Found ${brokenLinks.length} broken links. Sending Feishu notification...`); + const limit = 10; + const displayedBroken = uniqueBrokenLinks.slice(0, limit); + const brokenListMd = displayedBroken.map((item, idx) => { + const refsText = item.references.slice(0, 3).map(r => r.file).join(', ') + (item.references.length > 3 ? '等' : ''); + return `${idx + 1}. ❌ **[${item.errorReason}]** ${item.url}\n 🔍 引用源: \`${refsText}\``; + }).join('\n'); + + const totalText = uniqueBrokenLinks.length > limit ? `\n\n...以及其他 ${uniqueBrokenLinks.length - limit} 个失效链接,请点击下方按钮查看完整排错报告。` : ''; + + payload = { + msg_type: 'interactive', + card: { + header: { + template: 'red', + title: { + tag: 'plain_text', + content: `⚠️ 链接扫描失败警告 | ${repoName.split('/')[1] || repoName}` } }, - runUrl ? { - tag: 'action', - actions: [ - { - tag: 'button', - text: { - tag: 'plain_text', - content: '查看详细排错报告' - }, - type: 'primary', - url: runUrl + elements: [ + { + tag: 'div', + text: { + tag: 'lark_md', + content: `**触发场景**: ${eventName}\n**扫描模式**: ${linkCheckMode}\n**扫描目标**: ${linkCheckTarget}\n**提交人**: @${actor}${prNumber ? `\n**PR号**: #${prNumber}` : ''}\n**总失效链接数**: **${brokenLinks.length}** 个 (总共检查了 **${totalChecks}** 次链接,包含 **${uniqueUrls}** 个独立 URL)` } - ] - } : null - ].filter(Boolean) - } -}; + }, + { + tag: 'hr' + }, + { + tag: 'div', + text: { + tag: 'lark_md', + content: `**📊 死链分类统计 (Classification):**\n` + + `• 🔴 **页面未找到 (404)**: **${cnt404}** 个\n` + + `• 🟡 **锚点失效 (Anchor)**: **${cntAnchor}** 个\n` + + `• 🔵 **网络超时/其他**: **${cntTimeout + cntOther}** 个` + } + }, + { + tag: 'hr' + }, + { + tag: 'div', + text: { + tag: 'lark_md', + content: `**检测到失效链接列表 (最多展示 ${limit} 个):**\n${brokenListMd}${totalText}` + } + }, + runUrl ? { + tag: 'action', + actions: [ + { + tag: 'button', + text: { + tag: 'plain_text', + content: '查看详细排错报告' + }, + type: 'primary', + url: runUrl + } + ] + } : null + ].filter(Boolean) + } + }; +} const payloadStr = JSON.stringify(payload, null, 2); if (!WEBHOOK_URL) { console.error('Error: FEISHU_WEBHOOK environment variable is not set. Step summary has been written.'); - process.exit(1); + process.exit(exitCode); } const urlObj = new URL(WEBHOOK_URL); @@ -401,13 +440,13 @@ const req = protocol.request(options, (res) => { res.on('data', (chunk) => { body += chunk; }); res.on('end', () => { console.log(`Feishu response status: ${res.statusCode}`); - process.exit(1); + process.exit(exitCode); }); }); req.on('error', (e) => { console.error(`Problem sending request to Feishu: ${e.message}`); - process.exit(1); + process.exit(exitCode); }); req.write(payloadStr); From 6f1a0e24a1b8929a20efc92fb74bcc74d5a72c96 Mon Sep 17 00:00:00 2001 From: yushijie_27 Date: Mon, 15 Jun 2026 11:10:30 +0800 Subject: [PATCH 18/18] feat: migrate advanced link checker features from selectdb-docs --- .github/workflows/scripts/feishu-reporter.js | 213 +++++++++++++++++-- 1 file changed, 191 insertions(+), 22 deletions(-) diff --git a/.github/workflows/scripts/feishu-reporter.js b/.github/workflows/scripts/feishu-reporter.js index 07d0024b24c6e..73330264256db 100644 --- a/.github/workflows/scripts/feishu-reporter.js +++ b/.github/workflows/scripts/feishu-reporter.js @@ -19,7 +19,126 @@ try { const totalChecks = (data.links || []).length; const uniqueUrls = new Set((data.links || []).map(link => link.url)).size; -const brokenLinks = (data.links || []).filter(link => link.state === 'BROKEN' && link.status === 404); + +function doesTargetDocExist(targetUrl, parentUrl) { + try { + const urlObj = new URL(targetUrl, 'http://localhost:3000'); + let pathname = decodeURIComponent(urlObj.pathname).replace(/^\/|\/$/g, ''); + pathname = pathname.replace(/\.html?$/, ''); + + if (!pathname) return false; + + const segments = pathname.split('/').filter(Boolean); + if (segments.length === 0) return false; + + // Check if first segment is a locale (e.g., 'zh-CN', 'ja') + let locale = ''; + if (segments[0] === 'zh-CN' || segments[0] === 'ja') { + locale = segments.shift(); + } + + // Now segments should start with 'docs' if it's a documentation route + if (segments[0] !== 'docs') { + return false; + } + + segments.shift(); // remove 'docs' + if (segments.length === 0) return false; + + // Check if the next segment is a version number + let version = ''; + if (segments[0] && (segments[0].match(/^\d/) || segments[0] === 'next')) { + version = segments.shift(); + } + + const docSubPath = segments.join('/'); + if (!docSubPath) return false; + + // Build candidate search tails + const searchTails = []; + const subSegs = docSubPath.split('/').filter(Boolean); + for (let len = Math.min(subSegs.length, 4); len >= 2; len--) { + const tail = subSegs.slice(-len).join('/'); + searchTails.push(tail + '.md'); + searchTails.push(tail + '.mdx'); + } + searchTails.push(docSubPath + '.md'); + searchTails.push(docSubPath + '.mdx'); + + // Determine base directories + const baseDirs = []; + if (locale) { + const versionFolder = version ? `version-${version}` : 'current'; + baseDirs.push(path.join('i18n', locale, 'docusaurus-plugin-content-docs', versionFolder)); + } else { + if (version) { + baseDirs.push(path.join('versioned_docs', `version-${version}`)); + } else { + baseDirs.push('docs'); + } + } + + // Read all markdown files in these base directories (cache it globally) + if (!global.allDocsFiles) { + global.allDocsFiles = []; + const walk = (dir) => { + const fullDir = path.join(process.cwd(), dir); + if (!fs.existsSync(fullDir)) return; + const list = fs.readdirSync(fullDir); + for (const file of list) { + const fullPath = path.join(fullDir, file); + const relPath = path.relative(process.cwd(), fullPath); + const stat = fs.statSync(fullPath); + if (stat.isDirectory()) { + if (file !== 'node_modules' && file !== '.git' && file !== 'build' && file !== '.docusaurus') { + walk(relPath); + } + } else if (file.endsWith('.md') || file.endsWith('.mdx')) { + global.allDocsFiles.push(relPath.replace(/\\/g, '/')); + } + } + }; + walk('docs'); + walk('versioned_docs'); + walk('i18n'); + } + + for (const file of global.allDocsFiles) { + const matchesDir = baseDirs.length === 0 || baseDirs.some(dir => { + const normalizedDir = dir.replace(/\\/g, '/'); + return file.startsWith(normalizedDir + '/'); + }); + if (matchesDir) { + for (const tail of searchTails) { + if (file.endsWith('/' + tail) || file === tail) { + return true; + } + } + } + } + } catch (e) { + console.error('Error in doesTargetDocExist:', e); + } + return false; +} + +const brokenLinks = (data.links || []).filter(link => { + if (link.state !== 'BROKEN' || link.status !== 404) { + return false; + } + + const isImage = link.url.match(/\.(png|jpe?g|gif|webp|svg|ico)(\?.*)?$/i); + if (isImage) { + return true; + } + + if (doesTargetDocExist(link.url, link.parent)) { + console.log(`Ignoring browser-navigable 404 link (target document exists): ${link.url}`); + return false; + } + + return true; +}); // Resolve repo name to determine specific doc mappings const repoName = process.env.GITHUB_REPOSITORY || 'doris-website'; @@ -168,13 +287,15 @@ function findLineNumber(localPath, targetUrl) { return i + 1; } } - // Try searching for the last segment of the path - const segments = searchDecoded.split('/'); - const lastSegment = segments[segments.length - 1]; - if (lastSegment && lastSegment.length > 3) { - for (let i = 0; i < lines.length; i++) { - if (lines[i].includes(lastSegment)) { - return i + 1; + // Try searching for progressive sub-paths from the end (e.g. 3 segments, then 2, then 1) to avoid false positives on common words + const segments = searchDecoded.split('/').filter(Boolean); + for (let len = Math.min(segments.length, 3); len >= 1; len--) { + const subPath = segments.slice(-len).join('/'); + if (subPath && subPath.length > 3) { + for (let i = 0; i < lines.length; i++) { + if (lines[i].includes(subPath)) { + return i + 1; + } } } } @@ -185,6 +306,30 @@ function findLineNumber(localPath, targetUrl) { return 0; } +// Helper to get line content from a file +function getLineContent(localPath, lineNum) { + if (!localPath || lineNum <= 0 || !fs.existsSync(localPath)) return ''; + try { + const content = fs.readFileSync(localPath, 'utf8'); + const lines = content.split('\n'); + if (lineNum <= lines.length) { + return lines[lineNum - 1].trim(); + } + } catch (err) { + // ignore + } + return ''; +} + +// Helper to escape special characters in markdown table cell +function escapeMarkdownTable(text) { + return text + .replace(/\|/g, '\\|') + .replace(//g, '>') + .replace(/\r?\n/g, ' '); +} + // Process and group broken links by URL const urlMap = new Map(); let cnt404 = 0; @@ -228,17 +373,34 @@ for (const link of brokenLinks) { const finalLink = line ? `${fileLink}#L${line}` : fileLink; const displayFile = line ? `${resolvedFile}:${line}` : resolvedFile; + const fileExists = localPath && fs.existsSync(path.join(process.cwd(), localPath)); + if (!urlMap.has(cleanUrl)) { urlMap.set(cleanUrl, { url: cleanUrl, errorReason, - references: [] + references: [], + seenRefs: new Set() + }); + } + + const entry = urlMap.get(cleanUrl); + const refKey = `${resolvedFile}:${line}`; + if (!entry.seenRefs.has(refKey)) { + entry.seenRefs.add(refKey); + + let codeSnippet = ''; + if (fileExists && line > 0) { + codeSnippet = getLineContent(localPath, line); + } + + entry.references.push({ + file: displayFile, + link: finalLink, + fileExists: !!fileExists, + codeSnippet }); } - urlMap.get(cleanUrl).references.push({ - file: displayFile, - link: finalLink - }); } const uniqueBrokenLinks = [...urlMap.values()]; @@ -266,14 +428,18 @@ function writeStepSummary() { const limit = 100; const displayed = uniqueBrokenLinks.slice(0, limit); for (const item of displayed) { - const refLinks = item.references.map(ref => { - if (ref.link) { - return `[\`${ref.file}\`](${encodeURI(ref.link)})`; - } - return `\`${ref.file}\``; - }).join('
'); - - markdown += `| \`${item.url}\` | \`${item.errorReason}\` | ${refLinks} |\n`; + const validRefs = item.references.filter(r => r.fileExists); + const refLinks = validRefs.map(ref => { + let lineInfo = `[\`${ref.file}\`](${encodeURI(ref.link)})`; + if (ref.codeSnippet) { + lineInfo += `
${escapeMarkdownTable(ref.codeSnippet)}`; + } + return lineInfo; + }).join('
'); + + const finalRefCell = refLinks || '*无有效引用源 (可能为 404 页面产生的级联链接)*'; + + markdown += `| \`${item.url}\` | \`${item.errorReason}\` | ${finalRefCell} |\n`; } if (uniqueBrokenLinks.length > limit) { markdown += `\n> [!NOTE]\n`; @@ -346,7 +512,10 @@ if (!hasIssues) { const limit = 10; const displayedBroken = uniqueBrokenLinks.slice(0, limit); const brokenListMd = displayedBroken.map((item, idx) => { - const refsText = item.references.slice(0, 3).map(r => r.file).join(', ') + (item.references.length > 3 ? '等' : ''); + const validRefs = item.references.filter(r => r.fileExists); + const refsText = validRefs.length > 0 + ? (validRefs.slice(0, 3).map(r => r.file).join(', ') + (validRefs.length > 3 ? '等' : '')) + : '未知 (可能为404页面产生级联链接)'; return `${idx + 1}. ❌ **[${item.errorReason}]** ${item.url}\n 🔍 引用源: \`${refsText}\``; }).join('\n');