From a7c46a8ce891a43087b586a0c11a40eb007c65c7 Mon Sep 17 00:00:00 2001 From: Vorflux AI Date: Mon, 23 Mar 2026 22:53:48 +0000 Subject: [PATCH] refactor: deduplicate question ID validation, fix cross-benchmark pattern matching - Extract shared validateQuestionIds utility (src/utils/question-ids.ts) to deduplicate identical ~30-line validation blocks in batch.ts and index.ts - Rewrite expand-ids endpoint to use generic prefix matching instead of hardcoded regex (^[a-zA-Z]+-\d+$) that only worked for LoCoMo. Now works across all benchmarks (LoCoMo, ConvoMem, LongMemEval) by matching any pattern as a prefix against question IDs. - Extract shared QuestionIdSelector component and validation utility to deduplicate identical code between runs/new and compare/new pages - Remove redundant client-side re-validation that fetched all benchmark questions page-by-page (server already validates in expand-ids endpoint) --- src/orchestrator/batch.ts | 30 +---- src/orchestrator/index.ts | 30 +---- src/server/routes/benchmarks.ts | 53 +++----- src/utils/question-ids.ts | 46 +++++++ ui/app/compare/new/page.tsx | 176 ++---------------------- ui/app/runs/new/page.tsx | 180 ++----------------------- ui/components/question-id-selector.tsx | 123 +++++++++++++++++ ui/lib/question-id-validation.ts | 43 ++++++ 8 files changed, 259 insertions(+), 422 deletions(-) create mode 100644 src/utils/question-ids.ts create mode 100644 ui/components/question-id-selector.tsx create mode 100644 ui/lib/question-id-validation.ts diff --git a/src/orchestrator/batch.ts b/src/orchestrator/batch.ts index 5febfdd..0226b79 100644 --- a/src/orchestrator/batch.ts +++ b/src/orchestrator/batch.ts @@ -5,6 +5,7 @@ import type { BenchmarkResult } from "../types/unified" import { orchestrator, CheckpointManager } from "./index" import { createBenchmark } from "../benchmarks" import { logger } from "../utils/logger" +import { validateQuestionIds } from "../utils/question-ids" import { existsSync, mkdirSync, readFileSync, writeFileSync, rmSync } from "fs" import { join } from "path" import { startRun, endRun } from "../server/runState" @@ -157,35 +158,8 @@ export class BatchManager { let targetQuestionIds: string[] if (questionIds && questionIds.length > 0) { - // Validate that all provided IDs exist in the benchmark - const allQuestionIdsSet = new Set(allQuestions.map((q) => q.questionId)) - const validIds: string[] = [] - const invalidIds: string[] = [] - - for (const id of questionIds) { - if (allQuestionIdsSet.has(id)) { - validIds.push(id) - } else { - invalidIds.push(id) - } - } - - if (invalidIds.length > 0) { - logger.warn(`Invalid question IDs (will be skipped): ${invalidIds.join(", ")}`) - } - - if (validIds.length === 0) { - throw new Error( - `All provided questionIds are invalid. No matching questions found in benchmark "${benchmark}". ` + - `Invalid IDs: ${invalidIds.join(", ")}` - ) - } - + const { validIds } = validateQuestionIds(questionIds, allQuestions, benchmark) targetQuestionIds = validIds - logger.info( - `Using explicit questionIds: ${validIds.length} valid questions` + - (invalidIds.length > 0 ? ` (${invalidIds.length} invalid skipped)` : "") - ) } else if (sampling) { targetQuestionIds = selectQuestionsBySampling(allQuestions, sampling) } else { diff --git a/src/orchestrator/index.ts b/src/orchestrator/index.ts index f19123b..758067a 100644 --- a/src/orchestrator/index.ts +++ b/src/orchestrator/index.ts @@ -10,6 +10,7 @@ import { CheckpointManager } from "./checkpoint" import { getProviderConfig, getJudgeConfig } from "../utils/config" import { resolveModel } from "../utils/models" import { logger } from "../utils/logger" +import { validateQuestionIds } from "../utils/question-ids" import { runIngestPhase } from "./phases/ingest" import { runIndexingPhase } from "./phases/indexing" import { runSearchPhase } from "./phases/search" @@ -213,35 +214,8 @@ export class Orchestrator { effectiveLimit = limit if (questionIds && questionIds.length > 0) { - // Validate that all provided IDs exist in the benchmark - const allQuestionIdsSet = new Set(allQuestions.map((q) => q.questionId)) - const validIds: string[] = [] - const invalidIds: string[] = [] - - for (const id of questionIds) { - if (allQuestionIdsSet.has(id)) { - validIds.push(id) - } else { - invalidIds.push(id) - } - } - - if (invalidIds.length > 0) { - logger.warn(`Invalid question IDs (will be skipped): ${invalidIds.join(", ")}`) - } - - if (validIds.length === 0) { - throw new Error( - `All provided questionIds are invalid. No matching questions found in benchmark "${benchmarkName}". ` + - `Invalid IDs: ${invalidIds.join(", ")}` - ) - } - + const { validIds } = validateQuestionIds(questionIds, allQuestions, benchmarkName) targetQuestionIds = validIds - logger.info( - `Using explicit questionIds: ${validIds.length} valid questions` + - (invalidIds.length > 0 ? ` (${invalidIds.length} invalid skipped)` : "") - ) } else if (sampling) { logger.info(`Using sampling mode: ${sampling.mode}`) targetQuestionIds = selectQuestionsBySampling(allQuestions, sampling) diff --git a/src/server/routes/benchmarks.ts b/src/server/routes/benchmarks.ts index 6860315..4a46cb6 100644 --- a/src/server/routes/benchmarks.ts +++ b/src/server/routes/benchmarks.ts @@ -128,7 +128,8 @@ export async function handleBenchmarksRoutes(req: Request, url: URL): Promise - q.questionId.startsWith(trimmed + "-q") - ) - matchingQuestions.forEach((q) => { - expanded.push(q.questionId) - expandedIds.add(q.questionId) - }) - } - // Pattern 2: Session ID (e.g., "conv-26-session_1" or "001be529-session-0") - // Find all questions that reference this session - else if (trimmed.includes("-session")) { - const matchingQuestions = allQuestions.filter((q) => - q.haystackSessionIds.includes(trimmed) - ) - matchingQuestions.forEach((q) => { - expanded.push(q.questionId) - expandedIds.add(q.questionId) - }) - } - // Pattern 3: Direct question ID - add as-is if it exists - else { - const exactMatch = allQuestions.find((q) => q.questionId === trimmed) - if (exactMatch) { - expanded.push(trimmed) - expandedIds.add(trimmed) - } + const addMatch = (id: string) => { expanded.push(id); expandedIds.add(id) } + + // Priority: exact match > session lookup > prefix expansion + const exactMatch = allQuestions.find((q) => q.questionId === trimmed) + if (exactMatch) { + addMatch(trimmed) + } else if (trimmed.includes("-session")) { + // Session ID — find all questions that reference this session + allQuestions + .filter((q) => q.haystackSessionIds.includes(trimmed)) + .forEach((q) => addMatch(q.questionId)) + } else { + // Prefix match — works across all benchmarks (e.g., "conv-26" matches + // "conv-26-q0", "convomem-user_evidence" matches "convomem-user_evidence-0") + const prefix = trimmed.endsWith("-") ? trimmed : trimmed + "-" + allQuestions + .filter((q) => q.questionId.startsWith(prefix)) + .forEach((q) => addMatch(q.questionId)) } - patternResults[pattern] = expanded + patternResults[trimmed] = expanded } return json({ diff --git a/src/utils/question-ids.ts b/src/utils/question-ids.ts new file mode 100644 index 0000000..8db1430 --- /dev/null +++ b/src/utils/question-ids.ts @@ -0,0 +1,46 @@ +import { logger } from "./logger" + +export interface ValidateQuestionIdsResult { + validIds: string[] + invalidIds: string[] +} + +/** + * Validates a list of question IDs against the full set of questions in a benchmark. + * Returns valid and invalid IDs separately. Throws if all IDs are invalid. + */ +export function validateQuestionIds( + questionIds: string[], + allQuestions: { questionId: string }[], + benchmarkName: string +): ValidateQuestionIdsResult { + const allQuestionIdsSet = new Set(allQuestions.map((q) => q.questionId)) + const validIds: string[] = [] + const invalidIds: string[] = [] + + for (const id of questionIds) { + if (allQuestionIdsSet.has(id)) { + validIds.push(id) + } else { + invalidIds.push(id) + } + } + + if (invalidIds.length > 0) { + logger.warn(`Invalid question IDs (will be skipped): ${invalidIds.join(", ")}`) + } + + if (validIds.length === 0) { + throw new Error( + `All provided questionIds are invalid. No matching questions found in benchmark "${benchmarkName}". ` + + `Invalid IDs: ${invalidIds.join(", ")}` + ) + } + + logger.info( + `Using explicit questionIds: ${validIds.length} valid questions` + + (invalidIds.length > 0 ? ` (${invalidIds.length} invalid skipped)` : "") + ) + + return { validIds, invalidIds } +} diff --git a/ui/app/compare/new/page.tsx b/ui/app/compare/new/page.tsx index e095b84..a1abf73 100644 --- a/ui/app/compare/new/page.tsx +++ b/ui/app/compare/new/page.tsx @@ -8,14 +8,14 @@ import { getBenchmarks, getModels, startCompare, - expandQuestionIdPatterns, - getBenchmarkQuestions, type SelectionMode, type SampleType, type SamplingConfig, } from "@/lib/api" +import { type QuestionIdValidationResult } from "@/lib/question-id-validation" import { SingleSelect } from "@/components/single-select" import { MultiSelect } from "@/components/multi-select" +import { QuestionIdSelector } from "@/components/question-id-selector" export default function NewComparePage() { const router = useRouter() @@ -41,14 +41,7 @@ export default function NewComparePage() { }) const [editingCompareId, setEditingCompareId] = useState(false) - const [validatingQuestionIds, setValidatingQuestionIds] = useState(false) - const [questionIdValidation, setQuestionIdValidation] = useState<{ - valid: string[] - invalid: string[] - total: number - expanded: string[] - patternResults: Record - } | null>(null) + const [questionIdValidation, setQuestionIdValidation] = useState(null) const compareIdInputRef = useRef(null) useEffect(() => { @@ -83,70 +76,6 @@ export default function NewComparePage() { } } - async function validateQuestionIds( - benchmark: string, - questionIdsInput: string - ): Promise<{ - valid: string[] - invalid: string[] - total: number - expanded: string[] - patternResults: Record - }> { - // Parse input: split by comma, trim, remove duplicates - const inputPatterns = questionIdsInput - .split(",") - .map((id) => id.trim()) - .filter((id) => id.length > 0) - const uniquePatterns = [...new Set(inputPatterns)] - - // Call pattern expansion endpoint - const expansionResult = await expandQuestionIdPatterns(benchmark, uniquePatterns) - const expandedIds = expansionResult.expandedIds - - // Fetch all questions to validate expanded IDs exist - const allQuestionIds = new Set() - let page = 1 - let hasMore = true - - while (hasMore) { - const response = await getBenchmarkQuestions(benchmark, { - page, - limit: 100, - }) - response.questions.forEach((q) => allQuestionIds.add(q.questionId)) - hasMore = page < response.pagination.totalPages - page++ - } - - // Validate expanded IDs - const valid: string[] = [] - const invalid: string[] = [] - - expandedIds.forEach((id) => { - if (allQuestionIds.has(id)) { - valid.push(id) - } else { - invalid.push(id) - } - }) - - // Find patterns that didn't expand to anything - const patternsWithNoResults = uniquePatterns.filter( - (pattern) => - !expansionResult.patternResults[pattern] || - expansionResult.patternResults[pattern].length === 0 - ) - - return { - valid, - invalid: [...invalid, ...patternsWithNoResults], - total: uniquePatterns.length, - expanded: expandedIds, - patternResults: expansionResult.patternResults, - } - } - function generateCompareId() { const now = new Date() const date = now.toISOString().slice(0, 10).replace(/-/g, "") @@ -444,97 +373,14 @@ export default function NewComparePage() { )} {form.selectionMode === "questionIds" && ( -
-
- -