From 9a6024a78471662c0a6fb29bd5b44732057e6ce7 Mon Sep 17 00:00:00 2001 From: mbeaulne Date: Thu, 18 Jun 2026 13:05:29 -0400 Subject: [PATCH] add synonym groups --- src/services/componentSearchIndex.test.ts | 67 +++++++++++++++++++++++ src/services/componentSearchIndex.ts | 46 +++++++++++----- src/services/componentSearchSynonyms.ts | 35 ++++++++++++ 3 files changed, 134 insertions(+), 14 deletions(-) create mode 100644 src/services/componentSearchSynonyms.ts diff --git a/src/services/componentSearchIndex.test.ts b/src/services/componentSearchIndex.test.ts index 5f6a31ff3..4907a8262 100644 --- a/src/services/componentSearchIndex.test.ts +++ b/src/services/componentSearchIndex.test.ts @@ -318,6 +318,56 @@ describe("lexicalSearch", () => { expect(results[0]?.digest).toBe("stronger-concept-match"); }); + it("expands domain-neutral synonyms", () => { + const index = buildSearchIndex([ + makeSourced({ + digest: "storage", + spec: { + name: "upload_object", + description: "Upload files to a cloud storage bucket.", + inputs: [], + outputs: [], + implementation: { container: { image: "x" } }, + }, + }), + makeSourced({ + digest: "train", + spec: { + name: "train_model", + description: "Train a model on tabular data.", + inputs: [], + outputs: [], + implementation: { container: { image: "x" } }, + }, + }), + makeSourced({ + digest: "predict", + spec: { + name: "predict_labels", + description: "Predict labels for examples.", + inputs: [], + outputs: [], + implementation: { container: { image: "x" } }, + }, + }), + makeSourced({ + digest: "table", + spec: { + name: "clean_table", + description: "Clean tabular dataframe rows.", + inputs: [], + outputs: [], + implementation: { container: { image: "x" } }, + }, + }), + ]); + + expect(lexicalSearch(index, "gcs")[0]?.digest).toBe("storage"); + expect(lexicalSearch(index, "fit")[0]?.digest).toBe("train"); + expect(lexicalSearch(index, "infer")[0]?.digest).toBe("predict"); + expect(lexicalSearch(index, "df")[0]?.digest).toBe("table"); + }); + it("ignores natural-language filler words that would otherwise swamp intent", () => { const index = buildSearchIndex([ makeSourced({ @@ -480,6 +530,23 @@ describe("lexicalSearch", () => { expect(results[0]?.source).toEqual(USER); }); + it("expands dataframe synonyms to table-only metadata", () => { + const index = buildSearchIndex([ + makeSourced({ + digest: "table-only", + spec: { + name: "clean_rows", + description: "Clean table rows.", + inputs: [], + outputs: [], + implementation: { container: { image: "x" } }, + }, + }), + ]); + + expect(lexicalSearch(index, "df")[0]?.digest).toBe("table-only"); + }); + it("respects the limit option", () => { const many: SourcedReference[] = Array.from({ length: 10 }, (_, i) => makeSourced({ diff --git a/src/services/componentSearchIndex.ts b/src/services/componentSearchIndex.ts index fb2db8a86..76699d2e9 100644 --- a/src/services/componentSearchIndex.ts +++ b/src/services/componentSearchIndex.ts @@ -15,6 +15,8 @@ import type { ComponentReference } from "@/utils/componentSpec"; import { getComponentName } from "@/utils/getComponentName"; +import { expandSynonymTokens } from "./componentSearchSynonyms"; + /** Which field of a component matched the query. Surfaced in the UI. */ export type MatchField = | "name" @@ -159,7 +161,7 @@ function stemToken(token: string): string { return token; } -function normalizeSearchText(text: string): string { +function baseSearchTokens(text: string): string[] { const splitText = splitIdentifierText(text).toLowerCase(); const tokens = splitText.split(/[^a-z0-9]+/).filter(isNonEmptyString); const expandedTokens: string[] = []; @@ -173,7 +175,17 @@ function normalizeSearchText(text: string): string { } } - return [text.toLowerCase(), splitText, expandedTokens.join(" ")].join(" "); + return expandedTokens; +} + +function normalizeSearchText(text: string): string { + const splitText = splitIdentifierText(text).toLowerCase(); + // Synonym expansion happens on the query side only (see `tokenize`). Expanding + // the index too would make a query token set intersect a ballooned index token + // set, surfacing components that match neither the literal text nor the intent. + return [text.toLowerCase(), splitText, baseSearchTokens(text).join(" ")].join( + " ", + ); } function extractAnnotationsText( @@ -354,21 +366,26 @@ const QUERY_STOP_WORDS = new Set([ ]); /** - * Split a query into meaningful lowercase alphanumeric tokens. Natural-language - * searches often include filler words ("I want to upload a component to GCS"). - * Dropping those words prevents common tokens like "a"/"to" from matching - * nearly every component and drowning out the useful intent terms. + * Drop filler words and de-duplicate query tokens. Natural-language searches + * often include filler ("I want to upload a component to GCS"); removing those + * prevents common tokens like "a"/"to" from matching nearly every component and + * drowning out the useful intent terms. */ -function tokenizeConcepts(text: string): string[][] { +function tokenizeQuery(text: string): { + concepts: string[][]; + phraseTokens: string[]; +} { const splitText = splitIdentifierText(text).toLowerCase(); const rawTokens = splitText.split(/[^a-z0-9]+/).filter(isNonEmptyString); - const concepts: string[][] = []; + const phraseTokens: string[] = []; const seen = new Set(); + for (const token of rawTokens) { if (QUERY_STOP_WORDS.has(token)) continue; - const variants = Array.from(new Set([token, stemToken(token)])).filter( + phraseTokens.push(token); + const variants = expandSynonymTokens([token, stemToken(token)]).filter( (variant) => !QUERY_STOP_WORDS.has(variant), ); if (variants.length === 0) continue; @@ -380,7 +397,7 @@ function tokenizeConcepts(text: string): string[][] { seen.add(conceptKey); } - return concepts; + return { concepts, phraseTokens }; } /** @@ -430,6 +447,7 @@ interface SearchOptions { function scoreEntry( entry: IndexEntry, concepts: string[][], + phraseTokens: string[], ): { score: number; matchedFields: MatchField[] } { const matched = new Set(); let score = 0; @@ -447,9 +465,9 @@ function scoreEntry( // sides are normalized so the bonus also fires for snake_case names — // query "train test split" should match `train_test_split`, not just // names that happen to contain literal spaces. - if (concepts.length > 1) { + if (phraseTokens.length > 1) { const normalizedName = entry.searchable.name.replace(/[^a-z0-9]+/g, " "); - const normalizedQuery = concepts.map((concept) => concept[0]).join(" "); + const normalizedQuery = phraseTokens.join(" "); if (normalizedName.includes(normalizedQuery)) { score += 10; matched.add("name"); @@ -473,12 +491,12 @@ export function lexicalSearch( const trimmed = query.trim().toLowerCase(); if (trimmed.length < minLength) return []; - const concepts = tokenizeConcepts(trimmed); + const { concepts, phraseTokens } = tokenizeQuery(trimmed); if (concepts.length === 0) return []; const scored: Array = []; for (const entry of index) { - const { score, matchedFields } = scoreEntry(entry, concepts); + const { score, matchedFields } = scoreEntry(entry, concepts, phraseTokens); if (score === 0) continue; scored.push({ reference: entry.reference, diff --git a/src/services/componentSearchSynonyms.ts b/src/services/componentSearchSynonyms.ts new file mode 100644 index 000000000..83e25f2cb --- /dev/null +++ b/src/services/componentSearchSynonyms.ts @@ -0,0 +1,35 @@ +const SYNONYM_GROUPS = [ + ["gcs", "storage", "bucket"], + ["train", "fit", "training", "trainer"], + ["predict", "infer", "inference"], + ["df", "dataframe", "table"], + ["csv", "tabular"], + ["embed", "embedding", "vectorize"], + ["llm"], +] as const; + +const SYNONYM_TOKENS_BY_TOKEN = new Map(); + +for (const group of SYNONYM_GROUPS) { + for (const token of group) { + SYNONYM_TOKENS_BY_TOKEN.set(token, [...group]); + } +} + +export function expandSynonymTokens(tokens: string[]): string[] { + const expanded: string[] = []; + const seen = new Set(); + + for (const token of tokens) { + for (const variant of [ + token, + ...(SYNONYM_TOKENS_BY_TOKEN.get(token) ?? []), + ]) { + if (seen.has(variant)) continue; + seen.add(variant); + expanded.push(variant); + } + } + + return expanded; +}