From 9a6024a78471662c0a6fb29bd5b44732057e6ce7 Mon Sep 17 00:00:00 2001
From: mbeaulne <matt.beaulne@gmail.com>
Date: Thu, 18 Jun 2026 13:05:29 -0400
Subject: [PATCH] add synonym groups

---
 src/services/componentSearchIndex.test.ts | 67 +++++++++++++++++++++++
 src/services/componentSearchIndex.ts      | 46 +++++++++++-----
 src/services/componentSearchSynonyms.ts   | 35 ++++++++++++
 3 files changed, 134 insertions(+), 14 deletions(-)
 create mode 100644 src/services/componentSearchSynonyms.ts

diff --git a/src/services/componentSearchIndex.test.ts b/src/services/componentSearchIndex.test.ts
index 5f6a31ff3..4907a8262 100644
--- a/src/services/componentSearchIndex.test.ts
+++ b/src/services/componentSearchIndex.test.ts
@@ -318,6 +318,56 @@ describe("lexicalSearch", () => {
     expect(results[0]?.digest).toBe("stronger-concept-match");
   });
 
+  it("expands domain-neutral synonyms", () => {
+    const index = buildSearchIndex([
+      makeSourced({
+        digest: "storage",
+        spec: {
+          name: "upload_object",
+          description: "Upload files to a cloud storage bucket.",
+          inputs: [],
+          outputs: [],
+          implementation: { container: { image: "x" } },
+        },
+      }),
+      makeSourced({
+        digest: "train",
+        spec: {
+          name: "train_model",
+          description: "Train a model on tabular data.",
+          inputs: [],
+          outputs: [],
+          implementation: { container: { image: "x" } },
+        },
+      }),
+      makeSourced({
+        digest: "predict",
+        spec: {
+          name: "predict_labels",
+          description: "Predict labels for examples.",
+          inputs: [],
+          outputs: [],
+          implementation: { container: { image: "x" } },
+        },
+      }),
+      makeSourced({
+        digest: "table",
+        spec: {
+          name: "clean_table",
+          description: "Clean tabular dataframe rows.",
+          inputs: [],
+          outputs: [],
+          implementation: { container: { image: "x" } },
+        },
+      }),
+    ]);
+
+    expect(lexicalSearch(index, "gcs")[0]?.digest).toBe("storage");
+    expect(lexicalSearch(index, "fit")[0]?.digest).toBe("train");
+    expect(lexicalSearch(index, "infer")[0]?.digest).toBe("predict");
+    expect(lexicalSearch(index, "df")[0]?.digest).toBe("table");
+  });
+
   it("ignores natural-language filler words that would otherwise swamp intent", () => {
     const index = buildSearchIndex([
       makeSourced({
@@ -480,6 +530,23 @@ describe("lexicalSearch", () => {
     expect(results[0]?.source).toEqual(USER);
   });
 
+  it("expands dataframe synonyms to table-only metadata", () => {
+    const index = buildSearchIndex([
+      makeSourced({
+        digest: "table-only",
+        spec: {
+          name: "clean_rows",
+          description: "Clean table rows.",
+          inputs: [],
+          outputs: [],
+          implementation: { container: { image: "x" } },
+        },
+      }),
+    ]);
+
+    expect(lexicalSearch(index, "df")[0]?.digest).toBe("table-only");
+  });
+
   it("respects the limit option", () => {
     const many: SourcedReference[] = Array.from({ length: 10 }, (_, i) =>
       makeSourced({
diff --git a/src/services/componentSearchIndex.ts b/src/services/componentSearchIndex.ts
index fb2db8a86..76699d2e9 100644
--- a/src/services/componentSearchIndex.ts
+++ b/src/services/componentSearchIndex.ts
@@ -15,6 +15,8 @@
 import type { ComponentReference } from "@/utils/componentSpec";
 import { getComponentName } from "@/utils/getComponentName";
 
+import { expandSynonymTokens } from "./componentSearchSynonyms";
+
 /** Which field of a component matched the query. Surfaced in the UI. */
 export type MatchField =
   | "name"
@@ -159,7 +161,7 @@ function stemToken(token: string): string {
   return token;
 }
 
-function normalizeSearchText(text: string): string {
+function baseSearchTokens(text: string): string[] {
   const splitText = splitIdentifierText(text).toLowerCase();
   const tokens = splitText.split(/[^a-z0-9]+/).filter(isNonEmptyString);
   const expandedTokens: string[] = [];
@@ -173,7 +175,17 @@ function normalizeSearchText(text: string): string {
     }
   }
 
-  return [text.toLowerCase(), splitText, expandedTokens.join(" ")].join(" ");
+  return expandedTokens;
+}
+
+function normalizeSearchText(text: string): string {
+  const splitText = splitIdentifierText(text).toLowerCase();
+  // Synonym expansion happens on the query side only (see `tokenize`). Expanding
+  // the index too would make a query token set intersect a ballooned index token
+  // set, surfacing components that match neither the literal text nor the intent.
+  return [text.toLowerCase(), splitText, baseSearchTokens(text).join(" ")].join(
+    " ",
+  );
 }
 
 function extractAnnotationsText(
@@ -354,21 +366,26 @@ const QUERY_STOP_WORDS = new Set([
 ]);
 
 /**
- * Split a query into meaningful lowercase alphanumeric tokens. Natural-language
- * searches often include filler words ("I want to upload a component to GCS").
- * Dropping those words prevents common tokens like "a"/"to" from matching
- * nearly every component and drowning out the useful intent terms.
+ * Drop filler words and de-duplicate query tokens. Natural-language searches
+ * often include filler ("I want to upload a component to GCS"); removing those
+ * prevents common tokens like "a"/"to" from matching nearly every component and
+ * drowning out the useful intent terms.
  */
-function tokenizeConcepts(text: string): string[][] {
+function tokenizeQuery(text: string): {
+  concepts: string[][];
+  phraseTokens: string[];
+} {
   const splitText = splitIdentifierText(text).toLowerCase();
   const rawTokens = splitText.split(/[^a-z0-9]+/).filter(isNonEmptyString);
-
   const concepts: string[][] = [];
+  const phraseTokens: string[] = [];
   const seen = new Set<string>();
+
   for (const token of rawTokens) {
     if (QUERY_STOP_WORDS.has(token)) continue;
 
-    const variants = Array.from(new Set([token, stemToken(token)])).filter(
+    phraseTokens.push(token);
+    const variants = expandSynonymTokens([token, stemToken(token)]).filter(
       (variant) => !QUERY_STOP_WORDS.has(variant),
     );
     if (variants.length === 0) continue;
@@ -380,7 +397,7 @@ function tokenizeConcepts(text: string): string[][] {
     seen.add(conceptKey);
   }
 
-  return concepts;
+  return { concepts, phraseTokens };
 }
 
 /**
@@ -430,6 +447,7 @@ interface SearchOptions {
 function scoreEntry(
   entry: IndexEntry,
   concepts: string[][],
+  phraseTokens: string[],
 ): { score: number; matchedFields: MatchField[] } {
   const matched = new Set<MatchField>();
   let score = 0;
@@ -447,9 +465,9 @@ function scoreEntry(
   // sides are normalized so the bonus also fires for snake_case names —
   // query "train test split" should match `train_test_split`, not just
   // names that happen to contain literal spaces.
-  if (concepts.length > 1) {
+  if (phraseTokens.length > 1) {
     const normalizedName = entry.searchable.name.replace(/[^a-z0-9]+/g, " ");
-    const normalizedQuery = concepts.map((concept) => concept[0]).join(" ");
+    const normalizedQuery = phraseTokens.join(" ");
     if (normalizedName.includes(normalizedQuery)) {
       score += 10;
       matched.add("name");
@@ -473,12 +491,12 @@ export function lexicalSearch(
   const trimmed = query.trim().toLowerCase();
   if (trimmed.length < minLength) return [];
 
-  const concepts = tokenizeConcepts(trimmed);
+  const { concepts, phraseTokens } = tokenizeQuery(trimmed);
   if (concepts.length === 0) return [];
 
   const scored: Array<LexicalMatch & { score: number }> = [];
   for (const entry of index) {
-    const { score, matchedFields } = scoreEntry(entry, concepts);
+    const { score, matchedFields } = scoreEntry(entry, concepts, phraseTokens);
     if (score === 0) continue;
     scored.push({
       reference: entry.reference,
diff --git a/src/services/componentSearchSynonyms.ts b/src/services/componentSearchSynonyms.ts
new file mode 100644
index 000000000..83e25f2cb
--- /dev/null
+++ b/src/services/componentSearchSynonyms.ts
@@ -0,0 +1,35 @@
+const SYNONYM_GROUPS = [
+  ["gcs", "storage", "bucket"],
+  ["train", "fit", "training", "trainer"],
+  ["predict", "infer", "inference"],
+  ["df", "dataframe", "table"],
+  ["csv", "tabular"],
+  ["embed", "embedding", "vectorize"],
+  ["llm"],
+] as const;
+
+const SYNONYM_TOKENS_BY_TOKEN = new Map<string, string[]>();
+
+for (const group of SYNONYM_GROUPS) {
+  for (const token of group) {
+    SYNONYM_TOKENS_BY_TOKEN.set(token, [...group]);
+  }
+}
+
+export function expandSynonymTokens(tokens: string[]): string[] {
+  const expanded: string[] = [];
+  const seen = new Set<string>();
+
+  for (const token of tokens) {
+    for (const variant of [
+      token,
+      ...(SYNONYM_TOKENS_BY_TOKEN.get(token) ?? []),
+    ]) {
+      if (seen.has(variant)) continue;
+      seen.add(variant);
+      expanded.push(variant);
+    }
+  }
+
+  return expanded;
+}