From 2f1abcc9ab3fc5efa1ba82e9cc7d2206eeae03b0 Mon Sep 17 00:00:00 2001 From: giriraj-singh-couchbase Date: Mon, 1 Dec 2025 15:15:13 +0530 Subject: [PATCH 1/5] updated tutorial based upon 8.0 requirements --- jinaai/gsi/frontmatter.md | 22 --- jinaai/{gsi => query_based}/.env.sample | 0 .../RAG_with_Couchbase_and_Jina_AI.ipynb | 182 +++++++++--------- jinaai/query_based/frontmatter.md | 24 +++ jinaai/{fts => search_based}/.env.sample | 0 .../RAG_with_Couchbase_and_Jina_AI.ipynb | 4 +- jinaai/{fts => search_based}/frontmatter.md | 4 +- jinaai/{fts => search_based}/jina_index.json | 0 8 files changed, 115 insertions(+), 121 deletions(-) delete mode 100644 jinaai/gsi/frontmatter.md rename jinaai/{gsi => query_based}/.env.sample (100%) rename jinaai/{gsi => query_based}/RAG_with_Couchbase_and_Jina_AI.ipynb (84%) create mode 100644 jinaai/query_based/frontmatter.md rename jinaai/{fts => search_based}/.env.sample (100%) rename jinaai/{fts => search_based}/RAG_with_Couchbase_and_Jina_AI.ipynb (98%) rename jinaai/{fts => search_based}/frontmatter.md (88%) rename jinaai/{fts => search_based}/jina_index.json (100%) diff --git a/jinaai/gsi/frontmatter.md b/jinaai/gsi/frontmatter.md deleted file mode 100644 index 8f6e50c3..00000000 --- a/jinaai/gsi/frontmatter.md +++ /dev/null @@ -1,22 +0,0 @@ ---- -# frontmatter -path: "/tutorial-jina-couchbase-rag-with-global-secondary-index" -title: Retrieval-Augmented Generation (RAG) with Couchbase and Jina AI using GSI -short_title: RAG with Couchbase and Jina -description: - - Learn how to build a semantic search engine using Couchbase and Jina. - - This tutorial demonstrates how to integrate Couchbase's vector search capabilities with Jina embeddings and language models. - - You'll understand how to perform Retrieval-Augmented Generation (RAG) using LangChain and Couchbase using GSI. -content_type: tutorial -filter: sdk -technology: - - vector search -tags: - - GSI - - Artificial Intelligence - - LangChain - - Jina AI -sdk_language: - - python -length: 60 Mins ---- diff --git a/jinaai/gsi/.env.sample b/jinaai/query_based/.env.sample similarity index 100% rename from jinaai/gsi/.env.sample rename to jinaai/query_based/.env.sample diff --git a/jinaai/gsi/RAG_with_Couchbase_and_Jina_AI.ipynb b/jinaai/query_based/RAG_with_Couchbase_and_Jina_AI.ipynb similarity index 84% rename from jinaai/gsi/RAG_with_Couchbase_and_Jina_AI.ipynb rename to jinaai/query_based/RAG_with_Couchbase_and_Jina_AI.ipynb index 6ee9b44f..3b512336 100644 --- a/jinaai/gsi/RAG_with_Couchbase_and_Jina_AI.ipynb +++ b/jinaai/query_based/RAG_with_Couchbase_and_Jina_AI.ipynb @@ -5,15 +5,7 @@ "id": "3459e9c0", "metadata": {}, "source": [ - "# Semantic Search with Couchbase GSI Vector Indexes and Jina AI" - ] - }, - { - "cell_type": "markdown", - "id": "7b0c7eae", - "metadata": {}, - "source": [ - "## Overview" + "## Introduction" ] }, { @@ -21,12 +13,12 @@ "id": "569c4838", "metadata": {}, "source": [ - "This tutorial demonstrates building a high-performance semantic search engine using Couchbase's GSI (Global Secondary Index) vector search and Jina AI for embeddings and language models. We'll show measurable performance improvements with GSI optimization and implement a complete RAG (Retrieval-Augmented Generation) system. Alternatively if you want to perform semantic search using the FTS, please take a look at [this.](https://developer.couchbase.com/tutorial-jina-couchbase-rag-with-fts)\n", + "This tutorial demonstrates building a high-performance semantic search engine using Couchbase's Hyperscale and Composite indexes with Jina AI for embeddings and language models. We'll show measurable performance improvements with Hyperscale and Composite indexes optimization and implement a complete RAG (Retrieval-Augmented Generation) system. For deep dive on the working of these indexes refer to the following [documentation](https://docs.couchbase.com/server/current/vector-index/use-vector-indexes.html). Alternatively if you want to perform semantic search using the Search Vector Index, please take a look at [this.](https://developer.couchbase.com/tutorial-jina-couchbase-rag-with-search-vector-index)\n", "\n", "**Key Features:**\n", - "- High-performance GSI vector search with BHIVE indexing\n", + "- High-performance vector search using Hyperscale/Composite indexes\n", "- Jina AI embeddings and language models\n", - "- Performance benchmarks showing GSI benefits\n", + "- Performance benchmarks showing Hyperscale/Composite index benefits\n", "- Complete RAG workflow with caching optimization\n", "\n", "**Requirements:** Couchbase Server 8.0+ or Capella with Query Service enabled." @@ -45,7 +37,7 @@ "id": "c1f64ee4", "metadata": {}, "source": [ - "This tutorial is available as a Jupyter Notebook that you can run interactively on [Google Colab](https://colab.research.google.com/) or locally by setting up the Python environment. You can access the original notebook [here](https://github.com/couchbase-examples/vector-search-cookbook/blob/main/jinaai/gsi/RAG_with_Couchbase_and_Jina_AI.ipynb)." + "This tutorial is available as a Jupyter Notebook that you can run interactively on [Google Colab](https://colab.research.google.com/) or locally by setting up the Python environment. You can access the original notebook [here](https://github.com/couchbase-examples/vector-search-cookbook/blob/main/jinaai/query_based/RAG_with_Couchbase_and_Jina_AI.ipynb)." ] }, { @@ -70,7 +62,7 @@ "metadata": {}, "source": [ "- **Couchbase Server 8.0+** or Couchbase Capella\n", - "- **Query Service enabled** (required for GSI Vector Indexes)\n", + "- **Query Service enabled** (required for both Hyperscale and Composite Vector Indexes)\n", "- **Jina AI API credentials** ([Get them here](https://jina.ai/))\n", "- **JinaChat API credentials** ([Get them here](https://chat.jina.ai/api))" ] @@ -90,7 +82,7 @@ "source": [ "1. **Create Account:** Deploy a [free tier cluster](https://cloud.couchbase.com/sign-up)\n", "2. **Configure Access:** Set up database credentials and network security \n", - "3. **Enable Query Service:** Required for GSI vector search functionality" + "3. **Enable Query Service:** Required for vector search functionality using Hyperscale and Composite vector index" ] }, { @@ -114,7 +106,7 @@ "id": "317fdebc", "metadata": {}, "source": [ - "Install the necessary packages for Couchbase GSI vector search, Jina AI integration, and LangChain RAG capabilities." + "Install the necessary packages for Couchbase vector search, Jina AI integration, and LangChain RAG capabilities." ] }, { @@ -149,7 +141,7 @@ "id": "5ac90d29", "metadata": {}, "source": [ - "Import libraries for Couchbase GSI vector search, Jina AI models, and LangChain components." + "Import libraries for Couchbase vector search, Jina AI models, and LangChain components." ] }, { @@ -495,7 +487,7 @@ "id": "b61549cc", "metadata": {}, "source": [ - "### Create GSI Vector Store" + "### Create Couchbase Vector Store" ] }, { @@ -503,12 +495,12 @@ "id": "ac616279", "metadata": {}, "source": [ - "Set up the GSI vector store for high-performance vector storage and similarity search using Couchbase's Query Service." + "Set up the Couchbase vector store which enables both Hyperscale and Composite Vector Indexes for high-performance vector storage and similarity search using Couchbase's Query Service." ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "b16d7be6", "metadata": {}, "outputs": [ @@ -516,7 +508,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "2025-10-08 11:18:57,341 - INFO - Successfully created GSI vector store\n" + "2025-10-08 11:18:57,341 - INFO - Successfully created Couchbase vector store\n" ] } ], @@ -530,9 +522,9 @@ " embedding=embeddings,\n", " distance_metric=DistanceStrategy.COSINE\n", " )\n", - " logging.info(\"Successfully created GSI vector store\")\n", + " logging.info(\"Successfully created Couchbase vector store\")\n", "except Exception as e:\n", - " raise ValueError(f\"Failed to create GSI vector store: {str(e)}\")" + " raise ValueError(f\"Failed to create Couchbase vector store: {str(e)}\")" ] }, { @@ -548,7 +540,7 @@ "id": "a2ab4b1b", "metadata": {}, "source": [ - "**Important**: GSI Vector Indexes must be created AFTER uploading vector data. The index creation process analyzes existing vectors to optimize search performance through clustering and quantization." + "**Important**: Hyperscale and Composite Vector Indexes must be created AFTER uploading vector data. The index creation process analyzes existing vectors to optimize search performance through clustering and quantization." ] }, { @@ -718,11 +710,11 @@ "lines_to_next_cell": 2 }, "source": [ - "Now let's demonstrate the performance benefits of GSI optimization by testing pure vector search performance. We'll compare three optimization levels:\n", + "Now let's demonstrate the performance benefits of Hyperscale/Composite Vector Index by testing pure vector search performance. We'll compare three optimization levels:\n", "\n", - "1. **Baseline Performance**: Vector search without GSI optimization\n", - "2. **GSI-Optimized Performance**: Same search with BHIVE GSI index\n", - "3. **Cache Benefits**: Show how caching can be applied on top of GSI for repeated queries" + "1. **Baseline Performance**: Vector search without Hyperscale/Composite Vector Index optimization\n", + "2. **Hyperscale/Composite Vector Index-Optimized Performance**: Same search with BHIVE Hyperscale/Composite Vector Index\n", + "3. **Cache Benefits**: Show how caching can be applied on top of Hyperscale/Composite Vector Index for repeated queries" ] }, { @@ -730,7 +722,7 @@ "id": "e200aa57", "metadata": {}, "source": [ - "### GSI Vector Index Types Overview" + "### Vector Index Types Overview" ] }, { @@ -810,13 +802,13 @@ "\n", "#### **Performance Considerations**\n", "\n", - "**Distance Interpretation**: In GSI vector search, lower distance values indicate higher similarity, while higher distance values indicate lower similarity.\n", + "**Distance Interpretation**: In vector search using Hyperscale and Composite vector indexes, lower distance values indicate higher similarity, while higher distance values indicate lower similarity.\n", "\n", "**Scalability**: BHIVE indexes can scale to billions of vectors with optimized concurrent operations, making them suitable for large-scale production deployments.\n", "\n", "For detailed configuration options, see the [Quantization & Centroid Settings](https://docs.couchbase.com/cloud/vector-index/hyperscale-vector-index.html#algo_settings).\n", "\n", - "For more information on GSI vector indexes, see [Couchbase GSI Vector Documentation](https://docs.couchbase.com/cloud/vector-index/use-vector-indexes.html)." + "For more information on Hyperscale and Composite vector indexes, see [Couchbase Hyperscale and Composite Vector Index Documentation](https://docs.couchbase.com/cloud/vector-index/use-vector-indexes.html)." ] }, { @@ -868,7 +860,7 @@ "id": "cc751f3c", "metadata": {}, "source": [ - "### Test 1: Baseline Performance (No GSI Index)" + "### Test 1: Baseline Performance (No Hyperscale/Composite Vector Index)" ] }, { @@ -876,7 +868,7 @@ "id": "1e875b60", "metadata": {}, "source": [ - "Test pure vector search performance without GSI optimization." + "Test pure vector search performance without Hyperscale/Composite Vector Index optimization." ] }, { @@ -889,7 +881,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Testing baseline vector search performance without GSI optimization...\n", + "Testing baseline vector search performance without Hyperscale/Composite Vector Index optimization...\n", "\n", "[Baseline Search] Testing vector search performance\n", "[Baseline Search] Query: 'What was manchester city manager pep guardiola's reaction to the team's current form?'\n", @@ -900,17 +892,17 @@ "\n", "\"We have entirely changed the way we think about...\n", "\n", - "Baseline vector search time (without GSI): 0.8305 seconds\n", + "Baseline vector search time (without Hyperscale/Composite Vector Index): 0.8305 seconds\n", "\n" ] } ], "source": [ - "# Test baseline vector search performance without GSI index\n", + "# Test baseline vector search performance without Hyperscale/Composite Vector Index\n", "test_query = \"What was manchester city manager pep guardiola's reaction to the team's current form?\"\n", - "print(\"Testing baseline vector search performance without GSI optimization...\")\n", + "print(\"Testing baseline vector search performance without Hyperscale/Composite Vector Index optimization...\")\n", "baseline_time = test_vector_search_performance(vector_store, test_query, \"Baseline Search\")\n", - "print(f\"\\nBaseline vector search time (without GSI): {baseline_time:.4f} seconds\\n\")" + "print(f\"\\nBaseline vector search time (without Hyperscale/Composite Vector Index): {baseline_time:.4f} seconds\\n\")" ] }, { @@ -918,7 +910,7 @@ "id": "c0dc252f", "metadata": {}, "source": [ - "### Create BHIVE GSI Index" + "### Create BHIVE Hyperscale/Composite Vector Index" ] }, { @@ -926,12 +918,12 @@ "id": "926f3cb6", "metadata": {}, "source": [ - "Now let's create a BHIVE GSI vector index to enable high-performance vector searches. The index creation is done programmatically through the vector store." + "Now let's create a BHIVE Hyperscale/Composite vector index to enable high-performance vector searches. The index creation is done programmatically through the vector store." ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "id": "0f67fd3a", "metadata": {}, "outputs": [ @@ -939,21 +931,21 @@ "name": "stdout", "output_type": "stream", "text": [ - "Creating BHIVE GSI vector index...\n", - "GSI Vector index created successfully\n", + "Creating BHIVE Hyperscale/Composite vector index...\n", + "BHIVE Hyperscale/Composite vector index created successfully\n", "Waiting for index to become available...\n" ] } ], "source": [ - "# Create GSI Vector Index for high-performance searches\n", - "print(\"Creating BHIVE GSI vector index...\")\n", + "# Create BHIVE Vector Index for high-performance searches\n", + "print(\"Creating BHIVE Hyperscale/Composite vector index...\")\n", "try:\n", " vector_store.create_index(\n", " index_type=IndexType.BHIVE, # Use IndexType.COMPOSITE for Composite index\n", " index_description=\"IVF,SQ8\"\n", " )\n", - " print(\"GSI Vector index created successfully\")\n", + " print(\"BHIVE Hyperscale/Composite vector index created successfully\")\n", " \n", " # Wait for index to become available\n", " print(\"Waiting for index to become available...\")\n", @@ -961,9 +953,9 @@ " \n", "except Exception as e:\n", " if \"already exists\" in str(e).lower():\n", - " print(\"GSI Vector index already exists, proceeding...\")\n", + " print(\"BHIVE Hyperscale/Composite vector index already exists, proceeding...\")\n", " else:\n", - " print(f\"Error creating GSI index: {str(e)}\")" + " print(f\"Error creating BHIVE Hyperscale/Composite vector index: {str(e)}\")" ] }, { @@ -995,7 +987,7 @@ "id": "d3e24394", "metadata": {}, "source": [ - "### Test 2: GSI-Optimized Performance" + "### Test 2: Hyperscale and Composite vector indexes Optimized Performance" ] }, { @@ -1003,7 +995,7 @@ "id": "9c8032cb", "metadata": {}, "source": [ - "Test the same vector search with BHIVE GSI optimization." + "Test the same vector search with Hyperscale optimization." ] }, { @@ -1016,22 +1008,22 @@ "name": "stdout", "output_type": "stream", "text": [ - "Testing vector search performance with BHIVE GSI optimization...\n", + "Testing vector search performance with Hyperscale optimization...\n", "\n", - "[GSI-Optimized Search] Testing vector search performance\n", - "[GSI-Optimized Search] Query: 'What happened in the latest Premier League matches?'\n", - "[GSI-Optimized Search] Vector search completed in 0.6452 seconds\n", - "[GSI-Optimized Search] Found 3 documents\n", - "[GSI-Optimized Search] Top result distance: 0.394714 (lower = more similar)\n", - "[GSI-Optimized Search] Top result preview: The latest updates and analysis from the BBC.\n" + "[Hyperscale-Optimized Search] Testing vector search performance\n", + "[Hyperscale-Optimized Search] Query: 'What happened in the latest Premier League matches?'\n", + "[Hyperscale-Optimized Search] Vector search completed in 0.6452 seconds\n", + "[Hyperscale-Optimized Search] Found 3 documents\n", + "[Hyperscale-Optimized Search] Top result distance: 0.394714 (lower = more similar)\n", + "[Hyperscale-Optimized Search] Top result preview: The latest updates and analysis from the BBC.\n" ] } ], "source": [ - "# Test vector search performance with GSI index\n", - "gsi_test_query = \"What happened in the latest Premier League matches?\"\n", - "print(\"Testing vector search performance with BHIVE GSI optimization...\")\n", - "gsi_time = test_vector_search_performance(vector_store, gsi_test_query, \"GSI-Optimized Search\")" + "# Test vector search performance with Hyperscale index\n", + "hyperscale_test_query = \"What happened in the latest Premier League matches?\"\n", + "print(\"Testing vector search performance with Hyperscale optimization...\")\n", + "hyperscale_time = test_vector_search_performance(vector_store, hyperscale_test_query, \"Hyperscale-Optimized Search\")" ] }, { @@ -1047,7 +1039,7 @@ "id": "5878b5fe", "metadata": {}, "source": [ - "Now let's demonstrate how caching can improve performance for repeated queries. **Note**: Caching benefits apply to both baseline and GSI-optimized searches." + "Now let's demonstrate how caching can improve performance for repeated queries. **Note**: Caching benefits apply to both baseline and Hyperscale-optimized searches." ] }, { @@ -1139,7 +1131,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "id": "8e87794a", "metadata": {}, "outputs": [ @@ -1151,8 +1143,8 @@ "================================================================================\n", "VECTOR SEARCH PERFORMANCE OPTIMIZATION SUMMARY\n", "================================================================================\n", - "Phase 1 - Baseline Search (No GSI): 0.8305 seconds\n", - "Phase 2 - GSI-Optimized Search: 0.6452 seconds\n", + "Phase 1 - Baseline Search (No Hyperscale): 0.8305 seconds\n", + "Phase 2 - Hyperscale-Optimized Search: 0.6452 seconds\n", "Phase 3 - Cache Benefits:\n", " First execution (cache miss): 0.9695 seconds\n", " Second execution (cache hit): 0.5252 seconds\n", @@ -1160,14 +1152,14 @@ "--------------------------------------------------------------------------------\n", "VECTOR SEARCH OPTIMIZATION IMPACT:\n", "--------------------------------------------------------------------------------\n", - "GSI Index Benefit: 1.29x faster (22.3% improvement)\n", + "Hyperscale Index Benefit: 1.29x faster (22.3% improvement)\n", "Cache Benefit: 1.85x faster (45.8% improvement)\n", "\n", "Key Insights for Vector Search Performance:\n", - "• GSI BHIVE indexes provide significant performance improvements for vector similarity search\n", + "• Hyperscale BHIVE indexes provide significant performance improvements for vector similarity search\n", "• Performance gains are most dramatic for complex semantic queries\n", "• BHIVE optimization is particularly effective for high-dimensional embeddings\n", - "• Combined with proper quantization (SQ8), GSI delivers production-ready performance\n", + "• Combined with proper quantization (SQ8), BHIVE Hyperscale delivers production-ready performance\n", "• These performance improvements directly benefit any application using the vector store\n" ] } @@ -1177,8 +1169,8 @@ "print(\"VECTOR SEARCH PERFORMANCE OPTIMIZATION SUMMARY\")\n", "print(\"=\"*80)\n", "\n", - "print(f\"Phase 1 - Baseline Search (No GSI): {baseline_time:.4f} seconds\")\n", - "print(f\"Phase 2 - GSI-Optimized Search: {gsi_time:.4f} seconds\")\n", + "print(f\"Phase 1 - Baseline Search (No Hyperscale): {baseline_time:.4f} seconds\")\n", + "print(f\"Phase 2 - Hyperscale-Optimized Search: {hyperscale_time:.4f} seconds\")\n", "if cache_time_1 and cache_time_2:\n", " print(f\"Phase 3 - Cache Benefits:\")\n", " print(f\" First execution (cache miss): {cache_time_1:.4f} seconds\")\n", @@ -1188,12 +1180,12 @@ "print(\"VECTOR SEARCH OPTIMIZATION IMPACT:\")\n", "print(\"-\"*80)\n", "\n", - "# GSI improvement analysis\n", - "if baseline_time and gsi_time:\n", - " speedup = baseline_time / gsi_time if gsi_time > 0 else float('inf')\n", - " time_saved = baseline_time - gsi_time\n", + "# Hyperscale improvement analysis\n", + "if baseline_time and hyperscale_time:\n", + " speedup = baseline_time / hyperscale_time if hyperscale_time > 0 else float('inf')\n", + " time_saved = baseline_time - hyperscale_time\n", " percent_improvement = (time_saved / baseline_time) * 100\n", - " print(f\"GSI Index Benefit: {speedup:.2f}x faster ({percent_improvement:.1f}% improvement)\")\n", + " print(f\"Hyperscale Index Benefit: {speedup:.2f}x faster ({percent_improvement:.1f}% improvement)\")\n", "\n", "# Cache improvement analysis\n", "if cache_time_1 and cache_time_2 and cache_time_2 < cache_time_1:\n", @@ -1204,10 +1196,10 @@ " print(f\"Cache Benefit: Variable (depends on query complexity and caching mechanism)\")\n", "\n", "print(f\"\\nKey Insights for Vector Search Performance:\")\n", - "print(f\"• GSI BHIVE indexes provide significant performance improvements for vector similarity search\")\n", + "print(f\"• Hyperscale BHIVE indexes provide significant performance improvements for vector similarity search\")\n", "print(f\"• Performance gains are most dramatic for complex semantic queries\")\n", "print(f\"• BHIVE optimization is particularly effective for high-dimensional embeddings\")\n", - "print(f\"• Combined with proper quantization (SQ8), GSI delivers production-ready performance\")\n", + "print(f\"• Combined with proper quantization (SQ8), Hyperscale delivers production-ready performance\")\n", "print(f\"• These performance improvements directly benefit any application using the vector store\")" ] }, @@ -1232,10 +1224,10 @@ "id": "c9d50d7a", "metadata": {}, "source": [ - "Now that we've optimized our vector search performance, let's demonstrate how to build a complete RAG system using Jina AI. RAG combines the power of our GSI-optimized semantic search with language model generation:\n", + "Now that we've optimized our vector search performance, let's demonstrate how to build a complete RAG system using Jina AI. RAG combines the power of our Hyperscale/Composite index optimized semantic search with language model generation:\n", "\n", "1. **Query Processing**: User question is converted to vector embedding using Jina AI\n", - "2. **Document Retrieval**: GSI BHIVE index finds most relevant documents (now with proven performance improvements)\n", + "2. **Document Retrieval**: BHIVE Hyperscale/Composite index finds most relevant documents (now with proven performance improvements)\n", "3. **Context Assembly**: Retrieved documents provide factual context for the language model\n", "4. **Response Generation**: Jina's language model generates intelligent answers grounded in the retrieved data\n", "\n", @@ -1255,7 +1247,7 @@ "id": "14e582ba", "metadata": {}, "source": [ - "Initialize Jina's chat model for generating intelligent responses based on our GSI-optimized retrieval system." + "Initialize Jina's chat model for generating intelligent responses based on our Hyperscale/Composite optimized retrieval system." ] }, { @@ -1305,12 +1297,12 @@ "id": "9a2f73b9", "metadata": {}, "source": [ - "Create the complete RAG pipeline that integrates our GSI-optimized vector search with Jina's language model." + "Create the complete RAG pipeline that integrates our Hyperscale/Composite optimized vector search with Jina's language model." ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "id": "44af7a43", "metadata": {}, "outputs": [ @@ -1319,7 +1311,7 @@ "output_type": "stream", "text": [ "Optimized RAG pipeline created successfully\n", - "Components: GSI BHIVE Vector Search → Context Assembly → Jina Language Model → Response\n" + "Components: BHIVE Hyperscale/Composite Vector Search → Context Assembly → Jina Language Model → Response\n" ] } ], @@ -1339,7 +1331,7 @@ " \n", " prompt = ChatPromptTemplate.from_template(template)\n", "\n", - " # Build the RAG chain: GSI-Optimized Retrieval → Context → Generation → Output\n", + " # Build the RAG chain: Hyperscale/Composite optimized Retrieval → Context → Generation → Output\n", " rag_chain = (\n", " {\n", " \"context\": vector_store.as_retriever(search_kwargs={\"k\": 2}), \n", @@ -1350,7 +1342,7 @@ " | StrOutputParser()\n", " )\n", " print(\"Optimized RAG pipeline created successfully\")\n", - " print(\"Components: GSI BHIVE Vector Search → Context Assembly → Jina Language Model → Response\")\n", + " print(\"Components: BHIVE Hyperscale/Composite Vector Search → Context Assembly → Jina Language Model → Response\")\n", "except Exception as e:\n", " raise ValueError(f\"Error creating RAG pipeline: {str(e)}\")" ] @@ -1368,12 +1360,12 @@ "id": "ba25dccc", "metadata": {}, "source": [ - "Test the complete RAG system leveraging our GSI performance optimizations." + "Test the complete RAG system leveraging our Hyperscale/Composite performance optimizations." ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "id": "915cd261", "metadata": {}, "outputs": [ @@ -1381,13 +1373,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "Testing RAG System with GSI-Optimized Vector Search\n", + "Testing RAG System with BHIVE Hyperscale/Composite-Optimized Vector Search\n", "============================================================\n", "User Query: What are the new eligibility rules for transgender women competing in leading women's golf tours, and what prompted these changes?\n", "\n", "Processing with optimized pipeline...\n", "1. Converting query to vector embedding with Jina AI\n", - "2. Searching GSI BHIVE index for relevant documents (optimized)\n", + "2. Searching BHIVE Hyperscale/Composite vector index for relevant documents (optimized)\n", "3. Assembling context from retrieved documents\n", "4. Generating intelligent response with JinaChat\n", "\n", @@ -1398,7 +1390,7 @@ } ], "source": [ - "print(\"Testing RAG System with GSI-Optimized Vector Search\")\n", + "print(\"Testing RAG System with BHIVE Hyperscale/Composite-Optimized Vector Search\")\n", "print(\"=\" * 60)\n", "\n", "try:\n", @@ -1407,7 +1399,7 @@ " print(f\"User Query: {sample_query}\")\n", " print(\"\\nProcessing with optimized pipeline...\")\n", " print(\"1. Converting query to vector embedding with Jina AI\")\n", - " print(\"2. Searching GSI BHIVE index for relevant documents (optimized)\")\n", + " print(\"2. Searching BHIVE Hyperscale/Composite vector index for relevant documents (optimized)\")\n", " print(\"3. Assembling context from retrieved documents\")\n", " print(\"4. Generating intelligent response with JinaChat\")\n", " \n", @@ -1447,7 +1439,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": null, "id": "fd59a9cc", "metadata": {}, "outputs": [ @@ -1474,7 +1466,7 @@ "- Explored the power station and visited stalls at the Curated Makers Market.\n", "\n", "✅ RAG demo completed successfully!\n", - "✅ The system leverages GSI BHIVE optimization for fast document retrieval!\n", + "✅ The system leverages BHIVE Hyperscale/Composite vector index optimization for fast document retrieval!\n", "✅ Jina AI provides high-quality embeddings and intelligent response generation!\n" ] } @@ -1506,7 +1498,7 @@ " print(f\"Error: {str(e)}\")\n", "\n", "print(f\"\\n✅ RAG demo completed successfully!\")\n", - "print(\"✅ The system leverages GSI BHIVE optimization for fast document retrieval!\")\n", + "print(\"✅ The system leverages BHIVE Hyperscale/Composite vector index optimization for fast document retrieval!\")\n", "print(\"✅ Jina AI provides high-quality embeddings and intelligent response generation!\")" ] }, @@ -1526,7 +1518,7 @@ }, "source": [ "You've successfully built a high-performance semantic search engine combining:\n", - "- **Couchbase GSI BHIVE indexes** for optimized vector search\n", + "- **Couchbase BHIVE Hyperscale/Composite indexes** for optimized vector search\n", "- **Jina AI embeddings and language models** for intelligent processing\n", "- **Complete RAG pipeline** with caching optimization" ] diff --git a/jinaai/query_based/frontmatter.md b/jinaai/query_based/frontmatter.md new file mode 100644 index 00000000..21f67dd8 --- /dev/null +++ b/jinaai/query_based/frontmatter.md @@ -0,0 +1,24 @@ +--- +# frontmatter +path: "/tutorial-jina-couchbase-rag-with-hyperscale-or-composite-vector-index" +title: Retrieval-Augmented Generation (RAG) with Jina AI using Couchbase Hyperscale and Composite Vector Index +short_title: RAG with Couchbase and Jina AI +description: + - Learn how to build a semantic search engine using Couchbase and Jina. + - This tutorial demonstrates how to integrate Couchbase's vector search capabilities with Jina embeddings and language models. + - You'll understand how to perform Retrieval-Augmented Generation (RAG) using LangChain, Couchbase Hyperscale and Composite Vector Index. +content_type: tutorial +filter: sdk +technology: + - vector search +tags: + - Hyperscale Vector Index + - Composite Vector Index + - Artificial Intelligence + - LangChain + - Jina AI +sdk_language: + - python +length: 60 Mins +alt_paths: ["/tutorial-jina-couchbase-rag-with-hyperscale-vector-index", "/tutorial-jina-couchbase-rag-with-composite-vector-index"] +--- diff --git a/jinaai/fts/.env.sample b/jinaai/search_based/.env.sample similarity index 100% rename from jinaai/fts/.env.sample rename to jinaai/search_based/.env.sample diff --git a/jinaai/fts/RAG_with_Couchbase_and_Jina_AI.ipynb b/jinaai/search_based/RAG_with_Couchbase_and_Jina_AI.ipynb similarity index 98% rename from jinaai/fts/RAG_with_Couchbase_and_Jina_AI.ipynb rename to jinaai/search_based/RAG_with_Couchbase_and_Jina_AI.ipynb index 434db0a8..6ac1ed74 100644 --- a/jinaai/fts/RAG_with_Couchbase_and_Jina_AI.ipynb +++ b/jinaai/search_based/RAG_with_Couchbase_and_Jina_AI.ipynb @@ -7,7 +7,7 @@ }, "source": [ "# Introduction\n", - "In this guide, we will walk you through building a powerful semantic search engine using Couchbase as the backend database and [Jina](https://jina.ai/) as the AI-powered embedding and language model provider, utilizing Full-Text Search (FTS). Semantic search goes beyond simple keyword matching by understanding the context and meaning behind the words in a query, making it an essential tool for applications that require intelligent information retrieval. This tutorial is designed to be beginner-friendly, with clear, step-by-step instructions that will equip you with the knowledge to create a fully functional semantic search system from scratch. Alternatively if you want to perform semantic search using the GSI index, please take a look at [this.](https://developer.couchbase.com/tutorial-jina-couchbase-rag-with-global-secondary-index)" + "In this guide, we will walk you through building a powerful semantic search engine using Couchbase as the backend database and [Jina](https://jina.ai/) as the AI-powered embedding and language model provider, utilizing Full-Text Search using search vector index. Semantic search goes beyond simple keyword matching by understanding the context and meaning behind the words in a query, making it an essential tool for applications that require intelligent information retrieval. This tutorial is designed to be beginner-friendly, with clear, step-by-step instructions that will equip you with the knowledge to create a fully functional semantic search system from scratch. Alternatively if you want to perform semantic search using the GSI index, please take a look at [this.](https://developer.couchbase.com/tutorial-jina-couchbase-rag-with-global-secondary-index)" ] }, { @@ -522,7 +522,7 @@ }, "source": [ "# Setting Up the Couchbase Vector Store\n", - "A vector store is where we'll keep our embeddings. Unlike the FTS index, which is used for text-based search, the vector store is specifically designed to handle embeddings and perform similarity searches. When a user inputs a query, the search engine converts the query into an embedding and compares it against the embeddings stored in the vector store. This allows the engine to find documents that are semantically similar to the query, even if they don't contain the exact same words. By setting up the vector store in Couchbase, we create a powerful tool that enables our search engine to understand and retrieve information based on the meaning and context of the query, rather than just the specific words used." + "A vector store is where we'll keep our embeddings. Unlike the search vector index, which is used for text-based search, the vector store is specifically designed to handle embeddings and perform similarity searches. When a user inputs a query, the search engine converts the query into an embedding and compares it against the embeddings stored in the vector store. This allows the engine to find documents that are semantically similar to the query, even if they don't contain the exact same words. By setting up the vector store in Couchbase, we create a powerful tool that enables our search engine to understand and retrieve information based on the meaning and context of the query, rather than just the specific words used." ] }, { diff --git a/jinaai/fts/frontmatter.md b/jinaai/search_based/frontmatter.md similarity index 88% rename from jinaai/fts/frontmatter.md rename to jinaai/search_based/frontmatter.md index 5c28525d..fb6feba3 100644 --- a/jinaai/fts/frontmatter.md +++ b/jinaai/search_based/frontmatter.md @@ -1,6 +1,6 @@ --- # frontmatter -path: "/tutorial-jina-couchbase-rag-with-fts" +path: "/tutorial-jina-couchbase-rag-with-search-vector-index" title: Retrieval-Augmented Generation (RAG) with Couchbase and Jina AI using FTS short_title: RAG with Couchbase and Jina description: @@ -12,7 +12,7 @@ filter: sdk technology: - vector search tags: - - FTS + - Search Vector Index - Artificial Intelligence - LangChain - Jina AI diff --git a/jinaai/fts/jina_index.json b/jinaai/search_based/jina_index.json similarity index 100% rename from jinaai/fts/jina_index.json rename to jinaai/search_based/jina_index.json From fa5417fde11566b462a26f5bbee53b6f2aee729c Mon Sep 17 00:00:00 2001 From: giriraj-singh-couchbase Date: Mon, 1 Dec 2025 15:51:47 +0530 Subject: [PATCH 2/5] removed BHIVE term --- .../RAG_with_Couchbase_and_Jina_AI.ipynb | 62 +++++++++---------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/jinaai/query_based/RAG_with_Couchbase_and_Jina_AI.ipynb b/jinaai/query_based/RAG_with_Couchbase_and_Jina_AI.ipynb index 3b512336..fee95f15 100644 --- a/jinaai/query_based/RAG_with_Couchbase_and_Jina_AI.ipynb +++ b/jinaai/query_based/RAG_with_Couchbase_and_Jina_AI.ipynb @@ -713,7 +713,7 @@ "Now let's demonstrate the performance benefits of Hyperscale/Composite Vector Index by testing pure vector search performance. We'll compare three optimization levels:\n", "\n", "1. **Baseline Performance**: Vector search without Hyperscale/Composite Vector Index optimization\n", - "2. **Hyperscale/Composite Vector Index-Optimized Performance**: Same search with BHIVE Hyperscale/Composite Vector Index\n", + "2. **Hyperscale/Composite Vector Index-Optimized Performance**: Same search with Hyperscale/Composite Vector Index\n", "3. **Cache Benefits**: Show how caching can be applied on top of Hyperscale/Composite Vector Index for repeated queries" ] }, @@ -732,7 +732,7 @@ "source": [ "Before we start testing, let's understand the index types available:\n", "\n", - "**Hyperscale Vector Indexes (BHIVE):**\n", + "**Hyperscale Vector Indexes:**\n", "- **Best for**: Pure vector searches - content discovery, recommendations, semantic search\n", "- **Performance**: High performance with low memory footprint, designed to scale to billions of vectors\n", "- **Optimization**: Optimized for concurrent operations, supports simultaneous searches and inserts\n", @@ -751,7 +751,7 @@ "- Use Composite Vector Index when scalar filters significantly reduce your search space\n", "- Consider your dataset size: Hyperscale scales to billions, Composite works well for tens of millions to billions\n", "\n", - "For this tutorial, we'll use **BHIVE** as it's optimized for pure semantic search scenarios." + "For this tutorial, we'll use **Hyperscale Vector Indexes:** as it's optimized for pure semantic search scenarios." ] }, { @@ -804,7 +804,7 @@ "\n", "**Distance Interpretation**: In vector search using Hyperscale and Composite vector indexes, lower distance values indicate higher similarity, while higher distance values indicate lower similarity.\n", "\n", - "**Scalability**: BHIVE indexes can scale to billions of vectors with optimized concurrent operations, making them suitable for large-scale production deployments.\n", + "**Scalability**: Hyperscale/Composite vector indexes can scale to billions of vectors with optimized concurrent operations, making them suitable for large-scale production deployments.\n", "\n", "For detailed configuration options, see the [Quantization & Centroid Settings](https://docs.couchbase.com/cloud/vector-index/hyperscale-vector-index.html#algo_settings).\n", "\n", @@ -910,7 +910,7 @@ "id": "c0dc252f", "metadata": {}, "source": [ - "### Create BHIVE Hyperscale/Composite Vector Index" + "### Create Hyperscale/Composite Vector Index" ] }, { @@ -918,7 +918,7 @@ "id": "926f3cb6", "metadata": {}, "source": [ - "Now let's create a BHIVE Hyperscale/Composite vector index to enable high-performance vector searches. The index creation is done programmatically through the vector store." + "Now let's create a Hyperscale/Composite vector index to enable high-performance vector searches. The index creation is done programmatically through the vector store." ] }, { @@ -931,21 +931,21 @@ "name": "stdout", "output_type": "stream", "text": [ - "Creating BHIVE Hyperscale/Composite vector index...\n", - "BHIVE Hyperscale/Composite vector index created successfully\n", + "Creating Hyperscale/Composite vector index...\n", + "Hyperscale/Composite vector index created successfully\n", "Waiting for index to become available...\n" ] } ], "source": [ - "# Create BHIVE Vector Index for high-performance searches\n", - "print(\"Creating BHIVE Hyperscale/Composite vector index...\")\n", + "# Create Hyperscale Vector Index for high-performance searches\n", + "print(\"Creating Hyperscale/Composite vector index...\")\n", "try:\n", " vector_store.create_index(\n", - " index_type=IndexType.BHIVE, # Use IndexType.COMPOSITE for Composite index\n", + " index_type=IndexType.HYPERSCALE, # Use IndexType.COMPOSITE for Composite index\n", " index_description=\"IVF,SQ8\"\n", " )\n", - " print(\"BHIVE Hyperscale/Composite vector index created successfully\")\n", + " print(\"Hyperscale/Composite vector index created successfully\")\n", " \n", " # Wait for index to become available\n", " print(\"Waiting for index to become available...\")\n", @@ -953,9 +953,9 @@ " \n", "except Exception as e:\n", " if \"already exists\" in str(e).lower():\n", - " print(\"BHIVE Hyperscale/Composite vector index already exists, proceeding...\")\n", + " print(\"Hyperscale/Composite vector index already exists, proceeding...\")\n", " else:\n", - " print(f\"Error creating BHIVE Hyperscale/Composite vector index: {str(e)}\")" + " print(f\"Error creating Hyperscale/Composite vector index: {str(e)}\")" ] }, { @@ -976,7 +976,7 @@ "```python\n", "# Alternative: Create a Composite index for filtered searches\n", "vector_store.create_index(\n", - " index_type=IndexType.COMPOSITE, # Instead of IndexType.BHIVE\n", + " index_type=IndexType.COMPOSITE, # Instead of IndexType.HYPERSCALE\n", " index_description=\"IVF,SQ8\" # Same quantization settings\n", ")\n", "```" @@ -1156,10 +1156,10 @@ "Cache Benefit: 1.85x faster (45.8% improvement)\n", "\n", "Key Insights for Vector Search Performance:\n", - "• Hyperscale BHIVE indexes provide significant performance improvements for vector similarity search\n", + "• Hyperscale indexes provide significant performance improvements for vector similarity search\n", "• Performance gains are most dramatic for complex semantic queries\n", - "• BHIVE optimization is particularly effective for high-dimensional embeddings\n", - "• Combined with proper quantization (SQ8), BHIVE Hyperscale delivers production-ready performance\n", + "• Hyperscale vector index optimization is particularly effective for high-dimensional embeddings\n", + "• Combined with proper quantization (SQ8), Hyperscale delivers production-ready performance\n", "• These performance improvements directly benefit any application using the vector store\n" ] } @@ -1196,9 +1196,9 @@ " print(f\"Cache Benefit: Variable (depends on query complexity and caching mechanism)\")\n", "\n", "print(f\"\\nKey Insights for Vector Search Performance:\")\n", - "print(f\"• Hyperscale BHIVE indexes provide significant performance improvements for vector similarity search\")\n", + "print(f\"• Hyperscale indexes provide significant performance improvements for vector similarity search\")\n", "print(f\"• Performance gains are most dramatic for complex semantic queries\")\n", - "print(f\"• BHIVE optimization is particularly effective for high-dimensional embeddings\")\n", + "print(f\"• Hyperscale vector index optimization is particularly effective for high-dimensional embeddings\")\n", "print(f\"• Combined with proper quantization (SQ8), Hyperscale delivers production-ready performance\")\n", "print(f\"• These performance improvements directly benefit any application using the vector store\")" ] @@ -1227,7 +1227,7 @@ "Now that we've optimized our vector search performance, let's demonstrate how to build a complete RAG system using Jina AI. RAG combines the power of our Hyperscale/Composite index optimized semantic search with language model generation:\n", "\n", "1. **Query Processing**: User question is converted to vector embedding using Jina AI\n", - "2. **Document Retrieval**: BHIVE Hyperscale/Composite index finds most relevant documents (now with proven performance improvements)\n", + "2. **Document Retrieval**: Hyperscale/Composite index finds most relevant documents (now with proven performance improvements)\n", "3. **Context Assembly**: Retrieved documents provide factual context for the language model\n", "4. **Response Generation**: Jina's language model generates intelligent answers grounded in the retrieved data\n", "\n", @@ -1252,7 +1252,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": null, "id": "a9477c3c", "metadata": {}, "outputs": [ @@ -1311,7 +1311,7 @@ "output_type": "stream", "text": [ "Optimized RAG pipeline created successfully\n", - "Components: BHIVE Hyperscale/Composite Vector Search → Context Assembly → Jina Language Model → Response\n" + "Components: Hyperscale/Composite Vector Search → Context Assembly → Jina Language Model → Response\n" ] } ], @@ -1342,7 +1342,7 @@ " | StrOutputParser()\n", " )\n", " print(\"Optimized RAG pipeline created successfully\")\n", - " print(\"Components: BHIVE Hyperscale/Composite Vector Search → Context Assembly → Jina Language Model → Response\")\n", + " print(\"Components: Hyperscale/Composite Vector Search → Context Assembly → Jina Language Model → Response\")\n", "except Exception as e:\n", " raise ValueError(f\"Error creating RAG pipeline: {str(e)}\")" ] @@ -1373,13 +1373,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "Testing RAG System with BHIVE Hyperscale/Composite-Optimized Vector Search\n", + "Testing RAG System with Hyperscale/Composite-Optimized Vector Search\n", "============================================================\n", "User Query: What are the new eligibility rules for transgender women competing in leading women's golf tours, and what prompted these changes?\n", "\n", "Processing with optimized pipeline...\n", "1. Converting query to vector embedding with Jina AI\n", - "2. Searching BHIVE Hyperscale/Composite vector index for relevant documents (optimized)\n", + "2. Searching Hyperscale/Composite vector index for relevant documents (optimized)\n", "3. Assembling context from retrieved documents\n", "4. Generating intelligent response with JinaChat\n", "\n", @@ -1390,7 +1390,7 @@ } ], "source": [ - "print(\"Testing RAG System with BHIVE Hyperscale/Composite-Optimized Vector Search\")\n", + "print(\"Testing RAG System with Hyperscale/Composite-Optimized Vector Search\")\n", "print(\"=\" * 60)\n", "\n", "try:\n", @@ -1399,7 +1399,7 @@ " print(f\"User Query: {sample_query}\")\n", " print(\"\\nProcessing with optimized pipeline...\")\n", " print(\"1. Converting query to vector embedding with Jina AI\")\n", - " print(\"2. Searching BHIVE Hyperscale/Composite vector index for relevant documents (optimized)\")\n", + " print(\"2. Searching Hyperscale/Composite vector index for relevant documents (optimized)\")\n", " print(\"3. Assembling context from retrieved documents\")\n", " print(\"4. Generating intelligent response with JinaChat\")\n", " \n", @@ -1466,7 +1466,7 @@ "- Explored the power station and visited stalls at the Curated Makers Market.\n", "\n", "✅ RAG demo completed successfully!\n", - "✅ The system leverages BHIVE Hyperscale/Composite vector index optimization for fast document retrieval!\n", + "✅ The system leverages Hyperscale/Composite vector index optimization for fast document retrieval!\n", "✅ Jina AI provides high-quality embeddings and intelligent response generation!\n" ] } @@ -1498,7 +1498,7 @@ " print(f\"Error: {str(e)}\")\n", "\n", "print(f\"\\n✅ RAG demo completed successfully!\")\n", - "print(\"✅ The system leverages BHIVE Hyperscale/Composite vector index optimization for fast document retrieval!\")\n", + "print(\"✅ The system leverages Hyperscale/Composite vector index optimization for fast document retrieval!\")\n", "print(\"✅ Jina AI provides high-quality embeddings and intelligent response generation!\")" ] }, @@ -1518,7 +1518,7 @@ }, "source": [ "You've successfully built a high-performance semantic search engine combining:\n", - "- **Couchbase BHIVE Hyperscale/Composite indexes** for optimized vector search\n", + "- **Couchbase Hyperscale/Composite indexes** for optimized vector search\n", "- **Jina AI embeddings and language models** for intelligent processing\n", "- **Complete RAG pipeline** with caching optimization" ] From 11b897f73a567d0141f689648aafb67230331220 Mon Sep 17 00:00:00 2001 From: Giriraj Singh Date: Mon, 1 Dec 2025 16:05:06 +0530 Subject: [PATCH 3/5] Updated GSI to hyperscale or composite indexes in search_based notebook Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- jinaai/search_based/RAG_with_Couchbase_and_Jina_AI.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jinaai/search_based/RAG_with_Couchbase_and_Jina_AI.ipynb b/jinaai/search_based/RAG_with_Couchbase_and_Jina_AI.ipynb index 6ac1ed74..f89a88be 100644 --- a/jinaai/search_based/RAG_with_Couchbase_and_Jina_AI.ipynb +++ b/jinaai/search_based/RAG_with_Couchbase_and_Jina_AI.ipynb @@ -7,7 +7,7 @@ }, "source": [ "# Introduction\n", - "In this guide, we will walk you through building a powerful semantic search engine using Couchbase as the backend database and [Jina](https://jina.ai/) as the AI-powered embedding and language model provider, utilizing Full-Text Search using search vector index. Semantic search goes beyond simple keyword matching by understanding the context and meaning behind the words in a query, making it an essential tool for applications that require intelligent information retrieval. This tutorial is designed to be beginner-friendly, with clear, step-by-step instructions that will equip you with the knowledge to create a fully functional semantic search system from scratch. Alternatively if you want to perform semantic search using the GSI index, please take a look at [this.](https://developer.couchbase.com/tutorial-jina-couchbase-rag-with-global-secondary-index)" + "In this guide, we will walk you through building a powerful semantic search engine using Couchbase as the backend database and [Jina](https://jina.ai/) as the AI-powered embedding and language model provider, utilizing Full-Text Search using search vector index. Semantic search goes beyond simple keyword matching by understanding the context and meaning behind the words in a query, making it an essential tool for applications that require intelligent information retrieval. This tutorial is designed to be beginner-friendly, with clear, step-by-step instructions that will equip you with the knowledge to create a fully functional semantic search system from scratch. Alternatively if you want to perform semantic search using Hyperscale or Composite indexes, please take a look at [this.](https://developer.couchbase.com/tutorial-jina-couchbase-rag-with-hyperscale-or-composite-vector-index)" ] }, { From 9162f927ea176bc3f0df1cf746ec0618be20ee2f Mon Sep 17 00:00:00 2001 From: giriraj-singh-couchbase Date: Tue, 2 Dec 2025 14:49:47 +0530 Subject: [PATCH 4/5] fixed old link, changed vector search index to search vector index --- .../RAG_with_Couchbase_and_Jina_AI.ipynb | 2158 ++++++++--------- 1 file changed, 1075 insertions(+), 1083 deletions(-) diff --git a/jinaai/search_based/RAG_with_Couchbase_and_Jina_AI.ipynb b/jinaai/search_based/RAG_with_Couchbase_and_Jina_AI.ipynb index f89a88be..e9f5e8d9 100644 --- a/jinaai/search_based/RAG_with_Couchbase_and_Jina_AI.ipynb +++ b/jinaai/search_based/RAG_with_Couchbase_and_Jina_AI.ipynb @@ -1,1110 +1,1102 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "kNdImxzypDlm" - }, - "source": [ - "# Introduction\n", - "In this guide, we will walk you through building a powerful semantic search engine using Couchbase as the backend database and [Jina](https://jina.ai/) as the AI-powered embedding and language model provider, utilizing Full-Text Search using search vector index. Semantic search goes beyond simple keyword matching by understanding the context and meaning behind the words in a query, making it an essential tool for applications that require intelligent information retrieval. This tutorial is designed to be beginner-friendly, with clear, step-by-step instructions that will equip you with the knowledge to create a fully functional semantic search system from scratch. Alternatively if you want to perform semantic search using Hyperscale or Composite indexes, please take a look at [this.](https://developer.couchbase.com/tutorial-jina-couchbase-rag-with-hyperscale-or-composite-vector-index)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# How to run this tutorial\n", - "\n", - "This tutorial is available as a Jupyter Notebook (`.ipynb` file) that you can run interactively. You can access the original notebook [here](https://github.com/couchbase-examples/vector-search-cookbook/blob/main/jinaai/fts/RAG_with_Couchbase_and_Jina_AI.ipynb).\n", - "\n", - "You can either download the notebook file and run it on [Google Colab](https://colab.research.google.com/) or run it on your system by setting up the Python environment." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Before you start\n", - "\n", - "## Get Credentials for Jina AI\n", - "\n", - "* Please follow the [instructions](https://jina.ai/) to generate the Jina AI credentials.\n", - "* Please follow the [instructions](https://chat.jina.ai/api) to generate the JinaChat credentials.\n", - "\n", - "## Create and Deploy Your Free Tier Operational cluster on Capella\n", - "\n", - "To get started with Couchbase Capella, create an account and use it to deploy a forever free tier operational cluster. This account provides you with a environment where you can explore and learn about Capella with no time constraint.\n", - "\n", - "To learn more, please follow the [instructions](https://docs.couchbase.com/cloud/get-started/create-account.html).\n", - "\n", - "### Couchbase Capella Configuration\n", - "\n", - "When running Couchbase using [Capella](https://cloud.couchbase.com/sign-in), the following prerequisites need to be met.\n", - "\n", - "* Create the [database credentials](https://docs.couchbase.com/cloud/clusters/manage-database-users.html) to access the required bucket (Read and Write) used in the application.\n", - "* [Allow access](https://docs.couchbase.com/cloud/clusters/allow-ip-address.html) to the Cluster from the IP on which the application is running." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "NH2o6pqa69oG" - }, - "source": [ - "# Setting the Stage: Installing Necessary Libraries\n", - "To build our semantic search engine, we need a robust set of tools. The libraries we install handle everything from connecting to databases to performing complex machine learning tasks. Each library has a specific role: Couchbase libraries manage database operations, LangChain handles AI model integrations, and Jina provides advanced AI models for generating embeddings and understanding natural language. By setting up these libraries, we ensure our environment is equipped to handle the data-intensive and computationally complex tasks required for semantic search." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "id": "DYhPj0Ta8l_A" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Note: you may need to restart the kernel to use updated packages.\n" - ] - } - ], - "source": [ - "# Jina doesnt support openai other than 0.27\n", - "%pip install --quiet datasets==3.6.0 langchain-couchbase==0.3.0 langchain-community==0.3.24 openai==0.27 python-dotenv==1.1.0 ipywidgets" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1pp7GtNg8mB9" - }, - "source": [ - "# Importing Necessary Libraries\n", - "The script starts by importing a series of libraries required for various tasks, including handling JSON, logging, time tracking, Couchbase connections, embedding generation, and dataset loading. These libraries provide essential functions for working with data, managing database connections, and processing machine learning models." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "id": "8GzS6tfL8mFP" - }, - "outputs": [], - "source": [ - "import getpass\n", - "import json\n", - "import logging\n", - "import os\n", - "import time\n", - "from datetime import timedelta\n", - "\n", - "from couchbase.auth import PasswordAuthenticator\n", - "from couchbase.cluster import Cluster\n", - "from couchbase.exceptions import (CouchbaseException,\n", - " InternalServerFailureException,\n", - " QueryIndexAlreadyExistsException,\n", - " ServiceUnavailableException)\n", - "from couchbase.management.buckets import CreateBucketSettings\n", - "from couchbase.management.search import SearchIndex\n", - "from couchbase.options import ClusterOptions\n", - "from datasets import load_dataset\n", - "from dotenv import load_dotenv\n", - "from langchain_community.chat_models import JinaChat\n", - "from langchain_community.embeddings import JinaEmbeddings\n", - "from langchain_core.globals import set_llm_cache\n", - "from langchain_core.output_parsers import StrOutputParser\n", - "from langchain_core.prompts import ChatPromptTemplate\n", - "from langchain_core.prompts.chat import ChatPromptTemplate\n", - "from langchain_core.runnables import RunnablePassthrough\n", - "from langchain_couchbase.cache import CouchbaseCache\n", - "from langchain_couchbase.vectorstores import CouchbaseSearchVectorStore" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "pBnMp5vb8mIb" - }, - "source": [ - "# Setup Logging\n", - "Logging is configured to track the progress of the script and capture any errors or warnings. This is crucial for debugging and understanding the flow of execution. The logging output includes timestamps, log levels (e.g., INFO, ERROR), and messages that describe what is happening in the script.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "id": "Yv8kWcuf8mLx" - }, - "outputs": [], - "source": [ - "logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s',force=True)\n", - "\n", - "# Suppress all logs from specific loggers\n", - "logging.getLogger('openai').setLevel(logging.WARNING)\n", - "logging.getLogger('httpx').setLevel(logging.WARNING)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "K9G5a0en8mPA" - }, - "source": [ - "# Loading Sensitive Informnation\n", - "In this section, we prompt the user to input essential configuration settings needed for integrating Couchbase with Cohere's API. These settings include sensitive information like API keys, database credentials, and specific configuration names. Instead of hardcoding these details into the script, we request the user to provide them at runtime, ensuring flexibility and security.\n", - "\n", - "The script also validates that all required inputs are provided, raising an error if any crucial information is missing. This approach ensures that your integration is both secure and correctly configured without hardcoding sensitive information, enhancing the overall security and maintainability of your code." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "id": "PFGyHll18mSe" - }, - "outputs": [], - "source": [ - "load_dotenv(\"./.env\") \n", - "\n", - "JINA_API_KEY = os.getenv(\"JINA_API_KEY\")\n", - "JINACHAT_API_KEY = os.getenv(\"JINACHAT_API_KEY\")\n", - "\n", - "CB_HOST = os.getenv(\"CB_HOST\") or 'couchbase://localhost'\n", - "CB_USERNAME = os.getenv(\"CB_USERNAME\") or 'Administrator'\n", - "CB_PASSWORD = os.getenv(\"CB_PASSWORD\") or 'password'\n", - "CB_BUCKET_NAME = os.getenv(\"CB_BUCKET_NAME\") or 'vector-search-testing'\n", - "INDEX_NAME = os.getenv(\"INDEX_NAME\") or 'vector_search_jina'\n", - "\n", - "SCOPE_NAME = os.getenv(\"SCOPE_NAME\") or 'shared'\n", - "COLLECTION_NAME = os.getenv(\"COLLECTION_NAME\") or 'jina'\n", - "CACHE_COLLECTION = os.getenv(\"CACHE_COLLECTION\") or 'cache'\n", - "\n", - "# Check if the variables are correctly loaded\n", - "if not JINA_API_KEY:\n", - " raise ValueError(\"JINA_API_KEY environment variable is not set\")\n", - "if not JINACHAT_API_KEY:\n", - " raise ValueError(\"JINACHAT_API_KEY environment variable is not set\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qtGrYzUY8mV3" - }, - "source": [ - "# Connecting to the Couchbase Cluster\n", - "Connecting to a Couchbase cluster is the foundation of our project. Couchbase will serve as our primary data store, handling all the storage and retrieval operations required for our semantic search engine. By establishing this connection, we enable our application to interact with the database, allowing us to perform operations such as storing embeddings, querying data, and managing collections. This connection is the gateway through which all data will flow, so ensuring it's set up correctly is paramount.\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "id": "Zb3kK-7W8mZK" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-09-23 10:45:51,014 - INFO - Successfully connected to Couchbase\n" - ] - } - ], - "source": [ - "try:\n", - " auth = PasswordAuthenticator(CB_USERNAME, CB_PASSWORD)\n", - " options = ClusterOptions(auth)\n", - " cluster = Cluster(CB_HOST, options)\n", - " cluster.wait_until_ready(timedelta(seconds=5))\n", - " logging.info(\"Successfully connected to Couchbase\")\n", - "except Exception as e:\n", - " raise ConnectionError(f\"Failed to connect to Couchbase: {str(e)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "C_Gpy32N8mcZ" - }, - "source": [ - "## Setting Up Collections in Couchbase\n", - "\n", - "The setup_collection() function handles creating and configuring the hierarchical data organization in Couchbase:\n", - "\n", - "1. Bucket Creation:\n", - " - Checks if specified bucket exists, creates it if not\n", - " - Sets bucket properties like RAM quota (1024MB) and replication (disabled)\n", - " - Note: You will not be able to create a bucket on Capella\n", - "\n", - "2. Scope Management: \n", - " - Verifies if requested scope exists within bucket\n", - " - Creates new scope if needed (unless it's the default \"_default\" scope)\n", - "\n", - "3. Collection Setup:\n", - " - Checks for collection existence within scope\n", - " - Creates collection if it doesn't exist\n", - " - Waits 2 seconds for collection to be ready\n", - "\n", - "Additional Tasks:\n", - "- Creates primary index on collection for query performance\n", - "- Clears any existing documents for clean state\n", - "- Implements comprehensive error handling and logging\n", - "\n", - "The function is called twice to set up:\n", - "1. Main collection for vector embeddings\n", - "2. Cache collection for storing results\n" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "id": "ACZcwUnG8mf2" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-09-23 10:45:56,608 - INFO - Bucket 'vector-search-testing' exists.\n", - "2025-09-23 10:45:59,312 - INFO - Collection 'jina' already exists. Skipping creation.\n", - "2025-09-23 10:46:02,683 - INFO - Primary index present or created successfully.\n", - "2025-09-23 10:46:03,447 - INFO - All documents cleared from the collection.\n", - "2025-09-23 10:46:03,449 - INFO - Bucket 'vector-search-testing' exists.\n", - "2025-09-23 10:46:06,152 - INFO - Collection 'jina_cache' already exists. Skipping creation.\n", - "2025-09-23 10:46:09,482 - INFO - Primary index present or created successfully.\n", - "2025-09-23 10:46:09,804 - INFO - All documents cleared from the collection.\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "def setup_collection(cluster, bucket_name, scope_name, collection_name):\n", - " try:\n", - " # Check if bucket exists, create if it doesn't\n", - " try:\n", - " bucket = cluster.bucket(bucket_name)\n", - " logging.info(f\"Bucket '{bucket_name}' exists.\")\n", - " except Exception as e:\n", - " logging.info(f\"Bucket '{bucket_name}' does not exist. Creating it...\")\n", - " bucket_settings = CreateBucketSettings(\n", - " name=bucket_name,\n", - " bucket_type='couchbase',\n", - " ram_quota_mb=1024,\n", - " flush_enabled=True,\n", - " num_replicas=0\n", - " )\n", - " cluster.buckets().create_bucket(bucket_settings)\n", - " time.sleep(2) # Wait for bucket creation to complete and become available\n", - " bucket = cluster.bucket(bucket_name)\n", - " logging.info(f\"Bucket '{bucket_name}' created successfully.\")\n", - "\n", - " bucket_manager = bucket.collections()\n", - "\n", - " # Check if scope exists, create if it doesn't\n", - " scopes = bucket_manager.get_all_scopes()\n", - " scope_exists = any(scope.name == scope_name for scope in scopes)\n", - " \n", - " if not scope_exists and scope_name != \"_default\":\n", - " logging.info(f\"Scope '{scope_name}' does not exist. Creating it...\")\n", - " bucket_manager.create_scope(scope_name)\n", - " logging.info(f\"Scope '{scope_name}' created successfully.\")\n", - "\n", - " # Check if collection exists, create if it doesn't\n", - " collections = bucket_manager.get_all_scopes()\n", - " collection_exists = any(\n", - " scope.name == scope_name and collection_name in [col.name for col in scope.collections]\n", - " for scope in collections\n", - " )\n", - "\n", - " if not collection_exists:\n", - " logging.info(f\"Collection '{collection_name}' does not exist. Creating it...\")\n", - " bucket_manager.create_collection(scope_name, collection_name)\n", - " logging.info(f\"Collection '{collection_name}' created successfully.\")\n", - " else:\n", - " logging.info(f\"Collection '{collection_name}' already exists. Skipping creation.\")\n", - "\n", - " # Wait for collection to be ready\n", - " collection = bucket.scope(scope_name).collection(collection_name)\n", - " time.sleep(2) # Give the collection time to be ready for queries\n", - "\n", - " # Ensure primary index exists\n", - " try:\n", - " cluster.query(f\"CREATE PRIMARY INDEX IF NOT EXISTS ON `{bucket_name}`.`{scope_name}`.`{collection_name}`\").execute()\n", - " logging.info(\"Primary index present or created successfully.\")\n", - " except Exception as e:\n", - " logging.warning(f\"Error creating primary index: {str(e)}\")\n", - "\n", - " # Clear all documents in the collection\n", - " try:\n", - " query = f\"DELETE FROM `{bucket_name}`.`{scope_name}`.`{collection_name}`\"\n", - " cluster.query(query).execute()\n", - " logging.info(\"All documents cleared from the collection.\")\n", - " except Exception as e:\n", - " logging.warning(f\"Error while clearing documents: {str(e)}. The collection might be empty.\")\n", - "\n", - " return collection\n", - " except Exception as e:\n", - " raise RuntimeError(f\"Error setting up collection: {str(e)}\")\n", - " \n", - "setup_collection(cluster, CB_BUCKET_NAME, SCOPE_NAME, COLLECTION_NAME)\n", - "setup_collection(cluster, CB_BUCKET_NAME, SCOPE_NAME, CACHE_COLLECTION)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "NMJ7RRYp8mjV" - }, - "source": [ - "# Loading Couchbase Vector Search Index\n", - "\n", - "Semantic search requires an efficient way to retrieve relevant documents based on a user's query. This is where the Couchbase **Vector Search Index** comes into play. In this step, we load the Vector Search Index definition from a JSON file, which specifies how the index should be structured. This includes the fields to be indexed, the dimensions of the vectors, and other parameters that determine how the search engine processes queries based on vector similarity.\n", - "\n", - "This Jina vector search index configuration requires specific default settings to function properly. This tutorial uses the bucket named `vector-search-testing` with the scope `shared` and collection `jina`. The configuration is set up for vectors with exactly `1024 dimensions`, using dot product similarity and optimized for recall. If you want to use a different bucket, scope, or collection, you will need to modify the index configuration accordingly.\n", - "\n", - "For more information on creating a vector search index, please follow the [instructions](https://docs.couchbase.com/cloud/vector-search/create-vector-search-index-ui.html).\n" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "id": "y7xiCrOc8mmj" - }, - "outputs": [], - "source": [ - "# If you are running this script locally (not in Google Colab), uncomment the following line\n", - "# and provide the path to your index definition file.\n", - "\n", - "# index_definition_path = '/path_to_your_index_file/jina_index.json' # Local setup: specify your file path here\n", - "\n", - "# # Version for Google Colab\n", - "# def load_index_definition_colab():\n", - "# from google.colab import files\n", - "# print(\"Upload your index definition file\")\n", - "# uploaded = files.upload()\n", - "# index_definition_path = list(uploaded.keys())[0]\n", - "\n", - "# try:\n", - "# with open(index_definition_path, 'r') as file:\n", - "# index_definition = json.load(file)\n", - "# return index_definition\n", - "# except Exception as e:\n", - "# raise ValueError(f\"Error loading index definition from {index_definition_path}: {str(e)}\")\n", - "\n", - "# Version for Local Environment\n", - "def load_index_definition_local(index_definition_path):\n", - " try:\n", - " with open(index_definition_path, 'r') as file:\n", - " index_definition = json.load(file)\n", - " return index_definition\n", - " except Exception as e:\n", - " raise ValueError(f\"Error loading index definition from {index_definition_path}: {str(e)}\")\n", - "\n", - "# Usage\n", - "# Uncomment the appropriate line based on your environment\n", - "# index_definition = load_index_definition_colab()\n", - "index_definition = load_index_definition_local('jina_index.json')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "v_ddPQ_Y8mpm" - }, - "source": [ - "# Creating or Updating Search Indexes\n", - "\n", - "With the index definition loaded, the next step is to create or update the **Vector Search Index** in Couchbase. This step is crucial because it optimizes our database for vector similarity search operations, allowing us to perform searches based on the semantic content of documents rather than just keywords. By creating or updating a Vector Search Index, we enable our search engine to handle complex queries that involve finding semantically similar documents using vector embeddings, which is essential for a robust semantic search engine." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "id": "bHEpUu1l8msx" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-09-23 10:47:03,763 - INFO - Index 'vector_search_jina' found\n", - "2025-09-23 10:47:04,742 - INFO - Index 'vector_search_jina' already exists. Skipping creation/update.\n" - ] - } - ], - "source": [ - "try:\n", - " scope_index_manager = cluster.bucket(CB_BUCKET_NAME).scope(SCOPE_NAME).search_indexes()\n", - "\n", - " # Check if index already exists\n", - " existing_indexes = scope_index_manager.get_all_indexes()\n", - " index_name = index_definition[\"name\"]\n", - "\n", - " if index_name in [index.name for index in existing_indexes]:\n", - " logging.info(f\"Index '{index_name}' found\")\n", - " else:\n", - " logging.info(f\"Creating new index '{index_name}'...\")\n", - "\n", - " # Create SearchIndex object from JSON definition\n", - " search_index = SearchIndex.from_json(index_definition)\n", - "\n", - " # Upsert the index (create if not exists, update if exists)\n", - " scope_index_manager.upsert_index(search_index)\n", - " logging.info(f\"Index '{index_name}' successfully created/updated.\")\n", - "\n", - "except QueryIndexAlreadyExistsException:\n", - " logging.info(f\"Index '{index_name}' already exists. Skipping creation/update.\")\n", - "except ServiceUnavailableException:\n", - " raise RuntimeError(\"Search service is not available. Please ensure the Search service is enabled in your Couchbase cluster.\")\n", - "except InternalServerFailureException as e:\n", - " logging.error(f\"Internal server error: {str(e)}\")\n", - " raise" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "7FvxRsg38m3G" - }, - "source": [ - "# Creating Jina Embeddings\n", - "Embeddings are at the heart of semantic search. They are numerical representations of text that capture the semantic meaning of the words and phrases. Unlike traditional keyword-based search, which looks for exact matches, embeddings allow our search engine to understand the context and nuances of language, enabling it to retrieve documents that are semantically similar to the query, even if they don't contain the exact keywords. By creating embeddings using Jina, we equip our search engine with the ability to understand and process natural language in a way that's much closer to how humans understand language. This step transforms our raw text data into a format that the search engine can use to find and rank relevant documents.\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "id": "_75ZyCRh8m6m" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-09-23 10:47:06,326 - INFO - Successfully created JinaEmbeddings\n" - ] - } - ], - "source": [ - "try:\n", - " embeddings = JinaEmbeddings(\n", - " jina_api_key=JINA_API_KEY, model_name=\"jina-embeddings-v3\"\n", - " )\n", - " logging.info(\"Successfully created JinaEmbeddings\")\n", - "except Exception as e:\n", - " raise ValueError(f\"Error creating JinaEmbeddings: {str(e)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8IwZMUnF8m-N" - }, - "source": [ - "# Setting Up the Couchbase Vector Store\n", - "A vector store is where we'll keep our embeddings. Unlike the search vector index, which is used for text-based search, the vector store is specifically designed to handle embeddings and perform similarity searches. When a user inputs a query, the search engine converts the query into an embedding and compares it against the embeddings stored in the vector store. This allows the engine to find documents that are semantically similar to the query, even if they don't contain the exact same words. By setting up the vector store in Couchbase, we create a powerful tool that enables our search engine to understand and retrieve information based on the meaning and context of the query, rather than just the specific words used." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "id": "DwIJQjYT9RV_" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-09-23 10:47:12,343 - INFO - Successfully created vector store\n" - ] - } - ], - "source": [ - "try:\n", - " vector_store = CouchbaseSearchVectorStore(\n", - " cluster=cluster,\n", - " bucket_name=CB_BUCKET_NAME,\n", - " scope_name=SCOPE_NAME,\n", - " collection_name=COLLECTION_NAME,\n", - " embedding=embeddings,\n", - " index_name=INDEX_NAME,\n", - " )\n", - " logging.info(\"Successfully created vector store\")\n", - "except Exception as e:\n", - " raise ValueError(f\"Failed to create vector store: {str(e)}\")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Load the BBC News Dataset\n", - "To build a search engine, we need data to search through. We use the BBC News dataset from RealTimeData, which provides real-world news articles. This dataset contains news articles from BBC covering various topics and time periods. Loading the dataset is a crucial step because it provides the raw material that our search engine will work with. The quality and diversity of the news articles make it an excellent choice for testing and refining our search engine, ensuring it can handle real-world news content effectively.\n", - "\n", - "The BBC News dataset allows us to work with authentic news articles, enabling us to build and test a search engine that can effectively process and retrieve relevant news content. The dataset is loaded using the Hugging Face datasets library, specifically accessing the \"RealTimeData/bbc_news_alltime\" dataset with the \"2024-12\" version." - ] - }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "kNdImxzypDlm" + }, + "source": [ + "# Introduction\n", + "In this guide, we will walk you through building a powerful semantic search engine using Couchbase as the backend database and [Jina](https://jina.ai/) as the AI-powered embedding and language model provider, utilizing Full-Text Search using search vector index. Semantic search goes beyond simple keyword matching by understanding the context and meaning behind the words in a query, making it an essential tool for applications that require intelligent information retrieval. This tutorial is designed to be beginner-friendly, with clear, step-by-step instructions that will equip you with the knowledge to create a fully functional semantic search system from scratch. Alternatively if you want to perform semantic search using Hyperscale or Composite indexes, please take a look at [this.](https://developer.couchbase.com/tutorial-jina-couchbase-rag-with-hyperscale-or-composite-vector-index)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# How to run this tutorial\n", + "\n", + "This tutorial is available as a Jupyter Notebook (`.ipynb` file) that you can run interactively. You can access the original notebook [here](https://github.com/couchbase-examples/vector-search-cookbook/blob/main/jinaai/search_based/RAG_with_Couchbase_and_Jina_AI.ipynb).\n", + "\n", + "You can either download the notebook file and run it on [Google Colab](https://colab.research.google.com/) or run it on your system by setting up the Python environment." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Before you start\n", + "\n", + "## Get Credentials for Jina AI\n", + "\n", + "* Please follow the [instructions](https://jina.ai/) to generate the Jina AI credentials.\n", + "* Please follow the [instructions](https://chat.jina.ai/api) to generate the JinaChat credentials.\n", + "\n", + "## Create and Deploy Your Free Tier Operational cluster on Capella\n", + "\n", + "To get started with Couchbase Capella, create an account and use it to deploy a forever free tier operational cluster. This account provides you with a environment where you can explore and learn about Capella with no time constraint.\n", + "\n", + "To learn more, please follow the [instructions](https://docs.couchbase.com/cloud/get-started/create-account.html).\n", + "\n", + "### Couchbase Capella Configuration\n", + "\n", + "When running Couchbase using [Capella](https://cloud.couchbase.com/sign-in), the following prerequisites need to be met.\n", + "\n", + "* Create the [database credentials](https://docs.couchbase.com/cloud/clusters/manage-database-users.html) to access the required bucket (Read and Write) used in the application.\n", + "* [Allow access](https://docs.couchbase.com/cloud/clusters/allow-ip-address.html) to the Cluster from the IP on which the application is running." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NH2o6pqa69oG" + }, + "source": [ + "# Setting the Stage: Installing Necessary Libraries\n", + "To build our semantic search engine, we need a robust set of tools. The libraries we install handle everything from connecting to databases to performing complex machine learning tasks. Each library has a specific role: Couchbase libraries manage database operations, LangChain handles AI model integrations, and Jina provides advanced AI models for generating embeddings and understanding natural language. By setting up these libraries, we ensure our environment is equipped to handle the data-intensive and computationally complex tasks required for semantic search." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "DYhPj0Ta8l_A" + }, + "outputs": [], + "source": [ + "# Jina doesnt support openai other than 0.27\n", + "%pip install --quiet datasets==3.6.0 langchain-couchbase==0.3.0 langchain-community==0.3.24 openai==0.27 python-dotenv==1.1.0 ipywidgets" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1pp7GtNg8mB9" + }, + "source": [ + "# Importing Necessary Libraries\n", + "The script starts by importing a series of libraries required for various tasks, including handling JSON, logging, time tracking, Couchbase connections, embedding generation, and dataset loading. These libraries provide essential functions for working with data, managing database connections, and processing machine learning models." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "8GzS6tfL8mFP" + }, + "outputs": [], + "source": [ + "import getpass\n", + "import json\n", + "import logging\n", + "import os\n", + "import time\n", + "from datetime import timedelta\n", + "\n", + "from couchbase.auth import PasswordAuthenticator\n", + "from couchbase.cluster import Cluster\n", + "from couchbase.exceptions import (CouchbaseException,\n", + " InternalServerFailureException,\n", + " QueryIndexAlreadyExistsException,\n", + " ServiceUnavailableException)\n", + "from couchbase.management.buckets import CreateBucketSettings\n", + "from couchbase.management.search import SearchIndex\n", + "from couchbase.options import ClusterOptions\n", + "from datasets import load_dataset\n", + "from dotenv import load_dotenv\n", + "from langchain_community.chat_models import JinaChat\n", + "from langchain_community.embeddings import JinaEmbeddings\n", + "from langchain_core.globals import set_llm_cache\n", + "from langchain_core.output_parsers import StrOutputParser\n", + "from langchain_core.prompts import ChatPromptTemplate\n", + "from langchain_core.prompts.chat import ChatPromptTemplate\n", + "from langchain_core.runnables import RunnablePassthrough\n", + "from langchain_couchbase.cache import CouchbaseCache\n", + "from langchain_couchbase.vectorstores import CouchbaseSearchVectorStore" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pBnMp5vb8mIb" + }, + "source": [ + "# Setup Logging\n", + "Logging is configured to track the progress of the script and capture any errors or warnings. This is crucial for debugging and understanding the flow of execution. The logging output includes timestamps, log levels (e.g., INFO, ERROR), and messages that describe what is happening in the script.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "Yv8kWcuf8mLx" + }, + "outputs": [], + "source": [ + "logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s',force=True)\n", + "\n", + "# Suppress all logs from specific loggers\n", + "logging.getLogger('openai').setLevel(logging.WARNING)\n", + "logging.getLogger('httpx').setLevel(logging.WARNING)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "K9G5a0en8mPA" + }, + "source": [ + "# Loading Sensitive Informnation\n", + "In this section, we prompt the user to input essential configuration settings needed for integrating Couchbase with Cohere's API. These settings include sensitive information like API keys, database credentials, and specific configuration names. Instead of hardcoding these details into the script, we request the user to provide them at runtime, ensuring flexibility and security.\n", + "\n", + "The script also validates that all required inputs are provided, raising an error if any crucial information is missing. This approach ensures that your integration is both secure and correctly configured without hardcoding sensitive information, enhancing the overall security and maintainability of your code." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "PFGyHll18mSe" + }, + "outputs": [], + "source": [ + "load_dotenv(\"./.env\") \n", + "\n", + "JINA_API_KEY = os.getenv(\"JINA_API_KEY\")\n", + "JINACHAT_API_KEY = os.getenv(\"JINACHAT_API_KEY\")\n", + "\n", + "CB_HOST = os.getenv(\"CB_HOST\") or 'couchbase://localhost'\n", + "CB_USERNAME = os.getenv(\"CB_USERNAME\") or 'Administrator'\n", + "CB_PASSWORD = os.getenv(\"CB_PASSWORD\") or 'password'\n", + "CB_BUCKET_NAME = os.getenv(\"CB_BUCKET_NAME\") or 'vector-search-testing'\n", + "INDEX_NAME = os.getenv(\"INDEX_NAME\") or 'vector_search_jina'\n", + "\n", + "SCOPE_NAME = os.getenv(\"SCOPE_NAME\") or 'shared'\n", + "COLLECTION_NAME = os.getenv(\"COLLECTION_NAME\") or 'jina'\n", + "CACHE_COLLECTION = os.getenv(\"CACHE_COLLECTION\") or 'cache'\n", + "\n", + "# Check if the variables are correctly loaded\n", + "if not JINA_API_KEY:\n", + " raise ValueError(\"JINA_API_KEY environment variable is not set\")\n", + "if not JINACHAT_API_KEY:\n", + " raise ValueError(\"JINACHAT_API_KEY environment variable is not set\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qtGrYzUY8mV3" + }, + "source": [ + "# Connecting to the Couchbase Cluster\n", + "Connecting to a Couchbase cluster is the foundation of our project. Couchbase will serve as our primary data store, handling all the storage and retrieval operations required for our semantic search engine. By establishing this connection, we enable our application to interact with the database, allowing us to perform operations such as storing embeddings, querying data, and managing collections. This connection is the gateway through which all data will flow, so ensuring it's set up correctly is paramount.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "id": "Zb3kK-7W8mZK" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-09-23 10:47:18,035 - INFO - Successfully loaded the BBC News dataset with 2687 rows.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Loaded the BBC News dataset with 2687 rows\n" - ] - } - ], - "source": [ - "try:\n", - " news_dataset = load_dataset(\n", - " \"RealTimeData/bbc_news_alltime\", \"2024-12\", split=\"train\"\n", - " )\n", - " print(f\"Loaded the BBC News dataset with {len(news_dataset)} rows\")\n", - " logging.info(f\"Successfully loaded the BBC News dataset with {len(news_dataset)} rows.\")\n", - "except Exception as e:\n", - " raise ValueError(f\"Error loading the BBC News dataset: {str(e)}\")" - ] - }, + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-23 10:45:51,014 - INFO - Successfully connected to Couchbase\n" + ] + } + ], + "source": [ + "try:\n", + " auth = PasswordAuthenticator(CB_USERNAME, CB_PASSWORD)\n", + " options = ClusterOptions(auth)\n", + " cluster = Cluster(CB_HOST, options)\n", + " cluster.wait_until_ready(timedelta(seconds=5))\n", + " logging.info(\"Successfully connected to Couchbase\")\n", + "except Exception as e:\n", + " raise ConnectionError(f\"Failed to connect to Couchbase: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "C_Gpy32N8mcZ" + }, + "source": [ + "## Setting Up Collections in Couchbase\n", + "\n", + "The setup_collection() function handles creating and configuring the hierarchical data organization in Couchbase:\n", + "\n", + "1. Bucket Creation:\n", + " - Checks if specified bucket exists, creates it if not\n", + " - Sets bucket properties like RAM quota (1024MB) and replication (disabled)\n", + " - Note: You will not be able to create a bucket on Capella\n", + "\n", + "2. Scope Management: \n", + " - Verifies if requested scope exists within bucket\n", + " - Creates new scope if needed (unless it's the default \"_default\" scope)\n", + "\n", + "3. Collection Setup:\n", + " - Checks for collection existence within scope\n", + " - Creates collection if it doesn't exist\n", + " - Waits 2 seconds for collection to be ready\n", + "\n", + "Additional Tasks:\n", + "- Creates primary index on collection for query performance\n", + "- Clears any existing documents for clean state\n", + "- Implements comprehensive error handling and logging\n", + "\n", + "The function is called twice to set up:\n", + "1. Main collection for vector embeddings\n", + "2. Cache collection for storing results\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "id": "ACZcwUnG8mf2" + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Cleaning up the Data\n", - "We will use the content of the news articles for our RAG system.\n", - "\n", - "The dataset contains a few duplicate records. We are removing them to avoid duplicate results in the retrieval stage of our RAG system." - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-23 10:45:56,608 - INFO - Bucket 'vector-search-testing' exists.\n", + "2025-09-23 10:45:59,312 - INFO - Collection 'jina' already exists. Skipping creation.\n", + "2025-09-23 10:46:02,683 - INFO - Primary index present or created successfully.\n", + "2025-09-23 10:46:03,447 - INFO - All documents cleared from the collection.\n", + "2025-09-23 10:46:03,449 - INFO - Bucket 'vector-search-testing' exists.\n", + "2025-09-23 10:46:06,152 - INFO - Collection 'jina_cache' already exists. Skipping creation.\n", + "2025-09-23 10:46:09,482 - INFO - Primary index present or created successfully.\n", + "2025-09-23 10:46:09,804 - INFO - All documents cleared from the collection.\n" + ] }, { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "We have 1749 unique articles in our database.\n" - ] - } - ], - "source": [ - "news_articles = news_dataset[\"content\"]\n", - "unique_articles = set()\n", - "for article in news_articles:\n", - " if article:\n", - " unique_articles.add(article)\n", - "unique_news_articles = list(unique_articles)\n", - "print(f\"We have {len(unique_news_articles)} unique articles in our database.\")" + "data": { + "text/plain": [ + "" ] - }, + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def setup_collection(cluster, bucket_name, scope_name, collection_name):\n", + " try:\n", + " # Check if bucket exists, create if it doesn't\n", + " try:\n", + " bucket = cluster.bucket(bucket_name)\n", + " logging.info(f\"Bucket '{bucket_name}' exists.\")\n", + " except Exception as e:\n", + " logging.info(f\"Bucket '{bucket_name}' does not exist. Creating it...\")\n", + " bucket_settings = CreateBucketSettings(\n", + " name=bucket_name,\n", + " bucket_type='couchbase',\n", + " ram_quota_mb=1024,\n", + " flush_enabled=True,\n", + " num_replicas=0\n", + " )\n", + " cluster.buckets().create_bucket(bucket_settings)\n", + " time.sleep(2) # Wait for bucket creation to complete and become available\n", + " bucket = cluster.bucket(bucket_name)\n", + " logging.info(f\"Bucket '{bucket_name}' created successfully.\")\n", + "\n", + " bucket_manager = bucket.collections()\n", + "\n", + " # Check if scope exists, create if it doesn't\n", + " scopes = bucket_manager.get_all_scopes()\n", + " scope_exists = any(scope.name == scope_name for scope in scopes)\n", + " \n", + " if not scope_exists and scope_name != \"_default\":\n", + " logging.info(f\"Scope '{scope_name}' does not exist. Creating it...\")\n", + " bucket_manager.create_scope(scope_name)\n", + " logging.info(f\"Scope '{scope_name}' created successfully.\")\n", + "\n", + " # Check if collection exists, create if it doesn't\n", + " collections = bucket_manager.get_all_scopes()\n", + " collection_exists = any(\n", + " scope.name == scope_name and collection_name in [col.name for col in scope.collections]\n", + " for scope in collections\n", + " )\n", + "\n", + " if not collection_exists:\n", + " logging.info(f\"Collection '{collection_name}' does not exist. Creating it...\")\n", + " bucket_manager.create_collection(scope_name, collection_name)\n", + " logging.info(f\"Collection '{collection_name}' created successfully.\")\n", + " else:\n", + " logging.info(f\"Collection '{collection_name}' already exists. Skipping creation.\")\n", + "\n", + " # Wait for collection to be ready\n", + " collection = bucket.scope(scope_name).collection(collection_name)\n", + " time.sleep(2) # Give the collection time to be ready for queries\n", + "\n", + " # Ensure primary index exists\n", + " try:\n", + " cluster.query(f\"CREATE PRIMARY INDEX IF NOT EXISTS ON `{bucket_name}`.`{scope_name}`.`{collection_name}`\").execute()\n", + " logging.info(\"Primary index present or created successfully.\")\n", + " except Exception as e:\n", + " logging.warning(f\"Error creating primary index: {str(e)}\")\n", + "\n", + " # Clear all documents in the collection\n", + " try:\n", + " query = f\"DELETE FROM `{bucket_name}`.`{scope_name}`.`{collection_name}`\"\n", + " cluster.query(query).execute()\n", + " logging.info(\"All documents cleared from the collection.\")\n", + " except Exception as e:\n", + " logging.warning(f\"Error while clearing documents: {str(e)}. The collection might be empty.\")\n", + "\n", + " return collection\n", + " except Exception as e:\n", + " raise RuntimeError(f\"Error setting up collection: {str(e)}\")\n", + " \n", + "setup_collection(cluster, CB_BUCKET_NAME, SCOPE_NAME, COLLECTION_NAME)\n", + "setup_collection(cluster, CB_BUCKET_NAME, SCOPE_NAME, CACHE_COLLECTION)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NMJ7RRYp8mjV" + }, + "source": [ + "# Loading Couchbase Search Vector Index\n", + "\n", + "Semantic search requires an efficient way to retrieve relevant documents based on a user's query. This is where the Couchbase **Search Vector Index** comes into play. In this step, we load the Search Vector Index definition from a JSON file, which specifies how the index should be structured. This includes the fields to be indexed, the dimensions of the vectors, and other parameters that determine how the search engine processes queries based on vector similarity.\n", + "\n", + "This Jina Search Vector Index configuration requires specific default settings to function properly. This tutorial uses the bucket named `vector-search-testing` with the scope `shared` and collection `jina`. The configuration is set up for vectors with exactly `1024 dimensions`, using dot product similarity and optimized for recall. If you want to use a different bucket, scope, or collection, you will need to modify the index configuration accordingly.\n", + "\n", + "For more information on creating a Search Vector Index, please follow the [instructions](https://docs.couchbase.com/cloud/vector-search/create-vector-search-index-ui.html).\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "id": "y7xiCrOc8mmj" + }, + "outputs": [], + "source": [ + "# If you are running this script locally (not in Google Colab), uncomment the following line\n", + "# and provide the path to your index definition file.\n", + "\n", + "# index_definition_path = '/path_to_your_index_file/jina_index.json' # Local setup: specify your file path here\n", + "\n", + "# # Version for Google Colab\n", + "# def load_index_definition_colab():\n", + "# from google.colab import files\n", + "# print(\"Upload your index definition file\")\n", + "# uploaded = files.upload()\n", + "# index_definition_path = list(uploaded.keys())[0]\n", + "\n", + "# try:\n", + "# with open(index_definition_path, 'r') as file:\n", + "# index_definition = json.load(file)\n", + "# return index_definition\n", + "# except Exception as e:\n", + "# raise ValueError(f\"Error loading index definition from {index_definition_path}: {str(e)}\")\n", + "\n", + "# Version for Local Environment\n", + "def load_index_definition_local(index_definition_path):\n", + " try:\n", + " with open(index_definition_path, 'r') as file:\n", + " index_definition = json.load(file)\n", + " return index_definition\n", + " except Exception as e:\n", + " raise ValueError(f\"Error loading index definition from {index_definition_path}: {str(e)}\")\n", + "\n", + "# Usage\n", + "# Uncomment the appropriate line based on your environment\n", + "# index_definition = load_index_definition_colab()\n", + "index_definition = load_index_definition_local('jina_index.json')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "v_ddPQ_Y8mpm" + }, + "source": [ + "# Creating or Updating Search Indexes\n", + "\n", + "With the index definition loaded, the next step is to create or update the **Vector Search Index** in Couchbase. This step is crucial because it optimizes our database for vector similarity search operations, allowing us to perform searches based on the semantic content of documents rather than just keywords. By creating or updating a Vector Search Index, we enable our search engine to handle complex queries that involve finding semantically similar documents using vector embeddings, which is essential for a robust semantic search engine." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "id": "bHEpUu1l8msx" + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Saving Data to the Vector Store\n", - "To efficiently handle the large number of articles, we process them in batches of articles at a time. This batch processing approach helps manage memory usage and provides better control over the ingestion process.\n", - "\n", - "We first filter out any articles that exceed 50,000 characters to avoid potential issues with token limits. Then, using the vector store's add_texts method, we add the filtered articles to our vector database. The batch_size parameter controls how many articles are processed in each iteration.\n", - "\n", - "This approach offers several benefits:\n", - "1. Memory Efficiency: Processing in smaller batches prevents memory overload\n", - "2. Error Handling: If an error occurs, only the current batch is affected\n", - "3. Progress Tracking: Easier to monitor and track the ingestion progress\n", - "4. Resource Management: Better control over CPU and network resource utilization\n", - "\n", - "We use a conservative batch size of 50 to ensure reliable operation.\n", - "The optimal batch size depends on many factors including:\n", - "- Document sizes being inserted\n", - "- Available system resources\n", - "- Network conditions\n", - "- Concurrent workload\n", - "\n", - "Consider measuring performance with your specific workload before adjusting.\n" - ] - }, + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-23 10:47:03,763 - INFO - Index 'vector_search_jina' found\n", + "2025-09-23 10:47:04,742 - INFO - Index 'vector_search_jina' already exists. Skipping creation/update.\n" + ] + } + ], + "source": [ + "try:\n", + " scope_index_manager = cluster.bucket(CB_BUCKET_NAME).scope(SCOPE_NAME).search_indexes()\n", + "\n", + " # Check if index already exists\n", + " existing_indexes = scope_index_manager.get_all_indexes()\n", + " index_name = index_definition[\"name\"]\n", + "\n", + " if index_name in [index.name for index in existing_indexes]:\n", + " logging.info(f\"Index '{index_name}' found\")\n", + " else:\n", + " logging.info(f\"Creating new index '{index_name}'...\")\n", + "\n", + " # Create SearchIndex object from JSON definition\n", + " search_index = SearchIndex.from_json(index_definition)\n", + "\n", + " # Upsert the index (create if not exists, update if exists)\n", + " scope_index_manager.upsert_index(search_index)\n", + " logging.info(f\"Index '{index_name}' successfully created/updated.\")\n", + "\n", + "except QueryIndexAlreadyExistsException:\n", + " logging.info(f\"Index '{index_name}' already exists. Skipping creation/update.\")\n", + "except ServiceUnavailableException:\n", + " raise RuntimeError(\"Search service is not available. Please ensure the Search service is enabled in your Couchbase cluster.\")\n", + "except InternalServerFailureException as e:\n", + " logging.error(f\"Internal server error: {str(e)}\")\n", + " raise" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7FvxRsg38m3G" + }, + "source": [ + "# Creating Jina Embeddings\n", + "Embeddings are at the heart of semantic search. They are numerical representations of text that capture the semantic meaning of the words and phrases. Unlike traditional keyword-based search, which looks for exact matches, embeddings allow our search engine to understand the context and nuances of language, enabling it to retrieve documents that are semantically similar to the query, even if they don't contain the exact keywords. By creating embeddings using Jina, we equip our search engine with the ability to understand and process natural language in a way that's much closer to how humans understand language. This step transforms our raw text data into a format that the search engine can use to find and rank relevant documents.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "id": "_75ZyCRh8m6m" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-09-23 10:50:03,866 - INFO - Document ingestion completed successfully\n" - ] - } - ], - "source": [ - "# Calculate 60% of the dataset size and round to nearest integer\n", - "dataset_size = len(unique_news_articles)\n", - "subset_size = round(dataset_size * 0.6)\n", - "\n", - "# Filter articles by length and create subset\n", - "filtered_articles = [article for article in unique_news_articles[:subset_size] \n", - " if article and len(article) <= 50000]\n", - "\n", - "# Process in batches\n", - "batch_size = 50\n", - "\n", - "try:\n", - " vector_store.add_texts(\n", - " texts=filtered_articles,\n", - " batch_size=batch_size\n", - " )\n", - " logging.info(\"Document ingestion completed successfully\")\n", - " \n", - "except CouchbaseException as e:\n", - " logging.error(f\"Couchbase error during ingestion: {str(e)}\")\n", - " raise RuntimeError(f\"Error performing document ingestion: {str(e)}\")\n", - "except Exception as e:\n", - " if \"Payment Required\" in str(e):\n", - " logging.error(\"Payment required for Jina AI API. Please check your subscription status and API key.\")\n", - " print(\"To resolve this error:\")\n", - " print(\"1. Visit 'https://jina.ai/reader/#pricing' to review subscription options\")\n", - " print(\"2. Ensure your API key is valid and has sufficient credits\") \n", - " print(\"3. Consider upgrading your subscription plan if needed\")\n", - " else:\n", - " logging.error(f\"Unexpected error during ingestion: {str(e)}\")\n", - " raise RuntimeError(f\"Failed to save documents to vector store: {str(e)}\")" - ] - }, + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-23 10:47:06,326 - INFO - Successfully created JinaEmbeddings\n" + ] + } + ], + "source": [ + "try:\n", + " embeddings = JinaEmbeddings(\n", + " jina_api_key=JINA_API_KEY, model_name=\"jina-embeddings-v3\"\n", + " )\n", + " logging.info(\"Successfully created JinaEmbeddings\")\n", + "except Exception as e:\n", + " raise ValueError(f\"Error creating JinaEmbeddings: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8IwZMUnF8m-N" + }, + "source": [ + "# Setting Up the Couchbase Vector Store\n", + "A vector store is where we'll keep our embeddings. Unlike the search vector index, which is used for text-based search, the vector store is specifically designed to handle embeddings and perform similarity searches. When a user inputs a query, the search engine converts the query into an embedding and compares it against the embeddings stored in the vector store. This allows the engine to find documents that are semantically similar to the query, even if they don't contain the exact same words. By setting up the vector store in Couchbase, we create a powerful tool that enables our search engine to understand and retrieve information based on the meaning and context of the query, rather than just the specific words used." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "id": "DwIJQjYT9RV_" + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "8Pn8-dQw9RfQ" - }, - "source": [ - "# Setting Up a Couchbase Cache\n", - "To further optimize our system, we set up a Couchbase-based cache. A cache is a temporary storage layer that holds data that is frequently accessed, speeding up operations by reducing the need to repeatedly retrieve the same information from the database. In our setup, the cache will help us accelerate repetitive tasks, such as looking up similar documents. By implementing a cache, we enhance the overall performance of our search engine, ensuring that it can handle high query volumes and deliver results quickly.\n", - "\n", - "Caching is particularly valuable in scenarios where users may submit similar queries multiple times or where certain pieces of information are frequently requested. By storing these in a cache, we can significantly reduce the time it takes to respond to these queries, improving the user experience.\n" - ] - }, + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-23 10:47:12,343 - INFO - Successfully created vector store\n" + ] + } + ], + "source": [ + "try:\n", + " vector_store = CouchbaseSearchVectorStore(\n", + " cluster=cluster,\n", + " bucket_name=CB_BUCKET_NAME,\n", + " scope_name=SCOPE_NAME,\n", + " collection_name=COLLECTION_NAME,\n", + " embedding=embeddings,\n", + " index_name=INDEX_NAME,\n", + " )\n", + " logging.info(\"Successfully created vector store\")\n", + "except Exception as e:\n", + " raise ValueError(f\"Failed to create vector store: {str(e)}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Load the BBC News Dataset\n", + "To build a search engine, we need data to search through. We use the BBC News dataset from RealTimeData, which provides real-world news articles. This dataset contains news articles from BBC covering various topics and time periods. Loading the dataset is a crucial step because it provides the raw material that our search engine will work with. The quality and diversity of the news articles make it an excellent choice for testing and refining our search engine, ensuring it can handle real-world news content effectively.\n", + "\n", + "The BBC News dataset allows us to work with authentic news articles, enabling us to build and test a search engine that can effectively process and retrieve relevant news content. The dataset is loaded using the Hugging Face datasets library, specifically accessing the \"RealTimeData/bbc_news_alltime\" dataset with the \"2024-12\" version." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": 24, - "metadata": { - "id": "V2y7dyjf9Rid" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-09-23 10:50:21,526 - INFO - Successfully created cache\n" - ] - } - ], - "source": [ - "try:\n", - " cache = CouchbaseCache(\n", - " cluster=cluster,\n", - " bucket_name=CB_BUCKET_NAME,\n", - " scope_name=SCOPE_NAME,\n", - " collection_name=CACHE_COLLECTION,\n", - " )\n", - " logging.info(\"Successfully created cache\")\n", - " set_llm_cache(cache)\n", - "except Exception as e:\n", - " raise ValueError(f\"Failed to create cache: {str(e)}\")" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-23 10:47:18,035 - INFO - Successfully loaded the BBC News dataset with 2687 rows.\n" + ] }, { - "cell_type": "markdown", - "metadata": { - "id": "uehAx36o9Rlm" - }, - "source": [ - "# Creating the Jina Language Model (LLM)\n", - "Language models are AI systems that are trained to understand and generate human language. We'll be using Jina's language model to process user queries and generate meaningful responses. This model is a key component of our semantic search engine, allowing it to go beyond simple keyword matching and truly understand the intent behind a query. By creating this language model, we equip our search engine with the ability to interpret complex queries, understand the nuances of language, and provide more accurate and contextually relevant responses.\n", - "\n", - "The language model's ability to understand context and generate coherent responses is what makes our search engine truly intelligent. It can not only find the right information but also present it in a way that is useful and understandable to the user.\n", - "\n" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded the BBC News dataset with 2687 rows\n" + ] + } + ], + "source": [ + "try:\n", + " news_dataset = load_dataset(\n", + " \"RealTimeData/bbc_news_alltime\", \"2024-12\", split=\"train\"\n", + " )\n", + " print(f\"Loaded the BBC News dataset with {len(news_dataset)} rows\")\n", + " logging.info(f\"Successfully loaded the BBC News dataset with {len(news_dataset)} rows.\")\n", + "except Exception as e:\n", + " raise ValueError(f\"Error loading the BBC News dataset: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Cleaning up the Data\n", + "We will use the content of the news articles for our RAG system.\n", + "\n", + "The dataset contains a few duplicate records. We are removing them to avoid duplicate results in the retrieval stage of our RAG system." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": 25, - "metadata": { - "id": "yRAfBRLH9RpO" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-09-23 10:50:22,466 - INFO - Successfully created JinaChat\n" - ] - } - ], - "source": [ - "try:\n", - " llm = JinaChat(temperature=0.1, jinachat_api_key=JINACHAT_API_KEY)\n", - " logging.info(\"Successfully created JinaChat\")\n", - "except Exception as e:\n", - " logging.error(f\"Error creating JinaChat: {str(e)}. Please check your API key and network connection.\")\n", - " raise" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "We have 1749 unique articles in our database.\n" + ] + } + ], + "source": [ + "news_articles = news_dataset[\"content\"]\n", + "unique_articles = set()\n", + "for article in news_articles:\n", + " if article:\n", + " unique_articles.add(article)\n", + "unique_news_articles = list(unique_articles)\n", + "print(f\"We have {len(unique_news_articles)} unique articles in our database.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Saving Data to the Vector Store\n", + "To efficiently handle the large number of articles, we process them in batches of articles at a time. This batch processing approach helps manage memory usage and provides better control over the ingestion process.\n", + "\n", + "We first filter out any articles that exceed 50,000 characters to avoid potential issues with token limits. Then, using the vector store's add_texts method, we add the filtered articles to our vector database. The batch_size parameter controls how many articles are processed in each iteration.\n", + "\n", + "This approach offers several benefits:\n", + "1. Memory Efficiency: Processing in smaller batches prevents memory overload\n", + "2. Error Handling: If an error occurs, only the current batch is affected\n", + "3. Progress Tracking: Easier to monitor and track the ingestion progress\n", + "4. Resource Management: Better control over CPU and network resource utilization\n", + "\n", + "We use a conservative batch size of 50 to ensure reliable operation.\n", + "The optimal batch size depends on many factors including:\n", + "- Document sizes being inserted\n", + "- Available system resources\n", + "- Network conditions\n", + "- Concurrent workload\n", + "\n", + "Consider measuring performance with your specific workload before adjusting.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "afOOEECGiLuQ" - }, - "source": [ - "## Perform Semantic Search\n", - "Semantic search in Couchbase involves converting queries and documents into vector representations using an embeddings model. These vectors capture the semantic meaning of the text and are stored directly in Couchbase. When a query is made, Couchbase performs a similarity search by comparing the query vector against the stored document vectors. The similarity metric used for this comparison is configurable, allowing flexibility in how the relevance of documents is determined.\n", - "\n", - "In the provided code, the search process begins by recording the start time, followed by executing the similarity_search_with_score method of the CouchbaseSearchVectorStore. This method searches Couchbase for the most relevant documents based on the vector similarity to the query. The search results include the document content and a similarity score that reflects how closely each document aligns with the query in the defined semantic space. The time taken to perform this search is then calculated and logged, and the results are displayed, showing the most relevant documents along with their similarity scores. This approach leverages Couchbase as both a storage and retrieval engine for vector data, enabling efficient and scalable semantic searches. The integration of vector storage and search capabilities within Couchbase allows for sophisticated semantic search operations without relying on external services for vector storage or comparison.\n", - "\n", - "### Note on Retry Mechanism\n", - "The search implementation includes a retry mechanism to handle rate limiting and API errors gracefully. If a rate limit error (HTTP 429) is encountered, the system will automatically retry the request up to 3 times with exponential backoff, waiting 2 seconds initially and doubling the wait time between each retry. This helps manage API usage limits while maintaining service reliability. For other types of errors, such as payment requirements or general failures, appropriate error messages and troubleshooting steps are provided to help diagnose and resolve the issue." - ] - }, + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-23 10:50:03,866 - INFO - Document ingestion completed successfully\n" + ] + } + ], + "source": [ + "# Calculate 60% of the dataset size and round to nearest integer\n", + "dataset_size = len(unique_news_articles)\n", + "subset_size = round(dataset_size * 0.6)\n", + "\n", + "# Filter articles by length and create subset\n", + "filtered_articles = [article for article in unique_news_articles[:subset_size] \n", + " if article and len(article) <= 50000]\n", + "\n", + "# Process in batches\n", + "batch_size = 50\n", + "\n", + "try:\n", + " vector_store.add_texts(\n", + " texts=filtered_articles,\n", + " batch_size=batch_size\n", + " )\n", + " logging.info(\"Document ingestion completed successfully\")\n", + " \n", + "except CouchbaseException as e:\n", + " logging.error(f\"Couchbase error during ingestion: {str(e)}\")\n", + " raise RuntimeError(f\"Error performing document ingestion: {str(e)}\")\n", + "except Exception as e:\n", + " if \"Payment Required\" in str(e):\n", + " logging.error(\"Payment required for Jina AI API. Please check your subscription status and API key.\")\n", + " print(\"To resolve this error:\")\n", + " print(\"1. Visit 'https://jina.ai/reader/#pricing' to review subscription options\")\n", + " print(\"2. Ensure your API key is valid and has sufficient credits\") \n", + " print(\"3. Consider upgrading your subscription plan if needed\")\n", + " else:\n", + " logging.error(f\"Unexpected error during ingestion: {str(e)}\")\n", + " raise RuntimeError(f\"Failed to save documents to vector store: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8Pn8-dQw9RfQ" + }, + "source": [ + "# Setting Up a Couchbase Cache\n", + "To further optimize our system, we set up a Couchbase-based cache. A cache is a temporary storage layer that holds data that is frequently accessed, speeding up operations by reducing the need to repeatedly retrieve the same information from the database. In our setup, the cache will help us accelerate repetitive tasks, such as looking up similar documents. By implementing a cache, we enhance the overall performance of our search engine, ensuring that it can handle high query volumes and deliver results quickly.\n", + "\n", + "Caching is particularly valuable in scenarios where users may submit similar queries multiple times or where certain pieces of information are frequently requested. By storing these in a cache, we can significantly reduce the time it takes to respond to these queries, improving the user experience.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "id": "V2y7dyjf9Rid" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 26, - "metadata": { - "id": "y3oO33_LiLxU" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-09-23 10:50:25,678 - INFO - Semantic search completed in 2.13 seconds\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Semantic Search Results (completed in 2.13 seconds):\n", - "--------------------------------------------------------------------------------\n", - "Score: 0.6798, Text: 'Self-doubt, errors & big changes' - inside the crisis at Man City\n", - "\n", - "Pep Guardiola has not been through a moment like this in his managerial career. Manchester City have lost nine matches in their past 12 - as many defeats as they had suffered in their previous 106 fixtures. At the end of October, City were still unbeaten at the top of the Premier League and favourites to win a fifth successive title. Now they are seventh, 12 points behind leaders Liverpool having played a game more. It has been an incredible fall from grace and left people trying to work out what has happened - and whether Guardiola can make it right. After discussing the situation with those who know him best, I have taken a closer look at the future - both short and long term - and how the current crisis at Man City is going to be solved.\n", - "\n", - "Pep Guardiola's Man City have lost nine of their past 12 matches\n", - "\n", - "Guardiola has also been giving it a lot of thought. He has not been sleeping very well, as he has said, and has not been himself at times when talking to the media. He has been talking to a lot of people about what is going on as he tries to work out the reasons for City's demise. Some reasons he knows, others he still doesn't. What people perhaps do not realise is Guardiola hugely doubts himself and always has. He will be thinking \"I'm not going to be able to get us out of this\" and needs the support of people close to him to push away those insecurities - and he has that. He is protected by his people who are very aware, like he is, that there are a lot of people that want City to fail. It has been a turbulent time for Guardiola. Remember those marks he had on his head after the 3-3 draw with Feyenoord in the Champions League? He always scratches his head, it is a gesture of nervousness. Normally nothing happens but on that day one of his nails was far too sharp so, after talking to the players in the changing room where he scratched his head because of his usual agitated gesturing, he went to the news conference. His right-hand man Manel Estiarte sent him photos in a message saying \"what have you got on your head?\", but by the time Guardiola returned to the coaching room there was hardly anything there again. He started that day with a cover on his nose after the same thing happened at the training ground the day before. Guardiola was having a footballing debate with Kyle Walker about positional stuff and marked his nose with that same nail. There was also that remarkable news conference after the Manchester derby when he said \"I don't know what to do\". That is partly true and partly not true. Ignore the fact Guardiola suggested he was \"not good enough\". He actually meant he was not good enough to resolve the situation with the group of players he has available and with all the other current difficulties. There are obviously logical explanations for the crisis and the first one has been talked about many times - the absence of injured midfielder Rodri. You know the game Jenga? When you take the wrong piece out, the whole tower collapses. That is what has happened here. It is normal for teams to have an over-reliance on one player if he is the best in the world in his position. And you cannot calculate the consequences of an injury that rules someone like Rodri out for the season. City are a team, like many modern ones, in which the holding midfielder is a key element to the construction. So, when you take Rodri out, it is difficult to hold it together. There were Plan Bs - John Stones, Manuel Akanji, even Nathan Ake - but injuries struck. The big injury list has been out of the ordinary and the busy calendar has also played a part in compounding the issues. However, one factor even Guardiola cannot explain is the big uncharacteristic errors in almost every game from international players. Why did Matheus Nunes make that challenge to give away the penalty against Manchester United? Jack Grealish is sent on at the end to keep the ball and cannot do that. There are errors from Walker and other defenders. These are some of the best players in the world. Of course the players' mindset is important, and confidence is diminishing. Wrong decisions get taken so there is almost panic on the pitch instead of calm. There are also players badly out of form who are having to play because of injuries. Walker is now unable to hide behind his pace, I'm not sure Kevin de Bruyne is ever getting back to the level he used to be at, Bernardo Silva and Ilkay Gundogan do not have time to rest, Grealish is not playing at his best. Some of these players were only meant to be playing one game a week but, because of injuries, have played 12 games in 40 days. It all has a domino effect. One consequence is that Erling Haaland isn't getting the service to score. But the Norwegian still remains City's top-scorer with 13. Defender Josko Gvardiol is next on the list with just four. The way their form has been analysed inside the City camp is there have only been three games where they deserved to lose (Liverpool, Bournemouth and Aston Villa). But of course it is time to change the dynamic.\n", - "\n", - "Guardiola has never protected his players so much. He has not criticised them and is not going to do so. They have won everything with him. Instead of doing more with them, he has tried doing less. He has sometimes given them more days off to clear their heads, so they can reset - two days this week for instance. Perhaps the time to change a team is when you are winning, but no-one was suggesting Man City were about to collapse when they were top and unbeaten after nine league games. Some people have asked how bad it has to get before City make a decision on Guardiola. The answer is that there is no decision to be made. Maybe if this was Real Madrid, Barcelona or Juventus, the pressure from outside would be massive and the argument would be made that Guardiola has to go. At City he has won the lot, so how can anyone say he is failing? Yes, this is a crisis. But given all their problems, City's renewed target is finishing in the top four. That is what is in all their heads now. The idea is to recover their essence by improving defensive concepts that are not there and re-establishing the intensity they are known for. Guardiola is planning to use the next two years of his contract, which is expected to be his last as a club manager, to prepare a new Manchester City. When he was at the end of his four years at Barcelona, he asked two managers what to do when you feel people are not responding to your instructions. Do you go or do the players go? Sir Alex Ferguson and Rafael Benitez both told him that the players need to go. Guardiola did not listen because of his emotional attachment to his players back then and he decided to leave the Camp Nou because he felt the cycle was over. He will still protect his players now but there is not the same emotional attachment - so it is the players who are going to leave this time. It is likely City will look to replace five or six regular starters. Guardiola knows it is the end of an era and the start of a new one. Changes will not be immediate and the majority of the work will be done in the summer. But they are open to any opportunities in January - and a holding midfielder is one thing they need. In the summer City might want to get Spain's Martin Zubimendi from Real Sociedad and they know 60m euros (£50m) will get him. He said no to Liverpool last summer even though everything was agreed, but he now wants to move on and the Premier League is the target. Even if they do not get Zubimendi, that is the calibre of footballer they are after. A new Manchester City is on its way - with changes driven by Guardiola, incoming sporting director Hugo Viana and the football department.\n", - "--------------------------------------------------------------------------------\n", - "Score: 0.6795, Text: 'Self-doubt, errors & big changes' - inside the crisis at Man City\n", - "\n", - "Pep Guardiola has not been through a moment like this in his managerial career. Manchester City have lost nine matches in their past 12 - as many defeats as they had suffered in their previous 106 fixtures. At the end of October, City were still unbeaten at the top of the Premier League and favourites to win a fifth successive title. Now they are seventh, 12 points behind leaders Liverpool having played a game more. It has been an incredible fall from grace and left people trying to work out what has happened - and whether Guardiola can make it right. After discussing the situation with those who know him best, I have taken a closer look at the future - both short and long term - and how the current crisis at Man City is going to be solved.\n", - "\n", - "Pep Guardiola's Man City have lost nine of their past 12 matches\n", - "\n", - "Guardiola has also been giving it a lot of thought. He has not been sleeping very well, as he has said, and has not been himself at times when talking to the media. He has been talking to a lot of people about what is going on as he tries to work out the reasons for City's demise. Some reasons he knows, others he still doesn't. What people perhaps do not realise is Guardiola hugely doubts himself and always has. He will be thinking \"I'm not going to be able to get us out of this\" and needs the support of people close to him to push away those insecurities - and he has that. He is protected by his people who are very aware, like he is, that there are a lot of people that want City to fail. It has been a turbulent time for Guardiola. Remember those marks he had on his head after the 3-3 draw with Feyenoord in the Champions League? He always scratches his head, it is a gesture of nervousness. Normally nothing happens but on that day one of his nails was far too sharp so, after talking to the players in the changing room where he scratched his head because of his usual agitated gesturing, he went to the news conference. His right-hand man Manel Estiarte sent him photos in a message saying \"what have you got on your head?\", but by the time Guardiola returned to the coaching room there was hardly anything there again. He started that day with a cover on his nose after the same thing happened at the training ground the day before. Guardiola was having a footballing debate with Kyle Walker about positional stuff and marked his nose with that same nail. There was also that remarkable news conference after the Manchester derby when he said \"I don't know what to do\". That is partly true and partly not true. Ignore the fact Guardiola suggested he was \"not good enough\". He actually meant he was not good enough to resolve the situation with the group of players he has available and with all the other current difficulties. There are obviously logical explanations for the crisis and the first one has been talked about many times - the absence of injured midfielder Rodri. You know the game Jenga? When you take the wrong piece out, the whole tower collapses. That is what has happened here. It is normal for teams to have an over-reliance on one player if he is the best in the world in his position. And you cannot calculate the consequences of an injury that rules someone like Rodri out for the season. City are a team, like many modern ones, in which the holding midfielder is a key element to the construction. So, when you take Rodri out, it is difficult to hold it together. There were Plan Bs - John Stones, Manuel Akanji, even Nathan Ake - but injuries struck. The big injury list has been out of the ordinary and the busy calendar has also played a part in compounding the issues. However, one factor even Guardiola cannot explain is the big uncharacteristic errors in almost every game from international players. Why did Matheus Nunes make that challenge to give away the penalty against Manchester United? Jack Grealish is sent on at the end to keep the ball and cannot do that. There are errors from Walker and other defenders. These are some of the best players in the world. Of course the players' mindset is important, and confidence is diminishing. Wrong decisions get taken so there is almost panic on the pitch instead of calm. There are also players badly out of form who are having to play because of injuries. Walker is now unable to hide behind his pace, I'm not sure Kevin de Bruyne is ever getting back to the level he used to be at, Bernardo Silva and Ilkay Gundogan do not have time to rest, Grealish is not playing at his best. Some of these players were only meant to be playing one game a week but, because of injuries, have played 12 games in 40 days. It all has a domino effect. One consequence is that Erling Haaland isn't getting the service to score. But the Norwegian still remains City's top-scorer with 13. Defender Josko Gvardiol is next on the list with just four. The way their form has been analysed inside the City camp is there have only been three games where they deserved to lose (Liverpool, Bournemouth and Aston Villa). But of course it is time to change the dynamic.\n", - "\n", - "Guardiola has never protected his players so much. He has not criticised them and is not going to do so. They have won everything with him. Instead of doing more with them, he has tried doing less. He has sometimes given them more days off to clear their heads, so they can reset - two days this week for instance. Perhaps the time to change a team is when you are winning, but no-one was suggesting Man City were about to collapse when they were top and unbeaten after nine league games. Some people have asked how bad it has to get before City make a decision on Guardiola. The answer is that there is no decision to be made. Maybe if this was Real Madrid, Barcelona or Juventus, the pressure from outside would be massive and the argument would be made that Guardiola has to go. At City he has won the lot, so how can anyone say he is failing? Yes, this is a crisis. But given all their problems, City's renewed target is finishing in the top four. That is what is in all their heads now. The idea is to recover their essence by improving defensive concepts that are not there and re-establishing the intensity they are known for. Guardiola is planning to use the next two years of his contract, which is expected to be his last as a club manager, to prepare a new Manchester City. When he was at the end of his four years at Barcelona, he asked two managers what to do when you feel people are not responding to your instructions. Do you go or do the players go? Sir Alex Ferguson and Rafael Benitez both told him that the players need to go. Guardiola did not listen because of his emotional attachment to his players back then and he decided to leave the Camp Nou because he felt the cycle was over. He will still protect his players now but there is not the same emotional attachment - so it is the players who are going to leave this time. It is likely City will look to replace five or six regular starters. Guardiola knows it is the end of an era and the start of a new one. Changes will not be immediate and the majority of the work will be done in the summer. But they are open to any opportunities in January - and a holding midfielder is one thing they need. In the summer City might want to get Spain's Martin Zubimendi from Real Sociedad and they know 60m euros (£50m) will get him. He said no to Liverpool last summer even though everything was agreed, but he now wants to move on and the Premier League is the target. Even if they do not get Zubimendi, that is the calibre of footballer they are after. A new Manchester City is on its way - with changes driven by Guardiola, incoming sporting director Hugo Viana and the football department.\n", - "--------------------------------------------------------------------------------\n", - "Score: 0.6207, Text: Manchester City boss Pep Guardiola has won 18 trophies since he arrived at the club in 2016\n", - "\n", - "\n", - "... (output truncated for brevity)\n" - ] - } - ], - "source": [ - "def perform_semantic_search(query, vector_store, max_retries=3, retry_delay=2): \n", - " for attempt in range(max_retries):\n", - " try:\n", - " start_time = time.time()\n", - " search_results = vector_store.similarity_search_with_score(query, k=5)\n", - " search_elapsed_time = time.time() - start_time\n", - " \n", - " logging.info(f\"Semantic search completed in {search_elapsed_time:.2f} seconds\")\n", - " return search_results, search_elapsed_time\n", - " \n", - " except Exception as e:\n", - " error_str = str(e)\n", - " \n", - " # Check if it's a rate limit error (HTTP 429)\n", - " if \"http_status: 429\" in error_str or \"query request rejected\" in error_str:\n", - " logging.warning(f\"Rate limit hit (attempt {attempt+1}/{max_retries}). Waiting {retry_delay} seconds...\")\n", - " time.sleep(retry_delay)\n", - " retry_delay *= 2 # Exponential backoff\n", - " \n", - " if attempt == max_retries - 1:\n", - " logging.error(\"Maximum retry attempts reached. API rate limit exceeded.\")\n", - " raise RuntimeError(\"API rate limit exceeded. Please try again later or check your subscription.\")\n", - " else:\n", - " # For other errors, don't retry\n", - " logging.error(f\"Search error: {error_str}\")\n", - " if \"Payment Required\" in error_str:\n", - " raise RuntimeError(\"Payment required for Jina AI API. Please check your subscription status and API key.\")\n", - " else:\n", - " raise RuntimeError(f\"Search failed: {error_str}\")\n", - "\n", - "try:\n", - " query = \"What was manchester city manager pep guardiola's reaction to the team's current form?\"\n", - " search_results, search_elapsed_time = perform_semantic_search(query, vector_store)\n", - " \n", - " # Display search results\n", - " print(f\"\\nSemantic Search Results (completed in {search_elapsed_time:.2f} seconds):\")\n", - " print(\"-\"*80)\n", - " for doc, score in search_results:\n", - " print(f\"Score: {score:.4f}, Text: {doc.page_content}\")\n", - " print(\"-\"*80)\n", - " \n", - "except RuntimeError as e:\n", - " print(f\"Error: {str(e)}\")\n", - " print(\"\\nTroubleshooting steps:\")\n", - " if \"API rate limit\" in str(e):\n", - " print(\"1. Wait a few minutes before trying again\")\n", - " print(\"2. Reduce the frequency of your requests\")\n", - " print(\"3. Consider upgrading your Jina AI plan for higher rate limits\")\n", - " elif \"Payment required\" in str(e):\n", - " print(\"1. Visit 'https://jina.ai/reader/#pricing' to review subscription options\")\n", - " print(\"2. Ensure your API key is valid and has sufficient credits\")\n", - " print(\"3. Update your API key configuration\")\n", - " else:\n", - " print(\"1. Check your network connection\")\n", - " print(\"2. Verify your Couchbase and Jina configurations\")\n", - " print(\"3. Review the vector store implementation for any bugs\")" - ] - }, + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-23 10:50:21,526 - INFO - Successfully created cache\n" + ] + } + ], + "source": [ + "try:\n", + " cache = CouchbaseCache(\n", + " cluster=cluster,\n", + " bucket_name=CB_BUCKET_NAME,\n", + " scope_name=SCOPE_NAME,\n", + " collection_name=CACHE_COLLECTION,\n", + " )\n", + " logging.info(\"Successfully created cache\")\n", + " set_llm_cache(cache)\n", + "except Exception as e:\n", + " raise ValueError(f\"Failed to create cache: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uehAx36o9Rlm" + }, + "source": [ + "# Creating the Jina Language Model (LLM)\n", + "Language models are AI systems that are trained to understand and generate human language. We'll be using Jina's language model to process user queries and generate meaningful responses. This model is a key component of our semantic search engine, allowing it to go beyond simple keyword matching and truly understand the intent behind a query. By creating this language model, we equip our search engine with the ability to interpret complex queries, understand the nuances of language, and provide more accurate and contextually relevant responses.\n", + "\n", + "The language model's ability to understand context and generate coherent responses is what makes our search engine truly intelligent. It can not only find the right information but also present it in a way that is useful and understandable to the user.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "id": "yRAfBRLH9RpO" + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "6bp8YEEQiL0r" - }, - "source": [ - "# Retrieval-Augmented Generation (RAG) with Couchbase and Langchain\n", - "Couchbase and LangChain can be seamlessly integrated to create RAG (Retrieval-Augmented Generation) chains, enhancing the process of generating contextually relevant responses. In this setup, Couchbase serves as the vector store, where embeddings of documents are stored. When a query is made, LangChain retrieves the most relevant documents from Couchbase by comparing the query’s embedding with the stored document embeddings. These documents, which provide contextual information, are then passed to a generative language model within LangChain.\n", - "\n", - "The language model, equipped with the context from the retrieved documents, generates a response that is both informed and contextually accurate. This integration allows the RAG chain to leverage Couchbase’s efficient storage and retrieval capabilities, while LangChain handles the generation of responses based on the context provided by the retrieved documents. Together, they create a powerful system that can deliver highly relevant and accurate answers by combining the strengths of both retrieval and generation." - ] - }, + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-23 10:50:22,466 - INFO - Successfully created JinaChat\n" + ] + } + ], + "source": [ + "try:\n", + " llm = JinaChat(temperature=0.1, jinachat_api_key=JINACHAT_API_KEY)\n", + " logging.info(\"Successfully created JinaChat\")\n", + "except Exception as e:\n", + " logging.error(f\"Error creating JinaChat: {str(e)}. Please check your API key and network connection.\")\n", + " raise" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "afOOEECGiLuQ" + }, + "source": [ + "## Perform Semantic Search\n", + "Semantic search in Couchbase involves converting queries and documents into vector representations using an embeddings model. These vectors capture the semantic meaning of the text and are stored directly in Couchbase. When a query is made, Couchbase performs a similarity search by comparing the query vector against the stored document vectors. The similarity metric used for this comparison is configurable, allowing flexibility in how the relevance of documents is determined.\n", + "\n", + "In the provided code, the search process begins by recording the start time, followed by executing the similarity_search_with_score method of the CouchbaseSearchVectorStore. This method searches Couchbase for the most relevant documents based on the vector similarity to the query. The search results include the document content and a similarity score that reflects how closely each document aligns with the query in the defined semantic space. The time taken to perform this search is then calculated and logged, and the results are displayed, showing the most relevant documents along with their similarity scores. This approach leverages Couchbase as both a storage and retrieval engine for vector data, enabling efficient and scalable semantic searches. The integration of vector storage and search capabilities within Couchbase allows for sophisticated semantic search operations without relying on external services for vector storage or comparison.\n", + "\n", + "### Note on Retry Mechanism\n", + "The search implementation includes a retry mechanism to handle rate limiting and API errors gracefully. If a rate limit error (HTTP 429) is encountered, the system will automatically retry the request up to 3 times with exponential backoff, waiting 2 seconds initially and doubling the wait time between each retry. This helps manage API usage limits while maintaining service reliability. For other types of errors, such as payment requirements or general failures, appropriate error messages and troubleshooting steps are provided to help diagnose and resolve the issue." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "id": "y3oO33_LiLxU" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 27, - "metadata": { - "id": "fTolIHFpiL30" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-09-23 10:50:26,937 - INFO - Successfully created RAG chain\n" - ] - } - ], - "source": [ - "try:\n", - " template = \"\"\"You are a helpful bot. If you cannot answer based on the context provided, respond with a generic answer. Answer the question as truthfully as possible using the context below:\n", - " {context}\n", - "\n", - " Question: {question}\"\"\"\n", - " prompt = ChatPromptTemplate.from_template(template)\n", - "\n", - " rag_chain = (\n", - " {\"context\": vector_store.as_retriever(search_kwargs={\"k\": 2}), \"question\": RunnablePassthrough()}\n", - " | prompt\n", - " | llm\n", - " | StrOutputParser()\n", - " )\n", - " logging.info(\"Successfully created RAG chain\")\n", - "except Exception as e:\n", - " raise ValueError(f\"Error creating RAG chain: {str(e)}\")" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-23 10:50:25,678 - INFO - Semantic search completed in 2.13 seconds\n" + ] }, { - "cell_type": "code", - "execution_count": 28, - "metadata": { - "id": "6GbtJzTEiL7M" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-09-23 10:50:47,733 - INFO - RAG response generated in 17.23 seconds using k=2\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "RAG Response: Pep Guardiola has been grappling with self-doubt and seeking support to navigate Manchester City's current crisis.\n", - "Response generated in 17.23 seconds\n" - ] - } - ], - "source": [ - "try:\n", - " # Create chain with k=2\n", - " # Start with k=4 and gradually reduce if token limit exceeded\n", - " # k=4 -> k=3 -> k=2 based on token limit warnings\n", - " # Final k=2 produced valid response about Guardiola in 2.33 seconds\n", - " current_chain = (\n", - " {\n", - " \"context\": vector_store.as_retriever(search_kwargs={\"k\": 2}),\n", - " \"question\": RunnablePassthrough()\n", - " }\n", - " | prompt\n", - " | llm\n", - " | StrOutputParser()\n", - " )\n", - " \n", - " # Try to get response\n", - " start_time = time.time()\n", - " rag_response = current_chain.invoke(query)\n", - " elapsed_time = time.time() - start_time\n", - " \n", - " logging.info(f\"RAG response generated in {elapsed_time:.2f} seconds using k=2\")\n", - " print(f\"RAG Response: {rag_response}\")\n", - " print(f\"Response generated in {elapsed_time:.2f} seconds\")\n", - " \n", - "except Exception as e:\n", - " if \"Payment Required\" in str(e):\n", - " logging.error(\"Payment required for Jina AI API. Please check your subscription status and API key.\")\n", - " print(\"To resolve this error:\")\n", - " print(\"1. Visit 'https://jina.ai/reader/#pricing' to review subscription options\")\n", - " print(\"2. Ensure your API key is valid and has sufficient credits\")\n", - " print(\"3. Consider upgrading your subscription plan if needed\")\n", - " else:\n", - " raise RuntimeError(f\"Unexpected error: {str(e)}\")" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Semantic Search Results (completed in 2.13 seconds):\n", + "--------------------------------------------------------------------------------\n", + "Score: 0.6798, Text: 'Self-doubt, errors & big changes' - inside the crisis at Man City\n", + "\n", + "Pep Guardiola has not been through a moment like this in his managerial career. Manchester City have lost nine matches in their past 12 - as many defeats as they had suffered in their previous 106 fixtures. At the end of October, City were still unbeaten at the top of the Premier League and favourites to win a fifth successive title. Now they are seventh, 12 points behind leaders Liverpool having played a game more. It has been an incredible fall from grace and left people trying to work out what has happened - and whether Guardiola can make it right. After discussing the situation with those who know him best, I have taken a closer look at the future - both short and long term - and how the current crisis at Man City is going to be solved.\n", + "\n", + "Pep Guardiola's Man City have lost nine of their past 12 matches\n", + "\n", + "Guardiola has also been giving it a lot of thought. He has not been sleeping very well, as he has said, and has not been himself at times when talking to the media. He has been talking to a lot of people about what is going on as he tries to work out the reasons for City's demise. Some reasons he knows, others he still doesn't. What people perhaps do not realise is Guardiola hugely doubts himself and always has. He will be thinking \"I'm not going to be able to get us out of this\" and needs the support of people close to him to push away those insecurities - and he has that. He is protected by his people who are very aware, like he is, that there are a lot of people that want City to fail. It has been a turbulent time for Guardiola. Remember those marks he had on his head after the 3-3 draw with Feyenoord in the Champions League? He always scratches his head, it is a gesture of nervousness. Normally nothing happens but on that day one of his nails was far too sharp so, after talking to the players in the changing room where he scratched his head because of his usual agitated gesturing, he went to the news conference. His right-hand man Manel Estiarte sent him photos in a message saying \"what have you got on your head?\", but by the time Guardiola returned to the coaching room there was hardly anything there again. He started that day with a cover on his nose after the same thing happened at the training ground the day before. Guardiola was having a footballing debate with Kyle Walker about positional stuff and marked his nose with that same nail. There was also that remarkable news conference after the Manchester derby when he said \"I don't know what to do\". That is partly true and partly not true. Ignore the fact Guardiola suggested he was \"not good enough\". He actually meant he was not good enough to resolve the situation with the group of players he has available and with all the other current difficulties. There are obviously logical explanations for the crisis and the first one has been talked about many times - the absence of injured midfielder Rodri. You know the game Jenga? When you take the wrong piece out, the whole tower collapses. That is what has happened here. It is normal for teams to have an over-reliance on one player if he is the best in the world in his position. And you cannot calculate the consequences of an injury that rules someone like Rodri out for the season. City are a team, like many modern ones, in which the holding midfielder is a key element to the construction. So, when you take Rodri out, it is difficult to hold it together. There were Plan Bs - John Stones, Manuel Akanji, even Nathan Ake - but injuries struck. The big injury list has been out of the ordinary and the busy calendar has also played a part in compounding the issues. However, one factor even Guardiola cannot explain is the big uncharacteristic errors in almost every game from international players. Why did Matheus Nunes make that challenge to give away the penalty against Manchester United? Jack Grealish is sent on at the end to keep the ball and cannot do that. There are errors from Walker and other defenders. These are some of the best players in the world. Of course the players' mindset is important, and confidence is diminishing. Wrong decisions get taken so there is almost panic on the pitch instead of calm. There are also players badly out of form who are having to play because of injuries. Walker is now unable to hide behind his pace, I'm not sure Kevin de Bruyne is ever getting back to the level he used to be at, Bernardo Silva and Ilkay Gundogan do not have time to rest, Grealish is not playing at his best. Some of these players were only meant to be playing one game a week but, because of injuries, have played 12 games in 40 days. It all has a domino effect. One consequence is that Erling Haaland isn't getting the service to score. But the Norwegian still remains City's top-scorer with 13. Defender Josko Gvardiol is next on the list with just four. The way their form has been analysed inside the City camp is there have only been three games where they deserved to lose (Liverpool, Bournemouth and Aston Villa). But of course it is time to change the dynamic.\n", + "\n", + "Guardiola has never protected his players so much. He has not criticised them and is not going to do so. They have won everything with him. Instead of doing more with them, he has tried doing less. He has sometimes given them more days off to clear their heads, so they can reset - two days this week for instance. Perhaps the time to change a team is when you are winning, but no-one was suggesting Man City were about to collapse when they were top and unbeaten after nine league games. Some people have asked how bad it has to get before City make a decision on Guardiola. The answer is that there is no decision to be made. Maybe if this was Real Madrid, Barcelona or Juventus, the pressure from outside would be massive and the argument would be made that Guardiola has to go. At City he has won the lot, so how can anyone say he is failing? Yes, this is a crisis. But given all their problems, City's renewed target is finishing in the top four. That is what is in all their heads now. The idea is to recover their essence by improving defensive concepts that are not there and re-establishing the intensity they are known for. Guardiola is planning to use the next two years of his contract, which is expected to be his last as a club manager, to prepare a new Manchester City. When he was at the end of his four years at Barcelona, he asked two managers what to do when you feel people are not responding to your instructions. Do you go or do the players go? Sir Alex Ferguson and Rafael Benitez both told him that the players need to go. Guardiola did not listen because of his emotional attachment to his players back then and he decided to leave the Camp Nou because he felt the cycle was over. He will still protect his players now but there is not the same emotional attachment - so it is the players who are going to leave this time. It is likely City will look to replace five or six regular starters. Guardiola knows it is the end of an era and the start of a new one. Changes will not be immediate and the majority of the work will be done in the summer. But they are open to any opportunities in January - and a holding midfielder is one thing they need. In the summer City might want to get Spain's Martin Zubimendi from Real Sociedad and they know 60m euros (£50m) will get him. He said no to Liverpool last summer even though everything was agreed, but he now wants to move on and the Premier League is the target. Even if they do not get Zubimendi, that is the calibre of footballer they are after. A new Manchester City is on its way - with changes driven by Guardiola, incoming sporting director Hugo Viana and the football department.\n", + "--------------------------------------------------------------------------------\n", + "Score: 0.6795, Text: 'Self-doubt, errors & big changes' - inside the crisis at Man City\n", + "\n", + "Pep Guardiola has not been through a moment like this in his managerial career. Manchester City have lost nine matches in their past 12 - as many defeats as they had suffered in their previous 106 fixtures. At the end of October, City were still unbeaten at the top of the Premier League and favourites to win a fifth successive title. Now they are seventh, 12 points behind leaders Liverpool having played a game more. It has been an incredible fall from grace and left people trying to work out what has happened - and whether Guardiola can make it right. After discussing the situation with those who know him best, I have taken a closer look at the future - both short and long term - and how the current crisis at Man City is going to be solved.\n", + "\n", + "Pep Guardiola's Man City have lost nine of their past 12 matches\n", + "\n", + "Guardiola has also been giving it a lot of thought. He has not been sleeping very well, as he has said, and has not been himself at times when talking to the media. He has been talking to a lot of people about what is going on as he tries to work out the reasons for City's demise. Some reasons he knows, others he still doesn't. What people perhaps do not realise is Guardiola hugely doubts himself and always has. He will be thinking \"I'm not going to be able to get us out of this\" and needs the support of people close to him to push away those insecurities - and he has that. He is protected by his people who are very aware, like he is, that there are a lot of people that want City to fail. It has been a turbulent time for Guardiola. Remember those marks he had on his head after the 3-3 draw with Feyenoord in the Champions League? He always scratches his head, it is a gesture of nervousness. Normally nothing happens but on that day one of his nails was far too sharp so, after talking to the players in the changing room where he scratched his head because of his usual agitated gesturing, he went to the news conference. His right-hand man Manel Estiarte sent him photos in a message saying \"what have you got on your head?\", but by the time Guardiola returned to the coaching room there was hardly anything there again. He started that day with a cover on his nose after the same thing happened at the training ground the day before. Guardiola was having a footballing debate with Kyle Walker about positional stuff and marked his nose with that same nail. There was also that remarkable news conference after the Manchester derby when he said \"I don't know what to do\". That is partly true and partly not true. Ignore the fact Guardiola suggested he was \"not good enough\". He actually meant he was not good enough to resolve the situation with the group of players he has available and with all the other current difficulties. There are obviously logical explanations for the crisis and the first one has been talked about many times - the absence of injured midfielder Rodri. You know the game Jenga? When you take the wrong piece out, the whole tower collapses. That is what has happened here. It is normal for teams to have an over-reliance on one player if he is the best in the world in his position. And you cannot calculate the consequences of an injury that rules someone like Rodri out for the season. City are a team, like many modern ones, in which the holding midfielder is a key element to the construction. So, when you take Rodri out, it is difficult to hold it together. There were Plan Bs - John Stones, Manuel Akanji, even Nathan Ake - but injuries struck. The big injury list has been out of the ordinary and the busy calendar has also played a part in compounding the issues. However, one factor even Guardiola cannot explain is the big uncharacteristic errors in almost every game from international players. Why did Matheus Nunes make that challenge to give away the penalty against Manchester United? Jack Grealish is sent on at the end to keep the ball and cannot do that. There are errors from Walker and other defenders. These are some of the best players in the world. Of course the players' mindset is important, and confidence is diminishing. Wrong decisions get taken so there is almost panic on the pitch instead of calm. There are also players badly out of form who are having to play because of injuries. Walker is now unable to hide behind his pace, I'm not sure Kevin de Bruyne is ever getting back to the level he used to be at, Bernardo Silva and Ilkay Gundogan do not have time to rest, Grealish is not playing at his best. Some of these players were only meant to be playing one game a week but, because of injuries, have played 12 games in 40 days. It all has a domino effect. One consequence is that Erling Haaland isn't getting the service to score. But the Norwegian still remains City's top-scorer with 13. Defender Josko Gvardiol is next on the list with just four. The way their form has been analysed inside the City camp is there have only been three games where they deserved to lose (Liverpool, Bournemouth and Aston Villa). But of course it is time to change the dynamic.\n", + "\n", + "Guardiola has never protected his players so much. He has not criticised them and is not going to do so. They have won everything with him. Instead of doing more with them, he has tried doing less. He has sometimes given them more days off to clear their heads, so they can reset - two days this week for instance. Perhaps the time to change a team is when you are winning, but no-one was suggesting Man City were about to collapse when they were top and unbeaten after nine league games. Some people have asked how bad it has to get before City make a decision on Guardiola. The answer is that there is no decision to be made. Maybe if this was Real Madrid, Barcelona or Juventus, the pressure from outside would be massive and the argument would be made that Guardiola has to go. At City he has won the lot, so how can anyone say he is failing? Yes, this is a crisis. But given all their problems, City's renewed target is finishing in the top four. That is what is in all their heads now. The idea is to recover their essence by improving defensive concepts that are not there and re-establishing the intensity they are known for. Guardiola is planning to use the next two years of his contract, which is expected to be his last as a club manager, to prepare a new Manchester City. When he was at the end of his four years at Barcelona, he asked two managers what to do when you feel people are not responding to your instructions. Do you go or do the players go? Sir Alex Ferguson and Rafael Benitez both told him that the players need to go. Guardiola did not listen because of his emotional attachment to his players back then and he decided to leave the Camp Nou because he felt the cycle was over. He will still protect his players now but there is not the same emotional attachment - so it is the players who are going to leave this time. It is likely City will look to replace five or six regular starters. Guardiola knows it is the end of an era and the start of a new one. Changes will not be immediate and the majority of the work will be done in the summer. But they are open to any opportunities in January - and a holding midfielder is one thing they need. In the summer City might want to get Spain's Martin Zubimendi from Real Sociedad and they know 60m euros (£50m) will get him. He said no to Liverpool last summer even though everything was agreed, but he now wants to move on and the Premier League is the target. Even if they do not get Zubimendi, that is the calibre of footballer they are after. A new Manchester City is on its way - with changes driven by Guardiola, incoming sporting director Hugo Viana and the football department.\n", + "--------------------------------------------------------------------------------\n", + "Score: 0.6207, Text: Manchester City boss Pep Guardiola has won 18 trophies since he arrived at the club in 2016\n", + "\n", + "\n", + "... (output truncated for brevity)\n" + ] + } + ], + "source": [ + "def perform_semantic_search(query, vector_store, max_retries=3, retry_delay=2): \n", + " for attempt in range(max_retries):\n", + " try:\n", + " start_time = time.time()\n", + " search_results = vector_store.similarity_search_with_score(query, k=5)\n", + " search_elapsed_time = time.time() - start_time\n", + " \n", + " logging.info(f\"Semantic search completed in {search_elapsed_time:.2f} seconds\")\n", + " return search_results, search_elapsed_time\n", + " \n", + " except Exception as e:\n", + " error_str = str(e)\n", + " \n", + " # Check if it's a rate limit error (HTTP 429)\n", + " if \"http_status: 429\" in error_str or \"query request rejected\" in error_str:\n", + " logging.warning(f\"Rate limit hit (attempt {attempt+1}/{max_retries}). Waiting {retry_delay} seconds...\")\n", + " time.sleep(retry_delay)\n", + " retry_delay *= 2 # Exponential backoff\n", + " \n", + " if attempt == max_retries - 1:\n", + " logging.error(\"Maximum retry attempts reached. API rate limit exceeded.\")\n", + " raise RuntimeError(\"API rate limit exceeded. Please try again later or check your subscription.\")\n", + " else:\n", + " # For other errors, don't retry\n", + " logging.error(f\"Search error: {error_str}\")\n", + " if \"Payment Required\" in error_str:\n", + " raise RuntimeError(\"Payment required for Jina AI API. Please check your subscription status and API key.\")\n", + " else:\n", + " raise RuntimeError(f\"Search failed: {error_str}\")\n", + "\n", + "try:\n", + " query = \"What was manchester city manager pep guardiola's reaction to the team's current form?\"\n", + " search_results, search_elapsed_time = perform_semantic_search(query, vector_store)\n", + " \n", + " # Display search results\n", + " print(f\"\\nSemantic Search Results (completed in {search_elapsed_time:.2f} seconds):\")\n", + " print(\"-\"*80)\n", + " for doc, score in search_results:\n", + " print(f\"Score: {score:.4f}, Text: {doc.page_content}\")\n", + " print(\"-\"*80)\n", + " \n", + "except RuntimeError as e:\n", + " print(f\"Error: {str(e)}\")\n", + " print(\"\\nTroubleshooting steps:\")\n", + " if \"API rate limit\" in str(e):\n", + " print(\"1. Wait a few minutes before trying again\")\n", + " print(\"2. Reduce the frequency of your requests\")\n", + " print(\"3. Consider upgrading your Jina AI plan for higher rate limits\")\n", + " elif \"Payment required\" in str(e):\n", + " print(\"1. Visit 'https://jina.ai/reader/#pricing' to review subscription options\")\n", + " print(\"2. Ensure your API key is valid and has sufficient credits\")\n", + " print(\"3. Update your API key configuration\")\n", + " else:\n", + " print(\"1. Check your network connection\")\n", + " print(\"2. Verify your Couchbase and Jina configurations\")\n", + " print(\"3. Review the vector store implementation for any bugs\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6bp8YEEQiL0r" + }, + "source": [ + "# Retrieval-Augmented Generation (RAG) with Couchbase and Langchain\n", + "Couchbase and LangChain can be seamlessly integrated to create RAG (Retrieval-Augmented Generation) chains, enhancing the process of generating contextually relevant responses. In this setup, Couchbase serves as the vector store, where embeddings of documents are stored. When a query is made, LangChain retrieves the most relevant documents from Couchbase by comparing the query’s embedding with the stored document embeddings. These documents, which provide contextual information, are then passed to a generative language model within LangChain.\n", + "\n", + "The language model, equipped with the context from the retrieved documents, generates a response that is both informed and contextually accurate. This integration allows the RAG chain to leverage Couchbase’s efficient storage and retrieval capabilities, while LangChain handles the generation of responses based on the context provided by the retrieved documents. Together, they create a powerful system that can deliver highly relevant and accurate answers by combining the strengths of both retrieval and generation." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "id": "fTolIHFpiL30" + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "T8hCgpMyiL-J" - }, - "source": [ - "# Using Couchbase as a caching mechanism\n", - "Couchbase can be effectively used as a caching mechanism for RAG (Retrieval-Augmented Generation) responses by storing and retrieving precomputed results for specific queries. This approach enhances the system's efficiency and speed, particularly when dealing with repeated or similar queries. When a query is first processed, the RAG chain retrieves relevant documents, generates a response using the language model, and then stores this response in Couchbase, with the query serving as the key.\n", - "\n", - "For subsequent requests with the same query, the system checks Couchbase first. If a cached response is found, it is retrieved directly from Couchbase, bypassing the need to re-run the entire RAG process. This significantly reduces response time because the computationally expensive steps of document retrieval and response generation are skipped. Couchbase's role in this setup is to provide a fast and scalable storage solution for caching these responses, ensuring that frequently asked queries can be answered more quickly and efficiently." - ] - }, + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-23 10:50:26,937 - INFO - Successfully created RAG chain\n" + ] + } + ], + "source": [ + "try:\n", + " template = \"\"\"You are a helpful bot. If you cannot answer based on the context provided, respond with a generic answer. Answer the question as truthfully as possible using the context below:\n", + " {context}\n", + "\n", + " Question: {question}\"\"\"\n", + " prompt = ChatPromptTemplate.from_template(template)\n", + "\n", + " rag_chain = (\n", + " {\"context\": vector_store.as_retriever(search_kwargs={\"k\": 2}), \"question\": RunnablePassthrough()}\n", + " | prompt\n", + " | llm\n", + " | StrOutputParser()\n", + " )\n", + " logging.info(\"Successfully created RAG chain\")\n", + "except Exception as e:\n", + " raise ValueError(f\"Error creating RAG chain: {str(e)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "id": "6GbtJzTEiL7M" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 29, - "metadata": { - "id": "c10Qzeq2Q8N7" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Query 1: What happened in the match between Fullham and Liverpool?\n", - "Response: Fulham and Liverpool played to a 2-2 draw at Anfield, with both teams showcasing strong performances.\n", - "Time taken: 5.13 seconds\n", - "\n", - "Query 2: What was manchester city manager pep guardiola's reaction to the team's current form?\n", - "Response: Pep Guardiola has been grappling with self-doubt and seeking support to navigate Manchester City's current crisis.\n", - "Time taken: 2.16 seconds\n", - "\n", - "Query 3: What happened in the match between Fullham and Liverpool?\n", - "Response: Fulham and Liverpool played to a 2-2 draw at Anfield, with both teams showcasing strong performances.\n", - "Time taken: 1.95 seconds\n" - ] - } - ], - "source": [ - "try:\n", - " queries = [\n", - " \"What happened in the match between Fullham and Liverpool?\",\n", - " \"What was manchester city manager pep guardiola's reaction to the team's current form?\", # Repeated query\n", - " \"What happened in the match between Fullham and Liverpool?\", # Repeated query\n", - " ]\n", - "\n", - " for i, query in enumerate(queries, 1):\n", - " print(f\"\\nQuery {i}: {query}\")\n", - " start_time = time.time()\n", - " response = rag_chain.invoke(query)\n", - " elapsed_time = time.time() - start_time\n", - " print(f\"Response: {response}\")\n", - " \n", - " print(f\"Time taken: {elapsed_time:.2f} seconds\")\n", - "except Exception as e:\n", - " if \"Payment Required\" in str(e):\n", - " logging.error(\"Payment required for Jina AI API. Please check your subscription status and API key.\")\n", - " print(\"To resolve this error:\")\n", - " print(\"1. Visit 'https://jina.ai/reader/#pricing' to review subscription options\")\n", - " print(\"2. Ensure your API key is valid and has sufficient credits\")\n", - " print(\"3. Consider upgrading your subscription plan if needed\")\n", - " else:\n", - " raise RuntimeError(f\"Unexpected error: {str(e)}\")" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-23 10:50:47,733 - INFO - RAG response generated in 17.23 seconds using k=2\n" + ] }, { - "cell_type": "markdown", - "metadata": { - "id": "yJQ5P8E29go1" - }, - "source": [ - "## Conclusion\n", - "By following these steps, you’ll have a fully functional semantic search engine that leverages the strengths of Couchbase and Jina. This guide is designed not just to show you how to build the system, but also to explain why each step is necessary, giving you a deeper understanding of the principles behind semantic search and how to implement it effectively. Whether you’re a newcomer to software development or an experienced developer looking to expand your skills, this guide will provide you with the knowledge and tools you need to create a powerful, AI-driven search engine." - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "RAG Response: Pep Guardiola has been grappling with self-doubt and seeking support to navigate Manchester City's current crisis.\n", + "Response generated in 17.23 seconds\n" + ] } - ], - "metadata": { - "colab": { - "provenance": [], - "toc_visible": true - }, - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.13.7" + ], + "source": [ + "try:\n", + " # Create chain with k=2\n", + " # Start with k=4 and gradually reduce if token limit exceeded\n", + " # k=4 -> k=3 -> k=2 based on token limit warnings\n", + " # Final k=2 produced valid response about Guardiola in 2.33 seconds\n", + " current_chain = (\n", + " {\n", + " \"context\": vector_store.as_retriever(search_kwargs={\"k\": 2}),\n", + " \"question\": RunnablePassthrough()\n", + " }\n", + " | prompt\n", + " | llm\n", + " | StrOutputParser()\n", + " )\n", + " \n", + " # Try to get response\n", + " start_time = time.time()\n", + " rag_response = current_chain.invoke(query)\n", + " elapsed_time = time.time() - start_time\n", + " \n", + " logging.info(f\"RAG response generated in {elapsed_time:.2f} seconds using k=2\")\n", + " print(f\"RAG Response: {rag_response}\")\n", + " print(f\"Response generated in {elapsed_time:.2f} seconds\")\n", + " \n", + "except Exception as e:\n", + " if \"Payment Required\" in str(e):\n", + " logging.error(\"Payment required for Jina AI API. Please check your subscription status and API key.\")\n", + " print(\"To resolve this error:\")\n", + " print(\"1. Visit 'https://jina.ai/reader/#pricing' to review subscription options\")\n", + " print(\"2. Ensure your API key is valid and has sufficient credits\")\n", + " print(\"3. Consider upgrading your subscription plan if needed\")\n", + " else:\n", + " raise RuntimeError(f\"Unexpected error: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "T8hCgpMyiL-J" + }, + "source": [ + "# Using Couchbase as a caching mechanism\n", + "Couchbase can be effectively used as a caching mechanism for RAG (Retrieval-Augmented Generation) responses by storing and retrieving precomputed results for specific queries. This approach enhances the system's efficiency and speed, particularly when dealing with repeated or similar queries. When a query is first processed, the RAG chain retrieves relevant documents, generates a response using the language model, and then stores this response in Couchbase, with the query serving as the key.\n", + "\n", + "For subsequent requests with the same query, the system checks Couchbase first. If a cached response is found, it is retrieved directly from Couchbase, bypassing the need to re-run the entire RAG process. This significantly reduces response time because the computationally expensive steps of document retrieval and response generation are skipped. Couchbase's role in this setup is to provide a fast and scalable storage solution for caching these responses, ensuring that frequently asked queries can be answered more quickly and efficiently." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "id": "c10Qzeq2Q8N7" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Query 1: What happened in the match between Fullham and Liverpool?\n", + "Response: Fulham and Liverpool played to a 2-2 draw at Anfield, with both teams showcasing strong performances.\n", + "Time taken: 5.13 seconds\n", + "\n", + "Query 2: What was manchester city manager pep guardiola's reaction to the team's current form?\n", + "Response: Pep Guardiola has been grappling with self-doubt and seeking support to navigate Manchester City's current crisis.\n", + "Time taken: 2.16 seconds\n", + "\n", + "Query 3: What happened in the match between Fullham and Liverpool?\n", + "Response: Fulham and Liverpool played to a 2-2 draw at Anfield, with both teams showcasing strong performances.\n", + "Time taken: 1.95 seconds\n" + ] } + ], + "source": [ + "try:\n", + " queries = [\n", + " \"What happened in the match between Fullham and Liverpool?\",\n", + " \"What was manchester city manager pep guardiola's reaction to the team's current form?\", # Repeated query\n", + " \"What happened in the match between Fullham and Liverpool?\", # Repeated query\n", + " ]\n", + "\n", + " for i, query in enumerate(queries, 1):\n", + " print(f\"\\nQuery {i}: {query}\")\n", + " start_time = time.time()\n", + " response = rag_chain.invoke(query)\n", + " elapsed_time = time.time() - start_time\n", + " print(f\"Response: {response}\")\n", + " \n", + " print(f\"Time taken: {elapsed_time:.2f} seconds\")\n", + "except Exception as e:\n", + " if \"Payment Required\" in str(e):\n", + " logging.error(\"Payment required for Jina AI API. Please check your subscription status and API key.\")\n", + " print(\"To resolve this error:\")\n", + " print(\"1. Visit 'https://jina.ai/reader/#pricing' to review subscription options\")\n", + " print(\"2. Ensure your API key is valid and has sufficient credits\")\n", + " print(\"3. Consider upgrading your subscription plan if needed\")\n", + " else:\n", + " raise RuntimeError(f\"Unexpected error: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yJQ5P8E29go1" + }, + "source": [ + "## Conclusion\n", + "By following these steps, you’ll have a fully functional semantic search engine that leverages the strengths of Couchbase and Jina. This guide is designed not just to show you how to build the system, but also to explain why each step is necessary, giving you a deeper understanding of the principles behind semantic search and how to implement it effectively. Whether you’re a newcomer to software development or an experienced developer looking to expand your skills, this guide will provide you with the knowledge and tools you need to create a powerful, AI-driven search engine." + ] + } + ], + "metadata": { + "colab": { + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" }, - "nbformat": 4, - "nbformat_minor": 0 + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.14.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 } From 61aac0f1abe48967b6a20d23130ac5560c96224d Mon Sep 17 00:00:00 2001 From: giriraj-singh-couchbase Date: Tue, 2 Dec 2025 15:23:06 +0530 Subject: [PATCH 5/5] Increased short title length and updated frontmatter removing FTS term --- .../RAG_with_Couchbase_and_Jina_AI.ipynb | 18 +++++++++--------- jinaai/query_based/frontmatter.md | 2 +- jinaai/search_based/frontmatter.md | 6 +++--- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/jinaai/query_based/RAG_with_Couchbase_and_Jina_AI.ipynb b/jinaai/query_based/RAG_with_Couchbase_and_Jina_AI.ipynb index fee95f15..28bbfc92 100644 --- a/jinaai/query_based/RAG_with_Couchbase_and_Jina_AI.ipynb +++ b/jinaai/query_based/RAG_with_Couchbase_and_Jina_AI.ipynb @@ -910,7 +910,7 @@ "id": "c0dc252f", "metadata": {}, "source": [ - "### Create Hyperscale/Composite Vector Index" + "### Create Hyperscale Vector Index" ] }, { @@ -918,7 +918,7 @@ "id": "926f3cb6", "metadata": {}, "source": [ - "Now let's create a Hyperscale/Composite vector index to enable high-performance vector searches. The index creation is done programmatically through the vector store." + "Now let's create a Hyperscale vector index to enable high-performance vector searches. The index creation is done programmatically through the vector store." ] }, { @@ -931,21 +931,21 @@ "name": "stdout", "output_type": "stream", "text": [ - "Creating Hyperscale/Composite vector index...\n", - "Hyperscale/Composite vector index created successfully\n", + "Creating Hyperscale vector index...\n", + "Hyperscale vector index created successfully\n", "Waiting for index to become available...\n" ] } ], "source": [ "# Create Hyperscale Vector Index for high-performance searches\n", - "print(\"Creating Hyperscale/Composite vector index...\")\n", + "print(\"Creating Hyperscale vector index...\")\n", "try:\n", " vector_store.create_index(\n", " index_type=IndexType.HYPERSCALE, # Use IndexType.COMPOSITE for Composite index\n", " index_description=\"IVF,SQ8\"\n", " )\n", - " print(\"Hyperscale/Composite vector index created successfully\")\n", + " print(\"Hyperscale vector index created successfully\")\n", " \n", " # Wait for index to become available\n", " print(\"Waiting for index to become available...\")\n", @@ -953,9 +953,9 @@ " \n", "except Exception as e:\n", " if \"already exists\" in str(e).lower():\n", - " print(\"Hyperscale/Composite vector index already exists, proceeding...\")\n", + " print(\"Hyperscale vector index already exists, proceeding...\")\n", " else:\n", - " print(f\"Error creating Hyperscale/Composite vector index: {str(e)}\")" + " print(f\"Error creating Hyperscale vector index: {str(e)}\")" ] }, { @@ -987,7 +987,7 @@ "id": "d3e24394", "metadata": {}, "source": [ - "### Test 2: Hyperscale and Composite vector indexes Optimized Performance" + "### Test 2: Hyperscale vector indexes Optimized Performance" ] }, { diff --git a/jinaai/query_based/frontmatter.md b/jinaai/query_based/frontmatter.md index 21f67dd8..6cf8605a 100644 --- a/jinaai/query_based/frontmatter.md +++ b/jinaai/query_based/frontmatter.md @@ -2,7 +2,7 @@ # frontmatter path: "/tutorial-jina-couchbase-rag-with-hyperscale-or-composite-vector-index" title: Retrieval-Augmented Generation (RAG) with Jina AI using Couchbase Hyperscale and Composite Vector Index -short_title: RAG with Couchbase and Jina AI +short_title: RAG with Jina AI using Couchbase Hyperscale and Composite Vector Index and description: - Learn how to build a semantic search engine using Couchbase and Jina. - This tutorial demonstrates how to integrate Couchbase's vector search capabilities with Jina embeddings and language models. diff --git a/jinaai/search_based/frontmatter.md b/jinaai/search_based/frontmatter.md index fb6feba3..f1b9c32a 100644 --- a/jinaai/search_based/frontmatter.md +++ b/jinaai/search_based/frontmatter.md @@ -1,12 +1,12 @@ --- # frontmatter path: "/tutorial-jina-couchbase-rag-with-search-vector-index" -title: Retrieval-Augmented Generation (RAG) with Couchbase and Jina AI using FTS -short_title: RAG with Couchbase and Jina +title: Retrieval-Augmented Generation (RAG) with Jina AI using Couchbase Search Vector Index +short_title: RAG with Couchbase Search Vector Index and Jina description: - Learn how to build a semantic search engine using Couchbase and Jina. - This tutorial demonstrates how to integrate Couchbase's vector search capabilities with Jina embeddings and language models. - - You'll understand how to perform Retrieval-Augmented Generation (RAG) using LangChain and Couchbase using FTS. + - You'll understand how to perform Retrieval-Augmented Generation (RAG) using LangChain and Couchbase using Search Vector Index. content_type: tutorial filter: sdk technology: