From 6eb8a5f3ebf8af79b326f998a44da80c83b4fbeb Mon Sep 17 00:00:00 2001 From: Shwetha Rao Date: Fri, 5 Dec 2025 12:46:16 +0530 Subject: [PATCH 1/4] first commit --- .../manage-backup-and-restore/autovec.adoc | 248 ++++++++++++++++++ preview/backup-restore-preview.yml | 28 ++ 2 files changed, 276 insertions(+) create mode 100644 modules/manage/pages/manage-backup-and-restore/autovec.adoc create mode 100644 preview/backup-restore-preview.yml diff --git a/modules/manage/pages/manage-backup-and-restore/autovec.adoc b/modules/manage/pages/manage-backup-and-restore/autovec.adoc new file mode 100644 index 0000000000..55f2f70629 --- /dev/null +++ b/modules/manage/pages/manage-backup-and-restore/autovec.adoc @@ -0,0 +1,248 @@ += Auto-Vectorization +:description: Automatically generate vector embeddings for your documents in Couchbase Capella. +:page-topic-type: concept + +[abstract] +Auto-Vectorization (AutoVec) automatically generates vector embeddings for documents stored in Couchbase Capella, enabling semantic search and RAG (Retrieval-Augmented Generation) pipelines without manual embedding management. + +== Introduction + +Auto-Vectorization is a Couchbase Capella feature that automatically converts your structured and unstructured data into vector embeddings. These embeddings enable powerful semantic search capabilities and form the foundation for building RAG applications. + +=== Key benefits + +* *Automatic processing* - Documents are vectorized automatically as they are inserted or updated +* *No code required* - Configure vectorization through the Capella UI without writing embedding code +* *Scalable architecture* - Built on the Eventing Service for high-throughput processing +* *Multiple embedding providers* - Support for OpenAI, AWS Bedrock, and AI Gateway + +=== Use cases + +* Semantic search across document collections +* Building RAG pipelines for AI applications +* Content similarity matching +* Intelligent document retrieval + +== Vectorization + +Auto-Vectorization uses the Eventing Service to process document mutations and generate embeddings in real-time. + +=== How it works + +. A document is inserted or updated in the source collection +. The Eventing worker receives the mutation via DCP (Database Change Protocol) +. The worker checks if the document needs vectorization by comparing CRC checksums +. Documents are batched together for efficient API calls +. The embedding service generates vectors for the batch +. Embeddings are written back to the source document + +=== Architecture + +Auto-Vectorization supports two architecture versions: + +==== V2 Architecture (Current) + +The V2 architecture provides significant performance improvements: + +* *Worker affinity* - Each worker maintains its own in-memory batch, ensuring correctness +* *Reduced I/O* - Approximately 2.6x I/O amplification (compared to 11x in V1) +* *Crash recovery* - Batch state is persisted with timer-based recovery +* *Topology aware* - Automatically handles scale-up, scale-down, and failover + +==== I/O estimation + +For every 100 document mutations: + +[cols="1,1"] +|=== +|Operation |Count + +|GET (batch state) +|100 + +|UPSERT (batch persist) +|100 + +|Timer cancellations +|18 + +|Timer creations +|30 + +|Batch resets +|6 + +|*Total* +|*254 operations* +|=== + +This results in an I/O amplification factor of approximately 2.6x. + +== Data pre-processing options + +Before vectorization, you can configure how your data is prepared for the embedding service. + +=== Batch configuration + +* *BATCH_LIMIT_BYTES* - Controls the maximum size of data sent to the embedding service per request +* *Batch size* - Number of documents processed together (default: 16) + +=== Field selection + +You can specify which document fields to include in the vectorization: + +* Select specific text fields for embedding +* Exclude metadata or system fields +* Combine multiple fields into a single embedding + +NOTE: Reducing batch size or field content helps avoid token limit errors from embedding providers. + +== Unstructured data processing limitations + +Auto-Vectorization has certain limitations when processing unstructured data: + +=== Token limits + +Most embedding services have token limits: + +* OpenAI: 8,192 tokens maximum context length +* Exceeding limits results in Error Code 400 + +TIP: Reduce `BATCH_LIMIT_BYTES` if you encounter token limit errors. + +=== Content types + +* Text content is fully supported +* Binary content (images, PDFs) requires pre-processing +* Large documents may need chunking before vectorization + +=== Processing constraints + +* Documents larger than the configured batch limit are skipped +* Extremely large collections may require throttling configuration +* Network timeouts can occur with slow embedding services + +== Workflow statuses + +Auto-Vectorization workflows have distinct status indicators: + +=== Health indicators + +[cols="1,3"] +|=== +|Status |Description + +|🟒 *Healthy* +|Workflow is running normally + +|🟑 *Warning* +|Retryable errors occurring (such as timeouts) + +|πŸ”΄ *Failed* +|Fatal failure requiring intervention + +|⏸️ *Paused* +|Workflow is paused by user or system +|=== + +=== Workflow state truth table + +When the control plane shows a workflow as "running": + +[cols="1,1,1"] +|=== +|Controller State |Worker State |Actual Status + +|Running +|Running +|Healthy + +|Running +|Paused +|Throttled or user paused + +|Running +|Undeployed +|Failed + +|Paused +|Running +|Failed + +|Paused +|Paused +|Paused + +|Undeployed +|Any +|Failed +|=== + +=== Error codes + +[cols="1,1,2"] +|=== +|Code |Name |Description + +|400 +|Bad Request +|Context length exceeded token limit. Reduce `BATCH_LIMIT_BYTES`. + +|401 +|Unauthenticated +|Invalid API key or account not part of organization. + +|403 +|Forbidden +|Accessing API from unsupported region. + +|500 +|Internal Server Error +|Embedding service internal error. + +|600 +|Data Service Error +|Issues writing embeddings to Couchbase. +|=== + +== Workflow billing + +Auto-Vectorization billing is based on token consumption from the embedding service. + +=== Metering metrics + +The following metrics are tracked for billing: + +* *tokens_processed_total* - Total tokens sent to the embedding service +* *batch_requests_total* - Number of API calls made +* *embedding_writes_total* - Successful embedding operations + +=== Cost factors + +Billing depends on: + +* Number of documents processed +* Size of document content (affects token count) +* Embedding model selected (pricing varies by provider) +* Re-processing events (document updates trigger new embeddings) + +IMPORTANT: Monitor `tokens_processed_total` to estimate embedding service costs. + +== Conclusion + +Auto-Vectorization simplifies the process of adding semantic search capabilities to your Couchbase Capella applications. By automatically generating embeddings for your documents, you can focus on building AI-powered features without managing the complexity of embedding pipelines. + +=== Best practices + +* Start with a small collection to validate configuration +* Monitor workflow health indicators regularly +* Adjust batch settings based on your document sizes +* Use appropriate embedding models for your use case + +== See also + +* xref:cloud:clusters:data-service/scopes-collections.adoc[Scopes and Collections] +* xref:cloud:search:vector-search.adoc[Vector Search] +* xref:cloud:eventing:eventing-overview.adoc[Eventing Service] +* https://platform.openai.com/docs/guides/embeddings[OpenAI Embeddings Documentation] + diff --git a/preview/backup-restore-preview.yml b/preview/backup-restore-preview.yml new file mode 100644 index 0000000000..95f3bb064f --- /dev/null +++ b/preview/backup-restore-preview.yml @@ -0,0 +1,28 @@ +sources: + docs-server: + branches: DOC-13786-backup-and-restore + docs-analytics: + branches: release/8.0 + docs-devex: + url: https://github.com/couchbaselabs/docs-devex.git + branches: master + startPaths: docs/ + couchbase-cli: + # url: ../../docs-includes/couchbase-cli + url: https://github.com/couchbaselabs/couchbase-cli-doc + # branches: HEAD + branches: master + startPaths: docs/ + backup: + # url: ../../docs-includes/backup + url: https://github.com/couchbaselabs/backup-docs.git + #branches: HEAD + branches: master + startPaths: docs/ + #analytics: + # url: ../../docs-includes/docs-analytics + # branches: HEAD + #cb-swagger: + # url: https://github.com/couchbaselabs/cb-swagger + # branches: release/8.0 + # start_path: docs \ No newline at end of file From aeb86aa93ee7822c4ba8bdca57fe80ba89371f9c Mon Sep 17 00:00:00 2001 From: Shwetha Rao Date: Fri, 5 Dec 2025 13:05:45 +0530 Subject: [PATCH 2/4] preview file --- preview/antora-playbook.preview.local.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 preview/antora-playbook.preview.local.yml diff --git a/preview/antora-playbook.preview.local.yml b/preview/antora-playbook.preview.local.yml new file mode 100644 index 0000000000..3dd279e132 --- /dev/null +++ b/preview/antora-playbook.preview.local.yml @@ -0,0 +1,12 @@ +site: + title: Couchbase Docs Local Preview + +output: + dir: ./preview + clean: true +asciidoc: + attributes: + kroki-server-url: null +content: + sources: + - url: https://github.com/couchbase/docs-server From de826950eaa12e615a3358eca6080d871849e568 Mon Sep 17 00:00:00 2001 From: Shwetha Rao Date: Fri, 5 Dec 2025 16:38:45 +0530 Subject: [PATCH 3/4] Minor grammar fix --- .../manage-backup-and-restore/autovec.adoc | 248 ------------------ 1 file changed, 248 deletions(-) delete mode 100644 modules/manage/pages/manage-backup-and-restore/autovec.adoc diff --git a/modules/manage/pages/manage-backup-and-restore/autovec.adoc b/modules/manage/pages/manage-backup-and-restore/autovec.adoc deleted file mode 100644 index 55f2f70629..0000000000 --- a/modules/manage/pages/manage-backup-and-restore/autovec.adoc +++ /dev/null @@ -1,248 +0,0 @@ -= Auto-Vectorization -:description: Automatically generate vector embeddings for your documents in Couchbase Capella. -:page-topic-type: concept - -[abstract] -Auto-Vectorization (AutoVec) automatically generates vector embeddings for documents stored in Couchbase Capella, enabling semantic search and RAG (Retrieval-Augmented Generation) pipelines without manual embedding management. - -== Introduction - -Auto-Vectorization is a Couchbase Capella feature that automatically converts your structured and unstructured data into vector embeddings. These embeddings enable powerful semantic search capabilities and form the foundation for building RAG applications. - -=== Key benefits - -* *Automatic processing* - Documents are vectorized automatically as they are inserted or updated -* *No code required* - Configure vectorization through the Capella UI without writing embedding code -* *Scalable architecture* - Built on the Eventing Service for high-throughput processing -* *Multiple embedding providers* - Support for OpenAI, AWS Bedrock, and AI Gateway - -=== Use cases - -* Semantic search across document collections -* Building RAG pipelines for AI applications -* Content similarity matching -* Intelligent document retrieval - -== Vectorization - -Auto-Vectorization uses the Eventing Service to process document mutations and generate embeddings in real-time. - -=== How it works - -. A document is inserted or updated in the source collection -. The Eventing worker receives the mutation via DCP (Database Change Protocol) -. The worker checks if the document needs vectorization by comparing CRC checksums -. Documents are batched together for efficient API calls -. The embedding service generates vectors for the batch -. Embeddings are written back to the source document - -=== Architecture - -Auto-Vectorization supports two architecture versions: - -==== V2 Architecture (Current) - -The V2 architecture provides significant performance improvements: - -* *Worker affinity* - Each worker maintains its own in-memory batch, ensuring correctness -* *Reduced I/O* - Approximately 2.6x I/O amplification (compared to 11x in V1) -* *Crash recovery* - Batch state is persisted with timer-based recovery -* *Topology aware* - Automatically handles scale-up, scale-down, and failover - -==== I/O estimation - -For every 100 document mutations: - -[cols="1,1"] -|=== -|Operation |Count - -|GET (batch state) -|100 - -|UPSERT (batch persist) -|100 - -|Timer cancellations -|18 - -|Timer creations -|30 - -|Batch resets -|6 - -|*Total* -|*254 operations* -|=== - -This results in an I/O amplification factor of approximately 2.6x. - -== Data pre-processing options - -Before vectorization, you can configure how your data is prepared for the embedding service. - -=== Batch configuration - -* *BATCH_LIMIT_BYTES* - Controls the maximum size of data sent to the embedding service per request -* *Batch size* - Number of documents processed together (default: 16) - -=== Field selection - -You can specify which document fields to include in the vectorization: - -* Select specific text fields for embedding -* Exclude metadata or system fields -* Combine multiple fields into a single embedding - -NOTE: Reducing batch size or field content helps avoid token limit errors from embedding providers. - -== Unstructured data processing limitations - -Auto-Vectorization has certain limitations when processing unstructured data: - -=== Token limits - -Most embedding services have token limits: - -* OpenAI: 8,192 tokens maximum context length -* Exceeding limits results in Error Code 400 - -TIP: Reduce `BATCH_LIMIT_BYTES` if you encounter token limit errors. - -=== Content types - -* Text content is fully supported -* Binary content (images, PDFs) requires pre-processing -* Large documents may need chunking before vectorization - -=== Processing constraints - -* Documents larger than the configured batch limit are skipped -* Extremely large collections may require throttling configuration -* Network timeouts can occur with slow embedding services - -== Workflow statuses - -Auto-Vectorization workflows have distinct status indicators: - -=== Health indicators - -[cols="1,3"] -|=== -|Status |Description - -|🟒 *Healthy* -|Workflow is running normally - -|🟑 *Warning* -|Retryable errors occurring (such as timeouts) - -|πŸ”΄ *Failed* -|Fatal failure requiring intervention - -|⏸️ *Paused* -|Workflow is paused by user or system -|=== - -=== Workflow state truth table - -When the control plane shows a workflow as "running": - -[cols="1,1,1"] -|=== -|Controller State |Worker State |Actual Status - -|Running -|Running -|Healthy - -|Running -|Paused -|Throttled or user paused - -|Running -|Undeployed -|Failed - -|Paused -|Running -|Failed - -|Paused -|Paused -|Paused - -|Undeployed -|Any -|Failed -|=== - -=== Error codes - -[cols="1,1,2"] -|=== -|Code |Name |Description - -|400 -|Bad Request -|Context length exceeded token limit. Reduce `BATCH_LIMIT_BYTES`. - -|401 -|Unauthenticated -|Invalid API key or account not part of organization. - -|403 -|Forbidden -|Accessing API from unsupported region. - -|500 -|Internal Server Error -|Embedding service internal error. - -|600 -|Data Service Error -|Issues writing embeddings to Couchbase. -|=== - -== Workflow billing - -Auto-Vectorization billing is based on token consumption from the embedding service. - -=== Metering metrics - -The following metrics are tracked for billing: - -* *tokens_processed_total* - Total tokens sent to the embedding service -* *batch_requests_total* - Number of API calls made -* *embedding_writes_total* - Successful embedding operations - -=== Cost factors - -Billing depends on: - -* Number of documents processed -* Size of document content (affects token count) -* Embedding model selected (pricing varies by provider) -* Re-processing events (document updates trigger new embeddings) - -IMPORTANT: Monitor `tokens_processed_total` to estimate embedding service costs. - -== Conclusion - -Auto-Vectorization simplifies the process of adding semantic search capabilities to your Couchbase Capella applications. By automatically generating embeddings for your documents, you can focus on building AI-powered features without managing the complexity of embedding pipelines. - -=== Best practices - -* Start with a small collection to validate configuration -* Monitor workflow health indicators regularly -* Adjust batch settings based on your document sizes -* Use appropriate embedding models for your use case - -== See also - -* xref:cloud:clusters:data-service/scopes-collections.adoc[Scopes and Collections] -* xref:cloud:search:vector-search.adoc[Vector Search] -* xref:cloud:eventing:eventing-overview.adoc[Eventing Service] -* https://platform.openai.com/docs/guides/embeddings[OpenAI Embeddings Documentation] - From 3157dbfb2fa91e5067e451ba0e7f410ec2ce050c Mon Sep 17 00:00:00 2001 From: Shwetha Rao Date: Fri, 5 Dec 2025 16:40:25 +0530 Subject: [PATCH 4/4] testing links --- preview/antora-playbook.preview.local.yml | 12 ---------- preview/backup-restore-preview.yml | 28 ----------------------- 2 files changed, 40 deletions(-) delete mode 100644 preview/antora-playbook.preview.local.yml delete mode 100644 preview/backup-restore-preview.yml diff --git a/preview/antora-playbook.preview.local.yml b/preview/antora-playbook.preview.local.yml deleted file mode 100644 index 3dd279e132..0000000000 --- a/preview/antora-playbook.preview.local.yml +++ /dev/null @@ -1,12 +0,0 @@ -site: - title: Couchbase Docs Local Preview - -output: - dir: ./preview - clean: true -asciidoc: - attributes: - kroki-server-url: null -content: - sources: - - url: https://github.com/couchbase/docs-server diff --git a/preview/backup-restore-preview.yml b/preview/backup-restore-preview.yml deleted file mode 100644 index 95f3bb064f..0000000000 --- a/preview/backup-restore-preview.yml +++ /dev/null @@ -1,28 +0,0 @@ -sources: - docs-server: - branches: DOC-13786-backup-and-restore - docs-analytics: - branches: release/8.0 - docs-devex: - url: https://github.com/couchbaselabs/docs-devex.git - branches: master - startPaths: docs/ - couchbase-cli: - # url: ../../docs-includes/couchbase-cli - url: https://github.com/couchbaselabs/couchbase-cli-doc - # branches: HEAD - branches: master - startPaths: docs/ - backup: - # url: ../../docs-includes/backup - url: https://github.com/couchbaselabs/backup-docs.git - #branches: HEAD - branches: master - startPaths: docs/ - #analytics: - # url: ../../docs-includes/docs-analytics - # branches: HEAD - #cb-swagger: - # url: https://github.com/couchbaselabs/cb-swagger - # branches: release/8.0 - # start_path: docs \ No newline at end of file