From 6eb8a5f3ebf8af79b326f998a44da80c83b4fbeb Mon Sep 17 00:00:00 2001
From: Shwetha Rao <shwetha.rao@couchbase.com>
Date: Fri, 5 Dec 2025 12:46:16 +0530
Subject: [PATCH 1/4] first commit

---
 .../manage-backup-and-restore/autovec.adoc    | 248 ++++++++++++++++++
 preview/backup-restore-preview.yml            |  28 ++
 2 files changed, 276 insertions(+)
 create mode 100644 modules/manage/pages/manage-backup-and-restore/autovec.adoc
 create mode 100644 preview/backup-restore-preview.yml

diff --git a/modules/manage/pages/manage-backup-and-restore/autovec.adoc b/modules/manage/pages/manage-backup-and-restore/autovec.adoc
new file mode 100644
index 0000000000..55f2f70629
--- /dev/null
+++ b/modules/manage/pages/manage-backup-and-restore/autovec.adoc
@@ -0,0 +1,248 @@
+= Auto-Vectorization
+:description: Automatically generate vector embeddings for your documents in Couchbase Capella.
+:page-topic-type: concept
+
+[abstract]
+Auto-Vectorization (AutoVec) automatically generates vector embeddings for documents stored in Couchbase Capella, enabling semantic search and RAG (Retrieval-Augmented Generation) pipelines without manual embedding management.
+
+== Introduction
+
+Auto-Vectorization is a Couchbase Capella feature that automatically converts your structured and unstructured data into vector embeddings. These embeddings enable powerful semantic search capabilities and form the foundation for building RAG applications.
+
+=== Key benefits
+
+* *Automatic processing* - Documents are vectorized automatically as they are inserted or updated
+* *No code required* - Configure vectorization through the Capella UI without writing embedding code
+* *Scalable architecture* - Built on the Eventing Service for high-throughput processing
+* *Multiple embedding providers* - Support for OpenAI, AWS Bedrock, and AI Gateway
+
+=== Use cases
+
+* Semantic search across document collections
+* Building RAG pipelines for AI applications
+* Content similarity matching
+* Intelligent document retrieval
+
+== Vectorization
+
+Auto-Vectorization uses the Eventing Service to process document mutations and generate embeddings in real-time.
+
+=== How it works
+
+. A document is inserted or updated in the source collection
+. The Eventing worker receives the mutation via DCP (Database Change Protocol)
+. The worker checks if the document needs vectorization by comparing CRC checksums
+. Documents are batched together for efficient API calls
+. The embedding service generates vectors for the batch
+. Embeddings are written back to the source document
+
+=== Architecture
+
+Auto-Vectorization supports two architecture versions:
+
+==== V2 Architecture (Current)
+
+The V2 architecture provides significant performance improvements:
+
+* *Worker affinity* - Each worker maintains its own in-memory batch, ensuring correctness
+* *Reduced I/O* - Approximately 2.6x I/O amplification (compared to 11x in V1)
+* *Crash recovery* - Batch state is persisted with timer-based recovery
+* *Topology aware* - Automatically handles scale-up, scale-down, and failover
+
+==== I/O estimation
+
+For every 100 document mutations:
+
+[cols="1,1"]
+|===
+|Operation |Count
+
+|GET (batch state)
+|100
+
+|UPSERT (batch persist)
+|100
+
+|Timer cancellations
+|18
+
+|Timer creations
+|30
+
+|Batch resets
+|6
+
+|*Total*
+|*254 operations*
+|===
+
+This results in an I/O amplification factor of approximately 2.6x.
+
+== Data pre-processing options
+
+Before vectorization, you can configure how your data is prepared for the embedding service.
+
+=== Batch configuration
+
+* *BATCH_LIMIT_BYTES* - Controls the maximum size of data sent to the embedding service per request
+* *Batch size* - Number of documents processed together (default: 16)
+
+=== Field selection
+
+You can specify which document fields to include in the vectorization:
+
+* Select specific text fields for embedding
+* Exclude metadata or system fields
+* Combine multiple fields into a single embedding
+
+NOTE: Reducing batch size or field content helps avoid token limit errors from embedding providers.
+
+== Unstructured data processing limitations
+
+Auto-Vectorization has certain limitations when processing unstructured data:
+
+=== Token limits
+
+Most embedding services have token limits:
+
+* OpenAI: 8,192 tokens maximum context length
+* Exceeding limits results in Error Code 400
+
+TIP: Reduce `BATCH_LIMIT_BYTES` if you encounter token limit errors.
+
+=== Content types
+
+* Text content is fully supported
+* Binary content (images, PDFs) requires pre-processing
+* Large documents may need chunking before vectorization
+
+=== Processing constraints
+
+* Documents larger than the configured batch limit are skipped
+* Extremely large collections may require throttling configuration
+* Network timeouts can occur with slow embedding services
+
+== Workflow statuses
+
+Auto-Vectorization workflows have distinct status indicators:
+
+=== Health indicators
+
+[cols="1,3"]
+|===
+|Status |Description
+
+|🟢 *Healthy*
+|Workflow is running normally
+
+|🟡 *Warning*
+|Retryable errors occurring (such as timeouts)
+
+|🔴 *Failed*
+|Fatal failure requiring intervention
+
+|⏸️ *Paused*
+|Workflow is paused by user or system
+|===
+
+=== Workflow state truth table
+
+When the control plane shows a workflow as "running":
+
+[cols="1,1,1"]
+|===
+|Controller State |Worker State |Actual Status
+
+|Running
+|Running
+|Healthy
+
+|Running
+|Paused
+|Throttled or user paused
+
+|Running
+|Undeployed
+|Failed
+
+|Paused
+|Running
+|Failed
+
+|Paused
+|Paused
+|Paused
+
+|Undeployed
+|Any
+|Failed
+|===
+
+=== Error codes
+
+[cols="1,1,2"]
+|===
+|Code |Name |Description
+
+|400
+|Bad Request
+|Context length exceeded token limit. Reduce `BATCH_LIMIT_BYTES`.
+
+|401
+|Unauthenticated
+|Invalid API key or account not part of organization.
+
+|403
+|Forbidden
+|Accessing API from unsupported region.
+
+|500
+|Internal Server Error
+|Embedding service internal error.
+
+|600
+|Data Service Error
+|Issues writing embeddings to Couchbase.
+|===
+
+== Workflow billing
+
+Auto-Vectorization billing is based on token consumption from the embedding service.
+
+=== Metering metrics
+
+The following metrics are tracked for billing:
+
+* *tokens_processed_total* - Total tokens sent to the embedding service
+* *batch_requests_total* - Number of API calls made
+* *embedding_writes_total* - Successful embedding operations
+
+=== Cost factors
+
+Billing depends on:
+
+* Number of documents processed
+* Size of document content (affects token count)
+* Embedding model selected (pricing varies by provider)
+* Re-processing events (document updates trigger new embeddings)
+
+IMPORTANT: Monitor `tokens_processed_total` to estimate embedding service costs.
+
+== Conclusion
+
+Auto-Vectorization simplifies the process of adding semantic search capabilities to your Couchbase Capella applications. By automatically generating embeddings for your documents, you can focus on building AI-powered features without managing the complexity of embedding pipelines.
+
+=== Best practices
+
+* Start with a small collection to validate configuration
+* Monitor workflow health indicators regularly
+* Adjust batch settings based on your document sizes
+* Use appropriate embedding models for your use case
+
+== See also
+
+* xref:cloud:clusters:data-service/scopes-collections.adoc[Scopes and Collections]
+* xref:cloud:search:vector-search.adoc[Vector Search]
+* xref:cloud:eventing:eventing-overview.adoc[Eventing Service]
+* https://platform.openai.com/docs/guides/embeddings[OpenAI Embeddings Documentation]
+
diff --git a/preview/backup-restore-preview.yml b/preview/backup-restore-preview.yml
new file mode 100644
index 0000000000..95f3bb064f
--- /dev/null
+++ b/preview/backup-restore-preview.yml
@@ -0,0 +1,28 @@
+sources:
+    docs-server:
+      branches: DOC-13786-backup-and-restore
+    docs-analytics:
+      branches: release/8.0
+    docs-devex:
+      url: https://github.com/couchbaselabs/docs-devex.git
+      branches: master
+      startPaths: docs/
+    couchbase-cli:
+      # url: ../../docs-includes/couchbase-cli
+      url: https://github.com/couchbaselabs/couchbase-cli-doc
+      # branches: HEAD
+      branches: master
+      startPaths: docs/
+    backup:
+      # url: ../../docs-includes/backup
+      url: https://github.com/couchbaselabs/backup-docs.git
+      #branches: HEAD
+      branches: master
+      startPaths: docs/    
+    #analytics:
+    #  url: ../../docs-includes/docs-analytics
+    #  branches: HEAD
+    #cb-swagger:
+    #  url: https://github.com/couchbaselabs/cb-swagger
+    #  branches: release/8.0
+    #  start_path: docs
\ No newline at end of file

From aeb86aa93ee7822c4ba8bdca57fe80ba89371f9c Mon Sep 17 00:00:00 2001
From: Shwetha Rao <shwetha.rao@couchbase.com>
Date: Fri, 5 Dec 2025 13:05:45 +0530
Subject: [PATCH 2/4] preview file

---
 preview/antora-playbook.preview.local.yml | 12 ++++++++++++
 1 file changed, 12 insertions(+)
 create mode 100644 preview/antora-playbook.preview.local.yml

diff --git a/preview/antora-playbook.preview.local.yml b/preview/antora-playbook.preview.local.yml
new file mode 100644
index 0000000000..3dd279e132
--- /dev/null
+++ b/preview/antora-playbook.preview.local.yml
@@ -0,0 +1,12 @@
+site:
+  title: Couchbase Docs Local Preview
+
+output:
+  dir: ./preview
+  clean: true
+asciidoc:
+  attributes:
+    kroki-server-url: null
+content:
+  sources:
+    - url: https://github.com/couchbase/docs-server

From de826950eaa12e615a3358eca6080d871849e568 Mon Sep 17 00:00:00 2001
From: Shwetha Rao <shwetha.rao@couchbase.com>
Date: Fri, 5 Dec 2025 16:38:45 +0530
Subject: [PATCH 3/4] Minor grammar fix

---
 .../manage-backup-and-restore/autovec.adoc    | 248 ------------------
 1 file changed, 248 deletions(-)
 delete mode 100644 modules/manage/pages/manage-backup-and-restore/autovec.adoc

diff --git a/modules/manage/pages/manage-backup-and-restore/autovec.adoc b/modules/manage/pages/manage-backup-and-restore/autovec.adoc
deleted file mode 100644
index 55f2f70629..0000000000
--- a/modules/manage/pages/manage-backup-and-restore/autovec.adoc
+++ /dev/null
@@ -1,248 +0,0 @@
-= Auto-Vectorization
-:description: Automatically generate vector embeddings for your documents in Couchbase Capella.
-:page-topic-type: concept
-
-[abstract]
-Auto-Vectorization (AutoVec) automatically generates vector embeddings for documents stored in Couchbase Capella, enabling semantic search and RAG (Retrieval-Augmented Generation) pipelines without manual embedding management.
-
-== Introduction
-
-Auto-Vectorization is a Couchbase Capella feature that automatically converts your structured and unstructured data into vector embeddings. These embeddings enable powerful semantic search capabilities and form the foundation for building RAG applications.
-
-=== Key benefits
-
-* *Automatic processing* - Documents are vectorized automatically as they are inserted or updated
-* *No code required* - Configure vectorization through the Capella UI without writing embedding code
-* *Scalable architecture* - Built on the Eventing Service for high-throughput processing
-* *Multiple embedding providers* - Support for OpenAI, AWS Bedrock, and AI Gateway
-
-=== Use cases
-
-* Semantic search across document collections
-* Building RAG pipelines for AI applications
-* Content similarity matching
-* Intelligent document retrieval
-
-== Vectorization
-
-Auto-Vectorization uses the Eventing Service to process document mutations and generate embeddings in real-time.
-
-=== How it works
-
-. A document is inserted or updated in the source collection
-. The Eventing worker receives the mutation via DCP (Database Change Protocol)
-. The worker checks if the document needs vectorization by comparing CRC checksums
-. Documents are batched together for efficient API calls
-. The embedding service generates vectors for the batch
-. Embeddings are written back to the source document
-
-=== Architecture
-
-Auto-Vectorization supports two architecture versions:
-
-==== V2 Architecture (Current)
-
-The V2 architecture provides significant performance improvements:
-
-* *Worker affinity* - Each worker maintains its own in-memory batch, ensuring correctness
-* *Reduced I/O* - Approximately 2.6x I/O amplification (compared to 11x in V1)
-* *Crash recovery* - Batch state is persisted with timer-based recovery
-* *Topology aware* - Automatically handles scale-up, scale-down, and failover
-
-==== I/O estimation
-
-For every 100 document mutations:
-
-[cols="1,1"]
-|===
-|Operation |Count
-
-|GET (batch state)
-|100
-
-|UPSERT (batch persist)
-|100
-
-|Timer cancellations
-|18
-
-|Timer creations
-|30
-
-|Batch resets
-|6
-
-|*Total*
-|*254 operations*
-|===
-
-This results in an I/O amplification factor of approximately 2.6x.
-
-== Data pre-processing options
-
-Before vectorization, you can configure how your data is prepared for the embedding service.
-
-=== Batch configuration
-
-* *BATCH_LIMIT_BYTES* - Controls the maximum size of data sent to the embedding service per request
-* *Batch size* - Number of documents processed together (default: 16)
-
-=== Field selection
-
-You can specify which document fields to include in the vectorization:
-
-* Select specific text fields for embedding
-* Exclude metadata or system fields
-* Combine multiple fields into a single embedding
-
-NOTE: Reducing batch size or field content helps avoid token limit errors from embedding providers.
-
-== Unstructured data processing limitations
-
-Auto-Vectorization has certain limitations when processing unstructured data:
-
-=== Token limits
-
-Most embedding services have token limits:
-
-* OpenAI: 8,192 tokens maximum context length
-* Exceeding limits results in Error Code 400
-
-TIP: Reduce `BATCH_LIMIT_BYTES` if you encounter token limit errors.
-
-=== Content types
-
-* Text content is fully supported
-* Binary content (images, PDFs) requires pre-processing
-* Large documents may need chunking before vectorization
-
-=== Processing constraints
-
-* Documents larger than the configured batch limit are skipped
-* Extremely large collections may require throttling configuration
-* Network timeouts can occur with slow embedding services
-
-== Workflow statuses
-
-Auto-Vectorization workflows have distinct status indicators:
-
-=== Health indicators
-
-[cols="1,3"]
-|===
-|Status |Description
-
-|🟢 *Healthy*
-|Workflow is running normally
-
-|🟡 *Warning*
-|Retryable errors occurring (such as timeouts)
-
-|🔴 *Failed*
-|Fatal failure requiring intervention
-
-|⏸️ *Paused*
-|Workflow is paused by user or system
-|===
-
-=== Workflow state truth table
-
-When the control plane shows a workflow as "running":
-
-[cols="1,1,1"]
-|===
-|Controller State |Worker State |Actual Status
-
-|Running
-|Running
-|Healthy
-
-|Running
-|Paused
-|Throttled or user paused
-
-|Running
-|Undeployed
-|Failed
-
-|Paused
-|Running
-|Failed
-
-|Paused
-|Paused
-|Paused
-
-|Undeployed
-|Any
-|Failed
-|===
-
-=== Error codes
-
-[cols="1,1,2"]
-|===
-|Code |Name |Description
-
-|400
-|Bad Request
-|Context length exceeded token limit. Reduce `BATCH_LIMIT_BYTES`.
-
-|401
-|Unauthenticated
-|Invalid API key or account not part of organization.
-
-|403
-|Forbidden
-|Accessing API from unsupported region.
-
-|500
-|Internal Server Error
-|Embedding service internal error.
-
-|600
-|Data Service Error
-|Issues writing embeddings to Couchbase.
-|===
-
-== Workflow billing
-
-Auto-Vectorization billing is based on token consumption from the embedding service.
-
-=== Metering metrics
-
-The following metrics are tracked for billing:
-
-* *tokens_processed_total* - Total tokens sent to the embedding service
-* *batch_requests_total* - Number of API calls made
-* *embedding_writes_total* - Successful embedding operations
-
-=== Cost factors
-
-Billing depends on:
-
-* Number of documents processed
-* Size of document content (affects token count)
-* Embedding model selected (pricing varies by provider)
-* Re-processing events (document updates trigger new embeddings)
-
-IMPORTANT: Monitor `tokens_processed_total` to estimate embedding service costs.
-
-== Conclusion
-
-Auto-Vectorization simplifies the process of adding semantic search capabilities to your Couchbase Capella applications. By automatically generating embeddings for your documents, you can focus on building AI-powered features without managing the complexity of embedding pipelines.
-
-=== Best practices
-
-* Start with a small collection to validate configuration
-* Monitor workflow health indicators regularly
-* Adjust batch settings based on your document sizes
-* Use appropriate embedding models for your use case
-
-== See also
-
-* xref:cloud:clusters:data-service/scopes-collections.adoc[Scopes and Collections]
-* xref:cloud:search:vector-search.adoc[Vector Search]
-* xref:cloud:eventing:eventing-overview.adoc[Eventing Service]
-* https://platform.openai.com/docs/guides/embeddings[OpenAI Embeddings Documentation]
-

From 3157dbfb2fa91e5067e451ba0e7f410ec2ce050c Mon Sep 17 00:00:00 2001
From: Shwetha Rao <shwetha.rao@couchbase.com>
Date: Fri, 5 Dec 2025 16:40:25 +0530
Subject: [PATCH 4/4] testing links

---
 preview/antora-playbook.preview.local.yml | 12 ----------
 preview/backup-restore-preview.yml        | 28 -----------------------
 2 files changed, 40 deletions(-)
 delete mode 100644 preview/antora-playbook.preview.local.yml
 delete mode 100644 preview/backup-restore-preview.yml

diff --git a/preview/antora-playbook.preview.local.yml b/preview/antora-playbook.preview.local.yml
deleted file mode 100644
index 3dd279e132..0000000000
--- a/preview/antora-playbook.preview.local.yml
+++ /dev/null
@@ -1,12 +0,0 @@
-site:
-  title: Couchbase Docs Local Preview
-
-output:
-  dir: ./preview
-  clean: true
-asciidoc:
-  attributes:
-    kroki-server-url: null
-content:
-  sources:
-    - url: https://github.com/couchbase/docs-server
diff --git a/preview/backup-restore-preview.yml b/preview/backup-restore-preview.yml
deleted file mode 100644
index 95f3bb064f..0000000000
--- a/preview/backup-restore-preview.yml
+++ /dev/null
@@ -1,28 +0,0 @@
-sources:
-    docs-server:
-      branches: DOC-13786-backup-and-restore
-    docs-analytics:
-      branches: release/8.0
-    docs-devex:
-      url: https://github.com/couchbaselabs/docs-devex.git
-      branches: master
-      startPaths: docs/
-    couchbase-cli:
-      # url: ../../docs-includes/couchbase-cli
-      url: https://github.com/couchbaselabs/couchbase-cli-doc
-      # branches: HEAD
-      branches: master
-      startPaths: docs/
-    backup:
-      # url: ../../docs-includes/backup
-      url: https://github.com/couchbaselabs/backup-docs.git
-      #branches: HEAD
-      branches: master
-      startPaths: docs/    
-    #analytics:
-    #  url: ../../docs-includes/docs-analytics
-    #  branches: HEAD
-    #cb-swagger:
-    #  url: https://github.com/couchbaselabs/cb-swagger
-    #  branches: release/8.0
-    #  start_path: docs
\ No newline at end of file