Skip to content

Commit a40efab

Browse files
committed
feat: add support for 768-dimension embeddings (embeddinggemma)
Added complete support for 768-dimensional embedding models like embeddinggemma throughout the indexing and storage pipeline. Changes: - Added SURR_EMBEDDING_COLUMN_768 constant to codegraph-graph - Updated surreal_embedding_column_for_dimension() to map 768 -> embedding_768 - Added embedding_768 field to SurrealNodeRecord struct - Added embedding_768 field to SymbolEmbeddingRecord struct - Updated all match statements in vector search functions - Updated SurrealEmbeddingColumn enum in indexer with Embedding768 variant - Updated dimension mapping, column_name(), and error messages Supported dimensions: 384, 768, 1024, 2048, 4096 Usage: Set CODEGRAPH_EMBEDDING_DIMENSION=768 or configure embedding provider to use 768-dim models. The system will automatically use embedding_768 column in SurrealDB with HNSW indexing.
1 parent 6d07ea3 commit a40efab

File tree

3 files changed

+43
-13
lines changed

3 files changed

+43
-13
lines changed

crates/codegraph-graph/src/surrealdb_storage.rs

Lines changed: 24 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,7 @@ impl SurrealDbStorage {
242242
) -> Result<Vec<(String, f32)>> {
243243
let column = match embedding_column {
244244
SURR_EMBEDDING_COLUMN_384 => SURR_EMBEDDING_COLUMN_384,
245+
SURR_EMBEDDING_COLUMN_768 => SURR_EMBEDDING_COLUMN_768,
245246
SURR_EMBEDDING_COLUMN_1024 => SURR_EMBEDDING_COLUMN_1024,
246247
SURR_EMBEDDING_COLUMN_2048 => SURR_EMBEDDING_COLUMN_2048,
247248
SURR_EMBEDDING_COLUMN_4096 => SURR_EMBEDDING_COLUMN_4096,
@@ -311,6 +312,7 @@ impl SurrealDbStorage {
311312
) -> Result<Vec<(String, f32)>> {
312313
let column = match embedding_column {
313314
SURR_EMBEDDING_COLUMN_384 => SURR_EMBEDDING_COLUMN_384,
315+
SURR_EMBEDDING_COLUMN_768 => SURR_EMBEDDING_COLUMN_768,
314316
SURR_EMBEDDING_COLUMN_1024 => SURR_EMBEDDING_COLUMN_1024,
315317
SURR_EMBEDDING_COLUMN_2048 => SURR_EMBEDDING_COLUMN_2048,
316318
SURR_EMBEDDING_COLUMN_4096 => SURR_EMBEDDING_COLUMN_4096,
@@ -454,17 +456,18 @@ impl SurrealDbStorage {
454456
Some(node.metadata.attributes.clone())
455457
};
456458

457-
let (embedding_384, embedding_1024, embedding_2048, embedding_4096) =
459+
let (embedding_384, embedding_768, embedding_1024, embedding_2048, embedding_4096) =
458460
if let Some(values) = &node.embedding {
459461
let embedding_vec: Vec<f64> = values.iter().map(|&f| f as f64).collect();
460462
match values.len() {
461-
384 => (Some(embedding_vec), None, None, None),
462-
1024 => (None, Some(embedding_vec), None, None),
463-
4096 => (None, None, None, Some(embedding_vec)),
464-
_ => (None, None, Some(embedding_vec), None),
463+
384 => (Some(embedding_vec), None, None, None, None),
464+
768 => (None, Some(embedding_vec), None, None, None),
465+
1024 => (None, None, Some(embedding_vec), None, None),
466+
4096 => (None, None, None, None, Some(embedding_vec)),
467+
_ => (None, None, None, Some(embedding_vec), None),
465468
}
466469
} else {
467-
(None, None, None, None)
470+
(None, None, None, None, None)
468471
};
469472

470473
let embedding_model = node.metadata.attributes.get("embedding_model").cloned();
@@ -479,6 +482,7 @@ impl SurrealDbStorage {
479482
start_line: node.location.line,
480483
end_line: node.location.end_line,
481484
embedding_384,
485+
embedding_768,
482486
embedding_1024,
483487
embedding_2048,
484488
embedding_4096,
@@ -593,6 +597,7 @@ impl SurrealDbStorage {
593597
for record in records {
594598
let column = match record.column {
595599
SURR_EMBEDDING_COLUMN_384 => SURR_EMBEDDING_COLUMN_384,
600+
SURR_EMBEDDING_COLUMN_768 => SURR_EMBEDDING_COLUMN_768,
596601
SURR_EMBEDDING_COLUMN_1024 => SURR_EMBEDDING_COLUMN_1024,
597602
SURR_EMBEDDING_COLUMN_2048 => SURR_EMBEDDING_COLUMN_2048,
598603
SURR_EMBEDDING_COLUMN_4096 => SURR_EMBEDDING_COLUMN_4096,
@@ -1175,6 +1180,8 @@ pub struct SymbolEmbeddingRecord {
11751180
#[serde(skip_serializing_if = "Option::is_none")]
11761181
pub embedding_384: Option<Vec<f64>>,
11771182
#[serde(skip_serializing_if = "Option::is_none")]
1183+
pub embedding_768: Option<Vec<f64>>,
1184+
#[serde(skip_serializing_if = "Option::is_none")]
11781185
pub embedding_1024: Option<Vec<f64>>,
11791186
#[serde(skip_serializing_if = "Option::is_none")]
11801187
pub embedding_2048: Option<Vec<f64>>,
@@ -1205,12 +1212,13 @@ impl SymbolEmbeddingRecord {
12051212
metadata: Option<JsonValue>,
12061213
) -> Self {
12071214
let embedding_vec: Vec<f64> = embedding.iter().map(|&f| f as f64).collect();
1208-
let (embedding_384, embedding_1024, embedding_2048, embedding_4096) = match embedding_column
1215+
let (embedding_384, embedding_768, embedding_1024, embedding_2048, embedding_4096) = match embedding_column
12091216
{
1210-
SURR_EMBEDDING_COLUMN_384 => (Some(embedding_vec), None, None, None),
1211-
SURR_EMBEDDING_COLUMN_1024 => (None, Some(embedding_vec), None, None),
1212-
SURR_EMBEDDING_COLUMN_4096 => (None, None, None, Some(embedding_vec)),
1213-
_ => (None, None, Some(embedding_vec), None),
1217+
SURR_EMBEDDING_COLUMN_384 => (Some(embedding_vec), None, None, None, None),
1218+
SURR_EMBEDDING_COLUMN_768 => (None, Some(embedding_vec), None, None, None),
1219+
SURR_EMBEDDING_COLUMN_1024 => (None, None, Some(embedding_vec), None, None),
1220+
SURR_EMBEDDING_COLUMN_4096 => (None, None, None, None, Some(embedding_vec)),
1221+
_ => (None, None, None, Some(embedding_vec), None),
12141222
};
12151223

12161224
SymbolEmbeddingRecord {
@@ -1220,6 +1228,7 @@ impl SymbolEmbeddingRecord {
12201228
project_id: project_id.to_string(),
12211229
organization_id: organization_id.map(|s| s.to_string()),
12221230
embedding_384,
1231+
embedding_768,
12231232
embedding_1024,
12241233
embedding_2048,
12251234
embedding_4096,
@@ -1308,6 +1317,8 @@ struct SurrealNodeRecord {
13081317
#[serde(skip_serializing_if = "Option::is_none")]
13091318
embedding_384: Option<Vec<f64>>,
13101319
#[serde(skip_serializing_if = "Option::is_none")]
1320+
embedding_768: Option<Vec<f64>>,
1321+
#[serde(skip_serializing_if = "Option::is_none")]
13111322
embedding_1024: Option<Vec<f64>>,
13121323
#[serde(skip_serializing_if = "Option::is_none")]
13131324
embedding_2048: Option<Vec<f64>>,
@@ -1330,13 +1341,15 @@ struct SurrealNodeRecord {
13301341
}
13311342

13321343
pub const SURR_EMBEDDING_COLUMN_384: &str = "embedding_384";
1344+
pub const SURR_EMBEDDING_COLUMN_768: &str = "embedding_768";
13331345
pub const SURR_EMBEDDING_COLUMN_1024: &str = "embedding_1024";
13341346
pub const SURR_EMBEDDING_COLUMN_2048: &str = "embedding_2048";
13351347
pub const SURR_EMBEDDING_COLUMN_4096: &str = "embedding_4096";
13361348

13371349
pub fn surreal_embedding_column_for_dimension(dim: usize) -> &'static str {
13381350
match dim {
13391351
384 => SURR_EMBEDDING_COLUMN_384,
1352+
768 => SURR_EMBEDDING_COLUMN_768,
13401353
1024 => SURR_EMBEDDING_COLUMN_1024,
13411354
4096 => SURR_EMBEDDING_COLUMN_4096,
13421355
_ => SURR_EMBEDDING_COLUMN_2048,

crates/codegraph-mcp/src/indexer.rs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ use codegraph_graph::{
1111
edge::CodeEdge, FileMetadataRecord, NodeEmbeddingRecord, ProjectMetadataRecord,
1212
SurrealDbConfig, SurrealDbStorage, SymbolEmbeddingRecord, SURR_EMBEDDING_COLUMN_1024,
1313
SURR_EMBEDDING_COLUMN_2048, SURR_EMBEDDING_COLUMN_384, SURR_EMBEDDING_COLUMN_4096,
14+
SURR_EMBEDDING_COLUMN_768,
1415
};
1516
use codegraph_parser::{get_ai_pattern_learner, TreeSitterParser};
1617
#[cfg(feature = "ai-enhanced")]
@@ -58,6 +59,7 @@ pub struct FileChange {
5859
#[derive(Clone, Copy, Debug)]
5960
enum SurrealEmbeddingColumn {
6061
Embedding384,
62+
Embedding768,
6163
Embedding1024,
6264
Embedding2048,
6365
Embedding4096,
@@ -67,6 +69,7 @@ impl SurrealEmbeddingColumn {
6769
fn column_name(&self) -> &'static str {
6870
match self {
6971
SurrealEmbeddingColumn::Embedding384 => SURR_EMBEDDING_COLUMN_384,
72+
SurrealEmbeddingColumn::Embedding768 => SURR_EMBEDDING_COLUMN_768,
7073
SurrealEmbeddingColumn::Embedding1024 => SURR_EMBEDDING_COLUMN_1024,
7174
SurrealEmbeddingColumn::Embedding2048 => SURR_EMBEDDING_COLUMN_2048,
7275
SurrealEmbeddingColumn::Embedding4096 => SURR_EMBEDDING_COLUMN_4096,
@@ -76,6 +79,7 @@ impl SurrealEmbeddingColumn {
7679
fn dimension(&self) -> usize {
7780
match self {
7881
SurrealEmbeddingColumn::Embedding384 => 384,
82+
SurrealEmbeddingColumn::Embedding768 => 768,
7983
SurrealEmbeddingColumn::Embedding1024 => 1024,
8084
SurrealEmbeddingColumn::Embedding2048 => 2048,
8185
SurrealEmbeddingColumn::Embedding4096 => 4096,
@@ -494,7 +498,7 @@ impl ProjectIndexer {
494498
let vector_dim = env_vector_dim.unwrap_or(embedder_dimension);
495499
let embedding_column = resolve_surreal_embedding_column(vector_dim).with_context(|| {
496500
format!(
497-
"Unsupported embedding dimension {}. Supported dimensions: 384, 1024, 2048, 4096.",
501+
"Unsupported embedding dimension {}. Supported dimensions: 384, 768, 1024, 2048, 4096.",
498502
vector_dim
499503
)
500504
})?;
@@ -2731,11 +2735,12 @@ fn symbol_embedding_db_batch_size() -> usize {
27312735
fn resolve_surreal_embedding_column(dim: usize) -> Result<SurrealEmbeddingColumn> {
27322736
match dim {
27332737
384 => Ok(SurrealEmbeddingColumn::Embedding384),
2738+
768 => Ok(SurrealEmbeddingColumn::Embedding768),
27342739
1024 => Ok(SurrealEmbeddingColumn::Embedding1024),
27352740
2048 => Ok(SurrealEmbeddingColumn::Embedding2048),
27362741
4096 => Ok(SurrealEmbeddingColumn::Embedding4096),
27372742
other => Err(anyhow!(
2738-
"Unsupported embedding dimension {}. Supported: 384, 1024, 2048, 4096",
2743+
"Unsupported embedding dimension {}. Supported: 384, 768, 1024, 2048, 4096",
27392744
other
27402745
)),
27412746
}

schema/codegraph.surql

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@ DEFINE FIELD IF NOT EXISTS start_line ON TABLE nodes TYPE option<int>;
4242
DEFINE FIELD IF NOT EXISTS end_line ON TABLE nodes TYPE option<int>;
4343
DEFINE FIELD IF NOT EXISTS embedding_384 ON TABLE nodes TYPE option<array<float>>
4444
ASSERT array::len($value) = 384;
45+
DEFINE FIELD IF NOT EXISTS embedding_768 ON TABLE nodes TYPE option<array<float>>
46+
ASSERT array::len($value) = 768;
4547
DEFINE FIELD IF NOT EXISTS embedding_1024 ON TABLE nodes TYPE option<array<float>>
4648
ASSERT array::len($value) = 1024;
4749
DEFINE FIELD IF NOT EXISTS embedding_1536 ON TABLE nodes TYPE option<array<float>>
@@ -72,6 +74,8 @@ DEFINE INDEX IF NOT EXISTS idx_nodes_file_path ON TABLE nodes COLUMNS file_path
7274
DEFINE INDEX IF NOT EXISTS idx_nodes_project ON TABLE nodes COLUMNS project_id;
7375
DEFINE INDEX IF NOT EXISTS idx_nodes_embedding_384
7476
ON TABLE nodes FIELDS embedding_384 HNSW DIMENSION 384 DIST COSINE EFC 200 M 16;
77+
DEFINE INDEX IF NOT EXISTS idx_nodes_embedding_768
78+
ON TABLE nodes FIELDS embedding_768 HNSW DIMENSION 768 DIST COSINE EFC 200 M 16;
7579
DEFINE INDEX IF NOT EXISTS idx_nodes_embedding_1024
7680
ON TABLE nodes FIELDS embedding_1024 HNSW DIMENSION 1024 DIST COSINE EFC 200 M 16;
7781
DEFINE INDEX IF NOT EXISTS idx_nodes_embedding_1536
@@ -217,6 +221,8 @@ DEFINE FIELD IF NOT EXISTS project_id ON symbol_embeddings TYPE option<s
217221
DEFINE FIELD IF NOT EXISTS organization_id ON symbol_embeddings TYPE option<string>;
218222
DEFINE FIELD IF NOT EXISTS embedding_384 ON TABLE symbol_embeddings TYPE option<array<float>>
219223
ASSERT array::len($value) = 384;
224+
DEFINE FIELD IF NOT EXISTS embedding_768 ON TABLE symbol_embeddings TYPE option<array<float>>
225+
ASSERT array::len($value) = 768;
220226
DEFINE FIELD IF NOT EXISTS embedding_1024 ON TABLE symbol_embeddings TYPE option<array<float>>
221227
ASSERT array::len($value) = 1024;
222228
DEFINE FIELD IF NOT EXISTS embedding_1536 ON TABLE symbol_embeddings TYPE option<array<float>>
@@ -228,8 +234,11 @@ DEFINE FIELD IF NOT EXISTS embedding_3072 ON TABLE symbol_embeddings TYPE opti
228234
DEFINE FIELD IF NOT EXISTS embedding_4096 ON TABLE symbol_embeddings TYPE option<array<float>>
229235
ASSERT array::len($value) = 4096;
230236
DEFINE FIELD IF NOT EXISTS embedding_384[*] ON symbol_embeddings TYPE float;
237+
DEFINE FIELD IF NOT EXISTS embedding_768[*] ON symbol_embeddings TYPE float;
231238
DEFINE FIELD IF NOT EXISTS embedding_1024[*] ON symbol_embeddings TYPE float;
239+
DEFINE FIELD IF NOT EXISTS embedding_1536[*] ON symbol_embeddings TYPE float;
232240
DEFINE FIELD IF NOT EXISTS embedding_2048[*] ON symbol_embeddings TYPE float;
241+
DEFINE FIELD IF NOT EXISTS embedding_3072[*] ON symbol_embeddings TYPE float;
233242
DEFINE FIELD IF NOT EXISTS embedding_4096[*] ON symbol_embeddings TYPE float;
234243
DEFINE FIELD IF NOT EXISTS embedding_model ON symbol_embeddings TYPE string DEFAULT 'jina-embeddings-v4';
235244
DEFINE FIELD IF NOT EXISTS last_computed_at ON symbol_embeddings TYPE datetime DEFAULT time::now() READONLY;
@@ -245,6 +254,9 @@ DEFINE INDEX IF NOT EXISTS idx_symbol_embeddings_project_symbol
245254
DEFINE INDEX IF NOT EXISTS idx_symbol_embeddings_vector_384
246255
ON symbol_embeddings FIELDS embedding_384
247256
HNSW DIMENSION 384 DIST COSINE EFC 200 M 16;
257+
DEFINE INDEX IF NOT EXISTS idx_symbol_embeddings_vector_768
258+
ON symbol_embeddings FIELDS embedding_768
259+
HNSW DIMENSION 768 DIST COSINE EFC 200 M 16;
248260
DEFINE INDEX IF NOT EXISTS idx_symbol_embeddings_vector_1024
249261
ON symbol_embeddings FIELDS embedding_1024
250262
HNSW DIMENSION 1024 DIST COSINE EFC 200 M 16;

0 commit comments

Comments
 (0)