Skip to content

Commit 156367e

Browse files
committed
Refactor embedding providers to utilize chunking and sanitization
- Introduced a new `prep` module for chunking logic. - Updated `JinaEmbeddingProvider`, `LocalEmbeddingProvider`, and `OpenAiEmbeddingProvider` to use chunking strategies for preparing text for embeddings. - Removed redundant text sanitization methods and replaced them with a unified approach in the new chunking logic. - Enhanced the `prepare_text` method to leverage chunk plans for better token management. - Updated SurrealDB schema to include new metadata and graph analysis functions. - Added functions for transitive dependencies, circular dependency detection, and coupling metrics to improve graph analysis capabilities.
1 parent 1c3a3a6 commit 156367e

File tree

11 files changed

+609
-443
lines changed

11 files changed

+609
-443
lines changed

Cargo.lock

Lines changed: 10 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/codegraph-mcp/src/bin/codegraph.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -878,15 +878,15 @@ async fn handle_start(
878878
}
879879

880880
// Create session manager for stateful HTTP connections
881-
let session_manager = Arc::new(LocalSessionManager::new());
881+
let session_manager = Arc::new(LocalSessionManager::default());
882882

883883
// Service factory - creates new CodeGraphMCPServer for each session
884-
let service_factory = Arc::new(|| {
885-
let mut server = codegraph_mcp::official_server::CodeGraphMCPServer::new();
884+
let service_factory = || {
885+
let server = codegraph_mcp::official_server::CodeGraphMCPServer::new();
886886
// Note: initialize_qwen() is async, but service factory must be sync
887887
// Qwen initialization will happen on first use
888888
Ok(server)
889-
});
889+
};
890890

891891
// Configure HTTP server with SSE streaming
892892
let config = StreamableHttpServerConfig {

crates/codegraph-mcp/src/indexer.rs

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,11 @@
33
use anyhow::Result;
44
#[cfg(feature = "ai-enhanced")]
55
use codegraph_ai::SemanticSearchEngine;
6-
#[cfg(feature = "ai-enhanced")]
7-
use futures::{stream, StreamExt};
86
use codegraph_core::{CodeNode, EdgeRelationship, GraphStore, NodeId, NodeType};
97
use codegraph_graph::{edge::CodeEdge, CodeGraph};
108
use codegraph_parser::{get_ai_pattern_learner, TreeSitterParser};
9+
#[cfg(feature = "ai-enhanced")]
10+
use futures::{stream, StreamExt};
1111
use indicatif::{MultiProgress, ProgressBar, ProgressStyle};
1212
use num_cpus;
1313
use rayon::prelude::*;
@@ -1303,6 +1303,9 @@ impl ProjectIndexer {
13031303
.map(|chunk| chunk.iter().cloned().collect())
13041304
.collect();
13051305

1306+
let max_concurrent = 4; // Parallel batch processing
1307+
let mut processed = 0; // Track progress
1308+
13061309
let mut batch_stream = stream::iter(batches.into_iter().map(|batch| {
13071310
let embedder = embedder;
13081311
async move {
@@ -1315,7 +1318,8 @@ impl ProjectIndexer {
13151318
while let Some((batch, result)) = batch_stream.next().await {
13161319
match result {
13171320
Ok(batch_embeddings) => {
1318-
for (symbol, embedding) in batch.iter().cloned().zip(batch_embeddings.into_iter())
1321+
for (symbol, embedding) in
1322+
batch.iter().cloned().zip(batch_embeddings.into_iter())
13191323
{
13201324
embeddings.insert(symbol, embedding);
13211325
processed += 1;
@@ -1408,6 +1412,8 @@ impl ProjectIndexer {
14081412
.map(|chunk| chunk.iter().cloned().collect())
14091413
.collect();
14101414

1415+
let max_concurrent = 4; // Parallel batch processing
1416+
14111417
let mut batch_stream = stream::iter(batches.into_iter().map(|batch| {
14121418
let embedder = embedder;
14131419
async move {
@@ -1420,7 +1426,8 @@ impl ProjectIndexer {
14201426
while let Some((batch, result)) = batch_stream.next().await {
14211427
match result {
14221428
Ok(batch_embeddings) => {
1423-
for (symbol, embedding) in batch.iter().cloned().zip(batch_embeddings.into_iter())
1429+
for (symbol, embedding) in
1430+
batch.iter().cloned().zip(batch_embeddings.into_iter())
14241431
{
14251432
embeddings.insert(symbol, embedding);
14261433
}

crates/codegraph-vector/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ reqwest = { workspace = true, optional = true }
4646
# ONNX Runtime provider (optional)
4747
ort = { version = "2.0.0-rc.10", optional = true, default-features = false, features = ["std", "ndarray", "download-binaries"] }
4848
semchunk-rs = "0.1.1"
49+
fxhash = "0.2.1"
4950

5051

5152

crates/codegraph-vector/src/embedding.rs

Lines changed: 70 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,11 @@
11
#[cfg(any(feature = "local-embeddings", feature = "openai", feature = "onnx"))]
22
use crate::embeddings::generator::TextEmbeddingEngine;
3+
use crate::prep::chunker::{
4+
aggregate_chunk_embeddings, build_chunk_plan, ChunkPlan, ChunkerConfig, SanitizeMode,
5+
};
36
use codegraph_core::{CodeGraphError, CodeNode, Result};
4-
#[cfg(any(
5-
feature = "local-embeddings",
6-
feature = "openai",
7-
feature = "onnx",
8-
feature = "ollama",
9-
feature = "jina"
10-
))]
11-
use std::sync::Arc;
7+
use std::{path::PathBuf, sync::Arc};
8+
use tokenizers::Tokenizer;
129

1310
pub struct EmbeddingGenerator {
1411
model_config: ModelConfig,
@@ -18,6 +15,7 @@ pub struct EmbeddingGenerator {
1815
ollama_provider: Option<crate::ollama_embedding_provider::OllamaEmbeddingProvider>,
1916
#[cfg(feature = "jina")]
2017
jina_provider: Option<crate::jina_provider::JinaEmbeddingProvider>,
18+
tokenizer: Arc<Tokenizer>,
2119
}
2220

2321
#[derive(Debug, Clone)]
@@ -39,6 +37,17 @@ impl Default for ModelConfig {
3937

4038
impl EmbeddingGenerator {
4139
pub fn new(config: ModelConfig) -> Self {
40+
let tokenizer_path = PathBuf::from(concat!(
41+
env!("CARGO_MANIFEST_DIR"),
42+
"/tokenizers/qwen2.5-coder.json"
43+
));
44+
let tokenizer = Tokenizer::from_file(&tokenizer_path).unwrap_or_else(|e| {
45+
panic!(
46+
"Failed to load tokenizer from {:?}: {}. This tokenizer is required for chunking.",
47+
tokenizer_path, e
48+
)
49+
});
50+
4251
Self {
4352
model_config: config,
4453
#[cfg(any(feature = "local-embeddings", feature = "openai", feature = "onnx"))]
@@ -47,6 +56,7 @@ impl EmbeddingGenerator {
4756
ollama_provider: None,
4857
#[cfg(feature = "jina")]
4958
jina_provider: None,
59+
tokenizer: Arc::new(tokenizer),
5060
}
5161
}
5262

@@ -76,6 +86,16 @@ impl EmbeddingGenerator {
7686
}
7787
}
7888

89+
fn chunker_config(&self) -> ChunkerConfig {
90+
ChunkerConfig::new(self.model_config.max_tokens)
91+
.sanitize_mode(SanitizeMode::AsciiFastPath)
92+
.cache_capacity(2048)
93+
}
94+
95+
fn build_plan_for_nodes(&self, nodes: &[CodeNode]) -> ChunkPlan {
96+
build_chunk_plan(nodes, Arc::clone(&self.tokenizer), self.chunker_config())
97+
}
98+
7999
pub fn dimension(&self) -> usize {
80100
self.model_config.dimension
81101
}
@@ -247,11 +267,16 @@ impl EmbeddingGenerator {
247267
}
248268

249269
pub async fn generate_embedding(&self, node: &CodeNode) -> Result<Vec<f32>> {
250-
let text = self.prepare_text(node);
251-
self.encode_text(&text).await
270+
let mut embeddings = self.generate_embeddings(std::slice::from_ref(node)).await?;
271+
embeddings
272+
.pop()
273+
.ok_or_else(|| CodeGraphError::Vector("No embedding generated".to_string()))
252274
}
253275

254276
pub async fn generate_embeddings(&self, nodes: &[CodeNode]) -> Result<Vec<Vec<f32>>> {
277+
if nodes.is_empty() {
278+
return Ok(Vec::new());
279+
}
255280
// Prefer Jina provider for batch processing (cloud-based embeddings)
256281
#[cfg(feature = "jina")]
257282
if let Some(jina) = &self.jina_provider {
@@ -294,31 +319,51 @@ impl EmbeddingGenerator {
294319

295320
#[cfg(any(feature = "local-embeddings", feature = "openai", feature = "onnx"))]
296321
if let Some(engine) = &self.advanced {
297-
// Use provider's batched path when available
298-
let texts: Vec<String> = nodes.iter().map(|n| self.prepare_text(n)).collect();
322+
let plan = self.build_plan_for_nodes(nodes);
323+
tracing::info!(
324+
target: "codegraph_vector::embeddings",
325+
"Advanced engine chunk plan: {} nodes -> {} chunks",
326+
plan.stats.total_nodes,
327+
plan.stats.total_chunks
328+
);
329+
let chunk_to_node = plan.chunk_to_node();
330+
let chunk_texts: Vec<String> =
331+
plan.chunks.into_iter().map(|chunk| chunk.text).collect();
299332
tracing::info!(
300333
target: "codegraph_vector::embeddings",
301-
"Using advanced embedding engine for batch: {} items",
302-
texts.len()
334+
"Using advanced embedding engine for batch: {} chunks",
335+
chunk_texts.len()
303336
);
304-
let embs = engine.embed_many(&texts).await?;
305-
if embs.len() != texts.len() {
337+
let chunk_embeddings = engine.embed_many(&chunk_texts).await?;
338+
if chunk_embeddings.len() != chunk_texts.len() {
306339
return Err(CodeGraphError::Vector(format!(
307340
"provider returned {} embeddings for {} inputs",
308-
embs.len(),
309-
texts.len()
341+
chunk_embeddings.len(),
342+
chunk_texts.len()
310343
)));
311344
}
312-
return Ok(embs);
345+
let aggregated = aggregate_chunk_embeddings(
346+
nodes.len(),
347+
&chunk_to_node,
348+
chunk_embeddings,
349+
self.dimension(),
350+
);
351+
return Ok(aggregated);
313352
}
314353

315-
// Fallback: sequential deterministic embeddings
316-
let mut embeddings = Vec::with_capacity(nodes.len());
317-
for node in nodes {
318-
let embedding = self.generate_embedding(node).await?;
319-
embeddings.push(embedding);
354+
// Fallback: sequential deterministic embeddings with chunking
355+
let plan = self.build_plan_for_nodes(nodes);
356+
let chunk_to_node = plan.chunk_to_node();
357+
let mut chunk_embeddings = Vec::with_capacity(plan.chunks.len());
358+
for chunk in plan.chunks {
359+
chunk_embeddings.push(self.encode_text(&chunk.text).await?);
320360
}
321-
Ok(embeddings)
361+
Ok(aggregate_chunk_embeddings(
362+
nodes.len(),
363+
&chunk_to_node,
364+
chunk_embeddings,
365+
self.dimension(),
366+
))
322367
}
323368

324369
/// Generate an embedding directly from free text. Useful for query embeddings.
@@ -349,37 +394,6 @@ impl EmbeddingGenerator {
349394
Ok(embeddings)
350395
}
351396

352-
fn prepare_text(&self, node: &CodeNode) -> String {
353-
let mut text = format!(
354-
"{} {} {}",
355-
node.language
356-
.as_ref()
357-
.map_or("unknown".to_string(), language_to_string),
358-
node.node_type
359-
.as_ref()
360-
.map_or("unknown".to_string(), node_type_to_string),
361-
node.name.as_str()
362-
);
363-
364-
if let Some(content) = &node.content {
365-
text.push(' ');
366-
text.push_str(content);
367-
}
368-
369-
if text.len() > self.model_config.max_tokens * 4 {
370-
let mut new_len = self.model_config.max_tokens * 4;
371-
if new_len > text.len() {
372-
new_len = text.len();
373-
}
374-
while new_len > 0 && !text.is_char_boundary(new_len) {
375-
new_len -= 1;
376-
}
377-
text.truncate(new_len);
378-
}
379-
380-
text
381-
}
382-
383397
async fn encode_text(&self, text: &str) -> Result<Vec<f32>> {
384398
// Prefer Jina provider when available (cloud code embeddings with code.query task)
385399
#[cfg(feature = "jina")]
@@ -450,39 +464,3 @@ fn simple_hash(text: &str) -> u32 {
450464
}
451465
hash
452466
}
453-
454-
fn language_to_string(lang: &codegraph_core::Language) -> String {
455-
match lang {
456-
codegraph_core::Language::Rust => "rust".to_string(),
457-
codegraph_core::Language::TypeScript => "typescript".to_string(),
458-
codegraph_core::Language::JavaScript => "javascript".to_string(),
459-
codegraph_core::Language::Python => "python".to_string(),
460-
codegraph_core::Language::Go => "go".to_string(),
461-
codegraph_core::Language::Java => "java".to_string(),
462-
codegraph_core::Language::Cpp => "cpp".to_string(),
463-
// Revolutionary universal language support
464-
codegraph_core::Language::Swift => "swift".to_string(),
465-
codegraph_core::Language::Kotlin => "kotlin".to_string(),
466-
codegraph_core::Language::CSharp => "csharp".to_string(),
467-
codegraph_core::Language::Ruby => "ruby".to_string(),
468-
codegraph_core::Language::Php => "php".to_string(),
469-
codegraph_core::Language::Dart => "dart".to_string(),
470-
codegraph_core::Language::Other(name) => name.clone(),
471-
}
472-
}
473-
474-
fn node_type_to_string(node_type: &codegraph_core::NodeType) -> String {
475-
match node_type {
476-
codegraph_core::NodeType::Function => "function".to_string(),
477-
codegraph_core::NodeType::Struct => "struct".to_string(),
478-
codegraph_core::NodeType::Enum => "enum".to_string(),
479-
codegraph_core::NodeType::Trait => "trait".to_string(),
480-
codegraph_core::NodeType::Module => "module".to_string(),
481-
codegraph_core::NodeType::Variable => "variable".to_string(),
482-
codegraph_core::NodeType::Import => "import".to_string(),
483-
codegraph_core::NodeType::Class => "class".to_string(),
484-
codegraph_core::NodeType::Interface => "interface".to_string(),
485-
codegraph_core::NodeType::Type => "type".to_string(),
486-
codegraph_core::NodeType::Other(name) => name.clone(),
487-
}
488-
}

0 commit comments

Comments
 (0)