11#[ cfg( any( feature = "local-embeddings" , feature = "openai" , feature = "onnx" ) ) ]
22use crate :: embeddings:: generator:: TextEmbeddingEngine ;
3+ use crate :: prep:: chunker:: {
4+ aggregate_chunk_embeddings, build_chunk_plan, ChunkPlan , ChunkerConfig , SanitizeMode ,
5+ } ;
36use codegraph_core:: { CodeGraphError , CodeNode , Result } ;
4- #[ cfg( any(
5- feature = "local-embeddings" ,
6- feature = "openai" ,
7- feature = "onnx" ,
8- feature = "ollama" ,
9- feature = "jina"
10- ) ) ]
11- use std:: sync:: Arc ;
7+ use std:: { path:: PathBuf , sync:: Arc } ;
8+ use tokenizers:: Tokenizer ;
129
1310pub struct EmbeddingGenerator {
1411 model_config : ModelConfig ,
@@ -18,6 +15,7 @@ pub struct EmbeddingGenerator {
1815 ollama_provider : Option < crate :: ollama_embedding_provider:: OllamaEmbeddingProvider > ,
1916 #[ cfg( feature = "jina" ) ]
2017 jina_provider : Option < crate :: jina_provider:: JinaEmbeddingProvider > ,
18+ tokenizer : Arc < Tokenizer > ,
2119}
2220
2321#[ derive( Debug , Clone ) ]
@@ -39,6 +37,17 @@ impl Default for ModelConfig {
3937
4038impl EmbeddingGenerator {
4139 pub fn new ( config : ModelConfig ) -> Self {
40+ let tokenizer_path = PathBuf :: from ( concat ! (
41+ env!( "CARGO_MANIFEST_DIR" ) ,
42+ "/tokenizers/qwen2.5-coder.json"
43+ ) ) ;
44+ let tokenizer = Tokenizer :: from_file ( & tokenizer_path) . unwrap_or_else ( |e| {
45+ panic ! (
46+ "Failed to load tokenizer from {:?}: {}. This tokenizer is required for chunking." ,
47+ tokenizer_path, e
48+ )
49+ } ) ;
50+
4251 Self {
4352 model_config : config,
4453 #[ cfg( any( feature = "local-embeddings" , feature = "openai" , feature = "onnx" ) ) ]
@@ -47,6 +56,7 @@ impl EmbeddingGenerator {
4756 ollama_provider : None ,
4857 #[ cfg( feature = "jina" ) ]
4958 jina_provider : None ,
59+ tokenizer : Arc :: new ( tokenizer) ,
5060 }
5161 }
5262
@@ -76,6 +86,16 @@ impl EmbeddingGenerator {
7686 }
7787 }
7888
89+ fn chunker_config ( & self ) -> ChunkerConfig {
90+ ChunkerConfig :: new ( self . model_config . max_tokens )
91+ . sanitize_mode ( SanitizeMode :: AsciiFastPath )
92+ . cache_capacity ( 2048 )
93+ }
94+
95+ fn build_plan_for_nodes ( & self , nodes : & [ CodeNode ] ) -> ChunkPlan {
96+ build_chunk_plan ( nodes, Arc :: clone ( & self . tokenizer ) , self . chunker_config ( ) )
97+ }
98+
7999 pub fn dimension ( & self ) -> usize {
80100 self . model_config . dimension
81101 }
@@ -247,11 +267,16 @@ impl EmbeddingGenerator {
247267 }
248268
249269 pub async fn generate_embedding ( & self , node : & CodeNode ) -> Result < Vec < f32 > > {
250- let text = self . prepare_text ( node) ;
251- self . encode_text ( & text) . await
270+ let mut embeddings = self . generate_embeddings ( std:: slice:: from_ref ( node) ) . await ?;
271+ embeddings
272+ . pop ( )
273+ . ok_or_else ( || CodeGraphError :: Vector ( "No embedding generated" . to_string ( ) ) )
252274 }
253275
254276 pub async fn generate_embeddings ( & self , nodes : & [ CodeNode ] ) -> Result < Vec < Vec < f32 > > > {
277+ if nodes. is_empty ( ) {
278+ return Ok ( Vec :: new ( ) ) ;
279+ }
255280 // Prefer Jina provider for batch processing (cloud-based embeddings)
256281 #[ cfg( feature = "jina" ) ]
257282 if let Some ( jina) = & self . jina_provider {
@@ -294,31 +319,51 @@ impl EmbeddingGenerator {
294319
295320 #[ cfg( any( feature = "local-embeddings" , feature = "openai" , feature = "onnx" ) ) ]
296321 if let Some ( engine) = & self . advanced {
297- // Use provider's batched path when available
298- let texts: Vec < String > = nodes. iter ( ) . map ( |n| self . prepare_text ( n) ) . collect ( ) ;
322+ let plan = self . build_plan_for_nodes ( nodes) ;
323+ tracing:: info!(
324+ target: "codegraph_vector::embeddings" ,
325+ "Advanced engine chunk plan: {} nodes -> {} chunks" ,
326+ plan. stats. total_nodes,
327+ plan. stats. total_chunks
328+ ) ;
329+ let chunk_to_node = plan. chunk_to_node ( ) ;
330+ let chunk_texts: Vec < String > =
331+ plan. chunks . into_iter ( ) . map ( |chunk| chunk. text ) . collect ( ) ;
299332 tracing:: info!(
300333 target: "codegraph_vector::embeddings" ,
301- "Using advanced embedding engine for batch: {} items " ,
302- texts . len( )
334+ "Using advanced embedding engine for batch: {} chunks " ,
335+ chunk_texts . len( )
303336 ) ;
304- let embs = engine. embed_many ( & texts ) . await ?;
305- if embs . len ( ) != texts . len ( ) {
337+ let chunk_embeddings = engine. embed_many ( & chunk_texts ) . await ?;
338+ if chunk_embeddings . len ( ) != chunk_texts . len ( ) {
306339 return Err ( CodeGraphError :: Vector ( format ! (
307340 "provider returned {} embeddings for {} inputs" ,
308- embs . len( ) ,
309- texts . len( )
341+ chunk_embeddings . len( ) ,
342+ chunk_texts . len( )
310343 ) ) ) ;
311344 }
312- return Ok ( embs) ;
345+ let aggregated = aggregate_chunk_embeddings (
346+ nodes. len ( ) ,
347+ & chunk_to_node,
348+ chunk_embeddings,
349+ self . dimension ( ) ,
350+ ) ;
351+ return Ok ( aggregated) ;
313352 }
314353
315- // Fallback: sequential deterministic embeddings
316- let mut embeddings = Vec :: with_capacity ( nodes. len ( ) ) ;
317- for node in nodes {
318- let embedding = self . generate_embedding ( node) . await ?;
319- embeddings. push ( embedding) ;
354+ // Fallback: sequential deterministic embeddings with chunking
355+ let plan = self . build_plan_for_nodes ( nodes) ;
356+ let chunk_to_node = plan. chunk_to_node ( ) ;
357+ let mut chunk_embeddings = Vec :: with_capacity ( plan. chunks . len ( ) ) ;
358+ for chunk in plan. chunks {
359+ chunk_embeddings. push ( self . encode_text ( & chunk. text ) . await ?) ;
320360 }
321- Ok ( embeddings)
361+ Ok ( aggregate_chunk_embeddings (
362+ nodes. len ( ) ,
363+ & chunk_to_node,
364+ chunk_embeddings,
365+ self . dimension ( ) ,
366+ ) )
322367 }
323368
324369 /// Generate an embedding directly from free text. Useful for query embeddings.
@@ -349,37 +394,6 @@ impl EmbeddingGenerator {
349394 Ok ( embeddings)
350395 }
351396
352- fn prepare_text ( & self , node : & CodeNode ) -> String {
353- let mut text = format ! (
354- "{} {} {}" ,
355- node. language
356- . as_ref( )
357- . map_or( "unknown" . to_string( ) , language_to_string) ,
358- node. node_type
359- . as_ref( )
360- . map_or( "unknown" . to_string( ) , node_type_to_string) ,
361- node. name. as_str( )
362- ) ;
363-
364- if let Some ( content) = & node. content {
365- text. push ( ' ' ) ;
366- text. push_str ( content) ;
367- }
368-
369- if text. len ( ) > self . model_config . max_tokens * 4 {
370- let mut new_len = self . model_config . max_tokens * 4 ;
371- if new_len > text. len ( ) {
372- new_len = text. len ( ) ;
373- }
374- while new_len > 0 && !text. is_char_boundary ( new_len) {
375- new_len -= 1 ;
376- }
377- text. truncate ( new_len) ;
378- }
379-
380- text
381- }
382-
383397 async fn encode_text ( & self , text : & str ) -> Result < Vec < f32 > > {
384398 // Prefer Jina provider when available (cloud code embeddings with code.query task)
385399 #[ cfg( feature = "jina" ) ]
@@ -450,39 +464,3 @@ fn simple_hash(text: &str) -> u32 {
450464 }
451465 hash
452466}
453-
454- fn language_to_string ( lang : & codegraph_core:: Language ) -> String {
455- match lang {
456- codegraph_core:: Language :: Rust => "rust" . to_string ( ) ,
457- codegraph_core:: Language :: TypeScript => "typescript" . to_string ( ) ,
458- codegraph_core:: Language :: JavaScript => "javascript" . to_string ( ) ,
459- codegraph_core:: Language :: Python => "python" . to_string ( ) ,
460- codegraph_core:: Language :: Go => "go" . to_string ( ) ,
461- codegraph_core:: Language :: Java => "java" . to_string ( ) ,
462- codegraph_core:: Language :: Cpp => "cpp" . to_string ( ) ,
463- // Revolutionary universal language support
464- codegraph_core:: Language :: Swift => "swift" . to_string ( ) ,
465- codegraph_core:: Language :: Kotlin => "kotlin" . to_string ( ) ,
466- codegraph_core:: Language :: CSharp => "csharp" . to_string ( ) ,
467- codegraph_core:: Language :: Ruby => "ruby" . to_string ( ) ,
468- codegraph_core:: Language :: Php => "php" . to_string ( ) ,
469- codegraph_core:: Language :: Dart => "dart" . to_string ( ) ,
470- codegraph_core:: Language :: Other ( name) => name. clone ( ) ,
471- }
472- }
473-
474- fn node_type_to_string ( node_type : & codegraph_core:: NodeType ) -> String {
475- match node_type {
476- codegraph_core:: NodeType :: Function => "function" . to_string ( ) ,
477- codegraph_core:: NodeType :: Struct => "struct" . to_string ( ) ,
478- codegraph_core:: NodeType :: Enum => "enum" . to_string ( ) ,
479- codegraph_core:: NodeType :: Trait => "trait" . to_string ( ) ,
480- codegraph_core:: NodeType :: Module => "module" . to_string ( ) ,
481- codegraph_core:: NodeType :: Variable => "variable" . to_string ( ) ,
482- codegraph_core:: NodeType :: Import => "import" . to_string ( ) ,
483- codegraph_core:: NodeType :: Class => "class" . to_string ( ) ,
484- codegraph_core:: NodeType :: Interface => "interface" . to_string ( ) ,
485- codegraph_core:: NodeType :: Type => "type" . to_string ( ) ,
486- codegraph_core:: NodeType :: Other ( name) => name. clone ( ) ,
487- }
488- }
0 commit comments