fix: allow multi input sources and ensure sequence extraction in omics qa

CHERRY-ui8 · CHERRY-ui8 · commit d2c413544400 · 2025-12-17T17:44:39.000+08:00
diff --git a/examples/generate/generate_omics_qa/omics_qa_config.yaml b/examples/generate/generate_omics_qa/omics_qa_config.yaml
@@ -10,10 +10,10 @@ nodes:
     dependencies: []
     params:
       input_path:
-        # For DNA: examples/input_examples/search_dna_demo.jsonl
-        # For RNA: examples/input_examples/search_rna_demo.jsonl
-        # For Protein: examples/input_examples/search_protein_demo.jsonl
-        - examples/input_examples/search_protein_demo.jsonl # Change this to dna/rna/protein demo file as needed
+        # three input files to generate DNA, RNA, and Protein data together
+        - examples/input_examples/search_dna_demo.jsonl
+        - examples/input_examples/search_rna_demo.jsonl
+        - examples/input_examples/search_protein_demo.jsonl
 
   - id: search_data
     op_name: search
@@ -24,25 +24,26 @@ nodes:
       replicas: 1
       batch_size: 10
     params:
-      data_sources: [uniprot] # Change to [ncbi] for DNA or [rnacentral] for RNA
+      data_sources: [ncbi, rnacentral, uniprot] # Multi-omics: use all three data sources
       # DNA search parameters
       ncbi_params:
         email: your_email@example.com # Required for NCBI
         tool: GraphGen
         use_local_blast: true
-        local_blast_db: refseq_release/refseq_release
+        local_blast_db: databases/refseq_232_old/refseq_232
         blast_num_threads: 2
         max_concurrent: 5
       # RNA search parameters
       rnacentral_params:
         use_local_blast: true
-        local_blast_db: rnacentral_ensembl_gencode_YYYYMMDD/ensembl_gencode_YYYYMMDD
+        local_blast_db: databases/rnacentral_merged_20251213/rnacentral_merged_20251213
         blast_num_threads: 2
         max_concurrent: 5
       # Protein search parameters
       uniprot_params:
         use_local_blast: true
-        local_blast_db: ${RELEASE}/uniprot_sprot
+        # local_blast_db: ${RELEASE}/uniprot_sprot
+        local_blast_db: databases/2025_04/uniprot_sprot
         blast_num_threads: 2
         max_concurrent: 5 
 
@@ -76,7 +77,7 @@ nodes:
     params:
       method: anchor_bfs # partition method
       method_params:
-        anchor_type: protein # node type (dna, rna, or protein)
+        anchor_type: [dna, rna, protein] # Multi-omics: support multiple anchor types (list or single string)
         max_units_per_community: 10 # max nodes and edges per community
 
   - id: generate
diff --git a/graphgen/models/generator/omics_qa_generator.py b/graphgen/models/generator/omics_qa_generator.py
@@ -230,28 +230,36 @@ async def generate(
         # Detect molecule type from nodes
         molecule_type = self._detect_molecule_type(nodes)
         
-        # Extract caption for each node and attach to QA pairs
-        # Only attach caption once per batch (from the first relevant node)
+        # Extract captions for all molecule types from nodes
+        captions = {"dna": None, "rna": None, "protein": None}
         caption_attached = False
+        
         for node in nodes:
             node_id, node_data = node
-            caption = self._extract_caption(node_data, molecule_type)
             
-            if caption and not caption_attached:
-                # Attach caption to all QA pairs
-                for qa in qa_pairs.values():
-                    # Use molecule_type as the key (dna, rna, or protein)
-                    qa[molecule_type] = caption
-                caption_attached = True
-                break  # Only need to attach once per batch
+            # Check for pre-extracted captions (from partition_service)
+            for mol_type in ["dna", "rna", "protein"]:
+                caption_key = f"{mol_type}_caption"
+                if caption_key in node_data and node_data[caption_key]:
+                    captions[mol_type] = node_data[caption_key]
+                    caption_attached = True
+            
+            # If no pre-extracted captions, extract from node_data using the detected molecule_type
+            if not caption_attached:
+                caption = self._extract_caption(node_data, molecule_type)
+                if caption:
+                    captions[molecule_type] = caption
+                    caption_attached = True
+                    break  # Only need to extract once per batch
+        
+        # Attach all captions to QA pairs
+        for qa in qa_pairs.values():
+            qa["dna"] = captions["dna"] if captions["dna"] else ""
+            qa["rna"] = captions["rna"] if captions["rna"] else ""
+            qa["protein"] = captions["protein"] if captions["protein"] else ""
         
         if not caption_attached:
             logger.warning(f"No caption extracted for molecule_type={molecule_type}. Node data sample: {dict(list(nodes[0][1].items())[:5]) if nodes else 'No nodes'}")
-            # Still attach empty captions to maintain format consistency
-            for qa in qa_pairs.values():
-                qa.setdefault("dna", "")
-                qa.setdefault("rna", "")
-                qa.setdefault("protein", "")
         
         result.update(qa_pairs)
         return result
diff --git a/graphgen/models/kg_builder/omics_kg_builder.py b/graphgen/models/kg_builder/omics_kg_builder.py
@@ -179,12 +179,17 @@ async def merge_nodes(
             set([dp["source_id"] for dp in node_data] + source_ids)
         )
 
-        node_data = {
+        node_data_dict = {
             "entity_type": entity_type,
             "description": description,
             "source_id": source_id,
         }
-        kg_instance.upsert_node(entity_name, node_data=node_data)
+        
+        # Preserve sequence from existing node if present (e.g., added by partition_service)
+        if node is not None and "sequence" in node and node["sequence"]:
+            node_data_dict["sequence"] = node["sequence"]
+            
+        kg_instance.upsert_node(entity_name, node_data=node_data_dict)
 
     async def merge_edges(
         self,
@@ -194,6 +199,12 @@ async def merge_edges(
         """Merge extracted edges into the knowledge graph."""
         (src_id, tgt_id), edge_data = edges_data
 
+        # Skip self-loops (edges where source and target are the same)
+        # This can happen when LLM extracts invalid relationships
+        if src_id == tgt_id:
+            logger.debug("Skipping self-loop edge: (%s, %s)", src_id, tgt_id)
+            return
+
         source_ids = []
         descriptions = []
 
diff --git a/graphgen/models/partitioner/anchor_bfs_partitioner.py b/graphgen/models/partitioner/anchor_bfs_partitioner.py
@@ -1,6 +1,6 @@
 import random
 from collections import deque
-from typing import Any, Iterable, List, Literal, Set, Tuple
+from typing import Any, Iterable, List, Literal, Set, Tuple, Union
 
 from graphgen.bases import BaseGraphStorage
 from graphgen.bases.datatypes import Community
@@ -18,16 +18,26 @@ class AnchorBFSPartitioner(BFSPartitioner):
     2. Expand the community using BFS until the max unit size is reached.(A unit is a node or an edge.)
     3. Non-anchor units can only be "pulled" into a community and never become seeds themselves.
     For example, for VQA tasks, we may want to use image nodes as anchors and expand to nearby text nodes and edges.
+    
+    Supports multiple anchor types for multi-omics data: anchor_type can be a single string or a list of strings.
+    When a list is provided, nodes matching any of the types in the list can serve as anchors.
     """
 
     def __init__(
         self,
         *,
-        anchor_type: Literal["image"] = "image",
+        anchor_type: Union[
+            Literal["image", "dna", "rna", "protein"],
+            List[Literal["dna", "rna", "protein"]],
+        ] = "image",
         anchor_ids: Set[str] | None = None,
     ) -> None:
         super().__init__()
-        self.anchor_type = anchor_type
+        # Normalize anchor_type to always be a list for internal processing
+        if isinstance(anchor_type, str):
+            self.anchor_types = [anchor_type]
+        else:
+            self.anchor_types = list(anchor_type)
         self.anchor_ids = anchor_ids
 
     def partition(
@@ -68,10 +78,53 @@ def _pick_anchor_ids(
             return self.anchor_ids
 
         anchor_ids: Set[str] = set()
+        anchor_types_lower = [at.lower() for at in self.anchor_types]
+        
         for node_id, meta in nodes:
+            # Check if node matches any of the anchor types
+            matched = False
+            
+            # Check 1: entity_type (for image, etc.)
             node_type = str(meta.get("entity_type", "")).lower()
-            if self.anchor_type.lower() in node_type:
+            for anchor_type_lower in anchor_types_lower:
+                if anchor_type_lower in node_type:
+                    anchor_ids.add(node_id)
+                    matched = True
+                    break
+            
+            if matched:
+                continue
+            
+            # Check 2: molecule_type (for omics data: dna, rna, protein)
+            molecule_type = str(meta.get("molecule_type", "")).lower()
+            if molecule_type in anchor_types_lower:
                 anchor_ids.add(node_id)
+                continue
+            
+            # Check 3: source_id prefix (for omics data: dna-, rna-, protein-)
+            source_id = str(meta.get("source_id", "")).lower()
+            for anchor_type_lower in anchor_types_lower:
+                if source_id.startswith(f"{anchor_type_lower}-"):
+                    anchor_ids.add(node_id)
+                    matched = True
+                    break
+            
+            if matched:
+                continue
+            
+            # Check 4: Check if source_id contains multiple IDs separated by <SEP>
+            if "<sep>" in source_id:
+                source_ids = source_id.split("<sep>")
+                for sid in source_ids:
+                    sid = sid.strip()
+                    for anchor_type_lower in anchor_types_lower:
+                        if sid.startswith(f"{anchor_type_lower}-"):
+                            anchor_ids.add(node_id)
+                            matched = True
+                            break
+                    if matched:
+                        break
+        
         return anchor_ids
 
     @staticmethod
@@ -113,7 +166,21 @@ def _grow_community(
                 if it in used_e:
                     continue
                 used_e.add(it)
-                u, v = it
+                # Convert frozenset to tuple for edge representation
+                # Note: Self-loops should be filtered during graph construction,
+                # but we handle edge cases defensively
+                try:
+                    u, v = tuple(it)
+                except ValueError:
+                    # Handle edge case: frozenset with unexpected number of elements
+                    # This should not happen if graph construction is correct
+                    edge_nodes = list(it)
+                    if len(edge_nodes) == 1:
+                        # Self-loop edge (should have been filtered during graph construction)
+                        u, v = edge_nodes[0], edge_nodes[0]
+                    else:
+                        # Invalid edge, skip it
+                        continue
                 comm_e.append((u, v))
                 cnt += 1
                 for n in it:
diff --git a/graphgen/operators/partition/partition_service.py b/graphgen/operators/partition/partition_service.py
@@ -64,8 +64,13 @@ def partition(self) -> Iterable[pd.DataFrame]:
             partitioner = LeidenPartitioner()
         elif method == "anchor_bfs":
             logger.info("Partitioning knowledge graph using Anchor BFS method.")
+            anchor_type = method_params.get("anchor_type")
+            if isinstance(anchor_type, list):
+                logger.info("Using multiple anchor types: %s", anchor_type)
+            else:
+                logger.info("Using single anchor type: %s", anchor_type)
             partitioner = AnchorBFSPartitioner(
-                anchor_type=method_params.get("anchor_type"),
+                anchor_type=anchor_type,
                 anchor_ids=set(method_params.get("anchor_ids", []))
                 if method_params.get("anchor_ids")
                 else None,
@@ -187,41 +192,86 @@ def _attach_additional_data_to_node(self, batch: tuple) -> tuple:
                     logger.warning("No chunks found for node %s (type: %s) with source_ids: %s", node_id, molecule_type, source_ids)
                     continue
                 
-                first_chunk = omics_chunks[0]
-                def get_chunk_value(field: str):
+                def get_chunk_value(chunk: dict, field: str):
                     # First check root level of chunk
-                    if field in first_chunk:
-                        return first_chunk[field]
+                    if field in chunk:
+                        return chunk[field]
                     # Then check metadata if it exists and is a dict
-                    chunk_metadata = first_chunk.get("metadata")
+                    chunk_metadata = chunk.get("metadata")
                     if isinstance(chunk_metadata, dict) and field in chunk_metadata:
                         return chunk_metadata[field]
                     return None
                 
-                # Attach sequence if not already present
+                # Group chunks by molecule type to preserve all types of sequences
+                chunks_by_type = {"dna": [], "rna": [], "protein": []}
+                for chunk in omics_chunks:
+                    chunk_id = chunk.get("_chunk_id", "").lower()
+                    if chunk_id.startswith("dna-"):
+                        chunks_by_type["dna"].append(chunk)
+                    elif chunk_id.startswith("rna-"):
+                        chunks_by_type["rna"].append(chunk)
+                    elif chunk_id.startswith("protein-"):
+                        chunks_by_type["protein"].append(chunk)
+                
+                # Field mappings for each molecule type
+                field_mapping = {
+                    "protein": ["protein_name", "gene_names", "organism", "function", "sequence", "id", "database", "entry_name", "uniprot_id"],
+                    "dna": ["gene_name", "gene_description", "organism", "chromosome", "genomic_location", "function", "gene_type", "sequence", "id", "database"],
+                    "rna": ["rna_type", "description", "organism", "related_genes", "gene_name", "so_term", "sequence", "id", "database", "rnacentral_id"],
+                }
+                
+                # Extract and store captions for each molecule type
+                for mol_type in ["dna", "rna", "protein"]:
+                    type_chunks = chunks_by_type[mol_type]
+                    if not type_chunks:
+                        continue
+                    
+                    # Use the first chunk of this type
+                    type_chunk = type_chunks[0]
+                    caption = {}
+                    
+                    # Extract all relevant fields for this molecule type
+                    for field in field_mapping.get(mol_type, []):
+                        value = get_chunk_value(type_chunk, field)
+                        if value:
+                            caption[field] = value
+                    
+                    # Store caption if it has any data
+                    if caption:
+                        caption_key = f"{mol_type}_caption"
+                        node_data[caption_key] = caption
+                        logger.debug("Stored %s caption for node %s with %d fields", mol_type, node_id, len(caption))
+                
+                # For backward compatibility, also attach sequence and other fields from the primary molecule type
+                # Use the detected molecule_type or default to the first available type
+                primary_chunk = None
+                if chunks_by_type.get(molecule_type):
+                    primary_chunk = chunks_by_type[molecule_type][0]
+                elif chunks_by_type["dna"]:
+                    primary_chunk = chunks_by_type["dna"][0]
+                elif chunks_by_type["rna"]:
+                    primary_chunk = chunks_by_type["rna"][0]
+                elif chunks_by_type["protein"]:
+                    primary_chunk = chunks_by_type["protein"][0]
+                else:
+                    primary_chunk = omics_chunks[0]
+                
+                # Attach sequence if not already present (for backward compatibility)
                 if "sequence" not in node_data:
-                    sequence = get_chunk_value("sequence")
+                    sequence = get_chunk_value(primary_chunk, "sequence")
                     if sequence:
                         node_data["sequence"] = sequence
-                    else:
-                        logger.warning("No sequence found in chunk for node %s. Chunk keys: %s", node_id, list(first_chunk.keys())[:15])
                 
                 # Attach molecule_type if not present
                 if "molecule_type" not in node_data:
-                    chunk_molecule_type = get_chunk_value("molecule_type")
+                    chunk_molecule_type = get_chunk_value(primary_chunk, "molecule_type")
                     if chunk_molecule_type:
                         node_data["molecule_type"] = chunk_molecule_type
                 
-                # Attach molecule-specific fields
-                field_mapping = {
-                    "protein": ["protein_name", "gene_names", "organism", "function", "id", "database", "entry_name", "uniprot_id"],
-                    "dna": ["gene_name", "gene_description", "organism", "chromosome", "genomic_location", "function", "gene_type", "id", "database"],
-                    "rna": ["rna_type", "description", "organism", "related_genes", "gene_name", "so_term", "id", "database", "rnacentral_id"],
-                }
-                
+                # Attach molecule-specific fields from primary chunk (for backward compatibility)
                 for field in field_mapping.get(molecule_type, []):
                     if field not in node_data:
-                        value = get_chunk_value(field)
+                        value = get_chunk_value(primary_chunk, field)
                         if value:
                             node_data[field] = value
 
diff --git a/graphgen/operators/search/search_service.py b/graphgen/operators/search/search_service.py
@@ -242,8 +242,11 @@ def process(self, batch: pd.DataFrame) -> pd.DataFrame:
                 if doc_type not in ["text", "dna", "rna", "protein"]:
                     doc_type = "text"
                 
-                # Generate document ID from result ID or search query
-                doc_id = result.get("id") or result.get("_search_query") or f"search-{len(result_rows)}"
+                # Convert to string to handle Ray Data ListElement and other types
+                raw_doc_id = result.get("id") or result.get("_search_query") or f"search-{len(result_rows)}"
+                doc_id = str(raw_doc_id)
+                
+                # Ensure doc_id starts with "doc-" prefix
                 if not doc_id.startswith("doc-"):
                     doc_id = f"doc-{doc_id}"