Refactor logging setup and clean up whitespace in various files. Adjusted formatting in base_operator.py, base_searcher.py, and anchor_bfs_partitioner.py for consistency. Removed unnecessary blank lines in json_reader.py, rnacentral_searcher.py, and other template files to improve readability.

CHERRY-ui8 · CHERRY-ui8 · commit 1e08c61cb51b · 2025-12-18T00:13:15.000+08:00
diff --git a/graphgen/bases/base_operator.py b/graphgen/bases/base_operator.py
@@ -30,7 +30,7 @@ def __init__(self, working_dir: str = "cache", op_name: str = None):
         log_file = os.path.join(log_dir, f"{self.op_name}_{worker_id_short}.log")
 
         self.logger = set_logger(
-            log_file=log_file, name=f"{self.op_name}.{worker_id_short}", 
+            log_file=log_file, name=f"{self.op_name}.{worker_id_short}",
             console_level=logging.ERROR, force=True
         )
 
diff --git a/graphgen/bases/base_searcher.py b/graphgen/bases/base_searcher.py
@@ -24,7 +24,7 @@ def __init__(self, working_dir: str = "cache"):
         log_file = os.path.join(log_dir, f"{searcher_name}.log")
 
         self.logger = set_logger(
-            log_file=log_file, name=searcher_name, 
+            log_file=log_file, name=searcher_name,
             console_level=logging.ERROR, force=True
         )
 
diff --git a/graphgen/models/partitioner/anchor_bfs_partitioner.py b/graphgen/models/partitioner/anchor_bfs_partitioner.py
@@ -18,7 +18,7 @@ class AnchorBFSPartitioner(BFSPartitioner):
     2. Expand the community using BFS until the max unit size is reached.(A unit is a node or an edge.)
     3. Non-anchor units can only be "pulled" into a community and never become seeds themselves.
     For example, for VQA tasks, we may want to use image nodes as anchors and expand to nearby text nodes and edges.
-    
+
     Supports multiple anchor types for multi-omics data: anchor_type can be a single string or a list of strings.
     When a list is provided, nodes matching any of the types in the list can serve as anchors.
     """
@@ -79,39 +79,39 @@ def _pick_anchor_ids(
 
         anchor_ids: Set[str] = set()
         anchor_types_lower = [at.lower() for at in self.anchor_types]
-        
+
         for node_id, meta in nodes:
             # Check if node matches any of the anchor types
             matched = False
-            
+
             # Check 1: entity_type (for image, etc.)
             node_type = str(meta.get("entity_type", "")).lower()
             for anchor_type_lower in anchor_types_lower:
                 if anchor_type_lower in node_type:
                     anchor_ids.add(node_id)
                     matched = True
                     break
-            
+
             if matched:
                 continue
-            
+
             # Check 2: molecule_type (for omics data: dna, rna, protein)
             molecule_type = str(meta.get("molecule_type", "")).lower()
             if molecule_type in anchor_types_lower:
                 anchor_ids.add(node_id)
                 continue
-            
+
             # Check 3: source_id prefix (for omics data: dna-, rna-, protein-)
             source_id = str(meta.get("source_id", "")).lower()
             for anchor_type_lower in anchor_types_lower:
                 if source_id.startswith(f"{anchor_type_lower}-"):
                     anchor_ids.add(node_id)
                     matched = True
                     break
-            
+
             if matched:
                 continue
-            
+
             # Check 4: Check if source_id contains multiple IDs separated by <SEP>
             if "<sep>" in source_id:
                 source_ids = source_id.split("<sep>")
@@ -124,7 +124,7 @@ def _pick_anchor_ids(
                             break
                     if matched:
                         break
-        
+
         return anchor_ids
 
     @staticmethod
diff --git a/graphgen/models/reader/json_reader.py b/graphgen/models/reader/json_reader.py
@@ -48,13 +48,13 @@ def read_stream(self, file_path: str) -> Iterator[Dict[str, Any]]:
         """
         Stream read JSONL files line by line without loading entire file into memory.
         Returns an iterator that yields filtered documents.
-        
+
         :param file_path: Path to the JSONL file.
         :return: Iterator of dictionaries containing the data.
         """
         if not file_path.endswith(".jsonl"):
             raise ValueError("read_stream only supports JSONL files, not JSON files")
-        
+
         with open(file_path, "r", encoding="utf-8") as f:
             for line in f:
                 try:
@@ -64,7 +64,7 @@ def read_stream(self, file_path: str) -> Iterator[Dict[str, Any]]:
                         raise ValueError(
                             f"Missing '{self.text_column}' in document: {doc}"
                         )
-                    
+
                     # Apply filtering logic inline (similar to BaseReader.filter)
                     if doc.get("type") == "text":
                         content = doc.get(self.text_column, "").strip()
diff --git a/graphgen/models/searcher/db/rnacentral_searcher.py b/graphgen/models/searcher/db/rnacentral_searcher.py
@@ -35,9 +35,9 @@ class RNACentralSearch(BaseSearcher):
     """
 
     def __init__(
-        self, 
-        use_local_blast: bool = False, 
-        local_blast_db: str = "rna_db", 
+        self,
+        use_local_blast: bool = False,
+        local_blast_db: str = "rna_db",
         api_timeout: int = 30,
         blast_num_threads: int = 4,
         working_dir: str = "cache",
@@ -49,7 +49,7 @@ def __init__(
         self.local_blast_db = local_blast_db
         self.api_timeout = api_timeout
         self.blast_num_threads = blast_num_threads  # Number of threads for BLAST search
-        
+
         if self.use_local_blast and not os.path.isfile(f"{self.local_blast_db}.nhr"):
             self.logger.error("Local BLAST database files not found. Please check the path.")
             self.use_local_blast = False
@@ -254,22 +254,22 @@ def _local_blast(self, seq: str, threshold: float) -> Optional[str]:
                 "-num_threads", str(self.blast_num_threads),
                 "-outfmt", "6 sacc"  # Only accession, tab-separated
             ]
-            self.logger.debug("Running local blastn for RNA (threads=%d): %s", 
+            self.logger.debug("Running local blastn for RNA (threads=%d): %s",
                         self.blast_num_threads, " ".join(cmd))
-            
+
             # Run BLAST with timeout to avoid hanging
             try:
                 out = subprocess.check_output(
-                    cmd, 
-                    text=True, 
+                    cmd,
+                    text=True,
                     timeout=300,  # 5 minute timeout for BLAST search
                     stderr=subprocess.DEVNULL  # Suppress BLAST warnings to reduce I/O
                 ).strip()
             except subprocess.TimeoutExpired:
                 self.logger.warning("BLAST search timed out after 5 minutes for sequence")
                 os.remove(tmp_name)
                 return None
-            
+
             os.remove(tmp_name)
             return out.split("\n", maxsplit=1)[0] if out else None
         except Exception as exc:
@@ -378,7 +378,7 @@ async def search(self, query: str, threshold: float = 0.1, **kwargs) -> Optional
         # check if RNA sequence (AUCG or ATCG characters, contains U or T)
         # Note: Sequences with T are also RNA sequences
         is_rna_sequence = query.startswith(">") or (
-            re.fullmatch(r"[AUCGTN\s]+", query, re.I) and 
+            re.fullmatch(r"[AUCGTN\s]+", query, re.I) and
             ("U" in query.upper() or "T" in query.upper())
         )
         if is_rna_sequence:
diff --git a/graphgen/models/splitter/sequence_splitter.py b/graphgen/models/splitter/sequence_splitter.py
@@ -20,7 +20,7 @@ def __init__(
     ):
         """
         Initialize sequence splitter.
-        
+
         :param chunk_size: Maximum length of each chunk (in sequence characters)
         :param chunk_overlap: Number of characters to overlap between chunks
         :param length_function: Function to calculate length (default: len)
@@ -37,83 +37,83 @@ def __init__(
     def split_text(self, text: str) -> List[str]:
         """
         Split a sequence into chunks of fixed size with overlap.
-        
+
         :param text: The sequence to split (may include FASTA header)
         :return: List of sequence chunks
         """
         # Remove FASTA header if present
         sequence = self._extract_sequence(text)
-        
+
         if not sequence:
             logger.warning("Empty sequence provided to SequenceSplitter")
             return []
-        
+
         # If sequence is shorter than chunk_size, return as single chunk
         if len(sequence) <= self.chunk_size:
             return [sequence]
-        
+
         chunks = []
         start = 0
         step = self.chunk_size - self.chunk_overlap
-        
+
         while start < len(sequence):
             end = min(start + self.chunk_size, len(sequence))
             chunk = sequence[start:end]
             chunks.append(chunk)
             start += step
-            
+
             # Avoid infinite loop if step is 0 or negative
             if step <= 0:
                 break
-        
+
         return chunks
 
     @staticmethod
     def _extract_sequence(text: str) -> str:
         """
         Extract sequence from text, removing FASTA headers and whitespace.
-        
+
         :param text: Input text (may contain FASTA header)
         :return: Clean sequence string
         """
         # Remove FASTA header lines (lines starting with >)
         lines = text.split("\n")
         sequence_lines = [line for line in lines if not line.strip().startswith(">")]
-        
+
         # Join and remove whitespace
         sequence = "".join(sequence_lines)
         sequence = re.sub(r"\s+", "", sequence)
-        
+
         return sequence.upper()  # Normalize to uppercase
 
     @staticmethod
     def detect_sequence_type(sequence: str) -> Optional[str]:
         """
         Detect the type of sequence (DNA, RNA, or protein).
-        
+
         :param sequence: The sequence string
         :return: "dna", "rna", "protein", or None if cannot determine
         """
         # Remove FASTA header and whitespace
         clean_seq = SequenceSplitter._extract_sequence(sequence)
-        
+
         if not clean_seq:
             return None
-        
+
         # Check for protein-specific amino acids
         protein_chars = set("EFILPQXZ")  # Amino acids not in DNA/RNA
         if any(char in clean_seq for char in protein_chars):
             return "protein"
-        
+
         # Check for RNA-specific character (U)
         if "U" in clean_seq.upper():
             return "rna"
-        
+
         # Check if contains only DNA/RNA characters (A, T, G, C, N)
         dna_rna_chars = set("ATGCUN")
         if all(char.upper() in dna_rna_chars for char in clean_seq):
             # Default to DNA if ambiguous (could be DNA or RNA without U)
             return "dna"
-        
+
         # If contains other characters, might be protein
         return "protein"
diff --git a/graphgen/models/storage/kv/json_storage.py b/graphgen/models/storage/kv/json_storage.py
@@ -54,7 +54,7 @@ def iter_items(self) -> Iterator[Tuple[str, dict]]:
     def get_batch(self, keys: list[str]) -> dict[str, dict]:
         """
         Get a batch of items by their keys.
-        
+
         :param keys: List of keys to retrieve.
         :return: Dictionary of {key: value} for the requested keys.
         """
@@ -63,7 +63,7 @@ def get_batch(self, keys: list[str]) -> dict[str, dict]:
     def iter_batches(self, batch_size: int = 10000) -> Iterator[dict[str, dict]]:
         """
         Iterate over items in batches to avoid loading everything into memory.
-        
+
         :param batch_size: Number of items per batch.
         :return: Iterator of dictionaries, each containing up to batch_size items.
         """
diff --git a/graphgen/operators/build_kg/build_omics_kg.py b/graphgen/operators/build_kg/build_omics_kg.py
@@ -15,7 +15,7 @@ def build_omics_kg(
 ):
     """
     Build knowledge graph from multi-omics chunks (DNA, RNA, protein).
-    
+
     :param llm_client: Synthesizer LLM model to extract entities and relationships
     :param kg_instance: Graph storage instance
     :param chunks: List of omics chunks
diff --git a/graphgen/templates/description_rephrasing.py b/graphgen/templates/description_rephrasing.py
@@ -19,7 +19,7 @@
 ################
 -Real Data-
 ################
-Input: 
+Input:
 {input_sentence}
 ################
 Please directly output the rewritten sentence without any additional information.
diff --git a/graphgen/templates/extraction/schema_guided_extraction.py b/graphgen/templates/extraction/schema_guided_extraction.py
@@ -7,7 +7,7 @@
 3. Present your findings in JSON format as specified below.
 
 Important Notes:
-- Extract only relevant information. 
+- Extract only relevant information.
 - Consider the context of the entire document when determining relevance.
 - Do not be verbose, only respond with the correct format and information.
 - Some docs may have multiple relevant excerpts -- include all that apply.
diff --git a/graphgen/templates/generation/atomic_generation.py b/graphgen/templates/generation/atomic_generation.py
@@ -1,6 +1,6 @@
 # pylint: disable=C0301
 TEMPLATE_EN: str = """You are given a text passage. Your task is to generate a question and answer (QA) pair based on the content of that text.
-The answer should be accurate and directly derived from the text. Make sure the QA pair is relevant to the main theme or important details of the given text. 
+The answer should be accurate and directly derived from the text. Make sure the QA pair is relevant to the main theme or important details of the given text.
 For example:
 Question: What is the effect of overexpressing the BG1 gene on grain size and development?
 Answer: Overexpression of the BG1 gene leads to significantly increased grain size, demonstrating its role in grain development.
diff --git a/graphgen/templates/generation/cot_generation.py b/graphgen/templates/generation/cot_generation.py
@@ -106,7 +106,7 @@
 
 5. 推理路径生成
 - 根据问题设计一个**可被后续模型直接执行的推理蓝图**。
-- 保持步骤最小化：每一步只解决一个“不可分割”的子问题。 
+- 保持步骤最小化：每一步只解决一个“不可分割”的子问题。
 
 -约束条件-
 1. 不要在回答中描述你的思考过程，直接给出回复，只给出问题和推理路径设计，不要生成无关信息。
@@ -155,7 +155,7 @@
 - The question must be verifiable directly within the graph through entities, relationships, or attributes; avoid subjective judgments.
 - The question should allow the model to think sufficiently, fully utilizing the entities and relationships in the graph, avoiding overly simple or irrelevant questions.
 
-5. Reasoning-Path Design 
+5. Reasoning-Path Design
 - Output a **blueprint that any later model can directly execute**.
 - Keep steps minimal: each step solves one indivisible sub-problem.
 
diff --git a/graphgen/templates/kg/kg_extraction.py b/graphgen/templates/kg/kg_extraction.py
@@ -92,7 +92,7 @@
    - entity_type：以下类型之一：[{entity_types}]
    - entity_summary：实体的属性与活动的全面总结
    将每个实体格式化为("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_summary>)
-   
+
 2. 从步骤1中识别的实体中，识别所有（源实体，目标实体）对，这些实体彼此之间*明显相关*。
    对于每对相关的实体，提取以下信息：
    - source_entity：步骤1中识别的源实体名称
diff --git a/graphgen/templates/kg/mm_kg_extraction.py b/graphgen/templates/kg/mm_kg_extraction.py
@@ -26,7 +26,7 @@
     - target_entity: The name of the target entity identified in Step 1
     - relationship_summary: Explain why you think the source entity and target entity are related to each other
     Format each relationship as ("relationship"{tuple_delimiter}<source_entity>{tuple_delimiter}<target_entity>{tuple_delimiter}<relationship_summary>)
-    
+
 3. Return the output list of all entities and relationships identified in Steps 1 and 2 in English. Use **{record_delimiter}** as the list separator.
 
 4. Upon completion, output {completion_delimiter}
diff --git a/graphgen/templates/kg/protein_kg_extraction.py b/graphgen/templates/kg/protein_kg_extraction.py
diff --git a/graphgen/templates/search_judgement.py b/graphgen/templates/search_judgement.py
diff --git a/webui/base.py b/webui/base.py

Original file line number	Diff line number	Diff line change
`@@ -30,7 +30,7 @@ def __init__(self, working_dir: str = "cache", op_name: str = None):`
`30`	`30`	`log_file = os.path.join(log_dir, f"{self.op_name}_{worker_id_short}.log")`
`31`	`31`
`32`	`32`	`self.logger = set_logger(`
`33`		`- log_file=log_file, name=f"{self.op_name}.{worker_id_short}",`
	`33`	`+ log_file=log_file, name=f"{self.op_name}.{worker_id_short}",`
`34`	`34`	`console_level=logging.ERROR, force=True`
`35`	`35`	`)`
`36`	`36`
Original file line number	Diff line number	Diff line change
`@@ -24,7 +24,7 @@ def __init__(self, working_dir: str = "cache"):`
`24`	`24`	`log_file = os.path.join(log_dir, f"{searcher_name}.log")`
`25`	`25`
`26`	`26`	`self.logger = set_logger(`
`27`		`- log_file=log_file, name=searcher_name,`
	`27`	`+ log_file=log_file, name=searcher_name,`
`28`	`28`	`console_level=logging.ERROR, force=True`
`29`	`29`	`)`
`30`	`30`