Skip to content

Commit 1e08c61

Browse files
committed
Refactor logging setup and clean up whitespace in various files. Adjusted formatting in base_operator.py, base_searcher.py, and anchor_bfs_partitioner.py for consistency. Removed unnecessary blank lines in json_reader.py, rnacentral_searcher.py, and other template files to improve readability.
1 parent 35ffa68 commit 1e08c61

File tree

17 files changed

+181
-181
lines changed

17 files changed

+181
-181
lines changed

graphgen/bases/base_operator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def __init__(self, working_dir: str = "cache", op_name: str = None):
3030
log_file = os.path.join(log_dir, f"{self.op_name}_{worker_id_short}.log")
3131

3232
self.logger = set_logger(
33-
log_file=log_file, name=f"{self.op_name}.{worker_id_short}",
33+
log_file=log_file, name=f"{self.op_name}.{worker_id_short}",
3434
console_level=logging.ERROR, force=True
3535
)
3636

graphgen/bases/base_searcher.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ def __init__(self, working_dir: str = "cache"):
2424
log_file = os.path.join(log_dir, f"{searcher_name}.log")
2525

2626
self.logger = set_logger(
27-
log_file=log_file, name=searcher_name,
27+
log_file=log_file, name=searcher_name,
2828
console_level=logging.ERROR, force=True
2929
)
3030

graphgen/models/partitioner/anchor_bfs_partitioner.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ class AnchorBFSPartitioner(BFSPartitioner):
1818
2. Expand the community using BFS until the max unit size is reached.(A unit is a node or an edge.)
1919
3. Non-anchor units can only be "pulled" into a community and never become seeds themselves.
2020
For example, for VQA tasks, we may want to use image nodes as anchors and expand to nearby text nodes and edges.
21-
21+
2222
Supports multiple anchor types for multi-omics data: anchor_type can be a single string or a list of strings.
2323
When a list is provided, nodes matching any of the types in the list can serve as anchors.
2424
"""
@@ -79,39 +79,39 @@ def _pick_anchor_ids(
7979

8080
anchor_ids: Set[str] = set()
8181
anchor_types_lower = [at.lower() for at in self.anchor_types]
82-
82+
8383
for node_id, meta in nodes:
8484
# Check if node matches any of the anchor types
8585
matched = False
86-
86+
8787
# Check 1: entity_type (for image, etc.)
8888
node_type = str(meta.get("entity_type", "")).lower()
8989
for anchor_type_lower in anchor_types_lower:
9090
if anchor_type_lower in node_type:
9191
anchor_ids.add(node_id)
9292
matched = True
9393
break
94-
94+
9595
if matched:
9696
continue
97-
97+
9898
# Check 2: molecule_type (for omics data: dna, rna, protein)
9999
molecule_type = str(meta.get("molecule_type", "")).lower()
100100
if molecule_type in anchor_types_lower:
101101
anchor_ids.add(node_id)
102102
continue
103-
103+
104104
# Check 3: source_id prefix (for omics data: dna-, rna-, protein-)
105105
source_id = str(meta.get("source_id", "")).lower()
106106
for anchor_type_lower in anchor_types_lower:
107107
if source_id.startswith(f"{anchor_type_lower}-"):
108108
anchor_ids.add(node_id)
109109
matched = True
110110
break
111-
111+
112112
if matched:
113113
continue
114-
114+
115115
# Check 4: Check if source_id contains multiple IDs separated by <SEP>
116116
if "<sep>" in source_id:
117117
source_ids = source_id.split("<sep>")
@@ -124,7 +124,7 @@ def _pick_anchor_ids(
124124
break
125125
if matched:
126126
break
127-
127+
128128
return anchor_ids
129129

130130
@staticmethod

graphgen/models/reader/json_reader.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,13 +48,13 @@ def read_stream(self, file_path: str) -> Iterator[Dict[str, Any]]:
4848
"""
4949
Stream read JSONL files line by line without loading entire file into memory.
5050
Returns an iterator that yields filtered documents.
51-
51+
5252
:param file_path: Path to the JSONL file.
5353
:return: Iterator of dictionaries containing the data.
5454
"""
5555
if not file_path.endswith(".jsonl"):
5656
raise ValueError("read_stream only supports JSONL files, not JSON files")
57-
57+
5858
with open(file_path, "r", encoding="utf-8") as f:
5959
for line in f:
6060
try:
@@ -64,7 +64,7 @@ def read_stream(self, file_path: str) -> Iterator[Dict[str, Any]]:
6464
raise ValueError(
6565
f"Missing '{self.text_column}' in document: {doc}"
6666
)
67-
67+
6868
# Apply filtering logic inline (similar to BaseReader.filter)
6969
if doc.get("type") == "text":
7070
content = doc.get(self.text_column, "").strip()

graphgen/models/searcher/db/rnacentral_searcher.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,9 @@ class RNACentralSearch(BaseSearcher):
3535
"""
3636

3737
def __init__(
38-
self,
39-
use_local_blast: bool = False,
40-
local_blast_db: str = "rna_db",
38+
self,
39+
use_local_blast: bool = False,
40+
local_blast_db: str = "rna_db",
4141
api_timeout: int = 30,
4242
blast_num_threads: int = 4,
4343
working_dir: str = "cache",
@@ -49,7 +49,7 @@ def __init__(
4949
self.local_blast_db = local_blast_db
5050
self.api_timeout = api_timeout
5151
self.blast_num_threads = blast_num_threads # Number of threads for BLAST search
52-
52+
5353
if self.use_local_blast and not os.path.isfile(f"{self.local_blast_db}.nhr"):
5454
self.logger.error("Local BLAST database files not found. Please check the path.")
5555
self.use_local_blast = False
@@ -254,22 +254,22 @@ def _local_blast(self, seq: str, threshold: float) -> Optional[str]:
254254
"-num_threads", str(self.blast_num_threads),
255255
"-outfmt", "6 sacc" # Only accession, tab-separated
256256
]
257-
self.logger.debug("Running local blastn for RNA (threads=%d): %s",
257+
self.logger.debug("Running local blastn for RNA (threads=%d): %s",
258258
self.blast_num_threads, " ".join(cmd))
259-
259+
260260
# Run BLAST with timeout to avoid hanging
261261
try:
262262
out = subprocess.check_output(
263-
cmd,
264-
text=True,
263+
cmd,
264+
text=True,
265265
timeout=300, # 5 minute timeout for BLAST search
266266
stderr=subprocess.DEVNULL # Suppress BLAST warnings to reduce I/O
267267
).strip()
268268
except subprocess.TimeoutExpired:
269269
self.logger.warning("BLAST search timed out after 5 minutes for sequence")
270270
os.remove(tmp_name)
271271
return None
272-
272+
273273
os.remove(tmp_name)
274274
return out.split("\n", maxsplit=1)[0] if out else None
275275
except Exception as exc:
@@ -378,7 +378,7 @@ async def search(self, query: str, threshold: float = 0.1, **kwargs) -> Optional
378378
# check if RNA sequence (AUCG or ATCG characters, contains U or T)
379379
# Note: Sequences with T are also RNA sequences
380380
is_rna_sequence = query.startswith(">") or (
381-
re.fullmatch(r"[AUCGTN\s]+", query, re.I) and
381+
re.fullmatch(r"[AUCGTN\s]+", query, re.I) and
382382
("U" in query.upper() or "T" in query.upper())
383383
)
384384
if is_rna_sequence:

graphgen/models/splitter/sequence_splitter.py

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ def __init__(
2020
):
2121
"""
2222
Initialize sequence splitter.
23-
23+
2424
:param chunk_size: Maximum length of each chunk (in sequence characters)
2525
:param chunk_overlap: Number of characters to overlap between chunks
2626
:param length_function: Function to calculate length (default: len)
@@ -37,83 +37,83 @@ def __init__(
3737
def split_text(self, text: str) -> List[str]:
3838
"""
3939
Split a sequence into chunks of fixed size with overlap.
40-
40+
4141
:param text: The sequence to split (may include FASTA header)
4242
:return: List of sequence chunks
4343
"""
4444
# Remove FASTA header if present
4545
sequence = self._extract_sequence(text)
46-
46+
4747
if not sequence:
4848
logger.warning("Empty sequence provided to SequenceSplitter")
4949
return []
50-
50+
5151
# If sequence is shorter than chunk_size, return as single chunk
5252
if len(sequence) <= self.chunk_size:
5353
return [sequence]
54-
54+
5555
chunks = []
5656
start = 0
5757
step = self.chunk_size - self.chunk_overlap
58-
58+
5959
while start < len(sequence):
6060
end = min(start + self.chunk_size, len(sequence))
6161
chunk = sequence[start:end]
6262
chunks.append(chunk)
6363
start += step
64-
64+
6565
# Avoid infinite loop if step is 0 or negative
6666
if step <= 0:
6767
break
68-
68+
6969
return chunks
7070

7171
@staticmethod
7272
def _extract_sequence(text: str) -> str:
7373
"""
7474
Extract sequence from text, removing FASTA headers and whitespace.
75-
75+
7676
:param text: Input text (may contain FASTA header)
7777
:return: Clean sequence string
7878
"""
7979
# Remove FASTA header lines (lines starting with >)
8080
lines = text.split("\n")
8181
sequence_lines = [line for line in lines if not line.strip().startswith(">")]
82-
82+
8383
# Join and remove whitespace
8484
sequence = "".join(sequence_lines)
8585
sequence = re.sub(r"\s+", "", sequence)
86-
86+
8787
return sequence.upper() # Normalize to uppercase
8888

8989
@staticmethod
9090
def detect_sequence_type(sequence: str) -> Optional[str]:
9191
"""
9292
Detect the type of sequence (DNA, RNA, or protein).
93-
93+
9494
:param sequence: The sequence string
9595
:return: "dna", "rna", "protein", or None if cannot determine
9696
"""
9797
# Remove FASTA header and whitespace
9898
clean_seq = SequenceSplitter._extract_sequence(sequence)
99-
99+
100100
if not clean_seq:
101101
return None
102-
102+
103103
# Check for protein-specific amino acids
104104
protein_chars = set("EFILPQXZ") # Amino acids not in DNA/RNA
105105
if any(char in clean_seq for char in protein_chars):
106106
return "protein"
107-
107+
108108
# Check for RNA-specific character (U)
109109
if "U" in clean_seq.upper():
110110
return "rna"
111-
111+
112112
# Check if contains only DNA/RNA characters (A, T, G, C, N)
113113
dna_rna_chars = set("ATGCUN")
114114
if all(char.upper() in dna_rna_chars for char in clean_seq):
115115
# Default to DNA if ambiguous (could be DNA or RNA without U)
116116
return "dna"
117-
117+
118118
# If contains other characters, might be protein
119119
return "protein"

graphgen/models/storage/kv/json_storage.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ def iter_items(self) -> Iterator[Tuple[str, dict]]:
5454
def get_batch(self, keys: list[str]) -> dict[str, dict]:
5555
"""
5656
Get a batch of items by their keys.
57-
57+
5858
:param keys: List of keys to retrieve.
5959
:return: Dictionary of {key: value} for the requested keys.
6060
"""
@@ -63,7 +63,7 @@ def get_batch(self, keys: list[str]) -> dict[str, dict]:
6363
def iter_batches(self, batch_size: int = 10000) -> Iterator[dict[str, dict]]:
6464
"""
6565
Iterate over items in batches to avoid loading everything into memory.
66-
66+
6767
:param batch_size: Number of items per batch.
6868
:return: Iterator of dictionaries, each containing up to batch_size items.
6969
"""

graphgen/operators/build_kg/build_omics_kg.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ def build_omics_kg(
1515
):
1616
"""
1717
Build knowledge graph from multi-omics chunks (DNA, RNA, protein).
18-
18+
1919
:param llm_client: Synthesizer LLM model to extract entities and relationships
2020
:param kg_instance: Graph storage instance
2121
:param chunks: List of omics chunks

graphgen/templates/description_rephrasing.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
################
2020
-Real Data-
2121
################
22-
Input:
22+
Input:
2323
{input_sentence}
2424
################
2525
Please directly output the rewritten sentence without any additional information.

graphgen/templates/extraction/schema_guided_extraction.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
3. Present your findings in JSON format as specified below.
88
99
Important Notes:
10-
- Extract only relevant information.
10+
- Extract only relevant information.
1111
- Consider the context of the entire document when determining relevance.
1212
- Do not be verbose, only respond with the correct format and information.
1313
- Some docs may have multiple relevant excerpts -- include all that apply.

0 commit comments

Comments
 (0)