@@ -20,7 +20,7 @@ def __init__(
2020 ):
2121 """
2222 Initialize sequence splitter.
23-
23+
2424 :param chunk_size: Maximum length of each chunk (in sequence characters)
2525 :param chunk_overlap: Number of characters to overlap between chunks
2626 :param length_function: Function to calculate length (default: len)
@@ -37,83 +37,83 @@ def __init__(
3737 def split_text (self , text : str ) -> List [str ]:
3838 """
3939 Split a sequence into chunks of fixed size with overlap.
40-
40+
4141 :param text: The sequence to split (may include FASTA header)
4242 :return: List of sequence chunks
4343 """
4444 # Remove FASTA header if present
4545 sequence = self ._extract_sequence (text )
46-
46+
4747 if not sequence :
4848 logger .warning ("Empty sequence provided to SequenceSplitter" )
4949 return []
50-
50+
5151 # If sequence is shorter than chunk_size, return as single chunk
5252 if len (sequence ) <= self .chunk_size :
5353 return [sequence ]
54-
54+
5555 chunks = []
5656 start = 0
5757 step = self .chunk_size - self .chunk_overlap
58-
58+
5959 while start < len (sequence ):
6060 end = min (start + self .chunk_size , len (sequence ))
6161 chunk = sequence [start :end ]
6262 chunks .append (chunk )
6363 start += step
64-
64+
6565 # Avoid infinite loop if step is 0 or negative
6666 if step <= 0 :
6767 break
68-
68+
6969 return chunks
7070
7171 @staticmethod
7272 def _extract_sequence (text : str ) -> str :
7373 """
7474 Extract sequence from text, removing FASTA headers and whitespace.
75-
75+
7676 :param text: Input text (may contain FASTA header)
7777 :return: Clean sequence string
7878 """
7979 # Remove FASTA header lines (lines starting with >)
8080 lines = text .split ("\n " )
8181 sequence_lines = [line for line in lines if not line .strip ().startswith (">" )]
82-
82+
8383 # Join and remove whitespace
8484 sequence = "" .join (sequence_lines )
8585 sequence = re .sub (r"\s+" , "" , sequence )
86-
86+
8787 return sequence .upper () # Normalize to uppercase
8888
8989 @staticmethod
9090 def detect_sequence_type (sequence : str ) -> Optional [str ]:
9191 """
9292 Detect the type of sequence (DNA, RNA, or protein).
93-
93+
9494 :param sequence: The sequence string
9595 :return: "dna", "rna", "protein", or None if cannot determine
9696 """
9797 # Remove FASTA header and whitespace
9898 clean_seq = SequenceSplitter ._extract_sequence (sequence )
99-
99+
100100 if not clean_seq :
101101 return None
102-
102+
103103 # Check for protein-specific amino acids
104104 protein_chars = set ("EFILPQXZ" ) # Amino acids not in DNA/RNA
105105 if any (char in clean_seq for char in protein_chars ):
106106 return "protein"
107-
107+
108108 # Check for RNA-specific character (U)
109109 if "U" in clean_seq .upper ():
110110 return "rna"
111-
111+
112112 # Check if contains only DNA/RNA characters (A, T, G, C, N)
113113 dna_rna_chars = set ("ATGCUN" )
114114 if all (char .upper () in dna_rna_chars for char in clean_seq ):
115115 # Default to DNA if ambiguous (could be DNA or RNA without U)
116116 return "dna"
117-
117+
118118 # If contains other characters, might be protein
119119 return "protein"
0 commit comments