From 1eafbe6dca96986506d9496ab3cba9f93b1fd179 Mon Sep 17 00:00:00 2001 From: SamSi0322 Date: Fri, 10 Apr 2026 11:19:53 -0400 Subject: [PATCH] Fix NlpSentenceChunking destroying sentence order MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit list(set(sens)) is unordered — sentences were returned in arbitrary order instead of document order. Replaced with dict.fromkeys() which deduplicates while preserving insertion order. Closes #1909 --- crawl4ai/chunking_strategy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawl4ai/chunking_strategy.py b/crawl4ai/chunking_strategy.py index f46cb667c..2973ceab7 100644 --- a/crawl4ai/chunking_strategy.py +++ b/crawl4ai/chunking_strategy.py @@ -86,7 +86,7 @@ def chunk(self, text: str) -> list: sentences = sent_tokenize(text) sens = [sent.strip() for sent in sentences] - return list(set(sens)) + return list(dict.fromkeys(sens)) # Topic-based segmentation using TextTiling