diff --git a/machine/corpora/__init__.py b/machine/corpora/__init__.py index d07e52ee..4a0817a6 100644 --- a/machine/corpora/__init__.py +++ b/machine/corpora/__init__.py @@ -27,7 +27,7 @@ from .paratext_project_settings import ParatextProjectSettings from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase from .paratext_project_terms_parser_base import KeyTerm, ParatextProjectTermsParserBase -from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase +from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase, filter_tokens_by_chapter from .paratext_project_versification_error_detector_base import ParatextProjectVersificationErrorDetectorBase from .paratext_text_corpus import ParatextTextCorpus from .place_markers_usfm_update_block_handler import PlaceMarkersAlignmentInfo, PlaceMarkersUsfmUpdateBlockHandler @@ -115,6 +115,7 @@ "FileParatextProjectTermsParser", "FileParatextProjectTextUpdater", "FileParatextProjectVersificationErrorDetector", + "filter_tokens_by_chapter", "flatten", "is_scripture", "KeyTerm", diff --git a/machine/corpora/paratext_project_text_updater_base.py b/machine/corpora/paratext_project_text_updater_base.py index 0e7bfdfd..7eee05fc 100644 --- a/machine/corpora/paratext_project_text_updater_base.py +++ b/machine/corpora/paratext_project_text_updater_base.py @@ -1,6 +1,7 @@ from abc import ABC -from typing import Callable, Iterable, Optional, Sequence, Union +from typing import Callable, Iterable, List, Optional, Sequence, Tuple, Union +from ..utils.string_utils import parse_integer from .paratext_project_file_handler import ParatextProjectFileHandler from .paratext_project_settings import ParatextProjectSettings from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase @@ -11,6 +12,8 @@ UpdateUsfmTextBehavior, ) from .usfm_parser import parse_usfm +from .usfm_token import UsfmTokenType +from .usfm_tokenizer import UsfmToken, UsfmTokenizer from .usfm_update_block_handler import UsfmUpdateBlockHandler, UsfmUpdateBlockHandlerError @@ -30,6 +33,7 @@ def update_usfm( self, book_id: str, rows: Optional[Sequence[UpdateUsfmRow]] = None, + chapters: Optional[Sequence[int]] = None, full_name: Optional[str] = None, text_behavior: UpdateUsfmTextBehavior = UpdateUsfmTextBehavior.PREFER_EXISTING, paragraph_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE, @@ -37,7 +41,7 @@ def update_usfm( style_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.STRIP, preserve_paragraph_styles: Optional[Union[Iterable[str], str]] = None, update_block_handlers: Optional[Iterable[UsfmUpdateBlockHandler]] = None, - remarks: Optional[Iterable[str]] = None, + remarks: Optional[Iterable[Tuple[int, str]]] = None, error_handler: Optional[Callable[[UsfmUpdateBlockHandlerError], bool]] = None, compare_segments: bool = False, ) -> Optional[str]: @@ -60,7 +64,10 @@ def update_usfm( compare_segments=compare_segments, ) try: - parse_usfm(usfm, handler, self._settings.stylesheet, self._settings.versification) + tokenizer = UsfmTokenizer(self._settings.stylesheet) + tokens = tokenizer.tokenize(usfm) + tokens = filter_tokens_by_chapter(tokens, chapters) + parse_usfm(tokens, handler, self._settings.stylesheet, self._settings.versification) return handler.get_usfm(self._settings.stylesheet) except Exception as e: error_message = ( @@ -69,3 +76,30 @@ def update_usfm( f". Error: '{e}'" ) raise RuntimeError(error_message) from e + + +def filter_tokens_by_chapter( + tokens: Sequence[UsfmToken], chapters: Optional[Sequence[int]] = None +) -> Sequence[UsfmToken]: + if chapters is None: + return tokens + tokens_within_chapters: List[UsfmToken] = [] + in_chapter: bool = False + in_id_marker: bool = False + for index, token in enumerate(tokens): + if index == 0 and token.marker == "id": + in_id_marker = True + if 1 in chapters: + in_chapter = True + elif in_id_marker and token.marker is not None and token.marker != "id": + in_id_marker = False + elif token.type == UsfmTokenType.CHAPTER: + chapter_num = parse_integer(token.data) if token.data else None + if chapter_num is not None and chapter_num in chapters: + in_chapter = True + else: + in_chapter = False + + if in_id_marker or in_chapter: + tokens_within_chapters.append(token) + return tokens_within_chapters diff --git a/machine/corpora/update_usfm_parser_handler.py b/machine/corpora/update_usfm_parser_handler.py index 9d95850c..fe405a64 100644 --- a/machine/corpora/update_usfm_parser_handler.py +++ b/machine/corpora/update_usfm_parser_handler.py @@ -53,7 +53,7 @@ def __init__( style_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.STRIP, preserve_paragraph_styles: Optional[Union[Iterable[str], str]] = None, update_block_handlers: Optional[Iterable[UsfmUpdateBlockHandler]] = None, - remarks: Optional[Iterable[str]] = None, + remarks: Optional[Iterable[Tuple[int, str]]] = None, error_handler: Optional[Callable[[UsfmUpdateBlockHandlerError], bool]] = None, compare_segments: bool = False, ) -> None: @@ -340,19 +340,42 @@ def get_usfm(self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty") -> str: tokenizer = UsfmTokenizer(stylesheet) tokens = list(self._tokens) if len(self._remarks) > 0: - remark_tokens: List[UsfmToken] = [] - for remark in self._remarks: - remark_tokens.append(UsfmToken(UsfmTokenType.PARAGRAPH, "rem")) - remark_tokens.append(UsfmToken(UsfmTokenType.TEXT, text=remark)) + remark_tokens_by_chapter: Dict[int, List[UsfmToken]] = {} + for chapter_num, remark in self._remarks: + chapter_tokens = remark_tokens_by_chapter.setdefault(chapter_num, []) + chapter_tokens.append(UsfmToken(UsfmTokenType.PARAGRAPH, "rem")) + chapter_tokens.append(UsfmToken(UsfmTokenType.TEXT, text=remark)) if len(tokens) > 0: - index = 0 - markers_to_skip = {"id", "ide", "rem"} - while tokens[index].marker in markers_to_skip: - index += 1 - if len(tokens) > index and tokens[index].type == UsfmTokenType.TEXT: + for chapter_num, remark_tokens in remark_tokens_by_chapter.items(): + if chapter_num == 0: + index = 0 + markers_to_skip = {"id", "ide", "rem"} + else: + index = next( + ( + i + for i, token in enumerate(tokens) + if token.type == UsfmTokenType.CHAPTER + and token.data is not None + and str(token.data).isdigit() + and int(token.data) == chapter_num + ), + -1, + ) + if index == -1: + continue index += 1 - for remark_token in reversed(remark_tokens): - tokens.insert(index, remark_token) + markers_to_skip = {"rem"} + + if index >= len(tokens): + tokens.extend(remark_tokens) + else: + while index < len(tokens) and tokens[index].marker in markers_to_skip: + index += 1 + if index < len(tokens) and tokens[index].type == UsfmTokenType.TEXT: + index += 1 + + tokens[index:index] = remark_tokens return tokenizer.detokenize(tokens) def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> Tuple[List[str], Optional[dict[str, object]]]: diff --git a/machine/corpora/usfm_parser.py b/machine/corpora/usfm_parser.py index 220ade28..4a31f6ff 100644 --- a/machine/corpora/usfm_parser.py +++ b/machine/corpora/usfm_parser.py @@ -17,7 +17,7 @@ def parse_usfm( - usfm: str, + usfm: Union[str, Sequence[UsfmToken]], handler: UsfmParserHandler, stylesheet: Union[StrPath, UsfmStylesheet] = "usfm.sty", versification: Optional[Versification] = None, diff --git a/tests/corpora/test_update_usfm_parser_handler.py b/tests/corpora/test_update_usfm_parser_handler.py index a9c1cdc1..278b795d 100644 --- a/tests/corpora/test_update_usfm_parser_handler.py +++ b/tests/corpora/test_update_usfm_parser_handler.py @@ -1,4 +1,4 @@ -from typing import Iterable, List, Optional, Sequence, Union +from typing import Iterable, List, Optional, Sequence, Tuple, Union from testutils.corpora_test_helpers import USFM_TEST_PROJECT_PATH, ignore_line_endings @@ -9,9 +9,11 @@ UpdateUsfmParserHandler, UpdateUsfmRow, UpdateUsfmTextBehavior, + UsfmTokenizer, UsfmUpdateBlock, UsfmUpdateBlockElementType, UsfmUpdateBlockHandler, + filter_tokens_by_chapter, parse_usfm, ) @@ -1387,29 +1389,108 @@ def test_pass_remark(): \v 1 Some text \v 2 \v 3 Other text +\c 2 +\rem Existing remark +\v 1 More text +\c 3 """ - target = update_usfm(rows, usfm, text_behavior=UpdateUsfmTextBehavior.PREFER_EXISTING, remarks=["New remark"]) + target = update_usfm( + rows, + usfm, + text_behavior=UpdateUsfmTextBehavior.PREFER_EXISTING, + remarks=[(0, "New remark 0"), (1, "New remark 1"), (2, "New remark 2"), (3, "New remark 3")], + ) result = r"""\id MAT - Test \ide UTF-8 \rem Existing remark -\rem New remark +\rem New remark 0 \c 1 +\rem New remark 1 \v 1 Some text \v 2 Update 2 \v 3 Other text +\c 2 +\rem Existing remark +\rem New remark 2 +\v 1 More text +\c 3 +\rem New remark 3 """ assert_usfm_equals(target, result) - target = update_usfm(rows, target, text_behavior=UpdateUsfmTextBehavior.PREFER_EXISTING, remarks=["New remark 2"]) + +def test_pass_remark_0_no_existing_remark(): + rows = [ + UpdateUsfmRow( + scr_ref("MAT 1:1"), + "Update 1", + ), + UpdateUsfmRow( + scr_ref("MAT 1:2"), + "Update 2", + ), + ] + usfm = r"""\id MAT - Test +\ide UTF-8 +\c 1 +\v 1 Some text +\v 2 +\v 3 Other text +""" + target = update_usfm( + rows, + usfm, + text_behavior=UpdateUsfmTextBehavior.PREFER_EXISTING, + remarks=[(0, "New remark 0")], + ) result = r"""\id MAT - Test \ide UTF-8 +\rem New remark 0 +\c 1 +\v 1 Some text +\v 2 Update 2 +\v 3 Other text +""" + assert_usfm_equals(target, result) + + +def test_pass_multiple_remarks_same_chapter() -> None: + rows = [ + UpdateUsfmRow( + scr_ref("MAT 1:1"), + "Update 1", + ), + UpdateUsfmRow( + scr_ref("MAT 1:2"), + "Update 2", + ), + ] + usfm = r"""\id MAT - Test +\ide UTF-8 \rem Existing remark -\rem New remark -\rem New remark 2 \c 1 \v 1 Some text +\v 2 +\v 3 Other text +""" + + target = update_usfm( + rows, + usfm, + text_behavior=UpdateUsfmTextBehavior.PREFER_EXISTING, + remarks=[(0, "New remark 0.1"), (0, "New remark 0.2"), (1, "New remark 1.1"), (1, "New remark 1.2")], + ) + result = r"""\id MAT - Test +\ide UTF-8 +\rem Existing remark +\rem New remark 0.1 +\rem New remark 0.2 +\c 1 +\rem New remark 1.1 +\rem New remark 1.2 +\v 1 Some text \v 2 Update 2 \v 3 Other text """ @@ -1494,6 +1575,79 @@ def test_update_block_footnote_at_start_of_chapter_with_preceding_text(): ) +def test_filter_chapters() -> None: + usfm = r"""\id MAT - Test +\h Matthew +\c 1 +\v 1 Some text +\v 2 +\v 3 Other text +\c 2 +\v 1 Some text +\c 3 +\v 1 Some text +\c 4 +\v 1 Some text +""" + chapters = [2, 4] + target = update_usfm(chapters=chapters, source=usfm) + result = r"""\id MAT - Test +\c 2 +\v 1 Some text +\c 4 +\v 1 Some text +""" + assert_usfm_equals(target, result) + + +def test_filter_chapters_with_chapter_1_and_header() -> None: + usfm = r"""\id MAT - Test +\h Matthew +\c 1 +\v 1 Some text +\v 2 +\v 3 Other text +\c 2 +\v 1 Some text +\c 3 +\v 1 Some text +\c 4 +\v 1 Some text +""" + chapters = [1, 3] + target = update_usfm(chapters=chapters, source=usfm) + result = r"""\id MAT - Test +\h Matthew +\c 1 +\v 1 Some text +\v 2 +\v 3 Other text +\c 3 +\v 1 Some text +""" + assert_usfm_equals(target, result) + + +def test_filter_chapters_with_bad_chapter_reference() -> None: + usfm = r"""\id MAT - Test +\c 1. +\v 1 Some text +\c 2. +\v 1 Some text +\c 3 +\v 1 Some text with good chapter reference +\c 4 +\v 1 Some text with good chapter reference +""" + chapters = [2, 4] + target = update_usfm(chapters=chapters, source=usfm) + result = r"""\id MAT - Test +\c 4 +\v 1 Some text with good chapter reference +""" + assert_usfm_equals(target, result) + + def scr_ref(*refs: str) -> List[ScriptureRef]: return [ScriptureRef.parse(ref) for ref in refs] @@ -1501,6 +1655,7 @@ def scr_ref(*refs: str) -> List[ScriptureRef]: def update_usfm( rows: Optional[Sequence[UpdateUsfmRow]] = None, source: Optional[str] = None, + chapters: Optional[Sequence[int]] = None, id_text: Optional[str] = None, text_behavior: UpdateUsfmTextBehavior = UpdateUsfmTextBehavior.PREFER_NEW, paragraph_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE, @@ -1508,7 +1663,7 @@ def update_usfm( style_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.STRIP, preserve_paragraph_styles: Optional[Iterable[str]] = None, update_block_handlers: Optional[Iterable[UsfmUpdateBlockHandler]] = None, - remarks: Optional[Iterable[str]] = None, + remarks: Optional[Iterable[Tuple[int, str]]] = None, compare_segments: bool = False, ) -> Optional[str]: if source is None: @@ -1516,6 +1671,7 @@ def update_usfm( return updater.update_usfm( "MAT", rows, + chapters, id_text, text_behavior, paragraph_behavior, @@ -1542,7 +1698,10 @@ def update_usfm( lambda _: False, compare_segments, ) - parse_usfm(source, updater) + tokenizer = UsfmTokenizer() + tokens = tokenizer.tokenize(source) + tokens = filter_tokens_by_chapter(tokens, chapters) + parse_usfm(tokens, updater) return updater.get_usfm()