@@ -41,14 +41,17 @@ def __str__(self) -> str:
4141class OcrPage :
4242 """OCR extraction for a single page."""
4343
44- all_words : List [OcrWord ]
45- """All the words on the page, in semi-random order."""
44+ _all_words : List [OcrWord ]
4645 _lines : List [OcrLine ]
4746
4847 def __init__ (self , prediction : TypeApiPrediction ):
49- self .all_words = [
48+ self ._all_words = [
5049 OcrWord (word_prediction ) for word_prediction in prediction ["all_words" ]
5150 ]
51+ # make sure words are sorted from top to bottom
52+ self ._all_words .sort (
53+ key = lambda item : get_min_max_y (item .polygon ).min , reverse = False
54+ )
5255 self ._lines = []
5356
5457 @staticmethod
@@ -70,14 +73,9 @@ def _to_lines(self) -> List[OcrLine]:
7073 indexes : List [int ] = []
7174 lines : List [OcrLine ] = []
7275
73- # make sure words are sorted from top to bottom
74- self .all_words .sort (
75- key = lambda item : get_min_max_y (item .polygon ).min , reverse = False
76- )
77-
78- for _ in self .all_words :
76+ for _ in self ._all_words :
7977 line : OcrLine = OcrLine ()
80- for idx , word in enumerate (self .all_words ):
78+ for idx , word in enumerate (self ._all_words ):
8179 if idx in indexes :
8280 continue
8381 if current is None :
@@ -102,6 +100,11 @@ def all_lines(self) -> List[OcrLine]:
102100 self ._lines = self ._to_lines ()
103101 return self ._lines
104102
103+ @property
104+ def all_words (self ) -> List [OcrWord ]:
105+ """All the words on the page, in semi-random order."""
106+ return self ._all_words
107+
105108 def __str__ (self ) -> str :
106109 return "\n " .join (str (line ) for line in self .all_lines ) + "\n "
107110
0 commit comments