[MRG] Utils: optimise get_page_layout

karlowich · karlowich · commit db01b682fa8f · 2022-04-29T10:51:17.000+02:00
Since the existing code overwrites `layout` and `dim` in each iteration,
it is much more efficient to simply return the `layout` and `dim` of the
first page.

I have tested the difference with a 455 page pdf and the optimisation
reduces the time spent from 50 to 5 seconds.

Signed-off-by: Karl Bonde Torp &lt;k.torp@samsung.com&gt;
diff --git a/camelot/utils.py b/camelot/utils.py
@@ -889,12 +889,14 @@ def get_page_layout(
         rsrcmgr = PDFResourceManager()
         device = PDFPageAggregator(rsrcmgr, laparams=laparams)
         interpreter = PDFPageInterpreter(rsrcmgr, device)
-        for page in PDFPage.create_pages(document):
-            interpreter.process_page(page)
-            layout = device.get_result()
-            width = layout.bbox[2]
-            height = layout.bbox[3]
-            dim = (width, height)
+        page = next(PDFPage.create_pages(document), None)
+        if page is None:
+            raise PDFTextExtractionNotAllowed
+        interpreter.process_page(page)
+        layout = device.get_result()
+        width = layout.bbox[2]
+        height = layout.bbox[3]
+        dim = (width, height)
         return layout, dim