diff --git a/scraper/src/maps2zim/context.py b/scraper/src/maps2zim/context.py index e90ab20..3a5ddc8 100644 --- a/scraper/src/maps2zim/context.py +++ b/scraper/src/maps2zim/context.py @@ -111,6 +111,9 @@ class Context: # Geonames region to download (e.g. "allCountries", "FR", "US") geonames_region: str = "allCountries" + # Number of worker threads for the ZIM creator + zim_workers: int | None = None + @classmethod def setup(cls, **kwargs: Any): new_instance = cls(**kwargs) diff --git a/scraper/src/maps2zim/entrypoint.py b/scraper/src/maps2zim/entrypoint.py index c885655..d72d3ae 100644 --- a/scraper/src/maps2zim/entrypoint.py +++ b/scraper/src/maps2zim/entrypoint.py @@ -205,6 +205,13 @@ def prepare_context(raw_args: list[str], tmpdir: str) -> None: dest="geonames_region", ) + parser.add_argument( + "--zim-workers", + type=int, + help="Number of worker threads for the ZIM creator. Default: libzim default", + dest="zim_workers", + ) + args = parser.parse_args(raw_args) # Ignore unset values so they do not override the default specified in Context diff --git a/scraper/src/maps2zim/processor.py b/scraper/src/maps2zim/processor.py index 3f93276..091a388 100644 --- a/scraper/src/maps2zim/processor.py +++ b/scraper/src/maps2zim/processor.py @@ -6,6 +6,7 @@ import re import sqlite3 import tarfile +import threading import time import zipfile from importlib import resources @@ -20,6 +21,7 @@ from zimscraperlib.image import convert_image, resize_image from zimscraperlib.image.conversion import convert_svg2png from zimscraperlib.image.probing import format_for +from zimscraperlib.typing import Callback from zimscraperlib.zim import Creator, metadata from zimscraperlib.zim.dedup import Deduplicator from zimscraperlib.zim.filesystem import ( @@ -81,6 +83,9 @@ def __init__(self) -> None: # could happen in the loop in terms of exit conditions self.stats_items_total = 1 + # Semaphore for backpressure: limit items in-flight to 100 + self._inflight_semaphore = threading.Semaphore(100) + def run(self) -> Path: """Generates a zim for a single document. @@ -157,6 +162,8 @@ def _run_internal(self) -> Path: logger.debug(f"User-Agent: {context.wm_user_agent}") creator = Creator(zim_path, "index.html") + if context.zim_workers is not None: + creator.config_nbworkers(context.zim_workers) logger.info(" Fetching ZIM illustration...") zim_illustration = self._fetch_zim_illustration() @@ -198,7 +205,8 @@ def _run_internal(self) -> Path: # Start creator early to detect problems early. with creator as creator: try: - creator.add_item_for( + self._add_item_for( + creator, "favicon.ico", content=self._fetch_favicon_from_illustration( zim_illustration @@ -223,12 +231,36 @@ def _run_internal(self) -> Path: return zim_path + def _add_item_for( + self, creator: Creator, path: str, title: str | None = None, **kwargs: Any + ) -> None: + """Wrapper for creator.add_item_for with backpressure. + + Blocks when 100 items are already in-flight, releases a slot when + the item is finalized (garbage-collected by libzim). + """ + self._inflight_semaphore.acquire() + + existing_callbacks = kwargs.pop("callbacks", None) + callbacks: list[Callback] = [] + if existing_callbacks is not None: + if isinstance(existing_callbacks, list): + callbacks.extend( + existing_callbacks # pyright: ignore[reportUnknownArgumentType] + ) + else: + callbacks.append(existing_callbacks) + callbacks.append(Callback(func=self._inflight_semaphore.release)) + + creator.add_item_for(path, title, callbacks=callbacks, **kwargs) + def run_with_creator(self, creator: Creator): context.current_thread_workitem = "standard files" logger.info(" Storing configuration...") - creator.add_item_for( + self._add_item_for( + creator, "content/config.json", content=ConfigModel( secondary_color=self.zim_config.secondary_color, @@ -258,7 +290,8 @@ def run_with_creator(self, creator: Creator): logger.debug(f"Adding {path} to ZIM") if path == "index.html": # Change index.html title and add to ZIM index_html_path = context.zimui_dist / path - creator.add_item_for( + self._add_item_for( + creator, path=path, content=index_html_path.read_text(encoding="utf-8").replace( "