From 85aab64bdba183df92c5b49cc9bc84ede551a383 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Tue, 3 Mar 2026 10:47:34 +0000 Subject: [PATCH 1/2] Allow to customize number of libzim workers --- scraper/src/maps2zim/context.py | 3 +++ scraper/src/maps2zim/entrypoint.py | 7 +++++++ scraper/src/maps2zim/processor.py | 2 ++ 3 files changed, 12 insertions(+) diff --git a/scraper/src/maps2zim/context.py b/scraper/src/maps2zim/context.py index e90ab20..3a5ddc8 100644 --- a/scraper/src/maps2zim/context.py +++ b/scraper/src/maps2zim/context.py @@ -111,6 +111,9 @@ class Context: # Geonames region to download (e.g. "allCountries", "FR", "US") geonames_region: str = "allCountries" + # Number of worker threads for the ZIM creator + zim_workers: int | None = None + @classmethod def setup(cls, **kwargs: Any): new_instance = cls(**kwargs) diff --git a/scraper/src/maps2zim/entrypoint.py b/scraper/src/maps2zim/entrypoint.py index c885655..d72d3ae 100644 --- a/scraper/src/maps2zim/entrypoint.py +++ b/scraper/src/maps2zim/entrypoint.py @@ -205,6 +205,13 @@ def prepare_context(raw_args: list[str], tmpdir: str) -> None: dest="geonames_region", ) + parser.add_argument( + "--zim-workers", + type=int, + help="Number of worker threads for the ZIM creator. Default: libzim default", + dest="zim_workers", + ) + args = parser.parse_args(raw_args) # Ignore unset values so they do not override the default specified in Context diff --git a/scraper/src/maps2zim/processor.py b/scraper/src/maps2zim/processor.py index 3f93276..87190ec 100644 --- a/scraper/src/maps2zim/processor.py +++ b/scraper/src/maps2zim/processor.py @@ -157,6 +157,8 @@ def _run_internal(self) -> Path: logger.debug(f"User-Agent: {context.wm_user_agent}") creator = Creator(zim_path, "index.html") + if context.zim_workers is not None: + creator.config_nbworkers(context.zim_workers) logger.info(" Fetching ZIM illustration...") zim_illustration = self._fetch_zim_illustration() From 08ae6c2b23f6e4e2e0db34cf58e8bc8ec1a573e3 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Tue, 3 Mar 2026 10:48:25 +0000 Subject: [PATCH 2/2] Apply backpressure to add_item_for operations --- scraper/src/maps2zim/processor.py | 64 +++++++++++++++++++++++++------ 1 file changed, 52 insertions(+), 12 deletions(-) diff --git a/scraper/src/maps2zim/processor.py b/scraper/src/maps2zim/processor.py index 87190ec..091a388 100644 --- a/scraper/src/maps2zim/processor.py +++ b/scraper/src/maps2zim/processor.py @@ -6,6 +6,7 @@ import re import sqlite3 import tarfile +import threading import time import zipfile from importlib import resources @@ -20,6 +21,7 @@ from zimscraperlib.image import convert_image, resize_image from zimscraperlib.image.conversion import convert_svg2png from zimscraperlib.image.probing import format_for +from zimscraperlib.typing import Callback from zimscraperlib.zim import Creator, metadata from zimscraperlib.zim.dedup import Deduplicator from zimscraperlib.zim.filesystem import ( @@ -81,6 +83,9 @@ def __init__(self) -> None: # could happen in the loop in terms of exit conditions self.stats_items_total = 1 + # Semaphore for backpressure: limit items in-flight to 100 + self._inflight_semaphore = threading.Semaphore(100) + def run(self) -> Path: """Generates a zim for a single document. @@ -200,7 +205,8 @@ def _run_internal(self) -> Path: # Start creator early to detect problems early. with creator as creator: try: - creator.add_item_for( + self._add_item_for( + creator, "favicon.ico", content=self._fetch_favicon_from_illustration( zim_illustration @@ -225,12 +231,36 @@ def _run_internal(self) -> Path: return zim_path + def _add_item_for( + self, creator: Creator, path: str, title: str | None = None, **kwargs: Any + ) -> None: + """Wrapper for creator.add_item_for with backpressure. + + Blocks when 100 items are already in-flight, releases a slot when + the item is finalized (garbage-collected by libzim). + """ + self._inflight_semaphore.acquire() + + existing_callbacks = kwargs.pop("callbacks", None) + callbacks: list[Callback] = [] + if existing_callbacks is not None: + if isinstance(existing_callbacks, list): + callbacks.extend( + existing_callbacks # pyright: ignore[reportUnknownArgumentType] + ) + else: + callbacks.append(existing_callbacks) + callbacks.append(Callback(func=self._inflight_semaphore.release)) + + creator.add_item_for(path, title, callbacks=callbacks, **kwargs) + def run_with_creator(self, creator: Creator): context.current_thread_workitem = "standard files" logger.info(" Storing configuration...") - creator.add_item_for( + self._add_item_for( + creator, "content/config.json", content=ConfigModel( secondary_color=self.zim_config.secondary_color, @@ -260,7 +290,8 @@ def run_with_creator(self, creator: Creator): logger.debug(f"Adding {path} to ZIM") if path == "index.html": # Change index.html title and add to ZIM index_html_path = context.zimui_dist / path - creator.add_item_for( + self._add_item_for( + creator, path=path, content=index_html_path.read_text(encoding="utf-8").replace( "Vite App", @@ -270,7 +301,8 @@ def run_with_creator(self, creator: Creator): is_front=True, ) else: - creator.add_item_for( + self._add_item_for( + creator, path=path, fpath=file, is_front=False, @@ -786,7 +818,8 @@ def _write_sprites(self, creator: Creator): content = f.read() # Transform path from ofm_f384/... to sprites/ofm_f384/... zim_path = f"sprites/{member.name}" - creator.add_item_for( + self._add_item_for( + creator, path=zim_path, content=content, ) @@ -863,7 +896,8 @@ def _write_styles(self, creator: Creator): if relative_path.endswith(".json"): relative_path = relative_path[:-5] zim_path = f"styles/{relative_path}" - creator.add_item_for( + self._add_item_for( + creator, path=zim_path, content=content, ) @@ -1181,7 +1215,8 @@ def _write_dedupl_files( dedupl_path = self._dedupl_helper_path(dedupl_id) # Add to ZIM - creator.add_item_for( + self._add_item_for( + creator, path=f"dedupl/{dedupl_path}", content=tile_data, mimetype="application/x-protobuf", @@ -1368,7 +1403,8 @@ def _write_tilejson(self, creator: Creator): # Write TileJSON to ZIM tilejson_content = json.dumps(tilejson, ensure_ascii=False, indent=2) - creator.add_item_for( + self._add_item_for( + creator, path="planet", content=tilejson_content.encode("utf-8"), mimetype="application/json", @@ -1615,7 +1651,8 @@ def _write_places( # Add CSS file to ZIM assets = resources.files("maps2zim") / "assets" styles_path = Path(str(assets / "styles.css")) - creator.add_item_for( + self._add_item_for( + creator, path="content/styles.css", fpath=styles_path, mimetype="text/css", @@ -1648,7 +1685,8 @@ def _write_places( # Single place: create redirect place = places[0] redirect_html = self._create_redirect_html(place, root_prefix) - creator.add_item_for( + self._add_item_for( + creator, path=path, content=redirect_html.encode("utf-8"), mimetype="text/html", @@ -1660,7 +1698,8 @@ def _write_places( disamb_html = self._create_disambiguation_html( name, places, root_prefix ) - creator.add_item_for( + self._add_item_for( + creator, path=path, content=disamb_html.encode("utf-8"), mimetype="text/html", @@ -1931,7 +1970,8 @@ def _write_about_html(self, creator: Creator) -> None: ) # Add to ZIM - creator.add_item_for( + self._add_item_for( + creator, path="content/about.html", content=about_html.encode("utf-8"), mimetype="text/html",