Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions scraper/src/maps2zim/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,9 @@ class Context:
# Geonames region to download (e.g. "allCountries", "FR", "US")
geonames_region: str = "allCountries"

# Number of worker threads for the ZIM creator
zim_workers: int | None = None

@classmethod
def setup(cls, **kwargs: Any):
new_instance = cls(**kwargs)
Expand Down
7 changes: 7 additions & 0 deletions scraper/src/maps2zim/entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,13 @@ def prepare_context(raw_args: list[str], tmpdir: str) -> None:
dest="geonames_region",
)

parser.add_argument(
"--zim-workers",
type=int,
help="Number of worker threads for the ZIM creator. Default: libzim default",
dest="zim_workers",
)

args = parser.parse_args(raw_args)

# Ignore unset values so they do not override the default specified in Context
Expand Down
66 changes: 54 additions & 12 deletions scraper/src/maps2zim/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import re
import sqlite3
import tarfile
import threading
import time
import zipfile
from importlib import resources
Expand All @@ -20,6 +21,7 @@
from zimscraperlib.image import convert_image, resize_image
from zimscraperlib.image.conversion import convert_svg2png
from zimscraperlib.image.probing import format_for
from zimscraperlib.typing import Callback
from zimscraperlib.zim import Creator, metadata
from zimscraperlib.zim.dedup import Deduplicator
from zimscraperlib.zim.filesystem import (
Expand Down Expand Up @@ -81,6 +83,9 @@ def __init__(self) -> None:
# could happen in the loop in terms of exit conditions
self.stats_items_total = 1

# Semaphore for backpressure: limit items in-flight to 100
self._inflight_semaphore = threading.Semaphore(100)

def run(self) -> Path:
"""Generates a zim for a single document.

Expand Down Expand Up @@ -157,6 +162,8 @@ def _run_internal(self) -> Path:
logger.debug(f"User-Agent: {context.wm_user_agent}")

creator = Creator(zim_path, "index.html")
if context.zim_workers is not None:
creator.config_nbworkers(context.zim_workers)

logger.info(" Fetching ZIM illustration...")
zim_illustration = self._fetch_zim_illustration()
Expand Down Expand Up @@ -198,7 +205,8 @@ def _run_internal(self) -> Path:
# Start creator early to detect problems early.
with creator as creator:
try:
creator.add_item_for(
self._add_item_for(
creator,
"favicon.ico",
content=self._fetch_favicon_from_illustration(
zim_illustration
Expand All @@ -223,12 +231,36 @@ def _run_internal(self) -> Path:

return zim_path

def _add_item_for(
self, creator: Creator, path: str, title: str | None = None, **kwargs: Any
) -> None:
"""Wrapper for creator.add_item_for with backpressure.

Blocks when 100 items are already in-flight, releases a slot when
the item is finalized (garbage-collected by libzim).
"""
self._inflight_semaphore.acquire()

existing_callbacks = kwargs.pop("callbacks", None)
callbacks: list[Callback] = []
if existing_callbacks is not None:
if isinstance(existing_callbacks, list):
callbacks.extend(
existing_callbacks # pyright: ignore[reportUnknownArgumentType]
)
else:
callbacks.append(existing_callbacks)
callbacks.append(Callback(func=self._inflight_semaphore.release))

creator.add_item_for(path, title, callbacks=callbacks, **kwargs)

def run_with_creator(self, creator: Creator):

context.current_thread_workitem = "standard files"

logger.info(" Storing configuration...")
creator.add_item_for(
self._add_item_for(
creator,
"content/config.json",
content=ConfigModel(
secondary_color=self.zim_config.secondary_color,
Expand Down Expand Up @@ -258,7 +290,8 @@ def run_with_creator(self, creator: Creator):
logger.debug(f"Adding {path} to ZIM")
if path == "index.html": # Change index.html title and add to ZIM
index_html_path = context.zimui_dist / path
creator.add_item_for(
self._add_item_for(
creator,
path=path,
content=index_html_path.read_text(encoding="utf-8").replace(
"<title>Vite App</title>",
Expand All @@ -268,7 +301,8 @@ def run_with_creator(self, creator: Creator):
is_front=True,
)
else:
creator.add_item_for(
self._add_item_for(
creator,
path=path,
fpath=file,
is_front=False,
Expand Down Expand Up @@ -784,7 +818,8 @@ def _write_sprites(self, creator: Creator):
content = f.read()
# Transform path from ofm_f384/... to sprites/ofm_f384/...
zim_path = f"sprites/{member.name}"
creator.add_item_for(
self._add_item_for(
creator,
path=zim_path,
content=content,
)
Expand Down Expand Up @@ -861,7 +896,8 @@ def _write_styles(self, creator: Creator):
if relative_path.endswith(".json"):
relative_path = relative_path[:-5]
zim_path = f"styles/{relative_path}"
creator.add_item_for(
self._add_item_for(
creator,
path=zim_path,
content=content,
)
Expand Down Expand Up @@ -1179,7 +1215,8 @@ def _write_dedupl_files(
dedupl_path = self._dedupl_helper_path(dedupl_id)

# Add to ZIM
creator.add_item_for(
self._add_item_for(
creator,
path=f"dedupl/{dedupl_path}",
content=tile_data,
mimetype="application/x-protobuf",
Expand Down Expand Up @@ -1366,7 +1403,8 @@ def _write_tilejson(self, creator: Creator):

# Write TileJSON to ZIM
tilejson_content = json.dumps(tilejson, ensure_ascii=False, indent=2)
creator.add_item_for(
self._add_item_for(
creator,
path="planet",
content=tilejson_content.encode("utf-8"),
mimetype="application/json",
Expand Down Expand Up @@ -1613,7 +1651,8 @@ def _write_places(
# Add CSS file to ZIM
assets = resources.files("maps2zim") / "assets"
styles_path = Path(str(assets / "styles.css"))
creator.add_item_for(
self._add_item_for(
creator,
path="content/styles.css",
fpath=styles_path,
mimetype="text/css",
Expand Down Expand Up @@ -1646,7 +1685,8 @@ def _write_places(
# Single place: create redirect
place = places[0]
redirect_html = self._create_redirect_html(place, root_prefix)
creator.add_item_for(
self._add_item_for(
creator,
path=path,
content=redirect_html.encode("utf-8"),
mimetype="text/html",
Expand All @@ -1658,7 +1698,8 @@ def _write_places(
disamb_html = self._create_disambiguation_html(
name, places, root_prefix
)
creator.add_item_for(
self._add_item_for(
creator,
path=path,
content=disamb_html.encode("utf-8"),
mimetype="text/html",
Expand Down Expand Up @@ -1929,7 +1970,8 @@ def _write_about_html(self, creator: Creator) -> None:
)

# Add to ZIM
creator.add_item_for(
self._add_item_for(
creator,
path="content/about.html",
content=about_html.encode("utf-8"),
mimetype="text/html",
Expand Down
Loading