From fd483092a7bfa1054bb931833930b5dd5df75cd4 Mon Sep 17 00:00:00 2001 From: andreasntr Date: Fri, 3 Apr 2026 18:54:17 +0200 Subject: [PATCH 1/7] add duplicate images handling --- src/__init__.py | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/src/__init__.py b/src/__init__.py index 1256fc450..fcbdbd8a1 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -29,6 +29,7 @@ import warnings import weakref import zipfile +from operator import itemgetter from . import extra @@ -2923,6 +2924,8 @@ def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0 self.is_encrypted = False self.is_encrypted = False self.metadata = None + self.has_duplicate_images = False + self.images_xrefs_by_page = None self.FontInfos = [] self.Graftmaps = {} self.ShownPages = {} @@ -3047,6 +3050,26 @@ def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0 self.page_count2 = extra.page_count_pdf else: self.page_count2 = extra.page_count_fz + + if len(self.page_count) > 1: + self.has_duplicate_images = True + first_page_n_images = len(self.get_page_images(0)) + for page in self.pages(start=1): + # we need at least one page with a different number of images + # to exclude full document duplication + if len(page.get_images()) != first_page_n_images: + self.has_duplicate_images = False + break + + if self.has_duplicate_images: + self.images_xrefs_by_page = [] + for page in self.pages(): + # store only images referenced by page + page_xrefs = list(map( + itemgetter("xref"), + page.get_image_info(xrefs=True) + )) + self.images_xrefs_by_page = page_xrefs finally: JM_mupdf_show_errors = JM_mupdf_show_errors_old @@ -5090,7 +5113,14 @@ def get_page_images(self, pno: int, full: bool =False) -> list: return () val = self._getPageInfo(pno, 2) if not full: - return [v[:-1] for v in val] + val = [v[:-1] for v in val] + if self.has_duplicate_images: + deduplicated_val = [] + for v in val: + # v[0] is "xref" + if v[0] in self.images_xrefs_by_page[pno]: + deduplicated_val.append(v) + return deduplicated_val return val def get_page_labels(self): From b218bbfc1015c54562bd7acad390500e6187a8e2 Mon Sep 17 00:00:00 2001 From: andreasntr Date: Fri, 3 Apr 2026 19:10:11 +0200 Subject: [PATCH 2/7] resolve circular dependency on first images duplication check --- src/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/__init__.py b/src/__init__.py index fcbdbd8a1..1cb9e5c02 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -3052,14 +3052,15 @@ def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0 self.page_count2 = extra.page_count_fz if len(self.page_count) > 1: - self.has_duplicate_images = True + has_duplicate_images = True first_page_n_images = len(self.get_page_images(0)) for page in self.pages(start=1): # we need at least one page with a different number of images # to exclude full document duplication if len(page.get_images()) != first_page_n_images: - self.has_duplicate_images = False + has_duplicate_images = False break + self.has_duplicate_images = has_duplicate_images if self.has_duplicate_images: self.images_xrefs_by_page = [] From ba2e3beacd3047e638d2ba110d70a2986784e4c2 Mon Sep 17 00:00:00 2001 From: andreasntr Date: Tue, 14 Apr 2026 19:13:36 +0200 Subject: [PATCH 3/7] fix length check over page_count --- src/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/__init__.py b/src/__init__.py index 1cb9e5c02..1ab0bac00 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -3051,7 +3051,7 @@ def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0 else: self.page_count2 = extra.page_count_fz - if len(self.page_count) > 1: + if self.page_count > 1: has_duplicate_images = True first_page_n_images = len(self.get_page_images(0)) for page in self.pages(start=1): From cd3bf19ac4a4840b5eda64f71e7a4e24eb1cbd75 Mon Sep 17 00:00:00 2001 From: andreasntr Date: Tue, 14 Apr 2026 22:19:48 +0200 Subject: [PATCH 4/7] perform images duplication check only for pdfs --- pipcl.py | 6 +++--- setup.py | 6 +++--- src/__init__.py | 23 +++++++++++++---------- tests/.gitignore | 5 +++++ tests/README.md | 2 +- tests/conftest.py | 2 +- tests/test_pixmap.py | 2 +- tests/util.py | 2 +- 8 files changed, 28 insertions(+), 20 deletions(-) create mode 100644 tests/.gitignore diff --git a/pipcl.py b/pipcl.py index 9f3c93538..d15747eed 100644 --- a/pipcl.py +++ b/pipcl.py @@ -295,7 +295,7 @@ class Package: >>> print('Installing from wheel into venv using pip.', file=sys.stderr) >>> _ = subprocess.run( - ... f'. pipcl_test/pylocal/bin/activate && pip install pipcl_test/dist/*.whl', + ... f'. pipcl_test/pylocal/bin/activate && uv pip install pipcl_test/dist/*.whl', ... shell=1, check=1) >>> print('Running foo_cli.', file=sys.stderr) @@ -2675,9 +2675,9 @@ def _macos_fixup_platform_tag(tag): platform tags seem more restricted than platform tags from sysconfig.get_platform(). For example: - pip install ...-macosx_10_13_arm64.whl + uv pip install ...-macosx_10_13_arm64.whl ERROR: ...-macosx_10_13_arm64.whl is not a supported wheel on this platform. - pip install ...-macosx_10_13_universal2.whl + uv pip install ...-macosx_10_13_universal2.whl Ok. ''' m = re.match( '^macosx_([0-9_]+)_([^0-9].+)$', tag) diff --git a/setup.py b/setup.py index 651838312..b945fd0bc 100755 --- a/setup.py +++ b/setup.py @@ -1375,7 +1375,7 @@ def get_requires_for_build_wheel(config_settings=None): assert 0, f'Unrecognised {PYMUPDF_SETUP_FLAVOUR=}.' if os.environ.get('PYODIDE_ROOT'): - # We can't pip install pytest on pyodide, so specify it here. + # We can't uv pip install pytest on pyodide, so specify it here. requires_dist.append('pytest') p = pipcl.Package( @@ -1438,9 +1438,9 @@ def platform_release_tuple(): else: ret.append('libclang') if msys2: - print(f'msys2: pip install of swig does not build; assuming `pacman -S swig`.') + print(f'msys2: uv pip install of swig does not build; assuming `pacman -S swig`.') elif openbsd: - print(f'OpenBSD: pip install of swig does not build; assuming `pkg_add swig`.') + print(f'OpenBSD: uv pip install of swig does not build; assuming `pkg_add swig`.') elif PYMUPDF_SETUP_SWIG: pass elif darwin and python_version_tuple < (3, 13): diff --git a/src/__init__.py b/src/__init__.py index 1ab0bac00..46c921248 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -3051,7 +3051,10 @@ def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0 else: self.page_count2 = extra.page_count_fz - if self.page_count > 1: + # if the doc is a PDF, check for images duplication across pages + # this may happen, e.g., when converting from MS Office formats with external tools + # if the PDF has 1 page only, there is no possibility of duplication across pages + if self.is_pdf and self.page_count > 1: has_duplicate_images = True first_page_n_images = len(self.get_page_images(0)) for page in self.pages(start=1): @@ -3062,15 +3065,15 @@ def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0 break self.has_duplicate_images = has_duplicate_images - if self.has_duplicate_images: - self.images_xrefs_by_page = [] - for page in self.pages(): - # store only images referenced by page - page_xrefs = list(map( - itemgetter("xref"), - page.get_image_info(xrefs=True) - )) - self.images_xrefs_by_page = page_xrefs + if self.has_duplicate_images: + self.images_xrefs_by_page = [] + for page in self.pages(): + # store only images referenced by page + page_xrefs = list(map( + itemgetter("xref"), + page.get_image_info(xrefs=True) + )) + self.images_xrefs_by_page = page_xrefs finally: JM_mupdf_show_errors = JM_mupdf_show_errors_old diff --git a/tests/.gitignore b/tests/.gitignore new file mode 100644 index 000000000..9b4cf8e95 --- /dev/null +++ b/tests/.gitignore @@ -0,0 +1,5 @@ +# **/*.png +# **/*.jpg +# **/*.py +**/test_*.py +# **/*.pdf \ No newline at end of file diff --git a/tests/README.md b/tests/README.md index 3c0fb16fa..c6d49ddcc 100644 --- a/tests/README.md +++ b/tests/README.md @@ -11,7 +11,7 @@ To run these tests: For example, as of 2023-12-11: ``` -> python -m pip install pytest fontTools psutil pymupdf-fonts pillow +> python -m uv pip install pytest fontTools psutil pymupdf-fonts pillow > pytest PyMuPDF ============================= test session starts ============================== platform linux -- Python 3.11.2, pytest-7.4.3, pluggy-1.3.0 diff --git a/tests/conftest.py b/tests/conftest.py index c79e69906..685fb7bf0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -32,7 +32,7 @@ def install_required_packages(): pass else: packages += ' psutil' - command = f'pip install --upgrade {packages}' + command = f'uv pip install --upgrade {packages}' print(f'{__file__}:install_required_packages)(): Running: {command}', flush=1) subprocess.run(command, shell=1, check=1) diff --git a/tests/test_pixmap.py b/tests/test_pixmap.py index 81c764cb1..73464d39f 100644 --- a/tests/test_pixmap.py +++ b/tests/test_pixmap.py @@ -491,7 +491,7 @@ def test_4336(): venv = os.path.normpath(f'{__file__}/../../tests/resources/test_4336_venv') command = f'{sys.executable} -m venv {venv}' command += f' && . {venv}/bin/activate' - command += f' && pip install --force-reinstall pymupdf==1.23.8' + command += f' && uv pip install --force-reinstall pymupdf==1.23.8' command += f' && python {path_code}' print(f'Running: {command}', flush=1) subprocess.run(command, shell=1, check=1) diff --git a/tests/util.py b/tests/util.py index 32785f673..d3179c52b 100644 --- a/tests/util.py +++ b/tests/util.py @@ -16,7 +16,7 @@ def download(url, name, size=None): print(f'Using existing file {path=}.') else: print(f'Downloading from {url=}.') - subprocess.run(f'pip install -U requests', check=1, shell=1) + subprocess.run(f'uv pip install -U requests', check=1, shell=1) import requests r = requests.get(url, path, timeout=10) r.raise_for_status() From fdd21bf205cc10f4f6a4be0f1a95e0d886c66fa0 Mon Sep 17 00:00:00 2001 From: andreasntr Date: Tue, 14 Apr 2026 22:26:28 +0200 Subject: [PATCH 5/7] revert uv push --- pipcl.py | 6 +++--- setup.py | 6 +++--- tests/README.md | 2 +- tests/conftest.py | 2 +- tests/util.py | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/pipcl.py b/pipcl.py index d15747eed..9f3c93538 100644 --- a/pipcl.py +++ b/pipcl.py @@ -295,7 +295,7 @@ class Package: >>> print('Installing from wheel into venv using pip.', file=sys.stderr) >>> _ = subprocess.run( - ... f'. pipcl_test/pylocal/bin/activate && uv pip install pipcl_test/dist/*.whl', + ... f'. pipcl_test/pylocal/bin/activate && pip install pipcl_test/dist/*.whl', ... shell=1, check=1) >>> print('Running foo_cli.', file=sys.stderr) @@ -2675,9 +2675,9 @@ def _macos_fixup_platform_tag(tag): platform tags seem more restricted than platform tags from sysconfig.get_platform(). For example: - uv pip install ...-macosx_10_13_arm64.whl + pip install ...-macosx_10_13_arm64.whl ERROR: ...-macosx_10_13_arm64.whl is not a supported wheel on this platform. - uv pip install ...-macosx_10_13_universal2.whl + pip install ...-macosx_10_13_universal2.whl Ok. ''' m = re.match( '^macosx_([0-9_]+)_([^0-9].+)$', tag) diff --git a/setup.py b/setup.py index b945fd0bc..651838312 100755 --- a/setup.py +++ b/setup.py @@ -1375,7 +1375,7 @@ def get_requires_for_build_wheel(config_settings=None): assert 0, f'Unrecognised {PYMUPDF_SETUP_FLAVOUR=}.' if os.environ.get('PYODIDE_ROOT'): - # We can't uv pip install pytest on pyodide, so specify it here. + # We can't pip install pytest on pyodide, so specify it here. requires_dist.append('pytest') p = pipcl.Package( @@ -1438,9 +1438,9 @@ def platform_release_tuple(): else: ret.append('libclang') if msys2: - print(f'msys2: uv pip install of swig does not build; assuming `pacman -S swig`.') + print(f'msys2: pip install of swig does not build; assuming `pacman -S swig`.') elif openbsd: - print(f'OpenBSD: uv pip install of swig does not build; assuming `pkg_add swig`.') + print(f'OpenBSD: pip install of swig does not build; assuming `pkg_add swig`.') elif PYMUPDF_SETUP_SWIG: pass elif darwin and python_version_tuple < (3, 13): diff --git a/tests/README.md b/tests/README.md index c6d49ddcc..3c0fb16fa 100644 --- a/tests/README.md +++ b/tests/README.md @@ -11,7 +11,7 @@ To run these tests: For example, as of 2023-12-11: ``` -> python -m uv pip install pytest fontTools psutil pymupdf-fonts pillow +> python -m pip install pytest fontTools psutil pymupdf-fonts pillow > pytest PyMuPDF ============================= test session starts ============================== platform linux -- Python 3.11.2, pytest-7.4.3, pluggy-1.3.0 diff --git a/tests/conftest.py b/tests/conftest.py index 685fb7bf0..c79e69906 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -32,7 +32,7 @@ def install_required_packages(): pass else: packages += ' psutil' - command = f'uv pip install --upgrade {packages}' + command = f'pip install --upgrade {packages}' print(f'{__file__}:install_required_packages)(): Running: {command}', flush=1) subprocess.run(command, shell=1, check=1) diff --git a/tests/util.py b/tests/util.py index d3179c52b..32785f673 100644 --- a/tests/util.py +++ b/tests/util.py @@ -16,7 +16,7 @@ def download(url, name, size=None): print(f'Using existing file {path=}.') else: print(f'Downloading from {url=}.') - subprocess.run(f'uv pip install -U requests', check=1, shell=1) + subprocess.run(f'pip install -U requests', check=1, shell=1) import requests r = requests.get(url, path, timeout=10) r.raise_for_status() From ceb4a8a31b105c5a831c1c937691405db5b0d811 Mon Sep 17 00:00:00 2001 From: andreasntr Date: Tue, 14 Apr 2026 22:27:25 +0200 Subject: [PATCH 6/7] revert tests gitignore addition --- tests/.gitignore | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 tests/.gitignore diff --git a/tests/.gitignore b/tests/.gitignore deleted file mode 100644 index 9b4cf8e95..000000000 --- a/tests/.gitignore +++ /dev/null @@ -1,5 +0,0 @@ -# **/*.png -# **/*.jpg -# **/*.py -**/test_*.py -# **/*.pdf \ No newline at end of file From 3b29476ac28c2d700750b244aa430ab035fa2c76 Mon Sep 17 00:00:00 2001 From: andreasntr Date: Tue, 14 Apr 2026 22:28:36 +0200 Subject: [PATCH 7/7] revert uv push --- tests/test_pixmap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_pixmap.py b/tests/test_pixmap.py index 73464d39f..81c764cb1 100644 --- a/tests/test_pixmap.py +++ b/tests/test_pixmap.py @@ -491,7 +491,7 @@ def test_4336(): venv = os.path.normpath(f'{__file__}/../../tests/resources/test_4336_venv') command = f'{sys.executable} -m venv {venv}' command += f' && . {venv}/bin/activate' - command += f' && uv pip install --force-reinstall pymupdf==1.23.8' + command += f' && pip install --force-reinstall pymupdf==1.23.8' command += f' && python {path_code}' print(f'Running: {command}', flush=1) subprocess.run(command, shell=1, check=1)