Skip to content

Commit 2128ef0

Browse files
chg: ✨ Use pikepdf to replace pymupdf (#41)
* chg: ✨ Use of pikepdf * chg: ✅ Updated and added tests for pdfs Inputs * chg: 🔧 Added pikepdf in requirements * fix pylint and mypy Co-authored-by: Ianaré Sévi <ianare@mindee.co>
1 parent fae922c commit 2128ef0

File tree

12 files changed

+80
-76
lines changed

12 files changed

+80
-76
lines changed

.github/workflows/test.yml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,11 @@ jobs:
1010
strategy:
1111
matrix:
1212
python-version:
13-
#- "3.5"
14-
- "3.6"
13+
#- "3.6"
1514
- "3.7"
1615
- "3.8"
1716
- "3.9"
18-
#- "3.10"
17+
- "3.10"
1918
steps:
2019
- uses: actions/checkout@v2
2120

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ The full documentation is available [here](https://developers.mindee.com/docs/ge
44

55
## Requirements
66

7-
This library is officially supported on Python 3.6 to 3.9.
7+
This library is officially supported on Python 3.7 to 3.10.
88

99
## Install
1010

mindee/__init__.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -93,14 +93,13 @@ def _wrap_response(self, input_file, response, document_type):
9393
"Receipt API %s HTTP error: %s"
9494
% (response.status_code, json.dumps(dict_response))
9595
)
96-
elif response.status_code > 201:
96+
if response.status_code > 201:
9797
return Response(
9898
http_response=dict_response,
9999
pages=[],
100100
document=None,
101101
document_type=document_type,
102102
)
103-
104103
return Response.format_response(dict_response, document_type, input_file)
105104

106105
def parse_passport(
@@ -264,8 +263,8 @@ def dump(self, path):
264263
:param path: file path for storing the response object
265264
:return: (void) save the json response
266265
"""
267-
with open(path, "w") as fp:
268-
json.dump(self.http_response, fp)
266+
with open(path, "w") as handle:
267+
json.dump(self.http_response, handle)
269268

270269
@staticmethod
271270
def load(json_path):
@@ -274,8 +273,8 @@ def load(json_path):
274273
:return: Full response object loaded from json file
275274
"""
276275
try:
277-
with open(json_path) as fp:
278-
json_response = json.load(fp)
276+
with open(json_path) as handle:
277+
json_response = json.load(handle)
279278

280279
file_input = Inputs.load(
281280
json_response["input_type"],

mindee/documents/passport.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -283,28 +283,28 @@ def __mrz_last_name_checksum(self):
283283
return True
284284

285285
@staticmethod
286-
def check_sum(s):
286+
def check_sum(to_check: str) -> str:
287287
"""
288288
https://en.wikipedia.org/wiki/Machine-readable_passport
289-
:param s: string
289+
:param to_check: string
290290
:return: checksum value for string s
291291
"""
292292
checker = 0
293293
alpha_to_num = {c: 10 + i for i, c in enumerate("ABCDEFGHIJKLMNOPQRSTUVWXYZ")}
294-
for i, c in enumerate(s):
294+
for i, chk in enumerate(to_check):
295295
if i % 3 == 0:
296296
weight = 7
297297
elif i % 3 == 1:
298298
weight = 3
299299
else:
300300
weight = 1
301301

302-
if c == "<":
302+
if chk == "<":
303303
val = 0
304-
elif c.isalpha():
305-
val = alpha_to_num[c]
304+
elif chk.isalpha():
305+
val = alpha_to_num[chk]
306306
else:
307-
val = int(c)
307+
val = int(chk)
308308
checker += val * weight
309309
return str(checker % 10)
310310

mindee/inputs.py

Lines changed: 30 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import os
33
from base64 import decodebytes
44
from mimetypes import guess_type
5-
import fitz
5+
import pikepdf
66

77

88
class Inputs:
@@ -48,7 +48,7 @@ def __init__(
4848
self.file_extension = guess_type(file)[0]
4949

5050
if input_type == "dummy":
51-
self.file_object = ""
51+
self.file_object = None
5252
self.input_type = ""
5353
self.filename = ""
5454
self.filepath = ""
@@ -60,6 +60,8 @@ def __init__(
6060
)
6161

6262
if self.file_extension == "application/pdf":
63+
self.check_pdf_open()
64+
6365
count_pages = self.count_pdf_pages()
6466

6567
if cut_pdf is True:
@@ -99,47 +101,44 @@ def count_pdf_pages(self):
99101
:return: Number of pages in the Input file for pdfs
100102
"""
101103
self.file_object.seek(0)
102-
src = fitz.open(
103-
stream=self.file_object.read(),
104-
filetype=self.file_extension,
105-
filename=self.filename,
106-
)
107-
return len(src)
104+
with pikepdf.open(self.file_object) as pdf:
105+
return len(pdf.pages)
108106

109107
def merge_pdf_pages(self, pages_number):
110108
"""
111109
:param pages_number: List of pages number to use for merging in the original pdf
112110
:return: (void) Set the Input.file with the reconstructed pdf stream
113111
"""
114112
self.file_object.seek(0)
115-
src = fitz.open(stream=self.file_object.read(), filetype="pdf")
116-
doc = fitz.open()
117-
pdf_pages = [src[n] for n in pages_number]
118-
for spage in pdf_pages:
119-
width = spage.MediaBoxSize[0]
120-
height = spage.MediaBoxSize[1]
121-
r = fitz.Rect(0, 0, width, height)
122-
page = doc.new_page(-1, width=width, height=height)
123-
try:
124-
page.showPDFpage(r, src, spage.number)
125-
except:
126-
pass
113+
new_pdf = pikepdf.Pdf.new()
114+
with pikepdf.open(self.file_object) as pdf:
115+
for page_n in pages_number:
116+
new_pdf.pages.append(pdf.pages[page_n])
127117
self.file_object.close()
128-
self.file_object = io.BytesIO(doc.write())
118+
self.file_object = io.BytesIO()
119+
new_pdf.save(self.file_object)
129120

130121
def check_if_document_is_empty(self):
131122
"""
132123
:return: (void) Check if the document contain only empty pages
133124
"""
125+
self.file_object.seek(0)
126+
with pikepdf.open(self.file_object) as pdf:
127+
for _, page in enumerate(pdf.pages):
128+
if (
129+
"/Font" in page["/Resources"].keys()
130+
or "/XObject" in page["/Resources"].keys()
131+
or page["/Contents"]["/Length"] > 1000
132+
):
133+
return
134+
raise Exception("PDF pages are empty")
134135

136+
def check_pdf_open(self):
137+
"""
138+
:return: (void) Check if the document can be opened using pikepdf
139+
"""
135140
self.file_object.seek(0)
136-
src = fitz.open(stream=self.file_object.read(), filetype="pdf")
137-
fitz.open()
138-
for page in src:
139-
if (
140-
len(page.get_images()) > 0
141-
or len(page.get_cdrawings()) > 1
142-
or len(page.get_text()) > 0
143-
):
144-
return
145-
raise Exception("PDF pages are empty")
141+
try:
142+
pikepdf.open(self.file_object)
143+
except Exception as err:
144+
raise Exception("Couldn't open PDF file. %s" % err)

pyproject.toml

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
[tool.black]
22
line-length = 88
3-
target-version = ['py35', 'py36', 'py37']
3+
target-version = ['py36', 'py37', 'py38']
44
include = '\.pyi?$'
55

66
[[tool.mypy.overrides]]
7-
module = ['fitz',]
7+
module = ['pikepdf',]
88
ignore_missing_imports = true
99

1010
[tool.pylint.'MESSAGES CONTROL']
@@ -23,9 +23,7 @@ disable=[
2323
'unidiomatic-typecheck',
2424
'arguments-differ',
2525
'inconsistent-return-statements',
26-
'invalid-name',
2726
'super-init-not-called',
28-
'no-else-raise',
2927
'raise-missing-from',
3028
'consider-iterating-dictionary',
3129
'unspecified-encoding',

requirements.txt

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,16 @@ chardet==4.0.0
1010
# via requests
1111
idna==2.10
1212
# via requests
13-
pymupdf==1.18.17
13+
lxml==4.7.1
14+
# via pikepdf
15+
packaging==21.3
16+
# via pikepdf
17+
pikepdf==4.3.1
1418
# via mindee (setup.py)
19+
pillow==9.0.0
20+
# via pikepdf
21+
pyparsing==3.0.6
22+
# via packaging
1523
pytz==2021.3
1624
# via mindee (setup.py)
1725
requests==2.25.1

setup.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,13 @@
1616

1717

1818
requirements = [
19+
"pikepdf==4.3.1",
1920
"pytz==2021.3",
20-
"PyMuPDF==1.18.17",
2121
"requests==2.25.1",
2222
]
2323

2424
test_requirements = [
25-
"pytest==6.1.2",
25+
"pytest==6.2.5",
2626
"pytest-cov==2.11.1",
2727
]
2828

tests/data/pdfs/blank.pdf

6.48 KB
Binary file not shown.

tests/data/pdfs/blank_1.pdf

589 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)