1+ import os
2+ import random
3+ import string
4+ from datetime import datetime
5+ from pathlib import Path
6+ from typing import Optional , Union
7+ from urllib .parse import urlparse
8+
9+ import requests
10+
111from mindee .error .mindee_error import MindeeSourceError
12+ from mindee .input .sources .bytes_input import BytesInput
213from mindee .input .sources .local_input_source import InputType
314from mindee .logger import logger
415
@@ -13,7 +24,7 @@ def __init__(self, url: str) -> None:
1324 """
1425 Input document from a base64 encoded string.
1526
16- :param url: URL to send, must be HTTPS
27+ :param url: URL to send, must be HTTPS.
1728 """
1829 if not url .lower ().startswith ("https" ):
1930 raise MindeeSourceError ("URL must be HTTPS" )
@@ -23,3 +34,175 @@ def __init__(self, url: str) -> None:
2334 logger .debug ("URL input: %s" , url )
2435
2536 self .url = url
37+
38+ def __fetch_file_content (
39+ self ,
40+ username : Optional [str ] = None ,
41+ password : Optional [str ] = None ,
42+ token : Optional [str ] = None ,
43+ headers : Optional [dict ] = None ,
44+ max_redirects : int = 3 ,
45+ ) -> bytes :
46+ """
47+ Fetch the content of the file from the URL.
48+
49+ :param username: Optional username for authentication.
50+ :param password: Optional password for authentication.
51+ :param token: Optional token for authentication.
52+ :param headers: Optional additional headers for the request.
53+ :param max_redirects: Maximum number of redirects to follow.
54+ :return: The content of the file as bytes.
55+ """
56+ if not headers :
57+ headers = {}
58+ if token :
59+ headers ["Authorization" ] = f"Bearer { token } "
60+ auth = None if not username or not password else (username , password )
61+
62+ response = UrlInputSource .__make_request (
63+ self .url , auth , headers , 0 , max_redirects = max_redirects
64+ )
65+
66+ return response
67+
68+ def save_to_file (
69+ self ,
70+ filepath : Union [Path , str ],
71+ filename : Optional [str ] = None ,
72+ username : Optional [str ] = None ,
73+ password : Optional [str ] = None ,
74+ token : Optional [str ] = None ,
75+ headers : Optional [dict ] = None ,
76+ max_redirects : int = 3 ,
77+ ) -> Path :
78+ """
79+ Save the content of the URL to a file.
80+
81+ :param filepath: Path to save the content to.
82+ :param filename: Optional filename to give to the file.
83+ :param username: Optional username for authentication.
84+ :param password: Optional password for authentication.
85+ :param token: Optional token for authentication.
86+ :param headers: Optional additional headers for the request.
87+ :param max_redirects: Maximum number of redirects to follow.
88+ :return: The path to the saved file.
89+ """
90+ response = self .__fetch_file_content (
91+ username , password , token , headers , max_redirects
92+ )
93+ filename = self .__fill_filename (filename )
94+ full_path = Path (filepath ) / filename
95+ with open (full_path , "wb" ) as binary_file :
96+ binary_file .write (response )
97+ return full_path
98+
99+ def as_local_input_source (
100+ self ,
101+ filename : Optional [str ] = None ,
102+ username : Optional [str ] = None ,
103+ password : Optional [str ] = None ,
104+ token : Optional [str ] = None ,
105+ headers : Optional [dict ] = None ,
106+ max_redirects : int = 3 ,
107+ ) -> BytesInput :
108+ """
109+ Convert the URL content to a BytesInput object.
110+
111+ :param filename: Optional filename for the BytesInput.
112+ :param username: Optional username for authentication.
113+ :param password: Optional password for authentication.
114+ :param token: Optional token for authentication.
115+ :param headers: Optional additional headers for the request.
116+ :param max_redirects: Maximum number of redirects to follow.
117+ :return: A BytesInput object containing the file content.
118+ """
119+ response = self .__fetch_file_content (
120+ username , password , token , headers , max_redirects
121+ )
122+ filename = self .__fill_filename (filename )
123+
124+ return BytesInput (response , filename )
125+
126+ @staticmethod
127+ def __extract_filename_from_url (uri ) -> str :
128+ """
129+ Extract the filename from a given URL.
130+
131+ :param uri: The URL to extract the filename from.
132+ :return: The extracted filename or an empty string if not found.
133+ """
134+ filename = os .path .basename (urlparse (uri ).path )
135+ return filename if filename else ""
136+
137+ @staticmethod
138+ def __generate_file_name (extension = ".tmp" ) -> str :
139+ """
140+ Generate a unique filename with a timestamp and random string.
141+
142+ :param extension: The file extension to use (default is '.tmp').
143+ :return: A generated filename.
144+ """
145+ random_string = "" .join (
146+ random .choices (string .ascii_lowercase + string .digits , k = 8 )
147+ )
148+ timestamp = datetime .now ().strftime ("%Y-%m-%d_%H-%M-%S" )
149+ return f"mindee_temp_{ timestamp } _{ random_string } { extension } "
150+
151+ @staticmethod
152+ def __get_file_extension (filename ) -> Optional [str ]:
153+ """
154+ Get the extension from a filename.
155+
156+ :param filename: The filename to extract the extension from.
157+ :return: The lowercase file extension or None if not found.
158+ """
159+ ext = os .path .splitext (filename )[1 ]
160+ return ext .lower () if ext else None
161+
162+ def __fill_filename (self , filename = None ) -> str :
163+ """
164+ Fill in a filename if not provided or incomplete.
165+
166+ :param filename: Optional filename to use.
167+ :return: A complete filename.
168+ """
169+ if filename is None :
170+ filename = UrlInputSource .__extract_filename_from_url (self .url )
171+
172+ if not filename or not os .path .splitext (filename )[1 ]:
173+ filename = self .__generate_file_name (
174+ extension = UrlInputSource .__get_file_extension (filename )
175+ )
176+
177+ return filename
178+
179+ @staticmethod
180+ def __make_request (url , auth , headers , redirects , max_redirects ) -> bytes :
181+ """
182+ Makes an HTTP request to the given URL, while following redirections.
183+
184+ :param url: The URL to request.
185+ :param auth: Authentication tuple (username, password).
186+ :param headers: Headers for the request.
187+ :param redirects: Current number of redirects.
188+ :param max_redirects: Maximum number of redirects to follow.
189+ :return: The content of the response.
190+ :raises MindeeSourceError: If max redirects are exceeded or the request fails.
191+ """
192+ result = requests .get (url , headers = headers , timeout = 120 , auth = auth )
193+ if 299 < result .status_code < 400 :
194+ if redirects == max_redirects :
195+ raise MindeeSourceError (
196+ f"Can't reach URL after { redirects } out of { max_redirects } redirects, "
197+ f"aborting operation."
198+ )
199+ return UrlInputSource .__make_request (
200+ redirects .location , auth , headers , redirects + 1 , max_redirects
201+ )
202+
203+ if result .status_code >= 400 or result .status_code < 200 :
204+ raise MindeeSourceError (
205+ f"Couldn't retrieve file from server, error code { result .status_code } ."
206+ )
207+
208+ return result .content
0 commit comments