44"""
55import json
66import os
7+ import gzip
78from collections import Iterable
89import configparser
910from ..utils .logger import get_logger
@@ -79,7 +80,7 @@ def read_lines_lazy(filename, encoding=_ENCODING_UTF8, keep_end=False,
7980 file .close ()
8081
8182
82- def read_file (filename , encoding = _ENCODING_UTF8 , default = None ):
83+ def read_file (filename , encoding = _ENCODING_UTF8 , default = None , is_gzip = False ):
8384 """
8485 wrap open function to read text in file
8586 :param filename: file path
@@ -90,8 +91,14 @@ def read_file(filename, encoding=_ENCODING_UTF8, default=None):
9091 """
9192 if not os .path .exists (filename ) and default is not None :
9293 return default
93- with open (filename , encoding = encoding ) as f :
94- return f .read ()
94+ if not is_gzip :
95+ f = open (filename , encoding = encoding )
96+ else :
97+ f = gzip .open (filename , 'rt' , encoding = encoding )
98+
99+ text = f .read ()
100+ f .close ()
101+ return text
95102
96103
97104def write_file (filename , data , encoding = _ENCODING_UTF8 ):
@@ -163,66 +170,79 @@ def write_json(filename, data, serialize_method=None):
163170 json .dump (data , f , ensure_ascii = False , default = serialize_method )
164171
165172
166- def read_jsonline (filename , encoding = _ENCODING_UTF8 , default = None ):
173+ def read_jsonline (filename , encoding = _ENCODING_UTF8 , default = None , is_gzip = False ):
167174 """
168175 read jsonl file
169176 :param filename: source file path
170177 :param encoding: file encoding
171178 :param default: returned value when filename is not existed.
172179 If it's None, exception will be raised as usual.
180+ :param is_gzip: whether input file is gzip format
173181 :return: object list, an object corresponding a line
174182 """
175183 if not os .path .exists (filename ) and default is not None :
176184 return default
177- file = open (filename , encoding = encoding )
185+ if not is_gzip :
186+ file = open (filename , encoding = encoding )
187+ else :
188+ file = gzip .open (filename , 'rt' , encoding = encoding )
178189 items = []
179190 for line in file :
180191 items .append (json .loads (line ))
181192 file .close ()
182193 return items
183194
184195
185- def read_jsonline_lazy (filename , encoding = _ENCODING_UTF8 , default = None ):
196+ def read_jsonline_lazy (filename , encoding = _ENCODING_UTF8 , default = None , is_gzip = False ):
186197 """
187198 use generator to load jsonl one line every time
188199 :param filename: source file path
189200 :param encoding: file encoding
190201 :param default: returned value when filename is not existed.
191202 If it's None, exception will be raised as usual.
203+ :param is_gzip: whether input file is gzip file
192204 :return: json object
193205 """
194206 if not os .path .exists (filename ) and default is not None :
195207 return default
196- file = open (filename , encoding = encoding )
208+ if not is_gzip :
209+ file = open (filename , encoding = encoding )
210+ else :
211+ file = gzip .open (filename , 'rt' , encoding = encoding )
197212 for line in file :
198213 yield json .loads (line )
199214 file .close ()
200215
201216
202- def get_jsonline_chunk_lazy (filename , chunk_size , encoding = _ENCODING_UTF8 , default = None ):
217+ def get_jsonline_chunk_lazy (filename , chunk_size , encoding = _ENCODING_UTF8 ,
218+ default = None , is_gzip = False ):
203219 """
204220 use generator to read jsonline items chunk by chunk
205221 :param filename: source jsonline file
206222 :param chunk_size: chunk size
207223 :param encoding: file encoding
208224 :param default: default value to return when file is not existed
225+ :param is_gzip: whether input file is gzip file
209226 :return: chunk of some items
210227 """
211- file_generator = read_jsonline_lazy (filename , encoding , default )
228+ file_generator = read_jsonline_lazy (filename , encoding , default , is_gzip )
212229 for chunk in get_chunk (file_generator , chunk_size ):
213230 yield chunk
214231
215232
216- def get_jsonline_chunk (filename , chunk_size , encoding = _ENCODING_UTF8 , default = None ):
233+ def get_jsonline_chunk (filename , chunk_size , encoding = _ENCODING_UTF8 ,
234+ default = None , is_gzip = False ):
217235 """
218236 read jsonline items chunk by chunk
219237 :param filename: source jsonline file
220238 :param chunk_size: chunk size
221239 :param encoding: file encoding
222240 :param default: default value to return when file is not existed
241+ :param is_gzip: whether input file is gzip format
223242 :return: chunk of some items
224243 """
225- chunk_generator = get_chunk (read_jsonline_lazy (filename , encoding , default ), chunk_size )
244+ f = read_jsonline_lazy (filename , encoding , default , is_gzip )
245+ chunk_generator = get_chunk (f , chunk_size )
226246 return list (chunk_generator )
227247
228248
0 commit comments