@@ -780,143 +780,195 @@ def parse(
780780 output [asheetname ] = DataFrame ()
781781 continue
782782
783- is_list_header = False
784- is_len_one_list_header = False
785- if is_list_like (header ):
786- assert isinstance (header , Sequence )
787- is_list_header = True
788- if len (header ) == 1 :
789- is_len_one_list_header = True
790-
791- if is_len_one_list_header :
792- header = cast (Sequence [int ], header )[0 ]
793-
794- # forward fill and pull out names for MultiIndex column
795- header_names = None
796- if header is not None and is_list_like (header ):
797- assert isinstance (header , Sequence )
798-
799- header_names = []
800- control_row = [True ] * len (data [0 ])
801-
802- for row in header :
803- if is_integer (skiprows ):
804- assert isinstance (skiprows , int )
805- row += skiprows
806-
807- if row > len (data ) - 1 :
808- raise ValueError (
809- f"header index { row } exceeds maximum index "
810- f"{ len (data ) - 1 } of data." ,
811- )
812-
813- data [row ], control_row = fill_mi_header (data [row ], control_row )
814-
815- if index_col is not None :
816- header_name , _ = pop_header_name (data [row ], index_col )
817- header_names .append (header_name )
818-
819- # If there is a MultiIndex header and an index then there is also
820- # a row containing just the index name(s)
821- has_index_names = False
822- if is_list_header and not is_len_one_list_header and index_col is not None :
823- index_col_list : Sequence [int ]
824- if isinstance (index_col , int ):
825- index_col_list = [index_col ]
826- else :
827- assert isinstance (index_col , Sequence )
828- index_col_list = index_col
829-
830- # We have to handle mi without names. If any of the entries in the data
831- # columns are not empty, this is a regular row
832- assert isinstance (header , Sequence )
833- if len (header ) < len (data ):
834- potential_index_names = data [len (header )]
835- potential_data = [
836- x
837- for i , x in enumerate (potential_index_names )
838- if not control_row [i ] and i not in index_col_list
839- ]
840- has_index_names = all (x == "" or x is None for x in potential_data )
841-
842- if is_list_like (index_col ):
843- # Forward fill values for MultiIndex index.
844- if header is None :
845- offset = 0
846- elif isinstance (header , int ):
847- offset = 1 + header
848- else :
849- offset = 1 + max (header )
783+ output = self ._parse_sheet (
784+ data = data ,
785+ output = output ,
786+ asheetname = asheetname ,
787+ header = header ,
788+ names = names ,
789+ index_col = index_col ,
790+ usecols = usecols ,
791+ dtype = dtype ,
792+ skiprows = skiprows ,
793+ nrows = nrows ,
794+ true_values = true_values ,
795+ false_values = false_values ,
796+ na_values = na_values ,
797+ parse_dates = parse_dates ,
798+ date_parser = date_parser ,
799+ date_format = date_format ,
800+ thousands = thousands ,
801+ decimal = decimal ,
802+ comment = comment ,
803+ skipfooter = skipfooter ,
804+ dtype_backend = dtype_backend ,
805+ ** kwds ,
806+ )
850807
851- # GH34673: if MultiIndex names present and not defined in the header,
852- # offset needs to be incremented so that forward filling starts
853- # from the first MI value instead of the name
854- if has_index_names :
855- offset += 1
808+ if last_sheetname is None :
809+ raise ValueError ("Sheet name is an empty list" )
856810
857- # Check if we have an empty dataset
858- # before trying to collect data.
859- if offset < len ( data ) :
860- assert isinstance ( index_col , Sequence )
811+ if ret_dict :
812+ return output
813+ else :
814+ return output [ last_sheetname ]
861815
862- for col in index_col :
863- last = data [offset ][col ]
816+ def _parse_sheet (
817+ self ,
818+ data : list ,
819+ output : dict ,
820+ asheetname : str | int | None = None ,
821+ header : int | Sequence [int ] | None = 0 ,
822+ names : SequenceNotStr [Hashable ] | range | None = None ,
823+ index_col : int | Sequence [int ] | None = None ,
824+ usecols = None ,
825+ dtype : DtypeArg | None = None ,
826+ skiprows : Sequence [int ] | int | Callable [[int ], object ] | None = None ,
827+ nrows : int | None = None ,
828+ true_values : Iterable [Hashable ] | None = None ,
829+ false_values : Iterable [Hashable ] | None = None ,
830+ na_values = None ,
831+ parse_dates : list | dict | bool = False ,
832+ date_parser : Callable | lib .NoDefault = lib .no_default ,
833+ date_format : dict [Hashable , str ] | str | None = None ,
834+ thousands : str | None = None ,
835+ decimal : str = "." ,
836+ comment : str | None = None ,
837+ skipfooter : int = 0 ,
838+ dtype_backend : DtypeBackend | lib .NoDefault = lib .no_default ,
839+ ** kwds ,
840+ ):
841+ is_list_header = False
842+ is_len_one_list_header = False
843+ if is_list_like (header ):
844+ assert isinstance (header , Sequence )
845+ is_list_header = True
846+ if len (header ) == 1 :
847+ is_len_one_list_header = True
848+
849+ if is_len_one_list_header :
850+ header = cast (Sequence [int ], header )[0 ]
851+
852+ # forward fill and pull out names for MultiIndex column
853+ header_names = None
854+ if header is not None and is_list_like (header ):
855+ assert isinstance (header , Sequence )
856+
857+ header_names = []
858+ control_row = [True ] * len (data [0 ])
859+
860+ for row in header :
861+ if is_integer (skiprows ):
862+ assert isinstance (skiprows , int )
863+ row += skiprows
864+
865+ if row > len (data ) - 1 :
866+ raise ValueError (
867+ f"header index { row } exceeds maximum index "
868+ f"{ len (data ) - 1 } of data." ,
869+ )
864870
865- for row in range (offset + 1 , len (data )):
866- if data [row ][col ] == "" or data [row ][col ] is None :
867- data [row ][col ] = last
868- else :
869- last = data [row ][col ]
871+ data [row ], control_row = fill_mi_header (data [row ], control_row )
870872
871- # GH 12292 : error when read one empty column from excel file
872- try :
873- parser = TextParser (
874- data ,
875- names = names ,
876- header = header ,
877- index_col = index_col ,
878- has_index_names = has_index_names ,
879- dtype = dtype ,
880- true_values = true_values ,
881- false_values = false_values ,
882- skiprows = skiprows ,
883- nrows = nrows ,
884- na_values = na_values ,
885- skip_blank_lines = False , # GH 39808
886- parse_dates = parse_dates ,
887- date_parser = date_parser ,
888- date_format = date_format ,
889- thousands = thousands ,
890- decimal = decimal ,
891- comment = comment ,
892- skipfooter = skipfooter ,
893- usecols = usecols ,
894- dtype_backend = dtype_backend ,
895- ** kwds ,
896- )
873+ if index_col is not None :
874+ header_name , _ = pop_header_name (data [row ], index_col )
875+ header_names .append (header_name )
897876
898- output [asheetname ] = parser .read (nrows = nrows )
877+ # If there is a MultiIndex header and an index then there is also
878+ # a row containing just the index name(s)
879+ has_index_names = False
880+ if is_list_header and not is_len_one_list_header and index_col is not None :
881+ index_col_list : Sequence [int ]
882+ if isinstance (index_col , int ):
883+ index_col_list = [index_col ]
884+ else :
885+ assert isinstance (index_col , Sequence )
886+ index_col_list = index_col
887+
888+ # We have to handle mi without names. If any of the entries in the data
889+ # columns are not empty, this is a regular row
890+ assert isinstance (header , Sequence )
891+ if len (header ) < len (data ):
892+ potential_index_names = data [len (header )]
893+ potential_data = [
894+ x
895+ for i , x in enumerate (potential_index_names )
896+ if not control_row [i ] and i not in index_col_list
897+ ]
898+ has_index_names = all (x == "" or x is None for x in potential_data )
899+
900+ if is_list_like (index_col ):
901+ # Forward fill values for MultiIndex index.
902+ if header is None :
903+ offset = 0
904+ elif isinstance (header , int ):
905+ offset = 1 + header
906+ else :
907+ offset = 1 + max (header )
908+
909+ # GH34673: if MultiIndex names present and not defined in the header,
910+ # offset needs to be incremented so that forward filling starts
911+ # from the first MI value instead of the name
912+ if has_index_names :
913+ offset += 1
914+
915+ # Check if we have an empty dataset
916+ # before trying to collect data.
917+ if offset < len (data ):
918+ assert isinstance (index_col , Sequence )
919+
920+ for col in index_col :
921+ last = data [offset ][col ]
922+
923+ for row in range (offset + 1 , len (data )):
924+ if data [row ][col ] == "" or data [row ][col ] is None :
925+ data [row ][col ] = last
926+ else :
927+ last = data [row ][col ]
928+
929+ # GH 12292 : error when read one empty column from excel file
930+ try :
931+ parser = TextParser (
932+ data ,
933+ names = names ,
934+ header = header ,
935+ index_col = index_col ,
936+ has_index_names = has_index_names ,
937+ dtype = dtype ,
938+ true_values = true_values ,
939+ false_values = false_values ,
940+ skiprows = skiprows ,
941+ nrows = nrows ,
942+ na_values = na_values ,
943+ skip_blank_lines = False , # GH 39808
944+ parse_dates = parse_dates ,
945+ date_parser = date_parser ,
946+ date_format = date_format ,
947+ thousands = thousands ,
948+ decimal = decimal ,
949+ comment = comment ,
950+ skipfooter = skipfooter ,
951+ usecols = usecols ,
952+ dtype_backend = dtype_backend ,
953+ ** kwds ,
954+ )
899955
900- if header_names :
901- output [asheetname ].columns = output [asheetname ].columns .set_names (
902- header_names
903- )
956+ output [asheetname ] = parser .read (nrows = nrows )
904957
905- except EmptyDataError :
906- # No Data, return an empty DataFrame
907- output [asheetname ] = DataFrame ()
958+ if header_names :
959+ output [asheetname ].columns = output [asheetname ].columns .set_names (
960+ header_names
961+ )
908962
909- except Exception as err :
910- err . args = ( f" { err . args [ 0 ] } (sheet: { asheetname } )" , * err . args [ 1 :])
911- raise err
963+ except EmptyDataError :
964+ # No Data, return an empty DataFrame
965+ output [ asheetname ] = DataFrame ()
912966
913- if last_sheetname is None :
914- raise ValueError ("Sheet name is an empty list" )
967+ except Exception as err :
968+ err .args = (f"{ err .args [0 ]} (sheet: { asheetname } )" , * err .args [1 :])
969+ raise err
915970
916- if ret_dict :
917- return output
918- else :
919- return output [last_sheetname ]
971+ return output
920972
921973
922974@doc (storage_options = _shared_docs ["storage_options" ])
0 commit comments