@@ -575,6 +575,47 @@ def test_write_column_index_nonstring(self, pa):
575575 msg = r"parquet must have string column names"
576576 self .check_error_on_write (df , engine , ValueError , msg )
577577
578+ def test_use_nullable_dtypes (self , engine ):
579+ import pyarrow .parquet as pq
580+
581+ if engine == "fastparquet" :
582+ # We are manually disabling fastparquet's
583+ # nullable dtype support pending discussion
584+ pytest .skip ("Fastparquet nullable dtype support is disabled" )
585+
586+ table = pyarrow .table (
587+ {
588+ "a" : pyarrow .array ([1 , 2 , 3 , None ], "int64" ),
589+ "b" : pyarrow .array ([1 , 2 , 3 , None ], "uint8" ),
590+ "c" : pyarrow .array (["a" , "b" , "c" , None ]),
591+ "d" : pyarrow .array ([True , False , True , None ]),
592+ # Test that nullable dtypes used even in absence of nulls
593+ "e" : pyarrow .array ([1 , 2 , 3 , 4 ], "int64" ),
594+ }
595+ )
596+ with tm .ensure_clean () as path :
597+ # write manually with pyarrow to write integers
598+ pq .write_table (table , path )
599+ result1 = read_parquet (path , engine = engine )
600+ result2 = read_parquet (path , engine = engine , use_nullable_dtypes = True )
601+
602+ assert result1 ["a" ].dtype == np .dtype ("float64" )
603+ expected = pd .DataFrame (
604+ {
605+ "a" : pd .array ([1 , 2 , 3 , None ], dtype = "Int64" ),
606+ "b" : pd .array ([1 , 2 , 3 , None ], dtype = "UInt8" ),
607+ "c" : pd .array (["a" , "b" , "c" , None ], dtype = "string" ),
608+ "d" : pd .array ([True , False , True , None ], dtype = "boolean" ),
609+ "e" : pd .array ([1 , 2 , 3 , 4 ], dtype = "Int64" ),
610+ }
611+ )
612+ if engine == "fastparquet" :
613+ # Fastparquet doesn't support string columns yet
614+ # Only int and boolean
615+ result2 = result2 .drop ("c" , axis = 1 )
616+ expected = expected .drop ("c" , axis = 1 )
617+ tm .assert_frame_equal (result2 , expected )
618+
578619
579620@pytest .mark .filterwarnings ("ignore:CategoricalBlock is deprecated:DeprecationWarning" )
580621class TestParquetPyArrow (Base ):
@@ -829,35 +870,6 @@ def test_additional_extension_types(self, pa):
829870 )
830871 check_round_trip (df , pa )
831872
832- @td .skip_if_no ("pyarrow" )
833- def test_use_nullable_dtypes (self , pa ):
834- import pyarrow .parquet as pq
835-
836- table = pyarrow .table (
837- {
838- "a" : pyarrow .array ([1 , 2 , 3 , None ], "int64" ),
839- "b" : pyarrow .array ([1 , 2 , 3 , None ], "uint8" ),
840- "c" : pyarrow .array (["a" , "b" , "c" , None ]),
841- "d" : pyarrow .array ([True , False , True , None ]),
842- }
843- )
844- with tm .ensure_clean () as path :
845- # write manually with pyarrow to write integers
846- pq .write_table (table , path )
847- result1 = read_parquet (path )
848- result2 = read_parquet (path , use_nullable_dtypes = True )
849-
850- assert result1 ["a" ].dtype == np .dtype ("float64" )
851- expected = pd .DataFrame (
852- {
853- "a" : pd .array ([1 , 2 , 3 , None ], dtype = "Int64" ),
854- "b" : pd .array ([1 , 2 , 3 , None ], dtype = "UInt8" ),
855- "c" : pd .array (["a" , "b" , "c" , None ], dtype = "string" ),
856- "d" : pd .array ([True , False , True , None ], dtype = "boolean" ),
857- }
858- )
859- tm .assert_frame_equal (result2 , expected )
860-
861873 def test_timestamp_nanoseconds (self , pa ):
862874 # with version 2.0, pyarrow defaults to writing the nanoseconds, so
863875 # this should work without error
@@ -928,7 +940,9 @@ def test_duplicate_columns(self, fp):
928940 def test_bool_with_none (self , fp ):
929941 df = pd .DataFrame ({"a" : [True , None , False ]})
930942 expected = pd .DataFrame ({"a" : [1.0 , np .nan , 0.0 ]}, dtype = "float16" )
931- check_round_trip (df , fp , expected = expected )
943+ # Fastparquet bug in 0.7.1 makes it so that this dtype becomes
944+ # float64
945+ check_round_trip (df , fp , expected = expected , check_dtype = False )
932946
933947 def test_unsupported (self , fp ):
934948
@@ -1049,9 +1063,14 @@ def test_timezone_aware_index(self, fp, timezone_aware_date_list):
10491063 expected .index .name = "index"
10501064 check_round_trip (df , fp , expected = expected )
10511065
1052- def test_use_nullable_dtypes_not_supported (self , fp ):
1066+ def test_use_nullable_dtypes_not_supported (self , monkeypatch , fp ):
10531067 df = pd .DataFrame ({"a" : [1 , 2 ]})
10541068
1069+ # This is supported now in fastparquet 0.7.1 and above actually
1070+ # Still need to ensure that this raises in all versions below
1071+ import fastparquet as fp
1072+
1073+ monkeypatch .setattr (fp , "__version__" , "0.4" )
10551074 with tm .ensure_clean () as path :
10561075 df .to_parquet (path )
10571076 with pytest .raises (ValueError , match = "not supported for the fastparquet" ):
0 commit comments