From 0d68f0af091148c846606dd12d396eefc9e8391b Mon Sep 17 00:00:00 2001 From: aviruthen <91846056+aviruthen@users.noreply.github.com> Date: Tue, 7 Apr 2026 15:41:34 -0400 Subject: [PATCH 1/2] fix: load_feature_definitions_from_dataframe() doesn't recognize pandas nullable dtyp (5675) --- .../mlops/feature_store/feature_utils.py | 15 +++- .../mlops/feature_store/test_feature_utils.py | 72 +++++++++++++++++++ 2 files changed, 85 insertions(+), 2 deletions(-) diff --git a/sagemaker-mlops/src/sagemaker/mlops/feature_store/feature_utils.py b/sagemaker-mlops/src/sagemaker/mlops/feature_store/feature_utils.py index 5ee04780be..d9aa432ae9 100644 --- a/sagemaker-mlops/src/sagemaker/mlops/feature_store/feature_utils.py +++ b/sagemaker-mlops/src/sagemaker/mlops/feature_store/feature_utils.py @@ -46,8 +46,19 @@ "float64": "Fractional", } -_INTEGER_TYPES = {"int_", "int8", "int16", "int32", "int64", "uint8", "uint16", "uint32", "uint64"} -_FLOAT_TYPES = {"float_", "float16", "float32", "float64"} +_INTEGER_TYPES = { + "int_", "int8", "int16", "int32", "int64", + "uint8", "uint16", "uint32", "uint64", + # pandas nullable integer dtypes + "Int8", "Int16", "Int32", "Int64", + "UInt8", "UInt16", "UInt32", "UInt64", +} +_FLOAT_TYPES = { + "float_", "float16", "float32", "float64", + # pandas nullable float dtypes + "Float32", "Float64", +} +_STRING_TYPES = {"object", "string"} def _get_athena_client(session: Session): diff --git a/sagemaker-mlops/tests/unit/sagemaker/mlops/feature_store/test_feature_utils.py b/sagemaker-mlops/tests/unit/sagemaker/mlops/feature_store/test_feature_utils.py index 91098247a5..0e3c7e659f 100644 --- a/sagemaker-mlops/tests/unit/sagemaker/mlops/feature_store/test_feature_utils.py +++ b/sagemaker-mlops/tests/unit/sagemaker/mlops/feature_store/test_feature_utils.py @@ -49,6 +49,78 @@ def test_returns_correct_count(self, sample_dataframe): defs = load_feature_definitions_from_dataframe(sample_dataframe) assert len(defs) == 3 + def test_infers_integral_type_with_pandas_nullable_Int64(self): + df = pd.DataFrame({"id": pd.Series([1, 2, 3], dtype="Int64")}) + defs = load_feature_definitions_from_dataframe(df) + assert defs[0].feature_type == "Integral" + + def test_infers_integral_type_with_pandas_nullable_Int32(self): + df = pd.DataFrame({"id": pd.Series([1, 2, 3], dtype="Int32")}) + defs = load_feature_definitions_from_dataframe(df) + assert defs[0].feature_type == "Integral" + + def test_infers_integral_type_with_pandas_nullable_Int16(self): + df = pd.DataFrame({"id": pd.Series([1, 2, 3], dtype="Int16")}) + defs = load_feature_definitions_from_dataframe(df) + assert defs[0].feature_type == "Integral" + + def test_infers_integral_type_with_pandas_nullable_Int8(self): + df = pd.DataFrame({"id": pd.Series([1, 2, 3], dtype="Int8")}) + defs = load_feature_definitions_from_dataframe(df) + assert defs[0].feature_type == "Integral" + + def test_infers_integral_type_with_pandas_nullable_UInt64(self): + df = pd.DataFrame({"id": pd.Series([1, 2, 3], dtype="UInt64")}) + defs = load_feature_definitions_from_dataframe(df) + assert defs[0].feature_type == "Integral" + + def test_infers_integral_type_with_pandas_nullable_UInt32(self): + df = pd.DataFrame({"id": pd.Series([1, 2, 3], dtype="UInt32")}) + defs = load_feature_definitions_from_dataframe(df) + assert defs[0].feature_type == "Integral" + + def test_infers_fractional_type_with_pandas_nullable_Float64(self): + df = pd.DataFrame({"value": pd.Series([1.1, 2.2, 3.3], dtype="Float64")}) + defs = load_feature_definitions_from_dataframe(df) + assert defs[0].feature_type == "Fractional" + + def test_infers_fractional_type_with_pandas_nullable_Float32(self): + df = pd.DataFrame({"value": pd.Series([1.1, 2.2], dtype="Float32")}) + defs = load_feature_definitions_from_dataframe(df) + assert defs[0].feature_type == "Fractional" + + def test_infers_string_type_with_pandas_string_dtype(self): + df = pd.DataFrame({"name": pd.Series(["a", "b", "c"], dtype="string")}) + defs = load_feature_definitions_from_dataframe(df) + assert defs[0].feature_type == "String" + + def test_infers_correct_types_after_convert_dtypes(self): + df = pd.DataFrame({ + "id": [1, 2, 3], + "price": [1.1, 2.2, 3.3], + "name": ["a", "b", "c"], + }).convert_dtypes() + defs = load_feature_definitions_from_dataframe(df) + id_def = next(d for d in defs if d.feature_name == "id") + price_def = next(d for d in defs if d.feature_name == "price") + name_def = next(d for d in defs if d.feature_name == "name") + assert id_def.feature_type == "Integral" + assert price_def.feature_type == "Fractional" + assert name_def.feature_type == "String" + + def test_infers_correct_types_with_mixed_nullable_and_numpy_dtypes(self): + df = pd.DataFrame({ + "numpy_int": pd.Series([1, 2, 3], dtype="int64"), + "nullable_float": pd.Series([1.1, 2.2, 3.3], dtype="Float64"), + "nullable_int": pd.Series([10, 20, 30], dtype="Int64"), + "numpy_float": pd.Series([0.1, 0.2, 0.3], dtype="float64"), + }) + defs = load_feature_definitions_from_dataframe(df) + assert next(d for d in defs if d.feature_name == "numpy_int").feature_type == "Integral" + assert next(d for d in defs if d.feature_name == "nullable_float").feature_type == "Fractional" + assert next(d for d in defs if d.feature_name == "nullable_int").feature_type == "Integral" + assert next(d for d in defs if d.feature_name == "numpy_float").feature_type == "Fractional" + def test_collection_type_with_in_memory_storage(self): df = pd.DataFrame({ "id": pd.Series([1, 2], dtype="int64"), From 1e5ad13853b11cd6d117e4ce2abd4ce141af7ea1 Mon Sep 17 00:00:00 2001 From: aviruthen <91846056+aviruthen@users.noreply.github.com> Date: Tue, 7 Apr 2026 18:54:21 -0400 Subject: [PATCH 2/2] fix: address review comments (iteration #1) --- .../mlops/feature_store/feature_utils.py | 14 +++ .../mlops/feature_store/test_feature_utils.py | 94 ++++++++++--------- 2 files changed, 66 insertions(+), 42 deletions(-) diff --git a/sagemaker-mlops/src/sagemaker/mlops/feature_store/feature_utils.py b/sagemaker-mlops/src/sagemaker/mlops/feature_store/feature_utils.py index d9aa432ae9..f6147d306d 100644 --- a/sagemaker-mlops/src/sagemaker/mlops/feature_store/feature_utils.py +++ b/sagemaker-mlops/src/sagemaker/mlops/feature_store/feature_utils.py @@ -44,6 +44,18 @@ "string": "String", "int64": "Integral", "float64": "Fractional", + # pandas nullable integer dtypes + "Int8": "Integral", + "Int16": "Integral", + "Int32": "Integral", + "Int64": "Integral", + "UInt8": "Integral", + "UInt16": "Integral", + "UInt32": "Integral", + "UInt64": "Integral", + # pandas nullable float dtypes + "Float32": "Fractional", + "Float64": "Fractional", } _INTEGER_TYPES = { @@ -329,6 +341,8 @@ def _generate_feature_definition( return IntegralFeatureDefinition(series.name, collection_type) if dtype in _FLOAT_TYPES: return FractionalFeatureDefinition(series.name, collection_type) + if dtype in _STRING_TYPES: + return StringFeatureDefinition(series.name, collection_type) return StringFeatureDefinition(series.name, collection_type) diff --git a/sagemaker-mlops/tests/unit/sagemaker/mlops/feature_store/test_feature_utils.py b/sagemaker-mlops/tests/unit/sagemaker/mlops/feature_store/test_feature_utils.py index 0e3c7e659f..7fd55ceef6 100644 --- a/sagemaker-mlops/tests/unit/sagemaker/mlops/feature_store/test_feature_utils.py +++ b/sagemaker-mlops/tests/unit/sagemaker/mlops/feature_store/test_feature_utils.py @@ -49,43 +49,29 @@ def test_returns_correct_count(self, sample_dataframe): defs = load_feature_definitions_from_dataframe(sample_dataframe) assert len(defs) == 3 - def test_infers_integral_type_with_pandas_nullable_Int64(self): - df = pd.DataFrame({"id": pd.Series([1, 2, 3], dtype="Int64")}) - defs = load_feature_definitions_from_dataframe(df) - assert defs[0].feature_type == "Integral" - - def test_infers_integral_type_with_pandas_nullable_Int32(self): - df = pd.DataFrame({"id": pd.Series([1, 2, 3], dtype="Int32")}) - defs = load_feature_definitions_from_dataframe(df) - assert defs[0].feature_type == "Integral" - - def test_infers_integral_type_with_pandas_nullable_Int16(self): - df = pd.DataFrame({"id": pd.Series([1, 2, 3], dtype="Int16")}) - defs = load_feature_definitions_from_dataframe(df) - assert defs[0].feature_type == "Integral" - - def test_infers_integral_type_with_pandas_nullable_Int8(self): - df = pd.DataFrame({"id": pd.Series([1, 2, 3], dtype="Int8")}) - defs = load_feature_definitions_from_dataframe(df) - assert defs[0].feature_type == "Integral" - - def test_infers_integral_type_with_pandas_nullable_UInt64(self): - df = pd.DataFrame({"id": pd.Series([1, 2, 3], dtype="UInt64")}) - defs = load_feature_definitions_from_dataframe(df) - assert defs[0].feature_type == "Integral" - - def test_infers_integral_type_with_pandas_nullable_UInt32(self): - df = pd.DataFrame({"id": pd.Series([1, 2, 3], dtype="UInt32")}) + @pytest.mark.parametrize( + "dtype", + ["Int8", "Int16", "Int32", "Int64", + "UInt8", "UInt16", "UInt32", "UInt64"], + ) + def test_infers_integral_type_with_pandas_nullable_int( + self, dtype + ): + df = pd.DataFrame( + {"id": pd.Series([1, 2, 3], dtype=dtype)} + ) defs = load_feature_definitions_from_dataframe(df) assert defs[0].feature_type == "Integral" - def test_infers_fractional_type_with_pandas_nullable_Float64(self): - df = pd.DataFrame({"value": pd.Series([1.1, 2.2, 3.3], dtype="Float64")}) - defs = load_feature_definitions_from_dataframe(df) - assert defs[0].feature_type == "Fractional" - - def test_infers_fractional_type_with_pandas_nullable_Float32(self): - df = pd.DataFrame({"value": pd.Series([1.1, 2.2], dtype="Float32")}) + @pytest.mark.parametrize( + "dtype", ["Float32", "Float64"], + ) + def test_infers_fractional_type_with_pandas_nullable_float( + self, dtype + ): + df = pd.DataFrame( + {"value": pd.Series([1.1, 2.2, 3.3], dtype=dtype)} + ) defs = load_feature_definitions_from_dataframe(df) assert defs[0].feature_type == "Fractional" @@ -108,18 +94,42 @@ def test_infers_correct_types_after_convert_dtypes(self): assert price_def.feature_type == "Fractional" assert name_def.feature_type == "String" - def test_infers_correct_types_with_mixed_nullable_and_numpy_dtypes(self): + def test_infers_correct_types_with_mixed_nullable_and_numpy_dtypes( + self, + ): df = pd.DataFrame({ "numpy_int": pd.Series([1, 2, 3], dtype="int64"), - "nullable_float": pd.Series([1.1, 2.2, 3.3], dtype="Float64"), - "nullable_int": pd.Series([10, 20, 30], dtype="Int64"), - "numpy_float": pd.Series([0.1, 0.2, 0.3], dtype="float64"), + "nullable_float": pd.Series( + [1.1, 2.2, 3.3], dtype="Float64" + ), + "nullable_int": pd.Series( + [10, 20, 30], dtype="Int64" + ), + "numpy_float": pd.Series( + [0.1, 0.2, 0.3], dtype="float64" + ), }) defs = load_feature_definitions_from_dataframe(df) - assert next(d for d in defs if d.feature_name == "numpy_int").feature_type == "Integral" - assert next(d for d in defs if d.feature_name == "nullable_float").feature_type == "Fractional" - assert next(d for d in defs if d.feature_name == "nullable_int").feature_type == "Integral" - assert next(d for d in defs if d.feature_name == "numpy_float").feature_type == "Fractional" + + result = next( + d for d in defs if d.feature_name == "numpy_int" + ) + assert result.feature_type == "Integral" + + result = next( + d for d in defs if d.feature_name == "nullable_float" + ) + assert result.feature_type == "Fractional" + + result = next( + d for d in defs if d.feature_name == "nullable_int" + ) + assert result.feature_type == "Integral" + + result = next( + d for d in defs if d.feature_name == "numpy_float" + ) + assert result.feature_type == "Fractional" def test_collection_type_with_in_memory_storage(self): df = pd.DataFrame({