diff --git a/airbyte_cdk/sources/file_based/file_types/csv_parser.py b/airbyte_cdk/sources/file_based/file_types/csv_parser.py index b8c720642..28dc2eb55 100644 --- a/airbyte_cdk/sources/file_based/file_types/csv_parser.py +++ b/airbyte_cdk/sources/file_based/file_types/csv_parser.py @@ -73,9 +73,12 @@ def read_data( headers, raw_headers = self._read_and_validate_headers( fp, config_format, dialect_name ) - except UnicodeError: + except UnicodeError as e: raise AirbyteTracedException( - message=f"{FileBasedSourceError.ENCODING_ERROR.value} Expected encoding: {config_format.encoding}", + message=f"File contains bytes that cannot be decoded with the configured {config_format.encoding} encoding.", + internal_message=str(e), + failure_type=FailureType.config_error, + exception=e, ) rows_to_skip = ( diff --git a/unit_tests/sources/file_based/file_types/test_csv_parser.py b/unit_tests/sources/file_based/file_types/test_csv_parser.py index f12f6613e..a95ea74a0 100644 --- a/unit_tests/sources/file_based/file_types/test_csv_parser.py +++ b/unit_tests/sources/file_based/file_types/test_csv_parser.py @@ -648,15 +648,17 @@ def test_read_data_with_encoding_error(self) -> None: self._stream_reader.open_file.return_value = ( CsvFileBuilder().with_data(["something"]).build() ) - self._csv_reader._read_and_validate_headers = Mock( - side_effect=UnicodeDecodeError("encoding", b"", 0, 1, "reason") - ) + unicode_error = UnicodeDecodeError("utf-8", b"\xff", 0, 1, "invalid start byte") + self._csv_reader._read_and_validate_headers = Mock(side_effect=unicode_error) with pytest.raises(AirbyteTracedException) as ate: data_generator = self._read_data() - assert len(list(data_generator)) == 0 + list(data_generator) - assert "encoding" in ate.value.message + assert "utf8" in ate.value.message + assert ate.value.failure_type == FailureType.config_error + assert ate.value.internal_message == str(unicode_error) + assert ate.value._exception is unicode_error assert self._csv_reader._read_and_validate_headers.called def _read_data(self) -> Generator[Dict[str, str], None, None]: