From b21062f263d80587ea46d3b16c8c34957e13e193 Mon Sep 17 00:00:00 2001 From: dittmar Date: Mon, 8 Dec 2025 10:55:30 +0100 Subject: [PATCH 01/13] feat: improve type annotations for parameter values and mappings --- CHANGELOG.md | 4 ++++ src/cadenzaanalytics/data/parameter.py | 7 ++++--- src/cadenzaanalytics/data/parameter_value.py | 9 +++++---- src/cadenzaanalytics/data/parameter_value_type.py | 9 +++++++++ src/cadenzaanalytics/request/analytics_request.py | 2 +- src/cadenzaanalytics/request/request_metadata.py | 2 +- src/cadenzaanalytics/request/request_parameter.py | 11 ++++++----- 7 files changed, 30 insertions(+), 14 deletions(-) create mode 100644 src/cadenzaanalytics/data/parameter_value_type.py diff --git a/CHANGELOG.md b/CHANGELOG.md index a5eb3fa..74affde 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Added - Support for measure aggregation types MEDIAN and STANDARD_DEVIATION_SAMPLE + +### Changed +- Improved type annotations for mappings and parameter values + ## 10.4.0 - 2025-12-05 ### Added - New visual response of type text (TextResponse) diff --git a/src/cadenzaanalytics/data/parameter.py b/src/cadenzaanalytics/data/parameter.py index a096da2..693a9df 100644 --- a/src/cadenzaanalytics/data/parameter.py +++ b/src/cadenzaanalytics/data/parameter.py @@ -1,8 +1,9 @@ -from typing import List, Any, Optional +from typing import List, Optional from cadenzaanalytics.data.geometry_type import GeometryType from cadenzaanalytics.data.data_object import DataObject from cadenzaanalytics.data.parameter_type import ParameterType +from cadenzaanalytics.data.parameter_value_type import ParameterValueType # pylint: disable=too-many-instance-attributes @@ -31,7 +32,7 @@ def __init__(self, *, geometry_types: Optional[List[GeometryType]] = None, options: Optional[List[str]] = None, required: bool = False, - default_value: Any = None, + default_value: Optional[ParameterValueType] = None, requested_srs: Optional[str] = None) -> None: """Initialize a Parameter. @@ -51,7 +52,7 @@ def __init__(self, *, Whether the parameter is required, by default False. For parameter type boolean, required=True makes submitting the value True mandatory. - default_value : Any, optional + default_value : Optional[ParameterValueType], optional Default value if the user doesn't provide one. requested_srs : Optional[str], optional Requested spatial reference system for GEOMETRY parameters. diff --git a/src/cadenzaanalytics/data/parameter_value.py b/src/cadenzaanalytics/data/parameter_value.py index 9e1ab9c..ff5fddf 100644 --- a/src/cadenzaanalytics/data/parameter_value.py +++ b/src/cadenzaanalytics/data/parameter_value.py @@ -5,6 +5,7 @@ from cadenzaanalytics.data.data_type import DataType from cadenzaanalytics.data.geometry_type import GeometryType from cadenzaanalytics.data.data_object import DataObject +from cadenzaanalytics.data.parameter_value_type import ParameterValueType class ParameterValue(DataObject): @@ -92,12 +93,12 @@ def data_type(self) -> DataType: return self._data_type @property - def value(self) -> Any: + def value(self) -> Optional[ParameterValueType]: """Get the typed value of the parameter. Returns ------- - Any + Optional[ParameterValueType] The value of the parameter, typed according to the data type. """ return self._value @@ -125,7 +126,7 @@ def srs(self) -> Optional[str]: return self._srs - def _parse_value(self, value: Any, data_type: DataType) -> Any: + def _parse_value(self, value: Any, data_type: DataType) -> Optional[ParameterValueType]: """Parse and convert a parameter value according to its data type. Parameters @@ -137,7 +138,7 @@ def _parse_value(self, value: Any, data_type: DataType) -> Any: Returns ------- - Any + Optional[ParameterValueType] The parsed value with appropriate type, or None if input is None. """ if value is None: diff --git a/src/cadenzaanalytics/data/parameter_value_type.py b/src/cadenzaanalytics/data/parameter_value_type.py new file mode 100644 index 0000000..896177d --- /dev/null +++ b/src/cadenzaanalytics/data/parameter_value_type.py @@ -0,0 +1,9 @@ +from __future__ import annotations + +from datetime import datetime +from typing import Union + +from shapely.geometry.base import BaseGeometry + +# A typed value for parameters: numbers, strings, boolean, shapely geometries, and datetime +ParameterValueType = Union[int, float, str, bool, datetime, BaseGeometry] diff --git a/src/cadenzaanalytics/request/analytics_request.py b/src/cadenzaanalytics/request/analytics_request.py index 66ab8e8..77c050d 100644 --- a/src/cadenzaanalytics/request/analytics_request.py +++ b/src/cadenzaanalytics/request/analytics_request.py @@ -5,7 +5,7 @@ from cadenzaanalytics.request.request_table import RequestTable -class AnalyticsRequest(collections.abc.Mapping): +class AnalyticsRequest(collections.abc.Mapping[str, RequestTable]): """Represents an incoming analytics request from Cadenza. Provides access to request parameters and data tables. Supports dict-like diff --git a/src/cadenzaanalytics/request/request_metadata.py b/src/cadenzaanalytics/request/request_metadata.py index 2e3304d..6a2ac53 100644 --- a/src/cadenzaanalytics/request/request_metadata.py +++ b/src/cadenzaanalytics/request/request_metadata.py @@ -6,7 +6,7 @@ # pylint: disable=protected-access -class RequestMetadata(collections.abc.Mapping): +class RequestMetadata(collections.abc.Mapping[str, ColumnMetadata]): """Metadata describing the columns in a request table. Provides access to column metadata by name and groupings by attribute group. diff --git a/src/cadenzaanalytics/request/request_parameter.py b/src/cadenzaanalytics/request/request_parameter.py index 438df8d..817b69e 100644 --- a/src/cadenzaanalytics/request/request_parameter.py +++ b/src/cadenzaanalytics/request/request_parameter.py @@ -1,11 +1,12 @@ import collections -from typing import Iterator, List, Optional, Any +from typing import Iterator, List, Optional from cadenzaanalytics.data.parameter_value import ParameterValue from cadenzaanalytics.request.view_parameter import ViewParameter +from cadenzaanalytics.data.parameter_value_type import ParameterValueType -class RequestParameter(collections.abc.Mapping): +class RequestParameter(collections.abc.Mapping[str, ParameterValueType]): """Provides access to parameters from an analytics request. Supports dict-like access to parameter values via `params["name"]` syntax. @@ -41,7 +42,7 @@ def view(self) -> ViewParameter: device_pixel_ratio=device_pixel_ratio ) - def __getitem__(self, name: str) -> Any: + def __getitem__(self, name: str) -> Optional[ParameterValueType]: parameter = self._get_parameter(name) if parameter is not None: return parameter.value @@ -90,7 +91,7 @@ def _get_parameter(self, name: str) -> Optional[ParameterValue]: return None - def _get_parameter_value(self, name: str) -> Any: + def _get_parameter_value(self, name: str) -> Optional[ParameterValueType]: """Returns a specific parameter value. Parameters @@ -100,7 +101,7 @@ def _get_parameter_value(self, name: str) -> Any: Returns ------- - Any + Optional[ParameterValueType] The value of the parameter if found, else None. """ From d2eff398a4ac7fe343c82236325e138cd7a6cb16 Mon Sep 17 00:00:00 2001 From: dittmar Date: Tue, 23 Dec 2025 22:15:46 +0100 Subject: [PATCH 02/13] fix example enrichment extension to actually enrich some values, not only empty strings, by working on the input dataframe and not an empty dataframe --- examples/enrichment/extension/example_extensions.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/examples/enrichment/extension/example_extensions.py b/examples/enrichment/extension/example_extensions.py index b53c460..ecc06d4 100644 --- a/examples/enrichment/extension/example_extensions.py +++ b/examples/enrichment/extension/example_extensions.py @@ -1,14 +1,12 @@ """Example module for running a disy Cadenza analytics extension that will execute a basic enrichment.""" -import pandas as pd - import cadenzaanalytics as ca def enrichment_basic_analytics_function(request: ca.AnalyticsRequest): # pylint: disable=unused-argument - df_data = pd.DataFrame() + df_data = request["table"].data df_data["new_value"] = "value" result_metadata = [ From 1a08ee0093bc6416a139018fde53f78a3f07067b Mon Sep 17 00:00:00 2001 From: dittmar Date: Fri, 16 Jan 2026 17:29:34 +0100 Subject: [PATCH 03/13] CADENZA-42792 feat: add custom csv parser handling all special definitions and to allow working around issues with pandas csv parsing and writing --- .github/workflows/ci.yml | 5 + pyproject.toml | 1 + .../cadenza_analytics_extension.py | 44 +- src/cadenzaanalytics/response/csv_response.py | 48 +- src/cadenzaanalytics/tests/__init__.py | 1 + src/cadenzaanalytics/tests/test_csv.py | 535 ++++++++++++++++ .../tests/test_csv_roundtrip.py | 581 ++++++++++++++++++ src/cadenzaanalytics/tests/test_csv_writer.py | 435 +++++++++++++ src/cadenzaanalytics/util/__init__.py | 3 + src/cadenzaanalytics/util/csv.py | 283 +++++++++ 10 files changed, 1862 insertions(+), 74 deletions(-) create mode 100644 src/cadenzaanalytics/tests/__init__.py create mode 100644 src/cadenzaanalytics/tests/test_csv.py create mode 100644 src/cadenzaanalytics/tests/test_csv_roundtrip.py create mode 100644 src/cadenzaanalytics/tests/test_csv_writer.py create mode 100644 src/cadenzaanalytics/util/__init__.py create mode 100644 src/cadenzaanalytics/util/csv.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6a3990a..062e7c9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -43,3 +43,8 @@ jobs: run: | python -m pip install --upgrade pip pip install . + pip install pytest + + - name: Run tests + run: | + pytest src/cadenzaanalytics/tests/ -v diff --git a/pyproject.toml b/pyproject.toml index c2cdb7d..eacbe01 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,7 @@ requests-toolbelt = "1.0.0" pandas = " ^2.0.2" chardet = "5.2.0" Shapely = "2.1.2" +pytest = "9.0.2" [project] name = "cadenzaanalytics" diff --git a/src/cadenzaanalytics/cadenza_analytics_extension.py b/src/cadenzaanalytics/cadenza_analytics_extension.py index f3c12b5..a2dd799 100644 --- a/src/cadenzaanalytics/cadenza_analytics_extension.py +++ b/src/cadenzaanalytics/cadenza_analytics_extension.py @@ -3,13 +3,9 @@ invoked via HTTP POST on the relative path.""" import json import logging -from io import StringIO from typing import Callable, List, Optional -import numpy as np -import pandas as pd from flask import Response, request -from shapely import from_wkt from cadenzaanalytics.data.analytics_extension import AnalyticsExtension from cadenzaanalytics.data.extension_type import ExtensionType @@ -21,6 +17,7 @@ from cadenzaanalytics.request.request_metadata import RequestMetadata from cadenzaanalytics.request.request_table import RequestTable from cadenzaanalytics.response.extension_response import ExtensionResponse +from cadenzaanalytics.util.csv import from_cadenza_csv logger = logging.getLogger('cadenzaanalytics') @@ -151,51 +148,26 @@ def _get_request_data(self, multipart_request) -> AnalyticsRequest: if len(metadata.columns) > 0: has_data = True type_mapping = {} - na_values_mapping = {} datetime_columns = [] geometry_columns = [] for column in metadata.columns: if column.data_type == DataType.ZONEDDATETIME: datetime_columns.append(column.name) - # must be empty list, otherwise pd.read_csv interprets empty strings as NA which - # is rejected by the parse_dates mechanism before it reaches the _parse_datetime function - na_values_mapping[column.name] = [] - elif column.data_type == DataType.STRING: - # only empty strings must be considered as NA - # unfortunately there does not seem to be a way to interpret empty quotes as empty string - # and unquoted as None - na_values_mapping[column.name] = [''] - else: - # pandas default list of NA values, mostly relevant for numeric columns - na_values_mapping[column.name] = ['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', - '#N/A', 'N/A', 'n/a', 'NA', '', '#NA', 'NULL', 'null', - 'NaN', '-NaN', 'nan', '-nan', 'None', ''] - - if column.data_type == DataType.GEOMETRY: + elif column.data_type == DataType.GEOMETRY: geometry_columns.append(column.name) type_mapping[column.name] = column.data_type.pandas_type() - csv_data = StringIO(self._get_from_request(multipart_request, 'data')) - # read_csv cannot distinguish None from empty strings - df_data = pd.read_csv( + csv_data = self._get_from_request(multipart_request, 'data') + # Use custom parser that properly handles quoted vs unquoted values + df_data = from_cadenza_csv( csv_data, - sep=';', - dtype=type_mapping, - parse_dates=datetime_columns, - date_format='ISO8601', - na_values=na_values_mapping, - keep_default_na=False, + type_mapping=type_mapping, + datetime_columns=datetime_columns, + geometry_columns=geometry_columns ) - # Parse WKT geometries into shapely geometry objects using vectorized from_wkt - for gcol in geometry_columns: - values = df_data[gcol].to_numpy() - # from_wkt handles None values; replace empty strings with None - values = np.where((values == '') | pd.isna(values), None, values) - df_data[gcol] = from_wkt(values, on_invalid='warn') - logger.debug('Received data:\n%s', df_data.head()) else: has_data = False diff --git a/src/cadenzaanalytics/response/csv_response.py b/src/cadenzaanalytics/response/csv_response.py index 7af9187..fb1c1ad 100644 --- a/src/cadenzaanalytics/response/csv_response.py +++ b/src/cadenzaanalytics/response/csv_response.py @@ -1,6 +1,3 @@ -import csv -import re -import sys from typing import List, Optional import logging @@ -12,6 +9,7 @@ from cadenzaanalytics.request.request_table import RequestTable from cadenzaanalytics.response.extension_data_response import ExtensionDataResponse from cadenzaanalytics.response.missing_metadata_strategy import MissingMetadataStrategy +from cadenzaanalytics.util import to_cadenza_csv logger = logging.getLogger('cadenzaanalytics') @@ -107,41 +105,15 @@ def get_response(self, request_table: Optional[RequestTable] = None) -> Response leftover_metadata_column_names = self._apply_missing_metadata_strategy() self._validate_response(leftover_metadata_column_names) - python_3_12 = (3, 12) - if sys.version_info >= python_3_12 and len(self._data.columns) > 1: - # The quoting strategies QUOTE_NOTNULL or QUOTE_NULL would fail with the csv writer - # error "single empty field record must be quoted" - # if there is only one column and if there is any null-ish value available. - # Also refer to https://github.com/pandas-dev/pandas/issues/59116 - # Thus we can only use this strategy if there is more than one column, else fallback to - # the fallback approach that always quotes and then removes quotes again. - # The limitation to python 3.12 comes from the option QUOTE_NOTNULL only becoming available on that version. - csv_data = self._data.to_csv( - sep=';', - encoding='utf-8', - quoting=csv.QUOTE_NOTNULL, - index=False, - na_rep=None, # missing/None/Null values are sent without quotes - quotechar='"', - lineterminator='\r\n', - date_format='%Y-%m-%dT%H:%M:%SZ') - else: - # info: this approach cannot distinguish empty strings from NULL - csv_data = self._data.to_csv( - sep=';', - encoding='utf-8', - quoting=csv.QUOTE_ALL, - index=False, - quotechar='"', - lineterminator='\r\n', - date_format='%Y-%m-%dT%H:%M:%SZ') - # Needed to make sure to send NULL/None values (unquoted empty content) and not empty strings - # (quoted empty content) - # as empty strings would only be valid for DataType.STRING and cause errors for other DataTypes. - # regex searches and replaces double quotes that are surrounded by separators - # (start file, end file, semicolon or newline) - # this way double-quotes that represent a single escaped quote character within a string value are retained - csv_data = re.sub(r'(^|;|\r\n)""(?=;|\r\n|$)', r'\1', csv_data) + datetime_columns = [c.name for c in self._column_meta_data if c.data_type == DataType.ZONEDDATETIME] + geometry_columns = [c.name for c in self._column_meta_data if c.data_type == DataType.GEOMETRY] + float_columns = [c.name for c in self._column_meta_data if c.data_type == DataType.FLOAT64] + int_columns = [c.name for c in self._column_meta_data if c.data_type == DataType.INT64] + csv_data = to_cadenza_csv(self._data, + datetime_columns=datetime_columns, + geometry_columns=geometry_columns, + float_columns=float_columns, + int_columns=int_columns) return self._create_response(csv_data, self._column_meta_data) diff --git a/src/cadenzaanalytics/tests/__init__.py b/src/cadenzaanalytics/tests/__init__.py new file mode 100644 index 0000000..4bb48ee --- /dev/null +++ b/src/cadenzaanalytics/tests/__init__.py @@ -0,0 +1 @@ +"""Test suite for cadenzaanalytics package.""" diff --git a/src/cadenzaanalytics/tests/test_csv.py b/src/cadenzaanalytics/tests/test_csv.py new file mode 100644 index 0000000..15b8cf3 --- /dev/null +++ b/src/cadenzaanalytics/tests/test_csv.py @@ -0,0 +1,535 @@ +"""Unit tests for Cadenza CSV parser.""" +from shapely.geometry import Point, LineString, MultiPoint, Polygon +import pandas as pd +import pytest +from cadenzaanalytics.util.csv import from_cadenza_csv + + +# pylint: disable=too-many-public-methods +class TestCadenzaCsvParser: + """Test suite for from_cadenza_csv function.""" + + def test_empty_input(self): + """Empty string should return empty DataFrame.""" + result = from_cadenza_csv("") + assert result.empty + assert len(result.columns) == 0 + + def test_single_quoted_value(self): + """Single quoted value.""" + csv = '"header"\r\n"value"' + result = from_cadenza_csv(csv) + assert list(result.columns) == ["header"] + assert result.iloc[0, 0] == "value" + + def test_empty_quoted_string(self): + """Empty quoted string should be empty string, not None.""" + csv = '"col1";"col2"\r\n"";""' + result = from_cadenza_csv(csv) + assert result.iloc[0, 0] == "" + assert result.iloc[0, 1] == "" + + def test_unquoted_values_are_none(self): + """Unquoted values should be parsed as None.""" + csv = '"col1";"col2";"col3"\r\n;"value2";' + result = from_cadenza_csv(csv) + assert result.iloc[0, 0] is None + assert result.iloc[0, 1] == "value2" + assert result.iloc[0, 2] is None + + def test_mixed_quoted_and_unquoted(self): + """Mixed quote and unquoted in one row.""" + csv = '"col1";"col2";"col3"\r\n"";;"def"' + result = from_cadenza_csv(csv) + assert result.iloc[0, 0] == "" + assert result.iloc[0, 1] is None + assert result.iloc[0, 2] == "def" + + def test_multiple_rows(self): + """Multiple data rows.""" + csv = '"name";"age"\r\n"Alice";"30"\r\n"Bob";' + result = from_cadenza_csv(csv) + assert len(result) == 2 + assert result.iloc[0, 0] == "Alice" + assert result.iloc[0, 1] == "30" + assert result.iloc[1, 0] == "Bob" + assert result.iloc[1, 1] is None + + def test_escaped_quotes(self): + """Double quotes inside quoted values should be unescaped.""" + csv = '"text"\r\n"He said ""hello"""' + result = from_cadenza_csv(csv) + assert result.iloc[0, 0] == 'He said "hello"' + + def test_semicolon_in_quoted_value(self): + """Semicolons inside quoted values should be preserved.""" + csv = '"col1";"col2"\r\n"a;b;c";"normal"' + result = from_cadenza_csv(csv) + assert result.iloc[0, 0] == "a;b;c" + assert result.iloc[0, 1] == "normal" + + def test_newline_in_quoted_value(self): + """Newlines inside quoted values should be preserved.""" + csv = '"text"\r\n"line1\r\nline2"' + result = from_cadenza_csv(csv) + assert result.iloc[0, 0] == "line1\r\nline2" + + def test_trailing_empty_line(self): + """Trailing CRLF is just row terminator, not an additional row.""" + csv = '"col"\r\n"val"\r\n' + result = from_cadenza_csv(csv) + assert len(result) == 1 # Just one data row + assert result.iloc[0, 0] == "val" + + def test_actual_empty_row(self): + """An actual empty row between CRLFs should be parsed as [None].""" + csv = '"col"\r\n"val"\r\n\r\n"val2"' + result = from_cadenza_csv(csv) + assert len(result) == 3 + assert result.iloc[0, 0] == "val" + assert result.iloc[1, 0] is None # Empty row + assert result.iloc[2, 0] == "val2" + + def test_numbers_as_strings(self): + """Numbers are initially parsed as strings.""" + csv = '"int";"float"\r\n"123";"45.67"' + result = from_cadenza_csv(csv) + assert result.iloc[0, 0] == "123" + assert result.iloc[0, 1] == "45.67" + + def test_type_mapping_int(self): + """Type mapping should convert strings to Int64.""" + csv = '"number"\r\n"123"\r\n' + result = from_cadenza_csv(csv, type_mapping={"number": "Int64"}) + assert result["number"].dtype == "Int64" + assert result.iloc[0, 0] == 123 + + def test_type_mapping_float(self): + """Type mapping should convert strings to Float64.""" + csv = '"value"\r\n"3.14"' + result = from_cadenza_csv(csv, type_mapping={"value": "Float64"}) + assert result["value"].dtype == "Float64" + assert result.iloc[0, 0] == 3.14 + + def test_type_mapping_with_none(self): + """Type mapping should handle None values (unquoted) correctly.""" + csv = '"number"\r\n\r\n"42"' + result = from_cadenza_csv(csv, type_mapping={"number": "Int64"}) + assert result["number"].dtype == "Int64" + assert pd.isna(result.iloc[0, 0]) # None becomes pd.NA + assert result.iloc[1, 0] == 42 + + def test_iso_datetime_format(self): + """ISO8601 datetime strings are kept as strings by default.""" + csv = '"timestamp"\r\n"2023-01-03T15:29:13Z"' + result = from_cadenza_csv(csv) + assert result.iloc[0, 0] == "2023-01-03T15:29:13Z" + + def test_datetime_parsing(self): + """Datetime columns should be parsed when specified.""" + csv = '"timestamp";"value"\r\n"2023-01-03T15:29:13Z";"42"' + result = from_cadenza_csv(csv, datetime_columns=["timestamp"]) + assert pd.api.types.is_datetime64_any_dtype(result["timestamp"]) + assert result.iloc[0, 0] == pd.Timestamp("2023-01-03T15:29:13Z") + assert result.iloc[0, 1] == "42" + + def test_datetime_parsing_with_none(self): + """Datetime parsing should handle None values.""" + csv = '"timestamp"\r\n"2023-01-03T15:29:13Z"\r\n\r\n""' + result = from_cadenza_csv(csv, datetime_columns=["timestamp"]) + assert pd.api.types.is_datetime64_any_dtype(result["timestamp"]) + assert result.iloc[0, 0] == pd.Timestamp("2023-01-03T15:29:13Z") + assert pd.isna(result.iloc[1, 0]) # None string becomes None + assert pd.isna(result.iloc[2, 0]) # Empty string becomes None + + def test_all_none_row(self): + """Row with all unquoted values should be all None.""" + csv = '"a";"b";"c"\r\n;;' + result = from_cadenza_csv(csv) + assert result.iloc[0, 0] is None + assert result.iloc[0, 1] is None + assert result.iloc[0, 2] is None + + def test_single_column(self): + """Single column CSV.""" + csv = '"only"\r\n"val1"\r\n"val2"' + result = from_cadenza_csv(csv) + assert list(result.columns) == ["only"] + assert len(result) == 2 + assert result.iloc[0, 0] == "val1" + assert result.iloc[1, 0] == "val2" + + def test_leading_unquoted_value(self): + """Line starting with unquoted value.""" + csv = '"col1";"col2"\r\nabc;"def"' + result = from_cadenza_csv(csv) + assert result.iloc[0, 0] is None + assert result.iloc[0, 1] == "def" + + def test_trailing_unquoted_value(self): + """Line ending with unquoted value.""" + csv = '"col1";"col2"\r\n"abc";xyz' + result = from_cadenza_csv(csv) + assert result.iloc[0, 0] == "abc" + assert result.iloc[0, 1] is None + + def test_complex_real_world_example(self): + """Complex real-world example with mixed types.""" + csv = ( + '"id";"name";"score";"comment"\r\n' + '"1";"Alice";"95.5";"Excellent work""!"""\r\n' + '"2";;"75.0";""\r\n' + ';"Charlie";;"No comment"' + ) + result = from_cadenza_csv(csv) + + # Row 0 + assert result.iloc[0, 0] == "1" + assert result.iloc[0, 1] == "Alice" + assert result.iloc[0, 2] == "95.5" + assert result.iloc[0, 3] == 'Excellent work"!"' + + # Row 1 + assert result.iloc[1, 0] == "2" + assert result.iloc[1, 1] is None + assert result.iloc[1, 2] == "75.0" + assert result.iloc[1, 3] == "" + + # Row 2 + assert result.iloc[2, 0] is None + assert result.iloc[2, 1] == "Charlie" + assert result.iloc[2, 2] is None + assert result.iloc[2, 3] == "No comment" + + def test_type_mapping_multiple_columns(self): + """Type mapping for multiple columns with different types.""" + csv = '"id";"score";"name"\r\n"1";"98.5";"Alice"\r\n;"75.0";"Bob"' + result = from_cadenza_csv( + csv, + type_mapping={"id": "Int64", "score": "Float64", "name": "string"} + ) + + assert result["id"].dtype == "Int64" + assert result["score"].dtype == "Float64" + assert result["name"].dtype == "string" + + assert result.iloc[0, 0] == 1 + assert result.iloc[0, 1] == 98.5 + assert result.iloc[0, 2] == "Alice" + + assert pd.isna(result.iloc[1, 0]) + assert result.iloc[1, 1] == 75.0 + assert result.iloc[1, 2] == "Bob" + + def test_geometry_parsing(self): + """Geometry columns should be parsed from WKT when specified.""" + csv = '"location";"name"\r\n"POINT (1 2)";"Place A"' + result = from_cadenza_csv(csv, geometry_columns=["location"]) + assert result.iloc[0, 0] == Point(1, 2) + assert result.iloc[0, 1] == "Place A" + + def test_geometry_parsing_with_none(self): + """Geometry parsing should handle None values.""" + csv = '"location"\r\n"POINT (1 2)"\r\nx' + result = from_cadenza_csv(csv, geometry_columns=["location"]) + assert result.iloc[0, 0] == Point(1, 2) + assert result.iloc[1, 0] is None # Unquoted 'x' becomes None + + def test_geometry_parsing_multiple_types(self): + """Geometry parsing should handle different geometry types.""" + csv = '"geom"\r\n"POINT (1 2)"\r\n"LINESTRING (0 0, 1 1)"' + result = from_cadenza_csv(csv, geometry_columns=["geom"]) + assert result.iloc[0, 0] == Point(1, 2) + assert result.iloc[1, 0] == LineString([(0, 0), (1, 1)]) + + def test_combined_type_datetime_geometry(self): + """Test combining type mapping, datetime and geometry parsing.""" + csv = ( + '"id";"timestamp";"location";"value"\r\n' + '"1";"2023-01-03T15:29:13Z";"POINT (10 20)";"42.5"' + ) + result = from_cadenza_csv( + csv, + type_mapping={"id": "Int64", "value": "Float64"}, + datetime_columns=["timestamp"], + geometry_columns=["location"] + ) + assert result.iloc[0, 0] == 1 + assert result.iloc[0, 1] == pd.Timestamp("2023-01-03T15:29:13Z") + assert result.iloc[0, 2] == Point(10, 20) + assert result.iloc[0, 3] == 42.5 + + def test_datetime_and_geometry_with_nulls(self): + """Test datetime and geometry parsing with mix of values and nulls.""" + csv = ( + '"timestamp";"location"\r\n' + '"2023-01-03T15:29:13Z";"POINT (10 20)"\r\n' + 'x;"POINT (5 5)"\r\n' + '"2023-06-15T10:00:00Z";y\r\n' + 'a;b' + ) + result = from_cadenza_csv( + csv, + datetime_columns=["timestamp"], + geometry_columns=["location"] + ) + + # Row 0: both have values + assert result.iloc[0, 0] == pd.Timestamp("2023-01-03T15:29:13Z") + assert result.iloc[0, 1] == Point(10, 20) + + # Row 1: timestamp is None, geometry has value + assert pd.isna(result.iloc[1, 0]) + assert result.iloc[1, 1] == Point(5, 5) + + # Row 2: timestamp has value, geometry is None + assert result.iloc[2, 0] == pd.Timestamp("2023-06-15T10:00:00Z") + assert result.iloc[2, 1] is None + + # Row 3: both are None + assert pd.isna(result.iloc[3, 0]) + assert result.iloc[3, 1] is None + + def test_rows_with_all_nones_empty_strings_and_values(self): + """Test rows with only None values, only empty strings, and only real values.""" + csv = ( + '"col1";"col2";"col3"\r\n' + '"value1";"value2";"value3"\r\n' + '"";"";""\r\n' + ';;' + ) + result = from_cadenza_csv(csv) + + # Row 0: all real values + assert result.iloc[0, 0] == "value1" + assert result.iloc[0, 1] == "value2" + assert result.iloc[0, 2] == "value3" + + # Row 1: all empty strings (quoted) + assert result.iloc[1, 0] == "" + assert result.iloc[1, 1] == "" + assert result.iloc[1, 2] == "" + + # Row 2: all None values (unquoted) + assert result.iloc[2, 0] is None + assert result.iloc[2, 1] is None + assert result.iloc[2, 2] is None + + def test_unicode_characters(self): + """Test handling of various Unicode characters.""" + csv = '"text"\r\n"Hello 世界"\r\n"Emoji: 😀🎉"\r\n"Français: café"' + result = from_cadenza_csv(csv) + assert len(result) == 3 + assert result.iloc[0, 0] == "Hello 世界" + assert result.iloc[1, 0] == "Emoji: 😀🎉" + assert result.iloc[2, 0] == "Français: café" + + def test_very_long_value(self): + """Test handling of very long values.""" + long_text = "a" * 10000 + csv = f'"text"\r\n"{long_text}"' + result = from_cadenza_csv(csv) + assert result.iloc[0, 0] == long_text + + def test_many_columns(self): + """Test handling of many columns.""" + num_cols = 100 + headers = ';'.join([f'"col{i}"' for i in range(num_cols)]) + values = ';'.join([f'"val{i}"' for i in range(num_cols)]) + csv = f'{headers}\r\n{values}' + result = from_cadenza_csv(csv) + assert len(result.columns) == num_cols + assert result.iloc[0, 0] == "val0" + assert result.iloc[0, 99] == "val99" + + def test_multiple_consecutive_semicolons(self): + """Test handling of multiple consecutive semicolons (multiple None values).""" + csv = '"a";"b";"c";"d"\r\n"val1";;;"val4"' + result = from_cadenza_csv(csv) + assert result.iloc[0, 0] == "val1" + assert result.iloc[0, 1] is None + assert result.iloc[0, 2] is None + assert result.iloc[0, 3] == "val4" + + def test_quoted_value_with_only_whitespace(self): + """Test that quoted whitespace is preserved.""" + csv = '"col1";"col2"\r\n" ";" \t "' + result = from_cadenza_csv(csv) + assert result.iloc[0, 0] == " " + assert result.iloc[0, 1] == " \t " + + def test_mixed_line_endings_in_quoted_values(self): + """Test handling of different line endings within quoted values.""" + csv = '"text"\r\n"line1\nline2"\r\n"line3\rline4"' + result = from_cadenza_csv(csv) + assert result.iloc[0, 0] == "line1\nline2" + assert result.iloc[1, 0] == "line3\rline4" + + def test_type_mapping_invalid_conversion(self): + """Test type mapping with invalid values.""" + csv = '"number"\r\n"abc"\r\n"123"' + with pytest.raises(ValueError): + from_cadenza_csv(csv, type_mapping={"number": "Int64"}) + + def test_geometry_invalid_wkt(self): + """Test geometry parsing with invalid WKT.""" + csv = '"location"\r\n"INVALID WKT"' + result = from_cadenza_csv(csv, geometry_columns=["location"]) + # shapely's from_wkt with on_invalid='warn' returns None for invalid WKT + assert result.iloc[0, 0] is None + + def test_polygon_and_multipoint_geometries(self): + """Test parsing of more complex geometry types.""" + csv = '"geom"\r\n"POLYGON ((0 0, 1 0, 1 1, 0 1, 0 0))"\r\n"MULTIPOINT ((0 0), (1 1))"' + result = from_cadenza_csv(csv, geometry_columns=["geom"]) + assert isinstance(result.iloc[0, 0], Polygon) + assert isinstance(result.iloc[1, 0], MultiPoint) + + def test_datetime_with_different_timezones(self): + """Test datetime parsing with different timezone offset formats: + These inputs are unexpected and will currently not happen from Cadenza, yet we want + to document the current behavior which will not produce a datetime64 dtype but 'object' dtype.""" + csv = ( + '"timestamp"\r\n' + '"2023-01-03T15:29:13Z"\r\n' + '"2023-01-03T15:29:13+00:00"\r\n' + '"2023-01-03T10:29:13-05:00"' + ) + result = from_cadenza_csv(csv, datetime_columns=["timestamp"]) + assert not pd.api.types.is_datetime64_any_dtype(result["timestamp"]) + assert result["timestamp"].dtype == "object" + assert len(result) == 3 + assert pd.notna(result.iloc[0, 0]) + assert pd.notna(result.iloc[1, 0]) + assert pd.notna(result.iloc[2, 0]) + # All three timestamps represent the same UTC moment (when compared) + # but may have different timezone info preserved + assert result.iloc[0, 0] == result.iloc[1, 0] + assert result.iloc[0, 0] == result.iloc[2, 0] + + def test_datetime_with_timezone_and_none_values(self): + """Test datetime parsing with the same timezone format and None values.""" + csv = ( + '"timestamp";"value"\r\n' + '"2023-01-03T15:29:13+01:00";"a"\r\n' + ';""\r\n' + '"2023-06-15T08:00:00+01:00";"c"\r\n' + '"2023-12-01T00:00:00+01:00";"d"\r\n' + ';"e"' + ) + result = from_cadenza_csv(csv, datetime_columns=["timestamp"]) + assert pd.api.types.is_datetime64_any_dtype(result["timestamp"]) + assert len(result) == 5 + assert pd.notna(result.iloc[0, 0]) + assert pd.isna(result.iloc[1, 0]) # None + assert pd.notna(result.iloc[2, 0]) + assert pd.notna(result.iloc[3, 0]) + assert pd.isna(result.iloc[4, 0]) # None + + def test_special_characters_in_column_names(self): + """Test handling of special characters in column names.""" + csv = '"col;1";"col""2";"col\r\n3"\r\n"a";"b";"c"' + result = from_cadenza_csv(csv) + assert list(result.columns) == ["col;1", 'col"2', "col\r\n3"] + assert result.iloc[0, 0] == "a" + + def test_empty_dataframe_with_headers_only(self): + """Test DataFrame with headers but no data rows.""" + csv = '"col1";"col2";"col3"' + result = from_cadenza_csv(csv) + assert list(result.columns) == ["col1", "col2", "col3"] + assert len(result) == 0 + + def test_numeric_string_preservation(self): + """Test that numeric-looking strings are preserved without conversion.""" + csv = '"code"\r\n"00123"\r\n"001.5"\r\n"+123"\r\n"-456"' + result = from_cadenza_csv(csv) + assert result.iloc[0, 0] == "00123" + assert result.iloc[1, 0] == "001.5" + assert result.iloc[2, 0] == "+123" + assert result.iloc[3, 0] == "-456" + + def test_boolean_like_values(self): + """Test handling of boolean-like values as strings.""" + csv = '"flag"\r\n"true"\r\n"false"\r\n"True"\r\n"False"' + result = from_cadenza_csv(csv) + assert result.iloc[0, 0] == "true" + assert result.iloc[1, 0] == "false" + assert result.iloc[2, 0] == "True" + assert result.iloc[3, 0] == "False" + + def test_large_numbers(self): + """Test handling of very large numbers with type mapping.""" + csv = '"big_int";"big_float"\r\n"9999999999999999";"1.7976931348623157e+308"' + result = from_cadenza_csv( + csv, + type_mapping={"big_int": "Int64", "big_float": "Float64"} + ) + assert result.iloc[0, 0] == 9999999999999999 + assert result.iloc[0, 1] == pytest.approx(1.7976931348623157e+308) + + def test_mixed_none_types(self): + """Test DataFrame with mix of None, np.nan, and pd.NA.""" + csv = '"col1";"col2";"col3"\r\n"a";;"c"' + result = from_cadenza_csv(csv) + assert result.iloc[0, 0] == "a" + assert result.iloc[0, 1] is None + assert result.iloc[0, 2] == "c" + + def test_datetime_precision(self): + """Test datetime parsing with milliseconds and microseconds.""" + csv = ( + '"timestamp"\r\n' + '"2023-01-03T15:29:13.123456Z"\r\n' + '"2023-01-03T15:29:13.000Z"' + ) + result = from_cadenza_csv(csv, datetime_columns=["timestamp"]) + assert pd.api.types.is_datetime64_any_dtype(result["timestamp"]) + # Check that precision is maintained + assert result.iloc[0, 0].microsecond == 123456 + + def test_all_column_types_combined(self): + """Comprehensive test with all supported column types.""" + csv = ( + '"id";"name";"score";"active";"timestamp";"location";"notes"\r\n' + '"1";"Alice";"95.5";"true";"2023-01-03T15:29:13Z";"POINT (10 20)";"First student"\r\n' + ';"Bob";;;"2023-06-15T10:00:00+01";;""\r\n' + '"3";;"75.0";"false";;"POINT (30 40)";' + ) + result = from_cadenza_csv( + csv, + type_mapping={"id": "Int64", "score": "Float64"}, + datetime_columns=["timestamp"], + geometry_columns=["location"] + ) + + # Row 0 + assert result.iloc[0, 0] == 1 + assert result.iloc[0, 1] == "Alice" + assert result.iloc[0, 2] == 95.5 + assert result.iloc[0, 3] == "true" + assert result.iloc[0, 4] == pd.Timestamp("2023-01-03T15:29:13Z") + assert result.iloc[0, 5] == Point(10, 20) + assert result.iloc[0, 6] == "First student" + + # Row 1 - mix of None and empty string + assert pd.isna(result.iloc[1, 0]) + assert result.iloc[1, 1] == "Bob" + assert pd.isna(result.iloc[1, 2]) + assert result.iloc[1, 3] is None + assert result.iloc[1, 4] == pd.Timestamp("2023-06-15T10:00:00+01") + assert result.iloc[1, 5] is None + assert result.iloc[1, 6] == "" + + # Row 2 + assert result.iloc[2, 0] == 3 + assert result.iloc[2, 1] is None + assert result.iloc[2, 2] == 75.0 + assert result.iloc[2, 3] == "false" + assert pd.isna(result.iloc[2, 4]) + assert result.iloc[2, 5] == Point(30, 40) + assert result.iloc[2, 6] is None + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/src/cadenzaanalytics/tests/test_csv_roundtrip.py b/src/cadenzaanalytics/tests/test_csv_roundtrip.py new file mode 100644 index 0000000..e59c3f2 --- /dev/null +++ b/src/cadenzaanalytics/tests/test_csv_roundtrip.py @@ -0,0 +1,581 @@ +"""Roundtrip tests for Cadenza CSV reader and writer. + +Tests that write -> read -> write yields the same results. +""" +from datetime import timedelta +from shapely.geometry import Point, LineString, Polygon, MultiPoint + +import pandas as pd +import numpy as np +import pytest +from cadenzaanalytics.util.csv import from_cadenza_csv, to_cadenza_csv + +#pylint: disable=too-many-public-methods +class TestCadenzaCsvRoundtrip: + """Test suite for CSV roundtrip (write-read-write).""" + + def test_simple_values_roundtrip(self): + """Simple values should roundtrip correctly.""" + df1 = pd.DataFrame({"col1": ["a", "b"], "col2": ["c", "d"]}) + csv1 = to_cadenza_csv(df1) + df2 = from_cadenza_csv(csv1) + csv2 = to_cadenza_csv(df2) + assert csv1 == csv2 + + def test_none_values_roundtrip(self): + """None values should roundtrip correctly.""" + df1 = pd.DataFrame({"col1": [None, "value"], "col2": ["value2", None]}) + csv1 = to_cadenza_csv(df1) + df2 = from_cadenza_csv(csv1) + csv2 = to_cadenza_csv(df2) + assert csv1 == csv2 + + def test_empty_strings_roundtrip(self): + """Empty strings should roundtrip correctly (different from None).""" + df1 = pd.DataFrame({"col1": ["", "value"], "col2": ["value2", ""]}) + csv1 = to_cadenza_csv(df1) + df2 = from_cadenza_csv(csv1) + csv2 = to_cadenza_csv(df2) + assert csv1 == csv2 + # Verify empty strings are preserved + assert df2.iloc[0, 0] == "" + assert df2.iloc[1, 1] == "" + + def test_mixed_none_and_empty_roundtrip(self): + """Mix of None and empty strings should roundtrip correctly.""" + df1 = pd.DataFrame({ + "col1": ["", None, "value"], + "col2": [None, "", "value2"], + "col3": ["v1", "v2", "v3"] + }) + csv1 = to_cadenza_csv(df1) + df2 = from_cadenza_csv(csv1) + csv2 = to_cadenza_csv(df2) + assert csv1 == csv2 + # Verify distinction is preserved + assert df2.iloc[0, 0] == "" # Empty string + assert df2.iloc[0, 1] is None # None + assert df2.iloc[1, 0] is None # None + assert df2.iloc[1, 1] == "" # Empty string + + def test_escaped_quotes_roundtrip(self): + """Escaped quotes should roundtrip correctly.""" + df1 = pd.DataFrame({"text": ['He said "hello"', 'She replied "hi"']}) + csv1 = to_cadenza_csv(df1) + df2 = from_cadenza_csv(csv1) + csv2 = to_cadenza_csv(df2) + assert csv1 == csv2 + assert df2.iloc[0, 0] == 'He said "hello"' + + def test_semicolons_roundtrip(self): + """Semicolons in values should roundtrip correctly.""" + df1 = pd.DataFrame({"col": ["a;b;c", "x;y"]}) + csv1 = to_cadenza_csv(df1) + df2 = from_cadenza_csv(csv1) + csv2 = to_cadenza_csv(df2) + assert csv1 == csv2 + + def test_newlines_roundtrip(self): + """Newlines in values should roundtrip correctly.""" + df1 = pd.DataFrame({"text": ["line1\r\nline2", "line3\r\nline4"]}) + csv1 = to_cadenza_csv(df1) + df2 = from_cadenza_csv(csv1) + csv2 = to_cadenza_csv(df2) + assert csv1 == csv2 + + def test_numbers_roundtrip(self): + """Numbers should roundtrip as strings (default behavior).""" + df1 = pd.DataFrame({"int": ["123", "456"], "float": ["45.67", "89.01"]}) + csv1 = to_cadenza_csv(df1) + df2 = from_cadenza_csv(csv1) + csv2 = to_cadenza_csv(df2) + assert csv1 == csv2 + + def test_typed_numbers_roundtrip(self): + """Typed numbers should roundtrip with type mapping.""" + df1 = pd.DataFrame({"int": [123, 456], "float": [45.67, 89.01]}) + df1 = df1.astype({"int": "Int64", "float": "Float64"}) + csv1 = to_cadenza_csv(df1) + df2 = from_cadenza_csv(csv1, type_mapping={"int": "Int64", "float": "Float64"}) + csv2 = to_cadenza_csv(df2) + assert csv1 == csv2 + # Verify types are preserved + assert df2["int"].dtype == "Int64" + assert df2["float"].dtype == "Float64" + + def test_datetime_roundtrip(self): + """Datetime values should roundtrip correctly.""" + df1 = pd.DataFrame({ + "timestamp": [pd.Timestamp("2023-01-03T15:29:13Z"), pd.Timestamp("2024-06-15T10:00:00Z")], + "value": ["a", "b"] + }) + csv1 = to_cadenza_csv(df1, datetime_columns=["timestamp"]) + df2 = from_cadenza_csv(csv1, datetime_columns=["timestamp"]) + csv2 = to_cadenza_csv(df2, datetime_columns=["timestamp"]) + assert csv1 == csv2 + # Verify datetime type is preserved (all same timezone -> pandas datetime dtype) + assert pd.api.types.is_datetime64_any_dtype(df2["timestamp"]) + + def test_datetime_with_none_roundtrip(self): + """Datetime with None values should roundtrip correctly.""" + df1 = pd.DataFrame({ + "timestamp": [pd.Timestamp("2023-01-03T15:29:13Z"), None] + }) + csv1 = to_cadenza_csv(df1, datetime_columns=["timestamp"]) + df2 = from_cadenza_csv(csv1, datetime_columns=["timestamp"]) + csv2 = to_cadenza_csv(df2, datetime_columns=["timestamp"]) + assert csv1 == csv2 + assert pd.isna(df2.iloc[1, 0]) + + def test_datetime_with_none_and_timezoneoffset_roundtrip(self): + """Datetime with None values (and/or time zone offsets) should roundtrip correctly.""" + df1 = pd.DataFrame({ + "timestamp": [pd.Timestamp("2023-01-03T15:29:13Z"), None, pd.Timestamp("2024-06-15T10:00:00+01:00")] + }) + csv1 = to_cadenza_csv(df1, datetime_columns=["timestamp"]) + df2 = from_cadenza_csv(csv1, datetime_columns=["timestamp"]) + csv2 = to_cadenza_csv(df2, datetime_columns=["timestamp"]) + assert csv1 == csv2 + assert pd.isna(df2.iloc[1, 0]) + + def test_datetime_with_none_and_same_timezoneoffset_roundtrip(self): + """Datetime with None values (and/or SAME time zone offsets) should roundtrip correctly.""" + df1 = pd.DataFrame({ + "timestamp": [pd.Timestamp("2023-01-03T15:29:13+01:00"), None, pd.Timestamp("2024-06-15T10:00:00+01:00")] + }) + csv1 = to_cadenza_csv(df1, datetime_columns=["timestamp"]) + df2 = from_cadenza_csv(csv1, datetime_columns=["timestamp"]) + csv2 = to_cadenza_csv(df2, datetime_columns=["timestamp"]) + assert csv1 == csv2 + assert pd.isna(df2.iloc[1, 0]) + + def test_geometry_roundtrip(self): + """Geometry values should roundtrip correctly.""" + df1 = pd.DataFrame({ + "location": [Point(1, 2), Point(3, 4)], + "name": ["A", "B"] + }) + csv1 = to_cadenza_csv(df1, geometry_columns=["location"]) + df2 = from_cadenza_csv(csv1, geometry_columns=["location"]) + csv2 = to_cadenza_csv(df2, geometry_columns=["location"]) + assert csv1 == csv2 + # Verify geometries are preserved + assert df2.iloc[0, 0] == Point(1, 2) + assert df2.iloc[1, 0] == Point(3, 4) + + def test_geometry_with_none_roundtrip(self): + """Geometry with None values should roundtrip correctly.""" + df1 = pd.DataFrame({ + "location": [Point(1, 2), None, Point(5, 6)] + }) + csv1 = to_cadenza_csv(df1, geometry_columns=["location"]) + df2 = from_cadenza_csv(csv1, geometry_columns=["location"]) + csv2 = to_cadenza_csv(df2, geometry_columns=["location"]) + assert csv1 == csv2 + assert df2.iloc[1, 0] is None + + def test_complex_roundtrip(self): + """Complex data with all types should roundtrip correctly.""" + df1 = pd.DataFrame({ + "id": [1, 2, 3], + "name": ["Alice", "", None], + "timestamp": [pd.Timestamp("2023-01-03T15:29:13Z"), None, pd.Timestamp("2024-01-01T00:00:00Z")], + "location": [Point(10, 20), None, Point(30, 40)], + "score": [95.5, 75.0, None] + }) + df1 = df1.astype({"id": "Int64", "score": "Float64"}) + + csv1 = to_cadenza_csv( + df1, + datetime_columns=["timestamp"], + geometry_columns=["location"] + ) + df2 = from_cadenza_csv( + csv1, + type_mapping={"id": "Int64", "score": "Float64"}, + datetime_columns=["timestamp"], + geometry_columns=["location"] + ) + csv2 = to_cadenza_csv( + df2, + datetime_columns=["timestamp"], + geometry_columns=["location"] + ) + + assert csv1 == csv2 + + # Verify all values are preserved correctly + assert df2.iloc[0, 1] == "Alice" + assert df2.iloc[1, 1] == "" + assert df2.iloc[2, 1] is None + assert df2.iloc[1, 2] is pd.NaT or pd.isna(df2.iloc[1, 2]) + assert df2.iloc[1, 3] is None + assert pd.isna(df2.iloc[2, 4]) + + def test_all_none_row_roundtrip(self): + """Row with all None values should roundtrip correctly.""" + df1 = pd.DataFrame({ + "a": ["val1", None, "val3"], + "b": ["val2", None, "val4"], + "c": ["val3", None, "val5"] + }) + csv1 = to_cadenza_csv(df1) + df2 = from_cadenza_csv(csv1) + csv2 = to_cadenza_csv(df2) + assert csv1 == csv2 + # Verify all-None row + assert df2.iloc[1, 0] is None + assert df2.iloc[1, 1] is None + assert df2.iloc[1, 2] is None + + def test_single_column_roundtrip(self): + """Single column should roundtrip correctly.""" + df1 = pd.DataFrame({"only": ["val1", None, ""]}) + csv1 = to_cadenza_csv(df1) + df2 = from_cadenza_csv(csv1) + csv2 = to_cadenza_csv(df2) + assert csv1 == csv2 + assert df2.iloc[1, 0] is None + assert df2.iloc[2, 0] == "" + + def test_unicode_roundtrip(self): + """Unicode characters should roundtrip correctly.""" + df1 = pd.DataFrame({ + "text": ["Hello 世界", "Emoji: 😀🎉", "Français: café", "Русский: привет"] + }) + csv1 = to_cadenza_csv(df1) + df2 = from_cadenza_csv(csv1) + csv2 = to_cadenza_csv(df2) + assert csv1 == csv2 + assert df2.iloc[0, 0] == "Hello 世界" + assert df2.iloc[1, 0] == "Emoji: 😀🎉" + + def test_large_dataset_roundtrip(self): + """Large dataset should roundtrip correctly.""" + num_rows = 1000 + df1 = pd.DataFrame({ + "id": list(range(num_rows)), + "value": [f"val{i}" for i in range(num_rows)], + "nullable": [None if i % 3 == 0 else f"data{i}" for i in range(num_rows)] + }) + df1 = df1.astype({"id": "Int64"}) + csv1 = to_cadenza_csv(df1) + df2 = from_cadenza_csv(csv1, type_mapping={"id": "Int64"}) + csv2 = to_cadenza_csv(df2) + assert csv1 == csv2 + assert len(df2) == num_rows + + def test_many_columns_roundtrip(self): + """Many columns should roundtrip correctly.""" + num_cols = 100 + data = {f"col{i}": [f"val{i}", None, ""] for i in range(num_cols)} + df1 = pd.DataFrame(data) + csv1 = to_cadenza_csv(df1) + df2 = from_cadenza_csv(csv1) + csv2 = to_cadenza_csv(df2) + assert csv1 == csv2 + assert len(df2.columns) == num_cols + + def test_special_characters_in_headers_roundtrip(self): + """Special characters in column names should roundtrip correctly.""" + df1 = pd.DataFrame({ + "col;1": ["a", "b"], + 'col"2': ["c", "d"], + "col\r\n3": ["e", "f"] + }) + csv1 = to_cadenza_csv(df1) + df2 = from_cadenza_csv(csv1) + csv2 = to_cadenza_csv(df2) + assert csv1 == csv2 + assert list(df2.columns) == ["col;1", 'col"2', "col\r\n3"] + + def test_whitespace_values_roundtrip(self): + """Whitespace values should roundtrip correctly.""" + df1 = pd.DataFrame({ + "spaces": [" ", " a ", ""], + "tabs": ["\t\t", "a\tb", None] + }) + csv1 = to_cadenza_csv(df1) + df2 = from_cadenza_csv(csv1) + csv2 = to_cadenza_csv(df2) + assert csv1 == csv2 + assert df2.iloc[0, 0] == " " + assert df2.iloc[0, 1] == "\t\t" + + def test_multiple_consecutive_none_roundtrip(self): + """Multiple consecutive None values should roundtrip correctly.""" + df1 = pd.DataFrame({ + "col1": ["a", None, None, None, "b"], + "col2": [None, None, "c", None, None], + "col3": [None, None, None, None, None] + }) + csv1 = to_cadenza_csv(df1) + df2 = from_cadenza_csv(csv1) + csv2 = to_cadenza_csv(df2) + assert csv1 == csv2 + assert df2.iloc[2, 2] is None + assert df2.iloc[4, 2] is None + + def test_numeric_strings_roundtrip(self): + """Numeric-looking strings should roundtrip without type conversion.""" + df1 = pd.DataFrame({ + "codes": ["00123", "001.5", "+123", "-456"], + "scientific": ["1.5e-10", "3.14e+20", "1e5", "2E-3"] + }) + csv1 = to_cadenza_csv(df1) + df2 = from_cadenza_csv(csv1) + csv2 = to_cadenza_csv(df2) + assert csv1 == csv2 + assert df2.iloc[0, 0] == "00123" + assert df2.iloc[1, 1] == "3.14e+20" + + def test_mixed_geometry_types_roundtrip(self): + """Mix of different geometry types should roundtrip correctly.""" + df1 = pd.DataFrame({ + "geom": [ + Point(1, 2), + LineString([(0, 0), (1, 1)]), + Polygon([(0, 0), (1, 0), (1, 1), (0, 1), (0, 0)]), + MultiPoint([(0, 0), (1, 1)]), + None + ] + }) + csv1 = to_cadenza_csv(df1, geometry_columns=["geom"]) + df2 = from_cadenza_csv(csv1, geometry_columns=["geom"]) + csv2 = to_cadenza_csv(df2, geometry_columns=["geom"]) + assert csv1 == csv2 + assert df2.iloc[0, 0] == Point(1, 2) + assert df2.iloc[4, 0] is None + + def test_datetime_with_different_timezones_roundtrip(self): + """Datetime values with different timezone offsets should roundtrip correctly.""" + # Input CSV with mixed timezone formats + # Note: +00:00 gets normalized to Z on output for consistency + csv_input = ( + '"timestamp"\r\n' + '"2023-01-03T15:29:13Z"\r\n' + '"2023-01-03T15:30:13+00:00"\r\n' + '"2023-01-03T10:29:13-05:00"\r\n' + '\r\n' # None value + ) + + # Parse preserves timezone information + df1 = from_cadenza_csv(csv_input, datetime_columns=["timestamp"]) + csv1 = to_cadenza_csv(df1, datetime_columns=["timestamp"]) + + # Check that output normalizes +00:00 to Z but preserves other offsets + assert '"2023-01-03T15:30:13Z"' in csv1 # First two become Z + assert '"2023-01-03T10:29:13-05:00"' in csv1 # -05:00 preserved + + # Second roundtrip should be stable + df2 = from_cadenza_csv(csv1, datetime_columns=["timestamp"]) + csv2 = to_cadenza_csv(df2, datetime_columns=["timestamp"]) + + assert csv1 == csv2 + assert len(df2) == 4 + # Mixed timezones result in object dtype with Timestamp values (not pandas datetime dtype) + # This is expected behavior when not all values have the same timezone + assert df2["timestamp"].dtype == object + assert df2.iloc[1, 0] - df2.iloc[0, 0] == timedelta(minutes=1) + assert df2.iloc[0, 0] == df2.iloc[2, 0] + assert pd.isna(df2.iloc[3, 0]) + + def test_mixed_numeric_types_roundtrip(self): + """Mix of Int64, Float64, and None should roundtrip correctly.""" + df1 = pd.DataFrame({ + "int_col": [1, 2, None, 4, None], + "float_col": [1.1, None, 3.3, None, 5.5] + }) + df1 = df1.astype({"int_col": "Int64", "float_col": "Float64"}) + csv1 = to_cadenza_csv(df1) + df2 = from_cadenza_csv(csv1, type_mapping={"int_col": "Int64", "float_col": "Float64"}) + csv2 = to_cadenza_csv(df2) + assert csv1 == csv2 + assert df2["int_col"].dtype == "Int64" + assert df2["float_col"].dtype == "Float64" + + def test_consecutive_quotes_roundtrip(self): + """Multiple consecutive quotes should roundtrip correctly.""" + df1 = pd.DataFrame({ + "text": ['He said ""hello""', '"""', 'A "word" here', '""""""'] + }) + csv1 = to_cadenza_csv(df1) + df2 = from_cadenza_csv(csv1) + csv2 = to_cadenza_csv(df2) + assert csv1 == csv2 + assert df2.iloc[0, 0] == 'He said ""hello""' + assert df2.iloc[1, 0] == '"""' + + def test_complex_mixed_data_roundtrip(self): + """Complex real-world scenario with all data types mixed.""" + df1 = pd.DataFrame({ + "id": [1, 2, None, 4, 5], + "name": ["Alice", "", None, "Bob", "Charlie"], + "score": [95.5, None, 88.0, 75.0, None], + "timestamp": [ + pd.Timestamp("2023-01-03T15:29:13Z"), + None, + pd.Timestamp("2023-06-15T10:00:00Z"), + pd.Timestamp("2023-12-01T00:00:00Z"), + None + ], + "location": [Point(10, 20), None, LineString([(0, 0), (1, 1)]), Point(30, 40), None], + "notes": ['Contains "quotes"', "Has\r\nnewlines", "", None, "Normal text"], + "code": ["00123", "+456", None, "", "-789"] + }) + df1 = df1.astype({"id": "Int64", "score": "Float64"}) + + csv1 = to_cadenza_csv( + df1, + datetime_columns=["timestamp"], + geometry_columns=["location"] + ) + df2 = from_cadenza_csv( + csv1, + type_mapping={"id": "Int64", "score": "Float64"}, + datetime_columns=["timestamp"], + geometry_columns=["location"] + ) + csv2 = to_cadenza_csv( + df2, + datetime_columns=["timestamp"], + geometry_columns=["location"] + ) + + assert csv1 == csv2 + + # Verify specific values are preserved + assert df2.iloc[0, 0] == 1 + assert df2.iloc[1, 1] == "" + assert df2.iloc[2, 1] is None + assert df2.iloc[1, 2] is pd.NA or pd.isna(df2.iloc[1, 2]) + assert df2.iloc[3, 4] == Point(30, 40) + assert df2.iloc[0, 5] == 'Contains "quotes"' + + def test_zero_and_negative_numbers_roundtrip(self): + """Zero and negative numbers should roundtrip correctly.""" + df1 = pd.DataFrame({ + "int_val": [0, -1, -999, None], + "float_val": [0.0, -1.5, -999.99, None] + }) + df1 = df1.astype({"int_val": "Int64", "float_val": "Float64"}) + csv1 = to_cadenza_csv(df1) + df2 = from_cadenza_csv(csv1, type_mapping={"int_val": "Int64", "float_val": "Float64"}) + csv2 = to_cadenza_csv(df2) + assert csv1 == csv2 + assert df2.iloc[0, 0] == 0 + assert df2.iloc[1, 1] == -1.5 + + def test_empty_rows_pattern_roundtrip(self): + """Pattern of values and empty rows should roundtrip correctly.""" + df1 = pd.DataFrame({ + "col1": ["a", None, "c", None, "e"], + "col2": [None, "b", None, "d", None] + }) + csv1 = to_cadenza_csv(df1) + df2 = from_cadenza_csv(csv1) + csv2 = to_cadenza_csv(df2) + assert csv1 == csv2 + assert df2.iloc[1, 0] is None + assert df2.iloc[1, 1] == "b" + + def test_very_long_value_roundtrip(self): + """Very long values should roundtrip correctly.""" + long_text = "a" * 10000 + df1 = pd.DataFrame({"text": [long_text, "short", None]}) + csv1 = to_cadenza_csv(df1) + df2 = from_cadenza_csv(csv1) + csv2 = to_cadenza_csv(df2) + assert csv1 == csv2 + assert df2.iloc[0, 0] == long_text + + def test_datetime_all_none_roundtrip(self): + """Datetime column with all None values should roundtrip correctly.""" + df1 = pd.DataFrame({ + "timestamp": [None, None, None], + "value": ["a", "b", "c"] + }) + df1["timestamp"] = pd.to_datetime(df1["timestamp"]) + + csv1 = to_cadenza_csv(df1, datetime_columns=["timestamp"]) + df2 = from_cadenza_csv(csv1, datetime_columns=["timestamp"]) + csv2 = to_cadenza_csv(df2, datetime_columns=["timestamp"]) + + assert csv1 == csv2 + # Verify all datetime values are None/NaT + assert pd.isna(df2.iloc[0, 0]) + assert pd.isna(df2.iloc[1, 0]) + assert pd.isna(df2.iloc[2, 0]) + + def test_float_nan_roundtrip(self): + """Float NaN values should roundtrip as None (not preserved as NaN).""" + df1 = pd.DataFrame({ + "values": [1.5, np.nan, 3.7, np.nan, 5.2] + }) + + csv1 = to_cadenza_csv(df1) + df2 = from_cadenza_csv(csv1, type_mapping={"values": "Float64"}) + csv2 = to_cadenza_csv(df2) + + # CSV roundtrip should be identical + assert csv1 == csv2 + + # Values should be preserved + assert df2.iloc[0, 0] == 1.5 + assert pd.isna(df2.iloc[1, 0]) # NaN becomes pd.NA + assert df2.iloc[2, 0] == 3.7 + assert pd.isna(df2.iloc[3, 0]) # NaN becomes pd.NA + assert df2.iloc[4, 0] == 5.2 + + def test_literal_nan_string_roundtrip(self): + """Literal string 'NaN' should be preserved as a string, not treated as missing value.""" + # Start with CSV containing quoted "NaN" as a string value + csv1 = '"text";"number"\r\n"NaN";"123"\r\n"normal";"456"\r\n' + + df1 = from_cadenza_csv(csv1) + csv2 = to_cadenza_csv(df1) + + # CSV should roundtrip identically + assert csv1 == csv2 + + # "NaN" should be treated as a regular string, not as missing value + assert df1.iloc[0, 0] == "NaN" + assert df1.iloc[1, 0] == "normal" + assert df1.iloc[0, 1] == "123" + assert df1.iloc[1, 1] == "456" + + # Verify it roundtrips again + df2 = from_cadenza_csv(csv2) + csv3 = to_cadenza_csv(df2) + assert csv2 == csv3 + + def test_nan_string_as_float_roundtrip(self): + """Quoted 'NaN' string parsed as Float64 should become actual NaN.""" + # CSV with quoted "NaN" - should be treated as the string "NaN" + # When parsed as Float64, pandas converts the string "NaN" to actual np.nan + csv1 = '"values"\r\n"1.5"\r\n"NaN"\r\n"3.7"\r\n' + + # Parse with Float64 type - pandas will convert string "NaN" to actual NaN + df1 = from_cadenza_csv(csv1, type_mapping={"values": "Float64"}) + + # First value should be 1.5, second should be NaN (from string "NaN"), third should be 3.7 + assert df1.iloc[0, 0] == 1.5 + assert pd.isna(df1.iloc[1, 0]) # String "NaN" becomes actual NaN when converting to Float64 + assert df1.iloc[2, 0] == 3.7 + + # Write it back - NaN becomes unquoted (None in CSV) + # The expected CSV is different from input because "NaN" string became actual NaN + csv2 = to_cadenza_csv(df1) + expected_csv2 = '"values"\r\n"1.5"\r\n\r\n"3.7"\r\n' # NaN becomes unquoted + assert csv2 == expected_csv2 + + # Read it back again with Float64 + df2 = from_cadenza_csv(csv2, type_mapping={"values": "Float64"}) + csv3 = to_cadenza_csv(df2) + + # Should be stable after first conversion + assert csv2 == csv3 + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/src/cadenzaanalytics/tests/test_csv_writer.py b/src/cadenzaanalytics/tests/test_csv_writer.py new file mode 100644 index 0000000..f6ed5eb --- /dev/null +++ b/src/cadenzaanalytics/tests/test_csv_writer.py @@ -0,0 +1,435 @@ +"""Unit tests for Cadenza CSV writer.""" +from datetime import datetime, timezone, timedelta + +from shapely.geometry import Point, LineString, MultiPoint, MultiLineString, Polygon +import pandas as pd +import numpy as np +import pytest +from cadenzaanalytics.util.csv import to_cadenza_csv + +#pylint: disable=too-many-public-methods +class TestCadenzaCsvWriter: + """Test suite for to_cadenza_csv function.""" + + def test_empty_dataframe(self): + """Empty DataFrame should return empty string.""" + df = pd.DataFrame() + result = to_cadenza_csv(df) + assert result == "" + + def test_single_value(self): + """Single quoted value.""" + df = pd.DataFrame({"header": ["value"]}) + result = to_cadenza_csv(df) + assert result == '"header"\r\n"value"\r\n' + + def test_empty_string(self): + """Empty string should be quoted.""" + df = pd.DataFrame({"col1": ["", ""], "col2": ["a", "b"]}) + result = to_cadenza_csv(df) + assert result == '"col1";"col2"\r\n"";"a"\r\n"";"b"\r\n' + + def test_none_values(self): + """None values should be unquoted.""" + df = pd.DataFrame({"col1": [None, "value"], "col2": ["value2", None]}) + result = to_cadenza_csv(df) + assert result == '"col1";"col2"\r\n;"value2"\r\n"value";\r\n' + + def test_nan_values(self): + """NaN values should be unquoted (treated as None).""" + df = pd.DataFrame({"col1": [np.nan, 1.5], "col2": [2.5, np.nan]}) + result = to_cadenza_csv(df) + assert result == '"col1";"col2"\r\n;"2.5"\r\n"1.5";\r\n' + + def test_multiple_rows(self): + """Multiple data rows.""" + df = pd.DataFrame({"name": ["Alice", "Bob"], "age": ["30", None]}) + result = to_cadenza_csv(df) + assert result == '"name";"age"\r\n"Alice";"30"\r\n"Bob";\r\n' + + def test_escaped_quotes(self): + """Double quotes should be escaped.""" + df = pd.DataFrame({"text": ['He said "hello"']}) + result = to_cadenza_csv(df) + assert result == '"text"\r\n"He said ""hello"""\r\n' + + def test_semicolon_in_value(self): + """Semicolons should be preserved in quoted values.""" + df = pd.DataFrame({"col1": ["a;b;c"], "col2": ["normal"]}) + result = to_cadenza_csv(df) + assert result == '"col1";"col2"\r\n"a;b;c";"normal"\r\n' + + def test_newline_in_value(self): + """Newlines should be preserved in quoted values.""" + df = pd.DataFrame({"text": ["line1\r\nline2"]}) + result = to_cadenza_csv(df) + assert result == '"text"\r\n"line1\r\nline2"\r\n' + + def test_numbers(self): + """Numbers should be quoted as strings.""" + df = pd.DataFrame({"int": ["123"], "float": ["45.67"]}) + result = to_cadenza_csv(df) + assert result == '"int";"float"\r\n"123";"45.67"\r\n' + + def test_datetime_formatting(self): + """Datetime columns should be formatted as ISO8601.""" + df = pd.DataFrame({ + "timestamp": [pd.Timestamp("2023-01-03T15:29:13Z")], + "value": [42] + }) + result = to_cadenza_csv(df, datetime_columns=["timestamp"]) + assert result == '"timestamp";"value"\r\n"2023-01-03T15:29:13Z";"42"\r\n' + + def test_datetime_with_none(self): + """Datetime formatting should handle None values.""" + df = pd.DataFrame({ + "timestamp": [pd.Timestamp("2023-01-03T15:29:13Z"), None] + }) + result = to_cadenza_csv(df, datetime_columns=["timestamp"]) + assert result == '"timestamp"\r\n"2023-01-03T15:29:13Z"\r\n\r\n' + + def test_geometry_formatting(self): + """Geometry columns should be converted to WKT.""" + df = pd.DataFrame({ + "location": [Point(1, 2)], + "name": ["Place A"] + }) + result = to_cadenza_csv(df, geometry_columns=["location"]) + assert result == '"location";"name"\r\n"POINT (1 2)";"Place A"\r\n' + + def test_geometry_with_none(self): + """Geometry formatting should handle None values.""" + df = pd.DataFrame({ + "location": [Point(1, 2), None] + }) + result = to_cadenza_csv(df, geometry_columns=["location"]) + assert result == '"location"\r\n"POINT (1 2)"\r\n\r\n' + + def test_geometry_multiple_types(self): + """Geometry formatting should handle different geometry types.""" + df = pd.DataFrame({ + "geom": [Point(1, 2), LineString([(0, 0), (1, 1)])] + }) + result = to_cadenza_csv(df, geometry_columns=["geom"]) + assert result == '"geom"\r\n"POINT (1 2)"\r\n"LINESTRING (0 0, 1 1)"\r\n' + + def test_combined_datetime_geometry(self): + """Test combining datetime and geometry formatting.""" + df = pd.DataFrame({ + "id": [1], + "timestamp": [pd.Timestamp("2023-01-03T15:29:13Z")], + "location": [Point(10, 20)], + "value": [42.5] + }) + result = to_cadenza_csv( + df, + datetime_columns=["timestamp"], + geometry_columns=["location"] + ) + assert result == '"id";"timestamp";"location";"value"\r\n"1";"2023-01-03T15:29:13Z";"POINT (10 20)";"42.5"\r\n' + + def test_all_none_row(self): + """Row with all None values.""" + df = pd.DataFrame({"a": [None], "b": [None], "c": [None]}) + result = to_cadenza_csv(df) + assert result == '"a";"b";"c"\r\n;;\r\n' + + def test_mixed_row(self): + """Row with mixed None and values.""" + df = pd.DataFrame({"col1": [""], "col2": [None], "col3": ["def"]}) + result = to_cadenza_csv(df) + assert result == '"col1";"col2";"col3"\r\n"";;"def"\r\n' + + def test_single_column(self): + """Single column CSV.""" + df = pd.DataFrame({"only": ["val1", "val2"]}) + result = to_cadenza_csv(df) + assert result == '"only"\r\n"val1"\r\n"val2"\r\n' + + def test_single_column_with_none(self): + """Single column with None value.""" + df = pd.DataFrame({"col": ["val", None]}) + result = to_cadenza_csv(df) + assert result == '"col"\r\n"val"\r\n\r\n' + + def test_pandas_na(self): + """pd.NA values should be unquoted (empty line for single column).""" + df = pd.DataFrame({"col": [pd.NA, "value"]}, dtype="string") + result = to_cadenza_csv(df) + # Single column with None creates empty line (no semicolon) + assert result == '"col"\r\n\r\n"value"\r\n' + + def test_numeric_dtypes(self): + """Numeric dtypes Int64 and Float64 should work correctly.""" + df = pd.DataFrame({"int_col": [123, 456, None], "float_col": [1.5, None, 3.7]}) + df = df.astype({"int_col": "Int64", "float_col": "Float64"}) + result = to_cadenza_csv(df) + assert result == '"int_col";"float_col"\r\n"123";"1.5"\r\n"456";\r\n;"3.7"\r\n' + + def test_unicode_characters(self): + """Unicode characters should be preserved.""" + df = pd.DataFrame({"text": ["Hello 世界", "Emoji: 😀🎉", "Français: café"]}) + result = to_cadenza_csv(df) + assert result == '"text"\r\n"Hello 世界"\r\n"Emoji: 😀🎉"\r\n"Français: café"\r\n' + + def test_very_long_value(self): + """Very long values should be handled correctly.""" + long_text = "a" * 10000 + df = pd.DataFrame({"text": [long_text]}) + result = to_cadenza_csv(df) + assert long_text in result + assert result.startswith('"text"\r\n"') + + def test_many_columns(self): + """Many columns should be handled correctly.""" + num_cols = 100 + data = {f"col{i}": [f"val{i}"] for i in range(num_cols)} + df = pd.DataFrame(data) + result = to_cadenza_csv(df) + assert result.count(';') == num_cols - 1 + num_cols - 1 # header + data row + + def test_whitespace_only_value(self): + """Whitespace-only values should be quoted and preserved.""" + df = pd.DataFrame({"col": [" ", " \t ", " \n "]}) + result = to_cadenza_csv(df) + assert result == '"col"\r\n" "\r\n" \t "\r\n" \n "\r\n' + + def test_special_characters_in_column_names(self): + """Special characters in column names should be quoted properly.""" + df = pd.DataFrame({"col;1": ["a"], 'col"2': ["b"], "col\r\n3": ["c"]}) + result = to_cadenza_csv(df) + assert result == '"col;1";"col""2";"col\r\n3"\r\n"a";"b";"c"\r\n' + + def test_numeric_string_values(self): + """Numeric-looking strings should be quoted.""" + df = pd.DataFrame({"code": ["00123", "001.5", "+123", "-456"]}) + result = to_cadenza_csv(df) + assert result == '"code"\r\n"00123"\r\n"001.5"\r\n"+123"\r\n"-456"\r\n' + + def test_boolean_values_as_strings(self): + """Boolean-like values should be converted to strings.""" + df = pd.DataFrame({"flag": ["true", "false", "True", "False"]}) + result = to_cadenza_csv(df) + assert result == '"flag"\r\n"true"\r\n"false"\r\n"True"\r\n"False"\r\n' + + def test_scientific_notation_numbers(self): + """Numbers in scientific notation should be preserved.""" + df = pd.DataFrame({"value": ["1.5e-10", "3.14e+20"]}) + result = to_cadenza_csv(df) + assert result == '"value"\r\n"1.5e-10"\r\n"3.14e+20"\r\n' + + def test_multiple_consecutive_none_values(self): + """Multiple consecutive None values should be unquoted.""" + df = pd.DataFrame({"a": ["val1", None, None, None], "b": [None, None, None, "val2"]}) + result = to_cadenza_csv(df) + assert result == '"a";"b"\r\n"val1";\r\n;\r\n;\r\n;"val2"\r\n' + + def test_mixed_newlines_in_value(self): + """Different newline types in values should be preserved.""" + df = pd.DataFrame({"text": ["line1\nline2", "line3\rline4", "line5\r\nline6"]}) + result = to_cadenza_csv(df) + assert result == '"text"\r\n"line1\nline2"\r\n"line3\rline4"\r\n"line5\r\nline6"\r\n' + + def test_datetime_with_microseconds(self): + """Datetime with microseconds should format correctly.""" + df = pd.DataFrame({ + "timestamp": [pd.Timestamp("2023-01-03T15:29:13.123456Z")] + }) + result = to_cadenza_csv(df, datetime_columns=["timestamp"]) + assert result == '"timestamp"\r\n"2023-01-03T15:29:13Z"\r\n' + + def test_datetime_with_timezone_info(self): + """Datetime with timezone should format correctly.""" + df = pd.DataFrame({ + "timestamp": [pd.Timestamp("2023-01-03T15:29:13", tz="UTC")] + }) + result = to_cadenza_csv(df, datetime_columns=["timestamp"]) + assert '"2023-01-03T15:29:13Z' in result + + def test_non_pandas_datetime_with_timezone_info(self): + """Datetime with non-pandas timezone should format correctly.""" + df = pd.DataFrame({ + "timestamp": [datetime(2023, 1, 3, 15, 29, 13, tzinfo=timezone(timedelta(hours=1)))] + }) + result = to_cadenza_csv(df, datetime_columns=["timestamp"]) + assert pd.api.types.is_datetime64_any_dtype(df["timestamp"]) + assert '"2023-01-03T15:29:13+01:00' in result + + def test_non_pandas_non_dtype_datetime_with_timezone_info(self): + """Datetime with non-pandas timezone should format correctly.""" + df = pd.DataFrame({ + "timestamp": [datetime(2023, 1, 3, 15, 29, 13, tzinfo=timezone(timedelta(hours=1))), + datetime(2024, 1, 3, 15, 29, 13, tzinfo=timezone(timedelta(hours=2)))] + }) + assert not pd.api.types.is_datetime64_any_dtype(df["timestamp"]) + result = to_cadenza_csv(df, datetime_columns=["timestamp"]) + assert '"2023-01-03T15:29:13+01:00' in result + + def test_geometry_linestring_and_polygon(self): + """Different geometry types should convert to WKT correctly.""" + df = pd.DataFrame({ + "geom": [ + LineString([(0, 0), (1, 1)]), + Polygon([(0, 0), (1, 0), (1, 1), (0, 1), (0, 0)]) + ] + }) + result = to_cadenza_csv(df, geometry_columns=["geom"]) + assert 'LINESTRING' in result + assert 'POLYGON' in result + + def test_geometry_multipoint_and_multilinestring(self): + """Multi-geometry types should convert to WKT correctly.""" + df = pd.DataFrame({ + "geom": [ + MultiPoint([(0, 0), (1, 1)]), + MultiLineString([[(0, 0), (1, 1)], [(2, 2), (3, 3)]]) + ] + }) + result = to_cadenza_csv(df, geometry_columns=["geom"]) + assert 'MULTIPOINT' in result + assert 'MULTILINESTRING' in result + + def test_empty_string_vs_none_distinction(self): + """Empty strings and None values must be clearly distinguished.""" + df = pd.DataFrame({ + "col1": ["", None, "", None], + "col2": [None, "", None, ""], + }) + result = to_cadenza_csv(df) + expected = '"col1";"col2"\r\n"";\r\n;""\r\n"";\r\n;""\r\n' + assert result == expected + + def test_large_dataframe(self): + """Large dataframe should be handled correctly.""" + num_rows = 1000 + df = pd.DataFrame({ + "id": [str(i) for i in range(num_rows)], + "value": [f"val{i}" for i in range(num_rows)] + }) + result = to_cadenza_csv(df) + # Check header + 1000 data rows + final CRLF + assert result.count('\r\n') == num_rows + 1 + + def test_mixed_types_conversion(self): + """Mixed types should all be converted to strings.""" + df = pd.DataFrame({ + "mixed": [123, "text", 45.67, True, None] + }) + result = to_cadenza_csv(df) + lines = result.split('\r\n') + assert lines[0] == '"mixed"' + assert lines[1] == '"123"' + assert lines[2] == '"text"' + assert lines[3] == '"45.67"' + assert lines[4] == '"True"' + assert lines[5] == '' # None becomes unquoted (empty) + + def test_zero_values(self): + """Zero values should be quoted correctly.""" + df = pd.DataFrame({"int": ["0"], "float": ["0.0"], "text": ["000"]}) + result = to_cadenza_csv(df) + assert result == '"int";"float";"text"\r\n"0";"0.0";"000"\r\n' + + def test_negative_numbers(self): + """Negative numbers should be quoted correctly.""" + df = pd.DataFrame({"num": ["-123", "-45.67", "-0"]}) + result = to_cadenza_csv(df) + assert result == '"num"\r\n"-123"\r\n"-45.67"\r\n"-0"\r\n' + + def test_consecutive_quotes(self): + """Multiple consecutive quotes should be escaped correctly.""" + df = pd.DataFrame({"text": ['He said ""hello""', '"""']}) + result = to_cadenza_csv(df) + assert result == '"text"\r\n"He said """"hello"""""\r\n""""""""\r\n' + + def test_tab_characters(self): + """Tab characters should be preserved in quoted values.""" + df = pd.DataFrame({"text": ["col1\tcol2", "a\tb\tc"]}) + result = to_cadenza_csv(df) + assert result == '"text"\r\n"col1\tcol2"\r\n"a\tb\tc"\r\n' + + def test_datetime_timezone_aware_preservation(self): + """Timezone-aware datetime values should preserve timezone information.""" + df = pd.DataFrame({ + "timestamp": [ + pd.Timestamp("2023-01-03T15:29:13Z"), + pd.Timestamp("2023-01-03T15:29:13+00:00"), + ] + }) + result = to_cadenza_csv(df, datetime_columns=["timestamp"]) + lines = result.split('\r\n') + assert lines[0] == '"timestamp"' + # Both timestamps should be formatted with timezone info + # isoformat() outputs these consistently + assert '2023-01-03' in lines[1] + assert '2023-01-03' in lines[2] + + def test_datetime_column_all_none(self): + """Datetime column with all None values.""" + df = pd.DataFrame({ + "timestamp": [None, None, None], + "value": ["a", "b", "c"] + }) + df["timestamp"] = pd.to_datetime(df["timestamp"]) + result = to_cadenza_csv(df, datetime_columns=["timestamp"]) + assert result == '"timestamp";"value"\r\n;"a"\r\n;"b"\r\n;"c"\r\n' + + def test_float_nan_values(self): + """Float NaN values without float_columns specified should be treated as None (unquoted).""" + df = pd.DataFrame({ + "values": [1.5, np.nan, 3.7, np.nan] + }) + result = to_cadenza_csv(df) + assert result == '"values"\r\n"1.5"\r\n\r\n"3.7"\r\n\r\n' + + def test_float_nan_values_with_float_column_metadata(self): + """Float NaN values with float_columns specified should output literal "NaN".""" + df = pd.DataFrame({ + "values": [1.5, np.nan, 3.7, np.nan] + }) + result = to_cadenza_csv(df, float_columns=["values"]) + assert result == '"values"\r\n"1.5"\r\n"NaN"\r\n"3.7"\r\n"NaN"\r\n' + + def test_mixed_columns_with_nan(self): + """Mixed column types with NaN - only float columns should output "NaN".""" + df = pd.DataFrame({ + "int_col": [1, None, 3], + "float_col": [1.5, np.nan, 3.7], + "str_col": ["a", None, "c"] + }) + result = to_cadenza_csv(df, float_columns=["float_col"], int_columns=["int_col"]) + # int_col: pandas converts to float, but we convert back to int for output + assert result == '"int_col";"float_col";"str_col"\r\n"1";"1.5";"a"\r\n;"NaN";\r\n"3";"3.7";"c"\r\n' + + def test_int64_and_float64_with_nan(self): + """INT64 with None should output empty, FLOAT64 with None should output "NaN".""" + df = pd.DataFrame({ + "int_col": pd.array([1, None, 3], dtype="Int64"), + "float_col": pd.array([1.5, None, 3.7], dtype="Float64") + }) + result = to_cadenza_csv(df, float_columns=["float_col"], int_columns=["int_col"]) + # INT64 None = empty, FLOAT64 None = "NaN" + assert result == '"int_col";"float_col"\r\n"1";"1.5"\r\n;"NaN"\r\n"3";"3.7"\r\n' + + def test_int64_explicit_metadata(self): + """INT64 columns with int_columns specified should output empty for None.""" + df = pd.DataFrame({ + "int_col1": pd.array([1, None, 3], dtype="Int64"), + "int_col2": pd.array([10, 20, None], dtype="Int64") + }) + result = to_cadenza_csv(df, int_columns=["int_col1", "int_col2"]) + # Both INT64 columns should have empty for None + assert result == '"int_col1";"int_col2"\r\n"1";"10"\r\n;"20"\r\n"3";\r\n' + + def test_int64_explicit_from_float(self): + """INT64 columns with int_columns specified should output empty for None.""" + df = pd.DataFrame({ + "int_col1": pd.array([1, 2, 3], dtype="Int64"), + "int_col2": pd.array([10.0, 20.5, 13.6], dtype="Float64") + }) + result = to_cadenza_csv(df, int_columns=["int_col1", "int_col2"]) + # Both INT64 columns should have empty for None + assert result == '"int_col1";"int_col2"\r\n"1";"10"\r\n"2";"20"\r\n"3";"13"\r\n' + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/src/cadenzaanalytics/util/__init__.py b/src/cadenzaanalytics/util/__init__.py new file mode 100644 index 0000000..f9fe724 --- /dev/null +++ b/src/cadenzaanalytics/util/__init__.py @@ -0,0 +1,3 @@ +from cadenzaanalytics.util.csv import from_cadenza_csv, to_cadenza_csv + +__all__ = ['from_cadenza_csv', 'to_cadenza_csv'] diff --git a/src/cadenzaanalytics/util/csv.py b/src/cadenzaanalytics/util/csv.py new file mode 100644 index 0000000..0ed2446 --- /dev/null +++ b/src/cadenzaanalytics/util/csv.py @@ -0,0 +1,283 @@ +from typing import Dict, List, Optional + +import numpy as np +import pandas as pd +from shapely import from_wkt, to_wkt + + +def from_cadenza_csv( + csv_data: str, + type_mapping: Optional[Dict[str, str]] = None, + datetime_columns: Optional[List[str]] = None, + geometry_columns: Optional[List[str]] = None +) -> pd.DataFrame: + """Parse Cadenza CSV format into a pandas DataFrame. + + Cadenza CSV specs: + - Text encoding is UTF-8 + - Values are separated by semicolons (;) + - Values enclosed by double quotes (") are considered present + - Values not quoted are considered None/Null/missing + - Numbers are decimal with dot (.) as decimal separator + - Lines separated by CRLF (\\r\\n) + - DateTimes follow ISO8601 format (e.g., 2023-01-03T15:29:13Z) + - Geometries are WKT strings that get parsed to shapely geometries + + Parameters + ---------- + csv_data : str + The CSV data as a string + type_mapping : Optional[Dict[str, type]] + Optional mapping of column names to pandas dtypes + datetime_columns : Optional[List[str]] + List of column names to parse as ISO8601 datetimes + geometry_columns : Optional[List[str]] + List of column names to parse as WKT geometries + + Returns + ------- + pd.DataFrame + Parsed dataframe with proper None values for unquoted fields + """ + if not csv_data or not csv_data.strip(): + return pd.DataFrame() + + # Parse all rows (header + data) respecting quoted fields with embedded newlines + all_rows = _parse_csv(csv_data) + if not all_rows: + return pd.DataFrame() + + # First row is headers + headers = all_rows[0] + parsed_rows = all_rows[1:] + + # Create DataFrame + df = pd.DataFrame(parsed_rows, columns=headers) + + # Apply type mappings if provided + if type_mapping: + for col, dtype in type_mapping.items(): + if col in df.columns: + df[col] = df[col].astype(dtype) + + # Parse datetime columns + if datetime_columns: + for col in datetime_columns: + if col in df.columns: + # Parse without format specification to handle various ISO8601 timezone formats + # This preserves the original timezone information + df[col] = pd.to_datetime(df[col], errors='coerce') + + # Parse WKT geometries into shapely geometry objects + if geometry_columns: + for col in geometry_columns: + if col in df.columns: + values = df[col].to_numpy() + df[col] = from_wkt(values, on_invalid='warn') + + return df + + +# pylint: disable=too-many-branches,too-many-nested-blocks,too-many-locals +def _parse_csv(csv_data: str): + """Parse entire CSV data respecting quoted fields with embedded newlines. + + Returns list of rows, where each row is a list of values. + """ + rows = [] + pos = 0 + + while pos < len(csv_data): + row = [] + last_was_semicolon = False + + # Parse one row (until we hit CRLF that's not inside a quoted field) + while pos < len(csv_data): + # Check for row terminator first + if csv_data[pos:pos+2] == '\r\n': + # If last character before CRLF was semicolon, add trailing None + if last_was_semicolon: + row.append(None) + break + + # Reset flag + last_was_semicolon = False + + # Check if value is quoted + if csv_data[pos] == '"': + # Quoted value - extract content (can contain newlines) + pos += 1 + value = [] + while pos < len(csv_data): + if csv_data[pos] == '"': + # Check for escaped quote + if pos + 1 < len(csv_data) and csv_data[pos + 1] == '"': + value.append('"') + pos += 2 + else: + # End of quoted value + pos += 1 + break + else: + value.append(csv_data[pos]) + pos += 1 + row.append(''.join(value)) + + # Move past semicolon if present + if pos < len(csv_data) and csv_data[pos] == ';': + pos += 1 + last_was_semicolon = True + elif csv_data[pos] == ';': + # Semicolon at current position = unquoted (None) value before it + row.append(None) + pos += 1 + last_was_semicolon = True + else: + # Any other character at start of field = unquoted = None + row.append(None) + while pos < len(csv_data) and csv_data[pos] != ';' and csv_data[pos:pos+2] != '\r\n': + pos += 1 + if pos < len(csv_data) and csv_data[pos] == ';': + pos += 1 + last_was_semicolon = True + + # If we ended at EOF and last was semicolon, add trailing None + if pos >= len(csv_data) and last_was_semicolon: + row.append(None) + + # Add row (even if empty - empty row means single None value) + if row or (pos < len(csv_data) and csv_data[pos:pos+2] == '\r\n'): + rows.append(row if row else [None]) + + # Skip CRLF + if pos < len(csv_data) and csv_data[pos:pos+2] == '\r\n': + pos += 2 + + return rows + + +def to_cadenza_csv( + df: pd.DataFrame, + datetime_columns: Optional[List[str]] = None, + geometry_columns: Optional[List[str]] = None, + float_columns: Optional[List[str]] = None, + int_columns: Optional[List[str]] = None +) -> str: + """Convert a pandas DataFrame to Cadenza CSV format. + + Cadenza CSV specs: + - Text encoding is UTF-8 + - Values are separated by semicolons (;) + - Values enclosed by double quotes (") are considered present + - Values not quoted are considered None/Null/missing + - Numbers are decimal with dot (.) as decimal separator + - Lines separated by CRLF (\\r\\n) + - DateTimes follow ISO8601 format (e.g., 2023-01-03T15:29:13Z) + - Geometries are converted to WKT strings + - Float columns with NaN values output as the literal string "NaN" (quoted) + - Int columns with None values output as empty/unquoted (not "NaN") + + Parameters + ---------- + df : pd.DataFrame + The DataFrame to convert + datetime_columns : Optional[List[str]] + List of column names to format as ISO8601 datetimes + geometry_columns : Optional[List[str]] + List of column names to convert from shapely geometries to WKT + float_columns : Optional[List[str]] + List of column names that are float types (NaN will be output as "NaN") + int_columns : Optional[List[str]] + List of column names that are int types (None will be output as empty) + + Returns + ------- + str + CSV data as a string + """ + if df.empty and len(df.columns) == 0: + return "" + + # Build CSV string + lines = [] + + # Write header (no special formatting for header row) + columns_list = df.columns.tolist() + lines.append(_format_row(columns_list, columns_list, None, None, None, None)) + + # Write data rows + for _, row in df.iterrows(): + lines.append(_format_row( + row.tolist(), + columns_list, + float_columns, + int_columns, + datetime_columns, + geometry_columns)) + + return '\r\n'.join(lines) + '\r\n' + + +def _format_row( + values: List, + columns: List[str], + float_columns: Optional[List[str]] = None, + int_columns: Optional[List[str]] = None, + datetime_columns: Optional[List[str]] = None, + geometry_columns: Optional[List[str]] = None +) -> str: + """Format a row of values according to Cadenza CSV rules. + + - None/NaN/pd.NA values are unquoted for non-float types (represented as empty) + - For float type columns, NaN values are output as the literal string "NaN" (quoted) + - For int type columns, None values are explicitly output as empty (unquoted) + - All other values are quoted + - Quotes within values are escaped by doubling them + - Datetime values are formatted as ISO8601 strings + - Geometry values are converted to WKT strings + """ + formatted_values = [] + float_cols_set = set(float_columns) if float_columns else set() + int_cols_set = set(int_columns) if int_columns else set() + datetime_cols_set = set(datetime_columns) if datetime_columns else set() + geometry_cols_set = set(geometry_columns) if geometry_columns else set() + + for i, value in enumerate(values): + col_name = columns[i] if i < len(columns) else None + + # Check for None/NaN/pd.NA first (before checking column type) + if value is None or (isinstance(value, float) and np.isnan(value)) or pd.isna(value): + # For float columns, output literal "NaN"; for int/others, unquoted empty + if col_name and col_name in float_cols_set: + formatted_values.append('"NaN"') + else: + # Int columns and all other types output empty for None/NaN + formatted_values.append('') + # Handle datetime columns + elif col_name and col_name in datetime_cols_set: + # Use isoformat() and replace microseconds + iso_str = value.isoformat(timespec='seconds') + # Normalize +00:00 to Z for consistency + if iso_str.endswith('+00:00'): + iso_str = iso_str[:-6] + 'Z' + formatted_values.append(f'"{iso_str}"') + # Handle geometry columns + elif col_name and col_name in geometry_cols_set: + str_value = to_wkt(value) + formatted_values.append(f'"{str_value}"') + # Handle int columns - convert float to int if needed + elif col_name and col_name in int_cols_set: + # Convert to int first (handles case where pandas converted int to float due to None) + if isinstance(value, float): + int_value = int(value) + else: + int_value = value + formatted_values.append(f'"{int_value}"') + else: + # Convert to string and quote it + str_value = str(value) + # Escape quotes by doubling them + str_value = str_value.replace('"', '""') + formatted_values.append(f'"{str_value}"') + + return ';'.join(formatted_values) From 6e64ade02bc90147474c3bfe7d404ea902e21a28 Mon Sep 17 00:00:00 2001 From: dittmar Date: Mon, 26 Jan 2026 23:03:49 +0100 Subject: [PATCH 04/13] decrease required python version to 3.11 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index eacbe01..c03324a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,7 @@ classifiers = [ ] [tool.poetry.dependencies] -python = "^3.12" +python = "^3.11" Flask = "3.1.2" Werkzeug = "3.1.4" Flask-Cors = "6.0.1" From 4e405cf1f9b434fe2b00da135ce17dfbacfa7ade Mon Sep 17 00:00:00 2001 From: dittmar Date: Tue, 27 Jan 2026 10:16:58 +0100 Subject: [PATCH 05/13] add changelog entries --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 74affde..5d21030 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,11 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Changed - Improved type annotations for mappings and parameter values +- Uses custom csv handling instead of pandas csv handling to fix various edge cases +- Minimum required python version reduced to 3.11 + +### Fixed +- the `basic-extension` example enrichment now actually enriches the data ## 10.4.0 - 2025-12-05 ### Added From 71b694c3f207df046222f7d4627350945c6b2233 Mon Sep 17 00:00:00 2001 From: dittmar Date: Thu, 5 Feb 2026 09:38:20 +0100 Subject: [PATCH 06/13] adapt csv parsing to work with pandas 3.0.0 --- src/cadenzaanalytics/util/csv.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/cadenzaanalytics/util/csv.py b/src/cadenzaanalytics/util/csv.py index 0ed2446..ca7fd2c 100644 --- a/src/cadenzaanalytics/util/csv.py +++ b/src/cadenzaanalytics/util/csv.py @@ -51,8 +51,10 @@ def from_cadenza_csv( headers = all_rows[0] parsed_rows = all_rows[1:] - # Create DataFrame - df = pd.DataFrame(parsed_rows, columns=headers) + # Create DataFrame, use dtype=object to preserve None values (behavior changes with + # pandas 3.0.0 where None values without a specified dtype result in values + # and a specific dtype is chosen depending on other values in the column) + df = pd.DataFrame(parsed_rows, columns=headers, dtype=object) # Apply type mappings if provided if type_mapping: From b3c0dd3848d8236a99e9294a6cc06bb73931ab32 Mon Sep 17 00:00:00 2001 From: dittmar Date: Thu, 5 Feb 2026 09:38:46 +0100 Subject: [PATCH 07/13] add test case with confirmed output from cadenza concerning newline handling --- src/cadenzaanalytics/tests/test_csv.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/cadenzaanalytics/tests/test_csv.py b/src/cadenzaanalytics/tests/test_csv.py index 15b8cf3..b8bcac5 100644 --- a/src/cadenzaanalytics/tests/test_csv.py +++ b/src/cadenzaanalytics/tests/test_csv.py @@ -74,6 +74,14 @@ def test_newline_in_quoted_value(self): result = from_cadenza_csv(csv) assert result.iloc[0, 0] == "line1\r\nline2" + def test_newline_with_real_cadenza_output(self): + """Handles None and a newline within a quoted value correctly.""" + csv = '"a";"b";"c"\r\n;"e\r\nxd";"f"\r\n' + result = from_cadenza_csv(csv) + assert result.iloc[0, 0] is None + assert result.iloc[0, 1] == "e\r\nxd" + assert result.iloc[0, 2] == "f" + def test_trailing_empty_line(self): """Trailing CRLF is just row terminator, not an additional row.""" csv = '"col"\r\n"val"\r\n' @@ -161,14 +169,14 @@ def test_single_column(self): def test_leading_unquoted_value(self): """Line starting with unquoted value.""" - csv = '"col1";"col2"\r\nabc;"def"' + csv = '"col1";"col2"\r\n;"def"' result = from_cadenza_csv(csv) assert result.iloc[0, 0] is None assert result.iloc[0, 1] == "def" def test_trailing_unquoted_value(self): """Line ending with unquoted value.""" - csv = '"col1";"col2"\r\n"abc";xyz' + csv = '"col1";"col2"\r\n"abc";' result = from_cadenza_csv(csv) assert result.iloc[0, 0] == "abc" assert result.iloc[0, 1] is None From 397bbfbb0db9b0f7d330e00fa8d47eeab287a360 Mon Sep 17 00:00:00 2001 From: dittmar Date: Thu, 5 Feb 2026 09:46:09 +0100 Subject: [PATCH 08/13] for python versions after 3.12 prefer to use standard csv.reader instead of custom reader --- src/cadenzaanalytics/util/csv.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/src/cadenzaanalytics/util/csv.py b/src/cadenzaanalytics/util/csv.py index ca7fd2c..d84f0d3 100644 --- a/src/cadenzaanalytics/util/csv.py +++ b/src/cadenzaanalytics/util/csv.py @@ -1,3 +1,6 @@ +import csv +import sys +from io import StringIO from typing import Dict, List, Optional import numpy as np @@ -42,8 +45,24 @@ def from_cadenza_csv( if not csv_data or not csv_data.strip(): return pd.DataFrame() - # Parse all rows (header + data) respecting quoted fields with embedded newlines - all_rows = _parse_csv(csv_data) + all_rows = [] + if sys.version_info >= (3, 13): + # QUOTE_NOTNULL was only fixed for Python 3.13+ in the csv reader + # see https://github.com/python/cpython/issues/113732 + class CadenzaDialect(csv.excel): + delimiter = ';' + quotechar = '"' + doublequote = True + lineterminator = '\r\n' + quoting = csv.QUOTE_NOTNULL + skipinitialspace = False + + reader = csv.reader(StringIO(csv_data), dialect=CadenzaDialect) + + for row in reader: + all_rows.append(row) + else: + all_rows = _parse_csv(csv_data) if not all_rows: return pd.DataFrame() From 9fb7c82f5891d761e88ce5b088a583f85f53ea03 Mon Sep 17 00:00:00 2001 From: dittmar Date: Thu, 5 Feb 2026 09:50:50 +0100 Subject: [PATCH 09/13] extract method for parsing CSV with two different implementations depending on version --- src/cadenzaanalytics/util/csv.py | 40 ++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/src/cadenzaanalytics/util/csv.py b/src/cadenzaanalytics/util/csv.py index d84f0d3..271d625 100644 --- a/src/cadenzaanalytics/util/csv.py +++ b/src/cadenzaanalytics/util/csv.py @@ -45,24 +45,10 @@ def from_cadenza_csv( if not csv_data or not csv_data.strip(): return pd.DataFrame() - all_rows = [] - if sys.version_info >= (3, 13): - # QUOTE_NOTNULL was only fixed for Python 3.13+ in the csv reader - # see https://github.com/python/cpython/issues/113732 - class CadenzaDialect(csv.excel): - delimiter = ';' - quotechar = '"' - doublequote = True - lineterminator = '\r\n' - quoting = csv.QUOTE_NOTNULL - skipinitialspace = False - - reader = csv.reader(StringIO(csv_data), dialect=CadenzaDialect) - - for row in reader: - all_rows.append(row) - else: - all_rows = _parse_csv(csv_data) + all_rows = _parse_csv_with_default_reader(csv_data) \ + if sys.version_info >= (3, 13) \ + else _parse_csv(csv_data) + if not all_rows: return pd.DataFrame() @@ -99,6 +85,24 @@ class CadenzaDialect(csv.excel): return df +def _parse_csv_with_default_reader(csv_data: str) -> List[str]: + # QUOTE_NOTNULL was only fixed for Python 3.13+ in the csv reader + # see https://github.com/python/cpython/issues/113732 + all_rows = [] + class CadenzaDialect(csv.excel): + delimiter = ';' + quotechar = '"' + doublequote = True + lineterminator = '\r\n' + quoting = csv.QUOTE_NOTNULL + skipinitialspace = False + + reader = csv.reader(StringIO(csv_data), dialect=CadenzaDialect) + + for row in reader: + all_rows.append(row) + return all_rows + # pylint: disable=too-many-branches,too-many-nested-blocks,too-many-locals def _parse_csv(csv_data: str): """Parse entire CSV data respecting quoted fields with embedded newlines. From af0f91603ff97ed30b450cf1cd0a0659802b81ef Mon Sep 17 00:00:00 2001 From: dittmar Date: Thu, 5 Feb 2026 10:47:22 +0100 Subject: [PATCH 10/13] change behavior to normalize ZONED_DATA_TIME values read from cadenza to utc to have a stable output, always have pandas Timestamps and prevent issues with mixed timezone offsets --- CHANGELOG.md | 1 + src/cadenzaanalytics/tests/test_csv.py | 4 +--- .../tests/test_csv_roundtrip.py | 24 ++++++++++++++++--- src/cadenzaanalytics/util/csv.py | 3 +-- 4 files changed, 24 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5d21030..c6cfdec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). - Improved type annotations for mappings and parameter values - Uses custom csv handling instead of pandas csv handling to fix various edge cases - Minimum required python version reduced to 3.11 +- values of data type `ZONED_DATE_TIME` are now received as pandas Timestamps and normalized to UTC ### Fixed - the `basic-extension` example enrichment now actually enriches the data diff --git a/src/cadenzaanalytics/tests/test_csv.py b/src/cadenzaanalytics/tests/test_csv.py index b8bcac5..0769d02 100644 --- a/src/cadenzaanalytics/tests/test_csv.py +++ b/src/cadenzaanalytics/tests/test_csv.py @@ -404,14 +404,12 @@ def test_datetime_with_different_timezones(self): '"2023-01-03T10:29:13-05:00"' ) result = from_cadenza_csv(csv, datetime_columns=["timestamp"]) - assert not pd.api.types.is_datetime64_any_dtype(result["timestamp"]) - assert result["timestamp"].dtype == "object" + assert pd.api.types.is_datetime64_any_dtype(result["timestamp"]) assert len(result) == 3 assert pd.notna(result.iloc[0, 0]) assert pd.notna(result.iloc[1, 0]) assert pd.notna(result.iloc[2, 0]) # All three timestamps represent the same UTC moment (when compared) - # but may have different timezone info preserved assert result.iloc[0, 0] == result.iloc[1, 0] assert result.iloc[0, 0] == result.iloc[2, 0] diff --git a/src/cadenzaanalytics/tests/test_csv_roundtrip.py b/src/cadenzaanalytics/tests/test_csv_roundtrip.py index e59c3f2..f50f6b3 100644 --- a/src/cadenzaanalytics/tests/test_csv_roundtrip.py +++ b/src/cadenzaanalytics/tests/test_csv_roundtrip.py @@ -135,8 +135,12 @@ def test_datetime_with_none_and_timezoneoffset_roundtrip(self): csv1 = to_cadenza_csv(df1, datetime_columns=["timestamp"]) df2 = from_cadenza_csv(csv1, datetime_columns=["timestamp"]) csv2 = to_cadenza_csv(df2, datetime_columns=["timestamp"]) - assert csv1 == csv2 + df3 = from_cadenza_csv(csv2, datetime_columns=["timestamp"]) + csv3 = to_cadenza_csv(df3, datetime_columns=["timestamp"]) + assert '2024-06-15T10:00:00+01:00' in csv1 # still contains timezone offset when sending to cadenza + assert csv2 == csv3 # stabilizing on second roundtrip, as cadenzanalytics converts timezone offsets to utc assert pd.isna(df2.iloc[1, 0]) + assert pd.isna(df3.iloc[1, 0]) def test_datetime_with_none_and_same_timezoneoffset_roundtrip(self): """Datetime with None values (and/or SAME time zone offsets) should roundtrip correctly.""" @@ -146,6 +150,20 @@ def test_datetime_with_none_and_same_timezoneoffset_roundtrip(self): csv1 = to_cadenza_csv(df1, datetime_columns=["timestamp"]) df2 = from_cadenza_csv(csv1, datetime_columns=["timestamp"]) csv2 = to_cadenza_csv(df2, datetime_columns=["timestamp"]) + df3 = from_cadenza_csv(csv2, datetime_columns=["timestamp"]) + csv3 = to_cadenza_csv(df3, datetime_columns=["timestamp"]) + assert csv2 == csv3 # stabilizing on second roundtrip, as cadenzanalytics converts timezone offsets to utc + assert pd.isna(df2.iloc[1, 0]) + assert pd.isna(df3.iloc[1, 0]) + + def test_datetime_with_none_and_utc_roundtrip(self): + """Datetime with None values (and/or SAME time zone offsets) should roundtrip correctly.""" + df1 = pd.DataFrame({ + "timestamp": [pd.Timestamp("2023-01-03T15:29:13Z"), None, pd.Timestamp("2024-06-15T10:00:00Z")] + }) + csv1 = to_cadenza_csv(df1, datetime_columns=["timestamp"]) + df2 = from_cadenza_csv(csv1, datetime_columns=["timestamp"]) + csv2 = to_cadenza_csv(df2, datetime_columns=["timestamp"]) assert csv1 == csv2 assert pd.isna(df2.iloc[1, 0]) @@ -365,7 +383,7 @@ def test_datetime_with_different_timezones_roundtrip(self): # Check that output normalizes +00:00 to Z but preserves other offsets assert '"2023-01-03T15:30:13Z"' in csv1 # First two become Z - assert '"2023-01-03T10:29:13-05:00"' in csv1 # -05:00 preserved + assert '"2023-01-03T15:29:13Z"' in csv1 # -05:00 converted to utc # Second roundtrip should be stable df2 = from_cadenza_csv(csv1, datetime_columns=["timestamp"]) @@ -375,7 +393,7 @@ def test_datetime_with_different_timezones_roundtrip(self): assert len(df2) == 4 # Mixed timezones result in object dtype with Timestamp values (not pandas datetime dtype) # This is expected behavior when not all values have the same timezone - assert df2["timestamp"].dtype == object + assert pd.api.types.is_datetime64_any_dtype(df2["timestamp"]) assert df2.iloc[1, 0] - df2.iloc[0, 0] == timedelta(minutes=1) assert df2.iloc[0, 0] == df2.iloc[2, 0] assert pd.isna(df2.iloc[3, 0]) diff --git a/src/cadenzaanalytics/util/csv.py b/src/cadenzaanalytics/util/csv.py index 271d625..3462853 100644 --- a/src/cadenzaanalytics/util/csv.py +++ b/src/cadenzaanalytics/util/csv.py @@ -72,8 +72,7 @@ def from_cadenza_csv( for col in datetime_columns: if col in df.columns: # Parse without format specification to handle various ISO8601 timezone formats - # This preserves the original timezone information - df[col] = pd.to_datetime(df[col], errors='coerce') + df[col] = pd.to_datetime(df[col], errors='coerce', utc=True) # Parse WKT geometries into shapely geometry objects if geometry_columns: From 06465c1bed4b67b89ab55d28d41c1d35b4f68566 Mon Sep 17 00:00:00 2001 From: dittmar Date: Thu, 5 Feb 2026 10:55:22 +0100 Subject: [PATCH 11/13] add cadenzaAnalyticsVersion to capabilities and discovery responses --- CHANGELOG.md | 2 +- src/cadenzaanalytics/cadenza_analytics_extension_service.py | 2 +- src/cadenzaanalytics/data/analytics_extension.py | 5 ++++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c6cfdec..04b1f88 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Added - Support for measure aggregation types MEDIAN and STANDARD_DEVIATION_SAMPLE - +- Added `cadenzaAnalyticsVersion` to capabilities responses to help identify issues when adding or using an extension in Cadenza ### Changed - Improved type annotations for mappings and parameter values diff --git a/src/cadenzaanalytics/cadenza_analytics_extension_service.py b/src/cadenzaanalytics/cadenza_analytics_extension_service.py index 44f387a..4856fa8 100644 --- a/src/cadenzaanalytics/cadenza_analytics_extension_service.py +++ b/src/cadenzaanalytics/cadenza_analytics_extension_service.py @@ -135,5 +135,5 @@ def _list_extensions(self) -> Response: result_dict['extensions'].append({'relativePath': extension.relative_path, 'extensionPrintName': extension.print_name, 'extensionType': extension.extension_type}) - + result_dict['cadenzaAnalyticsVersion'] = __version__ return Response(response=json.dumps(result_dict, default=str), status=200, mimetype="application/json") diff --git a/src/cadenzaanalytics/data/analytics_extension.py b/src/cadenzaanalytics/data/analytics_extension.py index 917f99f..9548e48 100644 --- a/src/cadenzaanalytics/data/analytics_extension.py +++ b/src/cadenzaanalytics/data/analytics_extension.py @@ -4,6 +4,7 @@ from cadenzaanalytics.data.data_object import DataObject from cadenzaanalytics.data.extension_type import ExtensionType from cadenzaanalytics.data.parameter import Parameter +from cadenzaanalytics.version import __version__ class AnalyticsExtension(DataObject): @@ -16,7 +17,8 @@ class AnalyticsExtension(DataObject): "printName": "_print_name", "extensionType": "_extension_type", "attributeGroups": "_attribute_groups", - "parameters": "_parameters" + "parameters": "_parameters", + "cadenzaAnalyticsVersion": "_cadenza_analytics_version" } def __init__(self, @@ -28,6 +30,7 @@ def __init__(self, self._extension_type = extension_type self._attribute_groups = attribute_groups self._parameters = parameters + self._cadenza_analytics_version = __version__ @property def print_name(self) -> str: From aaec5b1363b2392c5d28d9a4268807d715e57b95 Mon Sep 17 00:00:00 2001 From: dittmar Date: Thu, 5 Feb 2026 11:11:32 +0100 Subject: [PATCH 12/13] remove obsolete comment that python versions below 3.12 are not supported and attempt to lower required version to 3.10 --- .github/workflows/ci.yml | 1 - pyproject.toml | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 062e7c9..dc3c414 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -27,7 +27,6 @@ jobs: matrix: python-version: ${{ fromJSON(vars.CI_PYTHON_VERSIONS) }} # build for (last 5) major supported versions to ensure compatibility to some degree, but really recommended, linted and deployed is only the latest - # due to csv-writing issues with versions below 3.12, versions before 3.12 are not supported steps: diff --git a/pyproject.toml b/pyproject.toml index c03324a..c5ad53d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,7 @@ classifiers = [ ] [tool.poetry.dependencies] -python = "^3.11" +python = "^3.10" Flask = "3.1.2" Werkzeug = "3.1.4" Flask-Cors = "6.0.1" From 80d6d09600a2dca7641d1f30c0fc4d5c15e946bc Mon Sep 17 00:00:00 2001 From: dittmar Date: Thu, 5 Feb 2026 13:26:19 +0100 Subject: [PATCH 13/13] add timezone information to the AnalyticsRequest to get some context about the cadenza server timezone, use the python server timezone as fallback --- pyproject.toml | 1 + .../cadenza_analytics_extension.py | 21 +++++++++++- .../request/analytics_request.py | 33 ++++++++++++++++++- 3 files changed, 53 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index c5ad53d..377dfa2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,6 +31,7 @@ pandas = " ^2.0.2" chardet = "5.2.0" Shapely = "2.1.2" pytest = "9.0.2" +tzlocal = "5.3.1" [project] name = "cadenzaanalytics" diff --git a/src/cadenzaanalytics/cadenza_analytics_extension.py b/src/cadenzaanalytics/cadenza_analytics_extension.py index a2dd799..8ec513f 100644 --- a/src/cadenzaanalytics/cadenza_analytics_extension.py +++ b/src/cadenzaanalytics/cadenza_analytics_extension.py @@ -3,7 +3,9 @@ invoked via HTTP POST on the relative path.""" import json import logging +from datetime import datetime from typing import Callable, List, Optional +from tzlocal import get_localzone_name from flask import Response, request @@ -174,7 +176,24 @@ def _get_request_data(self, multipart_request) -> AnalyticsRequest: df_data = None logger.debug('Received request without data') - analytics_request = AnalyticsRequest(parameters, cadenza_version=request.headers.get("X-Disy-Cadenza-Version")) + # use the analytics extension server timezone as a default, assuming they usually + # run in the same timezone as the Cadenza server. Cadenza versions after 10.4 will provide + # these timezone headers + analytics_extension_region = get_localzone_name() + analytics_extension_current_offset = datetime.now().astimezone().strftime('%z') + analytics_extension_current_offset_formatted = analytics_extension_current_offset[:3] + ':' + analytics_extension_current_offset_formatted += analytics_extension_current_offset[3:5] + if len(analytics_extension_current_offset) > 5: + # optional seconds and milliseconds (a dot already separates milliseconds) + analytics_extension_current_offset_formatted += ":" + analytics_extension_current_offset[5:] + + analytics_request = AnalyticsRequest( + parameters, + cadenza_version=request.headers.get("X-Disy-Cadenza-Version"), + cadenza_timezone_region=request.headers.get("X-Disy-Cadenza-Timezone-Region", + default=analytics_extension_region), + cadenza_timezone_current_offset=request.headers.get("X-Disy-Cadenza-Timezone-Current-Offset", + default=analytics_extension_current_offset_formatted)) if has_data: analytics_request[self._table_name] = RequestTable(df_data, metadata) diff --git a/src/cadenzaanalytics/request/analytics_request.py b/src/cadenzaanalytics/request/analytics_request.py index 77c050d..0321f5e 100644 --- a/src/cadenzaanalytics/request/analytics_request.py +++ b/src/cadenzaanalytics/request/analytics_request.py @@ -12,7 +12,11 @@ class AnalyticsRequest(collections.abc.Mapping[str, RequestTable]): access to tables via `request["table_name"]` syntax. """ - def __init__(self, parameters: RequestParameter, cadenza_version: str) -> None: + def __init__(self, + parameters: RequestParameter, + cadenza_version: str, + cadenza_timezone_region: str, + cadenza_timezone_current_offset: str) -> None: """Initialize an AnalyticsRequest. Parameters @@ -21,10 +25,16 @@ def __init__(self, parameters: RequestParameter, cadenza_version: str) -> None: The request parameters provided by Cadenza. cadenza_version : str Version string of the Cadenza instance sending the request. + cadenza_timezone_region : str + The timezone region (e.g. "Europe/Berlin") of the Cadenza instance sending the request. + cadenza_timezone_current_offset : str + The current timezone offset (e.g. "+01:00") of the Cadenza instance sending the request. """ self._parameters = parameters self._tables = {} self._cadenza_version = cadenza_version + self._cadenza_timezone_region = cadenza_timezone_region + self._cadenza_timezone_current_offset = cadenza_timezone_current_offset def __getitem__(self, key: str) -> RequestTable: """Returns the request table object by name. @@ -76,3 +86,24 @@ def cadenza_version(self) -> Optional[str]: The Cadenza version string, or None if not provided. """ return self._cadenza_version + + @property + def cadenza_timezone_region(self): + """Get the timezone region of the Cadenza instance that sent the request. If (an older version of) + Cadenza did not send a timezone region, this will be the region of this server. + + :return: Region identifier, such as "Europe/Berlin". + """ + return self._cadenza_timezone_region + + @property + def cadenza_timezone_current_offset(self): + """Get the current timezone offset of the Cadenza instance that sent the request. If (an older version of) + Cadenza did not send a timezone offset, this will be the offset of this server. + This information is purely informational and volatile as it will change with the daylight savings time. + It should not be used to convert datetime objects + to zone-aware datetimes, for that use the cadenza_timezone_region property. + + :return: Offset string, such as "+01:00" or "Z". + """ + return self._cadenza_timezone_current_offset