Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 59 additions & 22 deletions data_diff/abcs/database_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,23 +15,24 @@
N = TypeVar("N")


@attrs.frozen(kw_only=True, eq=False, order=False, unsafe_hash=True)
@attrs.frozen(kw_only=True, eq=False, order=False, hash=False)
class Collation:
"""
A pre-parsed or pre-known record about db collation, per column.

The "greater" collation should be used as a target collation for textual PKs
on both sides of the diff — by coverting the "lesser" collation to self.
on both sides of the diff — by converting the "lesser" collation to self.

Snowflake easily absorbs the performance losses, so it has a boost to always
be greater than any other collation in non-Snowflake databases.
Snowflake easily absorbs the performance losses, so it is always the "lesser"
collation (the side that performs the conversion), ensuring the non-Snowflake
side is "greater" (the collation both sides converge to).
Other databases need to negotiate which side absorbs the performance impact.
"""

# A boost for special databases that are known to absorb the performance dmaage well.
# A boost for special databases that are known to absorb the performance damage well.
absorbs_damage: bool = False

# Ordinal soring by ASCII/UTF8 (True), or alphabetic as per locale/country/etc (False).
# Ordinal sorting by ASCII/UTF8 (True), or alphabetic as per locale/country/etc (False).
ordinal: bool | None = None

# Lowercase first (aAbBcC or abcABC). Otherwise, uppercase first (AaBbCc or ABCabc).
Expand All @@ -49,40 +50,76 @@ class Collation:
# Purely informational, for debugging:
_source: None | str | Collection[str] = None

def _comparison_key(self) -> tuple:
"""Key for equality and hashing — keeps __eq__/__hash__ consistent."""
if self.ordinal is True:
# Ordinal sorting is by code point; the key collapses to absorbs_damage, the ordinal flag, and language.
return (self.absorbs_damage, True, self.language)
return (
self.absorbs_damage,
self.ordinal, # None vs False are semantically distinct
self.language,
self.country,
self.case_sensitive,
self.accent_sensitive,
self.lower_first,
)

def _ordering_key(self) -> tuple:
"""Key for deterministic total ordering. Only meaningful when preceded by an
equality check, as in __gt__. Do not use as a standalone sort key.

Unlike _comparison_key, this does not collapse fields for ordinals, so two
ordinals that are equal by __eq__ may have different ordering keys. The __gt__
method guards against this by checking equality first.
"""

# (0,) for None sorts before (1, value) for any real value.
def _wrap(v: object) -> tuple:
return (0,) if v is None else (1, v)

return (
self.absorbs_damage,
_wrap(self.ordinal),
_wrap(self.language),
_wrap(self.country),
_wrap(self.case_sensitive),
_wrap(self.accent_sensitive),
_wrap(self.lower_first),
)

def __eq__(self, other: object) -> bool:
if not isinstance(other, Collation):
return NotImplemented
if self.ordinal and other.ordinal:
# TODO: does it depend on language? what does Albanic_BIN mean in MS SQL?
return True
return (
self.language == other.language
and (self.country is None or other.country is None or self.country == other.country)
and self.case_sensitive == other.case_sensitive
and self.accent_sensitive == other.accent_sensitive
and self.lower_first == other.lower_first
)
return self._comparison_key() == other._comparison_key()

def __hash__(self) -> int:
return hash(self._comparison_key())

def __ne__(self, other: object) -> bool:
if not isinstance(other, Collation):
return NotImplemented
return not self.__eq__(other)
return not (self == other)

def __gt__(self, other: object) -> bool:
if not isinstance(other, Collation):
return NotImplemented
if self == other:
return False
# absorbs_damage=True means this db absorbs conversion cost — it is the "lesser"
# (the side that converts), so the non-absorbing side is "greater" (the target).
if self.absorbs_damage and not other.absorbs_damage:
return False
if other.absorbs_damage and not self.absorbs_damage:
return True # this one is preferred if it cannot absorb damage as its counterpart can
if self.ordinal and not other.ordinal:
return True
if other.ordinal and not self.ordinal:
if self.ordinal is True and other.ordinal is not True:
return True
if other.ordinal is True and self.ordinal is not True:
return False
# TODO: try to align the languages & countries?
return False
# By this point absorbs_damage is the same on both sides and neither side
# has a unilateral ordinal=True advantage, so the full tuple breaks the tie.
# None sorts distinctly from "" / False via (0,) vs (1, value) wrapping.
return self._ordering_key() > other._ordering_key()

def __ge__(self, other: object) -> bool:
if not isinstance(other, Collation):
Expand Down
Loading
Loading