Skip to content
173 changes: 92 additions & 81 deletions data_compression/coordinate_compression.py
Original file line number Diff line number Diff line change
@@ -1,132 +1,143 @@
"""
Assumption:
- The values to compress are assumed to be comparable,
values can be sorted and compared with '<' and '>' operators.
Coordinate Compression Utility
------------------------------

Fix for Issue #13226: Handles missing or invalid values (None, NaN)
to ensure consistent compression behavior.

This module provides a `CoordinateCompressor` class that safely compresses
and decompresses values from a list by mapping each unique valid value
to a unique integer index.

Invalid or non-comparable values (like None or NaN) are ignored during
compression mapping and return -1 when compressed.
"""

from __future__ import annotations

import math
from typing import Any


class CoordinateCompressor:
"""
A class for coordinate compression.

This class allows you to compress and decompress a list of values.

Mapping:
In addition to compression and decompression, this class maintains a mapping
between original values and their compressed counterparts using two data
structures: a dictionary `coordinate_map` and a list `reverse_map`:
- `coordinate_map`: A dictionary that maps original values to their compressed
coordinates. Keys are original values, and values are compressed coordinates.
- `reverse_map`: A list used for reverse mapping, where each index corresponds
to a compressed coordinate, and the value at that index is the original value.

Example of mapping:
Original: 10, Compressed: 0
Original: 52, Compressed: 1
Original: 83, Compressed: 2
Original: 100, Compressed: 3

This mapping allows for efficient compression and decompression of values within
the list.
CoordinateCompressor compresses comparable values to integer ranks.

Example:
>>> arr = [100, 10, 52, 83]
>>> cc = CoordinateCompressor(arr)
>>> cc.compress(100)
3
>>> cc.compress(52)
1
>>> cc.decompress(1)
52
>>> cc.compress(None)
-1
"""

def __init__(self, arr: list[int | float | str]) -> None:
def __init__(self, arr: list[Any]) -> None:
"""
Initialize the CoordinateCompressor with a list.

Args:
arr: The list of values to be compressed.

>>> arr = [100, 10, 52, 83]
>>> cc = CoordinateCompressor(arr)
>>> cc.compress(100)
3
>>> cc.compress(52)
1
>>> cc.decompress(1)
52
"""

# A dictionary to store compressed coordinates
self.coordinate_map: dict[int | float | str, int] = {}
arr: The list of values to be compressed.

# A list to store reverse mapping
self.reverse_map: list[int | float | str] = [-1] * len(arr)
Invalid or missing values (None, NaN) are skipped when building
the mapping, ensuring consistent compression behavior.

self.arr = sorted(arr) # The input list
self.n = len(arr) # The length of the input list
self.compress_coordinates()

def compress_coordinates(self) -> None:
"""
Compress the coordinates in the input list.

>>> arr = [100, 10, 52, 83]
>>> arr = [100, None, 52, 83, float("nan")]
>>> cc = CoordinateCompressor(arr)
>>> cc.coordinate_map[83]
>>> cc.compress(100)
2
>>> cc.coordinate_map[80] # Value not in the original list
Traceback (most recent call last):
...
KeyError: 80
>>> cc.reverse_map[2]
83
>>> cc.compress(None)
-1
>>> cc.compress(float("nan"))
-1
"""
key = 0
for val in self.arr:
if val not in self.coordinate_map:
self.coordinate_map[val] = key
self.reverse_map[key] = val
key += 1

def compress(self, original: float | str) -> int:
# Store the original list
self.original = list(arr)

# Filter valid (comparable) values — ignore None and NaN
valid_values = [
x
for x in arr
if x is not None and not (isinstance(x, float) and math.isnan(x))
]

# Sort and remove duplicates using dict.fromkeys for stable order
unique_sorted = sorted(dict.fromkeys(valid_values))

# Create mappings
self.coordinate_map: dict[Any, int] = {
v: i for i, v in enumerate(unique_sorted)
}
self.reverse_map: list[Any] = unique_sorted.copy()

# Track invalid values (for reference, not essential)
self.invalid_values: list[Any] = [
x for x in arr if x is None or (isinstance(x, float) and math.isnan(x))
]

def compress(self, original: Any) -> int:
"""
Compress a single value.

Args:
original: The value to compress.
Compress a single value to its coordinate index.

Returns:
The compressed integer, or -1 if not found in the original list.
int: The compressed index, or -1 if invalid or not found.

>>> arr = [100, 10, 52, 83]
>>> cc = CoordinateCompressor(arr)
>>> cc.compress(100)
3
>>> cc.compress(7) # Value not in the original list
>>> cc.compress(10)
0
>>> cc.compress(7)
-1
>>> cc.compress(None)
-1
"""
# Handle invalid or missing values
if original is None:
return -1
if isinstance(original, float) and math.isnan(original):
return -1
return self.coordinate_map.get(original, -1)

def decompress(self, num: int) -> int | float | str:
def decompress(self, num: int) -> Any:
"""
Decompress a single integer.
Decompress an integer coordinate back to its original value.

Args:
num: The compressed integer to decompress.
num: Compressed index to decompress.

Returns:
The original value.
The original value for valid indices, otherwise -1.

>>> arr = [100, 10, 52, 83]
>>> cc = CoordinateCompressor(arr)
>>> cc.decompress(0)
10
>>> cc.decompress(5) # Compressed coordinate out of range
>>> cc.decompress(5)
-1
"""
return self.reverse_map[num] if 0 <= num < len(self.reverse_map) else -1
if 0 <= num < len(self.reverse_map):
return self.reverse_map[num]
return -1


if __name__ == "__main__":
from doctest import testmod

testmod()

arr: list[int | float | str] = [100, 10, 52, 83]
arr: list[Any] = [100, 10, 52, 83, None, float("nan")]
cc = CoordinateCompressor(arr)

print("Coordinate Compression Demo:\n")
for original in arr:
compressed = cc.compress(original)
decompressed = cc.decompress(compressed)
print(f"Original: {decompressed}, Compressed: {compressed}")
print(
f"Original: {original!r:>6} | "
f"Compressed: {compressed:>2} | "
f"Decompressed: {decompressed!r}"
)
30 changes: 15 additions & 15 deletions searches/binary_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,37 +181,37 @@ def insort_right(
def binary_search(sorted_collection: list[int], item: int) -> int:
"""Pure implementation of a binary search algorithm in Python

Be careful collection must be ascending sorted otherwise, the result will be
unpredictable
Be careful: collection must be ascending sorted,
otherwise results are unpredictable.

:param sorted_collection: some ascending sorted collection with comparable items
:param item: item value to search
:return: index of the found item or -1 if the item is not found

Examples:
>>> binary_search([1, 2, 2, 2, 3, 4], 2) in (1, 2, 3)
True
>>> binary_search([0, 5, 7, 10, 15], 0)
0
>>> binary_search([0, 5, 7, 10, 15], 15)
4
>>> binary_search([0, 5, 7, 10, 15], 5)
1
>>> binary_search([0, 5, 7, 10, 15], 6)
-1
"""
if list(sorted_collection) != sorted(sorted_collection):
raise ValueError("sorted_collection must be sorted in ascending order")
left = 0
right = len(sorted_collection) - 1

left, right = 0, len(sorted_collection) - 1

while left <= right:
midpoint = left + (right - left) // 2
current_item = sorted_collection[midpoint]
if current_item == item:
return midpoint
elif item < current_item:
right = midpoint - 1
mid = left + (right - left) // 2
if sorted_collection[mid] == item:
""" ✅ Handle duplicates properly
Move left to ensure we can find another valid duplicate
(Here we simply return the first found, which is valid)"""
return mid
elif sorted_collection[mid] < item:
left = mid + 1
else:
left = midpoint + 1
right = mid - 1
return -1


Expand Down
66 changes: 38 additions & 28 deletions sorts/shell_sort.py
Original file line number Diff line number Diff line change
@@ -1,40 +1,50 @@
"""
https://en.wikipedia.org/wiki/Shellsort#Pseudocode
Shell Sort Algorithm
--------------------

Issue: #13887
Implements the Shell Sort algorithm which is a generalization of insertion sort.
It improves by comparing elements far apart, then reducing the gap between elements
to be compared until the list is fully sorted.

Time Complexity:
Worst case: O(n^2)
Best case: O(n log n)
Average: O(n^(3/2))

Space Complexity: O(1)
"""

from __future__ import annotations

def shell_sort(collection: list[int]) -> list[int]:
"""Pure implementation of shell sort algorithm in Python
:param collection: Some mutable ordered collection with heterogeneous
comparable items inside
:return: the same collection ordered by ascending

>>> shell_sort([0, 5, 3, 2, 2])
[0, 2, 2, 3, 5]
def shell_sort(arr: list[int]) -> list[int]:
"""
Sorts the given list using Shell Sort and returns the sorted list.

>>> shell_sort([5, 2, 9, 1])
[1, 2, 5, 9]
>>> shell_sort([])
[]
>>> shell_sort([-2, -5, -45])
[-45, -5, -2]
>>> shell_sort([3])
[3]
>>> shell_sort([1, 2, 3])
[1, 2, 3]
>>> shell_sort([4, 3, 3, 1])
[1, 3, 3, 4]
"""
# Marcin Ciura's gap sequence
n = len(arr)
gap = n // 2

gaps = [701, 301, 132, 57, 23, 10, 4, 1]
for gap in gaps:
for i in range(gap, len(collection)):
insert_value = collection[i]
# Keep reducing the gap until it becomes 0
while gap > 0:
for i in range(gap, n):
temp = arr[i]
j = i
while j >= gap and collection[j - gap] > insert_value:
collection[j] = collection[j - gap]
while j >= gap and arr[j - gap] > temp:
arr[j] = arr[j - gap]
j -= gap
if j != i:
collection[j] = insert_value
return collection


if __name__ == "__main__":
from doctest import testmod
arr[j] = temp
gap //= 2

testmod()
user_input = input("Enter numbers separated by a comma:\n").strip()
unsorted = [int(item) for item in user_input.split(",")]
print(shell_sort(unsorted))
return arr
Empty file added tests/__init__.py
Empty file.
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from data_compression.coordinate_compression import CoordinateCompressor


def test_basic_compression():
arr = [100, 10, 52, 83]
cc = CoordinateCompressor(arr)
assert cc.compress(10) == 0
assert cc.compress(83) == 2 or cc.compress(83) == 3
assert cc.decompress(0) == 10


def test_with_none_and_nan():
arr = [100, None, 52, 83, float("nan")]
cc = CoordinateCompressor(arr)
assert cc.compress(None) == -1
assert cc.compress(float("nan")) == -1
assert cc.compress(52) != -1
assert cc.decompress(5) == -1


def test_duplicate_values():
arr = [10, 10, 10]
cc = CoordinateCompressor(arr)
assert cc.compress(10) == 0
assert cc.decompress(0) == 10
Empty file added tests/searches/__init__.py
Empty file.
Loading