Skip to content

Commit 1e7cfa6

Browse files
authored
Ignore sequence_index column in the DataValidity checks for Diagnostic Report (#793)
1 parent e0a3a3f commit 1e7cfa6

File tree

2 files changed

+56
-0
lines changed

2 files changed

+56
-0
lines changed

sdmetrics/reports/single_table/_properties/data_validity.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,16 +42,22 @@ def _generate_details(self, real_data, synthetic_data, metadata, progress_bar=No
4242
error_messages = []
4343
primary_key = metadata.get('primary_key')
4444
alternate_keys = metadata.get('alternate_keys', [])
45+
sequence_index = metadata.get('sequence_index')
46+
4547
for column_name in metadata['columns']:
4648
sdtype = metadata['columns'][column_name]['sdtype']
4749
primary_key_match = column_name == primary_key
4850
alternate_key_match = column_name in alternate_keys
4951
is_unique = primary_key_match or alternate_key_match
52+
is_sequence_index = column_name == sequence_index
5053

5154
try:
5255
if sdtype not in self._sdtype_to_metric and not is_unique:
5356
continue
5457

58+
if is_sequence_index and self._sdtype_to_metric.get(sdtype) == BoundaryAdherence:
59+
continue
60+
5561
metric = self._sdtype_to_metric.get(sdtype, KeyUniqueness)
5662
column_score = metric.compute(real_data[column_name], synthetic_data[column_name])
5763
error_message = None

tests/unit/reports/single_table/_properties/test_data_validity.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,3 +152,53 @@ def test_get_visualization(self, mock_px):
152152
margin={'t': 150},
153153
font={'size': 18},
154154
)
155+
156+
@patch('sdmetrics.reports.single_table._properties.data_validity.BoundaryAdherence.compute')
157+
@patch('sdmetrics.reports.single_table._properties.data_validity.CategoryAdherence.compute')
158+
@patch('sdmetrics.reports.single_table._properties.data_validity.KeyUniqueness.compute')
159+
def test__generate_details_skip_sequence_index_boundary_adherence(
160+
self, key_uniqueness_mock, category_a_compute_mock, boundary_a_compute_mock
161+
):
162+
"""Test that sequence_index columns are excluded from BoundaryAdherence checks."""
163+
# Setup
164+
real_data = pd.DataFrame({
165+
'date': pd.to_datetime(['2020-01-01', '2020-01-02', '2020-01-03']),
166+
'value': [1, 2, 3],
167+
'category': ['a', 'b', 'c'],
168+
})
169+
synthetic_data = pd.DataFrame({
170+
'date': pd.to_datetime(['2020-01-04', '2020-01-05', '2020-01-06']),
171+
'value': [4, 5, 6],
172+
'category': ['d', 'e', 'f'],
173+
})
174+
metadata = {
175+
'sequence_index': 'date', # This should skip BoundaryAdherence
176+
'columns': {
177+
'date': {'sdtype': 'datetime'},
178+
'value': {'sdtype': 'numerical'},
179+
'category': {'sdtype': 'categorical'},
180+
},
181+
}
182+
183+
boundary_a_compute_mock.return_value = 0.8
184+
category_a_compute_mock.return_value = 0.9
185+
186+
# Run
187+
data_validity_property = DataValidity()
188+
result = data_validity_property._generate_details(real_data, synthetic_data, metadata)
189+
190+
# Assert
191+
expected_calls_ba = [call(real_data['value'], synthetic_data['value'])]
192+
boundary_a_compute_mock.assert_has_calls(expected_calls_ba)
193+
assert boundary_a_compute_mock.call_count == 1
194+
195+
expected_calls_ca = [call(real_data['category'], synthetic_data['category'])]
196+
category_a_compute_mock.assert_has_calls(expected_calls_ca)
197+
assert category_a_compute_mock.call_count == 1
198+
199+
key_uniqueness_mock.assert_not_called()
200+
201+
expected_columns = ['value', 'category']
202+
assert list(result['Column']) == expected_columns
203+
expected_metrics = ['BoundaryAdherence', 'CategoryAdherence']
204+
assert list(result['Metric']) == expected_metrics

0 commit comments

Comments
 (0)