Skip to content

Commit 9f9e109

Browse files
committed
make release-tag: Merge branch 'main' into stable
2 parents 7fc61c8 + 925a916 commit 9f9e109

File tree

15 files changed

+122
-24
lines changed

15 files changed

+122
-24
lines changed

HISTORY.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,15 @@
11
# History
22

3+
## v0.23.0 - 2025-08-14
4+
5+
### New Features
6+
7+
* Allow me to turn off or control any subsampling done within the quality report - Issue [#790](https://github.com/sdv-dev/SDMetrics/issues/790) by @R-Palazzo
8+
9+
### Bugs Fixed
10+
11+
* Diagnostic Report should ignore `sequence_index` column in the DataValidity checks - Issue [#731](https://github.com/sdv-dev/SDMetrics/issues/731) by @fealho
12+
313
## v0.22.0 - 2025-07-24
414

515
### New Features

latest_requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,5 @@ numpy==2.2.6
33
pandas==2.3.1
44
plotly==6.2.0
55
scikit-learn==1.7.1
6-
scipy==1.16.0
6+
scipy==1.16.1
77
tqdm==4.67.1

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ convention = 'google'
140140
add-ignore = ['D107', 'D407', 'D417']
141141

142142
[tool.bumpversion]
143-
current_version = "0.22.0"
143+
current_version = "0.23.0.dev0"
144144
parse = '(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?'
145145
serialize = [
146146
'{major}.{minor}.{patch}.{release}{candidate}',

sdmetrics/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
__author__ = 'MIT Data To AI Lab'
66
__email__ = 'dailabmit@gmail.com'
7-
__version__ = '0.22.0'
7+
__version__ = '0.23.0.dev0'
88

99
import sys
1010
import warnings as python_warnings

sdmetrics/reports/base_report.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
import tqdm
1515

1616
from sdmetrics._utils_metadata import _convert_datetime_column, _validate_metadata
17+
from sdmetrics.reports.utils import DEFAULT_NUM_ROWS_SUBSAMPLE
1718
from sdmetrics.visualization import set_plotly_config
1819

1920

@@ -27,6 +28,7 @@ def __init__(self):
2728
self._overall_score = None
2829
self.is_generated = False
2930
self._properties = {}
31+
self.num_rows_subsample = DEFAULT_NUM_ROWS_SUBSAMPLE
3032
self.report_info = {
3133
'report_type': self.__class__.__name__,
3234
'generated_date': None,
@@ -163,6 +165,7 @@ def generate(self, real_data, synthetic_data, metadata, verbose=True):
163165
f'({ind + 1}/{len(self._properties)}) Evaluating {property_name}'
164166
)
165167

168+
self._properties[property_name].num_rows_subsample = self.num_rows_subsample
166169
score = self._properties[property_name].get_score(
167170
real_data, synthetic_data, metadata, progress_bar=progress_bar
168171
)

sdmetrics/reports/multi_table/_properties/base.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
import numpy as np
44
import pandas as pd
55

6+
from sdmetrics.reports.utils import DEFAULT_NUM_ROWS_SUBSAMPLE
7+
68

79
class BaseMultiTableProperty:
810
"""Base class for multi table properties.
@@ -26,6 +28,7 @@ def __init__(self):
2628
self._properties = {}
2729
self.is_computed = False
2830
self.details = pd.DataFrame()
31+
self.num_rows_subsample = DEFAULT_NUM_ROWS_SUBSAMPLE
2932

3033
def _get_num_iterations(self, metadata):
3134
"""Get the number of iterations for the property."""

sdmetrics/reports/single_table/_properties/base.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
import pandas as pd
44

5+
from sdmetrics.reports.utils import DEFAULT_NUM_ROWS_SUBSAMPLE
6+
57

68
class BaseSingleTableProperty:
79
"""Base class for single table properties.
@@ -14,6 +16,7 @@ class BaseSingleTableProperty:
1416

1517
def __init__(self):
1618
self.details = pd.DataFrame()
19+
self.num_rows_subsample = DEFAULT_NUM_ROWS_SUBSAMPLE
1720

1821
def _compute_average(self):
1922
"""Average the scores for each column."""

sdmetrics/reports/single_table/_properties/column_pair_trends.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,6 @@
1010
from sdmetrics.reports.single_table._properties import BaseSingleTableProperty
1111
from sdmetrics.reports.utils import PlotConfig
1212

13-
DEFAULT_NUM_ROWS_SUBSAMPLE = 50000
14-
1513

1614
class ColumnPairTrends(BaseSingleTableProperty):
1715
"""Column pair trends property.
@@ -30,6 +28,7 @@ class ColumnPairTrends(BaseSingleTableProperty):
3028
}
3129

3230
def __init__(self):
31+
super().__init__()
3332
self._columns_datetime_conversion_failed = {}
3433
self._columns_discretization_failed = {}
3534

@@ -276,10 +275,12 @@ def _generate_details(
276275
)
277276

278277
metric_params = {}
279-
if (metric == ContingencySimilarity) and (
280-
max(len(col_real), len(col_synthetic)) > DEFAULT_NUM_ROWS_SUBSAMPLE
278+
if (
279+
self.num_rows_subsample
280+
and (metric == ContingencySimilarity)
281+
and (max(len(col_real), len(col_synthetic)) > self.num_rows_subsample)
281282
):
282-
metric_params['num_rows_subsample'] = DEFAULT_NUM_ROWS_SUBSAMPLE
283+
metric_params['num_rows_subsample'] = self.num_rows_subsample
283284

284285
try:
285286
error = self._preprocessing_failed(

sdmetrics/reports/single_table/_properties/data_validity.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,16 +42,22 @@ def _generate_details(self, real_data, synthetic_data, metadata, progress_bar=No
4242
error_messages = []
4343
primary_key = metadata.get('primary_key')
4444
alternate_keys = metadata.get('alternate_keys', [])
45+
sequence_index = metadata.get('sequence_index')
46+
4547
for column_name in metadata['columns']:
4648
sdtype = metadata['columns'][column_name]['sdtype']
4749
primary_key_match = column_name == primary_key
4850
alternate_key_match = column_name in alternate_keys
4951
is_unique = primary_key_match or alternate_key_match
52+
is_sequence_index = column_name == sequence_index
5053

5154
try:
5255
if sdtype not in self._sdtype_to_metric and not is_unique:
5356
continue
5457

58+
if is_sequence_index and self._sdtype_to_metric.get(sdtype) == BoundaryAdherence:
59+
continue
60+
5561
metric = self._sdtype_to_metric.get(sdtype, KeyUniqueness)
5662
column_score = metric.compute(real_data[column_name], synthetic_data[column_name])
5763
error_message = None

sdmetrics/reports/utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
CONTINUOUS_SDTYPES = ['numerical', 'datetime']
1919
DISCRETE_SDTYPES = ['categorical', 'boolean']
20+
DEFAULT_NUM_ROWS_SUBSAMPLE = 50000
2021

2122

2223
class PlotConfig:

0 commit comments

Comments
 (0)