Skip to content

Commit 49969a8

Browse files
authored
Mock response from the production server for dataset description (#1407)
1 parent 7fb265d commit 49969a8

File tree

3 files changed

+46
-10
lines changed

3 files changed

+46
-10
lines changed

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ test=[
7979
"pytest-rerunfailures",
8080
"mypy",
8181
"ruff",
82+
"requests-mock",
8283
]
8384
examples=[
8485
"matplotlib",
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
<oml:data_set_description xmlns:oml="http://openml.org/openml">
2+
<oml:id>61</oml:id>
3+
<oml:name>iris</oml:name>
4+
<oml:version>1</oml:version>
5+
<oml:description>**Author**: R.A. Fisher
6+
**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/Iris) - 1936 - Donated by Michael Marshall
7+
**Please cite**:
8+
9+
**Iris Plants Database**
10+
This is perhaps the best known database to be found in the pattern recognition literature. Fisher's paper is a classic in the field and is referenced frequently to this day. (See Duda &amp; Hart, for example.) The data set contains 3 classes of 50 instances each, where each class refers to a type of iris plant. One class is linearly separable from the other 2; the latter are NOT linearly separable from each other.
11+
12+
Predicted attribute: class of iris plant.
13+
This is an exceedingly simple domain.
14+
15+
### Attribute Information:
16+
1. sepal length in cm
17+
2. sepal width in cm
18+
3. petal length in cm
19+
4. petal width in cm
20+
5. class:
21+
-- Iris Setosa
22+
-- Iris Versicolour
23+
-- Iris Virginica</oml:description>
24+
<oml:description_version>4</oml:description_version>
25+
<oml:format>ARFF</oml:format>
26+
<oml:creator>R.A. Fisher</oml:creator> <oml:collection_date>1936</oml:collection_date> <oml:upload_date>2014-04-06T23:23:39</oml:upload_date>
27+
<oml:language>English</oml:language> <oml:licence>Public</oml:licence> <oml:url>https://api.openml.org/data/v1/download/61/iris.arff</oml:url>
28+
<oml:parquet_url>https://data.openml.org/datasets/0000/0061/dataset_61.pq</oml:parquet_url> <oml:file_id>61</oml:file_id> <oml:default_target_attribute>class</oml:default_target_attribute> <oml:version_label>1</oml:version_label> <oml:citation>https://archive.ics.uci.edu/ml/citation_policy.html</oml:citation> <oml:tag>Botany</oml:tag><oml:tag>Ecology</oml:tag><oml:tag>Kaggle</oml:tag><oml:tag>Machine Learning</oml:tag><oml:tag>study_1</oml:tag><oml:tag>study_25</oml:tag><oml:tag>study_4</oml:tag><oml:tag>study_41</oml:tag><oml:tag>study_50</oml:tag><oml:tag>study_52</oml:tag><oml:tag>study_7</oml:tag><oml:tag>study_86</oml:tag><oml:tag>study_88</oml:tag><oml:tag>study_89</oml:tag><oml:tag>uci</oml:tag> <oml:visibility>public</oml:visibility> <oml:original_data_url>https://archive.ics.uci.edu/ml/datasets/Iris</oml:original_data_url> <oml:paper_url>http://digital.library.adelaide.edu.au/dspace/handle/2440/15227</oml:paper_url> <oml:minio_url>https://data.openml.org/datasets/0000/0061/dataset_61.pq</oml:minio_url> <oml:status>active</oml:status>
29+
<oml:processing_date>2020-11-20 19:02:18</oml:processing_date> <oml:md5_checksum>ad484452702105cbf3d30f8deaba39a9</oml:md5_checksum>
30+
</oml:data_set_description>

tests/test_datasets/test_dataset_functions.py

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import pandas as pd
1818
import pytest
1919
import requests
20+
import requests_mock
2021
import scipy.sparse
2122
from oslo_concurrency import lockutils
2223

@@ -1496,16 +1497,6 @@ def test_data_fork(self):
14961497
data_id=999999,
14971498
)
14981499

1499-
@pytest.mark.production()
1500-
def test_get_dataset_parquet(self):
1501-
# Parquet functionality is disabled on the test server
1502-
# There is no parquet-copy of the test server yet.
1503-
openml.config.server = self.production_server
1504-
dataset = openml.datasets.get_dataset(61, download_data=True)
1505-
assert dataset._parquet_url is not None
1506-
assert dataset.parquet_file is not None
1507-
assert os.path.isfile(dataset.parquet_file)
1508-
assert dataset.data_file is None # is alias for arff path
15091500

15101501
@pytest.mark.production()
15111502
def test_list_datasets_with_high_size_parameter(self):
@@ -1952,3 +1943,17 @@ def test_read_features_from_xml_with_whitespace() -> None:
19521943
)
19531944
dict = _read_features(features_file)
19541945
assert dict[1].nominal_values == [" - 50000.", " 50000+."]
1946+
1947+
1948+
def test_get_dataset_parquet(requests_mock, test_files_directory):
1949+
# Parquet functionality is disabled on the test server
1950+
# There is no parquet-copy of the test server yet.
1951+
content_file = (
1952+
test_files_directory / "mock_responses" / "datasets" / "data_description_61.xml"
1953+
)
1954+
requests_mock.get("https://www.openml.org/api/v1/xml/data/61", text=content_file.read_text())
1955+
dataset = openml.datasets.get_dataset(61, download_data=True)
1956+
assert dataset._parquet_url is not None
1957+
assert dataset.parquet_file is not None
1958+
assert os.path.isfile(dataset.parquet_file)
1959+
assert dataset.data_file is None # is alias for arff path

0 commit comments

Comments
 (0)