From f3d9cdab9f55441249df9d3fb6bc35a680e54a0b Mon Sep 17 00:00:00 2001 From: John Atanbori <36032013+Amotica@users.noreply.github.com> Date: Thu, 20 Mar 2025 09:57:21 +0000 Subject: [PATCH 1/2] Add SPOTS10 dataset --- .../image_classification/__init__.py | 1 + .../image_classification/spots10.py | 180 ++++++++++++++++++ .../image_classification/spots10_test.py | 53 ++++++ 3 files changed, 234 insertions(+) create mode 100644 tensorflow_datasets/image_classification/spots10.py create mode 100644 tensorflow_datasets/image_classification/spots10_test.py diff --git a/tensorflow_datasets/image_classification/__init__.py b/tensorflow_datasets/image_classification/__init__.py index 59df59b7bd8..012b7b8796e 100644 --- a/tensorflow_datasets/image_classification/__init__.py +++ b/tensorflow_datasets/image_classification/__init__.py @@ -96,3 +96,4 @@ from tensorflow_datasets.image_classification.svhn import SvhnCropped from tensorflow_datasets.image_classification.uc_merced import UcMerced from tensorflow_datasets.image_classification.visual_domain_decathlon import VisualDomainDecathlon +from tensorflow_datasets.image_classification.spots10 import spots10 \ No newline at end of file diff --git a/tensorflow_datasets/image_classification/spots10.py b/tensorflow_datasets/image_classification/spots10.py new file mode 100644 index 00000000000..ebdc7f9bde6 --- /dev/null +++ b/tensorflow_datasets/image_classification/spots10.py @@ -0,0 +1,180 @@ +# coding=utf-8 +# Copyright 2024 The TensorFlow Datasets Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""spots10""" + +import os + +import numpy as np +from six.moves import urllib +from tensorflow_datasets.core.utils.lazy_imports_utils import tensorflow as tf +import tensorflow_datasets.public_api as tfds + +# spots10 constants +_spots10_URL = "https://github.com/Amotica/spots-10/raw/refs/heads/main/dataset/" +_spots10_TRAIN_DATA_FILENAME = "train-images-idx3-ubyte.gz" +_spots10_TRAIN_LABELS_FILENAME = "train-labels-idx1-ubyte.gz" +_spots10_TEST_DATA_FILENAME = "test-images-idx3-ubyte.gz" +_spots10_TEST_LABELS_FILENAME = "test-labels-idx1-ubyte.gz" +_spots10_IMAGE_SIZE = 32 +spots10_IMAGE_SHAPE = (_spots10_IMAGE_SIZE, _spots10_IMAGE_SIZE, 1) +spots10_NUM_CLASSES = 10 +_TRAIN_EXAMPLES = 40000 +_TEST_EXAMPLES = 10000 + +_spots10_CITATION = """\ +@article{atanbori2024spots, + title={spots-10: Animal Pattern Benchmark Dataset for Machine Learning Algorithms}, + author={Atanbori, John}, + journal={arXiv preprint arXiv:2410.21044}, + year={2024} +} +""" + +class spots10(tfds.core.GeneratorBasedBuilder): + """spots10.""" + + URL = _spots10_URL + + VERSION = tfds.core.Version("1.0.0") + + NAME = "spots10" + + def _info(self): + return tfds.core.DatasetInfo( + builder=self, + description="spots10 dataset consisting of grayscale images featuring patterns from various animal species.", + features=tfds.features.FeaturesDict({ + "image": tfds.features.Image(shape=spots10_IMAGE_SHAPE), + "label": tfds.features.ClassLabel(num_classes=spots10_NUM_CLASSES), + }), + supervised_keys=("image", "label"), + homepage="https://github.com/Amotica/spots-10", + citation=_spots10_CITATION, + ) + + def _split_generators(self, dl_manager): + """Returns SplitGenerators.""" + # Download the full spots10 Database + filenames = { + "train_data": _spots10_TRAIN_DATA_FILENAME, + "train_labels": _spots10_TRAIN_LABELS_FILENAME, + "test_data": _spots10_TEST_DATA_FILENAME, + "test_labels": _spots10_TEST_LABELS_FILENAME, + } + spots10_files = dl_manager.download_and_extract( + {k: urllib.parse.urljoin(self.URL, v) for k, v in filenames.items()} + ) + + # spots10 provides TRAIN and TEST splits, not a VALIDATION split, so we only + # write the TRAIN and TEST splits to disk. + return [ + tfds.core.SplitGenerator( + name=tfds.Split.TRAIN, + gen_kwargs=dict( + num_examples=_TRAIN_EXAMPLES, + data_path=spots10_files["train_data"], + label_path=spots10_files["train_labels"], + ), + ), + tfds.core.SplitGenerator( + name=tfds.Split.TEST, + gen_kwargs=dict( + num_examples=_TEST_EXAMPLES, + data_path=spots10_files["test_data"], + label_path=spots10_files["test_labels"], + ), + ), + ] + + def _generate_examples(self, num_examples, data_path, label_path): + """Generate spots10 examples as dicts. + + Args: + num_examples (int): The number of example. + data_path (str): Path to the data files + label_path (str): Path to the labels + + Yields: + Generator yielding the next examples + """ + images = _extract_spots10_images(data_path, num_examples) + labels = _extract_spots10_labels(label_path, num_examples) + data = list(zip(images, labels)) + + # Using index as key since data is always loaded in same order. + for index, (image, label) in enumerate(data): + record = {"image": image, "label": label} + yield index, record + + +def _extract_spots10_images(image_filepath, num_images): + with tf.io.gfile.GFile(image_filepath, "rb") as f: + f.read(16) # header + buf = f.read(_spots10_IMAGE_SIZE * _spots10_IMAGE_SIZE * num_images) + data = np.frombuffer( + buf, + dtype=np.uint8, + ).reshape(num_images, _spots10_IMAGE_SIZE, _spots10_IMAGE_SIZE, 1) + return data + + +def _extract_spots10_labels(labels_filepath, num_labels): + with tf.io.gfile.GFile(labels_filepath, "rb") as f: + f.read(8) # header + buf = f.read(num_labels) + labels = np.frombuffer(buf, dtype=np.uint8).astype(np.int64) + return labels + +''' +def main(): + print("Loading spots10 dataset...") + + # Load the spots10 dataset + (train_data, test_data), dataset_info = tfds.load( + 'spots10', + split=['train', 'test'], + with_info=True, + as_supervised=True + ) + + print("Dataset loaded successfully!") + print(f"Number of training examples: {len(list(train_data))}") + print(f"Number of test examples: {len(list(test_data))}") + + # Display dataset information + print("Dataset Info:") + print(dataset_info) + + # Example of displaying a few images + import matplotlib.pyplot as plt + + def show_images(dataset, title, num_images=5): + plt.figure(figsize=(10, 5)) + for i, (image, label) in enumerate(dataset.take(num_images)): + plt.subplot(1, num_images, i + 1) + plt.imshow(image.numpy().squeeze(), cmap='gray') + plt.title(f"Label: {label.numpy()}") + plt.axis('off') + plt.suptitle(title) + plt.show() + + show_images(train_data, "Training Data Samples") + show_images(test_data, "Test Data Samples") + + +if __name__ == "__main__": + main() +''' \ No newline at end of file diff --git a/tensorflow_datasets/image_classification/spots10_test.py b/tensorflow_datasets/image_classification/spots10_test.py new file mode 100644 index 00000000000..1bd8eb9f935 --- /dev/null +++ b/tensorflow_datasets/image_classification/spots10_test.py @@ -0,0 +1,53 @@ +# coding=utf-8 +# Copyright 2024 The TensorFlow Datasets Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for spots10 dataset module.""" + +from tensorflow_datasets import testing +from tensorflow_datasets.image_classification import spots10 +import pytest + +# testing/spots10.py generates fake input data + +spots10._TRAIN_EXAMPLES = 2 # pylint: disable=protected-access +spots10._TEST_EXAMPLES = 2 # pylint: disable=protected-access + + +class spots10Test(testing.DatasetBuilderTestCase): + DATASET_CLASS = spots10 + SPLITS = { + "train": 2, + "test": 2, + } + DL_EXTRACT_RESULT = { + "train_data": "train-image", + "train_labels": "train-label", + "test_data": "test-image", + "test_labels": "test-label", + } + + + """ + Skip the test_download_and_prepare_as_dataset test using + @pytest.mark.skip decorator because no dummy dataset + was included for spots10. + """ + @pytest.mark.skip(reason="Skipping this test temporarily.") + def test_download_and_prepare_as_dataset(self): + pass + + +if __name__ == "__main__": + testing.test_main() From 03b9bd0319c26410eaa1cb7d9a7b42983e9cf1ec Mon Sep 17 00:00:00 2001 From: John Atanbori <36032013+Amotica@users.noreply.github.com> Date: Fri, 21 Nov 2025 23:33:09 +0000 Subject: [PATCH 2/2] spot10 checksum and dummy data added --- .../spots10_checksums.tsv | 4 +++ .../image_classification/spots10_test.py | 24 +++++------------- .../spots10/test-images-idx3-ubyte.gz | Bin 0 -> 2110 bytes .../spots10/test-labels-idx1-ubyte.gz | Bin 0 -> 53 bytes .../spots10/train-images-idx3-ubyte.gz | Bin 0 -> 2111 bytes .../spots10/train-labels-idx1-ubyte.gz | Bin 0 -> 54 bytes 6 files changed, 10 insertions(+), 18 deletions(-) create mode 100644 tensorflow_datasets/image_classification/spots10_checksums.tsv create mode 100644 tensorflow_datasets/testing/test_data/fake_examples/spots10/test-images-idx3-ubyte.gz create mode 100644 tensorflow_datasets/testing/test_data/fake_examples/spots10/test-labels-idx1-ubyte.gz create mode 100644 tensorflow_datasets/testing/test_data/fake_examples/spots10/train-images-idx3-ubyte.gz create mode 100644 tensorflow_datasets/testing/test_data/fake_examples/spots10/train-labels-idx1-ubyte.gz diff --git a/tensorflow_datasets/image_classification/spots10_checksums.tsv b/tensorflow_datasets/image_classification/spots10_checksums.tsv new file mode 100644 index 00000000000..ff7c110555f --- /dev/null +++ b/tensorflow_datasets/image_classification/spots10_checksums.tsv @@ -0,0 +1,4 @@ +https://github.com/Amotica/spots-10/raw/refs/heads/main/dataset/train-images-idx3-ubyte.gz 219a7de4f14619757056c09710f43a57b1122233d85aa26b6b2137ebf4cedca1 train-images-idx3-ubyte.gz +https://github.com/Amotica/spots-10/raw/refs/heads/main/dataset/train-labels-idx1-ubyte.gz 5228a38a96f11a7d8fea4dc8096d49d2c7e41725fdf9645f8746e5acb96d9e0a train-labels-idx1-ubyte.gz +https://github.com/Amotica/spots-10/raw/refs/heads/main/dataset/test-images-idx3-ubyte.gz efcf680b245f52acad8053b710acf042eee54e182893353c91109cad65e9af2a test-images-idx3-ubyte.gz +https://github.com/Amotica/spots-10/raw/refs/heads/main/dataset/test-labels-idx1-ubyte.gz 7897ba88f7897f7424202688b35d0ca41a31501e67ccb18e17ca9aa9ddbcf18a test-labels-idx1-ubyte.gz diff --git a/tensorflow_datasets/image_classification/spots10_test.py b/tensorflow_datasets/image_classification/spots10_test.py index 1bd8eb9f935..eb2623b38f2 100644 --- a/tensorflow_datasets/image_classification/spots10_test.py +++ b/tensorflow_datasets/image_classification/spots10_test.py @@ -17,37 +17,25 @@ from tensorflow_datasets import testing from tensorflow_datasets.image_classification import spots10 -import pytest -# testing/spots10.py generates fake input data +# testing/mnist.py generates fake input data spots10._TRAIN_EXAMPLES = 2 # pylint: disable=protected-access spots10._TEST_EXAMPLES = 2 # pylint: disable=protected-access class spots10Test(testing.DatasetBuilderTestCase): - DATASET_CLASS = spots10 + DATASET_CLASS = spots10.spots10 SPLITS = { "train": 2, "test": 2, } DL_EXTRACT_RESULT = { - "train_data": "train-image", - "train_labels": "train-label", - "test_data": "test-image", - "test_labels": "test-label", + "train_data": "train-images-idx3-ubyte.gz", + "train_labels": "train-labels-idx1-ubyte.gz", + "test_data": "test-images-idx3-ubyte.gz", + "test_labels": "test-labels-idx1-ubyte.gz", } - - """ - Skip the test_download_and_prepare_as_dataset test using - @pytest.mark.skip decorator because no dummy dataset - was included for spots10. - """ - @pytest.mark.skip(reason="Skipping this test temporarily.") - def test_download_and_prepare_as_dataset(self): - pass - - if __name__ == "__main__": testing.test_main() diff --git a/tensorflow_datasets/testing/test_data/fake_examples/spots10/test-images-idx3-ubyte.gz b/tensorflow_datasets/testing/test_data/fake_examples/spots10/test-images-idx3-ubyte.gz new file mode 100644 index 0000000000000000000000000000000000000000..ddf790f234166a9c360230118190b9feadda4d6f GIT binary patch literal 2110 zcmV-E2*LLsiwFn+?I39a|8!+@bS-IZVP|D?Eoo$UGc9#ud30p}0T2l9_W%G00{{R3 z0ssI2AOHXWAb2EOKlSo{2g>r)iS}JYu?~WNXC+6asj51fZVS|SCGzd8rtD7}8Djkc zYcg?=Q%F09P-|+=;0|c#FdYadq^o4h7Vw|4MgH5w5NrZ0VBOK)hv#Ob&=Prb*Hjj3 zEaepdCNM50e`KPFbaHLoU6GCLv7_!46dX7xl=0Muf&>%u(?)S3k;Y|ZP6bYPq#4&w z7I7mUAQ6kK{`o=0Of$p2NaGR@pHV@Ok{jM--#NR zr+1LcYJ7#OZsei^&t4zj;dy?poPPNM+HG%lmo7Iu5`*SW4^X`!qzjAmM!2M|plkZ*eD6oX42H4Xkt;s>5EVOV#yNKj3Nx_k9?Yu_R2_@%YDw8D!Sl zapD_}r^g@7zG738VX|9eUXBMWwEk#Z=_psYLkf`At&X+vTpEc?8*{fo9X+@4>rCTd z$AgZhEi=2~zgIyv$u2gvNvs8LIjie(I2%GqnCMaRNGFbQrJ%3cwq8GaQvAz{Q_7p| z`b58Y*T6I&DX?u+bO9m^L-}jX>QhnGj3r56DE$Xxecs?X_bkqy8{nOKm+?$-C4;;0 ziLaYY^_WT0<+uGjNeR{AiMzRp@%HH&_hSMA6Qn>UR{Zxl*>5lk(RVP8a|*`PQ`=G0 zEa?Hd40uUY-vjMDFJ8~c_3&)y?h4VRlbC5}Lq1)~&wzpFqt)FS;-_p3yl3 zi^`Fv;3=8p8W+6eQAm-!s9)45hRA=&_Z2wlFIA|(lkA|lmBRysQ==zVlp z{2xi@oA8&^#Jqgwu*9{m7g}8x_EcAkv|d1xN(@C20j<ts>T5>~r-58GUr$1eZ*#t%TD@nLfAnvpF_g2%t?~ixq8mCxQ4Q#5nbf!;ig{ zc;$6ehg$HHi=20X?jT}*SeYO7m{p$4(>s5$+QIcJ=C=*BhTq7)JeFZ5kdGC_pyP8Y zwhw2z?_b+qu|C0(fYFJ3Q;*QcMP-*cUh_-}KL9&KsqP15CA?A%K#RvY_|vKJtr2(R z%c*zjSi_a38d94>e!QW$7VTst^acfu>;Ak=po%+Eh^pCM+hZVt1EN~15EqzozwbYH zA`dU$F{k4whHvkq)t2|Et_KT-$-foP!qbm_=Rs-OK|bKHKaliP6XpGv^%=;;xg}bP zUS2TqP4q<7znx3!918v4*{~gd$X;eaLxLwyGB}LJ$|L3>d_FYVr0gSY77KawubLhN zS&bQ6Bp3qWPOpavA(qz|;`jY(_v+hsTHP&Klu(bRahL}I?94zo-d~*u`>a%QnH=H* zi%Rzy;q=+wHSqm3&U^+8Z`h}ZJ39;=~HPsW#PHk43+QXZYn1rdUf{ z?LM~VTe9&H5~O98$XTQ~E3hkcdd9cDiqi(HeJl{dmhO=-0WK(#hEf>Z=K3Kjol?Dz z0?J-9{+2)Z-NDk^NUAhJ0tP4#5uPm>K+Csr0{N_vzG^@++AUa7&A#A#@qZ7UwR@6S zkwP5C8EhKD^M~l;o6V}U&2bad_cj^r=Ne{zQC2l7yIQ^-Kh+b@`uzt;k(GMJT6ilU zj&M8+o7~F1@hj&oRblt9>O2?)8W|VGDdSs!p^rHD< zTSgW=GTl1Ttl`bEK7;-;itjYGT;p^S*q!MgYn-vconweJF7gpQxN}8Kh(cczfI)#( zg-cAXh(1^Sc`U)wt5%w)z9k1n8iWhfuu$OHKcO{n<{Z`y?%nQX07&>lnAv=DqTT(gX8WcH`&>DNu>z! ztJy0qNCK;OaihR3Hh2{<#BR57d<+AQW`#oVYFq#rcHeS1-PKngyKb4PKyA$d&6GU2 zeW=mwo?MTCyNX?*{upI?3qh%}I2BaODH$tsM3UhouFo25ZzkGh)7_>eM z;dZ}k;usrZW9lw;5p_|7Y03DMx>}VsTVe(s-7;G`Rzh9Wo6B=~^Oj9#HH&W)W)Gi9 o2=$0Y0%In_#A~9tB}d`za+J|L^mffDK)29H#4QeP`5OxvLuxuIpIMKiPZDD6+b1i9Pcrz_^VtI6B009sP@Am)z2m=5B z00IC203ZMW03fwktxHQoc$neux@3&FpoayVFo_D#vNId*sU00#T+S>}(>wZ2h`%!8 z<+@I#K@}r8tgd)3oV_He5iJM!@l6w!UH;iq1>pp)eIGZ4L6-x*F)(f#^_GALpi!Q= zzui`q$IFi|BFkH9XnAq|qYF%=CsXtXCa^b;^#MBYv!(N$4?<%^M4kwGBHNcmtve0m zr7(eD?^zGKvMHY1&~+#$RQxy33vyXWPPf(|K73O`MYh-cnZ3OpnFCwmph&`_cG|RB zz$q5C1!t962vd`BRM3Oj+PPZ>3|+Q9htLob$?EBW3vXRJB!0Pie%r^%7VKoIzm0zl z#}4DRMWY+H@D!8Ku~yKD8H?BZ`6IB=UAO_$pcNP8Wqu>CMOez!>GwZ6TK-Sqit2mi z`?>raOpqjG@6FvzlWq`RG*KO< ziIN~tkwL8ztrvmu1=JQsEZK$uz@kxPG`t5b$g^q^4m89GJ2x1^GX8~={T9<+St257 zMfo-4=B_Wm^}_+-xG;e4bOkP26C)r=nplBO4=xn+^>3b6&2{rz5%VeP7na z{;f@?M@Oe@f*e9c1bMg{RmAq_;S4=Sc}1^g%M zrmCQ;hz5eKIEnOLjhJCQnpl=A8QHpYU6)u7x`_^iHko|s7Z5fjHzK`wh1}EnBP6Zs zF`^o?wB7)P;Z_fg*^50u9tIkxF5-6)yS_3S=GCW>g7qAQ%roVH{XZXg{NW#k*u(;v zi{?(^1^!m64_z6d$VF$+zs%S2WUU)oYT_Z18i*#bB-Ms^=+D~ zpxXWvw-r^0s!5W~0(gWqQ;_6v;|LoHf4UaZF?C6$EKzAg0FxCokdXom;)Vr20EuFN z%tB56(h|Dh+W?|W+O?d>L%;z4W zCJ&X&H+F_|R(Ka!LV|PTqMKFs;1An4+VIf2Q#}~EXvH;<>JCmLe_y;jkW;eF13Th< z+>*+ksg=CFpyrcX0*&&Wf@1;4`F5Pw-*xI>8#dr^5_`PlJ-M+;Z>ty-lMKp);pcuJ zEP+rBx0!g`*GfI4GiE0OfVzn$vA;W48;OPJ)YSW9-~p_y&BY-yvt&Pwel7OhFD+`! zB)KPEN(nW?q){EBV@ohxZplSHS+Zj(7J44yP%~E)M2xxYe{Oe+&V1)~5H9GKiJ&mF z%%{O>P&&T$J5*9q;7`dBoN*ny4%G?M=8^O&bCOVq0ULzzRut{x+^EKl@Y3)rw-G6w zD#Xi=A>iV*c?u+*waldNe4pq?G}#caQ0oCm#^OaA^8O->FnM<{gzx`FdXE)+0_bA z=rzrBZYLlOEPk!L2C&e*_p1zK=3=S`Z5w#MP7+qT{U?L+C>l!;R?I??M0WAP5(+$t zpz~Sb_U-Q=@UbwKD);RiJ_z?r8|YuCqIsNiKkRsJ75dHoMDghM6OU2s$Y>Fv!THmQ zJ}uc(L{TY8R3X!Qa^BR2@Qtaez$39i274njyCgLxWrJi%oEvhm;e6`UcE=q~%i#n% z-$0t0a0JzAeQ@6wnoDHx1g>ZK0b5V`FEaF3>@d+D> z*KGxU7Y;C5LRFx%FfT6wO;TqDv_AsvKZVv1o{v@N+9GGnKxpr!Zvk|k_JiMIeF*BHMfWdZXQl|aIZFLs6hncw>Qkgk!5P~NNeOMv_Nl!@5~V-vOZG3 zqJy7<=!`Zft=>nF_IgIK*lWBAU*xtWP6&;+;ZbUL7=K(1M5X-#NA%Hzf(YL0(*@K=RE@OY46{|n|i7qi9VuL`iDzI@y8qM8xWMnj)&#@ zZ3)Lk*)LMq-KN(fNIUOhCHj7&^~WaR>k7;flmmJFJ|pVOd_RvFOpJ3WvaNEB?Y!z0 zxSdldf_U10+NxddEUEJ2xJPRYwW=pxzD9_g)j%v^xdLV3^(yY|4n^+#Wm0PCBqhek zPm9?}AZ?T2;ZO)mf|zKCjG`#zsx<}Q%9UWWYD^@j2>8x{C=utB}d`zoaNJGfy`sF)1~tST{4J!cey~sj?)MAvxheLP|nHLQ(