From 64f38a5fca9d5a5b23fedc33407b15db1fccc121 Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Wed, 22 Oct 2025 16:04:17 +0000 Subject: [PATCH 01/66] feat: add HF dependencies (as a group) --- pyproject.toml | 10 +-- uv.lock | 189 ++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 194 insertions(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f0e3d31..4eff574 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,8 @@ dev = [ "nltk", "unidecode", "captum", - "pyarrow" + "pyarrow", + "pre-commit>=4.3.0", ] docs = [ "sphinx>=5.0.0", @@ -42,6 +43,10 @@ docs = [ "myst-parser>=0.18.0", "sphinx-design>=0.3.0" ] +hf-dep = [ + "tokenizers>=0.22.1", + "transformers>=4.57.1", +] [project.optional-dependencies] explainability = ["unidecode", "nltk", "captum"] @@ -58,6 +63,3 @@ line-length = 100 [tool.uv.build-backend] module-name="torchTextClassifiers" module-root = "" - - - diff --git a/uv.lock b/uv.lock index ee61acd..af125e0 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 2 +revision = 3 requires-python = ">=3.11" resolution-markers = [ "python_full_version >= '3.12'", @@ -146,6 +146,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/84/ae/320161bd181fc06471eed047ecce67b693fd7515b16d495d8932db763426/certifi-2025.6.15-py3-none-any.whl", hash = "sha256:2e0c7ce7cb5d8f8634ca55d2ba7e6ec2689a2fd6537d8dec1296a477a4910057", size = 157650, upload-time = "2025-06-15T02:45:49.977Z" }, ] +[[package]] +name = "cfgv" +version = "3.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/11/74/539e56497d9bd1d484fd863dd69cbbfa653cd2aa27abfe35653494d85e94/cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560", size = 7114, upload-time = "2023-08-12T20:38:17.776Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c5/55/51844dd50c4fc7a33b653bfaba4c2456f06955289ca770a5dbd5fd267374/cfgv-3.4.0-py2.py3-none-any.whl", hash = "sha256:b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9", size = 7249, upload-time = "2023-08-12T20:38:16.269Z" }, +] + [[package]] name = "charset-normalizer" version = "3.4.2" @@ -278,6 +287,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e7/05/c19819d5e3d95294a6f5947fb9b9629efb316b96de511b418c53d245aae6/cycler-0.12.1-py3-none-any.whl", hash = "sha256:85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30", size = 8321, upload-time = "2023-10-07T05:32:16.783Z" }, ] +[[package]] +name = "distlib" +version = "0.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/96/8e/709914eb2b5749865801041647dc7f4e6d00b549cfe88b65ca192995f07c/distlib-0.4.0.tar.gz", hash = "sha256:feec40075be03a04501a973d81f633735b4b69f98b05450592310c0f401a4e0d", size = 614605, upload-time = "2025-07-17T16:52:00.465Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/33/6b/e0547afaf41bf2c42e52430072fa5658766e3d65bd4b03a563d1b6336f57/distlib-0.4.0-py2.py3-none-any.whl", hash = "sha256:9659f7d87e46584a30b5780e43ac7a2143098441670ff0a49d5f9034c54a6c16", size = 469047, upload-time = "2025-07-17T16:51:58.613Z" }, +] + [[package]] name = "docutils" version = "0.21.2" @@ -420,6 +438,49 @@ http = [ { name = "aiohttp" }, ] +[[package]] +name = "hf-xet" +version = "1.1.10" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/74/31/feeddfce1748c4a233ec1aa5b7396161c07ae1aa9b7bdbc9a72c3c7dd768/hf_xet-1.1.10.tar.gz", hash = "sha256:408aef343800a2102374a883f283ff29068055c111f003ff840733d3b715bb97", size = 487910, upload-time = "2025-09-12T20:10:27.12Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f7/a2/343e6d05de96908366bdc0081f2d8607d61200be2ac802769c4284cc65bd/hf_xet-1.1.10-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:686083aca1a6669bc85c21c0563551cbcdaa5cf7876a91f3d074a030b577231d", size = 2761466, upload-time = "2025-09-12T20:10:22.836Z" }, + { url = "https://files.pythonhosted.org/packages/31/f9/6215f948ac8f17566ee27af6430ea72045e0418ce757260248b483f4183b/hf_xet-1.1.10-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:71081925383b66b24eedff3013f8e6bbd41215c3338be4b94ba75fd75b21513b", size = 2623807, upload-time = "2025-09-12T20:10:21.118Z" }, + { url = "https://files.pythonhosted.org/packages/15/07/86397573efefff941e100367bbda0b21496ffcdb34db7ab51912994c32a2/hf_xet-1.1.10-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b6bceb6361c80c1cc42b5a7b4e3efd90e64630bcf11224dcac50ef30a47e435", size = 3186960, upload-time = "2025-09-12T20:10:19.336Z" }, + { url = "https://files.pythonhosted.org/packages/01/a7/0b2e242b918cc30e1f91980f3c4b026ff2eedaf1e2ad96933bca164b2869/hf_xet-1.1.10-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:eae7c1fc8a664e54753ffc235e11427ca61f4b0477d757cc4eb9ae374b69f09c", size = 3087167, upload-time = "2025-09-12T20:10:17.255Z" }, + { url = "https://files.pythonhosted.org/packages/4a/25/3e32ab61cc7145b11eee9d745988e2f0f4fafda81b25980eebf97d8cff15/hf_xet-1.1.10-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:0a0005fd08f002180f7a12d4e13b22be277725bc23ed0529f8add5c7a6309c06", size = 3248612, upload-time = "2025-09-12T20:10:24.093Z" }, + { url = "https://files.pythonhosted.org/packages/2c/3d/ab7109e607ed321afaa690f557a9ada6d6d164ec852fd6bf9979665dc3d6/hf_xet-1.1.10-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:f900481cf6e362a6c549c61ff77468bd59d6dd082f3170a36acfef2eb6a6793f", size = 3353360, upload-time = "2025-09-12T20:10:25.563Z" }, + { url = "https://files.pythonhosted.org/packages/ee/0e/471f0a21db36e71a2f1752767ad77e92d8cde24e974e03d662931b1305ec/hf_xet-1.1.10-cp37-abi3-win_amd64.whl", hash = "sha256:5f54b19cc347c13235ae7ee98b330c26dd65ef1df47e5316ffb1e87713ca7045", size = 2804691, upload-time = "2025-09-12T20:10:28.433Z" }, +] + +[[package]] +name = "huggingface-hub" +version = "0.35.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "filelock" }, + { name = "fsspec" }, + { name = "hf-xet", marker = "platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'" }, + { name = "packaging" }, + { name = "pyyaml" }, + { name = "requests" }, + { name = "tqdm" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/10/7e/a0a97de7c73671863ca6b3f61fa12518caf35db37825e43d63a70956738c/huggingface_hub-0.35.3.tar.gz", hash = "sha256:350932eaa5cc6a4747efae85126ee220e4ef1b54e29d31c3b45c5612ddf0b32a", size = 461798, upload-time = "2025-09-29T14:29:58.625Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/31/a0/651f93d154cb72323358bf2bbae3e642bdb5d2f1bfc874d096f7cb159fa0/huggingface_hub-0.35.3-py3-none-any.whl", hash = "sha256:0e3a01829c19d86d03793e4577816fe3bdfc1602ac62c7fb220d593d351224ba", size = 564262, upload-time = "2025-09-29T14:29:55.813Z" }, +] + +[[package]] +name = "identify" +version = "2.6.15" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ff/e7/685de97986c916a6d93b3876139e00eef26ad5bbbd61925d670ae8013449/identify-2.6.15.tar.gz", hash = "sha256:e4f4864b96c6557ef2a1e1c951771838f4edc9df3a72ec7118b338801b11c7bf", size = 99311, upload-time = "2025-10-02T17:43:40.631Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0f/1c/e5fd8f973d4f375adb21565739498e2e9a1e54c858a97b9a8ccfdc81da9b/identify-2.6.15-py2.py3-none-any.whl", hash = "sha256:1181ef7608e00704db228516541eb83a88a9f94433a8c80bb9b5bd54b1d81757", size = 99183, upload-time = "2025-10-02T17:43:39.137Z" }, +] + [[package]] name = "idna" version = "3.10" @@ -799,6 +860,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/4d/66/7d9e26593edda06e8cb531874633f7c2372279c3b0f46235539fe546df8b/nltk-3.9.1-py3-none-any.whl", hash = "sha256:4fa26829c5b00715afe3061398a8989dc643b92ce7dd93fb4585a70930d168a1", size = 1505442, upload-time = "2024-08-18T19:48:21.909Z" }, ] +[[package]] +name = "nodeenv" +version = "1.9.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/43/16/fc88b08840de0e0a72a2f9d8c6bae36be573e475a6326ae854bcc549fc45/nodeenv-1.9.1.tar.gz", hash = "sha256:6ec12890a2dab7946721edbfbcd91f3319c6ccc9aec47be7c7e6b7011ee6645f", size = 47437, upload-time = "2024-06-04T18:44:11.171Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9", size = 22314, upload-time = "2024-06-04T18:44:08.352Z" }, +] + [[package]] name = "numpy" version = "2.2.6" @@ -1089,6 +1159,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/21/2c/5e05f58658cf49b6667762cca03d6e7d85cededde2caf2ab37b81f80e574/pillow-11.2.1-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:208653868d5c9ecc2b327f9b9ef34e0e42a4cdd172c2988fd81d62d2bc9bc044", size = 2674751, upload-time = "2025-04-12T17:49:59.628Z" }, ] +[[package]] +name = "platformdirs" +version = "4.5.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/61/33/9611380c2bdb1225fdef633e2a9610622310fed35ab11dac9620972ee088/platformdirs-4.5.0.tar.gz", hash = "sha256:70ddccdd7c99fc5942e9fc25636a8b34d04c24b335100223152c2803e4063312", size = 21632, upload-time = "2025-10-08T17:44:48.791Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/73/cb/ac7874b3e5d58441674fb70742e6c374b28b0c7cb988d37d991cde47166c/platformdirs-4.5.0-py3-none-any.whl", hash = "sha256:e578a81bb873cbb89a41fcc904c7ef523cc18284b7e3b3ccf06aca1403b7ebd3", size = 18651, upload-time = "2025-10-08T17:44:47.223Z" }, +] + [[package]] name = "pluggy" version = "1.6.0" @@ -1110,6 +1189,22 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e9/2f/a4583c70fbd8cd04910e2884bcc2bdd670e884061f7b4d70bc13e632a993/pockets-0.9.1-py2.py3-none-any.whl", hash = "sha256:68597934193c08a08eb2bf6a1d85593f627c22f9b065cc727a4f03f669d96d86", size = 26263, upload-time = "2019-11-02T14:46:17.814Z" }, ] +[[package]] +name = "pre-commit" +version = "4.3.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cfgv" }, + { name = "identify" }, + { name = "nodeenv" }, + { name = "pyyaml" }, + { name = "virtualenv" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ff/29/7cf5bbc236333876e4b41f56e06857a87937ce4bf91e117a6991a2dbb02a/pre_commit-4.3.0.tar.gz", hash = "sha256:499fe450cc9d42e9d58e606262795ecb64dd05438943c62b66f6a8673da30b16", size = 193792, upload-time = "2025-08-09T18:56:14.651Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5b/a5/987a405322d78a73b66e39e4a90e4ef156fd7141bf71df987e50717c321b/pre_commit-4.3.0-py2.py3-none-any.whl", hash = "sha256:2b0747ad7e6e967169136edffee14c16e148a778a54e4f967921aa1ebf2308d8", size = 220965, upload-time = "2025-08-09T18:56:13.192Z" }, +] + [[package]] name = "propcache" version = "0.3.1" @@ -1412,6 +1507,28 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/53/97/d2cbbaa10c9b826af0e10fdf836e1bf344d9f0abb873ebc34d1f49642d3f/roman_numerals_py-3.1.0-py3-none-any.whl", hash = "sha256:9da2ad2fb670bcf24e81070ceb3be72f6c11c440d73bd579fbeca1e9f330954c", size = 7742, upload-time = "2025-02-22T07:34:52.422Z" }, ] +[[package]] +name = "safetensors" +version = "0.6.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ac/cc/738f3011628920e027a11754d9cae9abec1aed00f7ae860abbf843755233/safetensors-0.6.2.tar.gz", hash = "sha256:43ff2aa0e6fa2dc3ea5524ac7ad93a9839256b8703761e76e2d0b2a3fa4f15d9", size = 197968, upload-time = "2025-08-08T13:13:58.654Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4d/b1/3f5fd73c039fc87dba3ff8b5d528bfc5a32b597fea8e7a6a4800343a17c7/safetensors-0.6.2-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:9c85ede8ec58f120bad982ec47746981e210492a6db876882aa021446af8ffba", size = 454797, upload-time = "2025-08-08T13:13:52.066Z" }, + { url = "https://files.pythonhosted.org/packages/8c/c9/bb114c158540ee17907ec470d01980957fdaf87b4aa07914c24eba87b9c6/safetensors-0.6.2-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:d6675cf4b39c98dbd7d940598028f3742e0375a6b4d4277e76beb0c35f4b843b", size = 432206, upload-time = "2025-08-08T13:13:50.931Z" }, + { url = "https://files.pythonhosted.org/packages/d3/8e/f70c34e47df3110e8e0bb268d90db8d4be8958a54ab0336c9be4fe86dac8/safetensors-0.6.2-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1d2d2b3ce1e2509c68932ca03ab8f20570920cd9754b05063d4368ee52833ecd", size = 473261, upload-time = "2025-08-08T13:13:41.259Z" }, + { url = "https://files.pythonhosted.org/packages/2a/f5/be9c6a7c7ef773e1996dc214e73485286df1836dbd063e8085ee1976f9cb/safetensors-0.6.2-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:93de35a18f46b0f5a6a1f9e26d91b442094f2df02e9fd7acf224cfec4238821a", size = 485117, upload-time = "2025-08-08T13:13:43.506Z" }, + { url = "https://files.pythonhosted.org/packages/c9/55/23f2d0a2c96ed8665bf17a30ab4ce5270413f4d74b6d87dd663258b9af31/safetensors-0.6.2-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:89a89b505f335640f9120fac65ddeb83e40f1fd081cb8ed88b505bdccec8d0a1", size = 616154, upload-time = "2025-08-08T13:13:45.096Z" }, + { url = "https://files.pythonhosted.org/packages/98/c6/affb0bd9ce02aa46e7acddbe087912a04d953d7a4d74b708c91b5806ef3f/safetensors-0.6.2-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fc4d0d0b937e04bdf2ae6f70cd3ad51328635fe0e6214aa1fc811f3b576b3bda", size = 520713, upload-time = "2025-08-08T13:13:46.25Z" }, + { url = "https://files.pythonhosted.org/packages/fe/5d/5a514d7b88e310c8b146e2404e0dc161282e78634d9358975fd56dfd14be/safetensors-0.6.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8045db2c872db8f4cbe3faa0495932d89c38c899c603f21e9b6486951a5ecb8f", size = 485835, upload-time = "2025-08-08T13:13:49.373Z" }, + { url = "https://files.pythonhosted.org/packages/7a/7b/4fc3b2ba62c352b2071bea9cfbad330fadda70579f617506ae1a2f129cab/safetensors-0.6.2-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:81e67e8bab9878bb568cffbc5f5e655adb38d2418351dc0859ccac158f753e19", size = 521503, upload-time = "2025-08-08T13:13:47.651Z" }, + { url = "https://files.pythonhosted.org/packages/5a/50/0057e11fe1f3cead9254315a6c106a16dd4b1a19cd247f7cc6414f6b7866/safetensors-0.6.2-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:b0e4d029ab0a0e0e4fdf142b194514695b1d7d3735503ba700cf36d0fc7136ce", size = 652256, upload-time = "2025-08-08T13:13:53.167Z" }, + { url = "https://files.pythonhosted.org/packages/e9/29/473f789e4ac242593ac1656fbece6e1ecd860bb289e635e963667807afe3/safetensors-0.6.2-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:fa48268185c52bfe8771e46325a1e21d317207bcabcb72e65c6e28e9ffeb29c7", size = 747281, upload-time = "2025-08-08T13:13:54.656Z" }, + { url = "https://files.pythonhosted.org/packages/68/52/f7324aad7f2df99e05525c84d352dc217e0fa637a4f603e9f2eedfbe2c67/safetensors-0.6.2-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:d83c20c12c2d2f465997c51b7ecb00e407e5f94d7dec3ea0cc11d86f60d3fde5", size = 692286, upload-time = "2025-08-08T13:13:55.884Z" }, + { url = "https://files.pythonhosted.org/packages/ad/fe/cad1d9762868c7c5dc70c8620074df28ebb1a8e4c17d4c0cb031889c457e/safetensors-0.6.2-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:d944cea65fad0ead848b6ec2c37cc0b197194bec228f8020054742190e9312ac", size = 655957, upload-time = "2025-08-08T13:13:57.029Z" }, + { url = "https://files.pythonhosted.org/packages/59/a7/e2158e17bbe57d104f0abbd95dff60dda916cf277c9f9663b4bf9bad8b6e/safetensors-0.6.2-cp38-abi3-win32.whl", hash = "sha256:cab75ca7c064d3911411461151cb69380c9225798a20e712b102edda2542ddb1", size = 308926, upload-time = "2025-08-08T13:14:01.095Z" }, + { url = "https://files.pythonhosted.org/packages/2c/c3/c0be1135726618dc1e28d181b8c442403d8dbb9e273fd791de2d4384bcdd/safetensors-0.6.2-cp38-abi3-win_amd64.whl", hash = "sha256:c7b214870df923cbc1593c3faee16bec59ea462758699bd3fee399d00aac072c", size = 320192, upload-time = "2025-08-08T13:13:59.467Z" }, +] + [[package]] name = "scikit-learn" version = "1.6.1" @@ -1697,6 +1814,31 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/32/d5/f9a850d79b0851d1d4ef6456097579a9005b31fea68726a4ae5f2d82ddd9/threadpoolctl-3.6.0-py3-none-any.whl", hash = "sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb", size = 18638, upload-time = "2025-03-13T13:49:21.846Z" }, ] +[[package]] +name = "tokenizers" +version = "0.22.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "huggingface-hub" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/1c/46/fb6854cec3278fbfa4a75b50232c77622bc517ac886156e6afbfa4d8fc6e/tokenizers-0.22.1.tar.gz", hash = "sha256:61de6522785310a309b3407bac22d99c4db5dba349935e99e4d15ea2226af2d9", size = 363123, upload-time = "2025-09-19T09:49:23.424Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bf/33/f4b2d94ada7ab297328fc671fed209368ddb82f965ec2224eb1892674c3a/tokenizers-0.22.1-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:59fdb013df17455e5f950b4b834a7b3ee2e0271e6378ccb33aa74d178b513c73", size = 3069318, upload-time = "2025-09-19T09:49:11.848Z" }, + { url = "https://files.pythonhosted.org/packages/1c/58/2aa8c874d02b974990e89ff95826a4852a8b2a273c7d1b4411cdd45a4565/tokenizers-0.22.1-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:8d4e484f7b0827021ac5f9f71d4794aaef62b979ab7608593da22b1d2e3c4edc", size = 2926478, upload-time = "2025-09-19T09:49:09.759Z" }, + { url = "https://files.pythonhosted.org/packages/1e/3b/55e64befa1e7bfea963cf4b787b2cea1011362c4193f5477047532ce127e/tokenizers-0.22.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:19d2962dd28bc67c1f205ab180578a78eef89ac60ca7ef7cbe9635a46a56422a", size = 3256994, upload-time = "2025-09-19T09:48:56.701Z" }, + { url = "https://files.pythonhosted.org/packages/71/0b/fbfecf42f67d9b7b80fde4aabb2b3110a97fac6585c9470b5bff103a80cb/tokenizers-0.22.1-cp39-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:38201f15cdb1f8a6843e6563e6e79f4abd053394992b9bbdf5213ea3469b4ae7", size = 3153141, upload-time = "2025-09-19T09:48:59.749Z" }, + { url = "https://files.pythonhosted.org/packages/17/a9/b38f4e74e0817af8f8ef925507c63c6ae8171e3c4cb2d5d4624bf58fca69/tokenizers-0.22.1-cp39-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d1cbe5454c9a15df1b3443c726063d930c16f047a3cc724b9e6e1a91140e5a21", size = 3508049, upload-time = "2025-09-19T09:49:05.868Z" }, + { url = "https://files.pythonhosted.org/packages/d2/48/dd2b3dac46bb9134a88e35d72e1aa4869579eacc1a27238f1577270773ff/tokenizers-0.22.1-cp39-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e7d094ae6312d69cc2a872b54b91b309f4f6fbce871ef28eb27b52a98e4d0214", size = 3710730, upload-time = "2025-09-19T09:49:01.832Z" }, + { url = "https://files.pythonhosted.org/packages/93/0e/ccabc8d16ae4ba84a55d41345207c1e2ea88784651a5a487547d80851398/tokenizers-0.22.1-cp39-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:afd7594a56656ace95cdd6df4cca2e4059d294c5cfb1679c57824b605556cb2f", size = 3412560, upload-time = "2025-09-19T09:49:03.867Z" }, + { url = "https://files.pythonhosted.org/packages/d0/c6/dc3a0db5a6766416c32c034286d7c2d406da1f498e4de04ab1b8959edd00/tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e2ef6063d7a84994129732b47e7915e8710f27f99f3a3260b8a38fc7ccd083f4", size = 3250221, upload-time = "2025-09-19T09:49:07.664Z" }, + { url = "https://files.pythonhosted.org/packages/d7/a6/2c8486eef79671601ff57b093889a345dd3d576713ef047776015dc66de7/tokenizers-0.22.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:ba0a64f450b9ef412c98f6bcd2a50c6df6e2443b560024a09fa6a03189726879", size = 9345569, upload-time = "2025-09-19T09:49:14.214Z" }, + { url = "https://files.pythonhosted.org/packages/6b/16/32ce667f14c35537f5f605fe9bea3e415ea1b0a646389d2295ec348d5657/tokenizers-0.22.1-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:331d6d149fa9c7d632cde4490fb8bbb12337fa3a0232e77892be656464f4b446", size = 9271599, upload-time = "2025-09-19T09:49:16.639Z" }, + { url = "https://files.pythonhosted.org/packages/51/7c/a5f7898a3f6baa3fc2685c705e04c98c1094c523051c805cdd9306b8f87e/tokenizers-0.22.1-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:607989f2ea68a46cb1dfbaf3e3aabdf3f21d8748312dbeb6263d1b3b66c5010a", size = 9533862, upload-time = "2025-09-19T09:49:19.146Z" }, + { url = "https://files.pythonhosted.org/packages/36/65/7e75caea90bc73c1dd8d40438adf1a7bc26af3b8d0a6705ea190462506e1/tokenizers-0.22.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a0f307d490295717726598ef6fa4f24af9d484809223bbc253b201c740a06390", size = 9681250, upload-time = "2025-09-19T09:49:21.501Z" }, + { url = "https://files.pythonhosted.org/packages/30/2c/959dddef581b46e6209da82df3b78471e96260e2bc463f89d23b1bf0e52a/tokenizers-0.22.1-cp39-abi3-win32.whl", hash = "sha256:b5120eed1442765cd90b903bb6cfef781fd8fe64e34ccaecbae4c619b7b12a82", size = 2472003, upload-time = "2025-09-19T09:49:27.089Z" }, + { url = "https://files.pythonhosted.org/packages/b3/46/e33a8c93907b631a99377ef4c5f817ab453d0b34f93529421f42ff559671/tokenizers-0.22.1-cp39-abi3-win_amd64.whl", hash = "sha256:65fd6e3fb11ca1e78a6a93602490f134d1fdeb13bcef99389d5102ea318ed138", size = 2674684, upload-time = "2025-09-19T09:49:24.953Z" }, +] + [[package]] name = "torch" version = "2.7.0" @@ -1784,6 +1926,7 @@ dev = [ { name = "captum" }, { name = "nltk" }, { name = "pandas" }, + { name = "pre-commit" }, { name = "pyarrow" }, { name = "pytest" }, { name = "scikit-learn" }, @@ -1798,6 +1941,10 @@ docs = [ { name = "sphinx-rtd-theme" }, { name = "sphinxcontrib-napoleon" }, ] +hf-dep = [ + { name = "tokenizers" }, + { name = "transformers" }, +] [package.metadata] requires-dist = [ @@ -1816,6 +1963,7 @@ dev = [ { name = "captum" }, { name = "nltk" }, { name = "pandas" }, + { name = "pre-commit", specifier = ">=4.3.0" }, { name = "pyarrow" }, { name = "pytest", specifier = ">=8.1.1,<9" }, { name = "scikit-learn" }, @@ -1830,6 +1978,10 @@ docs = [ { name = "sphinx-rtd-theme", specifier = ">=1.2.0" }, { name = "sphinxcontrib-napoleon", specifier = ">=0.7" }, ] +hf-dep = [ + { name = "tokenizers", specifier = ">=0.22.1" }, + { name = "transformers", specifier = ">=4.57.1" }, +] [[package]] name = "tqdm" @@ -1843,6 +1995,27 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540, upload-time = "2024-11-24T20:12:19.698Z" }, ] +[[package]] +name = "transformers" +version = "4.57.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "filelock" }, + { name = "huggingface-hub" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "pyyaml" }, + { name = "regex" }, + { name = "requests" }, + { name = "safetensors" }, + { name = "tokenizers" }, + { name = "tqdm" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d6/68/a39307bcc4116a30b2106f2e689130a48de8bd8a1e635b5e1030e46fcd9e/transformers-4.57.1.tar.gz", hash = "sha256:f06c837959196c75039809636cd964b959f6604b75b8eeec6fdfc0440b89cc55", size = 10142511, upload-time = "2025-10-14T15:39:26.18Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/71/d3/c16c3b3cf7655a67db1144da94b021c200ac1303f82428f2beef6c2e72bb/transformers-4.57.1-py3-none-any.whl", hash = "sha256:b10d05da8fa67dc41644dbbf9bc45a44cb86ae33da6f9295f5fbf5b7890bd267", size = 11990925, upload-time = "2025-10-14T15:39:23.085Z" }, +] + [[package]] name = "triton" version = "3.3.0" @@ -1893,6 +2066,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a7/c2/fe1e52489ae3122415c51f387e221dd0773709bad6c6cdaa599e8a2c5185/urllib3-2.5.0-py3-none-any.whl", hash = "sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc", size = 129795, upload-time = "2025-06-18T14:07:40.39Z" }, ] +[[package]] +name = "virtualenv" +version = "20.35.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "distlib" }, + { name = "filelock" }, + { name = "platformdirs" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a4/d5/b0ccd381d55c8f45d46f77df6ae59fbc23d19e901e2d523395598e5f4c93/virtualenv-20.35.3.tar.gz", hash = "sha256:4f1a845d131133bdff10590489610c98c168ff99dc75d6c96853801f7f67af44", size = 6002907, upload-time = "2025-10-10T21:23:33.178Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/27/73/d9a94da0e9d470a543c1b9d3ccbceb0f59455983088e727b8a1824ed90fb/virtualenv-20.35.3-py3-none-any.whl", hash = "sha256:63d106565078d8c8d0b206d48080f938a8b25361e19432d2c9db40d2899c810a", size = 5981061, upload-time = "2025-10-10T21:23:30.433Z" }, +] + [[package]] name = "yarl" version = "1.20.0" From 3703f4869f32a28b7f15e3b9007deb0ebbc5d12f Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Wed, 22 Oct 2025 16:04:51 +0000 Subject: [PATCH 02/66] feat: add WordPiece tokenize reorder files add first tests --- tests/test_wordpiece_tokenizer.py | 11 + torchTextClassifiers/tokenizers/WordPiece.py | 112 ++++++ torchTextClassifiers/tokenizers/__init__.py | 2 + torchTextClassifiers/tokenizers/base.py | 8 + torchTextClassifiers/tokenizers/tokenizer.py | 340 +++++++++++++++++++ 5 files changed, 473 insertions(+) create mode 100644 tests/test_wordpiece_tokenizer.py create mode 100644 torchTextClassifiers/tokenizers/WordPiece.py create mode 100644 torchTextClassifiers/tokenizers/__init__.py create mode 100644 torchTextClassifiers/tokenizers/base.py create mode 100644 torchTextClassifiers/tokenizers/tokenizer.py diff --git a/tests/test_wordpiece_tokenizer.py b/tests/test_wordpiece_tokenizer.py new file mode 100644 index 0000000..20dd0ed --- /dev/null +++ b/tests/test_wordpiece_tokenizer.py @@ -0,0 +1,11 @@ +from torchTextClassifiers.tokenizers import WordPieceTokenizer + + +class TestWordPieceTokenizer: + def test_init(self): + tokenizer = WordPieceTokenizer(1000) + assert tokenizer is not None + + def test_train(self, sample_text_data): + tokenizer = WordPieceTokenizer(1000) + tokenizer.train(sample_text_data) diff --git a/torchTextClassifiers/tokenizers/WordPiece.py b/torchTextClassifiers/tokenizers/WordPiece.py new file mode 100644 index 0000000..fb77388 --- /dev/null +++ b/torchTextClassifiers/tokenizers/WordPiece.py @@ -0,0 +1,112 @@ +import logging +import os +from typing import List + +from tokenizers import ( + Tokenizer, + decoders, + models, + normalizers, + pre_tokenizers, + processors, + trainers, +) +from transformers import PreTrainedTokenizerFast + +from torchTextClassifiers.tokenizers import BaseTokenizer + +logger = logging.getLogger(__name__) + + +class WordPieceTokenizer(BaseTokenizer): + def __init__(self, vocab_size: int, trained: bool = False): + """Largely inspired by https://huggingface.co/learn/llm-course/chapter6/8""" + + self.unk_token = "[UNK]" + self.pad_token = "[PAD]" + self.cls_token = "[CLS]" + self.sep_token = "[SEP]" + self.special_tokens = [ + self.unk_token, + self.pad_token, + self.cls_token, + self.sep_token, + ] + self.vocab_size = vocab_size + + self.tokenizer = Tokenizer(models.WordPiece(unk_token=self.unk_token)) + + self.tokenizer.normalizer = normalizers.BertNormalizer( + lowercase=True + ) # NFD, lowercase, strip accents - BERT style + + self.tokenizer.pre_tokenizer = ( + pre_tokenizers.BertPreTokenizer() + ) # split on whitespace and punctuation - BERT style + self.trained = trained + + def _post_training(self): + if not self.trained: + raise RuntimeError( + "Tokenizer must be trained before applying post-training configurations." + ) + + self.tokenizer.post_processor = processors.BertProcessing( + (self.cls_token, self.tokenizer.token_to_id(self.cls_token)), + (self.sep_token, self.tokenizer.token_to_id(self.sep_token)), + ) + self.tokenizer.decoder = decoders.WordPiece(prefix="##") + self.tokenizer.enable_padding(pad_id=self.tokenizer.token_to_id("[PAD]"), pad_token="[PAD]") + self.tokenizer = PreTrainedTokenizerFast(tokenizer_object=self.tokenizer) + + def tokenize(self, text: str) -> list: + if not self.trained: + raise RuntimeError("Tokenizer must be trained before tokenization.") + + return self.tokenizer.encode(text).tokens + + def train( + self, training_corpus: List[str], save_path: str = None, filesystem=None, s3_save_path=None + ): + trainer = trainers.WordPieceTrainer( + vocab_size=self.vocab_size, + special_tokens=self.special_tokens, + ) + self.tokenizer.train_from_iterator(training_corpus, trainer=trainer) + self.trained = True + self._post_training() + + if save_path: + self.tokenizer.save(save_path) + logger.info(f"💾 Tokenizer saved at {save_path}") + if filesystem and s3_save_path: + parent_dir = os.path.dirname(save_path) + if not filesystem.exists(parent_dir): + filesystem.mkdirs(parent_dir) + filesystem.put(save_path, s3_save_path) + logger.info(f"💾 Tokenizer uploaded to S3 at {s3_save_path}") + + @classmethod + def load(cls, load_path: str): + loaded_tokenizer = PreTrainedTokenizerFast(tokenizer_file=load_path) + instance = cls(vocab_size=len(loaded_tokenizer), trained=True) + instance.tokenizer = loaded_tokenizer + instance._post_training() + return instance + + @classmethod + def load_from_s3(cls, s3_path: str, filesystem): + if filesystem.exists(s3_path) is False: + raise FileNotFoundError( + f"Tokenizer not found at {s3_path}. Please train it first (see src/train_tokenizers)." + ) + + with filesystem.open(s3_path, "rb") as f: + json_str = f.read().decode("utf-8") + + tokenizer_obj = Tokenizer.from_str(json_str) + tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer_obj) + instance = cls(vocab_size=len(tokenizer), trained=True) + instance.tokenizer = tokenizer + instance._post_training() + return instance diff --git a/torchTextClassifiers/tokenizers/__init__.py b/torchTextClassifiers/tokenizers/__init__.py new file mode 100644 index 0000000..c8975bf --- /dev/null +++ b/torchTextClassifiers/tokenizers/__init__.py @@ -0,0 +1,2 @@ +from .base import BaseTokenizer as BaseTokenizer +from .WordPiece import WordPieceTokenizer as WordPieceTokenizer diff --git a/torchTextClassifiers/tokenizers/base.py b/torchTextClassifiers/tokenizers/base.py new file mode 100644 index 0000000..0e445e9 --- /dev/null +++ b/torchTextClassifiers/tokenizers/base.py @@ -0,0 +1,8 @@ +from abc import ABC, abstractmethod + + +class BaseTokenizer(ABC): + @abstractmethod + def tokenize(self, text: str) -> list: + """Tokenizes the raw input text into a list of tokens.""" + pass diff --git a/torchTextClassifiers/tokenizers/tokenizer.py b/torchTextClassifiers/tokenizers/tokenizer.py new file mode 100644 index 0000000..285bd3c --- /dev/null +++ b/torchTextClassifiers/tokenizers/tokenizer.py @@ -0,0 +1,340 @@ +""" +NGramTokenizer class. +""" + +import ctypes +import json +from typing import List, Tuple, Type + +import torch + +from ...utilities.preprocess import clean_text_feature + + +class NGramTokenizer: + """ + NGramTokenizer class. + """ + + def __init__( + self, + min_count: int, + min_n: int, + max_n: int, + num_tokens: int, + len_word_ngrams: int, + training_text: List[str], + **kwargs, + ): + """ + Constructor for the NGramTokenizer class. + + Args: + min_count (int): Minimum number of times a word has to be + in the training data to be given an embedding. + min_n (int): Minimum length of character n-grams. + max_n (int): Maximum length of character n-grams. + num_tokens (int): Number of rows in the embedding matrix. + word_ngrams (int): Maximum length of word n-grams. + training_text (List[str]): List of training texts. + + Raises: + ValueError: If `min_n` is 1 or smaller. + ValueError: If `max_n` is 7 or higher. + """ + if min_n < 2: + raise ValueError("`min_n` parameter must be greater than 1.") + if max_n > 6: + raise ValueError("`max_n` parameter must be smaller than 7.") + + self.min_count = min_count + self.min_n = min_n + self.max_n = max_n + self.num_tokens = num_tokens + self.word_ngrams = len_word_ngrams + + word_counts = {} + for sentence in training_text: + for word in sentence.split(" "): + word_counts[word] = word_counts.setdefault(word, 0) + 1 + + self.word_id_mapping = {} + i = 1 + for word, counts in word_counts.items(): + if word_counts[word] >= min_count: + self.word_id_mapping[word] = i + i += 1 + self.nwords = len(self.word_id_mapping) + + self.padding_index = self.num_tokens + self.get_nwords() + + def __str__(self) -> str: + """ + Returns description of the NGramTokenizer. + + Returns: + str: Description. + """ + return f"" + + def get_nwords(self) -> int: + """ + Return number of words kept in training data. + + Returns: + int: Number of words. + """ + return self.nwords + + def get_buckets(self) -> int: + """ + Return number of buckets for tokenizer. + + Returns: + int: Number of buckets. + """ + return self.num_tokens + + @staticmethod + def get_ngram_list(word: str, n: int) -> List[str]: + """ + Return the list of character n-grams for a word with a + given n. + + Args: + word (str): Word. + n (int): Length of the n-grams. + + Returns: + List[str]: List of character n-grams. + """ + return [word[i : i + n] for i in range(len(word) - n + 1)] + + @staticmethod + def get_hash(subword: str) -> int: + """ + Return hash for a given subword. + + Args: + subword (str): Character n-gram. + + Returns: + int: Corresponding hash. + """ + h = ctypes.c_uint32(2166136261).value + for c in subword: + c = ctypes.c_int8(ord(c)).value + h = ctypes.c_uint32(h ^ c).value + h = ctypes.c_uint32(h * 16777619).value + return h + + @staticmethod + def get_word_ngram_id(hashes: Tuple[int], bucket: int, nwords: int) -> int: + """ + Get word ngram index in the embedding matrix. + + Args: + hashes (Tuple[int]): Word hashes. + bucket (int): Number of rows in embedding matrix. + nwords (int): Number of words in the vocabulary. + + Returns: + int: Word ngram hash. + """ + hashes = [ctypes.c_int32(hash_value).value for hash_value in hashes] + h = ctypes.c_uint64(hashes[0]).value + for j in range(1, len(hashes)): + h = ctypes.c_uint64((h * 116049371)).value + h = ctypes.c_uint64(h + hashes[j]).value + return h % bucket + nwords + + def get_subword_index(self, subword: str) -> int: + """ + Return the row index from the embedding matrix which + corresponds to a character n-gram. + + Args: + subword (str): Character n-gram. + + Returns: + int: Index. + """ + return self.get_hash(subword) % self.num_tokens + self.nwords + + def get_word_index(self, word: str) -> int: + """ + Return the row index from the embedding matrix which + corresponds to a word. + + Args: + word (str): Word. + + Returns: + int: Index. + """ + return self.word_id_mapping[word] + + def get_subwords(self, word: str) -> Tuple[List[str], List[int]]: + """ + Return all subwords tokens and indices for a given word. + Also adds the whole word token and indice if the word is in word_id_mapping + (==> the word is in initial vocabulary + seen at least MIN_COUNT times). + Adds tags "<" and ">" to the word. + + Args: + word (str): Word. + + Returns: + Tuple[List[str], List[int]]: Tuple of tokens and indices. + """ + tokens = [] + word_with_tags = "<" + word + ">" + + # Get subwords and associated indices WITHOUT the whole word + for n in range(self.min_n, self.max_n + 1): + ngrams = self.get_ngram_list(word_with_tags, n) + tokens += [ + ngram for ngram in ngrams if ngram != word_with_tags and ngram != word + ] # Exclude the full word + + indices = [self.get_subword_index(token) for token in tokens] + assert word not in tokens + + # Add word token and indice only if the word is in word_id_mapping + if word in self.word_id_mapping.keys(): + self.get_word_index(word) + tokens = [word] + tokens + indices = [self.get_word_index(word)] + indices + + return (tokens, indices) + + def indices_matrix(self, sentence: str) -> tuple[torch.Tensor, dict, dict]: + """ + Returns an array of token indices for a text description. + + Args: + sentence (str): Text description. + + Returns: + tuple: (torch.Tensor of indices, id_to_token dict, token_to_id dict) + """ + # Pre-split the sentence once + words = sentence.split() + words.append("") # Add end of string token + + indices = [] + all_tokens_id = {} + + # Process subwords in one batch + for word in words[:-1]: # Exclude from subword processing + tokens, ind = self.get_subwords(word) + indices.extend(ind) + # Update dictionary with zip for efficiency + all_tokens_id.update(zip(tokens, ind)) + + # Add token + indices.append(0) + all_tokens_id[""] = 0 + + # Compute word n-grams more efficiently + if self.word_ngrams > 1: + # Pre-compute hashes for all words to avoid repeated computation + word_hashes = [self.get_hash(word) for word in words] + + # Generate n-grams using sliding window + word_ngram_ids = [] + for n in range(2, self.word_ngrams + 1): + for i in range(len(words) - n + 1): + # Get slice of hashes for current n-gram + gram_hashes = tuple(word_hashes[i : i + n]) + + # Compute n-gram ID + word_ngram_id = int( + self.get_word_ngram_id(gram_hashes, self.num_tokens, self.nwords) + ) + + # Store gram and its ID + gram = " ".join(words[i : i + n]) + all_tokens_id[gram] = word_ngram_id + word_ngram_ids.append(word_ngram_id) + + # Extend indices with n-gram IDs + indices.extend(word_ngram_ids) + + # Create reverse mapping once at the end + id_to_token = {v: k for k, v in all_tokens_id.items()} + + # Convert to tensor directly + return torch.tensor(indices, dtype=torch.long), id_to_token, all_tokens_id + + def tokenize(self, text: list[str], text_tokens=True, preprocess=False): + """ + Tokenize a list of sentences. + + Args: + text (list[str]): List of sentences. + text_tokens (bool): If True, return tokenized text in tokens. + preprocess (bool): If True, preprocess text. Needs unidecode library. + + Returns: + np.array: Array of indices. + """ + + if preprocess: + text = clean_text_feature(text) + + tokenized_text = [] + id_to_token_dicts = [] + token_to_id_dicts = [] + for sentence in text: + all_ind, id_to_token, token_to_id = self.indices_matrix( + sentence + ) # tokenize and convert to token indices + tokenized_text.append(all_ind) + id_to_token_dicts.append(id_to_token) + token_to_id_dicts.append(token_to_id) + + if text_tokens: + tokenized_text_tokens = self._tokenized_text_in_tokens( + tokenized_text, id_to_token_dicts + ) + return tokenized_text_tokens, tokenized_text, id_to_token_dicts, token_to_id_dicts + else: + return tokenized_text, id_to_token_dicts, token_to_id_dicts + + def _tokenized_text_in_tokens(self, tokenized_text, id_to_token_dicts): + """ + Convert tokenized text in int format to tokens in str format (given a mapping dictionary). + Private method. Used in tokenizer.tokenize and pytorch_model.predict() + + Args: + tokenized_text (list): List of tokenized text in int format. + id_to_token_dicts (list[Dict]): List of dictionaries mapping token indices to tokens. + + Both lists have the same length (number of sentences). + + Returns: + list[list[str]]: List of tokenized text in str format. + + """ + + return [ + [ + id_to_token_dicts[i][token_id.item()] + for token_id in tokenized_sentence + if token_id.item() not in {self.padding_index} + ] + for i, tokenized_sentence in enumerate(tokenized_text) + ] + + def get_vocab(self): + return self.word_id_mapping + + @classmethod + def from_json(cls: Type["NGramTokenizer"], filepath: str, training_text) -> "NGramTokenizer": + """ + Load a dataclass instance from a JSON file. + """ + with open(filepath, "r") as f: + data = json.load(f) + return cls(**data, training_text=training_text) From 1266287dce00e5f104086a1ca3de97cb3192afe8 Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Mon, 27 Oct 2025 15:22:07 +0000 Subject: [PATCH 03/66] chore: rename file to ngram --- torchTextClassifiers/tokenizers/{tokenizer.py => ngram.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename torchTextClassifiers/tokenizers/{tokenizer.py => ngram.py} (100%) diff --git a/torchTextClassifiers/tokenizers/tokenizer.py b/torchTextClassifiers/tokenizers/ngram.py similarity index 100% rename from torchTextClassifiers/tokenizers/tokenizer.py rename to torchTextClassifiers/tokenizers/ngram.py From d2563ea95a1b2e7618a9d34fe64ba400b5bf9417 Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Mon, 27 Oct 2025 15:23:24 +0000 Subject: [PATCH 04/66] feat: improve base tokenizer, add HF abstract --- torchTextClassifiers/tokenizers/__init__.py | 1 + torchTextClassifiers/tokenizers/base.py | 39 +++++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/torchTextClassifiers/tokenizers/__init__.py b/torchTextClassifiers/tokenizers/__init__.py index c8975bf..8284ad7 100644 --- a/torchTextClassifiers/tokenizers/__init__.py +++ b/torchTextClassifiers/tokenizers/__init__.py @@ -1,2 +1,3 @@ from .base import BaseTokenizer as BaseTokenizer +from .base import HuggingFaceTokenizer as HuggingFaceTokenizer from .WordPiece import WordPieceTokenizer as WordPieceTokenizer diff --git a/torchTextClassifiers/tokenizers/base.py b/torchTextClassifiers/tokenizers/base.py index 0e445e9..07de50b 100644 --- a/torchTextClassifiers/tokenizers/base.py +++ b/torchTextClassifiers/tokenizers/base.py @@ -2,7 +2,46 @@ class BaseTokenizer(ABC): + def __init__(self, vocab_size: int): + self.vocab_size = vocab_size + @abstractmethod def tokenize(self, text: str) -> list: """Tokenizes the raw input text into a list of tokens.""" pass + + def __len__(self): + return self.vocab_size + + +class HuggingFaceTokenizer(BaseTokenizer, ABC): + def __init__(self, vocab_size: int): + super().__init__(vocab_size) + + self.trained = False + self.tokenizer = None + + def tokenize(self, text: str) -> list: + if not self.trained: + raise RuntimeError("Tokenizer must be trained before tokenization.") + + return self.tokenizer( + text, padding=True, return_tensors="pt" + ) # method from PreTrainedTokenizerFast + + @abstractmethod + def train( + self, + training_corpus: list, + save_path: str = None, + filesystem=None, + s3_save_path=None, + **kwargs, + ): + """Trains the tokenizer on the provided training corpus.""" + pass + + @abstractmethod + def _post_training(self): + """Applies post-training configurations to the tokenizer.""" + pass From ae045abfb02717f687bdb9ab02e77e649d42afc7 Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Mon, 27 Oct 2025 15:25:14 +0000 Subject: [PATCH 05/66] feat: change inheritance to HFTokenizer add tests --- tests/test_wordpiece_tokenizer.py | 8 ++++++++ torchTextClassifiers/tokenizers/WordPiece.py | 16 +++++++--------- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/tests/test_wordpiece_tokenizer.py b/tests/test_wordpiece_tokenizer.py index 20dd0ed..0c29d1b 100644 --- a/tests/test_wordpiece_tokenizer.py +++ b/tests/test_wordpiece_tokenizer.py @@ -9,3 +9,11 @@ def test_init(self): def test_train(self, sample_text_data): tokenizer = WordPieceTokenizer(1000) tokenizer.train(sample_text_data) + + def test_tokenize(self, sample_text_data): + tokenizer = WordPieceTokenizer(1000) + tokenizer.train(sample_text_data) + tokens = tokenizer.tokenize(sample_text_data[0]) + tokens = tokenizer.tokenize(list(sample_text_data)) + + assert tokens is not None diff --git a/torchTextClassifiers/tokenizers/WordPiece.py b/torchTextClassifiers/tokenizers/WordPiece.py index fb77388..c943c3b 100644 --- a/torchTextClassifiers/tokenizers/WordPiece.py +++ b/torchTextClassifiers/tokenizers/WordPiece.py @@ -13,15 +13,17 @@ ) from transformers import PreTrainedTokenizerFast -from torchTextClassifiers.tokenizers import BaseTokenizer +from torchTextClassifiers.tokenizers import HuggingFaceTokenizer logger = logging.getLogger(__name__) -class WordPieceTokenizer(BaseTokenizer): +class WordPieceTokenizer(HuggingFaceTokenizer): def __init__(self, vocab_size: int, trained: bool = False): """Largely inspired by https://huggingface.co/learn/llm-course/chapter6/8""" + super().__init__(vocab_size) + self.unk_token = "[UNK]" self.pad_token = "[PAD]" self.cls_token = "[CLS]" @@ -56,14 +58,10 @@ def _post_training(self): (self.sep_token, self.tokenizer.token_to_id(self.sep_token)), ) self.tokenizer.decoder = decoders.WordPiece(prefix="##") - self.tokenizer.enable_padding(pad_id=self.tokenizer.token_to_id("[PAD]"), pad_token="[PAD]") - self.tokenizer = PreTrainedTokenizerFast(tokenizer_object=self.tokenizer) + self.padding_idx = self.tokenizer.token_to_id("[PAD]") + self.tokenizer.enable_padding(pad_id=self.padding_idx, pad_token="[PAD]") - def tokenize(self, text: str) -> list: - if not self.trained: - raise RuntimeError("Tokenizer must be trained before tokenization.") - - return self.tokenizer.encode(text).tokens + self.tokenizer = PreTrainedTokenizerFast(tokenizer_object=self.tokenizer) def train( self, training_corpus: List[str], save_path: str = None, filesystem=None, s3_save_path=None From c6eac5841ec88396856100960d68de8ac6469477 Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Mon, 27 Oct 2025 15:25:37 +0000 Subject: [PATCH 06/66] feat(dataset): init --- torchTextClassifiers/dataset/__init__.py | 1 + torchTextClassifiers/dataset/dataset.py | 113 +++++++++++++++++++++++ 2 files changed, 114 insertions(+) create mode 100644 torchTextClassifiers/dataset/__init__.py create mode 100644 torchTextClassifiers/dataset/dataset.py diff --git a/torchTextClassifiers/dataset/__init__.py b/torchTextClassifiers/dataset/__init__.py new file mode 100644 index 0000000..246df81 --- /dev/null +++ b/torchTextClassifiers/dataset/__init__.py @@ -0,0 +1 @@ +from .dataset import TextClassificationDataset as TextClassificationDataset diff --git a/torchTextClassifiers/dataset/dataset.py b/torchTextClassifiers/dataset/dataset.py new file mode 100644 index 0000000..49c536e --- /dev/null +++ b/torchTextClassifiers/dataset/dataset.py @@ -0,0 +1,113 @@ +import os +from typing import List, Union + +import numpy as np +import torch +from torch.utils.data import DataLoader, Dataset + +from torchTextClassifiers.tokenizers import BaseTokenizer + +os.environ["TOKENIZERS_PARALLELISM"] = "false" + + +class TextClassificationDataset(Dataset): + def __init__( + self, + texts: List[str], + categorical_variables: Union[List[List[int]], np.array, None], + tokenizer: BaseTokenizer, + labels: Union[List[int], None] = None, + **kwargs, + ): + self.categorical_variables = categorical_variables + + self.texts = texts + + if hasattr(tokenizer, "trained") and not tokenizer.trained: + raise RuntimeError( + f"Tokenizer {type(tokenizer)} must be trained before creating dataset." + ) + + self.tokenizer = tokenizer + + self.texts = texts + self.tokenizer = tokenizer + self.labels = labels + + def __len__(self): + return len(self.texts) + + def __getitem__(self, idx): + if self.labels: + return ( + self.texts[idx], + ( + self.categorical_variables[idx] + if self.categorical_variables is not None + else None + ), + self.labels[idx], + ) + else: + return ( + self.texts[idx], + ( + self.categorical_variables[idx] + if self.categorical_variables is not None + else None + ), + None, + ) + + def collate_fn(self, batch): + text, *categorical_vars, y = zip(*batch) + + if self.labels: + labels_tensor = torch.tensor(y, dtype=torch.long) + else: + labels_tensor = None + + tokenize_output = self.tokenizer.tokenize(text) + + if self.categorical_variables is not None: + categorical_tensors = torch.stack( + [ + torch.tensor(cat_var, dtype=torch.float32) + for cat_var in categorical_vars[ + 0 + ] # Access first element since zip returns tuple + ] + ) + else: + categorical_tensors = torch.empty( + len(text), 1, dtype=torch.float32, device=labels_tensor.device + ) + + return { + "input_ids": tokenize_output["input_ids"], + "attention_mask": tokenize_output["attention_mask"], + "categorical_vars": categorical_tensors, + "labels": labels_tensor, + } + + def create_dataloader( + self, + batch_size: int, + shuffle: bool = False, + drop_last: bool = False, + num_workers: int = os.cpu_count() - 1, + pin_memory: bool = True, + persistent_workers: bool = True, + **kwargs, + ) -> torch.utils.data.DataLoader: + return DataLoader( + dataset=self, + batch_size=batch_size, + collate_fn=self.collate_fn, + shuffle=shuffle, + drop_last=drop_last, + pin_memory=pin_memory, + num_workers=num_workers, + persistent_workers=persistent_workers, + **kwargs, + ) From c25eb36e5431a6eaa369299c5087901cec4a961c Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Mon, 27 Oct 2025 16:22:42 +0000 Subject: [PATCH 07/66] fix: add update of vocab size in post training --- torchTextClassifiers/tokenizers/WordPiece.py | 1 + 1 file changed, 1 insertion(+) diff --git a/torchTextClassifiers/tokenizers/WordPiece.py b/torchTextClassifiers/tokenizers/WordPiece.py index c943c3b..1efb075 100644 --- a/torchTextClassifiers/tokenizers/WordPiece.py +++ b/torchTextClassifiers/tokenizers/WordPiece.py @@ -62,6 +62,7 @@ def _post_training(self): self.tokenizer.enable_padding(pad_id=self.padding_idx, pad_token="[PAD]") self.tokenizer = PreTrainedTokenizerFast(tokenizer_object=self.tokenizer) + self.vocab_size = len(self.tokenizer) def train( self, training_corpus: List[str], save_path: str = None, filesystem=None, s3_save_path=None From d897befc311c8a410e794cce0d9f5f66d4b2c442 Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Mon, 27 Oct 2025 16:23:16 +0000 Subject: [PATCH 08/66] fix: categorical tensors set to None instead of empty tensors when no cat var --- torchTextClassifiers/dataset/dataset.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/torchTextClassifiers/dataset/dataset.py b/torchTextClassifiers/dataset/dataset.py index 49c536e..c185a8f 100644 --- a/torchTextClassifiers/dataset/dataset.py +++ b/torchTextClassifiers/dataset/dataset.py @@ -17,7 +17,6 @@ def __init__( categorical_variables: Union[List[List[int]], np.array, None], tokenizer: BaseTokenizer, labels: Union[List[int], None] = None, - **kwargs, ): self.categorical_variables = categorical_variables @@ -79,9 +78,7 @@ def collate_fn(self, batch): ] ) else: - categorical_tensors = torch.empty( - len(text), 1, dtype=torch.float32, device=labels_tensor.device - ) + categorical_tensors = None return { "input_ids": tokenize_output["input_ids"], From 51be1d139265d899765cafbe3a4537a3c49bbbc8 Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Fri, 31 Oct 2025 15:53:14 +0000 Subject: [PATCH 09/66] feat: add ruff and datasets dep --- pyproject.toml | 2 + uv.lock | 329 +++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 291 insertions(+), 40 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 4eff574..9244177 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,7 @@ dev = [ "captum", "pyarrow", "pre-commit>=4.3.0", + "ruff>=0.14.3", ] docs = [ "sphinx>=5.0.0", @@ -46,6 +47,7 @@ docs = [ hf-dep = [ "tokenizers>=0.22.1", "transformers>=4.57.1", + "datasets>=4.3.0", ] [project.optional-dependencies] diff --git a/uv.lock b/uv.lock index af125e0..33cc712 100644 --- a/uv.lock +++ b/uv.lock @@ -104,6 +104,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7e/b3/6b4067be973ae96ba0d615946e314c5ae35f9f993eca561b356540bb0c2b/alabaster-1.0.0-py3-none-any.whl", hash = "sha256:fc6786402dc3fcb2de3cabd5fe455a2db534b371124f1f21de8731783dec828b", size = 13929, upload-time = "2024-07-26T18:15:02.05Z" }, ] +[[package]] +name = "anyio" +version = "4.11.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "idna" }, + { name = "sniffio" }, + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c6/78/7d432127c41b50bccba979505f272c16cbcadcc33645d5fa3a738110ae75/anyio-4.11.0.tar.gz", hash = "sha256:82a8d0b81e318cc5ce71a5f1f8b5c4e63619620b63141ef8c995fa0db95a57c4", size = 219094, upload-time = "2025-09-23T09:19:12.58Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/15/b3/9b1a8074496371342ec1e796a96f99c82c945a339cd81a8e73de28b4cf9e/anyio-4.11.0-py3-none-any.whl", hash = "sha256:0287e96f4d26d4149305414d4e3bc32f0dcd0862365a4bddea19d7a1ec38c4fc", size = 109097, upload-time = "2025-09-23T09:19:10.601Z" }, +] + [[package]] name = "attrs" version = "25.3.0" @@ -287,6 +301,40 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e7/05/c19819d5e3d95294a6f5947fb9b9629efb316b96de511b418c53d245aae6/cycler-0.12.1-py3-none-any.whl", hash = "sha256:85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30", size = 8321, upload-time = "2023-10-07T05:32:16.783Z" }, ] +[[package]] +name = "datasets" +version = "4.3.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "dill" }, + { name = "filelock" }, + { name = "fsspec", extra = ["http"] }, + { name = "httpx" }, + { name = "huggingface-hub" }, + { name = "multiprocess" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "pandas" }, + { name = "pyarrow" }, + { name = "pyyaml" }, + { name = "requests" }, + { name = "tqdm" }, + { name = "xxhash" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/2a/47/325206ac160f7699ed9f1798afa8f8f8d5189b03bf3815654859ac1d5cba/datasets-4.3.0.tar.gz", hash = "sha256:bc9118ed9afd92346c5be7ed3aaa00177eb907c25467f9d072a0d22777efbd2b", size = 582801, upload-time = "2025-10-23T16:31:51.547Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ca/51/409a8184ed35453d9cbb3d6b20d524b1115c2c2d117b85d5e9b06cd70b45/datasets-4.3.0-py3-none-any.whl", hash = "sha256:0ea157e72138b3ca6c7d2415f19a164ecf7d4c4fa72da2a570da286882e96903", size = 506846, upload-time = "2025-10-23T16:31:49.965Z" }, +] + +[[package]] +name = "dill" +version = "0.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/12/80/630b4b88364e9a8c8c5797f4602d0f76ef820909ee32f0bacb9f90654042/dill-0.4.0.tar.gz", hash = "sha256:0633f1d2df477324f53a895b02c901fb961bdbf65a17122586ea7019292cbcf0", size = 186976, upload-time = "2025-04-16T00:41:48.867Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/50/3d/9373ad9c56321fdab5b41197068e1d8c25883b3fea29dd361f9b55116869/dill-0.4.0-py3-none-any.whl", hash = "sha256:44f54bf6412c2c8464c14e8243eb163690a9800dbe2c367330883b19c7561049", size = 119668, upload-time = "2025-04-16T00:41:47.671Z" }, +] + [[package]] name = "distlib" version = "0.4.0" @@ -438,6 +486,15 @@ http = [ { name = "aiohttp" }, ] +[[package]] +name = "h11" +version = "0.16.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/01/ee/02a2c011bdab74c6fb3c75474d40b3052059d95df7e73351460c8588d963/h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1", size = 101250, upload-time = "2025-04-24T03:35:25.427Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" }, +] + [[package]] name = "hf-xet" version = "1.1.10" @@ -453,6 +510,34 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ee/0e/471f0a21db36e71a2f1752767ad77e92d8cde24e974e03d662931b1305ec/hf_xet-1.1.10-cp37-abi3-win_amd64.whl", hash = "sha256:5f54b19cc347c13235ae7ee98b330c26dd65ef1df47e5316ffb1e87713ca7045", size = 2804691, upload-time = "2025-09-12T20:10:28.433Z" }, ] +[[package]] +name = "httpcore" +version = "1.0.9" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "certifi" }, + { name = "h11" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/06/94/82699a10bca87a5556c9c59b5963f2d039dbd239f25bc2a63907a05a14cb/httpcore-1.0.9.tar.gz", hash = "sha256:6e34463af53fd2ab5d807f399a9b45ea31c3dfa2276f15a2c3f00afff6e176e8", size = 85484, upload-time = "2025-04-24T22:06:22.219Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7e/f5/f66802a942d491edb555dd61e3a9961140fd64c90bce1eafd741609d334d/httpcore-1.0.9-py3-none-any.whl", hash = "sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55", size = 78784, upload-time = "2025-04-24T22:06:20.566Z" }, +] + +[[package]] +name = "httpx" +version = "0.28.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "certifi" }, + { name = "httpcore" }, + { name = "idna" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b1/df/48c586a5fe32a0f01324ee087459e112ebb7224f646c0b5023f5e79e9956/httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc", size = 141406, upload-time = "2024-12-06T15:37:23.222Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" }, +] + [[package]] name = "huggingface-hub" version = "0.35.3" @@ -819,6 +904,22 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/84/5d/e17845bb0fa76334477d5de38654d27946d5b5d3695443987a094a71b440/multidict-6.4.4-py3-none-any.whl", hash = "sha256:bd4557071b561a8b3b6075c3ce93cf9bfb6182cb241805c3d66ced3b75eff4ac", size = 10481, upload-time = "2025-05-19T14:16:36.024Z" }, ] +[[package]] +name = "multiprocess" +version = "0.70.16" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "dill" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b5/ae/04f39c5d0d0def03247c2893d6f2b83c136bf3320a2154d7b8858f2ba72d/multiprocess-0.70.16.tar.gz", hash = "sha256:161af703d4652a0e1410be6abccecde4a7ddffd19341be0a7011b94aeb171ac1", size = 1772603, upload-time = "2024-01-28T18:52:34.85Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bc/f7/7ec7fddc92e50714ea3745631f79bd9c96424cb2702632521028e57d3a36/multiprocess-0.70.16-py310-none-any.whl", hash = "sha256:c4a9944c67bd49f823687463660a2d6daae94c289adff97e0f9d696ba6371d02", size = 134824, upload-time = "2024-01-28T18:52:26.062Z" }, + { url = "https://files.pythonhosted.org/packages/50/15/b56e50e8debaf439f44befec5b2af11db85f6e0f344c3113ae0be0593a91/multiprocess-0.70.16-py311-none-any.whl", hash = "sha256:af4cabb0dac72abfb1e794fa7855c325fd2b55a10a44628a3c1ad3311c04127a", size = 143519, upload-time = "2024-01-28T18:52:28.115Z" }, + { url = "https://files.pythonhosted.org/packages/0a/7d/a988f258104dcd2ccf1ed40fdc97e26c4ac351eeaf81d76e266c52d84e2f/multiprocess-0.70.16-py312-none-any.whl", hash = "sha256:fc0544c531920dde3b00c29863377f87e1632601092ea2daca74e4beb40faa2e", size = 146741, upload-time = "2024-01-28T18:52:29.395Z" }, + { url = "https://files.pythonhosted.org/packages/ea/89/38df130f2c799090c978b366cfdf5b96d08de5b29a4a293df7f7429fa50b/multiprocess-0.70.16-py38-none-any.whl", hash = "sha256:a71d82033454891091a226dfc319d0cfa8019a4e888ef9ca910372a446de4435", size = 132628, upload-time = "2024-01-28T18:52:30.853Z" }, + { url = "https://files.pythonhosted.org/packages/da/d9/f7f9379981e39b8c2511c9e0326d212accacb82f12fbfdc1aa2ce2a7b2b6/multiprocess-0.70.16-py39-none-any.whl", hash = "sha256:a0bafd3ae1b732eac64be2e72038231c1ba97724b60b09400d68f229fcc2fbf3", size = 133351, upload-time = "2024-01-28T18:52:31.981Z" }, +] + [[package]] name = "myst-parser" version = "4.0.1" @@ -1280,46 +1381,52 @@ wheels = [ [[package]] name = "pyarrow" -version = "20.0.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/a2/ee/a7810cb9f3d6e9238e61d312076a9859bf3668fd21c69744de9532383912/pyarrow-20.0.0.tar.gz", hash = "sha256:febc4a913592573c8d5805091a6c2b5064c8bd6e002131f01061797d91c783c1", size = 1125187, upload-time = "2025-04-27T12:34:23.264Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/47/a2/b7930824181ceadd0c63c1042d01fa4ef63eee233934826a7a2a9af6e463/pyarrow-20.0.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:24ca380585444cb2a31324c546a9a56abbe87e26069189e14bdba19c86c049f0", size = 30856035, upload-time = "2025-04-27T12:28:40.78Z" }, - { url = "https://files.pythonhosted.org/packages/9b/18/c765770227d7f5bdfa8a69f64b49194352325c66a5c3bb5e332dfd5867d9/pyarrow-20.0.0-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:95b330059ddfdc591a3225f2d272123be26c8fa76e8c9ee1a77aad507361cfdb", size = 32309552, upload-time = "2025-04-27T12:28:47.051Z" }, - { url = "https://files.pythonhosted.org/packages/44/fb/dfb2dfdd3e488bb14f822d7335653092dde150cffc2da97de6e7500681f9/pyarrow-20.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5f0fb1041267e9968c6d0d2ce3ff92e3928b243e2b6d11eeb84d9ac547308232", size = 41334704, upload-time = "2025-04-27T12:28:55.064Z" }, - { url = "https://files.pythonhosted.org/packages/58/0d/08a95878d38808051a953e887332d4a76bc06c6ee04351918ee1155407eb/pyarrow-20.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b8ff87cc837601532cc8242d2f7e09b4e02404de1b797aee747dd4ba4bd6313f", size = 42399836, upload-time = "2025-04-27T12:29:02.13Z" }, - { url = "https://files.pythonhosted.org/packages/f3/cd/efa271234dfe38f0271561086eedcad7bc0f2ddd1efba423916ff0883684/pyarrow-20.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:7a3a5dcf54286e6141d5114522cf31dd67a9e7c9133d150799f30ee302a7a1ab", size = 40711789, upload-time = "2025-04-27T12:29:09.951Z" }, - { url = "https://files.pythonhosted.org/packages/46/1f/7f02009bc7fc8955c391defee5348f510e589a020e4b40ca05edcb847854/pyarrow-20.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:a6ad3e7758ecf559900261a4df985662df54fb7fdb55e8e3b3aa99b23d526b62", size = 42301124, upload-time = "2025-04-27T12:29:17.187Z" }, - { url = "https://files.pythonhosted.org/packages/4f/92/692c562be4504c262089e86757a9048739fe1acb4024f92d39615e7bab3f/pyarrow-20.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6bb830757103a6cb300a04610e08d9636f0cd223d32f388418ea893a3e655f1c", size = 42916060, upload-time = "2025-04-27T12:29:24.253Z" }, - { url = "https://files.pythonhosted.org/packages/a4/ec/9f5c7e7c828d8e0a3c7ef50ee62eca38a7de2fa6eb1b8fa43685c9414fef/pyarrow-20.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:96e37f0766ecb4514a899d9a3554fadda770fb57ddf42b63d80f14bc20aa7db3", size = 44547640, upload-time = "2025-04-27T12:29:32.782Z" }, - { url = "https://files.pythonhosted.org/packages/54/96/46613131b4727f10fd2ffa6d0d6f02efcc09a0e7374eff3b5771548aa95b/pyarrow-20.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:3346babb516f4b6fd790da99b98bed9708e3f02e734c84971faccb20736848dc", size = 25781491, upload-time = "2025-04-27T12:29:38.464Z" }, - { url = "https://files.pythonhosted.org/packages/a1/d6/0c10e0d54f6c13eb464ee9b67a68b8c71bcf2f67760ef5b6fbcddd2ab05f/pyarrow-20.0.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:75a51a5b0eef32727a247707d4755322cb970be7e935172b6a3a9f9ae98404ba", size = 30815067, upload-time = "2025-04-27T12:29:44.384Z" }, - { url = "https://files.pythonhosted.org/packages/7e/e2/04e9874abe4094a06fd8b0cbb0f1312d8dd7d707f144c2ec1e5e8f452ffa/pyarrow-20.0.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:211d5e84cecc640c7a3ab900f930aaff5cd2702177e0d562d426fb7c4f737781", size = 32297128, upload-time = "2025-04-27T12:29:52.038Z" }, - { url = "https://files.pythonhosted.org/packages/31/fd/c565e5dcc906a3b471a83273039cb75cb79aad4a2d4a12f76cc5ae90a4b8/pyarrow-20.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4ba3cf4182828be7a896cbd232aa8dd6a31bd1f9e32776cc3796c012855e1199", size = 41334890, upload-time = "2025-04-27T12:29:59.452Z" }, - { url = "https://files.pythonhosted.org/packages/af/a9/3bdd799e2c9b20c1ea6dc6fa8e83f29480a97711cf806e823f808c2316ac/pyarrow-20.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2c3a01f313ffe27ac4126f4c2e5ea0f36a5fc6ab51f8726cf41fee4b256680bd", size = 42421775, upload-time = "2025-04-27T12:30:06.875Z" }, - { url = "https://files.pythonhosted.org/packages/10/f7/da98ccd86354c332f593218101ae56568d5dcedb460e342000bd89c49cc1/pyarrow-20.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:a2791f69ad72addd33510fec7bb14ee06c2a448e06b649e264c094c5b5f7ce28", size = 40687231, upload-time = "2025-04-27T12:30:13.954Z" }, - { url = "https://files.pythonhosted.org/packages/bb/1b/2168d6050e52ff1e6cefc61d600723870bf569cbf41d13db939c8cf97a16/pyarrow-20.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:4250e28a22302ce8692d3a0e8ec9d9dde54ec00d237cff4dfa9c1fbf79e472a8", size = 42295639, upload-time = "2025-04-27T12:30:21.949Z" }, - { url = "https://files.pythonhosted.org/packages/b2/66/2d976c0c7158fd25591c8ca55aee026e6d5745a021915a1835578707feb3/pyarrow-20.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:89e030dc58fc760e4010148e6ff164d2f44441490280ef1e97a542375e41058e", size = 42908549, upload-time = "2025-04-27T12:30:29.551Z" }, - { url = "https://files.pythonhosted.org/packages/31/a9/dfb999c2fc6911201dcbf348247f9cc382a8990f9ab45c12eabfd7243a38/pyarrow-20.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6102b4864d77102dbbb72965618e204e550135a940c2534711d5ffa787df2a5a", size = 44557216, upload-time = "2025-04-27T12:30:36.977Z" }, - { url = "https://files.pythonhosted.org/packages/a0/8e/9adee63dfa3911be2382fb4d92e4b2e7d82610f9d9f668493bebaa2af50f/pyarrow-20.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:96d6a0a37d9c98be08f5ed6a10831d88d52cac7b13f5287f1e0f625a0de8062b", size = 25660496, upload-time = "2025-04-27T12:30:42.809Z" }, - { url = "https://files.pythonhosted.org/packages/9b/aa/daa413b81446d20d4dad2944110dcf4cf4f4179ef7f685dd5a6d7570dc8e/pyarrow-20.0.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:a15532e77b94c61efadde86d10957950392999503b3616b2ffcef7621a002893", size = 30798501, upload-time = "2025-04-27T12:30:48.351Z" }, - { url = "https://files.pythonhosted.org/packages/ff/75/2303d1caa410925de902d32ac215dc80a7ce7dd8dfe95358c165f2adf107/pyarrow-20.0.0-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:dd43f58037443af715f34f1322c782ec463a3c8a94a85fdb2d987ceb5658e061", size = 32277895, upload-time = "2025-04-27T12:30:55.238Z" }, - { url = "https://files.pythonhosted.org/packages/92/41/fe18c7c0b38b20811b73d1bdd54b1fccba0dab0e51d2048878042d84afa8/pyarrow-20.0.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aa0d288143a8585806e3cc7c39566407aab646fb9ece164609dac1cfff45f6ae", size = 41327322, upload-time = "2025-04-27T12:31:05.587Z" }, - { url = "https://files.pythonhosted.org/packages/da/ab/7dbf3d11db67c72dbf36ae63dcbc9f30b866c153b3a22ef728523943eee6/pyarrow-20.0.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b6953f0114f8d6f3d905d98e987d0924dabce59c3cda380bdfaa25a6201563b4", size = 42411441, upload-time = "2025-04-27T12:31:15.675Z" }, - { url = "https://files.pythonhosted.org/packages/90/c3/0c7da7b6dac863af75b64e2f827e4742161128c350bfe7955b426484e226/pyarrow-20.0.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:991f85b48a8a5e839b2128590ce07611fae48a904cae6cab1f089c5955b57eb5", size = 40677027, upload-time = "2025-04-27T12:31:24.631Z" }, - { url = "https://files.pythonhosted.org/packages/be/27/43a47fa0ff9053ab5203bb3faeec435d43c0d8bfa40179bfd076cdbd4e1c/pyarrow-20.0.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:97c8dc984ed09cb07d618d57d8d4b67a5100a30c3818c2fb0b04599f0da2de7b", size = 42281473, upload-time = "2025-04-27T12:31:31.311Z" }, - { url = "https://files.pythonhosted.org/packages/bc/0b/d56c63b078876da81bbb9ba695a596eabee9b085555ed12bf6eb3b7cab0e/pyarrow-20.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9b71daf534f4745818f96c214dbc1e6124d7daf059167330b610fc69b6f3d3e3", size = 42893897, upload-time = "2025-04-27T12:31:39.406Z" }, - { url = "https://files.pythonhosted.org/packages/92/ac/7d4bd020ba9145f354012838692d48300c1b8fe5634bfda886abcada67ed/pyarrow-20.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e8b88758f9303fa5a83d6c90e176714b2fd3852e776fc2d7e42a22dd6c2fb368", size = 44543847, upload-time = "2025-04-27T12:31:45.997Z" }, - { url = "https://files.pythonhosted.org/packages/9d/07/290f4abf9ca702c5df7b47739c1b2c83588641ddfa2cc75e34a301d42e55/pyarrow-20.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:30b3051b7975801c1e1d387e17c588d8ab05ced9b1e14eec57915f79869b5031", size = 25653219, upload-time = "2025-04-27T12:31:54.11Z" }, - { url = "https://files.pythonhosted.org/packages/95/df/720bb17704b10bd69dde086e1400b8eefb8f58df3f8ac9cff6c425bf57f1/pyarrow-20.0.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:ca151afa4f9b7bc45bcc791eb9a89e90a9eb2772767d0b1e5389609c7d03db63", size = 30853957, upload-time = "2025-04-27T12:31:59.215Z" }, - { url = "https://files.pythonhosted.org/packages/d9/72/0d5f875efc31baef742ba55a00a25213a19ea64d7176e0fe001c5d8b6e9a/pyarrow-20.0.0-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:4680f01ecd86e0dd63e39eb5cd59ef9ff24a9d166db328679e36c108dc993d4c", size = 32247972, upload-time = "2025-04-27T12:32:05.369Z" }, - { url = "https://files.pythonhosted.org/packages/d5/bc/e48b4fa544d2eea72f7844180eb77f83f2030b84c8dad860f199f94307ed/pyarrow-20.0.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7f4c8534e2ff059765647aa69b75d6543f9fef59e2cd4c6d18015192565d2b70", size = 41256434, upload-time = "2025-04-27T12:32:11.814Z" }, - { url = "https://files.pythonhosted.org/packages/c3/01/974043a29874aa2cf4f87fb07fd108828fc7362300265a2a64a94965e35b/pyarrow-20.0.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e1f8a47f4b4ae4c69c4d702cfbdfe4d41e18e5c7ef6f1bb1c50918c1e81c57b", size = 42353648, upload-time = "2025-04-27T12:32:20.766Z" }, - { url = "https://files.pythonhosted.org/packages/68/95/cc0d3634cde9ca69b0e51cbe830d8915ea32dda2157560dda27ff3b3337b/pyarrow-20.0.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:a1f60dc14658efaa927f8214734f6a01a806d7690be4b3232ba526836d216122", size = 40619853, upload-time = "2025-04-27T12:32:28.1Z" }, - { url = "https://files.pythonhosted.org/packages/29/c2/3ad40e07e96a3e74e7ed7cc8285aadfa84eb848a798c98ec0ad009eb6bcc/pyarrow-20.0.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:204a846dca751428991346976b914d6d2a82ae5b8316a6ed99789ebf976551e6", size = 42241743, upload-time = "2025-04-27T12:32:35.792Z" }, - { url = "https://files.pythonhosted.org/packages/eb/cb/65fa110b483339add6a9bc7b6373614166b14e20375d4daa73483755f830/pyarrow-20.0.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:f3b117b922af5e4c6b9a9115825726cac7d8b1421c37c2b5e24fbacc8930612c", size = 42839441, upload-time = "2025-04-27T12:32:46.64Z" }, - { url = "https://files.pythonhosted.org/packages/98/7b/f30b1954589243207d7a0fbc9997401044bf9a033eec78f6cb50da3f304a/pyarrow-20.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:e724a3fd23ae5b9c010e7be857f4405ed5e679db5c93e66204db1a69f733936a", size = 44503279, upload-time = "2025-04-27T12:32:56.503Z" }, - { url = "https://files.pythonhosted.org/packages/37/40/ad395740cd641869a13bcf60851296c89624662575621968dcfafabaa7f6/pyarrow-20.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:82f1ee5133bd8f49d31be1299dc07f585136679666b502540db854968576faf9", size = 25944982, upload-time = "2025-04-27T12:33:04.72Z" }, +version = "22.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/30/53/04a7fdc63e6056116c9ddc8b43bc28c12cdd181b85cbeadb79278475f3ae/pyarrow-22.0.0.tar.gz", hash = "sha256:3d600dc583260d845c7d8a6db540339dd883081925da2bd1c5cb808f720b3cd9", size = 1151151, upload-time = "2025-10-24T12:30:00.762Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2e/b7/18f611a8cdc43417f9394a3ccd3eace2f32183c08b9eddc3d17681819f37/pyarrow-22.0.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:3e294c5eadfb93d78b0763e859a0c16d4051fc1c5231ae8956d61cb0b5666f5a", size = 34272022, upload-time = "2025-10-24T10:04:28.973Z" }, + { url = "https://files.pythonhosted.org/packages/26/5c/f259e2526c67eb4b9e511741b19870a02363a47a35edbebc55c3178db22d/pyarrow-22.0.0-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:69763ab2445f632d90b504a815a2a033f74332997052b721002298ed6de40f2e", size = 35995834, upload-time = "2025-10-24T10:04:35.467Z" }, + { url = "https://files.pythonhosted.org/packages/50/8d/281f0f9b9376d4b7f146913b26fac0aa2829cd1ee7e997f53a27411bbb92/pyarrow-22.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:b41f37cabfe2463232684de44bad753d6be08a7a072f6a83447eeaf0e4d2a215", size = 45030348, upload-time = "2025-10-24T10:04:43.366Z" }, + { url = "https://files.pythonhosted.org/packages/f5/e5/53c0a1c428f0976bf22f513d79c73000926cb00b9c138d8e02daf2102e18/pyarrow-22.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:35ad0f0378c9359b3f297299c3309778bb03b8612f987399a0333a560b43862d", size = 47699480, upload-time = "2025-10-24T10:04:51.486Z" }, + { url = "https://files.pythonhosted.org/packages/95/e1/9dbe4c465c3365959d183e6345d0a8d1dc5b02ca3f8db4760b3bc834cf25/pyarrow-22.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8382ad21458075c2e66a82a29d650f963ce51c7708c7c0ff313a8c206c4fd5e8", size = 48011148, upload-time = "2025-10-24T10:04:59.585Z" }, + { url = "https://files.pythonhosted.org/packages/c5/b4/7caf5d21930061444c3cf4fa7535c82faf5263e22ce43af7c2759ceb5b8b/pyarrow-22.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:1a812a5b727bc09c3d7ea072c4eebf657c2f7066155506ba31ebf4792f88f016", size = 50276964, upload-time = "2025-10-24T10:05:08.175Z" }, + { url = "https://files.pythonhosted.org/packages/ae/f3/cec89bd99fa3abf826f14d4e53d3d11340ce6f6af4d14bdcd54cd83b6576/pyarrow-22.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:ec5d40dd494882704fb876c16fa7261a69791e784ae34e6b5992e977bd2e238c", size = 28106517, upload-time = "2025-10-24T10:05:14.314Z" }, + { url = "https://files.pythonhosted.org/packages/af/63/ba23862d69652f85b615ca14ad14f3bcfc5bf1b99ef3f0cd04ff93fdad5a/pyarrow-22.0.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:bea79263d55c24a32b0d79c00a1c58bb2ee5f0757ed95656b01c0fb310c5af3d", size = 34211578, upload-time = "2025-10-24T10:05:21.583Z" }, + { url = "https://files.pythonhosted.org/packages/b1/d0/f9ad86fe809efd2bcc8be32032fa72e8b0d112b01ae56a053006376c5930/pyarrow-22.0.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:12fe549c9b10ac98c91cf791d2945e878875d95508e1a5d14091a7aaa66d9cf8", size = 35989906, upload-time = "2025-10-24T10:05:29.485Z" }, + { url = "https://files.pythonhosted.org/packages/b4/a8/f910afcb14630e64d673f15904ec27dd31f1e009b77033c365c84e8c1e1d/pyarrow-22.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:334f900ff08ce0423407af97e6c26ad5d4e3b0763645559ece6fbf3747d6a8f5", size = 45021677, upload-time = "2025-10-24T10:05:38.274Z" }, + { url = "https://files.pythonhosted.org/packages/13/95/aec81f781c75cd10554dc17a25849c720d54feafb6f7847690478dcf5ef8/pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:c6c791b09c57ed76a18b03f2631753a4960eefbbca80f846da8baefc6491fcfe", size = 47726315, upload-time = "2025-10-24T10:05:47.314Z" }, + { url = "https://files.pythonhosted.org/packages/bb/d4/74ac9f7a54cfde12ee42734ea25d5a3c9a45db78f9def949307a92720d37/pyarrow-22.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:c3200cb41cdbc65156e5f8c908d739b0dfed57e890329413da2748d1a2cd1a4e", size = 47990906, upload-time = "2025-10-24T10:05:58.254Z" }, + { url = "https://files.pythonhosted.org/packages/2e/71/fedf2499bf7a95062eafc989ace56572f3343432570e1c54e6599d5b88da/pyarrow-22.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ac93252226cf288753d8b46280f4edf3433bf9508b6977f8dd8526b521a1bbb9", size = 50306783, upload-time = "2025-10-24T10:06:08.08Z" }, + { url = "https://files.pythonhosted.org/packages/68/ed/b202abd5a5b78f519722f3d29063dda03c114711093c1995a33b8e2e0f4b/pyarrow-22.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:44729980b6c50a5f2bfcc2668d36c569ce17f8b17bccaf470c4313dcbbf13c9d", size = 27972883, upload-time = "2025-10-24T10:06:14.204Z" }, + { url = "https://files.pythonhosted.org/packages/a6/d6/d0fac16a2963002fc22c8fa75180a838737203d558f0ed3b564c4a54eef5/pyarrow-22.0.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:e6e95176209257803a8b3d0394f21604e796dadb643d2f7ca21b66c9c0b30c9a", size = 34204629, upload-time = "2025-10-24T10:06:20.274Z" }, + { url = "https://files.pythonhosted.org/packages/c6/9c/1d6357347fbae062ad3f17082f9ebc29cc733321e892c0d2085f42a2212b/pyarrow-22.0.0-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:001ea83a58024818826a9e3f89bf9310a114f7e26dfe404a4c32686f97bd7901", size = 35985783, upload-time = "2025-10-24T10:06:27.301Z" }, + { url = "https://files.pythonhosted.org/packages/ff/c0/782344c2ce58afbea010150df07e3a2f5fdad299cd631697ae7bd3bac6e3/pyarrow-22.0.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:ce20fe000754f477c8a9125543f1936ea5b8867c5406757c224d745ed033e691", size = 45020999, upload-time = "2025-10-24T10:06:35.387Z" }, + { url = "https://files.pythonhosted.org/packages/1b/8b/5362443737a5307a7b67c1017c42cd104213189b4970bf607e05faf9c525/pyarrow-22.0.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:e0a15757fccb38c410947df156f9749ae4a3c89b2393741a50521f39a8cf202a", size = 47724601, upload-time = "2025-10-24T10:06:43.551Z" }, + { url = "https://files.pythonhosted.org/packages/69/4d/76e567a4fc2e190ee6072967cb4672b7d9249ac59ae65af2d7e3047afa3b/pyarrow-22.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:cedb9dd9358e4ea1d9bce3665ce0797f6adf97ff142c8e25b46ba9cdd508e9b6", size = 48001050, upload-time = "2025-10-24T10:06:52.284Z" }, + { url = "https://files.pythonhosted.org/packages/01/5e/5653f0535d2a1aef8223cee9d92944cb6bccfee5cf1cd3f462d7cb022790/pyarrow-22.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:252be4a05f9d9185bb8c18e83764ebcfea7185076c07a7a662253af3a8c07941", size = 50307877, upload-time = "2025-10-24T10:07:02.405Z" }, + { url = "https://files.pythonhosted.org/packages/2d/f8/1d0bd75bf9328a3b826e24a16e5517cd7f9fbf8d34a3184a4566ef5a7f29/pyarrow-22.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:a4893d31e5ef780b6edcaf63122df0f8d321088bb0dee4c8c06eccb1ca28d145", size = 27977099, upload-time = "2025-10-24T10:08:07.259Z" }, + { url = "https://files.pythonhosted.org/packages/90/81/db56870c997805bf2b0f6eeeb2d68458bf4654652dccdcf1bf7a42d80903/pyarrow-22.0.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:f7fe3dbe871294ba70d789be16b6e7e52b418311e166e0e3cba9522f0f437fb1", size = 34336685, upload-time = "2025-10-24T10:07:11.47Z" }, + { url = "https://files.pythonhosted.org/packages/1c/98/0727947f199aba8a120f47dfc229eeb05df15bcd7a6f1b669e9f882afc58/pyarrow-22.0.0-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:ba95112d15fd4f1105fb2402c4eab9068f0554435e9b7085924bcfaac2cc306f", size = 36032158, upload-time = "2025-10-24T10:07:18.626Z" }, + { url = "https://files.pythonhosted.org/packages/96/b4/9babdef9c01720a0785945c7cf550e4acd0ebcd7bdd2e6f0aa7981fa85e2/pyarrow-22.0.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:c064e28361c05d72eed8e744c9605cbd6d2bb7481a511c74071fd9b24bc65d7d", size = 44892060, upload-time = "2025-10-24T10:07:26.002Z" }, + { url = "https://files.pythonhosted.org/packages/f8/ca/2f8804edd6279f78a37062d813de3f16f29183874447ef6d1aadbb4efa0f/pyarrow-22.0.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:6f9762274496c244d951c819348afbcf212714902742225f649cf02823a6a10f", size = 47504395, upload-time = "2025-10-24T10:07:34.09Z" }, + { url = "https://files.pythonhosted.org/packages/b9/f0/77aa5198fd3943682b2e4faaf179a674f0edea0d55d326d83cb2277d9363/pyarrow-22.0.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:a9d9ffdc2ab696f6b15b4d1f7cec6658e1d788124418cb30030afbae31c64746", size = 48066216, upload-time = "2025-10-24T10:07:43.528Z" }, + { url = "https://files.pythonhosted.org/packages/79/87/a1937b6e78b2aff18b706d738c9e46ade5bfcf11b294e39c87706a0089ac/pyarrow-22.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ec1a15968a9d80da01e1d30349b2b0d7cc91e96588ee324ce1b5228175043e95", size = 50288552, upload-time = "2025-10-24T10:07:53.519Z" }, + { url = "https://files.pythonhosted.org/packages/60/ae/b5a5811e11f25788ccfdaa8f26b6791c9807119dffcf80514505527c384c/pyarrow-22.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:bba208d9c7decf9961998edf5c65e3ea4355d5818dd6cd0f6809bec1afb951cc", size = 28262504, upload-time = "2025-10-24T10:08:00.932Z" }, + { url = "https://files.pythonhosted.org/packages/bd/b0/0fa4d28a8edb42b0a7144edd20befd04173ac79819547216f8a9f36f9e50/pyarrow-22.0.0-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:9bddc2cade6561f6820d4cd73f99a0243532ad506bc510a75a5a65a522b2d74d", size = 34224062, upload-time = "2025-10-24T10:08:14.101Z" }, + { url = "https://files.pythonhosted.org/packages/0f/a8/7a719076b3c1be0acef56a07220c586f25cd24de0e3f3102b438d18ae5df/pyarrow-22.0.0-cp314-cp314-macosx_12_0_x86_64.whl", hash = "sha256:e70ff90c64419709d38c8932ea9fe1cc98415c4f87ea8da81719e43f02534bc9", size = 35990057, upload-time = "2025-10-24T10:08:21.842Z" }, + { url = "https://files.pythonhosted.org/packages/89/3c/359ed54c93b47fb6fe30ed16cdf50e3f0e8b9ccfb11b86218c3619ae50a8/pyarrow-22.0.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:92843c305330aa94a36e706c16209cd4df274693e777ca47112617db7d0ef3d7", size = 45068002, upload-time = "2025-10-24T10:08:29.034Z" }, + { url = "https://files.pythonhosted.org/packages/55/fc/4945896cc8638536ee787a3bd6ce7cec8ec9acf452d78ec39ab328efa0a1/pyarrow-22.0.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:6dda1ddac033d27421c20d7a7943eec60be44e0db4e079f33cc5af3b8280ccde", size = 47737765, upload-time = "2025-10-24T10:08:38.559Z" }, + { url = "https://files.pythonhosted.org/packages/cd/5e/7cb7edeb2abfaa1f79b5d5eb89432356155c8426f75d3753cbcb9592c0fd/pyarrow-22.0.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:84378110dd9a6c06323b41b56e129c504d157d1a983ce8f5443761eb5256bafc", size = 48048139, upload-time = "2025-10-24T10:08:46.784Z" }, + { url = "https://files.pythonhosted.org/packages/88/c6/546baa7c48185f5e9d6e59277c4b19f30f48c94d9dd938c2a80d4d6b067c/pyarrow-22.0.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:854794239111d2b88b40b6ef92aa478024d1e5074f364033e73e21e3f76b25e0", size = 50314244, upload-time = "2025-10-24T10:08:55.771Z" }, + { url = "https://files.pythonhosted.org/packages/3c/79/755ff2d145aafec8d347bf18f95e4e81c00127f06d080135dfc86aea417c/pyarrow-22.0.0-cp314-cp314-win_amd64.whl", hash = "sha256:b883fe6fd85adad7932b3271c38ac289c65b7337c2c132e9569f9d3940620730", size = 28757501, upload-time = "2025-10-24T10:09:59.891Z" }, + { url = "https://files.pythonhosted.org/packages/0e/d2/237d75ac28ced3147912954e3c1a174df43a95f4f88e467809118a8165e0/pyarrow-22.0.0-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:7a820d8ae11facf32585507c11f04e3f38343c1e784c9b5a8b1da5c930547fe2", size = 34355506, upload-time = "2025-10-24T10:09:02.953Z" }, + { url = "https://files.pythonhosted.org/packages/1e/2c/733dfffe6d3069740f98e57ff81007809067d68626c5faef293434d11bd6/pyarrow-22.0.0-cp314-cp314t-macosx_12_0_x86_64.whl", hash = "sha256:c6ec3675d98915bf1ec8b3c7986422682f7232ea76cad276f4c8abd5b7319b70", size = 36047312, upload-time = "2025-10-24T10:09:10.334Z" }, + { url = "https://files.pythonhosted.org/packages/7c/2b/29d6e3782dc1f299727462c1543af357a0f2c1d3c160ce199950d9ca51eb/pyarrow-22.0.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:3e739edd001b04f654b166204fc7a9de896cf6007eaff33409ee9e50ceaff754", size = 45081609, upload-time = "2025-10-24T10:09:18.61Z" }, + { url = "https://files.pythonhosted.org/packages/8d/42/aa9355ecc05997915af1b7b947a7f66c02dcaa927f3203b87871c114ba10/pyarrow-22.0.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:7388ac685cab5b279a41dfe0a6ccd99e4dbf322edfb63e02fc0443bf24134e91", size = 47703663, upload-time = "2025-10-24T10:09:27.369Z" }, + { url = "https://files.pythonhosted.org/packages/ee/62/45abedde480168e83a1de005b7b7043fd553321c1e8c5a9a114425f64842/pyarrow-22.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:f633074f36dbc33d5c05b5dc75371e5660f1dbf9c8b1d95669def05e5425989c", size = 48066543, upload-time = "2025-10-24T10:09:34.908Z" }, + { url = "https://files.pythonhosted.org/packages/84/e9/7878940a5b072e4f3bf998770acafeae13b267f9893af5f6d4ab3904b67e/pyarrow-22.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:4c19236ae2402a8663a2c8f21f1870a03cc57f0bef7e4b6eb3238cc82944de80", size = 50288838, upload-time = "2025-10-24T10:09:44.394Z" }, + { url = "https://files.pythonhosted.org/packages/7b/03/f335d6c52b4a4761bcc83499789a1e2e16d9d201a58c327a9b5cc9a41bd9/pyarrow-22.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:0c34fe18094686194f204a3b1787a27456897d8a2d62caf84b61e8dfbc0252ae", size = 29185594, upload-time = "2025-10-24T10:09:53.111Z" }, ] [[package]] @@ -1507,6 +1614,32 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/53/97/d2cbbaa10c9b826af0e10fdf836e1bf344d9f0abb873ebc34d1f49642d3f/roman_numerals_py-3.1.0-py3-none-any.whl", hash = "sha256:9da2ad2fb670bcf24e81070ceb3be72f6c11c440d73bd579fbeca1e9f330954c", size = 7742, upload-time = "2025-02-22T07:34:52.422Z" }, ] +[[package]] +name = "ruff" +version = "0.14.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/75/62/50b7727004dfe361104dfbf898c45a9a2fdfad8c72c04ae62900224d6ecf/ruff-0.14.3.tar.gz", hash = "sha256:4ff876d2ab2b161b6de0aa1f5bd714e8e9b4033dc122ee006925fbacc4f62153", size = 5558687, upload-time = "2025-10-31T00:26:26.878Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ce/8e/0c10ff1ea5d4360ab8bfca4cb2c9d979101a391f3e79d2616c9bf348cd26/ruff-0.14.3-py3-none-linux_armv6l.whl", hash = "sha256:876b21e6c824f519446715c1342b8e60f97f93264012de9d8d10314f8a79c371", size = 12535613, upload-time = "2025-10-31T00:25:44.302Z" }, + { url = "https://files.pythonhosted.org/packages/d3/c8/6724f4634c1daf52409fbf13fefda64aa9c8f81e44727a378b7b73dc590b/ruff-0.14.3-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:b6fd8c79b457bedd2abf2702b9b472147cd860ed7855c73a5247fa55c9117654", size = 12855812, upload-time = "2025-10-31T00:25:47.793Z" }, + { url = "https://files.pythonhosted.org/packages/de/03/db1bce591d55fd5f8a08bb02517fa0b5097b2ccabd4ea1ee29aa72b67d96/ruff-0.14.3-py3-none-macosx_11_0_arm64.whl", hash = "sha256:71ff6edca490c308f083156938c0c1a66907151263c4abdcb588602c6e696a14", size = 11944026, upload-time = "2025-10-31T00:25:49.657Z" }, + { url = "https://files.pythonhosted.org/packages/0b/75/4f8dbd48e03272715d12c87dc4fcaaf21b913f0affa5f12a4e9c6f8a0582/ruff-0.14.3-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:786ee3ce6139772ff9272aaf43296d975c0217ee1b97538a98171bf0d21f87ed", size = 12356818, upload-time = "2025-10-31T00:25:51.949Z" }, + { url = "https://files.pythonhosted.org/packages/ec/9b/506ec5b140c11d44a9a4f284ea7c14ebf6f8b01e6e8917734a3325bff787/ruff-0.14.3-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:cd6291d0061811c52b8e392f946889916757610d45d004e41140d81fb6cd5ddc", size = 12336745, upload-time = "2025-10-31T00:25:54.248Z" }, + { url = "https://files.pythonhosted.org/packages/c7/e1/c560d254048c147f35e7f8131d30bc1f63a008ac61595cf3078a3e93533d/ruff-0.14.3-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a497ec0c3d2c88561b6d90f9c29f5ae68221ac00d471f306fa21fa4264ce5fcd", size = 13101684, upload-time = "2025-10-31T00:25:56.253Z" }, + { url = "https://files.pythonhosted.org/packages/a5/32/e310133f8af5cd11f8cc30f52522a3ebccc5ea5bff4b492f94faceaca7a8/ruff-0.14.3-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:e231e1be58fc568950a04fbe6887c8e4b85310e7889727e2b81db205c45059eb", size = 14535000, upload-time = "2025-10-31T00:25:58.397Z" }, + { url = "https://files.pythonhosted.org/packages/a2/a1/7b0470a22158c6d8501eabc5e9b6043c99bede40fa1994cadf6b5c2a61c7/ruff-0.14.3-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:469e35872a09c0e45fecf48dd960bfbce056b5db2d5e6b50eca329b4f853ae20", size = 14156450, upload-time = "2025-10-31T00:26:00.889Z" }, + { url = "https://files.pythonhosted.org/packages/0a/96/24bfd9d1a7f532b560dcee1a87096332e461354d3882124219bcaff65c09/ruff-0.14.3-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3d6bc90307c469cb9d28b7cfad90aaa600b10d67c6e22026869f585e1e8a2db0", size = 13568414, upload-time = "2025-10-31T00:26:03.291Z" }, + { url = "https://files.pythonhosted.org/packages/a7/e7/138b883f0dfe4ad5b76b58bf4ae675f4d2176ac2b24bdd81b4d966b28c61/ruff-0.14.3-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e2f8a0bbcffcfd895df39c9a4ecd59bb80dca03dc43f7fb63e647ed176b741e", size = 13315293, upload-time = "2025-10-31T00:26:05.708Z" }, + { url = "https://files.pythonhosted.org/packages/33/f4/c09bb898be97b2eb18476b7c950df8815ef14cf956074177e9fbd40b7719/ruff-0.14.3-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:678fdd7c7d2d94851597c23ee6336d25f9930b460b55f8598e011b57c74fd8c5", size = 13539444, upload-time = "2025-10-31T00:26:08.09Z" }, + { url = "https://files.pythonhosted.org/packages/9c/aa/b30a1db25fc6128b1dd6ff0741fa4abf969ded161599d07ca7edd0739cc0/ruff-0.14.3-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:1ec1ac071e7e37e0221d2f2dbaf90897a988c531a8592a6a5959f0603a1ecf5e", size = 12252581, upload-time = "2025-10-31T00:26:10.297Z" }, + { url = "https://files.pythonhosted.org/packages/da/13/21096308f384d796ffe3f2960b17054110a9c3828d223ca540c2b7cc670b/ruff-0.14.3-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:afcdc4b5335ef440d19e7df9e8ae2ad9f749352190e96d481dc501b753f0733e", size = 12307503, upload-time = "2025-10-31T00:26:12.646Z" }, + { url = "https://files.pythonhosted.org/packages/cb/cc/a350bac23f03b7dbcde3c81b154706e80c6f16b06ff1ce28ed07dc7b07b0/ruff-0.14.3-py3-none-musllinux_1_2_i686.whl", hash = "sha256:7bfc42f81862749a7136267a343990f865e71fe2f99cf8d2958f684d23ce3dfa", size = 12675457, upload-time = "2025-10-31T00:26:15.044Z" }, + { url = "https://files.pythonhosted.org/packages/cb/76/46346029fa2f2078826bc88ef7167e8c198e58fe3126636e52f77488cbba/ruff-0.14.3-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:a65e448cfd7e9c59fae8cf37f9221585d3354febaad9a07f29158af1528e165f", size = 13403980, upload-time = "2025-10-31T00:26:17.81Z" }, + { url = "https://files.pythonhosted.org/packages/9f/a4/35f1ef68c4e7b236d4a5204e3669efdeefaef21f0ff6a456792b3d8be438/ruff-0.14.3-py3-none-win32.whl", hash = "sha256:f3d91857d023ba93e14ed2d462ab62c3428f9bbf2b4fbac50a03ca66d31991f7", size = 12500045, upload-time = "2025-10-31T00:26:20.503Z" }, + { url = "https://files.pythonhosted.org/packages/03/15/51960ae340823c9859fb60c63301d977308735403e2134e17d1d2858c7fb/ruff-0.14.3-py3-none-win_amd64.whl", hash = "sha256:d7b7006ac0756306db212fd37116cce2bd307e1e109375e1c6c106002df0ae5f", size = 13594005, upload-time = "2025-10-31T00:26:22.533Z" }, + { url = "https://files.pythonhosted.org/packages/b7/73/4de6579bac8e979fca0a77e54dec1f1e011a0d268165eb8a9bc0982a6564/ruff-0.14.3-py3-none-win_arm64.whl", hash = "sha256:26eb477ede6d399d898791d01961e16b86f02bc2486d0d1a7a9bb2379d055dc1", size = 12590017, upload-time = "2025-10-31T00:26:24.52Z" }, +] + [[package]] name = "safetensors" version = "0.6.2" @@ -1627,6 +1760,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" }, ] +[[package]] +name = "sniffio" +version = "1.3.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a2/87/a6771e1546d97e7e041b6ae58d80074f81b7d5121207425c964ddf5cfdbd/sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc", size = 20372, upload-time = "2024-02-25T23:20:04.057Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" }, +] + [[package]] name = "snowballstemmer" version = "3.0.1" @@ -1929,6 +2071,7 @@ dev = [ { name = "pre-commit" }, { name = "pyarrow" }, { name = "pytest" }, + { name = "ruff" }, { name = "scikit-learn" }, { name = "unidecode" }, ] @@ -1942,6 +2085,7 @@ docs = [ { name = "sphinxcontrib-napoleon" }, ] hf-dep = [ + { name = "datasets" }, { name = "tokenizers" }, { name = "transformers" }, ] @@ -1966,6 +2110,7 @@ dev = [ { name = "pre-commit", specifier = ">=4.3.0" }, { name = "pyarrow" }, { name = "pytest", specifier = ">=8.1.1,<9" }, + { name = "ruff", specifier = ">=0.14.3" }, { name = "scikit-learn" }, { name = "unidecode" }, ] @@ -1979,6 +2124,7 @@ docs = [ { name = "sphinxcontrib-napoleon", specifier = ">=0.7" }, ] hf-dep = [ + { name = "datasets", specifier = ">=4.3.0" }, { name = "tokenizers", specifier = ">=0.22.1" }, { name = "transformers", specifier = ">=4.57.1" }, ] @@ -2080,6 +2226,109 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/27/73/d9a94da0e9d470a543c1b9d3ccbceb0f59455983088e727b8a1824ed90fb/virtualenv-20.35.3-py3-none-any.whl", hash = "sha256:63d106565078d8c8d0b206d48080f938a8b25361e19432d2c9db40d2899c810a", size = 5981061, upload-time = "2025-10-10T21:23:30.433Z" }, ] +[[package]] +name = "xxhash" +version = "3.6.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/02/84/30869e01909fb37a6cc7e18688ee8bf1e42d57e7e0777636bd47524c43c7/xxhash-3.6.0.tar.gz", hash = "sha256:f0162a78b13a0d7617b2845b90c763339d1f1d82bb04a4b07f4ab535cc5e05d6", size = 85160, upload-time = "2025-10-02T14:37:08.097Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/17/d4/cc2f0400e9154df4b9964249da78ebd72f318e35ccc425e9f403c392f22a/xxhash-3.6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b47bbd8cf2d72797f3c2772eaaac0ded3d3af26481a26d7d7d41dc2d3c46b04a", size = 32844, upload-time = "2025-10-02T14:34:14.037Z" }, + { url = "https://files.pythonhosted.org/packages/5e/ec/1cc11cd13e26ea8bc3cb4af4eaadd8d46d5014aebb67be3f71fb0b68802a/xxhash-3.6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2b6821e94346f96db75abaa6e255706fb06ebd530899ed76d32cd99f20dc52fa", size = 30809, upload-time = "2025-10-02T14:34:15.484Z" }, + { url = "https://files.pythonhosted.org/packages/04/5f/19fe357ea348d98ca22f456f75a30ac0916b51c753e1f8b2e0e6fb884cce/xxhash-3.6.0-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:d0a9751f71a1a65ce3584e9cae4467651c7e70c9d31017fa57574583a4540248", size = 194665, upload-time = "2025-10-02T14:34:16.541Z" }, + { url = "https://files.pythonhosted.org/packages/90/3b/d1f1a8f5442a5fd8beedae110c5af7604dc37349a8e16519c13c19a9a2de/xxhash-3.6.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8b29ee68625ab37b04c0b40c3fafdf24d2f75ccd778333cfb698f65f6c463f62", size = 213550, upload-time = "2025-10-02T14:34:17.878Z" }, + { url = "https://files.pythonhosted.org/packages/c4/ef/3a9b05eb527457d5db13a135a2ae1a26c80fecd624d20f3e8dcc4cb170f3/xxhash-3.6.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:6812c25fe0d6c36a46ccb002f40f27ac903bf18af9f6dd8f9669cb4d176ab18f", size = 212384, upload-time = "2025-10-02T14:34:19.182Z" }, + { url = "https://files.pythonhosted.org/packages/0f/18/ccc194ee698c6c623acbf0f8c2969811a8a4b6185af5e824cd27b9e4fd3e/xxhash-3.6.0-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:4ccbff013972390b51a18ef1255ef5ac125c92dc9143b2d1909f59abc765540e", size = 445749, upload-time = "2025-10-02T14:34:20.659Z" }, + { url = "https://files.pythonhosted.org/packages/a5/86/cf2c0321dc3940a7aa73076f4fd677a0fb3e405cb297ead7d864fd90847e/xxhash-3.6.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:297b7fbf86c82c550e12e8fb71968b3f033d27b874276ba3624ea868c11165a8", size = 193880, upload-time = "2025-10-02T14:34:22.431Z" }, + { url = "https://files.pythonhosted.org/packages/82/fb/96213c8560e6f948a1ecc9a7613f8032b19ee45f747f4fca4eb31bb6d6ed/xxhash-3.6.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:dea26ae1eb293db089798d3973a5fc928a18fdd97cc8801226fae705b02b14b0", size = 210912, upload-time = "2025-10-02T14:34:23.937Z" }, + { url = "https://files.pythonhosted.org/packages/40/aa/4395e669b0606a096d6788f40dbdf2b819d6773aa290c19e6e83cbfc312f/xxhash-3.6.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:7a0b169aafb98f4284f73635a8e93f0735f9cbde17bd5ec332480484241aaa77", size = 198654, upload-time = "2025-10-02T14:34:25.644Z" }, + { url = "https://files.pythonhosted.org/packages/67/74/b044fcd6b3d89e9b1b665924d85d3f400636c23590226feb1eb09e1176ce/xxhash-3.6.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:08d45aef063a4531b785cd72de4887766d01dc8f362a515693df349fdb825e0c", size = 210867, upload-time = "2025-10-02T14:34:27.203Z" }, + { url = "https://files.pythonhosted.org/packages/bc/fd/3ce73bf753b08cb19daee1eb14aa0d7fe331f8da9c02dd95316ddfe5275e/xxhash-3.6.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:929142361a48ee07f09121fe9e96a84950e8d4df3bb298ca5d88061969f34d7b", size = 414012, upload-time = "2025-10-02T14:34:28.409Z" }, + { url = "https://files.pythonhosted.org/packages/ba/b3/5a4241309217c5c876f156b10778f3ab3af7ba7e3259e6d5f5c7d0129eb2/xxhash-3.6.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:51312c768403d8540487dbbfb557454cfc55589bbde6424456951f7fcd4facb3", size = 191409, upload-time = "2025-10-02T14:34:29.696Z" }, + { url = "https://files.pythonhosted.org/packages/c0/01/99bfbc15fb9abb9a72b088c1d95219fc4782b7d01fc835bd5744d66dd0b8/xxhash-3.6.0-cp311-cp311-win32.whl", hash = "sha256:d1927a69feddc24c987b337ce81ac15c4720955b667fe9b588e02254b80446fd", size = 30574, upload-time = "2025-10-02T14:34:31.028Z" }, + { url = "https://files.pythonhosted.org/packages/65/79/9d24d7f53819fe301b231044ea362ce64e86c74f6e8c8e51320de248b3e5/xxhash-3.6.0-cp311-cp311-win_amd64.whl", hash = "sha256:26734cdc2d4ffe449b41d186bbeac416f704a482ed835d375a5c0cb02bc63fef", size = 31481, upload-time = "2025-10-02T14:34:32.062Z" }, + { url = "https://files.pythonhosted.org/packages/30/4e/15cd0e3e8772071344eab2961ce83f6e485111fed8beb491a3f1ce100270/xxhash-3.6.0-cp311-cp311-win_arm64.whl", hash = "sha256:d72f67ef8bf36e05f5b6c65e8524f265bd61071471cd4cf1d36743ebeeeb06b7", size = 27861, upload-time = "2025-10-02T14:34:33.555Z" }, + { url = "https://files.pythonhosted.org/packages/9a/07/d9412f3d7d462347e4511181dea65e47e0d0e16e26fbee2ea86a2aefb657/xxhash-3.6.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:01362c4331775398e7bb34e3ab403bc9ee9f7c497bc7dee6272114055277dd3c", size = 32744, upload-time = "2025-10-02T14:34:34.622Z" }, + { url = "https://files.pythonhosted.org/packages/79/35/0429ee11d035fc33abe32dca1b2b69e8c18d236547b9a9b72c1929189b9a/xxhash-3.6.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b7b2df81a23f8cb99656378e72501b2cb41b1827c0f5a86f87d6b06b69f9f204", size = 30816, upload-time = "2025-10-02T14:34:36.043Z" }, + { url = "https://files.pythonhosted.org/packages/b7/f2/57eb99aa0f7d98624c0932c5b9a170e1806406cdbcdb510546634a1359e0/xxhash-3.6.0-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:dc94790144e66b14f67b10ac8ed75b39ca47536bf8800eb7c24b50271ea0c490", size = 194035, upload-time = "2025-10-02T14:34:37.354Z" }, + { url = "https://files.pythonhosted.org/packages/4c/ed/6224ba353690d73af7a3f1c7cdb1fc1b002e38f783cb991ae338e1eb3d79/xxhash-3.6.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:93f107c673bccf0d592cdba077dedaf52fe7f42dcd7676eba1f6d6f0c3efffd2", size = 212914, upload-time = "2025-10-02T14:34:38.6Z" }, + { url = "https://files.pythonhosted.org/packages/38/86/fb6b6130d8dd6b8942cc17ab4d90e223653a89aa32ad2776f8af7064ed13/xxhash-3.6.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2aa5ee3444c25b69813663c9f8067dcfaa2e126dc55e8dddf40f4d1c25d7effa", size = 212163, upload-time = "2025-10-02T14:34:39.872Z" }, + { url = "https://files.pythonhosted.org/packages/ee/dc/e84875682b0593e884ad73b2d40767b5790d417bde603cceb6878901d647/xxhash-3.6.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:f7f99123f0e1194fa59cc69ad46dbae2e07becec5df50a0509a808f90a0f03f0", size = 445411, upload-time = "2025-10-02T14:34:41.569Z" }, + { url = "https://files.pythonhosted.org/packages/11/4f/426f91b96701ec2f37bb2b8cec664eff4f658a11f3fa9d94f0a887ea6d2b/xxhash-3.6.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:49e03e6fe2cac4a1bc64952dd250cf0dbc5ef4ebb7b8d96bce82e2de163c82a2", size = 193883, upload-time = "2025-10-02T14:34:43.249Z" }, + { url = "https://files.pythonhosted.org/packages/53/5a/ddbb83eee8e28b778eacfc5a85c969673e4023cdeedcfcef61f36731610b/xxhash-3.6.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:bd17fede52a17a4f9a7bc4472a5867cb0b160deeb431795c0e4abe158bc784e9", size = 210392, upload-time = "2025-10-02T14:34:45.042Z" }, + { url = "https://files.pythonhosted.org/packages/1e/c2/ff69efd07c8c074ccdf0a4f36fcdd3d27363665bcdf4ba399abebe643465/xxhash-3.6.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:6fb5f5476bef678f69db04f2bd1efbed3030d2aba305b0fc1773645f187d6a4e", size = 197898, upload-time = "2025-10-02T14:34:46.302Z" }, + { url = "https://files.pythonhosted.org/packages/58/ca/faa05ac19b3b622c7c9317ac3e23954187516298a091eb02c976d0d3dd45/xxhash-3.6.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:843b52f6d88071f87eba1631b684fcb4b2068cd2180a0224122fe4ef011a9374", size = 210655, upload-time = "2025-10-02T14:34:47.571Z" }, + { url = "https://files.pythonhosted.org/packages/d4/7a/06aa7482345480cc0cb597f5c875b11a82c3953f534394f620b0be2f700c/xxhash-3.6.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:7d14a6cfaf03b1b6f5f9790f76880601ccc7896aff7ab9cd8978a939c1eb7e0d", size = 414001, upload-time = "2025-10-02T14:34:49.273Z" }, + { url = "https://files.pythonhosted.org/packages/23/07/63ffb386cd47029aa2916b3d2f454e6cc5b9f5c5ada3790377d5430084e7/xxhash-3.6.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:418daf3db71e1413cfe211c2f9a528456936645c17f46b5204705581a45390ae", size = 191431, upload-time = "2025-10-02T14:34:50.798Z" }, + { url = "https://files.pythonhosted.org/packages/0f/93/14fde614cadb4ddf5e7cebf8918b7e8fac5ae7861c1875964f17e678205c/xxhash-3.6.0-cp312-cp312-win32.whl", hash = "sha256:50fc255f39428a27299c20e280d6193d8b63b8ef8028995323bf834a026b4fbb", size = 30617, upload-time = "2025-10-02T14:34:51.954Z" }, + { url = "https://files.pythonhosted.org/packages/13/5d/0d125536cbe7565a83d06e43783389ecae0c0f2ed037b48ede185de477c0/xxhash-3.6.0-cp312-cp312-win_amd64.whl", hash = "sha256:c0f2ab8c715630565ab8991b536ecded9416d615538be8ecddce43ccf26cbc7c", size = 31534, upload-time = "2025-10-02T14:34:53.276Z" }, + { url = "https://files.pythonhosted.org/packages/54/85/6ec269b0952ec7e36ba019125982cf11d91256a778c7c3f98a4c5043d283/xxhash-3.6.0-cp312-cp312-win_arm64.whl", hash = "sha256:eae5c13f3bc455a3bbb68bdc513912dc7356de7e2280363ea235f71f54064829", size = 27876, upload-time = "2025-10-02T14:34:54.371Z" }, + { url = "https://files.pythonhosted.org/packages/33/76/35d05267ac82f53ae9b0e554da7c5e281ee61f3cad44c743f0fcd354f211/xxhash-3.6.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:599e64ba7f67472481ceb6ee80fa3bd828fd61ba59fb11475572cc5ee52b89ec", size = 32738, upload-time = "2025-10-02T14:34:55.839Z" }, + { url = "https://files.pythonhosted.org/packages/31/a8/3fbce1cd96534a95e35d5120637bf29b0d7f5d8fa2f6374e31b4156dd419/xxhash-3.6.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:7d8b8aaa30fca4f16f0c84a5c8d7ddee0e25250ec2796c973775373257dde8f1", size = 30821, upload-time = "2025-10-02T14:34:57.219Z" }, + { url = "https://files.pythonhosted.org/packages/0c/ea/d387530ca7ecfa183cb358027f1833297c6ac6098223fd14f9782cd0015c/xxhash-3.6.0-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:d597acf8506d6e7101a4a44a5e428977a51c0fadbbfd3c39650cca9253f6e5a6", size = 194127, upload-time = "2025-10-02T14:34:59.21Z" }, + { url = "https://files.pythonhosted.org/packages/ba/0c/71435dcb99874b09a43b8d7c54071e600a7481e42b3e3ce1eb5226a5711a/xxhash-3.6.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:858dc935963a33bc33490128edc1c12b0c14d9c7ebaa4e387a7869ecc4f3e263", size = 212975, upload-time = "2025-10-02T14:35:00.816Z" }, + { url = "https://files.pythonhosted.org/packages/84/7a/c2b3d071e4bb4a90b7057228a99b10d51744878f4a8a6dd643c8bd897620/xxhash-3.6.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:ba284920194615cb8edf73bf52236ce2e1664ccd4a38fdb543506413529cc546", size = 212241, upload-time = "2025-10-02T14:35:02.207Z" }, + { url = "https://files.pythonhosted.org/packages/81/5f/640b6eac0128e215f177df99eadcd0f1b7c42c274ab6a394a05059694c5a/xxhash-3.6.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:4b54219177f6c6674d5378bd862c6aedf64725f70dd29c472eaae154df1a2e89", size = 445471, upload-time = "2025-10-02T14:35:03.61Z" }, + { url = "https://files.pythonhosted.org/packages/5e/1e/3c3d3ef071b051cc3abbe3721ffb8365033a172613c04af2da89d5548a87/xxhash-3.6.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:42c36dd7dbad2f5238950c377fcbf6811b1cdb1c444fab447960030cea60504d", size = 193936, upload-time = "2025-10-02T14:35:05.013Z" }, + { url = "https://files.pythonhosted.org/packages/2c/bd/4a5f68381939219abfe1c22a9e3a5854a4f6f6f3c4983a87d255f21f2e5d/xxhash-3.6.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f22927652cba98c44639ffdc7aaf35828dccf679b10b31c4ad72a5b530a18eb7", size = 210440, upload-time = "2025-10-02T14:35:06.239Z" }, + { url = "https://files.pythonhosted.org/packages/eb/37/b80fe3d5cfb9faff01a02121a0f4d565eb7237e9e5fc66e73017e74dcd36/xxhash-3.6.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:b45fad44d9c5c119e9c6fbf2e1c656a46dc68e280275007bbfd3d572b21426db", size = 197990, upload-time = "2025-10-02T14:35:07.735Z" }, + { url = "https://files.pythonhosted.org/packages/d7/fd/2c0a00c97b9e18f72e1f240ad4e8f8a90fd9d408289ba9c7c495ed7dc05c/xxhash-3.6.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:6f2580ffab1a8b68ef2b901cde7e55fa8da5e4be0977c68f78fc80f3c143de42", size = 210689, upload-time = "2025-10-02T14:35:09.438Z" }, + { url = "https://files.pythonhosted.org/packages/93/86/5dd8076a926b9a95db3206aba20d89a7fc14dd5aac16e5c4de4b56033140/xxhash-3.6.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:40c391dd3cd041ebc3ffe6f2c862f402e306eb571422e0aa918d8070ba31da11", size = 414068, upload-time = "2025-10-02T14:35:11.162Z" }, + { url = "https://files.pythonhosted.org/packages/af/3c/0bb129170ee8f3650f08e993baee550a09593462a5cddd8e44d0011102b1/xxhash-3.6.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f205badabde7aafd1a31e8ca2a3e5a763107a71c397c4481d6a804eb5063d8bd", size = 191495, upload-time = "2025-10-02T14:35:12.971Z" }, + { url = "https://files.pythonhosted.org/packages/e9/3a/6797e0114c21d1725e2577508e24006fd7ff1d8c0c502d3b52e45c1771d8/xxhash-3.6.0-cp313-cp313-win32.whl", hash = "sha256:2577b276e060b73b73a53042ea5bd5203d3e6347ce0d09f98500f418a9fcf799", size = 30620, upload-time = "2025-10-02T14:35:14.129Z" }, + { url = "https://files.pythonhosted.org/packages/86/15/9bc32671e9a38b413a76d24722a2bf8784a132c043063a8f5152d390b0f9/xxhash-3.6.0-cp313-cp313-win_amd64.whl", hash = "sha256:757320d45d2fbcce8f30c42a6b2f47862967aea7bf458b9625b4bbe7ee390392", size = 31542, upload-time = "2025-10-02T14:35:15.21Z" }, + { url = "https://files.pythonhosted.org/packages/39/c5/cc01e4f6188656e56112d6a8e0dfe298a16934b8c47a247236549a3f7695/xxhash-3.6.0-cp313-cp313-win_arm64.whl", hash = "sha256:457b8f85dec5825eed7b69c11ae86834a018b8e3df5e77783c999663da2f96d6", size = 27880, upload-time = "2025-10-02T14:35:16.315Z" }, + { url = "https://files.pythonhosted.org/packages/f3/30/25e5321c8732759e930c555176d37e24ab84365482d257c3b16362235212/xxhash-3.6.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:a42e633d75cdad6d625434e3468126c73f13f7584545a9cf34e883aa1710e702", size = 32956, upload-time = "2025-10-02T14:35:17.413Z" }, + { url = "https://files.pythonhosted.org/packages/9f/3c/0573299560d7d9f8ab1838f1efc021a280b5ae5ae2e849034ef3dee18810/xxhash-3.6.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:568a6d743219e717b07b4e03b0a828ce593833e498c3b64752e0f5df6bfe84db", size = 31072, upload-time = "2025-10-02T14:35:18.844Z" }, + { url = "https://files.pythonhosted.org/packages/7a/1c/52d83a06e417cd9d4137722693424885cc9878249beb3a7c829e74bf7ce9/xxhash-3.6.0-cp313-cp313t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:bec91b562d8012dae276af8025a55811b875baace6af510412a5e58e3121bc54", size = 196409, upload-time = "2025-10-02T14:35:20.31Z" }, + { url = "https://files.pythonhosted.org/packages/e3/8e/c6d158d12a79bbd0b878f8355432075fc82759e356ab5a111463422a239b/xxhash-3.6.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:78e7f2f4c521c30ad5e786fdd6bae89d47a32672a80195467b5de0480aa97b1f", size = 215736, upload-time = "2025-10-02T14:35:21.616Z" }, + { url = "https://files.pythonhosted.org/packages/bc/68/c4c80614716345d55071a396cf03d06e34b5f4917a467faf43083c995155/xxhash-3.6.0-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:3ed0df1b11a79856df5ffcab572cbd6b9627034c1c748c5566fa79df9048a7c5", size = 214833, upload-time = "2025-10-02T14:35:23.32Z" }, + { url = "https://files.pythonhosted.org/packages/7e/e9/ae27c8ffec8b953efa84c7c4a6c6802c263d587b9fc0d6e7cea64e08c3af/xxhash-3.6.0-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:0e4edbfc7d420925b0dd5e792478ed393d6e75ff8fc219a6546fb446b6a417b1", size = 448348, upload-time = "2025-10-02T14:35:25.111Z" }, + { url = "https://files.pythonhosted.org/packages/d7/6b/33e21afb1b5b3f46b74b6bd1913639066af218d704cc0941404ca717fc57/xxhash-3.6.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fba27a198363a7ef87f8c0f6b171ec36b674fe9053742c58dd7e3201c1ab30ee", size = 196070, upload-time = "2025-10-02T14:35:26.586Z" }, + { url = "https://files.pythonhosted.org/packages/96/b6/fcabd337bc5fa624e7203aa0fa7d0c49eed22f72e93229431752bddc83d9/xxhash-3.6.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:794fe9145fe60191c6532fa95063765529770edcdd67b3d537793e8004cabbfd", size = 212907, upload-time = "2025-10-02T14:35:28.087Z" }, + { url = "https://files.pythonhosted.org/packages/4b/d3/9ee6160e644d660fcf176c5825e61411c7f62648728f69c79ba237250143/xxhash-3.6.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:6105ef7e62b5ac73a837778efc331a591d8442f8ef5c7e102376506cb4ae2729", size = 200839, upload-time = "2025-10-02T14:35:29.857Z" }, + { url = "https://files.pythonhosted.org/packages/0d/98/e8de5baa5109394baf5118f5e72ab21a86387c4f89b0e77ef3e2f6b0327b/xxhash-3.6.0-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:f01375c0e55395b814a679b3eea205db7919ac2af213f4a6682e01220e5fe292", size = 213304, upload-time = "2025-10-02T14:35:31.222Z" }, + { url = "https://files.pythonhosted.org/packages/7b/1d/71056535dec5c3177eeb53e38e3d367dd1d16e024e63b1cee208d572a033/xxhash-3.6.0-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:d706dca2d24d834a4661619dcacf51a75c16d65985718d6a7d73c1eeeb903ddf", size = 416930, upload-time = "2025-10-02T14:35:32.517Z" }, + { url = "https://files.pythonhosted.org/packages/dc/6c/5cbde9de2cd967c322e651c65c543700b19e7ae3e0aae8ece3469bf9683d/xxhash-3.6.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:5f059d9faeacd49c0215d66f4056e1326c80503f51a1532ca336a385edadd033", size = 193787, upload-time = "2025-10-02T14:35:33.827Z" }, + { url = "https://files.pythonhosted.org/packages/19/fa/0172e350361d61febcea941b0cc541d6e6c8d65d153e85f850a7b256ff8a/xxhash-3.6.0-cp313-cp313t-win32.whl", hash = "sha256:1244460adc3a9be84731d72b8e80625788e5815b68da3da8b83f78115a40a7ec", size = 30916, upload-time = "2025-10-02T14:35:35.107Z" }, + { url = "https://files.pythonhosted.org/packages/ad/e6/e8cf858a2b19d6d45820f072eff1bea413910592ff17157cabc5f1227a16/xxhash-3.6.0-cp313-cp313t-win_amd64.whl", hash = "sha256:b1e420ef35c503869c4064f4a2f2b08ad6431ab7b229a05cce39d74268bca6b8", size = 31799, upload-time = "2025-10-02T14:35:36.165Z" }, + { url = "https://files.pythonhosted.org/packages/56/15/064b197e855bfb7b343210e82490ae672f8bc7cdf3ddb02e92f64304ee8a/xxhash-3.6.0-cp313-cp313t-win_arm64.whl", hash = "sha256:ec44b73a4220623235f67a996c862049f375df3b1052d9899f40a6382c32d746", size = 28044, upload-time = "2025-10-02T14:35:37.195Z" }, + { url = "https://files.pythonhosted.org/packages/7e/5e/0138bc4484ea9b897864d59fce9be9086030825bc778b76cb5a33a906d37/xxhash-3.6.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:a40a3d35b204b7cc7643cbcf8c9976d818cb47befcfac8bbefec8038ac363f3e", size = 32754, upload-time = "2025-10-02T14:35:38.245Z" }, + { url = "https://files.pythonhosted.org/packages/18/d7/5dac2eb2ec75fd771957a13e5dda560efb2176d5203f39502a5fc571f899/xxhash-3.6.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:a54844be970d3fc22630b32d515e79a90d0a3ddb2644d8d7402e3c4c8da61405", size = 30846, upload-time = "2025-10-02T14:35:39.6Z" }, + { url = "https://files.pythonhosted.org/packages/fe/71/8bc5be2bb00deb5682e92e8da955ebe5fa982da13a69da5a40a4c8db12fb/xxhash-3.6.0-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:016e9190af8f0a4e3741343777710e3d5717427f175adfdc3e72508f59e2a7f3", size = 194343, upload-time = "2025-10-02T14:35:40.69Z" }, + { url = "https://files.pythonhosted.org/packages/e7/3b/52badfb2aecec2c377ddf1ae75f55db3ba2d321c5e164f14461c90837ef3/xxhash-3.6.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4f6f72232f849eb9d0141e2ebe2677ece15adfd0fa599bc058aad83c714bb2c6", size = 213074, upload-time = "2025-10-02T14:35:42.29Z" }, + { url = "https://files.pythonhosted.org/packages/a2/2b/ae46b4e9b92e537fa30d03dbc19cdae57ed407e9c26d163895e968e3de85/xxhash-3.6.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:63275a8aba7865e44b1813d2177e0f5ea7eadad3dd063a21f7cf9afdc7054063", size = 212388, upload-time = "2025-10-02T14:35:43.929Z" }, + { url = "https://files.pythonhosted.org/packages/f5/80/49f88d3afc724b4ac7fbd664c8452d6db51b49915be48c6982659e0e7942/xxhash-3.6.0-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:3cd01fa2aa00d8b017c97eb46b9a794fbdca53fc14f845f5a328c71254b0abb7", size = 445614, upload-time = "2025-10-02T14:35:45.216Z" }, + { url = "https://files.pythonhosted.org/packages/ed/ba/603ce3961e339413543d8cd44f21f2c80e2a7c5cfe692a7b1f2cccf58f3c/xxhash-3.6.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0226aa89035b62b6a86d3c68df4d7c1f47a342b8683da2b60cedcddb46c4d95b", size = 194024, upload-time = "2025-10-02T14:35:46.959Z" }, + { url = "https://files.pythonhosted.org/packages/78/d1/8e225ff7113bf81545cfdcd79eef124a7b7064a0bba53605ff39590b95c2/xxhash-3.6.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c6e193e9f56e4ca4923c61238cdaced324f0feac782544eb4c6d55ad5cc99ddd", size = 210541, upload-time = "2025-10-02T14:35:48.301Z" }, + { url = "https://files.pythonhosted.org/packages/6f/58/0f89d149f0bad89def1a8dd38feb50ccdeb643d9797ec84707091d4cb494/xxhash-3.6.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:9176dcaddf4ca963d4deb93866d739a343c01c969231dbe21680e13a5d1a5bf0", size = 198305, upload-time = "2025-10-02T14:35:49.584Z" }, + { url = "https://files.pythonhosted.org/packages/11/38/5eab81580703c4df93feb5f32ff8fa7fe1e2c51c1f183ee4e48d4bb9d3d7/xxhash-3.6.0-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:c1ce4009c97a752e682b897aa99aef84191077a9433eb237774689f14f8ec152", size = 210848, upload-time = "2025-10-02T14:35:50.877Z" }, + { url = "https://files.pythonhosted.org/packages/5e/6b/953dc4b05c3ce678abca756416e4c130d2382f877a9c30a20d08ee6a77c0/xxhash-3.6.0-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:8cb2f4f679b01513b7adbb9b1b2f0f9cdc31b70007eaf9d59d0878809f385b11", size = 414142, upload-time = "2025-10-02T14:35:52.15Z" }, + { url = "https://files.pythonhosted.org/packages/08/a9/238ec0d4e81a10eb5026d4a6972677cbc898ba6c8b9dbaec12ae001b1b35/xxhash-3.6.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:653a91d7c2ab54a92c19ccf43508b6a555440b9be1bc8be553376778be7f20b5", size = 191547, upload-time = "2025-10-02T14:35:53.547Z" }, + { url = "https://files.pythonhosted.org/packages/f1/ee/3cf8589e06c2164ac77c3bf0aa127012801128f1feebf2a079272da5737c/xxhash-3.6.0-cp314-cp314-win32.whl", hash = "sha256:a756fe893389483ee8c394d06b5ab765d96e68fbbfe6fde7aa17e11f5720559f", size = 31214, upload-time = "2025-10-02T14:35:54.746Z" }, + { url = "https://files.pythonhosted.org/packages/02/5d/a19552fbc6ad4cb54ff953c3908bbc095f4a921bc569433d791f755186f1/xxhash-3.6.0-cp314-cp314-win_amd64.whl", hash = "sha256:39be8e4e142550ef69629c9cd71b88c90e9a5db703fecbcf265546d9536ca4ad", size = 32290, upload-time = "2025-10-02T14:35:55.791Z" }, + { url = "https://files.pythonhosted.org/packages/b1/11/dafa0643bc30442c887b55baf8e73353a344ee89c1901b5a5c54a6c17d39/xxhash-3.6.0-cp314-cp314-win_arm64.whl", hash = "sha256:25915e6000338999236f1eb68a02a32c3275ac338628a7eaa5a269c401995679", size = 28795, upload-time = "2025-10-02T14:35:57.162Z" }, + { url = "https://files.pythonhosted.org/packages/2c/db/0e99732ed7f64182aef4a6fb145e1a295558deec2a746265dcdec12d191e/xxhash-3.6.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:c5294f596a9017ca5a3e3f8884c00b91ab2ad2933cf288f4923c3fd4346cf3d4", size = 32955, upload-time = "2025-10-02T14:35:58.267Z" }, + { url = "https://files.pythonhosted.org/packages/55/f4/2a7c3c68e564a099becfa44bb3d398810cc0ff6749b0d3cb8ccb93f23c14/xxhash-3.6.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:1cf9dcc4ab9cff01dfbba78544297a3a01dafd60f3bde4e2bfd016cf7e4ddc67", size = 31072, upload-time = "2025-10-02T14:35:59.382Z" }, + { url = "https://files.pythonhosted.org/packages/c6/d9/72a29cddc7250e8a5819dad5d466facb5dc4c802ce120645630149127e73/xxhash-3.6.0-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:01262da8798422d0685f7cef03b2bd3f4f46511b02830861df548d7def4402ad", size = 196579, upload-time = "2025-10-02T14:36:00.838Z" }, + { url = "https://files.pythonhosted.org/packages/63/93/b21590e1e381040e2ca305a884d89e1c345b347404f7780f07f2cdd47ef4/xxhash-3.6.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:51a73fb7cb3a3ead9f7a8b583ffd9b8038e277cdb8cb87cf890e88b3456afa0b", size = 215854, upload-time = "2025-10-02T14:36:02.207Z" }, + { url = "https://files.pythonhosted.org/packages/ce/b8/edab8a7d4fa14e924b29be877d54155dcbd8b80be85ea00d2be3413a9ed4/xxhash-3.6.0-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:b9c6df83594f7df8f7f708ce5ebeacfc69f72c9fbaaababf6cf4758eaada0c9b", size = 214965, upload-time = "2025-10-02T14:36:03.507Z" }, + { url = "https://files.pythonhosted.org/packages/27/67/dfa980ac7f0d509d54ea0d5a486d2bb4b80c3f1bb22b66e6a05d3efaf6c0/xxhash-3.6.0-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:627f0af069b0ea56f312fd5189001c24578868643203bca1abbc2c52d3a6f3ca", size = 448484, upload-time = "2025-10-02T14:36:04.828Z" }, + { url = "https://files.pythonhosted.org/packages/8c/63/8ffc2cc97e811c0ca5d00ab36604b3ea6f4254f20b7bc658ca825ce6c954/xxhash-3.6.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:aa912c62f842dfd013c5f21a642c9c10cd9f4c4e943e0af83618b4a404d9091a", size = 196162, upload-time = "2025-10-02T14:36:06.182Z" }, + { url = "https://files.pythonhosted.org/packages/4b/77/07f0e7a3edd11a6097e990f6e5b815b6592459cb16dae990d967693e6ea9/xxhash-3.6.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:b465afd7909db30168ab62afe40b2fcf79eedc0b89a6c0ab3123515dc0df8b99", size = 213007, upload-time = "2025-10-02T14:36:07.733Z" }, + { url = "https://files.pythonhosted.org/packages/ae/d8/bc5fa0d152837117eb0bef6f83f956c509332ce133c91c63ce07ee7c4873/xxhash-3.6.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:a881851cf38b0a70e7c4d3ce81fc7afd86fbc2a024f4cfb2a97cf49ce04b75d3", size = 200956, upload-time = "2025-10-02T14:36:09.106Z" }, + { url = "https://files.pythonhosted.org/packages/26/a5/d749334130de9411783873e9b98ecc46688dad5db64ca6e04b02acc8b473/xxhash-3.6.0-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:9b3222c686a919a0f3253cfc12bb118b8b103506612253b5baeaac10d8027cf6", size = 213401, upload-time = "2025-10-02T14:36:10.585Z" }, + { url = "https://files.pythonhosted.org/packages/89/72/abed959c956a4bfc72b58c0384bb7940663c678127538634d896b1195c10/xxhash-3.6.0-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:c5aa639bc113e9286137cec8fadc20e9cd732b2cc385c0b7fa673b84fc1f2a93", size = 417083, upload-time = "2025-10-02T14:36:12.276Z" }, + { url = "https://files.pythonhosted.org/packages/0c/b3/62fd2b586283b7d7d665fb98e266decadf31f058f1cf6c478741f68af0cb/xxhash-3.6.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5c1343d49ac102799905e115aee590183c3921d475356cb24b4de29a4bc56518", size = 193913, upload-time = "2025-10-02T14:36:14.025Z" }, + { url = "https://files.pythonhosted.org/packages/9a/9a/c19c42c5b3f5a4aad748a6d5b4f23df3bed7ee5445accc65a0fb3ff03953/xxhash-3.6.0-cp314-cp314t-win32.whl", hash = "sha256:5851f033c3030dd95c086b4a36a2683c2ff4a799b23af60977188b057e467119", size = 31586, upload-time = "2025-10-02T14:36:15.603Z" }, + { url = "https://files.pythonhosted.org/packages/03/d6/4cc450345be9924fd5dc8c590ceda1db5b43a0a889587b0ae81a95511360/xxhash-3.6.0-cp314-cp314t-win_amd64.whl", hash = "sha256:0444e7967dac37569052d2409b00a8860c2135cff05502df4da80267d384849f", size = 32526, upload-time = "2025-10-02T14:36:16.708Z" }, + { url = "https://files.pythonhosted.org/packages/0f/c9/7243eb3f9eaabd1a88a5a5acadf06df2d83b100c62684b7425c6a11bcaa8/xxhash-3.6.0-cp314-cp314t-win_arm64.whl", hash = "sha256:bb79b1e63f6fd84ec778a4b1916dfe0a7c3fdb986c06addd5db3a0d413819d95", size = 28898, upload-time = "2025-10-02T14:36:17.843Z" }, + { url = "https://files.pythonhosted.org/packages/93/1e/8aec23647a34a249f62e2398c42955acd9b4c6ed5cf08cbea94dc46f78d2/xxhash-3.6.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:0f7b7e2ec26c1666ad5fc9dbfa426a6a3367ceaf79db5dd76264659d509d73b0", size = 30662, upload-time = "2025-10-02T14:37:01.743Z" }, + { url = "https://files.pythonhosted.org/packages/b8/0b/b14510b38ba91caf43006209db846a696ceea6a847a0c9ba0a5b1adc53d6/xxhash-3.6.0-pp311-pypy311_pp73-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:5dc1e14d14fa0f5789ec29a7062004b5933964bb9b02aae6622b8f530dc40296", size = 41056, upload-time = "2025-10-02T14:37:02.879Z" }, + { url = "https://files.pythonhosted.org/packages/50/55/15a7b8a56590e66ccd374bbfa3f9ffc45b810886c8c3b614e3f90bd2367c/xxhash-3.6.0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:881b47fc47e051b37d94d13e7455131054b56749b91b508b0907eb07900d1c13", size = 36251, upload-time = "2025-10-02T14:37:04.44Z" }, + { url = "https://files.pythonhosted.org/packages/62/b2/5ac99a041a29e58e95f907876b04f7067a0242cb85b5f39e726153981503/xxhash-3.6.0-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c6dc31591899f5e5666f04cc2e529e69b4072827085c1ef15294d91a004bc1bd", size = 32481, upload-time = "2025-10-02T14:37:05.869Z" }, + { url = "https://files.pythonhosted.org/packages/7b/d9/8d95e906764a386a3d3b596f3c68bb63687dfca806373509f51ce8eea81f/xxhash-3.6.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:15e0dac10eb9309508bfc41f7f9deaa7755c69e35af835db9cb10751adebc35d", size = 31565, upload-time = "2025-10-02T14:37:06.966Z" }, +] + [[package]] name = "yarl" version = "1.20.0" From b53a10df6ad1b4b1ddd6e43e3dc778574f42658a Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Fri, 31 Oct 2025 15:54:09 +0000 Subject: [PATCH 10/66] feat: first working example for model/module --- torchTextClassifiers/model/__init__.py | 2 + torchTextClassifiers/model/lightning.py | 163 ++++++++++++ torchTextClassifiers/model/model.py | 320 ++++++++++++++++++++++++ 3 files changed, 485 insertions(+) create mode 100644 torchTextClassifiers/model/__init__.py create mode 100644 torchTextClassifiers/model/lightning.py create mode 100644 torchTextClassifiers/model/model.py diff --git a/torchTextClassifiers/model/__init__.py b/torchTextClassifiers/model/__init__.py new file mode 100644 index 0000000..29d16c9 --- /dev/null +++ b/torchTextClassifiers/model/__init__.py @@ -0,0 +1,2 @@ +from .lightning import TextClassificationModule as TextClassificationModule +from .model import TextClassificationModel as TextClassificationModel diff --git a/torchTextClassifiers/model/lightning.py b/torchTextClassifiers/model/lightning.py new file mode 100644 index 0000000..1c2cce0 --- /dev/null +++ b/torchTextClassifiers/model/lightning.py @@ -0,0 +1,163 @@ +import pytorch_lightning as pl +import torch +from torchmetrics import Accuracy + +from .model import TextClassificationModel + +# ============================================================================ +# PyTorch Lightning Module +# ============================================================================ + + +class TextClassificationModule(pl.LightningModule): + """Pytorch Lightning Module for FastTextModel.""" + + def __init__( + self, + model: TextClassificationModel, + loss, + optimizer, + optimizer_params, + scheduler, + scheduler_params, + scheduler_interval="epoch", + **kwargs, + ): + """ + Initialize FastTextModule. + + Args: + model: Model. + loss: Loss + optimizer: Optimizer + optimizer_params: Optimizer parameters. + scheduler: Scheduler. + scheduler_params: Scheduler parameters. + scheduler_interval: Scheduler interval. + """ + super().__init__() + self.save_hyperparameters(ignore=["model", "loss"]) + + self.model = model + self.loss = loss + self.accuracy_fn = Accuracy(task="multiclass", num_classes=self.model.num_classes) + self.optimizer = optimizer + self.optimizer_params = optimizer_params + self.scheduler = scheduler + self.scheduler_params = scheduler_params + self.scheduler_interval = scheduler_interval + + def forward(self, batch) -> torch.Tensor: + """ + Perform forward-pass. + + Args: + batch (List[torch.LongTensor]): Batch to perform forward-pass on. + + Returns (torch.Tensor): Prediction. + """ + return self.model( + encoded_text=batch["input_ids"], + attention_mask=batch["attention_mask"], + categorical_vars=batch.get("categorical_vars", None), + ) + + def training_step(self, batch, batch_idx: int) -> torch.Tensor: + """ + Training step. + + Args: + batch (List[torch.LongTensor]): Training batch. + batch_idx (int): Batch index. + + Returns (torch.Tensor): Loss tensor. + """ + + targets = batch["labels"] + + outputs = self.forward(batch) + loss = self.loss(outputs, targets) + self.log("train_loss", loss, on_epoch=True, on_step=True, prog_bar=True) + accuracy = self.accuracy_fn(outputs, targets) + self.log("train_accuracy", accuracy, on_epoch=True, on_step=False, prog_bar=True) + + torch.cuda.empty_cache() + + return loss + + def validation_step(self, batch, batch_idx: int): + """ + Validation step. + + Args: + batch (List[torch.LongTensor]): Validation batch. + batch_idx (int): Batch index. + + Returns (torch.Tensor): Loss tensor. + """ + targets = batch["labels"] + + outputs = self.forward(batch) + loss = self.loss(outputs, targets) + self.log("val_loss", loss, on_epoch=True, on_step=False, prog_bar=True, sync_dist=True) + + accuracy = self.accuracy_fn(outputs, targets) + self.log("val_accuracy", accuracy, on_epoch=True, on_step=False, prog_bar=True) + return loss + + def test_step(self, batch, batch_idx: int): + """ + Test step. + + Args: + batch (List[torch.LongTensor]): Test batch. + batch_idx (int): Batch index. + + Returns (torch.Tensor): Loss tensor. + """ + targets = batch["labels"] + + outputs = self.forward(batch) + loss = self.loss(outputs, targets) + + accuracy = self.accuracy_fn(outputs, targets) + + return loss, accuracy + + def predict_step(self, batch, batch_idx: int, dataloader_idx: int = 0): + """ + Prediction step. + + Args: + batch (List[torch.LongTensor]): Prediction batch. + batch_idx (int): Batch index. + dataloader_idx (int): Dataloader index. + + Returns (torch.Tensor): Predictions. + """ + outputs = self.forward(batch) + return outputs + + def configure_optimizers(self): + """ + Configure optimizer for Pytorch lighting. + + Returns: Optimizer and scheduler for pytorch lighting. + """ + optimizer = self.optimizer(self.parameters(), **self.optimizer_params) + + # Only use scheduler if it's not ReduceLROnPlateau or if we can ensure val_loss is available + # For complex training setups, sometimes val_loss is not available every epoch + if hasattr(self.scheduler, "__name__") and "ReduceLROnPlateau" in self.scheduler.__name__: + # For ReduceLROnPlateau, use train_loss as it's always available + scheduler = self.scheduler(optimizer, **self.scheduler_params) + scheduler_config = { + "scheduler": scheduler, + "monitor": "train_loss", + "interval": self.scheduler_interval, + } + return [optimizer], [scheduler_config] + else: + # For other schedulers (StepLR, etc.), no monitoring needed + scheduler = self.scheduler(optimizer, **self.scheduler_params) + return [optimizer], [scheduler] diff --git a/torchTextClassifiers/model/model.py b/torchTextClassifiers/model/model.py new file mode 100644 index 0000000..ffb3485 --- /dev/null +++ b/torchTextClassifiers/model/model.py @@ -0,0 +1,320 @@ +"""FastText model components. + +This module contains the PyTorch model, Lightning module, and dataset classes +for FastText classification. Consolidates what was previously in pytorch_model.py, +lightning_module.py, and dataset.py. +""" + +import logging +from typing import Annotated, List, Union + +import torch +from torch import nn + +try: + from captum.attr import LayerIntegratedGradients + + HAS_CAPTUM = True +except ImportError: + HAS_CAPTUM = False + +from torchTextClassifiers.utilities.checkers import validate_categorical_inputs + +logger = logging.getLogger(__name__) + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + handlers=[logging.StreamHandler()], +) + + +# ============================================================================ +# PyTorch Model +# ============================================================================ + + +class TextClassificationModel(nn.Module): + """FastText Pytorch Model.""" + + def __init__( + self, + embedding_dim: int, + num_classes: int, + tokenizer=None, + num_rows: int = None, + categorical_vocabulary_sizes: List[int] = None, + categorical_embedding_dims: Union[List[int], int] = None, + sparse: bool = False, + ): + """ + Constructor for the FastTextModel class. + + Args: + embedding_dim (int): Dimension of the text embedding space. + buckets (int): Number of rows in the embedding matrix. + num_classes (int): Number of classes. + categorical_vocabulary_sizes (List[int]): List of the number of + modalities for additional categorical features. + padding_idx (int, optional): Padding index for the text + descriptions. Defaults to 0. + sparse (bool): Indicates if Embedding layer is sparse. + """ + super().__init__() + + if isinstance(categorical_embedding_dims, int): + # if provided categorical embedding dims is an int, average the categorical embeddings before concatenating to sentence embedding + self.average_cat_embed = True + else: + self.average_cat_embed = False + + categorical_vocabulary_sizes, categorical_embedding_dims, num_categorical_features = ( + validate_categorical_inputs( + categorical_vocabulary_sizes, + categorical_embedding_dims, + num_categorical_features=None, + ) + ) + + assert ( + isinstance(categorical_embedding_dims, list) or categorical_embedding_dims is None + ), "categorical_embedding_dims must be a list of int at this stage" + + if categorical_embedding_dims is None: + self.average_cat_embed = False + + if tokenizer is None: + if num_rows is None: + raise ValueError( + "Either tokenizer or num_rows must be provided (number of rows in the embedding matrix)." + ) + else: + if num_rows is not None: + if num_rows != tokenizer.vocab_size: + logger.warning( + "num_rows is different from the number of tokens in the tokenizer. Using provided num_rows." + ) + else: + num_rows = tokenizer.vocab_size + + self.num_rows = num_rows + + self.num_classes = num_classes + self.tokenizer = tokenizer + self.padding_idx = self.tokenizer.padding_idx + self.embedding_dim = embedding_dim + self.sparse = sparse + + self.categorical_embedding_dims = categorical_embedding_dims + + self.embeddings = nn.Embedding( + embedding_dim=embedding_dim, + num_embeddings=num_rows, + padding_idx=self.padding_idx, + sparse=sparse, + ) + + self.categorical_embedding_layers = {} + + # Entry dim for the last layer: + # 1. embedding_dim if no categorical variables or summing the categrical embeddings to sentence embedding + # 2. embedding_dim + cat_embedding_dim if averaging the categorical embeddings before concatenating to sentence embedding (categorical_embedding_dims is a int) + # 3. embedding_dim + sum(categorical_embedding_dims) if concatenating individually the categorical embeddings to sentence embedding (no averaging, categorical_embedding_dims is a list) + dim_in_last_layer = embedding_dim + if self.average_cat_embed: + dim_in_last_layer += categorical_embedding_dims[0] + + if categorical_vocabulary_sizes is not None: + self.categorical_variables = True + for var_idx, num_rows in enumerate(categorical_vocabulary_sizes): + if categorical_embedding_dims is not None: + emb = nn.Embedding( + embedding_dim=categorical_embedding_dims[var_idx], num_embeddings=num_rows + ) # concatenate to sentence embedding + if not self.average_cat_embed: + dim_in_last_layer += categorical_embedding_dims[var_idx] + else: + emb = nn.Embedding( + embedding_dim=embedding_dim, num_embeddings=num_rows + ) # sum to sentence embedding + self.categorical_embedding_layers[var_idx] = emb + setattr(self, "emb_{}".format(var_idx), emb) + else: + self.categorical_variables = False + + self.fc = nn.Linear(dim_in_last_layer, num_classes) + + def _get_sentence_embedding( + self, token_embeddings: torch.Tensor, attention_mask: torch.Tensor + ) -> torch.Tensor: + """ + Compute sentence embedding from encoded text. + + Args: + encoded_text (torch.Tensor[Long]), shape (batch_size, seq_len): Tokenized + padded text + + Returns: + torch.Tensor: Sentence embeddings, shape (batch_size, embedding_dim) + """ + + # average over non-pad token embeddings. PAD always has 0 vector so no influence in the sum + # attention mask has 1 for non-pad tokens and 0 for pad token positions + # TODO: add attention logic at some point + sentence_embedding = token_embeddings.sum(dim=1) / attention_mask.sum( + dim=1, keepdim=True + ).clamp(min=1.0) # sum is over seq_len dim + + sentence_embedding = torch.nan_to_num(sentence_embedding, 0.0) + + return sentence_embedding + + def forward( + self, + encoded_text: Annotated[torch.Tensor, "batch seq_len"], + attention_mask: Annotated[torch.Tensor, "batch seq_len"], + categorical_vars: Annotated[torch.Tensor, "batch num_cats"], + ) -> torch.Tensor: + """ + Memory-efficient forward pass implementation. + + Args: + encoded_text (torch.Tensor[Long]), shape (batch_size, seq_len): Tokenized + padded text + additional_inputs (torch.Tensor[Long]): Additional categorical features, (batch_size, num_categorical_features) + + Returns: + torch.Tensor: Model output scores for each class + """ + + # Ensure correct dtype and device once + if encoded_text.dtype != torch.long: + encoded_text = encoded_text.to(torch.long) + + # Compute embeddings and averaging in a memory-efficient way + token_embeddings = self.embeddings(encoded_text) # (batch_size, seq_len, embedding_dim) + + x_text = self._get_sentence_embedding( + token_embeddings=token_embeddings, attention_mask=attention_mask + ) + + # Handle categorical variables efficiently + if self.categorical_variables and categorical_vars.numel() > 0: + cat_embeds = [] + # Process categorical embeddings in batch + for i, (_, embed_layer) in enumerate(self.categorical_embedding_layers.items()): + cat_input = categorical_vars[:, i].long() + + # Check if categorical values are within valid range + vocab_size = embed_layer.num_embeddings + max_val = cat_input.max().item() + min_val = cat_input.min().item() + + if max_val >= vocab_size or min_val < 0: + raise ValueError( + f"Categorical feature {i}: values range [{min_val}, {max_val}] exceed vocabulary size {vocab_size}." + ) + + cat_embed = embed_layer(cat_input) + if cat_embed.dim() > 2: + cat_embed = cat_embed.squeeze(1) + cat_embeds.append(cat_embed) + + if self.categorical_embedding_dims is not None: + if self.average_cat_embed: + # Stack and average in one operation + x_cat = torch.stack(cat_embeds, dim=0).mean(dim=0) + x_combined = torch.cat([x_text, x_cat], dim=1) + else: + # Optimize concatenation + x_combined = torch.cat([x_text] + cat_embeds, dim=1) + else: + # Sum embeddings efficiently + x_combined = x_text + torch.stack(cat_embeds, dim=0).sum(dim=0) + else: + x_combined = x_text + + # Final linear layer + return self.fc(x_combined) + + @torch.no_grad() + def predict( + self, + text: List[str], + categorical_variables: List[List[int]] = None, + top_k=1, + explain=False, + ): + """ + Args: + text (List[str]): A list of text observations. + params (Optional[Dict[str, Any]]): Additional parameters to + pass to the model for inference. + top_k (int): for each sentence, return the top_k most likely predictions (default: 1) + explain (bool): launch gradient integration to have an explanation of the prediction (default: False) + preprocess (bool): If True, preprocess text. Needs unidecode library. + + Returns: + if explain is False: + predictions (torch.Tensor, shape (len(text), top_k)): A tensor containing the top_k most likely codes to the query. + confidence (torch.Tensor, shape (len(text), top_k)): A tensor array containing the corresponding confidence scores. + if explain is True: + predictions (torch.Tensor, shape (len(text), top_k)): Containing the top_k most likely codes to the query. + confidence (torch.Tensor, shape (len(text), top_k)): Corresponding confidence scores. + all_attributions (torch.Tensor, shape (len(text), top_k, seq_len)): A tensor containing the attributions for each token in the text. + x (torch.Tensor): A tensor containing the token indices of the text. + id_to_token_dicts (List[Dict[int, str]]): A list of dictionaries mapping token indices to tokens (one for each sentence). + token_to_id_dicts (List[Dict[str, int]]): A list of dictionaries mapping tokens to token indices: the reverse of those in id_to_token_dicts. + text (list[str]): A plist containing the preprocessed text (one line for each sentence). + """ + + if explain: + if not HAS_CAPTUM: + raise ImportError( + "Captum is not installed and is required for explainability. Run 'pip install/uv add torchFastText[explainability]'." + ) + lig = LayerIntegratedGradients( + self, self.embeddings + ) # initialize a Captum layer gradient integrator + + self.eval() + + tokenize_output = self.tokenizer.tokenize(text) + + encoded_text = tokenize_output["input_ids"] # (batch_size, seq_len) + attention_mask = tokenize_output["attention_mask"] # (batch_size, seq_len) + + if categorical_variables is not None: + categorical_vars = torch.tensor( + categorical_variables, dtype=torch.float32 + ) # (batch_size, num_categorical_features) + else: + categorical_vars = torch.empty((encoded_text.shape[0], 0), dtype=torch.float32) + + pred = self( + encoded_text, attention_mask, categorical_vars + ) # forward pass, contains the prediction scores (len(text), num_classes) + label_scores = pred.detach().cpu() + label_scores_topk = torch.topk(label_scores, k=top_k, dim=1) + + predictions = label_scores_topk.indices # get the top_k most likely predictions + confidence = torch.round(label_scores_topk.values, decimals=2) # and their scores + + if explain: + all_attributions = [] + for k in range(top_k): + attributions = lig.attribute( + (encoded_text, attention_mask, categorical_vars), + target=torch.Tensor(predictions[:, k]).long(), + ) # (batch_size, seq_len) + attributions = attributions.sum(dim=-1) + all_attributions.append(attributions.detach().cpu()) + + all_attributions = torch.stack(all_attributions, dim=1) # (batch_size, top_k, seq_len) + + return { + "prediction": predictions, + "confidence": confidence, + "attributions": all_attributions, + } + else: + return predictions, confidence From 6f3417c8de73e962ef9c4f397725e833c7f8f7d8 Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Fri, 31 Oct 2025 15:54:34 +0000 Subject: [PATCH 11/66] chore: fix signature --- torchTextClassifiers/tokenizers/base.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/torchTextClassifiers/tokenizers/base.py b/torchTextClassifiers/tokenizers/base.py index 07de50b..d672f77 100644 --- a/torchTextClassifiers/tokenizers/base.py +++ b/torchTextClassifiers/tokenizers/base.py @@ -1,4 +1,5 @@ from abc import ABC, abstractmethod +from typing import List, Union class BaseTokenizer(ABC): @@ -6,7 +7,7 @@ def __init__(self, vocab_size: int): self.vocab_size = vocab_size @abstractmethod - def tokenize(self, text: str) -> list: + def tokenize(self, text: Union[str, List[str]]) -> list: """Tokenizes the raw input text into a list of tokens.""" pass @@ -21,7 +22,7 @@ def __init__(self, vocab_size: int): self.trained = False self.tokenizer = None - def tokenize(self, text: str) -> list: + def tokenize(self, text: Union[str, List[str]]) -> list: if not self.trained: raise RuntimeError("Tokenizer must be trained before tokenization.") From c600f18472a10a10e59e639a6c416aa1606baae0 Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Mon, 3 Nov 2025 18:47:59 +0000 Subject: [PATCH 12/66] chore: default value for batch_idx in predict --- torchTextClassifiers/model/lightning.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchTextClassifiers/model/lightning.py b/torchTextClassifiers/model/lightning.py index 1c2cce0..b6f6e01 100644 --- a/torchTextClassifiers/model/lightning.py +++ b/torchTextClassifiers/model/lightning.py @@ -124,7 +124,7 @@ def test_step(self, batch, batch_idx: int): return loss, accuracy - def predict_step(self, batch, batch_idx: int, dataloader_idx: int = 0): + def predict_step(self, batch, batch_idx: int = 0, dataloader_idx: int = 0): """ Prediction step. From 85cb8b844a9099af31efa5bcf88581b006d82850 Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Mon, 3 Nov 2025 18:56:52 +0000 Subject: [PATCH 13/66] feat!: violently modularize and simplify forward+checking ClassificationHead and CatVarNet objetcs maximum flexibility TextFeaturizer TODO --- .../model/categorical_var_net.py | 122 ++++++++++++++++++ .../model/classification_heads.py | 36 ++++++ torchTextClassifiers/model/model.py | 108 +++------------- torchTextClassifiers/utilities/checkers.py | 60 +-------- 4 files changed, 184 insertions(+), 142 deletions(-) create mode 100644 torchTextClassifiers/model/categorical_var_net.py create mode 100644 torchTextClassifiers/model/classification_heads.py diff --git a/torchTextClassifiers/model/categorical_var_net.py b/torchTextClassifiers/model/categorical_var_net.py new file mode 100644 index 0000000..c217902 --- /dev/null +++ b/torchTextClassifiers/model/categorical_var_net.py @@ -0,0 +1,122 @@ +from enum import Enum +from typing import List, Optional, Union + +import torch +from torch import nn + + +class ForwardType(Enum): + SUM_TO_TEXT = "EMBEDDING_SUM_TO_TEXT" + AVERAGE_AND_CONCAT = "EMBEDDING_AVERAGE_AND_CONCAT" + CONCATENATE_ALL = "EMBEDDING_CONCATENATE_ALL" + + +class CategoricalVariableNet(nn.Module): + def __init__( + self, + categorical_vocabulary_sizes: List[int], + categorical_embedding_dims: Optional[Union[List[int], int]] = None, + text_embedding_dim: Optional[int] = None, + ): + super().__init__() + + self.categorical_vocabulary_sizes = categorical_vocabulary_sizes + self.categorical_embedding_dims = categorical_embedding_dims + self.text_embedding_dim = text_embedding_dim + + self._validate_categorical_inputs() + + self.categorical_embedding_layers = {} + + for var_idx, num_rows in enumerate(self.categorical_vocabulary_sizes): + emb_layer = nn.Embedding( + num_embeddings=num_rows, + embedding_dim=self.categorical_embedding_dims[var_idx], + ) + self.categorical_embedding_layers[var_idx] = emb_layer + setattr(self, f"categorical_embedding_{var_idx}", emb_layer) + + def forward(self, categorical_vars_tensor: torch.Tensor) -> torch.Tensor: + cat_embeds = self._get_cat_embeds(categorical_vars_tensor) + if self.forward_type == ForwardType.SUM_TO_TEXT: + x_combined = torch.stack(cat_embeds, dim=0).sum(dim=0) # (bs, text_embed_dim) + elif self.forward_type == ForwardType.AVERAGE_AND_CONCAT: + x_combined = torch.stack(cat_embeds, dim=0).mean(dim=0) # (bs, embed_dim) + elif self.forward_type == ForwardType.CONCATENATE_ALL: + x_combined = torch.cat(cat_embeds, dim=1) # (bs, sum of all cat embed dims) + else: + raise ValueError(f"Unknown forward type: {self.forward_type}") + + assert ( + x_combined.dim() == 2 + ), "Output combined tensor must be 2-dimensional (batch_size, embed_dim)" + assert x_combined.size(1) == self.output_dim + + return x_combined + + def _get_cat_embeds(self, categorical_vars_tensor: torch.Tensor): + cat_embeds = [] + + for i, embed_layer in self.categorical_embedding_layers.items(): + cat_var_tensor = categorical_vars_tensor[:, i] + + # Check if categorical values are within valid range + vocab_size = embed_layer.num_embeddings + max_val = cat_var_tensor.max().item() + min_val = cat_var_tensor.min().item() + + if max_val >= vocab_size or min_val < 0: + raise ValueError( + f"Categorical feature {i}: values range [{min_val}, {max_val}] exceed vocabulary size {vocab_size}." + ) + + cat_embed = embed_layer(cat_var_tensor) + if cat_embed.dim() > 2: + cat_embed = cat_embed.squeeze(1) + cat_embeds.append(cat_embed) + + return cat_embeds + + def _validate_categorical_inputs(self): + categorical_vocabulary_sizes = self.categorical_vocabulary_sizes + categorical_embedding_dims = self.categorical_embedding_dims + + if not isinstance(categorical_vocabulary_sizes, list): + raise TypeError("categorical_vocabulary_sizes must be a list of int") + + if isinstance(categorical_embedding_dims, list): + if len(categorical_vocabulary_sizes) != len(categorical_embedding_dims): + raise ValueError( + "Categorical vocabulary sizes and their embedding dimensions must have the same length" + ) + + num_categorical_features = len(categorical_vocabulary_sizes) + + # "Transform" embedding dims into a suitable list, or stay None + if categorical_embedding_dims is not None: + if isinstance(categorical_embedding_dims, int): + self.forward_type = ForwardType.AVERAGE_AND_CONCAT + self.output_dim = categorical_embedding_dims + categorical_embedding_dims = [categorical_embedding_dims] * num_categorical_features + + elif isinstance(categorical_embedding_dims, list): + self.forward_type = ForwardType.CONCATENATE_ALL + self.output_dim = sum(categorical_embedding_dims) + else: + raise TypeError("categorical_embedding_dims must be an int, a list of int or None") + else: + if self.text_embedding_dim is None: + raise ValueError( + "If categorical_embedding_dims is None, text_embedding_dim must be provided" + ) + self.forward_type = ForwardType.SUM_TO_TEXT + self.output_dim = self.text_embedding_dim + categorical_embedding_dims = [self.text_embedding_dim] * num_categorical_features + + assert ( + isinstance(categorical_embedding_dims, list) or categorical_embedding_dims is None + ), "categorical_embedding_dims must be a list of int at this point" + + self.categorical_vocabulary_sizes = categorical_vocabulary_sizes + self.categorical_embedding_dims = categorical_embedding_dims + self.num_categorical_features = num_categorical_features diff --git a/torchTextClassifiers/model/classification_heads.py b/torchTextClassifiers/model/classification_heads.py new file mode 100644 index 0000000..a5b29c3 --- /dev/null +++ b/torchTextClassifiers/model/classification_heads.py @@ -0,0 +1,36 @@ +import torch +from torch import nn + + +class ClassificationHead(nn.Module): + def __init__(self, input_dim=None, num_classes=None, net=None): + super().__init__() + if net is not None: + self.net = net + self.input_dim = net.in_features + self.num_classes = net.out_features + else: + assert ( + input_dim is not None and num_classes is not None + ), "Either net or both input_dim and num_classes must be provided." + self.net = nn.Linear(input_dim, num_classes) + self.input_dim, self.num_classes = self._get_linear_input_output_dims(self.net) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.net(x) + + @staticmethod + def _get_linear_input_output_dims(module: nn.Module): + """ + Returns (input_dim, output_dim) for any module containing Linear layers. + Works for Linear, Sequential, or nested models. + """ + # Collect all Linear layers recursively + linears = [m for m in module.modules() if isinstance(m, nn.Linear)] + + if not linears: + raise ValueError("No Linear layers found in the given module.") + + input_dim = linears[0].in_features + output_dim = linears[-1].out_features + return input_dim, output_dim diff --git a/torchTextClassifiers/model/model.py b/torchTextClassifiers/model/model.py index ffb3485..a7633e6 100644 --- a/torchTextClassifiers/model/model.py +++ b/torchTextClassifiers/model/model.py @@ -18,7 +18,8 @@ except ImportError: HAS_CAPTUM = False -from torchTextClassifiers.utilities.checkers import validate_categorical_inputs +from torchTextClassifiers.model.categorical_var_net import CategoricalVariableNet, ForwardType +from torchTextClassifiers.model.classification_heads import ClassificationHead logger = logging.getLogger(__name__) @@ -41,7 +42,8 @@ class TextClassificationModel(nn.Module): def __init__( self, embedding_dim: int, - num_classes: int, + classification_head: ClassificationHead, + categorical_variable_net: CategoricalVariableNet = None, tokenizer=None, num_rows: int = None, categorical_vocabulary_sizes: List[int] = None, @@ -63,27 +65,6 @@ def __init__( """ super().__init__() - if isinstance(categorical_embedding_dims, int): - # if provided categorical embedding dims is an int, average the categorical embeddings before concatenating to sentence embedding - self.average_cat_embed = True - else: - self.average_cat_embed = False - - categorical_vocabulary_sizes, categorical_embedding_dims, num_categorical_features = ( - validate_categorical_inputs( - categorical_vocabulary_sizes, - categorical_embedding_dims, - num_categorical_features=None, - ) - ) - - assert ( - isinstance(categorical_embedding_dims, list) or categorical_embedding_dims is None - ), "categorical_embedding_dims must be a list of int at this stage" - - if categorical_embedding_dims is None: - self.average_cat_embed = False - if tokenizer is None: if num_rows is None: raise ValueError( @@ -99,8 +80,6 @@ def __init__( num_rows = tokenizer.vocab_size self.num_rows = num_rows - - self.num_classes = num_classes self.tokenizer = tokenizer self.padding_idx = self.tokenizer.padding_idx self.embedding_dim = embedding_dim @@ -115,35 +94,9 @@ def __init__( sparse=sparse, ) - self.categorical_embedding_layers = {} - - # Entry dim for the last layer: - # 1. embedding_dim if no categorical variables or summing the categrical embeddings to sentence embedding - # 2. embedding_dim + cat_embedding_dim if averaging the categorical embeddings before concatenating to sentence embedding (categorical_embedding_dims is a int) - # 3. embedding_dim + sum(categorical_embedding_dims) if concatenating individually the categorical embeddings to sentence embedding (no averaging, categorical_embedding_dims is a list) - dim_in_last_layer = embedding_dim - if self.average_cat_embed: - dim_in_last_layer += categorical_embedding_dims[0] - - if categorical_vocabulary_sizes is not None: - self.categorical_variables = True - for var_idx, num_rows in enumerate(categorical_vocabulary_sizes): - if categorical_embedding_dims is not None: - emb = nn.Embedding( - embedding_dim=categorical_embedding_dims[var_idx], num_embeddings=num_rows - ) # concatenate to sentence embedding - if not self.average_cat_embed: - dim_in_last_layer += categorical_embedding_dims[var_idx] - else: - emb = nn.Embedding( - embedding_dim=embedding_dim, num_embeddings=num_rows - ) # sum to sentence embedding - self.categorical_embedding_layers[var_idx] = emb - setattr(self, "emb_{}".format(var_idx), emb) - else: - self.categorical_variables = False - - self.fc = nn.Linear(dim_in_last_layer, num_classes) + self.classification_head = classification_head + self.categorical_variable_net = categorical_variable_net + self.num_classes = self.classification_head.num_classes def _get_sentence_embedding( self, token_embeddings: torch.Tensor, attention_mask: torch.Tensor @@ -197,44 +150,21 @@ def forward( token_embeddings=token_embeddings, attention_mask=attention_mask ) - # Handle categorical variables efficiently - if self.categorical_variables and categorical_vars.numel() > 0: - cat_embeds = [] - # Process categorical embeddings in batch - for i, (_, embed_layer) in enumerate(self.categorical_embedding_layers.items()): - cat_input = categorical_vars[:, i].long() - - # Check if categorical values are within valid range - vocab_size = embed_layer.num_embeddings - max_val = cat_input.max().item() - min_val = cat_input.min().item() - - if max_val >= vocab_size or min_val < 0: - raise ValueError( - f"Categorical feature {i}: values range [{min_val}, {max_val}] exceed vocabulary size {vocab_size}." - ) + if self.categorical_variable_net: + x_cat = self.categorical_variable_net(categorical_vars) - cat_embed = embed_layer(cat_input) - if cat_embed.dim() > 2: - cat_embed = cat_embed.squeeze(1) - cat_embeds.append(cat_embed) - - if self.categorical_embedding_dims is not None: - if self.average_cat_embed: - # Stack and average in one operation - x_cat = torch.stack(cat_embeds, dim=0).mean(dim=0) - x_combined = torch.cat([x_text, x_cat], dim=1) - else: - # Optimize concatenation - x_combined = torch.cat([x_text] + cat_embeds, dim=1) + if ( + self.categorical_variable_net.forward_type == ForwardType.AVERAGE_AND_CONCAT + or self.categorical_variable_net.forward_type == ForwardType.CONCATENATE_ALL + ): + x_combined = torch.cat((x_text, x_cat), dim=1) else: - # Sum embeddings efficiently - x_combined = x_text + torch.stack(cat_embeds, dim=0).sum(dim=0) - else: - x_combined = x_text + assert self.categorical_variable_net.forward_type == ForwardType.SUM_TO_TEXT + x_combined = x_text + x_cat + + logits = self.classification_head(x_combined) - # Final linear layer - return self.fc(x_combined) + return logits @torch.no_grad() def predict( diff --git a/torchTextClassifiers/utilities/checkers.py b/torchTextClassifiers/utilities/checkers.py index 1fef3fc..11fa6a7 100644 --- a/torchTextClassifiers/utilities/checkers.py +++ b/torchTextClassifiers/utilities/checkers.py @@ -1,6 +1,5 @@ -import logging import json -from typing import Optional, Union, Type, List +import logging import numpy as np @@ -8,9 +7,9 @@ def check_X(X): - assert isinstance(X, np.ndarray), ( - "X must be a numpy array of shape (N,d), with the first column being the text and the rest being the categorical variables." - ) + assert isinstance( + X, np.ndarray + ), "X must be a numpy array of shape (N,d), with the first column being the text and the rest being the categorical variables." try: if X.ndim > 1: @@ -40,9 +39,9 @@ def check_X(X): def check_Y(Y): assert isinstance(Y, np.ndarray), "Y must be a numpy array of shape (N,) or (N,1)." - assert len(Y.shape) == 1 or (len(Y.shape) == 2 and Y.shape[1] == 1), ( - "Y must be a numpy array of shape (N,) or (N,1)." - ) + assert len(Y.shape) == 1 or ( + len(Y.shape) == 2 and Y.shape[1] == 1 + ), "Y must be a numpy array of shape (N,) or (N,1)." try: Y = Y.astype(int) @@ -52,51 +51,6 @@ def check_Y(Y): return Y -def validate_categorical_inputs( - categorical_vocabulary_sizes: List[int], - categorical_embedding_dims: Union[List[int], int], - num_categorical_features: int = None, -): - if categorical_vocabulary_sizes is None: - logger.warning("No categorical_vocabulary_sizes. It will be inferred later.") - return None, None, None - - else: - if not isinstance(categorical_vocabulary_sizes, list): - raise TypeError("categorical_vocabulary_sizes must be a list of int") - - if isinstance(categorical_embedding_dims, list): - if len(categorical_vocabulary_sizes) != len(categorical_embedding_dims): - raise ValueError( - "Categorical vocabulary sizes and their embedding dimensions must have the same length" - ) - - if num_categorical_features is not None: - if len(categorical_vocabulary_sizes) != num_categorical_features: - raise ValueError( - "len(categorical_vocabulary_sizes) must be equal to num_categorical_features" - ) - else: - num_categorical_features = len(categorical_vocabulary_sizes) - - assert num_categorical_features is not None, ( - "num_categorical_features should be inferred at this point." - ) - - # "Transform" embedding dims into a suitable list, or stay None - if categorical_embedding_dims is not None: - if isinstance(categorical_embedding_dims, int): - categorical_embedding_dims = [categorical_embedding_dims] * num_categorical_features - elif not isinstance(categorical_embedding_dims, list): - raise TypeError("categorical_embedding_dims must be an int or a list of int") - - assert isinstance(categorical_embedding_dims, list) or categorical_embedding_dims is None, ( - "categorical_embedding_dims must be a list of int at this point" - ) - - return categorical_vocabulary_sizes, categorical_embedding_dims, num_categorical_features - - class NumpyJSONEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, np.integer): From dc863ff4945abfd6232b40585772ede3d8acf5ba Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Mon, 3 Nov 2025 18:57:28 +0000 Subject: [PATCH 14/66] chore: remove tokenizer (now it is ngram tokenizer) --- .../classifiers/fasttext/tokenizer.py | 346 ------------------ 1 file changed, 346 deletions(-) delete mode 100644 torchTextClassifiers/classifiers/fasttext/tokenizer.py diff --git a/torchTextClassifiers/classifiers/fasttext/tokenizer.py b/torchTextClassifiers/classifiers/fasttext/tokenizer.py deleted file mode 100644 index 7573952..0000000 --- a/torchTextClassifiers/classifiers/fasttext/tokenizer.py +++ /dev/null @@ -1,346 +0,0 @@ -""" -NGramTokenizer class. -""" - -import ctypes -import json -from typing import List, Tuple, Type, Dict - -import numpy as np -import torch -from torch import Tensor -from concurrent.futures import ThreadPoolExecutor -from dataclasses import dataclass -from queue import Queue -import multiprocessing - -from ...utilities.preprocess import clean_text_feature - - -class NGramTokenizer: - """ - NGramTokenizer class. - """ - - def __init__( - self, - min_count: int, - min_n: int, - max_n: int, - num_tokens: int, - len_word_ngrams: int, - training_text: List[str], - **kwargs, - ): - """ - Constructor for the NGramTokenizer class. - - Args: - min_count (int): Minimum number of times a word has to be - in the training data to be given an embedding. - min_n (int): Minimum length of character n-grams. - max_n (int): Maximum length of character n-grams. - num_tokens (int): Number of rows in the embedding matrix. - word_ngrams (int): Maximum length of word n-grams. - training_text (List[str]): List of training texts. - - Raises: - ValueError: If `min_n` is 1 or smaller. - ValueError: If `max_n` is 7 or higher. - """ - if min_n < 2: - raise ValueError("`min_n` parameter must be greater than 1.") - if max_n > 6: - raise ValueError("`max_n` parameter must be smaller than 7.") - - self.min_count = min_count - self.min_n = min_n - self.max_n = max_n - self.num_tokens = num_tokens - self.word_ngrams = len_word_ngrams - - word_counts = {} - for sentence in training_text: - for word in sentence.split(" "): - word_counts[word] = word_counts.setdefault(word, 0) + 1 - - self.word_id_mapping = {} - i = 1 - for word, counts in word_counts.items(): - if word_counts[word] >= min_count: - self.word_id_mapping[word] = i - i += 1 - self.nwords = len(self.word_id_mapping) - - self.padding_index = self.num_tokens + self.get_nwords() - - def __str__(self) -> str: - """ - Returns description of the NGramTokenizer. - - Returns: - str: Description. - """ - return f"" - - def get_nwords(self) -> int: - """ - Return number of words kept in training data. - - Returns: - int: Number of words. - """ - return self.nwords - - def get_buckets(self) -> int: - """ - Return number of buckets for tokenizer. - - Returns: - int: Number of buckets. - """ - return self.num_tokens - - @staticmethod - def get_ngram_list(word: str, n: int) -> List[str]: - """ - Return the list of character n-grams for a word with a - given n. - - Args: - word (str): Word. - n (int): Length of the n-grams. - - Returns: - List[str]: List of character n-grams. - """ - return [word[i : i + n] for i in range(len(word) - n + 1)] - - @staticmethod - def get_hash(subword: str) -> int: - """ - Return hash for a given subword. - - Args: - subword (str): Character n-gram. - - Returns: - int: Corresponding hash. - """ - h = ctypes.c_uint32(2166136261).value - for c in subword: - c = ctypes.c_int8(ord(c)).value - h = ctypes.c_uint32(h ^ c).value - h = ctypes.c_uint32(h * 16777619).value - return h - - @staticmethod - def get_word_ngram_id(hashes: Tuple[int], bucket: int, nwords: int) -> int: - """ - Get word ngram index in the embedding matrix. - - Args: - hashes (Tuple[int]): Word hashes. - bucket (int): Number of rows in embedding matrix. - nwords (int): Number of words in the vocabulary. - - Returns: - int: Word ngram hash. - """ - hashes = [ctypes.c_int32(hash_value).value for hash_value in hashes] - h = ctypes.c_uint64(hashes[0]).value - for j in range(1, len(hashes)): - h = ctypes.c_uint64((h * 116049371)).value - h = ctypes.c_uint64(h + hashes[j]).value - return h % bucket + nwords - - def get_subword_index(self, subword: str) -> int: - """ - Return the row index from the embedding matrix which - corresponds to a character n-gram. - - Args: - subword (str): Character n-gram. - - Returns: - int: Index. - """ - return self.get_hash(subword) % self.num_tokens + self.nwords - - def get_word_index(self, word: str) -> int: - """ - Return the row index from the embedding matrix which - corresponds to a word. - - Args: - word (str): Word. - - Returns: - int: Index. - """ - return self.word_id_mapping[word] - - def get_subwords(self, word: str) -> Tuple[List[str], List[int]]: - """ - Return all subwords tokens and indices for a given word. - Also adds the whole word token and indice if the word is in word_id_mapping - (==> the word is in initial vocabulary + seen at least MIN_COUNT times). - Adds tags "<" and ">" to the word. - - Args: - word (str): Word. - - Returns: - Tuple[List[str], List[int]]: Tuple of tokens and indices. - """ - tokens = [] - word_with_tags = "<" + word + ">" - - # Get subwords and associated indices WITHOUT the whole word - for n in range(self.min_n, self.max_n + 1): - ngrams = self.get_ngram_list(word_with_tags, n) - tokens += [ - ngram for ngram in ngrams if ngram != word_with_tags and ngram != word - ] # Exclude the full word - - indices = [self.get_subword_index(token) for token in tokens] - assert word not in tokens - - # Add word token and indice only if the word is in word_id_mapping - if word in self.word_id_mapping.keys(): - self.get_word_index(word) - tokens = [word] + tokens - indices = [self.get_word_index(word)] + indices - - return (tokens, indices) - - def indices_matrix(self, sentence: str) -> tuple[torch.Tensor, dict, dict]: - """ - Returns an array of token indices for a text description. - - Args: - sentence (str): Text description. - - Returns: - tuple: (torch.Tensor of indices, id_to_token dict, token_to_id dict) - """ - # Pre-split the sentence once - words = sentence.split() - words.append("") # Add end of string token - - indices = [] - all_tokens_id = {} - - # Process subwords in one batch - for word in words[:-1]: # Exclude from subword processing - tokens, ind = self.get_subwords(word) - indices.extend(ind) - # Update dictionary with zip for efficiency - all_tokens_id.update(zip(tokens, ind)) - - # Add token - indices.append(0) - all_tokens_id[""] = 0 - - # Compute word n-grams more efficiently - if self.word_ngrams > 1: - # Pre-compute hashes for all words to avoid repeated computation - word_hashes = [self.get_hash(word) for word in words] - - # Generate n-grams using sliding window - word_ngram_ids = [] - for n in range(2, self.word_ngrams + 1): - for i in range(len(words) - n + 1): - # Get slice of hashes for current n-gram - gram_hashes = tuple(word_hashes[i : i + n]) - - # Compute n-gram ID - word_ngram_id = int( - self.get_word_ngram_id(gram_hashes, self.num_tokens, self.nwords) - ) - - # Store gram and its ID - gram = " ".join(words[i : i + n]) - all_tokens_id[gram] = word_ngram_id - word_ngram_ids.append(word_ngram_id) - - # Extend indices with n-gram IDs - indices.extend(word_ngram_ids) - - # Create reverse mapping once at the end - id_to_token = {v: k for k, v in all_tokens_id.items()} - - # Convert to tensor directly - return torch.tensor(indices, dtype=torch.long), id_to_token, all_tokens_id - - def tokenize(self, text: list[str], text_tokens=True, preprocess=False): - """ - Tokenize a list of sentences. - - Args: - text (list[str]): List of sentences. - text_tokens (bool): If True, return tokenized text in tokens. - preprocess (bool): If True, preprocess text. Needs unidecode library. - - Returns: - np.array: Array of indices. - """ - - if preprocess: - text = clean_text_feature(text) - - tokenized_text = [] - id_to_token_dicts = [] - token_to_id_dicts = [] - for sentence in text: - all_ind, id_to_token, token_to_id = self.indices_matrix( - sentence - ) # tokenize and convert to token indices - tokenized_text.append(all_ind) - id_to_token_dicts.append(id_to_token) - token_to_id_dicts.append(token_to_id) - - if text_tokens: - tokenized_text_tokens = self._tokenized_text_in_tokens( - tokenized_text, id_to_token_dicts - ) - return tokenized_text_tokens, tokenized_text, id_to_token_dicts, token_to_id_dicts - else: - return tokenized_text, id_to_token_dicts, token_to_id_dicts - - def _tokenized_text_in_tokens(self, tokenized_text, id_to_token_dicts): - """ - Convert tokenized text in int format to tokens in str format (given a mapping dictionary). - Private method. Used in tokenizer.tokenize and pytorch_model.predict() - - Args: - tokenized_text (list): List of tokenized text in int format. - id_to_token_dicts (list[Dict]): List of dictionaries mapping token indices to tokens. - - Both lists have the same length (number of sentences). - - Returns: - list[list[str]]: List of tokenized text in str format. - - """ - - return [ - [ - id_to_token_dicts[i][token_id.item()] - for token_id in tokenized_sentence - if token_id.item() not in {self.padding_index} - ] - for i, tokenized_sentence in enumerate(tokenized_text) - ] - - def get_vocab(self): - return self.word_id_mapping - - @classmethod - def from_json(cls: Type["NGramTokenizer"], filepath: str, training_text) -> "NGramTokenizer": - """ - Load a dataclass instance from a JSON file. - """ - with open(filepath, "r") as f: - data = json.load(f) - return cls(**data, training_text=training_text) From 064b73f33ae1b27d0a75f2a4323a8428413abca5 Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Tue, 4 Nov 2025 19:14:03 +0000 Subject: [PATCH 15/66] feat!(components): first working example with full modularity components are text_embedder, categorical_var_net and classification_head text_embedder is optional, but tokenizer is not (TF-IDF is conceptually a tokenizer) all are customizable need to add doc at some point --- .../model/components/__init__.py | 8 + .../{ => components}/categorical_var_net.py | 20 ++- .../classification_head.py} | 9 +- .../model/components/text_embedder.py | 81 +++++++++ torchTextClassifiers/model/lightning.py | 2 +- torchTextClassifiers/model/model.py | 158 ++++++++---------- 6 files changed, 184 insertions(+), 94 deletions(-) create mode 100644 torchTextClassifiers/model/components/__init__.py rename torchTextClassifiers/model/{ => components}/categorical_var_net.py (85%) rename torchTextClassifiers/model/{classification_heads.py => components/classification_head.py} (86%) create mode 100644 torchTextClassifiers/model/components/text_embedder.py diff --git a/torchTextClassifiers/model/components/__init__.py b/torchTextClassifiers/model/components/__init__.py new file mode 100644 index 0000000..ad62cad --- /dev/null +++ b/torchTextClassifiers/model/components/__init__.py @@ -0,0 +1,8 @@ +from .categorical_var_net import ( + CategoricalForwardType as CategoricalForwardType, +) +from .categorical_var_net import ( + CategoricalVariableNet as CategoricalVariableNet, +) +from .classification_head import ClassificationHead as ClassificationHead +from .text_embedder import TextEmbedder as TextEmbedder diff --git a/torchTextClassifiers/model/categorical_var_net.py b/torchTextClassifiers/model/components/categorical_var_net.py similarity index 85% rename from torchTextClassifiers/model/categorical_var_net.py rename to torchTextClassifiers/model/components/categorical_var_net.py index c217902..973dd6c 100644 --- a/torchTextClassifiers/model/categorical_var_net.py +++ b/torchTextClassifiers/model/components/categorical_var_net.py @@ -5,7 +5,7 @@ from torch import nn -class ForwardType(Enum): +class CategoricalForwardType(Enum): SUM_TO_TEXT = "EMBEDDING_SUM_TO_TEXT" AVERAGE_AND_CONCAT = "EMBEDDING_AVERAGE_AND_CONCAT" CONCATENATE_ALL = "EMBEDDING_CONCATENATE_ALL" @@ -25,6 +25,10 @@ def __init__( self.text_embedding_dim = text_embedding_dim self._validate_categorical_inputs() + assert isinstance( + self.forward_type, CategoricalForwardType + ), "forward_type must be set after validation" + assert isinstance(self.output_dim, int), "output_dim must be set as int after validation" self.categorical_embedding_layers = {} @@ -38,11 +42,11 @@ def __init__( def forward(self, categorical_vars_tensor: torch.Tensor) -> torch.Tensor: cat_embeds = self._get_cat_embeds(categorical_vars_tensor) - if self.forward_type == ForwardType.SUM_TO_TEXT: + if self.forward_type == CategoricalForwardType.SUM_TO_TEXT: x_combined = torch.stack(cat_embeds, dim=0).sum(dim=0) # (bs, text_embed_dim) - elif self.forward_type == ForwardType.AVERAGE_AND_CONCAT: + elif self.forward_type == CategoricalForwardType.AVERAGE_AND_CONCAT: x_combined = torch.stack(cat_embeds, dim=0).mean(dim=0) # (bs, embed_dim) - elif self.forward_type == ForwardType.CONCATENATE_ALL: + elif self.forward_type == CategoricalForwardType.CONCATENATE_ALL: x_combined = torch.cat(cat_embeds, dim=1) # (bs, sum of all cat embed dims) else: raise ValueError(f"Unknown forward type: {self.forward_type}") @@ -55,6 +59,8 @@ def forward(self, categorical_vars_tensor: torch.Tensor) -> torch.Tensor: return x_combined def _get_cat_embeds(self, categorical_vars_tensor: torch.Tensor): + if categorical_vars_tensor.dtype != torch.long: + categorical_vars_tensor = categorical_vars_tensor.to(torch.long) cat_embeds = [] for i, embed_layer in self.categorical_embedding_layers.items(): @@ -95,12 +101,12 @@ def _validate_categorical_inputs(self): # "Transform" embedding dims into a suitable list, or stay None if categorical_embedding_dims is not None: if isinstance(categorical_embedding_dims, int): - self.forward_type = ForwardType.AVERAGE_AND_CONCAT + self.forward_type = CategoricalForwardType.AVERAGE_AND_CONCAT self.output_dim = categorical_embedding_dims categorical_embedding_dims = [categorical_embedding_dims] * num_categorical_features elif isinstance(categorical_embedding_dims, list): - self.forward_type = ForwardType.CONCATENATE_ALL + self.forward_type = CategoricalForwardType.CONCATENATE_ALL self.output_dim = sum(categorical_embedding_dims) else: raise TypeError("categorical_embedding_dims must be an int, a list of int or None") @@ -109,7 +115,7 @@ def _validate_categorical_inputs(self): raise ValueError( "If categorical_embedding_dims is None, text_embedding_dim must be provided" ) - self.forward_type = ForwardType.SUM_TO_TEXT + self.forward_type = CategoricalForwardType.SUM_TO_TEXT self.output_dim = self.text_embedding_dim categorical_embedding_dims = [self.text_embedding_dim] * num_categorical_features diff --git a/torchTextClassifiers/model/classification_heads.py b/torchTextClassifiers/model/components/classification_head.py similarity index 86% rename from torchTextClassifiers/model/classification_heads.py rename to torchTextClassifiers/model/components/classification_head.py index a5b29c3..4297d27 100644 --- a/torchTextClassifiers/model/classification_heads.py +++ b/torchTextClassifiers/model/components/classification_head.py @@ -1,9 +1,16 @@ +from typing import Optional + import torch from torch import nn class ClassificationHead(nn.Module): - def __init__(self, input_dim=None, num_classes=None, net=None): + def __init__( + self, + input_dim: Optional[int] = None, + num_classes: Optional[int] = None, + net: Optional[nn.Module] = None, + ): super().__init__() if net is not None: self.net = net diff --git a/torchTextClassifiers/model/components/text_embedder.py b/torchTextClassifiers/model/components/text_embedder.py new file mode 100644 index 0000000..17df9f7 --- /dev/null +++ b/torchTextClassifiers/model/components/text_embedder.py @@ -0,0 +1,81 @@ +import torch +from torch import nn + + +class TextEmbedder(nn.Module): + def __init__(self, vocab_size: int, embedding_dim: int, padding_idx: int): + super().__init__() + + self.vocab_size = vocab_size + self.embedding_dim = embedding_dim + self.padding_idx = padding_idx + + self.embedding_layer = nn.Embedding( + embedding_dim=embedding_dim, + num_embeddings=vocab_size, + padding_idx=self.padding_idx, + ) + + def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor: + """Converts input token IDs to their corresponding embeddings.""" + + encoded_text = input_ids # clearer name + if encoded_text.dtype != torch.long: + encoded_text = encoded_text.to(torch.long) + + batch_size, seq_len = encoded_text.shape + batch_size_check, seq_len_check = attention_mask.shape + + if batch_size != batch_size_check or seq_len != seq_len_check: + raise ValueError( + f"Input IDs and attention mask must have the same batch size and sequence length. " + f"Got input_ids shape {encoded_text.shape} and attention_mask shape {attention_mask.shape}." + ) + + token_embeddings = self.embedding_layer( + encoded_text + ) # (batch_size, seq_len, embedding_dim) + + text_embedding = self._get_sentence_embedding( + token_embeddings=token_embeddings, attention_mask=attention_mask + ) + + return text_embedding + + def _get_sentence_embedding( + self, token_embeddings: torch.Tensor, attention_mask: torch.Tensor + ) -> torch.Tensor: + """ + Compute sentence embedding from embedded tokens - "remove" second dimension. + + Args (output from dataset collate_fn): + token_embeddings (torch.Tensor[Long]), shape (batch_size, seq_len, embedding_dim): Tokenized + padded text + attention_mask (torch.Tensor[Long]), shape (batch_size, seq_len): Attention mask indicating non-pad tokens + Returns: + torch.Tensor: Sentence embeddings, shape (batch_size, embedding_dim) + """ + + # average over non-pad token embeddings + # attention mask has 1 for non-pad tokens and 0 for pad token positions + # TODO: add attention logic at some point + + # mask pad-tokens + mask = attention_mask.unsqueeze(-1).float() # (batch_size, seq_len, 1) + masked_embeddings = token_embeddings * mask # (batch_size, seq_len, embedding_dim) + + sentence_embedding = masked_embeddings.sum(dim=1) / mask.sum(dim=1).clamp( + min=1.0 + ) # avoid division by zero + + sentence_embedding = torch.nan_to_num(sentence_embedding, 0.0) + + return sentence_embedding + + def __call__(self, *args, **kwargs): + out = super().__call__(*args, **kwargs) + if out.dim() != 2: + raise ValueError( + f"Output of {self.__class__.__name__}.forward must be 2D " + f"(got shape {tuple(out.shape)})" + ) + return out diff --git a/torchTextClassifiers/model/lightning.py b/torchTextClassifiers/model/lightning.py index b6f6e01..c1e8839 100644 --- a/torchTextClassifiers/model/lightning.py +++ b/torchTextClassifiers/model/lightning.py @@ -57,7 +57,7 @@ def forward(self, batch) -> torch.Tensor: Returns (torch.Tensor): Prediction. """ return self.model( - encoded_text=batch["input_ids"], + input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], categorical_vars=batch.get("categorical_vars", None), ) diff --git a/torchTextClassifiers/model/model.py b/torchTextClassifiers/model/model.py index a7633e6..adffe1d 100644 --- a/torchTextClassifiers/model/model.py +++ b/torchTextClassifiers/model/model.py @@ -6,7 +6,7 @@ """ import logging -from typing import Annotated, List, Union +from typing import Annotated, List, Optional import torch from torch import nn @@ -18,8 +18,12 @@ except ImportError: HAS_CAPTUM = False -from torchTextClassifiers.model.categorical_var_net import CategoricalVariableNet, ForwardType -from torchTextClassifiers.model.classification_heads import ClassificationHead +from torchTextClassifiers.model.components import ( + CategoricalForwardType, + CategoricalVariableNet, + ClassificationHead, + TextEmbedder, +) logger = logging.getLogger(__name__) @@ -41,131 +45,115 @@ class TextClassificationModel(nn.Module): def __init__( self, - embedding_dim: int, classification_head: ClassificationHead, - categorical_variable_net: CategoricalVariableNet = None, - tokenizer=None, - num_rows: int = None, - categorical_vocabulary_sizes: List[int] = None, - categorical_embedding_dims: Union[List[int], int] = None, - sparse: bool = False, + text_embedder: Optional[TextEmbedder] = None, + categorical_variable_net: Optional[CategoricalVariableNet] = None, ): """ Constructor for the FastTextModel class. Args: - embedding_dim (int): Dimension of the text embedding space. - buckets (int): Number of rows in the embedding matrix. - num_classes (int): Number of classes. - categorical_vocabulary_sizes (List[int]): List of the number of - modalities for additional categorical features. - padding_idx (int, optional): Padding index for the text - descriptions. Defaults to 0. - sparse (bool): Indicates if Embedding layer is sparse. + classification_head (ClassificationHead): The classification head module. + text_embedder (Optional[TextEmbedder]): The text embedding module. + If not provided, assumes that input text is already embedded (as tensors) and directly passed to the classification head. + categorical_variable_net (Optional[CategoricalVariableNet]): The categorical variable network module. + If not provided, assumes no categorical variables are used. """ super().__init__() - if tokenizer is None: - if num_rows is None: - raise ValueError( - "Either tokenizer or num_rows must be provided (number of rows in the embedding matrix)." - ) - else: - if num_rows is not None: - if num_rows != tokenizer.vocab_size: - logger.warning( - "num_rows is different from the number of tokens in the tokenizer. Using provided num_rows." - ) - else: - num_rows = tokenizer.vocab_size - - self.num_rows = num_rows - self.tokenizer = tokenizer - self.padding_idx = self.tokenizer.padding_idx - self.embedding_dim = embedding_dim - self.sparse = sparse - - self.categorical_embedding_dims = categorical_embedding_dims - - self.embeddings = nn.Embedding( - embedding_dim=embedding_dim, - num_embeddings=num_rows, - padding_idx=self.padding_idx, - sparse=sparse, - ) + self.text_embedder = text_embedder - self.classification_head = classification_head self.categorical_variable_net = categorical_variable_net - self.num_classes = self.classification_head.num_classes + if not self.categorical_variable_net: + logger.info("🔹 No categorical variable network provided; using only text embeddings.") - def _get_sentence_embedding( - self, token_embeddings: torch.Tensor, attention_mask: torch.Tensor - ) -> torch.Tensor: - """ - Compute sentence embedding from encoded text. + self.classification_head = classification_head - Args: - encoded_text (torch.Tensor[Long]), shape (batch_size, seq_len): Tokenized + padded text + self._validate_component_connections() - Returns: - torch.Tensor: Sentence embeddings, shape (batch_size, embedding_dim) - """ + self.num_classes = self.classification_head.num_classes - # average over non-pad token embeddings. PAD always has 0 vector so no influence in the sum - # attention mask has 1 for non-pad tokens and 0 for pad token positions - # TODO: add attention logic at some point - sentence_embedding = token_embeddings.sum(dim=1) / attention_mask.sum( - dim=1, keepdim=True - ).clamp(min=1.0) # sum is over seq_len dim + def _validate_component_connections(self): + def _check_text_categorical_connection(self, text_embedder, cat_var_net): + if cat_var_net.forward_type == CategoricalForwardType.SUM_TO_TEXT: + if text_embedder.embedding_dim != cat_var_net.output_dim: + raise ValueError( + "Text embedding dimension must match categorical variable embedding dimension." + ) + self.expected_classification_head_input_dim = text_embedder.embedding_dim + else: + self.expected_classification_head_input_dim = ( + text_embedder.embedding_dim + cat_var_net.output_dim + ) - sentence_embedding = torch.nan_to_num(sentence_embedding, 0.0) + if self.text_embedder: + if self.categorical_variable_net: + _check_text_categorical_connection( + self, self.text_embedder, self.categorical_variable_net + ) + else: + self.expected_classification_head_input_dim = self.text_embedder.embedding_dim - return sentence_embedding + if self.expected_classification_head_input_dim != self.classification_head.input_dim: + raise ValueError( + "Classification head input dimension does not match expected dimension from text embedder and categorical variable net." + ) + else: + logger.warning( + "⚠️ No text embedder provided; assuming input text is already embedded or vectorized. Take care that the classification head input dimension matches the input text dimension." + ) def forward( self, - encoded_text: Annotated[torch.Tensor, "batch seq_len"], + input_ids: Annotated[torch.Tensor, "batch seq_len"], attention_mask: Annotated[torch.Tensor, "batch seq_len"], categorical_vars: Annotated[torch.Tensor, "batch num_cats"], + **kwargs, ) -> torch.Tensor: """ Memory-efficient forward pass implementation. - Args: - encoded_text (torch.Tensor[Long]), shape (batch_size, seq_len): Tokenized + padded text - additional_inputs (torch.Tensor[Long]): Additional categorical features, (batch_size, num_categorical_features) + Args: output from dataset collate_fn + input_ids (torch.Tensor[Long]), shape (batch_size, seq_len): Tokenized + padded text + attention_mask (torch.Tensor[int]), shape (batch_size, seq_len): Attention mask indicating non-pad tokens + categorical_vars (torch.Tensor[Long]): Additional categorical features, (batch_size, num_categorical_features) Returns: - torch.Tensor: Model output scores for each class + torch.Tensor: Model output scores for each class - shape (batch_size, num_classes) + Raw, not softmaxed. """ - - # Ensure correct dtype and device once - if encoded_text.dtype != torch.long: - encoded_text = encoded_text.to(torch.long) - - # Compute embeddings and averaging in a memory-efficient way - token_embeddings = self.embeddings(encoded_text) # (batch_size, seq_len, embedding_dim) - - x_text = self._get_sentence_embedding( - token_embeddings=token_embeddings, attention_mask=attention_mask - ) + encoded_text = input_ids # clearer name + if self.text_embedder is None: + x_text = encoded_text.float() + else: + x_text = self.text_embedder(input_ids=encoded_text, attention_mask=attention_mask) if self.categorical_variable_net: x_cat = self.categorical_variable_net(categorical_vars) if ( - self.categorical_variable_net.forward_type == ForwardType.AVERAGE_AND_CONCAT - or self.categorical_variable_net.forward_type == ForwardType.CONCATENATE_ALL + self.categorical_variable_net.forward_type + == CategoricalForwardType.AVERAGE_AND_CONCAT + or self.categorical_variable_net.forward_type + == CategoricalForwardType.CONCATENATE_ALL ): x_combined = torch.cat((x_text, x_cat), dim=1) else: - assert self.categorical_variable_net.forward_type == ForwardType.SUM_TO_TEXT + assert ( + self.categorical_variable_net.forward_type == CategoricalForwardType.SUM_TO_TEXT + ) x_combined = x_text + x_cat + else: + x_combined = x_text logits = self.classification_head(x_combined) return logits + # TODO: move to the wrapper class + # We should not have anything relating to tokenization in the model class + # A PyTorch model takes preocessed tensors as input not raw text, + # and it outputs raw logits, not predictions @torch.no_grad() def predict( self, From 164cccff6d50066cf68dae2e53c45d51b449a7b4 Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Wed, 5 Nov 2025 17:54:32 +0000 Subject: [PATCH 16/66] fix: avoid bugs with numpy arrays in boolean contexts --- torchTextClassifiers/dataset/dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torchTextClassifiers/dataset/dataset.py b/torchTextClassifiers/dataset/dataset.py index c185a8f..44dbaf2 100644 --- a/torchTextClassifiers/dataset/dataset.py +++ b/torchTextClassifiers/dataset/dataset.py @@ -37,7 +37,7 @@ def __len__(self): return len(self.texts) def __getitem__(self, idx): - if self.labels: + if self.labels is not None: return ( self.texts[idx], ( @@ -61,7 +61,7 @@ def __getitem__(self, idx): def collate_fn(self, batch): text, *categorical_vars, y = zip(*batch) - if self.labels: + if self.labels is not None: labels_tensor = torch.tensor(y, dtype=torch.long) else: labels_tensor = None From c5b9673f8fe4a46c881911669e2e183ccfb97445 Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Wed, 5 Nov 2025 18:02:13 +0000 Subject: [PATCH 17/66] feat: add smooth imports for HF and output_dim field in prevision of adding TF_IDF --- torchTextClassifiers/tokenizers/WordPiece.py | 54 ++++++------------- torchTextClassifiers/tokenizers/__init__.py | 11 +++- torchTextClassifiers/tokenizers/base.py | 57 ++++++++++++++++++-- 3 files changed, 80 insertions(+), 42 deletions(-) diff --git a/torchTextClassifiers/tokenizers/WordPiece.py b/torchTextClassifiers/tokenizers/WordPiece.py index 1efb075..6b33328 100644 --- a/torchTextClassifiers/tokenizers/WordPiece.py +++ b/torchTextClassifiers/tokenizers/WordPiece.py @@ -2,18 +2,23 @@ import os from typing import List -from tokenizers import ( - Tokenizer, - decoders, - models, - normalizers, - pre_tokenizers, - processors, - trainers, -) -from transformers import PreTrainedTokenizerFast - -from torchTextClassifiers.tokenizers import HuggingFaceTokenizer +from torchTextClassifiers.tokenizers import HAS_HF, HuggingFaceTokenizer + +if not HAS_HF: + raise ImportError( + "The HuggingFace dependencies are needed to use this tokenizer. Please run 'uv add torchTextClassifiers --group hf-dep." + ) +else: + from tokenizers import ( + Tokenizer, + decoders, + models, + normalizers, + pre_tokenizers, + processors, + trainers, + ) + from transformers import PreTrainedTokenizerFast logger = logging.getLogger(__name__) @@ -84,28 +89,3 @@ def train( filesystem.mkdirs(parent_dir) filesystem.put(save_path, s3_save_path) logger.info(f"💾 Tokenizer uploaded to S3 at {s3_save_path}") - - @classmethod - def load(cls, load_path: str): - loaded_tokenizer = PreTrainedTokenizerFast(tokenizer_file=load_path) - instance = cls(vocab_size=len(loaded_tokenizer), trained=True) - instance.tokenizer = loaded_tokenizer - instance._post_training() - return instance - - @classmethod - def load_from_s3(cls, s3_path: str, filesystem): - if filesystem.exists(s3_path) is False: - raise FileNotFoundError( - f"Tokenizer not found at {s3_path}. Please train it first (see src/train_tokenizers)." - ) - - with filesystem.open(s3_path, "rb") as f: - json_str = f.read().decode("utf-8") - - tokenizer_obj = Tokenizer.from_str(json_str) - tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer_obj) - instance = cls(vocab_size=len(tokenizer), trained=True) - instance.tokenizer = tokenizer - instance._post_training() - return instance diff --git a/torchTextClassifiers/tokenizers/__init__.py b/torchTextClassifiers/tokenizers/__init__.py index 8284ad7..5fd489f 100644 --- a/torchTextClassifiers/tokenizers/__init__.py +++ b/torchTextClassifiers/tokenizers/__init__.py @@ -1,3 +1,10 @@ -from .base import BaseTokenizer as BaseTokenizer -from .base import HuggingFaceTokenizer as HuggingFaceTokenizer +from .base import ( + HAS_HF as HAS_HF, +) +from .base import ( + BaseTokenizer as BaseTokenizer, +) +from .base import ( + HuggingFaceTokenizer as HuggingFaceTokenizer, +) from .WordPiece import WordPieceTokenizer as WordPieceTokenizer diff --git a/torchTextClassifiers/tokenizers/base.py b/torchTextClassifiers/tokenizers/base.py index d672f77..f5c7c6e 100644 --- a/torchTextClassifiers/tokenizers/base.py +++ b/torchTextClassifiers/tokenizers/base.py @@ -1,10 +1,34 @@ from abc import ABC, abstractmethod -from typing import List, Union +from typing import List, Optional, Union + +try: + from tokenizers import Tokenizer + from transformers import PreTrainedTokenizerFast + + HAS_HF = True +except ImportError: + HAS_HF = False class BaseTokenizer(ABC): - def __init__(self, vocab_size: int): + def __init__( + self, vocab_size: int, output_vectorized: bool = False, output_dim: Optional[int] = None + ): + """ + Base class for tokenizers. + Args: + vocab_size (int): Size of the vocabulary. + output_vectorized (bool): Whether the tokenizer outputs vectorized tokens. + True for instance for a TF-IDF tokenizer. + """ + self.vocab_size = vocab_size + self.output_vectorized = output_vectorized + if self.output_vectorized: + if output_dim is None: + raise ValueError( + "Tokenizer's output_dim must be provided if output_vectorized is True." + ) @abstractmethod def tokenize(self, text: Union[str, List[str]]) -> list: @@ -17,7 +41,9 @@ def __len__(self): class HuggingFaceTokenizer(BaseTokenizer, ABC): def __init__(self, vocab_size: int): - super().__init__(vocab_size) + super().__init__( + vocab_size, output_vectorized=False + ) # it outputs token ids and not vectors self.trained = False self.tokenizer = None @@ -30,6 +56,31 @@ def tokenize(self, text: Union[str, List[str]]) -> list: text, padding=True, return_tensors="pt" ) # method from PreTrainedTokenizerFast + @classmethod + def load(cls, load_path: str): + loaded_tokenizer = PreTrainedTokenizerFast(tokenizer_file=load_path) + instance = cls(vocab_size=len(loaded_tokenizer), trained=True) + instance.tokenizer = loaded_tokenizer + instance._post_training() + return instance + + @classmethod + def load_from_s3(cls, s3_path: str, filesystem): + if filesystem.exists(s3_path) is False: + raise FileNotFoundError( + f"Tokenizer not found at {s3_path}. Please train it first (see src/train_tokenizers)." + ) + + with filesystem.open(s3_path, "rb") as f: + json_str = f.read().decode("utf-8") + + tokenizer_obj = Tokenizer.from_str(json_str) + tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer_obj) + instance = cls(vocab_size=len(tokenizer), trained=True) + instance.tokenizer = tokenizer + instance._post_training() + return instance + @abstractmethod def train( self, From a0fe18cc7f80ffece86f68cb5937b81519975a0a Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Wed, 5 Nov 2025 18:10:05 +0000 Subject: [PATCH 18/66] feat!(wrapper class): finalize orchestration tokenizer, dataset, model, training, explainability moved prediction logic here --- torchTextClassifiers/__init__.py | 60 +- torchTextClassifiers/model/model.py | 99 +-- torchTextClassifiers/torchTextClassifiers.py | 811 ++++++++++--------- torchTextClassifiers/utilities/checkers.py | 45 - 4 files changed, 424 insertions(+), 591 deletions(-) diff --git a/torchTextClassifiers/__init__.py b/torchTextClassifiers/__init__.py index 8e61d86..b8ce72b 100644 --- a/torchTextClassifiers/__init__.py +++ b/torchTextClassifiers/__init__.py @@ -11,58 +11,22 @@ - Extensible architecture for adding new classifier types - Support for both text-only and mixed text/categorical features -Quick Start: - >>> from torchTextClassifiers import create_fasttext - >>> import numpy as np - >>> - >>> # Create classifier - >>> classifier = create_fasttext( - ... embedding_dim=100, - ... sparse=False, - ... num_tokens=10000, - ... min_count=2, - ... min_n=3, - ... max_n=6, - ... len_word_ngrams=2, - ... num_classes=2 - ... ) - >>> - >>> # Prepare data - >>> X_train = np.array(["positive text", "negative text"]) - >>> y_train = np.array([1, 0]) - >>> X_val = np.array(["validation text"]) - >>> y_val = np.array([1]) - >>> - >>> # Build and train - >>> classifier.build(X_train, y_train) - >>> classifier.train(X_train, y_train, X_val, y_val, num_epochs=10, batch_size=32) - >>> - >>> # Predict - >>> predictions = classifier.predict(np.array(["new text sample"])) """ -from .torchTextClassifiers import torchTextClassifiers - -# Convenience imports for FastText -try: - from .classifiers.fasttext.core import FastTextFactory - - # Expose FastText convenience methods at package level for easy access - create_fasttext = FastTextFactory.create_fasttext - build_fasttext_from_tokenizer = FastTextFactory.build_from_tokenizer - -except ImportError: - # FastText module not available - define placeholder functions - def create_fasttext(*args, **kwargs): - raise ImportError("FastText module not available") - - def build_fasttext_from_tokenizer(*args, **kwargs): - raise ImportError("FastText module not available") +from .torchTextClassifiers import ( + ModelConfig as ModelConfig, +) +from .torchTextClassifiers import ( + TrainingConfig as TrainingConfig, +) +from .torchTextClassifiers import ( + torchTextClassifiers as torchTextClassifiers, +) __all__ = [ "torchTextClassifiers", - "create_fasttext", - "build_fasttext_from_tokenizer", + "ModelConfig", + "TrainingConfig", ] -__version__ = "1.0.0" \ No newline at end of file +__version__ = "1.0.0" diff --git a/torchTextClassifiers/model/model.py b/torchTextClassifiers/model/model.py index adffe1d..e1f0715 100644 --- a/torchTextClassifiers/model/model.py +++ b/torchTextClassifiers/model/model.py @@ -6,18 +6,11 @@ """ import logging -from typing import Annotated, List, Optional +from typing import Annotated, Optional import torch from torch import nn -try: - from captum.attr import LayerIntegratedGradients - - HAS_CAPTUM = True -except ImportError: - HAS_CAPTUM = False - from torchTextClassifiers.model.components import ( CategoricalForwardType, CategoricalVariableNet, @@ -37,6 +30,9 @@ # ============================================================================ # PyTorch Model + +# It takes PyTorch tensors as input (not raw text!), +# and it outputs raw not-softmaxed logits, not predictions # ============================================================================ @@ -149,90 +145,3 @@ def forward( logits = self.classification_head(x_combined) return logits - - # TODO: move to the wrapper class - # We should not have anything relating to tokenization in the model class - # A PyTorch model takes preocessed tensors as input not raw text, - # and it outputs raw logits, not predictions - @torch.no_grad() - def predict( - self, - text: List[str], - categorical_variables: List[List[int]] = None, - top_k=1, - explain=False, - ): - """ - Args: - text (List[str]): A list of text observations. - params (Optional[Dict[str, Any]]): Additional parameters to - pass to the model for inference. - top_k (int): for each sentence, return the top_k most likely predictions (default: 1) - explain (bool): launch gradient integration to have an explanation of the prediction (default: False) - preprocess (bool): If True, preprocess text. Needs unidecode library. - - Returns: - if explain is False: - predictions (torch.Tensor, shape (len(text), top_k)): A tensor containing the top_k most likely codes to the query. - confidence (torch.Tensor, shape (len(text), top_k)): A tensor array containing the corresponding confidence scores. - if explain is True: - predictions (torch.Tensor, shape (len(text), top_k)): Containing the top_k most likely codes to the query. - confidence (torch.Tensor, shape (len(text), top_k)): Corresponding confidence scores. - all_attributions (torch.Tensor, shape (len(text), top_k, seq_len)): A tensor containing the attributions for each token in the text. - x (torch.Tensor): A tensor containing the token indices of the text. - id_to_token_dicts (List[Dict[int, str]]): A list of dictionaries mapping token indices to tokens (one for each sentence). - token_to_id_dicts (List[Dict[str, int]]): A list of dictionaries mapping tokens to token indices: the reverse of those in id_to_token_dicts. - text (list[str]): A plist containing the preprocessed text (one line for each sentence). - """ - - if explain: - if not HAS_CAPTUM: - raise ImportError( - "Captum is not installed and is required for explainability. Run 'pip install/uv add torchFastText[explainability]'." - ) - lig = LayerIntegratedGradients( - self, self.embeddings - ) # initialize a Captum layer gradient integrator - - self.eval() - - tokenize_output = self.tokenizer.tokenize(text) - - encoded_text = tokenize_output["input_ids"] # (batch_size, seq_len) - attention_mask = tokenize_output["attention_mask"] # (batch_size, seq_len) - - if categorical_variables is not None: - categorical_vars = torch.tensor( - categorical_variables, dtype=torch.float32 - ) # (batch_size, num_categorical_features) - else: - categorical_vars = torch.empty((encoded_text.shape[0], 0), dtype=torch.float32) - - pred = self( - encoded_text, attention_mask, categorical_vars - ) # forward pass, contains the prediction scores (len(text), num_classes) - label_scores = pred.detach().cpu() - label_scores_topk = torch.topk(label_scores, k=top_k, dim=1) - - predictions = label_scores_topk.indices # get the top_k most likely predictions - confidence = torch.round(label_scores_topk.values, decimals=2) # and their scores - - if explain: - all_attributions = [] - for k in range(top_k): - attributions = lig.attribute( - (encoded_text, attention_mask, categorical_vars), - target=torch.Tensor(predictions[:, k]).long(), - ) # (batch_size, seq_len) - attributions = attributions.sum(dim=-1) - all_attributions.append(attributions.detach().cpu()) - - all_attributions = torch.stack(all_attributions, dim=1) # (batch_size, top_k, seq_len) - - return { - "prediction": predictions, - "confidence": confidence, - "attributions": all_attributions, - } - else: - return predictions, confidence diff --git a/torchTextClassifiers/torchTextClassifiers.py b/torchTextClassifiers/torchTextClassifiers.py index 1629f4a..00c5284 100644 --- a/torchTextClassifiers/torchTextClassifiers.py +++ b/torchTextClassifiers/torchTextClassifiers.py @@ -1,7 +1,15 @@ import logging import time -import json -from typing import Optional, Union, Type, List, Dict, Any +from dataclasses import asdict, dataclass, field +from typing import Any, Dict, List, Optional, Tuple, Type, Union + +try: + from captum.attr import LayerIntegratedGradients + + HAS_CAPTUM = True +except ImportError: + HAS_CAPTUM = False + import numpy as np import pytorch_lightning as pl @@ -12,9 +20,15 @@ ModelCheckpoint, ) -from .utilities.checkers import check_X, check_Y, NumpyJSONEncoder -from .classifiers.base import BaseClassifierConfig, BaseClassifierWrapper - +from torchTextClassifiers.dataset import TextClassificationDataset +from torchTextClassifiers.model import TextClassificationModel, TextClassificationModule +from torchTextClassifiers.model.components import ( + CategoricalForwardType, + CategoricalVariableNet, + ClassificationHead, + TextEmbedder, +) +from torchTextClassifiers.tokenizers import BaseTokenizer logger = logging.getLogger(__name__) @@ -26,296 +40,248 @@ ) +@dataclass +class ModelConfig: + """Base configuration class for text classifiers.""" + + embedding_dim: int + categorical_vocabulary_sizes: Optional[List[int]] = None + categorical_embedding_dims: Optional[Union[List[int], int]] = None + num_classes: Optional[int] = None + + def to_dict(self) -> Dict[str, Any]: + return asdict(self) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "ModelConfig": + return cls(**data) + + +@dataclass +class TrainingConfig: + num_epochs: int + batch_size: int + lr: float + loss: torch.nn.Module = field(default_factory=lambda: torch.nn.CrossEntropyLoss()) + optimizer: Type[torch.optim.Optimizer] = torch.optim.Adam + scheduler: Optional[Type[torch.optim.lr_scheduler._LRScheduler]] = None + cpu_run: bool = False + num_workers: int = 12 + patience_early_stopping: int = 3 + dataloader_params: Optional[dict] = None + trainer_params: Optional[dict] = None + optimizer_params: Optional[dict] = None + scheduler_params: Optional[dict] = None + + def to_dict(self) -> Dict[str, Any]: + data = asdict(self) + # Serialize loss and scheduler as their class names + data["loss"] = self.loss.__class__.__name__ + if self.scheduler is not None: + data["scheduler"] = self.scheduler.__name__ + return data class torchTextClassifiers: """Generic text classifier framework supporting multiple architectures. - - This is the main class that provides a unified interface for different types - of text classifiers. It acts as a high-level wrapper that delegates operations - to specific classifier implementations while providing a consistent API. - - The class supports the full machine learning workflow including: - - Building tokenizers from training data - - Model training with validation - - Prediction and evaluation - - Model serialization and loading - - Attributes: - config: Configuration object specific to the classifier type - classifier: The underlying classifier implementation - - Example: - >>> from torchTextClassifiers import torchTextClassifiers - >>> from torchTextClassifiers.classifiers.fasttext.config import FastTextConfig - >>> from torchTextClassifiers.classifiers.fasttext.wrapper import FastTextWrapper - >>> - >>> # Create configuration - >>> config = FastTextConfig( - ... embedding_dim=100, - ... num_tokens=10000, - ... min_count=1, - ... min_n=3, - ... max_n=6, - ... len_word_ngrams=2, - ... num_classes=2 - ... ) - >>> - >>> # Initialize classifier with wrapper - >>> wrapper = FastTextWrapper(config) - >>> classifier = torchTextClassifiers(wrapper) - >>> - >>> # Build and train - >>> classifier.build(X_train, y_train) - >>> classifier.train(X_train, y_train, X_val, y_val, num_epochs=10, batch_size=32) - >>> - >>> # Predict - >>> predictions = classifier.predict(X_test) + + Given a tokenizer and model configuration, this class initializes: + - Text embedding layer (if needed) + - Categorical variable embedding network (if categorical variables are provided) + - Classification head + The resulting model can be trained using PyTorch Lightning and used for predictions. + """ - - def __init__(self, classifier: BaseClassifierWrapper): - """Initialize the torchTextClassifiers instance. - - Args: - classifier: An instance of a classifier wrapper that implements BaseClassifierWrapper - - Example: - >>> from torchTextClassifiers.classifiers.fasttext.wrapper import FastTextWrapper - >>> from torchTextClassifiers.classifiers.fasttext.config import FastTextConfig - >>> config = FastTextConfig(embedding_dim=50, num_tokens=5000) - >>> wrapper = FastTextWrapper(config) - >>> classifier = torchTextClassifiers(wrapper) - """ - self.classifier = classifier - self.config = classifier.config - - - def build_tokenizer(self, training_text: np.ndarray) -> None: - """Build tokenizer from training text data. - - This method is kept for backward compatibility. It delegates to - prepare_text_features which handles the actual text preprocessing. - - Args: - training_text: Array of text strings to build the tokenizer from - - Example: - >>> import numpy as np - >>> texts = np.array(["Hello world", "This is a test", "Another example"]) - >>> classifier.build_tokenizer(texts) - """ - self.classifier.prepare_text_features(training_text) - - def prepare_text_features(self, training_text: np.ndarray) -> None: - """Prepare text features for the classifier. - - This method handles text preprocessing which could involve tokenization, - vectorization, or other approaches depending on the classifier type. - - Args: - training_text: Array of text strings to prepare features from - - Example: - >>> import numpy as np - >>> texts = np.array(["Hello world", "This is a test", "Another example"]) - >>> classifier.prepare_text_features(texts) - """ - self.classifier.prepare_text_features(training_text) - - def build( + + def __init__( self, - X_train: np.ndarray, - y_train: np.ndarray = None, - lightning=True, - **kwargs - ) -> None: - """Build the complete classifier from training data. - - This method handles the full model building process including: - - Input validation and preprocessing - - Tokenizer creation from training text - - Model architecture initialization - - Lightning module setup (if enabled) - + tokenizer: BaseTokenizer, + model_config: ModelConfig, + ): + """Initialize the torchTextClassifiers instance. + Args: - X_train: Training input data (text and optional categorical features) - y_train: Training labels (optional, can be inferred if num_classes is set) - lightning: Whether to initialize PyTorch Lightning components - **kwargs: Additional arguments passed to Lightning initialization - - Raises: - ValueError: If y_train is None and num_classes is not set in config - ValueError: If label values are outside expected range - + tokenizer: A tokenizer instance for text preprocessing + model_config: Configuration parameters for the text classification model + Example: - >>> X_train = np.array(["text sample 1", "text sample 2"]) - >>> y_train = np.array([0, 1]) - >>> classifier.build(X_train, y_train) + >>> from torchTextClassifiers import ModelConfig, TrainingConfig, torchTextClassifiers + >>> # Assume tokenizer is a trained BaseTokenizer instance + >>> model_config = ModelConfig( + ... embedding_dim=10, + ... categorical_vocabulary_sizes=[30, 25], + ... categorical_embedding_dims=[10, 5], + ... num_classes=10, + ... ) + >>> ttc = torchTextClassifiers( + ... tokenizer=tokenizer, + ... model_config=model_config, + ... ) """ - training_text, categorical_variables, no_cat_var = check_X(X_train) - - if y_train is not None: - if self.config.num_classes is not None: - if self.config.num_classes != len(np.unique(y_train)): - logger.warning( - f"Updating num_classes from {self.config.num_classes} to {len(np.unique(y_train))}" - ) - - y_train = check_Y(y_train) - self.config.num_classes = len(np.unique(y_train)) - - if np.max(y_train) >= self.config.num_classes: - raise ValueError( - "y_train must contain values between 0 and num_classes-1" - ) + + self.model_config = model_config + self.tokenizer = tokenizer + + if not self.tokenizer.trained: + raise RuntimeError( + f"Tokenizer {type(self.tokenizer)} must be trained before initializing the classifier." + ) + + self.vocab_size = tokenizer.vocab_size + self.embedding_dim = model_config.embedding_dim + self.categorical_vocabulary_sizes = model_config.categorical_vocabulary_sizes + self.num_classes = model_config.num_classes + + if self.tokenizer.output_vectorized: + self.text_embedder = None + logger.info( + "Tokenizer outputs vectorized tokens; skipping TextEmbedder initialization." + ) + self.embedding_dim = self.tokenizer.output_dim else: - if self.config.num_classes is None: - raise ValueError( - "Either num_classes must be provided at init or y_train must be provided here." - ) - - # Handle categorical variables - if not no_cat_var: - if hasattr(self.config, 'num_categorical_features') and self.config.num_categorical_features is not None: - if self.config.num_categorical_features != categorical_variables.shape[1]: - logger.warning( - f"Updating num_categorical_features from {self.config.num_categorical_features} to {categorical_variables.shape[1]}" - ) - - if hasattr(self.config, 'num_categorical_features'): - self.config.num_categorical_features = categorical_variables.shape[1] - - categorical_vocabulary_sizes = np.max(categorical_variables, axis=0) + 1 - - if hasattr(self.config, 'categorical_vocabulary_sizes') and self.config.categorical_vocabulary_sizes is not None: - if self.config.categorical_vocabulary_sizes != list(categorical_vocabulary_sizes): - logger.warning( - "Overwriting categorical_vocabulary_sizes with values from training data." - ) - if hasattr(self.config, 'categorical_vocabulary_sizes'): - self.config.categorical_vocabulary_sizes = list(categorical_vocabulary_sizes) - - self.classifier.prepare_text_features(training_text) - self.classifier._build_pytorch_model() - - if lightning: - self.classifier._check_and_init_lightning(**kwargs) - + self.text_embedder = TextEmbedder( + vocab_size=self.vocab_size, + embedding_dim=self.embedding_dim, + padding_idx=tokenizer.padding_idx, + ) + + classif_head_input_dim = self.embedding_dim + if self.categorical_vocabulary_sizes: + self.categorical_var_net = CategoricalVariableNet( + categorical_vocabulary_sizes=self.categorical_vocabulary_sizes, + categorical_embedding_dims=model_config.categorical_embedding_dims, + text_embedding_dim=self.embedding_dim, + ) + + if self.categorical_var_net.forward_type != CategoricalForwardType.SUM_TO_TEXT: + classif_head_input_dim += self.categorical_var_net.output_dim + + else: + self.categorical_var_net = None + + self.classification_head = ClassificationHead( + input_dim=classif_head_input_dim, + num_classes=model_config.num_classes, + ) + + self.pytorch_model = TextClassificationModel( + text_embedder=self.text_embedder, + categorical_variable_net=self.categorical_var_net, + classification_head=self.classification_head, + ) + def train( self, X_train: np.ndarray, y_train: np.ndarray, X_val: np.ndarray, y_val: np.ndarray, - num_epochs: int, - batch_size: int, - cpu_run: bool = False, - num_workers: int = 12, - patience_train: int = 3, + training_config: TrainingConfig, verbose: bool = False, - trainer_params: Optional[dict] = None, - **kwargs ) -> None: """Train the classifier using PyTorch Lightning. - + This method handles the complete training process including: - Data validation and preprocessing - Dataset and DataLoader creation - PyTorch Lightning trainer setup with callbacks - Model training with early stopping - Best model loading after training - + Args: X_train: Training input data y_train: Training labels X_val: Validation input data y_val: Validation labels - num_epochs: Maximum number of training epochs - batch_size: Batch size for training and validation - cpu_run: If True, force training on CPU instead of GPU - num_workers: Number of worker processes for data loading - patience_train: Number of epochs to wait for improvement before early stopping - verbose: If True, print detailed training progress - trainer_params: Additional parameters to pass to PyTorch Lightning Trainer - **kwargs: Additional arguments passed to the build method - + training_config: Configuration parameters for training + verbose: Whether to print training progress information + + Example: - >>> classifier.train( - ... X_train, y_train, X_val, y_val, - ... num_epochs=50, - ... batch_size=32, - ... patience_train=5, - ... verbose=True - ... ) + + >>> training_config = TrainingConfig( + ... lr=1e-3, + ... batch_size=4, + ... num_epochs=1, + ... ) + >>> ttc.train( + ... X_train=X, + ... y_train=Y, + ... X_val=X, + ... y_val=Y, + ... training_config=training_config, + ... ) """ # Input validation - training_text, train_categorical_variables, train_no_cat_var = check_X(X_train) - val_text, val_categorical_variables, val_no_cat_var = check_X(X_val) - y_train = check_Y(y_train) - y_val = check_Y(y_val) - - # Consistency checks - assert train_no_cat_var == val_no_cat_var, ( - "X_train and X_val must have the same number of categorical variables." - ) - assert X_train.shape[0] == y_train.shape[0], ( - "X_train and y_train must have the same number of observations." - ) - assert X_train.ndim > 1 and X_train.shape[1] == X_val.shape[1] or X_val.ndim == 1, ( - "X_train and X_val must have the same number of columns." - ) - + X_train, y_train = self._check_XY(X_train, y_train) + X_val, y_val = self._check_XY(X_val, y_val) + + assert ( + X_train["categorical_variables"].ndim > 1 + and X_train["categorical_variables"].shape[1] == X_val["categorical_variables"].shape[1] + or X_val["categorical_variables"].ndim == 1 + ), "X_train and X_val must have the same number of columns." + if verbose: logger.info("Starting training process...") - + # Device setup - if cpu_run: + if training_config.cpu_run: device = torch.device("cpu") + accelerator = "cpu" else: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - - self.classifier.device = device - + accelerator = "gpu" if torch.cuda.is_available() else "cpu" + + self.device = device + + optimizer_params = {"lr": training_config.lr} + if training_config.optimizer_params is not None: + optimizer_params.update(training_config.optimizer_params) + + self.lightning_module = TextClassificationModule( + model=self.pytorch_model, + loss=training_config.loss, + optimizer=training_config.optimizer, + optimizer_params=optimizer_params, + scheduler=training_config.scheduler, + scheduler_params=training_config.scheduler_params + if training_config.scheduler_params + else {}, + scheduler_interval="epoch", + ) + + self.pytorch_model.to(self.device) + if verbose: logger.info(f"Running on: {device}") - - # Build model if not already built - if self.classifier.pytorch_model is None: - if verbose: - start = time.time() - logger.info("Building the model...") - self.build(X_train, y_train, **kwargs) - if verbose: - end = time.time() - logger.info(f"Model built in {end - start:.2f} seconds.") - - self.classifier.pytorch_model = self.classifier.pytorch_model.to(device) - - # Create datasets and dataloaders using wrapper methods - train_dataset = self.classifier.create_dataset( - texts=training_text, + + train_dataset = TextClassificationDataset( + texts=X_train["text"], + categorical_variables=X_train["categorical_variables"], # None if no cat vars + tokenizer=self.tokenizer, labels=y_train, - categorical_variables=train_categorical_variables, ) - val_dataset = self.classifier.create_dataset( - texts=val_text, + val_dataset = TextClassificationDataset( + texts=X_val["text"], + categorical_variables=X_val["categorical_variables"], # None if no cat vars + tokenizer=self.tokenizer, labels=y_val, - categorical_variables=val_categorical_variables, ) - - train_dataloader = self.classifier.create_dataloader( - dataset=train_dataset, - batch_size=batch_size, - num_workers=num_workers, - shuffle=True + + train_dataloader = train_dataset.create_dataloader( + batch_size=training_config.batch_size, + num_workers=training_config.num_workers, + shuffle=True, + **training_config.dataloader_params if training_config.dataloader_params else {}, ) - val_dataloader = self.classifier.create_dataloader( - dataset=val_dataset, - batch_size=batch_size, - num_workers=num_workers, - shuffle=False + val_dataloader = val_dataset.create_dataloader( + batch_size=training_config.batch_size, + num_workers=training_config.num_workers, + shuffle=False, + **training_config.dataloader_params if training_config.dataloader_params else {}, ) - + # Setup trainer callbacks = [ ModelCheckpoint( @@ -326,184 +292,223 @@ def train( ), EarlyStopping( monitor="val_loss", - patience=patience_train, + patience=training_config.patience_early_stopping, mode="min", ), LearningRateMonitor(logging_interval="step"), ] - - train_params = { + + trainer_params = { "callbacks": callbacks, - "max_epochs": num_epochs, + "max_epochs": training_config.batch_size, "num_sanity_val_steps": 2, "strategy": "auto", "log_every_n_steps": 1, "enable_progress_bar": True, } - - if trainer_params is not None: - train_params.update(trainer_params) - - trainer = pl.Trainer(**train_params) - + + if training_config.trainer_params is not None: + trainer_params.update(training_config.trainer_params) + + trainer = pl.Trainer(**trainer_params, accelerator=accelerator) + torch.cuda.empty_cache() torch.set_float32_matmul_precision("medium") - + if verbose: logger.info("Launching training...") start = time.time() - - trainer.fit(self.classifier.lightning_module, train_dataloader, val_dataloader) - + + trainer.fit(self.lightning_module, train_dataloader, val_dataloader) + if verbose: end = time.time() logger.info(f"Training completed in {end - start:.2f} seconds.") - - # Load best model using wrapper method + best_model_path = trainer.checkpoint_callback.best_model_path - self.classifier.load_best_model(best_model_path) - - def predict(self, X: np.ndarray, **kwargs) -> np.ndarray: - """Make predictions on input data. - - Args: - X: Input data for prediction (text and optional categorical features) - **kwargs: Additional arguments passed to the underlying predictor - - Returns: - np.ndarray: Predicted class labels - - Example: - >>> X_test = np.array(["new text sample", "another sample"]) - >>> predictions = classifier.predict(X_test) - >>> print(predictions) # [0, 1] - """ - return self.classifier.predict(X, **kwargs) - - def validate(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: - """Validate the model on test data. - - Args: - X: Input data for validation - Y: True labels for validation - **kwargs: Additional arguments passed to the validator - - Returns: - float: Validation accuracy score - - Example: - >>> accuracy = classifier.validate(X_test, y_test) - >>> print(f"Accuracy: {accuracy:.3f}") - """ - return self.classifier.validate(X, Y, **kwargs) - - def predict_and_explain(self, X: np.ndarray, **kwargs): - """Make predictions with explanations (if supported). - - This method provides both predictions and explanations for the model's - decisions. Availability depends on the specific classifier implementation. - + + self.lightning_module = TextClassificationModule.load_from_checkpoint( + best_model_path, + model=self.pytorch_model, + loss=training_config.loss, + ) + + self.pytorch_model = self.lightning_module.model.to(self.device) + + self.lightning_module.eval() + + def _check_XY(self, X: np.ndarray, Y: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: + X = self._check_X(X) + Y = self._check_Y(Y) + + if X["text"].shape[0] != Y.shape[0]: + raise ValueError("X_train and y_train must have the same number of observations.") + + return X, Y + + @staticmethod + def _check_text_col(X): + assert isinstance( + X, np.ndarray + ), "X must be a numpy array of shape (N,d), with the first column being the text and the rest being the categorical variables." + + try: + if X.ndim > 1: + text = X[:, 0].astype(str) + else: + text = X[:].astype(str) + except ValueError: + logger.error("The first column of X must be castable in string format.") + + return text + + def _check_categorical_variables(self, X: np.ndarray) -> None: + """Check if categorical variables in X match training configuration. + Args: - X: Input data for prediction - **kwargs: Additional arguments passed to the explainer - - Returns: - tuple: (predictions, explanations) where explanations format depends - on the classifier type - + X: Input data to check + Raises: - NotImplementedError: If the classifier doesn't support explanations - - Example: - >>> predictions, explanations = classifier.predict_and_explain(X_test) - >>> print(f"Predictions: {predictions}") - >>> print(f"Explanations: {explanations}") + ValueError: If the number of categorical variables does not match + the training configuration """ - if hasattr(self.classifier, 'predict_and_explain'): - return self.classifier.predict_and_explain(X, **kwargs) + + assert self.categorical_var_net is not None + + if X.ndim > 1: + num_cat_vars = X.shape[1] - 1 else: - raise NotImplementedError(f"Explanation not supported for {type(self.classifier).__name__}") - - def to_json(self, filepath: str) -> None: - """Save classifier configuration to JSON file. - - This method serializes the classifier configuration to a JSON - file. Note: This only saves configuration, not trained model weights. - Custom classifier wrappers should implement a class method `get_wrapper_class_info()` - that returns a dict with 'module' and 'class_name' keys for proper reconstruction. - - Args: - filepath: Path where to save the JSON configuration file - - Example: - >>> classifier.to_json('my_classifier_config.json') + num_cat_vars = 0 + + if num_cat_vars != self.categorical_var_net.num_categorical_features: + raise ValueError( + f"X must have the same number of categorical variables as the number of embedding layers in the categorical net: ({self.categorical_var_net.num_categorical_features})." + ) + + try: + categorical_variables = X[:, 1:].astype(int) + except ValueError: + logger.error( + f"Columns {1} to {X.shape[1] - 1} of X_train must be castable in integer format." + ) + + for j in range(1, X.shape[1]): + max_cat_value = categorical_variables.max() + if max_cat_value >= self.categorical_var_net.categorical_vocabulary_sizes[j - 1]: + raise ValueError( + f"Categorical variable at index {j} has value {max_cat_value} which exceeds the vocabulary size of {self.categorical_var_net.categorical_vocabulary_sizes[j]}." + ) + + return categorical_variables + + def _check_X(self, X: np.ndarray) -> np.ndarray: + text = self._check_text_col(X) + + categorical_variables = None + if self.categorical_var_net is not None: + categorical_variables = self._check_categorical_variables(X) + + return {"text": text, "categorical_variables": categorical_variables} + + def _check_Y(self, Y): + assert isinstance(Y, np.ndarray), "Y must be a numpy array of shape (N,) or (N,1)." + assert len(Y.shape) == 1 or ( + len(Y.shape) == 2 and Y.shape[1] == 1 + ), "Y must be a numpy array of shape (N,) or (N,1)." + + try: + Y = Y.astype(int) + except ValueError: + logger.error("Y must be castable in integer format.") + + if Y.max() >= self.num_classes or Y.min() < 0: + raise ValueError( + f"Y contains class labels outside the range [0, {self.num_classes - 1}]." + ) + + return Y + + def predict( + self, + X_test: np.ndarray, + top_k=1, + explain=False, + ): """ - with open(filepath, "w") as f: - data = { - "config": self.config.to_dict(), - } - - # Try to get wrapper class info for reconstruction - if hasattr(self.classifier.__class__, 'get_wrapper_class_info'): - data["wrapper_class_info"] = self.classifier.__class__.get_wrapper_class_info() - else: - # Fallback: store module and class name - data["wrapper_class_info"] = { - "module": self.classifier.__class__.__module__, - "class_name": self.classifier.__class__.__name__ - } - - json.dump(data, f, cls=NumpyJSONEncoder, indent=4) - - @classmethod - def from_json(cls, filepath: str, wrapper_class: Optional[Type[BaseClassifierWrapper]] = None) -> "torchTextClassifiers": - """Load classifier configuration from JSON file. - - This method creates a new classifier instance from a previously saved - configuration file. The classifier will need to be built and trained again. - Args: - filepath: Path to the JSON configuration file - wrapper_class: Optional wrapper class to use. If not provided, will try to - reconstruct from saved wrapper_class_info - - Returns: - torchTextClassifiers: New classifier instance with loaded configuration - - Raises: - ImportError: If the wrapper class cannot be imported - FileNotFoundError: If the configuration file doesn't exist - - Example: - >>> # Using saved wrapper class info - >>> classifier = torchTextClassifiers.from_json('my_classifier_config.json') - >>> - >>> # Or providing wrapper class explicitly - >>> from torchTextClassifiers.classifiers.fasttext.wrapper import FastTextWrapper - >>> classifier = torchTextClassifiers.from_json('config.json', FastTextWrapper) + X_test (np.ndarray): input data to predict on, shape (N,d) where the first column is text and the rest are categorical variables + top_k (int): for each sentence, return the top_k most likely predictions (default: 1) + explain (bool): launch gradient integration to have an explanation of the prediction (default: False) + + Returns: A dictionary containing the following fields: + - predictions (torch.Tensor, shape (len(text), top_k)): A tensor containing the top_k most likely codes to the query. + - confidence (torch.Tensor, shape (len(text), top_k)): A tensor array containing the corresponding confidence scores. + - if explain is True: + - attributions (torch.Tensor, shape (len(text), top_k, seq_len)): A tensor containing the attributions for each token in the text. """ - with open(filepath, "r") as f: - data = json.load(f) - - if wrapper_class is None: - # Try to reconstruct wrapper class from saved info - if "wrapper_class_info" not in data: - raise ValueError("No wrapper_class_info found in config file and no wrapper_class provided") - - wrapper_info = data["wrapper_class_info"] - module_name = wrapper_info["module"] - class_name = wrapper_info["class_name"] - - # Dynamically import the wrapper class - import importlib - module = importlib.import_module(module_name) - wrapper_class = getattr(module, class_name) - - # Reconstruct config using wrapper class's config class - config_class = wrapper_class.get_config_class() - config = config_class.from_dict(data["config"]) - - # Create wrapper instance - wrapper = wrapper_class(config) - - return cls(wrapper) \ No newline at end of file + + if explain: + if self.pytorch_model.text_embedder is None: + raise RuntimeError( + "Explainability is not supported when the tokenizer outputs vectorized text directly. Please use a tokenizer that outputs token IDs." + ) + else: + if not HAS_CAPTUM: + raise ImportError( + "Captum is not installed and is required for explainability. Run 'pip install/uv add torchFastText[explainability]'." + ) + lig = LayerIntegratedGradients( + self.pytorch_model, self.pytorch_model.text_embedder.embedding_layer + ) # initialize a Captum layer gradient integrator + + X_test = self._check_X(X_test) + text = X_test["text"] + categorical_variables = X_test["categorical_variables"] + + self.pytorch_model.eval() + + tokenize_output = self.tokenizer.tokenize(text.tolist()) + + encoded_text = tokenize_output["input_ids"] # (batch_size, seq_len) + attention_mask = tokenize_output["attention_mask"] # (batch_size, seq_len) + + if categorical_variables is not None: + categorical_vars = torch.tensor( + categorical_variables, dtype=torch.float32 + ) # (batch_size, num_categorical_features) + else: + categorical_vars = torch.empty((encoded_text.shape[0], 0), dtype=torch.float32) + + pred = self.pytorch_model( + encoded_text, attention_mask, categorical_vars + ) # forward pass, contains the prediction scores (len(text), num_classes) + + label_scores = pred.detach().cpu().softmax(dim=1) # convert to probabilities + + label_scores_topk = torch.topk(label_scores, k=top_k, dim=1) + + predictions = label_scores_topk.indices # get the top_k most likely predictions + confidence = torch.round(label_scores_topk.values, decimals=2) # and their scores + + if explain: + all_attributions = [] + for k in range(top_k): + attributions = lig.attribute( + (encoded_text, attention_mask, categorical_vars), + target=torch.Tensor(predictions[:, k]).long(), + ) # (batch_size, seq_len) + attributions = attributions.sum(dim=-1) + all_attributions.append(attributions.detach().cpu()) + + all_attributions = torch.stack(all_attributions, dim=1) # (batch_size, top_k, seq_len) + + return { + "prediction": predictions, + "confidence": confidence, + "attributions": all_attributions, + } + else: + return { + "prediction": predictions, + "confidence": confidence, + } diff --git a/torchTextClassifiers/utilities/checkers.py b/torchTextClassifiers/utilities/checkers.py index 11fa6a7..7474597 100644 --- a/torchTextClassifiers/utilities/checkers.py +++ b/torchTextClassifiers/utilities/checkers.py @@ -6,51 +6,6 @@ logger = logging.getLogger(__name__) -def check_X(X): - assert isinstance( - X, np.ndarray - ), "X must be a numpy array of shape (N,d), with the first column being the text and the rest being the categorical variables." - - try: - if X.ndim > 1: - text = X[:, 0].astype(str) - else: - text = X[:].astype(str) - except ValueError: - logger.error("The first column of X must be castable in string format.") - - if len(X.shape) == 1 or (len(X.shape) == 2 and X.shape[1] == 1): - no_cat_var = True - else: - no_cat_var = False - - if not no_cat_var: - try: - categorical_variables = X[:, 1:].astype(int) - except ValueError: - logger.error( - f"Columns {1} to {X.shape[1] - 1} of X_train must be castable in integer format." - ) - else: - categorical_variables = None - - return text, categorical_variables, no_cat_var - - -def check_Y(Y): - assert isinstance(Y, np.ndarray), "Y must be a numpy array of shape (N,) or (N,1)." - assert len(Y.shape) == 1 or ( - len(Y.shape) == 2 and Y.shape[1] == 1 - ), "Y must be a numpy array of shape (N,) or (N,1)." - - try: - Y = Y.astype(int) - except ValueError: - logger.error("Y must be castable in integer format.") - - return Y - - class NumpyJSONEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, np.integer): From ddd7cec8a3cdf2f9d16b744abbf4dd2da806ac52 Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Wed, 5 Nov 2025 18:10:35 +0000 Subject: [PATCH 19/66] fix: return only optimizer when scheduler is none --- torchTextClassifiers/model/lightning.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/torchTextClassifiers/model/lightning.py b/torchTextClassifiers/model/lightning.py index c1e8839..e432082 100644 --- a/torchTextClassifiers/model/lightning.py +++ b/torchTextClassifiers/model/lightning.py @@ -146,6 +146,9 @@ def configure_optimizers(self): """ optimizer = self.optimizer(self.parameters(), **self.optimizer_params) + if self.scheduler is None: + return optimizer + # Only use scheduler if it's not ReduceLROnPlateau or if we can ensure val_loss is available # For complex training setups, sometimes val_loss is not available every epoch if hasattr(self.scheduler, "__name__") and "ReduceLROnPlateau" in self.scheduler.__name__: From 32e6805d31fb511754a852e2b060057325f3048b Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Wed, 5 Nov 2025 18:12:15 +0000 Subject: [PATCH 20/66] feat(test): clean tests (wip) --- tests/conftest.py | 71 ++- tests/test_base_classes.py | 256 ---------- tests/test_core_functionality.py | 214 --------- tests/test_fasttext_implementation.py | 445 ------------------ ...rdpiece_tokenizer.py => test_tokenizer.py} | 0 tests/test_torchTextClassifiers.py | 356 +------------- 6 files changed, 43 insertions(+), 1299 deletions(-) delete mode 100644 tests/test_base_classes.py delete mode 100644 tests/test_core_functionality.py delete mode 100644 tests/test_fasttext_implementation.py rename tests/{test_wordpiece_tokenizer.py => test_tokenizer.py} (100%) diff --git a/tests/conftest.py b/tests/conftest.py index cde1c4e..4023570 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,19 +1,22 @@ -import pytest +from unittest.mock import Mock + import numpy as np -from unittest.mock import Mock, MagicMock +import pytest @pytest.fixture def sample_text_data(): """Sample text data for testing.""" - return np.array([ - "This is a positive example", - "This is a negative example", - "Another positive case", - "Another negative case", - "Good example here", - "Bad example here" - ]) + return np.array( + [ + "This is a positive example", + "This is a negative example", + "Another positive case", + "Another negative case", + "Good example here", + "Bad example here", + ] + ) @pytest.fixture @@ -25,14 +28,7 @@ def sample_labels(): @pytest.fixture def sample_categorical_data(): """Sample categorical data for testing.""" - return np.array([ - [1, 2], - [2, 1], - [1, 3], - [3, 1], - [2, 2], - [3, 3] - ]) + return np.array([[1, 2], [2, 1], [1, 3], [3, 1], [2, 2], [3, 3]]) @pytest.fixture @@ -48,33 +44,32 @@ def sample_X_text_only(sample_text_data): @pytest.fixture -def fasttext_config(): - """Mock FastText configuration.""" - from torchTextClassifiers.classifiers.fasttext.core import FastTextConfig - - config = FastTextConfig( +def model_config(): + """Mock model configuration.""" + from torchTextClassifiers import ModelConfig + + config = ModelConfig( embedding_dim=10, - sparse=False, - num_tokens=1000, - min_count=1, - min_n=3, - max_n=6, - len_word_ngrams=2, - num_classes=2 + categorical_vocabulary_sizes=[4, 5], + categorical_embedding_dims=[3, 4], + num_classes=10, ) return config @pytest.fixture def mock_tokenizer(): - """Mock NGramTokenizer for testing.""" + """Mock BaseTokenizer for testing.""" tokenizer = Mock() - tokenizer.min_count = 1 - tokenizer.min_n = 3 - tokenizer.max_n = 6 - tokenizer.num_tokens = 1000 - tokenizer.word_ngrams = 2 - tokenizer.padding_index = 999 + tokenizer.vocab_size = 1000 + tokenizer.padding_idx = 1 + tokenizer.tokenize = Mock( + return_value={ + "input_ids": np.array([[1, 2, 3], [4, 5, 6]]), + "attention_mask": np.array([[1, 1, 1], [1, 1, 1]]), + } + ) + tokenizer.output_dim = 50 return tokenizer @@ -108,4 +103,4 @@ def mock_dataset(): @pytest.fixture def mock_dataloader(): """Mock dataloader for testing.""" - return Mock() \ No newline at end of file + return Mock() diff --git a/tests/test_base_classes.py b/tests/test_base_classes.py deleted file mode 100644 index 1d7d7dc..0000000 --- a/tests/test_base_classes.py +++ /dev/null @@ -1,256 +0,0 @@ -import pytest -import numpy as np -from abc import ABC -from unittest.mock import Mock - -from torchTextClassifiers.classifiers.base import BaseClassifierConfig, BaseClassifierWrapper - - -class TestBaseClassifierConfig: - """Test the BaseClassifierConfig abstract base class.""" - - def test_cannot_instantiate_abstract_class(self): - """Test that BaseClassifierConfig cannot be instantiated directly.""" - with pytest.raises(TypeError): - BaseClassifierConfig() - - def test_concrete_implementation_required_methods(self): - """Test that concrete implementations must provide required methods.""" - - class ConcreteConfig(BaseClassifierConfig): - def __init__(self, value): - self.value = value - - def to_dict(self): - return {"value": self.value} - - @classmethod - def from_dict(cls, data): - return cls(data["value"]) - - # Should work with all methods implemented - config = ConcreteConfig(42) - assert config.value == 42 - - # Test serialization - config_dict = config.to_dict() - assert config_dict == {"value": 42} - - # Test deserialization - restored_config = ConcreteConfig.from_dict(config_dict) - assert restored_config.value == 42 - - def test_incomplete_implementation_fails(self): - """Test that incomplete implementations cannot be instantiated.""" - - class IncompleteConfig(BaseClassifierConfig): - def to_dict(self): - return {} - # Missing from_dict method - - with pytest.raises(TypeError): - IncompleteConfig() - - -class TestBaseClassifierWrapper: - """Test the BaseClassifierWrapper abstract base class.""" - - def test_cannot_instantiate_abstract_class(self): - """Test that BaseClassifierWrapper cannot be instantiated directly.""" - mock_config = Mock() - with pytest.raises(TypeError): - BaseClassifierWrapper(mock_config) - - def test_concrete_implementation_initialization(self): - """Test that concrete implementations can be initialized.""" - - class ConcreteWrapper(BaseClassifierWrapper): - def prepare_text_features(self, training_text): - pass # Mock implementation - - def _build_pytorch_model(self): - self.pytorch_model = "mock_model" - - def _check_and_init_lightning(self, **kwargs): - self.lightning_module = "mock_lightning" - - def predict(self, X, **kwargs): - return np.array([1, 0, 1]) - - def validate(self, X, Y, **kwargs): - return 0.85 - - def create_dataset(self, texts, labels, categorical_variables=None): - return "mock_dataset" - - def create_dataloader(self, dataset, batch_size, num_workers=0, shuffle=True): - return "mock_dataloader" - - def load_best_model(self, checkpoint_path): - self.trained = True - - @classmethod - def get_config_class(cls): - return Mock - - mock_config = Mock() - wrapper = ConcreteWrapper(mock_config) - - # Test initialization - assert wrapper.config == mock_config - assert wrapper.pytorch_model is None - assert wrapper.lightning_module is None - assert wrapper.trained == False - assert wrapper.device is None - - def test_concrete_implementation_methods(self): - """Test that concrete implementations can use all methods.""" - - class ConcreteWrapper(BaseClassifierWrapper): - def prepare_text_features(self, training_text): - pass # Mock implementation - - def _build_pytorch_model(self): - self.pytorch_model = "pytorch_model" - - def _check_and_init_lightning(self, **kwargs): - self.lightning_module = f"lightning_with_{kwargs}" - - def predict(self, X, **kwargs): - return np.array([1] * len(X)) - - def validate(self, X, Y, **kwargs): - return float(np.mean(Y)) - - def create_dataset(self, texts, labels, categorical_variables=None): - return { - "texts": texts, - "labels": labels, - "categorical": categorical_variables - } - - def create_dataloader(self, dataset, batch_size, num_workers=0, shuffle=True): - return { - "dataset": dataset, - "batch_size": batch_size, - "num_workers": num_workers, - "shuffle": shuffle - } - - def load_best_model(self, checkpoint_path): - self.trained = True - self.pytorch_model = f"model_from_{checkpoint_path}" - - @classmethod - def get_config_class(cls): - return Mock - - mock_config = Mock() - wrapper = ConcreteWrapper(mock_config) - - # Test prepare_text_features - sample_texts = np.array(["text1", "text2", "text3"]) - wrapper.prepare_text_features(sample_texts) - # Mock implementation doesn't set anything, so just test it doesn't crash - - # Test _build_pytorch_model - wrapper._build_pytorch_model() - assert wrapper.pytorch_model == "pytorch_model" - - # Test _check_and_init_lightning - wrapper._check_and_init_lightning(learning_rate=0.01) - assert "learning_rate" in str(wrapper.lightning_module) - - # Test predict - X = np.array(["test1", "test2"]) - predictions = wrapper.predict(X) - np.testing.assert_array_equal(predictions, np.array([1, 1])) - - # Test validate - Y = np.array([0, 1, 1, 0]) - accuracy = wrapper.validate(X, Y) - assert accuracy == 0.5 - - # Test create_dataset - labels = np.array([1, 0]) - dataset = wrapper.create_dataset(sample_texts[:2], labels) - np.testing.assert_array_equal(dataset["texts"], sample_texts[:2]) - np.testing.assert_array_equal(dataset["labels"], labels) - assert dataset["categorical"] is None - - # Test create_dataset with categorical - categorical = np.array([[1, 2], [3, 4]]) - dataset_with_cat = wrapper.create_dataset(sample_texts[:2], labels, categorical) - np.testing.assert_array_equal(dataset_with_cat["categorical"], categorical) - - # Test create_dataloader - dataloader = wrapper.create_dataloader(dataset, batch_size=32, num_workers=4, shuffle=False) - assert dataloader["dataset"] == dataset - assert dataloader["batch_size"] == 32 - assert dataloader["num_workers"] == 4 - assert dataloader["shuffle"] == False - - # Test load_best_model - checkpoint_path = "/path/to/checkpoint" - wrapper.load_best_model(checkpoint_path) - assert wrapper.trained == True - assert wrapper.pytorch_model == f"model_from_{checkpoint_path}" - - def test_incomplete_implementation_fails(self): - """Test that incomplete implementations cannot be instantiated.""" - - class IncompleteWrapper(BaseClassifierWrapper): - def prepare_text_features(self, training_text): - pass - - def _build_pytorch_model(self): - pass - - def _check_and_init_lightning(self, **kwargs): - pass - - def predict(self, X, **kwargs): - return np.array([]) - - # Missing: validate, create_dataset, create_dataloader, load_best_model - - mock_config = Mock() - with pytest.raises(TypeError): - IncompleteWrapper(mock_config) - - def test_method_signatures(self): - """Test that abstract methods have correct signatures.""" - - class ConcreteWrapper(BaseClassifierWrapper): - def prepare_text_features(self, training_text: np.ndarray) -> None: - pass - - def _build_pytorch_model(self) -> None: - pass - - def _check_and_init_lightning(self, **kwargs) -> None: - pass - - def predict(self, X: np.ndarray, **kwargs) -> np.ndarray: - return np.array([]) - - def validate(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: - return 0.0 - - def create_dataset(self, texts: np.ndarray, labels: np.ndarray, categorical_variables=None): - return None - - def create_dataloader(self, dataset, batch_size: int, num_workers: int = 0, shuffle: bool = True): - return None - - def load_best_model(self, checkpoint_path: str) -> None: - pass - - @classmethod - def get_config_class(cls): - return Mock - - # Should be able to instantiate with all methods implemented - mock_config = Mock() - wrapper = ConcreteWrapper(mock_config) - assert wrapper is not None \ No newline at end of file diff --git a/tests/test_core_functionality.py b/tests/test_core_functionality.py deleted file mode 100644 index a307e64..0000000 --- a/tests/test_core_functionality.py +++ /dev/null @@ -1,214 +0,0 @@ -import pytest -import numpy as np -import json -import tempfile -import os -from unittest.mock import Mock, patch, MagicMock - - -def test_basic_imports(): - """Test that core modules can be imported without torch dependencies.""" - # Test that the main class can be imported - from torchTextClassifiers.torchTextClassifiers import torchTextClassifiers - assert torchTextClassifiers is not None - - -def test_wrapper_based_pattern(): - """Test the wrapper-based pattern without actual implementations.""" - from torchTextClassifiers.classifiers.base import BaseClassifierWrapper, BaseClassifierConfig - - # Test that base classes exist and have expected structure - assert hasattr(BaseClassifierWrapper, 'prepare_text_features') - assert hasattr(BaseClassifierWrapper, '_build_pytorch_model') - assert hasattr(BaseClassifierWrapper, 'get_config_class') - assert hasattr(BaseClassifierConfig, 'to_dict') - assert hasattr(BaseClassifierConfig, 'from_dict') - - -def test_class_structure(): - """Test that the main class has the expected class structure.""" - # We can test the class structure without instantiating - from torchTextClassifiers.torchTextClassifiers import torchTextClassifiers - - # Check that it has the expected methods defined - assert hasattr(torchTextClassifiers, '__init__') - assert hasattr(torchTextClassifiers, 'build_tokenizer') # backward compatibility - assert hasattr(torchTextClassifiers, 'prepare_text_features') # new method - assert hasattr(torchTextClassifiers, 'build') - assert hasattr(torchTextClassifiers, 'train') - assert hasattr(torchTextClassifiers, 'predict') - assert hasattr(torchTextClassifiers, 'validate') - assert hasattr(torchTextClassifiers, 'to_json') - assert hasattr(torchTextClassifiers, 'from_json') - - -def test_abstract_base_classes(): - """Test the abstract base class structure.""" - from torchTextClassifiers.classifiers.base import BaseClassifierConfig, BaseClassifierWrapper - - # Test that they are abstract - with pytest.raises(TypeError): - BaseClassifierConfig() - - # Test that BaseClassifierWrapper requires a config but still can't be instantiated - mock_config = Mock() - with pytest.raises(TypeError): - BaseClassifierWrapper(mock_config) - - -def test_utilities_import(): - """Test that utility functions can be imported.""" - from torchTextClassifiers.utilities.checkers import check_X, check_Y, NumpyJSONEncoder - - # Test basic functionality that doesn't depend on complex imports - assert callable(check_X) - assert callable(check_Y) - assert NumpyJSONEncoder is not None - - -def test_torchTextClassifiers_initialization_pattern(): - """Test the initialization pattern using mocks.""" - from torchTextClassifiers.torchTextClassifiers import torchTextClassifiers - - # Mock wrapper with config - mock_wrapper = Mock() - mock_config = Mock() - mock_wrapper.config = mock_config - - # Create instance directly with wrapper - classifier = torchTextClassifiers(mock_wrapper) - - # Verify initialization - assert classifier.classifier == mock_wrapper - assert classifier.config == mock_config - - -def test_numpy_json_encoder(): - """Test the custom JSON encoder for numpy arrays.""" - from torchTextClassifiers.utilities.checkers import NumpyJSONEncoder - - # Test with numpy array - test_data = { - "array": np.array([1, 2, 3]), - "scalar": np.int64(42), - "regular": "string" - } - - # Should not raise an error - json_str = json.dumps(test_data, cls=NumpyJSONEncoder) - assert isinstance(json_str, str) - - # Verify it can be loaded back - loaded_data = json.loads(json_str) - assert loaded_data["regular"] == "string" - - -def test_create_fasttext_classmethod(): - """Test the create_fasttext class method through FastTextFactory.""" - from torchTextClassifiers.classifiers.fasttext.core import FastTextFactory - from torchTextClassifiers.torchTextClassifiers import torchTextClassifiers - from torchTextClassifiers.classifiers.fasttext.wrapper import FastTextWrapper - - # Just test that it creates a real instance and config properly - result = FastTextFactory.create_fasttext( - embedding_dim=50, - sparse=True, - num_tokens=5000, - min_count=2, - min_n=2, - max_n=5, - len_word_ngrams=3, - num_classes=2 - ) - - # Verify the result is a proper torchTextClassifiers instance - assert isinstance(result, torchTextClassifiers) - assert isinstance(result.classifier, FastTextWrapper) - assert result.config.embedding_dim == 50 - assert result.config.sparse == True - assert result.config.num_tokens == 5000 - - -def test_method_delegation_pattern(): - """Test that the main class properly delegates to wrapper methods.""" - from torchTextClassifiers.torchTextClassifiers import torchTextClassifiers - - # Create a mock instance - classifier = Mock(spec=torchTextClassifiers) - classifier.classifier = Mock() - - # Test predict delegation - expected_result = np.array([1, 0, 1]) - classifier.classifier.predict.return_value = expected_result - - # Apply the real predict method to our mock - sample_X = np.array(["test1", "test2", "test3"]) - result = torchTextClassifiers.predict(classifier, sample_X) - - classifier.classifier.predict.assert_called_once_with(sample_X) - assert result is expected_result - - -def test_error_handling_patterns(): - """Test expected error handling without actual implementation.""" - - # Test that incomplete wrapper configurations raise appropriate errors - from torchTextClassifiers.torchTextClassifiers import torchTextClassifiers - - # Test with invalid wrapper (missing config attribute) - class InvalidWrapper: - pass - - invalid_wrapper = InvalidWrapper() - - # This should raise AttributeError for missing config - with pytest.raises(AttributeError): - torchTextClassifiers(invalid_wrapper) - - -@pytest.mark.parametrize("method_name,expected_args", [ - ("predict", ["X"]), - ("validate", ["X", "Y"]), - ("prepare_text_features", ["training_text"]), -]) -def test_wrapper_method_signatures(method_name, expected_args): - """Test that wrapper methods have expected signatures.""" - from torchTextClassifiers.classifiers.base import BaseClassifierWrapper - - # Get the method from the abstract class - method = getattr(BaseClassifierWrapper, method_name) - - # Check that it's abstract - assert hasattr(method, '__isabstractmethod__') - assert method.__isabstractmethod__ == True - - -def test_configuration_serialization_pattern(): - """Test the configuration serialization pattern.""" - from torchTextClassifiers.classifiers.base import BaseClassifierConfig - - # Verify abstract methods exist - assert hasattr(BaseClassifierConfig, 'to_dict') - assert hasattr(BaseClassifierConfig, 'from_dict') - - # Verify they are abstract - assert BaseClassifierConfig.to_dict.__isabstractmethod__ == True - assert BaseClassifierConfig.from_dict.__isabstractmethod__ == True - - -def test_sample_data_fixtures(sample_text_data, sample_labels, sample_categorical_data): - """Test that our test fixtures work correctly.""" - assert len(sample_text_data) == 6 - assert len(sample_labels) == 6 - assert sample_categorical_data.shape == (6, 2) - - # Verify data types - assert isinstance(sample_text_data, np.ndarray) - assert isinstance(sample_labels, np.ndarray) - assert isinstance(sample_categorical_data, np.ndarray) - - # Verify content makes sense - assert all(isinstance(text, str) for text in sample_text_data) - assert all(label in [0, 1] for label in sample_labels) - - diff --git a/tests/test_fasttext_implementation.py b/tests/test_fasttext_implementation.py deleted file mode 100644 index 770ae83..0000000 --- a/tests/test_fasttext_implementation.py +++ /dev/null @@ -1,445 +0,0 @@ -import pytest -import numpy as np -import torch -from unittest.mock import Mock, patch, MagicMock - -from torchTextClassifiers.classifiers.fasttext.wrapper import FastTextWrapper -from torchTextClassifiers.classifiers.fasttext.core import FastTextConfig -from torchTextClassifiers.classifiers.fasttext.tokenizer import NGramTokenizer -from torchTextClassifiers.classifiers.fasttext.model import FastTextModelDataset, FastTextModel, FastTextModule - - -class TestFastTextConfig: - """Test FastTextConfig class.""" - - def test_config_creation(self): - """Test basic config creation.""" - config = FastTextConfig( - embedding_dim=100, - sparse=True, - num_tokens=5000, - min_count=2, - min_n=3, - max_n=6, - len_word_ngrams=2, - num_classes=3 - ) - - assert config.embedding_dim == 100 - assert config.sparse == True - assert config.num_tokens == 5000 - assert config.min_count == 2 - assert config.min_n == 3 - assert config.max_n == 6 - assert config.len_word_ngrams == 2 - assert config.num_classes == 3 - - def test_config_to_dict(self): - """Test config serialization to dictionary.""" - config = FastTextConfig( - embedding_dim=100, - sparse=False, - num_tokens=1000, - min_count=1, - min_n=3, - max_n=6, - len_word_ngrams=2 - ) - - config_dict = config.to_dict() - - assert isinstance(config_dict, dict) - assert config_dict['embedding_dim'] == 100 - assert config_dict['sparse'] == False - assert config_dict['num_tokens'] == 1000 - - def test_config_from_dict(self): - """Test config deserialization from dictionary.""" - config_dict = { - 'embedding_dim': 50, - 'sparse': True, - 'num_tokens': 2000, - 'min_count': 3, - 'min_n': 2, - 'max_n': 5, - 'len_word_ngrams': 3, - 'num_classes': 4 - } - - config = FastTextConfig.from_dict(config_dict) - - assert config.embedding_dim == 50 - assert config.sparse == True - assert config.num_tokens == 2000 - assert config.num_classes == 4 - - -class TestFastTextWrapper: - """Test FastTextWrapper class.""" - - def test_wrapper_initialization(self, fasttext_config): - """Test wrapper initialization.""" - wrapper = FastTextWrapper(fasttext_config) - - assert wrapper.config == fasttext_config - assert wrapper.tokenizer is None - assert wrapper.pytorch_model is None - assert wrapper.lightning_module is None - assert wrapper.trained == False - assert wrapper.device is None - - @patch('torchTextClassifiers.classifiers.fasttext.wrapper.NGramTokenizer') - def test_build_tokenizer(self, mock_tokenizer_class, fasttext_config, sample_text_data): - """Test tokenizer building.""" - mock_tokenizer = Mock() - mock_tokenizer_class.return_value = mock_tokenizer - - wrapper = FastTextWrapper(fasttext_config) - wrapper.build_tokenizer(sample_text_data) - - mock_tokenizer_class.assert_called_once_with( - fasttext_config.min_count, - fasttext_config.min_n, - fasttext_config.max_n, - fasttext_config.num_tokens, - fasttext_config.len_word_ngrams, - sample_text_data - ) - assert wrapper.tokenizer == mock_tokenizer - - @patch('torchTextClassifiers.classifiers.fasttext.wrapper.FastTextModel') - def test_build_pytorch_model_with_tokenizer(self, mock_model_class, fasttext_config, mock_tokenizer): - """Test PyTorch model building with existing tokenizer.""" - mock_model = Mock() - mock_model_class.return_value = mock_model - - wrapper = FastTextWrapper(fasttext_config) - wrapper.tokenizer = mock_tokenizer - - wrapper._build_pytorch_model() - - # Verify model was created with correct parameters - mock_model_class.assert_called_once() - call_kwargs = mock_model_class.call_args[1] - assert call_kwargs['tokenizer'] == mock_tokenizer - assert call_kwargs['embedding_dim'] == fasttext_config.embedding_dim - assert call_kwargs['num_classes'] == fasttext_config.num_classes - - assert wrapper.pytorch_model == mock_model - - def test_build_pytorch_model_no_tokenizer_no_num_rows(self, fasttext_config): - """Test PyTorch model building fails without tokenizer or num_rows.""" - wrapper = FastTextWrapper(fasttext_config) - # No tokenizer and no num_rows in config - wrapper.tokenizer = None - wrapper.config.num_rows = None - - with pytest.raises(ValueError, match="Please provide a tokenizer or num_rows"): - wrapper._build_pytorch_model() - - @patch('torch.optim.lr_scheduler.ReduceLROnPlateau') - @patch('torchTextClassifiers.classifiers.fasttext.wrapper.FastTextModule') - def test_check_and_init_lightning_basic(self, mock_module_class, mock_scheduler, fasttext_config, mock_pytorch_model): - """Test Lightning module initialization.""" - mock_module = Mock() - mock_module_class.return_value = mock_module - - wrapper = FastTextWrapper(fasttext_config) - wrapper.pytorch_model = mock_pytorch_model - - wrapper._check_and_init_lightning(lr=0.01) - - # Verify Lightning module was created - mock_module_class.assert_called_once() - assert wrapper.lightning_module == mock_module - assert wrapper.optimizer_params == {"lr": 0.01} - - @patch('torchTextClassifiers.classifiers.fasttext.wrapper.FastTextModule') - def test_check_and_init_lightning_uses_config_lr(self, mock_module_class, fasttext_config, mock_pytorch_model): - """Test Lightning module initialization uses config learning rate as default.""" - wrapper = FastTextWrapper(fasttext_config) - wrapper.pytorch_model = mock_pytorch_model - mock_module = Mock() - mock_module_class.return_value = mock_module - - # Should not raise an error since learning_rate is in config - wrapper._check_and_init_lightning() - - # Check that the learning rate from config was used - assert wrapper.optimizer_params['lr'] == fasttext_config.learning_rate - assert wrapper.lightning_module == mock_module - - @patch('torchTextClassifiers.classifiers.fasttext.wrapper.check_X') - def test_predict_not_trained(self, mock_check_X, fasttext_config, sample_text_data): - """Test prediction fails when model not trained.""" - wrapper = FastTextWrapper(fasttext_config) - wrapper.trained = False - - with pytest.raises(Exception, match="Model must be trained first"): - wrapper.predict(sample_text_data) - - @patch('torchTextClassifiers.classifiers.fasttext.wrapper.check_X') - def test_predict_success(self, mock_check_X, fasttext_config, sample_text_data, mock_pytorch_model): - """Test successful prediction.""" - mock_check_X.return_value = (sample_text_data, None, True) - expected_predictions = np.array([[1], [0], [1]]) # With top_k dimension - expected_confidence = np.array([[0.9], [0.8], [0.95]]) - mock_pytorch_model.predict.return_value = (expected_predictions, expected_confidence) - mock_pytorch_model.no_cat_var = True - - wrapper = FastTextWrapper(fasttext_config) - wrapper.trained = True - wrapper.pytorch_model = mock_pytorch_model - wrapper.config.num_categorical_features = None - - result = wrapper.predict(sample_text_data) - - mock_pytorch_model.predict.assert_called_once() - # The wrapper should squeeze the top_k dimension for top_k=1 - expected_result = np.array([1, 0, 1]) - np.testing.assert_array_equal(result, expected_result) - - @patch('torchTextClassifiers.classifiers.fasttext.wrapper.FastTextWrapper.predict') - @patch('torchTextClassifiers.classifiers.fasttext.wrapper.check_Y') - def test_validate_success(self, mock_check_Y, mock_predict, fasttext_config, - sample_text_data, sample_labels): - """Test successful validation.""" - mock_predictions = np.array([1, 0, 1]) - mock_predict.return_value = mock_predictions - mock_check_Y.return_value = np.array([1, 0, 1]) # Perfect predictions - - wrapper = FastTextWrapper(fasttext_config) - wrapper.trained = True - - result = wrapper.validate(sample_text_data, sample_labels) - - mock_predict.assert_called_once_with(sample_text_data) - mock_check_Y.assert_called_once_with(sample_labels) - assert result == 1.0 # Perfect accuracy - - def test_create_dataset(self, fasttext_config, sample_text_data, sample_labels, mock_tokenizer): - """Test dataset creation.""" - wrapper = FastTextWrapper(fasttext_config) - wrapper.tokenizer = mock_tokenizer - - with patch('torchTextClassifiers.classifiers.fasttext.wrapper.FastTextModelDataset') as mock_dataset_class: - mock_dataset = Mock() - mock_dataset_class.return_value = mock_dataset - - result = wrapper.create_dataset(sample_text_data, sample_labels) - - mock_dataset_class.assert_called_once_with( - categorical_variables=None, - texts=sample_text_data, - outputs=sample_labels, - tokenizer=mock_tokenizer - ) - assert result == mock_dataset - - def test_create_dataset_with_categorical(self, fasttext_config, sample_text_data, sample_labels, - sample_categorical_data, mock_tokenizer): - """Test dataset creation with categorical variables.""" - wrapper = FastTextWrapper(fasttext_config) - wrapper.tokenizer = mock_tokenizer - - with patch('torchTextClassifiers.classifiers.fasttext.wrapper.FastTextModelDataset') as mock_dataset_class: - mock_dataset = Mock() - mock_dataset_class.return_value = mock_dataset - - result = wrapper.create_dataset(sample_text_data, sample_labels, sample_categorical_data) - - mock_dataset_class.assert_called_once_with( - categorical_variables=sample_categorical_data, - texts=sample_text_data, - outputs=sample_labels, - tokenizer=mock_tokenizer - ) - assert result == mock_dataset - - def test_create_dataloader(self, fasttext_config, mock_dataset): - """Test dataloader creation.""" - mock_dataloader = Mock() - mock_dataset.create_dataloader.return_value = mock_dataloader - - wrapper = FastTextWrapper(fasttext_config) - - result = wrapper.create_dataloader(mock_dataset, batch_size=32, num_workers=4, shuffle=True) - - mock_dataset.create_dataloader.assert_called_once_with( - batch_size=32, num_workers=4, shuffle=True - ) - assert result == mock_dataloader - - @patch('torchTextClassifiers.classifiers.fasttext.wrapper.FastTextModule') - def test_load_best_model(self, mock_module_class, fasttext_config, mock_pytorch_model): - """Test loading best model from checkpoint.""" - mock_loaded_module = Mock() - mock_loaded_module.model = mock_pytorch_model - mock_module_class.load_from_checkpoint.return_value = mock_loaded_module - - wrapper = FastTextWrapper(fasttext_config) - wrapper.pytorch_model = mock_pytorch_model - wrapper.loss = Mock() - wrapper.optimizer = Mock() - wrapper.optimizer_params = {} - wrapper.scheduler = Mock() - wrapper.scheduler_params = {} - - mock_pytorch_model.to.return_value = mock_pytorch_model - mock_pytorch_model.eval = Mock() - - checkpoint_path = "/fake/checkpoint/path" - wrapper.load_best_model(checkpoint_path) - - # Verify checkpoint loading - mock_module_class.load_from_checkpoint.assert_called_once_with( - checkpoint_path, - model=mock_pytorch_model, - loss=wrapper.loss, - optimizer=wrapper.optimizer, - optimizer_params=wrapper.optimizer_params, - scheduler=wrapper.scheduler, - scheduler_params=wrapper.scheduler_params, - scheduler_interval="epoch" - ) - - # Verify model state updates - assert wrapper.lightning_module == mock_loaded_module - assert wrapper.pytorch_model == mock_pytorch_model - assert wrapper.trained == True - mock_pytorch_model.eval.assert_called_once() - - @patch('torchTextClassifiers.classifiers.fasttext.wrapper.check_X') - def test_predict_and_explain_success(self, mock_check_X, fasttext_config, sample_text_data, mock_pytorch_model): - """Test successful predict_and_explain.""" - mock_check_X.return_value = (sample_text_data, None, True) - expected_result = (np.array([1, 0, 1]), np.array([0.8, 0.2, 0.9])) - mock_pytorch_model.predict_and_explain.return_value = expected_result - mock_pytorch_model.no_cat_var = True - - wrapper = FastTextWrapper(fasttext_config) - wrapper.trained = True - wrapper.pytorch_model = mock_pytorch_model - wrapper.config.num_categorical_features = None - - result = wrapper.predict_and_explain(sample_text_data) - - mock_pytorch_model.predict_and_explain.assert_called_once() - assert result == expected_result - - @patch('torchTextClassifiers.classifiers.fasttext.wrapper.check_X') - def test_predict_and_explain_not_trained(self, mock_check_X, fasttext_config, sample_text_data): - """Test predict_and_explain fails when model not trained.""" - wrapper = FastTextWrapper(fasttext_config) - wrapper.trained = False - - with pytest.raises(Exception, match="Model must be trained first"): - wrapper.predict_and_explain(sample_text_data) - - -class TestFastTextModelDataset: - """Test FastTextModelDataset class.""" - - def test_dataset_initialization_text_only(self, sample_text_data, sample_labels, mock_tokenizer): - """Test dataset initialization with text only.""" - dataset = FastTextModelDataset( - categorical_variables=None, - texts=sample_text_data, - outputs=sample_labels, - tokenizer=mock_tokenizer - ) - - assert len(dataset) == len(sample_text_data) - assert dataset.texts is sample_text_data - assert dataset.outputs is sample_labels - assert dataset.tokenizer is mock_tokenizer - assert dataset.categorical_variables is None - - def test_dataset_initialization_with_categorical(self, sample_text_data, sample_labels, - sample_categorical_data, mock_tokenizer): - """Test dataset initialization with categorical variables.""" - dataset = FastTextModelDataset( - categorical_variables=sample_categorical_data, - texts=sample_text_data, - outputs=sample_labels, - tokenizer=mock_tokenizer - ) - - assert len(dataset) == len(sample_text_data) - assert dataset.categorical_variables is sample_categorical_data - - def test_dataset_length_mismatch_categorical(self, sample_text_data, sample_labels, mock_tokenizer): - """Test dataset initialization fails with mismatched lengths.""" - wrong_categorical = np.array([[1, 2]]) # Wrong length - - with pytest.raises(ValueError, match="Categorical variables and texts must have the same length"): - FastTextModelDataset( - categorical_variables=wrong_categorical, - texts=sample_text_data, - outputs=sample_labels, - tokenizer=mock_tokenizer - ) - - def test_dataset_length_mismatch_outputs(self, sample_text_data, mock_tokenizer): - """Test dataset initialization fails with mismatched output lengths.""" - wrong_outputs = np.array([1, 0]) # Wrong length - - with pytest.raises(ValueError, match="Outputs and texts must have the same length"): - FastTextModelDataset( - categorical_variables=None, - texts=sample_text_data, - outputs=wrong_outputs, - tokenizer=mock_tokenizer - ) - - def test_dataset_getitem_with_outputs(self, sample_text_data, sample_labels, mock_tokenizer): - """Test dataset __getitem__ with outputs.""" - dataset = FastTextModelDataset( - categorical_variables=None, - texts=sample_text_data, - outputs=sample_labels, - tokenizer=mock_tokenizer - ) - - text, categorical_vars, output = dataset[0] - - assert text == sample_text_data[0] - assert categorical_vars is None - assert output == sample_labels[0] - - def test_dataset_getitem_without_outputs(self, sample_text_data, mock_tokenizer): - """Test dataset __getitem__ without outputs.""" - dataset = FastTextModelDataset( - categorical_variables=None, - texts=sample_text_data, - outputs=None, - tokenizer=mock_tokenizer - ) - - text, categorical_vars = dataset[0] - - assert text == sample_text_data[0] - assert categorical_vars is None - - @patch('torch.utils.data.DataLoader') - def test_create_dataloader(self, mock_dataloader_class, sample_text_data, sample_labels, mock_tokenizer): - """Test dataloader creation.""" - mock_dataloader = Mock() - mock_dataloader_class.return_value = mock_dataloader - - dataset = FastTextModelDataset( - categorical_variables=None, - texts=sample_text_data, - outputs=sample_labels, - tokenizer=mock_tokenizer - ) - - result = dataset.create_dataloader(batch_size=16, shuffle=True, num_workers=2) - - mock_dataloader_class.assert_called_once() - call_kwargs = mock_dataloader_class.call_args[1] - assert call_kwargs['batch_size'] == 16 - assert call_kwargs['shuffle'] == True - assert call_kwargs['num_workers'] == 2 - - assert result == mock_dataloader \ No newline at end of file diff --git a/tests/test_wordpiece_tokenizer.py b/tests/test_tokenizer.py similarity index 100% rename from tests/test_wordpiece_tokenizer.py rename to tests/test_tokenizer.py diff --git a/tests/test_torchTextClassifiers.py b/tests/test_torchTextClassifiers.py index b80ee21..6ecf388 100644 --- a/tests/test_torchTextClassifiers.py +++ b/tests/test_torchTextClassifiers.py @@ -1,353 +1,17 @@ -import pytest -import numpy as np -import json -from unittest.mock import Mock, patch, MagicMock -import tempfile -import os - -from torchTextClassifiers.torchTextClassifiers import torchTextClassifiers -from torchTextClassifiers.classifiers.fasttext.core import FastTextConfig, FastTextFactory -from torchTextClassifiers.classifiers.fasttext.wrapper import FastTextWrapper - - +from torchTextClassifiers import torchTextClassifiers +from torchTextClassifiers.model import TextClassificationModel +from torchTextClassifiers.model.components import ClassificationHead class TestTorchTextClassifiers: """Test the main torchTextClassifiers class.""" - - def test_initialization(self, fasttext_config): + + def test_initialization(self, model_config, mock_tokenizer): """Test basic initialization.""" - wrapper = FastTextWrapper(fasttext_config) - classifier = torchTextClassifiers(wrapper) - - assert classifier.config == fasttext_config - assert isinstance(classifier.classifier, FastTextWrapper) - assert classifier.classifier is wrapper - - def test_create_fasttext_classmethod(self): - """Test the create_fasttext class method.""" - classifier = FastTextFactory.create_fasttext( - embedding_dim=50, - sparse=True, - num_tokens=5000, - min_count=2, - min_n=2, - max_n=5, - len_word_ngrams=3, - num_classes=3 - ) - - assert isinstance(classifier.classifier, FastTextWrapper) - assert classifier.config.embedding_dim == 50 - assert classifier.config.sparse == True - assert classifier.config.num_tokens == 5000 - assert classifier.config.num_classes == 3 - - def test_build_from_tokenizer(self, mock_tokenizer): - """Test building classifier from existing tokenizer.""" - classifier = FastTextFactory.build_from_tokenizer( + ttc = torchTextClassifiers( tokenizer=mock_tokenizer, - embedding_dim=100, - num_classes=2, - sparse=False + model_config=model_config, ) - - assert isinstance(classifier.classifier, FastTextWrapper) - assert classifier.config.embedding_dim == 100 - assert classifier.config.num_classes == 2 - assert classifier.classifier.tokenizer == mock_tokenizer - - def test_build_from_tokenizer_missing_attributes(self): - """Test build_from_tokenizer with tokenizer missing attributes.""" - class IncompleteTokenizer: - def __init__(self): - self.min_count = 1 - # Missing: min_n, max_n, num_tokens, word_ngrams - - incomplete_tokenizer = IncompleteTokenizer() - - with pytest.raises(ValueError, match="Missing attributes in tokenizer"): - FastTextFactory.build_from_tokenizer( - tokenizer=incomplete_tokenizer, - embedding_dim=100, - num_classes=2 - ) - - - @patch('torchTextClassifiers.torchTextClassifiers.check_X') - def test_build_tokenizer(self, mock_check_X, fasttext_config, sample_text_data): - """Test build_tokenizer method.""" - mock_check_X.return_value = (sample_text_data, None, True) - - wrapper = FastTextWrapper(fasttext_config) - classifier = torchTextClassifiers(wrapper) - classifier.classifier.prepare_text_features = Mock() - - classifier.build_tokenizer(sample_text_data) - - classifier.classifier.prepare_text_features.assert_called_once_with(sample_text_data) - - @patch('torchTextClassifiers.torchTextClassifiers.check_X') - @patch('torchTextClassifiers.torchTextClassifiers.check_Y') - def test_build_method_with_labels(self, mock_check_Y, mock_check_X, fasttext_config, - sample_text_data, sample_labels): - """Test build method with training labels.""" - mock_check_X.return_value = (sample_text_data, None, True) - mock_check_Y.return_value = sample_labels - - wrapper = FastTextWrapper(fasttext_config) - classifier = torchTextClassifiers(wrapper) - classifier.classifier.prepare_text_features = Mock() - classifier.classifier._build_pytorch_model = Mock() - classifier.classifier._check_and_init_lightning = Mock() - - classifier.build(sample_text_data, sample_labels) - - # Verify methods were called - classifier.classifier.prepare_text_features.assert_called_once() - classifier.classifier._build_pytorch_model.assert_called_once() - classifier.classifier._check_and_init_lightning.assert_called_once() - - # Verify num_classes was updated - assert classifier.config.num_classes == len(np.unique(sample_labels)) - - @patch('torchTextClassifiers.torchTextClassifiers.check_X') - def test_build_method_without_labels(self, mock_check_X, sample_text_data): - """Test build method without training labels.""" - mock_check_X.return_value = (sample_text_data, None, True) - - # Config with pre-set num_classes - config = FastTextConfig( - embedding_dim=10, sparse=False, num_tokens=1000, - min_count=1, min_n=3, max_n=6, len_word_ngrams=2, - num_classes=3 # Pre-set - ) - - wrapper = FastTextWrapper(config) - classifier = torchTextClassifiers(wrapper) - classifier.classifier.prepare_text_features = Mock() - classifier.classifier._build_pytorch_model = Mock() - classifier.classifier._check_and_init_lightning = Mock() - - classifier.build(sample_text_data, y_train=None) - - # Should not raise error since num_classes is pre-set - assert classifier.config.num_classes == 3 - - @patch('torchTextClassifiers.torchTextClassifiers.check_X') - def test_build_method_no_labels_no_num_classes(self, mock_check_X, fasttext_config, sample_text_data): - """Test build method fails when no labels and no num_classes.""" - mock_check_X.return_value = (sample_text_data, None, True) - - # Config without num_classes - fasttext_config.num_classes = None - - wrapper = FastTextWrapper(fasttext_config) - classifier = torchTextClassifiers(wrapper) - - with pytest.raises(ValueError, match="Either num_classes must be provided"): - classifier.build(sample_text_data, y_train=None) - - @patch('torchTextClassifiers.torchTextClassifiers.check_X') - @patch('torchTextClassifiers.torchTextClassifiers.check_Y') - def test_build_invalid_labels_range(self, mock_check_Y, mock_check_X, fasttext_config, - sample_text_data): - """Test build method with invalid label range.""" - mock_check_X.return_value = (sample_text_data, None, True) - # Labels with values that don't start from 0 or have gaps (invalid) - invalid_labels = np.array([0, 1, 5]) # Max value 5 but only 3 unique values, so num_classes=3 but max=5 - mock_check_Y.return_value = invalid_labels - - wrapper = FastTextWrapper(fasttext_config) - classifier = torchTextClassifiers(wrapper) - - with pytest.raises(ValueError, match="y_train must contain values between 0 and num_classes-1"): - classifier.build(sample_text_data, invalid_labels) - - @patch('torchTextClassifiers.torchTextClassifiers.check_X') - @patch('torchTextClassifiers.torchTextClassifiers.check_Y') - @patch('torch.cuda.is_available') - @patch('pytorch_lightning.Trainer') - def test_train_method_basic(self, mock_trainer_class, mock_cuda, mock_check_Y, mock_check_X, - fasttext_config, sample_text_data, sample_labels, mock_dataset, mock_dataloader): - """Test basic train method functionality.""" - # Setup mocks - mock_check_X.return_value = (sample_text_data, None, True) - mock_check_Y.return_value = sample_labels - mock_cuda.return_value = True - - mock_trainer = Mock() - mock_trainer.checkpoint_callback.best_model_path = "/fake/path" - mock_trainer_class.return_value = mock_trainer - - wrapper = FastTextWrapper(fasttext_config) - classifier = torchTextClassifiers(wrapper) - - # Mock wrapper methods - classifier.classifier.create_dataset = Mock(return_value=mock_dataset) - classifier.classifier.create_dataloader = Mock(return_value=mock_dataloader) - classifier.classifier.load_best_model = Mock() - classifier.classifier.tokenizer = Mock() # Pretend it's built - classifier.classifier.pytorch_model = Mock() - classifier.classifier.pytorch_model.to = Mock(return_value=classifier.classifier.pytorch_model) - classifier.classifier.lightning_module = Mock() - - # Call train - classifier.train( - X_train=sample_text_data, - y_train=sample_labels, - X_val=sample_text_data[:3], - y_val=sample_labels[:3], - num_epochs=1, - batch_size=2 - ) - - # Verify dataset creation - assert classifier.classifier.create_dataset.call_count == 2 # train + val - assert classifier.classifier.create_dataloader.call_count == 2 # train + val - - # Verify trainer was called - mock_trainer.fit.assert_called_once() - classifier.classifier.load_best_model.assert_called_once() - - def test_predict_method(self, fasttext_config, sample_text_data): - """Test predict method.""" - wrapper = FastTextWrapper(fasttext_config) - classifier = torchTextClassifiers(wrapper) - classifier.classifier.predict = Mock(return_value=np.array([1, 0, 1])) - - result = classifier.predict(sample_text_data) - - classifier.classifier.predict.assert_called_once_with(sample_text_data) - np.testing.assert_array_equal(result, np.array([1, 0, 1])) - - def test_validate_method(self, fasttext_config, sample_text_data, sample_labels): - """Test validate method.""" - wrapper = FastTextWrapper(fasttext_config) - classifier = torchTextClassifiers(wrapper) - classifier.classifier.validate = Mock(return_value=0.85) - - result = classifier.validate(sample_text_data, sample_labels) - - classifier.classifier.validate.assert_called_once_with(sample_text_data, sample_labels) - assert result == 0.85 - - def test_predict_and_explain_method(self, fasttext_config, sample_text_data): - """Test predict_and_explain method.""" - wrapper = FastTextWrapper(fasttext_config) - classifier = torchTextClassifiers(wrapper) - expected_predictions = np.array([1, 0, 1]) - expected_explanations = np.array([0.8, 0.2, 0.9]) - classifier.classifier.predict_and_explain = Mock( - return_value=(expected_predictions, expected_explanations) - ) - - predictions, explanations = classifier.predict_and_explain(sample_text_data) - - classifier.classifier.predict_and_explain.assert_called_once_with(sample_text_data) - np.testing.assert_array_equal(predictions, expected_predictions) - np.testing.assert_array_equal(explanations, expected_explanations) - - def test_predict_and_explain_not_supported(self, fasttext_config, sample_text_data): - """Test predict_and_explain when not supported by wrapper.""" - - # Create a mock wrapper class that doesn't have predict_and_explain - class MockWrapperWithoutExplain: - pass - - wrapper = FastTextWrapper(fasttext_config) - classifier = torchTextClassifiers(wrapper) - classifier.classifier = MockWrapperWithoutExplain() - - with pytest.raises(NotImplementedError, match="Explanation not supported"): - classifier.predict_and_explain(sample_text_data) - - def test_to_json_method(self, fasttext_config): - """Test to_json serialization method.""" - wrapper = FastTextWrapper(fasttext_config) - classifier = torchTextClassifiers(wrapper) - - with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f: - temp_path = f.name - - try: - classifier.to_json(temp_path) - - # Verify file was created and has correct content - assert os.path.exists(temp_path) - - with open(temp_path, 'r') as f: - data = json.load(f) - - assert 'wrapper_class_info' in data - assert 'config' in data - assert data['config']['embedding_dim'] == fasttext_config.embedding_dim - - finally: - if os.path.exists(temp_path): - os.unlink(temp_path) - - def test_from_json_method(self, fasttext_config): - """Test from_json deserialization method.""" - # First create a JSON file - wrapper = FastTextWrapper(fasttext_config) - original_classifier = torchTextClassifiers(wrapper) - - with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f: - temp_path = f.name - - try: - original_classifier.to_json(temp_path) - - # Load from JSON - loaded_classifier = torchTextClassifiers.from_json(temp_path) - - assert isinstance(loaded_classifier.classifier, FastTextWrapper) - assert loaded_classifier.config.embedding_dim == fasttext_config.embedding_dim - assert loaded_classifier.config.sparse == fasttext_config.sparse - assert loaded_classifier.config.num_tokens == fasttext_config.num_tokens - - finally: - if os.path.exists(temp_path): - os.unlink(temp_path) - - def test_from_json_missing_wrapper_info(self): - """Test from_json with missing wrapper class info.""" - # Create a JSON without wrapper_class_info - fake_data = { - "config": {"embedding_dim": 50, "sparse": False, "num_tokens": 1000, - "min_count": 1, "min_n": 3, "max_n": 6, "len_word_ngrams": 2} - } - - with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f: - json.dump(fake_data, f) - temp_path = f.name - - try: - with pytest.raises(ValueError, match="No wrapper_class_info found"): - torchTextClassifiers.from_json(temp_path) - finally: - if os.path.exists(temp_path): - os.unlink(temp_path) - - def test_from_json_with_explicit_wrapper_class(self, fasttext_config): - """Test from_json with explicitly provided wrapper class.""" - # First create a JSON file - wrapper = FastTextWrapper(fasttext_config) - original_classifier = torchTextClassifiers(wrapper) - - with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f: - temp_path = f.name - - try: - original_classifier.to_json(temp_path) - - # Load from JSON with explicit wrapper class - loaded_classifier = torchTextClassifiers.from_json(temp_path, FastTextWrapper) - - assert isinstance(loaded_classifier.classifier, FastTextWrapper) - assert loaded_classifier.config.embedding_dim == fasttext_config.embedding_dim - - finally: - if os.path.exists(temp_path): - os.unlink(temp_path) \ No newline at end of file + + assert isinstance(ttc.pytorch_model, TextClassificationModel) + assert isinstance(ttc.classification_head, ClassificationHead) From 8fdaf0c61fa80e3a6bb3b1b160423f2e4474790b Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Wed, 5 Nov 2025 18:16:20 +0000 Subject: [PATCH 21/66] chore: clean --- torchTextClassifiers/classifiers/base.py | 83 -- .../classifiers/fasttext/__init__.py | 25 - .../classifiers/fasttext/core.py | 269 ------- .../classifiers/fasttext/model.py | 752 ------------------ .../classifiers/fasttext/wrapper.py | 216 ----- .../classifiers/simple_text_classifier.py | 191 ----- torchTextClassifiers/factories.py | 34 - torchTextClassifiers/utilities/preprocess.py | 82 -- torchTextClassifiers/utilities/utils.py | 346 -------- 9 files changed, 1998 deletions(-) delete mode 100644 torchTextClassifiers/classifiers/base.py delete mode 100644 torchTextClassifiers/classifiers/fasttext/__init__.py delete mode 100644 torchTextClassifiers/classifiers/fasttext/core.py delete mode 100644 torchTextClassifiers/classifiers/fasttext/model.py delete mode 100644 torchTextClassifiers/classifiers/fasttext/wrapper.py delete mode 100644 torchTextClassifiers/classifiers/simple_text_classifier.py delete mode 100644 torchTextClassifiers/factories.py delete mode 100644 torchTextClassifiers/utilities/preprocess.py delete mode 100644 torchTextClassifiers/utilities/utils.py diff --git a/torchTextClassifiers/classifiers/base.py b/torchTextClassifiers/classifiers/base.py deleted file mode 100644 index 193fa0a..0000000 --- a/torchTextClassifiers/classifiers/base.py +++ /dev/null @@ -1,83 +0,0 @@ -from typing import Optional, Union, Type, List, Dict, Any -from dataclasses import dataclass, field, asdict -from abc import ABC, abstractmethod -import numpy as np - -class BaseClassifierConfig(ABC): - """Abstract base class for classifier configurations.""" - - @abstractmethod - def to_dict(self) -> Dict[str, Any]: - """Convert configuration to dictionary.""" - pass - - @classmethod - @abstractmethod - def from_dict(cls, data: Dict[str, Any]) -> "BaseClassifierConfig": - """Create configuration from dictionary.""" - pass - -class BaseClassifierWrapper(ABC): - """Abstract base class for classifier wrappers. - - Each classifier wrapper is responsible for its own text processing approach. - Some may use tokenizers, others may use different preprocessing methods. - """ - - def __init__(self, config: BaseClassifierConfig): - self.config = config - self.pytorch_model = None - self.lightning_module = None - self.trained: bool = False - self.device = None - # Remove tokenizer from base class - it's now wrapper-specific - - @abstractmethod - def prepare_text_features(self, training_text: np.ndarray) -> None: - """Prepare text features for the classifier. - - This could involve tokenization, vectorization, or other preprocessing. - Each classifier wrapper implements this according to its needs. - """ - pass - - @abstractmethod - def _build_pytorch_model(self) -> None: - """Build the PyTorch model.""" - pass - - @abstractmethod - def _check_and_init_lightning(self, **kwargs) -> None: - """Initialize Lightning module.""" - pass - - @abstractmethod - def predict(self, X: np.ndarray, **kwargs) -> np.ndarray: - """Make predictions.""" - pass - - @abstractmethod - def validate(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: - """Validate the model.""" - pass - - @abstractmethod - def create_dataset(self, texts: np.ndarray, labels: np.ndarray, categorical_variables: Optional[np.ndarray] = None): - """Create dataset for training/validation.""" - pass - - @abstractmethod - def create_dataloader(self, dataset, batch_size: int, num_workers: int = 0, shuffle: bool = True): - """Create dataloader from dataset.""" - pass - - @abstractmethod - def load_best_model(self, checkpoint_path: str) -> None: - """Load best model from checkpoint.""" - pass - - @classmethod - @abstractmethod - def get_config_class(cls) -> Type[BaseClassifierConfig]: - """Return the configuration class for this wrapper.""" - pass \ No newline at end of file diff --git a/torchTextClassifiers/classifiers/fasttext/__init__.py b/torchTextClassifiers/classifiers/fasttext/__init__.py deleted file mode 100644 index c9da238..0000000 --- a/torchTextClassifiers/classifiers/fasttext/__init__.py +++ /dev/null @@ -1,25 +0,0 @@ -"""FastText classifier package. - -Provides FastText text classification with PyTorch Lightning integration. -This folder contains 4 main files: -- core.py: Configuration, losses, and factory methods -- tokenizer.py: NGramTokenizer implementation -- model.py: PyTorch model, Lightning module, and dataset -- wrapper.py: High-level wrapper interface -""" - -from .core import FastTextConfig, OneVsAllLoss, FastTextFactory -from .tokenizer import NGramTokenizer -from .model import FastTextModel, FastTextModule, FastTextModelDataset -from .wrapper import FastTextWrapper - -__all__ = [ - "FastTextConfig", - "OneVsAllLoss", - "FastTextFactory", - "NGramTokenizer", - "FastTextModel", - "FastTextModule", - "FastTextModelDataset", - "FastTextWrapper", -] \ No newline at end of file diff --git a/torchTextClassifiers/classifiers/fasttext/core.py b/torchTextClassifiers/classifiers/fasttext/core.py deleted file mode 100644 index c9c24b0..0000000 --- a/torchTextClassifiers/classifiers/fasttext/core.py +++ /dev/null @@ -1,269 +0,0 @@ -"""FastText classifier core components. - -This module contains the core components for FastText classification: -- Configuration dataclass -- Loss functions -- Factory methods for creating classifiers - -Consolidates what was previously in config.py, losses.py, and factory.py. -""" - -from dataclasses import dataclass, field, asdict -from abc import ABC, abstractmethod -from ..base import BaseClassifierConfig -from typing import Optional, List, TYPE_CHECKING, Union, Dict, Any -import numpy as np -import torch -import torch.nn.functional as F -from torch import nn - -if TYPE_CHECKING: - from ...torchTextClassifiers import torchTextClassifiers - - -# ============================================================================ -# Configuration -# ============================================================================ - -@dataclass -class FastTextConfig(BaseClassifierConfig): - """Configuration for FastText classifier.""" - # Embedding matrix - embedding_dim: int - sparse: bool - - # Tokenizer-related - num_tokens: int - min_count: int - min_n: int - max_n: int - len_word_ngrams: int - - # Optional parameters - num_classes: Optional[int] = None - num_rows: Optional[int] = None - - # Categorical variables - categorical_vocabulary_sizes: Optional[List[int]] = None - categorical_embedding_dims: Optional[Union[List[int], int]] = None - num_categorical_features: Optional[int] = None - - # Model-specific parameters - direct_bagging: Optional[bool] = True - - # Training parameters - learning_rate: float = 4e-3 - - def to_dict(self) -> Dict[str, Any]: - return asdict(self) - - @classmethod - def from_dict(cls, data: Dict[str, Any]) -> "FastTextConfig": - return cls(**data) - - -# ============================================================================ -# Loss Functions -# ============================================================================ - -class OneVsAllLoss(nn.Module): - def __init__(self): - super(OneVsAllLoss, self).__init__() - - def forward(self, logits, targets): - """ - Compute One-vs-All loss - - Args: - logits: Tensor of shape (batch_size, num_classes) containing classification scores - targets: Tensor of shape (batch_size) containing true class indices - - Returns: - loss: Mean loss value across the batch - """ - - num_classes = logits.size(1) - - # Convert targets to one-hot encoding - targets_one_hot = F.one_hot(targets, num_classes=num_classes).float() - - # For each sample, treat the true class as positive and all others as negative - # Using binary cross entropy for each class - loss = F.binary_cross_entropy_with_logits( - logits, # Raw logits - targets_one_hot, # Target probabilities - reduction="none", # Don't reduce yet to allow for custom weighting if needed - ) - - # Sum losses across all classes for each sample, then take mean across batch - return loss.sum(dim=1).mean() - - -# ============================================================================ -# Factory Methods -# ============================================================================ - -class FastTextFactory: - """Factory class for creating FastText classifiers with convenience methods. - - This factory provides static methods for creating FastText classifiers with - common configurations. It handles the complexities of configuration creation - and classifier initialization, offering a simplified API for users. - - All methods return fully initialized torchTextClassifiers instances that are - ready for building and training. - """ - - @staticmethod - def create_fasttext( - embedding_dim: int, - sparse: bool, - num_tokens: int, - min_count: int, - min_n: int, - max_n: int, - len_word_ngrams: int, - **kwargs - ) -> "torchTextClassifiers": - """Create a FastText classifier with the specified configuration. - - This is the primary method for creating FastText classifiers. It creates - a configuration object with the provided parameters and initializes a - complete classifier instance. - - Args: - embedding_dim: Dimension of word embeddings - sparse: Whether to use sparse embeddings - num_tokens: Maximum number of tokens in vocabulary - min_count: Minimum count for tokens to be included in vocabulary - min_n: Minimum length of character n-grams - max_n: Maximum length of character n-grams - len_word_ngrams: Length of word n-grams to use - **kwargs: Additional configuration parameters (e.g., num_classes, - categorical_vocabulary_sizes, etc.) - - Returns: - torchTextClassifiers: Initialized FastText classifier instance - - Example: - >>> from torchTextClassifiers.classifiers.fasttext.core import FastTextFactory - >>> classifier = FastTextFactory.create_fasttext( - ... embedding_dim=100, - ... sparse=False, - ... num_tokens=10000, - ... min_count=2, - ... min_n=3, - ... max_n=6, - ... len_word_ngrams=2, - ... num_classes=3 - ... ) - """ - from ...torchTextClassifiers import torchTextClassifiers - from .wrapper import FastTextWrapper - - config = FastTextConfig( - embedding_dim=embedding_dim, - sparse=sparse, - num_tokens=num_tokens, - min_count=min_count, - min_n=min_n, - max_n=max_n, - len_word_ngrams=len_word_ngrams, - **kwargs - ) - wrapper = FastTextWrapper(config) - return torchTextClassifiers(wrapper) - - @staticmethod - def build_from_tokenizer( - tokenizer, # NGramTokenizer - embedding_dim: int, - num_classes: Optional[int], - categorical_vocabulary_sizes: Optional[List[int]] = None, - sparse: bool = False, - **kwargs - ) -> "torchTextClassifiers": - """Create FastText classifier from an existing trained tokenizer. - - This method is useful when you have a pre-trained tokenizer and want to - create a classifier that uses the same vocabulary and tokenization scheme. - The resulting classifier will have its tokenizer and model architecture - pre-built. - - Args: - tokenizer: Pre-trained NGramTokenizer instance - embedding_dim: Dimension of word embeddings - num_classes: Number of output classes - categorical_vocabulary_sizes: Sizes of categorical feature vocabularies - sparse: Whether to use sparse embeddings - **kwargs: Additional configuration parameters - - Returns: - torchTextClassifiers: Classifier with pre-built tokenizer and model - - Raises: - ValueError: If the tokenizer is missing required attributes - - Example: - >>> # Assume you have a pre-trained tokenizer - >>> classifier = FastTextFactory.build_from_tokenizer( - ... tokenizer=my_tokenizer, - ... embedding_dim=100, - ... num_classes=2, - ... sparse=False - ... ) - >>> # The classifier is ready for training without building - >>> classifier.train(X_train, y_train, X_val, y_val, ...) - """ - from ...torchTextClassifiers import torchTextClassifiers - from .wrapper import FastTextWrapper - - # Ensure the tokenizer has required attributes - required_attrs = ["min_count", "min_n", "max_n", "num_tokens", "word_ngrams"] - if not all(hasattr(tokenizer, attr) for attr in required_attrs): - missing_attrs = [attr for attr in required_attrs if not hasattr(tokenizer, attr)] - raise ValueError(f"Missing attributes in tokenizer: {missing_attrs}") - - config = FastTextConfig( - num_tokens=tokenizer.num_tokens, - embedding_dim=embedding_dim, - min_count=tokenizer.min_count, - min_n=tokenizer.min_n, - max_n=tokenizer.max_n, - len_word_ngrams=tokenizer.word_ngrams, - sparse=sparse, - num_classes=num_classes, - categorical_vocabulary_sizes=categorical_vocabulary_sizes, - **kwargs - ) - - wrapper = FastTextWrapper(config) - classifier = torchTextClassifiers(wrapper) - classifier.classifier.tokenizer = tokenizer - classifier.classifier._build_pytorch_model() - - return classifier - - @staticmethod - def from_dict(config_dict: dict) -> FastTextConfig: - """Create FastText configuration from dictionary. - - This method is used internally by the configuration factory system - to recreate FastText configurations from serialized data. - - Args: - config_dict: Dictionary containing configuration parameters - - Returns: - FastTextConfig: Reconstructed configuration object - - Example: - >>> config_dict = { - ... 'embedding_dim': 100, - ... 'num_tokens': 5000, - ... 'min_count': 1, - ... # ... other parameters - ... } - >>> config = FastTextFactory.from_dict(config_dict) - """ - return FastTextConfig.from_dict(config_dict) \ No newline at end of file diff --git a/torchTextClassifiers/classifiers/fasttext/model.py b/torchTextClassifiers/classifiers/fasttext/model.py deleted file mode 100644 index ac8cc7f..0000000 --- a/torchTextClassifiers/classifiers/fasttext/model.py +++ /dev/null @@ -1,752 +0,0 @@ -"""FastText model components. - -This module contains the PyTorch model, Lightning module, and dataset classes -for FastText classification. Consolidates what was previously in pytorch_model.py, -lightning_module.py, and dataset.py. -""" - -import os -import logging -from typing import List, Union -import torch -import pytorch_lightning as pl -from torch import nn -from torchmetrics import Accuracy - -try: - from captum.attr import LayerIntegratedGradients - HAS_CAPTUM = True -except ImportError: - HAS_CAPTUM = False - -from ...utilities.utils import ( - compute_preprocessed_word_score, - compute_word_score, - explain_continuous, -) -from ...utilities.checkers import validate_categorical_inputs - -logger = logging.getLogger(__name__) - -logging.basicConfig( - level=logging.INFO, - format="%(asctime)s - %(name)s - %(message)s", - datefmt="%Y-%m-%d %H:%M:%S", - handlers=[logging.StreamHandler()], -) - - -# ============================================================================ -# PyTorch Model -# ============================================================================ - -class FastTextModel(nn.Module): - """FastText Pytorch Model.""" - - def __init__( - self, - embedding_dim: int, - num_classes: int, - tokenizer=None, - num_rows: int = None, - categorical_vocabulary_sizes: List[int] = None, - categorical_embedding_dims: Union[List[int], int] = None, - padding_idx: int = 0, - sparse: bool = True, - direct_bagging: bool = False, - ): - """ - Constructor for the FastTextModel class. - - Args: - embedding_dim (int): Dimension of the text embedding space. - buckets (int): Number of rows in the embedding matrix. - num_classes (int): Number of classes. - categorical_vocabulary_sizes (List[int]): List of the number of - modalities for additional categorical features. - padding_idx (int, optional): Padding index for the text - descriptions. Defaults to 0. - sparse (bool): Indicates if Embedding layer is sparse. - direct_bagging (bool): Use EmbeddingBag instead of Embedding for the text embedding. - """ - super(FastTextModel, self).__init__() - - if isinstance(categorical_embedding_dims, int): - self.average_cat_embed = True # if provided categorical embedding dims is an int, average the categorical embeddings before concatenating to sentence embedding - else: - self.average_cat_embed = False - - categorical_vocabulary_sizes, categorical_embedding_dims, num_categorical_features = ( - validate_categorical_inputs( - categorical_vocabulary_sizes, - categorical_embedding_dims, - num_categorical_features=None, - ) - ) - - assert isinstance(categorical_embedding_dims, list) or categorical_embedding_dims is None, ( - "categorical_embedding_dims must be a list of int at this stage" - ) - - if categorical_embedding_dims is None: - self.average_cat_embed = False - - if tokenizer is None: - if num_rows is None: - raise ValueError( - "Either tokenizer or num_rows must be provided (number of rows in the embedding matrix)." - ) - else: - if num_rows is not None: - if num_rows != tokenizer.num_tokens: - logger.warning( - "num_rows is different from the number of tokens in the tokenizer. Using provided num_rows." - ) - - self.num_rows = num_rows - - self.num_classes = num_classes - self.padding_idx = padding_idx - self.tokenizer = tokenizer - self.embedding_dim = embedding_dim - self.direct_bagging = direct_bagging - self.sparse = sparse - - self.categorical_embedding_dims = categorical_embedding_dims - - self.embeddings = ( - nn.Embedding( - embedding_dim=embedding_dim, - num_embeddings=num_rows, - padding_idx=padding_idx, - sparse=sparse, - ) - if not direct_bagging - else nn.EmbeddingBag( - embedding_dim=embedding_dim, - num_embeddings=num_rows, - padding_idx=padding_idx, - sparse=sparse, - mode="mean", - ) - ) - - self.categorical_embedding_layers = {} - - # Entry dim for the last layer: - # 1. embedding_dim if no categorical variables or summing the categrical embeddings to sentence embedding - # 2. embedding_dim + cat_embedding_dim if averaging the categorical embeddings before concatenating to sentence embedding (categorical_embedding_dims is a int) - # 3. embedding_dim + sum(categorical_embedding_dims) if concatenating individually the categorical embeddings to sentence embedding (no averaging, categorical_embedding_dims is a list) - dim_in_last_layer = embedding_dim - if self.average_cat_embed: - dim_in_last_layer += categorical_embedding_dims[0] - - if categorical_vocabulary_sizes is not None: - self.no_cat_var = False - for var_idx, num_rows in enumerate(categorical_vocabulary_sizes): - if categorical_embedding_dims is not None: - emb = nn.Embedding( - embedding_dim=categorical_embedding_dims[var_idx], num_embeddings=num_rows - ) # concatenate to sentence embedding - if not self.average_cat_embed: - dim_in_last_layer += categorical_embedding_dims[var_idx] - else: - emb = nn.Embedding( - embedding_dim=embedding_dim, num_embeddings=num_rows - ) # sum to sentence embedding - self.categorical_embedding_layers[var_idx] = emb - setattr(self, "emb_{}".format(var_idx), emb) - else: - self.no_cat_var = True - - self.fc = nn.Linear(dim_in_last_layer, num_classes) - - def forward(self, encoded_text: torch.Tensor, additional_inputs: torch.Tensor) -> torch.Tensor: - """ - Memory-efficient forward pass implementation. - - Args: - encoded_text (torch.Tensor[Long]), shape (batch_size, seq_len): Tokenized + padded text - additional_inputs (torch.Tensor[Long]): Additional categorical features, (batch_size, num_categorical_features) - - Returns: - torch.Tensor: Model output scores for each class - """ - batch_size = encoded_text.size(0) - - # Ensure correct dtype and device once - if encoded_text.dtype != torch.long: - encoded_text = encoded_text.to(torch.long) - - # Compute text embeddings - if self.direct_bagging: - x_text = self.embeddings(encoded_text) # (batch_size, embedding_dim) - else: - # Compute embeddings and averaging in a memory-efficient way - x_text = self.embeddings(encoded_text) # (batch_size, seq_len, embedding_dim) - # Calculate non-zero tokens mask once - non_zero_mask = (x_text.sum(-1) != 0).float() # (batch_size, seq_len) - token_counts = non_zero_mask.sum(-1, keepdim=True) # (batch_size, 1) - - # Sum and average in place - x_text = (x_text * non_zero_mask.unsqueeze(-1)).sum( - dim=1 - ) # (batch_size, embedding_dim) - x_text = torch.div(x_text, token_counts.clamp(min=1.0)) - x_text = torch.nan_to_num(x_text, 0.0) - - # Handle categorical variables efficiently - if not self.no_cat_var and additional_inputs.numel() > 0: - cat_embeds = [] - # Process categorical embeddings in batch - for i, (_, embed_layer) in enumerate(self.categorical_embedding_layers.items()): - cat_input = additional_inputs[:, i].long() - - # Check if categorical values are within valid range and clamp if needed - vocab_size = embed_layer.num_embeddings - max_val = cat_input.max().item() - min_val = cat_input.min().item() - - if max_val >= vocab_size or min_val < 0: - logger.warning(f"Categorical feature {i}: values range [{min_val}, {max_val}] exceed vocabulary size {vocab_size}. Clamping to valid range [0, {vocab_size - 1}]") - # Clamp values to valid range - cat_input = torch.clamp(cat_input, 0, vocab_size - 1) - - cat_embed = embed_layer(cat_input) - if cat_embed.dim() > 2: - cat_embed = cat_embed.squeeze(1) - cat_embeds.append(cat_embed) - - if cat_embeds: # If we have categorical embeddings - if self.categorical_embedding_dims is not None: - if self.average_cat_embed: - # Stack and average in one operation - x_cat = torch.stack(cat_embeds, dim=0).mean(dim=0) - x_combined = torch.cat([x_text, x_cat], dim=1) - else: - # Optimize concatenation - x_combined = torch.cat([x_text] + cat_embeds, dim=1) - else: - # Sum embeddings efficiently - x_combined = x_text + torch.stack(cat_embeds, dim=0).sum(dim=0) - else: - x_combined = x_text - else: - x_combined = x_text - - # Final linear layer - return self.fc(x_combined) - - def predict( - self, - text: List[str], - categorical_variables: List[List[int]], - top_k=1, - explain=False, - preprocess=True, - ): - """ - Args: - text (List[str]): A list of text observations. - params (Optional[Dict[str, Any]]): Additional parameters to - pass to the model for inference. - top_k (int): for each sentence, return the top_k most likely predictions (default: 1) - explain (bool): launch gradient integration to have an explanation of the prediction (default: False) - preprocess (bool): If True, preprocess text. Needs unidecode library. - - Returns: - if explain is False: - predictions (torch.Tensor, shape (len(text), top_k)): A tensor containing the top_k most likely codes to the query. - confidence (torch.Tensor, shape (len(text), top_k)): A tensor array containing the corresponding confidence scores. - if explain is True: - predictions (torch.Tensor, shape (len(text), top_k)): Containing the top_k most likely codes to the query. - confidence (torch.Tensor, shape (len(text), top_k)): Corresponding confidence scores. - all_attributions (torch.Tensor, shape (len(text), top_k, seq_len)): A tensor containing the attributions for each token in the text. - x (torch.Tensor): A tensor containing the token indices of the text. - id_to_token_dicts (List[Dict[int, str]]): A list of dictionaries mapping token indices to tokens (one for each sentence). - token_to_id_dicts (List[Dict[str, int]]): A list of dictionaries mapping tokens to token indices: the reverse of those in id_to_token_dicts. - text (list[str]): A plist containing the preprocessed text (one line for each sentence). - """ - - flag_change_embed = False - if explain: - if not HAS_CAPTUM: - raise ImportError( - "Captum is not installed and is required for explainability. Run 'pip install torchFastText[explainability]'." - ) - if self.direct_bagging: - # Get back the classical embedding layer for explainability - new_embed_layer = nn.Embedding( - embedding_dim=self.embedding_dim, - num_embeddings=self.num_rows, - padding_idx=self.padding_idx, - sparse=self.sparse, - ) - new_embed_layer.load_state_dict( - self.embeddings.state_dict() - ) # No issues, as exactly the same parameters - self.embeddings = new_embed_layer - self.direct_bagging = ( - False # To inform the forward pass that we are not using EmbeddingBag anymore - ) - flag_change_embed = True - - lig = LayerIntegratedGradients( - self, self.embeddings - ) # initialize a Captum layer gradient integrator - - self.eval() - batch_size = len(text) - - indices_batch, id_to_token_dicts, token_to_id_dicts = self.tokenizer.tokenize( - text, text_tokens=False, preprocess=preprocess - ) - - padding_index = ( - self.tokenizer.get_buckets() + self.tokenizer.get_nwords() - ) # padding index, the integer value of the padding token - - padded_batch = torch.nn.utils.rnn.pad_sequence( - indices_batch, - batch_first=True, - padding_value=padding_index, - ) # (batch_size, seq_len) - Tokenized (int) + padded text - - x = padded_batch - - if not self.no_cat_var: - other_features = [] - # Transpose categorical_variables to iterate over features instead of samples - categorical_variables_transposed = categorical_variables.T - for i, categorical_variable in enumerate(categorical_variables_transposed): - other_features.append( - torch.tensor(categorical_variable).reshape(batch_size, -1).to(torch.int64) - ) - - other_features = torch.stack(other_features).reshape(batch_size, -1).long() - else: - other_features = torch.empty(batch_size) - - pred = self( - x, other_features - ) # forward pass, contains the prediction scores (len(text), num_classes) - label_scores = pred.detach().cpu() - label_scores_topk = torch.topk(label_scores, k=top_k, dim=1) - - predictions = label_scores_topk.indices # get the top_k most likely predictions - confidence = torch.round(label_scores_topk.values, decimals=2) # and their scores - - if explain: - assert not self.direct_bagging, "Direct bagging should be False for explainability" - all_attributions = [] - for k in range(top_k): - attributions = lig.attribute( - (x, other_features), target=torch.Tensor(predictions[:, k]).long() - ) # (batch_size, seq_len) - attributions = attributions.sum(dim=-1) - all_attributions.append(attributions.detach().cpu()) - - all_attributions = torch.stack(all_attributions, dim=1) # (batch_size, top_k, seq_len) - - # Get back to initial embedding layer: - # EmbeddingBag -> Embedding -> EmbeddingBag - # or keep Embedding with no change - if flag_change_embed: - new_embed_layer = nn.EmbeddingBag( - embedding_dim=self.embedding_dim, - num_embeddings=self.num_rows, - padding_idx=self.padding_idx, - sparse=self.sparse, - ) - new_embed_layer.load_state_dict( - self.embeddings.state_dict() - ) # No issues, as exactly the same parameters - self.embeddings = new_embed_layer - self.direct_bagging = True - return ( - predictions, - confidence, - all_attributions, - x, - id_to_token_dicts, - token_to_id_dicts, - text, - ) - else: - return predictions, confidence - - def predict_and_explain(self, text, categorical_variables, top_k=1, n=5, cutoff=0.65): - """ - Args: - text (List[str]): A list of sentences. - params (Optional[Dict[str, Any]]): Additional parameters to - pass to the model for inference. - top_k (int): for each sentence, return the top_k most likely predictions (default: 1) - n (int): mapping processed to original words: max number of candidate processed words to consider per original word (default: 5) - cutoff (float): mapping processed to original words: minimum similarity score to consider a candidate processed word (default: 0.75) - - Returns: - predictions (torch.Tensor, shape (len(text), top_k)): Containing the top_k most likely codes to the query. - confidence (torch.Tensor, shape (len(text), top_k)): Corresponding confidence scores. - all_scores (List[List[List[float]]]): For each sentence, list of the top_k lists of attributions for each word in the sentence (one for each pred). - """ - - # Step 1: Get the predictions, confidence scores and attributions at token level - ( - pred, - confidence, - all_attr, - tokenized_text, - id_to_token_dicts, - token_to_id_dicts, - processed_text, - ) = self.predict( - text=text, categorical_variables=categorical_variables, top_k=top_k, explain=True - ) - - tokenized_text_tokens = self.tokenizer._tokenized_text_in_tokens( - tokenized_text, id_to_token_dicts - ) - - # Step 2: Map the attributions at token level to the processed words - processed_word_to_score_dicts, processed_word_to_token_idx_dicts = ( - compute_preprocessed_word_score( - processed_text, - tokenized_text_tokens, - all_attr, - id_to_token_dicts, - token_to_id_dicts, - min_n=self.tokenizer.min_n, - padding_index=self.padding_idx, - end_of_string_index=0, - ) - ) - - # Step 3: Map the processed words to the original words - all_scores, orig_to_processed_mappings = compute_word_score( - processed_word_to_score_dicts, text, n=n, cutoff=cutoff - ) - - # Step 2bis: Get the attributions at letter level - all_scores_letters = explain_continuous( - text, - processed_text, - tokenized_text_tokens, - orig_to_processed_mappings, - processed_word_to_token_idx_dicts, - all_attr, - top_k, - ) - - return pred, confidence, all_scores, all_scores_letters - - -# ============================================================================ -# PyTorch Lightning Module -# ============================================================================ - -class FastTextModule(pl.LightningModule): - """Pytorch Lightning Module for FastTextModel.""" - - def __init__( - self, - model: FastTextModel, - loss, - optimizer, - optimizer_params, - scheduler, - scheduler_params, - scheduler_interval="epoch", - **kwargs, - ): - """ - Initialize FastTextModule. - - Args: - model: Model. - loss: Loss - optimizer: Optimizer - optimizer_params: Optimizer parameters. - scheduler: Scheduler. - scheduler_params: Scheduler parameters. - scheduler_interval: Scheduler interval. - """ - super().__init__() - self.save_hyperparameters(ignore=["model", "loss"]) - - self.model = model - self.loss = loss - self.accuracy_fn = Accuracy(task="multiclass", num_classes=self.model.num_classes) - self.optimizer = optimizer - self.optimizer_params = optimizer_params - self.scheduler = scheduler - self.scheduler_params = scheduler_params - self.scheduler_interval = scheduler_interval - - def forward(self, inputs) -> torch.Tensor: - """ - Perform forward-pass. - - Args: - batch (List[torch.LongTensor]): Batch to perform forward-pass on. - - Returns (torch.Tensor): Prediction. - """ - return self.model(inputs[0], inputs[1]) - - def training_step(self, batch, batch_idx: int) -> torch.Tensor: - """ - Training step. - - Args: - batch (List[torch.LongTensor]): Training batch. - batch_idx (int): Batch index. - - Returns (torch.Tensor): Loss tensor. - """ - - inputs, targets = batch[:-1], batch[-1] - outputs = self.forward(inputs) - loss = self.loss(outputs, targets) - self.log("train_loss", loss, on_epoch=True, on_step=True, prog_bar=True) - accuracy = self.accuracy_fn(outputs, targets) - self.log("train_accuracy", accuracy, on_epoch=True, on_step=False, prog_bar=True) - - torch.cuda.empty_cache() - - return loss - - def validation_step(self, batch, batch_idx: int): - """ - Validation step. - - Args: - batch (List[torch.LongTensor]): Validation batch. - batch_idx (int): Batch index. - - Returns (torch.Tensor): Loss tensor. - """ - inputs, targets = batch[:-1], batch[-1] - outputs = self.forward(inputs) - loss = self.loss(outputs, targets) - self.log("val_loss", loss, on_epoch=True, on_step=False, prog_bar=True, sync_dist=True) - - accuracy = self.accuracy_fn(outputs, targets) - self.log("val_accuracy", accuracy, on_epoch=True, on_step=False, prog_bar=True) - return loss - - def test_step(self, batch, batch_idx: int): - """ - Test step. - - Args: - batch (List[torch.LongTensor]): Test batch. - batch_idx (int): Batch index. - - Returns (torch.Tensor): Loss tensor. - """ - inputs, targets = batch[:-1], batch[-1] - outputs = self.forward(inputs) - loss = self.loss(outputs, targets) - - accuracy = self.accuracy_fn(outputs, targets) - - return loss, accuracy - - def configure_optimizers(self): - """ - Configure optimizer for Pytorch lighting. - - Returns: Optimizer and scheduler for pytorch lighting. - """ - optimizer = self.optimizer(self.parameters(), **self.optimizer_params) - - # Only use scheduler if it's not ReduceLROnPlateau or if we can ensure val_loss is available - # For complex training setups, sometimes val_loss is not available every epoch - if hasattr(self.scheduler, '__name__') and 'ReduceLROnPlateau' in self.scheduler.__name__: - # For ReduceLROnPlateau, use train_loss as it's always available - scheduler = self.scheduler(optimizer, **self.scheduler_params) - scheduler_config = { - "scheduler": scheduler, - "monitor": "train_loss", - "interval": self.scheduler_interval, - } - return [optimizer], [scheduler_config] - else: - # For other schedulers (StepLR, etc.), no monitoring needed - scheduler = self.scheduler(optimizer, **self.scheduler_params) - return [optimizer], [scheduler] - - -# ============================================================================ -# Dataset -# ============================================================================ - -class FastTextModelDataset(torch.utils.data.Dataset): - """FastTextModelDataset class.""" - - def __init__( - self, - categorical_variables: List[List[int]], - texts: List[str], - tokenizer, # NGramTokenizer - outputs: List[int] = None, - **kwargs, - ): - """ - Constructor for the TorchDataset class. - - Args: - categorical_variables (List[List[int]]): The elements of this list - are the values of each categorical variable across the dataset. - text (List[str]): List of text descriptions. - y (List[int]): List of outcomes. - tokenizer (Tokenizer): Tokenizer. - """ - - if categorical_variables is not None and len(categorical_variables) != len(texts): - raise ValueError("Categorical variables and texts must have the same length.") - - if outputs is not None and len(outputs) != len(texts): - raise ValueError("Outputs and texts must have the same length.") - - self.categorical_variables = categorical_variables - self.texts = texts - self.outputs = outputs - self.tokenizer = tokenizer - - def __len__(self) -> int: - """ - Returns length of the data. - - Returns: - int: Number of observations. - """ - return len(self.texts) - - def __str__(self) -> str: - """ - Returns description of the Dataset. - - Returns: - str: Description. - """ - return f"" - - def __getitem__(self, index: int) -> List: - """ - Returns observation for a given index. - - Args: - index (int): Index. - - Returns: - List[int, str]: Observation with given index. - """ - categorical_variables = ( - self.categorical_variables[index] if self.categorical_variables is not None else None - ) - text = self.texts[index] - - if self.outputs is not None: - y = self.outputs[index] - return text, categorical_variables, y - else: - return text, categorical_variables - - def collate_fn(self, batch): - """ - Efficient batch processing without explicit loops. - - Args: - batch: Data batch. - - Returns: - Tuple[torch.LongTensor]: Observation with given index. - """ - - # Unzip the batch in one go using zip(*batch) - if self.outputs is not None: - text, *categorical_vars, y = zip(*batch) - else: - text, *categorical_vars = zip(*batch) - - # Convert text to indices in parallel using map - indices_batch = list(map(lambda x: self.tokenizer.indices_matrix(x)[0], text)) - - # Get padding index once - padding_index = self.tokenizer.get_buckets() + self.tokenizer.get_nwords() - - # Pad sequences efficiently - padded_batch = torch.nn.utils.rnn.pad_sequence( - indices_batch, - batch_first=True, - padding_value=padding_index, - ) - - # Handle categorical variables efficiently - if self.categorical_variables is not None: - categorical_tensors = torch.stack( - [ - torch.tensor(cat_var, dtype=torch.float32) - for cat_var in categorical_vars[ - 0 - ] # Access first element since zip returns tuple - ] - ) - else: - categorical_tensors = torch.empty( - padded_batch.shape[0], 1, dtype=torch.float32, device=padded_batch.device - ) - - if self.outputs is not None: - # Convert labels to tensor in one go - y = torch.tensor(y, dtype=torch.long) - return (padded_batch, categorical_tensors, y) - else: - return (padded_batch, categorical_tensors) - - def create_dataloader( - self, - batch_size: int, - shuffle: bool = False, - drop_last: bool = False, - num_workers: int = os.cpu_count() - 1, - pin_memory: bool = True, - persistent_workers: bool = True, - **kwargs, - ) -> torch.utils.data.DataLoader: - """ - Creates a Dataloader from the FastTextModelDataset. - Use collate_fn() to tokenize and pad the sequences. - - Args: - batch_size (int): Batch size. - shuffle (bool, optional): Shuffle option. Defaults to False. - drop_last (bool, optional): Drop last option. Defaults to False. - num_workers (int, optional): Number of workers. Defaults to os.cpu_count() - 1. - pin_memory (bool, optional): Set True if working on GPU, False if CPU. Defaults to True. - persistent_workers (bool, optional): Set True for training, False for inference. Defaults to True. - **kwargs: Additional arguments for PyTorch DataLoader. - - Returns: - torch.utils.data.DataLoader: Dataloader. - """ - - logger.info(f"Creating DataLoader with {num_workers} workers.") - - # persistent_workers requires num_workers > 0 - if num_workers == 0: - persistent_workers = False - - return torch.utils.data.DataLoader( - dataset=self, - batch_size=batch_size, - collate_fn=self.collate_fn, - shuffle=shuffle, - drop_last=drop_last, - pin_memory=pin_memory, - num_workers=num_workers, - persistent_workers=persistent_workers, - **kwargs, - ) \ No newline at end of file diff --git a/torchTextClassifiers/classifiers/fasttext/wrapper.py b/torchTextClassifiers/classifiers/fasttext/wrapper.py deleted file mode 100644 index 3d1d559..0000000 --- a/torchTextClassifiers/classifiers/fasttext/wrapper.py +++ /dev/null @@ -1,216 +0,0 @@ -from typing import Optional -from ..base import BaseClassifierWrapper -from .core import FastTextConfig -from .tokenizer import NGramTokenizer -from .model import FastTextModel, FastTextModule, FastTextModelDataset -from ...utilities.checkers import check_X, check_Y -import logging -import numpy as np -import torch -from torch.optim import SGD, Adam - -logger = logging.getLogger() - - -class FastTextWrapper(BaseClassifierWrapper): - """Wrapper for FastText classifier.""" - - def __init__(self, config: FastTextConfig): - super().__init__(config) - self.config: FastTextConfig = config - self.tokenizer: Optional[NGramTokenizer] = None # FastText-specific tokenizer - - def prepare_text_features(self, training_text: np.ndarray) -> None: - """Build NGram tokenizer for FastText.""" - self.tokenizer = NGramTokenizer( - self.config.min_count, - self.config.min_n, - self.config.max_n, - self.config.num_tokens, - self.config.len_word_ngrams, - training_text, - ) - - def build_tokenizer(self, training_text: np.ndarray) -> None: - """Legacy method for backward compatibility.""" - self.prepare_text_features(training_text) - - def _build_pytorch_model(self) -> None: - """Build FastText PyTorch model.""" - if self.config.num_rows is None: - if self.tokenizer is None: - raise ValueError( - "Please provide a tokenizer or num_rows." - ) - else: - self.config.num_rows = self.tokenizer.padding_index + 1 - else: - if self.tokenizer is not None: - if self.config.num_rows != self.tokenizer.padding_index + 1: - logger.warning( - f"Divergent values for num_rows: {self.config.num_rows} and {self.tokenizer.padding_index + 1}. " - f"Using max value." - ) - self.config.num_rows = max(self.config.num_rows, self.tokenizer.padding_index + 1) - - self.padding_idx = self.config.num_rows - 1 - - # Update tokenizer padding index if necessary - if self.tokenizer is not None and self.padding_idx != self.tokenizer.padding_index: - self.tokenizer.padding_index = self.padding_idx - - self.pytorch_model = FastTextModel( - tokenizer=self.tokenizer, - embedding_dim=self.config.embedding_dim, - num_rows=self.config.num_rows, - num_classes=self.config.num_classes, - categorical_vocabulary_sizes=self.config.categorical_vocabulary_sizes, - categorical_embedding_dims=self.config.categorical_embedding_dims, - padding_idx=self.padding_idx, - sparse=self.config.sparse, - direct_bagging=self.config.direct_bagging, - ) - - def _check_and_init_lightning( - self, - optimizer=None, - optimizer_params=None, - lr=None, - scheduler=torch.optim.lr_scheduler.ReduceLROnPlateau, - scheduler_params=None, - patience_scheduler=3, - loss=torch.nn.CrossEntropyLoss(), - ) -> None: - """Initialize Lightning module for FastText.""" - if optimizer is None: - if lr is None: - lr = getattr(self.config, 'learning_rate', 4e-3) # Use config or default - self.optimizer = SGD if self.config.sparse else Adam - self.optimizer_params = {"lr": lr} - else: - self.optimizer = optimizer - if optimizer_params is None: - if lr is not None: - self.optimizer_params = {"lr": lr} - else: - logger.warning("No optimizer parameters provided. Using defaults.") - self.optimizer_params = {} - - self.scheduler = scheduler - - if scheduler_params is None: - logger.warning("No scheduler parameters provided. Using defaults.") - self.scheduler_params = { - "mode": "min", - "patience": patience_scheduler, - } - else: - self.scheduler_params = scheduler_params - - self.loss = loss - - self.lightning_module = FastTextModule( - model=self.pytorch_model, - loss=self.loss, - optimizer=self.optimizer, - optimizer_params=self.optimizer_params, - scheduler=self.scheduler, - scheduler_params=self.scheduler_params, - scheduler_interval="epoch", - ) - - def predict(self, X: np.ndarray, top_k=1, preprocess=False, verbose=False) -> np.ndarray: - """Make predictions with FastText model.""" - if not self.trained: - raise Exception("Model must be trained first.") - - text, categorical_variables, no_cat_var = check_X(X) - if categorical_variables is not None: - if categorical_variables.shape[1] != self.config.num_categorical_features: - raise Exception( - f"X must have the same number of categorical variables as training data." - ) - else: - assert self.pytorch_model.no_cat_var == True - - predictions, confidence = self.pytorch_model.predict( - text, categorical_variables, top_k=top_k, preprocess=preprocess - ) - - # Return just predictions, squeeze out the top_k dimension if top_k=1 - if top_k == 1: - predictions = predictions.squeeze(-1) - - # Convert to numpy array for consistency - if hasattr(predictions, 'numpy'): - predictions = predictions.numpy() - - return predictions - - def validate(self, X: np.ndarray, Y: np.ndarray, batch_size=256, num_workers=12) -> float: - """Validate FastText model.""" - if not self.trained: - raise Exception("Model must be trained first.") - - # Use predict method which handles input validation and returns just predictions - predictions = self.predict(X) - y = check_Y(Y) - - # Convert predictions to numpy if it's a tensor - if hasattr(predictions, 'numpy'): - predictions = predictions.numpy() - - # Calculate accuracy - accuracy = (predictions == y).mean() - return float(accuracy) - - def predict_and_explain(self, X: np.ndarray, top_k=1): - """Predict and explain with FastText model.""" - if not self.trained: - raise Exception("Model must be trained first.") - - text, categorical_variables, no_cat_var = check_X(X) - if categorical_variables is not None: - if categorical_variables.shape[1] != self.config.num_categorical_features: - raise Exception( - f"X must have the same number of categorical variables as training data ({self.config.num_categorical_features})." - ) - else: - assert self.pytorch_model.no_cat_var == True - - return self.pytorch_model.predict_and_explain(text, categorical_variables, top_k=top_k) - - def create_dataset(self, texts: np.ndarray, labels: np.ndarray, categorical_variables: np.ndarray = None): - """Create FastText dataset.""" - return FastTextModelDataset( - categorical_variables=categorical_variables, - texts=texts, - outputs=labels, - tokenizer=self.tokenizer, - ) - - def create_dataloader(self, dataset, batch_size: int, num_workers: int = 0, shuffle: bool = True): - """Create FastText dataloader.""" - return dataset.create_dataloader(batch_size=batch_size, num_workers=num_workers, shuffle=shuffle) - - def load_best_model(self, checkpoint_path: str) -> None: - """Load best FastText model from checkpoint.""" - self.lightning_module = FastTextModule.load_from_checkpoint( - checkpoint_path, - model=self.pytorch_model, - loss=self.loss, - optimizer=self.optimizer, - optimizer_params=self.optimizer_params, - scheduler=self.scheduler, - scheduler_params=self.scheduler_params, - scheduler_interval="epoch", - ) - self.pytorch_model = self.lightning_module.model.to("cpu") - self.trained = True - self.pytorch_model.eval() - - @classmethod - def get_config_class(cls): - """Return the configuration class for FastText wrapper.""" - return FastTextConfig - diff --git a/torchTextClassifiers/classifiers/simple_text_classifier.py b/torchTextClassifiers/classifiers/simple_text_classifier.py deleted file mode 100644 index 12b2f15..0000000 --- a/torchTextClassifiers/classifiers/simple_text_classifier.py +++ /dev/null @@ -1,191 +0,0 @@ -""" -Simple text classifier example that doesn't require a tokenizer. - -This demonstrates how to create a classifier wrapper that uses -different text preprocessing approaches. -""" - -from typing import Optional, Dict, Any -from dataclasses import dataclass, asdict -import numpy as np -import torch -import torch.nn as nn -from sklearn.feature_extraction.text import TfidfVectorizer -from torch.utils.data import Dataset, DataLoader -import pytorch_lightning as pl -from torch.optim import Adam - -from .base import BaseClassifierWrapper, BaseClassifierConfig - - -@dataclass -class SimpleTextConfig(BaseClassifierConfig): - """Configuration for simple text classifier using TF-IDF.""" - - hidden_dim: int = 128 - num_classes: Optional[int] = None - max_features: int = 10000 - learning_rate: float = 1e-3 - dropout_rate: float = 0.1 - - def to_dict(self) -> Dict[str, Any]: - return asdict(self) - - @classmethod - def from_dict(cls, data: Dict[str, Any]) -> "SimpleTextConfig": - return cls(**data) - - -class SimpleTextDataset(Dataset): - """Dataset for simple text classifier.""" - - def __init__(self, features: np.ndarray, labels: np.ndarray): - self.features = torch.FloatTensor(features) - self.labels = torch.LongTensor(labels) - - def __len__(self): - return len(self.features) - - def __getitem__(self, idx): - return self.features[idx], self.labels[idx] - - -class SimpleTextModel(nn.Module): - """Simple neural network for text classification using TF-IDF features.""" - - def __init__(self, input_dim: int, hidden_dim: int, num_classes: int, dropout_rate: float = 0.1): - super().__init__() - - self.network = nn.Sequential( - nn.Linear(input_dim, hidden_dim), - nn.ReLU(), - nn.Dropout(dropout_rate), - nn.Linear(hidden_dim, hidden_dim // 2), - nn.ReLU(), - nn.Dropout(dropout_rate), - nn.Linear(hidden_dim // 2, num_classes) - ) - - def forward(self, x): - return self.network(x) - - -class SimpleTextModule(pl.LightningModule): - """Lightning module for simple text classifier.""" - - def __init__(self, model: nn.Module, learning_rate: float = 1e-3): - super().__init__() - self.model = model - self.learning_rate = learning_rate - self.loss_fn = nn.CrossEntropyLoss() - - def forward(self, x): - return self.model(x) - - def training_step(self, batch, batch_idx): - features, labels = batch - logits = self(features) - loss = self.loss_fn(logits, labels) - self.log('train_loss', loss) - return loss - - def validation_step(self, batch, batch_idx): - features, labels = batch - logits = self(features) - loss = self.loss_fn(logits, labels) - self.log('val_loss', loss) - return loss - - def configure_optimizers(self): - return Adam(self.parameters(), lr=self.learning_rate) - - -class SimpleTextWrapper(BaseClassifierWrapper): - """Wrapper for simple text classifier that uses TF-IDF instead of tokenization.""" - - def __init__(self, config: SimpleTextConfig): - super().__init__(config) - self.config: SimpleTextConfig = config - self.vectorizer: Optional[TfidfVectorizer] = None - - def prepare_text_features(self, training_text: np.ndarray) -> None: - """Prepare TF-IDF vectorizer instead of tokenizer.""" - self.vectorizer = TfidfVectorizer( - max_features=self.config.max_features, - lowercase=True, - stop_words='english' - ) - # Fit the vectorizer on training text - self.vectorizer.fit(training_text) - - def _build_pytorch_model(self) -> None: - """Build the PyTorch model.""" - if self.vectorizer is None: - raise ValueError("Must call prepare_text_features first") - - input_dim = len(self.vectorizer.get_feature_names_out()) - - self.pytorch_model = SimpleTextModel( - input_dim=input_dim, - hidden_dim=self.config.hidden_dim, - num_classes=self.config.num_classes, - dropout_rate=self.config.dropout_rate - ) - - def _check_and_init_lightning(self, **kwargs) -> None: - """Initialize Lightning module.""" - self.lightning_module = SimpleTextModule( - model=self.pytorch_model, - learning_rate=self.config.learning_rate - ) - - def predict(self, X: np.ndarray, **kwargs) -> np.ndarray: - """Make predictions.""" - if not self.trained: - raise Exception("Model must be trained first.") - - # Extract text from X (assuming first column is text) - text_data = X[:, 0] if X.ndim > 1 else X - - # Transform text to TF-IDF features - features = self.vectorizer.transform(text_data).toarray() - features_tensor = torch.FloatTensor(features) - - self.pytorch_model.eval() - with torch.no_grad(): - logits = self.pytorch_model(features_tensor) - predictions = torch.argmax(logits, dim=1) - - return predictions.numpy() - - def validate(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: - """Validate the model.""" - predictions = self.predict(X) - accuracy = (predictions == Y).mean() - return float(accuracy) - - def create_dataset(self, texts: np.ndarray, labels: np.ndarray, categorical_variables: Optional[np.ndarray] = None): - """Create dataset.""" - # Transform text to TF-IDF features - features = self.vectorizer.transform(texts).toarray() - return SimpleTextDataset(features, labels) - - def create_dataloader(self, dataset, batch_size: int, num_workers: int = 0, shuffle: bool = True): - """Create dataloader.""" - return DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=shuffle) - - def load_best_model(self, checkpoint_path: str) -> None: - """Load best model from checkpoint.""" - self.lightning_module = SimpleTextModule.load_from_checkpoint( - checkpoint_path, - model=self.pytorch_model, - learning_rate=self.config.learning_rate - ) - self.pytorch_model = self.lightning_module.model - self.trained = True - self.pytorch_model.eval() - - @classmethod - def get_config_class(cls): - """Return the configuration class.""" - return SimpleTextConfig \ No newline at end of file diff --git a/torchTextClassifiers/factories.py b/torchTextClassifiers/factories.py deleted file mode 100644 index e6a92a3..0000000 --- a/torchTextClassifiers/factories.py +++ /dev/null @@ -1,34 +0,0 @@ -"""Generic factories for different classifier types.""" - -from typing import Dict, Any, Optional, Type, Callable -from .classifiers.base import BaseClassifierConfig - -# Registry of config factories for different classifier types -CONFIG_FACTORIES: Dict[str, Callable[[dict], BaseClassifierConfig]] = {} - - -def register_config_factory(classifier_type: str, factory_func: Callable[[dict], BaseClassifierConfig]): - """Register a config factory for a classifier type.""" - CONFIG_FACTORIES[classifier_type] = factory_func - - -def create_config_from_dict(classifier_type: str, config_dict: dict) -> BaseClassifierConfig: - """Create a config object from dictionary based on classifier type.""" - if classifier_type not in CONFIG_FACTORIES: - raise ValueError(f"Unsupported classifier type: {classifier_type}") - - return CONFIG_FACTORIES[classifier_type](config_dict) - - -# Register FastText factory -def _register_fasttext_factory(): - """Register FastText config factory.""" - try: - from .classifiers.fasttext.core import FastTextFactory - register_config_factory("fasttext", FastTextFactory.from_dict) - except ImportError: - pass # FastText module not available - - -# Auto-register available factories -_register_fasttext_factory() \ No newline at end of file diff --git a/torchTextClassifiers/utilities/preprocess.py b/torchTextClassifiers/utilities/preprocess.py deleted file mode 100644 index 900c427..0000000 --- a/torchTextClassifiers/utilities/preprocess.py +++ /dev/null @@ -1,82 +0,0 @@ -""" -Processing fns. -""" - -import string - -import numpy as np - -try: - import nltk - from nltk.corpus import stopwords as ntlk_stopwords - from nltk.stem.snowball import SnowballStemmer - - HAS_NLTK = True -except ImportError: - HAS_NLTK = False - -try: - import unidecode - - HAS_UNIDECODE = True -except ImportError: - HAS_UNIDECODE = False - - -def clean_text_feature(text: list[str], remove_stop_words=True): - """ - Cleans a text feature. - - Args: - text (list[str]): List of text descriptions. - remove_stop_words (bool): If True, remove stopwords. - - Returns: - list[str]: List of cleaned text descriptions. - - """ - if not HAS_NLTK: - raise ImportError( - "nltk is not installed and is required for preprocessing. Run 'pip install torchFastText[preprocess]'." - ) - if not HAS_UNIDECODE: - raise ImportError( - "unidecode is not installed and is required for preprocessing. Run 'pip install torchFastText[preprocess]'." - ) - - # Define stopwords and stemmer - - nltk.download("stopwords", quiet=True) - stopwords = tuple(ntlk_stopwords.words("french")) + tuple(string.ascii_lowercase) - stemmer = SnowballStemmer(language="french") - - # Remove of accented characters - text = np.vectorize(unidecode.unidecode)(np.array(text)) - - # To lowercase - text = np.char.lower(text) - - # Remove one letter words - def mylambda(x): - return " ".join([w for w in x.split() if len(w) > 1]) - - text = np.vectorize(mylambda)(text) - - # Remove duplicate words and stopwords in texts - # Stem words - libs_token = [lib.split() for lib in text.tolist()] - libs_token = [ - sorted(set(libs_token[i]), key=libs_token[i].index) for i in range(len(libs_token)) - ] - if remove_stop_words: - text = [ - " ".join([stemmer.stem(word) for word in libs_token[i] if word not in stopwords]) - for i in range(len(libs_token)) - ] - else: - text = [ - " ".join([stemmer.stem(word) for word in libs_token[i]]) for i in range(len(libs_token)) - ] - - # Return clean DataFrame - return text diff --git a/torchTextClassifiers/utilities/utils.py b/torchTextClassifiers/utilities/utils.py deleted file mode 100644 index c8e216e..0000000 --- a/torchTextClassifiers/utilities/utils.py +++ /dev/null @@ -1,346 +0,0 @@ -""" -Utility functions. -""" - -import warnings -import difflib -from difflib import SequenceMatcher - -import torch -import torch.nn.functional as F - -from .preprocess import clean_text_feature - - -def preprocess_token(token): - preprocessed_token = token.replace("", "") - preprocessed_token = preprocessed_token.replace("<", "") - preprocessed_token = preprocessed_token.replace(">", "") - - preprocessed_token = preprocessed_token.split() - - return preprocessed_token - - -def map_processed_to_original(processed_words, original_words, n=1, cutoff=0.9): - """ - Map processed words to original words based on similarity scores. - - Args: - processed_words (List[str]): List of processed words. - original_words (List[str]): List of original words. - n (int): Number of closest processed words to consider for a given original word. - cutoff (float): Minimum similarity score for a match. - - Returns: - Dict[str, str]: Mapping from original word to the corresponding closest processed word. - """ - - # For each word in the original list, find the n closest matching processed words - word_mapping = {} - - for original_word in original_words: - original_word_prepro = clean_text_feature([original_word], remove_stop_words=False)[ - 0 - ] # Preprocess the original word - - if original_word_prepro == "": - continue - - max_similarity_score = 0 - best_processed_word = None - # Calculate the similarity score for each processed word with the current original word - for processed_word in processed_words: - similarity_score = difflib.SequenceMatcher( - None, processed_word, original_word_prepro - ).ratio() # Ratcliff-Obershelp algorithm - - # Only consider matches with similarity above the cutoff - if similarity_score > max_similarity_score and similarity_score >= cutoff: - max_similarity_score = similarity_score - best_processed_word = processed_word - - if best_processed_word is not None: - # original_word = original_word.replace(',', '') - # Add the tuple (list of closest words, list of similarity scores) to the mapping - word_mapping[original_word] = best_processed_word - - return word_mapping - - -def test_end_of_word(all_processed_words, word, target_token, next_token, min_n): - flag = False - if target_token[-1] == ">": - if next_token[0] == "<": - if word in target_token: - flag = True - if word in next_token: - flag = False - if next_token[1] != word[0]: - flag = True - if len(next_token) == min_n: - flag = True - if next_token in all_processed_words: - flag = True - - return flag - - -def match_word_to_token_indexes(sentence, tokenized_sentence_tokens, min_n): - """ - Match words to token indexes in a sentence. - - Args: - sentence (str): Preprocessed sentence. - tokenized_sentence_tokens (List[str]): List of tokenized sentence tokens. - - Returns: - Dict[str, List[int]]: Mapping from word to list of token indexes. - - """ - - pointer_token = 0 - res = {} - processed_sentence = clean_text_feature([sentence], remove_stop_words=False)[0] - processed_words = processed_sentence.split() - # we know the tokens are in the right order - for index_word, word in enumerate(processed_words): - if word not in res: - res[word] = [] - - start = pointer_token - - # while we don't reach the end of the word, get going - while not test_end_of_word( - processed_words, - word, - tokenized_sentence_tokens[pointer_token], - tokenized_sentence_tokens[pointer_token + 1], - min_n=min_n, - ): - pointer_token += 1 - if pointer_token == len(tokenized_sentence_tokens) - 1: - warnings.warn("Error in the tokenization of the sentence") - # workaround to avoid error: each word is asociated to regular ranges - chunck = len(tokenized_sentence_tokens) // len(processed_words) - for idx, word in enumerate(processed_words): - res[word] = range( - idx * chunck, min((idx + 1) * chunck, len(tokenized_sentence_tokens)) - ) - return res - - pointer_token += 1 - end = pointer_token - - res[word] += list(range(start, end)) - - # here we arrive at the end of the sentence - assert tokenized_sentence_tokens[pointer_token] == "" - end_of_string_position = pointer_token - - # starting word n_gram - pointer_token += 1 - while pointer_token < len(tokenized_sentence_tokens): - token = tokenized_sentence_tokens[pointer_token] - for index_word, word in enumerate(processed_sentence.split()): - # now, the condition of matching changes: we need to find the word in the token - if word in token: - res[word].append(pointer_token) - pointer_token += 1 - - assert pointer_token == len(tokenized_sentence_tokens) - assert set(sum([v for v in res.values()], [end_of_string_position])) == set( - range(len(tokenized_sentence_tokens)) - ), print( - set(range(len(tokenized_sentence_tokens))) - - set(sum([v for v in res.values()], [end_of_string_position])) - ) # verify if all tokens are used - - return res - - -# at text level -def compute_preprocessed_word_score( - preprocessed_text, - tokenized_text_tokens, - scores, - id_to_token_dicts, - token_to_id_dicts, - min_n, - padding_index=2009603, - end_of_string_index=0, -): - """ - Compute preprocessed word scores based on token scores. - - Args: - preprocessed_text (List[str]): List of preprocessed sentences. - tokenized_text (List[List[int]]): For each sentence, list of token IDs. - scores (List[torch.Tensor]): For each sentence, list of token scores. - id_to_token_dicts (List[Dict[int, str]]): For each sentence, mapping from token ID to token in string form. - token_to_id_dicts (List[Dict[str, int]]): For each sentence, mapping from token (string) to token ID. - padding_index (int): Index of padding token. - end_of_string_index (int): Index of end of string token. - aggregate (bool): Whether to aggregate scores at word level (if False, stay at token level). - - Returns: - List[Dict[str, float]]: For each sentence, mapping from preprocessed word to score. - """ - - word_to_score_dicts = [] - word_to_token_idx_dicts = [] - - for idx, sentence in enumerate(preprocessed_text): - tokenized_sentence_tokens = tokenized_text_tokens[idx] # sentence level, List[str] - word_to_token_idx = match_word_to_token_indexes(sentence, tokenized_sentence_tokens, min_n) - score_sentence_topk = scores[idx] # torch.Tensor, token scores, (top_k, seq_len) - - # Calculate the score for each token and map to words - word_to_score_topk = [] - for k in range(len(score_sentence_topk)): - # Initialize word-to-score dictionary with zero values - word_to_score = {word: 0 for word in sentence.split()} - - score_sentence = score_sentence_topk[k] - for word, associated_token_idx in word_to_token_idx.items(): - associated_token_idx = torch.tensor(associated_token_idx).int() - word_to_score[word] = torch.sum(score_sentence[associated_token_idx]).item() - - word_to_score_topk.append(word_to_score.copy()) - - word_to_score_dicts.append(word_to_score_topk) - word_to_token_idx_dicts.append(word_to_token_idx) - - return word_to_score_dicts, word_to_token_idx_dicts - - -def compute_word_score(word_to_score_dicts, text, n=5, cutoff=0.75): - """ - Compute word scores based on preprocessed word scores. - - Args: - word_to_score_dicts (List[List[Dict[str, float]]]): For each sentence, list of top_k mappings from preprocessed word to score. - text (List[str]): List of sentences. - n (int): Number of closest preprocessed words to consider for a given original word. - cutoff (float): Minimum similarity score for a match. - - Returns: - List[List[List[float]]]: For each sentence, list of top-k scores for each word. - """ - - all_scores_text = [] - mappings = [] - for idx, word_to_score_topk in enumerate(word_to_score_dicts): # iteration over sentences - all_scores_topk = [] - processed_words = list(word_to_score_topk[0].keys()) - original_words = text[idx].split() - original_words = list(filter(lambda x: x != ",", original_words)) - mapping = map_processed_to_original( - processed_words, original_words, n=n, cutoff=cutoff - ) # Dict[str, Tuple[List[str], List[float]]] - mappings.append(mapping) - for word_to_score in word_to_score_topk: # iteration over top_k (the preds) - scores = [] - stopwords_idx = [] - for pos_word, word in enumerate(original_words): - if word not in mapping: - scores.append(0) - stopwords_idx.append(pos_word) - continue - matching_processed_word = mapping[word] - word_score = word_to_score[matching_processed_word] - scores.append(word_score) - - scores = torch.tensor(scores) - scores = F.softmax( - scores, dim=-1 - ) # softmax normalization. Length = len(original_words) - scores[stopwords_idx] = 0 - - all_scores_topk.append(scores) # length top_k - - all_scores_text.append(all_scores_topk) # length = len(text) - - return all_scores_text, mappings - - -def explain_continuous( - text, processed_text, tokenized_text_tokens, mappings, word_to_token_idx_dicts, all_attr, top_k -): - """ - Score explanation at letter level. - - Args: - text (List[str]): List of original sentences. - processed_text (List[str]): List of preprocessed sentences. - tokenized_text_tokens (List[List[str]]): List of tokenized sentences. - mappings (List[Dict[str, str]]): List of mappings from original word to preprocessed word. - word_to_token_idx_dicts (List[Dict[str, List[int]]]): List of mappings from preprocessed word to token indexes. - all_attr (torch.Tensor): Tensor of token scores. - top_k (int): Number of top tokens to consider. - - Returns: - List[torch.Tensor]: List of letter scores for each sentence. - - - """ - all_scores_text = [] - for idx, processed_sentence in enumerate(processed_text): - tokenized_sentence_tokens = tokenized_text_tokens[idx] - mapping = mappings[idx] - word_to_token_idx = word_to_token_idx_dicts[idx] - original_words = text[idx].split() - original_words = list(filter(lambda x: x != ",", original_words)) - - original_to_token = {} - original_to_token_idxs = {} - - for original in original_words: - # original = original.replace(',', '') - if original not in mapping: - continue - - matching_processed_word = mapping[original] - associated_token_idx = word_to_token_idx[matching_processed_word] - original_to_token[original] = [ - tokenized_sentence_tokens[token_idx] for token_idx in associated_token_idx - ] - original_to_token_idxs[original] = associated_token_idx - - scores_for_k = [] - for k in range(top_k): - scores_for_words = [] - for xxx, original_word in enumerate(original_words): - original_word_prepro = clean_text_feature([original_word], remove_stop_words=False)[ - 0 - ] - - letters = list(original_word) - scores_letter = torch.zeros(len(letters), dtype=torch.float32) - - if original_word not in original_to_token: # if stopword, 0 - scores_for_words.append(scores_letter) - continue - - for pos, token in enumerate(original_to_token[original_word]): - pos_token = original_to_token_idxs[original_word][pos] - # tok = preprocess_token(token)[0] - tok = preprocess_token(token) - score_token = all_attr[idx, k, pos_token].item() - - # Embed the token at the right indexes of the word - sm = SequenceMatcher(None, original_word_prepro, tok) - a, _, size = sm.find_longest_match() - scores_letter[a : a + size] += score_token - - scores_for_words.append(scores_letter) - - all_scores_letter = torch.cat(scores_for_words) - scores = F.softmax(all_scores_letter, dim=-1) - scores[all_scores_letter == 0] = 0 - scores_for_k.append(scores) - - scores_for_sentence = torch.stack(scores_for_k) - all_scores_text.append(scores_for_sentence) - - return torch.stack(all_scores_text) From a7f71d3b7bb6b7253fc0deb17c792986784f65c0 Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Wed, 5 Nov 2025 19:05:13 +0000 Subject: [PATCH 22/66] feat: enable to choose context size in tokenizer constant output dim if specified, otherwise pad to longest sequence in batch --- torchTextClassifiers/tokenizers/WordPiece.py | 7 ++++--- torchTextClassifiers/tokenizers/base.py | 13 ++++++++++--- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/torchTextClassifiers/tokenizers/WordPiece.py b/torchTextClassifiers/tokenizers/WordPiece.py index 6b33328..1222c80 100644 --- a/torchTextClassifiers/tokenizers/WordPiece.py +++ b/torchTextClassifiers/tokenizers/WordPiece.py @@ -1,6 +1,6 @@ import logging import os -from typing import List +from typing import List, Optional from torchTextClassifiers.tokenizers import HAS_HF, HuggingFaceTokenizer @@ -24,10 +24,10 @@ class WordPieceTokenizer(HuggingFaceTokenizer): - def __init__(self, vocab_size: int, trained: bool = False): + def __init__(self, vocab_size: int, trained: bool = False, output_dim: Optional[int] = None): """Largely inspired by https://huggingface.co/learn/llm-course/chapter6/8""" - super().__init__(vocab_size) + super().__init__(vocab_size=vocab_size, output_dim=output_dim) self.unk_token = "[UNK]" self.pad_token = "[PAD]" @@ -40,6 +40,7 @@ def __init__(self, vocab_size: int, trained: bool = False): self.sep_token, ] self.vocab_size = vocab_size + self.context_size = output_dim self.tokenizer = Tokenizer(models.WordPiece(unk_token=self.unk_token)) diff --git a/torchTextClassifiers/tokenizers/base.py b/torchTextClassifiers/tokenizers/base.py index f5c7c6e..8112d04 100644 --- a/torchTextClassifiers/tokenizers/base.py +++ b/torchTextClassifiers/tokenizers/base.py @@ -24,6 +24,7 @@ def __init__( self.vocab_size = vocab_size self.output_vectorized = output_vectorized + self.output_dim = output_dim if self.output_vectorized: if output_dim is None: raise ValueError( @@ -40,9 +41,9 @@ def __len__(self): class HuggingFaceTokenizer(BaseTokenizer, ABC): - def __init__(self, vocab_size: int): + def __init__(self, vocab_size: int, output_dim: Optional[int] = None): super().__init__( - vocab_size, output_vectorized=False + vocab_size, output_vectorized=False, output_dim=output_dim ) # it outputs token ids and not vectors self.trained = False @@ -52,8 +53,14 @@ def tokenize(self, text: Union[str, List[str]]) -> list: if not self.trained: raise RuntimeError("Tokenizer must be trained before tokenization.") + # Pad to longest sequence if no output_dim is specified + padding = True if self.output_dim is None else "max_length" + return self.tokenizer( - text, padding=True, return_tensors="pt" + text, + padding=padding, + return_tensors="pt", + max_length=self.output_dim, ) # method from PreTrainedTokenizerFast @classmethod From 0a9eda546018f3db7a7c4ec932e8daffa4e66298 Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Fri, 7 Nov 2025 09:44:40 +0000 Subject: [PATCH 23/66] chore: pin_memory to default False (avoid warning on CPU run) --- torchTextClassifiers/dataset/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchTextClassifiers/dataset/dataset.py b/torchTextClassifiers/dataset/dataset.py index 44dbaf2..90d34dc 100644 --- a/torchTextClassifiers/dataset/dataset.py +++ b/torchTextClassifiers/dataset/dataset.py @@ -93,7 +93,7 @@ def create_dataloader( shuffle: bool = False, drop_last: bool = False, num_workers: int = os.cpu_count() - 1, - pin_memory: bool = True, + pin_memory: bool = False, persistent_workers: bool = True, **kwargs, ) -> torch.utils.data.DataLoader: From 6d951fed7446b02ad595067e19d1267edc030c41 Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Fri, 7 Nov 2025 09:58:17 +0000 Subject: [PATCH 24/66] feat: ad __repr__ for all components --- torchTextClassifiers/tokenizers/WordPiece.py | 3 ++ torchTextClassifiers/tokenizers/base.py | 5 +-- torchTextClassifiers/torchTextClassifiers.py | 38 ++++++++++++++++---- 3 files changed, 38 insertions(+), 8 deletions(-) diff --git a/torchTextClassifiers/tokenizers/WordPiece.py b/torchTextClassifiers/tokenizers/WordPiece.py index 1222c80..646eb36 100644 --- a/torchTextClassifiers/tokenizers/WordPiece.py +++ b/torchTextClassifiers/tokenizers/WordPiece.py @@ -90,3 +90,6 @@ def train( filesystem.mkdirs(parent_dir) filesystem.put(save_path, s3_save_path) logger.info(f"💾 Tokenizer uploaded to S3 at {s3_save_path}") + + def __repr__(self): + return self.tokenizer.__repr__() diff --git a/torchTextClassifiers/tokenizers/base.py b/torchTextClassifiers/tokenizers/base.py index 8112d04..6f0ea5a 100644 --- a/torchTextClassifiers/tokenizers/base.py +++ b/torchTextClassifiers/tokenizers/base.py @@ -102,5 +102,6 @@ def train( @abstractmethod def _post_training(self): - """Applies post-training configurations to the tokenizer.""" - pass + + def __repr__(self): + return self.tokenizer.__repr__() diff --git a/torchTextClassifiers/torchTextClassifiers.py b/torchTextClassifiers/torchTextClassifiers.py index 00c5284..755ecc6 100644 --- a/torchTextClassifiers/torchTextClassifiers.py +++ b/torchTextClassifiers/torchTextClassifiers.py @@ -348,9 +348,9 @@ def _check_XY(self, X: np.ndarray, Y: np.ndarray) -> Tuple[np.ndarray, np.ndarra @staticmethod def _check_text_col(X): - assert isinstance( - X, np.ndarray - ), "X must be a numpy array of shape (N,d), with the first column being the text and the rest being the categorical variables." + assert isinstance(X, np.ndarray), ( + "X must be a numpy array of shape (N,d), with the first column being the text and the rest being the categorical variables." + ) try: if X.ndim > 1: @@ -412,9 +412,9 @@ def _check_X(self, X: np.ndarray) -> np.ndarray: def _check_Y(self, Y): assert isinstance(Y, np.ndarray), "Y must be a numpy array of shape (N,) or (N,1)." - assert len(Y.shape) == 1 or ( - len(Y.shape) == 2 and Y.shape[1] == 1 - ), "Y must be a numpy array of shape (N,) or (N,1)." + assert len(Y.shape) == 1 or (len(Y.shape) == 2 and Y.shape[1] == 1), ( + "Y must be a numpy array of shape (N,) or (N,1)." + ) try: Y = Y.astype(int) @@ -512,3 +512,29 @@ def predict( "prediction": predictions, "confidence": confidence, } + + def __repr__(self): + model_type = ( + self.lightning_module.__repr__() if hasattr(self, "lightning_module") else self.pytorch_model.__repr__() + ) + + tokenizer_info = ( + self.tokenizer.__repr__() + ) + + cat_forward_type = ( + self.categorical_var_net.forward_type.name + if self.categorical_var_net is not None + else "None" + ) + + lines = [ + "torchTextClassifiers(", + f" tokenizer = {tokenizer_info},", + f" model = {model_type},", + f" categorical_forward_type = {cat_forward_type},", + f" num_classes = {self.model_config.num_classes},", + f" embedding_dim = {self.embedding_dim},", + ")", + ] + return "\n".join(lines) \ No newline at end of file From c31ad43aa4a1eb32c4603a392b6bd5f3def29f50 Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Fri, 7 Nov 2025 09:58:47 +0000 Subject: [PATCH 25/66] chore: format --- torchTextClassifiers/torchTextClassifiers.py | 22 ++++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/torchTextClassifiers/torchTextClassifiers.py b/torchTextClassifiers/torchTextClassifiers.py index 755ecc6..3630222 100644 --- a/torchTextClassifiers/torchTextClassifiers.py +++ b/torchTextClassifiers/torchTextClassifiers.py @@ -348,9 +348,9 @@ def _check_XY(self, X: np.ndarray, Y: np.ndarray) -> Tuple[np.ndarray, np.ndarra @staticmethod def _check_text_col(X): - assert isinstance(X, np.ndarray), ( - "X must be a numpy array of shape (N,d), with the first column being the text and the rest being the categorical variables." - ) + assert isinstance( + X, np.ndarray + ), "X must be a numpy array of shape (N,d), with the first column being the text and the rest being the categorical variables." try: if X.ndim > 1: @@ -412,9 +412,9 @@ def _check_X(self, X: np.ndarray) -> np.ndarray: def _check_Y(self, Y): assert isinstance(Y, np.ndarray), "Y must be a numpy array of shape (N,) or (N,1)." - assert len(Y.shape) == 1 or (len(Y.shape) == 2 and Y.shape[1] == 1), ( - "Y must be a numpy array of shape (N,) or (N,1)." - ) + assert len(Y.shape) == 1 or ( + len(Y.shape) == 2 and Y.shape[1] == 1 + ), "Y must be a numpy array of shape (N,) or (N,1)." try: Y = Y.astype(int) @@ -515,12 +515,12 @@ def predict( def __repr__(self): model_type = ( - self.lightning_module.__repr__() if hasattr(self, "lightning_module") else self.pytorch_model.__repr__() + self.lightning_module.__repr__() + if hasattr(self, "lightning_module") + else self.pytorch_model.__repr__() ) - tokenizer_info = ( - self.tokenizer.__repr__() - ) + tokenizer_info = self.tokenizer.__repr__() cat_forward_type = ( self.categorical_var_net.forward_type.name @@ -537,4 +537,4 @@ def __repr__(self): f" embedding_dim = {self.embedding_dim},", ")", ] - return "\n".join(lines) \ No newline at end of file + return "\n".join(lines) From 956b7a35b96466ec04aae785b3c7bf24f871ee7a Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Fri, 7 Nov 2025 09:59:53 +0000 Subject: [PATCH 26/66] feat!(HF): enable load from pretrained --- torchTextClassifiers/tokenizers/base.py | 49 ++++++++++++++++--------- 1 file changed, 32 insertions(+), 17 deletions(-) diff --git a/torchTextClassifiers/tokenizers/base.py b/torchTextClassifiers/tokenizers/base.py index 6f0ea5a..bb76beb 100644 --- a/torchTextClassifiers/tokenizers/base.py +++ b/torchTextClassifiers/tokenizers/base.py @@ -3,7 +3,7 @@ try: from tokenizers import Tokenizer - from transformers import PreTrainedTokenizerFast + from transformers import AutoTokenizer, PreTrainedTokenizerFast HAS_HF = True except ImportError: @@ -39,15 +39,26 @@ def tokenize(self, text: Union[str, List[str]]) -> list: def __len__(self): return self.vocab_size + def __repr__(self): + return f"{self.__class__.__name__}(vocab_size={self.vocab_size}, output_vectorized={self.output_vectorized}, output_dim={self.output_dim})" + -class HuggingFaceTokenizer(BaseTokenizer, ABC): - def __init__(self, vocab_size: int, output_dim: Optional[int] = None): +class HuggingFaceTokenizer(BaseTokenizer): + def __init__( + self, + vocab_size: int, + output_dim: Optional[int] = None, + padding_idx: Optional[int] = None, + trained: bool = False, + ): super().__init__( vocab_size, output_vectorized=False, output_dim=output_dim ) # it outputs token ids and not vectors - self.trained = False + self.trained = trained self.tokenizer = None + self.padding_idx = padding_idx + self.output_dim = output_dim # constant context size for all batch def tokenize(self, text: Union[str, List[str]]) -> list: if not self.trained: @@ -63,12 +74,22 @@ def tokenize(self, text: Union[str, List[str]]) -> list: max_length=self.output_dim, ) # method from PreTrainedTokenizerFast + @classmethod + def load_from_pretrained(cls, tokenizer_name: str, output_dim: Optional[int] = None): + tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) + padding_idx = tokenizer.pad_token_id + instance = cls( + vocab_size=len(tokenizer), trained=True, padding_idx=padding_idx, output_dim=output_dim + ) + instance.tokenizer = tokenizer + return instance + @classmethod def load(cls, load_path: str): loaded_tokenizer = PreTrainedTokenizerFast(tokenizer_file=load_path) instance = cls(vocab_size=len(loaded_tokenizer), trained=True) instance.tokenizer = loaded_tokenizer - instance._post_training() + # instance._post_training() return instance @classmethod @@ -88,20 +109,14 @@ def load_from_s3(cls, s3_path: str, filesystem): instance._post_training() return instance - @abstractmethod - def train( - self, - training_corpus: list, - save_path: str = None, - filesystem=None, - s3_save_path=None, - **kwargs, - ): - """Trains the tokenizer on the provided training corpus.""" - pass + def train(self, *args, **kwargs): + raise NotImplementedError( + "This tokenizer cannot be trained directly. " + "Load it from pretrained or implement train() in a subclass." + ) - @abstractmethod def _post_training(self): + raise NotImplementedError("_post_training() not implemented for HuggingFaceTokenizer.") def __repr__(self): return self.tokenizer.__repr__() From a497697de1374c0ff66c603aa6c650a4b0508cc1 Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Fri, 7 Nov 2025 10:09:34 +0000 Subject: [PATCH 27/66] chore: update description --- pyproject.toml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 9244177..60c7fd8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,11 +1,9 @@ [project] name = "torchtextclassifiers" -description = "An implementation of the https://github.com/facebookresearch/fastText supervised learning algorithm for text classification using Pytorch." +description = "A text classification toolkit to easily build, train and evaluate deep learning text classifiers using PyTorch." authors = [ - { name = "Tom Seimandi", email = "tom.seimandi@gmail.com" }, - { name = "Julien Pramil", email = "julien.pramil@insee.fr" }, - { name = "Meilame Tayebjee", email = "meilame.tayebjee@insee.fr" }, { name = "Cédric Couralet", email = "cedric.couralet@insee.fr" }, + { name = "Meilame Tayebjee", email = "meilame.tayebjee@insee.fr" }, ] readme = "README.md" repository = "https://github.com/InseeFrLab/torchTextClassifiers" From 2fda9c2d139c8e66147785515fbdd26712837da2 Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Fri, 7 Nov 2025 10:14:18 +0000 Subject: [PATCH 28/66] feat: __call__ for tokenizers is tokenize --- torchTextClassifiers/tokenizers/base.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/torchTextClassifiers/tokenizers/base.py b/torchTextClassifiers/tokenizers/base.py index bb76beb..92dfb7f 100644 --- a/torchTextClassifiers/tokenizers/base.py +++ b/torchTextClassifiers/tokenizers/base.py @@ -42,6 +42,9 @@ def __len__(self): def __repr__(self): return f"{self.__class__.__name__}(vocab_size={self.vocab_size}, output_vectorized={self.output_vectorized}, output_dim={self.output_dim})" + def __call__(self, text: Union[str, List[str]]) -> list: + return self.tokenize(text) + class HuggingFaceTokenizer(BaseTokenizer): def __init__( From 13b9de4eba484bf7da5aa9ed36ca8f759648ecbc Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Fri, 7 Nov 2025 15:45:20 +0000 Subject: [PATCH 29/66] feat(tokenizers): clean __call__ and __rep__, add offset return for expl. --- torchTextClassifiers/tokenizers/WordPiece.py | 3 --- torchTextClassifiers/tokenizers/base.py | 11 +++++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/torchTextClassifiers/tokenizers/WordPiece.py b/torchTextClassifiers/tokenizers/WordPiece.py index 646eb36..1222c80 100644 --- a/torchTextClassifiers/tokenizers/WordPiece.py +++ b/torchTextClassifiers/tokenizers/WordPiece.py @@ -90,6 +90,3 @@ def train( filesystem.mkdirs(parent_dir) filesystem.put(save_path, s3_save_path) logger.info(f"💾 Tokenizer uploaded to S3 at {s3_save_path}") - - def __repr__(self): - return self.tokenizer.__repr__() diff --git a/torchTextClassifiers/tokenizers/base.py b/torchTextClassifiers/tokenizers/base.py index 92dfb7f..eec709a 100644 --- a/torchTextClassifiers/tokenizers/base.py +++ b/torchTextClassifiers/tokenizers/base.py @@ -42,8 +42,8 @@ def __len__(self): def __repr__(self): return f"{self.__class__.__name__}(vocab_size={self.vocab_size}, output_vectorized={self.output_vectorized}, output_dim={self.output_dim})" - def __call__(self, text: Union[str, List[str]]) -> list: - return self.tokenize(text) + def __call__(self, text: Union[str, List[str]], **kwargs) -> list: + return self.tokenize(text, **kwargs) class HuggingFaceTokenizer(BaseTokenizer): @@ -63,7 +63,9 @@ def __init__( self.padding_idx = padding_idx self.output_dim = output_dim # constant context size for all batch - def tokenize(self, text: Union[str, List[str]]) -> list: + def tokenize( + self, text: Union[str, List[str]], return_offsets_mapping: Optional[bool] = False + ) -> list: if not self.trained: raise RuntimeError("Tokenizer must be trained before tokenization.") @@ -75,6 +77,7 @@ def tokenize(self, text: Union[str, List[str]]) -> list: padding=padding, return_tensors="pt", max_length=self.output_dim, + return_offsets_mapping=return_offsets_mapping, ) # method from PreTrainedTokenizerFast @classmethod @@ -122,4 +125,4 @@ def _post_training(self): raise NotImplementedError("_post_training() not implemented for HuggingFaceTokenizer.") def __repr__(self): - return self.tokenizer.__repr__() + return f"{self.__class__.__name__} \n HuggingFace tokenizer: {self.tokenizer.__repr__()}" From f55452b4c9da227d29658b71766761300deac60d Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Fri, 7 Nov 2025 15:46:20 +0000 Subject: [PATCH 30/66] feat!(explainability): finalize explainability feature at word and char level plot functions to be fixed! --- torchTextClassifiers/torchTextClassifiers.py | 11 +- torchTextClassifiers/utilities/__init__.py | 3 - .../utilities/plot_explainability.py | 155 ++++++++++++++++++ 3 files changed, 165 insertions(+), 4 deletions(-) create mode 100644 torchTextClassifiers/utilities/plot_explainability.py diff --git a/torchTextClassifiers/torchTextClassifiers.py b/torchTextClassifiers/torchTextClassifiers.py index 3630222..974c067 100644 --- a/torchTextClassifiers/torchTextClassifiers.py +++ b/torchTextClassifiers/torchTextClassifiers.py @@ -448,6 +448,7 @@ def predict( """ if explain: + return_offsets_mapping = True # to be passed to the tokenizer if self.pytorch_model.text_embedder is None: raise RuntimeError( "Explainability is not supported when the tokenizer outputs vectorized text directly. Please use a tokenizer that outputs token IDs." @@ -460,6 +461,8 @@ def predict( lig = LayerIntegratedGradients( self.pytorch_model, self.pytorch_model.text_embedder.embedding_layer ) # initialize a Captum layer gradient integrator + else: + return_offsets_mapping = False X_test = self._check_X(X_test) text = X_test["text"] @@ -467,7 +470,9 @@ def predict( self.pytorch_model.eval() - tokenize_output = self.tokenizer.tokenize(text.tolist()) + tokenize_output = self.tokenizer.tokenize( + text.tolist(), return_offsets_mapping=return_offsets_mapping + ) encoded_text = tokenize_output["input_ids"] # (batch_size, seq_len) attention_mask = tokenize_output["attention_mask"] # (batch_size, seq_len) @@ -506,6 +511,10 @@ def predict( "prediction": predictions, "confidence": confidence, "attributions": all_attributions, + "offset_mapping": tokenize_output.get("offset_mapping", None), + "word_ids": np.array( + [tokenize_output.word_ids(i) for i in range(len(encoded_text))] + ), } else: return { diff --git a/torchTextClassifiers/utilities/__init__.py b/torchTextClassifiers/utilities/__init__.py index 84d1550..e69de29 100644 --- a/torchTextClassifiers/utilities/__init__.py +++ b/torchTextClassifiers/utilities/__init__.py @@ -1,3 +0,0 @@ -""" -Init script. -""" diff --git a/torchTextClassifiers/utilities/plot_explainability.py b/torchTextClassifiers/utilities/plot_explainability.py new file mode 100644 index 0000000..a2d1a04 --- /dev/null +++ b/torchTextClassifiers/utilities/plot_explainability.py @@ -0,0 +1,155 @@ +import numpy as np +import torch + +try: + from matplotlib import pyplot as plt + + HAS_PYPLOT = True +except ImportError: + HAS_PYPLOT = False + + +def map_attributions_to_char(attributions, offsets, text): + """ + Maps token-level attributions to character-level attributions based on token offsets. + Args: + attributions (np.ndarray): Array of shape (top_k, seq_len) or (seq_len,) containing token-level attributions. + Output from: + >>> ttc.predict(X, top_k=top_k, explain=True)["attributions"] + offsets (list of tuples): List of (start, end) offsets for each token in the original text. + Output from: + >>> ttc.predict(X, top_k=top_k, explain=True)["offset_mapping"] + Also from: + >>> ttc.tokenizer.tokenize(text, return_offsets_mapping=True)["offset_mapping"] + text (str): The original input text. + + Returns: + np.ndarray: Array of shape (top_k, text_len) containing character-level attributions. + text_len is the number of characters in the original text. + + """ + + if isinstance(text, list): + raise ValueError("text must be a single string, not a list of strings.") + + assert isinstance(text, str), "text must be a string." + + if isinstance(attributions, torch.Tensor): + attributions = attributions.cpu().numpy() + + if attributions.ndim == 1: + attributions = attributions[None, :] + + attributions_per_char = np.empty((attributions.shape[0], len(text))) # top_k, text_len + + for token_idx, (start, end) in enumerate(offsets): + if start == end: + continue + attributions_per_char[:, start:end] = attributions[:, token_idx][:, None] + + return attributions_per_char + + +def map_attributions_to_word(attributions, word_ids): + """ + Maps token-level attributions to word-level attributions based on word IDs. + Args: + attributions (np.ndarray): Array of shape (top_k, seq_len) or (seq_len,) containing token-level attributions. + Output from: + >>> ttc.predict(X, top_k=top_k, explain=True)["attributions"] + word_ids (list of int or None): List of word IDs for each token in the original text. + Output from: + >>> ttc.predict(X, top_k=top_k, explain=True)["word_ids"] + + Returns: + np.ndarray: Array of shape (top_k, num_words) containing word-level attributions. + num_words is the number of unique words in the original text. + """ + + word_ids = np.array(word_ids) + + # Convert None to -1 for easier processing (PAD tokens) + word_ids_int = np.array([x if x is not None else -1 for x in word_ids], dtype=int) + + # Consider only tokens that belong to actual words (non-PAD) + unique_word_ids = np.unique(word_ids_int) + unique_word_ids = unique_word_ids[unique_word_ids != -1] + + top_k = attributions.shape[0] + attr_with_word_id = np.concat( + (attributions[:, :, None], np.tile(word_ids_int[None, :], reps=(top_k, 1))[:, :, None]), + axis=-1, + ) # top_k, seq_len, 2 + # last dim is 2: 0 is the attribution of the token, 1 is the word_id the token is associated to + + word_attributions = np.zeros((top_k, len(word_ids_int))) + for word_id in unique_word_ids: + mask = attr_with_word_id[:, :, 1] == word_id # top_k, seq_len + word_attributions[:, word_id] = (attr_with_word_id[:, :, 0] * mask).sum( + axis=1 + ) # zero-out non-matching tokens and sum attributions for all tokens belonging to the same word + + return word_attributions + + +def plot_attributions_at_char(text, attributions_per_char, title="Attributions", figsize=(10, 2)): + """ + Plots character-level attributions as a heatmap. + Args: + text (str): The original input text. + attributions_per_char (np.ndarray): Array of shape (top_k, text_len) containing character-level attributions. + Output from map_attributions_to_char function. + title (str): Title of the plot. + figsize (tuple): Figure size for the plot. + """ + + if not HAS_PYPLOT: + raise ImportError( + "matplotlib is required for plotting. Please install it to use this function." + ) + + plt.figure(figsize=figsize) + plt.imshow(attributions_per_char, aspect="auto", cmap="viridis") + plt.colorbar(label="Attribution Score") + plt.yticks( + ticks=np.arange(attributions_per_char.shape[0]), + labels=[f"Top {i+1}" for i in range(attributions_per_char.shape[0])], + ) + plt.xticks(ticks=np.arange(len(text)), labels=list(text), rotation=90) + plt.title(title) + plt.xlabel("Characters in Text") + plt.ylabel("Top Predictions") + plt.tight_layout() + plt.show() + + +def plot_attributions_at_word(text, attributions_per_word, title="Attributions", figsize=(10, 2)): + """ + Plots word-level attributions as a heatmap. + Args: + text (str): The original input text. + attributions_per_word (np.ndarray): Array of shape (top_k, num_words) containing word-level attributions. + Output from map_attributions_to_word function. + title (str): Title of the plot. + figsize (tuple): Figure size for the plot. + """ + + if not HAS_PYPLOT: + raise ImportError( + "matplotlib is required for plotting. Please install it to use this function." + ) + + words = text.split() + plt.figure(figsize=figsize) + plt.imshow(attributions_per_word, aspect="auto", cmap="viridis") + plt.colorbar(label="Attribution Score") + plt.yticks( + ticks=np.arange(attributions_per_word.shape[0]), + labels=[f"Top {i+1}" for i in range(attributions_per_word.shape[0])], + ) + plt.xticks(ticks=np.arange(len(words)), labels=words, rotation=90) + plt.title(title) + plt.xlabel("Words in Text") + plt.ylabel("Top Predictions") + plt.tight_layout() + plt.show() From 0262109d49b86007d934a4b074dca57535f154ba Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Fri, 7 Nov 2025 15:46:32 +0000 Subject: [PATCH 31/66] chore: remove useless file --- torchTextClassifiers/utilities/checkers.py | 17 ----------------- 1 file changed, 17 deletions(-) delete mode 100644 torchTextClassifiers/utilities/checkers.py diff --git a/torchTextClassifiers/utilities/checkers.py b/torchTextClassifiers/utilities/checkers.py deleted file mode 100644 index 7474597..0000000 --- a/torchTextClassifiers/utilities/checkers.py +++ /dev/null @@ -1,17 +0,0 @@ -import json -import logging - -import numpy as np - -logger = logging.getLogger(__name__) - - -class NumpyJSONEncoder(json.JSONEncoder): - def default(self, obj): - if isinstance(obj, np.integer): - return int(obj) - if isinstance(obj, np.floating): - return float(obj) - if isinstance(obj, np.ndarray): - return obj.tolist() - return super().default(obj) From 6bdb750e1bc3b21b6d0dd477915d4970b30b4487 Mon Sep 17 00:00:00 2001 From: Meilame Tayebjee <114609737+meilame-tayebjee@users.noreply.github.com> Date: Mon, 10 Nov 2025 10:41:44 +0100 Subject: [PATCH 32/66] fix: typo in trainer_params max_epochs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Cédric Couralet --- torchTextClassifiers/torchTextClassifiers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchTextClassifiers/torchTextClassifiers.py b/torchTextClassifiers/torchTextClassifiers.py index 974c067..55b4d27 100644 --- a/torchTextClassifiers/torchTextClassifiers.py +++ b/torchTextClassifiers/torchTextClassifiers.py @@ -300,7 +300,7 @@ def train( trainer_params = { "callbacks": callbacks, - "max_epochs": training_config.batch_size, + "max_epochs": training_config.num_epochs, "num_sanity_val_steps": 2, "strategy": "auto", "log_every_n_steps": 1, From 830a45cadd9cd6d7e28ab7e44d17e9caa277b843 Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Mon, 10 Nov 2025 10:56:12 +0000 Subject: [PATCH 33/66] feat!(tokenizer): ensure output is consistent across al tokenizers --- torchTextClassifiers/dataset/dataset.py | 4 +- torchTextClassifiers/tokenizers/__init__.py | 5 +- torchTextClassifiers/tokenizers/base.py | 68 +++++++++++++++++++- torchTextClassifiers/torchTextClassifiers.py | 17 +++-- 4 files changed, 79 insertions(+), 15 deletions(-) diff --git a/torchTextClassifiers/dataset/dataset.py b/torchTextClassifiers/dataset/dataset.py index 90d34dc..460ffea 100644 --- a/torchTextClassifiers/dataset/dataset.py +++ b/torchTextClassifiers/dataset/dataset.py @@ -81,8 +81,8 @@ def collate_fn(self, batch): categorical_tensors = None return { - "input_ids": tokenize_output["input_ids"], - "attention_mask": tokenize_output["attention_mask"], + "input_ids": tokenize_output.input_ids, + "attention_mask": tokenize_output.attention_mask, "categorical_vars": categorical_tensors, "labels": labels_tensor, } diff --git a/torchTextClassifiers/tokenizers/__init__.py b/torchTextClassifiers/tokenizers/__init__.py index 5fd489f..476f879 100644 --- a/torchTextClassifiers/tokenizers/__init__.py +++ b/torchTextClassifiers/tokenizers/__init__.py @@ -1,10 +1,9 @@ from .base import ( HAS_HF as HAS_HF, ) -from .base import ( - BaseTokenizer as BaseTokenizer, -) +from .base import BaseTokenizer as BaseTokenizer from .base import ( HuggingFaceTokenizer as HuggingFaceTokenizer, ) +from .base import TokenizerOutput as TokenizerOutput from .WordPiece import WordPieceTokenizer as WordPieceTokenizer diff --git a/torchTextClassifiers/tokenizers/base.py b/torchTextClassifiers/tokenizers/base.py index eec709a..d700b78 100644 --- a/torchTextClassifiers/tokenizers/base.py +++ b/torchTextClassifiers/tokenizers/base.py @@ -1,5 +1,9 @@ from abc import ABC, abstractmethod -from typing import List, Optional, Union +from dataclasses import asdict, dataclass +from typing import Any, Dict, List, Optional, Union + +import numpy as np +import torch try: from tokenizers import Tokenizer @@ -10,6 +14,55 @@ HAS_HF = False +@dataclass +class TokenizerOutput: + input_ids: torch.Tensor # shape: (batch_size, seq_len) + attention_mask: torch.Tensor # shape: (batch_size, seq_len) + offset_mapping: Optional[torch.Tensor] = None # shape: (batch_size, seq_len, 2) + word_ids: Optional[np.ndarray] = None # shape: (batch_size, seq_len) + + def to_dict(self) -> Dict[str, Any]: + return asdict(self) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "TokenizerOutput": + return cls(**data) + + def __post_init__(self): + # --- Basic type checks --- + if not isinstance(self.input_ids, torch.Tensor): + raise TypeError(f"token_ids must be a torch.Tensor, got {type(self.input_ids)}") + if not isinstance(self.attention_mask, torch.Tensor): + raise TypeError( + f"attention_mask must be a torch.Tensor, got {type(self.attention_mask)}" + ) + if self.offset_mapping is not None and not isinstance(self.offset_mapping, torch.Tensor): + raise TypeError( + f"offset_mapping must be a torch.Tensor or None, got {type(self.offset_mapping)}" + ) + if self.word_ids is not None and not isinstance(self.word_ids, np.ndarray): + raise TypeError(f"word_ids must be a numpy.ndarray or None, got {type(self.word_ids)}") + + # --- Shape consistency checks --- + if self.input_ids.shape != self.attention_mask.shape: + raise ValueError( + f"Shape mismatch: token_ids {self.token_ids.shape} and attention_mask {self.attention_mask.shape}" + ) + + if self.offset_mapping is not None: + expected_shape = (*self.input_ids.shape, 2) + if self.offset_mapping.shape != expected_shape: + raise ValueError( + f"offset_mapping should have shape {expected_shape}, got {self.offset_mapping.shape}" + ) + + if self.word_ids is not None: + if self.word_ids.shape != self.input_ids.shape: + raise ValueError( + f"word_ids should have shape {self.input_ids.shape}, got {self.word_ids.shape}" + ) + + class BaseTokenizer(ABC): def __init__( self, vocab_size: int, output_vectorized: bool = False, output_dim: Optional[int] = None @@ -32,7 +85,7 @@ def __init__( ) @abstractmethod - def tokenize(self, text: Union[str, List[str]]) -> list: + def tokenize(self, text: Union[str, List[str]]) -> TokenizerOutput: """Tokenizes the raw input text into a list of tokens.""" pass @@ -72,7 +125,7 @@ def tokenize( # Pad to longest sequence if no output_dim is specified padding = True if self.output_dim is None else "max_length" - return self.tokenizer( + tokenize_output = self.tokenizer( text, padding=padding, return_tensors="pt", @@ -80,6 +133,15 @@ def tokenize( return_offsets_mapping=return_offsets_mapping, ) # method from PreTrainedTokenizerFast + encoded_text = tokenize_output["input_ids"] + + return TokenizerOutput( + input_ids=encoded_text, + attention_mask=tokenize_output["attention_mask"], + offset_mapping=tokenize_output.get("offset_mapping", None), + word_ids=np.array([tokenize_output.word_ids(i) for i in range(len(encoded_text))]), + ) + @classmethod def load_from_pretrained(cls, tokenizer_name: str, output_dim: Optional[int] = None): tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) diff --git a/torchTextClassifiers/torchTextClassifiers.py b/torchTextClassifiers/torchTextClassifiers.py index 55b4d27..0802157 100644 --- a/torchTextClassifiers/torchTextClassifiers.py +++ b/torchTextClassifiers/torchTextClassifiers.py @@ -28,7 +28,7 @@ ClassificationHead, TextEmbedder, ) -from torchTextClassifiers.tokenizers import BaseTokenizer +from torchTextClassifiers.tokenizers import BaseTokenizer, TokenizerOutput logger = logging.getLogger(__name__) @@ -474,8 +474,13 @@ def predict( text.tolist(), return_offsets_mapping=return_offsets_mapping ) - encoded_text = tokenize_output["input_ids"] # (batch_size, seq_len) - attention_mask = tokenize_output["attention_mask"] # (batch_size, seq_len) + if not isinstance(tokenize_output, TokenizerOutput): + raise TypeError( + f"Expected TokenizerOutput, got {type(tokenize_output)} from tokenizer.tokenize method." + ) + + encoded_text = tokenize_output.input_ids # (batch_size, seq_len) + attention_mask = tokenize_output.attention_mask # (batch_size, seq_len) if categorical_variables is not None: categorical_vars = torch.tensor( @@ -511,10 +516,8 @@ def predict( "prediction": predictions, "confidence": confidence, "attributions": all_attributions, - "offset_mapping": tokenize_output.get("offset_mapping", None), - "word_ids": np.array( - [tokenize_output.word_ids(i) for i in range(len(encoded_text))] - ), + "offset_mapping": tokenize_output.offset_mapping, + "word_ids": tokenize_output.word_ids, } else: return { From c7307f551708f3f0246ac3df35cd1c0144885832 Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Mon, 10 Nov 2025 11:02:59 +0000 Subject: [PATCH 34/66] fix: move hf-dep to optional dependencies so that uv sync --all-extras installs it, otherwise the CI breaks --- pyproject.toml | 7 ++++--- uv.lock | 20 +++++++++----------- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 60c7fd8..d3426a0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,15 +42,16 @@ docs = [ "myst-parser>=0.18.0", "sphinx-design>=0.3.0" ] + +[project.optional-dependencies] +explainability = ["unidecode", "nltk", "captum"] +preprocess = ["unidecode", "nltk"] hf-dep = [ "tokenizers>=0.22.1", "transformers>=4.57.1", "datasets>=4.3.0", ] -[project.optional-dependencies] -explainability = ["unidecode", "nltk", "captum"] -preprocess = ["unidecode", "nltk"] [build-system] requires = ["uv_build>=0.8.3,<0.9.0"] diff --git a/uv.lock b/uv.lock index 33cc712..ac254fc 100644 --- a/uv.lock +++ b/uv.lock @@ -2058,6 +2058,11 @@ explainability = [ { name = "nltk" }, { name = "unidecode" }, ] +hf-dep = [ + { name = "datasets" }, + { name = "tokenizers" }, + { name = "transformers" }, +] preprocess = [ { name = "nltk" }, { name = "unidecode" }, @@ -2084,23 +2089,21 @@ docs = [ { name = "sphinx-rtd-theme" }, { name = "sphinxcontrib-napoleon" }, ] -hf-dep = [ - { name = "datasets" }, - { name = "tokenizers" }, - { name = "transformers" }, -] [package.metadata] requires-dist = [ { name = "captum", marker = "extra == 'explainability'" }, + { name = "datasets", marker = "extra == 'hf-dep'", specifier = ">=4.3.0" }, { name = "nltk", marker = "extra == 'explainability'" }, { name = "nltk", marker = "extra == 'preprocess'" }, { name = "numpy", specifier = ">=1.26.4" }, { name = "pytorch-lightning", specifier = ">=2.4.0" }, + { name = "tokenizers", marker = "extra == 'hf-dep'", specifier = ">=0.22.1" }, + { name = "transformers", marker = "extra == 'hf-dep'", specifier = ">=4.57.1" }, { name = "unidecode", marker = "extra == 'explainability'" }, { name = "unidecode", marker = "extra == 'preprocess'" }, ] -provides-extras = ["explainability", "preprocess"] +provides-extras = ["explainability", "preprocess", "hf-dep"] [package.metadata.requires-dev] dev = [ @@ -2123,11 +2126,6 @@ docs = [ { name = "sphinx-rtd-theme", specifier = ">=1.2.0" }, { name = "sphinxcontrib-napoleon", specifier = ">=0.7" }, ] -hf-dep = [ - { name = "datasets", specifier = ">=4.3.0" }, - { name = "tokenizers", specifier = ">=0.22.1" }, - { name = "transformers", specifier = ">=4.57.1" }, -] [[package]] name = "tqdm" From 934b041f580b5fca5429203207bea02efb0855f4 Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Mon, 10 Nov 2025 17:28:37 +0000 Subject: [PATCH 35/66] feat!(attention): enable attention logic from nanochat attention_config to be passed to model_config init weights etc also update _get_sentence_embedding --- .../model/components/__init__.py | 4 + .../model/components/attention.py | 126 +++++++++++++++ .../model/components/text_embedder.py | 153 +++++++++++++++++- torchTextClassifiers/model/model.py | 4 + torchTextClassifiers/torchTextClassifiers.py | 9 +- 5 files changed, 288 insertions(+), 8 deletions(-) create mode 100644 torchTextClassifiers/model/components/attention.py diff --git a/torchTextClassifiers/model/components/__init__.py b/torchTextClassifiers/model/components/__init__.py index ad62cad..b14af0e 100644 --- a/torchTextClassifiers/model/components/__init__.py +++ b/torchTextClassifiers/model/components/__init__.py @@ -1,3 +1,6 @@ +from .attention import ( + AttentionConfig as AttentionConfig, +) from .categorical_var_net import ( CategoricalForwardType as CategoricalForwardType, ) @@ -6,3 +9,4 @@ ) from .classification_head import ClassificationHead as ClassificationHead from .text_embedder import TextEmbedder as TextEmbedder +from .text_embedder import TextEmbedderConfig as TextEmbedderConfig diff --git a/torchTextClassifiers/model/components/attention.py b/torchTextClassifiers/model/components/attention.py new file mode 100644 index 0000000..7c6474c --- /dev/null +++ b/torchTextClassifiers/model/components/attention.py @@ -0,0 +1,126 @@ +"""Largely inspired from Andrej Karpathy's nanochat, see here https://github.com/karpathy/nanochat/blob/master/nanochat/gpt.py""" + +from dataclasses import dataclass +from typing import Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F + +### Some utils used in text_embedder.py for the attention blocks ### + + +def apply_rotary_emb(x, cos, sin): + assert x.ndim == 4 # multihead attention + + d = x.shape[3] // 2 + x1, x2 = x[..., :d], x[..., d:] # split up last time into two halves + y1 = x1 * cos + x2 * sin # rotate pairs of dims + y2 = x1 * (-sin) + x2 * cos + out = torch.cat([y1, y2], 3) # re-assemble + out = out.to(x.dtype) # ensure input/output dtypes match + return out + + +def norm(x): + # Purely functional rmsnorm with no learnable params + return F.rms_norm(x, (x.size(-1),)) + + +#### Config ##### +@dataclass +class AttentionConfig: + n_layers: int + n_head: int + n_kv_head: int + sequence_len: Optional[int] = None + positional_encoding: bool = True + aggregation_method: str = "mean" # or 'last', or 'first' + + +#### Attention Block ##### + +# Composed of SelfAttentionLayer and MLP with residual connections + + +class Block(nn.Module): + def __init__(self, config: AttentionConfig, layer_idx: int): + super().__init__() + + self.layer_idx = layer_idx + self.attn = SelfAttentionLayer(config, layer_idx) + self.mlp = MLP(config) + + def forward(self, x, cos_sin): + x = x + self.attn(norm(x), cos_sin) + x = x + self.mlp(norm(x)) + return x + + +##### Components of the Block ##### + + +class SelfAttentionLayer(nn.Module): + def __init__(self, config: AttentionConfig, layer_idx): + super().__init__() + self.layer_idx = layer_idx + self.n_head = config.n_head + self.n_kv_head = config.n_kv_head + self.enable_gqa = ( + self.n_head != self.n_kv_head + ) # Group Query Attention (GQA): duplicate key/value heads to match query heads if desired + self.n_embd = config.n_embd + self.head_dim = self.n_embd // self.n_head + assert self.n_embd % self.n_head == 0 + assert self.n_kv_head <= self.n_head and self.n_head % self.n_kv_head == 0 + self.c_q = nn.Linear(self.n_embd, self.n_head * self.head_dim, bias=False) + self.c_k = nn.Linear(self.n_embd, self.n_kv_head * self.head_dim, bias=False) + self.c_v = nn.Linear(self.n_embd, self.n_kv_head * self.head_dim, bias=False) + self.c_proj = nn.Linear(self.n_embd, self.n_embd, bias=False) + + self.apply_positional_encoding = config.positional_encoding + + def forward(self, x, cos_sin=None): + B, T, C = x.size() + + # Project the input to get queries, keys, and values + q = self.c_q(x).view(B, T, self.n_head, self.head_dim) + k = self.c_k(x).view(B, T, self.n_kv_head, self.head_dim) + v = self.c_v(x).view(B, T, self.n_kv_head, self.head_dim) + + if self.apply_positional_encoding: + assert cos_sin is not None, "Rotary embeddings require precomputed cos/sin tensors" + cos, sin = cos_sin + q, k = ( + apply_rotary_emb(q, cos, sin), + apply_rotary_emb(k, cos, sin), + ) # QK rotary embedding + + q, k = norm(q), norm(k) # QK norm + q, k, v = ( + q.transpose(1, 2), + k.transpose(1, 2), + v.transpose(1, 2), + ) # make head be batch dim, i.e. (B, T, H, D) -> (B, H, T, D) + + # is_causal=False for non-autoregressive models (BERT-like) + y = F.scaled_dot_product_attention(q, k, v, is_causal=False, enable_gqa=self.enable_gqa) + + # Re-assemble the heads side by side and project back to residual stream + y = y.transpose(1, 2).contiguous().view(B, T, -1) + y = self.c_proj(y) + + return y + + +class MLP(nn.Module): + def __init__(self, config): + super().__init__() + self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=False) + self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=False) + + def forward(self, x): + x = self.c_fc(x) + x = F.relu(x).square() + x = self.c_proj(x) + return x diff --git a/torchTextClassifiers/model/components/text_embedder.py b/torchTextClassifiers/model/components/text_embedder.py index 17df9f7..e9703b7 100644 --- a/torchTextClassifiers/model/components/text_embedder.py +++ b/torchTextClassifiers/model/components/text_embedder.py @@ -1,21 +1,107 @@ +import math +from dataclasses import dataclass +from typing import Optional + import torch from torch import nn +from torchTextClassifiers.model.components.attention import AttentionConfig, Block, norm + + +@dataclass +class TextEmbedderConfig: + vocab_size: int + embedding_dim: int + padding_idx: int + attention_config: Optional[AttentionConfig] = None + class TextEmbedder(nn.Module): - def __init__(self, vocab_size: int, embedding_dim: int, padding_idx: int): + def __init__(self, text_embedder_config: TextEmbedderConfig): super().__init__() - self.vocab_size = vocab_size - self.embedding_dim = embedding_dim - self.padding_idx = padding_idx + self.config = text_embedder_config + + self.attention_config = text_embedder_config.attention_config + if self.attention_config is not None: + self.attention_config.n_embd = text_embedder_config.embedding_dim + + self.vocab_size = text_embedder_config.vocab_size + self.embedding_dim = text_embedder_config.embedding_dim + self.padding_idx = text_embedder_config.padding_idx self.embedding_layer = nn.Embedding( - embedding_dim=embedding_dim, - num_embeddings=vocab_size, + embedding_dim=self.embedding_dim, + num_embeddings=self.vocab_size, padding_idx=self.padding_idx, ) + if self.attention_config is not None: + self.transformer = nn.ModuleDict( + { + "h": nn.ModuleList( + [ + Block(self.attention_config, layer_idx) + for layer_idx in range(self.attention_config.n_layers) + ] + ), + } + ) + + head_dim = self.attention_config.n_embd // self.attention_config.n_head + + if head_dim * self.attention_config.n_head != self.attention_config.n_embd: + raise ValueError("embedding_dim must be divisible by n_head.") + + if self.attention_config.positional_encoding: + if head_dim % 2 != 0: + raise ValueError( + "embedding_dim / n_head must be even for rotary positional embeddings." + ) + + if self.attention_config.sequence_len is None: + raise ValueError( + "sequence_len must be specified in AttentionConfig when positional_encoding is True." + ) + + self.rotary_seq_len = self.attention_config.sequence_len * 10 + cos, sin = self._precompute_rotary_embeddings( + seq_len=self.rotary_seq_len, head_dim=head_dim + ) + + self.register_buffer( + "cos", cos, persistent=False + ) # persistent=False means it's not saved to the checkpoint + self.register_buffer("sin", sin, persistent=False) + + def init_weights(self): + self.apply(self._init_weights) + + # zero out c_proj weights in all blocks + if self.attention_config is not None: + for block in self.transformer.h: + torch.nn.init.zeros_(block.mlp.c_proj.weight) + torch.nn.init.zeros_(block.attn.c_proj.weight) + # init the rotary embeddings + head_dim = self.attention_config.n_embd // self.attention_config.n_head + cos, sin = self._precompute_rotary_embeddings(self.rotary_seq_len, head_dim) + self.cos, self.sin = cos, sin + # Cast the embeddings from fp32 to bf16: optim can tolerate it and it saves memory: both in the model and the activations + if self.embedding_layer.weight.device.type == "cuda": + self.embedding_layer.to(dtype=torch.bfloat16) + + def _init_weights(self, module): + if isinstance(module, nn.Linear): + # https://arxiv.org/pdf/2310.17813 + fan_out = module.weight.size(0) + fan_in = module.weight.size(1) + std = 1.0 / math.sqrt(fan_in) * min(1.0, math.sqrt(fan_out / fan_in)) + torch.nn.init.normal_(module.weight, mean=0.0, std=std) + if module.bias is not None: + torch.nn.init.zeros_(module.bias) + elif isinstance(module, nn.Embedding): + torch.nn.init.normal_(module.weight, mean=0.0, std=1.0) + def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor: """Converts input token IDs to their corresponding embeddings.""" @@ -36,6 +122,19 @@ def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torc encoded_text ) # (batch_size, seq_len, embedding_dim) + token_embeddings = norm(token_embeddings) + + if self.attention_config is not None: + if self.attention_config.positional_encoding: + cos_sin = self.cos[:, :seq_len], self.sin[:, :seq_len] + else: + cos_sin = None + + for block in self.transformer.h: + token_embeddings = block(token_embeddings, cos_sin) + + token_embeddings = norm(token_embeddings) + text_embedding = self._get_sentence_embedding( token_embeddings=token_embeddings, attention_mask=attention_mask ) @@ -57,9 +156,28 @@ def _get_sentence_embedding( # average over non-pad token embeddings # attention mask has 1 for non-pad tokens and 0 for pad token positions - # TODO: add attention logic at some point # mask pad-tokens + + if self.attention_config is not None: + if self.attention_config.aggregation_method is not None: + if self.attention_config.aggregation_method == "first": + return token_embeddings[:, 0, :] + elif self.attention_config.aggregation_method == "last": + lengths = attention_mask.sum(dim=1).clamp(min=1) # last non-pad token index + 1 + return token_embeddings[ + torch.arange(token_embeddings.size(0)), + lengths - 1, + :, + ] + else: + if self.attention_config.aggregation_method != "mean": + raise ValueError( + f"Unknown aggregation method: {self.attention_config.aggregation_method}. Supported methods are 'mean', 'first', 'last'." + ) + + assert self.attention_config is None or self.attention_config.aggregation_method == "mean" + mask = attention_mask.unsqueeze(-1).float() # (batch_size, seq_len, 1) masked_embeddings = token_embeddings * mask # (batch_size, seq_len, embedding_dim) @@ -79,3 +197,24 @@ def __call__(self, *args, **kwargs): f"(got shape {tuple(out.shape)})" ) return out + + def _precompute_rotary_embeddings(self, seq_len, head_dim, base=10000, device=None): + # autodetect the device from model embeddings + if device is None: + device = next(self.parameters()).device + + # stride the channels + channel_range = torch.arange(0, head_dim, 2, dtype=torch.float32, device=device) + inv_freq = 1.0 / (base ** (channel_range / head_dim)) + # stride the time steps + t = torch.arange(seq_len, dtype=torch.float32, device=device) + # calculate the rotation frequencies at each (time, channel) pair + freqs = torch.outer(t, inv_freq) + cos, sin = freqs.cos(), freqs.sin() + cos, sin = cos.bfloat16(), sin.bfloat16() # keep them in bfloat16 + cos, sin = ( + cos[None, :, None, :], + sin[None, :, None, :], + ) # add batch and head dims for later broadcasting + + return cos, sin diff --git a/torchTextClassifiers/model/model.py b/torchTextClassifiers/model/model.py index e1f0715..d9cffbf 100644 --- a/torchTextClassifiers/model/model.py +++ b/torchTextClassifiers/model/model.py @@ -69,6 +69,10 @@ def __init__( self.num_classes = self.classification_head.num_classes + torch.nn.init.zeros_(self.classification_head.net.weight) + if self.text_embedder is not None: + self.text_embedder.init_weights() + def _validate_component_connections(self): def _check_text_categorical_connection(self, text_embedder, cat_var_net): if cat_var_net.forward_type == CategoricalForwardType.SUM_TO_TEXT: diff --git a/torchTextClassifiers/torchTextClassifiers.py b/torchTextClassifiers/torchTextClassifiers.py index 0802157..a15026d 100644 --- a/torchTextClassifiers/torchTextClassifiers.py +++ b/torchTextClassifiers/torchTextClassifiers.py @@ -23,10 +23,12 @@ from torchTextClassifiers.dataset import TextClassificationDataset from torchTextClassifiers.model import TextClassificationModel, TextClassificationModule from torchTextClassifiers.model.components import ( + AttentionConfig, CategoricalForwardType, CategoricalVariableNet, ClassificationHead, TextEmbedder, + TextEmbedderConfig, ) from torchTextClassifiers.tokenizers import BaseTokenizer, TokenizerOutput @@ -48,6 +50,7 @@ class ModelConfig: categorical_vocabulary_sizes: Optional[List[int]] = None categorical_embedding_dims: Optional[Union[List[int], int]] = None num_classes: Optional[int] = None + attention_config: Optional[AttentionConfig] = None def to_dict(self) -> Dict[str, Any]: return asdict(self) @@ -139,10 +142,14 @@ def __init__( ) self.embedding_dim = self.tokenizer.output_dim else: - self.text_embedder = TextEmbedder( + text_embedder_config = TextEmbedderConfig( vocab_size=self.vocab_size, embedding_dim=self.embedding_dim, padding_idx=tokenizer.padding_idx, + attention_config=model_config.attention_config, + ) + self.text_embedder = TextEmbedder( + text_embedder_config=text_embedder_config, ) classif_head_input_dim = self.embedding_dim From 5e150b22d66def21790de72595615ed90829bfa5 Mon Sep 17 00:00:00 2001 From: Meilame Tayebjee <114609737+meilame-tayebjee@users.noreply.github.com> Date: Mon, 10 Nov 2025 18:32:57 +0100 Subject: [PATCH 36/66] fix: check if categorical var are present before checking their arrays MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Cédric Couralet --- torchTextClassifiers/torchTextClassifiers.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/torchTextClassifiers/torchTextClassifiers.py b/torchTextClassifiers/torchTextClassifiers.py index a15026d..dd93602 100644 --- a/torchTextClassifiers/torchTextClassifiers.py +++ b/torchTextClassifiers/torchTextClassifiers.py @@ -227,7 +227,12 @@ def train( X_train["categorical_variables"].ndim > 1 and X_train["categorical_variables"].shape[1] == X_val["categorical_variables"].shape[1] or X_val["categorical_variables"].ndim == 1 - ), "X_train and X_val must have the same number of columns." + if X_train["categorical_variables"] is not None and X_val["categorical_variables"] is not None: + assert ( + X_train["categorical_variables"].ndim > 1 + and X_train["categorical_variables"].shape[1] == X_val["categorical_variables"].shape[1] + or X_val["categorical_variables"].ndim == 1 + ), "X_train and X_val must have the same number of columns." if verbose: logger.info("Starting training process...") From 162e2962f3ab2295e0089fbfcb89831aa9f046e1 Mon Sep 17 00:00:00 2001 From: Meilame Tayebjee <114609737+meilame-tayebjee@users.noreply.github.com> Date: Mon, 10 Nov 2025 18:36:00 +0100 Subject: [PATCH 37/66] fix: no persistent_workers if num_workers=0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Cédric Couralet --- torchTextClassifiers/dataset/dataset.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/torchTextClassifiers/dataset/dataset.py b/torchTextClassifiers/dataset/dataset.py index 460ffea..9461dc0 100644 --- a/torchTextClassifiers/dataset/dataset.py +++ b/torchTextClassifiers/dataset/dataset.py @@ -96,7 +96,10 @@ def create_dataloader( pin_memory: bool = False, persistent_workers: bool = True, **kwargs, - ) -> torch.utils.data.DataLoader: + # persistent_workers requires num_workers > 0 + if num_workers == 0: + persistent_workers = False + return DataLoader( dataset=self, batch_size=batch_size, From 1591bd9ce19748ded4d3a1a1a44619d5b7187ae1 Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Wed, 12 Nov 2025 12:08:37 +0000 Subject: [PATCH 38/66] fix: closing parenthesis --- torchTextClassifiers/torchTextClassifiers.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/torchTextClassifiers/torchTextClassifiers.py b/torchTextClassifiers/torchTextClassifiers.py index dd93602..c747c5a 100644 --- a/torchTextClassifiers/torchTextClassifiers.py +++ b/torchTextClassifiers/torchTextClassifiers.py @@ -227,10 +227,16 @@ def train( X_train["categorical_variables"].ndim > 1 and X_train["categorical_variables"].shape[1] == X_val["categorical_variables"].shape[1] or X_val["categorical_variables"].ndim == 1 - if X_train["categorical_variables"] is not None and X_val["categorical_variables"] is not None: + ) + + if ( + X_train["categorical_variables"] is not None + and X_val["categorical_variables"] is not None + ): assert ( X_train["categorical_variables"].ndim > 1 - and X_train["categorical_variables"].shape[1] == X_val["categorical_variables"].shape[1] + and X_train["categorical_variables"].shape[1] + == X_val["categorical_variables"].shape[1] or X_val["categorical_variables"].ndim == 1 ), "X_train and X_val must have the same number of columns." From 1af9e531d6977246258b2dba2cc6e35eb2f4a3c1 Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Wed, 12 Nov 2025 12:54:43 +0000 Subject: [PATCH 39/66] fix: truncation=True is needed --- torchTextClassifiers/dataset/dataset.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/torchTextClassifiers/dataset/dataset.py b/torchTextClassifiers/dataset/dataset.py index 9461dc0..34fe24e 100644 --- a/torchTextClassifiers/dataset/dataset.py +++ b/torchTextClassifiers/dataset/dataset.py @@ -39,7 +39,7 @@ def __len__(self): def __getitem__(self, idx): if self.labels is not None: return ( - self.texts[idx], + str(self.texts[idx]), ( self.categorical_variables[idx] if self.categorical_variables is not None @@ -49,7 +49,7 @@ def __getitem__(self, idx): ) else: return ( - self.texts[idx], + str(self.texts[idx]), ( self.categorical_variables[idx] if self.categorical_variables is not None @@ -61,12 +61,13 @@ def __getitem__(self, idx): def collate_fn(self, batch): text, *categorical_vars, y = zip(*batch) + print(text) if self.labels is not None: labels_tensor = torch.tensor(y, dtype=torch.long) else: labels_tensor = None - tokenize_output = self.tokenizer.tokenize(text) + tokenize_output = self.tokenizer.tokenize(list(text)) if self.categorical_variables is not None: categorical_tensors = torch.stack( @@ -96,6 +97,7 @@ def create_dataloader( pin_memory: bool = False, persistent_workers: bool = True, **kwargs, + ): # persistent_workers requires num_workers > 0 if num_workers == 0: persistent_workers = False From 4ca1807477664444c6aa1f49c88d842ff3c39857 Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Wed, 12 Nov 2025 12:54:58 +0000 Subject: [PATCH 40/66] add ipywidgets --- pyproject.toml | 1 + uv.lock | 213 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 214 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 79438a7..a9ca049 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,6 +32,7 @@ dev = [ "pyarrow", "pre-commit>=4.3.0", "ruff>=0.14.3", + "ipywidgets>=8.1.8", ] docs = [ "sphinx>=5.0.0", diff --git a/uv.lock b/uv.lock index ac254fc..f65812b 100644 --- a/uv.lock +++ b/uv.lock @@ -118,6 +118,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/15/b3/9b1a8074496371342ec1e796a96f99c82c945a339cd81a8e73de28b4cf9e/anyio-4.11.0-py3-none-any.whl", hash = "sha256:0287e96f4d26d4149305414d4e3bc32f0dcd0862365a4bddea19d7a1ec38c4fc", size = 109097, upload-time = "2025-09-23T09:19:10.601Z" }, ] +[[package]] +name = "asttokens" +version = "3.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/4a/e7/82da0a03e7ba5141f05cce0d302e6eed121ae055e0456ca228bf693984bc/asttokens-3.0.0.tar.gz", hash = "sha256:0dcd8baa8d62b0c1d118b399b2ddba3c4aff271d0d7a9e0d4c1681c79035bbc7", size = 61978, upload-time = "2024-11-30T04:30:14.439Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/25/8a/c46dcc25341b5bce5472c718902eb3d38600a903b14fa6aeecef3f21a46f/asttokens-3.0.0-py3-none-any.whl", hash = "sha256:e3078351a059199dd5138cb1c706e6430c05eff2ff136af5eb4790f9d28932e2", size = 26918, upload-time = "2024-11-30T04:30:10.946Z" }, +] + [[package]] name = "attrs" version = "25.3.0" @@ -238,6 +247,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" }, ] +[[package]] +name = "comm" +version = "0.2.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/4c/13/7d740c5849255756bc17888787313b61fd38a0a8304fc4f073dfc46122aa/comm-0.2.3.tar.gz", hash = "sha256:2dc8048c10962d55d7ad693be1e7045d891b7ce8d999c97963a5e3e99c055971", size = 6319, upload-time = "2025-07-25T14:02:04.452Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/60/97/891a0971e1e4a8c5d2b20bbe0e524dc04548d2307fee33cdeba148fd4fc7/comm-0.2.3-py3-none-any.whl", hash = "sha256:c615d91d75f7f04f095b30d1c1711babd43bdc6419c1be9886a85f2f4e489417", size = 7294, upload-time = "2025-07-25T14:02:02.896Z" }, +] + [[package]] name = "contourpy" version = "1.3.2" @@ -326,6 +344,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ca/51/409a8184ed35453d9cbb3d6b20d524b1115c2c2d117b85d5e9b06cd70b45/datasets-4.3.0-py3-none-any.whl", hash = "sha256:0ea157e72138b3ca6c7d2415f19a164ecf7d4c4fa72da2a570da286882e96903", size = 506846, upload-time = "2025-10-23T16:31:49.965Z" }, ] +[[package]] +name = "decorator" +version = "5.2.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/43/fa/6d96a0978d19e17b68d634497769987b16c8f4cd0a7a05048bec693caa6b/decorator-5.2.1.tar.gz", hash = "sha256:65f266143752f734b0a7cc83c46f4618af75b8c5911b00ccb61d0ac9b6da0360", size = 56711, upload-time = "2025-02-24T04:41:34.073Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4e/8c/f3147f5c4b73e7550fe5f9352eaa956ae838d5c51eb58e7a25b9f3e2643b/decorator-5.2.1-py3-none-any.whl", hash = "sha256:d316bb415a2d9e2d2b3abcc4084c6502fc09240e292cd76a76afc106a1c8e04a", size = 9190, upload-time = "2025-02-24T04:41:32.565Z" }, +] + [[package]] name = "dill" version = "0.4.0" @@ -353,6 +380,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8f/d7/9322c609343d929e75e7e5e6255e614fcc67572cfd083959cdef3b7aad79/docutils-0.21.2-py3-none-any.whl", hash = "sha256:dafca5b9e384f0e419294eb4d2ff9fa826435bf15f15b7bd45723e8ad76811b2", size = 587408, upload-time = "2024-04-23T18:57:14.835Z" }, ] +[[package]] +name = "executing" +version = "2.2.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/cc/28/c14e053b6762b1044f34a13aab6859bbf40456d37d23aa286ac24cfd9a5d/executing-2.2.1.tar.gz", hash = "sha256:3632cc370565f6648cc328b32435bd120a1e4ebb20c77e3fdde9a13cd1e533c4", size = 1129488, upload-time = "2025-09-01T09:48:10.866Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c1/ea/53f2148663b321f21b5a606bd5f191517cf40b7072c0497d3c92c4a13b1e/executing-2.2.1-py2.py3-none-any.whl", hash = "sha256:760643d3452b4d777d295bb167ccc74c64a81df23fb5e08eff250c425a4b2017", size = 28317, upload-time = "2025-09-01T09:48:08.5Z" }, +] + [[package]] name = "filelock" version = "3.18.0" @@ -593,6 +629,68 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2c/e1/e6716421ea10d38022b952c159d5161ca1193197fb744506875fbb87ea7b/iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760", size = 6050, upload-time = "2025-03-19T20:10:01.071Z" }, ] +[[package]] +name = "ipython" +version = "9.7.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "decorator" }, + { name = "ipython-pygments-lexers" }, + { name = "jedi" }, + { name = "matplotlib-inline" }, + { name = "pexpect", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" }, + { name = "prompt-toolkit" }, + { name = "pygments" }, + { name = "stack-data" }, + { name = "traitlets" }, + { name = "typing-extensions", marker = "python_full_version < '3.12'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/29/e6/48c74d54039241a456add616464ea28c6ebf782e4110d419411b83dae06f/ipython-9.7.0.tar.gz", hash = "sha256:5f6de88c905a566c6a9d6c400a8fed54a638e1f7543d17aae2551133216b1e4e", size = 4422115, upload-time = "2025-11-05T12:18:54.646Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/05/aa/62893d6a591d337aa59dcc4c6f6c842f1fe20cd72c8c5c1f980255243252/ipython-9.7.0-py3-none-any.whl", hash = "sha256:bce8ac85eb9521adc94e1845b4c03d88365fd6ac2f4908ec4ed1eb1b0a065f9f", size = 618911, upload-time = "2025-11-05T12:18:52.484Z" }, +] + +[[package]] +name = "ipython-pygments-lexers" +version = "1.1.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pygments" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ef/4c/5dd1d8af08107f88c7f741ead7a40854b8ac24ddf9ae850afbcf698aa552/ipython_pygments_lexers-1.1.1.tar.gz", hash = "sha256:09c0138009e56b6854f9535736f4171d855c8c08a563a0dcd8022f78355c7e81", size = 8393, upload-time = "2025-01-17T11:24:34.505Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d9/33/1f075bf72b0b747cb3288d011319aaf64083cf2efef8354174e3ed4540e2/ipython_pygments_lexers-1.1.1-py3-none-any.whl", hash = "sha256:a9462224a505ade19a605f71f8fa63c2048833ce50abc86768a0d81d876dc81c", size = 8074, upload-time = "2025-01-17T11:24:33.271Z" }, +] + +[[package]] +name = "ipywidgets" +version = "8.1.8" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "comm" }, + { name = "ipython" }, + { name = "jupyterlab-widgets" }, + { name = "traitlets" }, + { name = "widgetsnbextension" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/4c/ae/c5ce1edc1afe042eadb445e95b0671b03cee61895264357956e61c0d2ac0/ipywidgets-8.1.8.tar.gz", hash = "sha256:61f969306b95f85fba6b6986b7fe45d73124d1d9e3023a8068710d47a22ea668", size = 116739, upload-time = "2025-11-01T21:18:12.393Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/56/6d/0d9848617b9f753b87f214f1c682592f7ca42de085f564352f10f0843026/ipywidgets-8.1.8-py3-none-any.whl", hash = "sha256:ecaca67aed704a338f88f67b1181b58f821ab5dc89c1f0f5ef99db43c1c2921e", size = 139808, upload-time = "2025-11-01T21:18:10.956Z" }, +] + +[[package]] +name = "jedi" +version = "0.19.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "parso" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/72/3a/79a912fbd4d8dd6fbb02bf69afd3bb72cf0c729bb3063c6f4498603db17a/jedi-0.19.2.tar.gz", hash = "sha256:4770dc3de41bde3966b02eb84fbcf557fb33cce26ad23da12c742fb50ecb11f0", size = 1231287, upload-time = "2024-11-11T01:41:42.873Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c0/5a/9cac0c82afec3d09ccd97c8b6502d48f165f9124db81b4bcb90b4af974ee/jedi-0.19.2-py2.py3-none-any.whl", hash = "sha256:a8ef22bde8490f57fe5c7681a3c83cb58874daf72b4784de3cce5b6ef6edb5b9", size = 1572278, upload-time = "2024-11-11T01:41:40.175Z" }, +] + [[package]] name = "jinja2" version = "3.1.6" @@ -614,6 +712,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7d/4f/1195bbac8e0c2acc5f740661631d8d750dc38d4a32b23ee5df3cde6f4e0d/joblib-1.5.1-py3-none-any.whl", hash = "sha256:4719a31f054c7d766948dcd83e9613686b27114f190f717cec7eaa2084f8a74a", size = 307746, upload-time = "2025-05-23T12:04:35.124Z" }, ] +[[package]] +name = "jupyterlab-widgets" +version = "3.0.16" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/26/2d/ef58fed122b268c69c0aa099da20bc67657cdfb2e222688d5731bd5b971d/jupyterlab_widgets-3.0.16.tar.gz", hash = "sha256:423da05071d55cf27a9e602216d35a3a65a3e41cdf9c5d3b643b814ce38c19e0", size = 897423, upload-time = "2025-11-01T21:11:29.724Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ab/b5/36c712098e6191d1b4e349304ef73a8d06aed77e56ceaac8c0a306c7bda1/jupyterlab_widgets-3.0.16-py3-none-any.whl", hash = "sha256:45fa36d9c6422cf2559198e4db481aa243c7a32d9926b500781c830c80f7ecf8", size = 914926, upload-time = "2025-11-01T21:11:28.008Z" }, +] + [[package]] name = "kiwisolver" version = "1.4.8" @@ -797,6 +904,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1b/92/9a45c91089c3cf690b5badd4be81e392ff086ccca8a1d4e3a08463d8a966/matplotlib-3.10.3-cp313-cp313t-win_amd64.whl", hash = "sha256:4f23ffe95c5667ef8a2b56eea9b53db7f43910fa4a2d5472ae0f72b64deab4d5", size = 8139044, upload-time = "2025-05-08T19:10:44.551Z" }, ] +[[package]] +name = "matplotlib-inline" +version = "0.2.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "traitlets" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c7/74/97e72a36efd4ae2bccb3463284300f8953f199b5ffbc04cbbb0ec78f74b1/matplotlib_inline-0.2.1.tar.gz", hash = "sha256:e1ee949c340d771fc39e241ea75683deb94762c8fa5f2927ec57c83c4dffa9fe", size = 8110, upload-time = "2025-10-23T09:00:22.126Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/af/33/ee4519fa02ed11a94aef9559552f3b17bb863f2ecfe1a35dc7f548cde231/matplotlib_inline-0.2.1-py3-none-any.whl", hash = "sha256:d56ce5156ba6085e00a9d54fead6ed29a9c47e215cd1bba2e976ef39f5710a76", size = 9516, upload-time = "2025-10-23T09:00:20.675Z" }, +] + [[package]] name = "mdit-py-plugins" version = "0.4.2" @@ -1201,6 +1320,27 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ab/5f/b38085618b950b79d2d9164a711c52b10aefc0ae6833b96f626b7021b2ed/pandas-2.2.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ad5b65698ab28ed8d7f18790a0dc58005c7629f227be9ecc1072aa74c0c1d43a", size = 13098436, upload-time = "2024-09-20T13:09:48.112Z" }, ] +[[package]] +name = "parso" +version = "0.8.5" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d4/de/53e0bcf53d13e005bd8c92e7855142494f41171b34c2536b86187474184d/parso-0.8.5.tar.gz", hash = "sha256:034d7354a9a018bdce352f48b2a8a450f05e9d6ee85db84764e9b6bd96dafe5a", size = 401205, upload-time = "2025-08-23T15:15:28.028Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/16/32/f8e3c85d1d5250232a5d3477a2a28cc291968ff175caeadaf3cc19ce0e4a/parso-0.8.5-py2.py3-none-any.whl", hash = "sha256:646204b5ee239c396d040b90f9e272e9a8017c630092bf59980beb62fd033887", size = 106668, upload-time = "2025-08-23T15:15:25.663Z" }, +] + +[[package]] +name = "pexpect" +version = "4.9.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "ptyprocess" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/42/92/cc564bf6381ff43ce1f4d06852fc19a2f11d180f23dc32d9588bee2f149d/pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f", size = 166450, upload-time = "2023-11-25T09:07:26.339Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl", hash = "sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523", size = 63772, upload-time = "2023-11-25T06:56:14.81Z" }, +] + [[package]] name = "pillow" version = "11.2.1" @@ -1306,6 +1446,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/5b/a5/987a405322d78a73b66e39e4a90e4ef156fd7141bf71df987e50717c321b/pre_commit-4.3.0-py2.py3-none-any.whl", hash = "sha256:2b0747ad7e6e967169136edffee14c16e148a778a54e4f967921aa1ebf2308d8", size = 220965, upload-time = "2025-08-09T18:56:13.192Z" }, ] +[[package]] +name = "prompt-toolkit" +version = "3.0.52" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "wcwidth" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a1/96/06e01a7b38dce6fe1db213e061a4602dd6032a8a97ef6c1a862537732421/prompt_toolkit-3.0.52.tar.gz", hash = "sha256:28cde192929c8e7321de85de1ddbe736f1375148b02f2e17edd840042b1be855", size = 434198, upload-time = "2025-08-27T15:24:02.057Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/84/03/0d3ce49e2505ae70cf43bc5bb3033955d2fc9f932163e84dc0779cc47f48/prompt_toolkit-3.0.52-py3-none-any.whl", hash = "sha256:9aac639a3bbd33284347de5ad8d68ecc044b91a762dc39b7c21095fcd6a19955", size = 391431, upload-time = "2025-08-27T15:23:59.498Z" }, +] + [[package]] name = "propcache" version = "0.3.1" @@ -1379,6 +1531,24 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b8/d3/c3cb8f1d6ae3b37f83e1de806713a9b3642c5895f0215a62e1a4bd6e5e34/propcache-0.3.1-py3-none-any.whl", hash = "sha256:9a8ecf38de50a7f518c21568c80f985e776397b902f1ce0b01f799aba1608b40", size = 12376, upload-time = "2025-03-26T03:06:10.5Z" }, ] +[[package]] +name = "ptyprocess" +version = "0.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/20/e5/16ff212c1e452235a90aeb09066144d0c5a6a8c0834397e03f5224495c4e/ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220", size = 70762, upload-time = "2020-12-28T15:15:30.155Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35", size = 13993, upload-time = "2020-12-28T15:15:28.35Z" }, +] + +[[package]] +name = "pure-eval" +version = "0.2.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/cd/05/0a34433a064256a578f1783a10da6df098ceaa4a57bbeaa96a6c0352786b/pure_eval-0.2.3.tar.gz", hash = "sha256:5f4e983f40564c576c7c8635ae88db5956bb2229d7e9237d03b3c0b0190eaf42", size = 19752, upload-time = "2024-07-21T12:58:21.801Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0", size = 11842, upload-time = "2024-07-21T12:58:20.04Z" }, +] + [[package]] name = "pyarrow" version = "22.0.0" @@ -1935,6 +2105,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/52/a7/d2782e4e3f77c8450f727ba74a8f12756d5ba823d81b941f1b04da9d033a/sphinxcontrib_serializinghtml-2.0.0-py3-none-any.whl", hash = "sha256:6e2cb0eef194e10c27ec0023bfeb25badbbb5868244cf5bc5bdc04e4464bf331", size = 92072, upload-time = "2024-07-29T01:10:08.203Z" }, ] +[[package]] +name = "stack-data" +version = "0.6.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "asttokens" }, + { name = "executing" }, + { name = "pure-eval" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/28/e3/55dcc2cfbc3ca9c29519eb6884dd1415ecb53b0e934862d3559ddcb7e20b/stack_data-0.6.3.tar.gz", hash = "sha256:836a778de4fec4dcd1dcd89ed8abff8a221f58308462e1c4aa2a3cf30148f0b9", size = 44707, upload-time = "2023-09-30T13:58:05.479Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl", hash = "sha256:d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695", size = 24521, upload-time = "2023-09-30T13:58:03.53Z" }, +] + [[package]] name = "sympy" version = "1.14.0" @@ -2071,6 +2255,7 @@ preprocess = [ [package.dev-dependencies] dev = [ { name = "captum" }, + { name = "ipywidgets" }, { name = "nltk" }, { name = "pandas" }, { name = "pre-commit" }, @@ -2108,6 +2293,7 @@ provides-extras = ["explainability", "preprocess", "hf-dep"] [package.metadata.requires-dev] dev = [ { name = "captum" }, + { name = "ipywidgets", specifier = ">=8.1.8" }, { name = "nltk" }, { name = "pandas" }, { name = "pre-commit", specifier = ">=4.3.0" }, @@ -2139,6 +2325,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540, upload-time = "2024-11-24T20:12:19.698Z" }, ] +[[package]] +name = "traitlets" +version = "5.14.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/eb/79/72064e6a701c2183016abbbfedaba506d81e30e232a68c9f0d6f6fcd1574/traitlets-5.14.3.tar.gz", hash = "sha256:9ed0579d3502c94b4b3732ac120375cda96f923114522847de4b3bb98b96b6b7", size = 161621, upload-time = "2024-04-19T11:11:49.746Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f", size = 85359, upload-time = "2024-04-19T11:11:46.763Z" }, +] + [[package]] name = "transformers" version = "4.57.1" @@ -2224,6 +2419,24 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/27/73/d9a94da0e9d470a543c1b9d3ccbceb0f59455983088e727b8a1824ed90fb/virtualenv-20.35.3-py3-none-any.whl", hash = "sha256:63d106565078d8c8d0b206d48080f938a8b25361e19432d2c9db40d2899c810a", size = 5981061, upload-time = "2025-10-10T21:23:30.433Z" }, ] +[[package]] +name = "wcwidth" +version = "0.2.14" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/24/30/6b0809f4510673dc723187aeaf24c7f5459922d01e2f794277a3dfb90345/wcwidth-0.2.14.tar.gz", hash = "sha256:4d478375d31bc5395a3c55c40ccdf3354688364cd61c4f6adacaa9215d0b3605", size = 102293, upload-time = "2025-09-22T16:29:53.023Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/af/b5/123f13c975e9f27ab9c0770f514345bd406d0e8d3b7a0723af9d43f710af/wcwidth-0.2.14-py2.py3-none-any.whl", hash = "sha256:a7bb560c8aee30f9957e5f9895805edd20602f2d7f720186dfd906e82b4982e1", size = 37286, upload-time = "2025-09-22T16:29:51.641Z" }, +] + +[[package]] +name = "widgetsnbextension" +version = "4.0.15" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/bd/f4/c67440c7fb409a71b7404b7aefcd7569a9c0d6bd071299bf4198ae7a5d95/widgetsnbextension-4.0.15.tar.gz", hash = "sha256:de8610639996f1567952d763a5a41af8af37f2575a41f9852a38f947eb82a3b9", size = 1097402, upload-time = "2025-11-01T21:15:55.178Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3f/0e/fa3b193432cfc60c93b42f3be03365f5f909d2b3ea410295cf36df739e31/widgetsnbextension-4.0.15-py3-none-any.whl", hash = "sha256:8156704e4346a571d9ce73b84bee86a29906c9abfd7223b7228a28899ccf3366", size = 2196503, upload-time = "2025-11-01T21:15:53.565Z" }, +] + [[package]] name = "xxhash" version = "3.6.0" From 927a5e7ca46169ae4be3c30bf66cb9dabf491737 Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Wed, 12 Nov 2025 12:55:29 +0000 Subject: [PATCH 41/66] fix: check_Y problem of indexes --- torchTextClassifiers/torchTextClassifiers.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/torchTextClassifiers/torchTextClassifiers.py b/torchTextClassifiers/torchTextClassifiers.py index c747c5a..ecc8fae 100644 --- a/torchTextClassifiers/torchTextClassifiers.py +++ b/torchTextClassifiers/torchTextClassifiers.py @@ -410,9 +410,9 @@ def _check_categorical_variables(self, X: np.ndarray) -> None: f"Columns {1} to {X.shape[1] - 1} of X_train must be castable in integer format." ) - for j in range(1, X.shape[1]): - max_cat_value = categorical_variables.max() - if max_cat_value >= self.categorical_var_net.categorical_vocabulary_sizes[j - 1]: + for j in range(X.shape[1] - 1): + max_cat_value = categorical_variables[:, j].max() + if max_cat_value >= self.categorical_var_net.categorical_vocabulary_sizes[j]: raise ValueError( f"Categorical variable at index {j} has value {max_cat_value} which exceeds the vocabulary size of {self.categorical_var_net.categorical_vocabulary_sizes[j]}." ) From 7fdb4e3a4d7c16c470862e40604284e7b3094719 Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Wed, 12 Nov 2025 12:56:10 +0000 Subject: [PATCH 42/66] fix: truncation=True is needed --- torchTextClassifiers/tokenizers/base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/torchTextClassifiers/tokenizers/base.py b/torchTextClassifiers/tokenizers/base.py index d700b78..fbc77d4 100644 --- a/torchTextClassifiers/tokenizers/base.py +++ b/torchTextClassifiers/tokenizers/base.py @@ -129,6 +129,7 @@ def tokenize( text, padding=padding, return_tensors="pt", + truncation=True, max_length=self.output_dim, return_offsets_mapping=return_offsets_mapping, ) # method from PreTrainedTokenizerFast From 4e36940071e30ab6da1f6827a91e1e5be813dafa Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Wed, 12 Nov 2025 12:56:34 +0000 Subject: [PATCH 43/66] rmeove unncessary print --- torchTextClassifiers/dataset/dataset.py | 1 - 1 file changed, 1 deletion(-) diff --git a/torchTextClassifiers/dataset/dataset.py b/torchTextClassifiers/dataset/dataset.py index 34fe24e..9e7b764 100644 --- a/torchTextClassifiers/dataset/dataset.py +++ b/torchTextClassifiers/dataset/dataset.py @@ -61,7 +61,6 @@ def __getitem__(self, idx): def collate_fn(self, batch): text, *categorical_vars, y = zip(*batch) - print(text) if self.labels is not None: labels_tensor = torch.tensor(y, dtype=torch.long) else: From d44d0515ddab9cad302ed28f685c3a187f1f99e6 Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Wed, 12 Nov 2025 12:58:02 +0000 Subject: [PATCH 44/66] progress on doc --- notebooks/example.ipynb | 1712 +++++---------------------------------- xxx.py | 167 ++++ 2 files changed, 371 insertions(+), 1508 deletions(-) create mode 100644 xxx.py diff --git a/notebooks/example.ipynb b/notebooks/example.ipynb index 02c41b1..f2c7d6f 100644 --- a/notebooks/example.ipynb +++ b/notebooks/example.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "a01b1526-51df-4bf9-9fd4-11ef22ffcc79", + "id": "0", "metadata": {}, "source": [ "# Example usage of the `torchTextClassifiers` library\n", @@ -18,483 +18,48 @@ }, { "cell_type": "code", - "execution_count": 6, - "id": "a00a2856", + "execution_count": null, + "id": "1", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[2mUsing Python 3.13.5 environment at: /opt/python\u001b[0m\n", - "\u001b[2K\u001b[2mResolved \u001b[1m42 packages\u001b[0m \u001b[2min 70ms\u001b[0m\u001b[0m \u001b[0m\n", - "\u001b[2K \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m torchtextclassifiers\u001b[2m @ file:///home/onyxia/work/torchTextClassifiers\u001b[0m\n", - "\u001b[2K\u001b[1A \u001b[32m\u001b[1mBuilt\u001b[0m\u001b[39m torchtextclassifiers\u001b[2m @ file:///home/onyxia/work/torchTextClassifiers\u001b[0m\n", - "\u001b[2K\u001b[2mPrepared \u001b[1m1 package\u001b[0m \u001b[2min 28ms\u001b[0m\u001b[0m \n", - "\u001b[2mUninstalled \u001b[1m1 package\u001b[0m \u001b[2min 1ms\u001b[0m\u001b[0m\n", - "\u001b[2K░░░░░░░░░░░░░░░░░░░░ [0/1] \u001b[2mInstalling wheels... \u001b[0m\u001b[1m\u001b[33mwarning\u001b[39m\u001b[0m\u001b[1m:\u001b[0m \u001b[1mFailed to hardlink files; falling back to full copy. This may lead to degraded performance.\n", - " If the cache and target directories are on different filesystems, hardlinking may not be supported.\n", - " If this is intentional, set `export UV_LINK_MODE=copy` or use `--link-mode=copy` to suppress this warning.\u001b[0m\n", - "\u001b[2K\u001b[2mInstalled \u001b[1m1 package\u001b[0m \u001b[2min 3ms\u001b[0m\u001b[0ms==0.0.0.dev0 (from file:///home/o\u001b[0m\n", - " \u001b[33m~\u001b[39m \u001b[1mtorchtextclassifiers\u001b[0m\u001b[2m==0.0.0.dev0 (from file:///home/onyxia/work/torchTextClassifiers)\u001b[0m\n", - "Note: you may need to restart the kernel to use updated packages.\n", - "\u001b[2mUsing Python 3.13.5 environment at: /opt/python\u001b[0m\n", - "\u001b[2K\u001b[2mResolved \u001b[1m43 packages\u001b[0m \u001b[2min 229ms\u001b[0m\u001b[0m \u001b[0m\n", - "\u001b[2K\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/2) \n", - "\u001b[2K\u001b[1A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/2)--------------\u001b[0m\u001b[0m 0 B/1.34 MiB \u001b[1A\n", - "\u001b[2K\u001b[1A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/2)--------------\u001b[0m\u001b[0m 14.91 KiB/1.34 MiB \u001b[1A\n", - "\u001b[2K\u001b[1A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/2)--------------\u001b[0m\u001b[0m 30.91 KiB/1.34 MiB \u001b[1A\n", - "\u001b[2K\u001b[1A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/2)--------------\u001b[0m\u001b[0m 46.91 KiB/1.34 MiB \u001b[1A\n", - "\u001b[2K\u001b[1A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/2)--------------\u001b[0m\u001b[0m 62.91 KiB/1.34 MiB \u001b[1A\n", - "\u001b[2K\u001b[1A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/2)--------------\u001b[0m\u001b[0m 78.91 KiB/1.34 MiB \u001b[1A\n", - "\u001b[2K\u001b[1A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/2)--------------\u001b[0m\u001b[0m 94.91 KiB/1.34 MiB \u001b[1A\n", - "\u001b[2K\u001b[1A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/2)--------------\u001b[0m\u001b[0m 110.91 KiB/1.34 MiB \u001b[1A\n", - "\u001b[2K\u001b[1A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/2)--------------\u001b[0m\u001b[0m 126.91 KiB/1.34 MiB \u001b[1A\n", - "\u001b[2K\u001b[1A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/2)--------------\u001b[0m\u001b[0m 142.91 KiB/1.34 MiB \u001b[1A\n", - "\u001b[2K\u001b[1A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/2)--------------\u001b[0m\u001b[0m 158.91 KiB/1.34 MiB \u001b[1A\n", - "\u001b[2K\u001b[1A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/2)--------------\u001b[0m\u001b[0m 174.91 KiB/1.34 MiB \u001b[1A\n", - "\u001b[2K\u001b[1A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/2)--------------\u001b[0m\u001b[0m 190.91 KiB/1.34 MiB \u001b[1A\n", - "\u001b[2K\u001b[1A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/2)--------------\u001b[0m\u001b[0m 206.20 KiB/1.34 MiB \u001b[1A\n", - "\u001b[2K\u001b[1A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/2)--------------\u001b[0m\u001b[0m 222.20 KiB/1.34 MiB \u001b[1A\n", - "\u001b[2K\u001b[1A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/2)--------------\u001b[0m\u001b[0m 238.20 KiB/1.34 MiB \u001b[1A\n", - "\u001b[2K\u001b[1A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/2)--------------\u001b[0m\u001b[0m 254.20 KiB/1.34 MiB \u001b[1A\n", - "\u001b[2K \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[32m\u001b[1mBuilt\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[2mPrepared \u001b[1m2 packages\u001b[0m \u001b[2min 1m 24s\u001b[0m\u001b[0m \n", - "\u001b[2mUninstalled \u001b[1m1 package\u001b[0m \u001b[2min 59ms\u001b[0m\u001b[0m\n", - "\u001b[2K░░░░░░░░░░░░░░░░░░░░ [0/3] \u001b[2mInstalling wheels... \u001b[0m\u001b[1m\u001b[33mwarning\u001b[39m\u001b[0m\u001b[1m:\u001b[0m \u001b[1mFailed to hardlink files; falling back to full copy. This may lead to degraded performance.\n", - " If the cache and target directories are on different filesystems, hardlinking may not be supported.\n", - " If this is intentional, set `export UV_LINK_MODE=copy` or use `--link-mode=copy` to suppress this warning.\u001b[0m\n", - "\u001b[2K\u001b[2mInstalled \u001b[1m3 packages\u001b[0m \u001b[2min 6.98s\u001b[0m\u001b[0m \u001b[0m\n", - " \u001b[32m+\u001b[39m \u001b[1mcaptum\u001b[0m\u001b[2m==0.8.0\u001b[0m\n", - " \u001b[31m-\u001b[39m \u001b[1mnumpy\u001b[0m\u001b[2m==2.3.1\u001b[0m\n", - " \u001b[32m+\u001b[39m \u001b[1mnumpy\u001b[0m\u001b[2m==1.26.4\u001b[0m\n", - " \u001b[32m+\u001b[39m \u001b[1munidecode\u001b[0m\u001b[2m==1.4.0\u001b[0m\n", - "Note: you may need to restart the kernel to use updated packages.\n" - ] - } - ], + "outputs": [], "source": [ "# Stable version\n", "%uv pip install --system .. \n", - "%uv pip install --system captum unidecode nltk\n" + "%uv pip install --system captum unidecode nltk scikit-learn\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2", + "metadata": {}, + "outputs": [], + "source": [ + "from torchTextClassifiers import ModelConfig, TrainingConfig, torchTextClassifiers\n", + "from torchTextClassifiers.dataset import TextClassificationDataset\n", + "from torchTextClassifiers.model import TextClassificationModel, TextClassificationModule\n", + "from torchTextClassifiers.model.components import (\n", + " AttentionConfig,\n", + " CategoricalVariableNet,\n", + " ClassificationHead,\n", + " TextEmbedder,\n", + " TextEmbedderConfig,\n", + ")\n", + "from torchTextClassifiers.tokenizers import HuggingFaceTokenizer, WordPieceTokenizer\n", + "from torchTextClassifiers.utilities.plot_explainability import (\n", + " map_attributions_to_char,\n", + " map_attributions_to_word,\n", + " plot_attributions_at_char,\n", + " plot_attributions_at_word,\n", + ")\n", + "\n", + "%load_ext autoreload\n", + "%autoreload 2" ] }, { "cell_type": "markdown", - "id": "b292ea76-57a1-4d4e-9bde-dcc9656dc447", + "id": "3", "metadata": {}, "source": [ "# Load and preprocess data\n", @@ -505,19 +70,30 @@ }, { "cell_type": "code", - "execution_count": 1, - "id": "37c042fe", + "execution_count": null, + "id": "4", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", - "df = pd.read_parquet(\"https://minio.lab.sspcloud.fr/projet-ape/extractions/20241027_sirene4.parquet\")\n", - "df = df.sample(10000)" + "\n", + "df = pd.read_parquet(\"https://minio.lab.sspcloud.fr/projet-ape/data/08112022_27102024/naf2008/split/df_train.parquet\")\n", + "df = df.sample(100000)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5", + "metadata": {}, + "outputs": [], + "source": [ + "df" ] }, { "cell_type": "markdown", - "id": "c399b4b0-a9cb-450e-9a5e-480e0e657b8e", + "id": "6", "metadata": {}, "source": [ "Our goal will be to build multilabel classification for the `code`\n", @@ -533,16 +109,17 @@ }, { "cell_type": "code", - "execution_count": 2, - "id": "92402df7", + "execution_count": null, + "id": "7", "metadata": {}, "outputs": [], "source": [ - "import pandas as pd\n", "import numpy as np\n", + "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import LabelEncoder\n", "\n", + "\n", "def categorize_surface(\n", " df: pd.DataFrame, surface_feature_name: int, like_sirene_3: bool = True\n", ") -> pd.DataFrame:\n", @@ -593,23 +170,11 @@ "\n", "def clean_and_tokenize_df(\n", " df,\n", - " categorical_features=[\"EVT\", \"CJ\", \"NAT\", \"TYP\", \"CRT\"],\n", + " categorical_features=[\"CJ\", \"NAT\", \"TYP\", \"CRT\"],\n", " text_feature=\"libelle_processed\",\n", " label_col=\"apet_finale\",\n", "):\n", " df.fillna(\"nan\", inplace=True)\n", - "\n", - " df = df.rename(\n", - " columns={\n", - " \"evenement_type\": \"EVT\",\n", - " \"cj\": \"CJ\",\n", - " \"activ_nat_et\": \"NAT\",\n", - " \"liasse_type\": \"TYP\",\n", - " \"activ_surf_et\": \"SRF\",\n", - " \"activ_perm_et\": \"CRT\",\n", - " }\n", - " )\n", - "\n", " les = []\n", " for col in categorical_features:\n", " le = LabelEncoder()\n", @@ -617,131 +182,26 @@ " les.append(le)\n", "\n", " df = categorize_surface(df, \"SRF\", like_sirene_3=True)\n", - " df = df[[text_feature, \"EVT\", \"CJ\", \"NAT\", \"TYP\", \"SRF\", \"CRT\", label_col]]\n", - "\n", - " return df, les\n", - "\n", - "\n", - "def stratified_split_rare_labels(X, y, test_size=0.2, min_train_samples=1):\n", - " # Get unique labels and their frequencies\n", - " unique_labels, label_counts = np.unique(y, return_counts=True)\n", - "\n", - " # Separate rare and common labels\n", - " rare_labels = unique_labels[label_counts == 1]\n", - "\n", - " # Create initial mask for rare labels to go into training set\n", - " rare_label_mask = np.isin(y, rare_labels)\n", - "\n", - " # Separate data into rare and common label datasets\n", - " X_rare = X[rare_label_mask]\n", - " y_rare = y[rare_label_mask]\n", - " X_common = X[~rare_label_mask]\n", - " y_common = y[~rare_label_mask]\n", - "\n", - " # Split common labels stratified\n", - " X_common_train, X_common_test, y_common_train, y_common_test = train_test_split(\n", - " X_common, y_common, test_size=test_size, stratify=y_common\n", - " )\n", + " df = df[[text_feature, \"CJ\", \"NAT\", \"TYP\", \"SRF\", \"CRT\", label_col]]\n", "\n", - " # Combine rare labels with common labels split\n", - " X_train = np.concatenate([X_rare, X_common_train])\n", - " y_train = np.concatenate([y_rare, y_common_train])\n", - " X_test = X_common_test\n", - " y_test = y_common_test\n", - "\n", - " return X_train, X_test, y_train, y_test\n", - "\n", - "def add_libelles(\n", - " df: pd.DataFrame,\n", - " df_naf: pd.DataFrame,\n", - " y: str,\n", - " text_feature: str,\n", - " textual_features: list,\n", - " categorical_features: list,\n", - "):\n", - " missing_codes = set(df_naf[\"code\"])\n", - " fake_obs = df_naf[df_naf[\"code\"].isin(missing_codes)]\n", - " fake_obs[y] = fake_obs[\"code\"]\n", - " fake_obs[text_feature] = fake_obs[[text_feature]].apply(\n", - " lambda row: \" \".join(f\"[{col}] {val}\" for col, val in row.items() if val != \"\"), axis=1\n", - " )\n", - " df = pd.concat([df, fake_obs[[col for col in fake_obs.columns if col in df.columns]]])\n", - "\n", - " if textual_features is not None:\n", - " for feature in textual_features:\n", - " df[feature] = df[feature].fillna(value=\"\")\n", - " if categorical_features is not None:\n", - " for feature in categorical_features:\n", - " df[feature] = df[feature].fillna(value=\"NaN\")\n", - "\n", - " print(f\"\\t*** {len(missing_codes)} codes have been added in the database...\\n\")\n", - " return df" + " return df, les" ] }, { "cell_type": "code", - "execution_count": 3, - "id": "1fd02895", + "execution_count": null, + "id": "8", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\t*** 732 codes have been added in the database...\n", - "\n" - ] - } - ], + "outputs": [], "source": [ - "categorical_features = [\"evenement_type\", \"cj\", \"activ_nat_et\", \"liasse_type\", \"activ_surf_et\", \"activ_perm_et\"]\n", + "categorical_features = [ \"CJ\", \"NAT\", \"TYP\", \"SRF\", \"CRT\"]\n", "text_feature = \"libelle\"\n", - "y = \"apet_finale\"\n", - "textual_features = None\n", - "\n", - "naf2008 = pd.read_csv(\"https://minio.lab.sspcloud.fr/projet-ape/data/naf2008.csv\", sep=\";\")\n", - "df = add_libelles(df, naf2008, y, text_feature, textual_features, categorical_features)" - ] - }, - { - "cell_type": "markdown", - "id": "67f4160d-0c98-4700-80f4-1ba454e6a2df", - "metadata": {}, - "source": [ - "## Preprocessing\n", - "\n", - "To reduce noise in text fields, we recommend pre-processing before\n", - "training a model with our package. We assume this preprocessing is\n", - "handled by the package user : this gives him the opportunity to control\n", - "data cleansing.\n", - "\n", - "Here’s an example of the type of preprocessing that can be carried out\n", - "before moving on to the modeling phase" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "61b0252e", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/opt/python/lib/python3.13/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - } - ], - "source": [ - "from torchTextClassifiers.utilities.preprocess import clean_text_feature\n", - "df[\"libelle_processed\"] = clean_text_feature(df[\"libelle\"])" + "y = \"apet_finale\"" ] }, { "cell_type": "markdown", - "id": "acde2929-fe92-4107-8066-a5c8ac5d6428", + "id": "9", "metadata": {}, "source": [ "Right now, the model requires the label (variable y) to be a numerical\n", @@ -755,8 +215,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "id": "8c02a833", + "execution_count": null, + "id": "10", "metadata": {}, "outputs": [], "source": [ @@ -766,7 +226,7 @@ }, { "cell_type": "markdown", - "id": "25593e1a-1661-49e3-9734-272ec4745de1", + "id": "11", "metadata": {}, "source": [ "The function `clean_and_tokenize_df` requires special `DataFrame`\n", @@ -779,1063 +239,299 @@ }, { "cell_type": "code", - "execution_count": 6, - "id": "5fb5b0c7", + "execution_count": null, + "id": "12", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_159490/2075507147.py:60: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'nan' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.\n", - " df.fillna(\"nan\", inplace=True)\n" - ] - } - ], + "outputs": [], "source": [ - "df, _ = clean_and_tokenize_df(df, text_feature=\"libelle_processed\")\n", - "X = df[[\"libelle_processed\", \"EVT\", \"CJ\", \"NAT\", \"TYP\", \"CRT\", \"SRF\"]].values\n", + "df, _ = clean_and_tokenize_df(df, text_feature=\"libelle\")\n", + "X = df[[\"libelle\", \"CJ\", \"NAT\", \"TYP\", \"CRT\", \"SRF\"]].values\n", "y = df[\"apet_finale\"].values" ] }, - { - "cell_type": "markdown", - "id": "e70de831-dbc9-49be-b0c4-d70dd6479d03", - "metadata": {}, - "source": [ - "## Splitting in train-test sets\n", - "\n", - "As usual in a learning approach, you need to break down your data into\n", - "learning and test/validation samples to obtain robust performance\n", - "statistics.\n", - "This work is the responsibility of the package’s users. Please make sure that np.max(y_train) == len(np.unique(y_train))-1 (i.e. your labels are well encoded, in a consecutive manner, starting from 0), and that all the possible labels appear at least once in the training set.\n", - "\n", - "We provide the function stratified_train_test_split to match these requirements here.." - ] - }, { "cell_type": "code", - "execution_count": 7, - "id": "b593fd75", + "execution_count": null, + "id": "13", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "True\n" - ] - } - ], + "outputs": [], "source": [ - "from sklearn.model_selection import train_test_split \n", - "X_train, X_test, y_train, y_test = stratified_split_rare_labels(X, y)\n", - "\n", - "print(np.max(y_train) == len(np.unique(y_train))-1)" + "X.shape, y.shape" ] }, { "cell_type": "markdown", - "id": "8729c5f4-9038-4437-929b-fc500dc0db7a", - "metadata": {}, - "source": [ - "# Build the torchTextClassifiers FastText model (without training it)\n", - "\n", - "There are several ways to define and train a FastText model in\n", - "this package.\n", - "\n", - "We first show how to initialize the model and then afterwards build it.\n", - "\n", - "`create_fasttext` function accepts the following parameters:\n", - "\n", - "| Parameter | Meaning | Example Value |\n", - "|---------------------|------------------------------------------|----------|\n", - "| `num_tokens` | Number of rows in the embedding matrix (size of the vocabulary) | 100000 |\n", - "| `embedding_dim` | Dimension of the embedding (number of columns in the matrix) | 50 |\n", - "| `sparse` | Use sparse embedding for fast computation (PyTorch) | False |\n", - "| `categorical_embedding_dims` | Dimension of the embedding for categorical features | 10 |\n", - "| `min_count` | Minimum occurrences of a word in the corpus to be included | 1 |\n", - "| `min_n` | Minimum length of character n-grams | 3 |\n", - "| `max_n` | Maximum length of character n-grams | 6 |\n", - "| `len_word_ngrams` | Length of word n-grams | 3 |" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "5879ca88", + "id": "14", "metadata": {}, - "outputs": [], "source": [ - "from torchTextClassifiers import create_fasttext\n", - "\n", - "parameters = {\n", - " \"num_tokens\": 100000,\n", - " \"embedding_dim\": 50,\n", - " \"sparse\": False,\n", - " \"categorical_embedding_dims\": 10,\n", - " \"min_count\": 1,\n", - " \"min_n\": 3,\n", - " \"max_n\": 6,\n", - " \"len_word_ngrams\": 3,\n", - "}\n", + "## Splitting in train-test sets\n", "\n", - "parameters_train = {\n", - " \"lr\": 0.004,\n", - " \"num_epochs\": 1,\n", - " \"batch_size\": 256,\n", - " \"patience\": 3 \n", - "}\n", + "As usual in a learning approach, you need to break down your data into\n", + "learning and test/validation samples to obtain robust performance\n", + "statistics.\n", + "This work is the responsibility of the package’s users. Please make sure that np.max(y_train) == len(np.unique(y_train))-1 (i.e. your labels are well encoded, in a consecutive manner, starting from 0), and that all the possible labels appear at least once in the training set.\n", "\n", - "model = create_fasttext(**parameters)" - ] - }, - { - "cell_type": "markdown", - "id": "05f9d26b-f08f-41be-93e4-b55a2c86690c", - "metadata": {}, - "source": [ - "`model` is then a special `torchTextClassifiers` object:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ebf5608b", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torchTextClassifiers.classifiers.fasttext.wrapper.FastTextWrapper" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "type(model.classifier)" - ] - }, - { - "cell_type": "markdown", - "id": "dcbe8289-f506-48f9-b854-96f25974368f", - "metadata": {}, - "source": [ - "As any `PyTorch` model, it accepts being save as a JSON for later on\n", - "use:" + "We provide the function stratified_train_test_split to match these requirements here.." ] }, { "cell_type": "code", "execution_count": null, - "id": "6c3b2b85", + "id": "15", "metadata": {}, "outputs": [], "source": [ - "model.to_json('torchTextClassifiers_config.json')\n", - "# Loading from JSON now works with the new API:\n", - "# from torchTextClassifiers import torchTextClassifiers\n", - "# loaded_model = torchTextClassifiers.from_json('torchTextClassifiers_config.json')" - ] - }, - { - "cell_type": "markdown", - "id": "5f8b017f-66a1-413d-85e8-1981adf64823", - "metadata": {}, - "source": [ - "We can apply `build` to finally train our model. These are the\n", - "parameters accepted by the `build` method\n", - "\n", - "| Parameter | Meaning | Example Value |\n", - "|---------------------|------------------------------------------|----------|\n", - "| `lr` | Learning rate | 0.004 |\n", - "| `num_epochs` | Number of training epochs | 1 |\n", - "| `batch_size` | Batch size for training | 256 |\n", - "| `patience` | Early stopping patience (number of epochs without improvement) | 3 |\n", + "from sklearn.model_selection import train_test_split\n", "\n", - "We build the model using the training data. We have now access to the\n", - "tokenizer, the PyTorch model as well as a PyTorch Lightning module ready\n", - "to be trained. Note that Lightning is high-level framework for PyTorch\n", - "that simplifies the process of training, validating, and deploying\n", - "machine learning models." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "e2e43d0e", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-07-23 12:36:22 - torchTextClassifiers.classifiers.fasttext.model - num_rows is different from the number of tokens in the tokenizer. Using provided num_rows.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-07-23 12:36:22 - root - No scheduler parameters provided. Using defaults.\n" - ] - } - ], - "source": [ - "model.build(X_train, y_train, lightning=True, lr=parameters_train.get(\"lr\"))" + "X_train, X_test, y_train, y_test = train_test_split(X, y)" ] }, { "cell_type": "markdown", - "id": "b5a7d5fa-596a-470b-892e-e8fafdb8221a", + "id": "16", "metadata": {}, "source": [ - "One can retrieve different objects from `model` instance:\n", - "\n", - "- `model.pytorch_model`\n", - "- `model.tokenizer`\n", - "- `model.lightning_module`" + "## Tokenizer" ] }, { "cell_type": "code", "execution_count": null, - "id": "091024e6", + "id": "17", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "FastTextModel(\n", - " (embeddings): EmbeddingBag(108297, 50, mode='mean', padding_idx=108296)\n", - " (emb_0): Embedding(24, 10)\n", - " (emb_1): Embedding(42, 10)\n", - " (emb_2): Embedding(10, 10)\n", - " (emb_3): Embedding(13, 10)\n", - " (emb_4): Embedding(3, 10)\n", - " (emb_5): Embedding(5, 10)\n", - " (fc): Linear(in_features=60, out_features=732, bias=True)\n", - ")" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "model.classifier.pytorch_model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d983b113", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "model.classifier.tokenizer" + "text = X_train[:, 0].tolist()" ] }, { "cell_type": "code", "execution_count": null, - "id": "9b23f1ba", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "FastTextModule(\n", - " (model): FastTextModel(\n", - " (embeddings): EmbeddingBag(108297, 50, mode='mean', padding_idx=108296)\n", - " (emb_0): Embedding(24, 10)\n", - " (emb_1): Embedding(42, 10)\n", - " (emb_2): Embedding(10, 10)\n", - " (emb_3): Embedding(13, 10)\n", - " (emb_4): Embedding(3, 10)\n", - " (emb_5): Embedding(5, 10)\n", - " (fc): Linear(in_features=60, out_features=732, bias=True)\n", - " )\n", - " (loss): CrossEntropyLoss()\n", - " (accuracy_fn): MulticlassAccuracy()\n", - ")" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "model.classifier.lightning_module" - ] - }, - { - "cell_type": "markdown", - "id": "b804391a-979a-4a74-a5f7-d8e27550e20e", + "id": "18", "metadata": {}, + "outputs": [], "source": [ - "One can also retrieve more precise information regarding the tokenizer.\n", - "This can be useful to know how text is parsed before being given to the\n", - "neural network:" + "tokenizer = HuggingFaceTokenizer.load_from_pretrained(\"google-bert/bert-base-uncased\")\n", + "tokenizer.tokenize(text[0]).input_ids.shape" ] }, { "cell_type": "code", "execution_count": null, - "id": "00c077b0", + "id": "19", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{0: '',\n", - " 369: 'sit',\n", - " 8402: 'lorem ipsum dolor',\n", - " 8477: '',\n", - " 9254: '',\n", - " 17674: 'ame',\n", - " 19233: '',\n", - " 21660: '',\n", - " 33686: 'or>',\n", - " 36146: 'ipsum dolor',\n", - " 37685: '',\n", - " 44699: '',\n", - " 46043: 'sit>',\n", - " 46176: 'ipsu',\n", - " 49083: 'psu',\n", - " 49236: 'orem>',\n", - " 50091: '',\n", - " 57650: 'it>',\n", - " 58295: 'olor>',\n", - " 60820: 'lor>',\n", - " 60827: 'ore',\n", - " 63114: 'sum>',\n", - " 65777: 'met',\n", - " 65864: '',\n", - " 68083: 'olor',\n", - " 68290: 'orem',\n", - " 68834: 'psum',\n", - " 72463: 'rem',\n", - " 74123: 'ipsum>',\n", - " 74942: 'dolor sit',\n", - " 76898: 'lor',\n", - " 77899: '',\n", - " 87932: '',\n", - " 93076: 'psum>',\n", - " 93114: '', '', 'H '],\n", - " ['', '', 'e '],\n", - " ['', '', 'l '],\n", - " ['', '', 'l '],\n", - " ['', '', 'o '],\n", - " [''],\n", - " ['', '', 'w '],\n", - " ['', '', 'o '],\n", - " ['', '', 'r '],\n", - " ['', '', 'l '],\n", - " ['', '', 'd ']],\n", - " [tensor([40876, 0, 51965]),\n", - " tensor([51907, 0, 77296]),\n", - " tensor([74312, 0, 26137]),\n", - " tensor([74312, 0, 26137]),\n", - " tensor([ 9853, 0, 53786]),\n", - " tensor([0]),\n", - " tensor([29925, 0, 74978]),\n", - " tensor([ 9853, 0, 53786]),\n", - " tensor([ 8646, 0, 13223]),\n", - " tensor([74312, 0, 26137]),\n", - " tensor([ 89472, 0, 104945])],\n", - " [{40876: '', 0: '', 51965: 'H '},\n", - " {51907: '', 0: '', 77296: 'e '},\n", - " {74312: '', 0: '', 26137: 'l '},\n", - " {74312: '', 0: '', 26137: 'l '},\n", - " {9853: '', 0: '', 53786: 'o '},\n", - " {0: ''},\n", - " {29925: '', 0: '', 74978: 'w '},\n", - " {9853: '', 0: '', 53786: 'o '},\n", - " {8646: '', 0: '', 13223: 'r '},\n", - " {74312: '', 0: '', 26137: 'l '},\n", - " {89472: '', 0: '', 104945: 'd '}],\n", - " [{'': 40876, '': 0, 'H ': 51965},\n", - " {'': 51907, '': 0, 'e ': 77296},\n", - " {'': 74312, '': 0, 'l ': 26137},\n", - " {'': 74312, '': 0, 'l ': 26137},\n", - " {'': 9853, '': 0, 'o ': 53786},\n", - " {'': 0},\n", - " {'': 29925, '': 0, 'w ': 74978},\n", - " {'': 9853, '': 0, 'o ': 53786},\n", - " {'': 8646, '': 0, 'r ': 13223},\n", - " {'': 74312, '': 0, 'l ': 26137},\n", - " {'': 89472, '': 0, 'd ': 104945}])" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "tokenizer.tokenize(\"Hello world\")" - ] - }, - { - "cell_type": "markdown", - "id": "fd5b6899-7831-40a6-9841-bbc1b0804956", - "metadata": {}, - "source": [ - "However, there is a more straightforward way to do: creating directly\n", - "the `NGramTokenizer` instance:" + "num_classes = int(y.max() + 1)\n", + "expected_input_dim = embedding_dim + categorical_var_net.output_dim\n", + "classification_head = ClassificationHead(\n", + " input_dim=expected_input_dim,\n", + " num_classes=num_classes,\n", + ")" ] }, { "cell_type": "code", "execution_count": null, - "id": "8a6ee96b", + "id": "26", "metadata": {}, "outputs": [], "source": [ - "tokenizer = NGramTokenizer(\n", - " **parameters,\n", - " training_text=training_text\n", - " )" + "model = TextClassificationModel(\n", + " text_embedder=text_embedder,\n", + " categorical_variable_net=categorical_var_net,\n", + " classification_head=classification_head,\n", + ")\n", + "model" ] }, { "cell_type": "code", - "execution_count": 24, - "id": "776636e6", + "execution_count": null, + "id": "27", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "([['', '', 'H '],\n", - " ['', '', 'e '],\n", - " ['', '', 'l '],\n", - " ['', '', 'l '],\n", - " ['', '', 'o '],\n", - " [''],\n", - " ['', '', 'w '],\n", - " ['', '', 'o '],\n", - " ['', '', 'r '],\n", - " ['', '', 'l '],\n", - " ['', '', 'd ']],\n", - " [tensor([40876, 0, 51965]),\n", - " tensor([51907, 0, 77296]),\n", - " tensor([74312, 0, 26137]),\n", - " tensor([74312, 0, 26137]),\n", - " tensor([ 9853, 0, 53786]),\n", - " tensor([0]),\n", - " tensor([29925, 0, 74978]),\n", - " tensor([ 9853, 0, 53786]),\n", - " tensor([ 8646, 0, 13223]),\n", - " tensor([74312, 0, 26137]),\n", - " tensor([ 89472, 0, 104945])],\n", - " [{40876: '', 0: '', 51965: 'H '},\n", - " {51907: '', 0: '', 77296: 'e '},\n", - " {74312: '', 0: '', 26137: 'l '},\n", - " {74312: '', 0: '', 26137: 'l '},\n", - " {9853: '', 0: '', 53786: 'o '},\n", - " {0: ''},\n", - " {29925: '', 0: '', 74978: 'w '},\n", - " {9853: '', 0: '', 53786: 'o '},\n", - " {8646: '', 0: '', 13223: 'r '},\n", - " {74312: '', 0: '', 26137: 'l '},\n", - " {89472: '', 0: '', 104945: 'd '}],\n", - " [{'': 40876, '': 0, 'H ': 51965},\n", - " {'': 51907, '': 0, 'e ': 77296},\n", - " {'': 74312, '': 0, 'l ': 26137},\n", - " {'': 74312, '': 0, 'l ': 26137},\n", - " {'': 9853, '': 0, 'o ': 53786},\n", - " {'': 0},\n", - " {'': 29925, '': 0, 'w ': 74978},\n", - " {'': 9853, '': 0, 'o ': 53786},\n", - " {'': 8646, '': 0, 'r ': 13223},\n", - " {'': 74312, '': 0, 'l ': 26137},\n", - " {'': 89472, '': 0, 'd ': 104945}])" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ - "tokenizer.tokenize(\"Hello world\")" + "import torch\n", + "\n", + "module = TextClassificationModule(\n", + " model=model,\n", + " loss=torch.nn.CrossEntropyLoss(),\n", + " optimizer=torch.optim.Adam,\n", + " optimizer_params={\"lr\": 1e-3},\n", + " scheduler=None,\n", + " scheduler_params=None,\n", + " scheduler_interval=\"epoch\",\n", + ")\n", + "module" ] }, { "cell_type": "markdown", - "id": "6b0fd6c0-9740-4a32-9bb2-4a3cfe174ea8", + "id": "28", "metadata": {}, "source": [ - "Why creating a `NGramTokenizer` separately ? Because model constructor\n", - "is now independent from training data:" + "## Using the wrapper" ] }, { "cell_type": "code", "execution_count": null, - "id": "ee5dbe0b", + "id": "29", "metadata": {}, "outputs": [], "source": [ - "from torchTextClassifiers import build_fasttext_from_tokenizer\n", - "\n", - "model = build_fasttext_from_tokenizer(\n", - " tokenizer, \n", - " embedding_dim=parameters[\"embedding_dim\"], \n", - " categorical_embedding_dims=parameters[\"categorical_embedding_dims\"], \n", - " sparse=parameters[\"sparse\"], \n", - " lr=parameters_train[\"lr\"], \n", - " num_classes=NUM_CLASSES, \n", - " num_categorical_features=NUM_CAT_VAR, \n", - " categorical_vocabulary_sizes=CAT_VOCAB_SIZE\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "f53080e9-9d78-479f-a446-2feb4a92b1de", - "metadata": {}, - "source": [ - "**Warning**:\n", - "\n", - "If the PyTorch model building did not use the training data, please keep\n", - "in mind that its architecture (that you customize here) should match the\n", - "vocabulary size of the categorical variables and the total number of\n", - "class, otherwise the model will raise an error during training.\n", + "model_config = ModelConfig(\n", + " embedding_dim=embedding_dim,\n", + " categorical_vocabulary_sizes=categorical_vocab_sizes,\n", + " categorical_embedding_dims=categorical_embedding_dims,\n", + " num_classes=num_classes,\n", + " attention_config=attention_config,\n", + ")\n", "\n", - "# Train a torchTextClassifiers FastText model directly\n", + "training_config = TrainingConfig(\n", + " lr=1e-3,\n", + " batch_size=256,\n", + " num_epochs=10,\n", + ")\n", "\n", - "If no advanced customization or PyTorch tuning is necessary, there is a\n", - "direct way of training model." - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "ce5dc4a1", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-07-23 12:38:57 - torchTextClassifiers.torchTextClassifiers - Starting training process...\n", - "2025-07-23 12:38:57 - torchTextClassifiers.torchTextClassifiers - Running on: cpu\n", - "2025-07-23 12:38:57 - torchTextClassifiers.classifiers.fasttext.model - Creating DataLoader with 12 workers.\n", - "2025-07-23 12:38:57 - torchTextClassifiers.classifiers.fasttext.model - Creating DataLoader with 12 workers.\n", - "GPU available: False, used: False\n", - "TPU available: False, using: 0 TPU cores\n", - "HPU available: False, using: 0 HPUs\n", - "/opt/python/lib/python3.13/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default\n", - "2025-07-23 12:38:57 - torchTextClassifiers.torchTextClassifiers - Launching training...\n", - "\n", - " | Name | Type | Params | Mode \n", - "-----------------------------------------------------------\n", - "0 | model | FastTextModel | 5.5 M | train\n", - "1 | loss | CrossEntropyLoss | 0 | train\n", - "2 | accuracy_fn | MulticlassAccuracy | 0 | train\n", - "-----------------------------------------------------------\n", - "5.5 M Trainable params\n", - "0 Non-trainable params\n", - "5.5 M Total params\n", - "21.842 Total estimated model params size (MB)\n", - "11 Modules in train mode\n", - "0 Modules in eval mode\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Sanity Checking: | | 0/? [00:00 \u001b[39m\u001b[32m5\u001b[39m pred, conf = model.predict(X, top_k=TOP_K)\n\u001b[32m 6\u001b[39m pred_naf = encoder.inverse_transform(pred.reshape(-\u001b[32m1\u001b[39m))\n\u001b[32m 7\u001b[39m subset = naf2008.set_index(\u001b[33m\"\u001b[39m\u001b[33mcode\u001b[39m\u001b[33m\"\u001b[39m).loc[np.flip(pred_naf)]\n", - "\u001b[31mValueError\u001b[39m: not enough values to unpack (expected 2, got 1)" - ] - } - ], - "source": [ - "text = [\"coiffeur, boulangerie, pâtisserie\"] # one text description\n", - "X= np.array([[text[0], 0, 0, 0, 0, 0, 0]]) # our new entry\n", - "TOP_K = 5\n", - "\n", - "pred, conf = model.predict(X, top_k=TOP_K)\n", - "pred_naf = encoder.inverse_transform(pred.reshape(-1))\n", - "subset = naf2008.set_index(\"code\").loc[np.flip(pred_naf)]\n", - "\n", - "for i in range(TOP_K-1, -1, -1):\n", - " print(f\"Prediction: {pred_naf[i]}, confidence: {conf[0, i]}, description: {subset['libelle'][pred_naf[i]]}\")" - ] - }, - { - "cell_type": "markdown", - "id": "f84e6bff-8fa7-4896-b60a-005ae5f1d3eb", + "execution_count": null, + "id": "31", "metadata": {}, + "outputs": [], "source": [ - "# Explainability\n", - "\n", - "The `torchTextClassifiers` framework provides explainability features through the `predict_and_explain` method. This allows you to understand which parts of the input text contribute most to the model's predictions." + "tokenizer.tokenize(X_train[:256, 0].tolist()).input_ids.shape" ] }, { "cell_type": "code", - "execution_count": 22, - "id": "58c46021", + "execution_count": null, + "id": "32", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Predictions with explanations:\n", - "Predicted classes: tensor([[727]])\n", - "Confidence scores: tensor([[0.5400]])\n", - "Word-level explanation scores shape: \n", - "Letter-level explanation scores shape: torch.Size([1, 1, 31])\n" - ] - } - ], + "outputs": [], "source": [ - "# Note: Visualization functions are not yet implemented in torchTextClassifiers\n", - "# The predict_and_explain method is available but visualization utilities need to be implemented\n", - "\n", - "pred, conf, all_scores, all_scores_letters = model.predict_and_explain(X)\n", - "\n", - "# TODO: Implement visualization functions\n", - "# visualize_word_scores(all_scores, text, pred_naf.reshape(1, -1))\n", - "# visualize_letter_scores(all_scores_letters, text, pred_naf.reshape(1, -1))\n", - "\n", - "print(\"Predictions with explanations:\")\n", - "print(f\"Predicted classes: {pred}\")\n", - "print(f\"Confidence scores: {conf}\")\n", - "print(f\"Word-level explanation scores shape: {all_scores.shape if hasattr(all_scores, 'shape') else type(all_scores)}\")\n", - "print(f\"Letter-level explanation scores shape: {all_scores_letters.shape if hasattr(all_scores_letters, 'shape') else type(all_scores_letters)}\")" + "ttc.train(\n", + " X_train=X_train,\n", + " y_train=y_train,\n", + " X_val=X_test,\n", + " y_val=y_test,\n", + " training_config=training_config,\n", + ")" ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": ".venv", "language": "python", "name": "python3" }, @@ -1849,7 +545,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.13.5" + "version": "3.13.8" } }, "nbformat": 4, diff --git a/xxx.py b/xxx.py new file mode 100644 index 0000000..2350dba --- /dev/null +++ b/xxx.py @@ -0,0 +1,167 @@ +from torchTextClassifiers import ModelConfig, TrainingConfig, torchTextClassifiers +from torchTextClassifiers.dataset import TextClassificationDataset +from torchTextClassifiers.model import TextClassificationModel, TextClassificationModule +from torchTextClassifiers.model.components import ( + AttentionConfig, + CategoricalVariableNet, + ClassificationHead, + TextEmbedder, + TextEmbedderConfig, +) +from torchTextClassifiers.tokenizers import HuggingFaceTokenizer, WordPieceTokenizer +from torchTextClassifiers.utilities.plot_explainability import ( + map_attributions_to_char, + map_attributions_to_word, + plot_attributions_at_char, + plot_attributions_at_word, +) + +sample_text_data = [ + "This is a positive example", + "This is a negative example", + "Another positive case", + "Another negative case", + "Good example here", + "Bad example here", +] +categorical_data = [[1, 0], [0, 1], [1, 0], [0, 1], [1, 0], [0, 1]] +labels = [1, 0, 1, 0, 1, 5] + +### +tokenizer = WordPieceTokenizer(3, output_dim=None) +tokenizer.train(sample_text_data) +tokenizer.tokenize(sample_text_data).input_ids.shape + + +### +tokenizer = HuggingFaceTokenizer.load_from_pretrained( + "google-bert/bert-base-uncased", output_dim=126 +) +tokenizer.tokenize(sample_text_data).input_ids.shape + + +dataset = TextClassificationDataset( + texts=sample_text_data, categorical_variables=categorical_data, tokenizer=tokenizer, labels=None +) + +dataloader = dataset.create_dataloader(batch_size=4) + +batch = next(iter(dataloader)) + +vocab_size = tokenizer.vocab_size +padding_idx = tokenizer.padding_idx + +embedding_dim = 96 +n_layers = 2 +n_head = 4 +n_kv_head = n_head +sequence_len = tokenizer.output_dim + +attention_config = AttentionConfig( + n_layers=n_layers, + n_head=n_head, + n_kv_head=n_kv_head, + sequence_len=sequence_len, +) + +text_embedder_config = TextEmbedderConfig( + vocab_size=vocab_size, + embedding_dim=embedding_dim, + padding_idx=padding_idx, + attention_config=attention_config, +) + + +text_embedder = TextEmbedder( + text_embedder_config=text_embedder_config, +) +text_embedder.init_weights() + + +categorical_vocab_sizes = [2, 2] +categorical_embedding_dims = [4, 7] + +categorical_var_net = CategoricalVariableNet( + categorical_vocabulary_sizes=categorical_vocab_sizes, + categorical_embedding_dims=categorical_embedding_dims, +) + +num_classes = 10 +expected_input_dim = embedding_dim + categorical_var_net.output_dim +classification_head = ClassificationHead( + input_dim=expected_input_dim, + num_classes=num_classes, +) + +model = TextClassificationModel( + text_embedder=text_embedder, + categorical_variable_net=categorical_var_net, + classification_head=classification_head, +) + +model(**batch) + +import torch + +module = TextClassificationModule( + model=model, + loss=torch.nn.CrossEntropyLoss(), + optimizer=torch.optim.Adam, + optimizer_params={"lr": 1e-3}, + scheduler=None, + scheduler_params=None, + scheduler_interval="epoch", +) + +module.predict_step(batch) + +# Convert categorical data to numpy array +import numpy as np + +categorical_data = np.array(categorical_data).astype(int) + +# Combine text (as a column vector) with categorical data +X = np.column_stack([sample_text_data, categorical_data]) +Y = np.array(labels) + +model_config = ModelConfig( + embedding_dim=embedding_dim, + categorical_vocabulary_sizes=categorical_vocab_sizes, + categorical_embedding_dims=categorical_embedding_dims, + num_classes=num_classes, + attention_config=attention_config, +) + +training_config = TrainingConfig( + lr=1e-3, + batch_size=4, + num_epochs=1, +) + +ttc = torchTextClassifiers( + tokenizer=tokenizer, + model_config=model_config, +) + +ttc.train( + X_train=X, + y_train=Y, + X_val=X, + y_val=Y, + training_config=training_config, +) + +top_k = 5 +yyy = ttc.predict(X, top_k=top_k, explain=True) + +text_idx = 0 +text = sample_text_data[text_idx] +offsets = yyy["offset_mapping"][text_idx] # seq_len, 2 +attributions = yyy["attributions"][text_idx] # top_k, seq_len +word_ids = yyy["word_ids"][text_idx] # seq_len + +word_attributions = map_attributions_to_word(attributions, word_ids) +char_attributions = map_attributions_to_char(attributions, offsets, text) + +plot_attributions_at_char(text, char_attributions) +plot_attributions_at_word(text, word_attributions) From a179c3759f9a1dda0e243300f5f4c7c36ae6104e Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Wed, 12 Nov 2025 13:44:05 +0000 Subject: [PATCH 45/66] fix: load model on cpu to avoid pb after training --- torchTextClassifiers/torchTextClassifiers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchTextClassifiers/torchTextClassifiers.py b/torchTextClassifiers/torchTextClassifiers.py index ecc8fae..b6fd62d 100644 --- a/torchTextClassifiers/torchTextClassifiers.py +++ b/torchTextClassifiers/torchTextClassifiers.py @@ -486,7 +486,7 @@ def predict( text = X_test["text"] categorical_variables = X_test["categorical_variables"] - self.pytorch_model.eval() + self.pytorch_model.eval().cpu() tokenize_output = self.tokenizer.tokenize( text.tolist(), return_offsets_mapping=return_offsets_mapping From ea26799fea19d84cfa11b1fcd361402f57739fc2 Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Wed, 12 Nov 2025 13:44:23 +0000 Subject: [PATCH 46/66] progress on docs --- notebooks/example.ipynb | 813 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 752 insertions(+), 61 deletions(-) diff --git a/notebooks/example.ipynb b/notebooks/example.ipynb index f2c7d6f..e77f89e 100644 --- a/notebooks/example.ipynb +++ b/notebooks/example.ipynb @@ -18,19 +18,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "1", - "metadata": {}, - "outputs": [], - "source": [ - "# Stable version\n", - "%uv pip install --system .. \n", - "%uv pip install --system captum unidecode nltk scikit-learn\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "2", "metadata": {}, "outputs": [], @@ -70,7 +58,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "4", "metadata": {}, "outputs": [], @@ -83,33 +71,198 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "5", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
apet_finalelibelleCJNATTYPSRFCRT
13906916201ZLe développement et l'exploitation d'un réseau...549999C0.0P
6676526831ZImmobilier : agent commercial en immobilier : ...NaNNaNR0.0NaN
811995320ZLe point relais est un service qui permet de s...NaNNaNX0.0P
6099237410ZCREATION OBJET PERSONNALISE MULTI SUPPORT, act...NaN99M0.0P
14940851071Dconfection de desserts, pâtisseries artisanale...NaN10M6.0P
........................
7755057311ZVente de prospect, Marketing publicitaire, Cré...549999C0.0P
3942299329ZDJ activité de servicesNaN99C0.0P
1494016820BL'acquisition, l'administration et la gestion ...654099G0.0P
4643647021ZAgence de communication, notamment le conseil ...549999C0.0P
2807075320ZLivraisons activité de servicesNaN99C0.0P
\n", + "

100000 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " apet_finale libelle CJ \\\n", + "1390691 6201Z Le développement et l'exploitation d'un réseau... 5499 \n", + "667652 6831Z Immobilier : agent commercial en immobilier : ... NaN \n", + "81199 5320Z Le point relais est un service qui permet de s... NaN \n", + "609923 7410Z CREATION OBJET PERSONNALISE MULTI SUPPORT, act... NaN \n", + "1494085 1071D confection de desserts, pâtisseries artisanale... NaN \n", + "... ... ... ... \n", + "775505 7311Z Vente de prospect, Marketing publicitaire, Cré... 5499 \n", + "394229 9329Z DJ activité de services NaN \n", + "149401 6820B L'acquisition, l'administration et la gestion ... 6540 \n", + "464364 7021Z Agence de communication, notamment le conseil ... 5499 \n", + "280707 5320Z Livraisons activité de services NaN \n", + "\n", + " NAT TYP SRF CRT \n", + "1390691 99 C 0.0 P \n", + "667652 NaN R 0.0 NaN \n", + "81199 NaN X 0.0 P \n", + "609923 99 M 0.0 P \n", + "1494085 10 M 6.0 P \n", + "... ... .. ... ... \n", + "775505 99 C 0.0 P \n", + "394229 99 C 0.0 P \n", + "149401 99 G 0.0 P \n", + "464364 99 C 0.0 P \n", + "280707 99 C 0.0 P \n", + "\n", + "[100000 rows x 7 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df" ] }, - { - "cell_type": "markdown", - "id": "6", - "metadata": {}, - "source": [ - "Our goal will be to build multilabel classification for the `code`\n", - "variable using `libelle` as feature.\n", - "\n", - "## Enriching our test dataset\n", - "\n", - "Unlike `Fasttext`, this package offers the possibility of having several\n", - "feature columns of different types (string for the text column and\n", - "additional variables in numeric form, for example). To illustrate that,\n", - "we propose the following enrichment of the example dataset:" - ] - }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "7", "metadata": {}, "outputs": [], @@ -189,7 +342,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "8", "metadata": {}, "outputs": [], @@ -215,7 +368,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "10", "metadata": {}, "outputs": [], @@ -239,7 +392,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "12", "metadata": {}, "outputs": [], @@ -251,10 +404,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "13", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "((100000, 6), (100000,))" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "X.shape, y.shape" ] @@ -276,7 +440,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "15", "metadata": {}, "outputs": [], @@ -296,7 +460,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "17", "metadata": {}, "outputs": [], @@ -306,23 +470,75 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 54, "id": "18", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([1, 3])" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "tokenizer = HuggingFaceTokenizer.load_from_pretrained(\"google-bert/bert-base-uncased\")\n", "tokenizer.tokenize(text[0]).input_ids.shape" ] }, + { + "cell_type": "code", + "execution_count": 55, + "id": "0501c22e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "30522" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.vocab_size" + ] + }, { "cell_type": "code", "execution_count": null, "id": "19", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n" + ] + }, + { + "data": { + "text/plain": [ + "torch.Size([256, 125])" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "tokenizer = WordPieceTokenizer(5000, output_dim=125)\n", + "tokenizer = WordPieceTokenizer(vocab_size=5000, output_dim=125)\n", "tokenizer.train(text)\n", "tokenizer.tokenize(text[:256]).input_ids.shape" ] @@ -337,7 +553,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "21", "metadata": {}, "outputs": [], @@ -354,7 +570,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "id": "22", "metadata": {}, "outputs": [], @@ -382,17 +598,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "id": "23", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[75, 8, 14, 2, 4]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "X[:, 1:].max(axis=0).tolist()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "id": "24", "metadata": {}, "outputs": [], @@ -408,7 +635,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "id": "25", "metadata": {}, "outputs": [], @@ -423,10 +650,51 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "id": "26", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "TextClassificationModel(\n", + " (text_embedder): TextEmbedder(\n", + " (embedding_layer): Embedding(5000, 96, padding_idx=1)\n", + " (transformer): ModuleDict(\n", + " (h): ModuleList(\n", + " (0): Block(\n", + " (attn): SelfAttentionLayer(\n", + " (c_q): Linear(in_features=96, out_features=96, bias=False)\n", + " (c_k): Linear(in_features=96, out_features=96, bias=False)\n", + " (c_v): Linear(in_features=96, out_features=96, bias=False)\n", + " (c_proj): Linear(in_features=96, out_features=96, bias=False)\n", + " )\n", + " (mlp): MLP(\n", + " (c_fc): Linear(in_features=96, out_features=384, bias=False)\n", + " (c_proj): Linear(in_features=384, out_features=96, bias=False)\n", + " )\n", + " )\n", + " )\n", + " )\n", + " )\n", + " (categorical_variable_net): CategoricalVariableNet(\n", + " (categorical_embedding_0): Embedding(76, 25)\n", + " (categorical_embedding_1): Embedding(9, 25)\n", + " (categorical_embedding_2): Embedding(15, 25)\n", + " (categorical_embedding_3): Embedding(3, 25)\n", + " (categorical_embedding_4): Embedding(5, 25)\n", + " )\n", + " (classification_head): ClassificationHead(\n", + " (net): Linear(in_features=121, out_features=583, bias=True)\n", + " )\n", + ")" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "model = TextClassificationModel(\n", " text_embedder=text_embedder,\n", @@ -438,10 +706,55 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "id": "27", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "TextClassificationModule(\n", + " (model): TextClassificationModel(\n", + " (text_embedder): TextEmbedder(\n", + " (embedding_layer): Embedding(5000, 96, padding_idx=1)\n", + " (transformer): ModuleDict(\n", + " (h): ModuleList(\n", + " (0): Block(\n", + " (attn): SelfAttentionLayer(\n", + " (c_q): Linear(in_features=96, out_features=96, bias=False)\n", + " (c_k): Linear(in_features=96, out_features=96, bias=False)\n", + " (c_v): Linear(in_features=96, out_features=96, bias=False)\n", + " (c_proj): Linear(in_features=96, out_features=96, bias=False)\n", + " )\n", + " (mlp): MLP(\n", + " (c_fc): Linear(in_features=96, out_features=384, bias=False)\n", + " (c_proj): Linear(in_features=384, out_features=96, bias=False)\n", + " )\n", + " )\n", + " )\n", + " )\n", + " )\n", + " (categorical_variable_net): CategoricalVariableNet(\n", + " (categorical_embedding_0): Embedding(76, 25)\n", + " (categorical_embedding_1): Embedding(9, 25)\n", + " (categorical_embedding_2): Embedding(15, 25)\n", + " (categorical_embedding_3): Embedding(3, 25)\n", + " (categorical_embedding_4): Embedding(5, 25)\n", + " )\n", + " (classification_head): ClassificationHead(\n", + " (net): Linear(in_features=121, out_features=583, bias=True)\n", + " )\n", + " )\n", + " (loss): CrossEntropyLoss()\n", + " (accuracy_fn): MulticlassAccuracy()\n", + ")" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "import torch\n", "\n", @@ -467,7 +780,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "id": "29", "metadata": {}, "outputs": [], @@ -494,30 +807,253 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "id": "30", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array(['livraisons a domicile repas et marchandises divers activité de services',\n", + " 75, 7, 1, 1, 0], dtype=object)" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "X_train[1, :]" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "id": "31", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([256, 125])" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "tokenizer.tokenize(X_train[:256, 0].tolist()).input_ids.shape" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "id": "32", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "GPU available: True (cuda), used: True\n", + "TPU available: False, using: 0 TPU cores\n", + "HPU available: False, using: 0 HPUs\n", + "/home/onyxia/work/torchTextClassifiers/.venv/lib/python3.13/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default\n", + "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n", + "\n", + " | Name | Type | Params | Mode \n", + "----------------------------------------------------------------\n", + "0 | model | TextClassificationModel | 664 K | train\n", + "1 | loss | CrossEntropyLoss | 0 | train\n", + "2 | accuracy_fn | MulticlassAccuracy | 0 | train\n", + "----------------------------------------------------------------\n", + "664 K Trainable params\n", + "0 Non-trainable params\n", + "664 K Total params\n", + "2.658 Total estimated model params size (MB)\n", + "24 Modules in train mode\n", + "0 Modules in eval mode\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b076ed12bfcb48879c03faa917b21b30", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Sanity Checking: | | 0/? [00:00" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from matplotlib import pyplot as plt\n", + "top_k = char_attributions.shape[0]\n", + "\n", + "all_plots = []\n", + "for i in range(1):\n", + " plt.figure(figsize=(10, 2))\n", + " plt.bar(range(len(text)), char_attributions[i])\n", + " plt.xticks(ticks=np.arange(len(text)), labels=list(text), rotation=90)\n", + " plt.title(f\"Attributions at character level for prediction {encoder.inverse_transform(np.array([predictions[i]]))[0] }\")\n", + " plt.xlabel(\"Characters in Text\")\n", + " plt.ylabel(\"Top Predictions\")\n", + " plt.tight_layout()\n", + " all_plots.append(plt)" + ] } ], "metadata": { From 269c76a30aec83afbfd0d98832d9d4844772c4c3 Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Wed, 12 Nov 2025 17:30:02 +0000 Subject: [PATCH 47/66] fix!(explainability): remove nan words and fix plotting --- .../utilities/plot_explainability.py | 103 +++++++++++------- 1 file changed, 66 insertions(+), 37 deletions(-) diff --git a/torchTextClassifiers/utilities/plot_explainability.py b/torchTextClassifiers/utilities/plot_explainability.py index a2d1a04..a5ad7f8 100644 --- a/torchTextClassifiers/utilities/plot_explainability.py +++ b/torchTextClassifiers/utilities/plot_explainability.py @@ -1,3 +1,5 @@ +from typing import List, Optional + import numpy as np import torch @@ -40,14 +42,16 @@ def map_attributions_to_char(attributions, offsets, text): if attributions.ndim == 1: attributions = attributions[None, :] - attributions_per_char = np.empty((attributions.shape[0], len(text))) # top_k, text_len + attributions_per_char = np.zeros((attributions.shape[0], len(text))) # top_k, text_len for token_idx, (start, end) in enumerate(offsets): - if start == end: + if start == end: # skip special tokens continue attributions_per_char[:, start:end] = attributions[:, token_idx][:, None] - return attributions_per_char + return np.exp(attributions_per_char) / np.sum( + np.exp(attributions_per_char), axis=1, keepdims=True + ) # softmax normalization def map_attributions_to_word(attributions, word_ids): @@ -71,9 +75,14 @@ def map_attributions_to_word(attributions, word_ids): # Convert None to -1 for easier processing (PAD tokens) word_ids_int = np.array([x if x is not None else -1 for x in word_ids], dtype=int) - # Consider only tokens that belong to actual words (non-PAD) + # Filter out PAD tokens from attributions and word_ids + attributions = attributions[ + torch.arange(attributions.shape[0])[:, None], + torch.tensor(np.where(word_ids_int != -1)[0])[None, :], + ] + word_ids_int = word_ids_int[word_ids_int != -1] unique_word_ids = np.unique(word_ids_int) - unique_word_ids = unique_word_ids[unique_word_ids != -1] + num_unique_words = len(unique_word_ids) top_k = attributions.shape[0] attr_with_word_id = np.concat( @@ -82,17 +91,25 @@ def map_attributions_to_word(attributions, word_ids): ) # top_k, seq_len, 2 # last dim is 2: 0 is the attribution of the token, 1 is the word_id the token is associated to - word_attributions = np.zeros((top_k, len(word_ids_int))) + word_attributions = np.zeros((top_k, num_unique_words)) for word_id in unique_word_ids: mask = attr_with_word_id[:, :, 1] == word_id # top_k, seq_len word_attributions[:, word_id] = (attr_with_word_id[:, :, 0] * mask).sum( axis=1 ) # zero-out non-matching tokens and sum attributions for all tokens belonging to the same word - return word_attributions + # assert word_attributions.sum(axis=1) == attributions.sum(axis=1), "Sum of word attributions per top_k must equal sum of token attributions per top_k." + return np.exp(word_attributions) / np.sum( + np.exp(word_attributions), axis=1, keepdims=True + ) # softmax normalization -def plot_attributions_at_char(text, attributions_per_char, title="Attributions", figsize=(10, 2)): +def plot_attributions_at_char( + text: str, + attributions_per_char: np.ndarray, + figsize=(10, 2), + titles: Optional[List[str]] = None, +): """ Plots character-level attributions as a heatmap. Args: @@ -107,23 +124,26 @@ def plot_attributions_at_char(text, attributions_per_char, title="Attributions", raise ImportError( "matplotlib is required for plotting. Please install it to use this function." ) - - plt.figure(figsize=figsize) - plt.imshow(attributions_per_char, aspect="auto", cmap="viridis") - plt.colorbar(label="Attribution Score") - plt.yticks( - ticks=np.arange(attributions_per_char.shape[0]), - labels=[f"Top {i+1}" for i in range(attributions_per_char.shape[0])], - ) - plt.xticks(ticks=np.arange(len(text)), labels=list(text), rotation=90) - plt.title(title) - plt.xlabel("Characters in Text") - plt.ylabel("Top Predictions") - plt.tight_layout() - plt.show() - - -def plot_attributions_at_word(text, attributions_per_word, title="Attributions", figsize=(10, 2)): + top_k = attributions_per_char.shape[0] + + all_plots = [] + for i in range(top_k): + fig, ax = plt.subplots(figsize=figsize) + ax.bar(range(len(text)), attributions_per_char[i]) + ax.set_xticks(np.arange(len(text))) + ax.set_xticklabels(list(text), rotation=90) + title = titles[i] if titles is not None else f"Attributions for Top {i+1} Prediction" + ax.set_title(title) + ax.set_xlabel("Characters in Text") + ax.set_ylabel("Top Predictions") + all_plots.append(fig) + + return all_plots + + +def plot_attributions_at_word( + text, attributions_per_word, figsize=(10, 2), titles: Optional[List[str]] = None +): """ Plots word-level attributions as a heatmap. Args: @@ -140,16 +160,25 @@ def plot_attributions_at_word(text, attributions_per_word, title="Attributions", ) words = text.split() - plt.figure(figsize=figsize) - plt.imshow(attributions_per_word, aspect="auto", cmap="viridis") - plt.colorbar(label="Attribution Score") - plt.yticks( - ticks=np.arange(attributions_per_word.shape[0]), - labels=[f"Top {i+1}" for i in range(attributions_per_word.shape[0])], - ) - plt.xticks(ticks=np.arange(len(words)), labels=words, rotation=90) - plt.title(title) - plt.xlabel("Words in Text") - plt.ylabel("Top Predictions") - plt.tight_layout() + top_k = attributions_per_word.shape[0] + all_plots = [] + for i in range(top_k): + fig, ax = plt.subplots(figsize=figsize) + ax.bar(range(len(words)), attributions_per_word[i]) + ax.set_xticks(np.arange(len(words))) + ax.set_xticklabels(words, rotation=90) + title = titles[i] if titles is not None else f"Attributions for Top {i+1} Prediction" + ax.set_title(title) + ax.set_xlabel("Words in Text") + ax.set_ylabel("Attributions") + all_plots.append(fig) + + return all_plots + + +def figshow(figure): + # https://stackoverflow.com/questions/53088212/create-multiple-figures-in-pyplot-but-only-show-one + for i in plt.get_fignums(): + if figure != plt.figure(i): + plt.close(plt.figure(i)) plt.show() From 89cc8fe50b35517904a98ecefe130df62fdc297e Mon Sep 17 00:00:00 2001 From: micedre Date: Wed, 12 Nov 2025 13:19:04 +0000 Subject: [PATCH 48/66] examples : fix basic_classification after refactor --- examples/basic_classification.py | 58 ++++++++++++++++++++------------ 1 file changed, 37 insertions(+), 21 deletions(-) diff --git a/examples/basic_classification.py b/examples/basic_classification.py index da0e5bc..dae4adf 100644 --- a/examples/basic_classification.py +++ b/examples/basic_classification.py @@ -6,7 +6,9 @@ """ import numpy as np -from torchTextClassifiers import create_fasttext +from torchTextClassifiers import ModelConfig, TrainingConfig, torchTextClassifiers +from torchTextClassifiers.tokenizers import WordPieceTokenizer + def main(): print("🚀 Basic Text Classification Example") @@ -48,43 +50,57 @@ def main(): print(f"Validation samples: {len(X_val)}") print(f"Test samples: {len(X_test)}") - # Create FastText classifier - print("\n🏗️ Creating FastText classifier...") - classifier = create_fasttext( + # Create and train tokenizer + print("\n🏗️ Creating and training WordPiece tokenizer...") + tokenizer = WordPieceTokenizer(vocab_size=5000, output_dim=128) + + # Train tokenizer on the training corpus + training_corpus = X_train.tolist() + tokenizer.train(training_corpus) + print("✅ Tokenizer trained successfully!") + + # Create model configuration + print("\n🔧 Creating model configuration...") + model_config = ModelConfig( embedding_dim=50, - sparse=False, - num_tokens=5000, - min_count=1, - min_n=3, - max_n=6, - len_word_ngrams=2, num_classes=2 ) - - # Build the model - print("\n🔨 Building model...") - classifier.build(X_train, y_train) - print("✅ Model built successfully!") + + # Create classifier + print("\n🔨 Creating classifier...") + classifier = torchTextClassifiers( + tokenizer=tokenizer, + model_config=model_config + ) + print("✅ Classifier created successfully!") # Train the model print("\n🎯 Training model...") - classifier.train( - X_train, y_train, X_val, y_val, + training_config = TrainingConfig( num_epochs=20, batch_size=4, - patience_train=5, + lr=1e-3, + patience_early_stopping=5, + num_workers=0 # Use 0 for simple examples to avoid multiprocessing issues + ) + classifier.train( + X_train, y_train, X_val, y_val, + training_config=training_config, verbose=True ) print("✅ Training completed!") # Make predictions print("\n🔮 Making predictions...") - predictions = classifier.predict(X_test) + result = classifier.predict(X_test) + predictions = result["prediction"].squeeze().numpy() # Extract predictions from dictionary + confidence = result["confidence"].squeeze().numpy() # Extract confidence scores print(f"Predictions: {predictions}") + print(f"Confidence: {confidence}") print(f"True labels: {y_test}") - + # Calculate accuracy - accuracy = classifier.validate(X_test, y_test) + accuracy = (predictions == y_test).mean() print(f"Test accuracy: {accuracy:.3f}") # Show detailed results From 1b62eeeba71be0541a35b11ea9b136fc9fc62f57 Mon Sep 17 00:00:00 2001 From: micedre Date: Wed, 12 Nov 2025 13:29:34 +0000 Subject: [PATCH 49/66] Fix check for categorical variable --- torchTextClassifiers/torchTextClassifiers.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/torchTextClassifiers/torchTextClassifiers.py b/torchTextClassifiers/torchTextClassifiers.py index b6fd62d..016a021 100644 --- a/torchTextClassifiers/torchTextClassifiers.py +++ b/torchTextClassifiers/torchTextClassifiers.py @@ -223,12 +223,6 @@ def train( X_train, y_train = self._check_XY(X_train, y_train) X_val, y_val = self._check_XY(X_val, y_val) - assert ( - X_train["categorical_variables"].ndim > 1 - and X_train["categorical_variables"].shape[1] == X_val["categorical_variables"].shape[1] - or X_val["categorical_variables"].ndim == 1 - ) - if ( X_train["categorical_variables"] is not None and X_val["categorical_variables"] is not None From 704fe1492056664956ac86d81fc68b41b37dbad9 Mon Sep 17 00:00:00 2001 From: micedre Date: Thu, 13 Nov 2025 12:45:32 +0000 Subject: [PATCH 50/66] Adapt examples to new package architecture --- examples/advanced_training.py | 214 +++++++++------ examples/basic_classification.py | 69 ++++- examples/multiclass_classification.py | 94 +++++-- examples/simple_explainability_example.py | 303 ++++++++++++++-------- examples/using_additional_features.py | 225 ++++++++-------- 5 files changed, 575 insertions(+), 330 deletions(-) diff --git a/examples/advanced_training.py b/examples/advanced_training.py index e03f075..a1a21bb 100644 --- a/examples/advanced_training.py +++ b/examples/advanced_training.py @@ -6,13 +6,44 @@ and training monitoring. """ +import os +import random +import warnings + import numpy as np -from torchTextClassifiers import create_fasttext +import torch +from pytorch_lightning import seed_everything + +from torchTextClassifiers import ModelConfig, TrainingConfig, torchTextClassifiers +from torchTextClassifiers.tokenizers import WordPieceTokenizer def main(): + # Set seed for reproducibility + SEED = 42 + + # Set environment variables for full reproducibility + os.environ['PYTHONHASHSEED'] = str(SEED) + os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8' + + # Use PyTorch Lightning's seed_everything for comprehensive seeding + seed_everything(SEED, workers=True) + + # Make PyTorch operations deterministic + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + torch.use_deterministic_algorithms(True, warn_only=True) + + # Suppress PyTorch Lightning warnings for cleaner output + warnings.filterwarnings( + 'ignore', + message='.*', + category=UserWarning, + module='pytorch_lightning' + ) + print("⚙️ Advanced Training Configuration Example") print("=" * 50) - + # Create a larger dataset for demonstrating advanced training print("📝 Creating training dataset...") @@ -67,54 +98,63 @@ def main(): print(f"Training samples: {len(X_train)}") print(f"Validation samples: {len(X_val)}") print(f"Test samples: {len(X_test)}") - - # Create FastText classifier - print("\n🏗️ Creating FastText classifier...") - classifier = create_fasttext( + + # Create and train tokenizer (shared across all examples) + print("\n🏗️ Creating and training WordPiece tokenizer...") + tokenizer = WordPieceTokenizer(vocab_size=5000, output_dim=128) + training_corpus = X_train.tolist() + tokenizer.train(training_corpus) + print("✅ Tokenizer trained successfully!") + + # Example 1: Basic training with default settings + print("\n🎯 Example 1: Basic training with default settings...") + + model_config = ModelConfig( embedding_dim=100, - sparse=False, - num_tokens=10000, - min_count=1, - min_n=3, - max_n=6, - len_word_ngrams=2, num_classes=2 ) - - # Build the model - print("\n🔨 Building model...") - classifier.build(X_train, y_train) - print("✅ Model built successfully!") - - # Example 1: Basic training with default settings - print("\n🎯 Example 1: Basic training with default settings...") - classifier.train( - X_train, y_train, X_val, y_val, + + classifier = torchTextClassifiers( + tokenizer=tokenizer, + model_config=model_config + ) + print("✅ Classifier created successfully!") + + training_config = TrainingConfig( num_epochs=15, batch_size=8, - patience_train=5, + lr=1e-3, + patience_early_stopping=5, + num_workers=0, + trainer_params={'deterministic': True} + ) + + classifier.train( + X_train, y_train, X_val, y_val, + training_config=training_config, verbose=True ) - - basic_accuracy = classifier.validate(X_test, y_test) + + result = classifier.predict(X_test) + basic_predictions = result["prediction"].squeeze().numpy() + basic_accuracy = (basic_predictions == y_test).mean() print(f"✅ Basic training completed! Accuracy: {basic_accuracy:.3f}") # Example 2: Advanced training with custom Lightning trainer parameters print("\n🚀 Example 2: Advanced training with custom parameters...") - + # Create a new classifier for comparison - advanced_classifier = create_fasttext( + advanced_model_config = ModelConfig( embedding_dim=100, - sparse=False, - num_tokens=10000, - min_count=1, - min_n=3, - max_n=6, - len_word_ngrams=2, num_classes=2 ) - advanced_classifier.build(X_train, y_train) - + + advanced_classifier = torchTextClassifiers( + tokenizer=tokenizer, + model_config=advanced_model_config + ) + print("✅ Advanced classifier created successfully!") + # Custom trainer parameters for advanced features advanced_trainer_params = { 'accelerator': 'auto', # Use GPU if available, else CPU @@ -125,62 +165,77 @@ def main(): 'enable_progress_bar': True, # Show progress bar 'log_every_n_steps': 5, # Log every 5 steps } - - advanced_classifier.train( - X_train, y_train, X_val, y_val, + + advanced_training_config = TrainingConfig( num_epochs=20, batch_size=4, # Smaller batch size with grad accumulation - patience_train=7, - trainer_params=advanced_trainer_params, + lr=1e-3, + patience_early_stopping=7, + num_workers=0, + cpu_run=False, # Don't override accelerator from trainer_params + trainer_params=advanced_trainer_params + ) + + advanced_classifier.train( + X_train, y_train, X_val, y_val, + training_config=advanced_training_config, verbose=True ) - - advanced_accuracy = advanced_classifier.validate(X_test, y_test) + + advanced_result = advanced_classifier.predict(X_test) + advanced_predictions = advanced_result["prediction"].squeeze().numpy() + advanced_accuracy = (advanced_predictions == y_test).mean() print(f"✅ Advanced training completed! Accuracy: {advanced_accuracy:.3f}") # Example 3: Training with CPU-only (useful for small datasets or debugging) print("\n💻 Example 3: CPU-only training...") - - cpu_classifier = create_fasttext( + + cpu_model_config = ModelConfig( embedding_dim=64, # Smaller embedding for faster CPU training - sparse=True, # Sparse embeddings for efficiency - num_tokens=5000, - min_count=1, - min_n=3, - max_n=6, - len_word_ngrams=2, num_classes=2 ) - cpu_classifier.build(X_train, y_train) - - cpu_classifier.train( - X_train, y_train, X_val, y_val, + + cpu_classifier = torchTextClassifiers( + tokenizer=tokenizer, + model_config=cpu_model_config + ) + print("✅ CPU classifier created successfully!") + + cpu_training_config = TrainingConfig( num_epochs=10, batch_size=16, # Larger batch size for CPU - cpu_run=True, # Force CPU usage + lr=1e-3, + patience_early_stopping=3, + cpu_run=False, # Don't override accelerator from trainer_params num_workers=0, # No multiprocessing for CPU - patience_train=3, + trainer_params={'deterministic': True, 'accelerator': 'cpu'} + ) + + cpu_classifier.train( + X_train, y_train, X_val, y_val, + training_config=cpu_training_config, verbose=True ) - - cpu_accuracy = cpu_classifier.validate(X_test, y_test) + + cpu_result = cpu_classifier.predict(X_test) + cpu_predictions = cpu_result["prediction"].squeeze().numpy() + cpu_accuracy = (cpu_predictions == y_test).mean() print(f"✅ CPU training completed! Accuracy: {cpu_accuracy:.3f}") # Example 4: Custom training with specific Lightning callbacks print("\n🔧 Example 4: Training with custom callbacks...") - - custom_classifier = create_fasttext( + + custom_model_config = ModelConfig( embedding_dim=128, - sparse=False, - num_tokens=8000, - min_count=1, - min_n=3, - max_n=6, - len_word_ngrams=2, num_classes=2 ) - custom_classifier.build(X_train, y_train) - + + custom_classifier = torchTextClassifiers( + tokenizer=tokenizer, + model_config=custom_model_config + ) + print("✅ Custom classifier created successfully!") + # Custom trainer with specific monitoring and checkpointing custom_trainer_params = { 'max_epochs': 25, @@ -189,18 +244,27 @@ def main(): 'check_val_every_n_epoch': 2, # Validate every 2 epochs 'enable_checkpointing': True, 'enable_model_summary': True, + 'deterministic': True, } - - custom_classifier.train( - X_train, y_train, X_val, y_val, + + custom_training_config = TrainingConfig( num_epochs=25, batch_size=6, - patience_train=8, - trainer_params=custom_trainer_params, + lr=1e-3, + patience_early_stopping=8, + num_workers=0, + trainer_params=custom_trainer_params + ) + + custom_classifier.train( + X_train, y_train, X_val, y_val, + training_config=custom_training_config, verbose=True ) - - custom_accuracy = custom_classifier.validate(X_test, y_test) + + custom_result = custom_classifier.predict(X_test) + custom_predictions = custom_result["prediction"].squeeze().numpy() + custom_accuracy = (custom_predictions == y_test).mean() print(f"✅ Custom training completed! Accuracy: {custom_accuracy:.3f}") # Compare all training approaches diff --git a/examples/basic_classification.py b/examples/basic_classification.py index dae4adf..2d1de9d 100644 --- a/examples/basic_classification.py +++ b/examples/basic_classification.py @@ -2,18 +2,31 @@ Basic Text Classification Example This example demonstrates how to use torchTextClassifiers for binary -text classification using the FastText classifier. +text classification using the Wrapper. """ +import os +import random +import warnings + import numpy as np +import torch from torchTextClassifiers import ModelConfig, TrainingConfig, torchTextClassifiers from torchTextClassifiers.tokenizers import WordPieceTokenizer def main(): + # Suppress PyTorch Lightning batch_size inference warnings for cleaner output + warnings.filterwarnings( + 'ignore', + message='.*', + category=UserWarning, + module='pytorch_lightning' + ) + print("🚀 Basic Text Classification Example") print("=" * 50) - + # Create sample data print("📝 Creating sample data...") X_train = np.array([ @@ -26,25 +39,58 @@ def main(): "Perfect! Exactly what I was looking for.", "Waste of money. Should have read reviews first.", "Outstanding product with excellent build quality.", - "Cheap plastic, feels like it will break soon." + "Cheap plastic, feels like it will break soon.", + "Absolutely fantastic! Exceeded all my expectations.", + "Horrible experience. Customer service was rude and unhelpful.", + "Best purchase I've made this year. Five stars!", + "Defective item arrived. Packaging was also damaged.", + "Super impressed with the performance and durability.", + "Total disappointment. Doesn't match the description at all.", + "Wonderful product! My whole family loves it.", + "Avoid at all costs. Complete waste of time and money.", + "Remarkable quality for the price. Very satisfied!", + "Broke within a week. Clearly poor manufacturing.", + "Exceptional value! Would definitely buy again.", + "Misleading photos. Product looks nothing like advertised.", + "Works like a charm. Installation was easy too.", + "Returned it immediately. Not worth even half the price.", + "Beautiful design and sturdy construction. Love it!", + "Arrived late and damaged. Very frustrating experience.", + "Top-notch quality! Highly recommend to everyone.", + "Uncomfortable and poorly made. Regret buying this.", + "Perfect fit and great finish. Couldn't be happier!", + "Stopped working after two uses. Complete junk." ]) - - y_train = np.array([1, 0, 1, 0, 1, 0, 1, 0, 1, 0]) # 1=positive, 0=negative + + y_train = np.array([1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]) # 1=positive, 0=negative # Validation data X_val = np.array([ "Good product, satisfied with purchase.", - "Not worth the money, poor quality." + "Not worth the money, poor quality.", + "Really happy with this purchase. Great item!", + "Disappointed with the quality. Expected better.", + "Solid product that does what it promises.", + "Don't waste your money on this. Very poor.", + "Impressive quality and quick delivery.", + "Malfunctioned right out of the box. Terrible." ]) - y_val = np.array([1, 0]) + y_val = np.array([1, 0, 1, 0, 1, 0, 1, 0]) # Test data X_test = np.array([ "This is an amazing product with great features!", "Completely disappointed with this purchase.", - "Excellent build quality and works as expected." + "Excellent build quality and works as expected.", + "Not recommended. Had issues from day one.", + "Fantastic product! Worth every penny.", + "Failed to meet basic expectations. Very poor.", + "Love it! Exactly as described and high quality.", + "Cheap materials and sloppy construction. Avoid.", + "Superb performance and easy to use. Highly satisfied!", + "Unreliable and frustrating. Should have bought elsewhere." ]) - y_test = np.array([1, 0, 1]) + y_test = np.array([1, 0, 1, 0, 1, 0, 1, 0, 1, 0]) print(f"Training samples: {len(X_train)}") print(f"Validation samples: {len(X_val)}") @@ -73,7 +119,7 @@ def main(): model_config=model_config ) print("✅ Classifier created successfully!") - + print(classifier) # Train the model print("\n🎯 Training model...") training_config = TrainingConfig( @@ -81,7 +127,7 @@ def main(): batch_size=4, lr=1e-3, patience_early_stopping=5, - num_workers=0 # Use 0 for simple examples to avoid multiprocessing issues + num_workers=0, # Use 0 for simple examples to avoid multiprocessing issues ) classifier.train( X_train, y_train, X_val, y_val, @@ -115,5 +161,6 @@ def main(): print("\n🎉 Example completed successfully!") + if __name__ == "__main__": main() \ No newline at end of file diff --git a/examples/multiclass_classification.py b/examples/multiclass_classification.py index e8863b6..8498e17 100644 --- a/examples/multiclass_classification.py +++ b/examples/multiclass_classification.py @@ -2,17 +2,48 @@ Multi-class Text Classification Example This example demonstrates multi-class text classification using -torchTextClassifiers with FastText for sentiment analysis with +torchTextClassifiers for sentiment analysis with 3 classes: positive, negative, and neutral. """ +import os +import random +import warnings + import numpy as np -from torchTextClassifiers import create_fasttext +import torch +from pytorch_lightning import seed_everything + +from torchTextClassifiers import ModelConfig, TrainingConfig, torchTextClassifiers +from torchTextClassifiers.tokenizers import WordPieceTokenizer def main(): + # Set seed for reproducibility + SEED = 42 + + # Set environment variables for full reproducibility + os.environ['PYTHONHASHSEED'] = str(SEED) + os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8' + + # Use PyTorch Lightning's seed_everything for comprehensive seeding + seed_everything(SEED, workers=True) + + # Make PyTorch operations deterministic + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + torch.use_deterministic_algorithms(True, warn_only=True) + + # Suppress PyTorch Lightning warnings for cleaner output + warnings.filterwarnings( + 'ignore', + message='.*', + category=UserWarning, + module='pytorch_lightning' + ) + print("🎭 Multi-class Text Classification Example") print("=" * 50) - + # Create multi-class sample data (3 classes: 0=negative, 1=neutral, 2=positive) print("📝 Creating multi-class sentiment data...") X_train = np.array([ @@ -63,44 +94,55 @@ def main(): print(f"Training samples: {len(X_train)}") print(f"Class distribution: Negative={sum(y_train==0)}, Neutral={sum(y_train==1)}, Positive={sum(y_train==2)}") - - # Create FastText classifier for 3 classes - print("\n🏗️ Creating multi-class FastText classifier...") - classifier = create_fasttext( + + # Create and train tokenizer + print("\n🏗️ Creating and training WordPiece tokenizer...") + tokenizer = WordPieceTokenizer(vocab_size=5000, output_dim=128) + training_corpus = X_train.tolist() + tokenizer.train(training_corpus) + print("✅ Tokenizer trained successfully!") + + # Create model configuration for 3 classes + print("\n🔧 Creating model configuration...") + model_config = ModelConfig( embedding_dim=64, - sparse=False, - num_tokens=8000, - min_count=1, - min_n=3, - max_n=6, - len_word_ngrams=2, - num_classes=3 # 3 classes for sentiment + num_classes=3 # 3 classes for sentiment (negative, neutral, positive) ) - - # Build the model - print("\n🔨 Building model...") - classifier.build(X_train, y_train) - print("✅ Model built successfully!") - + + # Create classifier + print("\n🔨 Creating multi-class classifier...") + classifier = torchTextClassifiers( + tokenizer=tokenizer, + model_config=model_config + ) + print("✅ Classifier created successfully!") + # Train the model print("\n🎯 Training model...") - classifier.train( - X_train, y_train, X_val, y_val, + training_config = TrainingConfig( num_epochs=30, batch_size=8, - patience_train=7, + lr=1e-3, + patience_early_stopping=7, + num_workers=0, + trainer_params={'deterministic': True} + ) + classifier.train( + X_train, y_train, X_val, y_val, + training_config=training_config, verbose=True ) print("✅ Training completed!") # Make predictions print("\n🔮 Making predictions...") - predictions = classifier.predict(X_test) + result = classifier.predict(X_test) + predictions = result["prediction"].squeeze().numpy() print(f"Predictions: {predictions}") print(f"True labels: {y_test}") - + # Calculate accuracy - accuracy = classifier.validate(X_test, y_test) + accuracy = (predictions == y_test).mean() print(f"Test accuracy: {accuracy:.3f}") # Define class names for better output diff --git a/examples/simple_explainability_example.py b/examples/simple_explainability_example.py index c1c44c2..57bcadd 100644 --- a/examples/simple_explainability_example.py +++ b/examples/simple_explainability_example.py @@ -2,14 +2,48 @@ Simple Explainability Example with ASCII Visualization """ -import numpy as np +import os import sys -from torchTextClassifiers import create_fasttext +import warnings + +import numpy as np +import torch +from pytorch_lightning import seed_everything + +from torchTextClassifiers import ModelConfig, TrainingConfig, torchTextClassifiers +from torchTextClassifiers.tokenizers import WordPieceTokenizer +from torchTextClassifiers.utilities.plot_explainability import ( + map_attributions_to_char, + map_attributions_to_word, +) def main(): + # Set seed for reproducibility + SEED = 42 + + # Set environment variables for full reproducibility + os.environ['PYTHONHASHSEED'] = str(SEED) + os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8' + + # Use PyTorch Lightning's seed_everything for comprehensive seeding + seed_everything(SEED, workers=True) + + # Make PyTorch operations deterministic + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + torch.use_deterministic_algorithms(True, warn_only=True) + + # Suppress PyTorch Lightning warnings for cleaner output + warnings.filterwarnings( + 'ignore', + message='.*', + category=UserWarning, + module='pytorch_lightning' + ) + print("🔍 Simple Explainability Example") - + # Enhanced training data with more diverse examples X_train = np.array([ # Positive examples @@ -55,29 +89,51 @@ def main(): ]) X_val = np.array([ - "Good product with decent quality", + "Good product with decent quality", "Bad quality and poor service", "Excellent value and great design", "Terrible experience and awful quality" ]) y_val = np.array([1, 0, 1, 0]) - - # Create classifier - classifier = create_fasttext( + + # Create and train tokenizer + print("\n🏗️ Creating and training WordPiece tokenizer...") + tokenizer = WordPieceTokenizer(vocab_size=5000, output_dim=128) + training_corpus = X_train.tolist() + tokenizer.train(training_corpus) + print("✅ Tokenizer trained successfully!") + + # Create model configuration + print("\n🔧 Creating model configuration...") + model_config = ModelConfig( embedding_dim=50, - sparse=False, - num_tokens=1000, - min_count=1, - min_n=3, - max_n=6, - len_word_ngrams=2, - num_classes=2, - direct_bagging=False # Required for explainability + num_classes=2 ) - - # Train - classifier.build(X_train, y_train) - classifier.train(X_train, y_train, X_val, y_val, num_epochs=25, batch_size=8, verbose=False) + + # Create classifier + print("\n🔨 Creating classifier...") + classifier = torchTextClassifiers( + tokenizer=tokenizer, + model_config=model_config + ) + print("✅ Classifier created successfully!") + + # Train the model + print("\n🎯 Training model...") + training_config = TrainingConfig( + num_epochs=25, + batch_size=8, + lr=1e-3, + patience_early_stopping=5, + num_workers=0, + trainer_params={'deterministic': True} + ) + classifier.train( + X_train, y_train, X_val, y_val, + training_config=training_config, + verbose=True + ) + print("✅ Training completed!") # Test examples with different sentiments test_texts = [ @@ -94,49 +150,70 @@ def main(): for i, test_text in enumerate(test_texts, 1): print(f"\n📝 Example {i}:") print(f"Text: '{test_text}'") - - # Get prediction - prediction = classifier.predict(np.array([test_text]))[0] - print(f"Prediction: {'Positive' if prediction == 1 else 'Negative'}") - - # Get explainability scores + + # Get prediction with explainability try: - pred, confidence, all_scores, all_scores_letters = classifier.predict_and_explain(np.array([test_text])) - - # Create ASCII histogram - if all_scores is not None and len(all_scores) > 0: - scores_data = all_scores[0][0] - if hasattr(scores_data, 'tolist'): - scores = scores_data.tolist() - else: - scores = [float(scores_data)] - - words = test_text.split() - - if len(words) == len(scores): - print("\n📊 Word Contribution Histogram:") - print("-" * 50) - - # Find max score for scaling - max_score = max(scores) if scores else 1 - bar_width = 30 # max bar width in characters - - for word, score in zip(words, scores): - # Calculate bar length - bar_length = int((score / max_score) * bar_width) - bar = "█" * bar_length - - # Format output - print(f"{word:>12} | {bar:<30} {score:.4f}") - - print("-" * 50) - else: - print(f"⚠️ Word/score mismatch: {len(words)} words vs {len(scores)} scores") - else: - print("⚠️ No explainability scores available") - + result = classifier.predict(np.array([test_text]), top_k=1, explain=True) + + # Extract prediction + prediction = result["prediction"][0][0].item() + confidence = result["confidence"][0][0].item() + print(f"Prediction: {'Positive' if prediction == 1 else 'Negative'} (confidence: {confidence:.4f})") + + # Extract attributions and mapping info + attributions = result["attributions"][0][0] # shape: (seq_len,) + offset_mapping = result["offset_mapping"][0] # List of (start, end) tuples + word_ids = result["word_ids"][0] # List of word IDs for each token + + # Map token-level attributions to character-level (for ASCII visualization) + char_attributions = map_attributions_to_char( + attributions.unsqueeze(0), # Add batch dimension: (1, seq_len) + offset_mapping, + test_text + )[0] # Get first result + + print("\n📊 Character-Level Contribution Visualization:") + print("-" * 60) + + # Create a simple ASCII visualization by character + max_attr = max(char_attributions) if len(char_attributions) > 0 else 1 + bar_width = 40 + + # Group characters into words for better readability + words = test_text.split() + char_idx = 0 + + for word in words: + word_len = len(word) + # Get attributions for this word + word_attrs = char_attributions[char_idx:char_idx + word_len] + if len(word_attrs) > 0: + avg_attr = sum(word_attrs) / len(word_attrs) + bar_length = int((avg_attr / max_attr) * bar_width) if max_attr > 0 else 0 + bar = "█" * bar_length + print(f"{word:>15} | {bar:<40} {avg_attr:.4f}") + char_idx += word_len + 1 # +1 for space + + print("-" * 60) + + # Show top contributing word + char_idx = 0 + word_scores = [] + for word in words: + word_len = len(word) + word_attrs = char_attributions[char_idx:char_idx + word_len] + if len(word_attrs) > 0: + word_scores.append((word, sum(word_attrs) / len(word_attrs))) + char_idx += word_len + 1 + + if word_scores: + top_word, top_score = max(word_scores, key=lambda x: x[1]) + print(f"💡 Most influential word: '{top_word}' (avg score: {top_score:.4f})") + except Exception as e: print(f"⚠️ Explainability failed: {e}") + import traceback + traceback.print_exc() # Analysis completed for this example print(f"✅ Analysis completed for example {i}") @@ -164,56 +241,72 @@ def main(): continue print(f"\n🔍 Analyzing: '{user_text}'") - - # Get prediction - prediction = classifier.predict(np.array([user_text]))[0] - sentiment = "Positive" if prediction == 1 else "Negative" - print(f"🎯 Prediction: {sentiment}") - - # Get explainability scores + + # Get prediction with explainability try: - pred, confidence, all_scores, all_scores_letters = classifier.predict_and_explain(np.array([user_text])) - - # Create ASCII histogram - if all_scores is not None and len(all_scores) > 0: - scores_data = all_scores[0][0] - if hasattr(scores_data, 'tolist'): - scores = scores_data.tolist() - else: - scores = [float(scores_data)] - - words = user_text.split() - - if len(words) == len(scores): - print("\n📊 Word Contribution Histogram:") - print("-" * 50) - - # Find max score for scaling - max_score = max(scores) if scores else 1 - bar_width = 30 # max bar width in characters - - for word, score in zip(words, scores): - # Calculate bar length - bar_length = int((score / max_score) * bar_width) - bar = "█" * bar_length - - # Format output - print(f"{word:>12} | {bar:<30} {score:.4f}") - - print("-" * 50) - - # Show interpretation - top_word = max(zip(words, scores), key=lambda x: x[1]) - print(f"💡 Most influential word: '{top_word[0]}' (score: {top_word[1]:.4f})") - - else: - print(f"⚠️ Word/score mismatch: {len(words)} words vs {len(scores)} scores") - else: - print("⚠️ No explainability scores available") - + result = classifier.predict(np.array([user_text]), top_k=1, explain=True) + + # Extract prediction + prediction = result["prediction"][0][0].item() + confidence = result["confidence"][0][0].item() + sentiment = "Positive" if prediction == 1 else "Negative" + print(f"🎯 Prediction: {sentiment} (confidence: {confidence:.4f})") + + # Extract attributions and mapping info + attributions = result["attributions"][0][0] # shape: (seq_len,) + offset_mapping = result["offset_mapping"][0] # List of (start, end) tuples + word_ids = result["word_ids"][0] # List of word IDs for each token + + # Map token-level attributions to character-level (for ASCII visualization) + char_attributions = map_attributions_to_char( + attributions.unsqueeze(0), # Add batch dimension: (1, seq_len) + offset_mapping, + user_text + )[0] # Get first result + + print("\n📊 Character-Level Contribution Visualization:") + print("-" * 60) + + # Create a simple ASCII visualization by character + max_attr = max(char_attributions) if len(char_attributions) > 0 else 1 + bar_width = 40 + + # Group characters into words for better readability + words = user_text.split() + char_idx = 0 + + for word in words: + word_len = len(word) + # Get attributions for this word + word_attrs = char_attributions[char_idx:char_idx + word_len] + if len(word_attrs) > 0: + avg_attr = sum(word_attrs) / len(word_attrs) + bar_length = int((avg_attr / max_attr) * bar_width) if max_attr > 0 else 0 + bar = "█" * bar_length + print(f"{word:>15} | {bar:<40} {avg_attr:.4f}") + char_idx += word_len + 1 # +1 for space + + print("-" * 60) + + # Show interpretation + char_idx = 0 + word_scores = [] + for word in words: + word_len = len(word) + word_attrs = char_attributions[char_idx:char_idx + word_len] + if len(word_attrs) > 0: + word_scores.append((word, sum(word_attrs) / len(word_attrs))) + char_idx += word_len + 1 + + if word_scores: + top_word, top_score = max(word_scores, key=lambda x: x[1]) + print(f"💡 Most influential word: '{top_word}' (avg score: {top_score:.4f})") + except Exception as e: print(f"⚠️ Explainability failed: {e}") print("🔍 Prediction available, but detailed explanation unavailable.") + import traceback + traceback.print_exc() print("\n" + "-"*50) diff --git a/examples/using_additional_features.py b/examples/using_additional_features.py index e22796c..854c0e4 100644 --- a/examples/using_additional_features.py +++ b/examples/using_additional_features.py @@ -2,19 +2,26 @@ Categorical Features Comparison Example This example demonstrates the performance difference between: -1. A FastText classifier using only text features -2. A FastText classifier using both text and categorical features +1. A classifier using only text features +2. A classifier using both text and categorical features """ +import os +import random +import time +import warnings + import numpy as np import pandas as pd +import torch +from pytorch_lightning import seed_everything from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder -from torchTextClassifiers import create_fasttext -from torchTextClassifiers.utilities.preprocess import clean_text_feature -from torchTextClassifiers import torchTextClassifiers -from torchTextClassifiers.classifiers.simple_text_classifier import SimpleTextWrapper, SimpleTextConfig -import time + +from torchTextClassifiers import ModelConfig, TrainingConfig, torchTextClassifiers +from torchTextClassifiers.tokenizers import WordPieceTokenizer +# Note: SimpleTextWrapper is not available in the current version +# from torchTextClassifiers.classifiers.simple_text_classifier import SimpleTextConfig, SimpleTextWrapper def stratified_split_rare_labels(X, y, test_size=0.2, min_train_samples=1): # Get unique labels and their frequencies @@ -95,131 +102,102 @@ def train_and_evaluate_model(X, y, model_name, use_categorical=False, use_simple X_temp, y_temp, test_size=0.5 # Split temp 50/50 into validation and test ) + # Note: SimpleTextWrapper is not available in the current version + # The use_simple branch has been disabled if use_simple: - start_time = time.time() - - simple_text_config = SimpleTextConfig( - hidden_dim=128, - num_classes=5, - max_features=5000, - learning_rate=1e-3, - dropout_rate=0.2 - ) - wrapper = SimpleTextWrapper(simple_text_config) - classifier = torchTextClassifiers(wrapper) - print(f"Classifier type: {type(classifier.classifier).__name__}") - print(f"Uses tokenizer: {hasattr(classifier.classifier, 'tokenizer')}") - print(f"Uses vectorizer: {hasattr(classifier.classifier, 'vectorizer')}") - - # Build the model (this will use TF-IDF vectorization instead of tokenization) - print("\n🔨 Building model with TF-IDF preprocessing...") - classifier.build(X_train, y_train) - print("✅ Model built successfully!") - print(f"TF-IDF features: {len(classifier.classifier.vectorizer.get_feature_names_out())}") - - # Train the model - print("\n🎯 Training model...") - classifier.train( - X_train, y_train, X_val, y_val, - num_epochs=10, - batch_size=4, - patience_train=3, - verbose=True + raise NotImplementedError( + "SimpleTextWrapper is not available in the current version. " + "Please use the standard torchTextClassifiers with WordPieceTokenizer instead." ) - training_time = time.time() - start_time - accuracy = classifier.validate(X_test, y_test) - print(f"Test accuracy: {accuracy:.3f}") - - return { - 'model_name': model_name, - 'test_accuracy': accuracy, - 'training_time': training_time, - 'classifier': classifier - } - # Model parameters + # Create and train tokenizer + print(" 🏗️ Creating and training tokenizer...") + tokenizer = WordPieceTokenizer(vocab_size=5000, output_dim=128) + + # Extract text column for tokenizer training + if use_categorical: + text_data = X_train[:, 0].tolist() # First column is text + else: + text_data = X_train.tolist() # All data is text + + tokenizer.train(text_data) + print(" ✅ Tokenizer trained successfully!") + + # Model configuration if use_categorical: # For mixed model - get vocabulary sizes from data - # cat_data = X_train[:, 1:].astype(int) # Categorical features - # vocab_sizes = [int(np.max(cat_data[:, i]) + 1) for i in range(cat_data.shape[1])] - - model_params = { - "embedding_dim": 50, - "sparse": False, - "num_tokens": 50000, - "min_count": 1, - "min_n": 3, - "max_n": 6, - "len_word_ngrams": 2, - "categorical_embedding_dims": 10, - #"num_categorical_features": num_cat_var, - #"categorical_vocabulary_sizes": vocab_sizes, - #"categorical_embedding_dims": 10 - } - #print(f" Categorical vocabulary sizes: {vocab_sizes}") + cat_data = X_train[:, 1:].astype(int) # Categorical features + vocab_sizes = [int(np.max(cat_data[:, i]) + 1) for i in range(cat_data.shape[1])] + + model_config = ModelConfig( + embedding_dim=50, + categorical_vocabulary_sizes=vocab_sizes, + categorical_embedding_dims=10, + num_classes=5 + ) + print(f" Categorical vocabulary sizes: {vocab_sizes}") else: # For text-only model - model_params = { - "embedding_dim": 50, - "sparse": False, - - "num_tokens": 50000, - "min_count": 1, - "min_n": 3, - "max_n": 6, - "len_word_ngrams": 2 + model_config = ModelConfig( + embedding_dim=50, + num_classes=5 + ) + + # Create classifier + print(" 🔨 Creating classifier...") + classifier = torchTextClassifiers( + tokenizer=tokenizer, + model_config=model_config + ) + print(" ✅ Classifier created successfully!") + + # Training configuration + training_config = TrainingConfig( + num_epochs=50, + batch_size=128, + lr=0.001, + patience_early_stopping=3, + num_workers=0, + trainer_params={ + 'enable_progress_bar': True, + 'deterministic': True } - - # Training parameters - reduced to save disk space - train_params = { - "num_epochs": 50, - "batch_size": 128, - "patience_train": 3, - "lr": 0.001, - "verbose": True - } + ) - extra_trainer_params = { - "enable_progress_bar": True, - - } - # Create and build model start_time = time.time() - - classifier = create_fasttext(**model_params) - classifier.build(X_train, y_train) - - # Train model - disable logging to save disk space + + # Train model + print(" 🎯 Training model...") classifier.train( - X_train, y_train, X_val, y_val, **train_params, - trainer_params=extra_trainer_params + X_train, y_train, X_val, y_val, + training_config=training_config, + verbose=True ) training_time = time.time() - start_time - + # Handle predictions based on model type if use_categorical: - # Skip validation for mixed model due to categorical prediction bug print(" ✅ Running validation for text-with-categorical-variables model...") try: - test_accuracy = classifier.validate(X_test, y_test) - predictions = classifier.predict(X_test) + result = classifier.predict(X_test) + predictions = result["prediction"].squeeze().numpy() + test_accuracy = (predictions == y_test).mean() print(f" Test accuracy: {test_accuracy:.3f}") except Exception as e: print(f" ⚠️ Validation failed: {e}") - train_accuracy = 0.0 test_accuracy = 0.0 predictions = np.zeros(len(y_test)) else: # Text-only model works fine for predictions print(" ✅ Running validation for text-only model...") try: - test_accuracy = classifier.validate(X_test, y_test) - predictions = classifier.predict(X_test) + result = classifier.predict(X_test) + predictions = result["prediction"].squeeze().numpy() + test_accuracy = (predictions == y_test).mean() print(f" Test accuracy: {test_accuracy:.3f}") except Exception as e: print(f" ⚠️ Validation failed: {e}") - train_accuracy = 0.0 test_accuracy = 0.0 predictions = np.zeros(len(y_test)) @@ -236,11 +214,34 @@ def train_and_evaluate_model(X, y, model_name, use_categorical=False, use_simple def main(): - print("🔀 FastText Classifier: Categorical Features Comparison") + # Set seed for reproducibility + SEED = 42 + + # Set environment variables for full reproducibility + os.environ['PYTHONHASHSEED'] = str(SEED) + os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8' + + # Use PyTorch Lightning's seed_everything for comprehensive seeding + seed_everything(SEED, workers=True) + + # Make PyTorch operations deterministic + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + torch.use_deterministic_algorithms(True, warn_only=True) + + # Suppress PyTorch Lightning warnings for cleaner output + warnings.filterwarnings( + 'ignore', + message='.*', + category=UserWarning, + module='pytorch_lightning' + ) + + print("🔀 Classifier: Categorical Features Comparison") print("=" * 60) - print("Comparing FastText performance with and without categorical features") + print("Comparing performance with and without categorical features") print() - + # Load and prepare data (same as notebook) X_text_only, X_mixed, y, encoder = load_and_prepare_data() @@ -250,17 +251,17 @@ def main(): # Text-only model results_text_only = train_and_evaluate_model( - X_text_only, y, "Text-Only FastText", use_categorical=False + X_text_only, y, "Text-Only Classifier", use_categorical=False ) - + # Mixed model (text + categorical) results_mixed = train_and_evaluate_model( - X_mixed, y, "Mixed Features FastText", use_categorical=True + X_mixed, y, "Mixed Features Classifier", use_categorical=True ) - # TF-IDF classifier - results_tfidf = train_and_evaluate_model(X_text_only, y, "TF-IDF classifier", use_categorical=False, use_simple=True) - + # Note: TF-IDF classifier (SimpleTextWrapper) is not available in the current version + # results_tfidf = train_and_evaluate_model(X_text_only, y, "TF-IDF classifier", use_categorical=False, use_simple=True) + # Compare results print(f"\n📊 Results Comparison:") print("=" * 50) @@ -270,8 +271,6 @@ def main(): f"{results_text_only['test_accuracy']:<11.3f} {results_text_only['training_time']:<10.1f}") print(f"{'Mixed Features':<25} " f"{results_mixed['test_accuracy']:<11.3f} {results_mixed['training_time']:<10.1f}") - print(f"{'TF-IDF':<25} " - f"{results_tfidf['test_accuracy']:<11.3f} {results_tfidf['training_time']:<10.1f}") # Calculate improvements acc_improvement = results_mixed['test_accuracy'] - results_text_only['test_accuracy'] time_overhead = results_mixed['training_time'] - results_text_only['training_time'] From be4acf24ac34d8294758d0da18dbeda2ab9e325d Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Thu, 13 Nov 2025 14:35:46 +0000 Subject: [PATCH 51/66] chore: first draft of example notebook. WIP --- notebooks/example.ipynb | 838 ++++++---------------------------------- 1 file changed, 114 insertions(+), 724 deletions(-) diff --git a/notebooks/example.ipynb b/notebooks/example.ipynb index e77f89e..c29d17d 100644 --- a/notebooks/example.ipynb +++ b/notebooks/example.ipynb @@ -18,8 +18,8 @@ }, { "cell_type": "code", - "execution_count": 1, - "id": "2", + "execution_count": null, + "id": "1", "metadata": {}, "outputs": [], "source": [ @@ -47,7 +47,7 @@ }, { "cell_type": "markdown", - "id": "3", + "id": "2", "metadata": {}, "source": [ "# Load and preprocess data\n", @@ -58,212 +58,31 @@ }, { "cell_type": "code", - "execution_count": 2, - "id": "4", + "execution_count": null, + "id": "3", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "df = pd.read_parquet(\"https://minio.lab.sspcloud.fr/projet-ape/data/08112022_27102024/naf2008/split/df_train.parquet\")\n", - "df = df.sample(100000)" + "df = df.sample(10)" ] }, { "cell_type": "code", - "execution_count": 3, - "id": "5", + "execution_count": null, + "id": "4", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
apet_finalelibelleCJNATTYPSRFCRT
13906916201ZLe développement et l'exploitation d'un réseau...549999C0.0P
6676526831ZImmobilier : agent commercial en immobilier : ...NaNNaNR0.0NaN
811995320ZLe point relais est un service qui permet de s...NaNNaNX0.0P
6099237410ZCREATION OBJET PERSONNALISE MULTI SUPPORT, act...NaN99M0.0P
14940851071Dconfection de desserts, pâtisseries artisanale...NaN10M6.0P
........................
7755057311ZVente de prospect, Marketing publicitaire, Cré...549999C0.0P
3942299329ZDJ activité de servicesNaN99C0.0P
1494016820BL'acquisition, l'administration et la gestion ...654099G0.0P
4643647021ZAgence de communication, notamment le conseil ...549999C0.0P
2807075320ZLivraisons activité de servicesNaN99C0.0P
\n", - "

100000 rows × 7 columns

\n", - "
" - ], - "text/plain": [ - " apet_finale libelle CJ \\\n", - "1390691 6201Z Le développement et l'exploitation d'un réseau... 5499 \n", - "667652 6831Z Immobilier : agent commercial en immobilier : ... NaN \n", - "81199 5320Z Le point relais est un service qui permet de s... NaN \n", - "609923 7410Z CREATION OBJET PERSONNALISE MULTI SUPPORT, act... NaN \n", - "1494085 1071D confection de desserts, pâtisseries artisanale... NaN \n", - "... ... ... ... \n", - "775505 7311Z Vente de prospect, Marketing publicitaire, Cré... 5499 \n", - "394229 9329Z DJ activité de services NaN \n", - "149401 6820B L'acquisition, l'administration et la gestion ... 6540 \n", - "464364 7021Z Agence de communication, notamment le conseil ... 5499 \n", - "280707 5320Z Livraisons activité de services NaN \n", - "\n", - " NAT TYP SRF CRT \n", - "1390691 99 C 0.0 P \n", - "667652 NaN R 0.0 NaN \n", - "81199 NaN X 0.0 P \n", - "609923 99 M 0.0 P \n", - "1494085 10 M 6.0 P \n", - "... ... .. ... ... \n", - "775505 99 C 0.0 P \n", - "394229 99 C 0.0 P \n", - "149401 99 G 0.0 P \n", - "464364 99 C 0.0 P \n", - "280707 99 C 0.0 P \n", - "\n", - "[100000 rows x 7 columns]" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df" ] }, { "cell_type": "code", - "execution_count": 4, - "id": "7", + "execution_count": null, + "id": "5", "metadata": {}, "outputs": [], "source": [ @@ -342,8 +161,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "id": "8", + "execution_count": null, + "id": "6", "metadata": {}, "outputs": [], "source": [ @@ -354,7 +173,7 @@ }, { "cell_type": "markdown", - "id": "9", + "id": "7", "metadata": {}, "source": [ "Right now, the model requires the label (variable y) to be a numerical\n", @@ -368,8 +187,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "id": "10", + "execution_count": null, + "id": "8", "metadata": {}, "outputs": [], "source": [ @@ -379,7 +198,7 @@ }, { "cell_type": "markdown", - "id": "11", + "id": "9", "metadata": {}, "source": [ "The function `clean_and_tokenize_df` requires special `DataFrame`\n", @@ -392,8 +211,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "id": "12", + "execution_count": null, + "id": "10", "metadata": {}, "outputs": [], "source": [ @@ -404,28 +223,17 @@ }, { "cell_type": "code", - "execution_count": 8, - "id": "13", + "execution_count": null, + "id": "11", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "((100000, 6), (100000,))" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "X.shape, y.shape" ] }, { "cell_type": "markdown", - "id": "14", + "id": "12", "metadata": {}, "source": [ "## Splitting in train-test sets\n", @@ -440,8 +248,8 @@ }, { "cell_type": "code", - "execution_count": 9, - "id": "15", + "execution_count": null, + "id": "13", "metadata": {}, "outputs": [], "source": [ @@ -452,7 +260,7 @@ }, { "cell_type": "markdown", - "id": "16", + "id": "14", "metadata": {}, "source": [ "## Tokenizer" @@ -460,8 +268,8 @@ }, { "cell_type": "code", - "execution_count": 10, - "id": "17", + "execution_count": null, + "id": "15", "metadata": {}, "outputs": [], "source": [ @@ -470,21 +278,10 @@ }, { "cell_type": "code", - "execution_count": 54, - "id": "18", + "execution_count": null, + "id": "16", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([1, 3])" - ] - }, - "execution_count": 54, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "tokenizer = HuggingFaceTokenizer.load_from_pretrained(\"google-bert/bert-base-uncased\")\n", "tokenizer.tokenize(text[0]).input_ids.shape" @@ -492,21 +289,10 @@ }, { "cell_type": "code", - "execution_count": 55, - "id": "0501c22e", + "execution_count": null, + "id": "17", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "30522" - ] - }, - "execution_count": 55, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "tokenizer.vocab_size" ] @@ -514,29 +300,9 @@ { "cell_type": "code", "execution_count": null, - "id": "19", + "id": "18", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "\n", - "\n" - ] - }, - { - "data": { - "text/plain": [ - "torch.Size([256, 125])" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "tokenizer = WordPieceTokenizer(vocab_size=5000, output_dim=125)\n", "tokenizer.train(text)\n", @@ -545,7 +311,7 @@ }, { "cell_type": "markdown", - "id": "20", + "id": "19", "metadata": {}, "source": [ "## Consider each component indepedently" @@ -553,8 +319,8 @@ }, { "cell_type": "code", - "execution_count": 13, - "id": "21", + "execution_count": null, + "id": "20", "metadata": {}, "outputs": [], "source": [ @@ -570,8 +336,8 @@ }, { "cell_type": "code", - "execution_count": 14, - "id": "22", + "execution_count": null, + "id": "21", "metadata": {}, "outputs": [], "source": [ @@ -598,29 +364,18 @@ }, { "cell_type": "code", - "execution_count": 15, - "id": "23", + "execution_count": null, + "id": "22", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[75, 8, 14, 2, 4]" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "X[:, 1:].max(axis=0).tolist()" ] }, { "cell_type": "code", - "execution_count": 16, - "id": "24", + "execution_count": null, + "id": "23", "metadata": {}, "outputs": [], "source": [ @@ -635,8 +390,8 @@ }, { "cell_type": "code", - "execution_count": 17, - "id": "25", + "execution_count": null, + "id": "24", "metadata": {}, "outputs": [], "source": [ @@ -650,51 +405,10 @@ }, { "cell_type": "code", - "execution_count": 18, - "id": "26", + "execution_count": null, + "id": "25", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "TextClassificationModel(\n", - " (text_embedder): TextEmbedder(\n", - " (embedding_layer): Embedding(5000, 96, padding_idx=1)\n", - " (transformer): ModuleDict(\n", - " (h): ModuleList(\n", - " (0): Block(\n", - " (attn): SelfAttentionLayer(\n", - " (c_q): Linear(in_features=96, out_features=96, bias=False)\n", - " (c_k): Linear(in_features=96, out_features=96, bias=False)\n", - " (c_v): Linear(in_features=96, out_features=96, bias=False)\n", - " (c_proj): Linear(in_features=96, out_features=96, bias=False)\n", - " )\n", - " (mlp): MLP(\n", - " (c_fc): Linear(in_features=96, out_features=384, bias=False)\n", - " (c_proj): Linear(in_features=384, out_features=96, bias=False)\n", - " )\n", - " )\n", - " )\n", - " )\n", - " )\n", - " (categorical_variable_net): CategoricalVariableNet(\n", - " (categorical_embedding_0): Embedding(76, 25)\n", - " (categorical_embedding_1): Embedding(9, 25)\n", - " (categorical_embedding_2): Embedding(15, 25)\n", - " (categorical_embedding_3): Embedding(3, 25)\n", - " (categorical_embedding_4): Embedding(5, 25)\n", - " )\n", - " (classification_head): ClassificationHead(\n", - " (net): Linear(in_features=121, out_features=583, bias=True)\n", - " )\n", - ")" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "model = TextClassificationModel(\n", " text_embedder=text_embedder,\n", @@ -706,55 +420,10 @@ }, { "cell_type": "code", - "execution_count": 19, - "id": "27", + "execution_count": null, + "id": "26", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "TextClassificationModule(\n", - " (model): TextClassificationModel(\n", - " (text_embedder): TextEmbedder(\n", - " (embedding_layer): Embedding(5000, 96, padding_idx=1)\n", - " (transformer): ModuleDict(\n", - " (h): ModuleList(\n", - " (0): Block(\n", - " (attn): SelfAttentionLayer(\n", - " (c_q): Linear(in_features=96, out_features=96, bias=False)\n", - " (c_k): Linear(in_features=96, out_features=96, bias=False)\n", - " (c_v): Linear(in_features=96, out_features=96, bias=False)\n", - " (c_proj): Linear(in_features=96, out_features=96, bias=False)\n", - " )\n", - " (mlp): MLP(\n", - " (c_fc): Linear(in_features=96, out_features=384, bias=False)\n", - " (c_proj): Linear(in_features=384, out_features=96, bias=False)\n", - " )\n", - " )\n", - " )\n", - " )\n", - " )\n", - " (categorical_variable_net): CategoricalVariableNet(\n", - " (categorical_embedding_0): Embedding(76, 25)\n", - " (categorical_embedding_1): Embedding(9, 25)\n", - " (categorical_embedding_2): Embedding(15, 25)\n", - " (categorical_embedding_3): Embedding(3, 25)\n", - " (categorical_embedding_4): Embedding(5, 25)\n", - " )\n", - " (classification_head): ClassificationHead(\n", - " (net): Linear(in_features=121, out_features=583, bias=True)\n", - " )\n", - " )\n", - " (loss): CrossEntropyLoss()\n", - " (accuracy_fn): MulticlassAccuracy()\n", - ")" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "import torch\n", "\n", @@ -772,7 +441,7 @@ }, { "cell_type": "markdown", - "id": "28", + "id": "27", "metadata": {}, "source": [ "## Using the wrapper" @@ -780,8 +449,8 @@ }, { "cell_type": "code", - "execution_count": 20, - "id": "29", + "execution_count": null, + "id": "28", "metadata": {}, "outputs": [], "source": [ @@ -807,253 +476,30 @@ }, { "cell_type": "code", - "execution_count": 21, - "id": "30", + "execution_count": null, + "id": "29", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array(['livraisons a domicile repas et marchandises divers activité de services',\n", - " 75, 7, 1, 1, 0], dtype=object)" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "X_train[1, :]" ] }, { "cell_type": "code", - "execution_count": 22, - "id": "31", + "execution_count": null, + "id": "30", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([256, 125])" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "tokenizer.tokenize(X_train[:256, 0].tolist()).input_ids.shape" ] }, { "cell_type": "code", - "execution_count": 23, - "id": "32", + "execution_count": null, + "id": "31", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "GPU available: True (cuda), used: True\n", - "TPU available: False, using: 0 TPU cores\n", - "HPU available: False, using: 0 HPUs\n", - "/home/onyxia/work/torchTextClassifiers/.venv/lib/python3.13/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default\n", - "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n", - "\n", - " | Name | Type | Params | Mode \n", - "----------------------------------------------------------------\n", - "0 | model | TextClassificationModel | 664 K | train\n", - "1 | loss | CrossEntropyLoss | 0 | train\n", - "2 | accuracy_fn | MulticlassAccuracy | 0 | train\n", - "----------------------------------------------------------------\n", - "664 K Trainable params\n", - "0 Non-trainable params\n", - "664 K Total params\n", - "2.658 Total estimated model params size (MB)\n", - "24 Modules in train mode\n", - "0 Modules in eval mode\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "b076ed12bfcb48879c03faa917b21b30", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Sanity Checking: | | 0/? [00:00" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ - "from matplotlib import pyplot as plt\n", - "top_k = char_attributions.shape[0]\n", - "\n", - "all_plots = []\n", - "for i in range(1):\n", - " plt.figure(figsize=(10, 2))\n", - " plt.bar(range(len(text)), char_attributions[i])\n", - " plt.xticks(ticks=np.arange(len(text)), labels=list(text), rotation=90)\n", - " plt.title(f\"Attributions at character level for prediction {encoder.inverse_transform(np.array([predictions[i]]))[0] }\")\n", - " plt.xlabel(\"Characters in Text\")\n", - " plt.ylabel(\"Top Predictions\")\n", - " plt.tight_layout()\n", - " all_plots.append(plt)" + "all_plots = plot_attributions_at_char(\n", + " text=text,\n", + " attributions_per_char=char_attributions,\n", + " titles = list(map(lambda x: f\"Attributions for code {x}\", encoder.inverse_transform(np.array([predictions]).reshape(-1)).tolist())),\n", + ")\n", + "figshow(all_plots[0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "39", + "metadata": {}, + "outputs": [], + "source": [ + "all_plots = plot_attributions_at_word(\n", + " text=text,\n", + " attributions_per_word=word_attributions,\n", + " titles = list(map(lambda x: f\"Attributions for code {x}\", encoder.inverse_transform(np.array([predictions]).reshape(-1)).tolist())),\n", + ")\n", + "figshow(all_plots[0])" ] } ], From 0f9b4b4c31e78d56cb441bf6f00887615a75ee0c Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Mon, 17 Nov 2025 22:47:32 +0000 Subject: [PATCH 52/66] refactor: replace cpu_run with accelerator in TrainingConfig --- torchTextClassifiers/torchTextClassifiers.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/torchTextClassifiers/torchTextClassifiers.py b/torchTextClassifiers/torchTextClassifiers.py index 016a021..b5a2090 100644 --- a/torchTextClassifiers/torchTextClassifiers.py +++ b/torchTextClassifiers/torchTextClassifiers.py @@ -68,7 +68,7 @@ class TrainingConfig: loss: torch.nn.Module = field(default_factory=lambda: torch.nn.CrossEntropyLoss()) optimizer: Type[torch.optim.Optimizer] = torch.optim.Adam scheduler: Optional[Type[torch.optim.lr_scheduler._LRScheduler]] = None - cpu_run: bool = False + accelerator: str = "auto" num_workers: int = 12 patience_early_stopping: int = 3 dataloader_params: Optional[dict] = None @@ -237,13 +237,10 @@ def train( if verbose: logger.info("Starting training process...") - # Device setup - if training_config.cpu_run: - device = torch.device("cpu") - accelerator = "cpu" - else: + if training_config.accelerator == "auto": device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - accelerator = "gpu" if torch.cuda.is_available() else "cpu" + else: + device = torch.device(training_config.accelerator) self.device = device @@ -311,6 +308,7 @@ def train( ] trainer_params = { + "accelerator": training_config.accelerator, "callbacks": callbacks, "max_epochs": training_config.num_epochs, "num_sanity_val_steps": 2, @@ -322,7 +320,7 @@ def train( if training_config.trainer_params is not None: trainer_params.update(training_config.trainer_params) - trainer = pl.Trainer(**trainer_params, accelerator=accelerator) + trainer = pl.Trainer(**trainer_params) torch.cuda.empty_cache() torch.set_float32_matmul_precision("medium") From 5102f829946e86434088ee1e4dc83d0ab26562db Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Tue, 18 Nov 2025 18:34:05 +0000 Subject: [PATCH 53/66] feat!(tokenizer-ngram): add very fast ngram tokenizer add also tests and comparison with WordPiece --- .gitignore | 4 +- tests/benchmark_suite.py | 263 ++++++++++ tests/test_tokenizer.py | 133 ++++- torchTextClassifiers/tokenizers/ngram.py | 629 ++++++++++++----------- 4 files changed, 726 insertions(+), 303 deletions(-) create mode 100644 tests/benchmark_suite.py diff --git a/.gitignore b/.gitignore index 3b63de3..31f2232 100644 --- a/.gitignore +++ b/.gitignore @@ -174,4 +174,6 @@ fastTextAttention.py poetry.lock # vscode -.vscode/ \ No newline at end of file +.vscode/ + +benchmark_results/ diff --git a/tests/benchmark_suite.py b/tests/benchmark_suite.py new file mode 100644 index 0000000..b59d7be --- /dev/null +++ b/tests/benchmark_suite.py @@ -0,0 +1,263 @@ +""" +Simplified benchmark suite for comparing tokenizers +""" + +import random +import time +from typing import Any, Dict, List + +import matplotlib.pyplot as plt +import numpy as np + +from torchTextClassifiers.tokenizers.ngram import NGramTokenizer +from torchTextClassifiers.tokenizers.WordPiece import WordPieceTokenizer + +# ============================================================================ +# Test Data Generation +# ============================================================================ + + +def generate_test_data(num_samples: int, avg_length: int = 50) -> List[str]: + """Generate synthetic text data.""" + words = [ + "the", + "quick", + "brown", + "fox", + "jumps", + "over", + "lazy", + "dog", + "machine", + "learning", + "artificial", + "intelligence", + "neural", + "network", + "tokenizer", + "optimization", + "performance", + "benchmark", + "testing", + "python", + "pytorch", + "numpy", + "data", + "processing", + "model", + ] + + sentences = [] + for _ in range(num_samples): + length = max(5, int(np.random.normal(avg_length, avg_length // 4))) + sentence = " ".join(random.choices(words, k=length)) + sentences.append(sentence) + + return sentences + + +# ============================================================================ +# Simple Benchmark +# ============================================================================ + + +def benchmark_tokenizer(tokenizer, data: List[str], name: str, runs: int = 3) -> Dict: + """Benchmark a single tokenizer on data.""" + + # Warmup + _ = tokenizer.tokenize(data[:10]) + + # Benchmark + times = [] + for _ in range(runs): + start = time.perf_counter() + _ = tokenizer.tokenize(data) + elapsed = time.perf_counter() - start + times.append(elapsed) + + mean_time = np.mean(times) + throughput = len(data) / mean_time + + return { + "name": name, + "time": mean_time, + "std": np.std(times), + "throughput": throughput, + "times": times, + } + + +def compare_tokenizers(tokenizers: Dict[str, Any], batch_sizes: List[int] = None): + """ + Compare multiple tokenizers across different batch sizes. + + Args: + tokenizers: Dict with {name: tokenizer_instance} + batch_sizes: List of batch sizes to test + """ + + if batch_sizes is None: + batch_sizes = [100, 500, 1000, 2000] + + print("=" * 80) + print("TOKENIZER COMPARISON") + print("=" * 80) + + results = {name: [] for name in tokenizers.keys()} + + for batch_size in batch_sizes: + print(f"\n--- Batch Size: {batch_size} ---") + test_data = generate_test_data(batch_size) + + batch_results = [] + for name, tokenizer in tokenizers.items(): + try: + result = benchmark_tokenizer(tokenizer, test_data, name) + results[name].append(result) + + print( + f"{name:20s}: {result['time']:.3f}s ± {result['std']:.3f}s " + f"({result['throughput']:.0f} samples/sec)" + ) + batch_results.append(result) + + except Exception as e: + print(f"{name:20s}: FAILED - {e}") + + # Show speedup + if len(batch_results) > 1: + fastest = min(batch_results, key=lambda x: x["time"]) + slowest = max(batch_results, key=lambda x: x["time"]) + speedup = slowest["time"] / fastest["time"] + print(f"\n → {fastest['name']} is {speedup:.2f}x faster than {slowest['name']}") + + return results + + +def plot_comparison(results: Dict[str, List[Dict]], save_path: str = "comparison.png"): + """Plot comparison results.""" + + fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5)) + + # Plot 1: Throughput vs Batch Size + for name, data in results.items(): + if not data: + continue + batch_sizes = [d["throughput"] / d["time"] * 1000 for d in data] # rough estimate + throughputs = [d["throughput"] for d in data] + ax1.plot(batch_sizes, throughputs, marker="o", label=name, linewidth=2) + + ax1.set_xlabel("Batch Size") + ax1.set_ylabel("Throughput (samples/sec)") + ax1.set_title("Throughput Comparison") + ax1.legend() + ax1.grid(True, alpha=0.3) + + # Plot 2: Time comparison (last batch size) + names = [] + times = [] + colors = [] + + for i, (name, data) in enumerate(results.items()): + if data: + names.append(name) + times.append(data[-1]["time"]) + colors.append(f"C{i}") + + if times: + bars = ax2.barh(range(len(names)), times, color=colors, alpha=0.7) + ax2.set_yticks(range(len(names))) + ax2.set_yticklabels(names) + ax2.set_xlabel("Time (seconds)") + ax2.set_title("Processing Time Comparison") + ax2.grid(True, alpha=0.3, axis="x") + + # Add value labels + for i, (bar, t) in enumerate(zip(bars, times)): + ax2.text(t + 0.01, i, f"{t:.3f}s", va="center") + + # Mark fastest + fastest_idx = times.index(min(times)) + ax2.get_yticklabels()[fastest_idx].set_weight("bold") + ax2.get_yticklabels()[fastest_idx].set_color("green") + + plt.tight_layout() + plt.savefig(save_path, dpi=150, bbox_inches="tight") + print(f"\n✓ Plot saved to {save_path}") + plt.close() + + +def print_summary(results: Dict[str, List[Dict]]): + """Print summary statistics.""" + + print("\n" + "=" * 80) + print("SUMMARY") + print("=" * 80) + + # Get last batch results (largest) + last_batch = {name: data[-1] for name, data in results.items() if data} + + if not last_batch: + print("No results to summarize") + return + + fastest = min(last_batch.items(), key=lambda x: x[1]["time"]) + slowest = max(last_batch.items(), key=lambda x: x[1]["time"]) + + print(f"\n🏆 Winner: {fastest[0]}") + print(f" Time: {fastest[1]['time']:.3f}s") + print(f" Throughput: {fastest[1]['throughput']:.0f} samples/sec") + + if len(last_batch) > 1: + speedup = slowest[1]["time"] / fastest[1]["time"] + print(f"\n {speedup:.2f}x faster than {slowest[0]}") + + print("\n" + "-" * 80) + print("All tokenizers (sorted by speed):") + for name, result in sorted(last_batch.items(), key=lambda x: x[1]["time"]): + speedup = slowest[1]["time"] / result["time"] + print(f" {name:20s}: {result['time']:.3f}s ({speedup:.2f}x)") + + +if __name__ == "__main__": + """ + Simple usage example: + + 1. Train your tokenizers + 2. Put them in a dict + 3. Run comparison + """ + + print("Training tokenizers...") + training_data = generate_test_data(1000, avg_length=30) + + # Create tokenizers + tokenizers = {} + + # NGram tokenizer + tokenizers["NGram"] = NGramTokenizer( + min_count=2, + min_n=2, + max_n=4, + num_tokens=10000, + len_word_ngrams=2, + training_text=training_data, + ) + + # WordPiece tokenizer + wp = WordPieceTokenizer(vocab_size=10000) + wp.train(training_corpus=training_data) + tokenizers["WordPiece"] = wp + + print(f"\n✓ Trained {len(tokenizers)} tokenizers\n") + + # Run comparison + results = compare_tokenizers(tokenizers, batch_sizes=[100, 500, 1000]) + + # Plot results + plot_comparison(results) + + # Print summary + print_summary(results) + + print("\n✓ Benchmark complete!") diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 0c29d1b..fd5b9b1 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -1,19 +1,124 @@ -from torchTextClassifiers.tokenizers import WordPieceTokenizer +""" +tests/test_tokenizer_benchmarks.py +Pytest integration for tokenizer benchmarks. +Run with: pytest tests/test_tokenizer_benchmarks.py --benchmark +""" -class TestWordPieceTokenizer: - def test_init(self): - tokenizer = WordPieceTokenizer(1000) - assert tokenizer is not None +from pathlib import Path - def test_train(self, sample_text_data): - tokenizer = WordPieceTokenizer(1000) - tokenizer.train(sample_text_data) +import pytest - def test_tokenize(self, sample_text_data): - tokenizer = WordPieceTokenizer(1000) - tokenizer.train(sample_text_data) - tokens = tokenizer.tokenize(sample_text_data[0]) - tokens = tokenizer.tokenize(list(sample_text_data)) +from tests.benchmark_suite import ( + compare_tokenizers, + generate_test_data, + plot_comparison, +) +from torchTextClassifiers.tokenizers.ngram import NGramTokenizer +from torchTextClassifiers.tokenizers.WordPiece import WordPieceTokenizer - assert tokens is not None + +@pytest.fixture(scope="module") +def training_data(): + """Generate training data once for all tests.""" + return generate_test_data(1000, avg_length=30) + + +@pytest.fixture(scope="module") +def ngram_tokenizer(training_data): + """Create and train NGram tokenizer.""" + tokenizer = NGramTokenizer( + min_count=2, + min_n=2, + max_n=4, + num_tokens=10000, + len_word_ngrams=2, + training_text=training_data, + ) + return tokenizer + + +@pytest.fixture(scope="module") +def wordpiece_tokenizer(training_data): + """Create and train WordPiece tokenizer.""" + wp = WordPieceTokenizer(vocab_size=10000) + wp.train(training_corpus=training_data) + return wp + + +# ============================================================================ +# Regular Tests (Always Run) +# ============================================================================ + + +def test_ngram_tokenizer_basic(ngram_tokenizer): + """Basic sanity test for NGram tokenizer.""" + test_text = ["hello world", "machine learning is awesome"] + result = ngram_tokenizer.tokenize(test_text) + + assert result.input_ids is not None + assert result.attention_mask is not None + assert result.input_ids.shape[0] == len(test_text) + + +def test_wordpiece_tokenizer_basic(wordpiece_tokenizer): + """Basic sanity test for WordPiece tokenizer.""" + test_text = ["hello world", "machine learning is awesome"] + result = wordpiece_tokenizer.tokenize(test_text) + + assert result.input_ids is not None + assert result.attention_mask is not None + assert result.input_ids.shape[0] == len(test_text) + + +# ============================================================================ +# Benchmark Tests (Run with --benchmark flag) +# ============================================================================ + + +def test_tokenizer_comparison_small(ngram_tokenizer, wordpiece_tokenizer): + """Compare tokenizers on small batch (CI-friendly).""" + tokenizers = { + "NGram": ngram_tokenizer, + "WordPiece": wordpiece_tokenizer, + } + + # Small batch sizes for CI + results = compare_tokenizers(tokenizers, batch_sizes=[100, 500]) + + # Ensure results were generated + assert len(results) == 2 + for name, data in results.items(): + assert len(data) > 0, f"{name} produced no results" + + +def test_tokenizer_comparison_full(ngram_tokenizer, wordpiece_tokenizer): + """Full benchmark comparison (for local testing).""" + tokenizers = { + "NGram": ngram_tokenizer, + "WordPiece": wordpiece_tokenizer, + } + + # Full benchmark + results = compare_tokenizers(tokenizers, batch_sizes=[100, 500, 1000]) + + # Save results + output_dir = Path("benchmark_results") + output_dir.mkdir(exist_ok=True) + + # Save plot + plot_comparison(results, save_path=str(output_dir / "comparison.png")) + + # Save JSON results + results_json = {} + for name, data in results.items(): + results_json[name] = [ + { + "batch_size": d["throughput"] / d["time"] * 1000, + "time": d["time"], + "throughput": d["throughput"], + } + for d in data + ] + + print(f"\n✓ Results: {results_json}/") diff --git a/torchTextClassifiers/tokenizers/ngram.py b/torchTextClassifiers/tokenizers/ngram.py index 285bd3c..e7476ef 100644 --- a/torchTextClassifiers/tokenizers/ngram.py +++ b/torchTextClassifiers/tokenizers/ngram.py @@ -1,340 +1,393 @@ -""" -NGramTokenizer class. -""" - -import ctypes import json -from typing import List, Tuple, Type +import re +import unicodedata +from functools import lru_cache +from typing import List, Optional, Tuple, Union import torch -from ...utilities.preprocess import clean_text_feature - +from torchTextClassifiers.tokenizers import BaseTokenizer, TokenizerOutput -class NGramTokenizer: - """ - NGramTokenizer class. - """ +# ============================================================================ +# Optimized normalization +# ============================================================================ - def __init__( - self, - min_count: int, - min_n: int, - max_n: int, - num_tokens: int, - len_word_ngrams: int, - training_text: List[str], - **kwargs, - ): - """ - Constructor for the NGramTokenizer class. - - Args: - min_count (int): Minimum number of times a word has to be - in the training data to be given an embedding. - min_n (int): Minimum length of character n-grams. - max_n (int): Maximum length of character n-grams. - num_tokens (int): Number of rows in the embedding matrix. - word_ngrams (int): Maximum length of word n-grams. - training_text (List[str]): List of training texts. - - Raises: - ValueError: If `min_n` is 1 or smaller. - ValueError: If `max_n` is 7 or higher. - """ - if min_n < 2: - raise ValueError("`min_n` parameter must be greater than 1.") - if max_n > 6: - raise ValueError("`max_n` parameter must be smaller than 7.") +_fasttext_non_alnum = re.compile(r"[^a-z0-9]+") +_fasttext_multi_space = re.compile(r"\s+") - self.min_count = min_count - self.min_n = min_n - self.max_n = max_n - self.num_tokens = num_tokens - self.word_ngrams = len_word_ngrams +# Pre-compile translation table for faster character removal +_COMBINING_MARKS = {c: None for c in range(0x0300, 0x0370)} - word_counts = {} - for sentence in training_text: - for word in sentence.split(" "): - word_counts[word] = word_counts.setdefault(word, 0) + 1 - self.word_id_mapping = {} - i = 1 - for word, counts in word_counts.items(): - if word_counts[word] >= min_count: - self.word_id_mapping[word] = i - i += 1 - self.nwords = len(self.word_id_mapping) +@lru_cache(maxsize=10000) +def _clean_single_text_cached(text: str) -> str: + """Cached version of text cleaning - major speedup for repeated texts.""" + t = text.lower() + t = unicodedata.normalize("NFKD", t) + # Faster: use translate() instead of list comprehension + t = t.translate(_COMBINING_MARKS) + t = _fasttext_non_alnum.sub(" ", t) + t = _fasttext_multi_space.sub(" ", t) + return t.strip() - self.padding_index = self.num_tokens + self.get_nwords() - def __str__(self) -> str: - """ - Returns description of the NGramTokenizer. +def clean_text_feature(texts: List[str]) -> List[str]: + """Vectorized text cleaning with caching.""" + return [_clean_single_text_cached(t) for t in texts] - Returns: - str: Description. - """ - return f"" - def get_nwords(self) -> int: - """ - Return number of words kept in training data. +# ============================================================================ +# Optimized hash function +# ============================================================================ - Returns: - int: Number of words. - """ - return self.nwords - def get_buckets(self) -> int: - """ - Return number of buckets for tokenizer. +def fast_hash(s: str) -> int: + """FNV-1a hash - simple and fast.""" + h = 2166136261 + for c in s: + h ^= ord(c) + h = (h * 16777619) & 0xFFFFFFFF + return h - Returns: - int: Number of buckets. - """ - return self.num_tokens - @staticmethod - def get_ngram_list(word: str, n: int) -> List[str]: - """ - Return the list of character n-grams for a word with a - given n. +# ============================================================================ +# Pre-computed subword cache +# ============================================================================ - Args: - word (str): Word. - n (int): Length of the n-grams. - Returns: - List[str]: List of character n-grams. - """ - return [word[i : i + n] for i in range(len(word) - n + 1)] +class SubwordCache: + """Aggressive pre-computation cache for subwords.""" - @staticmethod - def get_hash(subword: str) -> int: - """ - Return hash for a given subword. + def __init__( + self, + word_to_id: dict, + min_n: int, + max_n: int, + num_tokens: int, + nwords: int, + unk_token_id: int, + ): + self.cache = {} + self.word_to_id = word_to_id + self.min_n = min_n + self.max_n = max_n + self.num_tokens = num_tokens + self.nwords = nwords + self.unk_token_id = unk_token_id - Args: - subword (str): Character n-gram. + # Pre-compute for all vocabulary words + self._precompute_vocab() - Returns: - int: Corresponding hash. - """ - h = ctypes.c_uint32(2166136261).value - for c in subword: - c = ctypes.c_int8(ord(c)).value - h = ctypes.c_uint32(h ^ c).value - h = ctypes.c_uint32(h * 16777619).value - return h - - @staticmethod - def get_word_ngram_id(hashes: Tuple[int], bucket: int, nwords: int) -> int: - """ - Get word ngram index in the embedding matrix. + def _precompute_vocab(self): + """Pre-compute subwords for entire vocabulary.""" + for word, word_id in self.word_to_id.items(): + self.cache[word] = self._compute_subwords(word, word_id) - Args: - hashes (Tuple[int]): Word hashes. - bucket (int): Number of rows in embedding matrix. - nwords (int): Number of words in the vocabulary. + def _compute_subwords(self, word: str, word_id: Optional[int] = None) -> List[int]: + """Compute subword indices for a word.""" + indices = [] - Returns: - int: Word ngram hash. - """ - hashes = [ctypes.c_int32(hash_value).value for hash_value in hashes] - h = ctypes.c_uint64(hashes[0]).value - for j in range(1, len(hashes)): - h = ctypes.c_uint64((h * 116049371)).value - h = ctypes.c_uint64(h + hashes[j]).value - return h % bucket + nwords - - def get_subword_index(self, subword: str) -> int: - """ - Return the row index from the embedding matrix which - corresponds to a character n-gram. + # Add word token if in vocab + if word_id is not None: + indices.append(word_id) - Args: - subword (str): Character n-gram. + # Extract character n-grams + word_tagged = f"<{word}>" + L = len(word_tagged) - Returns: - int: Index. - """ - return self.get_hash(subword) % self.num_tokens + self.nwords + for n in range(self.min_n, self.max_n + 1): + for i in range(L - n + 1): + ngram = word_tagged[i : i + n] + if ngram != word and ngram != word_tagged: + bucket_idx = fast_hash(ngram) % self.num_tokens + indices.append(3 + self.nwords + bucket_idx) + + return indices if indices else [self.unk_token_id] + + def get(self, word: str) -> List[int]: + """Get subwords with on-demand computation for OOV words.""" + if word not in self.cache: + word_id = self.word_to_id.get(word) + self.cache[word] = self._compute_subwords(word, word_id) + return self.cache[word] + + +# ============================================================================ +# Vectorized encoding +# ============================================================================ + + +def encode_batch_vectorized( + sentences: List[str], + subword_cache: SubwordCache, + eos_token_id: int, + pad_token_id: int, + max_length: Optional[int] = None, + truncation: bool = False, +) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Vectorized batch encoding - processes all sentences together. + Returns padded tensors directly. + """ + all_ids = [] + max_len = 0 - def get_word_index(self, word: str) -> int: - """ - Return the row index from the embedding matrix which - corresponds to a word. + # First pass: encode all sentences + for sentence in sentences: + ids = [] + words = sentence.split() - Args: - word (str): Word. + for word in words: + ids.extend(subword_cache.get(word)) - Returns: - int: Index. - """ - return self.word_id_mapping[word] + ids.append(eos_token_id) - def get_subwords(self, word: str) -> Tuple[List[str], List[int]]: - """ - Return all subwords tokens and indices for a given word. - Also adds the whole word token and indice if the word is in word_id_mapping - (==> the word is in initial vocabulary + seen at least MIN_COUNT times). - Adds tags "<" and ">" to the word. + # Truncate if needed + if truncation and max_length and len(ids) > max_length: + ids = ids[:max_length] - Args: - word (str): Word. + all_ids.append(ids) + max_len = max(max_len, len(ids)) - Returns: - Tuple[List[str], List[int]]: Tuple of tokens and indices. - """ - tokens = [] - word_with_tags = "<" + word + ">" + # Determine final sequence length + if max_length and not truncation: + seq_len = min(max_len, max_length) + elif max_length: + seq_len = max_length + else: + seq_len = max_len - # Get subwords and associated indices WITHOUT the whole word - for n in range(self.min_n, self.max_n + 1): - ngrams = self.get_ngram_list(word_with_tags, n) - tokens += [ - ngram for ngram in ngrams if ngram != word_with_tags and ngram != word - ] # Exclude the full word + # Pre-allocate tensors + batch_size = len(sentences) + input_ids = torch.full((batch_size, seq_len), pad_token_id, dtype=torch.long) + attention_mask = torch.zeros((batch_size, seq_len), dtype=torch.long) - indices = [self.get_subword_index(token) for token in tokens] - assert word not in tokens + # Fill tensors + for i, ids in enumerate(all_ids): + length = min(len(ids), seq_len) + input_ids[i, :length] = torch.tensor(ids[:length], dtype=torch.long) + attention_mask[i, :length] = 1 - # Add word token and indice only if the word is in word_id_mapping - if word in self.word_id_mapping.keys(): - self.get_word_index(word) - tokens = [word] + tokens - indices = [self.get_word_index(word)] + indices + return input_ids, attention_mask - return (tokens, indices) - def indices_matrix(self, sentence: str) -> tuple[torch.Tensor, dict, dict]: - """ - Returns an array of token indices for a text description. +# ============================================================================ +# NGramTokenizer - Optimized +# ============================================================================ - Args: - sentence (str): Text description. - Returns: - tuple: (torch.Tensor of indices, id_to_token dict, token_to_id dict) - """ - # Pre-split the sentence once - words = sentence.split() - words.append("") # Add end of string token +class NGramTokenizer(BaseTokenizer): + """ + Heavily optimized FastText N-gram tokenizer with: + - Pre-computed subword cache for entire vocabulary + - Vectorized batch encoding + - Cached text normalization + - Direct tensor operations + - No multiprocessing overhead + - No Numba dependency + """ - indices = [] - all_tokens_id = {} - - # Process subwords in one batch - for word in words[:-1]: # Exclude from subword processing - tokens, ind = self.get_subwords(word) - indices.extend(ind) - # Update dictionary with zip for efficiency - all_tokens_id.update(zip(tokens, ind)) - - # Add token - indices.append(0) - all_tokens_id[""] = 0 - - # Compute word n-grams more efficiently - if self.word_ngrams > 1: - # Pre-compute hashes for all words to avoid repeated computation - word_hashes = [self.get_hash(word) for word in words] - - # Generate n-grams using sliding window - word_ngram_ids = [] - for n in range(2, self.word_ngrams + 1): - for i in range(len(words) - n + 1): - # Get slice of hashes for current n-gram - gram_hashes = tuple(word_hashes[i : i + n]) - - # Compute n-gram ID - word_ngram_id = int( - self.get_word_ngram_id(gram_hashes, self.num_tokens, self.nwords) - ) - - # Store gram and its ID - gram = " ".join(words[i : i + n]) - all_tokens_id[gram] = word_ngram_id - word_ngram_ids.append(word_ngram_id) - - # Extend indices with n-gram IDs - indices.extend(word_ngram_ids) - - # Create reverse mapping once at the end - id_to_token = {v: k for k, v in all_tokens_id.items()} - - # Convert to tensor directly - return torch.tensor(indices, dtype=torch.long), id_to_token, all_tokens_id - - def tokenize(self, text: list[str], text_tokens=True, preprocess=False): - """ - Tokenize a list of sentences. + PAD_TOKEN = "[PAD]" + UNK_TOKEN = "[UNK]" + EOS_TOKEN = "" - Args: - text (list[str]): List of sentences. - text_tokens (bool): If True, return tokenized text in tokens. - preprocess (bool): If True, preprocess text. Needs unidecode library. + def __init__( + self, + min_count: int, + min_n: int, + max_n: int, + num_tokens: int, + len_word_ngrams: int, + training_text: Optional[List[str]] = None, + preprocess: bool = True, + **kwargs, + ): + if min_n < 2: + raise ValueError("min_n must be >= 2") + if max_n > 6: + raise ValueError("max_n must be <= 6") - Returns: - np.array: Array of indices. - """ + self.min_count = min_count + self.min_n = min_n + self.max_n = max_n + self.num_tokens = num_tokens + self.word_ngrams = len_word_ngrams + self.preprocess = preprocess - if preprocess: - text = clean_text_feature(text) + self.pad_token_id = 0 + self.unk_token_id = 1 + self.eos_token_id = 2 - tokenized_text = [] - id_to_token_dicts = [] - token_to_id_dicts = [] - for sentence in text: - all_ind, id_to_token, token_to_id = self.indices_matrix( - sentence - ) # tokenize and convert to token indices - tokenized_text.append(all_ind) - id_to_token_dicts.append(id_to_token) - token_to_id_dicts.append(token_to_id) - - if text_tokens: - tokenized_text_tokens = self._tokenized_text_in_tokens( - tokenized_text, id_to_token_dicts - ) - return tokenized_text_tokens, tokenized_text, id_to_token_dicts, token_to_id_dicts + if training_text is not None: + self._build_vocab(training_text) else: - return tokenized_text, id_to_token_dicts, token_to_id_dicts - - def _tokenized_text_in_tokens(self, tokenized_text, id_to_token_dicts): - """ - Convert tokenized text in int format to tokens in str format (given a mapping dictionary). - Private method. Used in tokenizer.tokenize and pytorch_model.predict() + self.word_to_id = {} + self.id_to_word = {} + self.nwords = 0 + self.subword_cache = None - Args: - tokenized_text (list): List of tokenized text in int format. - id_to_token_dicts (list[Dict]): List of dictionaries mapping token indices to tokens. - - Both lists have the same length (number of sentences). - - Returns: - list[list[str]]: List of tokenized text in str format. + self.vocab_size = 3 + self.nwords + self.num_tokens + super().__init__(vocab_size=self.vocab_size) + def _build_vocab(self, training_text: List[str]): + """Build vocabulary from training text.""" + word_counts = {} + for sent in training_text: + for w in sent.split(): + word_counts[w] = word_counts.get(w, 0) + 1 + + self.word_to_id = {} + idx = 3 + for w, c in word_counts.items(): + if c >= self.min_count: + self.word_to_id[w] = idx + idx += 1 + + self.nwords = len(self.word_to_id) + + # Create reverse mapping + self.id_to_word = {v: k for k, v in self.word_to_id.items()} + self.id_to_word[self.pad_token_id] = self.PAD_TOKEN + self.id_to_word[self.unk_token_id] = self.UNK_TOKEN + self.id_to_word[self.eos_token_id] = self.EOS_TOKEN + + # Pre-compute all subwords for vocabulary + print(f"Pre-computing subwords for {self.nwords} vocabulary words...") + self.subword_cache = SubwordCache( + self.word_to_id, self.min_n, self.max_n, self.num_tokens, self.nwords, self.unk_token_id + ) + print("✓ Subword cache built") + + def tokenize( + self, + text: Union[str, List[str]], + padding: str = "longest", + max_length: Optional[int] = None, + truncation: bool = False, + return_offsets_mapping: bool = False, + return_word_ids: bool = False, + **kwargs, + ) -> TokenizerOutput: + """ + Optimized tokenization with vectorized operations. + Note: return_offsets_mapping and return_word_ids removed for speed. """ + is_single = isinstance(text, str) + if is_single: + text = [text] - return [ - [ - id_to_token_dicts[i][token_id.item()] - for token_id in tokenized_sentence - if token_id.item() not in {self.padding_index} - ] - for i, tokenized_sentence in enumerate(tokenized_text) - ] + # Fast cached text cleaning + if self.preprocess: + text = clean_text_feature(text) + + # Vectorized encoding + input_ids, attention_mask = encode_batch_vectorized( + text, + self.subword_cache, + self.eos_token_id, + self.pad_token_id, + max_length=max_length if padding == "max_length" else None, + truncation=truncation, + ) + + return TokenizerOutput( + input_ids=input_ids, + attention_mask=attention_mask, + word_ids=None, + offset_mapping=None, + ) + + def decode( + self, token_ids: Union[List[int], torch.Tensor], skip_special_tokens: bool = True + ) -> str: + """Decode token IDs back to text.""" + if isinstance(token_ids, torch.Tensor): + token_ids = token_ids.tolist() - def get_vocab(self): - return self.word_id_mapping + tokens = [] + for id_ in token_ids: + if id_ == self.pad_token_id and skip_special_tokens: + continue + + if id_ == self.eos_token_id: + if not skip_special_tokens: + tokens.append(self.EOS_TOKEN) + continue + + if id_ in self.id_to_word: + tokens.append(self.id_to_word[id_]) + elif not skip_special_tokens: + tokens.append(f"[ID:{id_}]") + + return " ".join(tokens) + + def batch_decode( + self, sequences: Union[List[List[int]], torch.Tensor], skip_special_tokens: bool = True + ) -> List[str]: + """Decode multiple sequences.""" + if isinstance(sequences, torch.Tensor): + sequences = sequences.tolist() + return [self.decode(seq, skip_special_tokens) for seq in sequences] + + def save_pretrained(self, save_directory: str): + """Save tokenizer configuration and vocabulary.""" + import os + + os.makedirs(save_directory, exist_ok=True) + + config = { + "min_count": self.min_count, + "min_n": self.min_n, + "max_n": self.max_n, + "num_tokens": self.num_tokens, + "len_word_ngrams": self.word_ngrams, + "word_to_id": self.word_to_id, + "preprocess": self.preprocess, + "vocab_size": self.vocab_size, + "nwords": self.nwords, + } + + with open(f"{save_directory}/tokenizer.json", "w") as f: + json.dump(config, f, indent=2) + + print(f"✓ Tokenizer saved to {save_directory}") @classmethod - def from_json(cls: Type["NGramTokenizer"], filepath: str, training_text) -> "NGramTokenizer": - """ - Load a dataclass instance from a JSON file. - """ - with open(filepath, "r") as f: - data = json.load(f) - return cls(**data, training_text=training_text) + def from_pretrained(cls, directory: str): + """Load tokenizer from saved configuration.""" + with open(f"{directory}/tokenizer.json", "r") as f: + config = json.load(f) + + tokenizer = cls( + min_count=config["min_count"], + min_n=config["min_n"], + max_n=config["max_n"], + num_tokens=config["num_tokens"], + len_word_ngrams=config["len_word_ngrams"], + preprocess=config["preprocess"], + training_text=None, + ) + + tokenizer.word_to_id = config["word_to_id"] + tokenizer.nwords = config["nwords"] + tokenizer.vocab_size = config["vocab_size"] + + tokenizer.id_to_word = {v: k for k, v in tokenizer.word_to_id.items()} + tokenizer.id_to_word[tokenizer.pad_token_id] = cls.PAD_TOKEN + tokenizer.id_to_word[tokenizer.unk_token_id] = cls.UNK_TOKEN + tokenizer.id_to_word[tokenizer.eos_token_id] = cls.EOS_TOKEN + + # Rebuild subword cache + print("Rebuilding subword cache...") + tokenizer.subword_cache = SubwordCache( + tokenizer.word_to_id, + tokenizer.min_n, + tokenizer.max_n, + tokenizer.num_tokens, + tokenizer.nwords, + tokenizer.unk_token_id, + ) + print("✓ Subword cache built") + + print(f"✓ Tokenizer loaded from {directory}") + return tokenizer From ab58e2679ad2edc17b85cd31bcf2a6533493e104 Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Wed, 19 Nov 2025 10:39:53 +0000 Subject: [PATCH 54/66] doc: clean example notebook --- notebooks/example.ipynb | 497 ++++++++++++++++++--------- notebooks/example.qmd | 499 ---------------------------- notebooks/torchFastText_config.json | 22 -- notebooks/utils.py | 76 +---- 4 files changed, 344 insertions(+), 750 deletions(-) delete mode 100644 notebooks/example.qmd delete mode 100644 notebooks/torchFastText_config.json diff --git a/notebooks/example.ipynb b/notebooks/example.ipynb index c29d17d..cd32c28 100644 --- a/notebooks/example.ipynb +++ b/notebooks/example.ipynb @@ -13,7 +13,14 @@ "regular look to for\n", "latest information.*\n", "\n", - "To install package, you can run the following snippet" + "To download the latest (development) version of the library, you can use\n", + "```bash\n", + "uv add git+https://github.com/InseeFrLab/torchTextClassifiers\n", + "```\n", + "or, if you prefer using `pip`:\n", + "```bash\n", + "pip install git git+https://github.com/InseeFrLab/torchTextClassifiers\n", + "```\n" ] }, { @@ -23,6 +30,11 @@ "metadata": {}, "outputs": [], "source": [ + "import torch\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import LabelEncoder\n", + "\n", + "from notebooks.utils import categorize_surface, clean_and_tokenize_df\n", "from torchTextClassifiers import ModelConfig, TrainingConfig, torchTextClassifiers\n", "from torchTextClassifiers.dataset import TextClassificationDataset\n", "from torchTextClassifiers.model import TextClassificationModel, TextClassificationModule\n", @@ -66,17 +78,17 @@ "import pandas as pd\n", "\n", "df = pd.read_parquet(\"https://minio.lab.sspcloud.fr/projet-ape/data/08112022_27102024/naf2008/split/df_train.parquet\")\n", - "df = df.sample(10)" + "df" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "id": "4", "metadata": {}, - "outputs": [], "source": [ - "df" + "It contains an extract from the French business register (SIRENE). The `apet_finale` column contains the activity code - out target - in the French version of the [NACE nomenclature](https://ec.europa.eu/eurostat/web/products-manuals-and-guidelines/-/ks-ra-07-015). The text to classify is the `libelle` column, which contains a short description of the activity.\n", + "\n", + "Other columns are additional, **categorical** features that will be also used as inputs to the model.\n" ] }, { @@ -86,77 +98,7 @@ "metadata": {}, "outputs": [], "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.preprocessing import LabelEncoder\n", - "\n", - "\n", - "def categorize_surface(\n", - " df: pd.DataFrame, surface_feature_name: int, like_sirene_3: bool = True\n", - ") -> pd.DataFrame:\n", - " \"\"\"\n", - " Categorize the surface of the activity.\n", - "\n", - " Args:\n", - " df (pd.DataFrame): DataFrame to categorize.\n", - " surface_feature_name (str): Name of the surface feature.\n", - " like_sirene_3 (bool): If True, categorize like Sirene 3.\n", - "\n", - " Returns:\n", - " pd.DataFrame: DataFrame with a new column \"surf_cat\".\n", - " \"\"\"\n", - " df_copy = df.copy()\n", - " df_copy[surface_feature_name] = df_copy[surface_feature_name].replace(\"nan\", np.nan)\n", - " df_copy[surface_feature_name] = df_copy[surface_feature_name].astype(float)\n", - " # Check surface feature exists\n", - " if surface_feature_name not in df.columns:\n", - " raise ValueError(f\"Surface feature {surface_feature_name} not found in DataFrame.\")\n", - " # Check surface feature is a float variable\n", - " if not (pd.api.types.is_float_dtype(df_copy[surface_feature_name])):\n", - " raise ValueError(f\"Surface feature {surface_feature_name} must be a float variable.\")\n", - "\n", - " if like_sirene_3:\n", - " # Categorize the surface\n", - " df_copy[\"surf_cat\"] = pd.cut(\n", - " df_copy[surface_feature_name],\n", - " bins=[0, 120, 400, 2500, np.inf],\n", - " labels=[\"1\", \"2\", \"3\", \"4\"],\n", - " ).astype(str)\n", - " else:\n", - " # Log transform the surface\n", - " df_copy[\"surf_log\"] = np.log(df[surface_feature_name])\n", - "\n", - " # Categorize the surface\n", - " df_copy[\"surf_cat\"] = pd.cut(\n", - " df_copy.surf_log,\n", - " bins=[0, 3, 4, 5, 12],\n", - " labels=[\"1\", \"2\", \"3\", \"4\"],\n", - " ).astype(str)\n", - "\n", - " df_copy[surface_feature_name] = df_copy[\"surf_cat\"].replace(\"nan\", \"0\")\n", - " df_copy[surface_feature_name] = df_copy[surface_feature_name].astype(int)\n", - " df_copy = df_copy.drop(columns=[\"surf_log\", \"surf_cat\"], errors=\"ignore\")\n", - " return df_copy\n", - "\n", - "\n", - "def clean_and_tokenize_df(\n", - " df,\n", - " categorical_features=[\"CJ\", \"NAT\", \"TYP\", \"CRT\"],\n", - " text_feature=\"libelle_processed\",\n", - " label_col=\"apet_finale\",\n", - "):\n", - " df.fillna(\"nan\", inplace=True)\n", - " les = []\n", - " for col in categorical_features:\n", - " le = LabelEncoder()\n", - " df[col] = le.fit_transform(df[col])\n", - " les.append(le)\n", - "\n", - " df = categorize_surface(df, \"SRF\", like_sirene_3=True)\n", - " df = df[[text_feature, \"CJ\", \"NAT\", \"TYP\", \"SRF\", \"CRT\", label_col]]\n", - "\n", - " return df, les" + "df" ] }, { @@ -205,8 +147,8 @@ "formatting:\n", "\n", "- First column contains the processed text (str)\n", - "- Next ones contain the “encoded” categorical (discrete) variables in\n", - " int format" + "- Next ones contain the **“encoded”** categorical (discrete) variables in\n", + " **int format**, as required by torchTextClassifiers." ] }, { @@ -218,7 +160,9 @@ "source": [ "df, _ = clean_and_tokenize_df(df, text_feature=\"libelle\")\n", "X = df[[\"libelle\", \"CJ\", \"NAT\", \"TYP\", \"CRT\", \"SRF\"]].values\n", - "y = df[\"apet_finale\"].values" + "y = df[\"apet_finale\"].values\n", + "\n", + "df" ] }, { @@ -236,14 +180,7 @@ "id": "12", "metadata": {}, "source": [ - "## Splitting in train-test sets\n", - "\n", - "As usual in a learning approach, you need to break down your data into\n", - "learning and test/validation samples to obtain robust performance\n", - "statistics.\n", - "This work is the responsibility of the package’s users. Please make sure that np.max(y_train) == len(np.unique(y_train))-1 (i.e. your labels are well encoded, in a consecutive manner, starting from 0), and that all the possible labels appear at least once in the training set.\n", - "\n", - "We provide the function stratified_train_test_split to match these requirements here.." + "We now split the data into train, val and test sets, as done classically in machine learning tasks.\n" ] }, { @@ -262,6 +199,14 @@ "cell_type": "markdown", "id": "14", "metadata": {}, + "source": [ + "# Let's dive into the different components of a text classification model" + ] + }, + { + "cell_type": "markdown", + "id": "15", + "metadata": {}, "source": [ "## Tokenizer" ] @@ -269,85 +214,190 @@ { "cell_type": "code", "execution_count": null, - "id": "15", + "id": "16", "metadata": {}, "outputs": [], "source": [ - "text = X_train[:, 0].tolist()" + "text = X_train[:, 0].tolist() # extract the text column as a list" + ] + }, + { + "cell_type": "markdown", + "id": "17", + "metadata": {}, + "source": [ + "You can directly load a pretrained tokenizer from Hugging Face. But you won't have control over its vocabulary size or other parameters!" ] }, { "cell_type": "code", "execution_count": null, - "id": "16", + "id": "18", "metadata": {}, "outputs": [], "source": [ "tokenizer = HuggingFaceTokenizer.load_from_pretrained(\"google-bert/bert-base-uncased\")\n", - "tokenizer.tokenize(text[0]).input_ids.shape" + "print(\"This tokenizer outputs tensors of size \", tokenizer.tokenize(text[0]).input_ids.shape)\n", + "print(\"The tokens are here \", tokenizer.tokenizer.convert_ids_to_tokens(tokenizer.tokenize(text[0]).input_ids.squeeze(0)))\n", + "print(\"The total number of tokens is \", tokenizer.vocab_size)" + ] + }, + { + "cell_type": "markdown", + "id": "19", + "metadata": {}, + "source": [ + "Or you can train your own tokenizer from scratch." ] }, { "cell_type": "code", "execution_count": null, - "id": "17", + "id": "20", "metadata": {}, "outputs": [], "source": [ - "tokenizer.vocab_size" + "tokenizer = WordPieceTokenizer(vocab_size=5000, output_dim=125)\n", + "tokenizer.train(text)\n", + "print(\"This tokenizer outputs tensors of size \", tokenizer.tokenize(text[0]).input_ids.shape)\n", + "print(\"The tokens are here \", tokenizer.tokenizer.convert_ids_to_tokens(tokenizer.tokenize(text[0]).input_ids.squeeze(0)))\n", + "print(\"The total number of tokens is \", tokenizer.vocab_size)" + ] + }, + { + "cell_type": "markdown", + "id": "21", + "metadata": {}, + "source": [ + "## The PyTorch Dataset" + ] + }, + { + "cell_type": "markdown", + "id": "22", + "metadata": {}, + "source": [ + "To train a text classification model using PyTorch, you need to create a Dataset object that will handle the data loading and preprocessing. The `TextClassificationDataset` class from the `torchTextClassifiers` library can be used for this purpose." ] }, { "cell_type": "code", "execution_count": null, - "id": "18", + "id": "23", "metadata": {}, "outputs": [], "source": [ - "tokenizer = WordPieceTokenizer(vocab_size=5000, output_dim=125)\n", - "tokenizer.train(text)\n", - "tokenizer.tokenize(text[:256]).input_ids.shape" + "train_dataset = TextClassificationDataset(\n", + " texts=X_train[:, 0].tolist(),\n", + " categorical_variables=X_train[:, 1:].tolist(),\n", + " tokenizer=tokenizer,\n", + " labels=y_train,\n", + ")\n", + "train_dataset[0]" ] }, { "cell_type": "markdown", - "id": "19", + "id": "24", "metadata": {}, "source": [ - "## Consider each component indepedently" + "And then, you can create a ``DataLoader`` to iterate over the dataset during training. The DataLoader, via its `collate_fn` function, handles nicely the raw text and outputs tokenized, padded PyTorch tensors for immediate model ingestion." ] }, { "cell_type": "code", "execution_count": null, - "id": "20", + "id": "25", "metadata": {}, "outputs": [], "source": [ - "vocab_size = tokenizer.vocab_size\n", - "padding_idx = tokenizer.padding_idx\n", + "train_dataloader = train_dataset.create_dataloader(\n", + " batch_size=256,\n", + " num_workers=12,\n", + " shuffle=False,\n", + " )\n", + "batch = next(iter(train_dataloader))\n", "\n", - "embedding_dim = 96\n", - "n_layers = 1\n", - "n_head = 4\n", - "n_kv_head = n_head\n", - "sequence_len = tokenizer.output_dim" + "print(\"Input IDs shape: \", batch[\"input_ids\"].shape) # (batch_size, tokenizer.output_dim (=seq_length))" + ] + }, + { + "cell_type": "markdown", + "id": "26", + "metadata": {}, + "source": [ + "## The PyTorch Model" + ] + }, + { + "cell_type": "markdown", + "id": "27", + "metadata": {}, + "source": [ + "We provide three main components for the model architecture: \n", + "\n", + "- The `TextEmbedder` class, which handles the embedding of the token ids input into dense vectors\n", + "- The `CategoricalVariableNet` class, which handles the embedding of the categorical variables\n", + "- The `ClassificationHead`, that outputs the prediction vector\n", + "\n", + "Eventually, the `TextClassificationModel` class combines all these components into a single model that can be trained end-to-end.\n", + "\n", + "All of these four objects inherit from the `torch.nn.Module` class, so you can use them as you would do with any PyTorch model." + ] + }, + { + "cell_type": "markdown", + "id": "28", + "metadata": {}, + "source": [ + "### The TextEmbedder" ] }, { "cell_type": "code", "execution_count": null, - "id": "21", + "id": "29", "metadata": {}, "outputs": [], "source": [ + "### TextEmbedder parameters\n", + "\n", + "# size of the vocabulary - do not change, this must match the tokenizer used. It is the number of rows in the embedding matrix\n", + "vocab_size = tokenizer.vocab_size \n", + "padding_idx = tokenizer.padding_idx\n", + "embedding_dim = 96\n", + "\n", + "### Attention parameters - Optional ! If you want to add self-attention layer\n", + "n_layers = 1\n", + "n_head = 4\n", + "n_kv_head = n_head\n", + "sequence_len = tokenizer.output_dim\n", + "\n", + "# wrap them into AttentionConfig\n", "attention_config = AttentionConfig(\n", " n_layers=n_layers,\n", " n_head=n_head,\n", " n_kv_head=n_kv_head,\n", " sequence_len=sequence_len,\n", - ")\n", - "\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "30", + "metadata": {}, + "source": [ + "The first component is the embedding layer. It transforms input tokens (that comes from the tokenizer) into dense vectors of dimension `embedding_dim`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "31", + "metadata": {}, + "outputs": [], + "source": [ + "# wrap all TextEmbedder parameters into TextEmbedderConfig\n", "text_embedder_config = TextEmbedderConfig(\n", " vocab_size=vocab_size,\n", " embedding_dim=embedding_dim,\n", @@ -355,7 +405,7 @@ " attention_config=attention_config,\n", ")\n", "\n", - "\n", + "# initialize the TextEmbedder\n", "text_embedder = TextEmbedder(\n", " text_embedder_config=text_embedder_config,\n", ")\n", @@ -365,38 +415,113 @@ { "cell_type": "code", "execution_count": null, - "id": "22", + "id": "32", "metadata": {}, "outputs": [], "source": [ - "X[:, 1:].max(axis=0).tolist()" + "text_embedder" ] }, { "cell_type": "code", "execution_count": null, - "id": "23", + "id": "33", + "metadata": {}, + "outputs": [], + "source": [ + "# test the TextEmbedder: it takes as input a tensor of token ids and outputs a tensor of embeddings\n", + "text_embedder_output = text_embedder(input_ids=batch[\"input_ids\"], attention_mask=batch[\"attention_mask\"])\n", + "\n", + "print(\"TextEmbedder input: \", text_embedder_input.input_ids)\n", + "print(\"TextEmbedder output shape: \", text_embedder_output.shape)" + ] + }, + { + "cell_type": "markdown", + "id": "34", + "metadata": {}, + "source": [ + "### The CategoricalVariableNet" + ] + }, + { + "cell_type": "markdown", + "id": "35", + "metadata": {}, + "source": [ + "The second component is the categorical variable layer. It transforms input ids from the encoded categorical variables into dense vectors of dimension `categorical_embedding_dims`. Depending on the type of `categorical_embedding_dims`, the output dimension will vary.\n", + "\n", + "This component is of course optional: you will not use it if you don't have categorical variables in your dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "36", "metadata": {}, "outputs": [], "source": [ "categorical_vocab_sizes = (X[:, 1:].max(axis=0) + 1).tolist()\n", - "categorical_embedding_dims = 25\n", "\n", + "## categorical_embedding_dims can be a list of ints (one embedding dimension per categorical variable) - then they are concatenated\n", + "# the final average will be concatenated with text embedding (in the full model)\n", + "categorical_embedding_dims = [15, 10, 10, 10, 10]\n", "categorical_var_net = CategoricalVariableNet(\n", " categorical_vocabulary_sizes=categorical_vocab_sizes,\n", " categorical_embedding_dims=categorical_embedding_dims,\n", - ")" + ")\n", + "cat_var_net_output = categorical_var_net(batch[\"categorical_vars\"])\n", + "print(cat_var_net_output.shape)\n", + "print(\"How will the categorical embedding be merged with the text one ? \", categorical_var_net.forward_type)\n", + "\n", + "## it can also be None - the output dim will be text_embedding_dim that you should specify\n", + "# the final average will be ADDED TO text embedding (in the full model)\n", + "categorical_embedding_dims = None\n", + "categorical_var_net = CategoricalVariableNet(\n", + " categorical_vocabulary_sizes=categorical_vocab_sizes,\n", + " categorical_embedding_dims=categorical_embedding_dims,\n", + " text_embedding_dim=embedding_dim, # not putting this will raise an error\n", + ")\n", + "cat_var_net_output = categorical_var_net(batch[\"categorical_vars\"])\n", + "print(cat_var_net_output.shape)\n", + "print(\"How will the categorical embedding be merged with the text one ? \", categorical_var_net.forward_type)\n", + "\n", + "\n", + "## and finally, it can be an int (the same embedding dimension for all categorical variables and then averaging)\n", + "# the final average will be concatenated with text embedding (in the full model)\n", + "categorical_embedding_dims = 25\n", + "categorical_var_net = CategoricalVariableNet(\n", + " categorical_vocabulary_sizes=categorical_vocab_sizes,\n", + " categorical_embedding_dims=categorical_embedding_dims,\n", + ")\n", + "cat_var_net_output = categorical_var_net(batch[\"categorical_vars\"])\n", + "print(cat_var_net_output.shape)\n", + "print(\"How will the categorical embedding be merged with the text one ? \", categorical_var_net.forward_type)\n" + ] + }, + { + "cell_type": "markdown", + "id": "37", + "metadata": {}, + "source": [ + "### The ClassificationHead" ] }, { "cell_type": "code", "execution_count": null, - "id": "24", + "id": "38", "metadata": {}, "outputs": [], "source": [ "num_classes = int(y.max() + 1)\n", - "expected_input_dim = embedding_dim + categorical_var_net.output_dim\n", + "\n", + "# as discussed above, the input dim of the classification head depends on how categorical variables are handled\n", + "if type(categorical_embedding_dims) is int or type(categorical_embedding_dims) is list:\n", + " expected_input_dim = embedding_dim + categorical_var_net.output_dim\n", + "else:\n", + " expected_input_dim = embedding_dim\n", + "\n", "classification_head = ClassificationHead(\n", " input_dim=expected_input_dim,\n", " num_classes=num_classes,\n", @@ -406,7 +531,36 @@ { "cell_type": "code", "execution_count": null, - "id": "25", + "id": "39", + "metadata": {}, + "outputs": [], + "source": [ + "x_combined = torch.cat((text_embedder_output, cat_var_net_output), dim=1)\n", + "logits = classification_head(x_combined)\n", + "print(\"logits shape: \", logits.shape) # (batch_size, num_classes)" + ] + }, + { + "cell_type": "markdown", + "id": "40", + "metadata": {}, + "source": [ + "### The TextClassificationModel" + ] + }, + { + "cell_type": "markdown", + "id": "41", + "metadata": {}, + "source": [ + "Finally, the `TextClassificationModel` class combines all these components into a single model that can be trained end-to-end.\n", + "It checks if everything connects well together (input/output dimensions) and handles the forward pass." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "42", "metadata": {}, "outputs": [], "source": [ @@ -421,7 +575,34 @@ { "cell_type": "code", "execution_count": null, - "id": "26", + "id": "43", + "metadata": {}, + "outputs": [], + "source": [ + "# Takes the same input as TextEmbedder + CategoricalVarNet -> same output as ClassificationHead (logits)\n", + "model(input_ids=batch[\"input_ids\"], attention_mask=batch[\"attention_mask\"], categorical_vars=batch[\"categorical_vars\"]).shape" + ] + }, + { + "cell_type": "markdown", + "id": "44", + "metadata": {}, + "source": [ + "### The TextClassificationModule" + ] + }, + { + "cell_type": "markdown", + "id": "45", + "metadata": {}, + "source": [ + "We provide a PyTorch Lightning wrapper, for easy training and checkpointing. We refer to [Lightning's doc](https://lightning.ai/docs/pytorch/stable/) for more details on how to use it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "46", "metadata": {}, "outputs": [], "source": [ @@ -441,19 +622,29 @@ }, { "cell_type": "markdown", - "id": "27", + "id": "47", "metadata": {}, "source": [ - "## Using the wrapper" + "# `torchTextClassifiers`: a wrapper to handle them all" + ] + }, + { + "cell_type": "markdown", + "id": "48", + "metadata": {}, + "source": [ + "## Initialization" ] }, { "cell_type": "code", "execution_count": null, - "id": "28", + "id": "49", "metadata": {}, "outputs": [], "source": [ + "### Two main config objects, that mirror the parameters used above - and you're good to go !\n", + "\n", "model_config = ModelConfig(\n", " embedding_dim=embedding_dim,\n", " categorical_vocabulary_sizes=categorical_vocab_sizes,\n", @@ -471,33 +662,37 @@ "ttc = torchTextClassifiers(\n", " tokenizer=tokenizer,\n", " model_config=model_config,\n", - ")" + ")\n", + "\n", + "## Given those parameters, the TextClassificationModel is created internally, with the right connections between components\n", + "ttc" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "29", + "cell_type": "markdown", + "id": "50", "metadata": {}, - "outputs": [], "source": [ - "X_train[1, :]" + "## Training" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "30", + "cell_type": "markdown", + "id": "51", "metadata": {}, - "outputs": [], "source": [ - "tokenizer.tokenize(X_train[:256, 0].tolist()).input_ids.shape" + "`torchTextClassifiers` has a `.train()` method that handles the whole training process for you:\n", + "\n", + "- Init of dataset and dataloaders\n", + "- Init of Lightning module\n", + "- Training with early stopping and model checkpointing\n", + "- Using Lightning's Trainer under the hood" ] }, { "cell_type": "code", "execution_count": null, - "id": "31", + "id": "52", "metadata": {}, "outputs": [], "source": [ @@ -511,19 +706,17 @@ ] }, { - "cell_type": "code", - "execution_count": null, - "id": "32", + "cell_type": "markdown", + "id": "53", "metadata": {}, - "outputs": [], "source": [ - "X_test[0].shape" + "## Prediction and explainability" ] }, { "cell_type": "code", "execution_count": null, - "id": "33", + "id": "54", "metadata": {}, "outputs": [], "source": [ @@ -533,7 +726,7 @@ { "cell_type": "code", "execution_count": null, - "id": "34", + "id": "55", "metadata": {}, "outputs": [], "source": [ @@ -551,17 +744,7 @@ { "cell_type": "code", "execution_count": null, - "id": "35", - "metadata": {}, - "outputs": [], - "source": [ - "predictions.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "36", + "id": "56", "metadata": {}, "outputs": [], "source": [ @@ -572,7 +755,7 @@ { "cell_type": "code", "execution_count": null, - "id": "37", + "id": "57", "metadata": {}, "outputs": [], "source": [ @@ -582,7 +765,7 @@ { "cell_type": "code", "execution_count": null, - "id": "38", + "id": "58", "metadata": {}, "outputs": [], "source": [ @@ -597,7 +780,7 @@ { "cell_type": "code", "execution_count": null, - "id": "39", + "id": "59", "metadata": {}, "outputs": [], "source": [ diff --git a/notebooks/example.qmd b/notebooks/example.qmd deleted file mode 100644 index 666c33b..0000000 --- a/notebooks/example.qmd +++ /dev/null @@ -1,499 +0,0 @@ ---- -title: "Exemple d'utilisation de la librairie `TorchFastText`" ---- - - - -_Warning_ - -_`TorchFastText` library is still under active development. Have a regular look to [https://github.com/inseefrlab/torch-fastText](https://github.com/inseefrlab/torch-fastText) for latest information._ - -To install package, you can run the following snippet - -```{python} -#| output: false -#| eval: false - -# Stable version -pip install torchFastText -# Development version -# pip install !https://github.com/InseeFrLab/torch-fastText.git -``` - -# Load and preprocess data - -In that guide, we propose to illustrate main package functionalities using that `DataFrame`: - -```{python} -import pandas as pd -df = pd.read_parquet("https://minio.lab.sspcloud.fr/projet-ape/extractions/20241027_sirene4.parquet") -df = df.sample(10000) -``` - -Our goal will be to build multilabel classification for the `code` variable using `libelle` as feature. - -## Enriching our test dataset - -Unlike `Fasttext`, this package offers the possibility of having several feature columns of different types (string for the text column and additional variables in numeric form, for example). To illustrate that, we propose the following enrichment of the example dataset: - - -```{python} -import pandas as pd -import numpy as np -from sklearn.model_selection import train_test_split -from sklearn.preprocessing import LabelEncoder - -def categorize_surface( - df: pd.DataFrame, surface_feature_name: int, like_sirene_3: bool = True -) -> pd.DataFrame: - """ - Categorize the surface of the activity. - - Args: - df (pd.DataFrame): DataFrame to categorize. - surface_feature_name (str): Name of the surface feature. - like_sirene_3 (bool): If True, categorize like Sirene 3. - - Returns: - pd.DataFrame: DataFrame with a new column "surf_cat". - """ - df_copy = df.copy() - df_copy[surface_feature_name] = df_copy[surface_feature_name].replace("nan", np.nan) - df_copy[surface_feature_name] = df_copy[surface_feature_name].astype(float) - # Check surface feature exists - if surface_feature_name not in df.columns: - raise ValueError(f"Surface feature {surface_feature_name} not found in DataFrame.") - # Check surface feature is a float variable - if not (pd.api.types.is_float_dtype(df_copy[surface_feature_name])): - raise ValueError(f"Surface feature {surface_feature_name} must be a float variable.") - - if like_sirene_3: - # Categorize the surface - df_copy["surf_cat"] = pd.cut( - df_copy[surface_feature_name], - bins=[0, 120, 400, 2500, np.inf], - labels=["1", "2", "3", "4"], - ).astype(str) - else: - # Log transform the surface - df_copy["surf_log"] = np.log(df[surface_feature_name]) - - # Categorize the surface - df_copy["surf_cat"] = pd.cut( - df_copy.surf_log, - bins=[0, 3, 4, 5, 12], - labels=["1", "2", "3", "4"], - ).astype(str) - - df_copy[surface_feature_name] = df_copy["surf_cat"].replace("nan", "0") - df_copy[surface_feature_name] = df_copy[surface_feature_name].astype(int) - df_copy = df_copy.drop(columns=["surf_log", "surf_cat"], errors="ignore") - return df_copy - - -def clean_and_tokenize_df( - df, - categorical_features=["EVT", "CJ", "NAT", "TYP", "CRT"], - text_feature="libelle_processed", - label_col="apet_finale", -): - df.fillna("nan", inplace=True) - - df = df.rename( - columns={ - "evenement_type": "EVT", - "cj": "CJ", - "activ_nat_et": "NAT", - "liasse_type": "TYP", - "activ_surf_et": "SRF", - "activ_perm_et": "CRT", - } - ) - - les = [] - for col in categorical_features: - le = LabelEncoder() - df[col] = le.fit_transform(df[col]) - les.append(le) - - df = categorize_surface(df, "SRF", like_sirene_3=True) - df = df[[text_feature, "EVT", "CJ", "NAT", "TYP", "SRF", "CRT", label_col]] - - return df, les - - -def stratified_split_rare_labels(X, y, test_size=0.2, min_train_samples=1): - # Get unique labels and their frequencies - unique_labels, label_counts = np.unique(y, return_counts=True) - - # Separate rare and common labels - rare_labels = unique_labels[label_counts == 1] - - # Create initial mask for rare labels to go into training set - rare_label_mask = np.isin(y, rare_labels) - - # Separate data into rare and common label datasets - X_rare = X[rare_label_mask] - y_rare = y[rare_label_mask] - X_common = X[~rare_label_mask] - y_common = y[~rare_label_mask] - - # Split common labels stratified - X_common_train, X_common_test, y_common_train, y_common_test = train_test_split( - X_common, y_common, test_size=test_size, stratify=y_common - ) - - # Combine rare labels with common labels split - X_train = np.concatenate([X_rare, X_common_train]) - y_train = np.concatenate([y_rare, y_common_train]) - X_test = X_common_test - y_test = y_common_test - - return X_train, X_test, y_train, y_test - -def add_libelles( - df: pd.DataFrame, - df_naf: pd.DataFrame, - y: str, - text_feature: str, - textual_features: list, - categorical_features: list, -): - missing_codes = set(df_naf["code"]) - fake_obs = df_naf[df_naf["code"].isin(missing_codes)] - fake_obs[y] = fake_obs["code"] - fake_obs[text_feature] = fake_obs[[text_feature]].apply( - lambda row: " ".join(f"[{col}] {val}" for col, val in row.items() if val != ""), axis=1 - ) - df = pd.concat([df, fake_obs[[col for col in fake_obs.columns if col in df.columns]]]) - - if textual_features is not None: - for feature in textual_features: - df[feature] = df[feature].fillna(value="") - if categorical_features is not None: - for feature in categorical_features: - df[feature] = df[feature].fillna(value="NaN") - - print(f"\t*** {len(missing_codes)} codes have been added in the database...\n") - return df -``` - -```{python} -categorical_features = ["evenement_type", "cj", "activ_nat_et", "liasse_type", "activ_surf_et", "activ_perm_et"] -text_feature = "libelle" -y = "apet_finale" -textual_features = None - -naf2008 = pd.read_csv("https://minio.lab.sspcloud.fr/projet-ape/data/naf2008.csv", sep=";") -df = add_libelles(df, naf2008, y, text_feature, textual_features, categorical_features) -``` - - -## Preprocessing - -To reduce noise in text fields, we recommend pre-processing before training a model with our package. We assume this preprocessing is handled by the package user : this gives him the opportunity to control data cleansing. - -Here's an example of the type of preprocessing that can be carried out before moving on to the modeling phase - -```{python} -from torchFastText.preprocess import clean_text_feature -df["libelle_processed"] = clean_text_feature(df["libelle"]) -``` - -Right now, the model requires the label (variable y) to be a numerical variable. If the label variable is a text variable, we recommend using Scikit Learn's [LabelEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html) to convert into a numeric variable. Using that function will give user the possibility to get back labels from the encoder after running predictions. - -```{python} -encoder = LabelEncoder() -df["apet_finale"] = encoder.fit_transform(df["apet_finale"]) -``` - -The function `clean_and_tokenize_df` requires special `DataFrame` formatting: - -- First column contains the processed text (str) -- Next ones contain the "encoded" categorical (discrete) variables in int format - - -```{python} -df, _ = clean_and_tokenize_df(df, text_feature="libelle_processed") -X = df[["libelle_processed", "EVT", "CJ", "NAT", "TYP", "CRT", "SRF"]].values -y = df["apet_finale"].values -``` - -## Splitting in train-test sets - -As usual in a learning approach, you need to break down your data into learning and test/validation samples to obtain robust performance statistics. - -This work is the responsibility of the package's users. Here's an example of how to do it, using the [`train_test_split`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html) function in `Scikit`. - -```{python} -from sklearn.model_selection import train_test_split -X_train, X_test, y_train, y_test = train_test_split(X, y) -``` - -# Build the torch-fastText model (without training it) - -There are several ways to define and train a pytorch.fasttext model in this package. - -We first show how to initialize the model and then afterwars build it. - -`torchFastText` function accepts the following parameters: - -| Parameter | Meaning | Example Value | -|---------------------------------|---------------------------------------------------------------------|--------------| -| `num_tokens` | Number of rows in the embedding matrix (size of the vocabulary) | 100000 | -| `embedding_dim` | Dimension of the embedding (number of columns in the matrix) | 50 | -| `sparse` | Use sparse embedding for fast computation (PyTorch) | False | -| `categorical_embedding_dims` | Dimension of the embedding for categorical features | 10 | -| `min_count` | Minimum occurrences of a word in the corpus to be included | 1 | -| `min_n` | Minimum length of character n-grams | 3 | -| `max_n` | Maximum length of character n-grams | 6 | -| `len_word_ngrams` | Length of word n-grams | 3 | - - -```{python} -from torchFastText import torchFastText - -parameters = { - "num_tokens": 100000, - "embedding_dim": 50, - "sparse": False, - "categorical_embedding_dims": 10, - "min_count": 1, - "min_n": 3, - "max_n": 6, - "len_word_ngrams": 3, -} - -parameters_train = { - "lr": 0.004, - "num_epochs": 1, - "batch_size": 256, - "patience": 3 -} - -model = torchFastText(**parameters) -``` - -`model` is then a special `torchFastText` object: - -```{python} -type(model) -``` - -As any `PyTorch` model, it accepts being save as a JSON for later on use: - -```{python} -model.to_json('torchFastText_config.json') -# model = torchFastText.from_json('torchFastText_config.json') -``` - -We can apply `build` to finally train our model. These are the parameters accepted by the `build` method - -| Parameter | Meaning | Example Value | -|---------------------------------|---------------------------------------------------------------------|--------------| -| `lr` | Learning rate | 0.004 | -| `num_epochs` | Number of training epochs | 1 | -| `batch_size` | Batch size for training | 256 | -| `patience` | Early stopping patience (number of epochs without improvement) | 3 | - - -We build the model using the training data. -We have now access to the tokenizer, the PyTorch model as well as a PyTorch Lightning module ready to be trained. -Note that Lightning is high-level framework for PyTorch that simplifies the process of training, validating, and deploying machine learning models. - - -```{python} -model.build(X_train, y_train, lightning=True, lr=parameters_train.get("lr")) -``` - -One can retrieve different objects from `model` instance: - -* `model.pytorch_model` -* `model.tokenizer` -* `model.lightning_module` - - -```{python} -model.pytorch_model -``` - -```{python} -model.tokenizer -``` - -```{python} -model.lightning_module -``` - -One can also retrieve more precise information regarding the tokenizer. This can be useful to know how text is parsed before being given to the neural network: - - -```{python} -from pprint import pprint -sentence = ["lorem ipsum dolor sit amet"] -pprint(model.tokenizer.tokenize(sentence)[2][0]) -``` - - -Saving parameters to JSON can also be done after building, but the model needs to be rebuilt after loading. - -```{python} -model.to_json('torchFastText_config.json') -``` - - -## Alternative way to build torchFastText - -The training data is only useful to initialize the tokenizer, but X_train and y_train are not needed to initialize the PyTorch model, provided we give the right parameters to construct layer. - -To highlight this, we provide a lower-level process to build the model where one can first build the tokenizer, and then build the model with custom architecture parameters. - -The tokenizer can be loaded **from the same JSON file** as the model parameters, or initialized using the right arguments. - - -```{python} -del model -``` - -Let's decompose our features in two group: - -* We have our textual feature stored in the first column of the features matrix -* All other columns are categorical variables - -```{python} -training_text = X_train[:, 0].tolist() -categorical_variables = X_train[:, 1:] -``` - -We need to create a few variables that will be useful afterwards - -```{python} -CAT_VOCAB_SIZE = (np.max(categorical_variables, axis=0) + 1).astype(int).tolist() -NUM_CLASSES = len(np.unique(y_train)) -NUM_CAT_VAR = categorical_variables.shape[1] -``` - -Now let's come to the nitty gritty. There are several ways to create an instance of the tokenizer. - -First, we can create the tokenizer from : - -* model definition in the JSON file created beforehand -* textual data in training dataset - -```{python} -from torchFastText.datasets import NGramTokenizer -tokenizer = NGramTokenizer.from_json('torchFastText_config.json', training_text) -``` - -```{python} -tokenizer.tokenize("Hello world") -``` - -However, there is a more straightforward way to do: creating directly the `NGramTokenizer` instance: - - -```{python} -tokenizer = NGramTokenizer( - **parameters, - training_text=training_text - ) -``` - -```{python} -tokenizer.tokenize("Hello world") -``` - -Why creating a `NGramTokenizer` separately ? Because model constructor is now independent from training data: - -```{python} -#| echo: false -#| eval: false -# TODO : allow to do that -#torchFastText.build_from_tokenizer( - #tokenizer, - #**parameters, - #**parameters_build -# ) -``` - -```{python} -model = torchFastText.build_from_tokenizer( - tokenizer, - embedding_dim=parameters["embedding_dim"], - categorical_embedding_dims=parameters["categorical_embedding_dims"], - sparse=parameters["sparse"], - lr=parameters_train["lr"], - num_classes=NUM_CLASSES, - num_categorical_features=NUM_CAT_VAR, - categorical_vocabulary_sizes=CAT_VOCAB_SIZE -) -``` - -__Warning__: - -If the PyTorch model building did not use the training data, please keep in mind that its architecture (that you customize here) should match the vocabulary size of the categorical variables and the total number of class, otherwise the model will raise an error during training. - - -# Train a torchFastText model directly - -If no advanced customization or PyTorch tuning is necessary, there is a direct way of training model. - - -```{python} -#| eval: false -model.train( - X_train, - y_train, - X_test, - y_test, - num_epochs=parameters_train['num_epochs'], - batch_size=parameters_train['batch_size'], - patience_scheduler=parameters_train['patience'], - patience_train=parameters_train['patience'], - lr=parameters_train['lr'], - verbose = True -) -``` - -# Load a trained model from a Lightning checkpoint - -/!\ TOCOMPLETE - - -```{python} -#| eval: false -model.load_from_checkpoint(model.best_model_path) # or any other checkpoint path (string) -``` - -# Predicting from new labels - - -```{python} -#| eval: false -text = ["coiffeur, boulangerie, pâtisserie"] # one text description -X= np.array([[text[0], 0, 0, 0, 0, 0, 0]]) # our new entry -TOP_K = 5 - -pred, conf = model.predict(X, top_k=TOP_K) -pred_naf = encoder.inverse_transform(pred.reshape(-1)) -subset = naf2008.set_index("code").loc[np.flip(pred_naf)] - -for i in range(TOP_K-1, -1, -1): - print(f"Prediction: {pred_naf[i]}, confidence: {conf[0, i]}, description: {subset['libelle'][pred_naf[i]]}") - -``` - -# Explainability - - -```{python} -#| eval: false -from torchFastText.explainability.visualisation import ( - visualize_letter_scores, - visualize_word_scores, -) - -pred, conf, all_scores, all_scores_letters = model.predict_and_explain(X) -visualize_word_scores(all_scores, text, pred_naf.reshape(1, -1)) -visualize_letter_scores(all_scores_letters, text, pred_naf.reshape(1, -1)) -``` \ No newline at end of file diff --git a/notebooks/torchFastText_config.json b/notebooks/torchFastText_config.json deleted file mode 100644 index ff40f49..0000000 --- a/notebooks/torchFastText_config.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "embedding_dim": 50, - "sparse": false, - "num_tokens": 100000, - "min_count": 1, - "min_n": 3, - "max_n": 6, - "len_word_ngrams": 3, - "num_classes": 646, - "num_rows": 107992, - "categorical_vocabulary_sizes": [ - 24, - 40, - 8, - 13, - 3, - 4 - ], - "categorical_embedding_dims": 10, - "num_categorical_features": 6, - "direct_bagging": true -} \ No newline at end of file diff --git a/notebooks/utils.py b/notebooks/utils.py index 19c4734..f458581 100644 --- a/notebooks/utils.py +++ b/notebooks/utils.py @@ -1,8 +1,8 @@ -import pandas as pd import numpy as np -from sklearn.model_selection import train_test_split +import pandas as pd from sklearn.preprocessing import LabelEncoder + def categorize_surface( df: pd.DataFrame, surface_feature_name: int, like_sirene_3: bool = True ) -> pd.DataFrame: @@ -53,23 +53,11 @@ def categorize_surface( def clean_and_tokenize_df( df, - categorical_features=["EVT", "CJ", "NAT", "TYP", "CRT"], + categorical_features=["CJ", "NAT", "TYP", "CRT"], text_feature="libelle_processed", label_col="apet_finale", ): df.fillna("nan", inplace=True) - - df = df.rename( - columns={ - "evenement_type": "EVT", - "cj": "CJ", - "activ_nat_et": "NAT", - "liasse_type": "TYP", - "activ_surf_et": "SRF", - "activ_perm_et": "CRT", - } - ) - les = [] for col in categorical_features: le = LabelEncoder() @@ -77,62 +65,6 @@ def clean_and_tokenize_df( les.append(le) df = categorize_surface(df, "SRF", like_sirene_3=True) - df = df[[text_feature, "EVT", "CJ", "NAT", "TYP", "SRF", "CRT", label_col]] + df = df[[text_feature, "CJ", "NAT", "TYP", "SRF", "CRT", label_col]] return df, les - - -def stratified_split_rare_labels(X, y, test_size=0.2, min_train_samples=1): - # Get unique labels and their frequencies - unique_labels, label_counts = np.unique(y, return_counts=True) - - # Separate rare and common labels - rare_labels = unique_labels[label_counts == 1] - - # Create initial mask for rare labels to go into training set - rare_label_mask = np.isin(y, rare_labels) - - # Separate data into rare and common label datasets - X_rare = X[rare_label_mask] - y_rare = y[rare_label_mask] - X_common = X[~rare_label_mask] - y_common = y[~rare_label_mask] - - # Split common labels stratified - X_common_train, X_common_test, y_common_train, y_common_test = train_test_split( - X_common, y_common, test_size=test_size, stratify=y_common - ) - - # Combine rare labels with common labels split - X_train = np.concatenate([X_rare, X_common_train]) - y_train = np.concatenate([y_rare, y_common_train]) - X_test = X_common_test - y_test = y_common_test - - return X_train, X_test, y_train, y_test - -def add_libelles( - df: pd.DataFrame, - df_naf: pd.DataFrame, - y: str, - text_feature: str, - textual_features: list, - categorical_features: list, -): - missing_codes = set(df_naf["code"]) - fake_obs = df_naf[df_naf["code"].isin(missing_codes)] - fake_obs[y] = fake_obs["code"] - fake_obs[text_feature] = fake_obs[[text_feature]].apply( - lambda row: " ".join(f"[{col}] {val}" for col, val in row.items() if val != ""), axis=1 - ) - df = pd.concat([df, fake_obs[[col for col in fake_obs.columns if col in df.columns]]]) - - if textual_features is not None: - for feature in textual_features: - df[feature] = df[feature].fillna(value="") - if categorical_features is not None: - for feature in categorical_features: - df[feature] = df[feature].fillna(value="NaN") - - print(f"\t*** {len(missing_codes)} codes have been added in the database...\n") - return df \ No newline at end of file From 45ace28bb3d9aa59c249f6464da5bfe434b52d29 Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Wed, 19 Nov 2025 10:40:14 +0000 Subject: [PATCH 55/66] fix: better handling of truncation to avoid warning --- torchTextClassifiers/tokenizers/base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/torchTextClassifiers/tokenizers/base.py b/torchTextClassifiers/tokenizers/base.py index fbc77d4..e46399f 100644 --- a/torchTextClassifiers/tokenizers/base.py +++ b/torchTextClassifiers/tokenizers/base.py @@ -124,12 +124,13 @@ def tokenize( # Pad to longest sequence if no output_dim is specified padding = True if self.output_dim is None else "max_length" + truncation = True if self.output_dim is not None else False tokenize_output = self.tokenizer( text, padding=padding, return_tensors="pt", - truncation=True, + truncation=truncation, max_length=self.output_dim, return_offsets_mapping=return_offsets_mapping, ) # method from PreTrainedTokenizerFast From b2e797b38c90b73369fe796ff3a289ca1c30067c Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Wed, 19 Nov 2025 11:07:12 +0000 Subject: [PATCH 56/66] doc: fix readme --- README.md | 142 +++++------------------------------------------------- 1 file changed, 12 insertions(+), 130 deletions(-) diff --git a/README.md b/README.md index 73810a9..95fa297 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,18 @@ # torchTextClassifiers -A unified, extensible framework for text classification built on [PyTorch](https://pytorch.org/) and [PyTorch Lightning](https://lightning.ai/docs/pytorch/stable/). +A unified, extensible framework for text classification with categorical variables built on [PyTorch](https://pytorch.org/) and [PyTorch Lightning](https://lightning.ai/docs/pytorch/stable/). ## 🚀 Features -- **Unified API**: Consistent interface for different classifier wrappers -- **Extensible**: Easy to add new classifier implementations through wrapper pattern -- **FastText Support**: Built-in FastText classifier with n-gram tokenization -- **Flexible Preprocessing**: Each classifier can implement its own text preprocessing approach +- **Mixed input support**: Handle text data alongside categorical variables seamlessly. +- **Unified yet highly customizable**: + - Use any tokenizer from HuggingFace or the original fastText's ngram tokenizer. + - Manipulate the components (`TextEmbedder`, `CategoricalVariableNet`, `ClassificationHead`) to easily create custom architectures - including **self-attention**. All of them are `torch.nn.Module` ! + - The `TextClassificationModel` class combines these components and can be extended for custom behavior. - **PyTorch Lightning**: Automated training with callbacks, early stopping, and logging +- **Easy experimentation**: Simple API for training, evaluating, and predicting with minimal code: + - The `torchTextClassifiers` wrapper class orchestrates the tokenizer and the model for you +- **Additional features**: explainability using Captum ## 📦 Installation @@ -25,129 +29,9 @@ uv sync pip install -e . ``` -## 🎯 Quick Start - -### Basic FastText Classification - -```python -import numpy as np -from torchTextClassifiers import create_fasttext - -# Create a FastText classifier -classifier = create_fasttext( - embedding_dim=100, - sparse=False, - num_tokens=10000, - min_count=2, - min_n=3, - max_n=6, - len_word_ngrams=2, - num_classes=2 -) - -# Prepare your data -X_train = np.array([ - "This is a positive example", - "This is a negative example", - "Another positive case", - "Another negative case" -]) -y_train = np.array([1, 0, 1, 0]) - -X_val = np.array([ - "Validation positive", - "Validation negative" -]) -y_val = np.array([1, 0]) - -# Build the model -classifier.build(X_train, y_train) - -# Train the model -classifier.train( - X_train, y_train, X_val, y_val, - num_epochs=50, - batch_size=32, - patience_train=5, - verbose=True -) - -# Make predictions -X_test = np.array(["This is a test sentence"]) -predictions = classifier.predict(X_test) -print(f"Predictions: {predictions}") - -# Validate on test set -accuracy = classifier.validate(X_test, np.array([1])) -print(f"Accuracy: {accuracy:.3f}") -``` - -### Custom Classifier Implementation - -```python -import numpy as np -from torchTextClassifiers import torchTextClassifiers -from torchTextClassifiers.classifiers.simple_text_classifier import SimpleTextWrapper, SimpleTextConfig - -# Example: TF-IDF based classifier (alternative to tokenization) -config = SimpleTextConfig( - hidden_dim=128, - num_classes=2, - max_features=5000, - learning_rate=1e-3, - dropout_rate=0.2 -) - -# Create classifier with TF-IDF preprocessing -wrapper = SimpleTextWrapper(config) -classifier = torchTextClassifiers(wrapper) - -# Text data -X_train = np.array(["Great product!", "Terrible service", "Love it!"]) -y_train = np.array([1, 0, 1]) - -# Build and train -classifier.build(X_train, y_train) -# ... continue with training -``` - - -### Training Customization - -```python -# Custom PyTorch Lightning trainer parameters -trainer_params = { - 'accelerator': 'gpu', - 'devices': 1, - 'precision': 16, # Mixed precision training - 'gradient_clip_val': 1.0, -} - -classifier.train( - X_train, y_train, X_val, y_val, - num_epochs=100, - batch_size=64, - patience_train=10, - trainer_params=trainer_params, - verbose=True -) -``` - -## 🔬 Testing - -Run the test suite: - -```bash -# Run all tests -uv run pytest - -# Run with coverage -uv run pytest --cov=torchTextClassifiers - -# Run specific test file -uv run pytest tests/test_torchTextClassifiers.py -v -``` +## 📝 Usage +Checkout the [notebook](notebooks/example.ipynb) for a quick start. ## 📚 Examples @@ -155,10 +39,8 @@ See the [examples/](examples/) directory for: - Basic text classification - Multi-class classification - Mixed features (text + categorical) -- Custom classifier implementation - Advanced training configurations - - +- Prediction and explainability ## 📄 License From 84b118b58aaee1fde0fd5dd4bcfe1a50e9e8a9bb Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Thu, 20 Nov 2025 10:57:43 +0000 Subject: [PATCH 57/66] fix: allow tokenizer not to have train attribute --- torchTextClassifiers/torchTextClassifiers.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/torchTextClassifiers/torchTextClassifiers.py b/torchTextClassifiers/torchTextClassifiers.py index b5a2090..9509857 100644 --- a/torchTextClassifiers/torchTextClassifiers.py +++ b/torchTextClassifiers/torchTextClassifiers.py @@ -125,10 +125,11 @@ def __init__( self.model_config = model_config self.tokenizer = tokenizer - if not self.tokenizer.trained: - raise RuntimeError( - f"Tokenizer {type(self.tokenizer)} must be trained before initializing the classifier." - ) + if hasattr(self.tokenizer, "trained"): + if not self.tokenizer.trained: + raise RuntimeError( + f"Tokenizer {type(self.tokenizer)} must be trained before initializing the classifier." + ) self.vocab_size = tokenizer.vocab_size self.embedding_dim = model_config.embedding_dim From 3c0a85aac71b724a6f760b3536433840b9dc7311 Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Thu, 20 Nov 2025 10:59:42 +0000 Subject: [PATCH 58/66] feat(ngram): add return offsets and word_ids + fix output_dim --- torchTextClassifiers/tokenizers/__init__.py | 1 + torchTextClassifiers/tokenizers/base.py | 7 +- torchTextClassifiers/tokenizers/ngram.py | 117 ++++++++++++++++---- 3 files changed, 104 insertions(+), 21 deletions(-) diff --git a/torchTextClassifiers/tokenizers/__init__.py b/torchTextClassifiers/tokenizers/__init__.py index 476f879..ecba309 100644 --- a/torchTextClassifiers/tokenizers/__init__.py +++ b/torchTextClassifiers/tokenizers/__init__.py @@ -6,4 +6,5 @@ HuggingFaceTokenizer as HuggingFaceTokenizer, ) from .base import TokenizerOutput as TokenizerOutput +from .ngram import NGramTokenizer as NGramTokenizer from .WordPiece import WordPieceTokenizer as WordPieceTokenizer diff --git a/torchTextClassifiers/tokenizers/base.py b/torchTextClassifiers/tokenizers/base.py index e46399f..75a200d 100644 --- a/torchTextClassifiers/tokenizers/base.py +++ b/torchTextClassifiers/tokenizers/base.py @@ -65,7 +65,11 @@ def __post_init__(self): class BaseTokenizer(ABC): def __init__( - self, vocab_size: int, output_vectorized: bool = False, output_dim: Optional[int] = None + self, + vocab_size: int, + padding_idx: int, + output_vectorized: bool = False, + output_dim: Optional[int] = None, ): """ Base class for tokenizers. @@ -78,6 +82,7 @@ def __init__( self.vocab_size = vocab_size self.output_vectorized = output_vectorized self.output_dim = output_dim + self.padding_idx = padding_idx if self.output_vectorized: if output_dim is None: raise ValueError( diff --git a/torchTextClassifiers/tokenizers/ngram.py b/torchTextClassifiers/tokenizers/ngram.py index e7476ef..b6911f9 100644 --- a/torchTextClassifiers/tokenizers/ngram.py +++ b/torchTextClassifiers/tokenizers/ngram.py @@ -4,6 +4,7 @@ from functools import lru_cache from typing import List, Optional, Tuple, Union +import numpy as np import torch from torchTextClassifiers.tokenizers import BaseTokenizer, TokenizerOutput @@ -113,7 +114,7 @@ def get(self, word: str) -> List[int]: # ============================================================================ -# Vectorized encoding +# Vectorized encoding with optional metadata # ============================================================================ @@ -124,33 +125,78 @@ def encode_batch_vectorized( pad_token_id: int, max_length: Optional[int] = None, truncation: bool = False, -) -> Tuple[torch.Tensor, torch.Tensor]: + return_offsets_mapping: bool = False, + return_word_ids: bool = False, + force_max_length: bool = False, +) -> Tuple[torch.Tensor, torch.Tensor, Optional[List], Optional[List]]: """ Vectorized batch encoding - processes all sentences together. - Returns padded tensors directly. + Returns padded tensors directly, with optional offset mappings and word IDs. + + Args: + force_max_length: If True and max_length is set, always return tensors of size max_length """ all_ids = [] + all_offsets = [] if return_offsets_mapping else None + all_word_ids = [] if return_word_ids else None max_len = 0 # First pass: encode all sentences for sentence in sentences: ids = [] + offsets = [] if return_offsets_mapping else None + word_ids = [] if return_word_ids else None + words = sentence.split() + char_offset = 0 + + for word_idx, word in enumerate(words): + # Find the actual position of this word in the original sentence + word_start = sentence.find(word, char_offset) + word_end = word_start + len(word) + char_offset = word_end + + # Get subword tokens for this word + subword_tokens = subword_cache.get(word) - for word in words: - ids.extend(subword_cache.get(word)) + for token_id in subword_tokens: + ids.append(token_id) + if return_offsets_mapping: + # All subword tokens of a word map to the word's character span + offsets.append((word_start, word_end)) + + if return_word_ids: + # All subword tokens of a word get the same word_id + word_ids.append(word_idx) + + # Add EOS token ids.append(eos_token_id) + if return_offsets_mapping: + offsets.append((len(sentence), len(sentence))) # EOS has no span + if return_word_ids: + word_ids.append(None) # EOS is not part of any word # Truncate if needed if truncation and max_length and len(ids) > max_length: ids = ids[:max_length] + if return_offsets_mapping: + offsets = offsets[:max_length] + if return_word_ids: + word_ids = word_ids[:max_length] all_ids.append(ids) + if return_offsets_mapping: + all_offsets.append(offsets) + if return_word_ids: + all_word_ids.append(word_ids) max_len = max(max_len, len(ids)) # Determine final sequence length - if max_length and not truncation: + if force_max_length and max_length: + # Always use max_length when force_max_length is True + seq_len = max_length + elif max_length and not truncation: seq_len = min(max_len, max_length) elif max_length: seq_len = max_length @@ -162,13 +208,22 @@ def encode_batch_vectorized( input_ids = torch.full((batch_size, seq_len), pad_token_id, dtype=torch.long) attention_mask = torch.zeros((batch_size, seq_len), dtype=torch.long) - # Fill tensors + # Fill tensors and pad metadata for i, ids in enumerate(all_ids): length = min(len(ids), seq_len) input_ids[i, :length] = torch.tensor(ids[:length], dtype=torch.long) attention_mask[i, :length] = 1 - return input_ids, attention_mask + # Pad offsets and word_ids to match sequence length + if return_offsets_mapping: + # Pad with (0, 0) for padding tokens + all_offsets[i] = all_offsets[i][:length] + [(0, 0)] * (seq_len - length) + + if return_word_ids: + # Pad with None for padding tokens + all_word_ids[i] = all_word_ids[i][:length] + [None] * (seq_len - length) + + return input_ids, attention_mask, all_offsets, all_word_ids # ============================================================================ @@ -183,8 +238,7 @@ class NGramTokenizer(BaseTokenizer): - Vectorized batch encoding - Cached text normalization - Direct tensor operations - - No multiprocessing overhead - - No Numba dependency + - Optional offset mapping and word ID tracking """ PAD_TOKEN = "[PAD]" @@ -200,6 +254,7 @@ def __init__( len_word_ngrams: int, training_text: Optional[List[str]] = None, preprocess: bool = True, + output_dim: Optional[int] = None, **kwargs, ): if min_n < 2: @@ -227,9 +282,11 @@ def __init__( self.subword_cache = None self.vocab_size = 3 + self.nwords + self.num_tokens - super().__init__(vocab_size=self.vocab_size) + super().__init__( + vocab_size=self.vocab_size, padding_idx=self.pad_token_id, output_dim=output_dim + ) - def _build_vocab(self, training_text: List[str]): + def train(self, training_text: List[str]): """Build vocabulary from training text.""" word_counts = {} for sent in training_text: @@ -261,16 +318,24 @@ def _build_vocab(self, training_text: List[str]): def tokenize( self, text: Union[str, List[str]], - padding: str = "longest", - max_length: Optional[int] = None, - truncation: bool = False, return_offsets_mapping: bool = False, return_word_ids: bool = False, **kwargs, ) -> TokenizerOutput: """ Optimized tokenization with vectorized operations. - Note: return_offsets_mapping and return_word_ids removed for speed. + + Args: + text: Single string or list of strings to tokenize + padding: Padding strategy ('longest' or 'max_length') + max_length: Maximum sequence length + truncation: Whether to truncate sequences exceeding max_length + return_offsets_mapping: If True, return character offsets for each token + return_word_ids: If True, return word indices for each token + + Returns: + TokenizerOutput with input_ids, attention_mask, and optionally + offset_mapping and word_ids """ is_single = isinstance(text, str) if is_single: @@ -280,21 +345,33 @@ def tokenize( if self.preprocess: text = clean_text_feature(text) + if self.output_dim is not None: + max_length = self.output_dim + truncation = True + else: + max_length = None + truncation = False + # Vectorized encoding - input_ids, attention_mask = encode_batch_vectorized( + input_ids, attention_mask, offsets, word_ids = encode_batch_vectorized( text, self.subword_cache, self.eos_token_id, self.pad_token_id, - max_length=max_length if padding == "max_length" else None, + max_length=max_length, truncation=truncation, + return_offsets_mapping=return_offsets_mapping, + return_word_ids=return_word_ids, ) + offsets = torch.tensor(offsets) if return_offsets_mapping else None + word_ids = np.array(word_ids) if return_word_ids else None + return TokenizerOutput( input_ids=input_ids, attention_mask=attention_mask, - word_ids=None, - offset_mapping=None, + word_ids=word_ids, + offset_mapping=offsets, ) def decode( From ab704853a6d93b2e1eec9f8142dd4fa08f866514 Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Thu, 20 Nov 2025 11:12:14 +0000 Subject: [PATCH 59/66] fix: update vocab_size after training --- torchTextClassifiers/tokenizers/base.py | 2 +- torchTextClassifiers/tokenizers/ngram.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/torchTextClassifiers/tokenizers/base.py b/torchTextClassifiers/tokenizers/base.py index 75a200d..3dfd71e 100644 --- a/torchTextClassifiers/tokenizers/base.py +++ b/torchTextClassifiers/tokenizers/base.py @@ -113,7 +113,7 @@ def __init__( trained: bool = False, ): super().__init__( - vocab_size, output_vectorized=False, output_dim=output_dim + vocab_size, output_vectorized=False, output_dim=output_dim, padding_idx=padding_idx ) # it outputs token ids and not vectors self.trained = trained diff --git a/torchTextClassifiers/tokenizers/ngram.py b/torchTextClassifiers/tokenizers/ngram.py index b6911f9..c534c3a 100644 --- a/torchTextClassifiers/tokenizers/ngram.py +++ b/torchTextClassifiers/tokenizers/ngram.py @@ -282,6 +282,7 @@ def __init__( self.subword_cache = None self.vocab_size = 3 + self.nwords + self.num_tokens + print("brrrrr ", self.vocab_size) super().__init__( vocab_size=self.vocab_size, padding_idx=self.pad_token_id, output_dim=output_dim ) @@ -301,6 +302,7 @@ def train(self, training_text: List[str]): idx += 1 self.nwords = len(self.word_to_id) + self.vocab_size = 3 + self.nwords + self.num_tokens # Create reverse mapping self.id_to_word = {v: k for k, v in self.word_to_id.items()} From 27a11bb61485e432ca8ad283e4ceeef89b1d3a5e Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Thu, 20 Nov 2025 11:17:06 +0000 Subject: [PATCH 60/66] fix: add a flag for return_word_ids aligning with NGramTokenizer --- torchTextClassifiers/tokenizers/base.py | 12 ++++++++++-- torchTextClassifiers/torchTextClassifiers.py | 6 +++++- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/torchTextClassifiers/tokenizers/base.py b/torchTextClassifiers/tokenizers/base.py index 3dfd71e..dee5546 100644 --- a/torchTextClassifiers/tokenizers/base.py +++ b/torchTextClassifiers/tokenizers/base.py @@ -122,7 +122,10 @@ def __init__( self.output_dim = output_dim # constant context size for all batch def tokenize( - self, text: Union[str, List[str]], return_offsets_mapping: Optional[bool] = False + self, + text: Union[str, List[str]], + return_offsets_mapping: Optional[bool] = False, + return_word_ids: Optional[bool] = False, ) -> list: if not self.trained: raise RuntimeError("Tokenizer must be trained before tokenization.") @@ -142,11 +145,16 @@ def tokenize( encoded_text = tokenize_output["input_ids"] + if return_word_ids: + word_ids = np.array([tokenize_output.word_ids(i) for i in range(len(encoded_text))]) + else: + word_ids = None + return TokenizerOutput( input_ids=encoded_text, attention_mask=tokenize_output["attention_mask"], offset_mapping=tokenize_output.get("offset_mapping", None), - word_ids=np.array([tokenize_output.word_ids(i) for i in range(len(encoded_text))]), + word_ids=word_ids, ) @classmethod diff --git a/torchTextClassifiers/torchTextClassifiers.py b/torchTextClassifiers/torchTextClassifiers.py index 9509857..66b285d 100644 --- a/torchTextClassifiers/torchTextClassifiers.py +++ b/torchTextClassifiers/torchTextClassifiers.py @@ -460,6 +460,7 @@ def predict( if explain: return_offsets_mapping = True # to be passed to the tokenizer + return_word_ids = True if self.pytorch_model.text_embedder is None: raise RuntimeError( "Explainability is not supported when the tokenizer outputs vectorized text directly. Please use a tokenizer that outputs token IDs." @@ -474,6 +475,7 @@ def predict( ) # initialize a Captum layer gradient integrator else: return_offsets_mapping = False + return_word_ids = False X_test = self._check_X(X_test) text = X_test["text"] @@ -482,7 +484,9 @@ def predict( self.pytorch_model.eval().cpu() tokenize_output = self.tokenizer.tokenize( - text.tolist(), return_offsets_mapping=return_offsets_mapping + text.tolist(), + return_offsets_mapping=return_offsets_mapping, + return_word_ids=return_word_ids, ) if not isinstance(tokenize_output, TokenizerOutput): From 823467b5b79bd28c7f1e1b2e91c61c23f5c12bd0 Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Thu, 20 Nov 2025 11:17:06 +0000 Subject: [PATCH 61/66] fix: add a flag for return_word_ids aligning with NGramTokenizer --- torchTextClassifiers/tokenizers/base.py | 12 ++++++++++-- torchTextClassifiers/tokenizers/ngram.py | 2 +- torchTextClassifiers/torchTextClassifiers.py | 6 +++++- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/torchTextClassifiers/tokenizers/base.py b/torchTextClassifiers/tokenizers/base.py index 3dfd71e..dee5546 100644 --- a/torchTextClassifiers/tokenizers/base.py +++ b/torchTextClassifiers/tokenizers/base.py @@ -122,7 +122,10 @@ def __init__( self.output_dim = output_dim # constant context size for all batch def tokenize( - self, text: Union[str, List[str]], return_offsets_mapping: Optional[bool] = False + self, + text: Union[str, List[str]], + return_offsets_mapping: Optional[bool] = False, + return_word_ids: Optional[bool] = False, ) -> list: if not self.trained: raise RuntimeError("Tokenizer must be trained before tokenization.") @@ -142,11 +145,16 @@ def tokenize( encoded_text = tokenize_output["input_ids"] + if return_word_ids: + word_ids = np.array([tokenize_output.word_ids(i) for i in range(len(encoded_text))]) + else: + word_ids = None + return TokenizerOutput( input_ids=encoded_text, attention_mask=tokenize_output["attention_mask"], offset_mapping=tokenize_output.get("offset_mapping", None), - word_ids=np.array([tokenize_output.word_ids(i) for i in range(len(encoded_text))]), + word_ids=word_ids, ) @classmethod diff --git a/torchTextClassifiers/tokenizers/ngram.py b/torchTextClassifiers/tokenizers/ngram.py index c534c3a..0addfb3 100644 --- a/torchTextClassifiers/tokenizers/ngram.py +++ b/torchTextClassifiers/tokenizers/ngram.py @@ -282,7 +282,7 @@ def __init__( self.subword_cache = None self.vocab_size = 3 + self.nwords + self.num_tokens - print("brrrrr ", self.vocab_size) + super().__init__( vocab_size=self.vocab_size, padding_idx=self.pad_token_id, output_dim=output_dim ) diff --git a/torchTextClassifiers/torchTextClassifiers.py b/torchTextClassifiers/torchTextClassifiers.py index 9509857..66b285d 100644 --- a/torchTextClassifiers/torchTextClassifiers.py +++ b/torchTextClassifiers/torchTextClassifiers.py @@ -460,6 +460,7 @@ def predict( if explain: return_offsets_mapping = True # to be passed to the tokenizer + return_word_ids = True if self.pytorch_model.text_embedder is None: raise RuntimeError( "Explainability is not supported when the tokenizer outputs vectorized text directly. Please use a tokenizer that outputs token IDs." @@ -474,6 +475,7 @@ def predict( ) # initialize a Captum layer gradient integrator else: return_offsets_mapping = False + return_word_ids = False X_test = self._check_X(X_test) text = X_test["text"] @@ -482,7 +484,9 @@ def predict( self.pytorch_model.eval().cpu() tokenize_output = self.tokenizer.tokenize( - text.tolist(), return_offsets_mapping=return_offsets_mapping + text.tolist(), + return_offsets_mapping=return_offsets_mapping, + return_word_ids=return_word_ids, ) if not isinstance(tokenize_output, TokenizerOutput): From 4e2ffa54bf4043fe693852cc0c1bba84955dd831 Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Thu, 20 Nov 2025 11:32:00 +0000 Subject: [PATCH 62/66] fix: replace _build_vocab by train for clarity and consistency --- torchTextClassifiers/tokenizers/ngram.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchTextClassifiers/tokenizers/ngram.py b/torchTextClassifiers/tokenizers/ngram.py index 0addfb3..ed0d8cb 100644 --- a/torchTextClassifiers/tokenizers/ngram.py +++ b/torchTextClassifiers/tokenizers/ngram.py @@ -274,7 +274,7 @@ def __init__( self.eos_token_id = 2 if training_text is not None: - self._build_vocab(training_text) + self.train(training_text) else: self.word_to_id = {} self.id_to_word = {} From 519a32d48fd0fef7ab2b4ce5f44145b5e2a70f69 Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Thu, 20 Nov 2025 11:32:25 +0000 Subject: [PATCH 63/66] feat(test): add test of all pipeline with different tokenizers --- tests/test_pipeline.py | 236 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 236 insertions(+) create mode 100644 tests/test_pipeline.py diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py new file mode 100644 index 0000000..b272acc --- /dev/null +++ b/tests/test_pipeline.py @@ -0,0 +1,236 @@ +import numpy as np +import pytest +import torch + +from torchTextClassifiers import ModelConfig, TrainingConfig, torchTextClassifiers +from torchTextClassifiers.dataset import TextClassificationDataset +from torchTextClassifiers.model import TextClassificationModel, TextClassificationModule +from torchTextClassifiers.model.components import ( + AttentionConfig, + CategoricalVariableNet, + ClassificationHead, + TextEmbedder, + TextEmbedderConfig, +) +from torchTextClassifiers.tokenizers import HuggingFaceTokenizer, NGramTokenizer, WordPieceTokenizer +from torchTextClassifiers.utilities.plot_explainability import ( + map_attributions_to_char, + map_attributions_to_word, + plot_attributions_at_char, + plot_attributions_at_word, +) + + +@pytest.fixture +def sample_data(): + """Fixture providing sample data for all tests.""" + sample_text_data = [ + "This is a positive example", + "This is a negative example", + "Another positive case", + "Another negative case", + "Good example here", + "Bad example here", + ] + categorical_data = np.array([[1, 0], [0, 1], [1, 0], [0, 1], [1, 0], [0, 1]]).astype(int) + labels = np.array([1, 0, 1, 0, 1, 5]) + + return sample_text_data, categorical_data, labels + + +@pytest.fixture +def model_params(): + """Fixture providing common model parameters.""" + return { + "embedding_dim": 96, + "n_layers": 2, + "n_head": 4, + "num_classes": 10, + "categorical_vocab_sizes": [2, 2], + "categorical_embedding_dims": [4, 7], + } + + +def run_full_pipeline(tokenizer, sample_text_data, categorical_data, labels, model_params): + """Helper function to run the complete pipeline for a given tokenizer.""" + # Create dataset + dataset = TextClassificationDataset( + texts=sample_text_data, + categorical_variables=categorical_data.tolist(), + tokenizer=tokenizer, + labels=None, + ) + + dataloader = dataset.create_dataloader(batch_size=4) + batch = next(iter(dataloader)) + + # Get tokenizer parameters + vocab_size = tokenizer.vocab_size + padding_idx = tokenizer.padding_idx + sequence_len = tokenizer.output_dim + + # Create attention config + attention_config = AttentionConfig( + n_layers=model_params["n_layers"], + n_head=model_params["n_head"], + n_kv_head=model_params["n_head"], + sequence_len=sequence_len, + ) + + # Create text embedder + text_embedder_config = TextEmbedderConfig( + vocab_size=vocab_size, + embedding_dim=model_params["embedding_dim"], + padding_idx=padding_idx, + attention_config=attention_config, + ) + + text_embedder = TextEmbedder(text_embedder_config=text_embedder_config) + text_embedder.init_weights() + + # Create categorical variable net + categorical_var_net = CategoricalVariableNet( + categorical_vocabulary_sizes=model_params["categorical_vocab_sizes"], + categorical_embedding_dims=model_params["categorical_embedding_dims"], + ) + + # Create classification head + expected_input_dim = model_params["embedding_dim"] + categorical_var_net.output_dim + classification_head = ClassificationHead( + input_dim=expected_input_dim, + num_classes=model_params["num_classes"], + ) + + # Create model + model = TextClassificationModel( + text_embedder=text_embedder, + categorical_variable_net=categorical_var_net, + classification_head=classification_head, + ) + + # Test forward pass + model(**batch) + + # Create module + module = TextClassificationModule( + model=model, + loss=torch.nn.CrossEntropyLoss(), + optimizer=torch.optim.Adam, + optimizer_params={"lr": 1e-3}, + scheduler=None, + scheduler_params=None, + scheduler_interval="epoch", + ) + + # Test prediction + module.predict_step(batch) + + # Prepare data for training + X = np.column_stack([sample_text_data, categorical_data]) + Y = labels + + # Create model config + model_config = ModelConfig( + embedding_dim=model_params["embedding_dim"], + categorical_vocabulary_sizes=model_params["categorical_vocab_sizes"], + categorical_embedding_dims=model_params["categorical_embedding_dims"], + num_classes=model_params["num_classes"], + attention_config=attention_config, + ) + + # Create training config + training_config = TrainingConfig( + lr=1e-3, + batch_size=4, + num_epochs=1, + ) + + # Create classifier + ttc = torchTextClassifiers( + tokenizer=tokenizer, + model_config=model_config, + ) + + # Train + ttc.train( + X_train=X, + y_train=Y, + X_val=X, + y_val=Y, + training_config=training_config, + ) + + # Predict with explanations + top_k = 5 + predictions = ttc.predict(X, top_k=top_k, explain=True) + + # Test explainability functions + text_idx = 0 + text = sample_text_data[text_idx] + offsets = predictions["offset_mapping"][text_idx] + attributions = predictions["attributions"][text_idx] + word_ids = predictions["word_ids"][text_idx] + + word_attributions = map_attributions_to_word(attributions, word_ids) + char_attributions = map_attributions_to_char(attributions, offsets, text) + + # Note: We're not actually plotting in tests, just calling the functions + # to ensure they don't raise errors + plot_attributions_at_char(text, char_attributions) + plot_attributions_at_word(text, word_attributions) + + +def test_wordpiece_tokenizer(sample_data, model_params): + """Test the full pipeline with WordPieceTokenizer.""" + sample_text_data, categorical_data, labels = sample_data + + vocab_size = 100 + tokenizer = WordPieceTokenizer(vocab_size, output_dim=50) + tokenizer.train(sample_text_data) + + # Check tokenizer works + result = tokenizer.tokenize(sample_text_data) + assert result.input_ids.shape[0] == len(sample_text_data) + + # Run full pipeline + run_full_pipeline(tokenizer, sample_text_data, categorical_data, labels, model_params) + + +def test_huggingface_tokenizer(sample_data, model_params): + """Test the full pipeline with HuggingFaceTokenizer.""" + sample_text_data, categorical_data, labels = sample_data + + tokenizer = HuggingFaceTokenizer.load_from_pretrained( + "google-bert/bert-base-uncased", output_dim=50 + ) + + # Check tokenizer works + result = tokenizer.tokenize(sample_text_data) + assert result.input_ids.shape[0] == len(sample_text_data) + + # Run full pipeline + run_full_pipeline(tokenizer, sample_text_data, categorical_data, labels, model_params) + + +def test_ngram_tokenizer(sample_data, model_params): + """Test the full pipeline with NGramTokenizer.""" + sample_text_data, categorical_data, labels = sample_data + + tokenizer = NGramTokenizer( + min_count=3, min_n=2, max_n=5, num_tokens=100, len_word_ngrams=2, output_dim=76 + ) + tokenizer.train(sample_text_data) + + # Check tokenizer works + result = tokenizer.tokenize( + sample_text_data[0], return_offsets_mapping=True, return_word_ids=True + ) + assert result.input_ids is not None + + # Check batch decode + batch_result = tokenizer.tokenize(sample_text_data) + decoded = tokenizer.batch_decode(batch_result.input_ids.tolist()) + assert len(decoded) == len(sample_text_data) + + # Run full pipeline + run_full_pipeline(tokenizer, sample_text_data, categorical_data, labels, model_params) From 6017a22097a8520d5d385316b8f06642f0a23d2b Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Thu, 20 Nov 2025 12:41:13 +0000 Subject: [PATCH 64/66] chore: remove old file has been put into tests into a suitable format --- xxx.py | 167 --------------------------------------------------------- 1 file changed, 167 deletions(-) delete mode 100644 xxx.py diff --git a/xxx.py b/xxx.py deleted file mode 100644 index 2350dba..0000000 --- a/xxx.py +++ /dev/null @@ -1,167 +0,0 @@ -from torchTextClassifiers import ModelConfig, TrainingConfig, torchTextClassifiers -from torchTextClassifiers.dataset import TextClassificationDataset -from torchTextClassifiers.model import TextClassificationModel, TextClassificationModule -from torchTextClassifiers.model.components import ( - AttentionConfig, - CategoricalVariableNet, - ClassificationHead, - TextEmbedder, - TextEmbedderConfig, -) -from torchTextClassifiers.tokenizers import HuggingFaceTokenizer, WordPieceTokenizer -from torchTextClassifiers.utilities.plot_explainability import ( - map_attributions_to_char, - map_attributions_to_word, - plot_attributions_at_char, - plot_attributions_at_word, -) - -sample_text_data = [ - "This is a positive example", - "This is a negative example", - "Another positive case", - "Another negative case", - "Good example here", - "Bad example here", -] -categorical_data = [[1, 0], [0, 1], [1, 0], [0, 1], [1, 0], [0, 1]] -labels = [1, 0, 1, 0, 1, 5] - -### -tokenizer = WordPieceTokenizer(3, output_dim=None) -tokenizer.train(sample_text_data) -tokenizer.tokenize(sample_text_data).input_ids.shape - - -### -tokenizer = HuggingFaceTokenizer.load_from_pretrained( - "google-bert/bert-base-uncased", output_dim=126 -) -tokenizer.tokenize(sample_text_data).input_ids.shape - - -dataset = TextClassificationDataset( - texts=sample_text_data, categorical_variables=categorical_data, tokenizer=tokenizer, labels=None -) - -dataloader = dataset.create_dataloader(batch_size=4) - -batch = next(iter(dataloader)) - -vocab_size = tokenizer.vocab_size -padding_idx = tokenizer.padding_idx - -embedding_dim = 96 -n_layers = 2 -n_head = 4 -n_kv_head = n_head -sequence_len = tokenizer.output_dim - -attention_config = AttentionConfig( - n_layers=n_layers, - n_head=n_head, - n_kv_head=n_kv_head, - sequence_len=sequence_len, -) - -text_embedder_config = TextEmbedderConfig( - vocab_size=vocab_size, - embedding_dim=embedding_dim, - padding_idx=padding_idx, - attention_config=attention_config, -) - - -text_embedder = TextEmbedder( - text_embedder_config=text_embedder_config, -) -text_embedder.init_weights() - - -categorical_vocab_sizes = [2, 2] -categorical_embedding_dims = [4, 7] - -categorical_var_net = CategoricalVariableNet( - categorical_vocabulary_sizes=categorical_vocab_sizes, - categorical_embedding_dims=categorical_embedding_dims, -) - -num_classes = 10 -expected_input_dim = embedding_dim + categorical_var_net.output_dim -classification_head = ClassificationHead( - input_dim=expected_input_dim, - num_classes=num_classes, -) - -model = TextClassificationModel( - text_embedder=text_embedder, - categorical_variable_net=categorical_var_net, - classification_head=classification_head, -) - -model(**batch) - -import torch - -module = TextClassificationModule( - model=model, - loss=torch.nn.CrossEntropyLoss(), - optimizer=torch.optim.Adam, - optimizer_params={"lr": 1e-3}, - scheduler=None, - scheduler_params=None, - scheduler_interval="epoch", -) - -module.predict_step(batch) - -# Convert categorical data to numpy array -import numpy as np - -categorical_data = np.array(categorical_data).astype(int) - -# Combine text (as a column vector) with categorical data -X = np.column_stack([sample_text_data, categorical_data]) -Y = np.array(labels) - -model_config = ModelConfig( - embedding_dim=embedding_dim, - categorical_vocabulary_sizes=categorical_vocab_sizes, - categorical_embedding_dims=categorical_embedding_dims, - num_classes=num_classes, - attention_config=attention_config, -) - -training_config = TrainingConfig( - lr=1e-3, - batch_size=4, - num_epochs=1, -) - -ttc = torchTextClassifiers( - tokenizer=tokenizer, - model_config=model_config, -) - -ttc.train( - X_train=X, - y_train=Y, - X_val=X, - y_val=Y, - training_config=training_config, -) - -top_k = 5 -yyy = ttc.predict(X, top_k=top_k, explain=True) - -text_idx = 0 -text = sample_text_data[text_idx] -offsets = yyy["offset_mapping"][text_idx] # seq_len, 2 -attributions = yyy["attributions"][text_idx] # top_k, seq_len -word_ids = yyy["word_ids"][text_idx] # seq_len - -word_attributions = map_attributions_to_word(attributions, word_ids) -char_attributions = map_attributions_to_char(attributions, offsets, text) - -plot_attributions_at_char(text, char_attributions) -plot_attributions_at_word(text, word_attributions) From aa709198a686c807e5104a9bb43150c9d75f1e70 Mon Sep 17 00:00:00 2001 From: Meilame Tayebjee <114609737+meilame-tayebjee@users.noreply.github.com> Date: Thu, 20 Nov 2025 13:44:15 +0100 Subject: [PATCH 65/66] fix: right command to install HF dependencies in warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Related to c7307f551708f3f0246ac3df35cd1c0144885832 Co-authored-by: Cédric Couralet --- torchTextClassifiers/tokenizers/WordPiece.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchTextClassifiers/tokenizers/WordPiece.py b/torchTextClassifiers/tokenizers/WordPiece.py index 1222c80..280d11d 100644 --- a/torchTextClassifiers/tokenizers/WordPiece.py +++ b/torchTextClassifiers/tokenizers/WordPiece.py @@ -6,7 +6,7 @@ if not HAS_HF: raise ImportError( - "The HuggingFace dependencies are needed to use this tokenizer. Please run 'uv add torchTextClassifiers --group hf-dep." + "The HuggingFace dependencies are needed to use this tokenizer. Please run 'uv add torchTextClassifiers --extra huggingface." ) else: from tokenizers import ( From 41a15f02603a667ac8d9654b110aaa2e5d7ff4fd Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Thu, 20 Nov 2025 12:49:17 +0000 Subject: [PATCH 66/66] chore: change HF opt. dep. group name to huggingface --- pyproject.toml | 2 +- uv.lock | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a9ca049..47449ac 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,7 +47,7 @@ docs = [ [project.optional-dependencies] explainability = ["unidecode", "nltk", "captum"] preprocess = ["unidecode", "nltk"] -hf-dep = [ +huggingface = [ "tokenizers>=0.22.1", "transformers>=4.57.1", "datasets>=4.3.0", diff --git a/uv.lock b/uv.lock index f65812b..627d823 100644 --- a/uv.lock +++ b/uv.lock @@ -2242,7 +2242,7 @@ explainability = [ { name = "nltk" }, { name = "unidecode" }, ] -hf-dep = [ +huggingface = [ { name = "datasets" }, { name = "tokenizers" }, { name = "transformers" }, @@ -2278,17 +2278,17 @@ docs = [ [package.metadata] requires-dist = [ { name = "captum", marker = "extra == 'explainability'" }, - { name = "datasets", marker = "extra == 'hf-dep'", specifier = ">=4.3.0" }, + { name = "datasets", marker = "extra == 'huggingface'", specifier = ">=4.3.0" }, { name = "nltk", marker = "extra == 'explainability'" }, { name = "nltk", marker = "extra == 'preprocess'" }, { name = "numpy", specifier = ">=1.26.4" }, { name = "pytorch-lightning", specifier = ">=2.4.0" }, - { name = "tokenizers", marker = "extra == 'hf-dep'", specifier = ">=0.22.1" }, - { name = "transformers", marker = "extra == 'hf-dep'", specifier = ">=4.57.1" }, + { name = "tokenizers", marker = "extra == 'huggingface'", specifier = ">=0.22.1" }, + { name = "transformers", marker = "extra == 'huggingface'", specifier = ">=4.57.1" }, { name = "unidecode", marker = "extra == 'explainability'" }, { name = "unidecode", marker = "extra == 'preprocess'" }, ] -provides-extras = ["explainability", "preprocess", "hf-dep"] +provides-extras = ["explainability", "preprocess", "huggingface"] [package.metadata.requires-dev] dev = [