From b5a5a05db9a69f3d318f79f5ad49f217734c6d56 Mon Sep 17 00:00:00 2001 From: rocky Date: Fri, 23 Jan 2026 22:25:22 -0500 Subject: [PATCH 1/7] Add basic WordTranslation Also remove linguistics_data directory. We don't docstring this properly yet. --- pymathics/natlang/__init__.py | 5 +- .../__init__.py => linguistic_data.py} | 0 .../natlang/linguistic_data/lang_trans.py | 3 - .../natlang/linguistic_data/translation.py | 54 --------- pymathics/natlang/translation.py | 106 ++++++++++++++++++ pyproject.toml | 4 +- 6 files changed, 111 insertions(+), 61 deletions(-) rename pymathics/natlang/{linguistic_data/__init__.py => linguistic_data.py} (100%) delete mode 100644 pymathics/natlang/linguistic_data/lang_trans.py delete mode 100644 pymathics/natlang/linguistic_data/translation.py create mode 100644 pymathics/natlang/translation.py diff --git a/pymathics/natlang/__init__.py b/pymathics/natlang/__init__.py index 9e8cb39..548bdcc 100644 --- a/pymathics/natlang/__init__.py +++ b/pymathics/natlang/__init__.py @@ -48,6 +48,7 @@ WordDefinition, WordList, ) + from pymathics.natlang.manipulate import Pluralize from pymathics.natlang.normalization import ( DeleteStopwords, @@ -65,8 +66,7 @@ WordSimilarity, WordStem, ) - -from pymathics.natlang.linguistic_data.translation import LanguageIdentify +from pymathics.natlang.translation import LanguageIdentify, WordTranslation from pymathics.natlang.version import __version__ pymathics_version_data = { @@ -99,6 +99,7 @@ "WordList", "WordSimilarity", "WordStem", + "WordTranslation", "__version__", "pymathics_version_data", ] diff --git a/pymathics/natlang/linguistic_data/__init__.py b/pymathics/natlang/linguistic_data.py similarity index 100% rename from pymathics/natlang/linguistic_data/__init__.py rename to pymathics/natlang/linguistic_data.py diff --git a/pymathics/natlang/linguistic_data/lang_trans.py b/pymathics/natlang/linguistic_data/lang_trans.py deleted file mode 100644 index e1d7f13..0000000 --- a/pymathics/natlang/linguistic_data/lang_trans.py +++ /dev/null @@ -1,3 +0,0 @@ -""" -Languages & Translations -""" diff --git a/pymathics/natlang/linguistic_data/translation.py b/pymathics/natlang/linguistic_data/translation.py deleted file mode 100644 index 94486c2..0000000 --- a/pymathics/natlang/linguistic_data/translation.py +++ /dev/null @@ -1,54 +0,0 @@ -# -*- coding: utf-8 -*- - - -""" -Language Translation - - -""" - -# This is under Text Normalization in WR. But also in Natural Language Processing, -# and Linguistic Data. I put here because is the only module that uses langid and pycountry -# modules. -# -# TODO: WordTranslation, TextTranslation - -from typing import Union - -import langid # see https://github.com/saffsd/langid.py -import pycountry -from mathics.core.atoms import String -from mathics.core.builtin import Builtin -from mathics.core.evaluation import Evaluation -from mathics.core.symbols import Symbol -from mathics.core.systemsymbols import SymbolFailed - -sort_order = "Language Translation" - - -class LanguageIdentify(Builtin): - """ - :WMA link: - https://reference.wolfram.com/language/ref/LanguageIdentify.html - -
-
'LanguageIdentify'[$text$] -
returns the name of the language used in $text$. -
- - >> LanguageIdentify["eins zwei drei"] - = German - """ - - summary_text = "determine the predominant human language in a string" - - def eval(self, text: String, evaluation: Evaluation) -> Union[Symbol, String]: - "LanguageIdentify[text_String]" - - # an alternative: https://github.com/Mimino666/langdetect - - code, _ = langid.classify(text.value) - language = pycountry.languages.get(alpha_2=code) - if language is None: - return SymbolFailed - return String(language.name) diff --git a/pymathics/natlang/translation.py b/pymathics/natlang/translation.py new file mode 100644 index 0000000..7b97372 --- /dev/null +++ b/pymathics/natlang/translation.py @@ -0,0 +1,106 @@ +# -*- coding: utf-8 -*- +""" +Language Translation + +""" + +# This is under Text Normalization in WR. But also in Natural Language Processing, +# and Linguistic Data. I put here because is the only module that uses langid and pycountry +# modules. +# +# TODO: WordTranslation, TextTranslation + +from typing import Union + +import langid # see https://github.com/saffsd/langid.py +import nltk.langnames as lgn +import pycountry +from mathics.core.atoms import String +from mathics.core.builtin import Builtin +from mathics.core.evaluation import Evaluation +from mathics.core.list import ListExpression +from mathics.core.symbols import Symbol +from mathics.core.systemsymbols import SymbolFailed +from nltk.corpus import wordnet as wn +from nltk.corpus.reader.wordnet import WordNetError + +sort_order = "Language Translation" + + +class LanguageIdentify(Builtin): + """ + :WMA link: + https://reference.wolfram.com/language/ref/LanguageIdentify.html + +
+
'LanguageIdentify'[$text$] +
returns the name of the language used in $text$. +
+ + >> LanguageIdentify["eins zwei drei"] + = German + """ + + summary_text = "determine the predominant human language in a string" + + def eval(self, text: String, evaluation: Evaluation) -> Union[Symbol, String]: + "LanguageIdentify[text_String]" + + # an alternative: https://github.com/Mimino666/langdetect + + code, _ = langid.classify(text.value) + language = pycountry.languages.get(alpha_2=code) + if language is None: + return SymbolFailed + return String(language.name) + + +# FIXME generalize +class WordTranslation(Builtin): + """ + :WMA link: + https://reference.wolfram.com/language/ref/WordTranslation.html + +
+
'WordTranslation'[word, $lang$] +
returns a list of translation for $word$ into $lang$. +
+ + >> WordTranslation["dog", "French"] + = ... + """ + + summary_text = "give word translations" + + def eval( + self, word: String, lang: String, evaluation: Evaluation + ) -> ListExpression: + "WordTranslation[word_String, lang_String]" + return eval_WordTranslation(word.value, lang.value) + + +# FIXME generalize +def eval_WordTranslation(word: str, language_name: str): + """ + Return a list of translations of `word` in English to `language_name`. + """ + + # Convert "language_name" using NLTK's langnames utility + # to its 3-letter ISO 639-3 code. + iso_code = lgn.langcode(language_name, typ=3) + + if iso_code is None: + return SymbolFailed + + synsets = wn.synsets(word) + translations = set() + + for ss in synsets: + # Pass the converted code to WordNet + try: + for lemma in ss.lemmas(lang=iso_code): + translations.add(lemma.name()) + except WordNetError: + return SymbolFailed + + return ListExpression(*[String(word) for word in translations]) diff --git a/pyproject.toml b/pyproject.toml index 94f0278..19a93e8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ requires = [ "langid", # replace with a supported newer package, e.g. via spacy "matplotlib", "mpmath>=1.2.0", - "nltk>=3.8.0", + "nltk[bcp47]>=3.8.0", "numpy", "pycountry>=3.2.0", "pyenchant>=3.2.0", @@ -27,7 +27,7 @@ dependencies = [ "joblib>=1.0.1", "langid", # replace with a supported newer package, e.g. via spacy "llvmlite>=0.36", - "nltk>=3.8.0", + "nltk[bcp47]>=3.8.0", "mpmath>=1.2.0", "PatternLite", "pyenchant>=3.2.0", From 41c40ec355d888af379aecf0efc3452c24b1d091 Mon Sep 17 00:00:00 2001 From: rocky Date: Sat, 24 Jan 2026 05:56:25 -0500 Subject: [PATCH 2/7] Use github mathics-scanner in CI --- .github/workflows/consistency-checks.yml | 2 +- .github/workflows/ubuntu.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/consistency-checks.yml b/.github/workflows/consistency-checks.yml index 0dbf299..501f271 100644 --- a/.github/workflows/consistency-checks.yml +++ b/.github/workflows/consistency-checks.yml @@ -24,7 +24,7 @@ jobs: python -m pip install pytest python -m pip install Mathics3-Module-Base # Can comment out when next Mathics3 core and Mathics-scanner are released - # python -m pip install -e git+https://github.com/Mathics3/mathics-scanner#egg=Mathics-Scanner[full] + python -m pip install -e git+https://github.com/Mathics3/mathics-scanner#egg=Mathics-Scanner[full] # Until the next mathics-core release git clone https://github.com/Mathics3/mathics-core (cd mathics-core && python -m pip install -e .[full]) diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml index 8fea1ac..28a1fcc 100644 --- a/.github/workflows/ubuntu.yml +++ b/.github/workflows/ubuntu.yml @@ -23,7 +23,7 @@ jobs: python -m pip install --upgrade pip python -m pip install pytest # Go over and comment out stuff when next Mathics core and Mathics-scanner are released - # python -m pip install -e git+https://github.com/Mathics3/mathics-scanner#egg=Mathics-Scanner[full] + python -m pip install -e git+https://github.com/Mathics3/mathics-scanner#egg=Mathics-Scanner[full] # Until the next mathics-core release git clone https://github.com/Mathics3/mathics-core (cd mathics-core && python -m pip install -e .[full]) From 8a5a21ca05249cc2376b284dd624d9132c80dd91 Mon Sep 17 00:00:00 2001 From: rocky Date: Sat, 24 Jan 2026 06:07:33 -0500 Subject: [PATCH 3/7] Try to fix up build --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 19a93e8..3de6887 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,6 @@ requires = [ "langid", # replace with a supported newer package, e.g. via spacy "matplotlib", "mpmath>=1.2.0", - "nltk[bcp47]>=3.8.0", "numpy", "pycountry>=3.2.0", "pyenchant>=3.2.0", From f74dc1da75d20ca56e599290a0650712c0cb4bb0 Mon Sep 17 00:00:00 2001 From: rocky Date: Sat, 24 Jan 2026 06:41:58 -0500 Subject: [PATCH 4/7] Improve bcp47 loading? And another attempt to get CI working --- .github/workflows/ubuntu.yml | 3 ++ .gitignore | 1 + Makefile | 6 ++- admin-tools/download-bcp47.py | 21 ++++++++ pymathics/natlang/translation.py | 84 +++++++++++++++++--------------- pyproject.toml | 2 +- 6 files changed, 76 insertions(+), 41 deletions(-) create mode 100755 admin-tools/download-bcp47.py diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml index 28a1fcc..e70e152 100644 --- a/.github/workflows/ubuntu.yml +++ b/.github/workflows/ubuntu.yml @@ -9,6 +9,8 @@ on: jobs: build: runs-on: ubuntu-latest + env: + NLTK_DATA: "/tmp" strategy: matrix: python-version: ['3.12', '3.13'] @@ -28,6 +30,7 @@ jobs: git clone https://github.com/Mathics3/mathics-core (cd mathics-core && python -m pip install -e .[full]) (cd mathics-core && bash ./admin-tools/make-JSON-tables.sh) + python ./admin-tools/download-bcp47.py - name: Install Mathics3 Module nltk run: | diff --git a/.gitignore b/.gitignore index 61b4358..0da6c50 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ *~ .python-version +/.nltk_data /.python-version /ChangeLog /ChangeLog-spell-corrected diff --git a/Makefile b/Makefile index 52cb4d6..0da1807 100644 --- a/Makefile +++ b/Makefile @@ -28,6 +28,10 @@ SPACY_DOWNLOAD ?= $(lang)_core_web_$(WORDLIST_SIZE) #: Default target - same as "develop" all: develop +#: Download bcp47 which is used needed to support Mathics3 builtin WordTranslation +download-bcp47: + $(PYTHON) ./admin-tools/download-bcp47.py + #: Word-list data. Customize with lang and eventually WORDLIST_SIZE variables wordlist: $(PYTHON) -m nltk.downloader wordnet2022 omw-1.4 @@ -38,7 +42,7 @@ pypi-setup: $(PIP) install --no-build-isolation -e . #: Set up to run from the source tree -develop: pypi-setup +develop: pypi-setup download-bcp47 $(MAKE) wordlist #: Install Mathics3 Module nltk diff --git a/admin-tools/download-bcp47.py b/admin-tools/download-bcp47.py new file mode 100755 index 0000000..df1d3a5 --- /dev/null +++ b/admin-tools/download-bcp47.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python3 +""" +Load bcp47 which is needed to support Mathics3 builtin-function WordTranslation. +""" +import os + +import nltk + +# choose a local data dir so we don't require system-wide write access +data_dir = os.environ.get("NLTK_DATA", os.path.join(os.getcwd(), ".nltk_data")) +os.makedirs(data_dir, exist_ok=True) + +# ensure nltk knows about it +if data_dir not in nltk.data.path: + nltk.data.path.append(data_dir) + +# only download if missing +try: + nltk.data.find("corpora/bcp47") +except LookupError: + nltk.download("bcp47", download_dir=data_dir, quiet=False) diff --git a/pymathics/natlang/translation.py b/pymathics/natlang/translation.py index 7b97372..afd1eda 100644 --- a/pymathics/natlang/translation.py +++ b/pymathics/natlang/translation.py @@ -13,7 +13,6 @@ from typing import Union import langid # see https://github.com/saffsd/langid.py -import nltk.langnames as lgn import pycountry from mathics.core.atoms import String from mathics.core.builtin import Builtin @@ -24,6 +23,13 @@ from nltk.corpus import wordnet as wn from nltk.corpus.reader.wordnet import WordNetError +try: + import nltk.langnames as lgn +except ImportError: + have_lng = False +else: + have_lng = True + sort_order = "Language Translation" @@ -55,52 +61,52 @@ def eval(self, text: String, evaluation: Evaluation) -> Union[Symbol, String]: return String(language.name) -# FIXME generalize -class WordTranslation(Builtin): - """ - :WMA link: - https://reference.wolfram.com/language/ref/WordTranslation.html +if have_lng: -
-
'WordTranslation'[word, $lang$] -
returns a list of translation for $word$ into $lang$. -
+ # FIXME generalize + class WordTranslation(Builtin): + """ + :WMA link: + https://reference.wolfram.com/language/ref/WordTranslation.html - >> WordTranslation["dog", "French"] - = ... - """ +
+
'WordTranslation'[word, $lang$] +
returns a list of translation for $word$ into $lang$. +
- summary_text = "give word translations" + >> WordTranslation["dog", "French"] + = ... + """ - def eval( - self, word: String, lang: String, evaluation: Evaluation - ) -> ListExpression: - "WordTranslation[word_String, lang_String]" - return eval_WordTranslation(word.value, lang.value) + summary_text = "give word translations" + def eval( + self, word: String, lang: String, evaluation: Evaluation + ) -> ListExpression: + "WordTranslation[word_String, lang_String]" + return eval_WordTranslation(word.value, lang.value) -# FIXME generalize -def eval_WordTranslation(word: str, language_name: str): - """ - Return a list of translations of `word` in English to `language_name`. - """ + def eval_WordTranslation(word: str, language_name: str): + """ + Return a list of translations of `word` in English to `language_name`. + """ - # Convert "language_name" using NLTK's langnames utility - # to its 3-letter ISO 639-3 code. - iso_code = lgn.langcode(language_name, typ=3) + # Convert "language_name" using NLTK's langnames utility + # to its 3-letter ISO 639-3 code. + iso_code = lgn.langcode(language_name, typ=3) - if iso_code is None: - return SymbolFailed + if iso_code is None: + return SymbolFailed - synsets = wn.synsets(word) - translations = set() + synsets = wn.synsets(word) + translations = set() - for ss in synsets: - # Pass the converted code to WordNet - try: - for lemma in ss.lemmas(lang=iso_code): - translations.add(lemma.name()) - except WordNetError: - return SymbolFailed + for ss in synsets: + # Pass the converted code to WordNet + try: + for lemma in ss.lemmas(lang=iso_code): + translations.add(lemma.name()) + except WordNetError: + return SymbolFailed - return ListExpression(*[String(word) for word in translations]) + return ListExpression(*[String(word) for word in translations]) diff --git a/pyproject.toml b/pyproject.toml index 3de6887..e768c92 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ dependencies = [ "joblib>=1.0.1", "langid", # replace with a supported newer package, e.g. via spacy "llvmlite>=0.36", - "nltk[bcp47]>=3.8.0", + "nltk>=3.8.0", "mpmath>=1.2.0", "PatternLite", "pyenchant>=3.2.0", From d3a52ff9f5c0cacce751e12d5348bc562c180027 Mon Sep 17 00:00:00 2001 From: rocky Date: Sat, 24 Jan 2026 06:52:53 -0500 Subject: [PATCH 5/7] Fix up CI? --- .github/workflows/consistency-checks.yml | 4 +++- .github/workflows/ubuntu.yml | 3 +-- Makefile | 7 ++----- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/.github/workflows/consistency-checks.yml b/.github/workflows/consistency-checks.yml index 501f271..b75dc28 100644 --- a/.github/workflows/consistency-checks.yml +++ b/.github/workflows/consistency-checks.yml @@ -9,6 +9,8 @@ on: jobs: build: runs-on: ubuntu-latest + env: + NLTK_DATA: "/tmp" strategy: matrix: python-version: ['3.13'] @@ -33,7 +35,7 @@ jobs: - name: Install Mathics3 Module nltk run: | python -m pip install --no-build-isolation setuptools Mathics3[full] nltk PatternLite enchant - make develop - name: Test Mathics Consistency and Style run: | + make develop make check-consistency-and-style diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml index e70e152..ab67917 100644 --- a/.github/workflows/ubuntu.yml +++ b/.github/workflows/ubuntu.yml @@ -30,12 +30,11 @@ jobs: git clone https://github.com/Mathics3/mathics-core (cd mathics-core && python -m pip install -e .[full]) (cd mathics-core && bash ./admin-tools/make-JSON-tables.sh) - python ./admin-tools/download-bcp47.py - name: Install Mathics3 Module nltk run: | python -m pip install --no-build-isolation setuptools Mathics3[full] nltk PatternLite enchant - make develop - name: Test Mathics3 Module nltk run: | + make develop make -j3 check diff --git a/Makefile b/Makefile index 0da1807..e89d274 100644 --- a/Makefile +++ b/Makefile @@ -42,12 +42,9 @@ pypi-setup: $(PIP) install --no-build-isolation -e . #: Set up to run from the source tree -develop: pypi-setup download-bcp47 +develop: pypi-setup $(MAKE) wordlist - -#: Install Mathics3 Module nltk -install: pypi-setup - $(PYTHON) setup.py install + $(MAKE) download-bcp47 #: Run tests test check: pytest doctest From b4ec2e73ac686fc25b15ff975d823e1a91119426 Mon Sep 17 00:00:00 2001 From: rocky Date: Sat, 24 Jan 2026 10:06:40 -0500 Subject: [PATCH 6/7] Add argument checking to WordTranslation --- pymathics/natlang/translation.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pymathics/natlang/translation.py b/pymathics/natlang/translation.py index afd1eda..6079559 100644 --- a/pymathics/natlang/translation.py +++ b/pymathics/natlang/translation.py @@ -78,6 +78,10 @@ class WordTranslation(Builtin): = ... """ + # Set checking that the number of arguments required is one. + eval_error = Builtin.generic_argument_error + expected_args = 2 + summary_text = "give word translations" def eval( From 4a57a05af4acac4d47f9edd9e9d65e548f67703e Mon Sep 17 00:00:00 2001 From: rocky Date: Sat, 24 Jan 2026 11:40:20 -0500 Subject: [PATCH 7/7] Use class variable requires --- pymathics/natlang/translation.py | 90 +++++++++++++++----------------- 1 file changed, 42 insertions(+), 48 deletions(-) diff --git a/pymathics/natlang/translation.py b/pymathics/natlang/translation.py index 6079559..168edc1 100644 --- a/pymathics/natlang/translation.py +++ b/pymathics/natlang/translation.py @@ -23,13 +23,6 @@ from nltk.corpus import wordnet as wn from nltk.corpus.reader.wordnet import WordNetError -try: - import nltk.langnames as lgn -except ImportError: - have_lng = False -else: - have_lng = True - sort_order = "Language Translation" @@ -61,56 +54,57 @@ def eval(self, text: String, evaluation: Evaluation) -> Union[Symbol, String]: return String(language.name) -if have_lng: +# FIXME generalize +class WordTranslation(Builtin): + """ + :WMA link: + https://reference.wolfram.com/language/ref/WordTranslation.html - # FIXME generalize - class WordTranslation(Builtin): - """ - :WMA link: - https://reference.wolfram.com/language/ref/WordTranslation.html +
+
'WordTranslation'[word, $lang$] +
returns a list of translation for $word$ into $lang$. +
-
-
'WordTranslation'[word, $lang$] -
returns a list of translation for $word$ into $lang$. -
+ >> WordTranslation["dog", "French"] + = ... + """ - >> WordTranslation["dog", "French"] - = ... - """ + requires = ("lgn",) - # Set checking that the number of arguments required is one. - eval_error = Builtin.generic_argument_error - expected_args = 2 + # Set checking that the number of arguments required to exactly two. + eval_error = Builtin.generic_argument_error + expected_args = 2 - summary_text = "give word translations" + summary_text = "give word translations" - def eval( - self, word: String, lang: String, evaluation: Evaluation - ) -> ListExpression: - "WordTranslation[word_String, lang_String]" - return eval_WordTranslation(word.value, lang.value) + def eval( + self, word: String, lang: String, evaluation: Evaluation + ) -> ListExpression: + "WordTranslation[word_String, lang_String]" + return eval_WordTranslation(word.value, lang.value) - def eval_WordTranslation(word: str, language_name: str): - """ - Return a list of translations of `word` in English to `language_name`. - """ - # Convert "language_name" using NLTK's langnames utility - # to its 3-letter ISO 639-3 code. - iso_code = lgn.langcode(language_name, typ=3) +def eval_WordTranslation(word: str, language_name: str): + """ + Return a list of translations of `word` in English to `language_name`. + """ - if iso_code is None: - return SymbolFailed + # Convert "language_name" using NLTK's langnames utility + # to its 3-letter ISO 639-3 code. + iso_code = lgn.langcode(language_name, typ=3) - synsets = wn.synsets(word) - translations = set() + if iso_code is None: + return SymbolFailed - for ss in synsets: - # Pass the converted code to WordNet - try: - for lemma in ss.lemmas(lang=iso_code): - translations.add(lemma.name()) - except WordNetError: - return SymbolFailed + synsets = wn.synsets(word) + translations = set() + + for ss in synsets: + # Pass the converted code to WordNet + try: + for lemma in ss.lemmas(lang=iso_code): + translations.add(lemma.name()) + except WordNetError: + return SymbolFailed - return ListExpression(*[String(word) for word in translations]) + return ListExpression(*[String(word) for word in translations])