diff --git a/Jenkinsfile b/Jenkinsfile index 3d7a538ed..039504611 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -28,7 +28,7 @@ pipeline { MR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-17-24-1' KO_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-23-26-0' - HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-17-26-0' + HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-25-26-0' DEFAULT_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0' } stages { diff --git a/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes_map.tsv b/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes_map.tsv index 77139cff5..2abb5c492 100644 --- a/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes_map.tsv +++ b/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes_map.tsv @@ -1,2 +1 @@ -वे वें - +वे वें \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/roman/__init__.py b/nemo_text_processing/text_normalization/hi/data/roman/__init__.py new file mode 100644 index 000000000..4fc25d0d3 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/roman/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/hi/data/roman/roman_ordinal_exceptions.tsv b/nemo_text_processing/text_normalization/hi/data/roman/roman_ordinal_exceptions.tsv new file mode 100644 index 000000000..b298e13e6 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/roman/roman_ordinal_exceptions.tsv @@ -0,0 +1,10 @@ +Iला पहला +Iली पहली +IIरा दूसरा +IIरी दूसरी +IIIरा तीसरा +IIIरी तीसरी +IVथा चौथा +IVथी चौथी +VIठा छठा +VIठी छठी \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/roman/roman_to_spoken.tsv b/nemo_text_processing/text_normalization/hi/data/roman/roman_to_spoken.tsv new file mode 100644 index 000000000..69b760196 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/roman/roman_to_spoken.tsv @@ -0,0 +1,100 @@ +I एक +II दो +III तीन +IV चार +V पाँच +VI छह +VII सात +VIII आठ +IX नौ +X दस +XI ग्यारह +XII बारह +XIII तेरह +XIV चौदह +XV पंद्रह +XVI सोलह +XVII सत्रह +XVIII अठारह +XIX उन्नीस +XX बीस +XXI इक्कीस +XXII बाईस +XXIII तेईस +XXIV चौबीस +XXV पच्चीस +XXVI छब्बीस +XXVII सत्ताईस +XXVIII अट्ठाईस +XXIX उनतीस +XXX तीस +XXXI इकतीस +XXXII बत्तीस +XXXIII तैंतीस +XXXIV चौंतीस +XXXV पैंतीस +XXXVI छत्तीस +XXXVII सैंतीस +XXXVIII अड़तीस +XXXIX उनचालीस +XL चालीस +XLI इकतालीस +XLII बयालीस +XLIII तैंतालीस +XLIV चौंतालीस +XLV पैंतालीस +XLVI छियालीस +XLVII सैंतालीस +XLVIII अड़तालीस +XLIX उनचास +L पचास +LI इक्यावन +LII बावन +LIII तिरपन +LIV चौवन +LV पचपन +LVI छप्पन +LVII सत्तावन +LVIII अट्ठावन +LIX उनसठ +LX साठ +LXI इकसठ +LXII बासठ +LXIII तिरसठ +LXIV चौंसठ +LXV पैंसठ +LXVI छियासठ +LXVII सड़सठ +LXVIII अड़सठ +LXIX उनहत्तर +LXX सत्तर +LXXI इकहत्तर +LXXII बहत्तर +LXXIII तिहत्तर +LXXIV चौहत्तर +LXXV पचहत्तर +LXXVI छिहत्तर +LXXVII सतहत्तर +LXXVIII अठहत्तर +LXXIX उनासी +LXXX अस्सी +LXXXI इक्यासी +LXXXII बयासी +LXXXIII तिरासी +LXXXIV चौरासी +LXXXV पचासी +LXXXVI छियासी +LXXXVII सत्तासी +LXXXVIII अट्ठासी +LXXXIX नवासी +XC नब्बे +XCI इक्यानवे +XCII बानवे +XCIII तिरानवे +XCIV चौरानवे +XCV पचानवे +XCVI छियानवे +XCVII सत्तानवे +XCVIII अट्ठानवे +XCIX निन्यानवे +C एक सौ \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/serial/__init__.py b/nemo_text_processing/text_normalization/hi/data/serial/__init__.py new file mode 100644 index 000000000..4fc25d0d3 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/serial/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/hi/taggers/roman.py b/nemo_text_processing/text_normalization/hi/taggers/roman.py new file mode 100644 index 000000000..ea8d259fe --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/taggers/roman.py @@ -0,0 +1,138 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.hi.graph_utils import GraphFst, convert_space, insert_space +from nemo_text_processing.text_normalization.hi.utils import get_abs_path, load_labels + + +class RomanFst(GraphFst): + """ + Finite state transducer for classifying Roman numerals in Hindi text. + e.g. भास्कर-II -> tokens { roman { key_cardinal: "भास्कर" integer: "II" } } + e.g. कक्षा XII -> tokens { roman { key_cardinal: "कक्षा" integer: "XII" } } + e.g. XIIवीं कक्षा -> tokens { roman { integer: "XII" default_ordinal: "बारहवीं" key_cardinal: "कक्षा" } } + e.g. IVथी कक्षा -> tokens { roman { integer: "IV" default_ordinal: "चौथी" key_cardinal: "कक्षा" } } + + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="roman", kind="classify", deterministic=deterministic) + + roman_graph = pynini.string_file(get_abs_path("data/roman/roman_to_spoken.tsv")).optimize() + roman_numeral_only = pynini.project(roman_graph, "input").optimize() + + devanagari_chars = pynini.project( + pynini.string_file(get_abs_path("data/serial/chars.tsv")), "input" + ).optimize() + + devanagari_word = pynini.closure(devanagari_chars, 1).optimize() + + devanagari_phrase = ( + devanagari_word + pynini.closure((pynini.accep(" ") | pynini.accep("-")) + devanagari_word) + ).optimize() + + separator = (pynini.accep("-") | pynini.accep(" ")).optimize() + + key_before_numeral = ( + pynutil.insert("preserve_order: true ") + + pynutil.insert('key_cardinal: "') + + convert_space(devanagari_phrase) + + pynutil.insert('"') + + pynutil.delete(separator) + + insert_space + + pynutil.insert('integer: "') + + roman_numeral_only + + pynutil.insert('"') + ).optimize() + + numeral_before_key = ( + pynutil.insert("preserve_order: true ") + + pynutil.insert('integer: "') + + roman_numeral_only + + pynutil.insert('"') + + pynutil.delete(separator) + + insert_space + + pynutil.insert('key_cardinal: "') + + convert_space(devanagari_phrase) + + pynutil.insert('"') + ).optimize() + + roman_rows = load_labels(get_abs_path("data/roman/roman_to_spoken.tsv")) + numerals_by_len_desc = sorted((n for n, _ in roman_rows), key=len, reverse=True) + + exception_rows = load_labels(get_abs_path("data/roman/roman_ordinal_exceptions.tsv")) + exception_fused_set = {fused for fused, _ in exception_rows} + + suffix_rows_raw = load_labels(get_abs_path("data/ordinal/suffixes.tsv")) + load_labels( + get_abs_path("data/ordinal/suffixes_map.tsv") + ) + + exception_graphs = [] + for fused, spoken_word in exception_rows: + matched_numeral = next(c for c in numerals_by_len_desc if fused.startswith(c)) + exception_graphs.append( + pynutil.insert('integer: "' + matched_numeral + '"') + + insert_space + + pynutil.insert('default_ordinal: "' + spoken_word + '"') + + pynutil.delete(fused) + ) + glued_ordinal_exceptions_graph = pynini.union(*exception_graphs).optimize() + + regular_row_graphs = [] + for numeral, spoken in roman_rows: + for row in suffix_rows_raw: + + suffix_input = row[0] + suffix_output = row[1] if len(row) > 1 else row[0] + + fused = numeral + suffix_input + if fused in exception_fused_set: + continue + spoken_ordinal = spoken + suffix_output + regular_row_graphs.append( + pynutil.insert('integer: "' + numeral + '"') + + insert_space + + pynutil.insert('default_ordinal: "' + spoken_ordinal + '"') + + pynutil.delete(fused) + ) + glued_ordinal_regular_graph = pynini.union(*regular_row_graphs).optimize() + + roman_glued_ordinal_fields = pynini.union( + pynutil.add_weight(glued_ordinal_exceptions_graph, -0.1), + glued_ordinal_regular_graph, + ).optimize() + + roman_glued_ordinal = ( + pynutil.insert("preserve_order: true ") + + roman_glued_ordinal_fields + + pynini.closure( + pynutil.delete(" ") + + insert_space + + pynutil.insert('key_cardinal: "') + + convert_space(devanagari_phrase) + + pynutil.insert('"'), + 0, + 1, + ) + ).optimize() + + graph = pynini.union(key_before_numeral, numeral_before_key, roman_glued_ordinal).optimize() + + self.fst = self.add_tokens(graph).optimize() diff --git a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py index 75663ca24..0e8d7fc73 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py @@ -35,6 +35,7 @@ from nemo_text_processing.text_normalization.hi.taggers.money import MoneyFst from nemo_text_processing.text_normalization.hi.taggers.ordinal import OrdinalFst from nemo_text_processing.text_normalization.hi.taggers.punctuation import PunctuationFst +from nemo_text_processing.text_normalization.hi.taggers.roman import RomanFst from nemo_text_processing.text_normalization.hi.taggers.serial import SerialFst from nemo_text_processing.text_normalization.hi.taggers.telephone import TelephoneFst from nemo_text_processing.text_normalization.hi.taggers.time import TimeFst @@ -115,6 +116,9 @@ def __init__( word = WordFst(punctuation=punctuation, deterministic=deterministic) word_graph = word.fst + roman = RomanFst(deterministic=deterministic) + roman_graph = roman.fst + telephone = TelephoneFst() telephone_graph = telephone.fst @@ -137,6 +141,7 @@ def __init__( | pynutil.add_weight(ordinal_graph, 1.1) | pynutil.add_weight(electronic_graph, 1.1) | pynutil.add_weight(serial_graph, 1.11) + | pynutil.add_weight(roman_graph, 1.1) ) punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=2.1) + pynutil.insert(" }") diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/roman.py b/nemo_text_processing/text_normalization/hi/verbalizers/roman.py new file mode 100644 index 000000000..c28084a77 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/verbalizers/roman.py @@ -0,0 +1,98 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.hi.graph_utils import ( + NEMO_NOT_QUOTE, + GraphFst, + delete_zero_or_one_space, + insert_space, +) +from nemo_text_processing.text_normalization.hi.utils import get_abs_path + + +class RomanFst(GraphFst): + """ + Finite state transducer for verbalizing Roman numerals in Hindi. + roman { preserve_order: true key_cardinal: "भास्कर" integer: "II" } -> भास्कर दो + roman { preserve_order: true key_cardinal: "कक्षा" integer: "XII" } -> कक्षा बारह + roman { preserve_order: true integer: "XII" default_ordinal: "बारहवीं" key_cardinal: "कक्षा" } -> बारहवीं कक्षा + roman { preserve_order: true integer: "IV" default_ordinal: "चौथी" key_cardinal: "कक्षा" } -> चौथी कक्षा + + Args: + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="roman", kind="verbalize", deterministic=deterministic) + + roman_to_spoken = pynini.string_file(get_abs_path("data/roman/roman_to_spoken.tsv")).optimize() + + key_cardinal = ( + pynutil.delete('key_cardinal: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') + ).optimize() + + integer = (pynutil.delete('integer: "') + roman_to_spoken + pynutil.delete('"')).optimize() + + default_ordinal = ( + pynutil.delete('default_ordinal: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') + ).optimize() + + ignore_integer = ( + pynutil.delete('integer: "') + pynutil.delete(pynini.closure(NEMO_NOT_QUOTE, 1)) + pynutil.delete('"') + ).optimize() + + drop_preserve_order = pynini.closure( + delete_zero_or_one_space + + pynutil.delete("preserve_order:") + + delete_zero_or_one_space + + pynutil.delete("true") + + delete_zero_or_one_space, + 0, + 1, + ).optimize() + + key_first = ( + drop_preserve_order + + key_cardinal + + delete_zero_or_one_space + + insert_space + + integer + + drop_preserve_order + ).optimize() + + numeral_first = ( + drop_preserve_order + + integer + + delete_zero_or_one_space + + insert_space + + key_cardinal + + drop_preserve_order + ).optimize() + + glued_ordinal = ( + drop_preserve_order + + ignore_integer + + delete_zero_or_one_space + + default_ordinal + + pynini.closure(delete_zero_or_one_space + insert_space + key_cardinal, 0, 1) + + drop_preserve_order + ).optimize() + + graph = pynini.union(key_first, numeral_first, glued_ordinal).optimize() + + self.fst = self.delete_tokens(graph).optimize() diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py index e0fb8d8b5..bd6ca4b5b 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py @@ -21,6 +21,7 @@ from nemo_text_processing.text_normalization.hi.verbalizers.measure import MeasureFst from nemo_text_processing.text_normalization.hi.verbalizers.money import MoneyFst from nemo_text_processing.text_normalization.hi.verbalizers.ordinal import OrdinalFst +from nemo_text_processing.text_normalization.hi.verbalizers.roman import RomanFst from nemo_text_processing.text_normalization.hi.verbalizers.telephone import TelephoneFst from nemo_text_processing.text_normalization.hi.verbalizers.time import TimeFst from nemo_text_processing.text_normalization.hi.verbalizers.whitelist import WhiteListFst @@ -70,6 +71,9 @@ def __init__(self, deterministic: bool = True): electronic = ElectronicFst(deterministic=deterministic) electronic_graph = electronic.fst + roman = RomanFst(deterministic=deterministic) + roman_graph = roman.fst + whitelist_graph = WhiteListFst(deterministic=deterministic).fst graph = ( @@ -84,6 +88,7 @@ def __init__(self, deterministic: bool = True): | whitelist_graph | telephone_graph | electronic_graph + | roman_graph ) self.fst = graph diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_roman.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_roman.txt new file mode 100644 index 000000000..00f697a89 --- /dev/null +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_roman.txt @@ -0,0 +1,23 @@ +भास्कर-II~भास्कर दो +चंद्रयान-III~चंद्रयान तीन +अग्नि-IV~अग्नि चार +श्रेणी-II~श्रेणी दो +कक्षा XII~कक्षा बारह +अध्याय IV~अध्याय चार +भाग III~भाग तीन +खंड V~खंड पाँच +विश्व युद्ध II~विश्व युद्ध दो +विश्व युद्ध-II~विश्व युद्ध दो +प्रथम पंचवर्षीय योजना-I~प्रथम पंचवर्षीय योजना एक +राष्ट्रीय राजमार्ग-IV~राष्ट्रीय राजमार्ग चार +रोहिणी आर एस-I~रोहिणी आर एस एक +पीएसएलवी सी-IV~पीएसएलवी सी चार +ISRO मिशन-III~आई एस आर ओ मिशन तीन +कक्षा XII की परीक्षा~कक्षा बारह की परीक्षा +XIIवीं कक्षा की परीक्षा~बारहवीं कक्षा की परीक्षा +भाग II का सारांश~भाग दो का सारांश +अध्याय IV के प्रश्न~अध्याय चार के प्रश्न +IVथी कक्षा के विद्यार्थी~चौथी कक्षा के विद्यार्थी +XC विद्यार्थी~नब्बे विद्यार्थी +LIII वा गणतंत्र दिन~तिरपन वा गणतंत्र दिन +भाग-XCIX~भाग निन्यानवे \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/test_roman.py b/tests/nemo_text_processing/hi/test_roman.py new file mode 100644 index 000000000..041b88fd1 --- /dev/null +++ b/tests/nemo_text_processing/hi/test_roman.py @@ -0,0 +1,36 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestRoman: + normalizer = Normalizer( + input_case='cased', lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=False + ) + inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('hi/data_text_normalization/test_cases_roman.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False) + assert pred.strip() == expected.strip() diff --git a/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh index 974dac331..74e1cf9c9 100644 --- a/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh +++ b/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh @@ -57,10 +57,10 @@ testTNSerial() { runtest $input } -#testTNRoman() { -# input=$PROJECT_DIR/en/data_text_normalization/test_cases_roman.txt -# runtest $input -#} +testTNRoman() { + input=$PROJECT_DIR/hi/data_text_normalization/test_cases_roman.txt + runtest $input +} testTNElectronic() { input=$PROJECT_DIR/hi/data_text_normalization/test_cases_electronic.txt