From 88940da3193dd16e7e7e417b07c8308629e59f66 Mon Sep 17 00:00:00 2001 From: Shreyas Pawar Date: Tue, 21 Apr 2026 04:24:48 +0000 Subject: [PATCH 1/9] date class accuracy improvement Signed-off-by: Shreyas Pawar --- .../text_normalization/hi/data/date/days.tsv | 24 +- .../hi/data/date/months.tsv | 35 ++- .../hi/data/date/prefixes.tsv | 7 +- .../hi/data/date/unambiguous_days.tsv | 38 +++ .../text_normalization/hi/taggers/date.py | 284 +++++++++++++++--- .../hi/taggers/tokenize_and_classify.py | 4 +- .../test_cases_date.txt | 2 +- 7 files changed, 345 insertions(+), 49 deletions(-) create mode 100644 nemo_text_processing/text_normalization/hi/data/date/unambiguous_days.tsv diff --git a/nemo_text_processing/text_normalization/hi/data/date/days.tsv b/nemo_text_processing/text_normalization/hi/data/date/days.tsv index 633e2aec0..7d2dc7fbb 100644 --- a/nemo_text_processing/text_normalization/hi/data/date/days.tsv +++ b/nemo_text_processing/text_normalization/hi/data/date/days.tsv @@ -3,7 +3,7 @@ ०३ तीन ०४ चार ०५ पाँच -०६ छः +०६ छह ०७ सात ०८ आठ ०९ नौ @@ -34,7 +34,7 @@ 03 तीन 04 चार 05 पाँच -06 छः +06 छह 07 सात 08 आठ 09 नौ @@ -59,4 +59,22 @@ 28 अट्ठाईस 29 उनतीस 30 तीस -31 इकतीस \ No newline at end of file +31 इकतीस +१ एक +२ दो +३ तीन +४ चार +५ पाँच +६ छह +७ सात +८ आठ +९ नौ +1 एक +2 दो +3 तीन +4 चार +5 पाँच +6 छह +7 सात +8 आठ +9 नौ \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/date/months.tsv b/nemo_text_processing/text_normalization/hi/data/date/months.tsv index af770dafc..5eaafb648 100644 --- a/nemo_text_processing/text_normalization/hi/data/date/months.tsv +++ b/nemo_text_processing/text_normalization/hi/data/date/months.tsv @@ -21,4 +21,37 @@ 09 सितंबर 10 अक्टूबर 11 नवंबर -12 दिसंबर \ No newline at end of file +12 दिसंबर +जनवरी जनवरी +फ़रवरी फ़रवरी +फरवरी फरवरी +मार्च मार्च +अप्रैल अप्रैल +अप्रील अप्रील +मई मई +जून जून +जुलाई जुलाई +अगस्त अगस्त +सितंबर सितंबर +अक्टूबर अक्टूबर +अक्तूबर अक्तूबर +नवंबर नवंबर +दिसंबर दिसंबर +१ जनवरी +२ फ़रवरी +३ मार्च +४ अप्रैल +५ मई +६ जून +७ जुलाई +८ अगस्त +९ सितंबर +1 जनवरी +2 फ़रवरी +3 मार्च +4 अप्रैल +5 मई +6 जून +7 जुलाई +8 अगस्त +9 सितंबर \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/date/prefixes.tsv b/nemo_text_processing/text_normalization/hi/data/date/prefixes.tsv index d4c1ca0b1..6166ec327 100644 --- a/nemo_text_processing/text_normalization/hi/data/date/prefixes.tsv +++ b/nemo_text_processing/text_normalization/hi/data/date/prefixes.tsv @@ -1,3 +1,4 @@ -सन् -सन -साल \ No newline at end of file +सन् +सन +साल +दशक \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/date/unambiguous_days.tsv b/nemo_text_processing/text_normalization/hi/data/date/unambiguous_days.tsv new file mode 100644 index 000000000..7fb5f5380 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/date/unambiguous_days.tsv @@ -0,0 +1,38 @@ +१३ तेरह +१४ चौदह +१५ पंद्रह +१६ सोलह +१७ सत्रह +१८ अठारह +१९ उन्नीस +२० बीस +२१ इक्कीस +२२ बाईस +२३ तेईस +२४ चौबीस +२५ पच्चीस +२६ छब्बीस +२७ सत्ताईस +२८ अट्ठाईस +२९ उनतीस +३० तीस +३१ इकतीस +13 तेरह +14 चौदह +15 पंद्रह +16 सोलह +17 सत्रह +18 अठारह +19 उन्नीस +20 बीस +21 इक्कीस +22 बाईस +23 तेईस +24 चौबीस +25 पच्चीस +26 छब्बीस +27 सत्ताईस +28 अट्ठाईस +29 उनतीस +30 तीस +31 इकतीस \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/taggers/date.py b/nemo_text_processing/text_normalization/hi/taggers/date.py index da917f3de..1dc1c86ba 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/text_normalization/hi/taggers/date.py @@ -25,6 +25,7 @@ from nemo_text_processing.text_normalization.hi.utils import get_abs_path days = pynini.string_file(get_abs_path("data/date/days.tsv")) +unambiguous_days = pynini.string_file(get_abs_path("data/date/unambiguous_days.tsv")) months = pynini.string_file(get_abs_path("data/date/months.tsv")) year_suffix = pynini.string_file(get_abs_path("data/date/year_suffix.tsv")) digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) @@ -33,23 +34,38 @@ teens_ties = pynini.union(teens_ties_hi, teens_ties_en) teens_and_ties = pynutil.add_weight(teens_ties, -0.1) -# Read suffixes from file into a list +digit_as_day = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) + with open(get_abs_path("data/date/suffixes.tsv"), "r", encoding="utf-8") as f: - suffixes_list = f.read().splitlines() + suffixes_list = [line.rstrip("\n") for line in f if line.strip()] with open(get_abs_path("data/date/prefixes.tsv"), "r", encoding="utf-8") as f: - prefixes_list = f.read().splitlines() + prefixes_list = [line.rstrip("\n") for line in f if line.strip()] -# Create union of suffixes and prefixes suffix_union = pynini.union(*suffixes_list) prefix_union = pynini.union(*prefixes_list) +verbalized_hundreds = teens_ties_hi.project("output") +verbalized_unit = pynini.union( + teens_ties_hi.project("output"), + digit.project("output") +) + +verbalized_year_sou = ( + verbalized_hundreds + + pynini.accep(" सौ") + + pynini.closure(pynini.accep(" ") + verbalized_unit, 0, 1) +) + class DateFst(GraphFst): """ Finite state transducer for classifying date, e.g. - "०१-०४-२०२४" -> date { day: "एक" month: "अप्रैल" year: "दो हज़ार चौबीस" } - "०४-०१-२०२४" -> date { month: "अप्रैल" day: "एक" year: "दो हज़ार चौबीस" } - + "०१-०४-२०२४" -> date { day: "एक" month: "अप्रैल" year: "दो हज़ार चौबीस" } + "६ मार्च, २०१०" -> date { day: "छह" month: "मार्च" year: "दो हज़ार दस" } + "३१ मई, १९९० ई." -> date { day: "इकतीस" month: "मई" year: "उन्नीस सौ नब्बे" era: "ईसवी" } + "उन्नीस सौ बीस में" -> date { era: "उन्नीस सौ बीस में" } + "०३-२०१०" -> date { month: "मार्च" year: "दो हज़ार दस" } + "11-2024" -> date { month: "नवंबर" year: "दो हज़ार चौबीस" } Args: cardinal: cardinal GraphFst @@ -60,60 +76,230 @@ class DateFst(GraphFst): def __init__(self, cardinal: GraphFst): super().__init__(name="date", kind="classify") + # ── Year number graphs ──────────────────────────────────────────────── graph_year_thousands = pynini.compose( - (NEMO_ALL_DIGIT + NEMO_ALL_ZERO + NEMO_ALL_DIGIT + NEMO_ALL_DIGIT), cardinal.graph_thousands + (NEMO_ALL_DIGIT + NEMO_ALL_ZERO + NEMO_ALL_DIGIT + NEMO_ALL_DIGIT), + cardinal.graph_thousands ) graph_year_hundreds_as_thousands = pynini.compose( - (NEMO_ALL_DIGIT + NEMO_ALL_NON_ZERO + NEMO_ALL_DIGIT + NEMO_ALL_DIGIT), cardinal.graph_hundreds_as_thousand + (NEMO_ALL_DIGIT + NEMO_ALL_NON_ZERO + NEMO_ALL_DIGIT + NEMO_ALL_DIGIT), + cardinal.graph_hundreds_as_thousand ) cardinal_graph = pynini.union( - digit, teens_and_ties, cardinal.graph_hundreds, graph_year_thousands, graph_year_hundreds_as_thousands + digit, + teens_and_ties, + cardinal.graph_hundreds, + graph_year_thousands, + graph_year_hundreds_as_thousands, ) graph_year = pynini.union(graph_year_thousands, graph_year_hundreds_as_thousands) - delete_dash = pynutil.delete("-") - delete_slash = pynutil.delete("/") + graph_year_era = pynini.union( + graph_year_thousands, + graph_year_hundreds_as_thousands, + cardinal.graph_hundreds, + ) - days_graph = pynutil.insert("day: \"") + days + pynutil.insert("\"") + insert_space + # ── Separators ─────────────────────────────────────────────────────── + delete_dash = pynutil.delete("-") + delete_slash = pynutil.delete("/") + delete_comma = pynutil.delete(",") + delete_space = pynutil.delete(" ") + delete_optional_space = pynini.closure(pynutil.delete(" "), 0, 1) + delete_comma_sep = delete_comma + delete_optional_space + delete_numeric_sep = pynini.union(delete_dash, delete_slash) + + # ── Day graphs ─────────────────────────────────────────────────────── + # Full day graph — all days 1-31 (used in DD-MM graphs) + day_num = pynini.union( + days, + digit_as_day, + teens_and_ties, + ) - months_graph = pynutil.insert("month: \"") + months + pynutil.insert("\"") + insert_space + days_graph = ( + pynutil.insert("day: \"") + day_num + pynutil.insert("\"") + insert_space + ) - years_graph = pynutil.insert("year: \"") + graph_year + pynutil.insert("\"") + insert_space + # Unambiguous day graph — only days 13-31 + # Used in MM-DD graphs so they only fire when day cannot be a month number + unambiguous_day_num = pynini.union( + unambiguous_days, + ) - graph_dd_mm = days_graph + delete_dash + months_graph + unambiguous_days_graph = ( + pynutil.insert("day: \"") + unambiguous_day_num + pynutil.insert("\"") + insert_space + ) - graph_mm_dd = months_graph + delete_dash + days_graph + # ── Month graph ────────────────────────────────────────────────────── + months_graph = ( + pynutil.insert("month: \"") + months + pynutil.insert("\"") + insert_space + ) - graph_mm_dd += pynutil.insert(" preserve_order: true ") + # ── Year graph ─────────────────────────────────────────────────────── + years_graph = ( + pynutil.insert("year: \"") + graph_year + pynutil.insert("\"") + insert_space + ) - # Graph for era - era_graph = pynutil.insert("era: \"") + year_suffix + pynutil.insert("\"") + insert_space + # ── Era graph ──────────────────────────────────────────────────────── + era_graph = ( + pynutil.insert("era: \"") + year_suffix + pynutil.insert("\"") + insert_space + ) + # ── Range graph (e.g. २९७-२७२ ई. पू.) ────────────────────────────── range_graph = pynini.cross("-", "से") - # Graph for year - century_number = pynini.compose(pynini.closure(NEMO_ALL_DIGIT, 1), cardinal_graph) + pynini.accep("वीं") - century_text = pynutil.insert("era: \"") + century_number + pynutil.insert("\"") + insert_space + # ── Century ordinal (e.g. २०वीं, १८वीं) ──────────────────────────── + century_number = ( + pynini.compose(pynini.closure(NEMO_ALL_DIGIT, 1), cardinal_graph) + + pynini.accep("वीं") + ) + century_text = ( + pynutil.insert("era: \"") + century_number + pynutil.insert("\"") + insert_space + ) - # Updated logic to use suffix_union + # ── Year + suffix (e.g. २०२० में, १९९० का) ────────────────────────── year_number = graph_year + suffix_union - year_text = pynutil.insert("era: \"") + year_number + pynutil.insert("\"") + insert_space + year_text = ( + pynutil.insert("era: \"") + year_number + pynutil.insert("\"") + insert_space + ) + + # ── Year + prefix (e.g. सन् २०२४, साल २०२०) ──────────────────────── + year_prefix = ( + pynutil.insert("era: \"") + + prefix_union + + pynini.accep(" ") + + graph_year + + pynutil.insert("\"") + ) + + # ── Year + prefix + suffix (e.g. सन २००८ में) ─────────────────────── + year_prefix_suffix = ( + pynutil.insert("era: \"") + + prefix_union + + pynini.accep(" ") + + graph_year + + suffix_union + + pynutil.insert("\"") + ) + + # ── Verbalized year passthrough graphs ─────────────────────────────── + graph_verbalized_year_suffix = ( + pynutil.insert("era: \"") + + verbalized_year_sou + + suffix_union + + pynutil.insert("\"") + + insert_space + ) + + graph_verbalized_year_bare = ( + pynutil.insert("era: \"") + + verbalized_year_sou + + pynutil.insert("\"") + + insert_space + ) - # Updated logic to use prefix_union - year_prefix = pynutil.insert("era: \"") + prefix_union + insert_space + graph_year + pynutil.insert("\"") + graph_verbalized_year_prefix = ( + pynutil.insert("era: \"") + + prefix_union + + pynini.accep(" ") + + verbalized_year_sou + + pynutil.insert("\"") + ) - delete_separator = pynini.union(delete_dash, delete_slash) - graph_dd_mm_yyyy = days_graph + delete_separator + months_graph + delete_separator + years_graph + graph_verbalized_year_prefix_suffix = ( + pynutil.insert("era: \"") + + prefix_union + + pynini.accep(" ") + + verbalized_year_sou + + suffix_union + + pynutil.insert("\"") + ) + + # ── Numeric separator date graphs ──────────────────────────────────── + # DD-MM: uses full day range (all 1-31) + graph_dd_mm = days_graph + delete_numeric_sep + months_graph + + # MM-DD: only fires when day is unambiguously > 12 + # This prevents 01-10 being read as MM-DD (January 10) + graph_mm_dd = months_graph + delete_numeric_sep + unambiguous_days_graph + graph_mm_dd += pynutil.insert(" preserve_order: true ") - graph_mm_dd_yyyy = months_graph + delete_separator + days_graph + delete_separator + years_graph + # DD-MM-YYYY: uses full day range + graph_dd_mm_yyyy = ( + days_graph + + delete_numeric_sep + + months_graph + + delete_numeric_sep + + years_graph + ) + # MM-DD-YYYY: only fires when day is unambiguously > 12 + graph_mm_dd_yyyy = ( + months_graph + + delete_numeric_sep + + unambiguous_days_graph + + delete_numeric_sep + + years_graph + ) graph_mm_dd_yyyy += pynutil.insert(" preserve_order: true ") - graph_mm_yyyy = months_graph + delete_dash + insert_space + years_graph + # ── Space-separated date graphs ────────────────────────────────────── + graph_dd_month = ( + days_graph + + delete_space + + months_graph + ) - graph_year_suffix = era_graph + graph_dd_month_comma_yyyy = ( + days_graph + + delete_space + + months_graph + + delete_comma_sep + + years_graph + ) + + graph_dd_month_comma_yyyy_era = ( + days_graph + + delete_space + + months_graph + + delete_comma_sep + + years_graph + + era_graph + ) + + graph_month_comma_yyyy = ( + months_graph + + delete_comma_sep + + years_graph + ) + + graph_month_comma_yyyy_era = ( + months_graph + + delete_comma_sep + + years_graph + + era_graph + ) + + # MM-YYYY: supports both space and dash separator + # e.g. "मार्च २००३", "०३-२०१०", "11-2024" + graph_mm_yyyy = ( + months_graph + + pynini.union(delete_space, delete_dash) + + years_graph + ) + + # ── Era-only graphs ────────────────────────────────────────────────── + graph_year_era_only = ( + pynutil.insert("era: \"") + + graph_year_era + + insert_space + + year_suffix + + pynutil.insert("\"") + + insert_space + ) graph_range = ( pynutil.insert("era: \"") @@ -126,21 +312,41 @@ def __init__(self, cardinal: GraphFst): + pynutil.insert(" preserve_order: true ") ) - # default assume dd_mm_yyyy + graph_year_suffix = era_graph + # ── Final graph ─────────────────────────────────────────────────────── final_graph = ( - pynutil.add_weight(graph_dd_mm, -0.001) - | graph_mm_dd + # Full date with era — most specific first + pynutil.add_weight(graph_dd_month_comma_yyyy_era, -0.003) + | pynutil.add_weight(graph_month_comma_yyyy_era, -0.003) + # Full numeric dates | pynutil.add_weight(graph_dd_mm_yyyy, -0.001) | graph_mm_dd_yyyy + # Full space/comma dates + | pynutil.add_weight(graph_dd_month_comma_yyyy, -0.001) + # Day + month only + | pynutil.add_weight(graph_dd_mm, -0.001) + | pynutil.add_weight(graph_dd_month, -0.001) + | graph_mm_dd + # Month + year — space or dash | pynutil.add_weight(graph_mm_yyyy, -0.2) - | pynutil.add_weight(graph_year_suffix, -0.001) + | pynutil.add_weight(graph_month_comma_yyyy, -0.2) + # Era graphs + | pynutil.add_weight(graph_year_era_only, -0.005) | pynutil.add_weight(graph_range, -0.005) + | pynutil.add_weight(graph_year_suffix, -0.001) + # Century ordinal | pynutil.add_weight(century_text, -0.001) - | pynutil.add_weight(year_text, -0.001) + # Verbalized year passthrough — more specific first + | pynutil.add_weight(graph_verbalized_year_prefix_suffix, -0.012) + | pynutil.add_weight(graph_verbalized_year_prefix, -0.011) + | pynutil.add_weight(graph_verbalized_year_suffix, -0.010) + | pynutil.add_weight(graph_verbalized_year_bare, -0.009) + # Numeric year with suffix/prefix + | pynutil.add_weight(year_prefix_suffix, -0.010) | pynutil.add_weight(year_prefix, -0.009) + | pynutil.add_weight(year_text, -0.001) ) self.final_graph = final_graph.optimize() - - self.fst = self.add_tokens(self.final_graph) + self.fst = self.add_tokens(self.final_graph) \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py index 3e1ded4b1..45304738f 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py @@ -121,7 +121,7 @@ def __init__( pynutil.add_weight(whitelist_graph, 1.01) | pynutil.add_weight(cardinal_graph, 1.1) | pynutil.add_weight(decimal_graph, 1.1) - | pynutil.add_weight(fraction_graph, 1.1) + | pynutil.add_weight(fraction_graph, 1.05) | pynutil.add_weight(date_graph, 1.1) | pynutil.add_weight(time_graph, 1.1) | pynutil.add_weight(measure_graph, 1.1) @@ -165,4 +165,4 @@ def __init__( if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst}) - logging.info(f"ClassifyFst grammars are saved to {far_file}.") + logging.info(f"ClassifyFst grammars are saved to {far_file}.") \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt index 86f1f6678..2df448456 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt @@ -1,4 +1,4 @@ -06-05~छः मई +06-05~छह मई ३१-०६~इकतीस जून 02-01~दो जनवरी ०४-०१~चार जनवरी From 017a615dd63cbe42d814576f5124b68e21a261be Mon Sep 17 00:00:00 2001 From: Shreyas Pawar Date: Fri, 24 Apr 2026 08:04:59 +0000 Subject: [PATCH 2/9] Jenkins file date update for Hi TN Signed-off-by: Shreyas Pawar --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 565f5df27..60aa14af9 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -27,7 +27,7 @@ pipeline { HY_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-17-24-1' - HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-23-26-0' + HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-24-26-0' KO_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-22-26-0' DEFAULT_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0' } From 64fb638f1ddd66e5bf6ba433d4f91c427348c7da Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 24 Apr 2026 08:06:24 +0000 Subject: [PATCH 3/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../text_normalization/hi/taggers/date.py | 141 ++++-------------- .../hi/taggers/tokenize_and_classify.py | 2 +- 2 files changed, 32 insertions(+), 111 deletions(-) diff --git a/nemo_text_processing/text_normalization/hi/taggers/date.py b/nemo_text_processing/text_normalization/hi/taggers/date.py index 1dc1c86ba..42d266547 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/text_normalization/hi/taggers/date.py @@ -45,15 +45,10 @@ prefix_union = pynini.union(*prefixes_list) verbalized_hundreds = teens_ties_hi.project("output") -verbalized_unit = pynini.union( - teens_ties_hi.project("output"), - digit.project("output") -) +verbalized_unit = pynini.union(teens_ties_hi.project("output"), digit.project("output")) verbalized_year_sou = ( - verbalized_hundreds - + pynini.accep(" सौ") - + pynini.closure(pynini.accep(" ") + verbalized_unit, 0, 1) + verbalized_hundreds + pynini.accep(" सौ") + pynini.closure(pynini.accep(" ") + verbalized_unit, 0, 1) ) @@ -78,12 +73,10 @@ def __init__(self, cardinal: GraphFst): # ── Year number graphs ──────────────────────────────────────────────── graph_year_thousands = pynini.compose( - (NEMO_ALL_DIGIT + NEMO_ALL_ZERO + NEMO_ALL_DIGIT + NEMO_ALL_DIGIT), - cardinal.graph_thousands + (NEMO_ALL_DIGIT + NEMO_ALL_ZERO + NEMO_ALL_DIGIT + NEMO_ALL_DIGIT), cardinal.graph_thousands ) graph_year_hundreds_as_thousands = pynini.compose( - (NEMO_ALL_DIGIT + NEMO_ALL_NON_ZERO + NEMO_ALL_DIGIT + NEMO_ALL_DIGIT), - cardinal.graph_hundreds_as_thousand + (NEMO_ALL_DIGIT + NEMO_ALL_NON_ZERO + NEMO_ALL_DIGIT + NEMO_ALL_DIGIT), cardinal.graph_hundreds_as_thousand ) cardinal_graph = pynini.union( @@ -103,13 +96,13 @@ def __init__(self, cardinal: GraphFst): ) # ── Separators ─────────────────────────────────────────────────────── - delete_dash = pynutil.delete("-") - delete_slash = pynutil.delete("/") - delete_comma = pynutil.delete(",") - delete_space = pynutil.delete(" ") + delete_dash = pynutil.delete("-") + delete_slash = pynutil.delete("/") + delete_comma = pynutil.delete(",") + delete_space = pynutil.delete(" ") delete_optional_space = pynini.closure(pynutil.delete(" "), 0, 1) - delete_comma_sep = delete_comma + delete_optional_space - delete_numeric_sep = pynini.union(delete_dash, delete_slash) + delete_comma_sep = delete_comma + delete_optional_space + delete_numeric_sep = pynini.union(delete_dash, delete_slash) # ── Day graphs ─────────────────────────────────────────────────────── # Full day graph — all days 1-31 (used in DD-MM graphs) @@ -119,9 +112,7 @@ def __init__(self, cardinal: GraphFst): teens_and_ties, ) - days_graph = ( - pynutil.insert("day: \"") + day_num + pynutil.insert("\"") + insert_space - ) + days_graph = pynutil.insert("day: \"") + day_num + pynutil.insert("\"") + insert_space # Unambiguous day graph — only days 13-31 # Used in MM-DD graphs so they only fire when day cannot be a month number @@ -129,51 +120,30 @@ def __init__(self, cardinal: GraphFst): unambiguous_days, ) - unambiguous_days_graph = ( - pynutil.insert("day: \"") + unambiguous_day_num + pynutil.insert("\"") + insert_space - ) + unambiguous_days_graph = pynutil.insert("day: \"") + unambiguous_day_num + pynutil.insert("\"") + insert_space # ── Month graph ────────────────────────────────────────────────────── - months_graph = ( - pynutil.insert("month: \"") + months + pynutil.insert("\"") + insert_space - ) + months_graph = pynutil.insert("month: \"") + months + pynutil.insert("\"") + insert_space # ── Year graph ─────────────────────────────────────────────────────── - years_graph = ( - pynutil.insert("year: \"") + graph_year + pynutil.insert("\"") + insert_space - ) + years_graph = pynutil.insert("year: \"") + graph_year + pynutil.insert("\"") + insert_space # ── Era graph ──────────────────────────────────────────────────────── - era_graph = ( - pynutil.insert("era: \"") + year_suffix + pynutil.insert("\"") + insert_space - ) + era_graph = pynutil.insert("era: \"") + year_suffix + pynutil.insert("\"") + insert_space # ── Range graph (e.g. २९७-२७२ ई. पू.) ────────────────────────────── range_graph = pynini.cross("-", "से") # ── Century ordinal (e.g. २०वीं, १८वीं) ──────────────────────────── - century_number = ( - pynini.compose(pynini.closure(NEMO_ALL_DIGIT, 1), cardinal_graph) - + pynini.accep("वीं") - ) - century_text = ( - pynutil.insert("era: \"") + century_number + pynutil.insert("\"") + insert_space - ) + century_number = pynini.compose(pynini.closure(NEMO_ALL_DIGIT, 1), cardinal_graph) + pynini.accep("वीं") + century_text = pynutil.insert("era: \"") + century_number + pynutil.insert("\"") + insert_space # ── Year + suffix (e.g. २०२० में, १९९० का) ────────────────────────── year_number = graph_year + suffix_union - year_text = ( - pynutil.insert("era: \"") + year_number + pynutil.insert("\"") + insert_space - ) + year_text = pynutil.insert("era: \"") + year_number + pynutil.insert("\"") + insert_space # ── Year + prefix (e.g. सन् २०२४, साल २०२०) ──────────────────────── - year_prefix = ( - pynutil.insert("era: \"") - + prefix_union - + pynini.accep(" ") - + graph_year - + pynutil.insert("\"") - ) + year_prefix = pynutil.insert("era: \"") + prefix_union + pynini.accep(" ") + graph_year + pynutil.insert("\"") # ── Year + prefix + suffix (e.g. सन २००८ में) ─────────────────────── year_prefix_suffix = ( @@ -187,26 +157,15 @@ def __init__(self, cardinal: GraphFst): # ── Verbalized year passthrough graphs ─────────────────────────────── graph_verbalized_year_suffix = ( - pynutil.insert("era: \"") - + verbalized_year_sou - + suffix_union - + pynutil.insert("\"") - + insert_space + pynutil.insert("era: \"") + verbalized_year_sou + suffix_union + pynutil.insert("\"") + insert_space ) graph_verbalized_year_bare = ( - pynutil.insert("era: \"") - + verbalized_year_sou - + pynutil.insert("\"") - + insert_space + pynutil.insert("era: \"") + verbalized_year_sou + pynutil.insert("\"") + insert_space ) graph_verbalized_year_prefix = ( - pynutil.insert("era: \"") - + prefix_union - + pynini.accep(" ") - + verbalized_year_sou - + pynutil.insert("\"") + pynutil.insert("era: \"") + prefix_union + pynini.accep(" ") + verbalized_year_sou + pynutil.insert("\"") ) graph_verbalized_year_prefix_suffix = ( @@ -228,68 +187,30 @@ def __init__(self, cardinal: GraphFst): graph_mm_dd += pynutil.insert(" preserve_order: true ") # DD-MM-YYYY: uses full day range - graph_dd_mm_yyyy = ( - days_graph - + delete_numeric_sep - + months_graph - + delete_numeric_sep - + years_graph - ) + graph_dd_mm_yyyy = days_graph + delete_numeric_sep + months_graph + delete_numeric_sep + years_graph # MM-DD-YYYY: only fires when day is unambiguously > 12 graph_mm_dd_yyyy = ( - months_graph - + delete_numeric_sep - + unambiguous_days_graph - + delete_numeric_sep - + years_graph + months_graph + delete_numeric_sep + unambiguous_days_graph + delete_numeric_sep + years_graph ) graph_mm_dd_yyyy += pynutil.insert(" preserve_order: true ") # ── Space-separated date graphs ────────────────────────────────────── - graph_dd_month = ( - days_graph - + delete_space - + months_graph - ) + graph_dd_month = days_graph + delete_space + months_graph - graph_dd_month_comma_yyyy = ( - days_graph - + delete_space - + months_graph - + delete_comma_sep - + years_graph - ) + graph_dd_month_comma_yyyy = days_graph + delete_space + months_graph + delete_comma_sep + years_graph graph_dd_month_comma_yyyy_era = ( - days_graph - + delete_space - + months_graph - + delete_comma_sep - + years_graph - + era_graph + days_graph + delete_space + months_graph + delete_comma_sep + years_graph + era_graph ) - graph_month_comma_yyyy = ( - months_graph - + delete_comma_sep - + years_graph - ) + graph_month_comma_yyyy = months_graph + delete_comma_sep + years_graph - graph_month_comma_yyyy_era = ( - months_graph - + delete_comma_sep - + years_graph - + era_graph - ) + graph_month_comma_yyyy_era = months_graph + delete_comma_sep + years_graph + era_graph # MM-YYYY: supports both space and dash separator # e.g. "मार्च २००३", "०३-२०१०", "11-2024" - graph_mm_yyyy = ( - months_graph - + pynini.union(delete_space, delete_dash) - + years_graph - ) + graph_mm_yyyy = months_graph + pynini.union(delete_space, delete_dash) + years_graph # ── Era-only graphs ────────────────────────────────────────────────── graph_year_era_only = ( @@ -349,4 +270,4 @@ def __init__(self, cardinal: GraphFst): ) self.final_graph = final_graph.optimize() - self.fst = self.add_tokens(self.final_graph) \ No newline at end of file + self.fst = self.add_tokens(self.final_graph) diff --git a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py index 45304738f..88cb04727 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py @@ -165,4 +165,4 @@ def __init__( if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst}) - logging.info(f"ClassifyFst grammars are saved to {far_file}.") \ No newline at end of file + logging.info(f"ClassifyFst grammars are saved to {far_file}.") From 83770829d6a4e66e591f7094bcea4e5cd5505b7e Mon Sep 17 00:00:00 2001 From: Shreyas Pawar Date: Mon, 4 May 2026 08:01:00 +0000 Subject: [PATCH 4/9] Address Reviewer feedback regarding date tagger, tsv files and inclusion of more test cases Signed-off-by: Shreyas Pawar --- .../text_normalization/hi/data/date/days.tsv | 82 ++++++++--------- .../hi/data/date/months.tsv | 63 +++++--------- .../hi/data/date/unambiguous_days.tsv | 38 -------- .../text_normalization/hi/taggers/date.py | 87 ++++++++----------- .../test_cases_date.txt | 21 +++-- 5 files changed, 115 insertions(+), 176 deletions(-) delete mode 100644 nemo_text_processing/text_normalization/hi/data/date/unambiguous_days.tsv diff --git a/nemo_text_processing/text_normalization/hi/data/date/days.tsv b/nemo_text_processing/text_normalization/hi/data/date/days.tsv index 7d2dc7fbb..5bab5ee25 100644 --- a/nemo_text_processing/text_normalization/hi/data/date/days.tsv +++ b/nemo_text_processing/text_normalization/hi/data/date/days.tsv @@ -1,34 +1,3 @@ -०१ एक -०२ दो -०३ तीन -०४ चार -०५ पाँच -०६ छह -०७ सात -०८ आठ -०९ नौ -१० दस -११ ग्यारह -१२ बारह -१३ तेरह -१४ चौदह -१५ पंद्रह -१६ सोलह -१७ सत्रह -१८ अठारह -१९ उन्नीस -२० बीस -२१ इक्कीस -२२ बाईस -२३ तेईस -२४ चौबीस -२५ पच्चीस -२६ छब्बीस -२७ सत्ताईस -२८ अट्ठाईस -२९ उनतीस -३० तीस -३१ इकतीस 01 एक 02 दो 03 तीन @@ -60,15 +29,6 @@ 29 उनतीस 30 तीस 31 इकतीस -१ एक -२ दो -३ तीन -४ चार -५ पाँच -६ छह -७ सात -८ आठ -९ नौ 1 एक 2 दो 3 तीन @@ -77,4 +37,44 @@ 6 छह 7 सात 8 आठ -9 नौ \ No newline at end of file +9 नौ +०१ एक +०२ दो +०३ तीन +०४ चार +०५ पाँच +०६ छह +०७ सात +०८ आठ +०९ नौ +१० दस +११ ग्यारह +१२ बारह +१३ तेरह +१४ चौदह +१५ पंद्रह +१६ सोलह +१७ सत्रह +१८ अठारह +१९ उन्नीस +२० बीस +२१ इक्कीस +२२ बाईस +२३ तेईस +२४ चौबीस +२५ पच्चीस +२६ छब्बीस +२७ सत्ताईस +२८ अट्ठाईस +२९ उनतीस +३० तीस +३१ इकतीस +१ एक +२ दो +३ तीन +४ चार +५ पाँच +६ छह +७ सात +८ आठ +९ नौ \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/date/months.tsv b/nemo_text_processing/text_normalization/hi/data/date/months.tsv index 5eaafb648..ab984372f 100644 --- a/nemo_text_processing/text_normalization/hi/data/date/months.tsv +++ b/nemo_text_processing/text_normalization/hi/data/date/months.tsv @@ -1,17 +1,5 @@ -०१ जनवरी -०२ फ़रवरी -०३ मार्च -०४ अप्रैल -०५ मई -०६ जून -०७ जुलाई -०८ अगस्त -०९ सितंबर -१० अक्टूबर -११ नवंबर -१२ दिसंबर 01 जनवरी -02 फ़रवरी +02 फरवरी 03 मार्च 04 अप्रैल 05 मई @@ -22,36 +10,33 @@ 10 अक्टूबर 11 नवंबर 12 दिसंबर -जनवरी जनवरी -फ़रवरी फ़रवरी -फरवरी फरवरी -मार्च मार्च -अप्रैल अप्रैल -अप्रील अप्रील -मई मई -जून जून -जुलाई जुलाई -अगस्त अगस्त -सितंबर सितंबर -अक्टूबर अक्टूबर -अक्तूबर अक्तूबर -नवंबर नवंबर -दिसंबर दिसंबर -१ जनवरी -२ फ़रवरी -३ मार्च -४ अप्रैल -५ मई -६ जून -७ जुलाई -८ अगस्त -९ सितंबर 1 जनवरी -2 फ़रवरी +2 फरवरी 3 मार्च 4 अप्रैल 5 मई 6 जून 7 जुलाई 8 अगस्त -9 सितंबर \ No newline at end of file +9 सितंबर +०१ जनवरी +०२ फरवरी +०३ मार्च +०४ अप्रैल +०५ मई +०६ जून +०७ जुलाई +०८ अगस्त +०९ सितंबर +१० अक्टूबर +११ नवंबर +१२ दिसंबर +१ जनवरी +२ फरवरी +३ मार्च +४ अप्रैल +५ मई +६ जून +७ जुलाई +८ अगस्त +९ सितंबर \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/date/unambiguous_days.tsv b/nemo_text_processing/text_normalization/hi/data/date/unambiguous_days.tsv deleted file mode 100644 index 7fb5f5380..000000000 --- a/nemo_text_processing/text_normalization/hi/data/date/unambiguous_days.tsv +++ /dev/null @@ -1,38 +0,0 @@ -१३ तेरह -१४ चौदह -१५ पंद्रह -१६ सोलह -१७ सत्रह -१८ अठारह -१९ उन्नीस -२० बीस -२१ इक्कीस -२२ बाईस -२३ तेईस -२४ चौबीस -२५ पच्चीस -२६ छब्बीस -२७ सत्ताईस -२८ अट्ठाईस -२९ उनतीस -३० तीस -३१ इकतीस -13 तेरह -14 चौदह -15 पंद्रह -16 सोलह -17 सत्रह -18 अठारह -19 उन्नीस -20 बीस -21 इक्कीस -22 बाईस -23 तेईस -24 चौबीस -25 पच्चीस -26 छब्बीस -27 सत्ताईस -28 अट्ठाईस -29 उनतीस -30 तीस -31 इकतीस \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/taggers/date.py b/nemo_text_processing/text_normalization/hi/taggers/date.py index 42d266547..e3067952a 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/text_normalization/hi/taggers/date.py @@ -25,7 +25,6 @@ from nemo_text_processing.text_normalization.hi.utils import get_abs_path days = pynini.string_file(get_abs_path("data/date/days.tsv")) -unambiguous_days = pynini.string_file(get_abs_path("data/date/unambiguous_days.tsv")) months = pynini.string_file(get_abs_path("data/date/months.tsv")) year_suffix = pynini.string_file(get_abs_path("data/date/year_suffix.tsv")) digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) @@ -34,8 +33,6 @@ teens_ties = pynini.union(teens_ties_hi, teens_ties_en) teens_and_ties = pynutil.add_weight(teens_ties, -0.1) -digit_as_day = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) - with open(get_abs_path("data/date/suffixes.tsv"), "r", encoding="utf-8") as f: suffixes_list = [line.rstrip("\n") for line in f if line.strip()] with open(get_abs_path("data/date/prefixes.tsv"), "r", encoding="utf-8") as f: @@ -45,7 +42,7 @@ prefix_union = pynini.union(*prefixes_list) verbalized_hundreds = teens_ties_hi.project("output") -verbalized_unit = pynini.union(teens_ties_hi.project("output"), digit.project("output")) +verbalized_unit = pynini.union(verbalized_hundreds, digit.project("output")) verbalized_year_sou = ( verbalized_hundreds + pynini.accep(" सौ") + pynini.closure(pynini.accep(" ") + verbalized_unit, 0, 1) @@ -55,12 +52,13 @@ class DateFst(GraphFst): """ Finite state transducer for classifying date, e.g. - "०१-०४-२०२४" -> date { day: "एक" month: "अप्रैल" year: "दो हज़ार चौबीस" } - "६ मार्च, २०१०" -> date { day: "छह" month: "मार्च" year: "दो हज़ार दस" } - "३१ मई, १९९० ई." -> date { day: "इकतीस" month: "मई" year: "उन्नीस सौ नब्बे" era: "ईसवी" } - "उन्नीस सौ बीस में" -> date { era: "उन्नीस सौ बीस में" } - "०३-२०१०" -> date { month: "मार्च" year: "दो हज़ार दस" } - "11-2024" -> date { month: "नवंबर" year: "दो हज़ार चौबीस" } + "०१-०४-२०२४" -> date { day: "एक" month: "अप्रैल" year: "दो हज़ार चौबीस" } + "०४-०१-२०२४" -> date { month: "अप्रैल" day: "एक" year: "दो हज़ार चौबीस" } + "६ मार्च, २०१०" -> date { day: "छह" month: "मार्च" year: "दो हज़ार दस" } + "३१ मई, १९९० ई." -> date { day: "इकतीस" month: "मई" year: "उन्नीस सौ नब्बे" era: "ईसवी" } + "उन्नीस सौ बीस में" -> date { era: "उन्नीस सौ बीस में" } + "०३-२०१०" -> date { month: "मार्च" year: "दो हज़ार दस" } + "11-2024" -> date { month: "नवंबर" year: "दो हज़ार चौबीस" } Args: cardinal: cardinal GraphFst @@ -71,7 +69,6 @@ class DateFst(GraphFst): def __init__(self, cardinal: GraphFst): super().__init__(name="date", kind="classify") - # ── Year number graphs ──────────────────────────────────────────────── graph_year_thousands = pynini.compose( (NEMO_ALL_DIGIT + NEMO_ALL_ZERO + NEMO_ALL_DIGIT + NEMO_ALL_DIGIT), cardinal.graph_thousands ) @@ -95,7 +92,6 @@ def __init__(self, cardinal: GraphFst): cardinal.graph_hundreds, ) - # ── Separators ─────────────────────────────────────────────────────── delete_dash = pynutil.delete("-") delete_slash = pynutil.delete("/") delete_comma = pynutil.delete(",") @@ -104,48 +100,52 @@ def __init__(self, cardinal: GraphFst): delete_comma_sep = delete_comma + delete_optional_space delete_numeric_sep = pynini.union(delete_dash, delete_slash) - # ── Day graphs ─────────────────────────────────────────────────────── - # Full day graph — all days 1-31 (used in DD-MM graphs) day_num = pynini.union( days, - digit_as_day, teens_and_ties, ) days_graph = pynutil.insert("day: \"") + day_num + pynutil.insert("\"") + insert_space + days_graph_with_year = days_graph + + unambiguous_ascii = pynini.union(*[str(i) for i in range(13, 32)]) + unambiguous_deva = pynini.union( + "१३", "१४", "१५", "१६", "१७", "१८", "१९", + "२०", "२१", "२२", "२३", "२४", "२५", "२६", "२७", "२८", "२९", + "३०", "३१", + ) + unambiguous_inputs = pynini.union(unambiguous_ascii, unambiguous_deva) + unambiguous_day_num = pynini.compose(unambiguous_inputs, days) - # Unambiguous day graph — only days 13-31 - # Used in MM-DD graphs so they only fire when day cannot be a month number - unambiguous_day_num = pynini.union( - unambiguous_days, + unambiguous_days_graph = ( + pynutil.insert("day: \"") + unambiguous_day_num + pynutil.insert("\"") + insert_space ) - unambiguous_days_graph = pynutil.insert("day: \"") + unambiguous_day_num + pynutil.insert("\"") + insert_space + month_name_acceptor = pynini.project(months, "output") - # ── Month graph ────────────────────────────────────────────────────── - months_graph = pynutil.insert("month: \"") + months + pynutil.insert("\"") + insert_space + months_numeric_fst = months + + months_graph_numeric = ( + pynutil.insert("month: \"") + months_numeric_fst + pynutil.insert("\"") + insert_space + ) + + months_fst = pynini.union(months_numeric_fst, month_name_acceptor) + months_graph = pynutil.insert("month: \"") + months_fst + pynutil.insert("\"") + insert_space - # ── Year graph ─────────────────────────────────────────────────────── years_graph = pynutil.insert("year: \"") + graph_year + pynutil.insert("\"") + insert_space - # ── Era graph ──────────────────────────────────────────────────────── era_graph = pynutil.insert("era: \"") + year_suffix + pynutil.insert("\"") + insert_space - # ── Range graph (e.g. २९७-२७२ ई. पू.) ────────────────────────────── range_graph = pynini.cross("-", "से") - # ── Century ordinal (e.g. २०वीं, १८वीं) ──────────────────────────── century_number = pynini.compose(pynini.closure(NEMO_ALL_DIGIT, 1), cardinal_graph) + pynini.accep("वीं") century_text = pynutil.insert("era: \"") + century_number + pynutil.insert("\"") + insert_space - # ── Year + suffix (e.g. २०२० में, १९९० का) ────────────────────────── year_number = graph_year + suffix_union year_text = pynutil.insert("era: \"") + year_number + pynutil.insert("\"") + insert_space - # ── Year + prefix (e.g. सन् २०२४, साल २०२०) ──────────────────────── year_prefix = pynutil.insert("era: \"") + prefix_union + pynini.accep(" ") + graph_year + pynutil.insert("\"") - # ── Year + prefix + suffix (e.g. सन २००८ में) ─────────────────────── year_prefix_suffix = ( pynutil.insert("era: \"") + prefix_union @@ -155,7 +155,6 @@ def __init__(self, cardinal: GraphFst): + pynutil.insert("\"") ) - # ── Verbalized year passthrough graphs ─────────────────────────────── graph_verbalized_year_suffix = ( pynutil.insert("era: \"") + verbalized_year_sou + suffix_union + pynutil.insert("\"") + insert_space ) @@ -177,26 +176,21 @@ def __init__(self, cardinal: GraphFst): + pynutil.insert("\"") ) - # ── Numeric separator date graphs ──────────────────────────────────── - # DD-MM: uses full day range (all 1-31) graph_dd_mm = days_graph + delete_numeric_sep + months_graph - # MM-DD: only fires when day is unambiguously > 12 - # This prevents 01-10 being read as MM-DD (January 10) graph_mm_dd = months_graph + delete_numeric_sep + unambiguous_days_graph graph_mm_dd += pynutil.insert(" preserve_order: true ") - # DD-MM-YYYY: uses full day range - graph_dd_mm_yyyy = days_graph + delete_numeric_sep + months_graph + delete_numeric_sep + years_graph + graph_dd_mm_yyyy = ( + days_graph_with_year + delete_numeric_sep + months_graph + delete_numeric_sep + years_graph + ) - # MM-DD-YYYY: only fires when day is unambiguously > 12 graph_mm_dd_yyyy = ( months_graph + delete_numeric_sep + unambiguous_days_graph + delete_numeric_sep + years_graph ) graph_mm_dd_yyyy += pynutil.insert(" preserve_order: true ") - # ── Space-separated date graphs ────────────────────────────────────── - graph_dd_month = days_graph + delete_space + months_graph + graph_dd_month = days_graph + delete_space + months_graph_numeric graph_dd_month_comma_yyyy = days_graph + delete_space + months_graph + delete_comma_sep + years_graph @@ -208,11 +202,8 @@ def __init__(self, cardinal: GraphFst): graph_month_comma_yyyy_era = months_graph + delete_comma_sep + years_graph + era_graph - # MM-YYYY: supports both space and dash separator - # e.g. "मार्च २००३", "०३-२०१०", "11-2024" graph_mm_yyyy = months_graph + pynini.union(delete_space, delete_dash) + years_graph - # ── Era-only graphs ────────────────────────────────────────────────── graph_year_era_only = ( pynutil.insert("era: \"") + graph_year_era @@ -235,39 +226,29 @@ def __init__(self, cardinal: GraphFst): graph_year_suffix = era_graph - # ── Final graph ─────────────────────────────────────────────────────── final_graph = ( - # Full date with era — most specific first pynutil.add_weight(graph_dd_month_comma_yyyy_era, -0.003) | pynutil.add_weight(graph_month_comma_yyyy_era, -0.003) - # Full numeric dates | pynutil.add_weight(graph_dd_mm_yyyy, -0.001) | graph_mm_dd_yyyy - # Full space/comma dates | pynutil.add_weight(graph_dd_month_comma_yyyy, -0.001) - # Day + month only | pynutil.add_weight(graph_dd_mm, -0.001) | pynutil.add_weight(graph_dd_month, -0.001) | graph_mm_dd - # Month + year — space or dash | pynutil.add_weight(graph_mm_yyyy, -0.2) | pynutil.add_weight(graph_month_comma_yyyy, -0.2) - # Era graphs | pynutil.add_weight(graph_year_era_only, -0.005) | pynutil.add_weight(graph_range, -0.005) | pynutil.add_weight(graph_year_suffix, -0.001) - # Century ordinal | pynutil.add_weight(century_text, -0.001) - # Verbalized year passthrough — more specific first | pynutil.add_weight(graph_verbalized_year_prefix_suffix, -0.012) | pynutil.add_weight(graph_verbalized_year_prefix, -0.011) | pynutil.add_weight(graph_verbalized_year_suffix, -0.010) | pynutil.add_weight(graph_verbalized_year_bare, -0.009) - # Numeric year with suffix/prefix | pynutil.add_weight(year_prefix_suffix, -0.010) | pynutil.add_weight(year_prefix, -0.009) | pynutil.add_weight(year_text, -0.001) ) self.final_graph = final_graph.optimize() - self.fst = self.add_tokens(self.final_graph) + self.fst = self.add_tokens(self.final_graph) \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt index 2df448456..55f26c459 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt @@ -2,19 +2,24 @@ ३१-०६~इकतीस जून 02-01~दो जनवरी ०४-०१~चार जनवरी -01-10~एक अक्टूबर +01-10~एक अक्टूबर १२-०७~बारह जुलाई -02-27~फ़रवरी सत्ताईस -०४-०३~चार मार्च +02-27~फरवरी सत्ताईस +०२-२७~फरवरी सत्ताईस +०४-०३~चार मार्च 25-03-2020~पच्चीस मार्च दो हज़ार बीस ३०-०५-२०७०~तीस मई दो हज़ार सत्तर -12-07-1970~बारह जुलाई उन्नीस सौ सत्तर ०९-१२-२१०१~नौ दिसंबर इक्कीस सौ एक 23-08-2024~तेईस अगस्त दो हज़ार चौबीस १०-२९-२०००~अक्टूबर उनतीस दो हज़ार 11-14-1100~नवंबर चौदह ग्यारह सौ ०३-२०१०~मार्च दो हज़ार दस 11-2024~नवंबर दो हज़ार चौबीस +३ मार्च~तीन मार्च +६ मार्च, २०१०~छह मार्च दो हज़ार दस +३१ मई, १९९० ई.~इकतीस मई उन्नीस सौ नब्बे ईसवी +मार्च, २०२४~मार्च दो हज़ार चौबीस +जनवरी, १९९० ई.~जनवरी उन्नीस सौ नब्बे ईसवी २०७०~दो हज़ार सत्तर 2024~दो हज़ार चौबीस १२० ई. पू.~एक सौ बीस ईसा पूर्व @@ -31,4 +36,10 @@ सन 1999~सन उन्नीस सौ निन्यानबे सन् १९२०~सन् उन्नीस सौ बीस साल 1971~साल उन्नीस सौ इकहत्तर -१९२०-२६ तक~उन्नीस सौ बीस से छब्बीस तक \ No newline at end of file +सन 1999 में~सन उन्नीस सौ निन्यानबे में +सन् उन्नीस सौ बीस~सन् उन्नीस सौ बीस +सन उन्नीस सौ बीस में~सन उन्नीस सौ बीस में +१९२०-२६ तक~उन्नीस सौ बीस से छब्बीस तक +02-7-1970~दो जुलाई उन्नीस सौ सत्तर +2-7-1970~दो जुलाई उन्नीस सौ सत्तर +2-07-1970~दो जुलाई उन्नीस सौ सत्तर \ No newline at end of file From 36260d4b64fb7b1251935ddf33e406a429744331 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 4 May 2026 08:11:34 +0000 Subject: [PATCH 5/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../text_normalization/hi/taggers/date.py | 36 ++++++++++++------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/nemo_text_processing/text_normalization/hi/taggers/date.py b/nemo_text_processing/text_normalization/hi/taggers/date.py index e3067952a..22a5e4f0a 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/text_normalization/hi/taggers/date.py @@ -110,24 +110,36 @@ def __init__(self, cardinal: GraphFst): unambiguous_ascii = pynini.union(*[str(i) for i in range(13, 32)]) unambiguous_deva = pynini.union( - "१३", "१४", "१५", "१६", "१७", "१८", "१९", - "२०", "२१", "२२", "२३", "२४", "२५", "२६", "२७", "२८", "२९", - "३०", "३१", + "१३", + "१४", + "१५", + "१६", + "१७", + "१८", + "१९", + "२०", + "२१", + "२२", + "२३", + "२४", + "२५", + "२६", + "२७", + "२८", + "२९", + "३०", + "३१", ) unambiguous_inputs = pynini.union(unambiguous_ascii, unambiguous_deva) unambiguous_day_num = pynini.compose(unambiguous_inputs, days) - unambiguous_days_graph = ( - pynutil.insert("day: \"") + unambiguous_day_num + pynutil.insert("\"") + insert_space - ) + unambiguous_days_graph = pynutil.insert("day: \"") + unambiguous_day_num + pynutil.insert("\"") + insert_space month_name_acceptor = pynini.project(months, "output") months_numeric_fst = months - months_graph_numeric = ( - pynutil.insert("month: \"") + months_numeric_fst + pynutil.insert("\"") + insert_space - ) + months_graph_numeric = pynutil.insert("month: \"") + months_numeric_fst + pynutil.insert("\"") + insert_space months_fst = pynini.union(months_numeric_fst, month_name_acceptor) months_graph = pynutil.insert("month: \"") + months_fst + pynutil.insert("\"") + insert_space @@ -181,9 +193,7 @@ def __init__(self, cardinal: GraphFst): graph_mm_dd = months_graph + delete_numeric_sep + unambiguous_days_graph graph_mm_dd += pynutil.insert(" preserve_order: true ") - graph_dd_mm_yyyy = ( - days_graph_with_year + delete_numeric_sep + months_graph + delete_numeric_sep + years_graph - ) + graph_dd_mm_yyyy = days_graph_with_year + delete_numeric_sep + months_graph + delete_numeric_sep + years_graph graph_mm_dd_yyyy = ( months_graph + delete_numeric_sep + unambiguous_days_graph + delete_numeric_sep + years_graph @@ -251,4 +261,4 @@ def __init__(self, cardinal: GraphFst): ) self.final_graph = final_graph.optimize() - self.fst = self.add_tokens(self.final_graph) \ No newline at end of file + self.fst = self.add_tokens(self.final_graph) From d8bdaa93c7eb91db514d782dc4d7800a02bf2e8e Mon Sep 17 00:00:00 2001 From: Shreyas Pawar Date: Mon, 4 May 2026 10:09:07 +0000 Subject: [PATCH 6/9] hi-tn-date: minor formatting fix Signed-off-by: Shreyas Pawar --- .../text_normalization/hi/taggers/date.py | 23 +++---------------- 1 file changed, 3 insertions(+), 20 deletions(-) diff --git a/nemo_text_processing/text_normalization/hi/taggers/date.py b/nemo_text_processing/text_normalization/hi/taggers/date.py index 22a5e4f0a..875f1aab6 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/text_normalization/hi/taggers/date.py @@ -53,7 +53,6 @@ class DateFst(GraphFst): """ Finite state transducer for classifying date, e.g. "०१-०४-२०२४" -> date { day: "एक" month: "अप्रैल" year: "दो हज़ार चौबीस" } - "०४-०१-२०२४" -> date { month: "अप्रैल" day: "एक" year: "दो हज़ार चौबीस" } "६ मार्च, २०१०" -> date { day: "छह" month: "मार्च" year: "दो हज़ार दस" } "३१ मई, १९९० ई." -> date { day: "इकतीस" month: "मई" year: "उन्नीस सौ नब्बे" era: "ईसवी" } "उन्नीस सौ बीस में" -> date { era: "उन्नीस सौ बीस में" } @@ -110,25 +109,9 @@ def __init__(self, cardinal: GraphFst): unambiguous_ascii = pynini.union(*[str(i) for i in range(13, 32)]) unambiguous_deva = pynini.union( - "१३", - "१४", - "१५", - "१६", - "१७", - "१८", - "१९", - "२०", - "२१", - "२२", - "२३", - "२४", - "२५", - "२६", - "२७", - "२८", - "२९", - "३०", - "३१", + "१३", "१४", "१५", "१६", "१७", "१८", "१९", + "२०", "२१", "२२", "२३", "२४", "२५", "२६", "२७", "२८", "२९", + "३०", "३१", ) unambiguous_inputs = pynini.union(unambiguous_ascii, unambiguous_deva) unambiguous_day_num = pynini.compose(unambiguous_inputs, days) From 59230d33bdddb8879ad959993980ee047b6c6faf Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 4 May 2026 10:10:04 +0000 Subject: [PATCH 7/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../text_normalization/hi/taggers/date.py | 22 ++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/nemo_text_processing/text_normalization/hi/taggers/date.py b/nemo_text_processing/text_normalization/hi/taggers/date.py index 875f1aab6..b893f6d72 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/text_normalization/hi/taggers/date.py @@ -109,9 +109,25 @@ def __init__(self, cardinal: GraphFst): unambiguous_ascii = pynini.union(*[str(i) for i in range(13, 32)]) unambiguous_deva = pynini.union( - "१३", "१४", "१५", "१६", "१७", "१८", "१९", - "२०", "२१", "२२", "२३", "२४", "२५", "२६", "२७", "२८", "२९", - "३०", "३१", + "१३", + "१४", + "१५", + "१६", + "१७", + "१८", + "१९", + "२०", + "२१", + "२२", + "२३", + "२४", + "२५", + "२६", + "२७", + "२८", + "२९", + "३०", + "३१", ) unambiguous_inputs = pynini.union(unambiguous_ascii, unambiguous_deva) unambiguous_day_num = pynini.compose(unambiguous_inputs, days) From b24baae8395dbe572ef78a008451d7c9d7a71765 Mon Sep 17 00:00:00 2001 From: Shreyas Pawar Date: Wed, 6 May 2026 08:56:54 +0000 Subject: [PATCH 8/9] date tagger modification according to feedback removed support for d-mm-yyyy, dd-m-yyyy and mm-yyyy Signed-off-by: Shreyas Pawar --- .../text_normalization/hi/data/date/days.tsv | 20 +--- .../hi/data/date/months.tsv | 20 +--- .../text_normalization/hi/taggers/date.py | 108 +++++++++--------- .../test_cases_date.txt | 9 +- 4 files changed, 58 insertions(+), 99 deletions(-) diff --git a/nemo_text_processing/text_normalization/hi/data/date/days.tsv b/nemo_text_processing/text_normalization/hi/data/date/days.tsv index 5bab5ee25..6df0fa3d4 100644 --- a/nemo_text_processing/text_normalization/hi/data/date/days.tsv +++ b/nemo_text_processing/text_normalization/hi/data/date/days.tsv @@ -29,15 +29,6 @@ 29 उनतीस 30 तीस 31 इकतीस -1 एक -2 दो -3 तीन -4 चार -5 पाँच -6 छह -7 सात -8 आठ -9 नौ ०१ एक ०२ दो ०३ तीन @@ -68,13 +59,4 @@ २८ अट्ठाईस २९ उनतीस ३० तीस -३१ इकतीस -१ एक -२ दो -३ तीन -४ चार -५ पाँच -६ छह -७ सात -८ आठ -९ नौ \ No newline at end of file +३१ इकतीस \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/date/months.tsv b/nemo_text_processing/text_normalization/hi/data/date/months.tsv index ab984372f..3667f07cf 100644 --- a/nemo_text_processing/text_normalization/hi/data/date/months.tsv +++ b/nemo_text_processing/text_normalization/hi/data/date/months.tsv @@ -10,15 +10,6 @@ 10 अक्टूबर 11 नवंबर 12 दिसंबर -1 जनवरी -2 फरवरी -3 मार्च -4 अप्रैल -5 मई -6 जून -7 जुलाई -8 अगस्त -9 सितंबर ०१ जनवरी ०२ फरवरी ०३ मार्च @@ -30,13 +21,4 @@ ०९ सितंबर १० अक्टूबर ११ नवंबर -१२ दिसंबर -१ जनवरी -२ फरवरी -३ मार्च -४ अप्रैल -५ मई -६ जून -७ जुलाई -८ अगस्त -९ सितंबर \ No newline at end of file +१२ दिसंबर \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/taggers/date.py b/nemo_text_processing/text_normalization/hi/taggers/date.py index b893f6d72..43d5589fb 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/text_normalization/hi/taggers/date.py @@ -48,6 +48,13 @@ verbalized_hundreds + pynini.accep(" सौ") + pynini.closure(pynini.accep(" ") + verbalized_unit, 0, 1) ) +pad_latin = pynini.union( + *[pynini.cross(str(i), f"0{i}") for i in range(1, 10)] +) +pad_devanagari = pynini.union( + *[pynini.cross(d, f"०{d}") for d in "१२३४५६७८९"] +) + class DateFst(GraphFst): """ @@ -56,8 +63,7 @@ class DateFst(GraphFst): "६ मार्च, २०१०" -> date { day: "छह" month: "मार्च" year: "दो हज़ार दस" } "३१ मई, १९९० ई." -> date { day: "इकतीस" month: "मई" year: "उन्नीस सौ नब्बे" era: "ईसवी" } "उन्नीस सौ बीस में" -> date { era: "उन्नीस सौ बीस में" } - "०३-२०१०" -> date { month: "मार्च" year: "दो हज़ार दस" } - "11-2024" -> date { month: "नवंबर" year: "दो हज़ार चौबीस" } + "02-07-1970" -> date { day: "दो" month: "जुलाई" year: "उन्नीस सौ सत्तर" } Args: cardinal: cardinal GraphFst @@ -92,56 +98,50 @@ def __init__(self, cardinal: GraphFst): ) delete_dash = pynutil.delete("-") - delete_slash = pynutil.delete("/") delete_comma = pynutil.delete(",") delete_space = pynutil.delete(" ") delete_optional_space = pynini.closure(pynutil.delete(" "), 0, 1) delete_comma_sep = delete_comma + delete_optional_space - delete_numeric_sep = pynini.union(delete_dash, delete_slash) - day_num = pynini.union( + day_num_padded = pynini.union( days, teens_and_ties, ) - days_graph = pynutil.insert("day: \"") + day_num + pynutil.insert("\"") + insert_space - days_graph_with_year = days_graph - - unambiguous_ascii = pynini.union(*[str(i) for i in range(13, 32)]) - unambiguous_deva = pynini.union( - "१३", - "१४", - "१५", - "१६", - "१७", - "१८", - "१९", - "२०", - "२१", - "२२", - "२३", - "२४", - "२५", - "२६", - "२७", - "२८", - "२९", - "३०", - "३१", + day_num_bare = pynini.union( + pynini.compose(pad_latin, days), + pynini.compose(pad_devanagari, days), ) - unambiguous_inputs = pynini.union(unambiguous_ascii, unambiguous_deva) - unambiguous_day_num = pynini.compose(unambiguous_inputs, days) - unambiguous_days_graph = pynutil.insert("day: \"") + unambiguous_day_num + pynutil.insert("\"") + insert_space + days_graph_padded = pynutil.insert("day: \"") + day_num_padded + pynutil.insert("\"") + insert_space + days_graph_bare = pynutil.insert("day: \"") + day_num_bare + pynutil.insert("\"") + insert_space month_name_acceptor = pynini.project(months, "output") - months_numeric_fst = months + months_numeric_padded = months - months_graph_numeric = pynutil.insert("month: \"") + months_numeric_fst + pynutil.insert("\"") + insert_space + months_numeric_bare = pynini.union( + pynini.compose(pad_latin, months), + pynini.compose(pad_devanagari, months), + ) - months_fst = pynini.union(months_numeric_fst, month_name_acceptor) - months_graph = pynutil.insert("month: \"") + months_fst + pynutil.insert("\"") + insert_space + months_graph_numeric_padded = ( + pynutil.insert("month: \"") + months_numeric_padded + pynutil.insert("\"") + insert_space + ) + + months_fst_padded = pynini.union(months_numeric_padded, month_name_acceptor) + months_graph_padded = ( + pynutil.insert("month: \"") + months_fst_padded + pynutil.insert("\"") + insert_space + ) + + months_fst_bare = pynini.union(months_numeric_bare, month_name_acceptor) + months_graph_bare = ( + pynutil.insert("month: \"") + months_fst_bare + pynutil.insert("\"") + insert_space + ) + + month_name_graph = ( + pynutil.insert("month: \"") + month_name_acceptor + pynutil.insert("\"") + insert_space + ) years_graph = pynutil.insert("year: \"") + graph_year + pynutil.insert("\"") + insert_space @@ -187,31 +187,33 @@ def __init__(self, cardinal: GraphFst): + pynutil.insert("\"") ) - graph_dd_mm = days_graph + delete_numeric_sep + months_graph + graph_dd_mm = days_graph_padded + delete_dash + months_graph_padded - graph_mm_dd = months_graph + delete_numeric_sep + unambiguous_days_graph - graph_mm_dd += pynutil.insert(" preserve_order: true ") + graph_d_m = days_graph_bare + delete_dash + months_graph_bare - graph_dd_mm_yyyy = days_graph_with_year + delete_numeric_sep + months_graph + delete_numeric_sep + years_graph + graph_dd_mm_yyyy = ( + days_graph_padded + delete_dash + months_graph_padded + delete_dash + years_graph + ) - graph_mm_dd_yyyy = ( - months_graph + delete_numeric_sep + unambiguous_days_graph + delete_numeric_sep + years_graph + graph_d_m_yyyy = ( + days_graph_bare + delete_dash + months_graph_bare + delete_dash + years_graph ) - graph_mm_dd_yyyy += pynutil.insert(" preserve_order: true ") - graph_dd_month = days_graph + delete_space + months_graph_numeric + graph_dd_month = days_graph_padded + delete_space + months_graph_numeric_padded - graph_dd_month_comma_yyyy = days_graph + delete_space + months_graph + delete_comma_sep + years_graph + graph_dd_month_comma_yyyy = ( + days_graph_padded + delete_space + months_graph_padded + delete_comma_sep + years_graph + ) graph_dd_month_comma_yyyy_era = ( - days_graph + delete_space + months_graph + delete_comma_sep + years_graph + era_graph + days_graph_padded + delete_space + months_graph_padded + delete_comma_sep + years_graph + era_graph ) - graph_month_comma_yyyy = months_graph + delete_comma_sep + years_graph + graph_month_comma_yyyy = months_graph_padded + delete_comma_sep + years_graph - graph_month_comma_yyyy_era = months_graph + delete_comma_sep + years_graph + era_graph + graph_month_comma_yyyy_era = months_graph_padded + delete_comma_sep + years_graph + era_graph - graph_mm_yyyy = months_graph + pynini.union(delete_space, delete_dash) + years_graph + graph_month_name_yyyy = month_name_graph + delete_space + years_graph graph_year_era_only = ( pynutil.insert("era: \"") @@ -239,12 +241,12 @@ def __init__(self, cardinal: GraphFst): pynutil.add_weight(graph_dd_month_comma_yyyy_era, -0.003) | pynutil.add_weight(graph_month_comma_yyyy_era, -0.003) | pynutil.add_weight(graph_dd_mm_yyyy, -0.001) - | graph_mm_dd_yyyy + | pynutil.add_weight(graph_d_m_yyyy, -0.001) | pynutil.add_weight(graph_dd_month_comma_yyyy, -0.001) | pynutil.add_weight(graph_dd_mm, -0.001) + | pynutil.add_weight(graph_d_m, -0.001) | pynutil.add_weight(graph_dd_month, -0.001) - | graph_mm_dd - | pynutil.add_weight(graph_mm_yyyy, -0.2) + | pynutil.add_weight(graph_month_name_yyyy, -0.2) | pynutil.add_weight(graph_month_comma_yyyy, -0.2) | pynutil.add_weight(graph_year_era_only, -0.005) | pynutil.add_weight(graph_range, -0.005) @@ -260,4 +262,4 @@ def __init__(self, cardinal: GraphFst): ) self.final_graph = final_graph.optimize() - self.fst = self.add_tokens(self.final_graph) + self.fst = self.add_tokens(self.final_graph) \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt index 55f26c459..97a71c9b7 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt @@ -4,17 +4,11 @@ ०४-०१~चार जनवरी 01-10~एक अक्टूबर १२-०७~बारह जुलाई -02-27~फरवरी सत्ताईस -०२-२७~फरवरी सत्ताईस ०४-०३~चार मार्च 25-03-2020~पच्चीस मार्च दो हज़ार बीस ३०-०५-२०७०~तीस मई दो हज़ार सत्तर ०९-१२-२१०१~नौ दिसंबर इक्कीस सौ एक 23-08-2024~तेईस अगस्त दो हज़ार चौबीस -१०-२९-२०००~अक्टूबर उनतीस दो हज़ार -11-14-1100~नवंबर चौदह ग्यारह सौ -०३-२०१०~मार्च दो हज़ार दस -11-2024~नवंबर दो हज़ार चौबीस ३ मार्च~तीन मार्च ६ मार्च, २०१०~छह मार्च दो हज़ार दस ३१ मई, १९९० ई.~इकतीस मई उन्नीस सौ नब्बे ईसवी @@ -40,6 +34,5 @@ सन् उन्नीस सौ बीस~सन् उन्नीस सौ बीस सन उन्नीस सौ बीस में~सन उन्नीस सौ बीस में १९२०-२६ तक~उन्नीस सौ बीस से छब्बीस तक -02-7-1970~दो जुलाई उन्नीस सौ सत्तर 2-7-1970~दो जुलाई उन्नीस सौ सत्तर -2-07-1970~दो जुलाई उन्नीस सौ सत्तर \ No newline at end of file +02-07-1970~दो जुलाई उन्नीस सौ सत्तर \ No newline at end of file From e22bab388d5b51d646cfa4f68f2c6c0d3e151b02 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 6 May 2026 08:57:47 +0000 Subject: [PATCH 9/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../text_normalization/hi/taggers/date.py | 30 +++++-------------- 1 file changed, 8 insertions(+), 22 deletions(-) diff --git a/nemo_text_processing/text_normalization/hi/taggers/date.py b/nemo_text_processing/text_normalization/hi/taggers/date.py index 43d5589fb..451497ed0 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/text_normalization/hi/taggers/date.py @@ -48,12 +48,8 @@ verbalized_hundreds + pynini.accep(" सौ") + pynini.closure(pynini.accep(" ") + verbalized_unit, 0, 1) ) -pad_latin = pynini.union( - *[pynini.cross(str(i), f"0{i}") for i in range(1, 10)] -) -pad_devanagari = pynini.union( - *[pynini.cross(d, f"०{d}") for d in "१२३४५६७८९"] -) +pad_latin = pynini.union(*[pynini.cross(str(i), f"0{i}") for i in range(1, 10)]) +pad_devanagari = pynini.union(*[pynini.cross(d, f"०{d}") for d in "१२३४५६७८९"]) class DateFst(GraphFst): @@ -130,18 +126,12 @@ def __init__(self, cardinal: GraphFst): ) months_fst_padded = pynini.union(months_numeric_padded, month_name_acceptor) - months_graph_padded = ( - pynutil.insert("month: \"") + months_fst_padded + pynutil.insert("\"") + insert_space - ) + months_graph_padded = pynutil.insert("month: \"") + months_fst_padded + pynutil.insert("\"") + insert_space months_fst_bare = pynini.union(months_numeric_bare, month_name_acceptor) - months_graph_bare = ( - pynutil.insert("month: \"") + months_fst_bare + pynutil.insert("\"") + insert_space - ) + months_graph_bare = pynutil.insert("month: \"") + months_fst_bare + pynutil.insert("\"") + insert_space - month_name_graph = ( - pynutil.insert("month: \"") + month_name_acceptor + pynutil.insert("\"") + insert_space - ) + month_name_graph = pynutil.insert("month: \"") + month_name_acceptor + pynutil.insert("\"") + insert_space years_graph = pynutil.insert("year: \"") + graph_year + pynutil.insert("\"") + insert_space @@ -191,13 +181,9 @@ def __init__(self, cardinal: GraphFst): graph_d_m = days_graph_bare + delete_dash + months_graph_bare - graph_dd_mm_yyyy = ( - days_graph_padded + delete_dash + months_graph_padded + delete_dash + years_graph - ) + graph_dd_mm_yyyy = days_graph_padded + delete_dash + months_graph_padded + delete_dash + years_graph - graph_d_m_yyyy = ( - days_graph_bare + delete_dash + months_graph_bare + delete_dash + years_graph - ) + graph_d_m_yyyy = days_graph_bare + delete_dash + months_graph_bare + delete_dash + years_graph graph_dd_month = days_graph_padded + delete_space + months_graph_numeric_padded @@ -262,4 +248,4 @@ def __init__(self, cardinal: GraphFst): ) self.final_graph = final_graph.optimize() - self.fst = self.add_tokens(self.final_graph) \ No newline at end of file + self.fst = self.add_tokens(self.final_graph)