From 42928b4fc409e6bb35ab6ead0ab3555bc306e5b8 Mon Sep 17 00:00:00 2001 From: "TF.Text Team" Date: Mon, 18 May 2026 11:10:55 -0700 Subject: [PATCH] Add kernels from the tensorflow_text to the tensorflow PiperOrigin-RevId: 917330647 --- tensorflow_text/BUILD | 21 +- tensorflow_text/core/kernels/BUILD | 1470 +++------- .../core/kernels/boise_offset_converter.cc | 254 -- .../core/kernels/boise_offset_converter.h | 108 +- .../kernels/boise_offset_converter_kernel.cc | 31 - .../kernels/boise_offset_converter_kernel.h | 21 +- .../boise_offset_converter_kernel_template.h | 621 +--- .../kernels/boise_offset_converter_test.cc | 561 ---- tensorflow_text/core/kernels/byte_splitter.cc | 76 - tensorflow_text/core/kernels/byte_splitter.h | 94 +- .../core/kernels/byte_splitter_kernel.cc | 31 - .../core/kernels/byte_splitter_kernel.h | 21 +- .../kernels/byte_splitter_kernel_template.h | 295 +- .../core/kernels/byte_splitter_test.cc | 77 - .../core/kernels/byte_splitter_tflite.cc | 39 - .../core/kernels/byte_splitter_tflite.h | 17 +- .../core/kernels/constrained_sequence.cc | 441 --- .../core/kernels/constrained_sequence.h | 80 +- .../kernels/constrained_sequence_kernel.cc | 259 -- ...d_sequence_kernel_input_validation_test.cc | 496 ---- .../core/kernels/darts_clone_trie_builder.cc | 100 - .../core/kernels/darts_clone_trie_builder.h | 35 +- .../core/kernels/darts_clone_trie_test.cc | 188 -- .../core/kernels/darts_clone_trie_wrapper.h | 150 +- .../core/kernels/disjoint_set_forest.h | 170 +- .../core/kernels/disjoint_set_forest_test.cc | 147 - .../core/kernels/edit_changes.proto | 15 - ...greedy_constrained_sequence_kernel_test.cc | 854 ------ ...iterbi_constrained_sequence_kernel_test.cc | 910 ------ .../core/kernels/fast_bert_normalizer.h | 348 +-- .../fast_bert_normalizer_kernel_template.h | 242 +- .../kernels/fast_bert_normalizer_model.fbs | 20 - .../fast_bert_normalizer_model_builder.cc | 243 -- .../fast_bert_normalizer_model_builder.h | 82 +- ...=> fast_bert_normalizer_model_generated.h} | 15 +- .../core/kernels/fast_bert_normalizer_test.cc | 224 -- .../kernels/fast_bert_normalizer_tf_kernel.h | 15 +- .../kernels/fast_bert_normalizer_tflite.cc | 35 - .../kernels/fast_bert_normalizer_tflite.h | 20 +- .../core/kernels/fast_wordpiece_tokenizer.cc | 773 ----- .../core/kernels/fast_wordpiece_tokenizer.h | 242 +- .../fast_wordpiece_tokenizer_kernel.cc | 31 - .../kernels/fast_wordpiece_tokenizer_kernel.h | 21 +- ...fast_wordpiece_tokenizer_kernel_template.h | 360 +-- .../fast_wordpiece_tokenizer_model.fbs | 68 - .../fast_wordpiece_tokenizer_model_builder.cc | 941 ------ .../fast_wordpiece_tokenizer_model_builder.h | 37 +- ...ast_wordpiece_tokenizer_model_generated.h} | 15 +- .../kernels/fast_wordpiece_tokenizer_test.cc | 2554 ----------------- .../fast_wordpiece_tokenizer_tflite.cc | 43 - .../kernels/fast_wordpiece_tokenizer_tflite.h | 23 +- .../kernels/fast_wordpiece_tokenizer_utils.h | 254 +- .../fast_wordpiece_tokenizer_utils_test.cc | 154 - ...greedy_constrained_sequence_kernel_test.cc | 799 ------ ...iterbi_constrained_sequence_kernel_test.cc | 815 ------ .../core/kernels/mst_op_kernels.cc | 190 -- tensorflow_text/core/kernels/mst_solver.h | 595 +--- .../mst_solver_random_comparison_test.cc | 176 -- .../core/kernels/mst_solver_test.cc | 273 -- tensorflow_text/core/kernels/ngrams_kernel.cc | 42 - tensorflow_text/core/kernels/ngrams_kernel.h | 30 +- .../core/kernels/ngrams_kernel_template.h | 264 +- .../core/kernels/ngrams_kernel_test.cc | 74 - tensorflow_text/core/kernels/ngrams_tflite.cc | 55 - tensorflow_text/core/kernels/ngrams_tflite.h | 35 +- .../core/kernels/ngrams_tflite_test.cc | 305 -- .../core/kernels/normalize_kernels.cc | 370 --- .../core/kernels/normalize_kernels_test.cc | 27 - .../core/kernels/phrase_tokenizer.cc | 224 -- .../core/kernels/phrase_tokenizer.h | 88 +- .../core/kernels/phrase_tokenizer_kernel.cc | 31 - .../core/kernels/phrase_tokenizer_kernel.h | 21 +- .../phrase_tokenizer_kernel_template.h | 344 +-- .../core/kernels/phrase_tokenizer_model.fbs | 38 - .../kernels/phrase_tokenizer_model_builder.cc | 143 - .../kernels/phrase_tokenizer_model_builder.h | 26 +- ....cc => phrase_tokenizer_model_generated.h} | 15 +- .../core/kernels/phrase_tokenizer_test.cc | 98 - .../kernels/ragged_tensor_to_tensor_tflite.cc | 745 ----- .../kernels/ragged_tensor_to_tensor_tflite.h | 15 +- .../ragged_tensor_to_tensor_tflite_test.cc | 317 -- tensorflow_text/core/kernels/regex_split.cc | 90 - tensorflow_text/core/kernels/regex_split.h | 30 +- .../core/kernels/regex_split_kernels.cc | 193 -- .../core/kernels/regex_split_test.cc | 77 - .../core/kernels/rouge_l_kernel.cc | 225 -- .../core/kernels/rouge_l_kernel_test.cc | 46 - .../core/kernels/round_robin_trimmer.h | 300 +- .../kernels/round_robin_trimmer_kernel.cc | 73 - .../core/kernels/round_robin_trimmer_kernel.h | 25 +- .../round_robin_trimmer_kernel_template.h | 306 +- .../core/kernels/round_robin_trimmer_test.cc | 230 -- .../kernels/round_robin_trimmer_tflite.cc | 70 - .../core/kernels/round_robin_trimmer_tflite.h | 17 +- .../core/kernels/sentence_breaking_kernels.cc | 267 -- .../core/kernels/sentence_breaking_utils.cc | 238 -- .../core/kernels/sentence_breaking_utils.h | 56 +- .../kernels/sentence_breaking_utils_test.cc | 576 ---- .../core/kernels/sentence_fragmenter.cc | 443 --- .../core/kernels/sentence_fragmenter.h | 212 +- .../core/kernels/sentence_fragmenter_v2.cc | 706 ----- .../core/kernels/sentence_fragmenter_v2.h | 186 +- .../kernels/sentence_fragmenter_v2_kernel.h | 15 +- .../sentence_fragmenter_v2_kernel_template.h | 146 +- .../kernels/sentence_fragmenter_v2_test.cc | 1092 ------- .../kernels/sentence_fragmenter_v2_tflite.cc | 34 - .../kernels/sentence_fragmenter_v2_tflite.h | 15 +- .../core/kernels/sentencepiece/BUILD | 405 +-- .../core/kernels/sentencepiece/config.fbs | 25 - .../kernels/sentencepiece/decoder_config.fbs | 43 - .../kernels/sentencepiece/double_array_trie.h | 120 +- .../double_array_trie_builder.cc | 93 - .../sentencepiece/double_array_trie_builder.h | 41 +- .../sentencepiece/double_array_trie_test.cc | 90 - .../kernels/sentencepiece/encoder_config.fbs | 52 - .../kernels/sentencepiece/model_converter.cc | 216 -- .../kernels/sentencepiece/model_converter.h | 52 +- .../core/kernels/sentencepiece/native.bzl | 89 - .../core/kernels/sentencepiece/native.bzl.oss | 87 - .../sentencepiece/optimized_decoder.cc | 75 - .../kernels/sentencepiece/optimized_decoder.h | 50 +- .../sentencepiece/optimized_decoder_test.cc | 107 - .../sentencepiece/optimized_encoder.cc | 259 -- .../kernels/sentencepiece/optimized_encoder.h | 52 +- .../sentencepiece/optimized_encoder_test.cc | 187 -- .../sentencepiece/py_tflite_registerer.cc | 57 - .../sentencepiece/py_tflite_registerer.h | 39 +- .../sentencepiece/sentencepiece_constants.h | 43 +- .../sentencepiece/sentencepiece_detokenizer.h | 33 +- .../sentencepiece_detokenizer_kernel.cc | 101 - .../sentencepiece_detokenizer_tflite.cc | 127 - .../sentencepiece/sentencepiece_tokenizer.h | 33 +- .../sentencepiece_tokenizer_kernel.cc | 108 - .../sentencepiece_tokenizer_tflite.cc | 150 - .../core/kernels/sentencepiece/utils.h | 66 +- .../core/kernels/sentencepiece_kernels.cc | 739 ----- .../core/kernels/spanning_tree_iterator.cc | 96 - .../core/kernels/spanning_tree_iterator.h | 66 +- .../kernels/spanning_tree_iterator_test.cc | 139 - .../kernels/split_merge_tokenize_kernel.cc | 222 -- tensorflow_text/core/kernels/string_vocab.cc | 50 - tensorflow_text/core/kernels/string_vocab.h | 30 +- .../core/kernels/text_kernels_test_util.cc | 72 - .../core/kernels/text_kernels_test_util.h | 111 +- .../kernels/tokenizer_from_logits_kernel.cc | 235 -- tensorflow_text/core/kernels/trimmer.h | 74 +- .../kernels/unicode_script_tokenize_kernel.cc | 194 -- .../unicode_script_tokenize_kernel_test.cc | 68 - tensorflow_text/core/kernels/utf8_binarize.cc | 53 - tensorflow_text/core/kernels/utf8_binarize.h | 17 +- .../core/kernels/utf8_binarize_kernel.h | 14 +- .../kernels/utf8_binarize_kernel_template.h | 166 +- .../core/kernels/utf8_binarize_test.cc | 59 - .../core/kernels/utf8_binarize_tflite.cc | 33 - .../core/kernels/utf8_binarize_tflite.h | 21 +- .../kernels/whitespace_tokenize_kernel.cc | 161 -- .../whitespace_tokenize_kernel_test.cc | 68 - .../core/kernels/whitespace_tokenizer.cc | 87 - .../core/kernels/whitespace_tokenizer.h | 96 +- .../whitespace_tokenizer_config_builder.cc | 67 - .../whitespace_tokenizer_config_builder.h | 26 +- ...hitespace_tokenizer_config_builder_test.cc | 89 - .../kernels/whitespace_tokenizer_kernel.cc | 27 - .../kernels/whitespace_tokenizer_kernel.h | 21 +- .../whitespace_tokenizer_kernel_template.h | 176 +- .../core/kernels/whitespace_tokenizer_test.cc | 90 - .../kernels/whitespace_tokenizer_tflite.cc | 34 - .../kernels/whitespace_tokenizer_tflite.h | 15 +- .../core/kernels/wordpiece_kernel.cc | 317 -- .../core/kernels/wordpiece_kernel_test.cc | 53 - .../core/kernels/wordpiece_tokenizer.cc | 246 -- .../core/kernels/wordpiece_tokenizer.h | 51 +- tensorflow_text/core/pybinds/BUILD | 21 +- ...wrap_fast_bert_normalizer_model_builder.cc | 2 +- ..._fast_wordpiece_tokenizer_model_builder.cc | 2 +- .../core/pybinds/pywrap_model_converter.cc | 2 +- .../pywrap_phrase_tokenizer_model_builder.cc | 2 +- ...rap_whitespace_tokenizer_config_builder.cc | 5 +- .../core/pybinds/tflite_registrar.cc | 20 +- ...izer_model_lower_case_nfd_strip_accents.fb | Bin 254136 -> 254164 bytes 180 files changed, 668 insertions(+), 32434 deletions(-) delete mode 100644 tensorflow_text/core/kernels/boise_offset_converter.cc delete mode 100644 tensorflow_text/core/kernels/boise_offset_converter_kernel.cc delete mode 100644 tensorflow_text/core/kernels/boise_offset_converter_test.cc delete mode 100644 tensorflow_text/core/kernels/byte_splitter.cc delete mode 100644 tensorflow_text/core/kernels/byte_splitter_kernel.cc delete mode 100644 tensorflow_text/core/kernels/byte_splitter_test.cc delete mode 100644 tensorflow_text/core/kernels/byte_splitter_tflite.cc delete mode 100644 tensorflow_text/core/kernels/constrained_sequence.cc delete mode 100644 tensorflow_text/core/kernels/constrained_sequence_kernel.cc delete mode 100644 tensorflow_text/core/kernels/constrained_sequence_kernel_input_validation_test.cc delete mode 100644 tensorflow_text/core/kernels/darts_clone_trie_builder.cc delete mode 100644 tensorflow_text/core/kernels/darts_clone_trie_test.cc delete mode 100644 tensorflow_text/core/kernels/disjoint_set_forest_test.cc delete mode 100644 tensorflow_text/core/kernels/edit_changes.proto delete mode 100644 tensorflow_text/core/kernels/exp_greedy_constrained_sequence_kernel_test.cc delete mode 100644 tensorflow_text/core/kernels/exp_viterbi_constrained_sequence_kernel_test.cc delete mode 100644 tensorflow_text/core/kernels/fast_bert_normalizer_model.fbs delete mode 100644 tensorflow_text/core/kernels/fast_bert_normalizer_model_builder.cc rename tensorflow_text/core/kernels/{fast_bert_normalizer_tf_kernel.cc => fast_bert_normalizer_model_generated.h} (63%) delete mode 100644 tensorflow_text/core/kernels/fast_bert_normalizer_test.cc delete mode 100644 tensorflow_text/core/kernels/fast_bert_normalizer_tflite.cc delete mode 100644 tensorflow_text/core/kernels/fast_wordpiece_tokenizer.cc delete mode 100644 tensorflow_text/core/kernels/fast_wordpiece_tokenizer_kernel.cc delete mode 100644 tensorflow_text/core/kernels/fast_wordpiece_tokenizer_model.fbs delete mode 100644 tensorflow_text/core/kernels/fast_wordpiece_tokenizer_model_builder.cc rename tensorflow_text/core/kernels/{sentence_fragmenter_v2_kernel.cc => fast_wordpiece_tokenizer_model_generated.h} (60%) delete mode 100644 tensorflow_text/core/kernels/fast_wordpiece_tokenizer_test.cc delete mode 100644 tensorflow_text/core/kernels/fast_wordpiece_tokenizer_tflite.cc delete mode 100644 tensorflow_text/core/kernels/fast_wordpiece_tokenizer_utils_test.cc delete mode 100644 tensorflow_text/core/kernels/log_greedy_constrained_sequence_kernel_test.cc delete mode 100644 tensorflow_text/core/kernels/log_viterbi_constrained_sequence_kernel_test.cc delete mode 100644 tensorflow_text/core/kernels/mst_op_kernels.cc delete mode 100644 tensorflow_text/core/kernels/mst_solver_random_comparison_test.cc delete mode 100644 tensorflow_text/core/kernels/mst_solver_test.cc delete mode 100644 tensorflow_text/core/kernels/ngrams_kernel.cc delete mode 100644 tensorflow_text/core/kernels/ngrams_kernel_test.cc delete mode 100644 tensorflow_text/core/kernels/ngrams_tflite.cc delete mode 100644 tensorflow_text/core/kernels/ngrams_tflite_test.cc delete mode 100644 tensorflow_text/core/kernels/normalize_kernels.cc delete mode 100644 tensorflow_text/core/kernels/normalize_kernels_test.cc delete mode 100644 tensorflow_text/core/kernels/phrase_tokenizer.cc delete mode 100644 tensorflow_text/core/kernels/phrase_tokenizer_kernel.cc delete mode 100644 tensorflow_text/core/kernels/phrase_tokenizer_model.fbs delete mode 100644 tensorflow_text/core/kernels/phrase_tokenizer_model_builder.cc rename tensorflow_text/core/kernels/{utf8_binarize_kernel.cc => phrase_tokenizer_model_generated.h} (64%) delete mode 100644 tensorflow_text/core/kernels/phrase_tokenizer_test.cc delete mode 100644 tensorflow_text/core/kernels/ragged_tensor_to_tensor_tflite.cc delete mode 100644 tensorflow_text/core/kernels/ragged_tensor_to_tensor_tflite_test.cc delete mode 100644 tensorflow_text/core/kernels/regex_split.cc delete mode 100644 tensorflow_text/core/kernels/regex_split_kernels.cc delete mode 100644 tensorflow_text/core/kernels/regex_split_test.cc delete mode 100644 tensorflow_text/core/kernels/rouge_l_kernel.cc delete mode 100644 tensorflow_text/core/kernels/rouge_l_kernel_test.cc delete mode 100644 tensorflow_text/core/kernels/round_robin_trimmer_kernel.cc delete mode 100644 tensorflow_text/core/kernels/round_robin_trimmer_test.cc delete mode 100644 tensorflow_text/core/kernels/round_robin_trimmer_tflite.cc delete mode 100644 tensorflow_text/core/kernels/sentence_breaking_kernels.cc delete mode 100644 tensorflow_text/core/kernels/sentence_breaking_utils.cc delete mode 100644 tensorflow_text/core/kernels/sentence_breaking_utils_test.cc delete mode 100644 tensorflow_text/core/kernels/sentence_fragmenter.cc delete mode 100644 tensorflow_text/core/kernels/sentence_fragmenter_v2.cc delete mode 100644 tensorflow_text/core/kernels/sentence_fragmenter_v2_test.cc delete mode 100644 tensorflow_text/core/kernels/sentence_fragmenter_v2_tflite.cc delete mode 100644 tensorflow_text/core/kernels/sentencepiece/config.fbs delete mode 100644 tensorflow_text/core/kernels/sentencepiece/decoder_config.fbs delete mode 100644 tensorflow_text/core/kernels/sentencepiece/double_array_trie_builder.cc delete mode 100644 tensorflow_text/core/kernels/sentencepiece/double_array_trie_test.cc delete mode 100644 tensorflow_text/core/kernels/sentencepiece/encoder_config.fbs delete mode 100644 tensorflow_text/core/kernels/sentencepiece/model_converter.cc delete mode 100644 tensorflow_text/core/kernels/sentencepiece/native.bzl delete mode 100644 tensorflow_text/core/kernels/sentencepiece/native.bzl.oss delete mode 100644 tensorflow_text/core/kernels/sentencepiece/optimized_decoder.cc delete mode 100644 tensorflow_text/core/kernels/sentencepiece/optimized_decoder_test.cc delete mode 100644 tensorflow_text/core/kernels/sentencepiece/optimized_encoder.cc delete mode 100644 tensorflow_text/core/kernels/sentencepiece/optimized_encoder_test.cc delete mode 100644 tensorflow_text/core/kernels/sentencepiece/py_tflite_registerer.cc delete mode 100644 tensorflow_text/core/kernels/sentencepiece/sentencepiece_detokenizer_kernel.cc delete mode 100644 tensorflow_text/core/kernels/sentencepiece/sentencepiece_detokenizer_tflite.cc delete mode 100644 tensorflow_text/core/kernels/sentencepiece/sentencepiece_tokenizer_kernel.cc delete mode 100644 tensorflow_text/core/kernels/sentencepiece/sentencepiece_tokenizer_tflite.cc delete mode 100644 tensorflow_text/core/kernels/sentencepiece_kernels.cc delete mode 100644 tensorflow_text/core/kernels/spanning_tree_iterator.cc delete mode 100644 tensorflow_text/core/kernels/spanning_tree_iterator_test.cc delete mode 100644 tensorflow_text/core/kernels/split_merge_tokenize_kernel.cc delete mode 100644 tensorflow_text/core/kernels/string_vocab.cc delete mode 100644 tensorflow_text/core/kernels/text_kernels_test_util.cc delete mode 100644 tensorflow_text/core/kernels/tokenizer_from_logits_kernel.cc delete mode 100644 tensorflow_text/core/kernels/unicode_script_tokenize_kernel.cc delete mode 100644 tensorflow_text/core/kernels/unicode_script_tokenize_kernel_test.cc delete mode 100644 tensorflow_text/core/kernels/utf8_binarize.cc delete mode 100644 tensorflow_text/core/kernels/utf8_binarize_test.cc delete mode 100644 tensorflow_text/core/kernels/utf8_binarize_tflite.cc delete mode 100644 tensorflow_text/core/kernels/whitespace_tokenize_kernel.cc delete mode 100644 tensorflow_text/core/kernels/whitespace_tokenize_kernel_test.cc delete mode 100644 tensorflow_text/core/kernels/whitespace_tokenizer.cc delete mode 100644 tensorflow_text/core/kernels/whitespace_tokenizer_config_builder.cc delete mode 100644 tensorflow_text/core/kernels/whitespace_tokenizer_config_builder_test.cc delete mode 100644 tensorflow_text/core/kernels/whitespace_tokenizer_kernel.cc delete mode 100644 tensorflow_text/core/kernels/whitespace_tokenizer_test.cc delete mode 100644 tensorflow_text/core/kernels/whitespace_tokenizer_tflite.cc delete mode 100644 tensorflow_text/core/kernels/wordpiece_kernel.cc delete mode 100644 tensorflow_text/core/kernels/wordpiece_kernel_test.cc delete mode 100644 tensorflow_text/core/kernels/wordpiece_tokenizer.cc diff --git a/tensorflow_text/BUILD b/tensorflow_text/BUILD index 209285f54..d0331dbb7 100644 --- a/tensorflow_text/BUILD +++ b/tensorflow_text/BUILD @@ -5,11 +5,15 @@ load("//tensorflow_text:tftext.bzl", "extra_py_deps", "if_pywrap", "py_library", # [internal] load build_test.bzl load("//tools/build_defs/license:license.bzl", "license") +load("//tools/build_defs/testing:bzl_library.bzl", "bzl_library") # Visibility rules package( default_applicable_licenses = [":license"], - default_visibility = ["//visibility:public"], + default_visibility = [ + "//visibility:public", + "@org_tensorflow//tensorflow/core/kernels/text:__subpackages__", + ], ) license(name = "license") @@ -1728,3 +1732,18 @@ py_test( "//tensorflow_text/core/pybinds:pybinds_library", ]), ) + +bzl_library( + name = "tftext_bzl", + srcs = ["tftext.bzl"], + parse_tests = False, + visibility = ["//visibility:private"], + deps = [ + "//devtools/build_cleaner/skylark:build_defs_lib", + "//third_party/bazel_rules/rules_python/python:py_library_bzl", + "//third_party/gpus/cuda:build_defs_bzl", + "//third_party/pybind11/google3_utils:build_defs_bzl", + "//third_party/tensorflow:tensorflow_bzl", + "@rules_cc//cc:core_rules", + ], +) diff --git a/tensorflow_text/core/kernels/BUILD b/tensorflow_text/core/kernels/BUILD index f667b6174..95365ee66 100644 --- a/tensorflow_text/core/kernels/BUILD +++ b/tensorflow_text/core/kernels/BUILD @@ -1,210 +1,76 @@ -"""Kernels for tf.text ops.""" +"""Kernels for tf.text ops. +All implementation files moved to //third_party/tensorflow/core/kernels/text. +""" -load("@flatbuffers//:build_defs.bzl", "flatbuffer_cc_library") load("@rules_cc//cc:cc_library.bzl", "cc_library") -load("@rules_cc//cc:cc_test.bzl", "cc_test") - -# Placeholder: load proto_library -load("//tensorflow_text:tftext.bzl", "tf_cc_library", "tflite_cc_library") -# [internal] load cc_proto_library.bzl licenses(["notice"]) -# Visibility rules package( default_applicable_licenses = ["//tensorflow_text:license"], + default_compatible_with = ["//buildenv/target:non_prod"], default_visibility = ["//visibility:public"], ) exports_files(["LICENSE"]) +# Aliases to relocated targets + cc_library( - name = "boise_offset_converter", - srcs = ["boise_offset_converter.cc"], - hdrs = ["boise_offset_converter.h"], - deps = [ - "@com_google_absl//absl/status", - "@com_google_absl//absl/status:statusor", - "@com_google_absl//absl/strings", - ], + name = "wordpiece_tokenizer", + hdrs = ["wordpiece_tokenizer.h"], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:wordpiece_tokenizer"], ) -cc_test( - name = "boise_offset_converter_test", - size = "small", - srcs = ["boise_offset_converter_test.cc"], - deps = [ - ":boise_offset_converter", - "@com_google_absl//absl/strings", - "@com_google_absl//absl/strings:string_view", - "@com_google_googletest//:gtest_main", - ], +cc_library( + name = "boise_offset_converter", + hdrs = ["boise_offset_converter.h"], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:boise_offset_converter"], ) -tf_cc_library( +cc_library( name = "boise_offset_converter_kernel", - srcs = ["boise_offset_converter_kernel.cc"], hdrs = ["boise_offset_converter_kernel.h"], - tf_deps = [ - # tf:framework tensorflow dep, - ], - deps = [ - ":boise_offset_converter_kernel_template", - # lite/kernels/shim:tf_op_shim tensorflow dep, - ], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:boise_offset_converter_kernel"], ) -tf_cc_library( +cc_library( name = "boise_offset_converter_kernel_template", hdrs = ["boise_offset_converter_kernel_template.h"], - tf_deps = [ - # tf/platform:tstring tensorflow dep, - ], - deps = [ - ":boise_offset_converter", - "@com_google_absl//absl/status", - # lite/kernels/shim:op_kernel tensorflow dep, - # lite/kernels/shim:shape tensorflow dep, - # lite/kernels/shim:status_macros tensorflow dep, - ], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:boise_offset_converter_kernel_template"], ) cc_library( name = "byte_splitter", - srcs = ["byte_splitter.cc"], hdrs = ["byte_splitter.h"], - deps = [ - "@com_google_absl//absl/status:statusor", - "@com_google_absl//absl/strings", - ], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:byte_splitter"], ) -cc_test( - name = "byte_splitter_test", - size = "small", - srcs = ["byte_splitter_test.cc"], - deps = [ - ":byte_splitter", - "@com_google_googletest//:gtest_main", - ], -) - -tf_cc_library( +cc_library( name = "byte_splitter_kernel", - srcs = ["byte_splitter_kernel.cc"], hdrs = ["byte_splitter_kernel.h"], - tf_deps = [ - # tf:framework tensorflow dep, - ], - deps = [ - ":byte_splitter_kernel_template", - "@com_google_absl//absl/status", - "@com_google_absl//absl/status:statusor", - # lite/kernels/shim:tf_op_shim tensorflow dep, - ], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:byte_splitter_kernel"], ) -tf_cc_library( +cc_library( name = "byte_splitter_kernel_template", hdrs = ["byte_splitter_kernel_template.h"], - tf_deps = [ - # tf/platform:tstring tensorflow dep, - ], - deps = [ - ":byte_splitter", - "@com_google_absl//absl/status", - # lite/kernels/shim:op_kernel tensorflow dep, - # lite/kernels/shim:shape tensorflow dep, - # lite/kernels/shim:status_macros tensorflow dep, - ], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:byte_splitter_kernel_template"], ) -tflite_cc_library( +cc_library( name = "byte_splitter_tflite", - srcs = ["byte_splitter_tflite.cc"], hdrs = ["byte_splitter_tflite.h"], - deps = [ - ":byte_splitter_kernel_template", - # lite:mutable_op_resolver tensorflow dep, - # lite/c:common tensorflow dep, - # lite/kernels/shim:tflite_op_shim tensorflow dep, - ], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:byte_splitter_tflite"], ) -tf_cc_library( +cc_library( name = "constrained_sequence", - srcs = ["constrained_sequence.cc"], hdrs = ["constrained_sequence.h"], - tf_deps = [ - # tf:framework tensorflow dep, - # tf:lib tensorflow dep, - # tf:protos_all_cc tensorflow dep, - ], -) - -tf_cc_library( - name = "constrained_sequence_kernel", - srcs = ["constrained_sequence_kernel.cc"], - tf_deps = [ - # tf:framework tensorflow dep, - # tf:lib tensorflow dep, - # tf:protos_all_cc tensorflow dep, - ], - deps = [ - ":constrained_sequence", - "@com_google_absl//absl/base:core_headers", - ], -) - -cc_test( - name = "constrained_sequence_kernel_input_validation_test", - srcs = ["constrained_sequence_kernel_input_validation_test.cc"], - deps = [ - ":text_kernels_test_util", - "@com_google_googletest//:gtest_main", - # tf:framework tensorflow dep, - # tf:lib tensorflow dep, - # tf:protos_all_cc tensorflow dep, - # tf:test tensorflow dep, - # tf:testlib tensorflow dep, - # tf/kernels:ops_testutil tensorflow dep, - "//tensorflow_text:constrained_sequence_op_cc", - ], -) - -cc_test( - name = "exp_greedy_constrained_sequence_kernel_test", - srcs = ["exp_greedy_constrained_sequence_kernel_test.cc"], - deps = [ - ":text_kernels_test_util", - "@com_google_googletest//:gtest_main", - # tf:framework tensorflow dep, - # tf:lib tensorflow dep, - # tf:protos_all_cc tensorflow dep, - # tf:test tensorflow dep, - # tf:testlib tensorflow dep, - # tf/kernels:ops_testutil tensorflow dep, - "//tensorflow_text:constrained_sequence_op_cc", - ], -) - -cc_test( - name = "exp_viterbi_constrained_sequence_kernel_test", - srcs = ["exp_viterbi_constrained_sequence_kernel_test.cc"], - deps = [ - ":text_kernels_test_util", - "@com_google_googletest//:gtest_main", - # tf:framework tensorflow dep, - # tf:lib tensorflow dep, - # tf:protos_all_cc tensorflow dep, - # tf:test tensorflow dep, - # tf:testlib tensorflow dep, - # tf/kernels:ops_testutil tensorflow dep, - "//tensorflow_text:constrained_sequence_op_cc", - ], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:constrained_sequence"], ) -tf_cc_library( +cc_library( name = "fast_bert_normalizer", hdrs = ["fast_bert_normalizer.h"], deps = [ @@ -216,831 +82,224 @@ tf_cc_library( "@com_google_absl//absl/status", "@com_google_absl//absl/strings", "@icu//:common", + "@org_tensorflow//tensorflow/core/kernels/text:fast_bert_normalizer", # lite/kernels/shim:status_macros tensorflow dep, ], ) -flatbuffer_cc_library( +cc_library( name = "fast_bert_normalizer_model", - srcs = [ - "fast_bert_normalizer_model.fbs", - ], + hdrs = ["fast_bert_normalizer_model_generated.h"], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:fast_bert_normalizer_model"], ) -tf_cc_library( +cc_library( name = "fast_bert_normalizer_model_builder", - srcs = ["fast_bert_normalizer_model_builder.cc"], hdrs = ["fast_bert_normalizer_model_builder.h"], - tf_deps = [ - # tf:lib tensorflow dep, - ], - deps = [ - ":darts_clone_trie_builder", - ":fast_bert_normalizer", - ":fast_bert_normalizer_model", - "@com_google_absl//absl/memory", - "@com_google_absl//absl/status", - "@com_google_absl//absl/status:statusor", - "@com_google_absl//absl/strings", - "@icu//:common", - "@icu//:nfkc_cf", # Needed for NFKC_Casefold Unicode Normalization form. - "@com_googlesource_code_re2//:re2", - # lite/kernels/shim:status_macros tensorflow dep, - ], -) - -cc_test( - name = "fast_bert_normalizer_test", - size = "small", - srcs = ["fast_bert_normalizer_test.cc"], - deps = [ - ":fast_bert_normalizer", - ":fast_bert_normalizer_model_builder", - "@com_google_googletest//:gtest_main", - ], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:fast_bert_normalizer_model_builder"], ) cc_library( name = "fast_bert_normalizer_kernel_template", hdrs = ["fast_bert_normalizer_kernel_template.h"], - deps = [ - ":fast_bert_normalizer", - "@com_google_absl//absl/status", - "@com_google_absl//absl/strings", - # lite/kernels/shim:op_kernel tensorflow dep, - # lite/kernels/shim:status_macros tensorflow dep, - ], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:fast_bert_normalizer_kernel_template"], ) -tf_cc_library( +cc_library( name = "fast_bert_normalizer_tf_kernel", - srcs = ["fast_bert_normalizer_tf_kernel.cc"], hdrs = ["fast_bert_normalizer_tf_kernel.h"], - tf_deps = [ - # tf:framework tensorflow dep, - ], - deps = [ - ":fast_bert_normalizer_kernel_template", - # lite/kernels/shim:tf_op_shim tensorflow dep, - ], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:fast_bert_normalizer_tf_kernel"], ) -tflite_cc_library( +cc_library( name = "fast_bert_normalizer_tflite", - srcs = ["fast_bert_normalizer_tflite.cc"], hdrs = ["fast_bert_normalizer_tflite.h"], - deps = [ - ":fast_bert_normalizer_kernel_template", - # lite:mutable_op_resolver tensorflow dep, - # lite/c:common tensorflow dep, - # lite/kernels/shim:tflite_op_shim tensorflow dep, - ], -) - -cc_test( - name = "log_greedy_constrained_sequence_kernel_test", - srcs = ["log_greedy_constrained_sequence_kernel_test.cc"], - deps = [ - ":text_kernels_test_util", - "@com_google_googletest//:gtest_main", - # tf:framework tensorflow dep, - # tf:lib tensorflow dep, - # tf:protos_all_cc tensorflow dep, - # tf:test tensorflow dep, - # tf:testlib tensorflow dep, - # tf/kernels:ops_testutil tensorflow dep, - "//tensorflow_text:constrained_sequence_op_cc", - ], -) - -cc_test( - name = "log_viterbi_constrained_sequence_kernel_test", - srcs = ["log_viterbi_constrained_sequence_kernel_test.cc"], - deps = [ - ":text_kernels_test_util", - "@com_google_googletest//:gtest_main", - # tf:framework tensorflow dep, - # tf:lib tensorflow dep, - # tf:protos_all_cc tensorflow dep, - # tf:test tensorflow dep, - # tf:testlib tensorflow dep, - # tf/kernels:ops_testutil tensorflow dep, - "//tensorflow_text:constrained_sequence_op_cc", - ], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:fast_bert_normalizer_tflite"], ) cc_library( name = "darts_clone_trie_builder", - srcs = [ - "darts_clone_trie_builder.cc", - ], - hdrs = [ - "darts_clone_trie_builder.h", - ], - deps = [ - "@com_google_absl//absl/container:flat_hash_set", - "@com_google_absl//absl/status:statusor", - "@com_google_absl//absl/strings", - "@darts_clone", - ], + hdrs = ["darts_clone_trie_builder.h"], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:darts_clone_trie_builder"], ) cc_library( name = "darts_clone_trie_wrapper", - hdrs = [ - "darts_clone_trie_wrapper.h", - ], - deps = [ - "@com_google_absl//absl/status:statusor", - ], -) - -cc_test( - name = "darts_clone_trie_test", - size = "small", - srcs = ["darts_clone_trie_test.cc"], - deps = [ - ":darts_clone_trie_builder", - ":darts_clone_trie_wrapper", - "@com_google_absl//absl/status", - "@com_google_googletest//:gtest_main", - ], + hdrs = ["darts_clone_trie_wrapper.h"], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:darts_clone_trie_wrapper"], ) -tf_cc_library( +cc_library( name = "disjoint_set_forest", hdrs = ["disjoint_set_forest.h"], - tf_deps = [ - # tf:lib tensorflow dep, - ], -) - -cc_test( - name = "disjoint_set_forest_test", - size = "small", - srcs = ["disjoint_set_forest_test.cc"], - deps = [ - ":disjoint_set_forest", - "@com_google_googletest//:gtest_main", - ], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:disjoint_set_forest"], ) -tf_cc_library( +cc_library( name = "fast_wordpiece_tokenizer", - srcs = ["fast_wordpiece_tokenizer.cc"], - hdrs = [ - "fast_wordpiece_tokenizer.h", - ], - deps = [ - ":darts_clone_trie_wrapper", - ":fast_wordpiece_tokenizer_model", - ":fast_wordpiece_tokenizer_utils", - "@com_google_absl//absl/base:core_headers", - "@com_google_absl//absl/status", - "@com_google_absl//absl/status:statusor", - "@com_google_absl//absl/strings", - "@icu//:nfkc", - # lite/kernels/shim:status_macros tensorflow dep, - ], -) - -cc_test( - name = "fast_wordpiece_tokenizer_test", - srcs = ["fast_wordpiece_tokenizer_test.cc"], - data = [ - "//tensorflow_text:python/ops/test_data/fast_wordpiece_tokenizer_model.fb", - "//tensorflow_text:python/ops/test_data/fast_wordpiece_tokenizer_model_ver_15_1.fb", - "//tensorflow_text:python/ops/test_data/fast_wordpiece_tokenizer_model_ver_16_0.fb", - ], - deps = [ - ":fast_wordpiece_tokenizer", - ":fast_wordpiece_tokenizer_model_builder", - "@com_google_googletest//:gtest_main", - "@com_google_absl//absl/flags:flag", - "@icu//:headers", - # tf:lib tensorflow dep, - ], + hdrs = ["fast_wordpiece_tokenizer.h"], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:fast_wordpiece_tokenizer"], ) -flatbuffer_cc_library( +cc_library( name = "fast_wordpiece_tokenizer_model", - srcs = [ - "fast_wordpiece_tokenizer_model.fbs", - ], + hdrs = ["fast_wordpiece_tokenizer_model_generated.h"], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:fast_wordpiece_tokenizer_model"], ) -flatbuffer_cc_library( - name = "phrase_tokenizer_model", - srcs = [ - "phrase_tokenizer_model.fbs", - ], +cc_library( + name = "fast_wordpiece_tokenizer_model_builder", + hdrs = ["fast_wordpiece_tokenizer_model_builder.h"], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:fast_wordpiece_tokenizer_model_builder"], ) -tf_cc_library( - name = "fast_wordpiece_tokenizer_model_builder", - srcs = ["fast_wordpiece_tokenizer_model_builder.cc"], - hdrs = [ - "fast_wordpiece_tokenizer_model_builder.h", - ], - deps = [ - ":darts_clone_trie_builder", - ":darts_clone_trie_wrapper", - ":fast_wordpiece_tokenizer_model", - ":fast_wordpiece_tokenizer_utils", - ":sentence_fragmenter_v2", - ":string_vocab", - "@com_google_absl//absl/container:flat_hash_set", - "@com_google_absl//absl/status", - "@com_google_absl//absl/status:statusor", - "@com_google_absl//absl/strings", - "@cppitertools", - "@icu//:nfkc", - # lite/kernels/shim:status_macros tensorflow dep, - ], +cc_library( + name = "phrase_tokenizer_model", + hdrs = ["phrase_tokenizer_model_generated.h"], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:phrase_tokenizer_model"], ) -tf_cc_library( +cc_library( name = "phrase_tokenizer_model_builder", - srcs = ["phrase_tokenizer_model_builder.cc"], - hdrs = [ - "phrase_tokenizer_model_builder.h", - ], - deps = [ - ":darts_clone_trie_builder", - ":darts_clone_trie_wrapper", - ":fast_wordpiece_tokenizer_utils", - ":phrase_tokenizer_model", - ":sentence_fragmenter_v2", - ":string_vocab", - ":whitespace_tokenizer_config_builder", - ":wordpiece_tokenizer", - "@com_google_absl//absl/container:flat_hash_map", - "@com_google_absl//absl/container:flat_hash_set", - "@com_google_absl//absl/status", - "@com_google_absl//absl/status:statusor", - "@com_google_absl//absl/strings", - "@icu//:nfkc", - # lite/kernels/shim:status_macros tensorflow dep, - "//tensorflow_text/core/kernels/sentencepiece:double_array_trie_builder", - ], + hdrs = ["phrase_tokenizer_model_builder.h"], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:phrase_tokenizer_model_builder"], ) -tf_cc_library( +cc_library( name = "fast_wordpiece_tokenizer_kernel", - srcs = ["fast_wordpiece_tokenizer_kernel.cc"], hdrs = ["fast_wordpiece_tokenizer_kernel.h"], - tf_deps = [ - # tf:framework tensorflow dep, - ], - deps = [ - ":fast_wordpiece_tokenizer_kernel_template", - # lite/kernels/shim:tf_op_shim tensorflow dep, - ], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:fast_wordpiece_tokenizer_kernel"], ) cc_library( name = "fast_wordpiece_tokenizer_kernel_template", hdrs = ["fast_wordpiece_tokenizer_kernel_template.h"], - deps = [ - ":fast_wordpiece_tokenizer", - "@com_google_absl//absl/status", - "@com_google_absl//absl/strings", - # lite/kernels/shim:op_kernel tensorflow dep, - # lite/kernels/shim:status_macros tensorflow dep, - ], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:fast_wordpiece_tokenizer_kernel_template"], ) -tflite_cc_library( +cc_library( name = "fast_wordpiece_tokenizer_tflite", - srcs = ["fast_wordpiece_tokenizer_tflite.cc"], hdrs = ["fast_wordpiece_tokenizer_tflite.h"], - deps = [ - ":fast_wordpiece_tokenizer_kernel_template", - # lite:mutable_op_resolver tensorflow dep, - # lite/c:common tensorflow dep, - # lite/kernels/shim:tflite_op_shim tensorflow dep, - ], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:fast_wordpiece_tokenizer_tflite"], ) cc_library( name = "fast_wordpiece_tokenizer_utils", - hdrs = [ - "fast_wordpiece_tokenizer_utils.h", - ], - deps = [ - "@com_google_absl//absl/status:statusor", - "@com_google_absl//absl/strings", - "@icu//:nfkc", - ], -) - -cc_test( - name = "fast_wordpiece_tokenizer_utils_test", - srcs = ["fast_wordpiece_tokenizer_utils_test.cc"], - deps = [ - ":fast_wordpiece_tokenizer_utils", - "@com_google_googletest//:gtest_main", - ], -) - -tf_cc_library( - name = "mst_op_kernels", - srcs = ["mst_op_kernels.cc"], - tf_deps = [ - # tf:framework tensorflow dep, - # tf:lib tensorflow dep, - ], - deps = [ - ":mst_solver", - ], + hdrs = ["fast_wordpiece_tokenizer_utils.h"], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:fast_wordpiece_tokenizer_utils"], ) -tf_cc_library( +cc_library( name = "mst_solver", hdrs = ["mst_solver.h"], - tf_deps = [ - # tf:lib tensorflow dep, - ], - deps = [ - ":disjoint_set_forest", - "@com_google_absl//absl/strings", - "@com_google_absl//absl/types:span", - ], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:mst_solver"], ) -cc_test( - name = "mst_solver_test", - size = "small", - srcs = ["mst_solver_test.cc"], - deps = [ - ":mst_solver", - "@com_google_googletest//:gtest", - "@com_google_googletest//:gtest_main", - # tf:test tensorflow dep, - ], +cc_library( + name = "ngrams_kernel_template", + hdrs = ["ngrams_kernel_template.h"], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:ngrams_kernel_template"], ) -cc_test( - name = "mst_solver_random_comparison_test", - size = "small", - timeout = "long", - srcs = ["mst_solver_random_comparison_test.cc"], - tags = [ - "nofastbuild", # exclude from non-opt TAP projects - "optonly", # exclude from non-opt TAP projects - ], - deps = [ - ":mst_solver", - ":spanning_tree_iterator", - "@com_google_googletest//:gtest", # google-only - "@com_google_googletest//:gtest_main", - "@com_google_absl//absl/flags:flag", - # tf:lib tensorflow dep, - ], +cc_library( + name = "ngrams_kernel", + hdrs = ["ngrams_kernel.h"], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:ngrams_kernel"], ) -proto_library( - name = "edit_changes_proto", - srcs = ["edit_changes.proto"], +cc_library( + name = "ngrams_tflite", + hdrs = ["ngrams_tflite.h"], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:ngrams_tflite"], ) -cc_proto_library( - name = "edit_changes_cc_proto", - deps = [":edit_changes_proto"], +cc_library( + name = "ragged_tensor_to_tensor_tflite", + hdrs = ["ragged_tensor_to_tensor_tflite.h"], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:ragged_tensor_to_tensor_tflite"], ) -tf_cc_library( - name = "ngrams_kernel_template", - hdrs = ["ngrams_kernel_template.h"], - tf_deps = [ - # tf/platform:tstring tensorflow dep, - ], - deps = [ - "@com_google_absl//absl/status", - "@com_google_absl//absl/status:statusor", - "@com_google_absl//absl/strings", - # lite/kernels/shim:op_kernel tensorflow dep, - # lite/kernels/shim:status_macros tensorflow dep, - # lite/kernels/shim:tensor_view tensorflow dep, - ], -) - -tf_cc_library( - name = "ngrams_kernel", - srcs = ["ngrams_kernel.cc"], - hdrs = ["ngrams_kernel.h"], - tf_deps = [ - # tf:framework tensorflow dep, - ], - deps = [ - ":ngrams_kernel_template", - # lite/kernels/shim:tf_op_shim tensorflow dep, - ], -) - -cc_test( - name = "ngrams_kernel_test", - srcs = ["ngrams_kernel_test.cc"], - deps = [ - # tf:framework tensorflow dep, - # tf:test tensorflow dep, - # tf:test_main tensorflow dep, - # tf/framework:shape_inference_testutil tensorflow dep, - # tf/framework:tensor_testutil tensorflow dep, - "//tensorflow_text:ngrams_op_cc", - ], -) - -tflite_cc_library( - name = "ngrams_tflite", - srcs = ["ngrams_tflite.cc"], - hdrs = ["ngrams_tflite.h"], - deps = [ - ":ngrams_kernel_template", - # lite:mutable_op_resolver tensorflow dep, - # lite/c:common tensorflow dep, - # lite/kernels/shim:tflite_op_shim tensorflow dep, - ], -) - -cc_test( - name = "ngrams_tflite_test", - srcs = ["ngrams_tflite_test.cc"], - deps = [ - ":ngrams_tflite", - "@com_google_googletest//:gtest_main", - "@flatbuffers", - # lite:string_util tensorflow dep, - # lite/c:common tensorflow dep, - # lite/kernels:test_util tensorflow dep, - # lite/schema:schema_fbs tensorflow dep, - ], -) - -tf_cc_library( - name = "normalize_kernels", - srcs = ["normalize_kernels.cc"], - tf_deps = [ - # tf:framework tensorflow dep, - ], - deps = [ - ":edit_changes_cc_proto", - "@com_google_absl//absl/strings", - "@icu//:nfkc", - "@icu//:nfkc_cf", - ], -) - -tflite_cc_library( - name = "ragged_tensor_to_tensor_tflite", - srcs = ["ragged_tensor_to_tensor_tflite.cc"], - hdrs = ["ragged_tensor_to_tensor_tflite.h"], - deps = [ - "@flatbuffers", - # tf/util:ragged_to_dense_util_common tensorflow dep, - # lite:framework tensorflow dep, - # lite/c:common tensorflow dep, - # lite/kernels:kernel_util tensorflow dep, - # lite/kernels/internal:tensor tensorflow dep, - # lite/kernels/internal:types tensorflow dep, - ], -) - -cc_test( - name = "ragged_tensor_to_tensor_tflite_test", - srcs = ["ragged_tensor_to_tensor_tflite_test.cc"], - deps = [ - ":ragged_tensor_to_tensor_tflite", - "@com_google_googletest//:gtest_main", - "@flatbuffers", - # lite:framework tensorflow dep, - # lite/c:common tensorflow dep, - # lite/kernels:test_util tensorflow dep, - # lite/kernels/internal:tensor tensorflow dep, - # lite/schema:schema_fbs tensorflow dep, - ], -) - -tf_cc_library( +cc_library( name = "regex_split", - srcs = ["regex_split.cc"], hdrs = ["regex_split.h"], - deps = [ - "@com_google_absl//absl/strings", - "@com_googlesource_code_re2//:re2", - ], -) - -tf_cc_library( - name = "regex_split_kernels", - srcs = ["regex_split_kernels.cc"], - tf_deps = [ - # tf:framework tensorflow dep, - # tf:lib tensorflow dep, - ], - deps = [ - ":regex_split", - "@com_google_absl//absl/memory", - ], -) - -cc_test( - name = "regex_split_test", - srcs = ["regex_split_test.cc"], - deps = [ - ":regex_split", - "@com_google_googletest//:gtest_main", - "@com_google_absl//absl/strings", - "@com_googlesource_code_re2//:re2", - # tf:lib tensorflow dep, - ], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:regex_split"], ) cc_library( name = "round_robin_trimmer", hdrs = ["round_robin_trimmer.h"], - deps = [ - ":trimmer", - "@com_google_absl//absl/types:span", - ], -) - -cc_test( - name = "round_robin_trimmer_test", - size = "small", - srcs = ["round_robin_trimmer_test.cc"], - deps = [ - ":round_robin_trimmer", - "@com_google_googletest//:gtest_main", - "@com_google_absl//absl/flags:flag", - "@com_google_absl//absl/status", - "@com_google_absl//absl/status:statusor", - # tf:lib tensorflow dep, - # tf:test_main tensorflow dep, - ], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:round_robin_trimmer"], ) -tf_cc_library( +cc_library( name = "round_robin_trimmer_kernel", - srcs = ["round_robin_trimmer_kernel.cc"], hdrs = ["round_robin_trimmer_kernel.h"], - tf_deps = [ - # tf:framework tensorflow dep, - ], - deps = [ - ":round_robin_trimmer_kernel_template", - # lite/kernels/shim:op_kernel tensorflow dep, - # lite/kernels/shim:tf_op_shim tensorflow dep, - ], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:round_robin_trimmer_kernel"], ) -tf_cc_library( +cc_library( name = "round_robin_trimmer_kernel_template", hdrs = ["round_robin_trimmer_kernel_template.h"], - tf_deps = [ - # tf:framework tensorflow dep, - ], - deps = [ - ":round_robin_trimmer", - "@com_google_absl//absl/status", - "@com_google_absl//absl/status:statusor", - "@com_google_absl//absl/types:span", - # lite/kernels/shim:op_kernel tensorflow dep, - # lite/kernels/shim:shape tensorflow dep, - # lite/kernels/shim:tensor_view tensorflow dep, - ], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:round_robin_trimmer_kernel_template"], ) -tflite_cc_library( +cc_library( name = "round_robin_trimmer_tflite", - srcs = ["round_robin_trimmer_tflite.cc"], hdrs = ["round_robin_trimmer_tflite.h"], - deps = [ - ":round_robin_trimmer_kernel_template", - # lite:mutable_op_resolver tensorflow dep, - # lite/c:common tensorflow dep, - # lite/kernels/shim:op_kernel tensorflow dep, - # lite/kernels/shim:tflite_op_shim tensorflow dep, - # lite/kernels/shim:tflite_op_wrapper tensorflow dep, - ], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:round_robin_trimmer_tflite"], ) -tf_cc_library( - name = "rouge_l_kernel", - srcs = ["rouge_l_kernel.cc"], - tf_deps = [ - # tf:framework tensorflow dep, - # tf:lib tensorflow dep, - ], - deps = [ - "@com_google_absl//absl/status", - "@com_google_absl//absl/strings", - ], -) - -cc_test( - name = "rouge_l_kernel_test", - size = "small", - srcs = ["rouge_l_kernel_test.cc"], - deps = [ - ":rouge_l_kernel", - # tf:framework tensorflow dep, - # tf:test tensorflow dep, - # tf:test_main tensorflow dep, - # tf:testlib tensorflow dep, - # tf/kernels:ops_testutil tensorflow dep, - "//tensorflow_text:text_similarity_metric_ops_cc", - ], -) - -tf_cc_library( - name = "sentence_breaking_kernels", - srcs = ["sentence_breaking_kernels.cc"], - tf_deps = [ - # tf:framework tensorflow dep, - ], - deps = [ - ":sentence_breaking_utils", - ":sentence_fragmenter", - "@com_google_absl//absl/strings", - "@icu//:common", - ], -) - -tf_cc_library( +cc_library( name = "sentence_breaking_utils", - srcs = ["sentence_breaking_utils.cc"], hdrs = ["sentence_breaking_utils.h"], - tf_deps = [ - # tf:lib tensorflow dep, - ], - deps = [ - "@com_google_absl//absl/strings", - "@icu//:common", - ], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:sentence_breaking_utils"], ) -cc_test( - name = "sentence_breaking_utils_test", - size = "small", - srcs = ["sentence_breaking_utils_test.cc"], - deps = [ - ":sentence_breaking_utils", - "@com_google_googletest//:gtest", - "@com_google_googletest//:gtest_main", - "@icu//:common", - ], -) - -tf_cc_library( +cc_library( name = "sentence_fragmenter", - srcs = ["sentence_fragmenter.cc"], hdrs = ["sentence_fragmenter.h"], - tf_deps = [ - # tf:lib tensorflow dep, - ], - deps = [ - ":sentence_breaking_utils", - "@com_google_absl//absl/status", - ], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:sentence_fragmenter"], ) -tf_cc_library( +cc_library( name = "sentence_fragmenter_v2", - srcs = ["sentence_fragmenter_v2.cc"], hdrs = ["sentence_fragmenter_v2.h"], - tf_deps = [ - # tf:lib tensorflow dep, - ], - deps = [ - "@com_google_absl//absl/status", - "@com_google_absl//absl/strings", - "@icu//:common", - ], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:sentence_fragmenter_v2"], ) -cc_test( - name = "sentence_fragmenter_v2_test", - srcs = ["sentence_fragmenter_v2_test.cc"], - deps = [ - ":sentence_fragmenter_v2", - "@com_google_absl//absl/strings", - "@com_google_googletest//:gtest_main", - "@icu//:common", - ], -) - -tf_cc_library( +cc_library( name = "sentence_fragmenter_v2_kernel", - srcs = ["sentence_fragmenter_v2_kernel.cc"], hdrs = ["sentence_fragmenter_v2_kernel.h"], - tf_deps = [ - # tf:framework tensorflow dep, - ], - deps = [ - ":sentence_fragmenter_v2_kernel_template", - # lite/kernels/shim:tf_op_shim tensorflow dep, - ], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:sentence_fragmenter_v2_kernel"], ) -tf_cc_library( +cc_library( name = "sentence_fragmenter_v2_kernel_template", hdrs = ["sentence_fragmenter_v2_kernel_template.h"], - tf_deps = [ - # tf:framework tensorflow dep, - ], - deps = [ - ":sentence_fragmenter_v2", - "@com_google_absl//absl/status", - # lite/kernels/shim:op_kernel tensorflow dep, - # lite/kernels/shim:shape tensorflow dep, - # lite/kernels/shim:status_macros tensorflow dep, - # lite/kernels/shim:tensor_view tensorflow dep, - ], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:sentence_fragmenter_v2_kernel_template"], ) -tflite_cc_library( +cc_library( name = "sentence_fragmenter_v2_tflite", - srcs = ["sentence_fragmenter_v2_tflite.cc"], hdrs = ["sentence_fragmenter_v2_tflite.h"], - deps = [ - ":sentence_fragmenter_v2_kernel_template", - # lite:mutable_op_resolver tensorflow dep, - # lite/c:common tensorflow dep, - # lite/kernels/shim:tflite_op_shim tensorflow dep, - ], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:sentence_fragmenter_v2_tflite"], ) -tf_cc_library( - name = "sentencepiece_kernels", - srcs = ["sentencepiece_kernels.cc"], - tf_deps = [ - # tf:framework tensorflow dep, - # tf:framework_headers_lib tensorflow dep, - # tf:lib tensorflow dep, - # tf:protos_all_cc tensorflow dep, - ], - deps = [ - "@com_google_absl//absl/base:core_headers", - "@com_google_absl//absl/memory", - "@com_google_absl//absl/meta:type_traits", - "@com_google_absl//absl/strings", - "@com_google_absl//absl/types:span", - "@com_google_sentencepiece//:sentencepiece_cc_proto", - "@com_google_sentencepiece//:sentencepiece_model_cc_proto", - "@com_google_sentencepiece//:sentencepiece_processor", - ], -) - -tf_cc_library( +cc_library( name = "spanning_tree_iterator", testonly = 1, - srcs = ["spanning_tree_iterator.cc"], hdrs = ["spanning_tree_iterator.h"], - tf_deps = [ - # tf:lib tensorflow dep, - ], -) - -cc_test( - name = "spanning_tree_iterator_test", - size = "small", - srcs = ["spanning_tree_iterator_test.cc"], - deps = [ - ":spanning_tree_iterator", - "@com_google_googletest//:gtest_main", - # tf:lib tensorflow dep, - ], -) - -tf_cc_library( - name = "split_merge_tokenize_kernel", - srcs = ["split_merge_tokenize_kernel.cc"], - tf_deps = [ - # tf:framework tensorflow dep, - # tf:lib tensorflow dep, - ], - deps = [ - "@com_google_absl//absl/base:core_headers", - "@com_google_absl//absl/status", - "@com_google_absl//absl/strings", - "@icu//:common", - ], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:spanning_tree_iterator"], ) cc_library( name = "text_kernels_test_util", testonly = 1, - srcs = ["text_kernels_test_util.cc"], hdrs = ["text_kernels_test_util.h"], - deps = [ - "@com_google_googletest//:gtest", - # tf:framework tensorflow dep, - # tf:testlib tensorflow dep, - ], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:text_kernels_test_util"], ) -tflite_cc_library( +cc_library( name = "tflite_ops", hdrs = [ "byte_splitter_tflite.h", @@ -1052,204 +311,54 @@ tflite_cc_library( "sentence_fragmenter_v2_tflite.h", "utf8_binarize_tflite.h", "whitespace_tokenizer_tflite.h", - "//tensorflow_text/core/kernels/sentencepiece:sp_headers", - ], - deps = [ - ":byte_splitter_tflite", - ":fast_bert_normalizer_tflite", - ":fast_wordpiece_tokenizer_tflite", - ":ngrams_tflite", - ":ragged_tensor_to_tensor_tflite", - ":round_robin_trimmer_tflite", - ":sentence_fragmenter_v2_tflite", - ":utf8_binarize_tflite", - ":whitespace_tokenizer_tflite", - # lite:mutable_op_resolver tensorflow dep, - # lite/c:common tensorflow dep, - "//tensorflow_text/core/kernels/sentencepiece:py_tflite_registerer", - ], -) - -tf_cc_library( - name = "tokenizer_from_logits_kernel", - srcs = ["tokenizer_from_logits_kernel.cc"], - tf_deps = [ - # tf:framework tensorflow dep, - # tf:lib tensorflow dep, - ], - deps = [ - "@com_google_absl//absl/base:core_headers", - "@com_google_absl//absl/strings", - "@icu//:common", ], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:tflite_ops"], ) cc_library( name = "trimmer", hdrs = ["trimmer.h"], - deps = [ - "@com_google_absl//absl/types:span", - ], -) - -tf_cc_library( - name = "unicode_script_tokenize_kernel", - srcs = ["unicode_script_tokenize_kernel.cc"], - tf_deps = [ - # tf:framework tensorflow dep, - # tf:lib tensorflow dep, - ], - deps = [ - "@icu//:common", - ], -) - -cc_test( - name = "unicode_script_tokenize_kernel_test", - srcs = ["unicode_script_tokenize_kernel_test.cc"], - deps = [ - ":text_kernels_test_util", - "@com_google_googletest//:gtest_main", - # tf:framework tensorflow dep, - # tf:lib tensorflow dep, - # tf:test tensorflow dep, - # tf:testlib tensorflow dep, - # tf/kernels:ops_testutil tensorflow dep, - "//tensorflow_text:unicode_script_tokenizer_cc", - ], -) - -tf_cc_library( - name = "whitespace_tokenize_kernel", - srcs = ["whitespace_tokenize_kernel.cc"], - tf_deps = [ - # tf:framework tensorflow dep, - # tf:lib tensorflow dep, - ], - deps = [ - "@icu//:common", - ], -) - -cc_test( - name = "whitespace_tokenize_kernel_test", - srcs = ["whitespace_tokenize_kernel_test.cc"], - deps = [ - ":text_kernels_test_util", - "@com_google_googletest//:gtest_main", - # tf:framework tensorflow dep, - # tf:lib tensorflow dep, - # tf:test tensorflow dep, - # tf:testlib tensorflow dep, - # tf/kernels:ops_testutil tensorflow dep, - "//tensorflow_text:whitespace_tokenizer_cc", - ], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:trimmer"], ) cc_library( name = "whitespace_tokenizer", - srcs = ["whitespace_tokenizer.cc"], hdrs = ["whitespace_tokenizer.h"], - deps = [ - "@com_google_absl//absl/strings", - "@icu//:common", - ], -) - -cc_test( - name = "whitespace_tokenizer_test", - size = "small", - srcs = ["whitespace_tokenizer_test.cc"], - deps = [ - ":whitespace_tokenizer", - ":whitespace_tokenizer_config_builder", - "@com_google_googletest//:gtest_main", - "@com_google_absl//absl/flags:flag", - "@com_google_absl//absl/status", - "@com_google_absl//absl/status:statusor", - # tf:lib tensorflow dep, - # tf:test_main tensorflow dep, - ], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:whitespace_tokenizer"], ) -tf_cc_library( +cc_library( name = "whitespace_tokenizer_kernel", - srcs = ["whitespace_tokenizer_kernel.cc"], hdrs = ["whitespace_tokenizer_kernel.h"], - tf_deps = [ - # tf:framework tensorflow dep, - ], - deps = [ - ":whitespace_tokenizer_kernel_template", - # lite/kernels/shim:op_kernel tensorflow dep, - # lite/kernels/shim:tf_op_shim tensorflow dep, - ], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:whitespace_tokenizer_kernel"], ) -tf_cc_library( +cc_library( name = "whitespace_tokenizer_kernel_template", hdrs = ["whitespace_tokenizer_kernel_template.h"], - tf_deps = [ - # tf:framework tensorflow dep, - ], - deps = [ - ":whitespace_tokenizer", - "@com_google_absl//absl/status", - "@com_google_absl//absl/status:statusor", - # lite/kernels/shim:op_kernel tensorflow dep, - # lite/kernels/shim:shape tensorflow dep, - # lite/kernels/shim:tensor_view tensorflow dep, - ], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:whitespace_tokenizer_kernel_template"], ) -tflite_cc_library( +cc_library( name = "whitespace_tokenizer_tflite", - srcs = ["whitespace_tokenizer_tflite.cc"], hdrs = ["whitespace_tokenizer_tflite.h"], - deps = [ - ":whitespace_tokenizer_kernel_template", - # lite:mutable_op_resolver tensorflow dep, - # lite/c:common tensorflow dep, - # lite/kernels/shim:tflite_op_shim tensorflow dep, - ], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:whitespace_tokenizer_tflite"], ) cc_library( name = "whitespace_tokenizer_config_builder", - srcs = ["whitespace_tokenizer_config_builder.cc"], hdrs = ["whitespace_tokenizer_config_builder.h"], - deps = [ - "@icu//:common", - ], -) - -cc_test( - name = "whitespace_tokenizer_config_builder_test", - size = "small", - srcs = ["whitespace_tokenizer_config_builder_test.cc"], - deps = [ - ":whitespace_tokenizer", - ":whitespace_tokenizer_config_builder", - "@com_google_googletest//:gtest_main", - "@icu//:common", - # tf:lib tensorflow dep, - # tf:test_main tensorflow dep, - ], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:whitespace_tokenizer_config_builder"], ) cc_library( name = "string_vocab", - srcs = ["string_vocab.cc"], hdrs = ["string_vocab.h"], - deps = [ - ":wordpiece_tokenizer", - "@com_google_absl//absl/container:flat_hash_map", - ], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:string_vocab"], ) cc_library( name = "phrase_tokenizer", - srcs = ["phrase_tokenizer.cc"], hdrs = ["phrase_tokenizer.h"], deps = [ ":phrase_tokenizer_model", @@ -1263,153 +372,264 @@ cc_library( "@com_google_absl//absl/status", "@com_google_absl//absl/status:statusor", "@com_google_absl//absl/strings", + "@org_tensorflow//tensorflow/core/kernels/text:phrase_tokenizer", # lite/kernels/shim:status_macros tensorflow dep, "//tensorflow_text/core/kernels/sentencepiece:double_array_trie", ], ) -cc_test( - name = "phrase_tokenizer_test", - size = "small", - srcs = ["phrase_tokenizer_test.cc"], - data = [ - "//tensorflow_text:python/ops/test_data/phrase_tokenizer_model.fb", - ], - deps = [ - ":phrase_tokenizer", - "@com_google_googletest//:gtest_main", - "@com_google_absl//absl/flags:flag", - "@com_google_absl//absl/status", - "@com_google_absl//absl/status:statusor", - # tf:lib tensorflow dep, - # tf:test_main tensorflow dep, - ], -) - cc_library( name = "phrase_tokenizer_kernel_template", hdrs = ["phrase_tokenizer_kernel_template.h"], - deps = [ - ":phrase_tokenizer", - "@com_google_absl//absl/status", - "@com_google_absl//absl/strings", - # lite/kernels/shim:op_kernel tensorflow dep, - # lite/kernels/shim:status_macros tensorflow dep, - ], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:phrase_tokenizer_kernel_template"], ) -tf_cc_library( +cc_library( name = "phrase_tokenizer_kernel", - srcs = ["phrase_tokenizer_kernel.cc"], hdrs = ["phrase_tokenizer_kernel.h"], - tf_deps = [ - # tf:framework tensorflow dep, - ], - deps = [ - ":phrase_tokenizer_kernel_template", - # lite/kernels/shim:op_kernel tensorflow dep, - # lite/kernels/shim:tf_op_shim tensorflow dep, - ], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:phrase_tokenizer_kernel"], ) cc_library( name = "utf8_binarize", - srcs = ["utf8_binarize.cc"], hdrs = ["utf8_binarize.h"], - deps = [ - "@com_google_absl//absl/strings", - "@com_google_absl//absl/types:span", - "@icu//:common", - ], -) - -cc_test( - name = "utf8_binarize_test", - size = "small", - srcs = ["utf8_binarize_test.cc"], - deps = [ - ":utf8_binarize", - "@com_google_googletest//:gtest_main", - "@com_google_absl//absl/types:span", - # tf:lib tensorflow dep, - # tf:test_main tensorflow dep, - ], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:utf8_binarize"], ) cc_library( name = "utf8_binarize_kernel_template", hdrs = ["utf8_binarize_kernel_template.h"], - deps = [ - ":utf8_binarize", - "@com_google_absl//absl/status", - "@com_google_absl//absl/strings", - # tf/platform:tstring tensorflow dep, - # lite/kernels/shim:op_kernel tensorflow dep, - # lite/kernels/shim:shape tensorflow dep, - # lite/kernels/shim:status_macros tensorflow dep, - ], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:utf8_binarize_kernel_template"], ) -tf_cc_library( +cc_library( name = "utf8_binarize_kernel", - srcs = ["utf8_binarize_kernel.cc"], hdrs = ["utf8_binarize_kernel.h"], - tf_deps = [ - # tf:framework tensorflow dep, - ], - deps = [ - ":utf8_binarize_kernel_template", - # lite/kernels/shim:op_kernel tensorflow dep, - # lite/kernels/shim:tf_op_shim tensorflow dep, - ], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:utf8_binarize_kernel"], ) -tflite_cc_library( +cc_library( name = "utf8_binarize_tflite", - srcs = ["utf8_binarize_tflite.cc"], hdrs = ["utf8_binarize_tflite.h"], - deps = [ - ":utf8_binarize_kernel_template", - # lite:mutable_op_resolver tensorflow dep, - # lite/c:common tensorflow dep, - # lite/kernels/shim:tflite_op_shim tensorflow dep, - ], + deps = ["@org_tensorflow//tensorflow/core/kernels/text:utf8_binarize_tflite"], ) -tf_cc_library( - name = "wordpiece_kernel", - srcs = ["wordpiece_kernel.cc"], - tf_deps = [ - # tf:framework tensorflow dep, - # tf:lib tensorflow dep, - ], - deps = [ - ":wordpiece_tokenizer", - "@com_google_absl//absl/base:core_headers", - ], +alias( + name = "boise_offset_converter_test", + actual = "@org_tensorflow//tensorflow/core/kernels/text:boise_offset_converter_test", ) -tf_cc_library( - name = "wordpiece_tokenizer", - srcs = ["wordpiece_tokenizer.cc"], - hdrs = ["wordpiece_tokenizer.h"], - deps = [ - "@com_google_absl//absl/strings", - "@icu//:common", - ], +alias( + name = "byte_splitter_test", + actual = "@org_tensorflow//tensorflow/core/kernels/text:byte_splitter_test", +) + +alias( + name = "constrained_sequence_kernel", + actual = "@org_tensorflow//tensorflow/core/kernels/text:constrained_sequence_kernel", ) -cc_test( +alias( + name = "constrained_sequence_kernel_input_validation_test", + actual = "@org_tensorflow//tensorflow/core/kernels/text:constrained_sequence_kernel_input_validation_test", +) + +alias( + name = "exp_greedy_constrained_sequence_kernel_test", + actual = "@org_tensorflow//tensorflow/core/kernels/text:exp_greedy_constrained_sequence_kernel_test", +) + +alias( + name = "exp_viterbi_constrained_sequence_kernel_test", + actual = "@org_tensorflow//tensorflow/core/kernels/text:exp_viterbi_constrained_sequence_kernel_test", +) + +alias( + name = "fast_bert_normalizer_test", + actual = "@org_tensorflow//tensorflow/core/kernels/text:fast_bert_normalizer_test", +) + +alias( + name = "log_greedy_constrained_sequence_kernel_test", + actual = "@org_tensorflow//tensorflow/core/kernels/text:log_greedy_constrained_sequence_kernel_test", +) + +alias( + name = "log_viterbi_constrained_sequence_kernel_test", + actual = "@org_tensorflow//tensorflow/core/kernels/text:log_viterbi_constrained_sequence_kernel_test", +) + +alias( + name = "darts_clone_trie_test", + actual = "@org_tensorflow//tensorflow/core/kernels/text:darts_clone_trie_test", +) + +alias( + name = "disjoint_set_forest_test", + actual = "@org_tensorflow//tensorflow/core/kernels/text:disjoint_set_forest_test", +) + +alias( + name = "fast_wordpiece_tokenizer_test", + actual = "@org_tensorflow//tensorflow/core/kernels/text:fast_wordpiece_tokenizer_test", +) + +alias( + name = "fast_wordpiece_tokenizer_utils_test", + actual = "@org_tensorflow//tensorflow/core/kernels/text:fast_wordpiece_tokenizer_utils_test", +) + +alias( + name = "mst_op_kernels", + actual = "@org_tensorflow//tensorflow/core/kernels/text:mst_op_kernels", +) + +alias( + name = "mst_solver_test", + actual = "@org_tensorflow//tensorflow/core/kernels/text:mst_solver_test", +) + +alias( + name = "mst_solver_random_comparison_test", + actual = "@org_tensorflow//tensorflow/core/kernels/text:mst_solver_random_comparison_test", +) + +alias( + name = "edit_changes_proto", + actual = "@org_tensorflow//tensorflow/core/kernels/text:edit_changes_proto", +) + +alias( + name = "edit_changes_cc_proto", + actual = "@org_tensorflow//tensorflow/core/kernels/text:edit_changes_cc_proto", +) + +alias( + name = "ngrams_kernel_test", + actual = "@org_tensorflow//tensorflow/core/kernels/text:ngrams_kernel_test", +) + +alias( + name = "ngrams_tflite_test", + actual = "@org_tensorflow//tensorflow/core/kernels/text:ngrams_tflite_test", +) + +alias( + name = "normalize_kernels", + actual = "@org_tensorflow//tensorflow/core/kernels/text:normalize_kernels", +) + +alias( + name = "ragged_tensor_to_tensor_tflite_test", + actual = "@org_tensorflow//tensorflow/core/kernels/text:ragged_tensor_to_tensor_tflite_test", +) + +alias( + name = "regex_split_kernels", + actual = "@org_tensorflow//tensorflow/core/kernels/text:regex_split_kernels", +) + +alias( + name = "regex_split_test", + actual = "@org_tensorflow//tensorflow/core/kernels/text:regex_split_test", +) + +alias( + name = "round_robin_trimmer_test", + actual = "@org_tensorflow//tensorflow/core/kernels/text:round_robin_trimmer_test", +) + +alias( + name = "rouge_l_kernel", + actual = "@org_tensorflow//tensorflow/core/kernels/text:rouge_l_kernel", +) + +alias( + name = "rouge_l_kernel_test", + actual = "@org_tensorflow//tensorflow/core/kernels/text:rouge_l_kernel_test", +) + +alias( + name = "sentence_breaking_kernels", + actual = "@org_tensorflow//tensorflow/core/kernels/text:sentence_breaking_kernels", +) + +alias( + name = "sentence_breaking_utils_test", + actual = "@org_tensorflow//tensorflow/core/kernels/text:sentence_breaking_utils_test", +) + +alias( + name = "sentence_fragmenter_v2_test", + actual = "@org_tensorflow//tensorflow/core/kernels/text:sentence_fragmenter_v2_test", +) + +alias( + name = "sentencepiece_kernels", + actual = "@org_tensorflow//tensorflow/core/kernels/text:sentencepiece_kernels", +) + +alias( + name = "spanning_tree_iterator_test", + actual = "@org_tensorflow//tensorflow/core/kernels/text:spanning_tree_iterator_test", +) + +alias( + name = "split_merge_tokenize_kernel", + actual = "@org_tensorflow//tensorflow/core/kernels/text:split_merge_tokenize_kernel", +) + +alias( + name = "tokenizer_from_logits_kernel", + actual = "@org_tensorflow//tensorflow/core/kernels/text:tokenizer_from_logits_kernel", +) + +alias( + name = "unicode_script_tokenize_kernel", + actual = "@org_tensorflow//tensorflow/core/kernels/text:unicode_script_tokenize_kernel", +) + +alias( + name = "unicode_script_tokenize_kernel_test", + actual = "@org_tensorflow//tensorflow/core/kernels/text:unicode_script_tokenize_kernel_test", +) + +alias( + name = "whitespace_tokenize_kernel", + actual = "@org_tensorflow//tensorflow/core/kernels/text:whitespace_tokenize_kernel", +) + +alias( + name = "whitespace_tokenize_kernel_test", + actual = "@org_tensorflow//tensorflow/core/kernels/text:whitespace_tokenize_kernel_test", +) + +alias( + name = "whitespace_tokenizer_test", + actual = "@org_tensorflow//tensorflow/core/kernels/text:whitespace_tokenizer_test", +) + +alias( + name = "whitespace_tokenizer_config_builder_test", + actual = "@org_tensorflow//tensorflow/core/kernels/text:whitespace_tokenizer_config_builder_test", +) + +alias( + name = "phrase_tokenizer_test", + actual = "@org_tensorflow//tensorflow/core/kernels/text:phrase_tokenizer_test", +) + +alias( + name = "utf8_binarize_test", + actual = "@org_tensorflow//tensorflow/core/kernels/text:utf8_binarize_test", +) + +alias( + name = "wordpiece_kernel", + actual = "@org_tensorflow//tensorflow/core/kernels/text:wordpiece_kernel", +) + +alias( name = "wordpiece_kernel_test", - size = "small", - srcs = ["wordpiece_kernel_test.cc"], - deps = [ - ":wordpiece_kernel", - # tf:framework tensorflow dep, - # tf:test tensorflow dep, - # tf:test_main tensorflow dep, - # tf:testlib tensorflow dep, - # tf/kernels:ops_testutil tensorflow dep, - "//tensorflow_text:wordpiece_tokenizer_cc", - ], + actual = "@org_tensorflow//tensorflow/core/kernels/text:wordpiece_kernel_test", ) diff --git a/tensorflow_text/core/kernels/boise_offset_converter.cc b/tensorflow_text/core/kernels/boise_offset_converter.cc deleted file mode 100644 index ac306d3e3..000000000 --- a/tensorflow_text/core/kernels/boise_offset_converter.cc +++ /dev/null @@ -1,254 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/boise_offset_converter.h" - -#include -#include -#include -#include - -#include "absl/status/status.h" -#include "absl/status/statusor.h" -#include "absl/strings/match.h" -#include "absl/strings/str_cat.h" -#include "absl/strings/string_view.h" - -namespace tensorflow { -namespace text { - -bool IsRightOutsideSpan(int token_start, int token_end, int span_start, - int span_end) { - // Token: |------) - // Span: |-----) - return token_start >= span_end; -} - -bool IsLeftOutsideSpan(int token_start, int token_end, int span_start, - int span_end) { - // Token: |------) - // Span: |-----) - return token_end <= span_start; -} - -bool IsStartOfSpan(int token_start, int token_end, int span_start, - int span_end) { - // Returns true if the token overlaps with the span from the - // left side (i.e. start) of the span, but not have the span inside. - // Token: |-------) - // Span: |-----) - return token_start <= span_start && token_end > span_start && - token_end <= span_end; -} - -bool IsEndOfSpan(int token_start, int token_end, int span_start, int span_end) { - // Returns true if the token overlaps with the span from the - // right side (i.e. end) of the span, but not have the span inside. - // Token: |------) - // Span: |------) - return token_start < span_end && token_end >= span_end && - token_start >= span_start; -} - -bool IsInsideSpan(int token_start, int token_end, int span_start, - int span_end) { - // Token: |------) - // Span: |-----------) - return token_start >= span_start && token_end <= span_end; -} - -absl::StatusOr> OffsetsToBoiseTags( - const std::vector& token_begin_offsets, - const std::vector& token_end_offsets, - const std::vector& span_begin_offsets, - const std::vector& span_end_offsets, - const std::vector& span_type, - const bool use_strict_boundary_mode) { - // Verify that token vectors are all the same size - if (token_begin_offsets.size() != token_end_offsets.size()) { - return absl::InvalidArgumentError("Token offsets must have the same size"); - } - if (span_begin_offsets.size() != span_end_offsets.size() || - span_begin_offsets.size() != span_type.size()) { - return absl::InvalidArgumentError("Span offsets must have the same size"); - } - - // Iterate through tokens - std::vector results; - int span_index = 0; - for (int i = 0; i < token_begin_offsets.size(); ++i) { - int token_start = token_begin_offsets[i]; - int token_end = token_end_offsets[i]; - std::string potential_span_type = "O"; - bool recorded = false; - - while (span_index < span_begin_offsets.size() && !recorded) { - int span_start = span_begin_offsets[span_index]; - int span_end = span_end_offsets[span_index]; - - if (IsLeftOutsideSpan(token_start, token_end, span_start, span_end)) { - results.push_back(potential_span_type); - recorded = true; - } else if (IsRightOutsideSpan(token_start, token_end, span_start, - span_end)) { - span_index++; - } else if (IsStartOfSpan(token_start, token_end, span_start, span_end)) { - if (IsEndOfSpan(token_start, token_end, span_start, span_end)) { - results.push_back(absl::StrCat("S-", span_type[span_index])); - span_index++; - recorded = true; - } else { - if (use_strict_boundary_mode && token_start != span_start) { - results.push_back(potential_span_type); - recorded = true; - } else { - results.push_back(absl::StrCat("B-", span_type[span_index])); - recorded = true; - } - } - } else if (IsEndOfSpan(token_start, token_end, span_start, span_end)) { - if (use_strict_boundary_mode && token_end != span_end) { - results.push_back(potential_span_type); - recorded = true; - } else { - potential_span_type = absl::StrCat("E-", span_type[span_index]); - } - span_index++; - } else if (IsInsideSpan(token_start, token_end, span_start, span_end)) { - // token: |--) - // span: |---------) - results.push_back(absl::StrCat("I-", span_type[span_index])); - recorded = true; - } else { - // token: |----------) - // span: |----) - potential_span_type = absl::StrCat("B-", span_type[span_index]); - span_index++; - } - } - if (!recorded) { - results.push_back(potential_span_type); - } - } - return results; -} - -std::string ExtractSpanType(const std::string& tag) { - return std::string(absl::ClippedSubstr(tag, 2).data()); -} - -absl::StatusOr< - std::tuple, std::vector, std::vector>> -BoiseTagsToOffsets(const std::vector& token_begin_offsets, - const std::vector& token_end_offsets, - const std::vector& per_token_boise_tags) { - // Verify that input vectors are all the same size - if (token_begin_offsets.size() != token_end_offsets.size()) { - return absl::InvalidArgumentError("Tokens must have the same size"); - } - if (token_begin_offsets.size() != per_token_boise_tags.size()) { - return absl::InvalidArgumentError( - "Tokens and BOISE tags must have the same size"); - } - - std::vector span_start, span_end; - std::vector span_type; - // Iterate through each token - int potential_span_start = -1; - std::string potential_span_type; - bool started_span = false; - - for (int i = 0; i < token_begin_offsets.size(); ++i) { - // If we find a (B)egin, (I)nside, (E)nd, or (S)ingleton tag then - // record a span start. - const std::string& tag = per_token_boise_tags[i]; - - if (!started_span) { - if (absl::StartsWith(tag, "B-") || absl::StartsWith(tag, "I-")) { - potential_span_start = token_begin_offsets[i]; - started_span = true; - potential_span_type = ExtractSpanType(tag); - } - - if (absl::StartsWith(tag, "E-") || absl::StartsWith(tag, "S-")) { - // Treat this as a singleton - span_start.push_back(token_begin_offsets[i]); - span_end.push_back(token_end_offsets[i]); - span_type.push_back(ExtractSpanType(tag)); - started_span = false; - potential_span_type.clear(); - } - } else { - // If we have found a Outside, but we previously had a span start (from - // a Begin, or Inside) then treat this as a singleton and record an span - // end - if (absl::StartsWith(tag, "O")) { - span_start.push_back(potential_span_start); - span_end.push_back(token_end_offsets[i - 1]); - span_type.push_back(potential_span_type); - started_span = false; - potential_span_type.clear(); - } - - // If we find a End or Singleton then also record an end. - if (absl::StartsWith(tag, "E-") || absl::StartsWith(tag, "S-")) { - span_start.push_back(potential_span_start); - span_end.push_back(token_end_offsets[i]); - // Also record a span type. - span_type.push_back(ExtractSpanType(tag)); - started_span = false; - } - - // If we find a Begin, - if (absl::StartsWith(tag, "B-") || absl::StartsWith(tag, "I-")) { - // potential_span_start = token_begin_offsets[i]; - started_span = true; - potential_span_type = ExtractSpanType(tag); - } - } - } - - // Record span that has started but not closed. - if (started_span) { - span_start.push_back(potential_span_start); - span_end.push_back(token_end_offsets.back()); - span_type.push_back(potential_span_type); - } - - return std::tuple, std::vector, - std::vector>(span_start, span_end, span_type); -} - -std::unordered_set GetAllBoiseTagsFromSpanType( - const std::vector& span_type) { - std::unordered_set res{"O"}; - const std::unordered_set deduped_span_type(span_type.begin(), - span_type.end()); - const std::vector boise_prefixes = {"B-", "I-", "S-", "E-"}; - - for (const std::string& cur_span_type : deduped_span_type) { - if (cur_span_type.empty() || cur_span_type == "O") { - continue; - } - for (const std::string& prefix : boise_prefixes) { - std::string tag = absl::StrCat(prefix, cur_span_type); - res.insert(tag); - } - } - - return res; -} - -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/boise_offset_converter.h b/tensorflow_text/core/kernels/boise_offset_converter.h index cfe21d128..73ef142c2 100644 --- a/tensorflow_text/core/kernels/boise_offset_converter.h +++ b/tensorflow_text/core/kernels/boise_offset_converter.h @@ -15,112 +15,6 @@ #ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_BOISE_OFFSET_CONVERTER_H_ #define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_BOISE_OFFSET_CONVERTER_H_ -#include -#include - -#include "absl/status/statusor.h" - -namespace tensorflow { -namespace text { -// Translates span begin/end offsets and token begin/end offsets into a -// BOISE scheme. -// -// In the BOISE scheme there is a set of 5 labels for each type: -// - (B)egin: meaning the beginning of the span type. -// - (O)utside: meaning the token is outside of any span type -// - (I)nside: the token is inside the span -// - (S)ingleton: the entire span consists of this single token. -// - (E)nd: this token is the end of the span. -// -// When given the span begin & end offsets along with a set of token begin & end -// offsets, this function helps translate which each token into one of the 5 -// labels. -// -// For example, given the following example inputs: -// -// std::string content = "Who let the dogs out"; -// std::string entity = "dogs"; -// std::vector tokens = { "Who", "let", "the", "dogs", "out" } -// std::vector token_begin_offsets = { 0, 4, 8, 12, 17 }; -// std::vector token_end_offsets = { 3, 7, 11, 16, 20 }; -// std::vector span_begin_offsets = { 12 }; -// std::vector span_end_offsets = { 16 }; -// std::vector span_type = { "animal" } -// -// Foo will produce the following labels: -// { "O", "O", "O", "S-animal", "O", } -// | | | | | -// Who let the dogs out -// -// Special Case 1: Loose or Strict Boundary Criteria: -// By default, loose boundary criteria are used to decide token start and end, -// given a entity span. In the above example, say if we have -// -// std::vector span_begin_offsets = { 13 }; -// std::vector span_end_offsets = { 16 }; -// -// we still get { "O", "O", "O", "S-animal", "O", }, even though the span -// begin offset (13) is not exactly aligned with the token begin offset (12). -// Partial overlap between a token and a BOISE tag still qualify the token to -// be labeled with this tag. -// -// You can choose to use strict boundary criteria by passing in -// use_strict_boundary_mode = false argument, with which Foo will produce -// { "O", "O", "O", "O", "O", } for the case described above. -// -// Special Case 2: One Token Mapped to Multiple BOISE Tags: -// In cases where a token is overlapped with multiple BOISE tags, we label the -// token with the last tag. For example, given the following example inputs: -// -// std::string content = "Getty Center"; -// std::vector tokens = { "Getty Center" }; -// std::vector token_begin_offsets = { 0 }; -// std::vector token_end_offsets = { 12 }; -// std::vector span_begin_offsets = { 0, 6 }; -// std::vector span_end_offsets = { 5, 12 }; -// std::vector span_type = { "per", "loc" } -// -// Foo will produce the following labels: -// { "B-loc", } -absl::StatusOr> OffsetsToBoiseTags( - const std::vector& token_begin_offsets, - const std::vector& token_end_offsets, - const std::vector& span_begin_offsets, - const std::vector& span_end_offsets, - const std::vector& span_type, - const bool use_strict_boundary_mode = false); - -// Given the token offsets and BOISE tags per token, perform a translation -// that marks start offset, end offset and span type per entity. -// -// For example, given the following example inputs: -// -// std::vector token_begin_offsets = { 0, 4, 8, 12, 17 }; -// std::vector token_end_offsets = { 3, 7, 11, 16, 20 }; -// std::vector per_token_boise_tags = { "O", "O", "O", "S-animal", -// "O" }; -// -// Foo will produce the following offsets and labels vectors: -// start offsets: { 12, } -// end offsets: { 16, } -// span types: { "animal", } -absl::StatusOr< - std::tuple, std::vector, std::vector>> -BoiseTagsToOffsets(const std::vector& token_begin_offsets, - const std::vector& token_end_offsets, - const std::vector& per_token_boise_tags); - -// Get all possible BOISE tags for given span types. For example, -// -// std::vector span_type = { "loc", "per" } -// -// Foo will produce an unordered set: -// { "O", "B-loc", "I-loc", "S-loc", "E-loc", "B-per", "I-per", "S-per", -// "E-per", }. -std::unordered_set GetAllBoiseTagsFromSpanType( - const std::vector& span_type); - -} // namespace text -} // namespace tensorflow +#include "tensorflow/core/kernels/text/boise_offset_converter.h" #endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_BOISE_OFFSET_CONVERTER_H_ diff --git a/tensorflow_text/core/kernels/boise_offset_converter_kernel.cc b/tensorflow_text/core/kernels/boise_offset_converter_kernel.cc deleted file mode 100644 index d2ed8c42a..000000000 --- a/tensorflow_text/core/kernels/boise_offset_converter_kernel.cc +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/boise_offset_converter_kernel.h" - -#include "tensorflow/core/framework/op_kernel.h" - -namespace tensorflow { -namespace text { - -REGISTER_KERNEL_BUILDER( - Name(OffsetsToBoiseTagsOpKernel::OpName()).Device(tensorflow::DEVICE_CPU), - OffsetsToBoiseTagsOpKernel); - -REGISTER_KERNEL_BUILDER( - Name(BoiseTagsToOffsetsOpKernel::OpName()).Device(tensorflow::DEVICE_CPU), - BoiseTagsToOffsetsOpKernel); - -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/boise_offset_converter_kernel.h b/tensorflow_text/core/kernels/boise_offset_converter_kernel.h index e8a978d36..873d03d05 100644 --- a/tensorflow_text/core/kernels/boise_offset_converter_kernel.h +++ b/tensorflow_text/core/kernels/boise_offset_converter_kernel.h @@ -15,25 +15,6 @@ #ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_BOISE_OFFSET_CONVERTER_KERNEL_H_ #define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_BOISE_OFFSET_CONVERTER_KERNEL_H_ -#include "tensorflow/lite/kernels/shim/tf_op_shim.h" -#include "tensorflow_text/core/kernels/boise_offset_converter_kernel_template.h" - -namespace tensorflow { -namespace text { - -class OffsetsToBoiseTagsOpKernel - : public tflite::shim::TfOpKernel { - public: - using TfOpKernel::TfOpKernel; -}; - -class BoiseTagsToOffsetsOpKernel - : public tflite::shim::TfOpKernel { - public: - using TfOpKernel::TfOpKernel; -}; - -} // namespace text -} // namespace tensorflow +#include "tensorflow/core/kernels/text/boise_offset_converter_kernel.h" #endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_BOISE_OFFSET_CONVERTER_KERNEL_H_ diff --git a/tensorflow_text/core/kernels/boise_offset_converter_kernel_template.h b/tensorflow_text/core/kernels/boise_offset_converter_kernel_template.h index f49f059aa..9867b6c2c 100644 --- a/tensorflow_text/core/kernels/boise_offset_converter_kernel_template.h +++ b/tensorflow_text/core/kernels/boise_offset_converter_kernel_template.h @@ -15,625 +15,6 @@ #ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_BOISE_OFFSET_CONVERTER_KERNEL_TEMPLATE_H_ #define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_BOISE_OFFSET_CONVERTER_KERNEL_TEMPLATE_H_ -#include -#include -#include -#include - -#include "absl/status/status.h" -#include "tensorflow/core/platform/tstring.h" -#include "tensorflow/lite/kernels/shim/op_kernel.h" -#include "tensorflow/lite/kernels/shim/shape.h" -#include "tensorflow/lite/kernels/shim/status_macros.h" -#include "tensorflow_text/core/kernels/boise_offset_converter.h" - -namespace tensorflow { -namespace text { - -template -class OffsetsToBoiseTagsOp - : public tflite::shim::OpKernelShim { - private: - enum Inputs { - kInputTokenBeginOffsets = 0, - kInputTokenEndOffsets, - kInputSpanBeginOffsets, - kInputSpanEndOffsets, - kInputSpanType, - kInputTokenBeginRowSplits, - kInputTokenEndRowSplits, - kInputSpanBeginRowSplits, - kInputSpanEndRowSplits, - kInputSpanTypeRowSplits, - kInputUseStrictBoundaryMode - }; - enum Outputs { kOutputBoiseTags = 0 }; - - using typename tflite::shim::OpKernelShim::InitContext; - using typename tflite::shim::OpKernelShim::InvokeContext; - using typename tflite::shim::OpKernelShim::ShapeInferenceContext; - - public: - OffsetsToBoiseTagsOp() = default; - static constexpr char kOpName[] = "TFText>OffsetsToBoiseTags"; - static constexpr char kDoc[] = R"doc( - Converts token/span begin/end offsets into BOISE tags. - )doc"; - - static const char* OpName() { return kOpName; } - static const char* Doc() { return kDoc; } - - // Attributes declaration (syntax: https://www.tensorflow.org/guide/create_op) - static std::vector Attrs() { return {}; } - - // Inputs declaration (syntax: https://www.tensorflow.org/guide/create_op) - static std::vector Inputs(); - - // Outputs declaration (syntax: https://www.tensorflow.org/guide/create_op) - static std::vector Outputs(); - - // Initializes the op - absl::Status Init(InitContext* context) { return absl::OkStatus(); } - - // Runs the operation - absl::Status Invoke(InvokeContext* context); - - // Shape inference - static absl::Status ShapeInference(ShapeInferenceContext* c); -}; - -////////////////////////// Implementation - -template -std::vector OffsetsToBoiseTagsOp::Inputs() { - return {"input_token_begin_offsets: int32", - "input_token_end_offsets: int32", - "input_span_begin_offsets: int32", - "input_span_end_offsets: int32", - "input_span_type: string", - "input_token_begin_row_splits: int64", - "input_token_end_row_splits: int64", - "input_span_begin_row_splits: int64", - "input_span_end_row_splits: int64", - "input_span_type_row_splits: int64", - "input_use_strict_boundary_mode: bool"}; -} - -template -std::vector OffsetsToBoiseTagsOp::Outputs() { - return {"output_boise_tags: string"}; -} - -template -absl::Status OffsetsToBoiseTagsOp::ShapeInference( - ShapeInferenceContext* c) { - using tflite::shim::Shape; - const auto rank_1_shape = Shape({Shape::kUnknownDim}); - - SH_ASSIGN_OR_RETURN(const Shape input_token_begin_shape, - c->GetInputShape(kInputTokenBeginOffsets)); - if (!input_token_begin_shape.Compatible(rank_1_shape)) { - return absl::FailedPreconditionError(absl::StrCat( - "Shape must be rank 1: ", input_token_begin_shape.ToString())); - } - - SH_ASSIGN_OR_RETURN(const Shape input_token_end_shape, - c->GetInputShape(kInputTokenEndOffsets)); - if (!input_token_end_shape.Compatible(rank_1_shape)) { - return absl::FailedPreconditionError(absl::StrCat( - "Shape must be rank 1: ", input_token_end_shape.ToString())); - } - - SH_ASSIGN_OR_RETURN(const Shape input_span_begin_shape, - c->GetInputShape(kInputSpanBeginOffsets)); - if (!input_span_begin_shape.Compatible(rank_1_shape)) { - return absl::FailedPreconditionError(absl::StrCat( - "Shape must be rank 1: ", input_span_begin_shape.ToString())); - } - - SH_ASSIGN_OR_RETURN(const Shape input_span_end_shape, - c->GetInputShape(kInputSpanEndOffsets)); - if (!input_span_end_shape.Compatible(rank_1_shape)) { - return absl::FailedPreconditionError(absl::StrCat( - "Shape must be rank 1: ", input_span_end_shape.ToString())); - } - - SH_ASSIGN_OR_RETURN(const Shape input_span_type_shape, - c->GetInputShape(kInputSpanType)); - if (!input_span_type_shape.Compatible(rank_1_shape)) { - return absl::FailedPreconditionError(absl::StrCat( - "Shape must be rank 1: ", input_span_type_shape.ToString())); - } - - SH_ASSIGN_OR_RETURN(const Shape input_token_begin_rs_shape, - c->GetInputShape(kInputTokenBeginRowSplits)); - if (!input_token_begin_rs_shape.Compatible(rank_1_shape)) { - return absl::FailedPreconditionError(absl::StrCat( - "Shape must be rank 1: ", input_token_begin_rs_shape.ToString())); - } - - SH_ASSIGN_OR_RETURN(const Shape input_token_end_rs_shape, - c->GetInputShape(kInputTokenEndRowSplits)); - if (!input_token_end_rs_shape.Compatible(rank_1_shape)) { - return absl::FailedPreconditionError(absl::StrCat( - "Shape must be rank 1: ", input_token_end_rs_shape.ToString())); - } - - SH_ASSIGN_OR_RETURN(const Shape input_span_begin_rs_shape, - c->GetInputShape(kInputSpanBeginRowSplits)); - if (!input_span_begin_rs_shape.Compatible(rank_1_shape)) { - return absl::FailedPreconditionError(absl::StrCat( - "Shape must be rank 1: ", input_span_begin_rs_shape.ToString())); - } - - SH_ASSIGN_OR_RETURN(const Shape input_span_end_rs_shape, - c->GetInputShape(kInputSpanEndRowSplits)); - if (!input_span_end_rs_shape.Compatible(rank_1_shape)) { - return absl::FailedPreconditionError(absl::StrCat( - "Shape must be rank 1: ", input_span_end_rs_shape.ToString())); - } - - SH_ASSIGN_OR_RETURN(const Shape input_span_type_rs_shape, - c->GetInputShape(kInputSpanTypeRowSplits)); - if (!input_span_type_rs_shape.Compatible(rank_1_shape)) { - return absl::FailedPreconditionError(absl::StrCat( - "Shape must be rank 1: ", input_span_type_rs_shape.ToString())); - } - - const int num_offsets = input_token_begin_shape.Dim(0); - SH_RETURN_IF_ERROR(c->SetOutputShape(kOutputBoiseTags, Shape({num_offsets}))); - - return absl::OkStatus(); -} - -template -absl::Status OffsetsToBoiseTagsOp::Invoke(InvokeContext* context) { - // Inputs - SH_ASSIGN_OR_RETURN(const auto input_token_begin_offsets, - context->GetInput(kInputTokenBeginOffsets)); - const auto& input_token_begin_offsets_vec = - input_token_begin_offsets->template As(); - - SH_ASSIGN_OR_RETURN(const auto input_token_end_offsets, - context->GetInput(kInputTokenEndOffsets)); - const auto& input_token_end_offsets_vec = - input_token_end_offsets->template As(); - - SH_ASSIGN_OR_RETURN(const auto input_span_begin_offsets, - context->GetInput(kInputSpanBeginOffsets)); - const auto& input_span_begin_offsets_vec = - input_span_begin_offsets->template As(); - - SH_ASSIGN_OR_RETURN(const auto input_span_end_offsets, - context->GetInput(kInputSpanEndOffsets)); - const auto& input_span_end_offsets_vec = - input_span_end_offsets->template As(); - - SH_ASSIGN_OR_RETURN(const auto input_span_type, - context->GetInput(kInputSpanType)); - const auto& input_span_type_vec = - input_span_type->template As(); - - SH_ASSIGN_OR_RETURN(const auto input_token_begin_row_splits, - context->GetInput(kInputTokenBeginRowSplits)); - const auto& input_token_begin_row_splits_vec = - input_token_begin_row_splits->template As(); - - SH_ASSIGN_OR_RETURN(const auto input_token_end_row_splits, - context->GetInput(kInputTokenEndRowSplits)); - const auto& input_token_end_row_splits_vec = - input_token_end_row_splits->template As(); - - SH_ASSIGN_OR_RETURN(const auto input_span_begin_row_splits, - context->GetInput(kInputSpanBeginRowSplits)); - const auto& input_span_begin_row_splits_vec = - input_span_begin_row_splits->template As(); - - SH_ASSIGN_OR_RETURN(const auto input_span_end_row_splits, - context->GetInput(kInputSpanEndRowSplits)); - const auto& input_span_end_row_splits_vec = - input_span_end_row_splits->template As(); - - SH_ASSIGN_OR_RETURN(const auto input_span_type_row_splits, - context->GetInput(kInputSpanTypeRowSplits)); - const auto& input_span_type_row_splits_vec = - input_span_type_row_splits->template As(); - - SH_ASSIGN_OR_RETURN(const auto input_use_strict_boundary_mode, - context->GetInput(kInputUseStrictBoundaryMode)); - const bool input_use_strict_boundary_mode_value = - input_use_strict_boundary_mode->template AsScalar(); - - // Check token begin and end offsets match in size. - // Check span begin/end offsets, span type match in size. - if (input_token_begin_offsets_vec.Dim(0) != - input_token_end_offsets_vec.Dim(0) || - input_span_begin_offsets_vec.Dim(0) != - input_span_end_offsets_vec.Dim(0) || - input_span_begin_offsets_vec.Dim(0) != input_span_type_vec.Dim(0)) { - return absl::InvalidArgumentError(absl::StrCat( - "Token begin/end offsets must have the same size. Span begin/end " - "offsets and span type must have the same size.", - " Token begin offsets shape: ", input_token_begin_offsets_vec.Dim(0), - " Token end offsets shape: ", input_token_end_offsets_vec.Dim(0), - " Span begin offsets shape: ", input_span_begin_offsets_vec.Dim(0), - " Span end offsets shape: ", input_span_end_offsets_vec.Dim(0), - " Span type shape: ", input_span_type_vec.Dim(0))); - } - - // Check row splits are the same for token begin, end offsets. - if (input_token_begin_row_splits_vec.Dim(0) != - input_token_end_row_splits_vec.Dim(0) || - input_span_begin_row_splits_vec.Dim(0) != - input_span_begin_row_splits_vec.Dim(0) || - input_span_begin_row_splits_vec.Dim(0) != - input_span_end_row_splits_vec.Dim(0) || - input_span_begin_row_splits_vec.Dim(0) != - input_span_type_row_splits_vec.Dim(0)) { - return absl::InvalidArgumentError(absl::StrCat( - "Row splits must have the same size for token and span. ", - " Token begin row splits shape: ", - input_token_begin_row_splits_vec.Dim(0), - " Token end row splits shape: ", input_token_end_row_splits_vec.Dim(0), - " Span begin row splits shape: ", - input_span_begin_row_splits_vec.Dim(0), " Span end row splits shape: ", - input_span_end_row_splits_vec.Dim(0), " Span type row splits shape: ", - input_span_type_row_splits_vec.Dim(0))); - } - - for (int i = 0; i < input_token_begin_row_splits_vec.Dim(0) - 1; ++i) { - if (input_token_begin_row_splits_vec(i) != - input_token_end_row_splits_vec(i)) { - return absl::InvalidArgumentError( - "Row splits must be the same for token begin and end offsets."); - } - } - - // Check row splits are the same for span begin, end offsets and span type. - for (int i = 0; i < input_span_begin_row_splits_vec.Dim(0) - 1; ++i) { - if (input_span_begin_row_splits_vec(i) != - input_span_end_row_splits_vec(i) || - input_span_begin_row_splits_vec(i) != - input_span_type_row_splits_vec(i)) { - return absl::InvalidArgumentError( - "Row splits must be the same for span begin, end offsets and span " - "type."); - } - } - - // Outputs - std::vector boise_tags; - std::vector input_token_begin_offsets_vec_i; - std::vector input_token_end_offsets_vec_i; - std::vector input_span_begin_offsets_vec_i; - std::vector input_span_end_offsets_vec_i; - std::vector input_span_type_vec_i; - - // Iterate through all the input values and split them. - for (int i = 0; i < input_token_begin_row_splits_vec.Dim(0) - 1; ++i) { - int token_start_index = input_token_begin_row_splits_vec(i); - int token_end_index = input_token_begin_row_splits_vec(i + 1); - int span_start_index = input_span_begin_row_splits_vec(i); - int span_end_index = input_span_begin_row_splits_vec(i + 1); - - input_token_begin_offsets_vec_i.clear(); - input_token_end_offsets_vec_i.clear(); - input_span_begin_offsets_vec_i.clear(); - input_span_end_offsets_vec_i.clear(); - input_span_type_vec_i.clear(); - - for (int j = token_start_index; j < token_end_index; ++j) { - input_token_begin_offsets_vec_i.push_back( - input_token_begin_offsets_vec(j)); - input_token_end_offsets_vec_i.push_back(input_token_end_offsets_vec(j)); - } - for (int j = span_start_index; j < span_end_index; ++j) { - input_span_begin_offsets_vec_i.push_back(input_span_begin_offsets_vec(j)); - input_span_end_offsets_vec_i.push_back(input_span_end_offsets_vec(j)); - input_span_type_vec_i.push_back(input_span_type_vec(j)); - } - - SH_ASSIGN_OR_RETURN( - std::vector boise_tags_i, - OffsetsToBoiseTags( - input_token_begin_offsets_vec_i, input_token_end_offsets_vec_i, - input_span_begin_offsets_vec_i, input_span_end_offsets_vec_i, - input_span_type_vec_i, input_use_strict_boundary_mode_value)); - - for (int j = 0; j < boise_tags_i.size(); ++j) { - boise_tags.push_back(boise_tags_i[j]); - } - } - - // Allocate output & fill output tensors. - SH_RETURN_IF_ERROR(this->template FillOutputTensor( - boise_tags, kOutputBoiseTags, context)); - - return absl::OkStatus(); -} - - -template -class BoiseTagsToOffsetsOp - : public tflite::shim::OpKernelShim { - private: - enum Inputs { - kInputTokenBeginOffsets = 0, - kInputTokenEndOffsets, - kInputBoiseTags, - kInputTokenBeginRowSplits, - kInputTokenEndRowSplits, - kInputBoiseTagsRowSplits, - }; - enum Outputs { - kOutputSpanBeginOffsets = 0, - kOutputSpanEndOffsets, - kOutputSpanType, - kOutputRowSplits, - }; - - using typename tflite::shim::OpKernelShim::InitContext; - using typename tflite::shim::OpKernelShim::InvokeContext; - using typename tflite::shim::OpKernelShim::ShapeInferenceContext; - - public: - BoiseTagsToOffsetsOp() = default; - static constexpr char kOpName[] = "TFText>BoiseTagsToOffsets"; - static constexpr char kDoc[] = R"doc( - Converts BOISE tags into span begin/end offsets and span type. - )doc"; - - static const char* OpName() { return kOpName; } - static const char* Doc() { return kDoc; } - - // Attributes declaration (syntax: https://www.tensorflow.org/guide/create_op) - static std::vector Attrs() { return {}; } - - // Inputs declaration (syntax: https://www.tensorflow.org/guide/create_op) - static std::vector Inputs(); - - // Outputs declaration (syntax: https://www.tensorflow.org/guide/create_op) - static std::vector Outputs(); - - // Initializes the op - absl::Status Init(InitContext* context) { return absl::OkStatus(); } - - // Runs the operation - absl::Status Invoke(InvokeContext* context); - - // Shape inference - static absl::Status ShapeInference(ShapeInferenceContext* c); - - protected: - template - inline absl::Status FillOutputTensor(const std::vector& buffer, - int index, InvokeContext* context); -}; - -////////////////////////// Implementation - -template -std::vector BoiseTagsToOffsetsOp::Inputs() { - return {"input_token_begin_offsets: int32", - "input_token_end_offsets: int32", - "input_boise_tags: string", - "input_token_begin_row_splits: int64", - "input_token_end_row_splits: int64", - "input_boise_tags_row_splits: int64"}; -} - -template -std::vector BoiseTagsToOffsetsOp::Outputs() { - return {"output_span_begin_offsets: int32", "output_span_end_offsets: int32", - "output_span_type: string", "output_row_splits: int64"}; -} - -template -absl::Status BoiseTagsToOffsetsOp::ShapeInference( - ShapeInferenceContext* c) { - using tflite::shim::Shape; - const auto rank_1_shape = Shape({Shape::kUnknownDim}); - - SH_ASSIGN_OR_RETURN(const Shape input_token_begin_shape, - c->GetInputShape(kInputTokenBeginOffsets)); - if (!input_token_begin_shape.Compatible(rank_1_shape)) { - return absl::FailedPreconditionError(absl::StrCat( - "Shape must be rank 1: ", input_token_begin_shape.ToString())); - } - - SH_ASSIGN_OR_RETURN(const Shape input_token_end_shape, - c->GetInputShape(kInputTokenEndOffsets)); - if (!input_token_end_shape.Compatible(rank_1_shape)) { - return absl::FailedPreconditionError(absl::StrCat( - "Shape must be rank 1: ", input_token_end_shape.ToString())); - } - - SH_ASSIGN_OR_RETURN(const Shape input_boise_tags_shape, - c->GetInputShape(kInputBoiseTags)); - if (!input_boise_tags_shape.Compatible(rank_1_shape)) { - return absl::FailedPreconditionError(absl::StrCat( - "Shape must be rank 1: ", input_boise_tags_shape.ToString())); - } - - SH_ASSIGN_OR_RETURN(const Shape input_token_begin_rs_shape, - c->GetInputShape(kInputTokenBeginRowSplits)); - if (!input_token_begin_rs_shape.Compatible(rank_1_shape)) { - return absl::FailedPreconditionError(absl::StrCat( - "Shape must be rank 1: ", input_token_begin_rs_shape.ToString())); - } - - SH_ASSIGN_OR_RETURN(const Shape input_token_end_rs_shape, - c->GetInputShape(kInputTokenEndRowSplits)); - if (!input_token_end_rs_shape.Compatible(rank_1_shape)) { - return absl::FailedPreconditionError(absl::StrCat( - "Shape must be rank 1: ", input_token_end_rs_shape.ToString())); - } - - SH_ASSIGN_OR_RETURN(const Shape input_boise_tags_rs_shape, - c->GetInputShape(kInputBoiseTagsRowSplits)); - if (!input_boise_tags_rs_shape.Compatible(rank_1_shape)) { - return absl::FailedPreconditionError(absl::StrCat( - "Shape must be rank 1: ", input_boise_tags_rs_shape.ToString())); - } - - SH_RETURN_IF_ERROR(c->SetOutputShape(kOutputSpanBeginOffsets, rank_1_shape)); - SH_RETURN_IF_ERROR(c->SetOutputShape(kOutputSpanEndOffsets, rank_1_shape)); - SH_RETURN_IF_ERROR(c->SetOutputShape(kOutputSpanType, rank_1_shape)); - SH_RETURN_IF_ERROR(c->SetOutputShape(kOutputRowSplits, rank_1_shape)); - - return absl::OkStatus(); -} - -template -absl::Status BoiseTagsToOffsetsOp::Invoke(InvokeContext* context) { - // Inputs - SH_ASSIGN_OR_RETURN(const auto input_token_begin_offsets, - context->GetInput(kInputTokenBeginOffsets)); - const auto& input_token_begin_offsets_vec = - input_token_begin_offsets->template As(); - - SH_ASSIGN_OR_RETURN(const auto input_token_end_offsets, - context->GetInput(kInputTokenEndOffsets)); - const auto& input_token_end_offsets_vec = - input_token_end_offsets->template As(); - - SH_ASSIGN_OR_RETURN(const auto input_boise_tags, - context->GetInput(kInputBoiseTags)); - const auto& input_boise_tags_vec = - input_boise_tags->template As(); - - SH_ASSIGN_OR_RETURN(const auto input_token_begin_row_splits, - context->GetInput(kInputTokenBeginRowSplits)); - const auto& input_token_begin_row_splits_vec = - input_token_begin_row_splits->template As(); - - SH_ASSIGN_OR_RETURN(const auto input_token_end_row_splits, - context->GetInput(kInputTokenEndRowSplits)); - const auto& input_token_end_row_splits_vec = - input_token_end_row_splits->template As(); - - SH_ASSIGN_OR_RETURN(const auto input_boise_tags_row_splits, - context->GetInput(kInputBoiseTagsRowSplits)); - const auto& input_boise_tags_row_splits_vec = - input_boise_tags_row_splits->template As(); - - // Check token begin and end offsets, and boise tags match in size. - if (input_token_begin_offsets_vec.Dim(0) != - input_token_end_offsets_vec.Dim(0) || - input_token_begin_offsets_vec.Dim(0) != input_boise_tags_vec.Dim(0)) { - return absl::InvalidArgumentError(absl::StrCat( - "Token begin/end offsets and boise tags must have the same size. ", - " Token begin offsets shape: ", input_token_begin_offsets_vec.Dim(0), - " Token end offsets shape: ", input_token_end_offsets_vec.Dim(0), - " BOISE tags shape: ", input_boise_tags_vec.Dim(0))); - } - - // Check row splits are the same for token begin, end offsets and boise tags. - // First, check dimensions are the same. - if (input_token_begin_row_splits_vec.Dim(0) != - input_token_end_row_splits_vec.Dim(0) || - input_token_begin_row_splits_vec.Dim(0) != - input_boise_tags_row_splits_vec.Dim(0)) { - return absl::InvalidArgumentError(absl::StrCat( - "Row splits must have the same size for token begin/end offsets and " - "BOISE tags. ", - " Token begin row splits shape: ", - input_token_begin_row_splits_vec.Dim(0), - " Token end row splits shape: ", input_token_end_row_splits_vec.Dim(0), - " BOISE tags row splits shape: ", - input_boise_tags_row_splits_vec.Dim(0))); - } - // Second, check values are the same. - for (int i = 0; i < input_token_begin_row_splits_vec.Dim(0) - 1; ++i) { - if (input_token_begin_row_splits_vec(i) != - input_token_end_row_splits_vec(i) || - input_token_begin_row_splits_vec(i) != - input_boise_tags_row_splits_vec(i)) { - return absl::InvalidArgumentError( - "Row splits must be the same for token begin/end offsets ad BOISE " - "tags."); - } - } - - // Outputs - std::vector span_begin_offsets; - std::vector span_end_offsets; - std::vector span_type; - std::vector row_splits; - - row_splits.push_back(0); - - // Iterate through all the input values and split them. - std::vector input_token_begin_offsets_vec_i; - std::vector input_token_end_offsets_vec_i; - std::vector input_boise_tags_vec_i; - for (int i = 0; i < input_token_begin_row_splits_vec.Dim(0) - 1; ++i) { - int token_start_index = input_token_begin_row_splits_vec(i); - int token_end_index = input_token_begin_row_splits_vec(i + 1); - - input_token_begin_offsets_vec_i.clear(); - input_token_end_offsets_vec_i.clear(); - input_boise_tags_vec_i.clear(); - - for (int j = token_start_index; j < token_end_index; ++j) { - input_token_begin_offsets_vec_i.push_back( - input_token_begin_offsets_vec(j)); - input_token_end_offsets_vec_i.push_back(input_token_end_offsets_vec(j)); - input_boise_tags_vec_i.push_back(input_boise_tags_vec(j)); - } - - auto [span_begin_offsets_i, span_end_offsets_i, span_type_i] = - BoiseTagsToOffsets(input_token_begin_offsets_vec_i, - input_token_end_offsets_vec_i, - input_boise_tags_vec_i) - .value(); - - const int num_span_i = span_type_i.size(); - row_splits.push_back(row_splits.back() + num_span_i); - - for (int j = 0; j < span_type_i.size(); ++j) { - span_type.push_back(span_type_i[j]); - span_begin_offsets.push_back(span_begin_offsets_i[j]); - span_end_offsets.push_back(span_end_offsets_i[j]); - } - } - - // Allocate output & fill output tensors. - SH_RETURN_IF_ERROR(FillOutputTensor( - span_begin_offsets, kOutputSpanBeginOffsets, context)); - SH_RETURN_IF_ERROR(FillOutputTensor( - span_end_offsets, kOutputSpanEndOffsets, context)); - SH_RETURN_IF_ERROR(FillOutputTensor( - span_type, kOutputSpanType, context)); - SH_RETURN_IF_ERROR(FillOutputTensor( - row_splits, kOutputRowSplits, context)); - - return absl::OkStatus(); -} - -template -template -absl::Status BoiseTagsToOffsetsOp::FillOutputTensor( - const std::vector& buffer, const int index, - InvokeContext* context) { - SH_ASSIGN_OR_RETURN( - const auto tensorview, - context->GetOutput( - index, tflite::shim::Shape({static_cast(buffer.size())}))); - auto data = tensorview->template As(); - // TODO(broken): investigate using memcpy like previous WST - for (int i = 0; i < buffer.size(); ++i) data(i) = buffer.at(i); - return absl::OkStatus(); -} - -} // namespace text -} // namespace tensorflow +#include "tensorflow/core/kernels/text/boise_offset_converter_kernel_template.h" #endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_BOISE_OFFSET_CONVERTER_KERNEL_TEMPLATE_H_ diff --git a/tensorflow_text/core/kernels/boise_offset_converter_test.cc b/tensorflow_text/core/kernels/boise_offset_converter_test.cc deleted file mode 100644 index 06c279d06..000000000 --- a/tensorflow_text/core/kernels/boise_offset_converter_test.cc +++ /dev/null @@ -1,561 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/boise_offset_converter.h" - -#include -#include -#include -#include -#include -#include - -#include -#include -#include "absl/strings/string_view.h" - -using ::testing::ContainerEq; - -namespace tensorflow { -namespace text { -namespace { - -// Helper function to extract texts based on the begin and end offsets. -// content = "Who let the dogs out" -// begin_offsets = {12, 17} -// end_offsets = {16, 20} -// Foo returns: {"dogs", "out"} -std::vector ExtractTextsFromOffsets( - const std::string content, const std::vector begin_offsets, - const std::vector end_offsets) { - absl::string_view content_sv = absl::string_view(content); - std::vector res; - for (int i = 0; i < begin_offsets.size(); ++i) { - int text_len = end_offsets[i] - begin_offsets[i]; - res.push_back(static_cast( - content_sv.substr(begin_offsets[i], text_len))); - } - return res; -} - -// Test that we can transform offsets into BOISE tags -TEST(OffsetsToBoiseTagsTest, ExtractSingleton) { - // 1 2 - // 012345678901234567890 - std::string content = "Who let the dogs out"; - std::string entity = "dogs"; - std::vector token_begin_offsets = {0, 4, 8, 12, 17}; - std::vector token_end_offsets = {3, 7, 11, 16, 20}; - std::vector entity_begin_offsets = {12}; - std::vector entity_end_offsets = {16}; - std::vector entity_type = {"animal"}; - std::vector boise_tags = - OffsetsToBoiseTags(token_begin_offsets, token_end_offsets, - entity_begin_offsets, entity_end_offsets, entity_type) - .ValueOrDie(); - EXPECT_THAT(boise_tags, ContainerEq(std::vector{ - "O", "O", "O", "S-animal", "O"})); -} - -TEST(OffsetsToBoiseTagsTest, ExtractSingletonStrictBoundary) { - // 1 - // 01234567890123456789 - std::string content = "Who let the dogs out"; - std::string entity = "dogs"; - std::vector token_begin_offsets = {0, 4, 8, 12, 17}; - std::vector token_end_offsets = {3, 7, 11, 16, 20}; - std::vector entity_begin_offsets = {13}; - std::vector entity_end_offsets = {16}; - std::vector entity_type = {"animal"}; - bool use_strict_boundary_mode = true; - std::vector boise_tags = - OffsetsToBoiseTags(token_begin_offsets, token_end_offsets, - entity_begin_offsets, entity_end_offsets, entity_type, - use_strict_boundary_mode) - .ValueOrDie(); - EXPECT_THAT(boise_tags, - ContainerEq(std::vector{"O", "O", "O", "O", "O"})); -} - -TEST(OffsetsToBoiseTagsTest, ExtractBEEntity) { - // 1 2 3 - // 0123456789012345678901234567890 - std::string content = "Who let the german shepherd out"; - std::string entity = "german shepherd"; - std::vector token_begin_offsets = {0, 4, 8, 12, 19, 28}; - std::vector token_end_offsets = {3, 7, 11, 18, 27, 31}; - std::vector entity_begin_offsets = {12}; - std::vector entity_end_offsets = {27}; - std::vector entity_type = {"animal"}; - - std::vector boise_tags = - OffsetsToBoiseTags(token_begin_offsets, token_end_offsets, - entity_begin_offsets, entity_end_offsets, entity_type) - .ValueOrDie(); - EXPECT_THAT(boise_tags, ContainerEq(std::vector{ - "O", "O", "O", "B-animal", "E-animal", "O"})); -} - -TEST(OffsetsToBoiseTagsTest, ExtractBIEEntity) { - // 1 2 3 - // 0123456789012345678901234567890 - std::string content = "How big is Los Angeles County?"; - std::string entity = "Los Angeles County"; - std::vector token_begin_offsets = {0, 4, 8, 11, 15, 23, 29}; - std::vector token_end_offsets = {3, 7, 10, 14, 22, 29, 30}; - std::vector entity_begin_offsets = {11}; - std::vector entity_end_offsets = {29}; - std::vector entity_type = {"loc"}; - - std::vector boise_tags = - OffsetsToBoiseTags(token_begin_offsets, token_end_offsets, - entity_begin_offsets, entity_end_offsets, entity_type) - .ValueOrDie(); - EXPECT_THAT(boise_tags, ContainerEq(std::vector{ - "O", "O", "O", "B-loc", "I-loc", "E-loc", "O"})); -} - -TEST(OffsetsToBoiseTagsTest, ExtractMutipleEntities) { - // 1 2 3 - // 01234567890123456789012345678901234567 - std::string content = "Getty Center is in Los Angeles County"; - std::vector token_begin_offsets = {0, 6, 13, 16, 19, 23, 31}; - std::vector token_end_offsets = {5, 12, 15, 18, 22, 30, 37}; - std::vector entity_begin_offsets = {0, 19}; - std::vector entity_end_offsets = {12, 37}; - std::vector entity_type = {"org", "loc"}; - - std::vector boise_tags = - OffsetsToBoiseTags(token_begin_offsets, token_end_offsets, - entity_begin_offsets, entity_end_offsets, entity_type) - .ValueOrDie(); - EXPECT_THAT(boise_tags, - ContainerEq(std::vector{"B-org", "E-org", "O", "O", - "B-loc", "I-loc", "E-loc"})); -} - -TEST(OffsetsToBoiseTagsTest, LooseBoundary) { - // 1 2 3 - // 01234567890123456789012345678901234567 - std::string content = "Getty Center is in Los Angeles County"; - std::vector token_begin_offsets = {0, 6, 13, 16, 19, 23, 31}; - std::vector token_end_offsets = {5, 12, 15, 18, 22, 30, 37}; - std::vector entity_begin_offsets = {3, 19}; - std::vector entity_end_offsets = {10, 32}; - std::vector entity_type = {"org", "loc"}; - - std::vector boise_tags = - OffsetsToBoiseTags(token_begin_offsets, token_end_offsets, - entity_begin_offsets, entity_end_offsets, entity_type) - .ValueOrDie(); - EXPECT_THAT(boise_tags, - ContainerEq(std::vector{"B-org", "E-org", "O", "O", - "B-loc", "I-loc", "E-loc"})); -} - -TEST(OffsetsToBoiseTagsTest, StrictBoundary) { - // 1 2 3 - // 01234567890123456789012345678901234567 - std::string content = "Getty Center is in Los Angeles County"; - std::vector token_begin_offsets = {0, 6, 13, 16, 19, 23, 31}; - std::vector token_end_offsets = {5, 12, 15, 18, 22, 30, 37}; - std::vector entity_begin_offsets = {3, 19}; - std::vector entity_end_offsets = {12, 32}; - std::vector entity_type = {"org", "loc"}; - bool use_strict_boundary_mode = true; - - std::vector boise_tags = - OffsetsToBoiseTags(token_begin_offsets, token_end_offsets, - entity_begin_offsets, entity_end_offsets, entity_type, - use_strict_boundary_mode) - .ValueOrDie(); - EXPECT_THAT(boise_tags, ContainerEq(std::vector{ - "O", "E-org", "O", "O", "B-loc", "I-loc", "O"})); -} - -TEST(OffsetsToBoiseTagsTest, OneTokenMultiEntitiesLastPrecedes) { - // 1 - // 0123456789012 - std::string content = "Getty Center"; - std::vector token_begin_offsets = {0}; - std::vector token_end_offsets = {12}; - std::vector entity_begin_offsets = {0, 6}; - std::vector entity_end_offsets = {5, 12}; - std::vector entity_type = {"per", "loc"}; - - std::vector boise_tags = - OffsetsToBoiseTags(token_begin_offsets, token_end_offsets, - entity_begin_offsets, entity_end_offsets, entity_type) - .ValueOrDie(); - EXPECT_THAT(boise_tags, ContainerEq(std::vector{"B-loc"})); -} - -TEST(OffsetsToBoiseTagsTest, OneTokenMultEntitiesPartialOverlapLastPrecedes) { - // 1 - // 0123456789012 - std::string content = "Getty Center"; - std::vector token_begin_offsets = {0, 6}; - std::vector token_end_offsets = {5, 12}; - std::vector entity_begin_offsets = {0, 9}; - std::vector entity_end_offsets = {8, 12}; - std::vector entity_type = {"per", "loc"}; - - std::vector boise_tags = - OffsetsToBoiseTags(token_begin_offsets, token_end_offsets, - entity_begin_offsets, entity_end_offsets, entity_type) - .ValueOrDie(); - EXPECT_THAT(boise_tags, - ContainerEq(std::vector{"B-per", "B-loc"})); -} - -TEST(OffsetsToBoiseTagsTest, MultiTokensOneEntityPartialOverlapLastPrecedes) { - // 1 2 3 - // 01234567890123456789012345678901234 - std::string content = "Getty Center, Los Angeles County"; - std::vector token_begin_offsets = {0, 6, 14, 18, 26}; - std::vector token_end_offsets = {5, 12, 17, 25, 32}; - std::vector entity_begin_offsets = {0, 15}; - std::vector entity_end_offsets = {14, 30}; - std::vector entity_type = {"org", "loc"}; - - std::vector boise_tags = - OffsetsToBoiseTags(token_begin_offsets, token_end_offsets, - entity_begin_offsets, entity_end_offsets, entity_type) - .ValueOrDie(); - EXPECT_THAT(boise_tags, ContainerEq(std::vector{ - "B-org", "I-org", "B-loc", "I-loc", "E-loc"})); -} - -TEST(OffsetsToBoiseTagsTest, EmptySpanOffsets) { - std::vector token_begin_offsets = {0, 6, 13, 16, 19, 23, 31}; - std::vector token_end_offsets = {5, 12, 15, 18, 22, 30, 37}; - std::vector entity_begin_offsets = {}; - std::vector entity_end_offsets = {}; - std::vector entity_type = {}; - - std::vector boise_tags = - OffsetsToBoiseTags(token_begin_offsets, token_end_offsets, - entity_begin_offsets, entity_end_offsets, entity_type) - .ValueOrDie(); - EXPECT_THAT(boise_tags, ContainerEq(std::vector{ - "O", "O", "O", "O", "O", "O", "O"})); -} - -TEST(OffsetsToBoiseTagsTest, InputSizeError) { - std::vector token_begin_offsets = {0, 4, 8, 12, 17}; - std::vector token_end_offsets = {3, 7, 11, 16, 20}; - std::vector entity_begin_offsets = {12}; - std::vector entity_end_offsets = {16}; - std::vector entity_type = {"animal", "extra_entity"}; - EXPECT_FALSE(OffsetsToBoiseTags(token_begin_offsets, token_end_offsets, - entity_begin_offsets, entity_end_offsets, - entity_type) - .ok()); -} - -// Test that BOISE tags can be transformed into offets -TEST(BoiseTagsToOffsetTest, BeginAndEndTagsAreConvertedToOffsets) { - // 1 2 3 - // 0123456789012345678901234567890 - std::string content = "Who let the german shepherd out"; - std::vector token_begin_offsets = {0, 4, 8, 12, 19, 28}; - std::vector token_end_offsets = {3, 7, 11, 18, 27, 31}; - std::vector boise_tags = {"O", "O", "O", - "B-animal", "E-animal", "O"}; - - auto [begin_offsets, end_offsets, span_types] = - BoiseTagsToOffsets(token_begin_offsets, token_end_offsets, boise_tags) - .ValueOrDie(); - - auto texts = ExtractTextsFromOffsets(content, begin_offsets, end_offsets); - EXPECT_THAT(texts, ContainerEq(std::vector{"german shepherd"})); - EXPECT_THAT(span_types, ContainerEq(std::vector{"animal"})); -} - -TEST(BoiseTagsToOffsetTest, SingletonTagsAreExtracted) { - // 1 2 - // 012345678901234567890 - std::string content = "Who let the dogs out"; - std::vector token_begin_offsets = {0, 4, 8, 12, 17}; - std::vector token_end_offsets = {3, 7, 11, 16, 20}; - std::vector boise_tags = {"O", "O", "O", "S-animal", "O"}; - - auto [begin_offsets, end_offsets, span_types] = - BoiseTagsToOffsets(token_begin_offsets, token_end_offsets, boise_tags) - .ValueOrDie(); - - auto texts = ExtractTextsFromOffsets(content, begin_offsets, end_offsets); - EXPECT_THAT(texts, ContainerEq(std::vector{"dogs"})); - EXPECT_THAT(span_types, ContainerEq(std::vector{"animal"})); -} - -TEST(BoiseTagsToOffsetTest, BeginInsideAndEndLabelsAreExtracted) { - // 1 2 3 - // 0123456789012345678901234567890 - std::string content = "How big is Los Angeles County?"; - std::vector token_begin_offsets = {0, 4, 8, 11, 15, 23, 29}; - std::vector token_end_offsets = {3, 7, 10, 14, 22, 29, 30}; - std::vector boise_tags = {"O", "O", "O", "B-loc", - "I-loc", "E-loc", "O"}; - - auto [begin_offsets, end_offsets, span_types] = - BoiseTagsToOffsets(token_begin_offsets, token_end_offsets, boise_tags) - .ValueOrDie(); - - auto texts = ExtractTextsFromOffsets(content, begin_offsets, end_offsets); - EXPECT_THAT(texts, - ContainerEq(std::vector{"Los Angeles County"})); - EXPECT_THAT(span_types, ContainerEq(std::vector{"loc"})); -} - -TEST(BoiseTagsToOffsetTest, InsideEndLabelsAreExtracted) { - // 1 2 3 - // 0123456789012345678901234567890 - std::string content = "Who let the german shepherd out"; - std::vector token_begin_offsets = {0, 4, 8, 12, 19, 28}; - std::vector token_end_offsets = {3, 7, 11, 18, 27, 31}; - std::vector boise_tags = {"O", "O", "O", - "I-animal", "E-animal", "O"}; - - auto [begin_offsets, end_offsets, span_types] = - BoiseTagsToOffsets(token_begin_offsets, token_end_offsets, boise_tags) - .ValueOrDie(); - - auto texts = ExtractTextsFromOffsets(content, begin_offsets, end_offsets); - EXPECT_THAT(texts, ContainerEq(std::vector{"german shepherd"})); - EXPECT_THAT(span_types, ContainerEq(std::vector{"animal"})); -} - -TEST(BoiseTagsToOffsetTest, BeginInsideLabelsAreExtracted) { - // 1 2 3 - // 0123456789012345678901234567890 - std::string content = "Who let the german shepherd out"; - std::vector token_begin_offsets = {0, 4, 8, 12, 19, 28}; - std::vector token_end_offsets = {3, 7, 11, 18, 27, 31}; - std::vector boise_tags = {"O", "O", "O", - "B-animal", "I-animal", "O"}; - - auto [begin_offsets, end_offsets, span_types] = - BoiseTagsToOffsets(token_begin_offsets, token_end_offsets, boise_tags) - .ValueOrDie(); - - auto texts = ExtractTextsFromOffsets(content, begin_offsets, end_offsets); - EXPECT_THAT(texts, ContainerEq(std::vector{"german shepherd"})); - EXPECT_THAT(span_types, ContainerEq(std::vector{"animal"})); -} - -TEST(BoiseTagsToOffsetTest, InsideOnlyLabelIsExtracted) { - // 1 2 3 - // 0123456789012345678901234567890 - std::string content = "Who let the shepherd out"; - std::vector token_begin_offsets = {0, 4, 8, 12, 21}; - std::vector token_end_offsets = {3, 7, 11, 20, 24}; - std::vector boise_tags = { - "O", "O", "O", "I-animal", "O", - }; - - auto [begin_offsets, end_offsets, span_types] = - BoiseTagsToOffsets(token_begin_offsets, token_end_offsets, boise_tags) - .ValueOrDie(); - - auto texts = ExtractTextsFromOffsets(content, begin_offsets, end_offsets); - EXPECT_THAT(texts, ContainerEq(std::vector{"shepherd"})); - EXPECT_THAT(span_types, ContainerEq(std::vector{"animal"})); -} - -TEST(BoiseTagsToOffsetTest, BeginOnlyLabelIsExtracted) { - // 1 2 3 - // 0123456789012345678901234567890 - std::string content = "Who let the shepherd out"; - std::vector token_begin_offsets = {0, 4, 8, 12, 21}; - std::vector token_end_offsets = {3, 7, 11, 20, 24}; - std::vector boise_tags = { - "O", "O", "O", "B-animal", "O", - }; - - auto [begin_offsets, end_offsets, span_types] = - BoiseTagsToOffsets(token_begin_offsets, token_end_offsets, boise_tags) - .ValueOrDie(); - - auto texts = ExtractTextsFromOffsets(content, begin_offsets, end_offsets); - EXPECT_THAT(texts, ContainerEq(std::vector{"shepherd"})); - EXPECT_THAT(span_types, ContainerEq(std::vector{"animal"})); -} - -TEST(BoiseTagsToOffsetTest, EndOnlyLabelIsExtracted) { - // 1 2 3 - // 0123456789012345678901234567890 - std::string content = "Who let the shepherd out"; - std::vector token_begin_offsets = {0, 4, 8, 12, 21}; - std::vector token_end_offsets = {3, 7, 11, 20, 24}; - std::vector boise_tags = { - "O", "O", "O", "E-animal", "O", - }; - - auto [begin_offsets, end_offsets, span_types] = - BoiseTagsToOffsets(token_begin_offsets, token_end_offsets, boise_tags) - .ValueOrDie(); - - auto texts = ExtractTextsFromOffsets(content, begin_offsets, end_offsets); - EXPECT_THAT(texts, ContainerEq(std::vector{"shepherd"})); - EXPECT_THAT(span_types, ContainerEq(std::vector{"animal"})); -} - -TEST(BoiseTagsToOffsetTest, MultipleEntitiesAreExtracted) { - // 1 2 3 - // 01234567890123456789012345678901234567 - std::string content = "Getty Center is in Los Angeles County"; - std::vector token_begin_offsets = {0, 6, 13, 16, 19, 23, 31}; - std::vector token_end_offsets = {5, 12, 15, 18, 22, 30, 37}; - std::vector boise_tags = {"B-org", "E-org", "O", "O", - "B-loc", "I-loc", "E-loc"}; - - auto [begin_offsets, end_offsets, span_types] = - BoiseTagsToOffsets(token_begin_offsets, token_end_offsets, boise_tags) - .ValueOrDie(); - - auto texts = ExtractTextsFromOffsets(content, begin_offsets, end_offsets); - EXPECT_THAT(texts, ContainerEq(std::vector{ - "Getty Center", "Los Angeles County"})); - EXPECT_THAT(span_types, ContainerEq(std::vector{"org", "loc"})); -} - -TEST(BoiseTagsToOffsetTest, MultipleBeginLabels) { - // 1 2 3 - // 0123456789012345678901234567890 - std::string content = "Who let the german shepherd out"; - std::vector token_begin_offsets = {0, 4, 8, 12, 19, 28}; - std::vector token_end_offsets = {3, 7, 11, 18, 27, 31}; - std::vector boise_tags = {"O", "O", "O", - "B-loc", "B-animal", "O"}; - - auto [begin_offsets, end_offsets, span_types] = - BoiseTagsToOffsets(token_begin_offsets, token_end_offsets, boise_tags) - .ValueOrDie(); - - auto texts = ExtractTextsFromOffsets(content, begin_offsets, end_offsets); - EXPECT_THAT(texts, ContainerEq(std::vector{"german shepherd"})); - EXPECT_THAT(span_types, ContainerEq(std::vector{"animal"})); -} - -TEST(BoiseTagsToOffsetTest, MultipleInsideLabels) { - // 1 2 3 - // 0123456789012345678901234567890 - std::string content = "Who let the german shepherd out"; - std::vector token_begin_offsets = {0, 4, 8, 12, 19, 28}; - std::vector token_end_offsets = {3, 7, 11, 18, 27, 31}; - std::vector boise_tags = {"O", "O", "O", - "I-loc", "I-animal", "O"}; - - auto [begin_offsets, end_offsets, span_types] = - BoiseTagsToOffsets(token_begin_offsets, token_end_offsets, boise_tags) - .ValueOrDie(); - - auto texts = ExtractTextsFromOffsets(content, begin_offsets, end_offsets); - EXPECT_THAT(texts, ContainerEq(std::vector{"german shepherd"})); - EXPECT_THAT(span_types, ContainerEq(std::vector{"animal"})); -} - -TEST(BoiseTagsToOffsetTest, MultipleEndLabels) { - // 1 2 3 - // 0123456789012345678901234567890 - std::string content = "Who let the german shepherd out"; - std::vector token_begin_offsets = {0, 4, 8, 12, 19, 28}; - std::vector token_end_offsets = {3, 7, 11, 18, 27, 31}; - std::vector boise_tags = {"O", "O", "O", - "E-loc", "E-animal", "O"}; - - auto [begin_offsets, end_offsets, span_types] = - BoiseTagsToOffsets(token_begin_offsets, token_end_offsets, boise_tags) - .ValueOrDie(); - - auto texts = ExtractTextsFromOffsets(content, begin_offsets, end_offsets); - EXPECT_THAT(texts, - ContainerEq(std::vector{"german", "shepherd"})); - EXPECT_THAT(span_types, - ContainerEq(std::vector{"loc", "animal"})); -} - -TEST(BoiseTagsToOffsetTest, MultipleSingleLabels) { - // 1 2 3 - // 0123456789012345678901234567890 - std::string content = "Who let the german shepherd out"; - std::vector token_begin_offsets = {0, 4, 8, 12, 19, 28}; - std::vector token_end_offsets = {3, 7, 11, 18, 27, 31}; - std::vector boise_tags = {"O", "O", "O", - "S-loc", "S-animal", "O"}; - - auto [begin_offsets, end_offsets, span_types] = - BoiseTagsToOffsets(token_begin_offsets, token_end_offsets, boise_tags) - .ValueOrDie(); - - auto texts = ExtractTextsFromOffsets(content, begin_offsets, end_offsets); - EXPECT_THAT(texts, - ContainerEq(std::vector{"german", "shepherd"})); - EXPECT_THAT(span_types, - ContainerEq(std::vector{"loc", "animal"})); -} - -TEST(BoiseTagsToOffsetTest, TrailingBeginLabels) { - // 1 2 3 - // 0123456789012345678901234567890 - std::string content = "Who own the german shepherd"; - std::vector token_begin_offsets = {0, 4, 8, 12, 19}; - std::vector token_end_offsets = {3, 7, 11, 18, 27}; - std::vector boise_tags = {"O", "O", "O", "B-loc", "B-animal"}; - - auto [begin_offsets, end_offsets, span_types] = - BoiseTagsToOffsets(token_begin_offsets, token_end_offsets, boise_tags) - .ValueOrDie(); - - auto texts = ExtractTextsFromOffsets(content, begin_offsets, end_offsets); - EXPECT_THAT(texts, ContainerEq(std::vector{"german shepherd"})); - EXPECT_THAT(span_types, ContainerEq(std::vector{"animal"})); -} - -TEST(BoiseTagsToOffsetTest, NoBoiseLabels) { - std::vector token_begin_offsets = {0, 4, 8, 12, 19}; - std::vector token_end_offsets = {3, 7, 11, 16, 20}; - std::vector boise_tags = {"O", "O", "O", "O", "O"}; - - auto [begin_offsets, end_offsets, span_types] = - BoiseTagsToOffsets(token_begin_offsets, token_end_offsets, boise_tags) - .ValueOrDie(); - - EXPECT_TRUE(begin_offsets.empty()); - EXPECT_TRUE(end_offsets.empty()); - EXPECT_TRUE(span_types.empty()); -} - -TEST(BoiseTagsToOffsetTest, InputSizeError) { - std::vector token_begin_offsets = {0, 4, 8, 12}; - std::vector token_end_offsets = {3, 7, 11, 16, 20}; - std::vector boise_tags = {"O", "O", "O", "B-loc", "B-animal"}; - EXPECT_FALSE( - BoiseTagsToOffsets(token_begin_offsets, token_end_offsets, boise_tags) - .ok()); -} - -TEST(GetAllBoiseTagsFromSpanTypeTest, GetAllTagsCorrect) { - std::vector span_type = {"loc", "O", "per", ""}; - std::unordered_set all_tags = - GetAllBoiseTagsFromSpanType(span_type); - EXPECT_THAT(all_tags, ContainerEq(std::unordered_set{ - "O", "B-loc", "I-loc", "S-loc", "E-loc", "B-per", - "I-per", "S-per", "E-per"})); -} - -} // namespace -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/byte_splitter.cc b/tensorflow_text/core/kernels/byte_splitter.cc deleted file mode 100644 index df7201717..000000000 --- a/tensorflow_text/core/kernels/byte_splitter.cc +++ /dev/null @@ -1,76 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/byte_splitter.h" - -#include - -namespace tensorflow { -namespace text { - -void ByteSplitter::Split(const absl::string_view input, - std::vector* bytes, - std::vector* start_offsets, - std::vector* end_offsets) const { - if (input.empty()) return; - Split(input, bytes); - start_offsets->push_back(0); - for (int i = 1; i < input.size(); ++i) { - start_offsets->push_back(i); - end_offsets->push_back(i); - } - end_offsets->push_back(input.size()); -} - -void ByteSplitter::Split(const absl::string_view input, - std::vector* bytes, - std::vector* offsets) const { - if (input.empty()) return; - Split(input, bytes); - for (int i = 0; i <= input.size(); ++i) { - offsets->push_back(i); - } -} - -void ByteSplitter::Split(const absl::string_view input, - std::vector* bytes) const { - for (const auto& c : input) { - bytes->push_back(c); - } -} - -absl::StatusOr> ByteSplitter::SplitByOffsets( - absl::string_view input, - absl::Span start_offsets, - absl::Span end_offsets) const { - std::vector result; - int num = std::min(start_offsets.size(), end_offsets.size()); - for (int i = 0; i < num; ++i) { - if (start_offsets[i] < 0 || start_offsets[i] > input.size()) { - return absl::InvalidArgumentError("Start offsets out of range."); - } - if (end_offsets[i] < 0 || end_offsets[i] > input.size()) { - return absl::InvalidArgumentError("End offsets out of range."); - } - if (start_offsets[i] > end_offsets[i]) { - return absl::InvalidArgumentError("Start offset after end offset."); - } - result.push_back(input.substr(start_offsets[i], - end_offsets[i] - start_offsets[i])); - } - return result; -} - -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/byte_splitter.h b/tensorflow_text/core/kernels/byte_splitter.h index 954b7ccfd..46582ac2a 100644 --- a/tensorflow_text/core/kernels/byte_splitter.h +++ b/tensorflow_text/core/kernels/byte_splitter.h @@ -12,95 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_BYTE_TOKENIZER_H_ -#define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_BYTE_TOKENIZER_H_ +#ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_BYTE_SPLITTER_H_ +#define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_BYTE_SPLITTER_H_ -#include -#include +#include "tensorflow/core/kernels/text/byte_splitter.h" -#include "absl/status/statusor.h" -#include "absl/strings/string_view.h" - -namespace tensorflow { -namespace text { - -class ByteSplitter { - public: - // Creates an instance. - ByteSplitter() { } - - // Tokenizes a string into bytes. - // - // Example: - // input = "uñ" - // bytes = [117, 195, 177] - // start_offsets = [0, 1, 2] - // end_offsets = [1, 2, 3] - // - // Args: - // * input: The string of an input. - // * bytes: The output bytes. - // * start_offsets: The start offsets of output bytes in the input text. - // * end_offsets: The end offsets of output bytes in the input text. - // Note: the start offsets are inclusive and the end offsets are exclusive. - void Split(const absl::string_view input, - std::vector* bytes, - std::vector* start_offsets, - std::vector* end_offsets) const; - - // Tokenizes a string into bytes. - // - // Example: - // input = "uñ" - // bytes = [117, 195, 177] - // offsets = [0, 1, 2, 3] - // - // Args: - // * input: The string of an input. - // * bytes: The output bytes. - // * offsets: The offsets of output bytes in the input text. The size will - // be one plus the input. Each value is the mapped offset of each byte of - // the original input text. The final value maps the end. - // Note: the start offsets are inclusive and the end offsets are exclusive. - void Split(const absl::string_view input, - std::vector* bytes, - std::vector* offsets) const; - - // Tokenizes a string into bytes. - // - // Example: - // input = "uñ" - // bytes = [117, 195, 177] - // - // Args: - // * input: The string of an input. - // * bytes: The output bytes. - void Split(const absl::string_view input, - std::vector* bytes) const; - - // Splits a string by the given start and end offsets. - // - // Example: - // input = "uñ" - // start_offsets = [0, 1] - // end_offsets = [1, 3] - // string = ["u", "ñ"] - // - // Args: - // * input: The string of an input. - // * start_offsets: Input byte index where the new strings start (inclusive). - // * end_offsets: Input byte index where the new strings end. (exclusive) - // - // Return: - // The split substrings. - absl::StatusOr> SplitByOffsets( - absl::string_view input, - absl::Span start_offsets, - absl::Span end_offsets) const; -}; - -} // namespace text -} // namespace tensorflow - - -#endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_BYTE_TOKENIZER_H_ +#endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_BYTE_SPLITTER_H_ diff --git a/tensorflow_text/core/kernels/byte_splitter_kernel.cc b/tensorflow_text/core/kernels/byte_splitter_kernel.cc deleted file mode 100644 index b05d7d29c..000000000 --- a/tensorflow_text/core/kernels/byte_splitter_kernel.cc +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/byte_splitter_kernel.h" - -#include "tensorflow/core/framework/op_kernel.h" - -namespace tensorflow { -namespace text { - -REGISTER_KERNEL_BUILDER(Name(ByteSplitterWithOffsetsOpKernel::OpName()) - .Device(tensorflow::DEVICE_CPU), - ByteSplitterWithOffsetsOpKernel); - -REGISTER_KERNEL_BUILDER(Name(ByteSplitByOffsetsOpKernel::OpName()) - .Device(tensorflow::DEVICE_CPU), - ByteSplitByOffsetsOpKernel); - -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/byte_splitter_kernel.h b/tensorflow_text/core/kernels/byte_splitter_kernel.h index c3e3df413..10e999f51 100644 --- a/tensorflow_text/core/kernels/byte_splitter_kernel.h +++ b/tensorflow_text/core/kernels/byte_splitter_kernel.h @@ -15,25 +15,6 @@ #ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_BYTE_SPLITTER_KERNEL_H_ #define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_BYTE_SPLITTER_KERNEL_H_ -#include "tensorflow/lite/kernels/shim/tf_op_shim.h" -#include "tensorflow_text/core/kernels/byte_splitter_kernel_template.h" - -namespace tensorflow { -namespace text { - -class ByteSplitterWithOffsetsOpKernel - : public tflite::shim::TfOpKernel { - public: - using TfOpKernel::TfOpKernel; -}; - -class ByteSplitByOffsetsOpKernel - : public tflite::shim::TfOpKernel { - public: - using TfOpKernel::TfOpKernel; -}; - -} // namespace text -} // namespace tensorflow +#include "tensorflow/core/kernels/text/byte_splitter_kernel.h" #endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_BYTE_SPLITTER_KERNEL_H_ diff --git a/tensorflow_text/core/kernels/byte_splitter_kernel_template.h b/tensorflow_text/core/kernels/byte_splitter_kernel_template.h index 77ab2b1ba..c61f6e7f9 100644 --- a/tensorflow_text/core/kernels/byte_splitter_kernel_template.h +++ b/tensorflow_text/core/kernels/byte_splitter_kernel_template.h @@ -15,299 +15,6 @@ #ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_BYTE_SPLITTER_KERNEL_TEMPLATE_H_ #define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_BYTE_SPLITTER_KERNEL_TEMPLATE_H_ -#include -#include - -#include "absl/status/status.h" -#include "tensorflow/core/platform/tstring.h" -#include "tensorflow/lite/kernels/shim/op_kernel.h" -#include "tensorflow/lite/kernels/shim/shape.h" -#include "tensorflow/lite/kernels/shim/status_macros.h" -#include "tensorflow_text/core/kernels/byte_splitter.h" - -namespace tensorflow { -namespace text { - -template -class ByteSplitterWithOffsetsOp - : public tflite::shim::OpKernelShim { - private: - enum Inputs { - kInputValues = 0 - }; - enum Outputs { - kOutputBytes = 0, - kOutputRowSplits, - kOutputStartOffsets, - kOutputEndOffsets - }; - - using typename tflite::shim::OpKernelShim::InitContext; - using typename tflite::shim::OpKernelShim::InvokeContext; - using typename tflite::shim::OpKernelShim::ShapeInferenceContext; - - public: - ByteSplitterWithOffsetsOp() = default; - static constexpr char kOpName[] = "TFText>ByteSplitWithOffsets"; - static constexpr char kDoc[] = R"doc( - Splits a string into bytes - )doc"; - - static const char* OpName() { return kOpName; } - static const char* Doc() { return kDoc; } - - // Attributes declaration (syntax: https://www.tensorflow.org/guide/create_op) - static std::vector Attrs() { return {}; } - - // Inputs declaration (syntax: https://www.tensorflow.org/guide/create_op) - static std::vector Inputs(); - - // Outputs declaration (syntax: https://www.tensorflow.org/guide/create_op) - static std::vector Outputs(); - - // Initializes the op - absl::Status Init(InitContext* context) { return absl::OkStatus(); } - - // Runs the operation - absl::Status Invoke(InvokeContext* context); - - // Shape inference - static absl::Status ShapeInference(ShapeInferenceContext* c); -}; - -template -std::vector ByteSplitterWithOffsetsOp::Inputs() { - return {"input_values: string"}; -} - -template -std::vector ByteSplitterWithOffsetsOp::Outputs() { - return {"output_bytes: uint8", "output_row_splits: int64", - "output_start_offsets: int32", "output_end_offsets: int32"}; -} - -template -absl::Status ByteSplitterWithOffsetsOp::ShapeInference( - ShapeInferenceContext* c) { - using tflite::shim::Shape; - const auto rank_1_shape = Shape({Shape::kUnknownDim}); - - SH_ASSIGN_OR_RETURN(const Shape& input_values_shape, - c->GetInputShape(kInputValues)); - if (!input_values_shape.Compatible(rank_1_shape)) { - return absl::FailedPreconditionError(absl::StrCat( - "Input values shape must be rank 1: ", - input_values_shape.ToString())); - } - - SH_RETURN_IF_ERROR(c->SetOutputShape(kOutputBytes, rank_1_shape)); - SH_RETURN_IF_ERROR(c->SetOutputShape(kOutputStartOffsets, rank_1_shape)); - SH_RETURN_IF_ERROR(c->SetOutputShape(kOutputEndOffsets, rank_1_shape)); - const int num_splits = Shape::AddDims(1, input_values_shape.Dim(0)); - SH_RETURN_IF_ERROR(c->SetOutputShape(kOutputRowSplits, Shape({num_splits}))); - - return absl::OkStatus(); -} - -template - absl::Status ByteSplitterWithOffsetsOp - ::Invoke(InvokeContext* context) { - // Inputs - SH_ASSIGN_OR_RETURN(const auto values_view, context->GetInput(kInputValues)); - const auto values = values_view->template As(); - - ByteSplitter splitter; - - // Outputs - std::vector bytes; - std::vector row_splits; - std::vector start_offsets; - std::vector end_offsets; - - // Iterate through all the string values and split them. - row_splits.push_back(0); - for (int i = 0; i < values.Dim(0); ++i) { - // Split into bytes and record the offset locations. - const int orig_num_bytes = bytes.size(); - splitter.Split(values(i), &bytes, &start_offsets, &end_offsets); - const int delta_num_bytes = bytes.size() - orig_num_bytes; - // Record the row splits. - row_splits.push_back(delta_num_bytes + row_splits.back()); - } - - // Allocate output & fill output tensors. - SH_RETURN_IF_ERROR(this->template FillOutputTensor( - bytes, kOutputBytes, context)); - SH_RETURN_IF_ERROR(this->template FillOutputTensor( - row_splits, kOutputRowSplits, context)); - SH_RETURN_IF_ERROR(this->template FillOutputTensor( - start_offsets, kOutputStartOffsets, context)); - SH_RETURN_IF_ERROR(this->template FillOutputTensor( - end_offsets, kOutputEndOffsets, context)); - - return absl::OkStatus(); -} - - -template -class ByteSplitByOffsetsOp - : public tflite::shim::OpKernelShim { - private: - enum Inputs { - kInputValues = 0, - kInputStartOffsets, - kInputEndOffsets, - kInputRowSplits - }; - enum Outputs { - kOutputValues = 0, - kOutputRowSplits, - }; - - using typename tflite::shim::OpKernelShim::InitContext; - using typename tflite::shim::OpKernelShim::InvokeContext; - using typename tflite::shim::OpKernelShim::ShapeInferenceContext; - - public: - ByteSplitByOffsetsOp() = default; - static constexpr char kOpName[] = "TFText>ByteSplitByOffsets"; - static constexpr char kDoc[] = R"doc( - Splits a string into bytes using the given start and end offsets. - )doc"; - - static const char* OpName() { return kOpName; } - static const char* Doc() { return kDoc; } - - // Attributes declaration (syntax: https://www.tensorflow.org/guide/create_op) - static std::vector Attrs() { return {}; } - - // Inputs declaration (syntax: https://www.tensorflow.org/guide/create_op) - static std::vector Inputs(); - - // Outputs declaration (syntax: https://www.tensorflow.org/guide/create_op) - static std::vector Outputs(); - - // Initializes the op - absl::Status Init(InitContext* context) { return absl::OkStatus(); } - - // Runs the operation - absl::Status Invoke(InvokeContext* context); - - // Shape inference - static absl::Status ShapeInference(ShapeInferenceContext* c); -}; - -template -std::vector ByteSplitByOffsetsOp::Inputs() { - return {"input_values: string", "input_start_offsets: int32", - "input_end_offsets: int32", "input_row_splits: int64"}; -} - -template -std::vector ByteSplitByOffsetsOp::Outputs() { - return {"output_values: string", "output_row_splits: int64"}; -} - -template -absl::Status ByteSplitByOffsetsOp::ShapeInference( - ShapeInferenceContext* c) { - using tflite::shim::Shape; - const auto rank_1_shape = Shape({Shape::kUnknownDim}); - // input values shape - SH_ASSIGN_OR_RETURN(const Shape& input_values_shape, - c->GetInputShape(kInputValues)); - if (!input_values_shape.Compatible(rank_1_shape)) { - return absl::FailedPreconditionError(absl::StrCat( - "Input values shape must be rank 1: ", - input_values_shape.ToString())); - } - // input starts shape - SH_ASSIGN_OR_RETURN(const Shape& input_starts_shape, - c->GetInputShape(kInputStartOffsets)); - if (!input_starts_shape.Compatible(rank_1_shape)) { - return absl::FailedPreconditionError(absl::StrCat( - "Input start offsets shape must be rank 1: ", - input_starts_shape.ToString())); - } - // input ends shape - SH_ASSIGN_OR_RETURN(const Shape& input_ends_shape, - c->GetInputShape(kInputEndOffsets)); - if (!input_ends_shape.Compatible(rank_1_shape)) { - return absl::FailedPreconditionError(absl::StrCat( - "Input end offsets shape must be rank 1: ", - input_ends_shape.ToString())); - } - // input row splits shape - SH_ASSIGN_OR_RETURN(const Shape& input_row_splits_shape, - c->GetInputShape(kInputRowSplits)); - if (!input_row_splits_shape.Compatible(rank_1_shape)) { - return absl::FailedPreconditionError(absl::StrCat( - "Input row splits shape must be rank 1: ", - input_row_splits_shape.ToString())); - } - - SH_RETURN_IF_ERROR(c->SetOutputShape(kOutputValues, input_starts_shape)); - SH_RETURN_IF_ERROR(c->SetOutputShape(kOutputRowSplits, - input_row_splits_shape)); - - return absl::OkStatus(); -} - -template - absl::Status ByteSplitByOffsetsOp - ::Invoke(InvokeContext* context) { - // Inputs - SH_ASSIGN_OR_RETURN(const auto input_values_view, - context->GetInput(kInputValues)); - const auto input_values = - input_values_view->template As(); - SH_ASSIGN_OR_RETURN(const auto starts_view, - context->GetInput(kInputStartOffsets)); - const auto starts = starts_view->template As(); - SH_ASSIGN_OR_RETURN(const auto ends_view, - context->GetInput(kInputEndOffsets)); - const auto ends = ends_view->template As(); - SH_ASSIGN_OR_RETURN(const auto in_splits_view, - context->GetInput(kInputRowSplits)); - const auto in_splits = in_splits_view->template As(); - - ByteSplitter splitter; - - // Outputs - std::vector output_values; - std::vector out_splits; - - // Iterate through all the string values and split them. - out_splits.push_back(0); - for (int i = 0; i < input_values.Dim(0); ++i) { - SH_ASSIGN_OR_RETURN(auto batch, - splitter.SplitByOffsets( - input_values(i), - absl::MakeSpan(starts.Ptr() + in_splits(i), - in_splits(i+1) - in_splits(i)), - absl::MakeSpan(ends.Ptr() + in_splits(i), - in_splits(i+1) - in_splits(i)))); - output_values.insert(output_values.end(), batch.begin(), batch.end()); - out_splits.push_back(batch.size() + out_splits.back()); - } - - // Allocate output & fill output tensors. - SH_RETURN_IF_ERROR( - this->template FillOutputTensor( - output_values, kOutputValues, context)); - SH_RETURN_IF_ERROR(this->template FillOutputTensor( - out_splits, kOutputRowSplits, context)); - - return absl::OkStatus(); -} - -} // namespace text -} // namespace tensorflow +#include "tensorflow/core/kernels/text/byte_splitter_kernel_template.h" #endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_BYTE_SPLITTER_KERNEL_TEMPLATE_H_ diff --git a/tensorflow_text/core/kernels/byte_splitter_test.cc b/tensorflow_text/core/kernels/byte_splitter_test.cc deleted file mode 100644 index c6b4d6e72..000000000 --- a/tensorflow_text/core/kernels/byte_splitter_test.cc +++ /dev/null @@ -1,77 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/byte_splitter.h" - -#include - -#include - -namespace tensorflow { -namespace text { -namespace { - -using ::testing::ElementsAre; - -TEST(ByteSplitterTest, SplitAscii) { - const absl::string_view input_string("hello"); - std::vector output_bytes; - std::vector output_offsets; - ByteSplitter s; - s.Split(input_string, &output_bytes, &output_offsets); - EXPECT_THAT(output_bytes, ElementsAre(104, 101, 108, 108, 111)); - EXPECT_THAT(output_offsets, ElementsAre(0, 1, 2, 3, 4, 5)); -} - -TEST(ByteSplitterTest, SplitUnicode) { - const absl::string_view input_string("muñdʓ"); - std::vector output_bytes; - std::vector output_offsets; - ByteSplitter s; - s.Split(input_string, &output_bytes, &output_offsets); - EXPECT_THAT(output_bytes, ElementsAre(109, 117, 195, 177, 100, 202, 147)); - EXPECT_THAT(output_offsets, ElementsAre(0, 1, 2, 3, 4, 5, 6, 7)); -} - -TEST(ByteSplitterTest, SplitEmoji) { - const absl::string_view input_string("😀🙃"); - std::vector output_bytes; - std::vector output_offsets; - ByteSplitter s; - s.Split(input_string, &output_bytes, &output_offsets); - EXPECT_THAT(output_bytes, - ElementsAre(240, 159, 152, 128, 240, 159, 153, 131)); - EXPECT_THAT(output_offsets, ElementsAre(0, 1, 2, 3, 4, 5, 6, 7, 8)); -} - -TEST(ByteSplitterTest, SplitHanzi) { - const absl::string_view input_string("你好"); - std::vector output_bytes; - std::vector output_offsets; - ByteSplitter s; - s.Split(input_string, &output_bytes, &output_offsets); - EXPECT_THAT(output_bytes, ElementsAre(228, 189, 160, 229, 165, 189)); - EXPECT_THAT(output_offsets, ElementsAre(0, 1, 2, 3, 4, 5, 6)); -} - -TEST(ByteSplitterTest, SplitByBytesHanzi) { - ByteSplitter s; - auto output = s.SplitByOffsets("你好", {0, 3}, {3, 6}); - EXPECT_TRUE(output.ok()); - EXPECT_THAT(output.value(), ElementsAre("你", "好")); -} - -} // namespace -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/byte_splitter_tflite.cc b/tensorflow_text/core/kernels/byte_splitter_tflite.cc deleted file mode 100644 index a733467dd..000000000 --- a/tensorflow_text/core/kernels/byte_splitter_tflite.cc +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/byte_splitter_tflite.h" - -#include "tensorflow/lite/c/common.h" -#include "tensorflow/lite/kernels/shim/tflite_op_shim.h" -#include "tensorflow_text/core/kernels/byte_splitter_kernel_template.h" - -namespace tflite { -namespace ops { -namespace custom { -namespace text { - -extern "C" void AddByteSplit(tflite::MutableOpResolver* resolver) { - tflite::shim::TfLiteOpKernel< - tensorflow::text::ByteSplitterWithOffsetsOp>::Add(resolver); -} - -extern "C" void AddByteSplitByOffsets(tflite::MutableOpResolver* resolver) { - tflite::shim::TfLiteOpKernel< - tensorflow::text::ByteSplitByOffsetsOp>::Add(resolver); -} - -} // namespace text -} // namespace custom -} // namespace ops -} // namespace tflite diff --git a/tensorflow_text/core/kernels/byte_splitter_tflite.h b/tensorflow_text/core/kernels/byte_splitter_tflite.h index c1219ecc7..73304a13c 100644 --- a/tensorflow_text/core/kernels/byte_splitter_tflite.h +++ b/tensorflow_text/core/kernels/byte_splitter_tflite.h @@ -15,21 +15,6 @@ #ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_BYTE_SPLITTER_TFLITE_H_ #define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_BYTE_SPLITTER_TFLITE_H_ -#include "tensorflow/lite/c/common.h" -#include "tensorflow/lite/mutable_op_resolver.h" - -namespace tflite { -namespace ops { -namespace custom { -namespace text { - -extern "C" void AddByteSplit(::tflite::MutableOpResolver* resolver); - -extern "C" void AddByteSplitByOffsets(::tflite::MutableOpResolver* resolver); - -} // namespace text -} // namespace custom -} // namespace ops -} // namespace tflite +#include "tensorflow/core/kernels/text/byte_splitter_tflite.h" #endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_BYTE_SPLITTER_TFLITE_H_ diff --git a/tensorflow_text/core/kernels/constrained_sequence.cc b/tensorflow_text/core/kernels/constrained_sequence.cc deleted file mode 100644 index 2553472c1..000000000 --- a/tensorflow_text/core/kernels/constrained_sequence.cc +++ /dev/null @@ -1,441 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/constrained_sequence.h" - -#include -#include -#include -#include - -#include "tensorflow/core/framework/tensor.h" -#include "tensorflow/core/framework/tensor_shape.h" -#include "tensorflow/core/framework/tensor_types.h" -#include "tensorflow/core/framework/types.pb.h" -#include "tensorflow/core/platform/logging.h" -#include "tensorflow/core/platform/types.h" - -namespace tensorflow { -namespace text { - -// State index to use if the sequence in question requires an impossible -// transition. -constexpr int kErrorState = -1; - -ScoreAccessor::ScoreAccessor(const Tensor &score_tensor, - const Tensor &lengths_tensor) { - data_ = score_tensor.flat().data(); - if (lengths_tensor.dtype() == DT_INT64) { - use_long_lengths_ = true; - long_lengths_ = lengths_tensor.flat().data(); - } else { - use_long_lengths_ = false; - lengths_ = lengths_tensor.flat().data(); - } - has_explicit_batch_ = (score_tensor.shape().dims() == 3); - if (has_explicit_batch_) { - batch_size_ = score_tensor.shape().dim_size(0); - num_steps_ = score_tensor.shape().dim_size(1); - num_scores_ = score_tensor.shape().dim_size(2); - } else { - batch_size_ = 1; - num_steps_ = score_tensor.shape().dim_size(0); - num_scores_ = score_tensor.shape().dim_size(1); - } - batch_offset_ = num_scores_ * num_steps_; - step_offset_ = num_scores_; -} - -// Get a score out of the data tensor. -float ScoreAccessor::GetScore(int batch_idx, int step_idx, - int score_idx) const { - DCHECK_LE(batch_idx, batch_size_); - DCHECK_LE(step_idx, num_steps_); - DCHECK_LE(score_idx, num_scores_); - return data_[batch_offset_ * batch_idx + step_offset_ * step_idx + score_idx]; -} - -int64 ScoreAccessor::GetLength(int batch_idx) const { - DCHECK_LE(batch_idx, batch_size_); - if (use_long_lengths_) { - return long_lengths_[batch_idx]; - } else { - return lengths_[batch_idx]; - } -} - -int ScoreAccessor::batch_size() const { return batch_size_; } -int ScoreAccessor::num_steps() const { return num_steps_; } -int ScoreAccessor::num_scores() const { return num_scores_; } -bool ScoreAccessor::has_explicit_batch() const { return has_explicit_batch_; } - -// Perform Viterbi analysis on a single batch item. -void ViterbiAnalysis( - const ScoreAccessor &scores, - const tensorflow::TTypes::Matrix &transition_weights, - const tensorflow::TTypes::Matrix &allowed_transitions, - const int batch, bool use_log_space, bool use_start_end_states, - int32 *output_data) { - VLOG(2) << "Analyzing batch " << batch; - const bool has_transition_weights = transition_weights.size() != 0; - const bool has_allowed_transitions = allowed_transitions.size() != 0; - const int num_states = scores.num_scores(); - const int out_of_bounds_index = num_states; - - int64 num_steps = scores.GetLength(batch); - - // Create two vectors to hold scores. These will be bound to referents later - // so the names here are somewhat irrelevant. - std::vector scores_a(num_states, - std::numeric_limits::lowest()); - std::vector scores_b(num_states, - std::numeric_limits::lowest()); - - // Create a chart of backpointers. Include rows for [start] and [end] - // transitions. By initializing this to kErrorState, we ensure unreachable - // transitions get marked as errors. - std::vector> backpointers( - num_steps, std::vector(num_states, kErrorState)); - - // Set current and previous references for step 0 - std::vector *previous_scores = &scores_a; - std::vector *current_scores = &scores_b; - - const bool vlog3 = VLOG_IS_ON(3); - - if (backpointers.empty()) { - // We're done with this batch if there are no steps to analyze. - return; - } - for (int curr_state = 0; curr_state < num_states; ++curr_state) { - std::vector ¤t_bps = backpointers[0]; - if (use_start_end_states) { - // Initialize the zeroth step BPs to kOutOfBoundsIndex for all states - // where the OOB->state transition is valid, and set scores as needed. - if (has_allowed_transitions && - !allowed_transitions(out_of_bounds_index, curr_state)) { - if (vlog3) { - LOG(INFO) << "(" << batch << ", 0, [START]->" << curr_state - << "): disallowed."; - } - continue; - } - - // Because the backpointer vectors are initialized to kErrorState, we - // need only to set the valid transition paths to have come from the - // padding state. - current_bps[curr_state] = out_of_bounds_index; - - // For valid transitions, get the score (and adjust as appropriate). - const int step = 0; - float current_score = scores.GetScore(batch, step, curr_state); - if (has_transition_weights) { - if (use_log_space) { - current_score += transition_weights(out_of_bounds_index, curr_state); - } else { - current_score *= transition_weights(out_of_bounds_index, curr_state); - } - } - - if (vlog3) { - if (has_transition_weights) { - LOG(INFO) << "(" << batch << ", " << step << ", [START]->" - << curr_state << "): Total score: " << current_score - << " (raw: " << scores.GetScore(batch, step, curr_state) - << ", tw: " - << transition_weights(out_of_bounds_index, curr_state) - << ")"; - } else { - LOG(INFO) << "(" << batch << ", " << step << ", [START]->" - << curr_state << "): Total score: " << current_score - << " (raw: " << scores.GetScore(batch, step, curr_state) - << ")"; - } - } - - current_scores->at(curr_state) = current_score; - } else { - // If we don't have specific start and end states, all bp's are valid - // and all starting scores are the unadjusted step 0 scores. - current_bps[curr_state] = out_of_bounds_index; - const int step = 0; - current_scores->at(curr_state) = scores.GetScore(batch, step, curr_state); - } - } - - // Update the current scores (and normalize if we're not in log space). - if (!use_log_space) { - const double max_score = - *std::max_element(current_scores->begin(), current_scores->end()); - if (max_score > 0) { - for (double &score : *current_scores) score /= max_score; - } - } - - // Swap current and previous score arrays, as we are advancing a step. - std::vector *tmp = previous_scores; - previous_scores = current_scores; - current_scores = tmp; - - // Handle all steps save for the first and last in this loop. - for (int step = 1; step < num_steps; ++step) { - const std::vector &previous_bps = backpointers[step - 1]; - std::vector ¤t_bps = backpointers[step]; - - for (int curr_state = 0; curr_state < num_states; ++curr_state) { - int best_source_state = kErrorState; - float best_score = std::numeric_limits::lowest(); - for (int prev_state = 0; prev_state < num_states; ++prev_state) { - // If the previous state was an error state, pass to the next state. - if (previous_bps[prev_state] == kErrorState) { - if (vlog3) { - LOG(INFO) << "(" << batch << ", " << step << ", " << prev_state - << "->" << curr_state << "): prev state error."; - } - continue; - } - - // If this is not a permitted transition, continue. - if (has_allowed_transitions && - !allowed_transitions(prev_state, curr_state)) { - if (vlog3) { - LOG(INFO) << "(" << batch << ", " << step << ", " << prev_state - << "->" << curr_state << "): disallowed."; - } - continue; - } - - float current_score = scores.GetScore(batch, step, curr_state); - if (use_log_space) { - current_score += previous_scores->at(prev_state); - } else { - current_score *= previous_scores->at(prev_state); - } - if (has_transition_weights) { - if (use_log_space) { - current_score += transition_weights(prev_state, curr_state); - } else { - current_score *= transition_weights(prev_state, curr_state); - } - } - - if (vlog3) { - if (has_transition_weights) { - LOG(INFO) << "(" << batch << ", " << step << ", " << prev_state - << "->" << curr_state - << "): Total score: " << current_score - << " (prev: " << previous_scores->at(prev_state) - << ", raw: " << scores.GetScore(batch, step, curr_state) - << ", tw: " << transition_weights(prev_state, curr_state) - << ")"; - } else { - LOG(INFO) << "(" << batch << ", " << step << ", " << prev_state - << "->" << curr_state - << "): Total score: " << current_score - << " (prev: " << previous_scores->at(prev_state) - << ", raw: " << scores.GetScore(batch, step, curr_state) - << ")"; - } - } - - if (current_score >= best_score) { - best_source_state = prev_state; - best_score = current_score; - } - } - current_bps[curr_state] = best_source_state; - current_scores->at(curr_state) = best_score; - } - - // Normalize if we're not in log space. - if (!use_log_space) { - const double max_score = - *std::max_element(current_scores->begin(), current_scores->end()); - if (max_score > 0) { - for (double &score : *current_scores) score /= max_score; - } - } - - // After each step, switch the current scores to the previous scores and - // use the previous previous scores as the current scores. - std::vector *tmp = previous_scores; - previous_scores = current_scores; - current_scores = tmp; - } - - // Handle the final transition out of the sequence. - int final_state = out_of_bounds_index; - const std::vector &previous_bps = backpointers[num_steps - 1]; - int best_source_state = kErrorState; - float final_score = std::numeric_limits::lowest(); - - for (int prev_state = 0; prev_state < num_states; ++prev_state) { - // If the previous state was an error state, pass to the next state. - if (previous_bps[prev_state] == kErrorState) { - current_scores->at(prev_state) = std::numeric_limits::lowest(); - if (vlog3) { - LOG(INFO) << "(" << batch << ", " << num_steps << ", " << prev_state - << "->[END]): prev state error."; - } - continue; - } - - // If this is not a permitted transition, continue. - if (has_allowed_transitions && use_start_end_states && - !allowed_transitions(prev_state, final_state)) { - current_scores->at(prev_state) = std::numeric_limits::lowest(); - if (vlog3) { - LOG(INFO) << "(" << batch << ", " << num_steps << ", " << prev_state - << "->[END]): disallowed."; - } - continue; - } - - // Weight the final transition score by the probability of exiting the - // sequence as well. - float current_score = previous_scores->at(prev_state); - if (use_start_end_states) { - if (has_transition_weights) { - if (use_log_space) { - current_score += transition_weights(prev_state, final_state); - } else { - current_score *= transition_weights(prev_state, final_state); - } - } - - if (vlog3) { - if (has_transition_weights) { - LOG(INFO) << "(" << batch << ", " << num_steps << ", " << prev_state - << "->[END]): Total score: " << current_score - << " (prev: " << previous_scores->at(prev_state) - << ", tw: " << transition_weights(prev_state, final_state) - << ")"; - } else { - LOG(INFO) << "(" << batch << ", " << num_steps << ", " << prev_state - << "->[END]): Total score: " << current_score - << " (prev: " << previous_scores->at(prev_state) << ")"; - } - } - } - - current_scores->at(prev_state) = current_score; - if (current_score >= final_score) { - best_source_state = prev_state; - final_score = current_score; - } - } - - if (vlog3) { - LOG(INFO) << "Final score: " << final_score; - } - - // Calculate the path. - if (best_source_state == kErrorState) { - // If the best source is an error state, the path is unknowable. Report - // error states for the whole sequence. - for (int64 i = 0; i < scores.GetLength(batch); ++i) { - output_data[i] = kErrorState; - } - } else { - // If the best source is a 'real' state, report the state path. - int steps_to_report = scores.GetLength(batch); - int previous_state = best_source_state; - for (int64 i = steps_to_report - 1; i >= 0; --i) { - output_data[i] = previous_state; - previous_state = backpointers[i][previous_state]; - } - } -} - -void GreedyAnalysis( - const ScoreAccessor &scores, - const tensorflow::TTypes::Matrix &transition_weights, - const tensorflow::TTypes::Matrix &allowed_transitions, - int batch, bool use_log_space, bool use_start_end_states, - int32 *output_data) { - const bool has_transition_weights = transition_weights.size() != 0; - const bool has_allowed_transitions = allowed_transitions.size() != 0; - const int num_states = scores.num_scores(); - const int out_of_bounds_index = num_states; - int64 num_steps = scores.GetLength(batch); - - for (int step = 0; step < num_steps; ++step) { - // Do final step calculations if this is the final step in the sequence - // and we are calculating based on implicit start and end states. - bool do_final_step = - (step == scores.GetLength(batch) - 1) && use_start_end_states; - VLOG(2) << "is last step: " << do_final_step; - - const int previous_state = - (step == 0) ? (out_of_bounds_index) : (output_data[step - 1]); - - if (previous_state == kErrorState) { - // If the previous state is the error state, the current state must - // also be the error state. - output_data[step] = kErrorState; - continue; - } - - // If no transition is possible, this will stay the error state. - int best_new_state = kErrorState; - float best_new_score = std::numeric_limits::lowest(); - - for (int state = 0; state < num_states; ++state) { - float current_score = scores.GetScore(batch, step, state); - - // If we are not using start/end states AND step is 0, then - // current_score will not be altered. - if (use_start_end_states || step > 0) { - if (has_allowed_transitions) { - // If either the transition from the previous state to this state - // is disallowed, or we need to analyze the final step and the - // transition from this state to the final step is not allowed, - // disallow this transition. - if (!allowed_transitions(previous_state, state) || - (do_final_step && - !allowed_transitions(state, out_of_bounds_index))) { - continue; - } - } - - if (has_transition_weights) { - if (use_log_space) { - current_score += transition_weights(previous_state, state); - } else { - current_score *= transition_weights(previous_state, state); - } - // On the last step, also analyze by the weight value of - // transitioning from this state to the out-of-bounds state. - if (do_final_step) { - if (use_log_space) { - current_score += transition_weights(state, out_of_bounds_index); - } else { - current_score *= transition_weights(state, out_of_bounds_index); - } - } - } - } - if (current_score >= best_new_score) { - best_new_state = state; - best_new_score = current_score; - } - } - output_data[step] = best_new_state; - VLOG(2) << "Best state for step " << step << " is " << output_data[step] - << " with score " << best_new_score; - } -} - -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/constrained_sequence.h b/tensorflow_text/core/kernels/constrained_sequence.h index 5f62f46b6..120f0ea65 100644 --- a/tensorflow_text/core/kernels/constrained_sequence.h +++ b/tensorflow_text/core/kernels/constrained_sequence.h @@ -12,81 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef TENSORFLOW_TEXT_CORE_KERNELS_CONSTRAINED_SEQUENCE_H_ -#define TENSORFLOW_TEXT_CORE_KERNELS_CONSTRAINED_SEQUENCE_H_ +#ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_CONSTRAINED_SEQUENCE_H_ +#define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_CONSTRAINED_SEQUENCE_H_ -#include "tensorflow/core/framework/tensor.h" -#include "tensorflow/core/framework/tensor_types.h" -#include "tensorflow/core/platform/types.h" +#include "tensorflow/core/kernels/text/constrained_sequence.h" -namespace tensorflow { -namespace text { - -class ScoreAccessor { - public: - explicit ScoreAccessor(const Tensor &score_tensor, - const Tensor &lengths_tensor); - - // Get a score out of the data tensor. - float GetScore(int batch_idx, int step_idx, int score_idx) const; - - int64 GetLength(int batch_idx) const; - - int batch_size() const; - int num_steps() const; - int num_scores() const; - bool has_explicit_batch() const; - - private: - // A pointer into the underlying data of the score tensor. Not owned. - const float *data_; - - // A pointer into the underlying data of the lengths tensor. Not owned. - const int *lengths_; - const int64 *long_lengths_; - - // Whether the passed lengths tensor is int32 or int64. - bool use_long_lengths_; - - // The batch size associated with the data tensor. - int batch_size_; - - // The number of steps in the data tensor. - int num_steps_; - - // The number of scores in the data tensor. - int num_scores_; - - // The amount to increase the offset within the flat data array if the batch - // index increases by 1. - int batch_offset_; - - // The amount to increase the offset within the flat data array if the step - // index increases by 1. - int step_offset_; - - // True if the original tensor had an explicit batch dimension (that is, - // it was of rank 3). - bool has_explicit_batch_; -}; - -// Perform Viterbi analysis on a single batch item. -void ViterbiAnalysis( - const ScoreAccessor &scores, - const tensorflow::TTypes::Matrix &transition_weights, - const tensorflow::TTypes::Matrix &allowed_transitions, - const int batch, bool use_log_space, bool use_start_end_states, - int32 *output_data); - -// Perform a greedy analysis on a single batch item. -void GreedyAnalysis( - const ScoreAccessor &scores, - const tensorflow::TTypes::Matrix &transition_weights, - const tensorflow::TTypes::Matrix &allowed_transitions, - int batch, bool use_log_space, bool use_start_end_states, - int32 *output_data); - -} // namespace text -} // namespace tensorflow - -#endif // TENSORFLOW_TEXT_CORE_KERNELS_CONSTRAINED_SEQUENCE_H_ +#endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_CONSTRAINED_SEQUENCE_H_ diff --git a/tensorflow_text/core/kernels/constrained_sequence_kernel.cc b/tensorflow_text/core/kernels/constrained_sequence_kernel.cc deleted file mode 100644 index 339c202ce..000000000 --- a/tensorflow_text/core/kernels/constrained_sequence_kernel.cc +++ /dev/null @@ -1,259 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include - -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/framework/resource_mgr.h" -#include "tensorflow/core/framework/tensor.h" -#include "tensorflow/core/framework/tensor_shape.h" -#include "tensorflow/core/framework/types.pb.h" -#include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/lib/core/threadpool.h" -#include "tensorflow/core/lib/io/path.h" -#include "tensorflow/core/platform/logging.h" -#include "tensorflow_text/core/kernels/constrained_sequence.h" - -namespace tensorflow { - -using ::tensorflow::DataType; -using ::tensorflow::DEVICE_CPU; -using ::tensorflow::DT_BOOL; -using ::tensorflow::DT_FLOAT; -using ::tensorflow::OpKernel; -using ::tensorflow::OpKernelConstruction; -using ::tensorflow::OpKernelContext; -using ::tensorflow::Status; -using ::tensorflow::Tensor; -using ::tensorflow::TensorShape; -using ::tensorflow::errors::InvalidArgument; -using ::tensorflow::text::GreedyAnalysis; -using ::tensorflow::text::ScoreAccessor; -using ::tensorflow::text::ViterbiAnalysis; - -// State index to use if the sequence in question requires an impossible -// transition. -constexpr int kErrorState = -1; - -// State index to use when outputting a padded tensor and the sequence in -// question does not have a token for a given step. -constexpr int kPaddingState = -2; - -namespace { - -// Validate that a given constraint tensor is the proper shape (dimension -// 2, with shape [num_states + 1, num_states + 1]. -absl::Status ValidateConstraintTensor(const Tensor &tensor, - const int num_states, - const bool use_start_end_states, - const string &name) { - if (tensor.shape().dims() != 2) { - return InvalidArgument( - tensorflow::strings::StrCat(name, " must be of rank 2")); - } - int expected_size = use_start_end_states ? num_states + 1 : num_states; - if (tensor.shape().dim_size(0) != expected_size) { - return InvalidArgument(tensorflow::strings::StrCat( - name, " must have a zeroth dimension of size ", expected_size, - " when num_states is ", num_states, " and use_start_and_end_states is ", - use_start_end_states)); - } - if (tensor.shape().dim_size(1) != expected_size) { - return InvalidArgument(tensorflow::strings::StrCat( - name, " must have a first dimension of size ", expected_size, - " when num_states is ", num_states, " and use_start_and_end_states is ", - use_start_end_states)); - } - return absl::OkStatus(); -} - -} // namespace - -template -class ConstrainedSequence : public OpKernel { - public: - explicit ConstrainedSequence(OpKernelConstruction *context) - : OpKernel(context) { - OP_REQUIRES_OK(context, context->GetAttr("use_viterbi", &use_viterbi_)); - OP_REQUIRES_OK(context, context->GetAttr("use_log_space", &use_log_space_)); - OP_REQUIRES_OK(context, context->GetAttr("use_start_and_end_states", - &use_start_end_states_)); - } - - void Compute(OpKernelContext *context) override { - const auto &score_tensor = context->input(0); - OP_REQUIRES(context, - (score_tensor.shape().dims() == 2) || - (score_tensor.shape().dims() == 3), - InvalidArgument("The score tensor must be of rank 2 or 3.")); - const auto &lengths_tensor = context->input(1); - - ScoreAccessor scores(score_tensor, lengths_tensor); - - // The scores tensor should be [batch, step, scores]. - const int batch_size = scores.batch_size(); - const int num_steps = scores.num_steps(); - const int num_scores = scores.num_scores(); - - OP_REQUIRES(context, lengths_tensor.NumElements() == batch_size, - InvalidArgument(tensorflow::strings::StrCat( - "There should be exactly one length for every batch " - "element. Found ", - lengths_tensor.NumElements(), - " length elements for a batch size of ", batch_size))); - - VLOG(2) << "batch: " << batch_size; - VLOG(2) << "steps: " << num_steps; - VLOG(2) << "score: " << num_scores; - - // Make sure there's enough data to advance every sequence. - int max_length = 0; - int total_length = 0; - for (int i = 0; i < batch_size; ++i) { - int64 length = scores.GetLength(i); - total_length += length; - if (length > max_length) { - max_length = length; - } - } - - OP_REQUIRES( - context, num_steps >= max_length, - InvalidArgument( - "The scores tensor is too short for the longest sequence length.")); - - // Validate the constraint tensors. - const auto &allowed_transitions_tensor = context->input(2); - bool has_allowed_transitions = - allowed_transitions_tensor.NumElements() != 0; - VLOG(4) << allowed_transitions_tensor.NumElements(); - if (has_allowed_transitions) { - OP_REQUIRES_OK(context, - ValidateConstraintTensor(allowed_transitions_tensor, - num_scores, use_start_end_states_, - "allowed_transitions")); - } - - const auto &transition_weights_tensor = context->input(3); - - VLOG(4) << transition_weights_tensor.NumElements(); - bool has_transition_weights = transition_weights_tensor.NumElements() != 0; - if (has_transition_weights) { - OP_REQUIRES_OK(context, ValidateConstraintTensor( - transition_weights_tensor, num_scores, - use_start_end_states_, "transition_weights")); - - // If we have transition weights in exp-space, all values must be non- - // negative. - if (!use_log_space_) { - for (int i = 0; i < transition_weights_tensor.NumElements(); ++i) { - OP_REQUIRES(context, transition_weights_tensor.flat()(i) >= 0, - InvalidArgument("The transition weights tensor must not " - "contain negative values.")); - } - } - } - - const tensorflow::Tensor empty_float(DT_FLOAT, TensorShape({0, 0})); - const tensorflow::Tensor empty_bool(DT_BOOL, TensorShape({0, 0})); - - const auto &transition_weights = - has_transition_weights ? transition_weights_tensor.matrix() - : empty_float.matrix(); - - const auto &allowed_transitions = - has_allowed_transitions ? allowed_transitions_tensor.matrix() - : empty_bool.matrix(); - - Tensor *output; - OP_REQUIRES_OK(context, context->allocate_output( - 0, TensorShape({total_length}), &output)); - int32 *output_data = output->flat().data(); - - Tensor *offsets; - OP_REQUIRES_OK(context, context->allocate_output( - 1, TensorShape({batch_size + 1}), &offsets)); - Tsplits *offset_data = offsets->flat().data(); - offset_data[0] = 0; - - for (int batch = 0; batch < batch_size; ++batch) { - int step_offset = offset_data[batch]; - int64 num_steps = scores.GetLength(batch); - offset_data[batch + 1] = step_offset + num_steps; - if (use_viterbi_) { - DoViterbiAnalysis(transition_weights, allowed_transitions, batch, - scores, &output_data[step_offset]); - } else { - DoGreedyAnalysis(transition_weights, allowed_transitions, batch, scores, - &output_data[step_offset]); - } - } - } - - private: - // Perform Viterbi analysis on a single batch item. - void DoViterbiAnalysis( - const tensorflow::TTypes::Matrix &transition_weights, - const tensorflow::TTypes::Matrix &allowed_transitions, - const int batch, const ScoreAccessor &scores, int32 *output_data) { - ViterbiAnalysis(scores, transition_weights, allowed_transitions, batch, - use_log_space_, use_start_end_states_, output_data); - } - - // Perform a greedy analysis on a single batch item. - void DoGreedyAnalysis( - const tensorflow::TTypes::Matrix &transition_weights, - const tensorflow::TTypes::Matrix &allowed_transitions, - int batch, const ScoreAccessor &scores, int32 *output_data) { - GreedyAnalysis(scores, transition_weights, allowed_transitions, batch, - use_log_space_, use_start_end_states_, output_data); - } - - // True if this op should perform calculations in log-space (using addition). - // If false, will perform calculations in normalized exp-space (using - // multiplication). - bool use_log_space_; - - // True if this op should calculate scores using the Viterbi algorithm. If - // false, will use a greedy algorithm. - bool use_viterbi_; - - // True if this op should calculate sequences based on an implicit start - // and end state. - bool use_start_end_states_; - - TF_DISALLOW_COPY_AND_ASSIGN(ConstrainedSequence); -}; - -#define REGISTER_KERNELS(Tin) \ - REGISTER_KERNEL_BUILDER(Name("ConstrainedSequence") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("Tin") \ - .TypeConstraint("Tsplits"), \ - ConstrainedSequence); \ - REGISTER_KERNEL_BUILDER(Name("ConstrainedSequence") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("Tin") \ - .TypeConstraint("Tsplits"), \ - ConstrainedSequence) - -REGISTER_KERNELS(int32); -REGISTER_KERNELS(int64); - -#undef REGISTER_KERNELS - -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/constrained_sequence_kernel_input_validation_test.cc b/tensorflow_text/core/kernels/constrained_sequence_kernel_input_validation_test.cc deleted file mode 100644 index 343d2d142..000000000 --- a/tensorflow_text/core/kernels/constrained_sequence_kernel_input_validation_test.cc +++ /dev/null @@ -1,496 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include - -#include -#include -#include "tensorflow/core/framework/fake_input.h" -#include "tensorflow/core/framework/node_def_builder.h" -#include "tensorflow/core/framework/tensor.h" -#include "tensorflow/core/framework/tensor_shape.h" -#include "tensorflow/core/framework/types.pb.h" -#include "tensorflow/core/kernels/ops_testutil.h" -#include "tensorflow/core/lib/core/status_test_util.h" -#include "tensorflow/core/platform/status.h" -#include "tensorflow/core/platform/types.h" -#include "tensorflow_text/core/kernels/text_kernels_test_util.h" - -namespace tensorflow { - -using tensorflow::DT_INT32; -using tensorflow::FakeInput; -using tensorflow::NodeDefBuilder; -using tensorflow::Status; -using tensorflow::TensorShape; -using tensorflow::text_kernels_test_util::MatrixEq; -using tensorflow::text_kernels_test_util::VectorEq; - -class ConstrainedSequenceInputValidationTest : public tensorflow::OpsTestBase { - public: - void SetUpOpWithDefaults(bool use_start_end, - tensorflow::DataType input_datatype) { - // Prepare graph. - TF_ASSERT_OK(NodeDefBuilder("tested_op", "ConstrainedSequence") - .Attr("Tin", input_datatype) - .Attr("use_viterbi", true) - .Attr("use_log_space", true) - .Attr("use_start_and_end_states", use_start_end) - .Input(FakeInput()) - .Input(FakeInput()) - .Input(FakeInput()) - .Input(FakeInput()) - .Finalize(node_def())); - TF_ASSERT_OK(InitOp()); - } - - void SetUpOpWithStartEnd() { SetUpOpWithDefaults(true, DT_INT32); } - - void SetUpOpWithNoStartEnd() { SetUpOpWithDefaults(false, DT_INT32); } -}; -// TODO(b/122968457): There are a bunch of tests that only validate !ok instead -// of looking for specific error messages; fix that. - -// This test examines evaluations with only a permissions matrix. -TEST_F(ConstrainedSequenceInputValidationTest, WorksWithInt64InputLengths) { - // Prepare graph. - SetUpOpWithDefaults(true, DT_INT64); - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 12.0, 13.0, 4.0, // - 1.0, 12.0, 13.0, 14.0, // - 15.0, 2.0, 3.0, 14.0, // - }}); - - // Add the sequence_lengths input. - std::vector input_lengths({1, 1, 1}); - AddInputFromArray(TensorShape({3}), input_lengths); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO OUT - true, true, true, true, true, // FROM 0 - true, true, true, true, true, // FROM 1 - true, true, true, true, true, // FROM 2 - true, true, true, true, true, // FROM 3 - true, true, false, true, false, // FROM 'OUTSIDE' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({0, 0}), {}); - - TF_ASSERT_OK(RunOpKernel()); - - // The first sequence's highest score is 2, but OUT->2 is not ok, so it's 1. - // The second sequence's highest score is 3, which is ok. - // The third sequence's highest score is 0, which is ok. - - // Validate the output. - std::vector expected_transitions({1, 3, 0}); - std::vector expected_offsets({0, 1, 2, 3}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -TEST_F(ConstrainedSequenceInputValidationTest, - FailsOnOuterWrongSizePermissionMatrix) { - // Prepare graph. - SetUpOpWithStartEnd(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 2.0, 3.0, 4.0, // - 1.0, 12.0, 3.0, 4.0, // - 1.0, 2.0, 3.0, 14.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({4, 5}), - { - true, true, true, true, true, // - true, true, true, true, true, // - true, true, true, true, true, // - true, true, true, true, true, // - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({0, 0}), {}); - - auto result = RunOpKernel(); - EXPECT_FALSE(result.ok()); -} -TEST_F(ConstrainedSequenceInputValidationTest, - FailsOnInnerWrongSizePermissionMatrix) { - // Prepare graph. - SetUpOpWithStartEnd(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 2.0, 3.0, 4.0, // - 1.0, 12.0, 3.0, 4.0, // - 1.0, 2.0, 3.0, 14.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 4}), - { - true, true, true, true, true, // - true, true, true, true, true, // - true, true, true, true, true, // - true, true, true, true, true, // - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({0, 0}), {}); - - auto result = RunOpKernel(); - EXPECT_FALSE(result.ok()); -} -TEST_F(ConstrainedSequenceInputValidationTest, - FailsOnWrongRankPermissionMatrix) { - // Prepare graph. - SetUpOpWithStartEnd(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 2.0, 3.0, 4.0, // - 1.0, 12.0, 3.0, 4.0, // - 1.0, 2.0, 3.0, 14.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({25}), - { - true, true, true, true, true, // - true, true, true, true, true, // - true, true, true, true, true, // - true, true, true, true, true, // - true, true, true, true, true, // - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({0, 0}), {}); - - auto result = RunOpKernel(); - EXPECT_FALSE(result.ok()); -} - -TEST_F(ConstrainedSequenceInputValidationTest, - FailsOnOuterWrongSizeWeightMatrix) { - // Prepare graph. - SetUpOpWithStartEnd(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 2.0, 3.0, 4.0, // - 1.0, 12.0, 3.0, 4.0, // - 1.0, 2.0, 3.0, 14.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({0, 0}), {}); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({4, 5}), {0.5, 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 0.5, 1.0, // - 0.1, 0.5, 0.5, 1.0, 1.0}); - auto result = RunOpKernel(); - EXPECT_FALSE(result.ok()); -} -TEST_F(ConstrainedSequenceInputValidationTest, - FailsOnInnerWrongSizeWeightMatrix) { - // Prepare graph. - SetUpOpWithStartEnd(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 2.0, 3.0, 4.0, // - 1.0, 12.0, 3.0, 4.0, // - 1.0, 2.0, 3.0, 14.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({0, 0}), {}); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({5, 4}), {0.5, 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 0.5, 1.0, // - 0.1, 0.5, 0.5, 1.0, 1.0}); - - auto result = RunOpKernel(); - EXPECT_FALSE(result.ok()); -} -TEST_F(ConstrainedSequenceInputValidationTest, FailsOnWrongRankWeightMatrix) { - // Prepare graph. - SetUpOpWithStartEnd(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 2.0, 3.0, 4.0, // - 1.0, 12.0, 3.0, 4.0, // - 1.0, 2.0, 3.0, 14.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({0, 0}), {}); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({25}), {0.5, 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 0.5, 1.0, // - 0.1, 0.5, 0.5, 1.0, 1.0}); - auto result = RunOpKernel(); - EXPECT_FALSE(result.ok()); -} - -TEST_F(ConstrainedSequenceInputValidationTest, - PassesWithCorrectSizedWeightAndPermissionsMatrix) { - // Prepare graph. - SetUpOpWithNoStartEnd(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 2.0, 3.0, 4.0, // - 1.0, 12.0, 3.0, 4.0, // - 1.0, 2.0, 3.0, 14.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({4, 4}), { - true, true, true, true, // - true, true, true, true, // - true, true, true, true, // - true, true, true, true, // - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({4, 4}), {0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 1.0, 1.0}); - auto result = RunOpKernel(); - EXPECT_TRUE(result.ok()); -} - -TEST_F(ConstrainedSequenceInputValidationTest, - FailsOnOuterWrongSizePermissionMatrixWithNoStartEnd) { - // Prepare graph. - SetUpOpWithNoStartEnd(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 2.0, 3.0, 4.0, // - 1.0, 12.0, 3.0, 4.0, // - 1.0, 2.0, 3.0, 14.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({4, 5}), - { - true, true, true, true, true, // - true, true, true, true, true, // - true, true, true, true, true, // - true, true, true, true, true, // - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({0, 0}), {}); - - auto result = RunOpKernel(); - EXPECT_FALSE(result.ok()); -} -TEST_F(ConstrainedSequenceInputValidationTest, - FailsOnInnerWrongSizePermissionMatrixWithNoStartEnd) { - // Prepare graph. - SetUpOpWithNoStartEnd(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 2.0, 3.0, 4.0, // - 1.0, 12.0, 3.0, 4.0, // - 1.0, 2.0, 3.0, 14.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 4}), - { - true, true, true, true, true, // - true, true, true, true, true, // - true, true, true, true, true, // - true, true, true, true, true, // - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({0, 0}), {}); - - auto result = RunOpKernel(); - EXPECT_FALSE(result.ok()); -} -TEST_F(ConstrainedSequenceInputValidationTest, - FailsOnWrongRankPermissionMatrixWithNoStartEnd) { - // Prepare graph. - SetUpOpWithNoStartEnd(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 2.0, 3.0, 4.0, // - 1.0, 12.0, 3.0, 4.0, // - 1.0, 2.0, 3.0, 14.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({16}), { - true, true, true, true, // - true, true, true, true, // - true, true, true, true, // - true, true, true, true, // - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({0, 0}), {}); - - auto result = RunOpKernel(); - EXPECT_FALSE(result.ok()); -} - -TEST_F(ConstrainedSequenceInputValidationTest, - FailsOnOuterWrongSizeWeightMatrixWithNoStartEnd) { - // Prepare graph. - SetUpOpWithNoStartEnd(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 2.0, 3.0, 4.0, // - 1.0, 12.0, 3.0, 4.0, // - 1.0, 2.0, 3.0, 14.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({0, 0}), {}); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({4, 5}), {0.5, 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 0.5, 1.0, // - 0.1, 0.5, 0.5, 1.0, 1.0}); - auto result = RunOpKernel(); - EXPECT_FALSE(result.ok()); -} -TEST_F(ConstrainedSequenceInputValidationTest, - FailsOnInnerWrongSizeWeightMatrixWithNoStartEnd) { - // Prepare graph. - SetUpOpWithNoStartEnd(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 2.0, 3.0, 4.0, // - 1.0, 12.0, 3.0, 4.0, // - 1.0, 2.0, 3.0, 14.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({0, 0}), {}); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({5, 4}), {0.5, 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 0.5, 1.0, // - 0.1, 0.5, 0.5, 1.0, 1.0}); - - auto result = RunOpKernel(); - EXPECT_FALSE(result.ok()); -} -TEST_F(ConstrainedSequenceInputValidationTest, - FailsOnWrongRankWeightMatrixWithNoStartEnd) { - // Prepare graph. - SetUpOpWithNoStartEnd(); - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 2.0, 3.0, 4.0, // - 1.0, 12.0, 3.0, 4.0, // - 1.0, 2.0, 3.0, 14.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({0, 0}), {}); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({16}), {0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 1.0}); - auto result = RunOpKernel(); - EXPECT_FALSE(result.ok()); -} - -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/darts_clone_trie_builder.cc b/tensorflow_text/core/kernels/darts_clone_trie_builder.cc deleted file mode 100644 index e204a9783..000000000 --- a/tensorflow_text/core/kernels/darts_clone_trie_builder.cc +++ /dev/null @@ -1,100 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/darts_clone_trie_builder.h" - -#include -#include -#include - -#include "absl/container/flat_hash_set.h" -#include "absl/strings/str_cat.h" -#include "include/darts.h" - -namespace tensorflow { -namespace text { -namespace trie_utils { - -absl::StatusOr> BuildDartsCloneTrie( - const std::vector& keys) { - std::vector values(keys.size()); - std::iota(values.begin(), values.end(), 0); - return BuildDartsCloneTrie(keys, values); -} - -absl::StatusOr> BuildDartsCloneTrie( - const std::vector& keys, const std::vector& values) { - if (keys.size() != values.size()) { - return absl::InvalidArgumentError(absl::StrCat( - "The sizes of 'keys' and 'values' must be equal! Keys size: ", - keys.size(), " . Values size: ", values.size())); - } - - { - // Make sure there are no duplicated elements or empty strings in 'keys'. - absl::flat_hash_set unique_keys; - for (const auto& key : keys) { - if (key.empty()) { - return absl::InvalidArgumentError( - "The empty string \"\" is found in 'keys', which is not " - "supported."); - } - if (!unique_keys.insert(key).second) { - return absl::InvalidArgumentError( - absl::StrCat("Duplicated key: ", key, ".")); - } - } - } - - // Make sure all values are non-negative. - for (int i = 0; i < keys.size(); ++i) { - if (values[i] < 0) { - return absl::InvalidArgumentError(absl::StrCat( - "All values must be non-negative! Found value: ", values[i], - " for key: ", keys[i], ", at index: ", i)); - } - } - - // Create a vector to hold the indexes. - std::vector vocab_index_sorted(keys.size()); - std::iota(vocab_index_sorted.begin(), vocab_index_sorted.end(), 0); - - // Sort the index by keys. - std::sort( - vocab_index_sorted.begin(), vocab_index_sorted.end(), - [&keys](const int x, const int y) { return keys.at(x) < keys.at(y); }); - - // Create vectors to build the trie. - std::vector trie_keys; - std::vector trie_values; - trie_keys.reserve(keys.size()); - trie_values.reserve(keys.size()); - for (const auto index : vocab_index_sorted) { - trie_keys.push_back(keys.at(index).c_str()); - trie_values.push_back(values[index]); - } - - // Build the trie. - auto trie = std::make_unique(); - trie->build(trie_keys.size(), const_cast(&trie_keys[0]), nullptr, - const_cast(&trie_values[0])); - - // Return the data of darts_clone (an array of 32-bit unsigned int). - const uint32_t* trie_array = static_cast(trie->array()); - return std::vector(trie_array, trie_array + trie->size()); -} - -} // namespace trie_utils -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/darts_clone_trie_builder.h b/tensorflow_text/core/kernels/darts_clone_trie_builder.h index 2557134f0..c8d777c1d 100644 --- a/tensorflow_text/core/kernels/darts_clone_trie_builder.h +++ b/tensorflow_text/core/kernels/darts_clone_trie_builder.h @@ -12,42 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Builder utils for Darts-clone tries. -// -// Darts-clone is a compact and efficient implementation of Darts (Double-ARray -// Trie System). For more info, see https://github.com/s-yata/darts-clone. -// -// This header file contains utils that build a darts-clone trie. To access such -// a darts-clone trie, use the utils from the companion header file -// darts_clone_trie_wrapper.h. #ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_DARTS_CLONE_TRIE_BUILDER_H_ #define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_DARTS_CLONE_TRIE_BUILDER_H_ -#include -#include -#include - -#include "absl/status/statusor.h" - -namespace tensorflow { -namespace text { -namespace trie_utils { - -// Builds the trie given keys and values, and returns the darts_clone trie -// array data. `keys` and `values` should have the same size; `values[i]` is the -// value for `keys[i]`. `keys` should not contain duplicated elements. In -// addition, the empty string "" should not be in `keys`, because darts_clone -// does not support that. Furthermore, all `values` should be non-negative. -absl::StatusOr> BuildDartsCloneTrie( - const std::vector& keys, const std::vector& values); - -// A variant where the values are indexes in the keys: i.e., the value for -// `keys[i]` is the index `i`. -absl::StatusOr> BuildDartsCloneTrie( - const std::vector& keys); - -} // namespace trie_utils -} // namespace text -} // namespace tensorflow +#include "tensorflow/core/kernels/text/darts_clone_trie_builder.h" #endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_DARTS_CLONE_TRIE_BUILDER_H_ diff --git a/tensorflow_text/core/kernels/darts_clone_trie_test.cc b/tensorflow_text/core/kernels/darts_clone_trie_test.cc deleted file mode 100644 index a80c28353..000000000 --- a/tensorflow_text/core/kernels/darts_clone_trie_test.cc +++ /dev/null @@ -1,188 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include "tensorflow_text/core/kernels/darts_clone_trie_builder.h" -#include "tensorflow_text/core/kernels/darts_clone_trie_wrapper.h" - -namespace tensorflow { -namespace text { -namespace trie_utils { - -using ::testing::status::StatusIs; - -TEST(DartsCloneTrieTest, CreateCursorPointToRootAndTryTraverseOneStep) { - // The test vocabulary. - std::vector vocab_tokens{"def", "\xe1\xb8\x8aZZ", "Abc"}; - - // Create the trie instance. - ASSERT_OK_AND_ASSIGN(std::vector trie_array, - BuildDartsCloneTrie(vocab_tokens)); - ASSERT_OK_AND_ASSIGN(DartsCloneTrieWrapper trie, - DartsCloneTrieWrapper::Create(trie_array.data())); - - DartsCloneTrieWrapper::TraversalCursor cursor; - int data; - - cursor = trie.CreateTraversalCursorPointToRoot(); // Create a cursor to point - // to the root. - EXPECT_TRUE(trie.TryTraverseOneStep(cursor, 'A')); - EXPECT_FALSE(trie.TryGetData(cursor, data)); - EXPECT_TRUE(trie.TryTraverseOneStep(cursor, 'b')); - EXPECT_FALSE(trie.TryGetData(cursor, data)); - EXPECT_TRUE(trie.TryTraverseOneStep(cursor, 'c')); - EXPECT_TRUE(trie.TryGetData(cursor, data)); - EXPECT_THAT(data, 2); - EXPECT_FALSE(trie.TryTraverseOneStep(cursor, 'c')); -} - -TEST(DartsCloneTrieTest, CreateCursorAndTryTraverseSeveralSteps) { - // The test vocabulary. - std::vector vocab_tokens{"def", "\xe1\xb8\x8aZZ", "Abc"}; - - // Create the trie instance. - ASSERT_OK_AND_ASSIGN(std::vector trie_array, - BuildDartsCloneTrie(vocab_tokens)); - ASSERT_OK_AND_ASSIGN(DartsCloneTrieWrapper trie, - DartsCloneTrieWrapper::Create(trie_array.data())); - - DartsCloneTrieWrapper::TraversalCursor cursor; - int data; - - cursor = trie.CreateTraversalCursor(trie.kRootNodeId); // Create a cursor to - // point to the root. - EXPECT_TRUE(trie.TryTraverseSeveralSteps(cursor, "def")); - EXPECT_TRUE(trie.TryGetData(cursor, data)); - EXPECT_THAT(data, 0); -} - -TEST(DartsCloneTrieTest, TraversePathNotExisted) { - // The test vocabulary. - std::vector vocab_tokens{"def", "\xe1\xb8\x8aZZ", "Abc"}; - - // Create the trie instance. - ASSERT_OK_AND_ASSIGN(std::vector trie_array, - BuildDartsCloneTrie(vocab_tokens)); - ASSERT_OK_AND_ASSIGN(DartsCloneTrieWrapper trie, - DartsCloneTrieWrapper::Create(trie_array.data())); - - DartsCloneTrieWrapper::TraversalCursor cursor; - - trie.SetTraversalCursor( - cursor, - trie.kRootNodeId); // Use SetTraversalCursor() to point to the root. - EXPECT_FALSE(trie.TryTraverseSeveralSteps(cursor, "dez")); -} - -TEST(DartsCloneTrieTest, TraverseOnUtf8Path) { - // The test vocabulary. - std::vector vocab_tokens{"def", "\xe1\xb8\x8aZZ", "Abc"}; - - // Create the trie instance. - ASSERT_OK_AND_ASSIGN(std::vector trie_array, - BuildDartsCloneTrie(vocab_tokens)); - ASSERT_OK_AND_ASSIGN(DartsCloneTrieWrapper trie, - DartsCloneTrieWrapper::Create(trie_array.data())); - - DartsCloneTrieWrapper::TraversalCursor cursor; - int data; - - trie.SetTraversalCursor( - cursor, - trie.kRootNodeId); // Use SetTraversalCursor() to point to the root. - EXPECT_TRUE(trie.TryTraverseSeveralSteps(cursor, "\xe1\xb8\x8aZZ")); - EXPECT_TRUE(trie.TryGetData(cursor, data)); - EXPECT_THAT(data, 1); -} - -TEST(DartsCloneTrieTest, TraverseOnPartialUtf8Path) { - // The test vocabulary. - std::vector vocab_tokens{"def", "\xe1\xb8\x8aZZ", "Abc"}; - - // Create the trie instance. - ASSERT_OK_AND_ASSIGN(std::vector trie_array, - BuildDartsCloneTrie(vocab_tokens)); - ASSERT_OK_AND_ASSIGN(DartsCloneTrieWrapper trie, - DartsCloneTrieWrapper::Create(trie_array.data())); - - DartsCloneTrieWrapper::TraversalCursor cursor; - int data; - - trie.SetTraversalCursor( - cursor, - trie.kRootNodeId); // Use SetTraversalCursor() to point to the root. - EXPECT_TRUE(trie.TryTraverseSeveralSteps(cursor, "\xe1\xb8")); - EXPECT_FALSE(trie.TryGetData(cursor, data)); -} - -TEST(DartsCloneTrieTest, TraverseOnUtf8PathNotExisted) { - // The test vocabulary. - std::vector vocab_tokens{"def", "\xe1\xb8\x8aZZ", "Abc"}; - - // Create the trie instance. - ASSERT_OK_AND_ASSIGN(std::vector trie_array, - BuildDartsCloneTrie(vocab_tokens)); - ASSERT_OK_AND_ASSIGN(DartsCloneTrieWrapper trie, - DartsCloneTrieWrapper::Create(trie_array.data())); - - DartsCloneTrieWrapper::TraversalCursor cursor; - - trie.SetTraversalCursor( - cursor, - trie.kRootNodeId); // Use SetTraversalCursor() to point to the root. - EXPECT_FALSE(trie.TryTraverseSeveralSteps(cursor, "\xe1\xb8\x84")); -} - -TEST(DartsCloneTrieBuildError, KeysValuesSizeDifferent) { - // The test vocabulary. - std::vector keys{"def", "\xe1\xb8\x8aZZ", "Abc"}; - std::vector values{1, 2, 3, 4}; - - // Create the trie instance. - ASSERT_THAT(BuildDartsCloneTrie(keys, values), - StatusIs(util::error::INVALID_ARGUMENT)); -} - -TEST(DartsCloneTrieBuildError, DuplicatedKeys) { - // The test vocabulary. - std::vector vocab_tokens{"def", "\xe1\xb8\x8aZZ", "Abc", "def"}; - - // Create the trie instance. - ASSERT_THAT(BuildDartsCloneTrie(vocab_tokens), - StatusIs(util::error::INVALID_ARGUMENT)); -} - -TEST(DartsCloneTrieBuildError, EmptyStringsInKeys) { - // The test vocabulary. - std::vector vocab_tokens{"def", "\xe1\xb8\x8aZZ", "Abc", ""}; - - // Create the trie instance. - ASSERT_THAT(BuildDartsCloneTrie(vocab_tokens), - StatusIs(util::error::INVALID_ARGUMENT)); -} - -TEST(DartsCloneTrieBuildError, NegativeValues) { - // The test vocabulary. - std::vector vocab_tokens{"def", "\xe1\xb8\x8aZZ", "Abc"}; - std::vector vocab_values{0, -1, 1}; - - // Create the trie instance. - ASSERT_THAT(BuildDartsCloneTrie(vocab_tokens, vocab_values), - StatusIs(util::error::INVALID_ARGUMENT)); -} - -} // namespace trie_utils -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/darts_clone_trie_wrapper.h b/tensorflow_text/core/kernels/darts_clone_trie_wrapper.h index 43067ec1b..371f28ef3 100644 --- a/tensorflow_text/core/kernels/darts_clone_trie_wrapper.h +++ b/tensorflow_text/core/kernels/darts_clone_trie_wrapper.h @@ -12,157 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Access utils for Darts-clone tries. -// -// Darts-clone is a compact and efficient implementation of Darts (Double-ARray -// Trie System). For more info, see https://github.com/s-yata/darts-clone. -// -// This header file contains utils that access a darts-clone trie. To build such -// a darts-clone trie, use the utils from the companion header file -// darts_clone_trie_builder.h. -// -// Note that although there is a 'traverse()' function in the original source -// (see https://github.com/s-yata/darts-clone/blob/master/include/darts.h), the -// utils in this header file are more efficient and the APIs are more flexible. #ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_DARTS_CLONE_TRIE_WRAPPER_H_ #define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_DARTS_CLONE_TRIE_WRAPPER_H_ -#include -#include - -#include "absl/status/statusor.h" - -namespace tensorflow { -namespace text { -namespace trie_utils { - -// A wrapper class of darts_clone trie for traversing and getting data on the -// trie. It does not own the actual 'trie_array'. -class DartsCloneTrieWrapper { - public: - // Represents the root node id. - static constexpr uint32_t kRootNodeId = 0; - - // A struct serving as the trie traversal cursor. It holds 'node_id' and - // 'unit' (which is 'trie_array_[node_id]'). The reason is to save and reuse - // the 'trie_array_[node_id]'. - struct TraversalCursor { - uint32_t node_id = 0; - uint32_t unit = 0; - }; - - // Constructs an instance by passing in the pointer to the trie array data. - // The caller needs to make sure that 'trie_array' points to a valid structure - // returned by darts_clone trie builder. The caller also needs to maintain the - // availability of 'trie_array' throughout the lifetime of this instance. - static absl::StatusOr Create( - const uint32_t* trie_array) { - if (trie_array == nullptr) { - return absl::InvalidArgumentError("trie_array is nullptr."); - } - return DartsCloneTrieWrapper(trie_array); - } - - // Creates a cursor pointing to the root. - TraversalCursor CreateTraversalCursorPointToRoot() { - return {kRootNodeId, trie_array_[kRootNodeId]}; - } - - // Creates a cursor pointing to the 'node_id'. - TraversalCursor CreateTraversalCursor(uint32_t node_id) { - return {node_id, trie_array_[node_id]}; - } - - // Sets the cursor to point to 'node_id'. - void SetTraversalCursor(TraversalCursor& cursor, uint32_t node_id) { - cursor.node_id = node_id; - cursor.unit = trie_array_[node_id]; - } - - // Traverses one step from 'cursor' following 'ch'. If successful (i.e., there - // exists such an edge), moves 'cursor' to the new node and returns true. - // Otherwise, does nothing (i.e., 'cursor' is not changed) and returns false. - bool TryTraverseOneStep(TraversalCursor& cursor, unsigned char ch) const { - const uint32_t next_node_id = cursor.node_id ^ offset(cursor.unit) ^ ch; - const uint32_t next_node_unit = trie_array_[next_node_id]; - if (label(next_node_unit) != ch) { - return false; - } - cursor.node_id = next_node_id; - cursor.unit = next_node_unit; - return true; - } - - // Traverses several steps from 'cursor' following the characters on 'path'. - // If *all* steps are successful, moves 'cursor' to the new node and returns - // true. Otherwise, does nothing (i.e., 'cursor' is not changed) and returns - // false. - bool TryTraverseSeveralSteps(TraversalCursor& cursor, - absl::string_view path) const { - return TryTraverseSeveralSteps(cursor, path.data(), path.size()); - } - - // If the node pointed by 'cursor' has data, read into 'out_data' and returns - // true; otherwise, does nothing and returns false. - bool TryGetData(const TraversalCursor& cursor, int& out_data) const { - if (!has_leaf(cursor.unit)) { - return false; - } - const uint32_t value_unit = - trie_array_[cursor.node_id ^ offset(cursor.unit)]; - out_data = value(value_unit); - return true; - } - - private: - // Use Create() instead of the constructor. - explicit DartsCloneTrieWrapper(const uint32_t* trie_array) - : trie_array_(trie_array) {} - - // The actual implementation of TryTraverseSeveralSteps. - bool TryTraverseSeveralSteps(TraversalCursor& cursor, const char* ptr, - int size) const { - uint32_t cur_id = cursor.node_id; - uint32_t cur_unit = cursor.unit; - for (; size > 0; --size, ++ptr) { - const unsigned char ch = static_cast(*ptr); - cur_id ^= offset(cur_unit) ^ ch; - cur_unit = trie_array_[cur_id]; - if (label(cur_unit) != ch) { - return false; - } - } - cursor.node_id = cur_id; - cursor.unit = cur_unit; - return true; - } - - // The helper functions below are based on - // https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/custom_ops/kernel/sentencepiece/double_array_trie.h - - // Returns offset to children. - static uint32_t offset(uint32_t unit) { - return (unit >> 10) << ((unit & 0x200) >> 6); - } - - // Returns a label associated with a node. - // A leaf node will have the MSB set and thus return an invalid label. - static uint32_t label(uint32_t unit) { return unit & 0x800000ff; } - - // Returns whether a node has a leaf as a child. - static bool has_leaf(uint32_t unit) { return unit & 0x100; } - - // Returns a value associated with a node. Available when a node is a leaf. - static int value(uint32_t unit) { - return static_cast(unit & 0x7fffffff); - } - - // The pointer to the darts trie array. - const uint32_t* trie_array_; -}; - -} // namespace trie_utils -} // namespace text -} // namespace tensorflow +#include "tensorflow/core/kernels/text/darts_clone_trie_wrapper.h" #endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_DARTS_CLONE_TRIE_WRAPPER_H_ diff --git a/tensorflow_text/core/kernels/disjoint_set_forest.h b/tensorflow_text/core/kernels/disjoint_set_forest.h index a0b163850..b152784b2 100644 --- a/tensorflow_text/core/kernels/disjoint_set_forest.h +++ b/tensorflow_text/core/kernels/disjoint_set_forest.h @@ -12,171 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef TENSORFLOW_TEXT_CORE_KERNELS_DISJOINT_SET_FOREST_H_ -#define TENSORFLOW_TEXT_CORE_KERNELS_DISJOINT_SET_FOREST_H_ +#ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_DISJOINT_SET_FOREST_H_ +#define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_DISJOINT_SET_FOREST_H_ -#include +#include "tensorflow/core/kernels/text/disjoint_set_forest.h" -#include -#include - -#include "tensorflow/core/platform/logging.h" - -namespace tensorflow { -namespace text { - -// An implementation of the disjoint-set forest data structure. The universe of -// elements is the dense range of indices [0,n). Thread-compatible. -// -// By default, this uses the path compression and union by rank optimizations, -// achieving near-constant runtime on all operations. However, the user may -// disable the union by rank optimization, which allows the user to control how -// roots are selected when a union occurs. When union by rank is disabled, the -// runtime of all operations increases to O(log n) amortized. -// -// Template args: -// Index: An unsigned integral type wide enough to hold n. -// kUseUnionByRank: Whether to use the union by rank optimization. -template -class DisjointSetForest { - public: - static_assert(std::is_integral::value, "Index must be integral"); - static_assert(!std::is_signed::value, "Index must be unsigned"); - using IndexType = Index; - - // Creates an empty forest. - DisjointSetForest() = default; - - // Initializes this to hold the elements [0,|size|), each initially in its own - // singleton set. Replaces existing state, if any. - void Init(Index size); - - // Returns the root of the set containing |element|, which uniquely identifies - // the set. Note that the root of a set may change as the set is merged with - // other sets; do not cache the return value of FindRoot(e) across calls to - // Union() or UnionOfRoots() that could merge the set containing e. - Index FindRoot(Index element); - - // For convenience, returns true if |element1| and |element2| are in the same - // set. When performing a large batch of queries it may be more efficient to - // cache the value of FindRoot(), modulo caveats regarding caching above. - bool SameSet(Index element1, Index element2); - - // Merges the sets rooted at |root1| and |root2|, which must be the roots of - // their respective sets. Either |root1| or |root2| will be the root of the - // merged set. If |kUseUnionByRank| is true, then it is unspecified whether - // |root1| or |root2| will be the root; otherwise, |root2| will be the root. - void UnionOfRoots(Index root1, Index root2); - - // As above, but for convenience finds the root of |element1| and |element2|. - void Union(Index element1, Index element2); - - // The number of elements in this. - Index size() const { return size_; } - - private: - // The number of elements in the universe underlying the sets. - Index size_ = 0; - - // The parent of each element, where self-loops are roots. - std::vector parents_; - - // The rank of each element, for the union by rank optimization. Only used if - // |kUseUnionByRank| is true. - std::vector ranks_; -}; - -// Implementation details below. - -template -void DisjointSetForest::Init(Index size) { - size_ = size; - parents_.resize(size_); - if (kUseUnionByRank) ranks_.resize(size_); - - // Create singleton sets. - for (Index i = 0; i < size_; ++i) { - parents_[i] = i; - if (kUseUnionByRank) ranks_[i] = 0; - } -} - -template -Index DisjointSetForest::FindRoot(Index element) { - DCHECK_LT(element, size()); - Index *const __restrict parents = parents_.data(); - - // Walk up to the root of the |element|. Unroll the first two comparisons - // because path compression ensures most FindRoot() calls end there. In - // addition, if a root is found within the first two comparisons, then the - // path compression updates can be skipped. - Index current = element; - Index parent = parents[current]; - if (current == parent) return current; // |element| is a root - current = parent; - parent = parents[current]; - if (current == parent) return current; // |element| is the child of a root - do { // otherwise, continue upwards until root - current = parent; - parent = parents[current]; - } while (current != parent); - const Index root = current; - - // Apply path compression on the traversed nodes. - current = element; - parent = parents[current]; // not root, thanks to unrolling above - do { - parents[current] = root; - current = parent; - parent = parents[current]; - } while (parent != root); - - return root; -} - -template -bool DisjointSetForest::SameSet(Index element1, - Index element2) { - return FindRoot(element1) == FindRoot(element2); -} - -template -void DisjointSetForest::UnionOfRoots(Index root1, - Index root2) { - DCHECK_LT(root1, size()); - DCHECK_LT(root2, size()); - DCHECK_EQ(root1, parents_[root1]); - DCHECK_EQ(root2, parents_[root2]); - if (root1 == root2) return; // already merged - Index *const __restrict parents = parents_.data(); - - if (kUseUnionByRank) { - // Attach the lesser-rank root to the higher-rank root. - Index *const __restrict ranks = ranks_.data(); - const Index rank1 = ranks[root1]; - const Index rank2 = ranks[root2]; - if (rank2 < rank1) { - parents[root2] = root1; - } else if (rank1 < rank2) { - parents[root1] = root2; - } else { - // Equal ranks; choose one arbitrarily and promote its rank. - parents[root1] = root2; - ranks[root2] = rank2 + 1; - } - } else { - // Always make |root2| the root of the merged set. - parents[root1] = root2; - } -} - -template -void DisjointSetForest::Union(Index element1, - Index element2) { - UnionOfRoots(FindRoot(element1), FindRoot(element2)); -} - -} // namespace text -} // namespace tensorflow - -#endif // TENSORFLOW_TEXT_CORE_KERNELS_DISJOINT_SET_FOREST_H_ +#endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_DISJOINT_SET_FOREST_H_ diff --git a/tensorflow_text/core/kernels/disjoint_set_forest_test.cc b/tensorflow_text/core/kernels/disjoint_set_forest_test.cc deleted file mode 100644 index f63d92f08..000000000 --- a/tensorflow_text/core/kernels/disjoint_set_forest_test.cc +++ /dev/null @@ -1,147 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/disjoint_set_forest.h" - -#include - -#include -#include -#include - -#include -#include - -namespace tensorflow { -namespace text { - -// Testing rig. -// -// Template args: -// Forest: An instantiation of the DisjointSetForest<> template. -template -class DisjointSetForestTest : public ::testing::Test { - protected: - using Index = typename Forest::IndexType; - - // Expects that the |expected_sets| and |forest| match. - void ExpectSets(const std::set> &expected_sets, - Forest *forest) { - std::set> expected_pairs; - for (const auto &expected_set : expected_sets) { - for (auto it = expected_set.begin(); it != expected_set.end(); ++it) { - for (auto jt = expected_set.begin(); jt != expected_set.end(); ++jt) { - expected_pairs.emplace(*it, *jt); - } - } - } - - for (Index lhs = 0; lhs < forest->size(); ++lhs) { - for (Index rhs = 0; rhs < forest->size(); ++rhs) { - if (expected_pairs.find({lhs, rhs}) != expected_pairs.end()) { - EXPECT_EQ(forest->FindRoot(lhs), forest->FindRoot(rhs)); - EXPECT_TRUE(forest->SameSet(lhs, rhs)); - } else { - EXPECT_NE(forest->FindRoot(lhs), forest->FindRoot(rhs)); - EXPECT_FALSE(forest->SameSet(lhs, rhs)); - } - } - } - } -}; - -using Forests = ::testing::Types< - DisjointSetForest, DisjointSetForest, - DisjointSetForest, DisjointSetForest, - DisjointSetForest, DisjointSetForest, - DisjointSetForest, DisjointSetForest>; -TYPED_TEST_SUITE(DisjointSetForestTest, Forests); - -TYPED_TEST(DisjointSetForestTest, DefaultEmpty) { - TypeParam forest; - EXPECT_EQ(0, forest.size()); -} - -TYPED_TEST(DisjointSetForestTest, InitEmpty) { - TypeParam forest; - forest.Init(0); - EXPECT_EQ(0, forest.size()); -} - -TYPED_TEST(DisjointSetForestTest, Populated) { - TypeParam forest; - forest.Init(5); - EXPECT_EQ(5, forest.size()); - this->ExpectSets({{0}, {1}, {2}, {3}, {4}}, &forest); - - forest.UnionOfRoots(1, 2); - this->ExpectSets({{0}, {1, 2}, {3}, {4}}, &forest); - - forest.Union(1, 2); - this->ExpectSets({{0}, {1, 2}, {3}, {4}}, &forest); - - forest.UnionOfRoots(0, 4); - this->ExpectSets({{0, 4}, {1, 2}, {3}}, &forest); - - forest.Union(3, 4); - this->ExpectSets({{0, 3, 4}, {1, 2}}, &forest); - - forest.Union(0, 3); - this->ExpectSets({{0, 3, 4}, {1, 2}}, &forest); - - forest.Union(2, 0); - this->ExpectSets({{0, 1, 2, 3, 4}}, &forest); - - forest.Union(1, 3); - this->ExpectSets({{0, 1, 2, 3, 4}}, &forest); -} - -// Testing rig for checking that when union by rank is disabled, the root of a -// merged set can be controlled. -class DisjointSetForestNoUnionByRankTest : public ::testing::Test { - protected: - using Forest = DisjointSetForest; - - // Expects that the roots of the |forest| match |expected_roots|. - void ExpectRoots(const std::vector &expected_roots, Forest *forest) { - ASSERT_EQ(expected_roots.size(), forest->size()); - for (uint32 i = 0; i < forest->size(); ++i) { - EXPECT_EQ(expected_roots[i], forest->FindRoot(i)); - } - } -}; - -TEST_F(DisjointSetForestNoUnionByRankTest, ManuallySpecifyRoot) { - Forest forest; - forest.Init(5); - ExpectRoots({0, 1, 2, 3, 4}, &forest); - - forest.UnionOfRoots(0, 1); // 1 is the root - ExpectRoots({1, 1, 2, 3, 4}, &forest); - - forest.Union(4, 3); // 3 is the root - ExpectRoots({1, 1, 2, 3, 3}, &forest); - - forest.Union(0, 2); // 2 is the root - ExpectRoots({2, 2, 2, 3, 3}, &forest); - - forest.Union(3, 3); // no effect - ExpectRoots({2, 2, 2, 3, 3}, &forest); - - forest.Union(4, 0); // 2 is the root - ExpectRoots({2, 2, 2, 2, 2}, &forest); -} - -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/edit_changes.proto b/tensorflow_text/core/kernels/edit_changes.proto deleted file mode 100644 index 62d622b7a..000000000 --- a/tensorflow_text/core/kernels/edit_changes.proto +++ /dev/null @@ -1,15 +0,0 @@ -syntax = "proto2"; - -package tensorflow.text; - -// Protocol buffer for serializing a single icu::Edits object -// represented by a sequence of edit changes pairs: (old_length, new_length) -message EditChanges { - message Change { - optional int32 old_length = 1; - optional int32 new_length = 2; - } - - repeated Change change = 1; -} - diff --git a/tensorflow_text/core/kernels/exp_greedy_constrained_sequence_kernel_test.cc b/tensorflow_text/core/kernels/exp_greedy_constrained_sequence_kernel_test.cc deleted file mode 100644 index 0c91e24be..000000000 --- a/tensorflow_text/core/kernels/exp_greedy_constrained_sequence_kernel_test.cc +++ /dev/null @@ -1,854 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include - -#include -#include -#include "tensorflow/core/framework/fake_input.h" -#include "tensorflow/core/framework/node_def_builder.h" -#include "tensorflow/core/framework/tensor.h" -#include "tensorflow/core/framework/tensor_shape.h" -#include "tensorflow/core/framework/types.pb.h" -#include "tensorflow/core/kernels/ops_testutil.h" -#include "tensorflow/core/lib/core/status_test_util.h" -#include "tensorflow/core/platform/status.h" -#include "tensorflow/core/platform/types.h" -#include "tensorflow_text/core/kernels/text_kernels_test_util.h" - -namespace tensorflow { - -using tensorflow::DT_INT32; -using tensorflow::FakeInput; -using tensorflow::NodeDefBuilder; -using tensorflow::Status; -using tensorflow::TensorShape; -using tensorflow::text_kernels_test_util::MatrixEq; -using tensorflow::text_kernels_test_util::VectorEq; - -class ExpGreedyConstrainedSequenceTest : public tensorflow::OpsTestBase { - public: - void SetUpOpWithDefaults() { - // Prepare graph. - TF_ASSERT_OK(NodeDefBuilder("tested_op", "ConstrainedSequence") - .Attr("Tin", DT_INT32) - .Attr("use_viterbi", false) - .Attr("use_log_space", false) - .Attr("use_start_and_end_states", true) - .Input(FakeInput()) - .Input(FakeInput()) - .Input(FakeInput()) - .Input(FakeInput()) - .Finalize(node_def())); - TF_ASSERT_OK(InitOp()); - } -}; - -// TODO(b/122968457): There are a bunch of tests that only validate !ok instead -// of looking for specific error messages; fix that. - -// This test examines evaluations with only a permissions matrix. -TEST_F(ExpGreedyConstrainedSequenceTest, - ComputesSingleTransitionWithNoWeights) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 12.0, 13.0, 4.0, // - 1.0, 12.0, 13.0, 14.0, // - 15.0, 2.0, 3.0, 14.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO OUT - true, true, true, true, true, // FROM 0 - true, true, true, true, true, // FROM 1 - true, true, true, true, true, // FROM 2 - true, true, true, true, true, // FROM 3 - true, true, false, true, false, // FROM 'OUTSIDE' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({0, 0}), {}); - - TF_ASSERT_OK(RunOpKernel()); - - // The first sequence's highest score is 2, but OUT->2 is not ok, so it's 1. - // The second sequence's highest score is 3, which is ok. - // The third sequence's highest score is 0, which is ok. - - // Validate the output. - std::vector expected_transitions({1, 3, 0}); - std::vector expected_offsets({0, 1, 2, 3}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test examines evaluations with an empty weights matrix not of rank 2. -TEST_F(ExpGreedyConstrainedSequenceTest, - ComputesSingleTransitionWithNonMatrixEmptyWeights) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 12.0, 13.0, 4.0, // - 1.0, 12.0, 13.0, 14.0, // - 15.0, 2.0, 3.0, 14.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO OUT - true, true, true, true, true, // FROM 0 - true, true, true, true, true, // FROM 1 - true, true, true, true, true, // FROM 2 - true, true, true, true, true, // FROM 3 - true, true, false, true, false, // FROM 'OUTSIDE' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({0}), {}); - - TF_ASSERT_OK(RunOpKernel()); - - // The first sequence's highest score is 2, but OUT->2 is not ok, so it's 1. - // The second sequence's highest score is 3, which is ok. - // The third sequence's highest score is 0, which is ok. - - // Validate the output. - std::vector expected_transitions({1, 3, 0}); - std::vector expected_offsets({0, 1, 2, 3}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test examines evaluations with a 2D score matrix (implicit batch 1). -TEST_F(ExpGreedyConstrainedSequenceTest, - ComputesSingleTransitionWithSingleBatchItem) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({1, 4}), // - { - 10.0, 12.0, 13.0, 4.0, // - }); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({1}), {1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO OUT - true, true, true, true, true, // FROM 0 - true, true, true, true, true, // FROM 1 - true, true, true, true, true, // FROM 2 - true, true, true, true, true, // FROM 3 - true, true, false, true, false, // FROM 'OUTSIDE' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({0, 0}), {}); - - TF_ASSERT_OK(RunOpKernel()); - - // The sequence's highest score is 2, but OUT->2 is not ok, so it's 1. - // Validate the output. - std::vector expected_transitions({1}); - std::vector expected_offsets({0, 1}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test examines int64 input type and int32 output type. -TEST_F(ExpGreedyConstrainedSequenceTest, int64inint32out) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 12.0, 13.0, 4.0, // - 1.0, 12.0, 13.0, 14.0, // - 15.0, 2.0, 3.0, 14.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO OUT - true, true, true, true, true, // FROM 0 - true, true, true, true, true, // FROM 1 - true, true, true, true, true, // FROM 2 - true, true, true, true, true, // FROM 3 - true, true, false, true, false, // FROM 'OUTSIDE' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({0, 0}), {}); - - TF_ASSERT_OK(RunOpKernel()); - - // The first sequence's highest score is 2, but OUT->2 is not ok, so it's 1. - // The second sequence's highest score is 3, which is ok. - // The third sequence's highest score is 0, which is ok. - // Validate the output. - // Validate the output. - std::vector expected_transitions({1, 3, 0}); - std::vector expected_offsets({0, 1, 2, 3}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test ensures the op can take a sequence length of type {{X},{Y},{Z}} -// (with an outer batch dimension). -TEST_F(ExpGreedyConstrainedSequenceTest, TwoDimensionalSequenceLengths) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 12.0, 13.0, 4.0, // - 1.0, 12.0, 13.0, 14.0, // - 15.0, 2.0, 3.0, 14.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3, 1}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO OUT - true, true, true, true, true, // FROM 0 - true, true, true, true, true, // FROM 1 - true, true, true, true, true, // FROM 2 - true, true, true, true, true, // FROM 3 - true, true, false, true, false, // FROM 'OUTSIDE' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({0, 0}), {}); - - TF_ASSERT_OK(RunOpKernel()); - - // The first sequence's highest score is 2, but OUT->2 is not ok, so it's 1. - // The second sequence's highest score is 3, which is ok. - // The third sequence's highest score is 0, which is ok. - - // Validate the output. - std::vector expected_transitions({1, 3, 0}); - std::vector expected_offsets({0, 1, 2, 3}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test ensures that final transitions that are forbidden by the permission -// matrix (final->null) are not taken. -TEST_F(ExpGreedyConstrainedSequenceTest, - ComputesSingleTransitionWithNoWeightsConstrainedByEnd) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 12.0, 13.0, 4.0, // - 1.0, 12.0, 13.0, 14.0, // - 15.0, 2.0, 3.0, 14.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO OUT - true, true, true, true, true, // FROM 0 - true, true, true, true, false, // FROM 1 - true, true, true, true, true, // FROM 2 - true, true, true, true, true, // FROM 3 - true, true, false, true, false, // FROM 'OUTSIDE' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({0, 0}), {}); - - TF_ASSERT_OK(RunOpKernel()); - - // The first sequence's highest score is 2, but OUT->2 is not ok; the next - // highest is 1, but 1->OUT is not OK; the next highest is 0, which is OK. - // The second sequence's highest score is 3, OUT->3 is OK and 3->OUT is OK. - // The third sequence's highest score is 0, OUT->0 is OK and 0->OUT is OK. - // Validate the output. - std::vector expected_transitions({0, 3, 0}); - std::vector expected_offsets({0, 1, 2, 3}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test examines evaluations with only a weight matrix. -TEST_F(ExpGreedyConstrainedSequenceTest, - ComputesSingleTransitionWithNoPermissions) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 2.0, 7.0, 4.0, // - 1.0, 9.0, 11.0, 5.0, // - 100.0, 24.0, 3.0, 4.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({0, 0}), {}); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({5, 5}), {0.5, 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 0.5, 1.0, // - 0.1, 0.5, 0.5, 1.0, 1.0}); - - TF_ASSERT_OK(RunOpKernel()); - - // All scores should be multiplied by the last row in the weight tensor, so - // the 'real' scores are: - // 1: {1.0, 1.0, 3.5, 4.0} (max is 3) - // 2: {0.1, 4.5, 5.5, 5.0} (max is 2) - // 3: {10.0, 12.0, 1.5, 4.0} (max is 1) - // Validate the output. - std::vector expected_transitions({3, 2, 1}); - std::vector expected_offsets({0, 1, 2, 3}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test examines evaluations with an empty not rank 2 permissions matrix. -TEST_F(ExpGreedyConstrainedSequenceTest, - ComputesSingleTransitionWithNonMatrixEmptyPermissions) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 2.0, 7.0, 4.0, // - 1.0, 9.0, 11.0, 5.0, // - 100.0, 24.0, 3.0, 4.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({0, 0, 0}), {}); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({5, 5}), {0.5, 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 0.5, 1.0, // - 0.1, 0.5, 0.5, 1.0, 1.0}); - - TF_ASSERT_OK(RunOpKernel()); - - // All scores should be multiplied by the last row in the weight tensor, so - // the 'real' scores are: - // 1: {1.0, 1.0, 3.5, 4.0} (max is 3) - // 2: {0.1, 4.5, 5.5, 5.0} (max is 2) - // 3: {10.0, 12.0, 1.5, 4.0} (max is 1) - // Validate the output. - std::vector expected_transitions({3, 2, 1}); - std::vector expected_offsets({0, 1, 2, 3}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test ensures that final transitions are scored with the probability -// of ending the sequence on the transition (x->final->null). -TEST_F(ExpGreedyConstrainedSequenceTest, - ComputesSingleTransitionWithNoPermissionsWeightedByEnd) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 2.0, 7.0, 4.0, // - 1.0, 9.0, 11.0, 5.0, // - 100.0, 24.0, 3.0, 4.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({0, 0}), {}); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({5, 5}), {0.5, 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 0.5, 0.1, // - 0.1, 0.5, 0.5, 1.0, 1.0}); - - TF_ASSERT_OK(RunOpKernel()); - - // All scores should be multiplied by the last row and the last column in the - // score tensor, so the real scores are: - // 1: {1.0, 1.0, 3.5, 0.4} (max is 2) - // 2: {0.1, 4.5, 5.5, 0.5} (max is 2) - // 3: {10.0, 12.0, 1.5, 0.4} (max is 1) - // Validate the output. - std::vector expected_transitions({2, 2, 1}); - std::vector expected_offsets({0, 1, 2, 3}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test ensures that final transitions are not scored with the probability -// of ending the sequence on the transition (x->final->null) if -// use_start_and_end_states is False. -TEST_F(ExpGreedyConstrainedSequenceTest, - ComputesSingleTransitionWithNoPermissionsNotWeightedByEnd) { - // Prepare graph. - TF_ASSERT_OK(NodeDefBuilder("tested_op", "ConstrainedSequence") - .Attr("Tin", DT_INT32) - .Attr("use_viterbi", false) - .Attr("use_log_space", false) - .Attr("use_start_and_end_states", false) - .Input(FakeInput()) - .Input(FakeInput()) - .Input(FakeInput()) - .Input(FakeInput()) - .Finalize(node_def())); - TF_ASSERT_OK(InitOp()); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 2.0, 7.0, 4.0, // - 1.0, 9.0, 11.0, 5.0, // - 100.0, 24.0, 3.0, 4.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({0, 0}), {}); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({4, 4}), {0.5, 0.5, 0.5, 0.5, // - 0.5, 0.5, 0.5, 0.5, // - 0.5, 0.5, 0.5, 0.5, // - 0.5, 0.5, 0.5, 0.5}); - - TF_ASSERT_OK(RunOpKernel()); - - // All scores should be multiplied by the last row and the last column in the - // score tensor, so the real scores are: - // 1: {5.0, 1.0, 3.5, 4.0} (max is 0) - // 2: {.5, 4.5, 5.5, 2.5} (max is 2) - // 3: {50.0, 12.0, 1.5,2.0} (max is 0) - // Validate the output. - std::vector expected_transitions({0, 2, 0}); - std::vector expected_offsets({0, 1, 2, 3}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test examines evaluations with both weight and permission matrices. -TEST_F(ExpGreedyConstrainedSequenceTest, - ComputesSingleTransitionWithWeightsAndPermissions) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 12.0, 7.0, 4.0, // - 1.0, 9.0, 11.0, 5.0, // - 100.0, 24.0, 3.0, 4.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO OUT - true, true, true, true, true, // FROM 0 - true, true, true, true, true, // FROM 1 - true, true, true, true, true, // FROM 2 - true, true, true, true, true, // FROM 'OUTSIDE' - true, false, true, true, false, // FROM 'NULL' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({5, 5}), {0.5, 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 0.5, 1.0, // - 0.1, 0.5, 0.5, 1.0, 1.0}); - - TF_ASSERT_OK(RunOpKernel()); - - // All scores should be multiplied by the last row in the weight tensor, so - // the 'real' scores are: - // 1: {1.0, 1.0, 3.5, 4.0} (max is 3). OUT->3 is OK. - // 2: {0.1, 4.5, 5.5, 5.0} (max is 2). OUT->2 is OK. - // 3: {10.0, 12.0, 1.5, 4.0} (max is 1). OUT->1 is not OK, so go with 0. - // Note that X->OUT is set to always be OK here. - std::vector expected_transitions({3, 2, 0}); - std::vector expected_offsets({0, 1, 2, 3}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test examines multiple evaluations with both weight and permission -// matrices. -TEST_F(ExpGreedyConstrainedSequenceTest, - ComputesMultipleTransitionsWithWeightsAndPermissions) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 2, 4}), // - {{ - 10.0, 12.0, 7.0, 4.0, // Batch 0, step 0 - 10.0, 10.0, 10.0, 10.0, // Batch 0, step 1 - 1.0, 9.0, 11.0, 5.0, // Batch 1, step 0 - 10.0, 15.0, 1.0, 12.0, // Batch 1, step 1 - 100.0, 24.0, 3.0, 4.0, // Batch 2, step 0 - 1.0, 11.0, 1.0, 10.0, // Batch 2, step 1 - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {2, 2, 2}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO NUL - true, true, true, true, true, // FROM 0 - true, true, true, true, false, // FROM 1 - true, false, true, false, true, // FROM 2 - true, true, true, true, true, // FROM 3 (OUT) - true, false, true, true, true, // FROM 'NULL' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({5, 5}), {0.5, 0.5, 0.5, 0.5, 1.0, // 0 - 0.5, 0.5, 0.5, 0.5, 1.0, // 1 - 0.5, 0.5, 0.5, 0.5, 1.0, // 2 - 0.5, 0.5, 1.0, 0.5, 1.0, // 3 - 0.1, 0.5, 0.5, 1.0, 1.0}); - - TF_ASSERT_OK(RunOpKernel()); - - // STEP 1: - // All scores should be multiplied by the last row in the weight tensor, so - // the 'real' scores are: - // 1: {0.1 6.0 3.5 4.0} (max is 3). OUT->3 is OK. - // 2: {0.1, 4.5, 5.5, 5.0} (max is 2). OUT->2 is OK. - // 3: {10.0, 12.0, 1.5, 4.0} (max is 1). OUT->1 is not OK, so go with 0. - // STEP 2: - // 1: In state '3', so use row 3 in the weight tensor. - // Weights are {5, 5, 10, 5}; 3->2 is OK and 2->OUT is OK; use 2. - // 2: In state '2', so use row 2 in the weight tensor. - // Weights are {5, 7.5, .5, 6.0}; 2->3 is not OK and 2->1 is not OK, so 0. - // 3: In state 0, so use row 0 in the weight tensor. - // Weights are {0.5, 5.5, 0.5, 5}; 0->1 is OK but 1->OUT is not, so 3. - - std::vector expected_transitions({3, 2, 2, 0, 0, 3}); - std::vector expected_offsets({0, 2, 4, 6}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} -// This test examines multiple evaluations with both weight and permission -// matrices. -TEST_F(ExpGreedyConstrainedSequenceTest, - ComputesMultipleTransitionsWithVaryingLengths) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 2, 4}), // - {{ - 10.0, 12.0, 7.0, 4.0, // Batch 0, step 0 - 10.0, 10.0, 10.0, 10.0, // Batch 0, step 1 - 1.0, 9.0, 11.0, 5.0, // Batch 1, step 0 - 10.0, 15.0, 1.0, 12.0, // Batch 1, step 1 - 100.0, 24.0, 3.0, 4.0, // Batch 2, step 0 - 1.0, 11.0, 1.0, 10.0, // Batch 2, step 1 - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {2, 1, 2}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO NUL - true, true, true, true, true, // FROM 0 - true, true, true, true, false, // FROM 1 - true, false, true, false, true, // FROM 2 - true, true, true, true, true, // FROM 3 (OUT) - true, false, true, true, true, // FROM 'NULL' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({5, 5}), {0.5, 0.5, 0.5, 0.5, 1.0, // 0 - 0.5, 0.5, 0.5, 0.5, 1.0, // 1 - 0.5, 0.5, 0.5, 0.5, 1.0, // 2 - 0.5, 0.5, 1.0, 0.5, 1.0, // 3 - 0.1, 0.5, 0.5, 1.0, 1.0}); - - TF_ASSERT_OK(RunOpKernel()); - - // STEP 1: - // All scores should be multiplied by the last row in the weight tensor, so - // the 'real' scores are: - // 1: {0.1 6.0 3.5 4.0} (max is 3). OUT->3 is OK. - // 2: {0.1, 4.5, 5.5, 5.0} (max is 2). OUT->2 and 2->OUT are OK. - // 3: {10.0, 12.0, 1.5, 4.0} (max is 1). OUT->1 is not OK, so go with 0. - // STEP 2: - // 1: In state '3', so use row 3 in the weight tensor. - // Weights are {5, 5, 10, 5}; 3->2 is OK and 2->OUT is OK; use 2. - // 2: End of sequence; no change. - // 3: In state 0, so use row 0 in the weight tensor. - // Weights are {0.5, 5.5, 0.5, 5}; 0->1 is OK but 1->OUT is not, so 3. - - std::vector expected_transitions({3, 2, 2, 0, 3}); - std::vector expected_offsets({0, 2, 3, 5}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test examines evaluations with a fully negative input set. -TEST_F(ExpGreedyConstrainedSequenceTest, - ComputesSingleTransitionWithNegativeInputs) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - -10.0, -12.0, -13.0, -4.0, // - -1.0, -12.0, -13.0, -14.0, // - -15.0, -2.0, -3.0, -14.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO OUT - true, true, true, true, true, // FROM 0 - true, true, true, true, true, // FROM 1 - true, true, true, true, true, // FROM 2 - true, true, true, true, true, // FROM 3 - true, true, true, true, true, // FROM 'OUTSIDE' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({0, 0}), {}); - - TF_ASSERT_OK(RunOpKernel()); - - std::vector expected_transitions({3, 0, 1}); - std::vector expected_offsets({0, 1, 2, 3}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test examines evaluations with an all-zero weight matrix. -TEST_F(ExpGreedyConstrainedSequenceTest, - ComputesSingleTransitionWithZeroedWeights) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 2.0, 7.0, 4.0, // - 1.0, 9.0, 11.0, 5.0, // - 100.0, 24.0, 3.0, 4.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({0, 0}), {}); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({5, 5}), { - 0.0, 0.0, 0.0, 0.0, 0.0, // - 0.0, 0.0, 0.0, 0.0, 0.0, // - 0.0, 0.0, 0.0, 0.0, 0.0, // - 0.0, 0.0, 0.0, 0.0, 0.0, // - 0.0, 0.0, 0.0, 0.0, 0.0, - }); - - TF_ASSERT_OK(RunOpKernel()); - - // In the case of a tie between weights, the higher state number wins; - // if all weights are zero, the states should all be 3. - - std::vector expected_transitions({3, 3, 3}); - std::vector expected_offsets({0, 1, 2, 3}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -TEST_F(ExpGreedyConstrainedSequenceTest, - ImpossibleSequencesResultInNegativeOnesIfAttrIsSet) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 2, 4}), // - {{ - 10.0, 12.0, 13.0, 4.0, // - 1.0, 12.0, 13.0, 14.0, // - 15.0, 2.0, 3.0, 14.0, // - 10.0, 12.0, 13.0, 4.0, // - 1.0, 12.0, 13.0, 14.0, // - 15.0, 2.0, 3.0, 14.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {2, 2, 2}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO OUT - false, false, false, false, false, // FROM 0 - false, false, false, false, false, // FROM 1 - false, false, false, false, false, // FROM 2 - false, false, false, false, false, // FROM 3 - false, false, false, false, false, // FROM 'OUT' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({0, 0}), {}); - - TF_ASSERT_OK(RunOpKernel()); - - // Validate the output. - - std::vector expected_transitions({-1, -1, -1, -1, -1, -1}); - std::vector expected_offsets({0, 2, 4, 6}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test ensures the op will throw an error if there are too few scores to -// finalize all the sequences. -TEST_F(ExpGreedyConstrainedSequenceTest, ErrorsIfGivenInsufficientScores) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 12.0, 13.0, 4.0, // - 1.0, 12.0, 13.0, 14.0, // - 15.0, 2.0, 3.0, 14.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 2, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO OUT - true, true, true, true, true, // FROM 0 - true, true, true, true, true, // FROM 1 - true, true, true, true, true, // FROM 2 - true, true, true, true, true, // FROM 3 - true, true, false, true, false, // FROM 'OUTSIDE' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({0, 0}), {}); - - auto result = RunOpKernel(); - EXPECT_FALSE(result.ok()); -} - -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/exp_viterbi_constrained_sequence_kernel_test.cc b/tensorflow_text/core/kernels/exp_viterbi_constrained_sequence_kernel_test.cc deleted file mode 100644 index 49cfa02be..000000000 --- a/tensorflow_text/core/kernels/exp_viterbi_constrained_sequence_kernel_test.cc +++ /dev/null @@ -1,910 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include - -#include -#include -#include "tensorflow/core/framework/fake_input.h" -#include "tensorflow/core/framework/node_def_builder.h" -#include "tensorflow/core/framework/tensor.h" -#include "tensorflow/core/framework/tensor_shape.h" -#include "tensorflow/core/framework/types.pb.h" -#include "tensorflow/core/kernels/ops_testutil.h" -#include "tensorflow/core/lib/core/status_test_util.h" -#include "tensorflow/core/platform/status.h" -#include "tensorflow/core/platform/types.h" -#include "tensorflow_text/core/kernels/text_kernels_test_util.h" - -namespace tensorflow { - -using tensorflow::DT_INT32; -using tensorflow::FakeInput; -using tensorflow::NodeDefBuilder; -using tensorflow::Status; -using tensorflow::TensorShape; -using tensorflow::text_kernels_test_util::MatrixEq; -using tensorflow::text_kernels_test_util::VectorEq; - -class ExpViterbiConstrainedSequenceTest : public tensorflow::OpsTestBase { - public: - void SetUpOpWithDefaults() { - // Prepare graph. - TF_ASSERT_OK(NodeDefBuilder("tested_op", "ConstrainedSequence") - .Attr("Tin", DT_INT32) - .Attr("use_viterbi", true) - .Attr("use_log_space", false) - .Attr("use_start_and_end_states", true) - .Input(FakeInput()) - .Input(FakeInput()) - .Input(FakeInput()) - .Input(FakeInput()) - .Finalize(node_def())); - TF_ASSERT_OK(InitOp()); - } -}; - -// TODO(b/122968457): There are a bunch of tests that only validate !ok instead -// of looking for specific error messages; fix that. - -// This test examines evaluations with only a permissions matrix. -TEST_F(ExpViterbiConstrainedSequenceTest, - ComputesSingleTransitionWithNoWeights) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 12.0, 13.0, 4.0, // - 1.0, 12.0, 13.0, 14.0, // - 15.0, 2.0, 3.0, 14.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO OUT - true, true, true, true, true, // FROM 0 - true, true, true, true, true, // FROM 1 - true, true, true, true, true, // FROM 2 - true, true, true, true, true, // FROM 3 - true, true, false, true, false, // FROM 'OUTSIDE' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({0, 0}), {}); - - TF_ASSERT_OK(RunOpKernel()); - - // The first sequence's highest score is 2, but OUT->2 is not ok, so it's 1. - // The second sequence's highest score is 3, which is ok. - // The third sequence's highest score is 0, which is ok. - - // Validate the output. - std::vector expected_transitions({1, 3, 0}); - std::vector expected_offsets({0, 1, 2, 3}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test examines evaluations with an empty weights matrix not of rank 2. -TEST_F(ExpViterbiConstrainedSequenceTest, - ComputesSingleTransitionWithNonMatrixEmptyWeights) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 12.0, 13.0, 4.0, // - 1.0, 12.0, 13.0, 14.0, // - 15.0, 2.0, 3.0, 14.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO OUT - true, true, true, true, true, // FROM 0 - true, true, true, true, true, // FROM 1 - true, true, true, true, true, // FROM 2 - true, true, true, true, true, // FROM 3 - true, true, false, true, false, // FROM 'OUTSIDE' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({0}), {}); - - TF_ASSERT_OK(RunOpKernel()); - - // The first sequence's highest score is 2, but OUT->2 is not ok, so it's 1. - // The second sequence's highest score is 3, which is ok. - // The third sequence's highest score is 0, which is ok. - - // Validate the output. - std::vector expected_transitions({1, 3, 0}); - std::vector expected_offsets({0, 1, 2, 3}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test examines evaluations with a 2D score matrix (implicit batch 1). -TEST_F(ExpViterbiConstrainedSequenceTest, - ComputesSingleTransitionWithSingleBatchItem) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({1, 4}), // - { - 10.0, 12.0, 13.0, 4.0, // - }); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({1}), {1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO OUT - true, true, true, true, true, // FROM 0 - true, true, true, true, true, // FROM 1 - true, true, true, true, true, // FROM 2 - true, true, true, true, true, // FROM 3 - true, true, false, true, false, // FROM 'OUTSIDE' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({0, 0}), {}); - - TF_ASSERT_OK(RunOpKernel()); - - // The sequence's highest score is 2, but OUT->2 is not ok, so it's 1. - // Validate the output. - std::vector expected_transitions({1}); - std::vector expected_offsets({0, 1}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test examines int64 input type and int32 output type. -TEST_F(ExpViterbiConstrainedSequenceTest, int64inint32out) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 12.0, 13.0, 4.0, // - 1.0, 12.0, 13.0, 14.0, // - 15.0, 2.0, 3.0, 14.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO OUT - true, true, true, true, true, // FROM 0 - true, true, true, true, true, // FROM 1 - true, true, true, true, true, // FROM 2 - true, true, true, true, true, // FROM 3 - true, true, false, true, false, // FROM 'OUTSIDE' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({0, 0}), {}); - - TF_ASSERT_OK(RunOpKernel()); - - // The first sequence's highest score is 2, but OUT->2 is not ok, so it's 1. - // The second sequence's highest score is 3, which is ok. - // The third sequence's highest score is 0, which is ok. - // Validate the output. - // Validate the output. - std::vector expected_transitions({1, 3, 0}); - std::vector expected_offsets({0, 1, 2, 3}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test ensures the op can take a sequence length of type {{X},{Y},{Z}} -// (with an outer batch dimension). -TEST_F(ExpViterbiConstrainedSequenceTest, TwoDimensionalSequenceLengths) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 12.0, 13.0, 4.0, // - 1.0, 12.0, 13.0, 14.0, // - 15.0, 2.0, 3.0, 14.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3, 1}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO OUT - true, true, true, true, true, // FROM 0 - true, true, true, true, true, // FROM 1 - true, true, true, true, true, // FROM 2 - true, true, true, true, true, // FROM 3 - true, true, false, true, false, // FROM 'OUTSIDE' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({0, 0}), {}); - - TF_ASSERT_OK(RunOpKernel()); - - // The first sequence's highest score is 2, but OUT->2 is not ok, so it's 1. - // The second sequence's highest score is 3, which is ok. - // The third sequence's highest score is 0, which is ok. - - // Validate the output. - std::vector expected_transitions({1, 3, 0}); - std::vector expected_offsets({0, 1, 2, 3}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test ensures that final transitions that are forbidden by the permission -// matrix (final->null) are not taken. -TEST_F(ExpViterbiConstrainedSequenceTest, - ComputesSingleTransitionWithNoWeightsConstrainedByEnd) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 12.0, 13.0, 4.0, // - 1.0, 12.0, 13.0, 14.0, // - 15.0, 2.0, 3.0, 14.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO OUT - true, true, true, true, true, // FROM 0 - true, true, true, true, false, // FROM 1 - true, true, true, true, true, // FROM 2 - true, true, true, true, true, // FROM 3 - true, true, false, true, false, // FROM 'OUTSIDE' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({0, 0}), {}); - - TF_ASSERT_OK(RunOpKernel()); - - // The first sequence's highest score is 2, but OUT->2 is not ok; the next - // highest is 1, but 1->OUT is not OK; the next highest is 0, which is OK. - // The second sequence's highest score is 3, OUT->3 is OK and 3->OUT is OK. - // The third sequence's highest score is 0, OUT->0 is OK and 0->OUT is OK. - // Validate the output. - std::vector expected_transitions({0, 3, 0}); - std::vector expected_offsets({0, 1, 2, 3}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test examines evaluations with only a weight matrix. -TEST_F(ExpViterbiConstrainedSequenceTest, - ComputesSingleTransitionWithNoPermissions) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 2.0, 7.0, 4.0, // - 1.0, 9.0, 11.0, 5.0, // - 100.0, 24.0, 3.0, 4.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({0, 0}), {}); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({5, 5}), {0.5, 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 0.5, 1.0, // - 0.1, 0.5, 0.5, 1.0, 1.0}); - - TF_ASSERT_OK(RunOpKernel()); - - // All scores should be multiplied by the last row in the weight tensor, so - // the 'real' scores are: - // 1: {1.0, 1.0, 3.5, 4.0} (max is 3) - // 2: {0.1, 4.5, 5.5, 5.0} (max is 2) - // 3: {10.0, 12.0, 1.5, 4.0} (max is 1) - // Validate the output. - std::vector expected_transitions({3, 2, 1}); - std::vector expected_offsets({0, 1, 2, 3}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test examines evaluations with an empty not rank 2 permissions matrix. -TEST_F(ExpViterbiConstrainedSequenceTest, - ComputesSingleTransitionWithNonMatrixEmptyPermissions) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 2.0, 7.0, 4.0, // - 1.0, 9.0, 11.0, 5.0, // - 100.0, 24.0, 3.0, 4.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({0, 0, 0}), {}); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({5, 5}), {0.5, 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 0.5, 1.0, // - 0.1, 0.5, 0.5, 1.0, 1.0}); - - TF_ASSERT_OK(RunOpKernel()); - - // All scores should be multiplied by the last row in the weight tensor, so - // the 'real' scores are: - // 1: {1.0, 1.0, 3.5, 4.0} (max is 3) - // 2: {0.1, 4.5, 5.5, 5.0} (max is 2) - // 3: {10.0, 12.0, 1.5, 4.0} (max is 1) - // Validate the output. - std::vector expected_transitions({3, 2, 1}); - std::vector expected_offsets({0, 1, 2, 3}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test ensures that final transitions are scored with the probability -// of ending the sequence on the transition (x->final->null). -TEST_F(ExpViterbiConstrainedSequenceTest, - ComputesSingleTransitionWithNoPermissionsWeightedByEnd) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 2.0, 7.0, 4.0, // - 1.0, 9.0, 11.0, 5.0, // - 100.0, 24.0, 3.0, 4.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({0, 0}), {}); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({5, 5}), {0.5, 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 0.5, 0.1, // - 0.1, 0.5, 0.5, 1.0, 1.0}); - - TF_ASSERT_OK(RunOpKernel()); - - // All scores should be multiplied by the last row and the last column in the - // score tensor, so the real scores are: - // 1: {1.0, 1.0, 3.5, 0.4} (max is 2) - // 2: {0.1, 4.5, 5.5, 0.5} (max is 2) - // 3: {10.0, 12.0, 1.5, 0.4} (max is 1) - // Validate the output. - std::vector expected_transitions({2, 2, 1}); - std::vector expected_offsets({0, 1, 2, 3}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test ensures that final transitions are not scored with the probability -// of ending the sequence on the transition (x->final->null) if -// use_start_and_end_states is False. -TEST_F(ExpViterbiConstrainedSequenceTest, - ComputesSingleTransitionWithNoPermissionsNotWeightedByEnd) { - // Prepare graph. - TF_ASSERT_OK(NodeDefBuilder("tested_op", "ConstrainedSequence") - .Attr("Tin", DT_INT32) - .Attr("use_viterbi", true) - .Attr("use_log_space", false) - .Attr("use_start_and_end_states", false) - .Input(FakeInput()) - .Input(FakeInput()) - .Input(FakeInput()) - .Input(FakeInput()) - .Finalize(node_def())); - TF_ASSERT_OK(InitOp()); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 2.0, 7.0, 4.0, // - 1.0, 9.0, 11.0, 5.0, // - 100.0, 24.0, 3.0, 4.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({0, 0}), {}); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({4, 4}), {0.5, 0.5, 0.5, 0.5, // - 0.5, 0.5, 0.5, 0.5, // - 0.5, 0.5, 0.5, 0.5, // - 0.5, 0.5, 0.5, 0.5}); - - TF_ASSERT_OK(RunOpKernel()); - - // All scores should be multiplied by the last row and the last column in the - // score tensor, so the real scores are: - // 1: {5.0, 1.0, 3.5, 4.0} (max is 0) - // 2: {.5, 4.5, 5.5, 2.5} (max is 2) - // 3: {50.0, 12.0, 1.5,2.0} (max is 0) - // Validate the output. - std::vector expected_transitions({0, 2, 0}); - std::vector expected_offsets({0, 1, 2, 3}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test examines evaluations with both weight and permission matrices. -TEST_F(ExpViterbiConstrainedSequenceTest, - ComputesSingleTransitionWithWeightsAndPermissions) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 12.0, 7.0, 4.0, // - 1.0, 9.0, 11.0, 5.0, // - 100.0, 24.0, 3.0, 4.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO OUT - true, true, true, true, true, // FROM 0 - true, true, true, true, true, // FROM 1 - true, true, true, true, true, // FROM 2 - true, true, true, true, true, // FROM 'OUTSIDE' - true, false, true, true, false, // FROM 'NULL' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({5, 5}), {0.5, 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 0.5, 1.0, // - 0.1, 0.5, 0.5, 1.0, 1.0}); - - TF_ASSERT_OK(RunOpKernel()); - - // All scores should be multiplied by the last row in the weight tensor, so - // the 'real' scores are: - // 1: {1.0, 1.0, 3.5, 4.0} (max is 3). OUT->3 is OK. - // 2: {0.1, 4.5, 5.5, 5.0} (max is 2). OUT->2 is OK. - // 3: {10.0, 12.0, 1.5, 4.0} (max is 1). OUT->1 is not OK, so go with 0. - // Note that X->OUT is set to always be OK here. - std::vector expected_transitions({3, 2, 0}); - std::vector expected_offsets({0, 1, 2, 3}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test examines multiple evaluations with both weight and permission -// matrices. -TEST_F(ExpViterbiConstrainedSequenceTest, - ComputesMultipleTransitionsWithWeightsAndPermissions) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 2, 4}), // - {{ - 10.0, 12.0, 7.0, 4.0, // Batch 0, step 0 - 10.0, 10.0, 10.0, 10.0, // Batch 0, step 1 - 1.0, 9.0, 11.0, 5.0, // Batch 1, step 0 - 10.0, 15.0, 1.0, 12.0, // Batch 1, step 1 - 100.0, 24.0, 3.0, 4.0, // Batch 2, step 0 - 1.0, 11.0, 1.0, 10.0, // Batch 2, step 1 - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {2, 2, 2}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO NUL - true, true, true, true, true, // FROM 0 - true, true, true, true, false, // FROM 1 - true, false, true, false, true, // FROM 2 - true, true, true, true, true, // FROM 3 (OUT) - true, false, true, true, true, // FROM 'NULL' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({5, 5}), {0.5, 0.5, 0.5, 0.5, 1.0, // 0 - 0.5, 0.5, 0.5, 0.5, 1.0, // 1 - 0.5, 0.5, 0.5, 0.5, 1.0, // 2 - 0.5, 0.5, 1.0, 0.5, 1.0, // 3 - 0.1, 0.5, 0.5, 1.0, 1.0}); - - TF_ASSERT_OK(RunOpKernel()); - - // STEP 1: - // All scores should be multiplied by the last row in the weight tensor, so - // the 'real' scores are: - // B0: { 1.0, [NOTOK], 3.5, 4.0} - // B1: { 0.1, [NOTOK], 5.5, 5.0} - // B2: {10.0, [NOTOK], 1.5, 4.0} - // - // STEP 2: - // (Forbidden transitions are marked with '*') - // - // BATCH 0: - // Raw scores are: {10.0, 10.0, 10.0, 10.0} - // from 0: New scores are {5.0, 5.0, 5.0, 5.0}, totals: {5, 0, 17.5, 20} - // from 1: New scores are {5.0, 5.0, 0*, 5.0}, totals: {5, 0, 0, 20} - // from 2: New scores are {5.0, 5.0, 5.0, 10.0}, totals: {5, 0, 17.5, 40} - // from 3: New scores are {5.0, 5.0, 0*, 5.0}, totals: {5, 0, 0, 20} - // Top scores are 20, 20, 40, 20 from [3, 3, 3, 3]. - // 1->OUT is not valid. - // Final scores are [20, 0, 40, 20] for a - // final state of [2] with a sequence of [3->2]. - // - // BATCH 1: - // Raw scores are {10, 15, 1, 12} - // from 0: Weighted score is {5, 5, 5, 5}, totals: {0.5, 0, 27.5, 25} - // from 1: Weighted score is {7.5, 7.5, 0*, 7.5}, t: {0.75, 0, 0, 37.5} - // from 2: Weighted score is {0.5, 0.5, 0.5, 1.0}, t: {0.05, 0, 2.75, 5} - // from 3: Weighted score is {6, 6, 0*, 6}, totals: {0.6, 0, 0, 30} - // Top scores are {27.5, 37.5, 5, 30} from [2, 3, 3, 3] - // 1->OUT is not valid, so final scores are [27.5, 0, 5, 30] for a final - // state of [3] and a sequence of [3, 3] - // - // BATCH 2: - // Raw scores are {1.0, 11.0, 1.0, 10.0} - // 2/0: Weighted score is {.5, .5, .5, .5}. t: {5, 0, 0.75, 2} - // 2/1: Weighted score is {5.5, 5.5, 0*, 5.5}. t: {55, 0, 0, 22} - // 2/2: Weighted score is {.5, .5, .5, 1.0}. t: {5, 0, 0.75, 4} - // 2/3: Weighted score is {5, 5, 0*, 5}. t: {50, 0, 0, 20} - // Top scores are {5, 55, 5, 50} from [0, 0, 0, 0] - // 1->OUT is not valid, so final scores are [5, 0, 5, 50] for a final - // state of 3 and a sequence of [0, 3]. - - std::vector expected_transitions({3, 2, 3, 3, 0, 3}); - std::vector expected_offsets({0, 2, 4, 6}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test examines multiple evaluations with both weight and permission -// matrices. -TEST_F(ExpViterbiConstrainedSequenceTest, - ComputesMultipleTransitionsWithVaryingLengths) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 2, 4}), // - {{ - 10.0, 12.0, 7.0, 4.0, // Batch 0, step 0 - 10.0, 10.0, 10.0, 10.0, // Batch 0, step 1 - 1.0, 9.0, 11.0, 5.0, // Batch 1, step 0 - 10.0, 15.0, 1.0, 12.0, // Batch 1, step 1 - 100.0, 24.0, 3.0, 4.0, // Batch 2, step 0 - 1.0, 11.0, 1.0, 10.0, // Batch 2, step 1 - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {2, 1, 2}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO NUL - true, true, true, true, true, // FROM 0 - true, true, true, true, false, // FROM 1 - true, false, true, false, true, // FROM 2 - true, true, true, true, true, // FROM 3 (OUT) - true, false, true, true, true, // FROM 'NULL' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({5, 5}), {0.5, 0.5, 0.5, 0.5, 1.0, // 0 - 0.5, 0.5, 0.5, 0.5, 1.0, // 1 - 0.5, 0.5, 0.5, 0.5, 1.0, // 2 - 0.5, 0.5, 1.0, 0.5, 1.0, // 3 - 0.1, 0.5, 0.5, 1.0, 1.0}); - - TF_ASSERT_OK(RunOpKernel()); - - // STEP 1: - // All scores should be multiplied by the last row in the weight tensor, so - // the 'real' scores are: - // B0: { 1.0, [NOTOK], 3.5, 4.0} - // B1: { 0.1, [NOTOK], 5.5, 5.0} - // B2: {10.0, [NOTOK], 1.5, 4.0} - // - // STEP 2: - // (Forbidden transitions are marked with '*') - // - // BATCH 0: - // Raw scores are: {10.0, 10.0, 10.0, 10.0} - // from 0: New scores are {5.0, 5.0, 5.0, 5.0}, totals: {5, 0, 17.5, 20} - // from 1: New scores are {5.0, 5.0, 0*, 5.0}, totals: {5, 0, 0, 20} - // from 2: New scores are {5.0, 5.0, 5.0, 10.0}, totals: {5, 0, 17.5, 40} - // from 3: New scores are {5.0, 5.0, 0*, 5.0}, totals: {5, 0, 0, 20} - // Top scores are 20, 20, 40, 20 from [3, 3, 3, 3]. - // 1->OUT is not valid. - // Final scores are [20, 0, 40, 20] for a - // final state of [2] with a sequence of [3->2]. - // - // BATCH 1: - // End of sequence; no further action. - // - // BATCH 2: - // Raw scores are {1.0, 11.0, 1.0, 10.0} - // 2/0: Weighted score is {.5, .5, .5, .5}. t: {5, 0, 0.75, 2} - // 2/1: Weighted score is {5.5, 5.5, 0*, 5.5}. t: {55, 0, 0, 22} - // 2/2: Weighted score is {.5, .5, .5, 1.0}. t: {5, 0, 0.75, 4} - // 2/3: Weighted score is {5, 5, 0*, 5}. t: {50, 0, 0, 20} - // Top scores are {5, 55, 5, 50} from [0, 0, 0, 0] - // 1->OUT is not valid, so final scores are [5, 0, 5, 50] for a final - // state of 3 and a sequence of [0, 3]. - - std::vector expected_transitions({3, 2, 2, 0, 3}); - std::vector expected_offsets({0, 2, 3, 5}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test examines evaluations with an all-zero weight matrix. -TEST_F(ExpViterbiConstrainedSequenceTest, - ComputesSingleTransitionWithZeroedWeights) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 2.0, 7.0, 4.0, // - 1.0, 9.0, 11.0, 5.0, // - 100.0, 24.0, 3.0, 4.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({0, 0}), {}); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({5, 5}), { - 0.0, 0.0, 0.0, 0.0, 0.0, // - 0.0, 0.0, 0.0, 0.0, 0.0, // - 0.0, 0.0, 0.0, 0.0, 0.0, // - 0.0, 0.0, 0.0, 0.0, 0.0, // - 0.0, 0.0, 0.0, 0.0, 0.0, - }); - - TF_ASSERT_OK(RunOpKernel()); - - // In the case of a tie between weights, the higher state number wins; - // if all weights are zero, the states should all be 3. - - std::vector expected_transitions({3, 3, 3}); - std::vector expected_offsets({0, 1, 2, 3}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -TEST_F(ExpViterbiConstrainedSequenceTest, - ImpossibleSequencesResultInNegativeOnesIfAttrIsSet) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 2, 4}), // - {{ - 10.0, 12.0, 13.0, 4.0, // - 1.0, 12.0, 13.0, 14.0, // - 15.0, 2.0, 3.0, 14.0, // - 10.0, 12.0, 13.0, 4.0, // - 1.0, 12.0, 13.0, 14.0, // - 15.0, 2.0, 3.0, 14.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {2, 2, 2}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO OUT - false, false, false, false, false, // FROM 0 - false, false, false, false, false, // FROM 1 - false, false, false, false, false, // FROM 2 - false, false, false, false, false, // FROM 3 - false, false, false, false, false, // FROM 'OUT' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({0, 0}), {}); - - TF_ASSERT_OK(RunOpKernel()); - - // Validate the output. - - std::vector expected_transitions({-1, -1, -1, -1, -1, -1}); - std::vector expected_offsets({0, 2, 4, 6}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test ensures the op will throw an error if there are too few scores to -// finalize all the sequences. -TEST_F(ExpViterbiConstrainedSequenceTest, ErrorsIfGivenInsufficientScores) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 12.0, 13.0, 4.0, // - 1.0, 12.0, 13.0, 14.0, // - 15.0, 2.0, 3.0, 14.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 2, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO OUT - true, true, true, true, true, // FROM 0 - true, true, true, true, true, // FROM 1 - true, true, true, true, true, // FROM 2 - true, true, true, true, true, // FROM 3 - true, true, false, true, false, // FROM 'OUTSIDE' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({0, 0}), {}); - - auto result = RunOpKernel(); - EXPECT_FALSE(result.ok()); -} - -// This test ensures that the op correctly outputs a ragged tensor with type -// int32 -TEST_F(ExpViterbiConstrainedSequenceTest, OutputsInt32RaggedTensor) { - // Prepare graph. - SetUpOpWithDefaults(); - - AddInputFromArray( - TensorShape({3, 2, 4}), // - {{ - 10.0, 12.0, 7.0, 4.0, // Tr. to 3 - 10.0, 10.0, 10.0, 10.0, // Tr. 3 to 2 on wt. - 1.0, 9.0, 11.0, 5.0, // Tr. to 2 - 10.0, 15.0, 1.0, 12.0, // Irrelevant (past end of sequence) - 100.0, 24.0, 3.0, 4.0, // Tr. to 0 - 1.0, 10.0, 1.0, 10.0, // Tr. 0 to 3 (1 cannot tr. to NULL) - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {2, 1, 2}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO NUL - true, true, true, true, true, // FROM 0 - true, true, true, true, false, // FROM 1 - true, false, true, false, true, // FROM 2 - true, true, true, true, true, // FROM 3 (OUT) - true, false, true, true, true, // FROM 'NULL' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({5, 5}), {0.5, 0.5, 0.5, 0.5, 1.0, // 0 - 0.5, 0.5, 0.5, 0.5, 1.0, // 1 - 0.5, 0.5, 0.5, 0.5, 1.0, // 2 - 0.5, 0.5, 1.0, 0.5, 1.0, // 3 - 0.1, 0.5, 0.5, 1.0, 1.0}); - - TF_ASSERT_OK(RunOpKernel()); - - std::vector expected_transitions({3, 2, 2, 0, 3}); - std::vector expected_offsets({0, 2, 3, 5}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/fast_bert_normalizer.h b/tensorflow_text/core/kernels/fast_bert_normalizer.h index c7b8f1849..52721e97a 100644 --- a/tensorflow_text/core/kernels/fast_bert_normalizer.h +++ b/tensorflow_text/core/kernels/fast_bert_normalizer.h @@ -15,352 +15,6 @@ #ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_FAST_BERT_NORMALIZER_H_ #define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_FAST_BERT_NORMALIZER_H_ -#include -#include -#include - -#include "absl/base/optimization.h" -#include "absl/status/status.h" -#include "absl/strings/string_view.h" -#include "icu4c/source/common/unicode/utf8.h" -#include "tensorflow/lite/kernels/shim/status_macros.h" -#include "tensorflow_text/core/kernels/darts_clone_trie_wrapper.h" -#include "tensorflow_text/core/kernels/fast_bert_normalizer_model_generated.h" - -namespace tensorflow { -namespace text { -namespace text_norm { - -// Bit configurations to encode the mapped normalized value. Currently, -// - The 1st bit (from the left) is reserved by Darts-clone Trie. -// - The 2nd bit stores whether the normalized string is different from the -// codepoint itself. It is also used to differentiate from value 0, which is -// the value returned by `LookupData()` when the codepoint is not stored on -// the trie. -// - The next 24 bits (3 to 26) encode the offset of the normalized string in -// a shared pool. -// - The last 6 bits (27 to 32) encode the length of utf8 bytes of the -// normalized string. - -// The 2rd bit stores whether the normalized string is different from itself. -static constexpr unsigned int kIsNormalizedStringDifferentMask = 0x40000000; - -// Number of lowest bits to represent the length of utf8 bytes of mapped -// values. 6-bit is enough to encode the length of the normalized strings. -static constexpr unsigned int kBitsToEncodeUtf8LengthOfNormalizedString = 6; - -// The mask for getting the length of the normalized string. It equals to 0x3F -// when `kBitsToEncodeUtf8LengthOfNormalizedString = 6`. -static constexpr unsigned int kNormalizedStringLengthMask = - (1 << kBitsToEncodeUtf8LengthOfNormalizedString) - 1; - -// Maximum length of utf8 bytes of normalized strings. It equals to 63 -// when `kBitsToEncodeUtf8LengthOfNormalizedString = 6`. -static constexpr unsigned int kMaximumUtf8LengthOfNormalizedString = - (1 << kBitsToEncodeUtf8LengthOfNormalizedString) - 1; - -// The mask for getting the offset of the normalized string in the pool. It -// equals to 0x3FFFFFC0 when `kBitsToEncodeUtf8LengthOfNormalizedString = 6`. -static constexpr unsigned int kNormalizedStringOffsetMask = - (kIsNormalizedStringDifferentMask - 1) ^ kNormalizedStringLengthMask; - -// Each normalized string is represented as a continuous utf-8 substring in a -// pool. `kMaximumOffsetOfNormalizedString` denotes the maximum offset supported -// here. -static constexpr unsigned int kMaximumOffsetOfNormalizedString = - (1 << (32 - 2 - kBitsToEncodeUtf8LengthOfNormalizedString)) - 1; - -} // namespace text_norm - -// A fast text normalizer for BERT based on codepoint-wise mappings. -class FastBertNormalizer { - public: - // Creates an instance. - // - // Args: - // * trie_data: the pointer to the trie data, which is not owned by this - // instance and should be kept alive through the lifetime of the instance. - // * data_for_codepoint_zero: the mapped data for the codepoint zero. - // * normalized_string_pool: the pointer to the normalized string pool data, - // which is not owned by this instance and should be kept alive through the - // lifetime of the instance. - static absl::StatusOr Create( - const uint32_t* trie_data, int data_for_codepoint_zero, - const char* normalized_string_pool, - size_t normalized_string_pool_size = static_cast(-1)) { - if (trie_data == nullptr || normalized_string_pool == nullptr) { - return absl::InvalidArgumentError( - "trie_data or normalized_string_pool is null"); - } - FastBertNormalizer result; - SH_ASSIGN_OR_RETURN(auto trie, - trie_utils::DartsCloneTrieWrapper::Create(trie_data)); - result.trie_ = - std::make_unique(std::move(trie)); - result.data_for_codepoint_zero_ = data_for_codepoint_zero; - result.normalized_string_pool_ = - reinterpret_cast(normalized_string_pool); - result.normalized_string_pool_size_ = normalized_string_pool_size; - return result; - } - - // Creates an instance. - // - // Args: - // * model_flatbuffer: the pointer to the FastBertNormalizerModel - // flatbuffer, which is not owned by this instance and should be kept alive - // through the lifetime of the instance. - static absl::StatusOr Create( - const void* model_flatbuffer) { - if (model_flatbuffer == nullptr) { - return absl::InvalidArgumentError("model_flatbuffer is null"); - } - // `GetFastBertNormalizerModel()` is autogenerated by flatbuffer. - auto model = GetFastBertNormalizerModel(model_flatbuffer); - if (model == nullptr || model->trie_array() == nullptr || - model->normalized_string_pool() == nullptr) { - return absl::InvalidArgumentError( - "FastBertNormalizerModel or its required fields are null"); - } - return Create( - model->trie_array()->data(), model->data_for_codepoint_zero(), - reinterpret_cast(model->normalized_string_pool()->data()), - model->normalized_string_pool()->size()); - } - - // Normalizes the input based on config `lower_case_nfd_strip_accents`. - // - // It keeps track that, for each byte in the normalized string, which position - // in the original input it should best map to (see below notes). - // - // Here are a few examples (assuming `lower_case_nfd_strip_accents=true`): - // * Input: "ABC" - // Output: "abc" - // Mapping: 0,1,2,3 - // Explanation: "A" -> "a", "B" -> "b", "C" -> "c". The start position of - // "a" maps to position 0 in the input; its exclusive end position equals to - // the start position of "b", which maps to position 1 in the input. The - // start position of "c" maps to position 2 in the input. The exclusive end - // position of "c" (which is also the end of the normalized string) maps to - // position 3 in the input (i.e., the end of input). - // * Input: "B\x41\xCC\x80C" - // Output: "bac" - // Mapping: 0,1,4,5 - // Explanation: "\x41\xCC\x80" -> "a". So the start position of "a" maps to - // position 1 in the input; the exclusive end position of "a" (which is also - // the start position of "c") is position 4 in the input. The exclusive end - // position of "c" (which is also the end of the normalized string) maps to - // position 5 in the input (i.e., the end of input). - // * Input: "a\xCE\x89" - // Output: "a\xCE\xB7" - // Mapping: 0,1,1,3 - // Explanation: "\xCE\xB9" (2 bytes) -> "\xCE\xB7" (2 bytes). Because - // "\xCE\xB7" represents the normalized string of the codepoint U+0389 (i.e. - // "\xCE\x90"), their start positions both map to position 1 in the input - // (which is the start position of that codepoint). - // * Input: "a\xC2\xBC" - // Output = "a1\xE2\x81\x84""4" - // Mapping: 0,1,1,1,1,1,3 - // Explanation: "\xC2\xBC" (2 bytes) -> "1\xE2\x81\x84""4" (5 bytes). The - // start points of those 5 bytes all point to position 1 in the input, which - // is the start position of that codepoint. - // - // Note that if the input character is not changed after normalization, the - // bytes are mapped to their original byte locations. For example: - // * Input: "a\xCC\x80" - // Output: "a\xCC\x80" - // Mapping: 0,1,2,3 - // However, if a multibyte character is changed after normalization, all bytes - // of the result character map to the first byte of the character in the - // input. - // * Input: "a\xCE\x89" - // Output: "a\xCE\xB7" - // Mapping: 0,1,1,3 - // The reasons are two-folds: - // 1. When a multibyte character is changed after normalizatoon, it is not - // always feasible to map every internal byte in the output back to their - // corresponding byte in the input. For example, consider the cases where - // 2-bytes are normalized to 3-bytes or vice versa. - // 2. The mapping of the internal bytes in the normalized text is usually not - // used, because users work with UTF-8 output in unit of codepoints, and only - // the mapping of the first byte is important. - // - // - // This function does not check whether the input is valid utf-8. This - // behavior is consistent with the existing TF.Text::BertTokenizer. - // - // Args: - // * input_text: The input text. - // * is_output_identical_as_input: True if the normalized string is the - // same as the input. In this case, `output_normalized_text` is empty and - // `output_normalized_offset_mapping` is not changed. - // * output_normalized_text: The normalized text. - // * output_normalized_offset_mapping: In addition to the existing content, - // the extended new content has size 1 plus the size of `normalized_text`. - // Each value is the mapped offset of each byte of `normalized_text` in the - // original `input_text`. The final value maps the end of `normalized_text` - // to the end of `input_text`. - template - void NormalizeText(absl::string_view input_text, - bool* is_output_identical_as_input, - std::string* output_normalized_text, - std::vector* output_normalized_offset_mapping) const { - *output_normalized_text = ""; - // `output_normalized_offset_mapping` is not cleared so the existing content - // is kept. - int last_pos_to_copy_over = 0; // Mark where the copy stopped last time. - auto copy_unchanged_input_to_output = - [input_text, output_normalized_text, output_normalized_offset_mapping, - &last_pos_to_copy_over](int exclusive_copy_end) { - // Copy from `last_pos_to_copy_over` to `exclusive_copy_end` and - // update `last_pos_to_copy_over` accordingly. - if (last_pos_to_copy_over < exclusive_copy_end) { - absl::StrAppend( - output_normalized_text, - input_text.substr(last_pos_to_copy_over, - exclusive_copy_end - last_pos_to_copy_over)); - if constexpr (kGetOffsets) { - for (int i = last_pos_to_copy_over; i < exclusive_copy_end; ++i) { - output_normalized_offset_mapping->push_back(i); - } - } - last_pos_to_copy_over = exclusive_copy_end; - } - }; - int cur_pos = 0; // Current position in `input_text` to process. - while (cur_pos < input_text.size()) { - int next_pos = cur_pos; - U8_FWD_1(input_text.data(), next_pos, input_text.size()); - const int cp_byte_length = next_pos - cur_pos; - if (cp_byte_length == 0) { - // The codepoint here has length 0, which is probably invalid UTF-8. - // Copy the remaining unchanged text if any. - copy_unchanged_input_to_output(cur_pos); - // Output a whitespace here to replace the invalid UTF-8 byte. - absl::StrAppend(output_normalized_text, " "); - if constexpr (kGetOffsets) { - output_normalized_offset_mapping->push_back(cur_pos); - } - // Move by one byte. - ++cur_pos; - // Mark the next position to copy over. - last_pos_to_copy_over = cur_pos; - continue; - } - const int encoded_data = - LookupData(input_text.substr(cur_pos, cp_byte_length)); - if (!IsNormalizedStringDifferent(encoded_data)) { - // The codepoint is the same as the normalized. We skip here and copy - // over in an aggregation way for efficiency reasons. - cur_pos += cp_byte_length; // Now move by one codepoint. - continue; - } - absl::string_view normalized_codepoint = - GetNormalizedString(encoded_data); - // Copy the previous unchanged text if any. - copy_unchanged_input_to_output(cur_pos); - - // Output the normalized codepoint text. - absl::StrAppend(output_normalized_text, normalized_codepoint); - if constexpr (kGetOffsets) { - // Every byte of the normalized string should be map to the same start - // position of the current codepoint in the original `input_text`. - for (int i = 0; i < normalized_codepoint.size(); ++i) { - output_normalized_offset_mapping->push_back(cur_pos); - } - } - // Move by one codepoint. - cur_pos += cp_byte_length; - // Mark the next position to copy over. - last_pos_to_copy_over = cur_pos; - } - if (last_pos_to_copy_over == 0) { - // This means that the normalized string would be the same as the input. - *is_output_identical_as_input = true; - return; - } - *is_output_identical_as_input = false; - // Copy the remaining unchanged text if any. - copy_unchanged_input_to_output(input_text.size()); - // Push one more mapping from end_of_normalized to end_of_original. - if constexpr (kGetOffsets) { - output_normalized_offset_mapping->push_back(input_text.size()); - } - } - - private: - // Use the public Create() method. - FastBertNormalizer() {} - - // Returns true if the normalized string is different from the codepoint (from - // the encoded `data`). If `data`==0, it means the normalized string is the - // same; in that case, this function returns false correctly. - static bool IsNormalizedStringDifferent(int data) { - return static_cast(data & - text_norm::kIsNormalizedStringDifferentMask); - } - - // Calls this only when IsNormalizedStringDifferent(data) returns true. - absl::string_view GetNormalizedString(int data) const { - const int len = data & text_norm::kNormalizedStringLengthMask; - if (!len) { - return ""; - } - const int offset = (data & text_norm::kNormalizedStringOffsetMask) >> - text_norm::kBitsToEncodeUtf8LengthOfNormalizedString; - if (ABSL_PREDICT_FALSE( - offset < 0 || - (normalized_string_pool_size_ != static_cast(-1) && - offset + len > normalized_string_pool_size_))) { - return ""; - } - return absl::string_view(normalized_string_pool_ + offset, len); - } - - // Looks up the character in format of utf8 string format and returns the - // associated data. If not found, returns 0. Note that 0 also means the - // normalized string is the same as the codepoint itself (refer to - // `kIsNormalizedStringDifferentMask`). - int LookupData(absl::string_view utf8_view) const { - return LookupData(utf8_view.data(), utf8_view.size()); - } - - // The actual implementation of LookupData(). 'utf8_view_ptr' and 'size' - // should point to the utf8 view of a codepoint. Performance-critical. - // Implicitly inline. - int LookupData(const char* utf8_view_ptr, int size) const { - // Darts_clone trie cannot encode the empty input string, so we store and - // return this value separately. - if (size == 0 || *utf8_view_ptr == '\0') return data_for_codepoint_zero_; - auto cursor = trie_->CreateTraversalCursorPointToRoot(); - if (!trie_->TryTraverseSeveralSteps( - cursor, absl::string_view(utf8_view_ptr, size))) { - return 0; - } - int data; - if (!trie_->TryGetData(cursor, data)) { - return 0; - } - return data; - } - - // Provides traversal/data-accessing methods on the trie. It has a pointer - // that points to 'trie_array_'. - std::unique_ptr trie_; - - // The encoded data for the special codepoint '\0'. Darts_clone trie cannot - // encode the empty string, so we store this value separately. - int data_for_codepoint_zero_; - - // The string pool of normalized strings. Each normalized string is a - // substring denoted by (offset and length). - const char* normalized_string_pool_; - - // The size of normalized_string_pool_ if known, or -1. - size_t normalized_string_pool_size_ = static_cast(-1); -}; - -} // namespace text -} // namespace tensorflow +#include "tensorflow/core/kernels/text/fast_bert_normalizer.h" #endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_FAST_BERT_NORMALIZER_H_ diff --git a/tensorflow_text/core/kernels/fast_bert_normalizer_kernel_template.h b/tensorflow_text/core/kernels/fast_bert_normalizer_kernel_template.h index ae795b53b..d34da0e96 100644 --- a/tensorflow_text/core/kernels/fast_bert_normalizer_kernel_template.h +++ b/tensorflow_text/core/kernels/fast_bert_normalizer_kernel_template.h @@ -15,246 +15,6 @@ #ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_FAST_BERT_NORMALIZER_KERNEL_TEMPLATE_H_ #define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_FAST_BERT_NORMALIZER_KERNEL_TEMPLATE_H_ -#include "absl/status/status.h" -#include "absl/strings/str_cat.h" -#include "tensorflow/lite/kernels/shim/op_kernel.h" -#include "tensorflow/lite/kernels/shim/status_macros.h" -#include "tensorflow_text/core/kernels/fast_bert_normalizer.h" +#include "tensorflow/core/kernels/text/fast_bert_normalizer_kernel_template.h" -namespace tensorflow { -namespace text { - -// See `kDoc` data member for the documentation on this op kernel. -// -// This template class can be instantiated into a kernel for either TF or -// TFLite. See go/tfshim for more info on how this works. -template -class FastBertNormalizeOp - : public tflite::shim::OpKernelShim { - private: - enum Inputs { kInputValues = 0, kFastBertNormalizerModel }; - enum Outputs { - kOutputValues = 0, - kOutputOffsets, - kOutputRowSplitsOfOffsets, - }; - - using Shape = tflite::shim::Shape; - using - typename tflite::shim::OpKernelShim::InitContext; - using typename tflite::shim::OpKernelShim::InvokeContext; - using typename tflite::shim::OpKernelShim::ShapeInferenceContext; - - static const char kGetOffsetsAttr[]; - - // The real work of the invoke operation. - template - absl::Status InvokeRealWork(InvokeContext* context); - - bool get_offsets_; - - public: - FastBertNormalizeOp() = default; - static constexpr char kOpName[] = "FastBertNormalize"; - static constexpr char kDoc[] = R"doc( - Normalizes texts. - - It returns the normalized texts and the relative offsets from the normalized - text to the original text. - - Args: - * input_values: 1D Tensor of strings to normalize. - * fast_bert_normalizer_model: Buffer tensor for the FastBertNormalizerModel - flatbuffer. - - Returns: - * output_values: 1D tensor containing the normalized text for all input - strings. The shape is the same as the input strings. - * output_offsets: 1D tensor containing the offset mapping from the - normalized text to the original text. A 2D RaggedTensor can be constructed - from this and output_row_splits. For example, if the input is - `input_values[i1...iN]` with `N` strings, the constructed 2D RaggedTensor - `offsets[i1...iN, k]` is the byte offset in `input_values[i1...iN]` for - the `kth` byte in `output_values[i1...iN]` after normalization. Note that - `offsets[i1...iN, ...]` also covers the position following the last byte - in the normalized `output_values[i1...iN]`, so that we know the byte - offset position in `input_values[i1...iN]` that corresponds to the end of - `output_values[i1...iN]`. - - - * output_row_splits: 1D int tensor with the row splits that allow us to - build RaggedTensors from output_offsets. - )doc"; - - static const char* OpName() { return kOpName; } - static const char* Doc() { return kDoc; } - - // Attributes declaration (syntax: https://www.tensorflow.org/guide/create_op) - static std::vector Attrs(); - - // Input tensors declaration (syntax: - // https://www.tensorflow.org/guide/create_op) - static std::vector Inputs(); - - // Output tensors declaration (syntax: - // https://www.tensorflow.org/guide/create_op) - static std::vector Outputs(); - - // Initializes the op - absl::Status Init(InitContext* context); - - // Runs the operation - absl::Status Invoke(InvokeContext* context); - - // Shape inference - static absl::Status ShapeInference(ShapeInferenceContext* c); -}; - -////////////////////////// Implementation - -template -const char FastBertNormalizeOp::kGetOffsetsAttr[] = - "get_offsets"; - -template -std::vector FastBertNormalizeOp::Attrs() { - return { - absl::StrCat(kGetOffsetsAttr, ": bool = false"), - }; -} - -template -std::vector FastBertNormalizeOp::Inputs() { - return {"input_values: string", "fast_bert_normalizer_model: uint8"}; -} - -template -std::vector FastBertNormalizeOp::Outputs() { - return {"output_values: string", "output_offsets: int64", - "output_row_splits: int64"}; -} - -template -absl::Status FastBertNormalizeOp::Init(InitContext* context) { - SH_RETURN_IF_ERROR( - context->GetAttr(kGetOffsetsAttr, &get_offsets_)); - return absl::OkStatus(); -} - -template -absl::Status FastBertNormalizeOp::Invoke(InvokeContext* context) { - if (get_offsets_) { - return InvokeRealWork(context); - } else { - return InvokeRealWork(context); - } -} - -template -template -absl::Status FastBertNormalizeOp::InvokeRealWork(InvokeContext* context) { - SH_ASSIGN_OR_RETURN(const auto input_values, context->GetInput(kInputValues)); - const auto& values_vec = input_values->template As(); - - SH_ASSIGN_OR_RETURN(const auto fast_bert_normalizer_model, - context->GetInput(kFastBertNormalizerModel)); - // OK to create on every call because FastBertNormalizer is a lightweight, - // memory-mapped wrapper on `fast_bert_normalizer_model` tensor, and thus - // Create() is very cheap. - auto text_normalizer = FastBertNormalizer::Create( - fast_bert_normalizer_model->template Data().data()); - SH_RETURN_IF_ERROR(text_normalizer.status()); - - SH_ASSIGN_OR_RETURN( - auto output_values, - context->GetOutput(kOutputValues, Shape(input_values->Shape()))); - auto output_values_vec = output_values->template As(); - std::vector offsets; - std::vector row_splits; - - if constexpr (kGetOffsets) { - row_splits.push_back(0); - } - - // Iterate through all the values and normalize them. - for (int i = 0; i < values_vec.Dim(0); ++i) { - // Normalize and record the offset locations. - std::string normalized_string; - bool is_normalized_string_identical; - const int original_size = offsets.size(); - - text_normalizer->template NormalizeText( - values_vec(i), &is_normalized_string_identical, &normalized_string, - &offsets); - if (is_normalized_string_identical) { - // When the input string is not changed after normalization, - // `normalized_string` is empty and `offsets` is not changed by - // the above function. So here we construct the corresponding result and - // append to the final output. - output_values_vec(i) = values_vec(i); // The normalized text. - if constexpr (kGetOffsets) { - // The offset mapping will be the identy mapping. - for (int j = 0; j < values_vec(i).size(); ++j) { - offsets.push_back(j); - } - // The mapping from the end of the output to the end of the input. - offsets.push_back(values_vec(i).size()); - } - } else { - output_values_vec(i) = normalized_string; - } - - if constexpr (kGetOffsets) { - // Record the row splits. - const int delta_size = offsets.size() - original_size; - row_splits.push_back(delta_size + row_splits.back()); - } - } - - if constexpr (kGetOffsets) { - SH_RETURN_IF_ERROR(this->template FillOutputTensor( - offsets, kOutputOffsets, context)); - SH_RETURN_IF_ERROR(this->template FillOutputTensor( - row_splits, kOutputRowSplitsOfOffsets, context)); - } else { - SH_RETURN_IF_ERROR(this->template FillOutputTensor( - offsets, kOutputOffsets, context)); - row_splits.resize(1+values_vec.Dim(0)); - SH_RETURN_IF_ERROR(this->template FillOutputTensor( - row_splits, kOutputRowSplitsOfOffsets, context)); - } - return absl::OkStatus(); -} - -template -absl::Status FastBertNormalizeOp::ShapeInference(ShapeInferenceContext* c) { - using tflite::shim::Shape; - SH_ASSIGN_OR_RETURN(const Shape input_values_shape, - c->GetInputShape(kInputValues)); - SH_ASSIGN_OR_RETURN(const auto fast_bert_normalizer_model_shape, - c->GetInputShape(kFastBertNormalizerModel)); - const auto rank_1_shape = Shape({Shape::kUnknownDim}); - if (!input_values_shape.Compatible(rank_1_shape)) { - return absl::FailedPreconditionError(absl::StrCat( - "Input values shape must be rank 1: ", input_values_shape.ToString())); - } - if (!fast_bert_normalizer_model_shape.Compatible(rank_1_shape)) { - return absl::FailedPreconditionError( - absl::StrCat("Fast BERT normalizer model shape must be rank 1: ", - fast_bert_normalizer_model_shape.ToString())); - } - SH_RETURN_IF_ERROR(c->SetOutputShape(kOutputValues, rank_1_shape)); - SH_RETURN_IF_ERROR(c->SetOutputShape(kOutputOffsets, rank_1_shape)); - // row splits size - const int num_splits = Shape::AddDims(1, input_values_shape.Dim(0)); - SH_RETURN_IF_ERROR( - c->SetOutputShape(kOutputRowSplitsOfOffsets, Shape({num_splits}))); - - return absl::OkStatus(); -} - -} // namespace text -} // namespace tensorflow #endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_FAST_BERT_NORMALIZER_KERNEL_TEMPLATE_H_ diff --git a/tensorflow_text/core/kernels/fast_bert_normalizer_model.fbs b/tensorflow_text/core/kernels/fast_bert_normalizer_model.fbs deleted file mode 100644 index 75b57d49a..000000000 --- a/tensorflow_text/core/kernels/fast_bert_normalizer_model.fbs +++ /dev/null @@ -1,20 +0,0 @@ -namespace tensorflow.text; - -table FastBertNormalizerModel { - // If true, a preprocessing step is added to lowercase the text, apply NFD - // normalization, and strip accents characters. - lower_case_nfd_strip_accents: bool; - - // The trie data, in the format of darts_clone trie, for input normalization. - trie_array: [uint32]; - - // The encoded data for the special codepoint '\0'. Darts_clone trie cannot - // encode the empty string, so we store this value separately. - data_for_codepoint_zero: int32; - - // The string pool of normalized strings. Each normalized string is a - // substring denoted by (offset and length). - normalized_string_pool: [ubyte]; -} - -root_type FastBertNormalizerModel; diff --git a/tensorflow_text/core/kernels/fast_bert_normalizer_model_builder.cc b/tensorflow_text/core/kernels/fast_bert_normalizer_model_builder.cc deleted file mode 100644 index 808d2a09c..000000000 --- a/tensorflow_text/core/kernels/fast_bert_normalizer_model_builder.cc +++ /dev/null @@ -1,243 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/fast_bert_normalizer_model_builder.h" - -#include -#include -#include -#include - -#include "absl/container/flat_hash_map.h" -#include "absl/memory/memory.h" -#include "absl/status/status.h" -#include "absl/status/statusor.h" -#include "absl/strings/string_view.h" -#include "icu4c/source/common/unicode/errorcode.h" -#include "icu4c/source/common/unicode/normalizer2.h" -#include "icu4c/source/common/unicode/utf.h" -#include "icu4c/source/common/unicode/utf8.h" -#include "re2/re2.h" -#include "tensorflow/core/platform/logging.h" -#include "tensorflow/lite/kernels/shim/status_macros.h" -#include "tensorflow_text/core/kernels/darts_clone_trie_builder.h" -#include "tensorflow_text/core/kernels/fast_bert_normalizer.h" -#include "tensorflow_text/core/kernels/fast_bert_normalizer_model_generated.h" - -namespace tensorflow { -namespace text { -namespace { -// Adapted from CaseFoldUTF8Op::Compute() in -// https://github.com/tensorflow/text/blob/master/tensorflow_text/core/kernels/normalize_kernels.cc. -absl::StatusOr case_fold_utf8(absl::string_view input) { - std::string output_text; - icu::ErrorCode icu_error; - const icu::Normalizer2* nfkc_cf = - icu::Normalizer2::getNFKCCasefoldInstance(icu_error); - if (!icu_error.isSuccess()) { - return absl::InternalError( - "Could not retrieve ICU NFKC_CaseFold normalizer"); - } - icu::StringByteSink byte_sink(&output_text); - nfkc_cf->normalizeUTF8(0, icu::StringPiece(input.data(), input.size()), - byte_sink, nullptr, icu_error); - if (!icu_error.isSuccess()) { - return absl::InternalError( - absl::StrCat("Could not normalize input string: ", input)); - } - return output_text; -} - -// Adapted from NormalizeUTF8Op::Compute() in -// https://github.com/tensorflow/text/blob/master/tensorflow_text/core/kernels/normalize_kernels.cc. -absl::StatusOr normalize_utf8_nfd(absl::string_view input) { - icu::ErrorCode icu_error; - const icu::Normalizer2* normalizer = - icu::Normalizer2::getNFDInstance(icu_error); - if (!icu_error.isSuccess()) { - return absl::InternalError(absl::StrCat( - icu_error.errorName(), ": Could not retrieve ICU NFD normalizer")); - } - std::string output_text; - icu::StringByteSink byte_sink(&output_text); - normalizer->normalizeUTF8(0, icu::StringPiece(input.data(), input.size()), - byte_sink, nullptr, icu_error); - if (!icu_error.isSuccess()) { - return absl::InternalError(absl::StrCat( - icu_error.errorName(), ": Could not normalize input string: ", input)); - } - return output_text; -} - -// Returns all valid Unicode codepoints. -std::vector AllValidUnicodeCodePoints() { - std::vector ret; - // The maximum codepoint in Unicode is 0x0010FFFF. - for (char32_t cp = 0; cp <= 0x0010FFFF; ++cp) { - if (!U_IS_UNICODE_CHAR(cp)) { - continue; - } - ret.push_back(cp); - } - return ret; -} - -// Calls the original methods as in BertTokenizer (e.g., icu lib, etc.) to -// normalize the input. Based on -// https://github.com/tensorflow/text/blob/master/tensorflow_text/python/ops/bert_tokenizer.py. -absl::StatusOr OriginalNormalizeText( - absl::string_view input, bool lower_case_nfd_strip_accents) { - static const RE2* const kMnRegex = new RE2("\\p{Mn}"); - static const RE2* const kControlRegex = new RE2("\\p{Cc}|\\p{Cf}"); - std::string output_text = std::string(input); - // Lowercase and strip accents (if option is set) - if (lower_case_nfd_strip_accents) { - SH_ASSIGN_OR_RETURN(output_text, case_fold_utf8(output_text)); - SH_ASSIGN_OR_RETURN(output_text, normalize_utf8_nfd(output_text)); - RE2::GlobalReplace(&output_text, *kMnRegex, ""); - } - - // Replace control characters with spaces. - RE2::GlobalReplace(&output_text, *kControlRegex, " "); - - return output_text; -} -} // namespace - -absl::StatusOr BuildFastBertNormalizerModelAndExportToFlatBuffer( - bool lower_case_nfd_strip_accents) { - const auto& text_normalizer = - FastBertNormalizerFactory::GetInstance(lower_case_nfd_strip_accents); - flatbuffers::FlatBufferBuilder builder; - const auto array = builder.CreateVector(text_normalizer.GetTrieData()); - const auto mapped_string_pool = builder.CreateVector( - std::vector(text_normalizer.GetMappedValuePool().begin(), - text_normalizer.GetMappedValuePool().end())); - auto text_normalizer_model = CreateFastBertNormalizerModel( - builder, lower_case_nfd_strip_accents, array, - text_normalizer.GetDataForCodepointZero(), mapped_string_pool); - builder.Finish(text_normalizer_model); - return std::string(reinterpret_cast(builder.GetBufferPointer()), - builder.GetSize()); -} - -/*static*/ absl::Status FastBertNormalizerFactory::BuildFastBertNormalizer( - bool lower_case_nfd_strip_accents, std::vector& trie_data, - int& data_for_codepoint_zero, std::string& mapped_value_string_pool) { - // Prepare the string keys and the encoded values. - std::vector keys; - std::vector values; - mapped_value_string_pool = ""; - data_for_codepoint_zero = 0; - // Memorize and reuse normalized strings. - absl::flat_hash_map norm_string_to_pool_offset; - - for (const auto cp : AllValidUnicodeCodePoints()) { - // Get the utf8 view of the codepoint. - char buf[4]; - int len = 0; - U8_APPEND_UNSAFE(buf, len, cp); - const absl::string_view cp_view(buf, len); - // Normalize. - SH_ASSIGN_OR_RETURN( - auto cp_norm, - OriginalNormalizeText(cp_view, lower_case_nfd_strip_accents)); - int data = 0; - if (cp_norm != cp_view) { - // The mapped value is different from the input. - data |= text_norm::kIsNormalizedStringDifferentMask; - // Encode the mapped value into `data`. - if (!cp_norm.empty()) { - const auto itr = norm_string_to_pool_offset.find(cp_norm); - int current_offset = 0; - if (itr == norm_string_to_pool_offset.end()) { - if (cp_norm.size() > - text_norm::kMaximumUtf8LengthOfNormalizedString) { - LOG(ERROR) << "The length of mapped value exceeds the maximum " - "supported. Codepoint: " - << uint32_t{cp} - << ". Mapped value length: " << cp_norm.size() - << ". Maximum supported length: " - << text_norm::kMaximumUtf8LengthOfNormalizedString; - } - current_offset = mapped_value_string_pool.size(); - if (current_offset > text_norm::kMaximumOffsetOfNormalizedString) { - LOG(ERROR) << "The offset of mapped value exceeds the maximum " - "supported. Codepoint: " - << uint32_t{cp} - << ". Mapped value offset: " << current_offset - << ". Maximum supported length: " - << text_norm::kMaximumOffsetOfNormalizedString; - } - norm_string_to_pool_offset[cp_norm] = current_offset; - absl::StrAppend(&mapped_value_string_pool, cp_norm); - } else { - current_offset = norm_string_to_pool_offset[cp_norm]; - } - data |= cp_norm.size(); - data |= (current_offset - << text_norm::kBitsToEncodeUtf8LengthOfNormalizedString); - } - } - // Store the encoded data. - if (cp == 0) { - data_for_codepoint_zero = data; - // Skip encoding it into the trie since Darts_clone cannot encode the - // empty string. - continue; - } - if (data == 0) { - // Data is not set when normalizing the codepoint doesn't change it. These - // characters aren't encoded to save space. - continue; - } - // Key is the utf8 view; value is the encoded data. - keys.emplace_back(buf, len); - values.push_back(data); - } - // Build the trie. - SH_ASSIGN_OR_RETURN(trie_data, trie_utils::BuildDartsCloneTrie(keys, values)); - LOG(INFO) << "CharacterSet built (lower_case_nfd_strip_accents=" - << lower_case_nfd_strip_accents - << "). Trie data size (int32): " << trie_data.size() - << ". Normalized string pool size (byte): " - << mapped_value_string_pool.size(); - return absl::OkStatus(); -} - -FastBertNormalizerFactory::FastBertNormalizerFactory( - bool lower_case_nfd_strip_accents) { - auto status = - BuildFastBertNormalizer(lower_case_nfd_strip_accents, trie_data_, - data_for_codepoint_zero_, mapped_value_pool_); - if (!status.ok()) { - // Should never happen since the same code must have passed the unit tests. - LOG(ERROR) << "Unexpected error. Failed to build the data for " - "FastBertNormalizer. Error message: " - << status.message(); - return; - } - auto char_set_recognizer_mapper = FastBertNormalizer::Create( - trie_data_.data(), data_for_codepoint_zero_, mapped_value_pool_.data()); - if (!char_set_recognizer_mapper.ok()) { - // Should never happen since the same code must have passed the unit tests. - LOG(ERROR) << "Unexpected error: Failed to initialize " - "FastBertNormalizer from the data."; - return; - } - char_set_normalizer_ = std::make_unique( - *std::move(char_set_recognizer_mapper)); -} -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/fast_bert_normalizer_model_builder.h b/tensorflow_text/core/kernels/fast_bert_normalizer_model_builder.h index a2e57b7c5..f8b8d3c8f 100644 --- a/tensorflow_text/core/kernels/fast_bert_normalizer_model_builder.h +++ b/tensorflow_text/core/kernels/fast_bert_normalizer_model_builder.h @@ -15,86 +15,6 @@ #ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_FAST_BERT_NORMALIZER_MODEL_BUILDER_H_ #define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_FAST_BERT_NORMALIZER_MODEL_BUILDER_H_ -#include - -#include "absl/status/status.h" -#include "absl/status/statusor.h" -#include "tensorflow_text/core/kernels/fast_bert_normalizer.h" - -namespace tensorflow { -namespace text { - -// Builds a FastBertNormalizer model in flatbuffer format. -// -// Args: -// * lower_case_nfd_strip_accents: If true, a preprocessing step is added to -// lowercase the text, apply NFD normalization, and strip accents characters. -// -// Returns: -// The bytes of the flatbuffer that stores the model. -absl::StatusOr BuildFastBertNormalizerModelAndExportToFlatBuffer( - bool lower_case_nfd_strip_accents); - -/// A singleton class to initialize FastBertNormalizer and also to -/// own the data for it. -class FastBertNormalizerFactory { - public: - // Returns the singleton instance. - // - // Args: - // lower_case_nfd_strip_accents: bool - // - If true, it first lowercases the text, applies NFD normalization, - // strips accents characters, and then replaces control characters with - // whitespaces. - // - If false, it only replaces control characters with whitespaces. - static const FastBertNormalizerFactory& GetInstance( - bool lower_case_nfd_strip_accents) { - if (lower_case_nfd_strip_accents) { - return GetInstanceLowerCase(); - } else { - return GetInstanceNoLowerCase(); - } - } - - const FastBertNormalizer* GetNormalizer() const { - return char_set_normalizer_.get(); - } - - const std::vector& GetTrieData() const { return trie_data_; } - - int GetDataForCodepointZero() const { return data_for_codepoint_zero_; } - - absl::string_view GetMappedValuePool() const { return mapped_value_pool_; } - - private: - FastBertNormalizerFactory(bool lower_case_nfd_strip_accents); - - // Returns a singleton instance with lower_case_nfd_strip_accents = false. - static const FastBertNormalizerFactory& GetInstanceNoLowerCase() { - static const FastBertNormalizerFactory* const kInstance = - new FastBertNormalizerFactory(false); - return *kInstance; - } - - // Returns a singleton instance with lower_case_nfd_strip_accents = true. - static const FastBertNormalizerFactory& GetInstanceLowerCase() { - static const FastBertNormalizerFactory* const kInstance = - new FastBertNormalizerFactory(true); - return *kInstance; - } - - // Returns the data to build a FastBertNormalizer. - static absl::Status BuildFastBertNormalizer( - bool lower_case_nfd_strip_accents, std::vector& trie_data, - int& data_for_codepoint_zero, std::string& mapped_value_string_pool); - - std::vector trie_data_; - int data_for_codepoint_zero_ = 0; - std::string mapped_value_pool_ = ""; - std::unique_ptr char_set_normalizer_ = nullptr; -}; - -} // namespace text -} // namespace tensorflow +#include "tensorflow/core/kernels/text/fast_bert_normalizer_model_builder.h" #endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_FAST_BERT_NORMALIZER_MODEL_BUILDER_H_ diff --git a/tensorflow_text/core/kernels/fast_bert_normalizer_tf_kernel.cc b/tensorflow_text/core/kernels/fast_bert_normalizer_model_generated.h similarity index 63% rename from tensorflow_text/core/kernels/fast_bert_normalizer_tf_kernel.cc rename to tensorflow_text/core/kernels/fast_bert_normalizer_model_generated.h index 7faba4703..06160f4ec 100644 --- a/tensorflow_text/core/kernels/fast_bert_normalizer_tf_kernel.cc +++ b/tensorflow_text/core/kernels/fast_bert_normalizer_model_generated.h @@ -12,16 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "tensorflow_text/core/kernels/fast_bert_normalizer_tf_kernel.h" +#ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_FAST_BERT_NORMALIZER_MODEL_GENERATED_H_ +#define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_FAST_BERT_NORMALIZER_MODEL_GENERATED_H_ -#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/kernels/text/fast_bert_normalizer_model_generated.h" -namespace tensorflow { -namespace text { - -REGISTER_KERNEL_BUILDER( - Name(FastBertNormalizeOpKernel::OpName()).Device(tensorflow::DEVICE_CPU), - FastBertNormalizeOpKernel); - -} // namespace text -} // namespace tensorflow +#endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_FAST_BERT_NORMALIZER_MODEL_GENERATED_H_ diff --git a/tensorflow_text/core/kernels/fast_bert_normalizer_test.cc b/tensorflow_text/core/kernels/fast_bert_normalizer_test.cc deleted file mode 100644 index 73b4a4c51..000000000 --- a/tensorflow_text/core/kernels/fast_bert_normalizer_test.cc +++ /dev/null @@ -1,224 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/fast_bert_normalizer.h" - -#include - -#include -#include -#include "tensorflow_text/core/kernels/fast_bert_normalizer_model_builder.h" - -namespace tensorflow { -namespace text { -namespace { - -template -std::string ListToString(const std::vector& list) { - return absl::StrCat("[", absl::StrJoin(list, ", "), "]"); -} - -// Testing spec struct for parameterized tests. -struct Spec { - friend std::ostream& operator<<(std::ostream& os, const Spec& s) { - return os << "input: " << s.input << ", " - << "lower_case_nfd_strip_accents:" - << s.lower_case_nfd_strip_accents << ", " - << "expected_output:" << s.expected_output << ", " - << "expected_offset_mapping:" - << ListToString(s.expected_offset_mapping) << std::endl; - } - - std::string input; - bool lower_case_nfd_strip_accents = false; - std::string expected_output; - std::vector expected_offset_mapping; -}; - -// Parameterized tests specs for FastBertNormalizer. -const std::vector& GetTestSpecs() { - static const std::vector& v = *new std::vector{ - // Test Suite 1: No lower case. - // Test 0: Empty input. - { - .input = "", - .lower_case_nfd_strip_accents = false, - .expected_output = "", - .expected_offset_mapping = {0}, - }, - // Test 1: All ascii, digit, and normal letters. - { - .input = "Test #1.", - .lower_case_nfd_strip_accents = false, - .expected_output = "Test #1.", - .expected_offset_mapping = {0, 1, 2, 3, 4, 5, 6, 7, 8}, - }, - // Test 2: Multi-byte letters. - // "\xC3\x80" is U+00C0 "Latin Capital Letter A with Grave". - // "\x41\xCC\x80" is the decomposition of U+00C0 "Latin Capital Letter A - // with Grave". - { - .input = "Test\xC3\x80\x41\xCC\x80 #1.", - .lower_case_nfd_strip_accents = false, - .expected_output = "Test\xC3\x80\x41\xCC\x80 #1.", - .expected_offset_mapping = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, - 13}, - }, - // Test 3: Control chars normalized into whitespaces. - { - .input = "Te\x11st #1.", - .lower_case_nfd_strip_accents = false, - .expected_output = "Te st #1.", - .expected_offset_mapping = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, - }, - // Test 4: Tabs and newlines normalized into whitespaces. - { - .input = "Test \t\n#1.", - .lower_case_nfd_strip_accents = false, - .expected_output = "Test #1.", - .expected_offset_mapping = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, - }, - // Test Suite 2: Lower case. - // Test 5: Empty input. - { - .input = "", - .lower_case_nfd_strip_accents = true, - .expected_output = "", - .expected_offset_mapping = {0}, - }, - // Test 6: All ascii, digit, and normal letters. - { - .input = "Test #1.", - .lower_case_nfd_strip_accents = true, - .expected_output = "test #1.", - .expected_offset_mapping = {0, 1, 2, 3, 4, 5, 6, 7, 8}, - }, - // Test 7: Multi-byte letters. - // "\xC3\x80" is U+00C0 "Latin Capital Letter A with Grave", which is - // normalized to "a". "\x41\xCC\x80" is the decomposition of U+00C0 "Latin - // Capital Letter A with Grave", which is normalized to "a". - { - .input = "Test\xC3\x80\x41\xCC\x80 #1.", - .lower_case_nfd_strip_accents = true, - .expected_output = "testaa #1.", - .expected_offset_mapping = {0, 1, 2, 3, 4, 6, 9, 10, 11, 12, 13}, - }, - // Test 8: Control chars normalized into whitespaces. - { - .input = "Te\x11st #1.", - .lower_case_nfd_strip_accents = true, - .expected_output = "te st #1.", - .expected_offset_mapping = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, - }, - // Test 9: Tabs and newlines normalized into whitespaces. - { - .input = "Test \t\n#1.", - .lower_case_nfd_strip_accents = true, - .expected_output = "test #1.", - .expected_offset_mapping = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, - }, - // Test 10: Multibytes string normalized into multibytes string. - // "\xC2\xBC" (2 bytes) is normalized into "1\xE2\x81\x84""4" (5 bytes). - { - .input = "a\xC2\xBC", - .lower_case_nfd_strip_accents = true, - .expected_output = "a1\xE2\x81\x84" - "4", - .expected_offset_mapping = {0, 1, 1, 1, 1, 1, 3}, - }, - // Test 11: Multibytes string normalized into multibytes string. - // "\xC7\xB2" (2 bytes) is normalized into "dz" (2 bytes). - { - .input = "a\xC7\xB2", - .lower_case_nfd_strip_accents = true, - .expected_output = "adz", - .expected_offset_mapping = {0, 1, 1, 3}, - }, - // Test 12: Multibytes string normalized into multibytes string. - // "\xCE\xB9" (2 bytes) is normalized into "\xCE\xB7" (2 bytes). - { - .input = "a\xCE\x89", - .lower_case_nfd_strip_accents = true, - .expected_output = "a\xCE\xB7", - .expected_offset_mapping = {0, 1, 1, 3}, - }, - // Test 13: Invalid UTF8 input. lower_case_nfd_strip_accents = false. - { - .input = "a\x80 \xFF \xF8 a\xE0\x61 \xF3\x9C\x9D", - .lower_case_nfd_strip_accents = false, - .expected_output = "a\x80 \xFF \xF8 a\xE0\x61 \xF3\x9C\x9D", - .expected_offset_mapping = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, - 13, 14}, - }, - // Test 14: Invalid UTF8 input. lower_case_nfd_strip_accents = true. - { - .input = "a\x80 \xFF \xF8 a\xE0\x61 \xF3\x9C\x9D", - .lower_case_nfd_strip_accents = true, - .expected_output = "a\x80 \xFF \xF8 a\xE0\x61 \xF3\x9C\x9D", - .expected_offset_mapping = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, - 13, 14}, - }, - }; - return v; -} - -using TestNormalization = testing::TestWithParam; - -TEST_P(TestNormalization, TestGetOffsets) { - const auto spec = GetParam(); - const auto fast_bert_normalizer = - FastBertNormalizerFactory::GetInstance(spec.lower_case_nfd_strip_accents) - .GetNormalizer(); - - std::string output_normalized_text = "Something existing"; - std::vector output_normalized_offset_mapping; - bool is_normalized_identical; - fast_bert_normalizer->NormalizeText( - spec.input, &is_normalized_identical, &output_normalized_text, - &output_normalized_offset_mapping); - if (is_normalized_identical) { - ASSERT_THAT(output_normalized_text, ""); - ASSERT_THAT(spec.input, spec.expected_output); - ASSERT_THAT(output_normalized_offset_mapping, testing::ElementsAre()); - } else { - ASSERT_THAT(output_normalized_text, spec.expected_output); - ASSERT_THAT(output_normalized_offset_mapping, spec.expected_offset_mapping); - } -} - -TEST_P(TestNormalization, TestNoGetOffsets) { - const auto spec = GetParam(); - const auto fast_bert_normalizer = - FastBertNormalizerFactory::GetInstance(spec.lower_case_nfd_strip_accents) - .GetNormalizer(); - - std::string output_normalized_text; - std::vector output_normalized_offset_mapping; - bool is_normalized_identical; - fast_bert_normalizer->NormalizeText( - spec.input, &is_normalized_identical, &output_normalized_text, - /*output_normalized_offset_mapping=*/nullptr); - if (is_normalized_identical) { - ASSERT_THAT(spec.input, spec.expected_output); - ASSERT_THAT(output_normalized_text, ""); - } else { - ASSERT_THAT(output_normalized_text, spec.expected_output); - } -} - -INSTANTIATE_TEST_SUITE_P(FastBertNormalizerTest, TestNormalization, - testing::ValuesIn(GetTestSpecs())); -} // namespace -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/fast_bert_normalizer_tf_kernel.h b/tensorflow_text/core/kernels/fast_bert_normalizer_tf_kernel.h index 2c1547115..c3263a707 100644 --- a/tensorflow_text/core/kernels/fast_bert_normalizer_tf_kernel.h +++ b/tensorflow_text/core/kernels/fast_bert_normalizer_tf_kernel.h @@ -15,19 +15,6 @@ #ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_FAST_BERT_NORMALIZER_TF_KERNEL_H_ #define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_FAST_BERT_NORMALIZER_TF_KERNEL_H_ -#include "tensorflow/lite/kernels/shim/tf_op_shim.h" -#include "tensorflow_text/core/kernels/fast_bert_normalizer_kernel_template.h" - -namespace tensorflow { -namespace text { - -class FastBertNormalizeOpKernel - : public tflite::shim::TfOpKernel { - public: - using TfOpKernel::TfOpKernel; -}; - -} // namespace text -} // namespace tensorflow +#include "tensorflow/core/kernels/text/fast_bert_normalizer_tf_kernel.h" #endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_FAST_BERT_NORMALIZER_TF_KERNEL_H_ diff --git a/tensorflow_text/core/kernels/fast_bert_normalizer_tflite.cc b/tensorflow_text/core/kernels/fast_bert_normalizer_tflite.cc deleted file mode 100644 index d61202bdc..000000000 --- a/tensorflow_text/core/kernels/fast_bert_normalizer_tflite.cc +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/fast_bert_normalizer_tflite.h" - -#include "tensorflow/lite/kernels/shim/tflite_op_shim.h" -#include "tensorflow_text/core/kernels/fast_bert_normalizer_kernel_template.h" - -namespace tflite { -namespace ops { -namespace custom { -namespace text { - -using FastBertNormalizeOpKernel = - tflite::shim::TfLiteOpKernel; - -extern "C" void AddFastBertNormalize(tflite::MutableOpResolver* resolver) { - FastBertNormalizeOpKernel::Add(resolver); -} - -} // namespace text -} // namespace custom -} // namespace ops -} // namespace tflite diff --git a/tensorflow_text/core/kernels/fast_bert_normalizer_tflite.h b/tensorflow_text/core/kernels/fast_bert_normalizer_tflite.h index d6e2de641..503277ab4 100644 --- a/tensorflow_text/core/kernels/fast_bert_normalizer_tflite.h +++ b/tensorflow_text/core/kernels/fast_bert_normalizer_tflite.h @@ -12,21 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef THIRD_PARTY_TENSORFLOW_TEXT_GOOGLE_CORE_KERNELS_FAST_BERT_NORMALIZER_TFLITE_H_ -#define THIRD_PARTY_TENSORFLOW_TEXT_GOOGLE_CORE_KERNELS_FAST_BERT_NORMALIZER_TFLITE_H_ +#ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_FAST_BERT_NORMALIZER_TFLITE_H_ +#define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_FAST_BERT_NORMALIZER_TFLITE_H_ -#include "tensorflow/lite/mutable_op_resolver.h" +#include "tensorflow/core/kernels/text/fast_bert_normalizer_tflite.h" -namespace tflite { -namespace ops { -namespace custom { -namespace text { - -extern "C" void AddFastBertNormalize(::tflite::MutableOpResolver* resolver); - -} // namespace text -} // namespace custom -} // namespace ops -} // namespace tflite - -#endif // THIRD_PARTY_TENSORFLOW_TEXT_GOOGLE_CORE_KERNELS_FAST_BERT_NORMALIZER_TFLITE_H_ +#endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_FAST_BERT_NORMALIZER_TFLITE_H_ diff --git a/tensorflow_text/core/kernels/fast_wordpiece_tokenizer.cc b/tensorflow_text/core/kernels/fast_wordpiece_tokenizer.cc deleted file mode 100644 index ff10541f4..000000000 --- a/tensorflow_text/core/kernels/fast_wordpiece_tokenizer.cc +++ /dev/null @@ -1,773 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/fast_wordpiece_tokenizer.h" - -#include - -#include "absl/base/attributes.h" -#include "absl/base/optimization.h" -#include "absl/status/status.h" -#include "absl/status/statusor.h" -#include "absl/strings/match.h" -#include "absl/strings/str_cat.h" -#include "absl/strings/str_join.h" -#include "absl/strings/string_view.h" -#include "icu4c/source/common/unicode/uchar.h" -#include "icu4c/source/common/unicode/utf8.h" -#include "tensorflow/lite/kernels/shim/status_macros.h" -#include "tensorflow_text/core/kernels/fast_wordpiece_tokenizer_utils.h" - -namespace tensorflow { -namespace text { -namespace { - -template -int GetCurrentOutputSize(std::vector* output_pieces, - std::vector* output_ids) { - if constexpr (kGetPieces) { - return output_pieces->size(); - } else { - return output_ids->size(); - } -} - -} // namespace - -/*static*/ absl::StatusOr -FastWordpieceTokenizer::Create(const void* config_flatbuffer) { - FastWordpieceTokenizer tokenizer; - // `GetFastWordpieceTokenizerConfig()` is autogenerated by flatbuffer. - tokenizer.config_ = GetFastWordpieceTokenizerConfig(config_flatbuffer); - if (tokenizer.config_ == nullptr || - tokenizer.config_->trie_array() == nullptr) { - return absl::InvalidArgumentError( - "FastWordpieceTokenizerConfig or its trie_array is null."); - } - auto trie_or = trie_utils::DartsCloneTrieWrapper::Create( - tokenizer.config_->trie_array()->data()); - if (!trie_or.ok()) { - return absl::InvalidArgumentError( - "Failed to create DartsCloneTrieWrapper from " - "FastWordpieceTokenizerConfig.trie_array."); - } - tokenizer.trie_ = - std::make_unique(*std::move(trie_or)); - return std::move(tokenizer); -} - -void FastWordpieceTokenizer::Tokenize(absl::string_view input, - std::vector* output_pieces, - std::vector* output_ids, - std::vector* output_start_offsets, - std::vector* output_end_offsets, - int input_word_offset_in_text, - bool* error) const { - if (config_->end_to_end()) { - TokenizeTextImpl(input, output_pieces, output_ids, - output_start_offsets, - output_end_offsets, error); - } else { - TokenizeSingleWordImpl( - input, input_word_offset_in_text, output_pieces, output_ids, - output_start_offsets, output_end_offsets); - } -} - -void FastWordpieceTokenizer::Tokenize(absl::string_view input, - std::vector* output_ids, - std::vector* output_start_offsets, - std::vector* output_end_offsets, - int input_word_offset_in_text) const { - if (config_->end_to_end()) { - TokenizeTextImpl( - input, /*output_pieces=*/nullptr, output_ids, output_start_offsets, - output_end_offsets, /*error=*/nullptr); - } else { - TokenizeSingleWordImpl( - input, input_word_offset_in_text, /*output_pieces=*/nullptr, output_ids, - output_start_offsets, output_end_offsets); - } -} - -void FastWordpieceTokenizer::Tokenize(absl::string_view input, - std::vector* output_ids, - int input_word_offset_in_text) const { - if (config_->end_to_end()) { - TokenizeTextImpl( - input, /*output_pieces=*/nullptr, output_ids, - /*output_start_offsets=*/nullptr, - /*output_end_offsets=*/nullptr, /*error=*/nullptr); - } else { - TokenizeSingleWordImpl( - input, input_word_offset_in_text, /*output_pieces=*/nullptr, output_ids, - /*output_start_offsets=*/nullptr, - /*output_end_offsets=*/nullptr); - } -} - -absl::StatusOr> -FastWordpieceTokenizer::DetokenizeToTokens( - const absl::Span input) const { - std::vector subwords; - std::vector output_tokens; - if (!config_->support_detokenization()) { - return absl::FailedPreconditionError( - "Detokenize function is only enabled when support_detokenization is " - "true in the config flatbuffer. Please rebuild the model flatbuffer " - "by setting support_detokenization=true."); - } - if (config_->vocab_array() == nullptr || - config_->vocab_is_suffix_array() == nullptr) { - return absl::InternalError( - "Missing vocab_array or vocab_is_suffix_array in config."); - } - const int vocab_size = config_->vocab_array()->size(); - const int is_suffix_size = config_->vocab_is_suffix_array()->size(); - for (int id : input) { - if (ABSL_PREDICT_FALSE(id < 0 || id >= vocab_size || - id >= is_suffix_size)) { - return absl::OutOfRangeError( - absl::StrCat("Token ID out of bounds: ", id)); - } - auto vocab = config_->vocab_array()->Get(id); - if (ABSL_PREDICT_FALSE(vocab == nullptr)) { - return absl::InternalError("Null vocab string in vocab_array."); - } - auto is_suffix = config_->vocab_is_suffix_array()->Get(id); - if (!subwords.empty() && !is_suffix) { - // When current subword is not a suffix token, it marks the start of a new - // word. We concatenate the subwords that compose the previous word and - // add it to the return list. - output_tokens.emplace_back(absl::StrJoin(subwords, "")); - subwords.clear(); - } - // Special case: when a suffix token e.g. "##a" appears at the start of the - // input ids, we preserve the suffix_indicator. - if (subwords.empty() && is_suffix) { - if (ABSL_PREDICT_FALSE(config_->suffix_indicator() == nullptr)) { - return absl::InternalError("Missing suffix_indicator in config."); - } - subwords.emplace_back(config_->suffix_indicator()->string_view()); - } - subwords.emplace_back(vocab->string_view()); - } - if (!subwords.empty()) { - output_tokens.emplace_back(absl::StrJoin(subwords, "")); - } - return output_tokens; -} - -absl::StatusOr FastWordpieceTokenizer::Detokenize( - const absl::Span input) const { - SH_ASSIGN_OR_RETURN(std::vector output_tokens, - DetokenizeToTokens(input)); - return absl::StrJoin(output_tokens, " "); -} - -int FastWordpieceTokenizer::SkipTheRemainingOfWordAndTrailingWhiteSpaces( - absl::string_view input, int& cur_pos) const { - const int input_size = input.size(); - UChar32 cur_unicode_char; - int next_pos; - int end_of_word = cur_pos; - while (cur_pos < input_size) { - next_pos = cur_pos; - U8_NEXT(input, next_pos, input_size, cur_unicode_char); - if (u_isUWhiteSpace(cur_unicode_char)) { - cur_pos = next_pos; // Skip the whitespace as well. - // Break and return since we've met a word boundary. - break; - } - if (fast_wordpiece_tokenizer_utils::IsPunctuationOrChineseChar( - cur_unicode_char)) { - // Break and return since we've met a word boundary. We do not skip the - // punctuation character: that character may be a token by itself. - break; - } - end_of_word = next_pos; // Mark the exclusive end. - cur_pos = next_pos; // Skip the character. - } - return end_of_word; -} - -template -void FastWordpieceTokenizer::TokenizeTextImpl( - absl::string_view input_text, std::vector* output_pieces, - std::vector* output_ids, std::vector* output_start_offsets, - std::vector* output_end_offsets, bool* error) const { - static_assert(kGetPieces || kGetIds, - "At least one of `kGetPieces` and `kGetIds` should be true."); - if (input_text.empty()) { - return; - } - const int input_size = input_text.size(); - int prev_pos = -1; - int next_pos = 0; - int cur_pos = 0; - int original_num_tokens = - GetCurrentOutputSize(output_pieces, output_ids); - UChar32 prev_unicode_char; - UChar32 cur_unicode_char; - while (cur_pos < input_size) { - // Prevent looping without progress in cur_pos. - if (prev_pos == cur_pos && error != nullptr) { - *error = true; - return; - } - prev_pos = cur_pos; - - int cur_offset_in_input_word = 0; - // Tokenize the word starting at the current position. - auto cur_node = trie_->CreateTraversalCursorPointToRoot(); - int word_byte_length_so_far = 0; - int input_word_offset_in_text = cur_pos; - absl::string_view input_substr = input_text.substr(cur_pos); - // The trie matching loop below tokenizes and recognizes word pieces until - // 1. it steps over the input boundary, or - // 2. the length of the current word reaches 'max_bytes_per_token', or - // 3. it sees a whitespace / punctuation / unknown character. - int prev_pos_inner = -1; - while (cur_pos < input_size) { - // Prevent looping without progress in cur_pos. - if (prev_pos_inner == cur_pos && error != nullptr) { - *error = true; - return; - } - prev_pos_inner = cur_pos; - - prev_unicode_char = cur_unicode_char; - next_pos = cur_pos; - U8_NEXT(input_text, next_pos, input_text.length(), cur_unicode_char); - - if (word_byte_length_so_far + next_pos - cur_pos > - config_->max_bytes_per_token()) - break; - // Try matching one Unicode character from here. - while (!trie_->TryTraverseSeveralSteps( - cur_node, input_text.substr(cur_pos, next_pos - cur_pos))) { - // Trie cannot consume the whole Unicode character. We need to pop one - // or more longest-matching tokens off the beginning of the string - // represented by the current node. We then transit to the node pointed - // by the failure link, which represents the remaining suffix string - // after popping those matching prefix tokens. - // - // For example, if the current node is "abcdef", and we need to pop - // "ab", and "##cd" off the beginning, the failure link points to the - // node that represents "##ef". - if (!TryFollowFailureLinkAndCollectTokens( - input_substr, input_word_offset_in_text, - cur_offset_in_input_word, cur_node, output_pieces, output_ids, - output_start_offsets, output_end_offsets)) { - goto outside_trie_match_loop; - } - } - // Trie consumed the whole Unicode char and was able to traverse to a - // new node. We move forward the cursor to match the next character. - word_byte_length_so_far += next_pos - cur_pos; - cur_pos = next_pos; - } - outside_trie_match_loop: - if (cur_pos >= input_size) { - // Collect the remaining tokens stored on a path on the trie. - HandleTheRemainingStringOnTriePath( - input_substr, input_word_offset_in_text, cur_node, - original_num_tokens, cur_offset_in_input_word, output_pieces, - output_ids, output_start_offsets, output_end_offsets); - // Break as we've finished all characters. - break; - } - bool is_white_space = u_isUWhiteSpace(cur_unicode_char); - if (is_white_space || - fast_wordpiece_tokenizer_utils::IsPunctuationOrChineseChar( - cur_unicode_char) || - (cur_pos && fast_wordpiece_tokenizer_utils::IsPunctuationOrChineseChar( - prev_unicode_char))) { - // If the current Unicode character is a valid word boundary, collect the - // remaining tokens stored on a path on the trie. - absl::string_view cur_str = absl::string_view( - input_substr.data(), cur_pos - input_word_offset_in_text); - HandleTheRemainingStringOnTriePath( - cur_str, input_word_offset_in_text, cur_node, original_num_tokens, - cur_offset_in_input_word, output_pieces, output_ids, - output_start_offsets, output_end_offsets); - if (is_white_space) { - // Skip the whitespace. - cur_pos = next_pos; - } else if (cur_str.empty()) { - // If the remaining tokens are empty, it means we encountered an - // unmappable separator, so output an unknown token and continue. - cur_pos = next_pos; - ResetOutputAppendUnknownToken( - input_word_offset_in_text, (cur_pos - input_word_offset_in_text), - original_num_tokens, output_pieces, output_ids, - output_start_offsets, output_end_offsets); - } - // Continue in the outer while loop to process the remaining input. - continue; - } - - // Note that even with the following line removed, the code is still correct - // (i.e., Mutants is right). We keep this line for efficiency reasons: We - // have tested the current char, and it is not a whitespace or punctuation - // char. Hence it's safe to skip the current char; we don't want to test it - // again in the subsequent function. - cur_pos = next_pos; - int end_of_word = - SkipTheRemainingOfWordAndTrailingWhiteSpaces(input_text, cur_pos); - - // The current character is not a word boundary. The case is simple: We are - // at the start or middle of some word with unknown characters or exceeding - // the length limit. We map the entire word unk_token, skip the remaining - // portion, and continue. - ResetOutputAppendUnknownToken( - input_word_offset_in_text, (end_of_word - input_word_offset_in_text), - original_num_tokens, output_pieces, output_ids, output_start_offsets, - output_end_offsets); - } -} -// This function implements the new linear WordPiece algorithm. The overall -// design is illustrated as follows: -// -// * WordPiece tokenization works in a left-to-right longest-matching-first -// greedy manner, known as maximum matching. -// -// * We use a trie containing all pieces from the vocabulary. -// -// * We iterate the input text left-to-right, following the trie in search of -// longer and longer matches. -// -// * Challenge: When we fall off the trie matching, the best match is usually -// several characters back. -// -// * For example, assume the vocabulary is {a, ab, ##cd, ##efz, abcdefg}. -// If the input is "abcdefz", the trie matching stops at the position of -// "z". However, the longest match is "ab", which is 5 characters back. -// -// * Straightforward solution: Remember the last match while iterating on the -// trie. That gives us the longest match. Then we roll our string iterator -// backwards and reprocess the characters that weren't part of the match. It -// can be proved that the time complexity is quadratic. -// -// * For the example above, it will backtrack to the 3rd position and -// restart matching from "c", resulting in repetitive, wasteful iterations. -// -// * Optimized solution (the novel linear algorithm): Instead of having to -// reprocess the letters that didn't match, we can have the trie record -// (1) the longest-matching tokens that we would have identified (called -// "failure pops") and (2) a link pointing to a node (called "failure link") -// representing the state from where we can continue to match the next -// character. When trie matching cannot consume an input character, we perform -// a "failure transition" by (a) appending the failure pops to the tokenization -// result and (b) transiting through the failure link to a new state to -// continue the process. Our string iterator never backtracks, and it can be -// proved that we make at most `n` failure transitions in total in processing a -// string of length `n`. Therefore, the time complexity is linear. -// -// * For the same example above, when the trie matching fails at the -// character "z", the optimized solution is smart enough to know that the -// longest-matching tokens we can collect are ["ab", "##cd"]. It is also -// smart enough to set itself into such a state as if it has only seen and -// matched "##ef" so far. Now given the next character being "z", it -// immediately identifies the next matching token as "##efz". -template -void FastWordpieceTokenizer::TokenizeSingleWordImpl( - absl::string_view input_word, int input_word_offset_in_text, - std::vector* output_pieces, std::vector* output_ids, - std::vector* output_start_offsets, - std::vector* output_end_offsets) const { - static_assert(kGetPieces || kGetIds, - "At least one of `kGetPieces` and `kGetIds` should be true."); - if (input_word.empty()) { - return; - } - const int input_size = input_word.size(); - - // `original_num_tokens` stores the number of tokens in the output before - // tokenizing this `input_word`. This is needed because we attempt to tokenize - // `input_word` into word piece tokens and append the recognized tokens to the - // outputs on the fly. If we later find out that `input_word` cannot be - // tokenized into sub-tokens with the current vocabulary, we roll-back the - // output vectors (by removing those tentative tokens) based on - // `original_num_tokens` and appends the "unk_token". - int original_num_tokens = - GetCurrentOutputSize(output_pieces, output_ids); - - if (input_word.size() > config_->max_bytes_per_token()) { - ResetOutputAppendUnknownToken( - input_word_offset_in_text, input_size, original_num_tokens, - output_pieces, output_ids, output_start_offsets, output_end_offsets); - return; - } - - // `cur_offset_in_input_word` tracks the offset of the remaining portion of - // `input_word`, for which the tokens are yet to be recognized and outputted. - // Initially it just points to the start of the input. And it gets moved - // when more tokens are outputed. - // - // For example, suppose the vocab is {a,abcd,##b,##bc,##z}, and the input is - // "abcz". First `cur_offset_in_input_word` points to position 0, since we - // haven't ouputted any tokens. After the first token "a" is recognized and - // outputted, it moves passing the substring "a" to position 1. Then after the - // second token "##bc" is recognized and put to the outputs, it moves passing - // the substring "bc" to position 3. - // - // This variable is used to calculate the offsets of each word piece token. - // And since knowing their offsets in the input word, we're also able to get - // the token string without looking it up in the vocabulary table. This saves - // an extra look-up in hash table (saving time), and we don't even need to - // save the vocabulary table anymore (saving memory). - int cur_offset_in_input_word = 0; - - // Here is an example to illustrate the inference process. - // - // Suppose the vocabulary is {a,abcd,##b,##bc,##z}, and the suffix indicator - // is ##. Below is the trie built from that vocabulary: - // - // (a) (b) (c) (d) - // 0 ----- 3 ----- 4 ----- 5 ----- 6 - // (#)| - // 1 - // (#)| (b) (c) - // 2 ----- 7 ----- 8 - // | (z) - // + ----- 9 - // - // The algorithm constructs auxiliary structures on top of the trie to enable - // linear inference, which consist of two parts (let v denote a node): - // * failure links f(v), pointing to another node, - // * failure pops F(v), a list of tokens stored on node v. - // - // The table of str(v) (which is the string along the trie path from the root - // to node v), f(v), and F(v) for the above trie is as follows: - // - // v | 0 1 2 3 4 5 6 7 8 9 - // str(v)| "" # ## a ab abc abcd ##b ##bc ##z - // F(v)| [] [] [] [a] [a] [a] [abcd] [##b] [##bc] [##z] - // f(v)| null null null 2 7 8 2 2 2 null - // - // Please refer to `FastWordpieceTokenizerBuilder.h|cc` for detailed - // information on how failure links and failure pops are constructed. - // - // Let the input word be "abcz". Below is the inference process that is - // carried out by this method. - // - // Step | Char | Node transition | Output - // 0 | | 0 | [] - // 1 | a | goto(0,a) -> 3 | [] - // 2 | b | goto(3,b) -> 4 | [] - // 3 | c | goto(4,c) -> 5 | [] - // 4 | z | f(5) -> 8 | [a] - // | z | f(8) -> 2 | [a, ##bc] - // | z | goto(2,z) -> 9 | [a, ##bc] - // final | f(9) -> 2 | [a, ##bc, ##z] - // - // Notes: - // * In each step we match and process one input character. - // * goto(u,c) -> v: following the trie link with label c to transit from node - // u to node v. - // * f(u) -> v: following the failure link to transit from node u to node v. - // * The "final" step means that after processing all input characters, we - // keep transiting through the failure links until arriving at the node 2 - // that represents the suffix indicator "##". - // - // Please refer to the below code and comments. - - // Start from the root of the trie. - auto cur_node = trie_->CreateTraversalCursorPointToRoot(); - - for (auto ch : input_word) { - // Although the matching is on Unicode codepoints, it is equivalent to - // directly work with the utf-8 encoding bytes. - while (!trie_->TryTraverseOneStep(cur_node, ch)) { - // Trie cannot consume `ch`. As explained earlier (see "Optimized - // solution" above) we need to (1) pop one or more longest-matching tokens - // (i.e., failure pops) off the start of the string represented by the - // current node, and (2) transit through the failure link to a node that - // represents the remaining suffix string after popping those - // longest-matching prefix tokens. - if (!TryFollowFailureLinkAndCollectTokens( - input_word, input_word_offset_in_text, cur_offset_in_input_word, - cur_node, output_pieces, output_ids, output_start_offsets, - output_end_offsets)) { - // If unable to follow the failure link, it means that the current trie - // node doesn't have any matching prefix vocab tokens to pop. Since the - // next character is not associated with a valid trie edge, the entire - // word cannot be tokenized. - ResetOutputAppendUnknownToken( - input_word_offset_in_text, input_size, original_num_tokens, - output_pieces, output_ids, output_start_offsets, - output_end_offsets); - return; - } - } - // Trie consumed `ch` and was able to traverse to a new node. Continue and - // process the next character. - } - // Segment the remaining string on the trie into tokens and collect them, or - // determine that the word cannot be tokenized. - HandleTheRemainingStringOnTriePath( - input_word, input_word_offset_in_text, cur_node, original_num_tokens, - cur_offset_in_input_word, output_pieces, output_ids, output_start_offsets, - output_end_offsets); -} - -template -ABSL_ATTRIBUTE_ALWAYS_INLINE bool -FastWordpieceTokenizer::TryFollowFailureLinkAndCollectTokens( - absl::string_view input_word, int input_word_offset_in_text, - int& cur_offset_in_input_word, - trie_utils::DartsCloneTrieWrapper::TraversalCursor& node, - std::vector* output_pieces, std::vector* output_ids, - std::vector* output_start_offsets, - std::vector* output_end_offsets) const { - int cur_node_data; - if (trie_->TryGetData(node, cur_node_data)) { - // A shortcut to get f(cur_node) (i.e., the failure link) and F(cur_node) - // (i.e., failure pops) when `cur_node` has data. This results in ~10% - // speedup (statistically significant). - AppendTokenToOutput( - input_word, input_word_offset_in_text, cur_offset_in_input_word, - cur_node_data, output_pieces, output_ids, output_start_offsets, - output_end_offsets); - // Transit through the failure link. - trie_->SetTraversalCursor( - node, - config_->failure_struct_array()->Get(node.node_id)->failure_link()); - return true; - } - - const auto& node_aux = config_->failure_struct_array()->Get(node.node_id); - - if (node_aux->failure_link() == fast_wordpiece_tokenizer_utils::kNullNode) { - // No failure_link can be followed. - return false; - } - - // Collect the tokens (i.e., failure pops), represented by (offset, length) in - // a failure_pops pool (held by the config flatbuffer). - int failure_pops_offset, failure_pops_length; - fast_wordpiece_tokenizer_utils::GetFailurePopsOffsetAndLength( - node_aux->failure_pops_offset_length(), failure_pops_offset, - failure_pops_length); - const int failure_pops_end_offset = failure_pops_offset + failure_pops_length; - for (int offset_in_pool = failure_pops_offset; - offset_in_pool < failure_pops_end_offset; ++offset_in_pool) { - AppendTokenToOutput( - input_word, input_word_offset_in_text, cur_offset_in_input_word, - config_->failure_pops_pool()->Get(offset_in_pool), output_pieces, - output_ids, output_start_offsets, output_end_offsets); - } - - // Transit through the failure link. - trie_->SetTraversalCursor(node, node_aux->failure_link()); - return true; -} - -template -void FastWordpieceTokenizer::AppendTokenToOutput( - absl::string_view input_word, int input_word_offset_in_text, - int& cur_offset_in_input_word, int encoded_token_value, - std::vector* output_pieces, std::vector* output_ids, - std::vector* output_start_offsets, - std::vector* output_end_offsets) const { - auto token_id = - fast_wordpiece_tokenizer_utils::GetTokenId(encoded_token_value); - if constexpr (kGetIds) { - output_ids->push_back(token_id); - } - if constexpr (kGetPieces || kGetOffsets) { - // For suffix tokens, the length below is without the suffix indicator. - int token_substr_length = - fast_wordpiece_tokenizer_utils::GetTokenLength(encoded_token_value); - if (!cur_offset_in_input_word && - fast_wordpiece_tokenizer_utils::IsSuffixToken(encoded_token_value)) { - // This is a special case where `input_word` happens to start with the - // suffix indicator (e.g., "##") and a suffix token is recognized at the - // start (since `cur_offset_input_word == 0`). In this case, we need - // to adjust and add the length of the suffix indicator string. - token_substr_length += config_->suffix_indicator()->size(); - } - if constexpr (kGetPieces) { - // If token id is unk_token_id, it means that it is a dummy node for - // punctuations that are not contained in the vocabulary, we append - // the unk_token in this case. Otherwise, we - // get the subword string from `input_word` by the offset and length. - auto unk_token = config_->unk_token()->string_view(); - auto subword_str = - (token_id == config_->unk_token_id()) - ? absl::string_view(unk_token.data(), unk_token.size()) - : absl::string_view(input_word.data() + cur_offset_in_input_word, - token_substr_length); - output_pieces->emplace_back( - cur_offset_in_input_word - ? absl::StrCat(config_->suffix_indicator()->str(), subword_str) - : subword_str); - } - if constexpr (kGetOffsets) { - // Record the offsets relative to the start of the whole text. - output_start_offsets->push_back(input_word_offset_in_text + - cur_offset_in_input_word); - output_end_offsets->push_back(input_word_offset_in_text + - cur_offset_in_input_word + - token_substr_length); - } - cur_offset_in_input_word += token_substr_length; - } -} - -template -ABSL_ATTRIBUTE_ALWAYS_INLINE void -FastWordpieceTokenizer::HandleTheRemainingStringOnTriePath( - absl::string_view input_word, int input_word_offset_in_text, - trie_utils::DartsCloneTrieWrapper::TraversalCursor& cur_node, - int& original_num_tokens, int& cur_offset_in_input_word, - std::vector* output_pieces, std::vector* output_ids, - std::vector* output_start_offsets, - std::vector* output_end_offsets) const { - if (cur_node.node_id == trie_utils::DartsCloneTrieWrapper::kRootNodeId) { - // We've seen an empty input word. Just return. - return; - } - // Try handling the special case where the entire input word happens to be the - // suffix indicator (e.g., "##") itself. - if (TryHandleTheInputWordBeingSuffixIndicatorItself( - input_word, input_word_offset_in_text, cur_node, - cur_offset_in_input_word, original_num_tokens, output_pieces, - output_ids, output_start_offsets, output_end_offsets)) { - original_num_tokens = - GetCurrentOutputSize(output_pieces, output_ids); - return; - } - - // Handle the normal case because we need to collect the remaining tokens from - // the string represented by `cur_node` (i.e., on the trie path from the trie - // root to `cur_node`), or find out the word cannot be tokenized. - // - // See the example in the comments of this function in the header file. - // - // The tokenization is successful if and only if the entire string represented - // by `cur_node` can be segmented into consecutive matching tokens, resulting - // in the empty suffix string (e.g., "##"), which is represented by - // `trie_suffix_root_`. So we keep following the failure links and collecting - // failure pops tokens until we arrive at `trie_suffix_root_` or encounter a - // null failure link in the middle. - while (cur_node.node_id != config_->trie_suffix_root() && - cur_node.node_id != config_->trie_punct_failure_link_node()) { - if (!TryFollowFailureLinkAndCollectTokens( - input_word, input_word_offset_in_text, cur_offset_in_input_word, - cur_node, output_pieces, output_ids, output_start_offsets, - output_end_offsets)) { - // The remaining string cannot be tokenized, neither can the input word. - ResetOutputAppendUnknownToken( - input_word_offset_in_text, input_word.size(), original_num_tokens, - output_pieces, output_ids, output_start_offsets, output_end_offsets); - return; - } - } - // Arrive at `trie_suffix_root_`. - - // Update the `original_num_tokens`. - original_num_tokens = - GetCurrentOutputSize(output_pieces, output_ids); - - // Succeed and exit. -} - -template -void FastWordpieceTokenizer::ResetOutputAppendUnknownToken( - int input_word_offset_in_text, int input_size, int& original_num_tokens, - std::vector* output_pieces, std::vector* output_ids, - std::vector* output_start_offsets, - std::vector* output_end_offsets) const { - if constexpr (kGetPieces) { - output_pieces->resize(original_num_tokens + 1); - output_pieces->back() = config_->unk_token()->str(); - } - if constexpr (kGetIds) { - output_ids->resize(original_num_tokens + 1); - output_ids->back() = config_->unk_token_id(); - } - if constexpr (kGetOffsets) { - output_start_offsets->resize(original_num_tokens + 1); - output_start_offsets->back() = input_word_offset_in_text; - - output_end_offsets->resize(original_num_tokens + 1); - output_end_offsets->back() = input_word_offset_in_text + input_size; - } - - // Update `original_num_tokens` (since we have appended the "unk_token"). - ++original_num_tokens; -} - -template -ABSL_ATTRIBUTE_ALWAYS_INLINE bool -FastWordpieceTokenizer::TryHandleTheInputWordBeingSuffixIndicatorItself( - absl::string_view input_word, int input_word_offset_in_text, - const trie_utils::DartsCloneTrieWrapper::TraversalCursor& cur_node, - int& cur_offset_in_input_word, int original_num_tokens, - std::vector* output_pieces, std::vector* output_ids, - std::vector* output_start_offsets, - std::vector* output_end_offsets) const { - // Handle the special case where the input word is the suffix indicator (e.g., - // "##") itself. This is because that, after all the characters of an input - // word were successfully processed, if we ended by standing at - // `trie_suffix_root_` but did not recognize any new tokens, it can only be - // the case that the word is the suffix indicator string (e.g., "##") itself. - // For this case we output the pre-computed result. - if (cur_node.node_id != config_->trie_suffix_root()) { - // The input word is not the suffix indicator itself. - return false; - } - int cur_num_tokens = - GetCurrentOutputSize(output_pieces, output_ids); - if (cur_num_tokens != original_num_tokens) { - // The input word is not the suffix indicator itself. - return false; - } - - // The input word is the suffix indicator itself. Next we handle two cases. - if (config_->precomputed_result_for_suffix_indicator()->size() == 1 && - fast_wordpiece_tokenizer_utils::GetTokenId( - config_->precomputed_result_for_suffix_indicator()->Get(0)) == - config_->unk_token_id()) { - // Case 1: The suffix indicator string cannot be tokenized but has to be - // mapped to unk_token. - ResetOutputAppendUnknownToken( - input_word_offset_in_text, input_word.size(), original_num_tokens, - output_pieces, output_ids, output_start_offsets, output_end_offsets); - return true; - } - - // Case 2: The suffix indicator can be tokenized normally. - for (int encoded_token_value : - *config_->precomputed_result_for_suffix_indicator()) { - AppendTokenToOutput( - input_word, input_word_offset_in_text, cur_offset_in_input_word, - encoded_token_value, output_pieces, output_ids, output_start_offsets, - output_end_offsets); - } - return true; -} -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/fast_wordpiece_tokenizer.h b/tensorflow_text/core/kernels/fast_wordpiece_tokenizer.h index 562c2a495..20be8a497 100644 --- a/tensorflow_text/core/kernels/fast_wordpiece_tokenizer.h +++ b/tensorflow_text/core/kernels/fast_wordpiece_tokenizer.h @@ -15,246 +15,6 @@ #ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_FAST_WORDPIECE_TOKENIZER_H_ #define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_FAST_WORDPIECE_TOKENIZER_H_ -#include -#include - -#include "absl/status/statusor.h" -#include "absl/strings/str_cat.h" -#include "tensorflow_text/core/kernels/darts_clone_trie_wrapper.h" -#include "tensorflow_text/core/kernels/fast_wordpiece_tokenizer_model_generated.h" -#include "tensorflow_text/core/kernels/fast_wordpiece_tokenizer_utils.h" - -namespace tensorflow { -namespace text { - -// Applies WordPiece tokenization with an existing WordPiece vocabulary. -// -// Example: -// input = unaffable -// output = un ##aff ##able -// -// One important edge case is that if the input word contains a Unicode -// character that is not seen in the vocabulary, the entire word is mapped -// to the unknown token, which is "" by default. Otherwise, in the "worst" -// case, the word is split into characters. -// -// This is based on the WordPiece/Subword tokenizer from tensor2tensor. -// https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/data_generators/text_encoder.py -class FastWordpieceTokenizer { - public: - // Creates an instance. - // - // Args: - // * config_flatbuffer: the pointer to the FastWordpieceTokenizerConfig - // flatbuffer, which is not owned by this instance and should be kept alive - // through the lifetime of the instance. - static absl::StatusOr Create( - const void* config_flatbuffer); - - // Tokenizes `input` into its word pieces (i.e., subword tokens) and - // appends the new tokens to the end of the outputs. - // When `config_->end_to_end() is `false`, `input` should be a single - // word (after pre-tokenization by whitespaces and/or punctuations). - // Otherwise, `input` should be general text consisting of potentially many - // words. - // - // The input should be UTF-8 but the tokenization is performed on Unicode - // codepoints. - // - // - // Args: - // * input: The UTF-8 string of an input. - // * output_pieces: The output tokens. - // * output_ids: The output token ids. - // * output_start_offsets: The start offsets of output tokens in the input - // text, in utf-8 bytes. - // * output_end_offsets: The end offsets of output tokens in the input - // text, in utf-8 bytes. - // * input_word_offset_in_text: The relative offset of the input word in - // the whole text. Only used when not using end-to-end tokenizer. - // * error: If not null, this will be set to true if the tokenizer failed to - // make progress in decoding the input. - // Note: the start offsets are inclusive and the end offsets are exclusive. - void Tokenize(absl::string_view input, - std::vector* output_pieces, - std::vector* output_ids, - std::vector* output_start_offsets, - std::vector* output_end_offsets, - int input_word_offset_in_text = 0, bool* error = nullptr) const; - - // An override not returning `output_pieces`. - void Tokenize(absl::string_view input, std::vector* output_ids, - std::vector* output_start_offsets, - std::vector* output_end_offsets, - int input_word_offset_in_text = 0) const; - - // An override only returning `output_ids`. - void Tokenize(absl::string_view input, std::vector* output_ids, - int input_word_offset_in_text = 0) const; - - // Detokenizes wordpiece ids into a vector of tokens. - absl::StatusOr> DetokenizeToTokens( - const absl::Span input) const; - - // Detokenizes wordpiece ids to a text. If the input string to the tokenizer - // is normalized and the tokenized wordpieces don't contain ``, the - // detokenized result of the tokenized wordpieces is the same as the original - // input text. - absl::StatusOr Detokenize( - const absl::Span input) const; - - private: - // The actual implementation of `Tokenize` when configured for single words. - // - // The template parameters `kGetPieces`, `kGetIds', and `kGetOffsets` control - // which parts of the output we generate. At least one of `kGetPieces` and - // `kGetIds` should be true. - template - void TokenizeSingleWordImpl(absl::string_view input_word, - int input_word_offset_in_text, - std::vector* output_pieces, - std::vector* output_ids, - std::vector* output_start_offsets, - std::vector* output_end_offsets) const; - - // The actual implementation of `Tokenize` when configured for general texts. - // - // The work of this method is equivalent to first splitting `input_text` into - // words (by splitting on punctuation and whitespaces, and next running - // `TokenizeSingleWordImpl` on each word. - template - void TokenizeTextImpl(absl::string_view input_text, - std::vector* output_pieces, - std::vector* output_ids, - std::vector* output_start_offsets, - std::vector* output_end_offsets, - bool* error) const; - - // Try following the failure link to make the transition when trie matching - // fails. - // - // If f(node) (i.e., failure link) is not null, it does the following: - // (1) collects tokens F(node) (i.e., failure pops) and appends to the end of - // `output_ids`, `output_pieces`, and/or `output_start_offsets` and - // `output_end_offsets`, - // (2) moves `cur_offset_in_input_word` accordingly to pass the collected - // tokens when `kGetPieces=true` or `kGetOffsets=true`, in order to - // calculate the start/end offsets of tokens and to get the token - // strings. Otherwise, `cur_offset_in_input_word` is ignored. - // (3) transits `node` to f(node) following the failure link, - // (4) returns true. - // - // If f(node) is null, it does not change anything and returns false. - // - // Args: - // * cur_offset_in_input_word: The current offset in `input_word` that - // corresponds to the start offset of the tokens that are going to be - // collected in this function. This value is used if 'kGetPieces=true' or - // 'kGetOffsets=true', and when so, this value will be updated accordingly - // after the new word piece tokens have been appended to the output. - template - bool TryFollowFailureLinkAndCollectTokens( - absl::string_view input_word, int input_word_offset_in_text, - int& cur_offset_in_input_word, - trie_utils::DartsCloneTrieWrapper::TraversalCursor& node, - std::vector* output_pieces, std::vector* output_ids, - std::vector* output_start_offsets, - std::vector* output_end_offsets) const; - - // Appends a word piece token (represented by `encoded_token_value`) to the - // output. - // - // Args: - // * cur_offset_in_input_word: The current offset in `input_word` that - // corresponds to the start offset of the wordpiece token. This value - // is used if `kGetPieces=true` or `kGetOffsets=true`, and when so, this - // value will be updated accordingly after the wordpiece token has been - // appended to the output. - // * encoded_token_value: the encoded value of the word piece token to be - // appended. See EncodeToken() in fast_wordpiece_tokenizer_utils.h. - template - void AppendTokenToOutput(absl::string_view input_word, - int input_word_offset_in_text, - int& cur_offset_in_input_word, - int encoded_token_value, - std::vector* output_pieces, - std::vector* output_ids, - std::vector* output_start_offsets, - std::vector* output_end_offsets) const; - - // This method is called when the trie matching loop encounters a word - // boundary (e.g., the end-of-input). This method segments the remaining - // string on the trie path into pieces and appends them to the outputs. If - // that is not possible with the current vocabulary, this method resets the - // outputs and appends unk_token. - // - // Example 1: suppose the vocabulary is {ab, abcd}. If the input word is "ab", - // after matching "ab", we processed all input characters and now meets the - // end-of-input. Note that the string "ab" is stored on the trie path that we - // just traversed along. This function recognizes it as the token "ab" and - // puts the token into the output as expected. - // - // Example 2: for the same vocabulary {ab, abcd}, suppose the input word is - // "abc". After the trie matching loop, we matched "abc" and encountered the - // end-of-input. Now the string "abc" is stored on the trie path, which we - // haven't segmented into tokens yet. So this function closes it by trying to - // segment "abc" into tokens. It fails since the remaining string "abc" cannot - // be tokenized into tokens given the vocabulary. In this case, it resets the - // outputs and appends unk_token at the end as expected. - template - void HandleTheRemainingStringOnTriePath( - absl::string_view input_word, int input_word_offset_in_text, - trie_utils::DartsCloneTrieWrapper::TraversalCursor& cur_node, - int& original_num_tokens, int& cur_offset_in_input_word, - std::vector* output_pieces, std::vector* output_ids, - std::vector* output_start_offsets, - std::vector* output_end_offsets) const; - - // Resets the output and appends unk_token. - // - // We call this method when we find that the input word cannot be tokenized. - // We clear all new tokens recognized so far and replace them with a single - // unk_token. - // - // Args: - // * input_word_offset_in_text: The offset of the current word in the - // input text. - // * input_size: The length of the current input word, in utf-8 bytes. - // * original_num_tokens: The original number of tokens in the output before - // we started the tokenization of the current input word. It is updated - // after this method. - template - void ResetOutputAppendUnknownToken( - int input_word_offset_in_text, int input_size, int& original_num_tokens, - std::vector* output_pieces, std::vector* output_ids, - std::vector* output_start_offsets, - std::vector* output_end_offsets) const; - - // Try handling the special case when the input word is the suffix indicator - // itself. If so, appends the precomputed result to output_pieces and - // output_ids, and returns true. Otherwise, it does nothing and returns false. - template - bool TryHandleTheInputWordBeingSuffixIndicatorItself( - absl::string_view input_word, int input_word_offset_in_text, - const trie_utils::DartsCloneTrieWrapper::TraversalCursor& cur_node, - int& cur_offset_in_input_word, int original_num_tokens, - std::vector* output_pieces, std::vector* output_ids, - std::vector* output_start_offsets, - std::vector* output_end_offsets) const; - - // Returns the position (in bytes) immediately after the end of the word. - int SkipTheRemainingOfWordAndTrailingWhiteSpaces(absl::string_view input, - int& cur_pos) const; - - // Points to the FastWordpieceTokenizer config flatbuffer (not owned). - const FastWordpieceTokenizerConfig* config_ = nullptr; - - // A wrapper to access the trie encoded inside the flatbuffer that `config_` - // points to. - std::unique_ptr trie_ = nullptr; -}; - -} // namespace text -} // namespace tensorflow +#include "tensorflow/core/kernels/text/fast_wordpiece_tokenizer.h" #endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_FAST_WORDPIECE_TOKENIZER_H_ diff --git a/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_kernel.cc b/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_kernel.cc deleted file mode 100644 index ca3e49435..000000000 --- a/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_kernel.cc +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/fast_wordpiece_tokenizer_kernel.h" - -#include "tensorflow/core/framework/op_kernel.h" - -namespace tensorflow { -namespace text { - -REGISTER_KERNEL_BUILDER(Name(FastWordpieceTokenizeWithOffsetsOpKernel::OpName()) - .Device(tensorflow::DEVICE_CPU), - FastWordpieceTokenizeWithOffsetsOpKernel); - -REGISTER_KERNEL_BUILDER(Name(FastWordpieceDetokenizeOpKernel::OpName()) - .Device(tensorflow::DEVICE_CPU), - FastWordpieceDetokenizeOpKernel); - -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_kernel.h b/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_kernel.h index 29542d92d..b2d0ae73a 100644 --- a/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_kernel.h +++ b/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_kernel.h @@ -15,25 +15,6 @@ #ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_FAST_WORDPIECE_TOKENIZER_KERNEL_H_ #define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_FAST_WORDPIECE_TOKENIZER_KERNEL_H_ -#include "tensorflow/lite/kernels/shim/tf_op_shim.h" -#include "tensorflow_text/core/kernels/fast_wordpiece_tokenizer_kernel_template.h" - -namespace tensorflow { -namespace text { - -class FastWordpieceTokenizeWithOffsetsOpKernel - : public tflite::shim::TfOpKernel { - public: - using TfOpKernel::TfOpKernel; -}; - -class FastWordpieceDetokenizeOpKernel - : public tflite::shim::TfOpKernel { - public: - using TfOpKernel::TfOpKernel; -}; - -} // namespace text -} // namespace tensorflow +#include "tensorflow/core/kernels/text/fast_wordpiece_tokenizer_kernel.h" #endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_FAST_WORDPIECE_TOKENIZER_KERNEL_H_ diff --git a/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_kernel_template.h b/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_kernel_template.h index efc26197a..8d0075b56 100644 --- a/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_kernel_template.h +++ b/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_kernel_template.h @@ -15,364 +15,6 @@ #ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_FAST_WORDPIECE_TOKENIZER_KERNEL_TEMPLATE_H_ #define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_FAST_WORDPIECE_TOKENIZER_KERNEL_TEMPLATE_H_ -#include "absl/status/status.h" -#include "absl/strings/str_cat.h" -#include "tensorflow/lite/kernels/shim/op_kernel.h" -#include "tensorflow/lite/kernels/shim/status_macros.h" -#include "tensorflow_text/core/kernels/fast_wordpiece_tokenizer.h" - -namespace tensorflow { -namespace text { - -// See `kDoc` data member for the documentation on this op kernel. -// -// This template class can be instantiated into a kernel for either TF or -// TFLite. See -// https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/kernels/shim -// for more info on how this works. -template -class FastWordpieceTokenizeWithOffsetsOp - : public tflite::shim::OpKernelShim { - private: - enum Inputs { kInputValues = 0, kWpModel }; - enum Outputs { - kOutputSubwords = 0, - kOutputIds, - kOutputRowSplits, - kStartValues, - kEndValues - }; - - using Shape = tflite::shim::Shape; - using typename tflite::shim::OpKernelShim::InitContext; - using typename tflite::shim::OpKernelShim::InvokeContext; - using typename tflite::shim::OpKernelShim::ShapeInferenceContext; - - public: - FastWordpieceTokenizeWithOffsetsOp() = default; - static constexpr char kOpName[] = "FastWordpieceTokenizeWithOffsets"; - static constexpr char kDoc[] = R"doc( - Tokenizes tokens into sub-word pieces based off of a vocabulary using the fast - linear WordPiece algorithm. - - `wordpiece_tokenize_with_offsets` returns the relative offsets. - - ### Example: - - ```python - >>> tokens = ['don', '\'t', 'treadness'] - >>> wordpiece, ids, row_splits, start, end = ( - ... fast_wordpiece_tokenize_with_offsets(tokens, model_buffer)) - >>> RaggedTensor.from_row_splits(wordpiece, row_splits) - [['don', '\'', 't'], ['tread', '##ness']] - >>> RaggedTensor.from_row_splits(ids, row_splits) - [[0, 1, 2], [3, 4]] # Dummy ids. - >>> RaggedTensor.from_row_splits(start, row_splits) - start = [[[0, 3, 4], [0, 5]]] - >>> RaggedTensor.from_row_splits(end, row_splits) - end = [[[3, 4, 5], [5, 10]]] - ``` - - Args: - input_values: 1D Tensor of strings to tokenize with. - wp_model: Buffer tensor for the FastWordpieceTokenizerConfig flatbuffer. - - Returns: - * output_values: 1D tensor containing the wordpieces for all input strings. - A 2D RaggedTensor can be constructed from this and output_row_splits. - * output_ids: 1D tensor containing the wordpiece ids for all input strings. - A 2D RaggedTensor can be constructed from this and output_row_splits. - * output_row_splits: 1D int tensor with the row splits that allow us to - build RaggedTensors from output_values, output_ids, start_values, and - end_values. - * start_values: 1D tensor containing the inclusive start byte offset for - each wordpiece in all input strings. Corresponds 1:1 with output_values. - A 2D RaggedTensor can be constructed from this and output_row_splits. - * end_values: 1D tensor containing the exclusive end byte offset for - each wordpiece in all input strings. Corresponds 1:1 with output_values. - A 2D RaggedTensor can be constructed from this and output_row_splits. - )doc"; - - static const char* OpName() { return kOpName; } - static const char* Doc() { return kDoc; } - - // Attributes declaration (syntax: https://www.tensorflow.org/guide/create_op) - static std::vector Attrs() { return {}; } - - // Input tensors declaration (syntax: - // https://www.tensorflow.org/guide/create_op) - static std::vector Inputs(); - - // Output tensors declaration (syntax: - // https://www.tensorflow.org/guide/create_op) - static std::vector Outputs(); - - // Initializes the op - absl::Status Init(InitContext* context) { return absl::OkStatus(); } - - // Runs the operation - absl::Status Invoke(InvokeContext* context); - - // Shape inference - static absl::Status ShapeInference(ShapeInferenceContext* c); -}; - -////////////////////////// Implementation - -template -std::vector FastWordpieceTokenizeWithOffsetsOp::Inputs() { - return {"input_values: string", "wp_model: uint8"}; -} - -template -std::vector FastWordpieceTokenizeWithOffsetsOp::Outputs() { - return {"output_subwords: string", "output_ids: int64", - "output_row_splits: int64", "start_values: int64", - "end_values: int64"}; -} - -template -absl::Status FastWordpieceTokenizeWithOffsetsOp::Invoke( - InvokeContext* context) { - SH_ASSIGN_OR_RETURN(const auto input_values, context->GetInput(kInputValues)); - const auto& values_vec = input_values->template As(); - - SH_ASSIGN_OR_RETURN(const auto wp_model, context->GetInput(kWpModel)); - // OK to create on every call because FastWordpieceTokenizer is a - // lightweight, memory-mapped wrapper on `wp_model` tensor, and thus - // Create() is very cheap. - auto fast_wordpiece_tokenizer = - ::tensorflow::text::FastWordpieceTokenizer::Create( - wp_model->template Data().data()); - SH_RETURN_IF_ERROR(fast_wordpiece_tokenizer.status()); - - // TODO(xysong): Optimize based on which information below is requested. - std::vector subwords; - std::vector subword_ids; - std::vector begin_offset; - std::vector end_offset; - std::vector row_splits; - - row_splits.push_back(0); - - // Iterate through all the values and wordpiece tokenize them. - for (int i = 0; i < values_vec.Dim(0); ++i) { - // Tokenize into subwords and record the offset locations. - const int original_num_wordpieces = subwords.size(); - bool error = false; - fast_wordpiece_tokenizer->Tokenize(values_vec(i), &subwords, &subword_ids, - &begin_offset, &end_offset, - /*input_word_offset_in_text=*/0, &error); - if (error) { - return absl::InternalError( - "Failed to make any progress in tokenizing the input text."); - } - const int delta_num_wordpieces = subwords.size() - original_num_wordpieces; - - // Record the row splits. - row_splits.push_back(delta_num_wordpieces + row_splits.back()); - } - - SH_RETURN_IF_ERROR(this->template FillOutputTensor( - subwords, kOutputSubwords, context)); - SH_RETURN_IF_ERROR(this->template FillOutputTensor( - subword_ids, kOutputIds, context)); - SH_RETURN_IF_ERROR(this->template FillOutputTensor( - row_splits, kOutputRowSplits, context)); - SH_RETURN_IF_ERROR(this->template FillOutputTensor( - begin_offset, kStartValues, context)); - SH_RETURN_IF_ERROR(this->template FillOutputTensor( - end_offset, kEndValues, context)); - - return absl::OkStatus(); -} - -template -absl::Status FastWordpieceTokenizeWithOffsetsOp::ShapeInference( - ShapeInferenceContext* c) { - using tflite::shim::Shape; - SH_ASSIGN_OR_RETURN(const Shape input_values_shape, - c->GetInputShape(kInputValues)); - SH_ASSIGN_OR_RETURN(const auto wp_model_shape, c->GetInputShape(kWpModel)); - const auto rank_1_shape = Shape({Shape::kUnknownDim}); - if (!input_values_shape.Compatible(rank_1_shape)) { - return absl::FailedPreconditionError( - absl::StrCat("Shape must be rank 1: ", input_values_shape.ToString())); - } - if (!wp_model_shape.Compatible(rank_1_shape)) { - return absl::FailedPreconditionError( - absl::StrCat("Shape must be rank 1: ", wp_model_shape.ToString())); - } - SH_RETURN_IF_ERROR(c->SetOutputShape(kOutputSubwords, rank_1_shape)); - SH_RETURN_IF_ERROR(c->SetOutputShape(kOutputIds, rank_1_shape)); - // row splits size - const int num_splits = Shape::AddDims(1, input_values_shape.Dim(0)); - SH_RETURN_IF_ERROR(c->SetOutputShape(kOutputRowSplits, Shape({num_splits}))); - SH_RETURN_IF_ERROR(c->SetOutputShape(kStartValues, rank_1_shape)); - SH_RETURN_IF_ERROR(c->SetOutputShape(kEndValues, rank_1_shape)); - - return absl::OkStatus(); -} - - -// See `kDoc` data member for the documentation on this op kernel. -// -// This template class can be instantiated into a kernel for either TF or -// TFLite. See -// https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/kernels/shim -// for more info on how this works. -template -class FastWordpieceDetokenizeOp - : public tflite::shim::OpKernelShim { - private: - enum Inputs { kInputValues = 0, kInputRowSplits, kWpModel }; - enum Outputs { kOutputWords = 0 }; - - using Shape = tflite::shim::Shape; - using typename tflite::shim::OpKernelShim::InitContext; - using typename tflite::shim::OpKernelShim::InvokeContext; - using typename tflite::shim::OpKernelShim::ShapeInferenceContext; - - public: - FastWordpieceDetokenizeOp() = default; - static constexpr char kOpName[] = "TFText>FastWordpieceDetokenize"; - static constexpr char kDoc[] = R"doc( - Detokenizes sub-word ids into sentences. - - ### Example: - - ```python - >>> # Vocab of the model_buffer: ['a', 'ab', '##c', 'abc', '##d']. - >>> wordpiece_ids = [0, 1, 2, 3, 4] - >>> row_splits = [0, 3, 5] - >>> tokens = fast_wordpiece_tokenizer_detokenize(tokens, row_splits, model_buffer) - >>> tokens - ['a abc', 'abcd'] - ``` - - Args: - input_values: 1D Tensor of sub-word ids. - input_row_splits: 1D Tensor of row splits that denotes the boundary of each - sentence in the `input_values`. - wp_model: Buffer tensor for the FastWordpieceTokenizerConfig flatbuffer. - - Returns: - * output_values: 1D tensor containing all the sentences. - )doc"; - - static const char* OpName() { return kOpName; } - static const char* Doc() { return kDoc; } - - // Attributes declaration (syntax: https://www.tensorflow.org/guide/create_op) - static std::vector Attrs() { return {}; } - - // Input tensors declaration (syntax: - // https://www.tensorflow.org/guide/create_op) - static std::vector Inputs(); - - // Output tensors declaration (syntax: - // https://www.tensorflow.org/guide/create_op) - static std::vector Outputs(); - - // Initializes the op - absl::Status Init(InitContext* context) { return absl::OkStatus(); } - - // Runs the operation - absl::Status Invoke(InvokeContext* context); - - // Shape inference - static absl::Status ShapeInference(ShapeInferenceContext* c); -}; - -////////////////////////// Implementation - -template -std::vector FastWordpieceDetokenizeOp::Inputs() { - return {"input_values: int32", "input_row_splits: int64", "wp_model: uint8"}; -} - -template -std::vector FastWordpieceDetokenizeOp::Outputs() { - return {"output_words: string"}; -} - -template -absl::Status FastWordpieceDetokenizeOp::Invoke(InvokeContext* context) { - SH_ASSIGN_OR_RETURN(const auto input_values, context->GetInput(kInputValues)); - const auto& values_vec = input_values->template As(); - - SH_ASSIGN_OR_RETURN(const auto input_row_splits, - context->GetInput(kInputRowSplits)); - const auto& row_splits_vec = input_row_splits->template As(); - - SH_ASSIGN_OR_RETURN(const auto wp_model, context->GetInput(kWpModel)); - // OK to create on every call because FastWordpieceTokenizer is a - // lightweight, memory-mapped wrapper on `wp_model` tensor, and thus - // Create() is very cheap. - auto fast_wordpiece_tokenizer = - ::tensorflow::text::FastWordpieceTokenizer::Create( - wp_model->template Data().data()); - SH_RETURN_IF_ERROR(fast_wordpiece_tokenizer.status()); - - std::vector sentences; - - // Iterate through row_splits to split input_values. - for (int i = 0; i < row_splits_vec.Dim(0) - 1; ++i) { - auto single_input = - absl::Span(values_vec.Ptr() + row_splits_vec(i), - row_splits_vec(i + 1) - row_splits_vec(i)); - SH_ASSIGN_OR_RETURN(auto sentence, - fast_wordpiece_tokenizer->Detokenize(single_input)); - sentences.push_back(sentence); - } - - const int words_size = sentences.size(); - SH_ASSIGN_OR_RETURN(auto output_words, - context->GetOutput(kOutputWords, Shape({words_size}))); - auto output_words_vec = output_words->template As(); - - for (int i = 0; i < words_size; ++i) { - output_words_vec(i) = sentences[i]; - } - - return absl::OkStatus(); -} - -template -absl::Status FastWordpieceDetokenizeOp::ShapeInference( - ShapeInferenceContext* c) { - using tflite::shim::Shape; - SH_ASSIGN_OR_RETURN(const Shape input_values_shape, - c->GetInputShape(kInputValues)); - SH_ASSIGN_OR_RETURN(const Shape input_row_splits_shape, - c->GetInputShape(kInputRowSplits)); - SH_ASSIGN_OR_RETURN(const auto wp_model_shape, c->GetInputShape(kWpModel)); - const auto rank_1_shape = Shape({Shape::kUnknownDim}); - if (!input_values_shape.Compatible(rank_1_shape)) { - return absl::FailedPreconditionError( - absl::StrCat("Shape must be rank 1: ", input_values_shape.ToString())); - } - if (!input_row_splits_shape.Compatible(rank_1_shape)) { - return absl::FailedPreconditionError(absl::StrCat( - "Shape must be rank 1: ", input_row_splits_shape.ToString())); - } - if (!wp_model_shape.Compatible(rank_1_shape)) { - return absl::FailedPreconditionError( - absl::StrCat("Shape must be rank 1: ", wp_model_shape.ToString())); - } - SH_RETURN_IF_ERROR(c->SetOutputShape(kOutputWords, rank_1_shape)); - return absl::OkStatus(); -} - -} // namespace text -} // namespace tensorflow +#include "tensorflow/core/kernels/text/fast_wordpiece_tokenizer_kernel_template.h" #endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_FAST_WORDPIECE_TOKENIZER_KERNEL_TEMPLATE_H_ diff --git a/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_model.fbs b/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_model.fbs deleted file mode 100644 index 3f508f677..000000000 --- a/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_model.fbs +++ /dev/null @@ -1,68 +0,0 @@ -namespace tensorflow.text; - -struct FailureStruct { - // The failure link of node v, denoted as f(v). - failure_link: uint32; - - // The failure pops of node v, denoted as F(v). It is an encoded value of - // (offset, length) that represents a consecutive subarray in - // 'failure_pops_pool' (see FastWordpieceTokenizerConfig). - failure_pops_offset_length: uint32; -} - -table FastWordpieceTokenizerConfig { - // The trie data, in the format of darts_clone trie, as accepted by - // DartsCloneTrieWrapper::Create(). - trie_array: [uint32]; - - // The array of the failure structures. - failure_struct_array: [FailureStruct]; - - // The array holding the failure pops. - failure_pops_pool: [int]; - - // The trie suffix root node id. - trie_suffix_root: uint32; - - // Max size of the input token. If the input length is longer than this, it - // will be mapped to unk_token. - max_bytes_per_token: int; - - // Characters prepended to a wordpiece to indicate that it is a suffix to - // another subword, such as "##". - suffix_indicator: string; - - // The unknown token string. - unk_token: string; - - // The unkown token id. - unk_token_id: int; - - // The precomputed result for the input being the suffix indicator itself. - precomputed_result_for_suffix_indicator: [int]; - - // The node id of every punctuation's failure link. It is only used when - // end_to_end=true. - trie_punct_failure_link_node: uint32; - - // Whether to build end-to-end tokenizer for tokenizing general texts (as - // opposed to splitted single words). When it is true, the input text is first - // split into words on "punctuation"/whitespaces, and each word is further - // tokenized into subwords. - // Note that our definition of "punctuation" includes some special Chinese - // characters for compatibility with Bert. More details are available in - // `fast_wordpiece_tokenizer_utils::IsPunctuationOrChineseChar`. - end_to_end: bool; - - // Whether the tokenizer supports detokenization function. - support_detokenization: bool; - - // WordPiece Vocabulary. Note that we remove suffix indicator from suffix - // tokens for saving space. - vocab_array: [string]; - - // Whether the corresponding token in the vocab_array is a suffix token. - vocab_is_suffix_array: [bool]; -} - -root_type FastWordpieceTokenizerConfig; diff --git a/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_model_builder.cc b/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_model_builder.cc deleted file mode 100644 index 9467c4d6e..000000000 --- a/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_model_builder.cc +++ /dev/null @@ -1,941 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/fast_wordpiece_tokenizer_model_builder.h" - -#include - -#include -#include -#include - -#include "absl/container/flat_hash_map.h" -#include "absl/container/flat_hash_set.h" -#include "absl/status/status.h" -#include "absl/strings/match.h" -#include "absl/strings/str_cat.h" -#include "absl/strings/str_join.h" -#include "absl/strings/str_split.h" -#include "absl/strings/string_view.h" -#include "absl/strings/strip.h" -#include /* cppitertools */ "imap.hpp" -#include "icu4c/source/common/unicode/umachine.h" -#include "icu4c/source/common/unicode/utf8.h" -#include "tensorflow/lite/kernels/shim/status_macros.h" -#include "tensorflow_text/core/kernels/darts_clone_trie_builder.h" -#include "tensorflow_text/core/kernels/darts_clone_trie_wrapper.h" -#include "tensorflow_text/core/kernels/fast_wordpiece_tokenizer_model_generated.h" -#include "tensorflow_text/core/kernels/fast_wordpiece_tokenizer_utils.h" -#include "tensorflow_text/core/kernels/sentence_fragmenter_v2.h" -#include "tensorflow_text/core/kernels/string_vocab.h" - -namespace tensorflow { -namespace text { -namespace { - -// A Unicode control char that never appears in the input as it is filtered -// during text normalization. It is used to build dummy nodes in the trie. -static constexpr char kInvalidControlChar = 0x11; - -// A wrapper of vocab tokens that will be used to build the trie. -class TrieVocabToken { - public: - TrieVocabToken(absl::string_view token, int token_id, - absl::string_view suffix_indicator) - : token_(std::string(token)), token_id_(token_id) { - if (!suffix_indicator.empty() && token_ != suffix_indicator && - absl::StartsWith(token_, suffix_indicator)) { - is_suffix_token_ = true; - actual_token_start_offset_ = suffix_indicator.size(); - } - // Iterate over the Unicode chars from the token, to initialize - // contains_punctuation_ and actual_token_unicode_len_. - int token_len = token.size(); - int cur_pos = actual_token_start_offset_; - UChar32 c; - while (cur_pos < token_len) { - U8_NEXT(token, cur_pos, token_len, c); - if (!contains_punctuation_ && - fast_wordpiece_tokenizer_utils::IsPunctuationOrChineseChar(c)) { - contains_punctuation_ = true; - } - ++actual_token_unicode_len_; - } - } - - absl::string_view Token() const { return token_; } - - int TokenId() const { return token_id_; } - - bool IsSuffixToken() const { return is_suffix_token_; } - - bool ContainsPunctuation() const { return contains_punctuation_; } - - int TokenUnicodeLengthWithoutSuffixIndicator() const { - return actual_token_unicode_len_; - } - - int TokenLengthWithoutSuffixIndicator() const { - return token_.size() - actual_token_start_offset_; - } - - private: - std::string token_; - - int token_id_ = -1; - - // By design, `is_suffix_token_`=false for the suffix indicator (e.g., "##") - // itself. - bool is_suffix_token_ = false; - - // The starting offset of the token string in `token_` without the suffix - // indicator. By design, `actual_token_start_offset_`=0 for the suffix - // indicator (e.g., "##") itself. - int actual_token_start_offset_ = 0; - - // Length of the actual token string in Unicode character. - int actual_token_unicode_len_ = 0; - - // True when the actual token string contains punctuation, e.g. "test.x", - // "##.", ".test", "...", "!", etc. - bool contains_punctuation_ = false; -}; - -// The failure struct to store failure links and failure pops. -struct FailureStruct { - // The failure link, denoted as f(v), of each node v. - // - // Null node is represented by fast_wordpiece_tokenizer_utils::kNullNode. - uint32_t failure_link = fast_wordpiece_tokenizer_utils::kNullNode; - - // The failure pop list, denoted as F(v), of a node v. - // - // It is stored as a pair of offset and length that represents a continuous - // vector in `failure_pops_pool_`. This pair is encoded using - // EncodeFailurePopList() in fast_wordpiece_tokenizer_utils.h. - uint32_t failure_pops_offset_length = - fast_wordpiece_tokenizer_utils::kNullFailurePopsList; -}; - -// Builds the FastWordpieceTokenizer model. -class FastWordpieceBuilder { - public: - // When no_pretokenization is false, we split the input string by punctuation - // chars (in addition to whitespaces) and then tokenize it to wordpieces. - absl::Status BuildModel(const std::vector& vocab, - int max_bytes_per_token, - absl::string_view suffix_indicator, - absl::string_view unk_token, - bool no_pretokenization, - bool support_detokenization); - - absl::StatusOr ExportToFlatBuffer() const; - - private: - absl::StatusOr> PrepareVocabTokensToBuildTrie(); - - absl::Status ConstructTrie( - const std::vector& tokens_to_build_trie); - - absl::Status BuildFailureStructure( - const std::vector& tokens_to_build_trie); - - // Builds the set of outgoing edge labels for each trie node and returns a - // mapping (node_id -> set). Used in BuildFailureStructure(). - absl::StatusOr>> - BuildOutgoingEdgeLabelsForTrie( - const std::vector& tokens_to_build_trie); - - // Builds the set of outgoing edge labels for nodes along the trie path of - // `vocab_token`. Used in BuildOutgoingEdgeLabelsForTrie(). - absl::Status BuildOutgoingEdgeLabelsAlongVocabToken( - const TrieVocabToken& vocab_token, - std::vector>& node_outgoing_edge_labels); - - // Assigns failure link f(cur_node) to `failure_link` and populates failure - // pops F(cur_node) (based on `one_step_pops` and - // `parent_failure_pops_offset_length`). - absl::Status AssignFailureLinkAndPops(uint32_t cur_node, - uint32_t failure_link, - const std::vector& one_step_pops, - int parent_failure_pops_offset_length); - - // If `failure_pops_offset_length` encodes a valid failure pop list, appends - // the failure pop list to the end of `out_failure_pops`. Otherwise, does - // nothing. - void GetFailurePopsAndAppendToOut(uint32_t failure_pops_offset_length, - std::vector& out_failure_pops); - - absl::Status PrecomputeResultForSuffixIndicator(); - - inline void BreakTrieLinkFromParentToChild(uint32_t child_node_id) { - // In trie, the least significant 8 bits encode the label of the trie link - // from the parent to the node itself. - // - // Reference: - // https://github.com/s-yata/darts-clone/blob/e40ce4627526985a7767444b6ed6893ab6ff8983/include/darts.h#L65-L70. - // - // For example, if there is a trie link `u` -> `v` with label (say) 'a' - // (ASCII 97 or 0x61), then the least significant 8 bits of node `v` will be - // 0x61. By erasing its least significant 8 bits to 0, it effectively - // prevents the node from being reachable from its parent, i.e. breaking the - // trie link from the parent to the node itself. - trie_array_[child_node_id] &= 0xFFFFFF00; - } - - inline void EraseValueOfNode(uint32_t node_id) { - // In trie, the 9th least significant bit of a node's value marks whether - // the node has a leaf node (i.e., having a value stored on the node). - // - // Reference: - // https://github.com/s-yata/darts-clone/blob/e40ce4627526985a7767444b6ed6893ab6ff8983/include/darts.h#L54-L58 - // - // By setting the 9th least significant bit to 0, it effectively erases any - // value (i.e., token id in our case) associated with the node. - trie_array_[node_id] &= 0xFFFFFEFF; - } - - std::unique_ptr vocab_; - - int max_bytes_per_token_ = -1; - - std::string suffix_indicator_; - - std::string unk_token_; - - int unk_token_id_ = -1; - - // A wrapper to access the trie encoded by `trie_array_`. - absl::optional trie_; - - // The actual data of the trie. - std::vector trie_array_; - - // The "suffix_root" node on the trie whose trie path (from the root to the - // node) is the suffix indicator string. - uint32_t trie_suffix_root_ = fast_wordpiece_tokenizer_utils::kNullNode; - - // The dummy node to serve as the failure link of punctuation nodes. - uint32_t trie_punct_failure_link_node_ = - fast_wordpiece_tokenizer_utils::kNullNode; - - // Whether to build the end-to-end tokenizer that tokenizes general texts. - // When set to false, it splits the input on punctuation/whitespace and treat - // each punctuation as an independent word. - bool no_pretokenization_; - - // Whether the tokenizer supports the detokenization function. - bool support_detokenization_; - - std::vector failure_struct_array_; - - // Each element in the failure pops pool is an encoded vocab token. - // See EncodeToken() in fast_wordpiece_tokenizer_utils.h. - std::vector failure_pops_pool_; - - // The precomputed result for the suffix indicator. Each element in the - // failure pops pool is an encoded vocab token. See EncodeToken() in - // fast_wordpiece_tokenizer_utils.h. - std::vector precomputed_result_for_suffix_indicator_; - - // The mapping from node id to whether the corresponding token is a - // punctuation char. - absl::flat_hash_map node_id_is_punc_map_; -}; - -absl::Status FastWordpieceBuilder::BuildModel( - const std::vector& vocab, int max_bytes_per_token, - absl::string_view suffix_indicator, absl::string_view unk_token, - bool no_pretokenization, bool support_detokenization) { - unk_token_ = std::string(unk_token); - suffix_indicator_ = std::string(suffix_indicator); - max_bytes_per_token_ = max_bytes_per_token; - no_pretokenization_ = no_pretokenization; - support_detokenization_ = support_detokenization; - - vocab_ = std::make_unique(vocab); - if (vocab_->Size() != vocab.size()) { - return absl::FailedPreconditionError( - "Tokens in the vocabulary must be unique."); - } - - // Determine `unk_token_id_`. - const absl::optional unk_token_id = vocab_->LookupId(unk_token_); - if (!unk_token_id.has_value()) { - return absl::FailedPreconditionError("Cannot find unk_token in the vocab!"); - } - unk_token_id_ = *unk_token_id; - - // Construct the trie and the failure structure. - SH_ASSIGN_OR_RETURN(auto tokens_to_build_trie, - PrepareVocabTokensToBuildTrie()); - SH_RETURN_IF_ERROR(ConstructTrie(tokens_to_build_trie)); - SH_RETURN_IF_ERROR(BuildFailureStructure(tokens_to_build_trie)); - - // Precompute the result when the input is the suffix indicator string itself. - SH_RETURN_IF_ERROR(PrecomputeResultForSuffixIndicator()); - - return absl::OkStatus(); -} - -absl::StatusOr> -FastWordpieceBuilder::PrepareVocabTokensToBuildTrie() { - // To simplify the inference (fewer corner cases), - // * We ensure that `trie_suffix_root_` is always available on the trie. - // * We ensure that `trie_suffix_root_` does not have data (i.e., the suffix - // indicator string is not in the set of the keys of the trie). - // * We don't actually add the end-of-input symbol "$" but use an alternative - // logic. See FastWordpieceTokenizer::HandleTheRemainingStringOnTriePath(). - - if (vocab_->Size() > fast_wordpiece_tokenizer_utils::kMaxSupportedVocabSize) { - return absl::FailedPreconditionError( - absl::StrCat("Vocab size exceeds the max supported (", - fast_wordpiece_tokenizer_utils::kMaxSupportedVocabSize, - "). Found vocab size: ", vocab_->Size(), ".")); - } - - // Collect a subset of tokens (and variations) to build the trie. - std::vector tokens_to_build_trie; - tokens_to_build_trie.reserve(vocab_->Size()); - for (int token_id = 0; token_id < vocab_->Size(); ++token_id) { - const absl::optional word = vocab_->LookupWord(token_id); - if (!word.has_value()) { - return absl::FailedPreconditionError( - "Impossible. `token_id` is definitely within the range of vocab " - "token ids; hence LookupWord() should always succeed."); - } - if (word->empty()) { - // It does not make sense to add the empty string "" to the vocabulary. In - // addition, darts_clone does not allow an empty Trie key. - // - // We allow this only for compatibility with the original Wordpiece - // algorithm. - LOG(WARNING) - << "The empty string is found in the vocabulary, which takes place " - "in the token id space but will never be used in the result. " - "Consider cleaning it from the vocabulary."; - continue; - } - if (*word == suffix_indicator_) { - // In real-life cases, no need to add the suffix indicator string (e.g., - // "##") to the vocabulary. - // - // We allow this only for compatibility with the original Wordpiece - // algorithm. - LOG(WARNING) - << "The empty suffix token is found in the vocabulary, which takes " - "place in token id space but will (almost) never be used in the " - "result. Consider cleaning it from the vocabulary."; - - // The token id of the suffix indicator is used only when the input is - // the suffix indicator itself. That case is handled elsewhere, in - // PrecomputeResultForSuffixIndicator(). - // - // Therefore, we don't insert the suffix indicator string as a key into - // the trie. As a result, `trie_suffix_root_` node will never have data. - - continue; - } - TrieVocabToken vocab_token(*word, token_id, suffix_indicator_); - if (vocab_token.TokenLengthWithoutSuffixIndicator() > - fast_wordpiece_tokenizer_utils::kMaxVocabTokenLengthInUTF8Bytes) { - return absl::FailedPreconditionError(absl::StrCat( - "Vocab token utf8 length (excluding suffix indicator) exceeds the " - "max supported (", - fast_wordpiece_tokenizer_utils::kMaxVocabTokenLengthInUTF8Bytes, - "). The vocab token is: ", *word, - " with utf8 length (excluding suffix indicator): ", - vocab_token.TokenLengthWithoutSuffixIndicator(), ".")); - } - // Skip word that contains punctuation but is not a punctuation itself. - // , , ##. are skipped in this step. - if (!no_pretokenization_ && vocab_token.ContainsPunctuation() && - (vocab_token.TokenUnicodeLengthWithoutSuffixIndicator() > 1 || - vocab_token.IsSuffixToken())) { - continue; - } - tokens_to_build_trie.emplace_back(vocab_token); - } - - if (tokens_to_build_trie.empty()) { - return absl::FailedPreconditionError( - "No valid vocab tokens were found to build the trie."); - } - if (!suffix_indicator_.empty()) { - const bool suffix_token_exists = std::any_of( - tokens_to_build_trie.begin(), tokens_to_build_trie.end(), - [](const TrieVocabToken& token) { return token.IsSuffixToken(); }); - if (!suffix_token_exists) { - // No suffix tokens in the vocab. That would lead to no trie node for - // the suffix indicator, which creates corner cases in the inference. - // To prevent that, we add a dummy suffix token, e.g., "##" + - // kInvalidControlChar (if the suffix indicator is "##"), which is never - // matched during inference. - tokens_to_build_trie.emplace_back(TrieVocabToken( - absl::StrCat(suffix_indicator_, std::string(1, kInvalidControlChar)), - unk_token_id_, suffix_indicator_)); - } - } - - if (!no_pretokenization_) { - // Special treatment for all Unicode punctuation chars that are not already - // in the trie. - // The maximum codepoint in Unicode is 0x0010FFFF. - for (UChar32 cp = 1; cp <= 0x0010FFFF; ++cp) { - if (!U_IS_UNICODE_CHAR(cp) || - !fast_wordpiece_tokenizer_utils::IsPunctuationOrChineseChar(cp)) { - continue; - } - // Get the UTF8 encoding of the codepoint cp. - char buf[4]; - int len = 0; - U8_APPEND_UNSAFE(buf, len, cp); - absl::string_view buf_view(buf, len); - // Set the token id of punctuation chars that don't exist in the vocab as - // unk_token_id_. - if (!vocab_->LookupId(buf_view)) { - TrieVocabToken vocab_token(buf_view, unk_token_id_, suffix_indicator_); - tokens_to_build_trie.emplace_back(vocab_token); - } - } - // Insert a dummy node to serve as the failure link targets for punctuation - // nodes. - tokens_to_build_trie.emplace_back(TrieVocabToken( - std::string(1, kInvalidControlChar), unk_token_id_, suffix_indicator_)); - } - return tokens_to_build_trie; -} - -absl::Status FastWordpieceBuilder::ConstructTrie( - const std::vector& tokens_to_build_trie) { - std::vector keys; - std::vector values; - for (const TrieVocabToken& vocab_token : tokens_to_build_trie) { - keys.emplace_back(vocab_token.Token()); - SH_ASSIGN_OR_RETURN(int encoded_value, - fast_wordpiece_tokenizer_utils::EncodeToken( - vocab_token.TokenId(), - vocab_token.TokenLengthWithoutSuffixIndicator(), - vocab_token.IsSuffixToken())); - values.push_back(encoded_value); - } - SH_ASSIGN_OR_RETURN(trie_array_, - trie_utils::BuildDartsCloneTrie(keys, values)); - SH_ASSIGN_OR_RETURN( - trie_utils::DartsCloneTrieWrapper trie, - trie_utils::DartsCloneTrieWrapper::Create(trie_array_.data())); - trie_.emplace(std::move(trie)); - - if (trie_array_.size() > - fast_wordpiece_tokenizer_utils::kMaxSupportedTrieSize) { - return absl::FailedPreconditionError(absl::StrCat( - "Not supported since the constructed Darts trie size (", - trie_array_.size(), ") is greater than the maximum supported size (", - fast_wordpiece_tokenizer_utils::kMaxSupportedTrieSize, ").")); - } - - // Locate the trie suffix root. - auto node = trie_->CreateTraversalCursorPointToRoot(); - if (!trie_->TryTraverseSeveralSteps(node, suffix_indicator_)) { - return absl::FailedPreconditionError( - "Cannot locate trie_suffix_root_. This should never happen."); - } - trie_suffix_root_ = node.node_id; - - if (!no_pretokenization_) { - // Locate the dummy node for the failure link for punctuation nodes. - node = trie_->CreateTraversalCursorPointToRoot(); - if (!trie_->TryTraverseSeveralSteps(node, - std::string(1, kInvalidControlChar))) { - return absl::FailedPreconditionError( - "Cannot locate the dummy node for the failure link for punctuation " - "nodes. This should never happen."); - } - trie_punct_failure_link_node_ = node.node_id; - - // We make `trie_punct_failure_link_node_` a standalone dummy node. - EraseValueOfNode(trie_punct_failure_link_node_); - BreakTrieLinkFromParentToChild(trie_punct_failure_link_node_); - } - return absl::OkStatus(); -} - -absl::Status FastWordpieceBuilder::BuildOutgoingEdgeLabelsAlongVocabToken( - const TrieVocabToken& vocab_token, - std::vector>& node_outgoing_edge_labels) { - const absl::string_view token = vocab_token.Token(); - trie_utils::DartsCloneTrieWrapper::TraversalCursor cur_node; - int char_pos = 0; - trie_->SetTraversalCursor(cur_node, trie_->kRootNodeId); - while (char_pos < token.size()) { - const char edge_label = token[char_pos]; - node_outgoing_edge_labels[cur_node.node_id].insert(edge_label); - if (!trie_->TryTraverseOneStep(cur_node, edge_label)) { - // Should never happen, since we built trie using all of `vocab_token`. - return absl::FailedPreconditionError(absl::StrCat( - "Cannot traverse from parent id ", cur_node.node_id, - " to child following the edge with label value of ", - static_cast(edge_label), - " when processing a vocabulary token with token ID ", - vocab_token.TokenId(), " (0-based). This error happened at ", - "position ", char_pos, " (0-based) of the token. Before that, ", - "the prefix \"", token.substr(0, char_pos), - "\" of the token had been processed. This should never happen. ", - "This probably indicates that there are some unicode ", - "issues (e.g., byte '\\x0' in the middle) for the above ", - "mentioned token in the vocabulary file. All bytes of this ", - "questionable token (ID ", vocab_token.TokenId(), ") are: [", - absl::StrJoin( - iter::imap([](auto ch) { return static_cast(ch); }, - vocab_token.Token()), - ", "), - "].")); - } - ++char_pos; - } - // Record whether the current node represents a punctuation char in the map. - node_id_is_punc_map_[cur_node.node_id] = - !vocab_token.IsSuffixToken() && vocab_token.ContainsPunctuation() && - vocab_token.TokenUnicodeLengthWithoutSuffixIndicator() == 1; - return absl::OkStatus(); -} - -absl::StatusOr>> -FastWordpieceBuilder::BuildOutgoingEdgeLabelsForTrie( - const std::vector& tokens_to_build_trie) { - std::vector> node_outgoing_edge_labels( - trie_array_.size()); - const std::string dummy_token_for_trie_punct_failure_link_node = - std::string(1, kInvalidControlChar); - for (const TrieVocabToken& vocab_token : tokens_to_build_trie) { - if (vocab_token.Token() == dummy_token_for_trie_punct_failure_link_node) - continue; - SH_RETURN_IF_ERROR(BuildOutgoingEdgeLabelsAlongVocabToken( - vocab_token, node_outgoing_edge_labels)); - } - return node_outgoing_edge_labels; -} - -// Computes failure links and failure pops using BFS traversal. -absl::Status FastWordpieceBuilder::BuildFailureStructure( - const std::vector& tokens_to_build_trie) { - // Build the set of outgoing edge labels for each trie node (node_id -> - // set). This is needed by BFS because darts-clone does not provide an - // API to enumerate the outgoing links for a node. - SH_ASSIGN_OR_RETURN( - std::vector> node_outgoing_edge_labels, - BuildOutgoingEdgeLabelsForTrie(tokens_to_build_trie)); - - failure_struct_array_.resize(trie_array_.size()); - // Initialize the BFS queue. - std::queue bfs_queue({trie_->kRootNodeId}); - if (trie_suffix_root_ != trie_->kRootNodeId) { - // When `suffix_indicator_` is empty, `trie_suffix_root_` will collapse - // with root. In this case, we don't visit it twice. - // - // In addition, we have ensured that `trie_suffix_root_` will never be null. - // See PrepareVocabTokensToBuildTrie(). - bfs_queue.push(trie_suffix_root_); - } - - // The BFS loop. - while (!bfs_queue.empty()) { - uint32_t parent_id = bfs_queue.front(); - bfs_queue.pop(); - - // Explore the children of the parent node. - // - // Fix the iteration order of the outgoing edges to ensure that the model is - // always built in the same way (i.e., visiting nodes in the same order). - std::vector outgoing_labels_sorted( - node_outgoing_edge_labels[parent_id].begin(), - node_outgoing_edge_labels[parent_id].end()); - std::sort(outgoing_labels_sorted.begin(), outgoing_labels_sorted.end()); - for (const char edge_label : outgoing_labels_sorted) { - auto child_node = trie_->CreateTraversalCursor(parent_id); - if (!trie_->TryTraverseOneStep(child_node, edge_label)) { - // Should never happen, due to how we built `node_outgoing_edge_labels`; - // see BuildOutgoingEdgeLabelsAlongVocabToken(). - return absl::FailedPreconditionError(absl::StrCat( - "Failed to traverse to child following edge ", - absl::string_view(&edge_label, 1), " at parent ", parent_id, ".")); - } - if (child_node.node_id == trie_suffix_root_) { - // Avoid visiting `trie_suffix_root_` twice. - continue; - } - - // For the child node v, compute failure link f(v) and failure pops F(v). - // - // In the comments below, str(v) is the string on the path from the trie - // root to the node v, and V is the vocabulary used to build the trie. - - int child_data_value = -1; - if (trie_->TryGetData(child_node, child_data_value)) { - uint32_t failure_link = trie_suffix_root_; - // Check whether the current node represents a punctuation char. - // Since the current node has data and thus corresponds to some token, - // it must be in the map `node_id_is_punc_map_` - if (!node_id_is_punc_map_.contains(child_node.node_id)) { - return absl::FailedPreconditionError( - "Failed to find if an end node in the trie is a punctuation char " - "in node_id_is_punc_map_. It should never happen."); - } - if (!no_pretokenization_ && - node_id_is_punc_map_.at(child_node.node_id)) { - // For end-to-end tokenizer, we set the failure link node of every - // punctuation char as a special node trie_punct_failure_link_node_ - // which is a dummy node (no parent, no descendants, failure link is - // null). Hence, by detecting the landing node, we know we just - // matched a punctuation char. We then split it as a single word. - failure_link = trie_punct_failure_link_node_; - } - // Case 1 (easy): str(v) is in V. Assume that during tokenization of a - // word, we reached node v, but can't continue further, because the - // current char from the input word does not match any of the edges - // outgoing from v. In that case, str(v) is already the max match, so - // it's the only wordpiece we add to the list of wordpieces we committed - // to. Hence, F(v) = [str(v)]. The next wordpiece from the current word - // is a suffix, so we move to node f(v) = trie_suffix_root_, which - // represents the suffix indicator (e.g., "##"), from where we continue - // the match process. In summary, we have: - // * f(v) = trie_suffix_root_. - // * F(v) = [str(v)]. - SH_RETURN_IF_ERROR(AssignFailureLinkAndPops( - /*cur_node=*/child_node.node_id, /*failure_link=*/failure_link, - /*one_step_pops=*/{child_data_value}, - /*parent_failure_pops_offset_length=*/ - fast_wordpiece_tokenizer_utils::kNullFailurePopsList)); - bfs_queue.push(child_node.node_id); - continue; - } - - // Case 2 (complex): str(v) is not in V. - // - // Consider the same scenario as in Case 1, where we can't continue - // further from v, but now, str(v) is not a valid wordpiece. Instead, - // we need to consider the wordpieces that the MaxMatch algorithm would - // generate for the beginning of str(v) (these wordpieces are stored in - // F(v)). f(v) (the state we transit to) should correspond to the trie - // node for the remaining suffix of str(v). - // - // We could compute F(v) and f(v) by running the original WordPiece - // algorithm. Instead, we do it even faster, by using F(u) and f(u) (the - // similar info for the parent node u). Intuitively F(v) consists of (1) - // the tokens from F(u) and (2) the possible tokens that the MaxMatch - // algorithm would generate for str(f(u)).c, where str(f(u)) is the suffix - // of str(u) not covered by the concatenation of the tokens from F(u), "." - // means concatenation, and c is the edge label character from u to v. - // - // - // Let u be the parent node, and c be the edge label from u to v. To - // compute f(v) and F(v), the loop below uses a node variable z (called - // `itr_node`) and a list G (called `one_steps_pops`). Initially, z is set - // to be f(u), and G is empty. - // 1. If z is null, f(v) will be null, too (see Note 2 below for what - // this means). We're done. - // 2. Check if there is a trie edge out of node z, for label c, leading - // to node goto(z, c). If so, set f(v) = goto(z,c) and F(v) = F(u) + G. - // We're done and break. - // 3. Otherwise, collect the pop tokens (by G = G + F(z)) and - // follows the failure link (by z = f(z)). - // 4. Goes to Step 1 and continue the loop. - // - // Note 1: processing node v depends on the info for nodes z that are - // closer to the root than v. Due to our use of the BFS traversal, that - // info is guaranteed to exist when we examine node v. - // - // Note 2: f(v) is null means that during the tokenization process of some - // input word, if the trie matching cannot continue at node v, there are - // no failure links that we can follow, and (it can be proved that in such - // a case) the input word can't be tokenized with the current vocab. - // - // For formal discussions and proofs, please refer to the academic paper - // https://arxiv.org/abs/2012.15524 - const FailureStruct& parent_fs = failure_struct_array_[parent_id]; - if (parent_fs.failure_link != fast_wordpiece_tokenizer_utils::kNullNode) { - std::vector one_step_pops; - auto itr_node = trie_->CreateTraversalCursor(parent_fs.failure_link); - while (true) { - if (trie_->TryTraverseOneStep(itr_node, edge_label)) { - // Set the failure link and failure pops for `child_node`. - SH_RETURN_IF_ERROR(AssignFailureLinkAndPops( - /*cur_node=*/child_node.node_id, - /*failure_link=*/itr_node.node_id, one_step_pops, - parent_fs.failure_pops_offset_length)); - break; - } - const FailureStruct& itr_node_fs = - failure_struct_array_[itr_node.node_id]; - if (itr_node_fs.failure_link == - fast_wordpiece_tokenizer_utils::kNullNode) { - // Cannot follow anymore: failure link of `child_node` will be null. - break; - } - // Append the failure pops of `itr_node` to `one_step_pops`. - GetFailurePopsAndAppendToOut(itr_node_fs.failure_pops_offset_length, - one_step_pops); - // Follow the failure link. - trie_->SetTraversalCursor(itr_node, itr_node_fs.failure_link); - } - } - - bfs_queue.push(child_node.node_id); - } - } - - if (!no_pretokenization_ && !suffix_indicator_.empty()) { - // Rewire trie links along suffix_indicator_. - // If the suffix indicator contains a punctuation char, let `u`--(`c`)-->`v` - // be the first trie edge along the suffix indicator such that the edge - // label (i.e. `c`) is a punctuation char. Note that `u`, `v` are trie - // nodes. `c` is the edge label. We make the following change: - // - // Case 1: if `u` is the root, we remove the trie edge from `v` to its child - // along the suffix indicator. - // Case 2: if `u` is not the root, we remove the trie edge from `u` to `v`. - // - // Example 1: if suffix_indicator_ is "##" (as in BERT), we remove the trie - // link from "#" to "##". The goal here is to make sure we match the - // punctuation character "#" as a token by itself, without matching "##" - // (as we split by punctuation, "##" is not a valid token). - // Example 2: if suffix_indicator is "foo#", we remove the trie link from - // "foo" to "foo#". - int cur_pos = 0; - int next_pos = 0; - bool prev_node_id_is_root = false; - auto node = trie_->CreateTraversalCursorPointToRoot(); - UChar32 c; - int suffix_indicator_length = suffix_indicator_.size(); - while (cur_pos < suffix_indicator_length) { - next_pos = cur_pos; - U8_NEXT(suffix_indicator_, next_pos, suffix_indicator_length, c); - prev_node_id_is_root = (node.node_id == trie_->kRootNodeId); - absl::string_view cur_unicode_char(suffix_indicator_.data() + cur_pos, - next_pos - cur_pos); - if (!trie_->TryTraverseSeveralSteps(node, cur_unicode_char)) { - return absl::FailedPreconditionError( - "Cannot locate a character in suffix_indicator_. It should never " - "happen."); - } - if (fast_wordpiece_tokenizer_utils::IsPunctuationOrChineseChar(c)) { - // If the previous node is a root node, read the next char to break the - // link from the current punctuation char to its next child node. - if (prev_node_id_is_root) { - cur_pos = next_pos; - U8_FWD_1(suffix_indicator_, next_pos, suffix_indicator_length); - const absl::string_view next_unicode_char( - suffix_indicator_.data() + cur_pos, next_pos - cur_pos); - auto child_node = node; - if (!trie_->TryTraverseSeveralSteps(child_node, next_unicode_char)) { - return absl::FailedPreconditionError( - "Cannot locate a character in suffix_indicator_. It should " - "never happen."); - } - BreakTrieLinkFromParentToChild(child_node.node_id); - } else { - BreakTrieLinkFromParentToChild(node.node_id); - } - break; - } - cur_pos = next_pos; - } - } - return absl::OkStatus(); -} - -absl::Status FastWordpieceBuilder::AssignFailureLinkAndPops( - uint32_t cur_node, uint32_t failure_link, - const std::vector& one_step_pops, - int parent_failure_pops_offset_length) { - if (failure_link == fast_wordpiece_tokenizer_utils::kNullNode) { - return absl::OkStatus(); - } - FailureStruct& cur_node_fs = failure_struct_array_[cur_node]; - cur_node_fs.failure_link = failure_link; - - // Let v be `cur_node` and u be the parent node. - if (one_step_pops.empty()) { - // Case 1: F(v) = F(u). So we just share the same vector. - cur_node_fs.failure_pops_offset_length = parent_failure_pops_offset_length; - } else { - // Case 2: F(v) = F(u) + `one_step_pops`. We need to create a new vector and - // append to `failure_pops_pool_`. - const int failure_pops_offset = failure_pops_pool_.size(); - if (failure_pops_offset > - fast_wordpiece_tokenizer_utils::kMaxSupportedFailurePoolOffset) { - return absl::FailedPreconditionError(absl::StrCat( - "Failure pops list offset is ", failure_pops_offset, - ", which exceeds maximum supported offset ", - fast_wordpiece_tokenizer_utils::kMaxSupportedFailurePoolOffset, - ". The vocabulary seems to be too large to be supported.")); - } - // First copy F(u). - GetFailurePopsAndAppendToOut(parent_failure_pops_offset_length, - failure_pops_pool_); - // Then append `one_step_pops`. - failure_pops_pool_.insert(failure_pops_pool_.end(), one_step_pops.begin(), - one_step_pops.end()); - const int failure_pops_length = - failure_pops_pool_.size() - failure_pops_offset; - if (failure_pops_length > - fast_wordpiece_tokenizer_utils::kMaxFailurePopsListSize) { - // This should not happen, because `kBitsToEncodeFailurePopsListSize` is - // set to be less than or equal to `kBitsToEncodeVocabTokenLength` (see - // fast_wordpiece_tokenizer_utils.h). - return absl::FailedPreconditionError(absl::StrCat( - "Failure pops list size is ", failure_pops_length, - ", which exceeds maximum supported size ", - fast_wordpiece_tokenizer_utils::kMaxFailurePopsListSize, ".")); - } - - cur_node_fs.failure_pops_offset_length = - fast_wordpiece_tokenizer_utils::EncodeFailurePopList( - failure_pops_offset, failure_pops_length); - } - return absl::OkStatus(); -} - -void FastWordpieceBuilder::GetFailurePopsAndAppendToOut( - uint32_t failure_pops_offset_length, std::vector& out_failure_pops) { - if (failure_pops_offset_length == - fast_wordpiece_tokenizer_utils::kNullFailurePopsList) { - return; - } - int failure_pops_offset, failure_pops_length; - fast_wordpiece_tokenizer_utils::GetFailurePopsOffsetAndLength( - failure_pops_offset_length, failure_pops_offset, failure_pops_length); - out_failure_pops.insert( - out_failure_pops.end(), failure_pops_pool_.begin() + failure_pops_offset, - failure_pops_pool_.begin() + failure_pops_offset + failure_pops_length); -} - -absl::Status FastWordpieceBuilder::PrecomputeResultForSuffixIndicator() { - std::vector subwords; - std::vector begin_offset; - std::vector end_offset; - int num_word_pieces; - // Use the original WordPiece implementation. - LookupStatus status = WordpieceTokenize( - suffix_indicator_, max_bytes_per_token_, /*max_chars_per_subtoken=*/-1, - suffix_indicator_, /*use_unknown_token=*/true, unk_token_, - /*split_unknown_characters=*/false, vocab_.get(), &subwords, - &begin_offset, &end_offset, &num_word_pieces); - precomputed_result_for_suffix_indicator_.reserve(subwords.size()); - if (!status.success) { - return absl::FailedPreconditionError(status.error_msg); - } - for (int i = 0; i < subwords.size(); ++i) { - const absl::optional subword_id = vocab_->LookupId(subwords[i]); - if (!subword_id.has_value()) { - return absl::FailedPreconditionError( - "Impossible because `subwords[i]` must be in the vocabulary!"); - } - TrieVocabToken token(subwords[i], *subword_id, suffix_indicator_); - SH_ASSIGN_OR_RETURN( - int encoded_value, - fast_wordpiece_tokenizer_utils::EncodeToken( - token.TokenId(), token.TokenLengthWithoutSuffixIndicator(), - token.IsSuffixToken())); - precomputed_result_for_suffix_indicator_.push_back(encoded_value); - } - return absl::OkStatus(); -} - -absl::StatusOr FastWordpieceBuilder::ExportToFlatBuffer() const { - flatbuffers::FlatBufferBuilder builder; - - const auto trie_array = builder.CreateVector(trie_array_); - std::vector failure_struct_fbs_vector; - failure_struct_fbs_vector.reserve(failure_struct_array_.size()); - for (const auto& item : failure_struct_array_) { - failure_struct_fbs_vector.emplace_back(item.failure_link, - item.failure_pops_offset_length); - } - const auto failure_structure_array = - builder.CreateVectorOfStructs(failure_struct_fbs_vector); - const auto failure_pops_pool = builder.CreateVector(failure_pops_pool_); - const auto precomputed_result_for_suffix_indicator = - builder.CreateVector(precomputed_result_for_suffix_indicator_); - const auto suffix_indicator = builder.CreateString(suffix_indicator_); - const auto unk_token = builder.CreateString(unk_token_); - - std::vector> vocab_fbs_vector; - std::vector vocab_is_suffix_fbs_vector; - - if (support_detokenization_) { - vocab_fbs_vector.reserve(vocab_->Size()); - for (int i = 0; i < vocab_->Size(); ++i) { - const absl::optional word = vocab_->LookupWord(i); - if (!word.has_value()) { - return absl::FailedPreconditionError( - "Impossible. `token_id` is definitely within the range of vocab " - "token ids; hence LookupWord() should always succeed."); - } - absl::string_view token = word.value(); - bool is_suffix_token = false; - if (!suffix_indicator_.empty() && token != suffix_indicator_ && - absl::StartsWith(token, suffix_indicator_)) { - is_suffix_token = true; - // For suffix tokens, we remove the suffix indicator to save spac and - // for ease of use in detokenization (where the suffix indicator will be - // stripped anyway). - token = token.substr(suffix_indicator_.size()); - } - vocab_fbs_vector.emplace_back(builder.CreateString(token)); - vocab_is_suffix_fbs_vector.emplace_back(is_suffix_token); - } - } - - auto vocab_array = builder.CreateVector(vocab_fbs_vector); - auto vocab_is_suffix_array = builder.CreateVector(vocab_is_suffix_fbs_vector); - - FastWordpieceTokenizerConfigBuilder wtcb(builder); - wtcb.add_trie_array(trie_array); - wtcb.add_failure_struct_array(failure_structure_array); - wtcb.add_failure_pops_pool(failure_pops_pool); - wtcb.add_trie_suffix_root(trie_suffix_root_); - wtcb.add_trie_punct_failure_link_node(trie_punct_failure_link_node_); - - wtcb.add_max_bytes_per_token(max_bytes_per_token_); - wtcb.add_suffix_indicator(suffix_indicator); - wtcb.add_unk_token(unk_token); - wtcb.add_unk_token_id(unk_token_id_); - wtcb.add_precomputed_result_for_suffix_indicator( - precomputed_result_for_suffix_indicator); - wtcb.add_end_to_end(!no_pretokenization_); - wtcb.add_support_detokenization(support_detokenization_); - wtcb.add_vocab_array(vocab_array); - wtcb.add_vocab_is_suffix_array(vocab_is_suffix_array); - FinishFastWordpieceTokenizerConfigBuffer(builder, wtcb.Finish()); - return std::string(reinterpret_cast(builder.GetBufferPointer()), - builder.GetSize()); -} -} // namespace - -absl::StatusOr BuildModelAndExportToFlatBuffer( - const std::vector& vocab, int max_bytes_per_token, - absl::string_view suffix_indicator, absl::string_view unk_token, - bool no_pretokenization, bool support_detokenization) { - FastWordpieceBuilder builder; - SH_RETURN_IF_ERROR(builder.BuildModel(vocab, max_bytes_per_token, - suffix_indicator, unk_token, - no_pretokenization, - support_detokenization)); - SH_ASSIGN_OR_RETURN(std::string flatbuffer, builder.ExportToFlatBuffer()); - return flatbuffer; -} - -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_model_builder.h b/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_model_builder.h index 080d4ab76..74fa28daa 100644 --- a/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_model_builder.h +++ b/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_model_builder.h @@ -15,39 +15,6 @@ #ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_FAST_WORDPIECE_TOKENIZER_MODEL_BUILDER_H_ #define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_FAST_WORDPIECE_TOKENIZER_MODEL_BUILDER_H_ -#include -#include +#include "tensorflow/core/kernels/text/fast_wordpiece_tokenizer_model_builder.h" -#include "absl/status/statusor.h" - -namespace tensorflow { -namespace text { - -// Builds a FastWordpieceTokenizer model in flatbuffer format. -// -// Args: -// * vocab: The WordPiece vocabulary. -// * max_bytes_per_token: The max size of the input token. If the input -// length is longer than this, it will be mapped to unk_token. -// * suffix_indicator: Characters prepended to a wordpiece to indicate that -// it is a suffix to another subword, such as "##". -// * unk_token: The unknown token string. -// * no_pretokenization: Whether to pretokenize on punctuation & whitespace. -// Set to `false` when the model is used for general text end-to-end -// tokenization, which combines pre-tokenization (splitting text into words -// on punctuation/whitespaces) and WordPiece (breaking words into subwords) -// into one pass. -//. * support_detokenization: Whether to enable the detokenization function. -// Setting it to true expands the size of the flatbuffer. As a reference, -// When using 120k multilingual BERT WordPiece vocab, the flatbuffer's size -// increases from ~5MB to ~6MB. -// Returns: -// The bytes of the flatbuffer that stores the model. -absl::StatusOr BuildModelAndExportToFlatBuffer( - const std::vector& vocab, int max_bytes_per_token, - absl::string_view suffix_indicator, absl::string_view unk_token, - bool no_pretokenization = false, bool support_detokenization = false); -} // namespace text -} // namespace tensorflow - -#endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_FASt_WORDPIECE_TOKENIZER_MODEL_BUILDER_H_ +#endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_FAST_WORDPIECE_TOKENIZER_MODEL_BUILDER_H_ diff --git a/tensorflow_text/core/kernels/sentence_fragmenter_v2_kernel.cc b/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_model_generated.h similarity index 60% rename from tensorflow_text/core/kernels/sentence_fragmenter_v2_kernel.cc rename to tensorflow_text/core/kernels/fast_wordpiece_tokenizer_model_generated.h index f50d41a31..ddb91e0f3 100644 --- a/tensorflow_text/core/kernels/sentence_fragmenter_v2_kernel.cc +++ b/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_model_generated.h @@ -12,16 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "tensorflow_text/core/kernels/sentence_fragmenter_v2_kernel.h" +#ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_FAST_WORDPIECE_TOKENIZER_MODEL_GENERATED_H_ +#define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_FAST_WORDPIECE_TOKENIZER_MODEL_GENERATED_H_ -#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/kernels/text/fast_wordpiece_tokenizer_model_generated.h" -namespace tensorflow { -namespace text { - -REGISTER_KERNEL_BUILDER(Name(SentenceFragmenterV2OpKernel::OpName()) - .Device(tensorflow::DEVICE_CPU), - SentenceFragmenterV2OpKernel); - -} // namespace text -} // namespace tensorflow +#endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_FAST_WORDPIECE_TOKENIZER_MODEL_GENERATED_H_ diff --git a/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_test.cc b/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_test.cc deleted file mode 100644 index fea96e3ef..000000000 --- a/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_test.cc +++ /dev/null @@ -1,2554 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/fast_wordpiece_tokenizer.h" - -#include -#include -#include "absl/flags/flag.h" -#include "icu4c/source/common/unicode/uchar.h" -#include "tensorflow/core/platform/env.h" -#include "tensorflow_text/core/kernels/fast_wordpiece_tokenizer_model_builder.h" - -namespace tensorflow { -namespace text { -namespace { - -using ::testing::AnyOf; -using ::testing::ElementsAre; - -constexpr char kTestConfigPath[] = - "tensorflow_text/python/ops/test_data/" - "fast_wordpiece_tokenizer_model.fb"; - -TEST(FastWordpieceTokenizerTest, LoadAndTokenize) { - std::string config_flatbuffer; - auto status = tensorflow::ReadFileToString( - tensorflow::Env::Default(), kTestConfigPath, &config_flatbuffer); - ASSERT_TRUE(status.ok()); - - // The config_flatbuffer used here is built from the following config: - // * vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - // "##ghz", ""} - // * unk_token = "" - // * suffix_indicator = "##" - // * max_bytes_per_token = 100 - ASSERT_OK_AND_ASSIGN( - auto tokenizer, FastWordpieceTokenizer::Create(config_flatbuffer.data())); - - std::string input = "abcdefghz"; - std::vector output_tokens; - std::vector output_ids; - std::vector output_start_offsets; - std::vector output_end_offsets; - tokenizer.Tokenize(input, &output_tokens, &output_ids, &output_start_offsets, - &output_end_offsets); - EXPECT_THAT(output_tokens, ElementsAre("abc", "##de", "##f", "##ghz")); - EXPECT_THAT(output_ids, ElementsAre(1, 3, 6, 7)); - EXPECT_THAT(output_start_offsets, ElementsAre(0, 3, 5, 6)); - EXPECT_THAT(output_end_offsets, ElementsAre(3, 5, 6, 9)); -} - -using TestPunctuationVersionMismatch = testing::TestWithParam; - -TEST_P(TestPunctuationVersionMismatch, Test) { - // The config_flatbuffer used here is built from the following config: - // * vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - // "##ghz", ""} - // * unk_token = "" - // * suffix_indicator = "##" - // * max_bytes_per_token = 100 - // * end_to_end = True - - const std::string kTestConfigUnicodePath = GetParam(); - - // We test the new punctuation symbol: \341\255\277, which was available in - // Unicode 16: https://www.fileformat.info/info/unicode/char//1b7f/index.htm, - // but not in 15.1. - // We also test an existing punctuation symbol ">". - std::string input = "abc>abc\341\255\277abc"; - - std::string config_flatbuffer; - auto status = tensorflow::ReadFileToString( - tensorflow::Env::Default(), kTestConfigUnicodePath, &config_flatbuffer); - ASSERT_TRUE(status.ok()); - - ASSERT_OK_AND_ASSIGN( - auto tokenizer, FastWordpieceTokenizer::Create(config_flatbuffer.data())); - - std::vector output_tokens; - std::vector output_ids; - std::vector output_start_offsets; - std::vector output_end_offsets; - tokenizer.Tokenize(input, &output_tokens, &output_ids, &output_start_offsets, - &output_end_offsets); - - // If the runtime environment has unicode <=15.1, "\341\255\277" is not a - // punctuation, so "abc\341\255\277abc" is one token. - // If the runtime environment has unicode >=16.0, "\341\255\277" is a - // punctuation, so tokens are "abc", "", "abc" - EXPECT_THAT(output_tokens.size(), AnyOf(3, 5)); - if (!u_ispunct(0x1b7f)) { - // We have a runtime environment of unicode <= 15.1. - EXPECT_THAT(output_tokens, ElementsAre("abc", "", "")); - EXPECT_THAT(output_ids, ElementsAre(1, 8, 8)); - EXPECT_THAT(output_start_offsets, ElementsAre(0, 3, 4)); - EXPECT_THAT(output_end_offsets, ElementsAre(3, 4, 13)); - } else { - // We have a runtime environment of unicode >= 16.0. - EXPECT_THAT(output_tokens, - ElementsAre("abc", "", "abc", "", "abc")); - EXPECT_THAT(output_ids, ElementsAre(1, 8, 1, 8, 1)); - EXPECT_THAT(output_start_offsets, ElementsAre(0, 3, 4, 7, 10)); - EXPECT_THAT(output_end_offsets, ElementsAre(3, 4, 7, 10, 13)); - } -} - -INSTANTIATE_TEST_SUITE_P(FastWordpieceTokenizerPunctuationTest, - TestPunctuationVersionMismatch, - testing::Values( - // Unicode v 15.1 config - "tensorflow_text/python/ops/test_data/" - "fast_wordpiece_tokenizer_model_ver_15_1.fb", - // Unicode v 16.0 config - "tensorflow_text/python/ops/test_data/" - "fast_wordpiece_tokenizer_model_ver_16_0.fb")); - -template -std::string ListToString(const std::vector& list) { - return absl::StrCat("[", absl::StrJoin(list, ", "), "]"); -} - -// Testing spec struct for parameterized tests. -struct Spec { - friend std::ostream& operator<<(std::ostream& os, const Spec& s) { - return os << "vocab: " << ListToString(s.vocab) << ", " - << "unk_token:" << s.unk_token << ", " - << "suffix_indicator:" << s.suffix_indicator << ", " - << "max_bytes_per_token:" << s.max_bytes_per_token << ", " - << "input:" << s.input << ", " - << "expected_tokens:" << ListToString(s.expected_tokens) << ", " - << "expected_token_ids:" << ListToString(s.expected_token_ids) - << ", " - << "expected_token_start_offsets:" - << ListToString(s.expected_token_start_offsets) << ", " - << "expected_token_end_offsets:" - << ListToString(s.expected_token_end_offsets) << std::endl; - } - - std::vector vocab; - std::string unk_token; - std::string suffix_indicator; - int max_bytes_per_token; - std::string input; - std::vector expected_tokens; - std::vector expected_token_ids; - std::vector expected_token_start_offsets = {}; - std::vector expected_token_end_offsets = {}; - // Only used when detokenizing the tokenized ids back to text. - std::string expected_detokenized_text; -}; - -// Parameterized tests specs for Tokenize() when input is a single word. -const std::vector& GetTestSpecsForTokenizeSingleWord() { - static const std::vector& v = *new std::vector{ - // Test suite 1, normal vocabulary. - // Test 0: Empty input. - { - .vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - "##ghz", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "", - .expected_tokens = {}, - .expected_token_ids = {}, - .expected_token_start_offsets = {}, - .expected_token_end_offsets = {}, - }, - // Test 1: Basic. - { - .vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - "##ghz", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "abcdefghz", - .expected_tokens = {"abc", "##de", "##f", "##ghz"}, - .expected_token_ids = {1, 3, 6, 7}, - .expected_token_start_offsets = {0, 3, 5, 6}, - .expected_token_end_offsets = {3, 5, 6, 9}, - }, - // Test 2: Collect more tokens at the end. - { - .vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - "##ghz", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "abcdef", - .expected_tokens = {"abc", "##de", "##f"}, - .expected_token_ids = {1, 3, 6}, - .expected_token_start_offsets = {0, 3, 5}, - .expected_token_end_offsets = {3, 5, 6}, - }, - // Test 3: Unseen character alone. Result is . - { - .vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - "##ghz", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "X", - .expected_tokens = {""}, - .expected_token_ids = {8}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {1}, - }, - // Test 4: Unseen character at the beginning. Result is . - { - .vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - "##ghz", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "Xde", - .expected_tokens = {""}, - .expected_token_ids = {8}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {3}, - }, - // Test 5: Unseen character in the middle. Result is . - { - .vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - "##ghz", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "abcXde", - .expected_tokens = {""}, - .expected_token_ids = {8}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {6}, - }, - // Test 6: Unseen character at the end. Result is . - { - .vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - "##ghz", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "abcX", - .expected_tokens = {""}, - .expected_token_ids = {8}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {4}, - }, - // Test 7: Input has leading suffix indicator. Result is normal. - { - .vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - "##ghz", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "##deh", - .expected_tokens = {"##deh"}, - .expected_token_ids = {5}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {5}, - }, - // Test 8: Input has the leading suffix indicator. Vocab has "#" and - // "###". Result is normal. - { - .vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - "##ghz", "#", "###", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "##deh", - .expected_tokens = {"##deh"}, - .expected_token_ids = {5}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {5}, - }, - // Test 9: Input is the suffix indicator itself. Result is . - { - .vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - "##ghz", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "##", - .expected_tokens = {""}, - .expected_token_ids = {8}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {2}, - }, - // Test 10: [PAD] is in the vocabulary. Input is [PAD]. - { - .vocab = {"[pad]", "a", "abc", "abcdefghi", "##de", "##defgxy", - "##deh", "##f", "##ghz", "#", "###", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "[pad]", - .expected_tokens = {"[pad]"}, - .expected_token_ids = {0}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {5}, - }, - // Test 11: [PAD] is not in the vocabulary. Input is [PAD]. - { - .vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - "##ghz", "#", "###", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "[pad]", - .expected_tokens = {""}, - .expected_token_ids = {10}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {5}, - }, - - // Test suite 2, input contains #. - // Test 12: Input is #. Result is . - { - .vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - "##ghz", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "#", - .expected_tokens = {""}, - .expected_token_ids = {8}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {1}, - }, - // Test 13: Input is #. Result is not . - { - .vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - "##ghz", "#", "###", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "#", - .expected_tokens = {"#"}, - .expected_token_ids = {8}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {1}, - }, - // Test 14: Input is #. The suffix indicator is in the vocab. Result is - // not . - { - .vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - "##ghz", "#", "###", "##", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "#", - .expected_tokens = {"#"}, - .expected_token_ids = {8}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {1}, - }, - // Test 15: Input is the suffix indicator itself. Result is . - { - .vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - "##ghz", "#", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "##", - .expected_tokens = {""}, - .expected_token_ids = {9}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {2}, - }, - // Test 16: Input is the suffix indicator itself. Result is not . - { - .vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - "##ghz", "#", "###", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "##", - .expected_tokens = {"#", "###"}, - .expected_token_ids = {8, 9}, - .expected_token_start_offsets = {0, 1}, - .expected_token_end_offsets = {1, 2}, - }, - // Test 17: Input is the suffix indicator itself. The suffix indicator is - // in the vocab. Result is not . - { - .vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - "##ghz", "#", "###", "##", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "##", - .expected_tokens = {"##"}, - .expected_token_ids = {10}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {2}, - }, - // Test 18: Input is ###. Result is . - { - .vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - "##ghz", "#", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "###", - .expected_tokens = {""}, - .expected_token_ids = {9}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {3}, - }, - // Test 19: Input is ###. Result is not . - { - .vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - "##ghz", "#", "###", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "###", - .expected_tokens = {"###"}, - .expected_token_ids = {9}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {3}, - }, - // Test 20: Input is ###. The suffix indicator is in the vocab. Result is - // not . - { - .vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - "##ghz", "#", "###", "##", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "###", - .expected_tokens = {"###"}, - .expected_token_ids = {9}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {3}, - }, - // Test 21: Input is ####. Result is not . - { - .vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - "##ghz", "#", "###", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "####", - .expected_tokens = {"###", "###"}, - .expected_token_ids = {9, 9}, - .expected_token_start_offsets = {0, 3}, - .expected_token_end_offsets = {3, 4}, - }, - // Test 22: Input is ####. The suffix indicator is in the vocab. Result - // is not . - { - .vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - "##ghz", "#", "###", "##", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "####", - .expected_tokens = {"###", "###"}, - .expected_token_ids = {9, 9}, - .expected_token_start_offsets = {0, 3}, - .expected_token_end_offsets = {3, 4}, - }, - - // Test suite 3, the vocabulary contains empty tokens ("", "##"). - // Test 23: The empty prefix token ("") and the empty suffix token ("##") - // are in the vocabulary. - { - .vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - "##ghz", "", "##", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "abcdefghz", - .expected_tokens = {"abc", "##de", "##f", "##ghz"}, - .expected_token_ids = {1, 3, 6, 7}, - .expected_token_start_offsets = {0, 3, 5, 6}, - .expected_token_end_offsets = {3, 5, 6, 9}, - }, - // Test 24: The empty prefix token ("") and the empty suffix ("##") token - // are in the vocabulary. Input is empty. - { - .vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - "##ghz", "", "##", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "", - .expected_tokens = {}, - .expected_token_ids = {}, - .expected_token_start_offsets = {}, - .expected_token_end_offsets = {}, - }, - // Test 25: The empty prefix token ("") and the empty suffix token ("##") - // are in the vocabulary. Input is the suffix indicator. - { - .vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - "##ghz", "", "##", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "##", - .expected_tokens = {"##"}, - .expected_token_ids = {9}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {2}, - }, - // Test 26: The empty prefix token ("") and the empty suffix token ("##") - // are in the vocabulary. There are vocab tokens after the empty vocab - // tokens in the vocab. Result is one vocab token. - { - .vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - "##ghz", "", "##", "xyz", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "xyz", - .expected_tokens = {"xyz"}, - .expected_token_ids = {10}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {3}, - }, - // Test 27: The empty prefix token ("") and the empty suffix ("##") token - // are in the vocabulary. There are vocab tokens after the empty vocab - // tokens in the vocab. Result has multiple tokens. - { - .vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - "##ghz", "", "##", "xy", "##z", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "xyz", - .expected_tokens = {"xy", "##z"}, - .expected_token_ids = {10, 11}, - .expected_token_start_offsets = {0, 2}, - .expected_token_end_offsets = {2, 3}, - }, - // Test 28: The empty prefix token ("") and the empty suffix token ("##") - // are in the vocabulary. Input has the leading suffix indicator. - { - .vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - "##ghz", "", "##", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "##deh", - .expected_tokens = {"##deh"}, - .expected_token_ids = {5}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {5}, - }, - - // Test suite 4, No suffix tokens in the vocabulary. - // Test 29: No suffix tokens in the vocabulary. Result is normal. - { - .vocab = {"a", "abc", "abcdefghi", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "abc", - .expected_tokens = {"abc"}, - .expected_token_ids = {1}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {3}, - }, - // Test 30: No suffix tokens in the vocabulary. Result is . - { - .vocab = {"a", "abc", "de", "abcdefghi", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "abcde", - .expected_tokens = {""}, - .expected_token_ids = {4}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {5}, - }, - // Test 31: No suffix tokens in the vocabulary. A different input. Result - // is . - { - .vocab = {"a", "abc", "de", "abcdefghi", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "abcdz", - .expected_tokens = {""}, - .expected_token_ids = {4}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {5}, - }, - // Test 32: No suffix tokens in the vocabulary. Input is #. Result is - // - { - .vocab = {"a", "abc", "de", "abcdefghi", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "#", - .expected_tokens = {""}, - .expected_token_ids = {4}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {1}, - }, - // Test 33: No suffix tokens in the vocabulary. Input is #. Result is not - // . - { - .vocab = {"a", "abc", "de", "abcdefghi", "", "#"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "#", - .expected_tokens = {"#"}, - .expected_token_ids = {5}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {1}, - }, - // Test 34: No suffix tokens in the vocabulary. Vocab has the suffix - // indicator. Input is #. - { - .vocab = {"a", "abc", "de", "abcdefghi", "", "##"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "#", - .expected_tokens = {""}, - .expected_token_ids = {4}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {1}, - }, - // Test 35: No suffix tokens in the vocabulary. Input is ##. Result is - // . - { - .vocab = {"a", "abc", "de", "abcdefghi", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "##", - .expected_tokens = {""}, - .expected_token_ids = {4}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {2}, - }, - // Test 36: No suffix tokens in the vocabulary. Vocab has the suffix - // indicator. Input is #. Result is . - { - .vocab = {"a", "abc", "de", "abcdefghi", "", "##"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "#", - .expected_tokens = {""}, - .expected_token_ids = {4}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {1}, - }, - // Test 37: No suffix tokens in the vocabulary. Vocab has the suffix - // indicator. Input is ##. - { - .vocab = {"a", "abc", "de", "abcdefghi", "", "##"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "##", - .expected_tokens = {"##"}, - .expected_token_ids = {5}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {2}, - }, - // Test 38: No suffix tokens in the vocabulary. Vocab has '#'. Input is - // ##. Result is . - { - .vocab = {"a", "abc", "de", "abcdefghi", "", "#"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "##", - .expected_tokens = {""}, - .expected_token_ids = {4}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {2}, - }, - // Test 39: No suffix tokens in the vocabulary. Vocab has the suffix - // indicator and "#". Input is ##. - { - .vocab = {"a", "abc", "de", "abcdefghi", "", "##", "#"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "##", - .expected_tokens = {"##"}, - .expected_token_ids = {5}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {2}, - }, - // Test 40: No suffix tokens in the vocabulary. Input is ###. Result is - // . - { - .vocab = {"a", "abc", "de", "abcdefghi", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "###", - .expected_tokens = {""}, - .expected_token_ids = {4}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {3}, - }, - // Test 41: No suffix tokens in the vocabulary. Vocab has '#'. Input is - // ###. Result is . - { - .vocab = {"a", "abc", "de", "abcdefghi", "", "#"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "###", - .expected_tokens = {""}, - .expected_token_ids = {4}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {3}, - }, - // Test 42: No suffix tokens in the vocabulary. Vocab has the suffix - // indicator. Input is ###. - { - .vocab = {"a", "abc", "de", "abcdefghi", "", "##"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "###", - .expected_tokens = {""}, - .expected_token_ids = {4}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {3}, - }, - // Test 43: There is only one suffix tokens "###" in the vocabulary. - // Input is ###. - { - .vocab = {"a", "abc", "de", "abcdefghi", "", "###"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "###", - .expected_tokens = {"###"}, - .expected_token_ids = {5}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {3}, - }, - - // Test suite 5, No prefix tokens in the vocabulary. - // Test 44: No prefix tokens in the vocabulary. Input is a prefix token. - { - .vocab = {"##a", "##abc", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "abc", - .expected_tokens = {""}, - .expected_token_ids = {2}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {3}, - }, - // Test 45: No prefix tokens in the vocabulary. Input is a suffix token. - { - .vocab = {"##a", "##abc", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "##abc", - .expected_tokens = {"##abc"}, - .expected_token_ids = {1}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {5}, - }, - - // Test suite 6, more tests. - // Test 46: Input is empty. - { - .vocab = {"", "", "", "want", "##want", "##ed", "wa", - "un", "runn", "##ing"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "", - .expected_tokens = {}, - .expected_token_ids = {}, - .expected_token_start_offsets = {}, - .expected_token_end_offsets = {}, - }, - // Test 47: Normal input. - { - .vocab = {"", "", "", "want", "##want", "##ed", "wa", - "un", "runn", "##ing"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "unwanted", - .expected_tokens = {"un", "##want", "##ed"}, - .expected_token_ids = {7, 4, 5}, - .expected_token_start_offsets = {0, 2, 6}, - .expected_token_end_offsets = {2, 6, 8}, - }, - // Test 48: Unseen character. - { - .vocab = {"", "", "", "want", "##want", "##ed", "wa", - "un", "runn", "##ing"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "unwantedX", - .expected_tokens = {""}, - .expected_token_ids = {1}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {9}, - }, - - // Test suite 7. Testing on long inputs (kMaxInputCharPerWord = 100). The - // word length below means the number of utf-8 bytes. - // Test 49: Word length = 99 (i.e., kMaxInputCharPerWord-1). - { - .vocab = {"", "0123456789", "##0123456789", "##012345678"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "01234567890123456789012345678901234567890123456789012345678" - "9012345678901234567890123456789012345678", - .expected_tokens = {"0123456789", "##0123456789", "##0123456789", - "##0123456789", "##0123456789", "##0123456789", - "##0123456789", "##0123456789", "##0123456789", - "##012345678"}, - .expected_token_ids = {1, 2, 2, 2, 2, 2, 2, 2, 2, 3}, - .expected_token_start_offsets = {0, 10, 20, 30, 40, 50, 60, 70, 80, - 90}, - .expected_token_end_offsets = {10, 20, 30, 40, 50, 60, 70, 80, 90, - 99}, - }, - // Test 50: Word length = 100 (i.e., kMaxInputCharPerWord). Contains a - // multi-bytes Unicode char. - { - .vocab = {"", "0123456789", "##0123456789", "##01234567", - /*U+05C3*/ "##\xD7\x83", "##a"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "01234567890123456789012345678901234567890123456789012345678" - "901234567890123456789012345678901234567\xD7\x83", - .expected_tokens = {"0123456789", "##0123456789", "##0123456789", - "##0123456789", "##0123456789", "##0123456789", - "##0123456789", "##0123456789", "##0123456789", - "##01234567", "##\xD7\x83"}, - .expected_token_ids = {1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 4}, - .expected_token_start_offsets = {0, 10, 20, 30, 40, 50, 60, 70, 80, - 90, 98}, - .expected_token_end_offsets = {10, 20, 30, 40, 50, 60, 70, 80, 90, 98, - 100}, - }, - // Test 51: Word length = 101 (i.e., kMaxInputCharPerWord+1). Contains a - // multi-bytes Unicode char. - { - .vocab = {"", "0123456789", "##0123456789", "##012345678", - /*U+05C3*/ "##\xD7\x83", "##a"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "01234567890123456789012345678901234567890123456789012345678" - "9012345678901234567890123456789012345678\xD7\x83", - .expected_tokens = {""}, - .expected_token_ids = {0}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {101}, - }, - // Test 52: Word length = 101 (i.e., kMaxInputCharPerWord+1). - { - .vocab = {"", "0123456789", "##0123456789", "##012345678", - "##a"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "01234567890123456789012345678901234567890123456789012345678" - "90123456789012345678901234567890123456789a", - .expected_tokens = {""}, - .expected_token_ids = {0}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {101}, - }, - // Test 53: Word length = 99 (i.e., kMaxInputCharPerWord-1). The word is - // not tokenizable. - { - .vocab = {"", "0123456789", "##0123456789", - "##012345678\xe2\x80\x8B", "##\xe2\x80\x8B"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "01234567890123456789012345678901234567890123456789012345678" - "9012345678901234567890123456789012345678", - .expected_tokens = {""}, - .expected_token_ids = {0}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {99}, - }, - - // Test suite 8. Normal vocab and inputs. - // Test 54. - { - .vocab = {"", "play", "see", "##ing", "##ed", "##es", "##ly", - "##on", "##s", "##able"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "play", - .expected_tokens = {"play"}, - .expected_token_ids = {1}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {4}, - }, - // Test 55. - { - .vocab = {"", "play", "see", "##ing", "##ed", "##es", "##ly", - "##on", "##s", "##able"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "playing", - .expected_tokens = {"play", "##ing"}, - .expected_token_ids = {1, 3}, - .expected_token_start_offsets = {0, 4}, - .expected_token_end_offsets = {4, 7}, - }, - // Test 56. - { - .vocab = {"", "play", "see", "##ing", "##ed", "##es", "##ly", - "##on", "##s", "##able"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "sees", - .expected_tokens = {"see", "##s"}, - .expected_token_ids = {2, 8}, - .expected_token_start_offsets = {0, 3}, - .expected_token_end_offsets = {3, 4}, - }, - // Test 57. - { - .vocab = {"", "play", "see", "##ing", "##ed", "##es", "##ly", - "##on", "##s", "##able", "u", "un", "##de", "##deni"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "undeniable", - .expected_tokens = {"un", "##deni", "##able"}, - .expected_token_ids = {11, 13, 9}, - .expected_token_start_offsets = {0, 2, 6}, - .expected_token_end_offsets = {2, 6, 10}, - }, - // Test 58. - { - .vocab = {"", "play", "see", "##ing", "##ed", "##es", "##ly", - "##on", "##s", "##able", "u", "un", "##de", "##deni", - "undeniable"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "undeniable", - .expected_tokens = {"undeniable"}, - .expected_token_ids = {14}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {10}, - }, - // Test 59. - { - .vocab = {"", "s", "su", "super", "##per", "##ca", - "##cali", "##f", "##fra", "##g", "##gil", "##i", - "##is", "##istic", "##e", "##ex", "##pi", "##pia", - "##li", "##lido", "##ci", "##cious", "##ous"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "supercalifragilisticexpialidocious", - .expected_tokens = {"super", "##cali", "##fra", "##gil", "##istic", - "##ex", "##pia", "##lido", "##cious"}, - .expected_token_ids = {3, 6, 8, 10, 13, 15, 17, 19, 21}, - .expected_token_start_offsets = {0, 5, 9, 12, 15, 20, 22, 25, 29}, - .expected_token_end_offsets = {5, 9, 12, 15, 20, 22, 25, 29, 34}, - }, - - // Test suite 9. Different unk_tokens. - // Test 60: Basic with a different unk_token. - { - .vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - "##ghz", "[unk]"}, - .unk_token = "[unk]", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "abcdefghz", - .expected_tokens = {"abc", "##de", "##f", "##ghz"}, - .expected_token_ids = {1, 3, 6, 7}, - .expected_token_start_offsets = {0, 3, 5, 6}, - .expected_token_end_offsets = {3, 5, 6, 9}, - }, - // Test 61: Untokenizable with a different unk_token. - { - .vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - "##ghz", "[unk]"}, - .unk_token = "[unk]", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "abcdefghzX", - .expected_tokens = {"[unk]"}, - .expected_token_ids = {8}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {10}, - }, - - // Test suite 10. Input is the unk_token. - // Test 62: Input is the unk_token. - { - .vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - "##ghz", "[unk]"}, - .unk_token = "[unk]", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "[unk]", - .expected_tokens = {"[unk]"}, - .expected_token_ids = {8}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {5}, - }, - - // Test suite 11. Input is the suffix indicator itself. - // Test 63: Suffix indicator is "##" and is tokenizable. - { - .vocab = {"#", "###", "a", "abc", "abcdefghi", "##de", "##defgxy", - "##deh", "##f", "##ghz", "[unk]"}, - .unk_token = "[unk]", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "##", - .expected_tokens = {"#", "###"}, - .expected_token_ids = {0, 1}, - .expected_token_start_offsets = {0, 1}, - .expected_token_end_offsets = {1, 2}, - }, - // Test 64: Suffix indicator is "##" but not tokenizable. - { - .vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - "##ghz", "[unk]"}, - .unk_token = "[unk]", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "##", - .expected_tokens = {"[unk]"}, - .expected_token_ids = {8}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {2}, - }, - // Test 65: Suffix indicator is "##" and "##" is in the vocabulary. - { - .vocab = {"#", "###", "##", "a", "abc", "abcdefghi", "##de", - "##defgxy", "##deh", "##f", "##ghz", "[unk]"}, - .unk_token = "[unk]", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "##", - .expected_tokens = {"##"}, - .expected_token_ids = {2}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {2}, - }, - // Test 66: Suffix indicator is "###" and is tokenizable. - { - .vocab = {"#", "####", "[unk]"}, - .unk_token = "[unk]", - .suffix_indicator = "###", - .max_bytes_per_token = 100, - .input = "###", - .expected_tokens = {"#", "####", "####"}, - .expected_token_ids = {0, 1, 1}, - .expected_token_start_offsets = {0, 1, 2}, - .expected_token_end_offsets = {1, 2, 3}, - }, - // Test 67: Suffix indicator is "###" and is tokenizable. A different - // vocab. - { - .vocab = {"#", "####", "##", "[unk]"}, - .unk_token = "[unk]", - .suffix_indicator = "###", - .max_bytes_per_token = 100, - .input = "###", - .expected_tokens = {"##", "####"}, - .expected_token_ids = {2, 1}, - .expected_token_start_offsets = {0, 2}, - .expected_token_end_offsets = {2, 3}, - }, - - // Test suite 12, different suffix indicators. - // Test 68: A different suffix indicator. - { - .vocab = {"a", "abc", "abcdefghi", "de", "defgxy", - "deh", "f", "ghz", ""}, - .unk_token = "", - .suffix_indicator = "", - .max_bytes_per_token = 100, - .input = "abcdefghz", - .expected_tokens = {"abc", "de", "f", "ghz"}, - .expected_token_ids = {1, 3, 6, 7}, - .expected_token_start_offsets = {0, 3, 5, 6}, - .expected_token_end_offsets = {3, 5, 6, 9}, - }, - // Test 69: The suffix indicator is empty. - { - .vocab = {"a", "abc", "abcdefghi", "de", "defgxy", "deh", "f", "ghz", - ""}, - .unk_token = "", - .suffix_indicator = "", - .max_bytes_per_token = 100, - .input = "abcdefghz", - .expected_tokens = {"abc", "de", "f", "ghz"}, - .expected_token_ids = {1, 3, 6, 7}, - .expected_token_start_offsets = {0, 3, 5, 6}, - .expected_token_end_offsets = {3, 5, 6, 9}, - }, - // Test 70: The suffix indicator is empty. Input is empty. - { - .vocab = {"a", "abc", "abcdefghi", "de", "defgxy", "deh", "f", "ghz", - ""}, - .unk_token = "", - .suffix_indicator = "", - .max_bytes_per_token = 100, - .input = "", - .expected_tokens = {}, - .expected_token_ids = {}, - .expected_token_start_offsets = {}, - .expected_token_end_offsets = {}, - }, - - // Test suite 13, multi-bytes chars in vocab and input. - // The following codepoints and their utf-8 encodings are used here: - // * U+03B1 (Greek Small Letter Alpha): "\xCE\xB1" - // * U+03B2 (Greek Small Letter Beta): "\xCE\xB2" - // * U+2EDA (Cjk Radical C-Simplified Leaf): b'\xE2\xBB\x9A' - // * U+2EDB (Cjk Radical C-Simplified Wind): b'\xE2\xBB\x9B' - // Test 71: multi-bytes chars in the vocab. - { - .vocab = {"", "abc", "a", "##bc", "a\xCE\xB1\xCE\xB2", - "\xCE\xB1", "##\xCE\xB1", "##\xCE\xB2", "\xE2\xBB\x9A"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "abc", - .expected_tokens = {"abc"}, - .expected_token_ids = {1}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {3}, - }, - // Test 72: input contains 2-bytes chars. - { - .vocab = {"", "abc", "a", "##bc", "a\xCE\xB1\xCE\xB2", - "\xCE\xB1", "##\xCE\xB1", "##\xCE\xB2", "\xE2\xBB\x9A"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "a\xCE\xB1\xCE\xB2\xCE\xB1\xCE\xB2", - .expected_tokens = {"a\xCE\xB1\xCE\xB2", "##\xCE\xB1", "##\xCE\xB2"}, - .expected_token_ids = {4, 6, 7}, - .expected_token_start_offsets = {0, 5, 7}, - .expected_token_end_offsets = {5, 7, 9}, - }, - // Test 73: input contains 3-bytes chars. - { - .vocab = {"", "abc", "a", "##bc", "a\xCE\xB1\xCE\xB2", - "\xCE\xB1", "##\xCE\xB1", "##\xCE\xB2", "\xE2\xBB\x9A"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "\xE2\xBB\x9A" - "bc\xCE\xB1", - .expected_tokens = {"\xE2\xBB\x9A", "##bc", "##\xCE\xB1"}, - .expected_token_ids = {8, 3, 6}, - .expected_token_start_offsets = {0, 3, 5}, - .expected_token_end_offsets = {3, 5, 7}, - }, - // Test 74: input contains unseen multi-bytes chars. - { - .vocab = {"", "abc", "a", "##bc", "a\xCE\xB1\xCE\xB2", - "\xCE\xB1", "##\xCE\xB1", "##\xCE\xB2", "\xE2\xBB\x9A"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "\xE2\xBB\x9B", - .expected_tokens = {""}, - .expected_token_ids = {0}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {3}, - }, - }; - return v; -} - -using TestTokenizeSingleWord = testing::TestWithParam; - -TEST_P(TestTokenizeSingleWord, Test) { - const Spec& spec = GetParam(); - ASSERT_OK_AND_ASSIGN( - std::string flatbuffer, - BuildModelAndExportToFlatBuffer(spec.vocab, spec.max_bytes_per_token, - spec.suffix_indicator, spec.unk_token, - /*no_pretokenization=*/true)); - ASSERT_OK_AND_ASSIGN(auto tokenizer, - FastWordpieceTokenizer::Create(flatbuffer.data())); - - std::vector output_tokens; - std::vector output_ids; - std::vector output_begin_offsets; - std::vector output_end_offsets; - tokenizer.Tokenize(spec.input, &output_tokens, &output_ids, - &output_begin_offsets, &output_end_offsets); - EXPECT_THAT(output_tokens, spec.expected_tokens); - EXPECT_THAT(output_ids, spec.expected_token_ids); - EXPECT_THAT(output_begin_offsets, spec.expected_token_start_offsets); - EXPECT_THAT(output_end_offsets, spec.expected_token_end_offsets); -} - -TEST_P(TestTokenizeSingleWord, TestNoOutputPieces) { - const Spec& spec = GetParam(); - ASSERT_OK_AND_ASSIGN( - std::string flatbuffer, - BuildModelAndExportToFlatBuffer(spec.vocab, spec.max_bytes_per_token, - spec.suffix_indicator, spec.unk_token, - true /* no_pretokenization */)); - ASSERT_OK_AND_ASSIGN(auto tokenizer, - FastWordpieceTokenizer::Create(flatbuffer.data())); - - std::vector output_ids; - std::vector output_begin_offsets; - std::vector output_end_offsets; - tokenizer.Tokenize(spec.input, &output_ids, &output_begin_offsets, - &output_end_offsets); - EXPECT_THAT(output_ids, spec.expected_token_ids); - EXPECT_THAT(output_begin_offsets, spec.expected_token_start_offsets); - EXPECT_THAT(output_end_offsets, spec.expected_token_end_offsets); -} - -TEST_P(TestTokenizeSingleWord, TestNoOutputPiecesOnlyOutputIds) { - const Spec& spec = GetParam(); - ASSERT_OK_AND_ASSIGN( - std::string flatbuffer, - BuildModelAndExportToFlatBuffer(spec.vocab, spec.max_bytes_per_token, - spec.suffix_indicator, spec.unk_token, - true /* no_pretokenization */)); - ASSERT_OK_AND_ASSIGN(auto tokenizer, - FastWordpieceTokenizer::Create(flatbuffer.data())); - - std::vector output_ids; - tokenizer.Tokenize(spec.input, &output_ids); - EXPECT_THAT(output_ids, spec.expected_token_ids); -} - -TEST_P(TestTokenizeSingleWord, TestNoOutputPiecesWithPositiveSentenceOffsets) { - const Spec& spec = GetParam(); - const int offset_in_sentence = 123; - ASSERT_OK_AND_ASSIGN( - std::string flatbuffer, - BuildModelAndExportToFlatBuffer(spec.vocab, spec.max_bytes_per_token, - spec.suffix_indicator, spec.unk_token, - true /* no_pretokenization */)); - ASSERT_OK_AND_ASSIGN(auto tokenizer, - FastWordpieceTokenizer::Create(flatbuffer.data())); - - std::vector output_ids; - std::vector output_begin_offsets; - std::vector output_end_offsets; - std::vector expected_token_start_offsets( - spec.expected_token_start_offsets); - std::vector expected_token_end_offsets(spec.expected_token_end_offsets); - - for (int& offset : expected_token_start_offsets) { - offset += offset_in_sentence; - } - for (int& offset : expected_token_end_offsets) { - offset += offset_in_sentence; - } - - tokenizer.Tokenize(spec.input, &output_ids, &output_begin_offsets, - &output_end_offsets, - /*input_word_offset_in_text=*/offset_in_sentence); - EXPECT_THAT(output_begin_offsets, expected_token_start_offsets); - EXPECT_THAT(output_end_offsets, expected_token_end_offsets); -} - -INSTANTIATE_TEST_SUITE_P( - FastWordpieceTokenizerParameterizedTest, TestTokenizeSingleWord, - testing::ValuesIn(GetTestSpecsForTokenizeSingleWord())); - -// Test End-to-end FastWordPieceTokenization for tokenizing general texts. -const std::vector& GetTestSpecsForTokenizeText() { - static const std::vector& v = *new std::vector{ - // Test suite 1. End-to-end test including whitespace tokenization. - // Test 0: Input is empty. - { - .vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - "##ghz", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "", - .expected_tokens = {}, - .expected_token_ids = {}, - .expected_token_start_offsets = {}, - .expected_token_end_offsets = {}, - }, - // Test 1: Input has only spaces. - { - .vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - "##ghz", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = " \t ", - .expected_tokens = {}, - .expected_token_ids = {}, - .expected_token_start_offsets = {}, - .expected_token_end_offsets = {}, - }, - // Test 2: Input is a single word. Result is OK. - { - .vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - "##ghz", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "abcdef", - .expected_tokens = {"abc", "##de", "##f"}, - .expected_token_ids = {1, 3, 6}, - .expected_token_start_offsets = {0, 3, 5}, - .expected_token_end_offsets = {3, 5, 6}, - }, - // Test 3: Input is a single word. Result is . - { - .vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - "##ghz", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "abcd", - .expected_tokens = {""}, - .expected_token_ids = {8}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {4}, - }, - // Test 4: Input contains multiple words, with several whitespaces in the - // middle. Result is OK. - { - .vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - "##ghz", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "abcdef \t\t \tabcf", - .expected_tokens = {"abc", "##de", "##f", "abc", "##f"}, - .expected_token_ids = {1, 3, 6, 1, 6}, - .expected_token_start_offsets = {0, 3, 5, 11, 14}, - .expected_token_end_offsets = {3, 5, 6, 14, 15}, - }, - // Test 5: Input has multiple words, with leading and trailing spaces. - { - .vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - "##ghz", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "\tabcdef abcf ", - .expected_tokens = {"abc", "##de", "##f", "abc", "##f"}, - .expected_token_ids = {1, 3, 6, 1, 6}, - .expected_token_start_offsets = {1, 4, 6, 9, 12}, - .expected_token_end_offsets = {4, 6, 7, 12, 13}, - }, - // Test 6: Input contains suffix indicator as words. Suffix indicator is - // in vocab. - { - .vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - "##ghz", "", "##"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "## abcde ## ##a", - .expected_tokens = {"", "", "abc", "##de", "", "", - "", "", "a"}, - .expected_token_ids = {8, 8, 1, 3, 8, 8, 8, 8, 0}, - .expected_token_start_offsets = {0, 1, 3, 6, 9, 10, 13, 14, 15}, - .expected_token_end_offsets = {1, 2, 6, 8, 10, 11, 14, 15, 16}, - }, - // Test 7: Input contains suffix indicator as words. Suffix indicator is - // in vocab. - { - .vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - "##ghz", "", "##"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "## abcde ## ##a ##f", - .expected_tokens = {"", "", "abc", "##de", "", "", - "", "", "a", "", "", ""}, - .expected_token_ids = {8, 8, 1, 3, 8, 8, 8, 8, 0, 8, 8, 8}, - .expected_token_start_offsets = {0, 1, 3, 6, 9, 10, 13, 14, 15, 17, - 18, 19}, - .expected_token_end_offsets = {1, 2, 6, 8, 10, 11, 14, 15, 16, 18, 19, - 20}, - }, - // Test 8: Input contains suffix indicator as words. Suffix indicator is - // not in vocab. - { - .vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - "##ghz", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "##", - .expected_tokens = {"", ""}, - .expected_token_ids = {8, 8}, - .expected_token_start_offsets = {0, 1}, - .expected_token_end_offsets = {1, 2}, - }, - // Test 9: Input contains unseen character words. - { - .vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - "##ghz", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = " a \tabcdeX \rabcdefghz abcdeXfghz Xabc abcd", - .expected_tokens = {"a", "", "abc", "##de", "##f", "##ghz", - "", "", ""}, - .expected_token_ids = {0, 8, 1, 3, 6, 7, 8, 8, 8}, - .expected_token_start_offsets = {1, 4, 12, 15, 17, 18, 22, 33, 38}, - .expected_token_end_offsets = {2, 10, 15, 17, 18, 21, 32, 37, 42}, - }, - // Test 10: Input contains untokenizable words. No spaces before or after. - { - .vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - "##ghz", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "abcdefgx", - .expected_tokens = {""}, - .expected_token_ids = {8}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {8}, - }, - // Test 11: Input contains untokenizable words. One space before. - { - .vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - "##ghz", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = " abcdefgx", - .expected_tokens = {""}, - .expected_token_ids = {8}, - .expected_token_start_offsets = {1}, - .expected_token_end_offsets = {9}, - }, - // Test 12: Input contains untokenizable words. One space after. - { - .vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - "##ghz", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "abcdefgx ", - .expected_tokens = {""}, - .expected_token_ids = {8}, - .expected_token_start_offsets = {0}, - .expected_token_end_offsets = {8}, - }, - // Test 13: Input has untokenizable words. One space before and after. - { - .vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - "##ghz", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = " abcdefgx ", - .expected_tokens = {""}, - .expected_token_ids = {8}, - .expected_token_start_offsets = {1}, - .expected_token_end_offsets = {9}, - }, - // Test 14: Input contains mix words with unseen characters. - { - .vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - "##ghz", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = " a \tabcdeX \rabcdefghz abcdeXfghz Xabc", - .expected_tokens = {"a", "", "abc", "##de", "##f", "##ghz", - "", ""}, - .expected_token_ids = {0, 8, 1, 3, 6, 7, 8, 8}, - .expected_token_start_offsets = {1, 4, 12, 15, 17, 18, 22, 33}, - .expected_token_end_offsets = {2, 10, 15, 17, 18, 21, 32, 37}, - }, - // Test 15: Another basic test. - { - .vocab = {"", "", "", "want", "##want", "##ed", "wa", - "un", "runn", "##ing"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "unwanted running", - .expected_tokens = {"un", "##want", "##ed", "runn", "##ing"}, - .expected_token_ids = {7, 4, 5, 8, 9}, - .expected_token_start_offsets = {0, 2, 6, 9, 13}, - .expected_token_end_offsets = {2, 6, 8, 13, 16}, - }, - // Test 16: Input has unseen characters. - { - .vocab = {"", "", "", "want", "##want", "##ed", "wa", - "un", "runn", "##ing"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "unwantedX running", - .expected_tokens = {"", "runn", "##ing"}, - .expected_token_ids = {0, 8, 9}, - .expected_token_start_offsets = {0, 10, 14}, - .expected_token_end_offsets = {9, 14, 17}, - }, - // Test 17: Input contains mix words with untokenizable words. - { - .vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f", - "##ghz", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = " a \tabcdeX \rabcdefghz abcdeXfghz ab", - .expected_tokens = {"a", "", "abc", "##de", "##f", "##ghz", - "", ""}, - .expected_token_ids = {0, 8, 1, 3, 6, 7, 8, 8}, - .expected_token_start_offsets = {1, 4, 12, 15, 17, 18, 22, 33}, - .expected_token_end_offsets = {2, 10, 15, 17, 18, 21, 32, 35}, - }, - // Test 18: Input and vocab contains Unicode tokens. The Trie matching - // loop would stop at matching a partial word. - { - .vocab = {"\xE2\x82\xAC", "a", "abc", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = " \xE2\x82\xAD abc", - .expected_tokens = {"", "abc"}, - .expected_token_ids = {3, 2}, - .expected_token_start_offsets = {1, 5}, - .expected_token_end_offsets = {4, 8}, - }, - // Test 19: Contains suffix indicator as a word. - { - .vocab = {"", "", "", "want", "##want", "##ed", "wa", - "un", "runn", "##ing", ".", "##.", "...", "#", "###"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "##", - .expected_tokens = {"#", "#"}, - .expected_token_ids = {13, 13}, - .expected_token_start_offsets = {0, 1}, - .expected_token_end_offsets = {1, 2}, - }, - // Test 20: unknown words. - { - .vocab = {"", "", "", "want", "##want", "##ed", "wa", - "un", "runn", "##ing", ".", "##.", "..."}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = " X wantXwanted. \t ", - .expected_tokens = {"", "", "."}, - .expected_token_ids = {1, 1, 10}, - .expected_token_start_offsets = {1, 3, 14}, - .expected_token_end_offsets = {2, 14, 15}, - }, - // Test 21: After the loop, the next character is whitespace. - { - .vocab = {"", "", "", "want", "##want", "##ed", "wa", - "un", "runn", "##ing", ".", "##.", "..."}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = " wanted. \t wa..", - .expected_tokens = {"want", "##ed", ".", "wa", ".", "."}, - .expected_token_ids = {3, 5, 10, 6, 10, 10}, - .expected_token_start_offsets = {2, 6, 8, 13, 15, 16}, - .expected_token_end_offsets = {6, 8, 9, 15, 16, 17}, - }, - // Test 22: After the loop, the next character is not a whitespace. - { - .vocab = {"", "", "", "want", "##want", "##ed", "wa", - "un", "runn", "##ing", ".", "##.", "..."}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = " wanted.x \t wa..", - .expected_tokens = {"want", "##ed", ".", "", "wa", ".", "."}, - .expected_token_ids = {3, 5, 10, 1, 6, 10, 10}, - .expected_token_start_offsets = {2, 6, 8, 9, 14, 16, 17}, - .expected_token_end_offsets = {6, 8, 9, 10, 16, 17, 18}, - }, - // Test 23: After the loop, the next character is not a whitespace. And a - // trailing space. - { - .vocab = {"", "", "", "want", "##want", "##ed", "wa", - "un", "runn", "##ing", ".", "##.", "..."}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = " wanted.x \t wa.. \n", - .expected_tokens = {"want", "##ed", ".", "", "wa", ".", "."}, - .expected_token_ids = {3, 5, 10, 1, 6, 10, 10}, - .expected_token_start_offsets = {2, 6, 8, 9, 14, 16, 17}, - .expected_token_end_offsets = {6, 8, 9, 10, 16, 17, 18}, - }, - // Test 24: After the loop, it's in the middle of a whitespace. The - // previous is tokenizable. - { - .vocab = {"", "want", "##want", "##ed", "wa", ".", "##.", "...", - "##\xc2\xa1"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = " wanted\xc2\xa0\t wa", - .expected_tokens = {"want", "##ed", "wa"}, - .expected_token_ids = {1, 3, 4}, - .expected_token_start_offsets = {2, 6, 12}, - .expected_token_end_offsets = {6, 8, 14}, - }, - // Test 25: After the loop, it's in the middle of a whitespace. The - // previous is tokenizable (a punctuation). - { - .vocab = {"", "want", "##want", "##ed", "wa", ".", "##.", "...", - "\xc2\xa1", "##\xc2\xa1"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = " wanted.\xc2\xa0\t wa", - .expected_tokens = {"want", "##ed", ".", "wa"}, - .expected_token_ids = {1, 3, 5, 4}, - .expected_token_start_offsets = {2, 6, 8, 13}, - .expected_token_end_offsets = {6, 8, 9, 15}, - }, - // Test 26: After the loop, it's in the middle of a whitespace. The - // previous is untokenizable. - { - .vocab = {"", "want", "##want", "##ed", "wa", ".", "##.", "...", - "##e\xC2\xA1", "##\xC2\xA1"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = " wante\xc2\xa0\t wa", - .expected_tokens = {"", "wa"}, - .expected_token_ids = {0, 4}, - .expected_token_start_offsets = {2, 11}, - .expected_token_end_offsets = {7, 13}, - }, - - // Test suite 2. End-to-end test including whitespace tokenization and - // split on punctuation. - // Test 27. Basic case 1. - { - .vocab = - { - "", "don", "##'", "##t", "tread", "##ness", - "hel", "##lo", "there", "my", "na", "##me", - "is", "ter", "##ry", "what", "##cha", "##ma", - "##call", "##it?", "you", "said", - }, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "hello there my name is terry", - .expected_tokens = {"hel", "##lo", "there", "my", "na", "##me", "is", - "ter", "##ry"}, - .expected_token_ids = {6, 7, 8, 9, 10, 11, 12, 13, 14}, - .expected_token_start_offsets = {0, 3, 6, 12, 15, 17, 20, 23, 26}, - .expected_token_end_offsets = {3, 5, 11, 14, 17, 19, 22, 26, 28}, - }, - // Test 28. Basic case 2. - { - .vocab = - { - "", "don", "##'", "##t", "tread", "##ness", - "hel", "##lo", "there", "my", "na", "##me", - "is", "ter", "##ry", "what", "##cha", "##ma", - "##call", "##it?", "you", "said", - }, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "whatchamacallit? you said", - .expected_tokens = {"", "", "you", "said"}, - .expected_token_ids = {0, 0, 20, 21}, - .expected_token_start_offsets = {0, 15, 17, 21}, - .expected_token_end_offsets = {15, 16, 20, 25}, - }, - // Test 29. Basic case 3. Punctuation is an independant word in the vocab. - { - .vocab = - { - "", "don", "##'", "##t", "tread", "##ness", - "hel", "##lo", "there", "my", "na", "##me", - "is", "ter", "##ry", "what", "##cha", "##ma", - "##call", "##it?", "you", "said", "##it", "?", - }, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "whatchamacallit? you said", - .expected_tokens = {"what", "##cha", "##ma", "##call", "##it", "?", - "you", "said"}, - .expected_token_ids = {15, 16, 17, 18, 22, 23, 20, 21}, - .expected_token_start_offsets = {0, 4, 7, 9, 13, 15, 17, 21}, - .expected_token_end_offsets = {4, 7, 9, 13, 15, 16, 20, 25}, - }, - // Test 30. Basic case 4 with untokenizable words. - { - .vocab = - { - "", "don", "'", "t", "tread", "##ness", - "hel", "##lo", "there", "my", "na", "##me", - "is", "ter", "##ry", "what", "##cha", "##ma", - "##call", "##it?", "you", "said", - }, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "don't tread cantfindme treadcantfindme", - .expected_tokens = {"don", "'", "t", "tread", "", ""}, - .expected_token_ids = {1, 2, 3, 4, 0, 0}, - .expected_token_start_offsets = {0, 3, 4, 6, 12, 23}, - .expected_token_end_offsets = {3, 4, 5, 11, 22, 38}, - }, - // Test 31: Basic case 5. - { - .vocab = {"", "", "", "want", "##want", "##ed", "wa", - "un", "runn", "##ing", ".", "##."}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "unwanted.", - .expected_tokens = {"un", "##want", "##ed", "."}, - .expected_token_ids = {7, 4, 5, 10}, - .expected_token_start_offsets = {0, 2, 6, 8}, - .expected_token_end_offsets = {2, 6, 8, 9}, - }, - // Test 32: Basic case 6. - { - .vocab = {"", "", "", "want", "##want", "##ed", "wa", - "un", "runn", "##ing", ".", "##.", "..."}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = " want.wanted. \t ", - .expected_tokens = {"want", ".", "want", "##ed", "."}, - .expected_token_ids = {3, 10, 3, 5, 10}, - .expected_token_start_offsets = {2, 6, 7, 11, 13}, - .expected_token_end_offsets = {6, 7, 11, 13, 14}, - }, - // Test 33: Basic with unseen characters (as a single word). - { - .vocab = {"", "", "", "want", "##want", "##ed", "wa", - "un", "runn", "##ing", ".", "##.", "..."}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = " X want.wanted. \t ", - .expected_tokens = {"", "want", ".", "want", "##ed", "."}, - .expected_token_ids = {1, 3, 10, 3, 5, 10}, - .expected_token_start_offsets = {1, 3, 7, 8, 12, 14}, - .expected_token_end_offsets = {2, 7, 8, 12, 14, 15}, - }, - // Test 34: Basic with unseen characters (in a word before a punctuation). - { - .vocab = {"", "", "", "want", "##want", "##ed", "wa", - "un", "runn", "##ing", ".", "##.", "..."}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = " X wantX.wanted. \t ", - .expected_tokens = {"", "", ".", "want", "##ed", "."}, - .expected_token_ids = {1, 1, 10, 3, 5, 10}, - .expected_token_start_offsets = {1, 3, 8, 9, 13, 15}, - .expected_token_end_offsets = {2, 8, 9, 13, 15, 16}, - }, - // Test 35: Basic with unseen characters (in the middle of a word). - { - .vocab = {"", "", "", "want", "##want", "##ed", "wa", - "un", "runn", "##ing", ".", "##.", "..."}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = " X wantXwanted. \t ", - .expected_tokens = {"", "", "."}, - .expected_token_ids = {1, 1, 10}, - .expected_token_start_offsets = {1, 3, 14}, - .expected_token_end_offsets = {2, 14, 15}, - }, - // Test 36: Basic with unseen characters and a leading period. - { - .vocab = {"", "", "", "want", "##want", "##ed", "wa", - "un", "runn", "##ing", ".", "##.", "..."}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = " X .wantXwanted. \t ", - .expected_tokens = {"", ".", "", "."}, - .expected_token_ids = {1, 10, 1, 10}, - .expected_token_start_offsets = {1, 3, 4, 15}, - .expected_token_end_offsets = {2, 4, 15, 16}, - }, - // Test 37: Contains ellipsis (as "....."). - { - .vocab = {"", "", "", "want", "##want", "##ed", "wa", - "un", "runn", "##ing", ".", "##.", "..."}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = " wanted. \t wa.....", - .expected_tokens = {"want", "##ed", ".", "wa", ".", ".", ".", ".", - "."}, - .expected_token_ids = {3, 5, 10, 6, 10, 10, 10, 10, 10}, - .expected_token_start_offsets = {2, 6, 8, 13, 15, 16, 17, 18, 19}, - .expected_token_end_offsets = {6, 8, 9, 15, 16, 17, 18, 19, 20}, - }, - // Test 38: After the loop, the next character is an unknown punctuation; - // the previous can be tokenized. - { - .vocab = {"", "", "", "want", "##want", "##ed", "wa", - "un", "runn", "##ing", ".", "##.", "..."}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = " wanted, \t wa", - .expected_tokens = {"want", "##ed", "", "wa"}, - .expected_token_ids = {3, 5, 1, 6}, - .expected_token_start_offsets = {2, 6, 8, 13}, - .expected_token_end_offsets = {6, 8, 9, 15}, - }, - // Test 39: After the loop, the next character is an unknown punctuation; - // the previous can be tokenized. - { - .vocab = {"", "", "", "want", "##want", "##ed", "wa", - "un", "runn", "##ing", ".", "##.", "..."}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = " wanted., \t wa", - .expected_tokens = {"want", "##ed", ".", "", "wa"}, - .expected_token_ids = {3, 5, 10, 1, 6}, - .expected_token_start_offsets = {2, 6, 8, 9, 14}, - .expected_token_end_offsets = {6, 8, 9, 10, 16}, - }, - // Test 40: After the loop, the next character is an unknown punctuation; - // the previous is empty. - { - .vocab = {"", "", "", "want", "##want", "##ed", "wa", - "un", "runn", "##ing", ".", "##.", "..."}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = " , wanted, \t wa", - .expected_tokens = {"", "want", "##ed", "", "wa"}, - .expected_token_ids = {1, 3, 5, 1, 6}, - .expected_token_start_offsets = {1, 3, 7, 9, 14}, - .expected_token_end_offsets = {2, 7, 9, 10, 16}, - }, - // Test 41: After the loop, the next character is an unknown punctuation; - // the previous can not be tokenized. - { - .vocab = {"", "", "", "want", "##want", "##ed", "wa", - "un", "runn", "##ing", ".", "##.", "..."}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = " wante, \t wa", - .expected_tokens = {"", "", "wa"}, - .expected_token_ids = {1, 1, 6}, - .expected_token_start_offsets = {2, 7, 12}, - .expected_token_end_offsets = {7, 8, 14}, - }, - // Test 42: After the loop, in the middle of an unknown punctuation. - // Previous is tokenizable. - { - .vocab = {"", "want", "##want", "##ed", "wa", ".", "##.", "...", - /*U+05C3*/ "\xD7\x83"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = " wanted\xd7\x86xyz \t wa", - .expected_tokens = {"want", "##ed", "", "", "wa"}, - .expected_token_ids = {1, 3, 0, 0, 4}, - .expected_token_start_offsets = {2, 6, 8, 10, 17}, - .expected_token_end_offsets = {6, 8, 10, 13, 19}, - }, - // Test 43: After the loop, in the middle of an unknown punctuation. - // Previous is tokenizable. - { - .vocab = {"", "want", "##want", "##ed", "wa", ".", "##.", "...", - /*U+05C3*/ "\xD7\x83"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = " wanted.\xd7\x86xyz \t wa", - .expected_tokens = {"want", "##ed", ".", "", "", "wa"}, - .expected_token_ids = {1, 3, 5, 0, 0, 4}, - .expected_token_start_offsets = {2, 6, 8, 9, 11, 18}, - .expected_token_end_offsets = {6, 8, 9, 11, 14, 20}, - }, - // Test 44: After the loop, in the middle of an unknown punctuation. - // Previous is not tokenizable. - { - .vocab = {"", "want", "##want", "##ed", "wa", ".", "##.", "...", - /*U+05C3*/ "##e\xD7\x83", - /*U+05C3*/ "\xD7\x83"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = " wante\xd7\x86xyz \t wa", - .expected_tokens = {"", "", "", "wa"}, - .expected_token_ids = {0, 0, 0, 4}, - .expected_token_start_offsets = {2, 7, 9, 16}, - .expected_token_end_offsets = {7, 9, 12, 18}, - }, - // Test 45: Fails to match the first character in the beginning. - { - .vocab = {"", "", "", "want", "##want", "##ed", "wa", - "un", "runn", "##ing", ".", "##.", "..."}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "xyz \t wa", - .expected_tokens = {"", "wa"}, - .expected_token_ids = {1, 6}, - .expected_token_start_offsets = {0, 7}, - .expected_token_end_offsets = {3, 9}, - }, - // Test 46: After the loop, the next character is not a whitespace nor - // punctuation. Trie fails to recognize the first character. - { - .vocab = {"", "", "", "want", "##want", "##ed", "wa", - "un", "runn", "##ing", ".", "##.", "..."}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = " wanted.xyz \t wa", - .expected_tokens = {"want", "##ed", ".", "", "wa"}, - .expected_token_ids = {3, 5, 10, 1, 6}, - .expected_token_start_offsets = {2, 6, 8, 9, 16}, - .expected_token_end_offsets = {6, 8, 9, 12, 18}, - }, - // Test 47: After the loop, the next character is not a whitespace nor - // punctuation. Previous is not tokenizable. - { - .vocab = {"", "", "", "want", "##want", "##ed", "wa", - "un", "runn", "##ing", ".", "##.", "..."}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = " wantedxyz \t wa", - .expected_tokens = {"", "wa"}, - .expected_token_ids = {1, 6}, - .expected_token_start_offsets = {2, 15}, - .expected_token_end_offsets = {11, 17}, - }, - // Test 48: After the loop, the next character is not a whitespace nor - // punctuation. Previous is not tokenizable. - { - .vocab = {"", "", "", "want", "##want", "##ed", "wa", - "un", "runn", "##ing", ".", "##.", "..."}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = " wantexyz \t wa", - .expected_tokens = {"", "wa"}, - .expected_token_ids = {1, 6}, - .expected_token_start_offsets = {2, 14}, - .expected_token_end_offsets = {10, 16}, - }, - // Test 49: Unknown punctuation followed by unseen character. - { - .vocab = {"", "want", "##want", "##ed", "wa", ".", "##.", "...", - /*U+05C3*/ "##e\xD7\x83", - /*U+05C3*/ "\xD7\x83"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "wanted\xd7\x86xyz", - .expected_tokens = {"want", "##ed", "", ""}, - .expected_token_ids = {1, 3, 0, 0}, - .expected_token_start_offsets = {0, 4, 6, 8}, - .expected_token_end_offsets = {4, 6, 8, 11}, - }, - // Test 50: Ellipsis is mapped to ""s when "." is not in vocab. - { - .vocab = {"", "want", "##want", "##ed", "wa", "..."}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "wanted...", - .expected_tokens = {"want", "##ed", "", "", ""}, - .expected_token_ids = {1, 3, 0, 0, 0}, - .expected_token_start_offsets = {0, 4, 6, 7, 8}, - .expected_token_end_offsets = {4, 6, 7, 8, 9}, - }, - - // Test suite 3. End-to-end test including whitespace and punctuation - // tokenization on max_bytes_per_token = 10. - // Test 51: Word length = 9 (i.e., max_bytes_per_token-1). - { - .vocab = {"", "01234", "##5678", "##56789", - /*U+05C3*/ "##\xD7\x83"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 10, - .input = " 012345678 ", - .expected_tokens = {"01234", "##5678"}, - .expected_token_ids = {1, 2}, - .expected_token_start_offsets = {2, 7}, - .expected_token_end_offsets = {7, 11}, - }, - // Test 52: Word length = 10 (i.e., max_bytes_per_token). - { - .vocab = {"", "01234", "##5678", "##56789", - /*U+05C3*/ "##\xD7\x83"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 10, - .input = " 0123456789 ", - .expected_tokens = {"01234", "##56789"}, - .expected_token_ids = {1, 3}, - .expected_token_start_offsets = {2, 7}, - .expected_token_end_offsets = {7, 12}, - }, - // Test 53: Word length = 9, followed by a multi-bytes Unicode punctuation - // char, which is a hebrew punctuation "sof pasquq". - { - .vocab = {"", "01234", "##5678", "##56789", - /*U+05C3*/ "##\xD7\x83"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 10, - .input = " 012345678\xD7\x83 ", - .expected_tokens = {"01234", "##5678", ""}, - .expected_token_ids = {1, 2, 0}, - .expected_token_start_offsets = {2, 7, 11}, - .expected_token_end_offsets = {7, 11, 13}, - }, - // Test 54: Word length = 11 (i.e., max_bytes_per_token+1). The 10th - // char is on Unicode boundary. - { - .vocab = {"", "01234", "##5678", "##56789", - /*U+05C3*/ "##\xD7\x83", "##a"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 10, - .input = " 0123456789a ", - .expected_tokens = {""}, - .expected_token_ids = {0}, - .expected_token_start_offsets = {2}, - .expected_token_end_offsets = {13}, - }, - // Test 55: Word length = 10 (i.e., max_bytes_per_token). The next char - // (\xe2\x80\x80) is a whitespace. - { - .vocab = {"", "01234", "##5678", "##56789", - /*U+05C3*/ "##\xD7\x83", "##a"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 10, - .input = " 0123456789\xe2\x80\x80 ", - .expected_tokens = {"01234", "##56789"}, - .expected_token_ids = {1, 3}, - .expected_token_start_offsets = {2, 7}, - .expected_token_end_offsets = {7, 12}, - }, - // Test 56: Word length = 9 (i.e., max_bytes_per_token-1). The next is - // a multi-byte whitespace. The 10th char is in the middle of the - // whitespace. - { - .vocab = {"", "01234", "##5678", "##56789", - /*U+05C3*/ "##\xD7\x83", "##a", "##\xe2\x80\x8B"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 10, - .input = " 012345678\xe2\x80\x80 ", - .expected_tokens = {"01234", "##5678"}, - .expected_token_ids = {1, 2}, - .expected_token_start_offsets = {2, 7}, - .expected_token_end_offsets = {7, 11}, - }, - // Test 57: Word length = 9 (i.e., max_bytes_per_token-1). The next is a - // multi-byte whitespace. The 10th char is in the middle of the - // whitespace. The word is not tokenizable. - { - .vocab = {"", "01234", "##56789", "##5678\xe2\x80\x8B", - /*U+05C3*/ "##\xD7\x83", "##a", "##\xe2\x80\x8B"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 10, - .input = " 012345678\xe2\x80\x80 ", - .expected_tokens = {""}, - .expected_token_ids = {0}, - .expected_token_start_offsets = {2}, - .expected_token_end_offsets = {11}, - }, - // Test 58: Word length = 9 (i.e., max_bytes_per_token-1) plus a - // trailing punctuation. - { - .vocab = {"", "01234", "##5678", "##56789", - /*U+05C3*/ "##\xD7\x83", "##a", "."}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 10, - .input = " .012345678. ", - .expected_tokens = {".", "01234", "##5678", "."}, - .expected_token_ids = {6, 1, 2, 6}, - .expected_token_start_offsets = {2, 3, 8, 12}, - .expected_token_end_offsets = {3, 8, 12, 13}, - }, - // Test 59: Word length = 9 (i.e., max_bytes_per_token-1) plus a - // trailing punctuation, followed by more words. - { - .vocab = {"", "01234", "##5678", "##56789", - /*U+05C3*/ "\xD7\x83", "##a", ".", "...", "a"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 10, - .input = " .012345678.a ", - .expected_tokens = {".", "01234", "##5678", ".", "a"}, - .expected_token_ids = {6, 1, 2, 6, 8}, - .expected_token_start_offsets = {2, 3, 8, 12, 13}, - .expected_token_end_offsets = {3, 8, 12, 13, 14}, - }, - // Test 60: Word length = 10 (i.e., max_bytes_per_token) plus a - // trailing punctuation, and the word is tokenizable. - { - .vocab = {"", "01234", "##5678", "##56789", - /*U+05C3*/ "\xD7\x83", "##a", ".", "..."}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 10, - .input = " .0123456789. ", - .expected_tokens = {".", "01234", "##56789", "."}, - .expected_token_ids = {6, 1, 3, 6}, - .expected_token_start_offsets = {2, 3, 8, 13}, - .expected_token_end_offsets = {3, 8, 13, 14}, - }, - // Test 61: Word length = 10 (i.e., max_bytes_per_token) plus a - // trailing unknown punctuation, and the word is tokenizable. - { - .vocab = {"", "01234", "##5678", "##56789", "##a", ".", "..."}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 10, - .input = " .0123456789\xD7\x83 ", - .expected_tokens = {".", "01234", "##56789", ""}, - .expected_token_ids = {5, 1, 3, 0}, - .expected_token_start_offsets = {2, 3, 8, 13}, - .expected_token_end_offsets = {3, 8, 13, 15}, - }, - // Test 62: Word length = 11 (i.e., max_bytes_per_token+1). - { - .vocab = {"", "01234", "##5678", "##56789", - /*U+05C3*/ "\xD7\x83", "##a", ".", "..."}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 10, - .input = " .0123456789Z ", - .expected_tokens = {".", ""}, - .expected_token_ids = {6, 0}, - .expected_token_start_offsets = {2, 3}, - .expected_token_end_offsets = {3, 14}, - }, - // Test 63: Word length = 11 (i.e., max_bytes_per_token+1). - // The input would be tokenizable if `max_byte_per_token` is set to be - // greater or equal to `word_length`. - { - .vocab = {"", "0123456789", "##0123456789", "##012345678abc", - /*U+05C3*/ "\xD7\x83", "##a", ".", "..."}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 10, - .input = " .012345678a. ", - .expected_tokens = {".", "", "."}, - .expected_token_ids = {6, 0, 6}, - .expected_token_start_offsets = {2, 3, 13}, - .expected_token_end_offsets = {3, 13, 14}, - }, - // Test 64: Input is "". - { - .vocab = {"", "0123456789", "##0123456789", "##012345678abc", - /*U+05C3*/ "\xD7\x83", "##a", ".", "...", ">"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = ".", - .expected_tokens = {"", "", ">", "."}, - .expected_token_ids = {0, 0, 8, 6}, - .expected_token_start_offsets = {0, 1, 4, 5}, - .expected_token_end_offsets = {1, 4, 5, 6}, - }, - - // Test suite 4: Test different suffix indicators. - // Test 65: Suffix indicator is "##". Input contains "##". - { - .vocab = {"", "", "", "want", "##want", "##ed", "wa", - "un", "runn", "##ing", ".", "##.", "...", "#", "##", "###"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "## running", - .expected_tokens = {"#", "#", "runn", "##ing"}, - .expected_token_ids = {13, 13, 8, 9}, - .expected_token_start_offsets = {0, 1, 3, 7}, - .expected_token_end_offsets = {1, 2, 7, 10}, - }, - // Test 66: Test suffix indicator "". - { - .vocab = {"", "want", "want", "ed", "wa", "un", - "runn", "ing", "#", "."}, - .unk_token = "", - .suffix_indicator = "", - .max_bytes_per_token = 100, - .input = "## running. <", - .expected_tokens = {"#", "#", "runn", "ing", ".", ""}, - .expected_token_ids = {8, 8, 6, 7, 9, 0}, - .expected_token_start_offsets = {0, 1, 3, 7, 10, 12}, - .expected_token_end_offsets = {1, 2, 7, 10, 11, 13}, - }, - // Test 67: Test suffix indicator "suffix>". Suffix indicator appears in - // the input as a single word after a punctuation. - { - .vocab = {"", "want", "suffix>want", "suffix>ed", "wa", "un", - "runn", "suffix>ing", "#", "su", "suffix>ffix", "suffix"}, - .unk_token = "", - .suffix_indicator = "suffix>", - .max_bytes_per_token = 100, - .input = "#suffix> running", - .expected_tokens = {"#", "suffix", "", "runn", "suffix>ing"}, - .expected_token_ids = {8, 11, 0, 6, 7}, - .expected_token_start_offsets = {0, 1, 7, 9, 13}, - .expected_token_end_offsets = {1, 7, 8, 13, 16}, - }, - // Test 68: Test suffix indicator "suffix>". Suffix indicator appears in - // the input as a single word after a punctuation. - { - .vocab = {"", "want", "suffix>want", "suffix>ed", "wa", "un", - "runn", "suffix>ing", "#", "su", "suffix>ffix"}, - .unk_token = "", - .suffix_indicator = "suffix>", - .max_bytes_per_token = 100, - .input = "#suffix> running", - .expected_tokens = {"#", "su", "suffix>ffix", "", "runn", - "suffix>ing"}, - .expected_token_ids = {8, 9, 10, 0, 6, 7}, - .expected_token_start_offsets = {0, 1, 3, 7, 9, 13}, - .expected_token_end_offsets = {1, 3, 7, 8, 13, 16}, - }, - // Test 69: Test suffix indicator "", "runn", "", "su", "", "runn", ">>". Suffix indicator appears in the - // input. - { - .vocab = {"", "want", ">>>want", ">>>ed", "wa", "un", "runn", - ">>>ing", "#", "su", ">>>ffix"}, - .unk_token = "", - .suffix_indicator = ">>>", - .max_bytes_per_token = 100, - .input = "#suffix>>> running", - .expected_tokens = {"#", "su", ">>>ffix", "", "", "", - "runn", ">>>ing"}, - .expected_token_ids = {8, 9, 10, 0, 0, 0, 6, 7}, - .expected_token_start_offsets = {0, 1, 3, 7, 8, 9, 11, 15}, - .expected_token_end_offsets = {1, 3, 7, 8, 9, 10, 15, 18}, - }, - // Test 72: Test suffix indicator "<", "runn", "<", "runn", "XYZing", "<", "X", "XYZYZ"}, - .unk_token = "", - .suffix_indicator = "XYZ", - .max_bytes_per_token = 100, - .input = "XYZ running", - .expected_tokens = {"X", "XYZYZ", "runn", "XYZing"}, - .expected_token_ids = {4, 5, 1, 2}, - .expected_token_start_offsets = {0, 1, 4, 8}, - .expected_token_end_offsets = {1, 3, 8, 11}, - }, - // Test 74: Test suffix indicator "XYZ", which appears in the - // vocab and input sentence as a single word. - { - .vocab = {"", "runn", "XYZing", "<", "X", "XYZYZ", "XYZ"}, - .unk_token = "", - .suffix_indicator = "XYZ", - .max_bytes_per_token = 100, - .input = "XYZ running", - .expected_tokens = {"XYZ", "runn", "XYZing"}, - .expected_token_ids = {6, 1, 2}, - .expected_token_start_offsets = {0, 4, 8}, - .expected_token_end_offsets = {3, 8, 11}, - }, - // Test suite 5: Test multi-byte punctuation and Chinese characters. - // Test 75: Contains a multi-bytes Unicode punctuation char "\xEF\xBC\x8C" - // followed by a tokenizable word. - { - .vocab = {"", "want", "##ed", "ABC", "\xEF\xBC\x8C", "##ABC"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 10, - .input = "wanted\xEF\xBC\x8C" - "ABC", - .expected_tokens = {"want", "##ed", "\xEF\xBC\x8C", "ABC"}, - .expected_token_ids = {1, 2, 4, 3}, - .expected_token_start_offsets = {0, 4, 6, 9}, - .expected_token_end_offsets = {4, 6, 9, 12}, - }, - // Test 76: Contains a multi-bytes Unicode punctuation char "\xEF\xBC\x8C" - // (absent in the vocab) followed by a tokenizable word. - { - .vocab = {"", "want", "##ed", "ABC", "\xEF\xBC\x8C", "##ABC"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 10, - .input = "wanted\xD7\x83" - "ABC", - .expected_tokens = {"want", "##ed", "", "ABC"}, - .expected_token_ids = {1, 2, 0, 3}, - .expected_token_start_offsets = {0, 4, 6, 8}, - .expected_token_end_offsets = {4, 6, 8, 11}, - }, - // Test 77: Contains a multi-bytes Unicode chinese character \xe4\xb8\x81, - // which is considered as a single word in Bert, so it's treated in the - // same way as punctuation characters by the tokenizer. - { - .vocab = {"", "want", "##ed", "ABC", "\xe4\xb8\x81", "##ABC"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 10, - .input = "wanted\xe4\xb8\x81" - "ABC", - .expected_tokens = {"want", "##ed", "\xe4\xb8\x81", "ABC"}, - .expected_token_ids = {1, 2, 4, 3}, - .expected_token_start_offsets = {0, 4, 6, 9}, - .expected_token_end_offsets = {4, 6, 9, 12}, - }, - // Test 78: Contains a multi-bytes Unicode chinese character \xe4\xb8\x81. - { - .vocab = {"", "want", "##ed", "ABC", "##ABC", - "wanted\xe4\xb8\x81"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 10, - .input = "wanted\xe4\xb8\x81" - "ABC", - .expected_tokens = {"want", "##ed", "", "ABC"}, - .expected_token_ids = {1, 2, 0, 3}, - .expected_token_start_offsets = {0, 4, 6, 9}, - .expected_token_end_offsets = {4, 6, 9, 12}, - }, - // Test 79: Contains a multi-bytes Unicode chinese character \xe4\xb8\x81, - // which is included in the vocab as the suffix of a word. - { - .vocab = {"", "want", "##ed", "ABC", "##ABC", - "wanted\xe4\xb8\x81"}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 10, - .input = "wanted\xe4\xb8\x81" - "ABC", - .expected_tokens = {"want", "##ed", "", "ABC"}, - .expected_token_ids = {1, 2, 0, 3}, - .expected_token_start_offsets = {0, 4, 6, 9}, - .expected_token_end_offsets = {4, 6, 9, 12}, - }}; - return v; -} - -using TestTokenizeText = testing::TestWithParam; - -TEST_P(TestTokenizeText, Test) { - const Spec& spec = GetParam(); - ASSERT_OK_AND_ASSIGN( - std::string flatbuffer, - BuildModelAndExportToFlatBuffer(spec.vocab, spec.max_bytes_per_token, - spec.suffix_indicator, spec.unk_token)); - ASSERT_OK_AND_ASSIGN(auto tokenizer, - FastWordpieceTokenizer::Create(flatbuffer.data())); - - std::vector output_tokens; - std::vector output_ids; - std::vector output_begin_offsets; - std::vector output_end_offsets; - tokenizer.Tokenize(spec.input, &output_tokens, &output_ids, - &output_begin_offsets, &output_end_offsets); - EXPECT_THAT(output_tokens, spec.expected_tokens); - EXPECT_THAT(output_ids, spec.expected_token_ids); - EXPECT_THAT(output_begin_offsets, spec.expected_token_start_offsets); - EXPECT_THAT(output_end_offsets, spec.expected_token_end_offsets); -} - -TEST_P(TestTokenizeText, TestNoOutputPieces) { - const Spec& spec = GetParam(); - ASSERT_OK_AND_ASSIGN( - std::string flatbuffer, - BuildModelAndExportToFlatBuffer(spec.vocab, spec.max_bytes_per_token, - spec.suffix_indicator, spec.unk_token)); - ASSERT_OK_AND_ASSIGN(auto tokenizer, - FastWordpieceTokenizer::Create(flatbuffer.data())); - - std::vector output_ids; - std::vector output_begin_offsets; - std::vector output_end_offsets; - tokenizer.Tokenize(spec.input, &output_ids, &output_begin_offsets, - &output_end_offsets); - EXPECT_THAT(output_ids, spec.expected_token_ids); - EXPECT_THAT(output_begin_offsets, spec.expected_token_start_offsets); - EXPECT_THAT(output_end_offsets, spec.expected_token_end_offsets); -} - -TEST_P(TestTokenizeText, TestNoOutputPiecesOnlyOutputIds) { - const Spec& spec = GetParam(); - ASSERT_OK_AND_ASSIGN( - std::string flatbuffer, - BuildModelAndExportToFlatBuffer(spec.vocab, spec.max_bytes_per_token, - spec.suffix_indicator, spec.unk_token)); - ASSERT_OK_AND_ASSIGN(auto tokenizer, - FastWordpieceTokenizer::Create(flatbuffer.data())); - - std::vector output_ids; - tokenizer.Tokenize(spec.input, &output_ids); - EXPECT_THAT(output_ids, spec.expected_token_ids); -} - -INSTANTIATE_TEST_SUITE_P(EndToEndFastWordpieceTokenizerParameterizedTest, - TestTokenizeText, - testing::ValuesIn(GetTestSpecsForTokenizeText())); - -// Test the detokenization function of FastWordPieceTokenizer. -const std::vector& GetTestSpecsForTokenizeDetokenize() { - static const std::vector& v = *new std::vector{ - // Test 0: Input is a single word. - { - .vocab = {"a", "abc", "##de", "##defgxy", "##deh", "##f", "##ghz", - ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "abcdefghz", - .expected_token_ids = {1, 2, 5, 6}, - .expected_detokenized_text = "abcdefghz", - }, - // Test 1: Input is a sentence. - { - .vocab = {"a", "abc", "##de", "##c", "##f", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "a abc abcde ab", - .expected_token_ids = {0, 1, 1, 2, 5}, - .expected_detokenized_text = "a abc abcde ", - }, - // Test 2: Input has the leading suffix indicator. - { - .vocab = {"a", "abc", "##de", "##deh", "##f", ""}, - .unk_token = "", - .suffix_indicator = "##", - .max_bytes_per_token = 100, - .input = "##deh abcde", - .expected_token_ids = {3, 1, 2}, - .expected_detokenized_text = "##deh abcde", - }, - }; - return v; -} -using TestTokenizeDetokenize = testing::TestWithParam; - -TEST_P(TestTokenizeDetokenize, Test) { - const Spec& spec = GetParam(); - ASSERT_OK_AND_ASSIGN( - std::string flatbuffer, - BuildModelAndExportToFlatBuffer(spec.vocab, spec.max_bytes_per_token, - spec.suffix_indicator, spec.unk_token, - /*no_pretokenization=*/true, - /*support_detokenization=*/true)); - ASSERT_OK_AND_ASSIGN(auto tokenizer, - FastWordpieceTokenizer::Create(flatbuffer.data())); - - // Test detokenization. - ASSERT_OK_AND_ASSIGN(auto output_text, - tokenizer.Detokenize(spec.expected_token_ids)); - EXPECT_THAT(output_text, spec.expected_detokenized_text); -} - -INSTANTIATE_TEST_SUITE_P( - FastWordpieceTokenizerDetokenizeParameterizedTest, TestTokenizeDetokenize, - testing::ValuesIn(GetTestSpecsForTokenizeDetokenize())); - -} // namespace -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_tflite.cc b/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_tflite.cc deleted file mode 100644 index 18c58af04..000000000 --- a/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_tflite.cc +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/fast_wordpiece_tokenizer_tflite.h" - -#include "tensorflow/lite/kernels/shim/tflite_op_shim.h" -#include "tensorflow_text/core/kernels/fast_wordpiece_tokenizer_kernel_template.h" - -namespace tflite { -namespace ops { -namespace custom { -namespace text { - -using TokenizeOpKernel = tflite::shim::TfLiteOpKernel< - tensorflow::text::FastWordpieceTokenizeWithOffsetsOp>; - -using DetokenizeOpKernel = - tflite::shim::TfLiteOpKernel; - -extern "C" void AddFastWordpieceTokenize(tflite::MutableOpResolver* resolver) { - TokenizeOpKernel::Add(resolver); -} - -extern "C" void AddFastWordpieceDetokenize( - tflite::MutableOpResolver* resolver) { - DetokenizeOpKernel::Add(resolver); -} - -} // namespace text -} // namespace custom -} // namespace ops -} // namespace tflite diff --git a/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_tflite.h b/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_tflite.h index bd1c176c5..ad1f9b6fe 100644 --- a/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_tflite.h +++ b/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_tflite.h @@ -12,24 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef THIRD_PARTY_TENSORFLOW_TEXT_GOOGLE_CORE_KERNELS_FAST_WORDPIECE_TOKENIZER_TFLITE_H_ -#define THIRD_PARTY_TENSORFLOW_TEXT_GOOGLE_CORE_KERNELS_FAST_WORDPIECE_TOKENIZER_TFLITE_H_ +#ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_FAST_WORDPIECE_TOKENIZER_TFLITE_H_ +#define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_FAST_WORDPIECE_TOKENIZER_TFLITE_H_ -#include "tensorflow/lite/mutable_op_resolver.h" +#include "tensorflow/core/kernels/text/fast_wordpiece_tokenizer_tflite.h" -namespace tflite { -namespace ops { -namespace custom { -namespace text { - -extern "C" void AddFastWordpieceTokenize(::tflite::MutableOpResolver* resolver); - -extern "C" void AddFastWordpieceDetokenize( - ::tflite::MutableOpResolver* resolver); - -} // namespace text -} // namespace custom -} // namespace ops -} // namespace tflite - -#endif // THIRD_PARTY_TENSORFLOW_TEXT_GOOGLE_CORE_KERNELS_FAST_WORDPIECE_TOKENIZER_TFLITE_H_ +#endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_FAST_WORDPIECE_TOKENIZER_TFLITE_H_ diff --git a/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_utils.h b/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_utils.h index a147654f0..a5f6df1bd 100644 --- a/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_utils.h +++ b/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_utils.h @@ -12,261 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -// To optimize speed/memory usage, we assume: -// * The WordPiece vocabulary has at most 2^22 = 4M tokens. -// * No token from the vocabulary has more than 256 bytes. -// -// The assumptions are adjustable by setting the constants defined in this file. -// -// Note: by recompiling the underlying trie library and the helper functions in -// this file to use 64-bit (or even larger) integers, we can support even a -// larger vocab size and longer vocab tokens. Still, we believe the current -// implementation covers all real cases. #ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_FAST_WORDPIECE_TOKENIZER_UTILS_H_ #define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_FAST_WORDPIECE_TOKENIZER_UTILS_H_ -#include - -#include - -#include "absl/status/statusor.h" -#include "absl/strings/str_cat.h" -#include "icu4c/source/common/unicode/uchar.h" -#include "icu4c/source/common/unicode/umachine.h" - -namespace tensorflow { -namespace text { -namespace fast_wordpiece_tokenizer_utils { - -// This header assumes that is 32-bit integer types. -static_assert(sizeof(int) == 4, "FastWordpieceTokenizer requires 4-byte int."); - -//////////////////////////////////////////////////////////////////////////////// -// Constants for token encoding. -// -// The constants below define a 32-bit compact token representation that encodes -// (1) the token id, (2) the token length (minus 1, and without the suffix -// indicator, in utf-8 bytes), and (3) is_suffix_token (i.e., the token starts -// with the suffix indicator (say) "##"). -// -// The encoded value is stored on the darts_clone trie as well as in the -// `failure_pops_pool` (see FastWordpieceTokenizerConfig in -// fast_wordpiece_tokenizer_model.fbs). As required by darts_clone_trie, the -// type of the encoded value should be 32-bit signed int, and the top bit is -// reserved to be always 0. -// -// Examples (given the existing constants; bits are numbered 0 to 31 from -// right/lower to left/upper; the top bit is reserved by darts_clone trie and is -// always 0): -// * Token "a", token id 0 -> The encoded value is 0x0: -// * bit 31: 0. -// * bit 30: 0, since token "a" is not a suffix token. -// * bits 29-8: 0, since the token id is 0. -// * bits 7-0: 0, since the encoded token length is 0 (see below comments). -// * Token "b", token id 1 -> The encoded value is 0x100: -// * bit 31: 0. -// * bit 30: 0, since token "b" is not a suffix token. -// * bits 29-8: 1, since the token id is 1. -// * bits 7-0: 0, since the encoded token length is 0 (see below comments). -// * Token "##b", token id 2 -> The encoded value is 0x40000200: -// * bit 31: 0. -// * bit 30: 1, since token "##b" is a suffix token. -// * bits 29-8: 2, since the token id is 2. -// * bits 7-0: 0, since the encoded token length is 0 (see below comments). -// * Token "bc", token id 3 -> The encoded value is 0x301: -// * bit 31: 0. -// * bit 30: 0, since token "bc" is not a suffix token. -// * bits 29-8: 3, since the token id is 3. -// * bits 7-0: 1, since the encoded token length is 1 (see below comments). -// * Token "##bcd", token id 5 -> The encoded value is 0x40000502: -// * bit 31: 0. -// * bit 30: 1, since token "##bcd" is a suffix token. -// * bits 29-8: 5, since the token id is 5. -// * bits 7-0: 2, since the encoded token length is 2 (see below comments). -// -// One special case is that when the suffix indicator is the empty string "". In -// this case, `is_suffix_token` is false for all tokens. -// -// Another special case is that when the suffix indicator string happens to be a -// token in the vocabulary. When encoding such a token like "##", by design, -// `is_suffix_token` is false, and the encoded token length is the full length -// of the suffix indicator string. -// -//////////////////////////////////////////////////////////////////////////////// - -// The (right-to-left 0-based) bit to encode whether the token is a suffix -// token. -static constexpr uint32_t kBitToIndicateSuffixToken = 30; - -// The number of low bits to encode the vocab token length into a compact -// representation. Technically, we encode the length of the token without the -// suffix indicator (if any) minus 1. Examples: -// * Token "a" -> we encode 1-1 = 0. -// * Token "abc" -> we encode 3-1 = 0. -// * Token "##abc" -> we encode 2, as before (we ignore the suffix indicator). -static constexpr uint32_t kBitsToEncodeVocabTokenLength = 8; - -// The bit mask to get the vocab token length from the compact representation. -static constexpr uint32_t kMaskToEncodeVocabTokenLength = - (1 << kBitsToEncodeVocabTokenLength) - 1; - -// Max vocab token length supported (given `kBitsToEncodeVocabTokenLength`). -static constexpr uint32_t kMaxVocabTokenLengthInUTF8Bytes = - (1 << kBitsToEncodeVocabTokenLength); - -// The maximum vocab size supported by our 32-bit encoding. Using right-to-left -// 0-based numbering, Bit 31 is reserved by darts_clone trie. Bit 30 indicates -// whether the token is a suffix token. The low `kBitsToEncodeVocabTokenLength` -// bits encode the token length. Given `kBitsToEncodeVocabTokenLength=8`, this -// leaves 32-1-1-8=22 bits for token ids, i.e., a max vocab size of 2^22 = 4M. -static constexpr uint32_t kMaxSupportedVocabSize = - (1 << (32 - 1 - 1 - kBitsToEncodeVocabTokenLength)); - -// The bit mask to get the vocab token id from the compact representation. -static constexpr uint32_t kMaskToEncodeVocabTokenId = - ((1 << kBitToIndicateSuffixToken) - 1) ^ kMaskToEncodeVocabTokenLength; - -//////////////////////////////////////////////////////////////////////////////// -// Helpers for encoding / decoding tokens. -//////////////////////////////////////////////////////////////////////////////// - -// Encodes a token into the encoded value. `token_length` is without the suffix -// indicator. The result is always a non-negative integer. Only used in building -// the model (in flatbuffer), not in doing WordPiece tokenization. -inline absl::StatusOr EncodeToken(int token_id, int token_length, - bool is_suffix_token) { - const int encoded_value = (is_suffix_token << kBitToIndicateSuffixToken) | - (token_id << kBitsToEncodeVocabTokenLength) | - (token_length - 1); - if (encoded_value < 0) { - return absl::FailedPreconditionError(absl::StrCat( - "EncodeToken() must return a non-negative value! Found encoded value: ", - encoded_value, " for input token id: ", token_id, ", token_length: ", - token_length, ", is_suffix_token: ", is_suffix_token)); - } - return encoded_value; -} - -// Gets whether it is a suffix token from the encoded value. -inline bool IsSuffixToken(int token_encoded_value) { - return static_cast(token_encoded_value >> kBitToIndicateSuffixToken); -} - -// Gets the token id from the encoded value. -inline int GetTokenId(int token_encoded_value) { - return (token_encoded_value & kMaskToEncodeVocabTokenId) >> - kBitsToEncodeVocabTokenLength; -} - -// Gets the token length (without the suffix indicator) from the encoded value. -inline int GetTokenLength(int token_encoded_value) { - return (token_encoded_value & kMaskToEncodeVocabTokenLength) + 1; -} - -//////////////////////////////////////////////////////////////////////////////// -// Constants for encoding failure pop lists. -// -// We put all failure pop lists into a common pool. The constants below define -// the compact representation that encodes (1) the offset, and (2) the length -// (minus 1) for a failure pop list in the common pool. -// -// Examples (given the existing constants; bits are numbered 0 to 31 from -// right/lower to left/upper): -// * failure pop list A, whose offset is 0 and length is 1 -> The encoded value -// is 0x0: -// * bits 31-8: 0, since the offset is 0. -// * bits 7-0: 0, since the encoded length is 0 (=1-1). -// * failure pop list B, whose offset is 0 and length is 3 -> The encoded value -// is 0x2: -// * bits 31-8: 0, since the offset is 0. -// * bits 7-0: 2, since the encoded length is 2 (=3-1). -// * failure pop list C, whose offset is 11 and the length is 10 -> The encoded -// value is 0xB09: -// * bits 31-8: 0xB, since the offset is 11. -// * bits 7-0: 9, since the encoded length is 9 (=10-1). -//////////////////////////////////////////////////////////////////////////////// - -// The number of low bits used to encode the length of failure pops minus 1 in -// the compact representation. This value should be less than or equal to -// `kBitsToEncodeVocabTokenLength`, since the size of failure pops is bounded by -// the maximum token length in the vocabulary. -static constexpr uint32_t kBitsToEncodeFailurePopsListSize = - kBitsToEncodeVocabTokenLength; - -// The bit mask to get the length of the failure pop list (without any suffix -// indicator, and minus 1) from the compact representation. -static constexpr uint32_t kMaskToEncodeFailurePopsListSize = - (1 << kBitsToEncodeFailurePopsListSize) - 1; - -// Max length of the failure pop list supported (given -// `kBitsToEncodeFailurePopsListSize`). -static constexpr uint32_t kMaxFailurePopsListSize = - (1 << kBitsToEncodeFailurePopsListSize); - -// The maximum valid offset in the failure pool, excluding the largest one -// (i.e., 0xFF...F), which is reserved to denote a null failure pop list (see -// `kNullFailurePopsList`). -static constexpr uint32_t kMaxSupportedFailurePoolOffset = - (1 << (32 - kBitsToEncodeFailurePopsListSize)) - 1 - 1; - -// Represents the null failure pops list, because 0xFF...F is not a valid of -// offset (see `kMaxSupportedFailurePoolOffset`). -static constexpr uint32_t kNullFailurePopsList = - std::numeric_limits::max(); - -//////////////////////////////////////////////////////////////////////////////// -// Helpers for encoding / decoding failure pop lists -//////////////////////////////////////////////////////////////////////////////// - -// Encodes the offset (in the failure pop pool) and the length of a failure pop -// list into an integer for a compact representation. -inline uint32_t EncodeFailurePopList(int offset, int length) { - return (offset << kBitsToEncodeFailurePopsListSize) | (length - 1); -} - -// Decodes the offset (in the failure pop pool) and the length of a failure pop -// list from the compact representation (an integer). -inline void GetFailurePopsOffsetAndLength(uint32_t offset_and_length, - int& out_offset, int& out_length) { - out_offset = offset_and_length >> kBitsToEncodeFailurePopsListSize; - out_length = (offset_and_length & kMaskToEncodeFailurePopsListSize) + 1; -} - -//////////////////////////////////////////////////////////////////////////////// -// Constants related to the Trie structure. -//////////////////////////////////////////////////////////////////////////////// - -// Represents the null node id. Different from any normal node. -static constexpr uint32_t kNullNode = std::numeric_limits::max(); - -// The maximum trie size supported. Because std::numeric_limits::max() -// (i.e., 0xFFFFFFFF) is reserved to represent the null node, the total trie -// size needs to be smaller or equal to 0xFFFFFFFF. -static constexpr uint32_t kMaxSupportedTrieSize = - std::numeric_limits::max(); - -//////////////////////////////////////////////////////////////////////////////// -// Helpers for analyzing Unicode characters. -//////////////////////////////////////////////////////////////////////////////// -inline bool IsPunctuationOrChineseChar(UChar32 char_value) { - uint32_t cp = static_cast(char_value); - // Chinese characters that are treated as punctuation in Bert. - if ((cp >= 0x4E00 && cp <= 0x9FFF) || (cp >= 0x3400 && cp <= 0x4DBF) || - (cp >= 0x20000 && cp <= 0x2A6DF) || (cp >= 0x2A700 && cp <= 0x2B73F) || - (cp >= 0x2B740 && cp <= 0x2B81F) || (cp >= 0x2B820 && cp <= 0x2CEAF) || - (cp >= 0xF900 && cp <= 0xFAFF) || (cp >= 0x2F800 && cp <= 0x2FA1F)) { - return true; - } - // Some special chars e.g. ">", "$" that are not covered by the u_ispunct are - // considered as punctuation chars. - if ((cp >= 33 && cp <= 47) || (cp >= 58 && cp <= 64) || - (cp >= 91 && cp <= 96) || (cp >= 123 && cp <= 126)) { - return true; - } - return u_ispunct(char_value); -} -} // namespace fast_wordpiece_tokenizer_utils -} // namespace text -} // namespace tensorflow +#include "tensorflow/core/kernels/text/fast_wordpiece_tokenizer_utils.h" #endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_FAST_WORDPIECE_TOKENIZER_UTILS_H_ diff --git a/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_utils_test.cc b/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_utils_test.cc deleted file mode 100644 index a36900542..000000000 --- a/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_utils_test.cc +++ /dev/null @@ -1,154 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/fast_wordpiece_tokenizer_utils.h" - -#include -#include - -namespace tensorflow { -namespace text { -namespace fast_wordpiece_tokenizer_utils { -namespace { - -// Testing spec struct for token encoding / decoding. -struct TokenSpec { - friend std::ostream& operator<<(std::ostream& os, const TokenSpec& s) { - return os << "token_id:" << s.token_id << ", " - << "token_length:" << s.token_length << ", " - << "is_suffix_token:" << s.is_suffix_token << std::endl; - } - - int token_id; - int token_length; - bool is_suffix_token; -}; - -// Parameterized tests specs for token encoding / decoding. -const std::vector& GetTokenSpecs() { - static const std::vector& kSpecs = *new std::vector{ - // Test 0. - { - .token_id = 0, - .token_length = 1, - .is_suffix_token = false, - }, - // Test 1. - { - .token_id = 1, - .token_length = 1, - .is_suffix_token = false, - }, - // Test 2. - { - .token_id = 2, - .token_length = 1, - .is_suffix_token = true, - }, - // Test 3. - { - .token_id = 3, - .token_length = 10, - .is_suffix_token = false, - }, - // Test 4. - { - .token_id = 4, - .token_length = 10, - .is_suffix_token = true, - }, - // Test 5. - { - .token_id = kMaxSupportedVocabSize - 1, - .token_length = kMaxVocabTokenLengthInUTF8Bytes, - .is_suffix_token = true, - }, - }; - return kSpecs; -} - -using TokenEncodingDecodingTest = testing::TestWithParam; - -TEST_P(TokenEncodingDecodingTest, GeneralTest) { - const TokenSpec& spec = GetParam(); - ASSERT_OK_AND_ASSIGN( - auto encoded_value, - EncodeToken(spec.token_id, spec.token_length, spec.is_suffix_token)); - EXPECT_THAT(GetTokenId(encoded_value), spec.token_id); - EXPECT_THAT(GetTokenLength(encoded_value), spec.token_length); - EXPECT_THAT(IsSuffixToken(encoded_value), spec.is_suffix_token); -} - -INSTANTIATE_TEST_SUITE_P(TestTokenEncodingDecoding, TokenEncodingDecodingTest, - testing::ValuesIn(GetTokenSpecs())); - -struct FailurePopListSpec { - friend std::ostream& operator<<(std::ostream& os, - const FailurePopListSpec& s) { - return os << "offset:" << s.offset << ", " - << "length:" << s.length << std::endl; - } - - int offset; - int length; -}; - -// Parameterized tests specs for failure pop list encoding and decoding. -const std::vector& GetFailurePopListSpecs() { - static const std::vector& kSpecs = - *new std::vector{ - // Test 0. - { - .offset = 0, - .length = 1, - }, - // Test 1. - { - .offset = 0, - .length = 3, - }, - // Test 2. - { - .offset = 11, - .length = 10, - }, - // Test 3. - { - .offset = kMaxSupportedFailurePoolOffset, - .length = kMaxFailurePopsListSize, - }, - }; - return kSpecs; -} - -using FailurePopListEncodingDecodingTest = - testing::TestWithParam; - -TEST_P(FailurePopListEncodingDecodingTest, GeneralTest) { - const FailurePopListSpec& spec = GetParam(); - auto offset_and_length = EncodeFailurePopList(spec.offset, spec.length); - int offset, length; - GetFailurePopsOffsetAndLength(offset_and_length, offset, length); - EXPECT_THAT(offset, spec.offset); - EXPECT_THAT(length, spec.length); -} - -INSTANTIATE_TEST_SUITE_P(TestFailurePopListEncodingDecoding, - FailurePopListEncodingDecodingTest, - testing::ValuesIn(GetFailurePopListSpecs())); - -} // namespace -} // namespace fast_wordpiece_tokenizer_utils -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/log_greedy_constrained_sequence_kernel_test.cc b/tensorflow_text/core/kernels/log_greedy_constrained_sequence_kernel_test.cc deleted file mode 100644 index 6d9d89054..000000000 --- a/tensorflow_text/core/kernels/log_greedy_constrained_sequence_kernel_test.cc +++ /dev/null @@ -1,799 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include - -#include -#include -#include "tensorflow/core/framework/fake_input.h" -#include "tensorflow/core/framework/node_def_builder.h" -#include "tensorflow/core/framework/tensor.h" -#include "tensorflow/core/framework/tensor_shape.h" -#include "tensorflow/core/framework/types.pb.h" -#include "tensorflow/core/kernels/ops_testutil.h" -#include "tensorflow/core/lib/core/status_test_util.h" -#include "tensorflow/core/platform/status.h" -#include "tensorflow/core/platform/types.h" -#include "tensorflow_text/core/kernels/text_kernels_test_util.h" - -namespace tensorflow { - -using tensorflow::DT_INT32; -using tensorflow::FakeInput; -using tensorflow::NodeDefBuilder; -using tensorflow::Status; -using tensorflow::TensorShape; -using tensorflow::text_kernels_test_util::MatrixEq; -using tensorflow::text_kernels_test_util::VectorEq; - -class LogGreedyConstrainedSequenceTest : public tensorflow::OpsTestBase { - public: - void SetUpOpWithDefaults() { - // Prepare graph. - TF_ASSERT_OK(NodeDefBuilder("tested_op", "ConstrainedSequence") - .Attr("Tin", DT_INT32) - .Attr("use_viterbi", false) - .Attr("use_log_space", true) - .Attr("use_start_and_end_states", true) - .Input(FakeInput()) - .Input(FakeInput()) - .Input(FakeInput()) - .Input(FakeInput()) - .Finalize(node_def())); - TF_ASSERT_OK(InitOp()); - } -}; - -// TODO(b/122968457): There are a bunch of tests that only validate !ok instead -// of looking for specific error messages; fix that. - -// This test examines evaluations with only a permissions matrix. -TEST_F(LogGreedyConstrainedSequenceTest, - ComputesSingleTransitionWithNoWeights) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 12.0, 13.0, 4.0, // - 1.0, 12.0, 13.0, 14.0, // - 15.0, 2.0, 3.0, 14.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO OUT - true, true, true, true, true, // FROM 0 - true, true, true, true, true, // FROM 1 - true, true, true, true, true, // FROM 2 - true, true, true, true, true, // FROM 3 - true, true, false, true, false, // FROM 'OUTSIDE' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({0, 0}), {}); - - TF_ASSERT_OK(RunOpKernel()); - - // The first sequence's highest score is 2, but OUT->2 is not ok, so it's 1. - // The second sequence's highest score is 3, which is ok. - // The third sequence's highest score is 0, which is ok. - - // Validate the output. - std::vector expected_transitions({1, 3, 0}); - std::vector expected_offsets({0, 1, 2, 3}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test examines evaluations with an empty weights matrix not of rank 2. -TEST_F(LogGreedyConstrainedSequenceTest, - ComputesSingleTransitionWithNonMatrixEmptyWeights) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 12.0, 13.0, 4.0, // - 1.0, 12.0, 13.0, 14.0, // - 15.0, 2.0, 3.0, 14.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO OUT - true, true, true, true, true, // FROM 0 - true, true, true, true, true, // FROM 1 - true, true, true, true, true, // FROM 2 - true, true, true, true, true, // FROM 3 - true, true, false, true, false, // FROM 'OUTSIDE' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({0}), {}); - - TF_ASSERT_OK(RunOpKernel()); - - // The first sequence's highest score is 2, but OUT->2 is not ok, so it's 1. - // The second sequence's highest score is 3, which is ok. - // The third sequence's highest score is 0, which is ok. - - // Validate the output. - std::vector expected_transitions({1, 3, 0}); - std::vector expected_offsets({0, 1, 2, 3}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test examines evaluations with a 2D score matrix (implicit batch 1). -TEST_F(LogGreedyConstrainedSequenceTest, - ComputesSingleTransitionWithSingleBatchItem) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({1, 4}), // - { - 10.0, 12.0, 13.0, 4.0, // - }); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({1}), {1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO OUT - true, true, true, true, true, // FROM 0 - true, true, true, true, true, // FROM 1 - true, true, true, true, true, // FROM 2 - true, true, true, true, true, // FROM 3 - true, true, false, true, false, // FROM 'OUTSIDE' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({0, 0}), {}); - - TF_ASSERT_OK(RunOpKernel()); - - // The sequence's highest score is 2, but OUT->2 is not ok, so it's 1. - // Validate the output. - std::vector expected_transitions({1}); - std::vector expected_offsets({0, 1}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test examines int64 input type and int32 output type. -TEST_F(LogGreedyConstrainedSequenceTest, int64inint32out) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 12.0, 13.0, 4.0, // - 1.0, 12.0, 13.0, 14.0, // - 15.0, 2.0, 3.0, 14.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO OUT - true, true, true, true, true, // FROM 0 - true, true, true, true, true, // FROM 1 - true, true, true, true, true, // FROM 2 - true, true, true, true, true, // FROM 3 - true, true, false, true, false, // FROM 'OUTSIDE' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({0, 0}), {}); - - TF_ASSERT_OK(RunOpKernel()); - - // The first sequence's highest score is 2, but OUT->2 is not ok, so it's 1. - // The second sequence's highest score is 3, which is ok. - // The third sequence's highest score is 0, which is ok. - // Validate the output. - // Validate the output. - std::vector expected_transitions({1, 3, 0}); - std::vector expected_offsets({0, 1, 2, 3}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test ensures the op can take a sequence length of type {{X},{Y},{Z}} -// (with an outer batch dimension). -TEST_F(LogGreedyConstrainedSequenceTest, TwoDimensionalSequenceLengths) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 12.0, 13.0, 4.0, // - 1.0, 12.0, 13.0, 14.0, // - 15.0, 2.0, 3.0, 14.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3, 1}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO OUT - true, true, true, true, true, // FROM 0 - true, true, true, true, true, // FROM 1 - true, true, true, true, true, // FROM 2 - true, true, true, true, true, // FROM 3 - true, true, false, true, false, // FROM 'OUTSIDE' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({0, 0}), {}); - - TF_ASSERT_OK(RunOpKernel()); - - // The first sequence's highest score is 2, but OUT->2 is not ok, so it's 1. - // The second sequence's highest score is 3, which is ok. - // The third sequence's highest score is 0, which is ok. - - // Validate the output. - std::vector expected_transitions({1, 3, 0}); - std::vector expected_offsets({0, 1, 2, 3}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test ensures that final transitions that are forbidden by the permission -// matrix (final->null) are not taken. -TEST_F(LogGreedyConstrainedSequenceTest, - ComputesSingleTransitionWithNoWeightsConstrainedByEnd) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 12.0, 13.0, 4.0, // - 1.0, 12.0, 13.0, 14.0, // - 15.0, 2.0, 3.0, 14.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO OUT - true, true, true, true, true, // FROM 0 - true, true, true, true, false, // FROM 1 - true, true, true, true, true, // FROM 2 - true, true, true, true, true, // FROM 3 - true, true, false, true, false, // FROM 'OUTSIDE' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({0, 0}), {}); - - TF_ASSERT_OK(RunOpKernel()); - - // The first sequence's highest score is 2, but OUT->2 is not ok; the next - // highest is 1, but 1->OUT is not OK; the next highest is 0, which is OK. - // The second sequence's highest score is 3, OUT->3 is OK and 3->OUT is OK. - // The third sequence's highest score is 0, OUT->0 is OK and 0->OUT is OK. - // Validate the output. - std::vector expected_transitions({0, 3, 0}); - std::vector expected_offsets({0, 1, 2, 3}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test examines evaluations with only a weight matrix. -TEST_F(LogGreedyConstrainedSequenceTest, - ComputesSingleTransitionWithNoPermissions) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 2.0, 7.0, 4.0, // - 1.0, 9.0, 11.0, 5.0, // - 100.0, 24.0, 3.0, 4.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({0, 0}), {}); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({5, 5}), {0.5, 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 0.5, 1.0, // - 0.1, 0.5, 0.5, 1.0, 1.0}); - - TF_ASSERT_OK(RunOpKernel()); - - // All scores should be summed with the last row in the weight tensor, so - // the 'real' scores are: - // 1: {10.1, 2.5, 7.5, 5.0} (max is 0) - // 2: {1.1, 9.5, 11.5, 6.0} (max is 2) - // 3: {100.1, 24.5, 3.5, 5.0} (max is 0) - // Validate the output. - std::vector expected_transitions({0, 2, 0}); - std::vector expected_offsets({0, 1, 2, 3}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test examines evaluations with an empty not rank 2 permissions matrix. -TEST_F(LogGreedyConstrainedSequenceTest, - ComputesSingleTransitionWithNonMatrixEmptyPermissions) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 2.0, 7.0, 4.0, // - 1.0, 9.0, 11.0, 5.0, // - 100.0, 24.0, 3.0, 4.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({0, 0, 0}), {}); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({5, 5}), {0.5, 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 0.5, 1.0, // - 0.1, 0.5, 0.5, 1.0, 1.0}); - - TF_ASSERT_OK(RunOpKernel()); - - // All scores should be summed with the last row in the weight tensor, so - // the 'real' scores are: - // 1: {10.1, 2.5, 7.5, 5.0} (max is 0) - // 2: {1.1, 9.5, 11.5, 6.0} (max is 2) - // 3: {100.1, 24.5, 3.5, 5.0} (max is 0) - // Validate the output. - std::vector expected_transitions({0, 2, 0}); - std::vector expected_offsets({0, 1, 2, 3}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test ensures that final transitions are scored with the probability -// of ending the sequence on the transition (x->final->null). -TEST_F(LogGreedyConstrainedSequenceTest, - ComputesSingleTransitionWithNoPermissionsWeightedByEnd) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 2.0, 7.0, 4.0, // - 1.0, 9.0, 11.0, 5.0, // - 100.0, 24.0, 3.0, 4.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({0, 0}), {}); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({5, 5}), {0.5, 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 0.5, 0.1, // - 0.1, 0.5, 0.5, 1.0, 1.0}); - - TF_ASSERT_OK(RunOpKernel()); - - // All scores should be summed with the last row and the last column in the - // score tensor, so the real scores are: - // 1: {10.1, 2.5, 7.5, 4.1} (max is 0) - // 2: {1.1, 9.5, 11.5, 6.0} (max is 2) - // 3: {100.1, 24.5, 3.5, 5.0} (max is 0) - // Validate the output. - std::vector expected_transitions({0, 2, 0}); - std::vector expected_offsets({0, 1, 2, 3}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test examines evaluations with both weight and permission matrices. -TEST_F(LogGreedyConstrainedSequenceTest, - ComputesSingleTransitionWithWeightsAndPermissions) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 7.0, 2.0, 7.0, 4.0, // - 1.0, 9.0, 11.0, 5.0, // - 100.0, 24.0, 3.0, 4.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO OUT - true, true, true, true, true, // FROM 0 - true, true, true, true, true, // FROM 1 - true, true, true, true, false, // FROM 2 - true, true, true, true, true, // FROM 3 - false, true, true, true, false, // FROM 'OUT' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({5, 5}), {0.5, 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 0.5, 1.0, // - 0.5, 0.5, 0.5, 0.5, 0.1, // - 0.1, 0.5, 0.5, 1.0, 1.0}); - - TF_ASSERT_OK(RunOpKernel()); - - // All scores should be summed with the last row and the last column in the - // score tensor, so the real scores are: - // 1: {7.1, 2.5, 7.5, 4.1} (max is 3, but 2->NUL/NUL->0 is not OK, so 3.) - // 2: {1.1, 9.5, 11.5, 6.0} (max is 2, but 2->NUL is not OK, so 1.) - // 3: {100.1, 24.5, 3.5, 5.0} (max is 0, but NUL->0 is not OK, so 1.) - // Validate the output. - std::vector expected_transitions({3, 1, 1}); - std::vector expected_offsets({0, 1, 2, 3}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test examines multiple evaluations with both weight and permission -// matrices. -TEST_F(LogGreedyConstrainedSequenceTest, - ComputesMultipleTransitionsWithWeightsAndPermissions) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 2, 4}), // - {{ - 10.0, 2.0, 7.0, 4.0, // Batch 0, step 0 - 10.0, 10.0, 10.0, 10.0, // Batch 0, step 1 - 1.0, 9.0, 11.0, 5.0, // Batch 1, step 0 - 10.0, 15.0, 1.0, 12.0, // Batch 1, step 1 - 100.0, 24.0, 3.0, 4.0, // Batch 2, step 0 - 1.0, 11.0, 1.0, 10.0, // Batch 2, step 1 - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {2, 2, 2}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO NUL - true, true, true, true, true, // FROM 0 - true, true, true, true, false, // FROM 1 - true, false, true, false, true, // FROM 2 - true, true, true, true, true, // FROM 3 (OUT) - false, true, true, true, true, // FROM 'NULL' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({5, 5}), {0.5, 0.5, 0.5, 0.5, 1.0, // 0 - 0.5, 0.5, 0.5, 0.5, 1.0, // 1 - 0.5, 0.5, 1.0, 0.5, 1.0, // 2 - 0.5, 0.5, 0.5, 0.5, 1.0, // 3 - 0.1, 0.5, 0.5, 1.0, 1.0}); - - TF_ASSERT_OK(RunOpKernel()); - - // STEP 1: - // All scores should be summed with the last row in the weight tensor, so - // the 'real' scores are: - // 1: {10.1, 2.5, 7.5, 5.0} (max is 2). OUT->2 is OK. - // 2: {1.1, 9.5, 11.5, 6.0} (max is 2). OUT->2 is OK. - // 3: {100.1, 11.5, 1.5, 11.0} (max is 0). OUT->0 is not OK, so go with 1. - // STEP 2: - // 1: In state '2', so use row 2 in the weight tensor. - // Weights are {11.5, 11.5, 12.0, 11.5}; 2->2 is OK and 2->OUT is OK; use 2. - // 2: In state '2', so use row 2 in the weight tensor. - // Weights are {10.5, 15.5, 2.0, 13.0}; 2->3 is not OK and 2->1 is not OK, so - // 0. 3: In state 0, so use row 0 in the weight tensor. Weights are - // {1.5, 11.5, 1.5, 11}; 0->1 is OK but 1->OUT is not, so 3. - - std::vector expected_transitions({2, 2, 2, 0, 1, 3}); - std::vector expected_offsets({0, 2, 4, 6}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} -// This test examines multiple evaluations with both weight and permission -// matrices. -TEST_F(LogGreedyConstrainedSequenceTest, - ComputesMultipleTransitionsWithVaryingLengths) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 2, 4}), // - {{ - 10.0, 2.0, 7.0, 4.0, // Batch 0, step 0 - 10.0, 10.0, 10.0, 10.0, // Batch 0, step 1 - 1.0, 9.0, 11.0, 5.0, // Batch 1, step 0 - 10.0, 15.0, 1.0, 12.0, // Batch 1, step 1 - 100.0, 24.0, 3.0, 4.0, // Batch 2, step 0 - 1.0, 11.0, 1.0, 10.0, // Batch 2, step 1 - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {2, 1, 2}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO NUL - true, true, true, true, true, // FROM 0 - true, true, true, true, false, // FROM 1 - true, false, true, false, true, // FROM 2 - true, true, true, true, true, // FROM 3 (OUT) - false, true, true, true, true, // FROM 'NULL' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({5, 5}), {0.5, 0.5, 0.5, 0.5, 1.0, // 0 - 0.5, 0.5, 0.5, 0.5, 1.0, // 1 - 0.5, 0.5, 1.0, 0.5, 1.0, // 2 - 0.5, 0.5, 0.5, 0.5, 1.0, // 3 - 0.1, 0.5, 0.5, 1.0, 1.0}); - - TF_ASSERT_OK(RunOpKernel()); - - // STEP 1: - // All scores should be summed with the last row in the weight tensor, so - // the 'real' scores are: - // 1: {10.1, 2.5, 7.5, 5.0} (max is 2). OUT->2 is OK. - // 2: {1.1, 9.5, 11.5, 6.0} (max is 2). OUT->2 and 2->OUT are OK. - // 3: {100.1, 11.5, 1.5, 11.0} (max is 0). OUT->0 is not OK, so go with 1. - // STEP 2: - // 1: In state '2', so use row 2 in the weight tensor. - // Weights are {11.5, 11.5, 12.0, 11.5}; 2->2 is OK and 2->OUT is OK; use 2. - // 2: End of sequence. - // 3: In state 0, so use row 0 in the weight tensor. - // Weights are {1.5, 11.5, 1.5, 11}; 0->1 is OK but 1->OUT is not, so 3. - - std::vector expected_transitions({2, 2, 2, 1, 3}); - std::vector expected_offsets({0, 2, 3, 5}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test examines evaluations with a fully negative input set. -TEST_F(LogGreedyConstrainedSequenceTest, - ComputesSingleTransitionWithNegativeInputs) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - -10.0, -12.0, -13.0, -4.0, // - -1.0, -12.0, -13.0, -14.0, // - -15.0, -2.0, -3.0, -14.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO OUT - true, true, true, true, true, // FROM 0 - true, true, true, true, true, // FROM 1 - true, true, true, true, true, // FROM 2 - true, true, true, true, true, // FROM 3 - true, true, true, true, true, // FROM 'OUTSIDE' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({0, 0}), {}); - - TF_ASSERT_OK(RunOpKernel()); - - std::vector expected_transitions({3, 0, 1}); - std::vector expected_offsets({0, 1, 2, 3}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test examines evaluations with an all-zero weight matrix. -TEST_F(LogGreedyConstrainedSequenceTest, - ComputesSingleTransitionWithZeroedWeights) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 2.0, 7.0, 4.0, // - 1.0, 9.0, 11.0, 5.0, // - 100.0, 24.0, 3.0, 4.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({0, 0}), {}); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({5, 5}), { - 0.0, 0.0, 0.0, 0.0, 0.0, // - 0.0, 0.0, 0.0, 0.0, 0.0, // - 0.0, 0.0, 0.0, 0.0, 0.0, // - 0.0, 0.0, 0.0, 0.0, 0.0, // - 0.0, 0.0, 0.0, 0.0, 0.0, - }); - - TF_ASSERT_OK(RunOpKernel()); - - // Because all weights are zero, the max values should be the max of the - // scores. - std::vector expected_transitions({0, 2, 0}); - std::vector expected_offsets({0, 1, 2, 3}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -TEST_F(LogGreedyConstrainedSequenceTest, - ImpossibleSequencesResultInNegativeOnesIfAttrIsSet) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 2, 4}), // - {{ - 10.0, 12.0, 13.0, 4.0, // - 1.0, 12.0, 13.0, 14.0, // - 15.0, 2.0, 3.0, 14.0, // - 10.0, 12.0, 13.0, 4.0, // - 1.0, 12.0, 13.0, 14.0, // - 15.0, 2.0, 3.0, 14.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {2, 2, 2}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO OUT - false, false, false, false, false, // FROM 0 - false, false, false, false, false, // FROM 1 - false, false, false, false, false, // FROM 2 - false, false, false, false, false, // FROM 3 - false, false, false, false, false, // FROM 'OUT' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({0, 0}), {}); - - TF_ASSERT_OK(RunOpKernel()); - - // Validate the output. - - std::vector expected_transitions({-1, -1, -1, -1, -1, -1}); - std::vector expected_offsets({0, 2, 4, 6}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test ensures the op will throw an error if there are too few scores to -// finalize all the sequences. -TEST_F(LogGreedyConstrainedSequenceTest, ErrorsIfGivenInsufficientScores) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 12.0, 13.0, 4.0, // - 1.0, 12.0, 13.0, 14.0, // - 15.0, 2.0, 3.0, 14.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 2, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO OUT - true, true, true, true, true, // FROM 0 - true, true, true, true, true, // FROM 1 - true, true, true, true, true, // FROM 2 - true, true, true, true, true, // FROM 3 - true, true, false, true, false, // FROM 'OUTSIDE' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({0, 0}), {}); - - auto result = RunOpKernel(); - EXPECT_FALSE(result.ok()); -} - -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/log_viterbi_constrained_sequence_kernel_test.cc b/tensorflow_text/core/kernels/log_viterbi_constrained_sequence_kernel_test.cc deleted file mode 100644 index 7e444a496..000000000 --- a/tensorflow_text/core/kernels/log_viterbi_constrained_sequence_kernel_test.cc +++ /dev/null @@ -1,815 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include - -#include -#include -#include "tensorflow/core/framework/fake_input.h" -#include "tensorflow/core/framework/node_def_builder.h" -#include "tensorflow/core/framework/tensor.h" -#include "tensorflow/core/framework/tensor_shape.h" -#include "tensorflow/core/framework/types.pb.h" -#include "tensorflow/core/kernels/ops_testutil.h" -#include "tensorflow/core/lib/core/status_test_util.h" -#include "tensorflow/core/platform/status.h" -#include "tensorflow/core/platform/types.h" -#include "tensorflow_text/core/kernels/text_kernels_test_util.h" - -namespace tensorflow { - -using tensorflow::DT_INT32; -using tensorflow::FakeInput; -using tensorflow::NodeDefBuilder; -using tensorflow::Status; -using tensorflow::TensorShape; -using tensorflow::text_kernels_test_util::MatrixEq; -using tensorflow::text_kernels_test_util::VectorEq; - - -// TODO(b/122968457): There are a bunch of tests that only validate !ok instead -// of looking for specific error messages; fix that. - -class LogViterbiConstrainedSequenceTest : public tensorflow::OpsTestBase { - public: - void SetUpOpWithDefaults() { - // Prepare graph. - TF_ASSERT_OK(NodeDefBuilder("tested_op", "ConstrainedSequence") - .Attr("Tin", DT_INT32) - .Attr("use_viterbi", true) - .Attr("use_log_space", true) - .Attr("use_start_and_end_states", true) - .Input(FakeInput()) - .Input(FakeInput()) - .Input(FakeInput()) - .Input(FakeInput()) - .Finalize(node_def())); - TF_ASSERT_OK(InitOp()); - } -}; - -// This test examines evaluations with only a permissions matrix. -TEST_F(LogViterbiConstrainedSequenceTest, - ComputesSingleTransitionWithNoWeights) { - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 12.0, 13.0, 4.0, // - 1.0, 12.0, 13.0, 14.0, // - 15.0, 2.0, 3.0, 14.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO OUT - true, true, true, true, true, // FROM 0 - true, true, true, true, true, // FROM 1 - true, true, true, true, true, // FROM 2 - true, true, true, true, true, // FROM 3 - true, true, false, true, false, // FROM 'OUTSIDE' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({0, 0}), {}); - - TF_ASSERT_OK(RunOpKernel()); - - // The first sequence's highest score is 2, but OUT->2 is not ok, so it's 1. - // The second sequence's highest score is 3, which is ok. - // The third sequence's highest score is 0, which is ok. - - // Validate the output. - std::vector expected_transitions({1, 3, 0}); - std::vector expected_offsets({0, 1, 2, 3}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test examines evaluations with an empty weights matrix not of rank 2. -TEST_F(LogViterbiConstrainedSequenceTest, - ComputesSingleTransitionWithNonMatrixEmptyWeights) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 12.0, 13.0, 4.0, // - 1.0, 12.0, 13.0, 14.0, // - 15.0, 2.0, 3.0, 14.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO OUT - true, true, true, true, true, // FROM 0 - true, true, true, true, true, // FROM 1 - true, true, true, true, true, // FROM 2 - true, true, true, true, true, // FROM 3 - true, true, false, true, false, // FROM 'OUTSIDE' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({0}), {}); - - TF_ASSERT_OK(RunOpKernel()); - - // The first sequence's highest score is 2, but OUT->2 is not ok, so it's 1. - // The second sequence's highest score is 3, which is ok. - // The third sequence's highest score is 0, which is ok. - - // Validate the output. - std::vector expected_transitions({1, 3, 0}); - std::vector expected_offsets({0, 1, 2, 3}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test examines evaluations with a 2D score matrix (implicit batch 1). -TEST_F(LogViterbiConstrainedSequenceTest, - ComputesSingleTransitionWithSingleBatchItem) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({1, 4}), // - { - 10.0, 12.0, 13.0, 4.0, // - }); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({1}), {1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO OUT - true, true, true, true, true, // FROM 0 - true, true, true, true, true, // FROM 1 - true, true, true, true, true, // FROM 2 - true, true, true, true, true, // FROM 3 - true, true, false, true, false, // FROM 'OUTSIDE' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({0, 0}), {}); - - TF_ASSERT_OK(RunOpKernel()); - - // The sequence's highest score is 2, but OUT->2 is not ok, so it's 1. - // Validate the output. - std::vector expected_transitions({1}); - std::vector expected_offsets({0, 1}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test examines int64 input type and int32 output type. -TEST_F(LogViterbiConstrainedSequenceTest, int64inint32out) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 12.0, 13.0, 4.0, // - 1.0, 12.0, 13.0, 14.0, // - 15.0, 2.0, 3.0, 14.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO OUT - true, true, true, true, true, // FROM 0 - true, true, true, true, true, // FROM 1 - true, true, true, true, true, // FROM 2 - true, true, true, true, true, // FROM 3 - true, true, false, true, false, // FROM 'OUTSIDE' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({0, 0}), {}); - - TF_ASSERT_OK(RunOpKernel()); - - // The first sequence's highest score is 2, but OUT->2 is not ok, so it's 1. - // The second sequence's highest score is 3, which is ok. - // The third sequence's highest score is 0, which is ok. - // Validate the output. - // Validate the output. - std::vector expected_transitions({1, 3, 0}); - std::vector expected_offsets({0, 1, 2, 3}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test ensures the op can take a sequence length of type {{X},{Y},{Z}} -// (with an outer batch dimension). -TEST_F(LogViterbiConstrainedSequenceTest, TwoDimensionalSequenceLengths) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 12.0, 13.0, 4.0, // - 1.0, 12.0, 13.0, 14.0, // - 15.0, 2.0, 3.0, 14.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3, 1}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO OUT - true, true, true, true, true, // FROM 0 - true, true, true, true, true, // FROM 1 - true, true, true, true, true, // FROM 2 - true, true, true, true, true, // FROM 3 - true, true, false, true, false, // FROM 'OUTSIDE' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({0, 0}), {}); - - TF_ASSERT_OK(RunOpKernel()); - - // The first sequence's highest score is 2, but OUT->2 is not ok, so it's 1. - // The second sequence's highest score is 3, which is ok. - // The third sequence's highest score is 0, which is ok. - - // Validate the output. - std::vector expected_transitions({1, 3, 0}); - std::vector expected_offsets({0, 1, 2, 3}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test ensures that final transitions that are forbidden by the permission -// matrix (final->null) are not taken. -TEST_F(LogViterbiConstrainedSequenceTest, - ComputesSingleTransitionWithNoWeightsConstrainedByEnd) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 12.0, 13.0, 4.0, // - 1.0, 12.0, 13.0, 14.0, // - 15.0, 2.0, 3.0, 14.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO OUT - true, true, true, true, true, // FROM 0 - true, true, true, true, false, // FROM 1 - true, true, true, true, true, // FROM 2 - true, true, true, true, true, // FROM 3 - true, true, false, true, false, // FROM 'OUTSIDE' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({0, 0}), {}); - - TF_ASSERT_OK(RunOpKernel()); - - // The first sequence's highest score is 2, but OUT->2 is not ok; the next - // highest is 1, but 1->OUT is not OK; the next highest is 0, which is OK. - // The second sequence's highest score is 3, OUT->3 is OK and 3->OUT is OK. - // The third sequence's highest score is 0, OUT->0 is OK and 0->OUT is OK. - // Validate the output. - std::vector expected_transitions({0, 3, 0}); - std::vector expected_offsets({0, 1, 2, 3}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test examines evaluations with only a weight matrix. -TEST_F(LogViterbiConstrainedSequenceTest, - ComputesSingleTransitionWithNoPermissions) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 2.0, 7.0, 4.0, // - 1.0, 9.0, 11.0, 5.0, // - -12.0, 3.0, 3.0, 4.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({0, 0}), {}); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({5, 5}), {0.0, 0.0, 0.0, 0.0, 0.0, // - 0.0, 0.0, 0.0, 0.0, 0.0, // - 0.0, 0.0, 0.0, 0.0, 0.0, // - 0.0, 0.0, 0.0, 0.0, 0.0, // - 10.0, 5.0, 3.0, 1.0, 0.0}); - - TF_ASSERT_OK(RunOpKernel()); - - // All scores should be summed with the last row in the weight tensor, so - // the 'real' scores are: - // 1: {20.0, 7.0, 10.0, 5.0} (max is 0) - // 2: {11.0, 14.0, 14.0, 6.0} (max is 2, due to tiebreaker.) - // 3: {-2.0, 8.0, 6.0, 5.0} (max is 1) - // Validate the output. - std::vector expected_transitions({0, 2, 1}); - std::vector expected_offsets({0, 1, 2, 3}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test examines evaluations with an empty not rank 2 permissions matrix. -TEST_F(LogViterbiConstrainedSequenceTest, - ComputesSingleTransitionWithNonMatrixEmptyPermissions) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 2.0, 7.0, 4.0, // - 1.0, 9.0, 11.0, 5.0, // - -12.0, 3.0, 3.0, 4.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({0, 0}), {}); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({5, 5}), {0.0, 0.0, 0.0, 0.0, 0.0, // - 0.0, 0.0, 0.0, 0.0, 0.0, // - 0.0, 0.0, 0.0, 0.0, 0.0, // - 0.0, 0.0, 0.0, 0.0, 0.0, // - 10.0, 5.0, 3.0, 1.0, 0.0}); - - TF_ASSERT_OK(RunOpKernel()); - - // All scores should be summed with the last row in the weight tensor, so - // the 'real' scores are: - // 1: {20.0, 7.0, 10.0, 5.0} (max is 0) - // 2: {11.0, 14.0, 14.0, 6.0} (max is 2, due to tiebreaker.) - // 3: {-2.0, 8.0, 6.0, 5.0} (max is 1) - // Validate the output. - std::vector expected_transitions({0, 2, 1}); - std::vector expected_offsets({0, 1, 2, 3}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test ensures that final transitions are scored with the probability -// of ending the sequence on the transition (x->final->null). -TEST_F(LogViterbiConstrainedSequenceTest, - ComputesSingleTransitionWithNoPermissionsWeightedByEnd) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 2.0, 7.0, 4.0, // - 1.0, 9.0, 11.0, 5.0, // - -12.0, 3.0, 3.0, 4.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({0, 0}), {}); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({5, 5}), {0.0, 0.0, 0.0, 0.0, -15.0, // - 0.0, 0.0, 0.0, 0.0, 0.0, // - 0.0, 0.0, 0.0, 0.0, 0.0, // - 0.0, 0.0, 0.0, 0.0, 0.0, // - 10.0, 5.0, 3.0, 1.0, 0.0}); - - TF_ASSERT_OK(RunOpKernel()); - - // All scores should be summed with the last row in the weight tensor, so - // the 'real' scores are: - // 1: {5.0, 7.0, 10.0, 5.0} (max is 2 - state 1->null adds -15.) - // 2: {11.0, 14.0, 14.0, 6.0} (max is 2, due to tiebreaker.) - // 3: {-2.0, 8.0, 6.0, 5.0} (max is 1) - // Validate the output. - std::vector expected_transitions({2, 2, 1}); - std::vector expected_offsets({0, 1, 2, 3}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test examines evaluations with both weight and permission matrices. -TEST_F(LogViterbiConstrainedSequenceTest, - ComputesSingleTransitionWithWeightsAndPermissions) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 2.0, 7.0, 4.0, // - 1.0, 9.0, 11.0, 5.0, // - -12.0, 3.0, 3.0, 4.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO OUT - true, true, true, true, true, // FROM 0 - true, true, true, true, true, // FROM 1 - true, true, true, true, true, // FROM 2 - true, true, true, true, true, // FROM 'OUTSIDE' - false, true, true, true, false, // FROM 'NULL' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({5, 5}), {0.0, 0.0, 0.0, 0.0, 0.0, // - 0.0, 0.0, 0.0, 0.0, 0.0, // - 0.0, 0.0, 0.0, 0.0, 0.0, // - 0.0, 0.0, 0.0, 0.0, 0.0, // - 10.0, 5.0, 3.0, 1.0, 0.0}); - - TF_ASSERT_OK(RunOpKernel()); - - // All scores should be summed with the last row in the weight tensor, so - // the 'real' scores are: - // 1: {20.0, 7.0, 10.0, 5.0} (max is 0, but NUL->0 is forbidden, so 2.) - // 2: {11.0, 14.0, 14.0, 6.0} (max is 2, due to tiebreaker.) - // 3: {-2.0, 8.0, 6.0, 5.0} (max is 1) - // Validate the output. - std::vector expected_transitions({2, 2, 1}); - std::vector expected_offsets({0, 1, 2, 3}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test examines multiple evaluations with both weight and permission -// matrices. -TEST_F(LogViterbiConstrainedSequenceTest, - ComputesMultipleTransitionsWithWeightsAndPermissions) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({2, 2, 4}), // - {{ - 10.0, 12.0, 7.0, 4.0, // Batch 0, step 0 - 13.0, 12.0, 11.0, 10.0, // Batch 0, step 1 - 7.0, 9.0, 11.0, 5.0, // Batch 1, step 0 - 10.0, 15.0, 1.0, 12.0, // Batch 1, step 1 - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({2}), {2, 2}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO NUL - true, true, true, true, true, // FROM 0 - true, true, true, true, true, // FROM 1 - true, false, true, false, false, // FROM 2 - true, true, true, true, true, // FROM 3 (OUT) - true, false, true, true, true, // FROM 'NULL' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({5, 5}), - {-1.0, 1.0, -2.0, 2.0, 0.0, // 0 - 3.0, -3.0, 4.0, -4.0, 0.0, // 1 - 5.0, -5.0, 6.0, -6.0, 0.0, // 2 - -7.0, 7.0, -8.0, 8.0, 0.0, // 3 - 0.0, 1.0, 2.0, 3.0, 0.0}); - - TF_ASSERT_OK(RunOpKernel()); - - // STEP 1: - // All scores should be summed with the last row in the weight tensor, so the - // 'real' scores are: - // B0: { 10.0, [NOTOK], 9.0, 7.0} - // B1: { 7.0, [NOTOK], 13.0, 8.0} - // - // STEP 2: - // (Forbidden transitions are marked with '*' and X stands for the lowest - // possible score.) - // - // BATCH 0: - // Raw scores are: {13.0, 12.0, 11.0, 10.0} - // - // Final state 0: (13.0) Weighted scores are {12.0, 16.0, 18.0, 6.0} - // New totals are {22, X, 27, 18} [max 27 from 2] - // - // Final state 1: (12.0) Weighted scores are {13.0, 9.0, X, 19.0}, - // New totals are {23, X, X, 26} [max 26 from 3] - // - // Final state 2: (11.0) Weighted scores are {9, 15, 21, 3}, - // New totals are {19, X, 30, 10} [max 30 from 2] - // - // Final state 3: (10.0) Weighted scores are {12, 6, X, 18}, - // New totals are {19, X, X, 25} [max 25 from 3] - // - // Top scores are [27, 26, 30, 25] from [2, 3, 2, 3]. - // 2->OUT is X, so final scores are [27, 26, X, 25] for a - // final state of [0] with a sequence of [2->0]. - // - // - // BATCH 1: - // Previous scores are {7, X, 13, 8} - // Raw scores are {10, 15, 1, 12} - // - // Final state 0: Weighted score is {9, 18, 15, 3} - // New totals are {16, X, 28, 11} [max 28 from 2] - // - // Final state 1: Weighted score is {16, 12, 10, 22} - // New totals are {23, X, X*, 30} [max 30 from 3] - // - // Final state 2: Weighted score is {-1, 5, 7, -7} - // New totals are {6, X, 20, 1} [max 20 from 2] - // - // Final state 3: Weighted score is {14, 8, 6, 20} - // New totals are {21, X, X*, 28} [max 28 from 3] - // - // Top scores are [28, 30, 20, 28] from [2, 3, 2, 3]. - // 2->OUT is not valid, so final scores are [28, 30, X*, 28] for a - // final state of [1] with a sequence of [3->1]. - // - - std::vector expected_transitions({2, 0, 3, 1}); - std::vector expected_offsets({0, 2, 4}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test examines multiple evaluations with both weight and permission -// matrices. -TEST_F(LogViterbiConstrainedSequenceTest, - ComputesMultipleTransitionsWithVaryingLengths) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({2, 2, 4}), // - {{ - 10.0, 12.0, 7.0, 4.0, // Batch 0, step 0 - 0.0, 0.0, 0.0, 0.0, // PAD - 7.0, 9.0, 11.0, 5.0, // Batch 1, step 0 - 10.0, 15.0, 1.0, 12.0, // Batch 1, step 1 - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({2}), {1, 2}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO NUL - true, true, true, true, true, // FROM 0 - true, true, true, true, true, // FROM 1 - true, false, true, false, false, // FROM 2 - true, true, true, true, true, // FROM 3 (OUT) - true, false, true, true, true, // FROM 'NULL' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({5, 5}), - {-1.0, 1.0, -2.0, 2.0, 0.0, // 0 - 3.0, -3.0, 4.0, -4.0, 0.0, // 1 - 5.0, -5.0, 6.0, -6.0, 0.0, // 2 - -7.0, 7.0, -8.0, 8.0, 0.0, // 3 - 0.0, 1.0, 2.0, 3.0, 0.0}); - - TF_ASSERT_OK(RunOpKernel()); - - // STEP 1: - // All scores should be summed with the last row in the weight tensor, so the - // 'real' scores are: - // B0: { 10.0, [NOTOK], 9.0, 7.0} - // B1: { 7.0, [NOTOK], 13.0, 8.0} - // - // STEP 2: - // (Forbidden transitions are marked with '*' and X stands for the lowest - // possible score.) - // - // BATCH 0: - // Batch 0 is complete. - // - // BATCH 1: - // Previous scores are {7, X, 13, 8} - // Raw scores are {10, 15, 1, 12} - // - // Final state 0: Weighted score is {9, 18, 15, 3} - // New totals are {16, X, 28, 11} [max 28 from 2] - // - // Final state 1: Weighted score is {16, 12, 10, 22} - // New totals are {23, X, X*, 30} [max 30 from 3] - // - // Final state 2: Weighted score is {-1, 5, 7, -7} - // New totals are {6, X, 20, 1} [max 20 from 2] - // - // Final state 3: Weighted score is {14, 8, 6, 20} - // New totals are {21, X, X*, 28} [max 28 from 3] - // - // Top scores are [28, 30, 20, 28] from [2, 3, 2, 3]. - // 2->OUT is not valid, so final scores are [28, 30, X*, 28] for a - // final state of [1] with a sequence of [3->1]. - // - - std::vector expected_transitions({0, 3, 1}); - std::vector expected_offsets({0, 1, 3}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test examines evaluations with a fully negative input set. -TEST_F(LogViterbiConstrainedSequenceTest, - ComputesSingleTransitionWithNegativeInputs) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - -10.0, -12.0, -13.0, -4.0, // - -1.0, -12.0, -13.0, -14.0, // - -15.0, -2.0, -3.0, -14.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 1, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO OUT - true, true, true, true, true, // FROM 0 - true, true, true, true, true, // FROM 1 - true, true, true, true, true, // FROM 2 - true, true, true, true, true, // FROM 3 - true, true, true, true, true, // FROM 'OUTSIDE' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({0, 0}), {}); - - TF_ASSERT_OK(RunOpKernel()); - - std::vector expected_transitions({3, 0, 1}); - std::vector expected_offsets({0, 1, 2, 3}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -TEST_F(LogViterbiConstrainedSequenceTest, - ImpossibleSequencesResultInNegativeOnesIfAttrIsSet) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 2, 4}), // - {{ - 10.0, 12.0, 13.0, 4.0, // - 1.0, 12.0, 13.0, 14.0, // - 15.0, 2.0, 3.0, 14.0, // - 10.0, 12.0, 13.0, 4.0, // - 1.0, 12.0, 13.0, 14.0, // - 15.0, 2.0, 3.0, 14.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {2, 2, 2}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO OUT - false, false, false, false, false, // FROM 0 - false, false, false, false, false, // FROM 1 - false, false, false, false, false, // FROM 2 - false, false, false, false, false, // FROM 3 - false, false, false, false, false, // FROM 'OUT' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({0, 0}), {}); - - TF_ASSERT_OK(RunOpKernel()); - - // Validate the output. - - std::vector expected_transitions({-1, -1, -1, -1, -1, -1}); - std::vector expected_offsets({0, 2, 4, 6}); - - // Validate the output. - EXPECT_THAT(*GetOutput(0), VectorEq(expected_transitions)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_offsets)); -} - -// This test ensures the op will throw an error if there are too few scores to -// finalize all the sequences. -TEST_F(LogViterbiConstrainedSequenceTest, ErrorsIfGivenInsufficientScores) { - // Prepare graph. - SetUpOpWithDefaults(); - - // Add the scores input. - AddInputFromArray(TensorShape({3, 1, 4}), // - {{ - 10.0, 12.0, 13.0, 4.0, // - 1.0, 12.0, 13.0, 14.0, // - 15.0, 2.0, 3.0, 14.0, // - }}); - - // Add the sequence_lengths input. - AddInputFromArray(TensorShape({3}), {1, 2, 1}); - - // Add the allowed_transitions input. - AddInputFromArray(TensorShape({5, 5}), - { - // TO 0 TO 1 TO 2 TO 3 TO OUT - true, true, true, true, true, // FROM 0 - true, true, true, true, true, // FROM 1 - true, true, true, true, true, // FROM 2 - true, true, true, true, true, // FROM 3 - true, true, false, true, false, // FROM 'OUTSIDE' - }); - - // Add the transition_weights input. - AddInputFromArray(TensorShape({0, 0}), {}); - - auto result = RunOpKernel(); - EXPECT_FALSE(result.ok()); -} - -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/mst_op_kernels.cc b/tensorflow_text/core/kernels/mst_op_kernels.cc deleted file mode 100644 index 01eb5954b..000000000 --- a/tensorflow_text/core/kernels/mst_op_kernels.cc +++ /dev/null @@ -1,190 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include - -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/framework/tensor.h" -#include "tensorflow/core/framework/tensor_shape.h" -#include "tensorflow/core/lib/core/errors.h" -#include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/platform/thread_annotations.h" -#include "tensorflow/core/util/work_sharder.h" -#include "tensorflow_text/core/kernels/mst_solver.h" - -namespace tensorflow { -namespace text { - -// Op kernel implementation that wraps the |MstSolver|. -template -class MaxSpanningTreeOpKernel : public tensorflow::OpKernel { - public: - explicit MaxSpanningTreeOpKernel(tensorflow::OpKernelConstruction *context) - : tensorflow::OpKernel(context) { - OP_REQUIRES_OK(context, context->GetAttr("forest", &forest_)); - } - - void Compute(tensorflow::OpKernelContext *context) override { - const tensorflow::Tensor &num_nodes_tensor = context->input(0); - const tensorflow::Tensor &scores_tensor = context->input(1); - - // Check ranks. - OP_REQUIRES(context, num_nodes_tensor.dims() == 1, - tensorflow::errors::InvalidArgument( - "num_nodes must be a vector, got shape ", - num_nodes_tensor.shape().DebugString())); - OP_REQUIRES(context, scores_tensor.dims() == 3, - tensorflow::errors::InvalidArgument( - "scores must be rank 3, got shape ", - scores_tensor.shape().DebugString())); - - // Batch size and input dimension (B and M in the op docstring). - const int64 batch_size = scores_tensor.shape().dim_size(0); - const int64 input_dim = scores_tensor.shape().dim_size(1); - - // Check shapes. - const tensorflow::TensorShape shape_b({batch_size}); - const tensorflow::TensorShape shape_bxm({batch_size, input_dim}); - const tensorflow::TensorShape shape_bxmxm( - {batch_size, input_dim, input_dim}); - OP_REQUIRES( - context, num_nodes_tensor.shape() == shape_b, - tensorflow::errors::InvalidArgument( - "num_nodes misshapen: got ", num_nodes_tensor.shape().DebugString(), - " but expected ", shape_b.DebugString())); - OP_REQUIRES( - context, scores_tensor.shape() == shape_bxmxm, - tensorflow::errors::InvalidArgument( - "scores misshapen: got ", scores_tensor.shape().DebugString(), - " but expected ", shape_bxmxm.DebugString())); - - // Create outputs. - tensorflow::Tensor *max_scores_tensor = nullptr; - tensorflow::Tensor *argmax_sources_tensor = nullptr; - OP_REQUIRES_OK(context, - context->allocate_output(0, shape_b, &max_scores_tensor)); - OP_REQUIRES_OK(context, context->allocate_output(1, shape_bxm, - &argmax_sources_tensor)); - - // Acquire shaped and typed references. - const BatchedSizes num_nodes_b = num_nodes_tensor.vec(); - const BatchedScores scores_bxmxm = scores_tensor.tensor(); - BatchedMaxima max_scores_b = max_scores_tensor->vec(); - BatchedSources argmax_sources_bxm = argmax_sources_tensor->matrix(); - - // Solve the batch of MST problems in parallel. Set a high cycles per unit - // to encourage finer sharding. - constexpr int64 kCyclesPerUnit = 1000 * 1000 * 1000; - std::vector statuses(batch_size); - context->device()->tensorflow_cpu_worker_threads()->workers->ParallelFor( - batch_size, kCyclesPerUnit, [&](int64 begin, int64 end) { - for (int64 problem = begin; problem < end; ++problem) { - statuses[problem] = RunSolver(problem, num_nodes_b, scores_bxmxm, - max_scores_b, argmax_sources_bxm); - } - }); - for (const absl::Status &status : statuses) { - OP_REQUIRES_OK(context, status); - } - } - - private: - using BatchedSizes = typename tensorflow::TTypes::ConstVec; - using BatchedScores = typename tensorflow::TTypes::ConstTensor; - using BatchedMaxima = typename tensorflow::TTypes::Vec; - using BatchedSources = typename tensorflow::TTypes::Matrix; - - // Solves for the maximum spanning tree of the digraph defined by the values - // at index |problem| in |num_nodes_b| and |scores_bxmxm|. On success, sets - // the values at index |problem| in |max_scores_b| and |argmax_sources_bxm|. - // On error, returns non-OK. - absl::Status RunSolver(int problem, BatchedSizes num_nodes_b, - BatchedScores scores_bxmxm, BatchedMaxima max_scores_b, - BatchedSources argmax_sources_bxm) const { - // Check digraph size overflow. - const int32 num_nodes = num_nodes_b(problem); - const int32 input_dim = argmax_sources_bxm.dimension(1); - if (num_nodes > input_dim) { - return tensorflow::errors::InvalidArgument( - "number of nodes in digraph ", problem, - " overflows input dimension: got ", num_nodes, - " but expected <= ", input_dim); - } - if (num_nodes >= std::numeric_limits::max()) { - return tensorflow::errors::InvalidArgument( - "number of nodes in digraph ", problem, " overflows index type: got ", - num_nodes, " but expected < ", std::numeric_limits::max()); - } - const Index num_nodes_index = static_cast(num_nodes); - - MstSolver solver; - TF_RETURN_IF_ERROR(solver.Init(forest_, num_nodes_index)); - - // Populate the solver with arcs and root selections. Note that non-finite - // scores are treated as nonexistent arcs or roots. - for (Index target = 0; target < num_nodes_index; ++target) { - for (Index source = 0; source < num_nodes_index; ++source) { - const Score score = scores_bxmxm(problem, target, source); - if (!std::isfinite(static_cast(score))) continue; - if (source == target) { // root - solver.AddRoot(target, score); - } else { // arc - solver.AddArc(source, target, score); - } - } - } - - std::vector argmax(num_nodes); - TF_RETURN_IF_ERROR(solver.Solve(&argmax)); - - // Output the tree and accumulate its score. - Score max_score = 0; - for (Index target = 0; target < num_nodes_index; ++target) { - const Index source = argmax[target]; - argmax_sources_bxm(problem, target) = source; - max_score += scores_bxmxm(problem, target, source); - } - max_scores_b(problem) = max_score; - - // Pad the source list with -1. - for (int32 i = num_nodes; i < input_dim; ++i) { - argmax_sources_bxm(problem, i) = -1; - } - - return absl::OkStatus(); - } - - private: - bool forest_ = false; -}; - -// Use Index=uint16, which allows digraphs containing up to 32,767 nodes. -REGISTER_KERNEL_BUILDER(Name("MaxSpanningTree") - .Device(tensorflow::DEVICE_CPU) - .TypeConstraint("T"), - MaxSpanningTreeOpKernel); -REGISTER_KERNEL_BUILDER(Name("MaxSpanningTree") - .Device(tensorflow::DEVICE_CPU) - .TypeConstraint("T"), - MaxSpanningTreeOpKernel); -REGISTER_KERNEL_BUILDER(Name("MaxSpanningTree") - .Device(tensorflow::DEVICE_CPU) - .TypeConstraint("T"), - MaxSpanningTreeOpKernel); - -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/mst_solver.h b/tensorflow_text/core/kernels/mst_solver.h index b75e964bf..dbcc545ff 100644 --- a/tensorflow_text/core/kernels/mst_solver.h +++ b/tensorflow_text/core/kernels/mst_solver.h @@ -12,596 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef TENSORFLOW_TEXT_CORE_KERNELS_MST_SOLVER_H_ -#define TENSORFLOW_TEXT_CORE_KERNELS_MST_SOLVER_H_ +#ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_MST_SOLVER_H_ +#define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_MST_SOLVER_H_ -#include +#include "tensorflow/core/kernels/text/mst_solver.h" -#include -#include -#include -#include -#include -#include - -#include "absl/strings/str_cat.h" -#include "absl/types/span.h" -#include "tensorflow/core/lib/core/errors.h" -#include "tensorflow/core/lib/core/status.h" -#include "tensorflow_text/core/kernels/disjoint_set_forest.h" - -namespace tensorflow { -namespace text { - -// Maximum spanning tree solver for directed graphs. Thread-compatible. -// -// The solver operates on a digraph of n nodes and m arcs and outputs a maximum -// spanning tree rooted at any node. Scores can be associated with arcs and -// root selections, and the score of a tree is the sum of the relevant arc and -// root-selection scores. -// -// The implementation is based on: -// -// go/tarjan-1977 google-only -// R.E. Tarjan. 1977. Finding Optimum Branchings. Networks 7(1), pp. 25-35. -// [In particular, see Section 4 "a modification for dense graphs"] -// -// which itself is an improvement of the Chu-Liu-Edmonds algorithm. Note also -// the correction in: -// -// go/camerini-1979 google-only -// P.M. Camerini, L. Fratta, F. Maffioli. 1979. A Note on Finding Optimum -// Branchings. Networks 9(4), pp. 309-312. -// -// The solver runs in O(n^2) time, which is optimal for dense digraphs but slow -// for sparse digraphs where O(m + n log n) can be achieved. The solver uses -// O(n^2) space to store the digraph, which is also optimal for dense digraphs. -// -// Although this algorithm has an inferior asymptotic runtime on sparse graphs, -// it avoids high-constant-overhead data structures like Fibonacci heaps, which -// are required in the asymptotically faster algorithms. Therefore, this solver -// may still be competitive on small sparse graphs. -// -// TODO(terrykoo): If we start running on large sparse graphs, implement the -// following, which runs in O(m + n log n): -// -// go/tarjan-1986 google-only -// H.N. Gabow, Z. Galil, T. Spencer, and R.E. Tarjan. 1986. Efficient -// algorithms for finding minimum spanning trees in undirected and directed -// graphs. Combinatorica, 6(2), pp. 109-122. -// -// Template args: -// Index: An unsigned integral type wide enough to hold 2n. -// Score: A signed arithmetic (integral or floating-point) type. -template -class MstSolver { - public: - static_assert(std::is_integral::value, "Index must be integral"); - static_assert(!std::is_signed::value, "Index must be unsigned"); - static_assert(std::is_arithmetic::value, "Score must be arithmetic"); - static_assert(std::is_signed::value, "Score must be signed"); - using IndexType = Index; - using ScoreType = Score; - - // Creates an empty solver. Call Init() before use. - MstSolver() = default; - - // Initializes this for a digraph with |num_nodes| nodes, or returns non-OK on - // error. Discards existing state; call AddArc() and AddRoot() to add arcs - // and root selections. If |forest| is true, then this solves for a maximum - // spanning forest (i.e., a set of disjoint trees that span the digraph). - absl::Status Init(bool forest, Index num_nodes); - - // Adds an arc from the |source| node to the |target| node with the |score|. - // The |source| and |target| must be distinct node indices in [0,n), and the - // |score| must be finite. Calling this multiple times on the same |source| - // and |target| overwrites the score instead of adding parallel arcs. - void AddArc(Index source, Index target, Score score); - - // As above, but adds a root selection for the |root| node with the |score|. - void AddRoot(Index root, Score score); - - // Returns the score of the arc from |source| to |target|, which must have - // been added by a previous call to AddArc(). - Score ArcScore(Index source, Index target) const; - - // Returns the score of selecting the |root|, which must have been added by a - // previous call to AddRoot(). - Score RootScore(Index root) const; - - // Populates |argmax| with the maximum directed spanning tree of the current - // digraph, or returns non-OK on error. The |argmax| array must contain at - // least n elements. On success, argmax[t] is the source of the arc directed - // into t, or t itself if t is a root. - // - // NB: If multiple spanning trees achieve the maximum score, |argmax| will be - // set to one of the maximal trees, but it is unspecified which one. - absl::Status Solve(absl::Span argmax); - - // Convience method - absl::Status Solve(std::vector *argmax) { - return Solve(absl::MakeSpan(argmax->data(), argmax->size())); - } - - private: - // Implementation notes: - // - // The solver does not operate on the "original" digraph as specified by the - // user, but a "transformed" digraph that differs as follows: - // - // * The transformed digraph adds an "artificial root" node at index 0 and - // offsets all original node indices by +1 to make room. For each root - // selection, the artificial root has one outbound arc directed into the - // candidate root that carries the root-selection score. The artificial - // root has no inbound arcs. - // - // * When solving for a spanning tree (i.e., when |forest_| is false), the - // outbound arcs of the artificial root are penalized to ensure that the - // artificial root has exactly one child. - // - // In the remainder of this file, all mentions of nodes, arcs, etc., refer to - // the transformed digraph unless otherwise specified. - // - // The algorithm is divided into two phases, the "contraction phase" and the - // "expansion phase". The contraction phase finds the arcs that make up the - // maximum spanning tree by applying a series of "contractions" which further - // modify the digraph. The expansion phase "expands" these modifications and - // recovers the maximum spanning tree in the original digraph. - // - // During the contraction phase, the algorithm selects the best inbound arc - // for each node. These arcs can form cycles, which are "contracted" by - // removing the cycle nodes and replacing them with a new contracted node. - // Since each contraction removes 2 or more cycle nodes and adds 1 contracted - // node, at most n-1 contractions will occur. (The digraph initially contains - // n+1 nodes, but one is the artificial root, which cannot form a cycle). - // - // When contracting a cycle, nodes are not explicitly removed and replaced. - // Instead, a contracted node is appended to the digraph and the cycle nodes - // are remapped to the contracted node, which implicitly removes and replaces - // the cycle. As a result, each contraction actually increases the size of - // the digraph, up to a maximum of 2n nodes. One advantage of adding and - // remapping nodes is that it is convenient to recover the argmax spanning - // tree during the expansion phase. - // - // Note that contractions can be nested, because the best inbound arc for a - // contracted node may itelf form a cycle. During the expansion phase, the - // algorithm picks a root of the hierarchy of contracted nodes, breaks the - // cycle it represents, and repeats until all cycles are broken. - - // Constants, as enums to avoid the need for static variable definitions. - enum Constants : Index { - // An index reserved for "null" values. - kNullIndex = std::numeric_limits::max(), - }; - - // A possibly-nonexistent arc in the digraph. - struct Arc { - // Creates a nonexistent arc. - Arc() = default; - - // Returns true if this arc exists. - bool Exists() const { return target != 0; } - - // Returns true if this is a root-selection arc. - bool IsRoot() const { return source == 0; } - - // Returns a string representation of this arc. - std::string DebugString() const { - if (!Exists()) return "[null]"; - if (IsRoot()) { - return absl::StrCat("[*->", target, "=", score, "]"); - } - return absl::StrCat("[", source, "->", target, "=", score, "]"); - } - - // Score of this arc. - Score score; - - // Source of this arc in the initial digraph. - Index source; - - // Target of this arc in the initial digraph, or 0 if this is nonexistent. - Index target = 0; - }; - - // Returns the index, in |arcs_|, of the arc from |source| to |target|. The - // |source| must be one of the initial n+1 nodes. - size_t ArcIndex(size_t source, size_t target) const; - - // Penalizes the root arc scores to ensure that this finds a tree, or does - // nothing if |forest_| is true. Must be called before ContractionPhase(). - void MaybePenalizeRootScoresForTree(); - - // Returns the maximum inbound arc of the |node|, or null if there is none. - const Arc *MaximumInboundArc(Index node) const; - - // Merges the inbound arcs of the |cycle_node| into the inbound arcs of the - // |contracted_node|. Arcs are merged as follows: - // * If the source and target of the arc belong to the same strongly-connected - // component, it is ignored. - // * If exactly one of the nodes had an arc from some source, then on exit the - // |contracted_node| has that arc. - // * If both of the nodes had an arc from the same source, then on exit the - // |contracted_node| has the better-scoring arc. - // The |score_offset| is added to the arc scores of the |cycle_node| before - // they are merged into the |contracted_node|. - void MergeInboundArcs(Index cycle_node, Score score_offset, - Index contracted_node); - - // Contracts the cycle in |argmax_arcs_| that contains the |node|. - void ContractCycle(Index node); - - // Runs the contraction phase of the solver, or returns non-OK on error. This - // phase finds the best inbound arc for each node, contracting cycles as they - // are formed. Stops when every node has selected an inbound arc and there - // are no cycles. - absl::Status ContractionPhase(); - - // Runs the expansion phase of the solver, or returns non-OK on error. This - // phase expands each contracted node, breaks cycles, and populates |argmax| - // with the maximum spanning tree. - absl::Status ExpansionPhase(absl::Span argmax); - - // If true, solve for a spanning forest instead of a spanning tree. - bool forest_ = false; - - // The number of nodes in the original digraph; i.e., n. - Index num_original_nodes_ = 0; - - // The number of nodes in the initial digraph; i.e., n+1. - Index num_initial_nodes_ = 0; - - // The maximum number of possible nodes in the digraph; i.e., 2n. - Index num_possible_nodes_ = 0; - - // The number of nodes in the current digraph, which grows from n+1 to 2n. - Index num_current_nodes_ = 0; - - // Column-major |num_initial_nodes_| x |num_current_nodes_| matrix of arcs, - // where rows and columns correspond to source and target nodes. Columns are - // added as cycles are contracted into new nodes. - // - // TODO(terrykoo): It is possible to squeeze the nonexistent arcs out of each - // column and run the algorithm with each column being a sorted list (sorted - // by source node). This is in fact the suggested representation in Tarjan - // (1977). This won't improve the asymptotic runtime but still might improve - // speed in practice. I haven't done this because it adds complexity versus - // checking Arc::Exists() in a few loops. Try this out when we can benchmark - // this on real data. - std::vector arcs_; - - // Disjoint-set forests tracking the weakly-connected and strongly-connected - // components of the initial digraph, based on the arcs in |argmax_arcs_|. - // Weakly-connected components are used to detect cycles; strongly-connected - // components are used to detect self-loops. - DisjointSetForest weak_components_; - DisjointSetForest strong_components_; - - // A disjoint-set forest that maps each node to the top-most contracted node - // that contains it. Nodes that have not been contracted map to themselves. - // NB: This disjoint-set forest does not use union by rank so we can control - // the outcome of a set union. There will only be O(n) operations on this - // instance, so the increased O(log n) cost of each operation is acceptable. - DisjointSetForest contracted_nodes_; - - // An array that represents the history of cycle contractions, as follows: - // * If contracted_into_[t] is |kNullIndex|, then t is deleted. - // * If contracted_into_[t] is 0, then t is a "root" contracted node; i.e., t - // has not been contracted into another node. - // * Otherwise, contracted_into_[t] is the node into which t was contracted. - std::vector contracted_into_; - - // The maximum inbound arc for each node. The first element is null because - // the artificial root has no inbound arcs. - std::vector argmax_arcs_; - - // Workspace for ContractCycle(), which records the nodes and arcs in the - // cycle being contracted. - std::vector> cycle_; -}; - -// Implementation details below. - -template -absl::Status MstSolver::Init(bool forest, Index num_nodes) { - if (num_nodes <= 0) { - return tensorflow::errors::InvalidArgument("Non-positive number of nodes: ", - num_nodes); - } - - // Upcast to size_t to avoid overflow. - if (2 * static_cast(num_nodes) >= static_cast(kNullIndex)) { - return tensorflow::errors::InvalidArgument("Too many nodes: ", num_nodes); - } - - forest_ = forest; - num_original_nodes_ = num_nodes; - num_initial_nodes_ = num_original_nodes_ + 1; - num_possible_nodes_ = 2 * num_original_nodes_; - num_current_nodes_ = num_initial_nodes_; - - // Allocate the full n+1 x 2n matrix, but start with a n+1 x n+1 prefix. - const size_t num_initial_arcs = static_cast(num_initial_nodes_) * - static_cast(num_initial_nodes_); - const size_t num_possible_arcs = static_cast(num_initial_nodes_) * - static_cast(num_possible_nodes_); - arcs_.reserve(num_possible_arcs); - arcs_.assign(num_initial_arcs, {}); - - weak_components_.Init(num_initial_nodes_); - strong_components_.Init(num_initial_nodes_); - contracted_nodes_.Init(num_possible_nodes_); - contracted_into_.assign(num_possible_nodes_, 0); - argmax_arcs_.assign(num_possible_nodes_, nullptr); - - // This doesn't need to be cleared now; it will be cleared before use. - cycle_.reserve(num_original_nodes_); - - return absl::OkStatus(); -} - -template -void MstSolver::AddArc(Index source, Index target, Score score) { - DCHECK_NE(source, target); - DCHECK(std::isfinite(score)); - Arc &arc = arcs_[ArcIndex(source + 1, target + 1)]; - arc.score = score; - arc.source = source + 1; - arc.target = target + 1; -} - -template -void MstSolver::AddRoot(Index root, Score score) { - DCHECK(std::isfinite(score)); - Arc &arc = arcs_[ArcIndex(0, root + 1)]; - arc.score = score; - arc.source = 0; - arc.target = root + 1; -} - -template -Score MstSolver::ArcScore(Index source, Index target) const { - const Arc &arc = arcs_[ArcIndex(source + 1, target + 1)]; - DCHECK(arc.Exists()); - return arc.score; -} - -template -Score MstSolver::RootScore(Index root) const { - const Arc &arc = arcs_[ArcIndex(0, root + 1)]; - DCHECK(arc.Exists()); - return arc.score; -} - -template -absl::Status MstSolver::Solve(absl::Span argmax) { - MaybePenalizeRootScoresForTree(); - TF_RETURN_IF_ERROR(ContractionPhase()); - TF_RETURN_IF_ERROR(ExpansionPhase(argmax)); - return absl::OkStatus(); -} - -template -inline size_t MstSolver::ArcIndex(size_t source, - size_t target) const { - DCHECK_LT(source, num_initial_nodes_); - DCHECK_LT(target, num_current_nodes_); - return source + target * static_cast(num_initial_nodes_); -} - -template -void MstSolver::MaybePenalizeRootScoresForTree() { - if (forest_) return; - DCHECK_EQ(num_current_nodes_, num_initial_nodes_) - << "Root penalties must be applied before starting the algorithm."; - - // Find the minimum and maximum arc scores. These allow us to bound the range - // of possible tree scores. - Score max_score = std::numeric_limits::lowest(); - Score min_score = std::numeric_limits::max(); - for (const Arc &arc : arcs_) { - if (!arc.Exists()) continue; - max_score = std::max(max_score, arc.score); - min_score = std::min(min_score, arc.score); - } - - // Nothing to do, no existing arcs. - if (max_score < min_score) return; - - // A spanning tree or forest contains n arcs. The penalty below ensures that - // every structure with one root has a higher score than every structure with - // two roots, and so on. - const Score root_penalty = 1 + num_initial_nodes_ * (max_score - min_score); - for (Index root = 1; root < num_initial_nodes_; ++root) { - Arc &arc = arcs_[ArcIndex(0, root)]; - if (!arc.Exists()) continue; - arc.score -= root_penalty; - } -} - -template -const typename MstSolver::Arc * -MstSolver::MaximumInboundArc(Index node) const { - const Arc *__restrict arc = &arcs_[ArcIndex(0, node)]; - const Arc *arc_end = arc + num_initial_nodes_; - - Score max_score = std::numeric_limits::lowest(); - const Arc *argmax_arc = nullptr; - for (; arc < arc_end; ++arc) { - if (!arc->Exists()) continue; - const Score score = arc->score; - if (max_score <= score) { - max_score = score; - argmax_arc = arc; - } - } - return argmax_arc; -} - -template -void MstSolver::MergeInboundArcs(Index cycle_node, - Score score_offset, - Index contracted_node) { - const Arc *__restrict cycle_arc = &arcs_[ArcIndex(0, cycle_node)]; - const Arc *cycle_arc_end = cycle_arc + num_initial_nodes_; - Arc *__restrict contracted_arc = &arcs_[ArcIndex(0, contracted_node)]; - - for (; cycle_arc < cycle_arc_end; ++cycle_arc, ++contracted_arc) { - if (!cycle_arc->Exists()) continue; // nothing to merge - - // Skip self-loops; they are useless because they cannot be used to break - // the cycle represented by the |contracted_node|. - if (strong_components_.SameSet(cycle_arc->source, cycle_arc->target)) { - continue; - } - - // Merge the |cycle_arc| into the |contracted_arc|. - const Score cycle_score = cycle_arc->score + score_offset; - if (!contracted_arc->Exists() || contracted_arc->score < cycle_score) { - contracted_arc->score = cycle_score; - contracted_arc->source = cycle_arc->source; - contracted_arc->target = cycle_arc->target; - } - } -} - -template -void MstSolver::ContractCycle(Index node) { - // Append a new node for the contracted cycle. - const Index contracted_node = num_current_nodes_++; - DCHECK_LE(num_current_nodes_, num_possible_nodes_); - arcs_.resize(arcs_.size() + num_initial_nodes_); - - // We make two passes through the cycle. The first pass updates everything - // except the |arcs_|, and the second pass updates the |arcs_|. The |arcs_| - // must be updated in a second pass because MergeInboundArcs() requires that - // the |strong_components_| are updated with the newly-contracted cycle. - cycle_.clear(); - Index cycle_node = node; - do { - // Gather the nodes and arcs in |cycle_| for the second pass. - const Arc *cycle_arc = argmax_arcs_[cycle_node]; - DCHECK(!cycle_arc->IsRoot()) << cycle_arc->DebugString(); - cycle_.emplace_back(cycle_node, cycle_arc); - - // Mark the cycle nodes as members of a strongly-connected component. - strong_components_.Union(cycle_arc->source, cycle_arc->target); - - // Mark the cycle nodes as members of the new contracted node. Juggling is - // required because |contracted_nodes_| also determines the next cycle node. - const Index next_node = contracted_nodes_.FindRoot(cycle_arc->source); - contracted_nodes_.UnionOfRoots(cycle_node, contracted_node); - contracted_into_[cycle_node] = contracted_node; - cycle_node = next_node; - - // When the cycle repeats, |cycle_node| will be equal to |contracted_node|, - // not |node|, because the first iteration of this loop mapped |node| to - // |contracted_node| in |contracted_nodes_|. - } while (cycle_node != contracted_node); - - // Merge the inbound arcs of each cycle node into the |contracted_node|. - for (const auto &node_and_arc : cycle_) { - // Set the |score_offset| to the cost of breaking the cycle by replacing the - // arc currently directed into the |cycle_node|. - const Index cycle_node = node_and_arc.first; - const Score score_offset = -node_and_arc.second->score; - MergeInboundArcs(cycle_node, score_offset, contracted_node); - } -} - -template -absl::Status MstSolver::ContractionPhase() { - // Skip the artificial root since it has no inbound arcs. - for (Index target = 1; target < num_current_nodes_; ++target) { - // Find the maximum inbound arc for the current |target|, if any. - const Arc *arc = MaximumInboundArc(target); - if (arc == nullptr) { - return tensorflow::errors::FailedPrecondition("Infeasible digraph"); - } - argmax_arcs_[target] = arc; - - // The articifial root cannot be part of a cycle, so we do not need to check - // for cycles or even update its membership in the connected components. - if (arc->IsRoot()) continue; - - // Since every node has at most one selected inbound arc, cycles can be - // detected using weakly-connected components. - const Index source_component = weak_components_.FindRoot(arc->source); - const Index target_component = weak_components_.FindRoot(arc->target); - if (source_component == target_component) { - // Cycle detected; contract it into a new node. - ContractCycle(target); - } else { - // No cycles, just update the weakly-connected components. - weak_components_.UnionOfRoots(source_component, target_component); - } - } - - return absl::OkStatus(); -} - -template -absl::Status MstSolver::ExpansionPhase(absl::Span argmax) { - if (argmax.size() < num_original_nodes_) { - return tensorflow::errors::InvalidArgument( - "Argmax array too small: ", num_original_nodes_, - " elements required, but got ", argmax.size()); - } - - // Select and expand a root contracted node until no contracted nodes remain. - // Thanks to the (topological) order in which contracted nodes are appended, - // root contracted nodes are easily enumerated using a backward scan. After - // this loop, entries [1,n] of |argmax_arcs_| provide the arcs of the maximum - // spanning tree. - for (Index i = num_current_nodes_ - 1; i >= num_initial_nodes_; --i) { - if (contracted_into_[i] == kNullIndex) continue; // already deleted - const Index root = i; // if not deleted, must be a root due to toposorting - - // Copy the cycle-breaking arc to its specified target. - const Arc *arc = argmax_arcs_[root]; - argmax_arcs_[arc->target] = arc; - - // The |arc| not only breaks the cycle associated with the |root|, but also - // breaks every nested cycle between the |root| and the target of the |arc|. - // Delete the contracted nodes corresponding to all broken cycles. - Index node = contracted_into_[arc->target]; - while (node != kNullIndex && node != root) { - const Index parent = contracted_into_[node]; - contracted_into_[node] = kNullIndex; - node = parent; - } - } - - // Copy the spanning tree from |argmax_arcs_| to |argmax|. Also count roots - // for validation below. - Index num_roots = 0; - for (Index target = 0; target < num_original_nodes_; ++target) { - const Arc &arc = *argmax_arcs_[target + 1]; - DCHECK_EQ(arc.target, target + 1) << arc.DebugString(); - if (arc.IsRoot()) { - ++num_roots; - argmax[target] = target; - } else { - argmax[target] = arc.source - 1; - } - } - DCHECK_GE(num_roots, 1); - - // Even when |forest_| is false, |num_roots| can still be more than 1. While - // the root score penalty discourages structures with multiple root arcs, it - // is not a hard constraint. For example, if the original digraph contained - // one root selection per node and no other arcs, the solver would incorrectly - // produce an all-root structure in spite of the root score penalty. As this - // example illustrates, however, |num_roots| will be more than 1 if and only - // if the original digraph is infeasible for trees. - if (!forest_ && num_roots != 1) { - return tensorflow::errors::FailedPrecondition("Infeasible digraph"); - } - - return absl::OkStatus(); -} - -} // namespace text -} // namespace tensorflow - -#endif // TENSORFLOW_TEXT_CORE_KERNELS_MST_SOLVER_H_ +#endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_MST_SOLVER_H_ diff --git a/tensorflow_text/core/kernels/mst_solver_random_comparison_test.cc b/tensorflow_text/core/kernels/mst_solver_random_comparison_test.cc deleted file mode 100644 index 9896801b5..000000000 --- a/tensorflow_text/core/kernels/mst_solver_random_comparison_test.cc +++ /dev/null @@ -1,176 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include - -#include -#include -#include - -#include -#include -#include "absl/flags/flag.h" -#include "tensorflow/core/lib/core/status.h" -#include "tensorflow_text/core/kernels/mst_solver.h" -#include "tensorflow_text/core/kernels/spanning_tree_iterator.h" - -ABSL_FLAG(int64_t, seed, 0, - "Seed for random comparison tests, or 0 for a weak random seed."); -ABSL_FLAG(int, num_trials, 3, "Number of trials for random comparison tests."); - -namespace tensorflow { -namespace text { - -using ::testing::Contains; - -// Returns the random seed, or 0 for a weak random seed. -int64 GetSeed() { return absl::GetFlag(FLAGS_seed); } - -// Returns the number of trials to run for each random comparison. -int64 GetNumTrials() { return absl::GetFlag(FLAGS_num_trials); } - -// Testing rig. Runs a comparison between a brute-force MST solver and the -// MstSolver<> on random digraphs. When the first test parameter is true, -// solves for forests instead of trees. The second test parameter defines the -// size of the test digraph. -class MstSolverRandomComparisonTest - : public ::testing::TestWithParam<::testing::tuple> { - protected: - // Use integer scores so score comparisons are exact. - using Solver = MstSolver; - - // An array providing a source node for each node. Roots are self-loops. - using SourceList = SpanningTreeIterator::SourceList; - - // A row-major n x n matrix whose i,j entry gives the score of the arc from i - // to j, and whose i,i entry gives the score of selecting i as a root. - using ScoreMatrix = std::vector; - - // Returns true if this should be a forest. - bool forest() const { return ::testing::get<0>(GetParam()); } - - // Returns the number of nodes for digraphs. - uint32 num_nodes() const { return ::testing::get<1>(GetParam()); } - - // Returns the score of the arcs in |sources| based on the |scores|. - int32 ScoreArcs(const ScoreMatrix &scores, const SourceList &sources) const { - CHECK_EQ(num_nodes() * num_nodes(), scores.size()); - int32 score = 0; - for (uint32 target = 0; target < num_nodes(); ++target) { - const uint32 source = sources[target]; - score += scores[target + source * num_nodes()]; - } - return score; - } - - // Returns the score of the maximum spanning tree (or forest, if the first - // test parameter is true) of the dense digraph defined by the |scores|, and - // sets |argmax_trees| to contain all maximal trees. - int32 RunBruteForceMstSolver(const ScoreMatrix &scores, - std::set *argmax_trees) { - CHECK_EQ(num_nodes() * num_nodes(), scores.size()); - int32 max_score; - argmax_trees->clear(); - - iterator_.ForEachTree(num_nodes(), [&](const SourceList &sources) { - const int32 score = ScoreArcs(scores, sources); - if (argmax_trees->empty() || max_score < score) { - max_score = score; - argmax_trees->clear(); - argmax_trees->insert(sources); - } else if (max_score == score) { - argmax_trees->insert(sources); - } - }); - - return max_score; - } - - // As above, but uses the |solver_| and extracts only one |argmax_tree|. - int32 RunMstSolver(const ScoreMatrix &scores, SourceList *argmax_tree) { - CHECK_EQ(num_nodes() * num_nodes(), scores.size()); - TF_CHECK_OK(solver_.Init(forest(), num_nodes())); - - // Add all roots and arcs. - for (uint32 source = 0; source < num_nodes(); ++source) { - for (uint32 target = 0; target < num_nodes(); ++target) { - const int32 score = scores[target + source * num_nodes()]; - if (source == target) { - solver_.AddRoot(target, score); - } else { - solver_.AddArc(source, target, score); - } - } - } - - // Solve for the max spanning tree. - argmax_tree->resize(num_nodes()); - TF_CHECK_OK(solver_.Solve(argmax_tree)); - return ScoreArcs(scores, *argmax_tree); - } - - // Returns a random ScoreMatrix spanning num_nodes() nodes. - ScoreMatrix RandomScores() { - ScoreMatrix scores(num_nodes() * num_nodes()); - for (int32 &value : scores) value = static_cast(prng_() % 201) - 100; - return scores; - } - - // Runs a comparison between MstSolver and BruteForceMst on random digraphs of - // num_nodes() nodes, for the specified number of trials. - void RunComparison() { - // Seed the PRNG, possibly non-deterministically. Log the seed value so the - // test results can be reproduced, even when the seed is non-deterministic. - uint32 seed = GetSeed(); - if (seed == 0) seed = time(nullptr); - prng_.seed(seed); - LOG(INFO) << "seed = " << seed; - - const int num_trials = GetNumTrials(); - for (int trial = 0; trial < num_trials; ++trial) { - const ScoreMatrix scores = RandomScores(); - - std::set expected_argmax_trees; - const int32 expected_max_score = - RunBruteForceMstSolver(scores, &expected_argmax_trees); - - SourceList actual_argmax_tree; - const int32 actual_max_score = RunMstSolver(scores, &actual_argmax_tree); - - // In case of ties, MstSolver will find a maximal spanning tree, but we - // don't know which one. - EXPECT_EQ(expected_max_score, actual_max_score); - ASSERT_THAT(expected_argmax_trees, Contains(actual_argmax_tree)); - } - } - - // Tree iterator for brute-force solver. - SpanningTreeIterator iterator_{forest()}; - - // MstSolver<> instance used by the test. Reused across all MST invocations - // to exercise reuse. - Solver solver_; - - // Pseudo-random number generator. - std::mt19937 prng_; -}; - -INSTANTIATE_TEST_SUITE_P(AllowForest, MstSolverRandomComparisonTest, - ::testing::Combine(::testing::Bool(), - ::testing::Range(1, 9))); - -TEST_P(MstSolverRandomComparisonTest, Comparison) { RunComparison(); } - -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/mst_solver_test.cc b/tensorflow_text/core/kernels/mst_solver_test.cc deleted file mode 100644 index 782f7817e..000000000 --- a/tensorflow_text/core/kernels/mst_solver_test.cc +++ /dev/null @@ -1,273 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/mst_solver.h" - -#include -#include -#include - -#include -#include -#include "tensorflow/core/lib/core/status_test_util.h" - -namespace tensorflow { -namespace text { - -// Testing rig. -// -// Template args: -// Solver: An instantiation of the MstSolver<> template. -template -class MstSolverTest : public ::testing::Test { - protected: - using Index = typename Solver::IndexType; - using Score = typename Solver::ScoreType; - - // Adds directed arcs for all |num_nodes| nodes to the |solver_| with the - // |score|. - void AddAllArcs(Index num_nodes, Score score) { - for (Index source = 0; source < num_nodes; ++source) { - for (Index target = 0; target < num_nodes; ++target) { - if (source == target) continue; - solver_.AddArc(source, target, score); - } - } - } - - // Adds root selections for all |num_nodes| nodes to the |solver_| with the - // |score|. - void AddAllRoots(Index num_nodes, Score score) { - for (Index root = 0; root < num_nodes; ++root) { - solver_.AddRoot(root, score); - } - } - - // Runs the |solver_| using an argmax array of size |argmax_array_size| and - // expects it to fail with an error message that matches |error_substr|. - void SolveAndExpectError(int argmax_array_size, - const std::string &error_message_substr) { - std::vector argmax(argmax_array_size); - EXPECT_TRUE(absl::StrContains(solver_.Solve(&argmax).ToString(), - error_message_substr)); - } - - // As above, but expects success. Does not assert anything about the solution - // produced by the solver. - void SolveAndExpectOk(int argmax_array_size) { - std::vector argmax(argmax_array_size); - TF_EXPECT_OK(solver_.Solve(&argmax)); - } - - // As above, but expects the solution to be |expected_argmax| and infers the - // argmax array size. - void SolveAndExpectArgmax(const std::vector &expected_argmax) { - std::vector actual_argmax(expected_argmax.size()); - TF_ASSERT_OK(solver_.Solve(&actual_argmax)); - EXPECT_EQ(expected_argmax, actual_argmax); - } - - // MstSolver<> instance used by the test. Reused across all MST problems in - // each test to exercise reuse. - Solver solver_; -}; - -using Solvers = - ::testing::Types, MstSolver, - MstSolver, MstSolver, - MstSolver>; -TYPED_TEST_SUITE(MstSolverTest, Solvers); - -TYPED_TEST(MstSolverTest, FailIfNoNodes) { - for (const bool forest : {false, true}) { - EXPECT_TRUE(absl::StrContains(this->solver_.Init(forest, 0).ToString(), - "Non-positive number of nodes")); - } -} - -TYPED_TEST(MstSolverTest, FailIfTooManyNodes) { - // Set to a value that would overflow when doubled. - const auto kNumNodes = - (std::numeric_limits::max() / 2) + 10; - for (const bool forest : {false, true}) { - EXPECT_TRUE(absl::StrContains( - this->solver_.Init(forest, kNumNodes).ToString(), "Too many nodes")); - } -} - -TYPED_TEST(MstSolverTest, InfeasibleIfNoRootsNoArcs) { - const int kNumNodes = 10; - for (const bool forest : {false, true}) { - TF_ASSERT_OK(this->solver_.Init(forest, kNumNodes)); - this->SolveAndExpectError(kNumNodes, "Infeasible digraph"); - } -} - -TYPED_TEST(MstSolverTest, InfeasibleIfNoRootsAllArcs) { - const int kNumNodes = 10; - for (const bool forest : {false, true}) { - TF_ASSERT_OK(this->solver_.Init(forest, kNumNodes)); - this->AddAllArcs(kNumNodes, 0); - this->SolveAndExpectError(kNumNodes, "Infeasible digraph"); - } -} - -TYPED_TEST(MstSolverTest, FeasibleForForestOnlyIfAllRootsNoArcs) { - const int kNumNodes = 10; - for (const bool forest : {false, true}) { - TF_ASSERT_OK(this->solver_.Init(forest, kNumNodes)); - this->AddAllRoots(kNumNodes, 0); - if (forest) { - this->SolveAndExpectOk(kNumNodes); // all roots is a valid forest - } else { - this->SolveAndExpectError(kNumNodes, "Infeasible digraph"); - } - } -} - -TYPED_TEST(MstSolverTest, FeasibleIfAllRootsAllArcs) { - const int kNumNodes = 10; - for (const bool forest : {false, true}) { - TF_ASSERT_OK(this->solver_.Init(forest, kNumNodes)); - this->AddAllRoots(kNumNodes, 0); - this->AddAllArcs(kNumNodes, 0); - this->SolveAndExpectOk(kNumNodes); - } -} - -TYPED_TEST(MstSolverTest, FailIfArgmaxArrayTooSmall) { - const int kNumNodes = 10; - for (const bool forest : {false, true}) { - TF_ASSERT_OK(this->solver_.Init(forest, kNumNodes)); - this->AddAllRoots(kNumNodes, 0); - this->AddAllArcs(kNumNodes, 0); - this->SolveAndExpectError(kNumNodes - 1, // too small - "Argmax array too small"); - } -} - -TYPED_TEST(MstSolverTest, OkIfArgmaxArrayTooLarge) { - const int kNumNodes = 10; - for (const bool forest : {false, true}) { - TF_ASSERT_OK(this->solver_.Init(forest, kNumNodes)); - this->AddAllRoots(kNumNodes, 0); - this->AddAllArcs(kNumNodes, 0); - this->SolveAndExpectOk(kNumNodes + 1); // too large - } -} - -TYPED_TEST(MstSolverTest, SolveForAllRootsForestOnly) { - const int kNumNodes = 10; - const bool forest = true; - TF_ASSERT_OK(this->solver_.Init(forest, kNumNodes)); - this->AddAllRoots(kNumNodes, 1); // favor all root selections - this->AddAllArcs(kNumNodes, 0); - this->SolveAndExpectArgmax({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}); -} - -TYPED_TEST(MstSolverTest, SolveForLeftToRightChain) { - const int kNumNodes = 10; - for (const bool forest : {false, true}) { - TF_ASSERT_OK(this->solver_.Init(forest, kNumNodes)); - this->AddAllRoots(kNumNodes, 0); - this->AddAllArcs(kNumNodes, 0); - for (int target = 1; target < kNumNodes; ++target) { - this->solver_.AddArc(target - 1, target, 1); // favor left-to-right chain - } - this->SolveAndExpectArgmax({0, 0, 1, 2, 3, 4, 5, 6, 7, 8}); - } -} - -TYPED_TEST(MstSolverTest, SolveForRightToLeftChain) { - const int kNumNodes = 10; - for (const bool forest : {false, true}) { - TF_ASSERT_OK(this->solver_.Init(forest, kNumNodes)); - this->AddAllRoots(kNumNodes, 0); - this->AddAllArcs(kNumNodes, 0); - for (int source = 1; source < kNumNodes; ++source) { - this->solver_.AddArc(source, source - 1, 1); // favor right-to-left chain - } - this->SolveAndExpectArgmax({1, 2, 3, 4, 5, 6, 7, 8, 9, 9}); - } -} - -TYPED_TEST(MstSolverTest, SolveForAllFromFirstTree) { - const int kNumNodes = 10; - for (const bool forest : {false, true}) { - TF_ASSERT_OK(this->solver_.Init(forest, kNumNodes)); - this->AddAllRoots(kNumNodes, 0); - this->AddAllArcs(kNumNodes, 0); - for (int target = 1; target < kNumNodes; ++target) { - this->solver_.AddArc(0, target, 1); // favor first -> target - } - this->SolveAndExpectArgmax({0, 0, 0, 0, 0, 0, 0, 0, 0, 0}); - } -} - -TYPED_TEST(MstSolverTest, SolveForAllFromLastTree) { - const int kNumNodes = 10; - for (const bool forest : {false, true}) { - TF_ASSERT_OK(this->solver_.Init(forest, kNumNodes)); - this->AddAllRoots(kNumNodes, 0); - this->AddAllArcs(kNumNodes, 0); - for (int target = 0; target + 1 < kNumNodes; ++target) { - this->solver_.AddArc(9, target, 1); // favor last -> target - } - this->SolveAndExpectArgmax({9, 9, 9, 9, 9, 9, 9, 9, 9, 9}); - } -} - -TYPED_TEST(MstSolverTest, SolveForBinaryTree) { - const int kNumNodes = 15; - for (const bool forest : {false, true}) { - TF_ASSERT_OK(this->solver_.Init(forest, kNumNodes)); - this->AddAllRoots(kNumNodes, 0); - this->AddAllArcs(kNumNodes, 0); - for (int target = 1; target < kNumNodes; ++target) { - this->solver_.AddArc((target - 1) / 2, target, 1); // like a binary heap - } - // clang-format off - this->SolveAndExpectArgmax({0, - 0, 0, - 1, 1, 2, 2, - 3, 3, 4, 4, 5, 5, 6, 6}); - // clang-format on - } -} - -TYPED_TEST(MstSolverTest, ScoreAccessors) { - for (const bool forest : {false, true}) { - TF_ASSERT_OK(this->solver_.Init(forest, 10)); - this->solver_.AddArc(0, 1, 0); - this->solver_.AddArc(1, 4, 1); - this->solver_.AddArc(7, 6, 2); - this->solver_.AddArc(9, 2, 3); - - this->solver_.AddRoot(0, 10); - this->solver_.AddRoot(2, 20); - this->solver_.AddRoot(8, 30); - - EXPECT_EQ(this->solver_.ArcScore(0, 1), 0); - EXPECT_EQ(this->solver_.ArcScore(1, 4), 1); - EXPECT_EQ(this->solver_.ArcScore(7, 6), 2); - EXPECT_EQ(this->solver_.ArcScore(9, 2), 3); - - EXPECT_EQ(this->solver_.RootScore(0), 10); - EXPECT_EQ(this->solver_.RootScore(2), 20); - EXPECT_EQ(this->solver_.RootScore(8), 30); - } -} - -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/ngrams_kernel.cc b/tensorflow_text/core/kernels/ngrams_kernel.cc deleted file mode 100644 index de9486e6f..000000000 --- a/tensorflow_text/core/kernels/ngrams_kernel.cc +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow_text/core/kernels/ngrams_kernel.h" - -#include "tensorflow/core/framework/op_kernel.h" - -namespace tensorflow { -namespace text { - -REGISTER_KERNEL_BUILDER( - Name(NgramsStringJoinKernel::OpName()).Device(tensorflow::DEVICE_CPU), - NgramsStringJoinKernel); - -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/ngrams_kernel.h b/tensorflow_text/core/kernels/ngrams_kernel.h index e8c13d603..963b7f956 100644 --- a/tensorflow_text/core/kernels/ngrams_kernel.h +++ b/tensorflow_text/core/kernels/ngrams_kernel.h @@ -12,37 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - #ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_NGRAMS_KERNEL_H_ #define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_NGRAMS_KERNEL_H_ -#include "tensorflow/lite/kernels/shim/tf_op_shim.h" -#include "tensorflow_text/core/kernels/ngrams_kernel_template.h" - -namespace tensorflow { -namespace text { - -class NgramsStringJoinKernel - : public tflite::shim::TfOpKernel { - public: - using TfOpKernel::TfOpKernel; -}; - -} // namespace text -} // namespace tensorflow +#include "tensorflow/core/kernels/text/ngrams_kernel.h" #endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_NGRAMS_KERNEL_H_ diff --git a/tensorflow_text/core/kernels/ngrams_kernel_template.h b/tensorflow_text/core/kernels/ngrams_kernel_template.h index 1a8a3fc8f..0190a67a6 100644 --- a/tensorflow_text/core/kernels/ngrams_kernel_template.h +++ b/tensorflow_text/core/kernels/ngrams_kernel_template.h @@ -12,265 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. +#ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_NGRAMS_KERNEL_TEMPLATE_H_ +#define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_NGRAMS_KERNEL_TEMPLATE_H_ -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at +#include "tensorflow/core/kernels/text/ngrams_kernel_template.h" - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#ifndef TENSORFLOW_TEXT_CORE_KERNELS_NGRAMS_KERNEL_TEMPLATE_H_ -#define TENSORFLOW_TEXT_CORE_KERNELS_NGRAMS_KERNEL_TEMPLATE_H_ - -#include "absl/status/status.h" -#include "absl/strings/str_cat.h" -#include "absl/strings/str_join.h" -#include "absl/strings/string_view.h" -#include "tensorflow/core/platform/tstring.h" -#include "tensorflow/lite/kernels/shim/op_kernel.h" -#include "tensorflow/lite/kernels/shim/status_macros.h" -#include "tensorflow/lite/kernels/shim/tensor_view.h" - -namespace tensorflow { -namespace text { - -// text.ngrams op kernel. See `kDoc` for more info. -template -class NgramsStringJoin : public tflite::shim::OpKernelShim { - protected: - using Shape = tflite::shim::Shape; - - public: - using typename tflite::shim::OpKernelShim::InitContext; - using typename tflite::shim::OpKernelShim::InvokeContext; - using typename tflite::shim::OpKernelShim::ShapeInferenceContext; - - NgramsStringJoin() = default; - static constexpr char kOpName[] = "TFText>NgramsStringJoin"; - static constexpr char kDoc[] = R"doc( - Create a tensor of n-grams based on the string input data. - - Args: - input_values: A string tensor, or a ragged string tensor (a 1D string value - tensor and one or more 1D int64 row_split tensors). - row_splits: List of integer tensors representing the splits of the - input_values - width: scalar integer - The width of the ngram window. - axis: scalar integer - The axis to create ngrams along. Currently, it must be -1. - string_separator: scalar string - The separator string used to join tokens together. - - Returns: - output_values: A string tensor that matches the rank of 'data'. Will be a - ragged tensor if 'data' is a ragged tensor. - output_row_splits: Splits of above. - )doc"; - - static const char* OpName() { return kOpName; } - static const char* Doc() { return kDoc; } - - // Attributes declaration - static std::vector Attrs() { - return {"width: int", - "axis: int", - "string_separator: string", - "RAGGED_RANK: int >= 0", - "Tsplits: {int64} = DT_INT64"}; - } - // Input tensors declaration - static std::vector Inputs() { - return {"input_values: string", "input_row_splits: RAGGED_RANK * Tsplits"}; - } - // Output tensors declaration - static std::vector Outputs() { - return {"output_values: string", - "output_row_splits: RAGGED_RANK * Tsplits"}; - } - - // Initializes the op - absl::Status Init(InitContext* ctx) { - int64_t axis; - SH_RETURN_IF_ERROR(ctx->GetAttr("axis", &axis)); - if (axis != -1) { - return absl::InternalError(absl::StrCat("axis != -1: ", axis)); - } - SH_RETURN_IF_ERROR(ctx->GetAttr("width", &width_)); - absl::string_view string_separator; - SH_RETURN_IF_ERROR(ctx->GetAttr("string_separator", &string_separator)); - string_separator_ = std::string(string_separator); - return absl::OkStatus(); - } - - // Shape inference - static absl::Status ShapeInference(ShapeInferenceContext* ctx) { - if (ctx->NumOutputs() == 1) { - // Tensor Output - SH_ASSIGN_OR_RETURN(const auto input_shape, ctx->GetInputShape(kValues)); - int64_t width; - SH_RETURN_IF_ERROR(ctx->GetAttr("width", &width)); - SH_RETURN_IF_ERROR(ctx->SetOutputShape( - kValues, OutputValuesTensorShape(input_shape, width))); - } else { - // RaggedTensor Output - SH_ASSIGN_OR_RETURN(const auto input_shape, ctx->GetInputShape(kValues)); - Shape output_shape(input_shape); - const int last_dim = output_shape->size() - 1; - if (last_dim != -1) { - (*output_shape)[last_dim] = output_shape.kUnknownDim; - } - SH_RETURN_IF_ERROR(ctx->SetOutputShape(kValues, output_shape)); - - // The row_splits tensors maintain their shape, because only the - // innermost dimension will change. - for (int i = kRowSplitsStart; i < ctx->NumOutputs(); ++i) { - SH_ASSIGN_OR_RETURN(const Shape input_row_splits_shape, - ctx->GetInputShape(i)); - if (input_row_splits_shape.Rank() != 1) { - return absl::InvalidArgumentError( - absl::StrCat("expected rank == 1 for input index: ", i)); - } - SH_RETURN_IF_ERROR(ctx->SetOutputShape(i, input_row_splits_shape)); - } - } - return absl::OkStatus(); - } - - // Runs the operation - absl::Status Invoke(InvokeContext* ctx) { - using Tsplits = int64_t; - // Storage for the dummy input and output row_splits used in the tensor - // case. - std::vector tensor_input_row_splits; - std::vector tensor_output_row_splits; - - const Tsplits* input_row_splits; - Tsplits* output_row_splits; - int n_row_splits = 0; - - SH_ASSIGN_OR_RETURN(const auto input_values, ctx->GetInput(kValues)); - const Shape input_values_shape(input_values->Shape()); - - // Tensor output - if (ctx->NumOutputs() == 1) { - // Generate mock input and output innermost row_splits. - int64_t total_tokens = - input_values->template Data().size(); - int64_t tokens_per_element = - input_values_shape->at(input_values_shape->size() - 1); - tensor_output_row_splits.resize(total_tokens / tokens_per_element + 1); - for (int64_t i = 0; i <= total_tokens; i += tokens_per_element) { - tensor_input_row_splits.push_back(i); - } - input_row_splits = tensor_input_row_splits.data(); - output_row_splits = tensor_output_row_splits.data(); - n_row_splits = tensor_input_row_splits.size(); - } else { - // RaggedTensor output - int index = 0; - const int num_row_splits = ctx->NumInputs() - kRowSplitsStart; - // Copy all input splits except for innermost into output splits. - while (index < num_row_splits - 1) { - SH_ASSIGN_OR_RETURN(const auto input_tensor_row_splits, - ctx->GetInput(kRowSplitsStart + index)); - SH_ASSIGN_OR_RETURN( - const auto output_tensor_row_splits, - ctx->GetOutput(kRowSplitsStart + index, - Shape(input_tensor_row_splits->Shape()))); - const auto input_buffer = - input_tensor_row_splits->template Data(); - const auto output_buffer = - output_tensor_row_splits->template Data(); - std::memcpy(output_buffer.data(), input_buffer.data(), - input_buffer.size() * sizeof(Tsplits)); - ++index; - } - // Set row splits variables to the innermost - SH_ASSIGN_OR_RETURN(const auto input_tensor_row_splits, - ctx->GetInput(kRowSplitsStart + index)); - SH_ASSIGN_OR_RETURN( - const auto output_tensor_row_splits, - ctx->GetOutput(kRowSplitsStart + index, - Shape(input_tensor_row_splits->Shape()))); - input_row_splits = - input_tensor_row_splits->template Data().data(); - output_row_splits = - output_tensor_row_splits->template Data().data(); - n_row_splits = input_tensor_row_splits->Shape().at(0); - } - - const auto input_values_data = - input_values->template Data(); - - // Create ngrams by looping through the innermost input splits. - std::vector buffer; - for (int i = 0; i < n_row_splits - 1; ++i) { - // Set output splits using current number of created output values. - output_row_splits[i] = buffer.size(); - std::vector tokens; - for (int j = input_row_splits[i]; j < input_row_splits[i + 1]; ++j) { - tokens.emplace_back(input_values_data.at(j)); - if (tokens.size() < width_) continue; - tokens.erase(tokens.begin(), tokens.begin() + tokens.size() - width_); - buffer.push_back(absl::StrJoin(tokens, string_separator_)); - } - } - output_row_splits[n_row_splits - 1] = buffer.size(); - - // Set output values from the generated buffer. - tflite::shim::TensorViewOr output_values_or; - if (ctx->NumOutputs() == 1) { - output_values_or = ctx->GetOutput( - kValues, OutputValuesTensorShape(input_values_shape, width_)); - } else { - output_values_or = - ctx->GetOutput(kValues, Shape({static_cast(buffer.size())})); - } - if (!output_values_or.ok()) return output_values_or.status(); - auto& output_buffer = - output_values_or.value()->template Data(); - int i = 0; - for (const auto& v : buffer) output_buffer[i++] = v; - return absl::OkStatus(); - } - - protected: - inline static Shape OutputValuesTensorShape(const Shape& input_values_shape, - const int64_t width) { - // If the input shape is unknown, so is the output shape. - if (input_values_shape.Rank() == input_values_shape.kUnknownRank) - return input_values_shape; - - Shape output_shape(input_values_shape); - const int last_dim = output_shape->size() - 1; - if (input_values_shape->at(last_dim) == input_values_shape.kUnknownDim) - return output_shape; - (*output_shape)[last_dim] = - std::max(0, output_shape->at(last_dim) - static_cast(width) + 1); - return output_shape; - } - - // Both the input and output tensors use the same indices. - static constexpr int kValues = 0; - static constexpr int kRowSplitsStart = 1; - - int64_t width_; - std::string string_separator_; -}; - -} // namespace text -} // namespace tensorflow - -#endif // TENSORFLOW_TEXT_CORE_KERNELS_NGRAMS_KERNEL_TEMPLATE_H_ +#endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_NGRAMS_KERNEL_TEMPLATE_H_ diff --git a/tensorflow_text/core/kernels/ngrams_kernel_test.cc b/tensorflow_text/core/kernels/ngrams_kernel_test.cc deleted file mode 100644 index a6b70925c..000000000 --- a/tensorflow_text/core/kernels/ngrams_kernel_test.cc +++ /dev/null @@ -1,74 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -/* Copyright 2022 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); - -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/core/framework/node_def_builder.h" -#include "tensorflow/core/framework/op.h" -#include "tensorflow/core/framework/shape_inference_testutil.h" -#include "tensorflow/core/framework/tensor_testutil.h" -#include "tensorflow/core/platform/test.h" - -namespace tensorflow { - -TEST(NgramsStringJoin, UnknownRank) { - ShapeInferenceTestOp op("TFText>NgramsStringJoin"); - op.input_tensors.resize(1); - AddNodeAttr("RAGGED_RANK", 0, &op.node_def); - AddNodeAttr("width", 1, &op.node_def); - - INFER_OK(op, "?", "?"); -} - -TEST(NgramsStringJoin, KnownRankUnknownDims) { - ShapeInferenceTestOp op("TFText>NgramsStringJoin"); - op.input_tensors.resize(1); - AddNodeAttr("RAGGED_RANK", 0, &op.node_def); - AddNodeAttr("width", 1, &op.node_def); - - INFER_OK(op, "[1,?]", "[1,?]"); -} - -TEST(NgramsStringJoin, LastDimWidth) { - ShapeInferenceTestOp op("TFText>NgramsStringJoin"); - op.input_tensors.resize(1); - AddNodeAttr("RAGGED_RANK", 0, &op.node_def); - AddNodeAttr("width", 3, &op.node_def); - - INFER_OK(op, "[?,5]", "[?,3]"); -} - -TEST(NgramsStringJoin, LastDimWidthClampZero) { - ShapeInferenceTestOp op("TFText>NgramsStringJoin"); - op.input_tensors.resize(1); - AddNodeAttr("RAGGED_RANK", 0, &op.node_def); - AddNodeAttr("width", 3, &op.node_def); - - INFER_OK(op, "[?,1]", "[?,0]"); -} - -} // end namespace tensorflow diff --git a/tensorflow_text/core/kernels/ngrams_tflite.cc b/tensorflow_text/core/kernels/ngrams_tflite.cc deleted file mode 100644 index 71c34bb2b..000000000 --- a/tensorflow_text/core/kernels/ngrams_tflite.cc +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow_text/core/kernels/ngrams_tflite.h" - -#include "tensorflow/lite/c/common.h" -#include "tensorflow/lite/kernels/shim/tflite_op_shim.h" -#include "tensorflow_text/core/kernels/ngrams_kernel_template.h" - -namespace tflite { -namespace ops { -namespace custom { -namespace text { - -using OpKernel = - tflite::shim::TfLiteOpKernel; - -extern "C" void AddNgramsStringJoin(tflite::MutableOpResolver* resolver) { - OpKernel::Add(resolver); -} - -TfLiteRegistration* Register_TFText_NgramsStringJoin() { - return OpKernel::GetTfLiteRegistration(); -} - -} // namespace text -} // namespace custom -} // namespace ops -} // namespace tflite diff --git a/tensorflow_text/core/kernels/ngrams_tflite.h b/tensorflow_text/core/kernels/ngrams_tflite.h index 0f0700ad0..02bfa93a8 100644 --- a/tensorflow_text/core/kernels/ngrams_tflite.h +++ b/tensorflow_text/core/kernels/ngrams_tflite.h @@ -15,39 +15,6 @@ #ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_NGRAMS_TFLITE_H_ #define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_NGRAMS_TFLITE_H_ -/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/lite/c/common.h" -#include "tensorflow/lite/mutable_op_resolver.h" - -namespace tflite { -namespace ops { -namespace custom { -namespace text { - -// Adds the Ngrams custom op to an op resolver. -// This function can be loaded using dlopen. Since C++ function names get -// mangled, declare this function as extern C, so its name is unchanged. -extern "C" void AddNgramsStringJoin(MutableOpResolver* resolver); - -TfLiteRegistration* Register_TFText_NgramsStringJoin(); - -} // namespace text -} // namespace custom -} // namespace ops -} // namespace tflite +#include "tensorflow/core/kernels/text/ngrams_tflite.h" #endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_NGRAMS_TFLITE_H_ diff --git a/tensorflow_text/core/kernels/ngrams_tflite_test.cc b/tensorflow_text/core/kernels/ngrams_tflite_test.cc deleted file mode 100644 index 0e2e88e61..000000000 --- a/tensorflow_text/core/kernels/ngrams_tflite_test.cc +++ /dev/null @@ -1,305 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow_text/core/kernels/ngrams_tflite.h" - -#include -#include - -#include -#include -#include "flatbuffers/flexbuffers.h" -#include "tensorflow/lite/kernels/test_util.h" -#include "tensorflow/lite/schema/schema_generated.h" -#include "tensorflow/lite/string_util.h" - -namespace tflite { -namespace ops { -namespace custom { -namespace text { -namespace { - -using ::testing::ElementsAre; -using ::testing::ElementsAreArray; - -class NgramsModel : public SingleOpModel { - public: - // Constructor for testing the op with a tf.Tensor - NgramsModel(int width, const std::string& string_separator, - const std::vector& input_values, - const std::vector& input_shape) { - input_values_ = AddInput(TensorType_STRING); - output_values_ = AddOutput(TensorType_STRING); - - BuildCustomOp(width, string_separator); - - BuildInterpreter({input_shape}); - PopulateStringTensor(input_values_, input_values); - Invoke(); - } - - // Constructor for the op with a tf.RaggedTensor - // Note: This interface uses row_lengths, as they're closer to the - // dimensions in a TensorShape, but internally everything is row_splits. - NgramsModel(int width, const std::string& string_separator, - const std::vector& input_values, - const std::vector> nested_row_lengths) { - std::vector> input_shapes; - input_shapes.reserve(nested_row_lengths.size() + 1); - - input_values_ = AddInput(TensorType_STRING); - input_shapes.push_back({static_cast(input_values.size())}); - output_values_ = AddOutput(TensorType_STRING); - - input_row_splits_.reserve(nested_row_lengths.size()); - output_row_splits_.reserve(nested_row_lengths.size()); - for (int i = 0; i < nested_row_lengths.size(); ++i) { - input_row_splits_.push_back(AddInput(TensorType_INT64)); - input_shapes.push_back( - {static_cast(nested_row_lengths[i].size() + 1)}); - output_row_splits_.push_back(AddOutput(TensorType_INT64)); - } - - BuildCustomOp(width, string_separator); - - BuildInterpreter(input_shapes); - PopulateStringTensor(input_values_, input_values); - for (int i = 0; i < nested_row_lengths.size(); ++i) { - std::vector row_splits; - row_splits.reserve(nested_row_lengths[i].size() + 1); - int64_t index = 0; - row_splits.push_back(index); - for (int64_t row_length : nested_row_lengths[i]) { - index += row_length; - row_splits.push_back(index); - } - PopulateTensor(input_row_splits_[i], row_splits); - } - Invoke(); - } - - std::vector GetValuesTensorShape() { - return GetTensorShape(output_values_); - } - - std::vector ExtractValuesTensorVector() { - std::vector r; - TfLiteTensor* tensor = interpreter_->tensor(output_values_); - int n = GetStringCount(tensor); - for (int i = 0; i < n; ++i) { - StringRef ref = GetString(tensor, i); - r.emplace_back(ref.str, ref.len); - } - return r; - } - - int GetNumNestedRowLengths() { return output_row_splits_.size(); } - - std::vector GetRowLengthsTensorShape(int i) { - std::vector shape = GetTensorShape(output_row_splits_[i]); - --shape[0]; - return shape; - } - - std::vector ExtractRowLengthsTensorVector(int i) { - std::vector row_splits = - ExtractVector(output_row_splits_[i]); - std::vector row_lengths; - row_lengths.reserve(row_splits.size() - 1); - int64_t head = row_splits[0]; - for (int i = 1; i < row_splits.size(); ++i) { - int64_t tail = row_splits[i]; - row_lengths.push_back(tail - head); - head = tail; - } - return row_lengths; - } - - private: - void BuildCustomOp(int width, const std::string& string_separator) { - flexbuffers::Builder fbb; - size_t start_map = fbb.StartMap(); - fbb.Int("width", width); - fbb.String("string_separator", string_separator); - fbb.Int("axis", -1); - fbb.String("reduction_type", "STRING_JOIN"); - fbb.EndMap(start_map); - fbb.Finish(); - - SetCustomOp("TFText>NgramsStringJoin", fbb.GetBuffer(), - Register_TFText_NgramsStringJoin); - } - - int input_values_; - std::vector input_row_splits_; - int output_values_; - std::vector output_row_splits_; -}; - -TEST(NgramsTest, TensorSingleSequenceWidthTwo) { - NgramsModel m(2, " ", {"this", "is", "a", "test"}, std::vector{4}); - EXPECT_THAT(m.GetValuesTensorShape(), ElementsAre(3)); - EXPECT_THAT(m.ExtractValuesTensorVector(), - ElementsAre("this is", "is a", "a test")); -} - -TEST(NgramsTest, TensorSingleSequenceWidthThree) { - NgramsModel m(3, " ", {"this", "is", "a", "test"}, std::vector{4}); - EXPECT_THAT(m.GetValuesTensorShape(), ElementsAre(2)); - EXPECT_THAT(m.ExtractValuesTensorVector(), - ElementsAre("this is a", "is a test")); -} - -TEST(NgramsTest, TensorSingleSequenceLongerSeparator) { - NgramsModel m(2, "...", {"this", "is", "a", "test"}, std::vector{4}); - EXPECT_THAT(m.GetValuesTensorShape(), ElementsAre(3)); - EXPECT_THAT(m.ExtractValuesTensorVector(), - ElementsAre("this...is", "is...a", "a...test")); -} - -TEST(NgramsTest, TensorSingleSequenceWidthTooLong) { - NgramsModel m(5, " ", {"this", "is", "a", "test"}, std::vector{4}); - EXPECT_THAT(m.GetValuesTensorShape(), ElementsAre(0)); - EXPECT_THAT(m.ExtractValuesTensorVector(), ElementsAre()); -} - -TEST(NgramsTest, TensorMultidimensionalInputWidthTwo) { - NgramsModel m(2, " ", - { - "0,0,0", "0,0,1", "0,0,2", "0,0,3", // - "0,1,0", "0,1,1", "0,1,2", "0,1,3", // - "0,2,0", "0,2,1", "0,2,2", "0,2,3", // - "1,0,0", "1,0,1", "1,0,2", "1,0,3", // - "1,1,0", "1,1,1", "1,1,2", "1,1,3", // - "1,2,0", "1,2,1", "1,2,2", "1,2,3", // - }, - std::vector{2, 3, 4}); - EXPECT_THAT(m.GetValuesTensorShape(), ElementsAre(2, 3, 3)); - EXPECT_THAT(m.ExtractValuesTensorVector(), - ElementsAreArray({ - "0,0,0 0,0,1", "0,0,1 0,0,2", "0,0,2 0,0,3", // - "0,1,0 0,1,1", "0,1,1 0,1,2", "0,1,2 0,1,3", // - "0,2,0 0,2,1", "0,2,1 0,2,2", "0,2,2 0,2,3", // - "1,0,0 1,0,1", "1,0,1 1,0,2", "1,0,2 1,0,3", // - "1,1,0 1,1,1", "1,1,1 1,1,2", "1,1,2 1,1,3", // - "1,2,0 1,2,1", "1,2,1 1,2,2", "1,2,2 1,2,3", // - })); -} - -TEST(NgramsTest, RaggedTensorSingleSequenceWidthTwo) { - std::vector> nested_row_lengths; - nested_row_lengths.push_back({4}); - NgramsModel m(2, " ", {"this", "is", "a", "test"}, - nested_row_lengths); - EXPECT_THAT(m.GetValuesTensorShape(), ElementsAre(3)); - EXPECT_THAT(m.ExtractValuesTensorVector(), - ElementsAre("this is", "is a", "a test")); - ASSERT_THAT(m.GetNumNestedRowLengths(), 1); - EXPECT_THAT(m.GetRowLengthsTensorShape(0), ElementsAre(1)); - EXPECT_THAT(m.ExtractRowLengthsTensorVector(0), ElementsAre(3)); -} - -TEST(NgramsTest, RaggedTensorSingleSequenceWidthThree) { - std::vector> nested_row_lengths; - nested_row_lengths.push_back({4}); - NgramsModel m(3, " ", {"this", "is", "a", "test"}, nested_row_lengths); - EXPECT_THAT(m.GetValuesTensorShape(), ElementsAre(2)); - EXPECT_THAT(m.ExtractValuesTensorVector(), - ElementsAre("this is a", "is a test")); - ASSERT_THAT(m.GetNumNestedRowLengths(), 1); - EXPECT_THAT(m.GetRowLengthsTensorShape(0), ElementsAre(1)); - EXPECT_THAT(m.ExtractRowLengthsTensorVector(0), ElementsAre(2)); -} - -TEST(NgramsTest, RaggedTensorSingleSequenceLongerSeparator) { - std::vector> nested_row_lengths; - nested_row_lengths.push_back({4}); - NgramsModel m(2, "<>", {"this", "is", "a", "test"}, nested_row_lengths); - EXPECT_THAT(m.GetValuesTensorShape(), ElementsAre(3)); - EXPECT_THAT(m.ExtractValuesTensorVector(), - ElementsAre("this<>is", "is<>a", "a<>test")); - ASSERT_THAT(m.GetNumNestedRowLengths(), 1); - EXPECT_THAT(m.GetRowLengthsTensorShape(0), ElementsAre(1)); - EXPECT_THAT(m.ExtractRowLengthsTensorVector(0), ElementsAre(3)); -} - -TEST(NgramsTest, RaggedTensorSingleSequenceWidthTooLong) { - std::vector> nested_row_lengths; - nested_row_lengths.push_back({4}); - NgramsModel m(5, " ", {"this", "is", "a", "test"}, nested_row_lengths); - EXPECT_THAT(m.GetValuesTensorShape(), ElementsAre(0)); - EXPECT_THAT(m.ExtractValuesTensorVector(), ElementsAre()); - ASSERT_THAT(m.GetNumNestedRowLengths(), 1); - EXPECT_THAT(m.GetRowLengthsTensorShape(0), ElementsAre(1)); - EXPECT_THAT(m.ExtractRowLengthsTensorVector(0), ElementsAre(0)); -} - -TEST(NgramsTest, RaggedTensorMultidimensionalInputWidthTwo) { - std::vector> nested_row_lengths; - nested_row_lengths.push_back({4, 2, 1}); - nested_row_lengths.push_back({5, 4, 3, 2, 2, 3, 4, 6}); - NgramsModel m(2, " ", - { - "0,0,0", "0,0,1", "0,0,2", "0,0,3", "0,0,4", // - "0,1,0", "0,1,1", "0,1,2", "0,1,3", // - "0,2,0", "0,2,1", "0,2,2", // - "0,3,0", "0,3,1", // - "1,0,0", "1,0,1", // - "1,1,0", "1,1,1", "1,1,2", // - "1,2,0", "1,2,1", "1,2,2", "1,2,3", // - "2,0,0", "2,0,1", "2,0,2", "2,0,3", "2,0,4", "2,0,5", // - }, - nested_row_lengths); - - std::vector expected_values = { - "0,0,0 0,0,1", "0,0,1 0,0,2", "0,0,2 0,0,3", "0,0,3 0,0,4", // - "0,1,0 0,1,1", "0,1,1 0,1,2", "0,1,2 0,1,3", // - "0,2,0 0,2,1", "0,2,1 0,2,2", // - "0,3,0 0,3,1", // - "1,0,0 1,0,1", // - "1,1,0 1,1,1", "1,1,1 1,1,2", // - "1,2,0 1,2,1", "1,2,1 1,2,2", "1,2,2 1,2,3", // - "2,0,0 2,0,1", "2,0,1 2,0,2", "2,0,2 2,0,3", "2,0,3 2,0,4", - "2,0,4 2,0,5", // - }; - EXPECT_THAT(m.GetValuesTensorShape(), ElementsAre(expected_values.size())); - EXPECT_THAT(m.ExtractValuesTensorVector(), ElementsAreArray(expected_values)); - ASSERT_THAT(m.GetNumNestedRowLengths(), 2); - EXPECT_THAT(m.GetRowLengthsTensorShape(0), ElementsAre(3)); - EXPECT_THAT(m.ExtractRowLengthsTensorVector(0), ElementsAre(4, 2, 1)); - EXPECT_THAT(m.GetRowLengthsTensorShape(1), ElementsAre(8)); - EXPECT_THAT(m.ExtractRowLengthsTensorVector(1), - ElementsAre(4, 3, 2, 1, 1, 2, 3, 5)); -} - -} // namespace -} // namespace text -} // namespace custom -} // namespace ops -} // namespace tflite diff --git a/tensorflow_text/core/kernels/normalize_kernels.cc b/tensorflow_text/core/kernels/normalize_kernels.cc deleted file mode 100644 index e011a4629..000000000 --- a/tensorflow_text/core/kernels/normalize_kernels.cc +++ /dev/null @@ -1,370 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include -#include - -#include "absl/strings/ascii.h" -#include "absl/strings/str_cat.h" -#include "icu4c/source/common/unicode/edits.h" -#include "icu4c/source/common/unicode/errorcode.h" -#include "icu4c/source/common/unicode/normalizer2.h" -#include "icu4c/source/common/unicode/utypes.h" -#include "tensorflow/core/framework/node_def_util.h" -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/framework/resource_mgr.h" -#include "tensorflow/core/framework/tensor.h" -#include "tensorflow/core/framework/variant.h" -#include "tensorflow/core/framework/variant_encode_decode.h" -#include "tensorflow_text/core/kernels/edit_changes.pb.h" - -namespace tensorflow { -namespace text { - -class CaseFoldUTF8Op : public tensorflow::OpKernel { - public: - explicit CaseFoldUTF8Op(tensorflow::OpKernelConstruction* context) - : tensorflow::OpKernel(context) {} - - void Compute(tensorflow::OpKernelContext* context) override { - const tensorflow::Tensor* input_tensor; - OP_REQUIRES_OK(context, context->input("input", &input_tensor)); - const auto& input_vec = input_tensor->flat(); - - // TODO(gregbillock): support forwarding - tensorflow::Tensor* output_tensor; - OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor->shape(), - &output_tensor)); - auto output_vec = output_tensor->flat(); - - icu::ErrorCode icu_error; - const icu::Normalizer2* nfkc_cf = - icu::Normalizer2::getNFKCCasefoldInstance(icu_error); - OP_REQUIRES(context, icu_error.isSuccess(), - errors::Internal(absl::StrCat( - icu_error.errorName(), - ": Could not retrieve ICU NFKC_CaseFold normalizer"))); - - for (int64 i = 0; i < input_vec.size(); ++i) { - string output_text; - icu::StringByteSink byte_sink(&output_text); - const auto& input = input_vec(i); - nfkc_cf->normalizeUTF8(0, icu::StringPiece(input.data(), input.size()), - byte_sink, nullptr, icu_error); - OP_REQUIRES(context, icu_error.isSuccess(), - errors::Internal("Could not normalize input string: " + - input_vec(i))); - output_vec(i) = output_text; - } - } -}; - -REGISTER_KERNEL_BUILDER(Name("CaseFoldUTF8").Device(tensorflow::DEVICE_CPU), - CaseFoldUTF8Op); - -namespace { - -string GetNormalizationForm(OpKernelConstruction* context) { - string normalization_form; - ([=](string* c) -> void { - OP_REQUIRES_OK(context, context->GetAttr("normalization_form", c)); - })(&normalization_form); - return absl::AsciiStrToUpper(normalization_form); -} - -} // namespace - -class NormalizeUTF8Op : public tensorflow::OpKernel { - public: - explicit NormalizeUTF8Op(tensorflow::OpKernelConstruction* context) - : tensorflow::OpKernel(context), - normalization_form_(GetNormalizationForm(context)) {} - - void Compute(tensorflow::OpKernelContext* context) override { - const tensorflow::Tensor* input_tensor; - OP_REQUIRES_OK(context, context->input("input", &input_tensor)); - const auto& input_vec = input_tensor->flat(); - - tensorflow::Tensor* output_tensor; - OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor->shape(), - &output_tensor)); - auto output_vec = output_tensor->flat(); - - icu::ErrorCode icu_error; - const icu::Normalizer2* normalizer = nullptr; - if (normalization_form_ == "NFKC") { - normalizer = icu::Normalizer2::getNFKCInstance(icu_error); - OP_REQUIRES(context, icu_error.isSuccess(), - errors::Internal(absl::StrCat( - icu_error.errorName(), - ": Could not retrieve ICU NFKC normalizer"))); - } else if (normalization_form_ == "NFC") { - normalizer = icu::Normalizer2::getNFCInstance(icu_error); - OP_REQUIRES(context, icu_error.isSuccess(), - errors::Internal( - absl::StrCat(icu_error.errorName(), - ": Could not retrieve ICU NFC normalizer"))); - } else if (normalization_form_ == "NFD") { - normalizer = icu::Normalizer2::getNFDInstance(icu_error); - OP_REQUIRES(context, icu_error.isSuccess(), - errors::Internal( - absl::StrCat(icu_error.errorName(), - ": Could not retrieve ICU NFD normalizer"))); - } else if (normalization_form_ == "NFKD") { - normalizer = icu::Normalizer2::getNFKDInstance(icu_error); - OP_REQUIRES(context, icu_error.isSuccess(), - errors::Internal(absl::StrCat( - icu_error.errorName(), - ": Could not retrieve ICU NFKd normalizer"))); - } else { - OP_REQUIRES( - context, false, - errors::InvalidArgument(absl::StrCat( - "Unknown normalization form requrested: ", normalization_form_))); - } - - for (int64 i = 0; i < input_vec.size(); ++i) { - string output_text; - icu::StringByteSink byte_sink(&output_text); - const auto& input = input_vec(i); - normalizer->normalizeUTF8(0, icu::StringPiece(input.data(), input.size()), - byte_sink, nullptr, icu_error); - OP_REQUIRES( - context, icu_error.isSuccess(), - errors::Internal(absl::StrCat(icu_error.errorName(), - ": Could not normalize input string: ", - absl::string_view(input_vec(i))))); - output_vec(i) = output_text; - } - } - - private: - string normalization_form_; -}; - -REGISTER_KERNEL_BUILDER(Name("NormalizeUTF8").Device(tensorflow::DEVICE_CPU), - NormalizeUTF8Op); - -namespace { - -// OffsetMapVariant is a tf.Variant object that stores a single icu::Edits -// object and providing encode/decode methods. -// The encode method is called to serialize the stored icu::Edits object when -// the variant is assigned to graph output. The decode method is called to -// reconstruct the icu::Edits object from the serialized `changes` string when -// the variant is at the graph input. -struct OffsetMapVariant { - string changes; - icu::Edits edits_; - - std::string TypeName() const { return "(anonymous)::OffsetMapVariant"; } - void Encode(tensorflow::VariantTensorData* data) const; - bool Decode(const tensorflow::VariantTensorData& data); -}; - -void OffsetMapVariant::Encode(tensorflow::VariantTensorData* data) const { - EditChanges changes; - icu::Edits::Iterator it = edits_.getFineIterator(); - icu::ErrorCode icu_error; - while (it.next(icu_error)) { - auto* change = changes.add_change(); - change->set_old_length(it.oldLength()); - change->set_new_length(it.newLength()); - } - string changes_str = changes.SerializeAsString(); - data->set_metadata(changes_str); -} - -bool OffsetMapVariant::Decode(const tensorflow::VariantTensorData& data) { - string serialized; - data.get_metadata(&serialized); - EditChanges changes; - changes.ParseFromString(serialized); - icu::Edits edit; - icu::ErrorCode icu_error; - for (int64 j = 0; j < changes.change_size(); ++j) { - auto* change = changes.mutable_change(j); - int old_length = change->old_length(); - int new_length = change->new_length(); - if (old_length == new_length) { - edit.addUnchanged(static_cast(old_length)); - } else { - edit.addReplace(static_cast(old_length), - static_cast(new_length)); - } - } - edits_ = edit; - return true; -} -} // namespace - -class NormalizeUTF8WithOffsetsMapOp : public tensorflow::OpKernel { - public: - explicit NormalizeUTF8WithOffsetsMapOp( - tensorflow::OpKernelConstruction* context) - : tensorflow::OpKernel(context), - normalization_form_(GetNormalizationForm(context)) {} - - void Compute(tensorflow::OpKernelContext* context) override { - const tensorflow::Tensor* input_tensor; - OP_REQUIRES_OK(context, context->input("input", &input_tensor)); - const auto& input_vec = input_tensor->flat(); - - tensorflow::Tensor* output_tensor; - OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor->shape(), - &output_tensor)); - tensorflow::Tensor* output_offsets_map_tensor; - OP_REQUIRES_OK(context, - context->allocate_output(1, input_tensor->shape(), - &output_offsets_map_tensor)); - - auto output_vec = output_tensor->flat(); - auto output_offsets_map_vec = output_offsets_map_tensor->flat(); - - icu::ErrorCode icu_error; - const icu::Normalizer2* normalizer = nullptr; - if (normalization_form_ == "NFKC") { - normalizer = icu::Normalizer2::getNFKCInstance(icu_error); - OP_REQUIRES(context, icu_error.isSuccess(), - errors::Internal(absl::StrCat( - icu_error.errorName(), - ": Could not retrieve ICU NFKC normalizer"))); - } else if (normalization_form_ == "NFC") { - normalizer = icu::Normalizer2::getNFCInstance(icu_error); - OP_REQUIRES(context, icu_error.isSuccess(), - errors::Internal( - absl::StrCat(icu_error.errorName(), - ": Could not retrieve ICU NFC normalizer"))); - } else if (normalization_form_ == "NFD") { - normalizer = icu::Normalizer2::getNFDInstance(icu_error); - OP_REQUIRES(context, icu_error.isSuccess(), - errors::Internal( - absl::StrCat(icu_error.errorName(), - ": Could not retrieve ICU NFD normalizer"))); - } else if (normalization_form_ == "NFKD") { - normalizer = icu::Normalizer2::getNFKDInstance(icu_error); - OP_REQUIRES(context, icu_error.isSuccess(), - errors::Internal(absl::StrCat( - icu_error.errorName(), - ": Could not retrieve ICU NFKD normalizer"))); - } else { - OP_REQUIRES(context, false, - errors::InvalidArgument(absl::StrCat( - "Offset not supported for this normalization form: ", - normalization_form_))); - } - - for (int64 i = 0; i < input_vec.size(); ++i) { - OffsetMapVariant variant; - string output_text; - icu::Edits edits; - icu::StringByteSink byte_sink(&output_text); - const auto& input = input_vec(i); - normalizer->normalizeUTF8(0, icu::StringPiece(input.data(), input.size()), - byte_sink, &edits, icu_error); - OP_REQUIRES( - context, icu_error.isSuccess(), - errors::Internal(absl::StrCat(icu_error.errorName(), - ": Could not normalize input string: ", - absl::string_view(input_vec(i))))); - - output_vec(i) = output_text; - variant.edits_ = std::move(edits); - output_offsets_map_vec(i) = variant; - } - } - - private: - string normalization_form_; -}; - -REGISTER_KERNEL_BUILDER( - Name("NormalizeUTF8WithOffsetsMap").Device(tensorflow::DEVICE_CPU), - NormalizeUTF8WithOffsetsMapOp); - -template -class FindSourceOffsetsOp : public tensorflow::OpKernel { - public: - explicit FindSourceOffsetsOp(tensorflow::OpKernelConstruction* context) - : tensorflow::OpKernel(context) {} - - void Compute(tensorflow::OpKernelContext* context) override { - const tensorflow::Tensor& edits_values = context->input(0); - const tensorflow::Tensor& input_offsets_values = context->input(1); - const tensorflow::Tensor& input_offsets_splits = context->input(2); - - const auto& input_offsets_values_vec = input_offsets_values.flat(); - const auto& input_offsets_splits_vec = - input_offsets_splits.flat(); - const auto& edits_vec = edits_values.flat(); - - icu::ErrorCode icu_error; - int64 cur_split_index_begin = 0; - int64 cur_split_index_end = 0; - std::vector output_offsets_values(input_offsets_values_vec.size()); - int64 idx_edits = 0; - int64 idx_output = 0; - for (int64 i = 0; i < input_offsets_splits_vec.size() - 1; ++i) { - cur_split_index_begin = input_offsets_splits_vec(i); - cur_split_index_end = input_offsets_splits_vec(i + 1); - if (cur_split_index_begin == cur_split_index_end) { - continue; - } - OP_REQUIRES(context, idx_edits < edits_vec.size(), - tensorflow::errors::InvalidArgument( - "Input offset tensor dimension did not match the offset " - "map dimension.")); - auto iter = edits_vec(idx_edits++) - .get() - ->edits_.getFineChangesIterator(); - for (int64 j = cur_split_index_begin; j < cur_split_index_end; ++j) { - output_offsets_values[idx_output++] = - iter.sourceIndexFromDestinationIndex(input_offsets_values_vec(j), - icu_error); - } - } - OP_REQUIRES(context, idx_edits == edits_vec.size(), - tensorflow::errors::InvalidArgument( - "Input offset tensor dimension did not match the offset " - "map dimension.")); - - int64 output_offsets_values_size = output_offsets_values.size(); - Tensor* output_offsets_values_tensor = nullptr; - OP_REQUIRES_OK(context, context->allocate_output( - "output_offsets_values", - TensorShape({output_offsets_values_size}), - &output_offsets_values_tensor)); - auto output_offsets_values_data = - output_offsets_values_tensor->flat().data(); - memcpy(output_offsets_values_data, output_offsets_values.data(), - output_offsets_values_size * sizeof(int64)); - } - - private: - TF_DISALLOW_COPY_AND_ASSIGN(FindSourceOffsetsOp); -}; - -REGISTER_KERNEL_BUILDER(Name("FindSourceOffsets") - .Device(tensorflow::DEVICE_CPU) - .TypeConstraint("Tsplits"), - FindSourceOffsetsOp); -REGISTER_KERNEL_BUILDER(Name("FindSourceOffsets") - .Device(tensorflow::DEVICE_CPU) - .TypeConstraint("Tsplits"), - FindSourceOffsetsOp); -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/normalize_kernels_test.cc b/tensorflow_text/core/kernels/normalize_kernels_test.cc deleted file mode 100644 index a3aa0207b..000000000 --- a/tensorflow_text/core/kernels/normalize_kernels_test.cc +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Copyright 2020 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. diff --git a/tensorflow_text/core/kernels/phrase_tokenizer.cc b/tensorflow_text/core/kernels/phrase_tokenizer.cc deleted file mode 100644 index 98abddd08..000000000 --- a/tensorflow_text/core/kernels/phrase_tokenizer.cc +++ /dev/null @@ -1,224 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/phrase_tokenizer.h" - -#include -#include -#include -#include -#include - -#include "absl/base/optimization.h" -#include "absl/status/status.h" -#include "absl/strings/match.h" -#include "absl/strings/str_cat.h" -#include "absl/strings/str_join.h" -#include "absl/strings/string_view.h" -#include "tensorflow/lite/kernels/shim/status_macros.h" -#include "tensorflow_text/core/kernels/whitespace_tokenizer_config_builder.h" - -namespace tensorflow { -namespace text { - -/*static*/ absl::StatusOr PhraseTokenizer::Create( - const void* config_flatbuffer) { - PhraseTokenizer tokenizer; - // `GetPhraseTokenizerConfig()` is autogenerated by flatbuffer. - tokenizer.phrase_config_ = GetPhraseTokenizerConfig(config_flatbuffer); - if (tokenizer.phrase_config_ == nullptr || - tokenizer.phrase_config_->vocab_trie() == nullptr || - tokenizer.phrase_config_->whitespace_config() == nullptr) { - return absl::InvalidArgumentError( - "PhraseTokenizerConfig or required fields are null."); - } - tokenizer.trie_ = absl::make_unique( - tokenizer.phrase_config_->vocab_trie()->nodes()); - tokenizer.prob_ = static_cast(tokenizer.phrase_config_->prob()) / 100; - const auto& ws_config = tokenizer.phrase_config_->whitespace_config(); - tokenizer.whitespace_config_str_ = - absl::string_view(ws_config->c_str(), ws_config->size()); - tokenizer.whitespace_tokenizer_ = absl::make_unique( - WhitespaceTokenizerConfig(tokenizer.whitespace_config_str_)); - tokenizer.split_end_punctuation_ = - tokenizer.phrase_config_->split_end_punctuation(); - return std::move(tokenizer); -} - -void PhraseTokenizer::Tokenize(const absl::string_view input, - std::vector* result_tokens, - std::vector* result_token_ids) { - // Word level information. - std::vector tokens; - - whitespace_tokenizer_->Tokenize(input, &tokens); - - // Loop through tokens, considering 1-level punctuations. - std::string all_str; - int n = tokens.size(); - for (int i = 0; i < n; i++) { - if (tokens[i].empty()) { - continue; - } - if (split_end_punctuation_) { - bool contained_special_token = false; - for (const auto& special_token : special_tokens_) { - if (absl::EndsWith(tokens[i], special_token)) { - // Eg: split "can't" into "can 't" - all_str += - tokens[i].substr(0, tokens[i].size() - special_token.size()); - all_str += " "; - all_str += special_token; - contained_special_token = true; - break; - } - } - if (!contained_special_token) { - all_str += tokens[i]; - } - } else { - all_str += tokens[i]; - } - if (i < n - 1) { - all_str += " "; - } - } - - FindPhraseTokens(all_str, result_tokens, result_token_ids); -} - -void PhraseTokenizer::FindPhraseTokens(const std::string& cur_phrase, - std::vector* phrase_tokens, - std::vector* phrase_token_ids) { - // Do a simple left to right search to tokenize the input text. - int index = 0; - while (index < cur_phrase.size()) { - bool in_trie = false; - int token_id = phrase_config_->unk_token_id(); - int length = 0; - PhraseLookup(cur_phrase, index, &in_trie, &token_id, &length); - if (!in_trie) { - // fall back to using single token. - std::size_t found = cur_phrase.find_first_of(' ', index); - phrase_tokens->push_back(phrase_config_->unk_token()->str()); - phrase_token_ids->push_back(phrase_config_->unk_token_id()); - if (found == std::string::npos) { - break; - } - index = found + 1; - } else { - // Found a phrase. - phrase_tokens->push_back(cur_phrase.substr(index, length)); - phrase_token_ids->push_back(token_id); - index += (length + 1); - } - } -} - -void PhraseTokenizer::PhraseLookup(const std::string& token, int cur, - bool* in_trie, int* emitted_phrase_id, - int* emitted_phrase_length) { - int matched_phrase_id = -1; - int matched_phrase_length = 0; - bool phrase_emitted = false; - float prob = prob_; - absl::BitGen* gen = &gen_; - auto phrase_emit_func = - [&token /*the input string*/, - cur /*the current starting point for searching phrase*/, - prob /*the probability to emit the current found phrase*/, - in_trie /*whether a phrase in matched in the trie*/, - emitted_phrase_id /*the token id of the emitted phrase*/, - emitted_phrase_length /*the length of the emitted phrase*/, - &matched_phrase_id /*the token id of the matched phrase*/, - &matched_phrase_length /*the length of the matched phrase*/, - &phrase_emitted /*whether the phrase is emitted or not*/, - gen /*the random generator*/]( - const sentencepiece::DoubleArrayTrie::Match& m) { - if (phrase_emitted || (cur + m.match_length < token.size() && - token[cur + m.match_length] != ' ')) { - // We should continue search without going through this function if: - // 1: a phrase has already been emitted, or - // 2: We located a phrase that split one single word. - return; - } - - matched_phrase_id = m.id; - matched_phrase_length = m.match_length; - *in_trie = true; - if ((prob > 0) && absl::Bernoulli(*gen, prob)) { - // Emit the current phrase. - *emitted_phrase_id = m.id; - *emitted_phrase_length = m.match_length; - phrase_emitted = true; - } - }; - trie_->IteratePrefixMatches( - sentencepiece::utils::string_view(token.data() + cur, token.size() - cur), - phrase_emit_func); - if (*in_trie && !phrase_emitted) { - // We should use prev longest one as output as we prefer longer ones. - *emitted_phrase_id = matched_phrase_id; - *emitted_phrase_length = matched_phrase_length; - } -} - -absl::StatusOr> PhraseTokenizer::DetokenizeToTokens( - const absl::Span input) const { - std::vector output_tokens; - if (!phrase_config_->support_detokenization()) { - return absl::FailedPreconditionError( - "Detokenize function is only enabled when support_detokenization is " - "true in the config flatbuffer. Please rebuild the model flatbuffer " - "by setting support_detokenization=true."); - } - if (phrase_config_->vocab_array() == nullptr) { - return absl::InternalError("Missing vocab_array in config."); - } - const int vocab_size = phrase_config_->vocab_array()->size(); - for (int id : input) { - if (ABSL_PREDICT_FALSE(id < 0 || id >= vocab_size)) { - return absl::OutOfRangeError( - absl::StrCat("Token ID out of bounds: ", id)); - } - auto vocab = phrase_config_->vocab_array()->Get(id); - if (ABSL_PREDICT_FALSE(vocab == nullptr)) { - return absl::InternalError("Null vocab string in vocab_array."); - } - output_tokens.emplace_back(vocab->string_view()); - } - return output_tokens; -} - -absl::StatusOr PhraseTokenizer::Detokenize( - const absl::Span input) const { - SH_ASSIGN_OR_RETURN(std::vector output_tokens, - DetokenizeToTokens(input)); - if (split_end_punctuation_) { - std::string result; - for (const auto& token : output_tokens) { - if (special_tokens_.contains(token)) { - result += token; - } else { - result += ((result.empty() ? "" : " ") + token); - } - } - return result; - } else { - return absl::StrJoin(output_tokens, " "); - } -} - -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/phrase_tokenizer.h b/tensorflow_text/core/kernels/phrase_tokenizer.h index b9f48eb72..f29774e7a 100644 --- a/tensorflow_text/core/kernels/phrase_tokenizer.h +++ b/tensorflow_text/core/kernels/phrase_tokenizer.h @@ -12,89 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_Phrase_TOKENIZER_H_ -#define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_Phrase_TOKENIZER_H_ +#ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_PHRASE_TOKENIZER_H_ +#define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_PHRASE_TOKENIZER_H_ -#include -#include -#include +#include "tensorflow/core/kernels/text/phrase_tokenizer.h" -#include "absl/container/flat_hash_map.h" -#include "absl/container/flat_hash_set.h" -#include "absl/random/random.h" -#include "absl/status/statusor.h" -#include "absl/strings/string_view.h" -#include "tensorflow_text/core/kernels/phrase_tokenizer_model_generated.h" -#include "tensorflow_text/core/kernels/sentencepiece/double_array_trie.h" -#include "tensorflow_text/core/kernels/string_vocab.h" -#include "tensorflow_text/core/kernels/whitespace_tokenizer.h" - -namespace tensorflow { -namespace text { - -class PhraseTokenizer { - public: - // Creates an instance. - // - // Args: - // * config_flatbuffer: the pointer to the PhraseTokenizerConfig - // flatbuffer, which is not owned by this instance and should be kept - // alive through the lifetime of the instance. - static absl::StatusOr Create(const void* config_flatbuffer); - - // Tokenizes a string (or series of character codepoints) by Phrase. - // - // Example: - // input = "Show me the way." - // output = ["Show me", "the", "way."] - // - // The input should be UTF-8 but the tokenization is performed on Unicode - // codepoints. - // - // Args: - // * input: The UTF-8 string of an input. - // * tokens: The output tokens. - void Tokenize(const absl::string_view input, - std::vector* result_tokens, - std::vector* result_token_ids); - - // Detokenizer the input into a single string. - absl::StatusOr Detokenize( - const absl::Span input) const; - - private: - // Detokenizer the input into vector of strings. - absl::StatusOr> DetokenizeToTokens( - const absl::Span input) const; - - // Find the phrase tokens based on the current phrase. - void FindPhraseTokens(const std::string& cur_phrase, - std::vector* phrase_tokens, - std::vector* phrase_token_ids); - - // Lookup the phrase in the token string from current index. - // Args: - // * token: The input token string to find the next phrase. - // * cur: The starting point to search for the phrase. - // * in_trie: Whether there is a phrase in DoubleArrayTrie. - // * emitted_phrase_id: The emitted phrase id. - // * emitted_phrase_length: The length of the emitted phrase. - void PhraseLookup(const std::string& token, int cur, bool* in_trie, - int* emitted_phrase_id, int* emitted_phrase_length); - - std::unique_ptr vocab_ = nullptr; - const PhraseTokenizerConfig* phrase_config_; - absl::string_view whitespace_config_str_; - std::unique_ptr trie_ = nullptr; - float prob_; - absl::BitGen gen_; - std::unique_ptr whitespace_tokenizer_ = nullptr; - bool split_end_punctuation_ = false; - const absl::flat_hash_set special_tokens_ = { - "'t", "'s", ".", ",", "!", "?", "'m", "'re", "'ll", "'d", "'ve"}; -}; - -} // namespace text -} // namespace tensorflow - -#endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_Phrase_TOKENIZER_H_ +#endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_PHRASE_TOKENIZER_H_ diff --git a/tensorflow_text/core/kernels/phrase_tokenizer_kernel.cc b/tensorflow_text/core/kernels/phrase_tokenizer_kernel.cc deleted file mode 100644 index ac47ba777..000000000 --- a/tensorflow_text/core/kernels/phrase_tokenizer_kernel.cc +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/phrase_tokenizer_kernel.h" - -#include "tensorflow/core/framework/op_kernel.h" - -namespace tensorflow { -namespace text { - -REGISTER_KERNEL_BUILDER( - Name(PhraseTokenizeOpKernel::OpName()).Device(tensorflow::DEVICE_CPU), - PhraseTokenizeOpKernel); - -REGISTER_KERNEL_BUILDER( - Name(PhraseDetokenizeOpKernel::OpName()).Device(tensorflow::DEVICE_CPU), - PhraseDetokenizeOpKernel); - -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/phrase_tokenizer_kernel.h b/tensorflow_text/core/kernels/phrase_tokenizer_kernel.h index 302b193df..61b876333 100644 --- a/tensorflow_text/core/kernels/phrase_tokenizer_kernel.h +++ b/tensorflow_text/core/kernels/phrase_tokenizer_kernel.h @@ -15,25 +15,6 @@ #ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_PHRASE_TOKENIZER_KERNEL_H_ #define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_PHRASE_TOKENIZER_KERNEL_H_ -#include "tensorflow/lite/kernels/shim/tf_op_shim.h" -#include "tensorflow_text/core/kernels/phrase_tokenizer_kernel_template.h" - -namespace tensorflow { -namespace text { - -class PhraseTokenizeOpKernel - : public tflite::shim::TfOpKernel { - public: - using TfOpKernel::TfOpKernel; -}; - -class PhraseDetokenizeOpKernel - : public tflite::shim::TfOpKernel { - public: - using TfOpKernel::TfOpKernel; -}; - -} // namespace text -} // namespace tensorflow +#include "tensorflow/core/kernels/text/phrase_tokenizer_kernel.h" #endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_PHRASE_TOKENIZER_KERNEL_H_ diff --git a/tensorflow_text/core/kernels/phrase_tokenizer_kernel_template.h b/tensorflow_text/core/kernels/phrase_tokenizer_kernel_template.h index 67807a768..4f27754c0 100644 --- a/tensorflow_text/core/kernels/phrase_tokenizer_kernel_template.h +++ b/tensorflow_text/core/kernels/phrase_tokenizer_kernel_template.h @@ -15,346 +15,6 @@ #ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_PHRASE_TOKENIZER_KERNEL_TEMPLATE_H_ #define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_PHRASE_TOKENIZER_KERNEL_TEMPLATE_H_ -#include "absl/status/status.h" -#include "absl/strings/str_cat.h" -#include "tensorflow/lite/kernels/shim/op_kernel.h" -#include "tensorflow/lite/kernels/shim/status_macros.h" -#include "tensorflow_text/core/kernels/phrase_tokenizer.h" +#include "tensorflow/core/kernels/text/phrase_tokenizer_kernel_template.h" -namespace tensorflow { -namespace text { - -// See `kDoc` data member for the documentation on this op kernel. -// -// This template class can be instantiated into a kernel for either TF or -// TFLite. See -// https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/kernels/shim -// for more info on how this works. -template -class PhraseTokenizeOp - : public tflite::shim::OpKernelShim { - private: - enum Inputs { kInputValues = 0, kPhraseModel }; - enum Outputs { - kOutputSubwords = 0, - kOutputIds, - kOutputRowSplits, - }; - - using Shape = tflite::shim::Shape; - using typename tflite::shim::OpKernelShim::InitContext; - using - typename tflite::shim::OpKernelShim::InvokeContext; - using typename tflite::shim::OpKernelShim::ShapeInferenceContext; - - public: - PhraseTokenizeOp() = default; - static constexpr char kOpName[] = "PhraseTokenize"; - static constexpr char kDoc[] = R"doc( - Tokenizes tokens into phrases based off of a vocabulary. - - ### Example: - - ```python - >>> tokens = ['I have a dream', 'I like coffee'] - >>> phrase, ids, row_splits = ( - ... phrase_tokenize(tokens, model_buffer)) - >>> RaggedTensor.from_row_splits(phrase, row_splits) - [['I', 'have', 'a dream'], ['I like', 'coffee']] - >>> RaggedTensor.from_row_splits(ids, row_splits) - [[0, 1, 2], [3, 4]] # Dummy ids. - ``` - - Args: - input_values: 1D Tensor of strings to tokenize with. - phrase_model: Buffer tensor for the PhraseTokenizerConfig flatbuffer. - - Returns: - * output_values: 1D tensor containing the phrases for all input strings. - A 2D RaggedTensor can be constructed from this and output_row_splits. - * output_ids: 1D tensor containing the phrase ids for all input strings. - A 2D RaggedTensor can be constructed from this and output_row_splits. - * output_row_splits: 1D int tensor with the row splits that allow us to - build RaggedTensors from output_values, output_ids. - )doc"; - - static const char* OpName() { return kOpName; } - static const char* Doc() { return kDoc; } - - // Attributes declaration (syntax: https://www.tensorflow.org/guide/create_op) - static std::vector Attrs() { return {}; } - - // Input tensors declaration (syntax: - // https://www.tensorflow.org/guide/create_op) - static std::vector Inputs(); - - // Output tensors declaration (syntax: - // https://www.tensorflow.org/guide/create_op) - static std::vector Outputs(); - - // Initializes the op - absl::Status Init(InitContext* context) { return absl::OkStatus(); } - - // Runs the operation - absl::Status Invoke(InvokeContext* context); - - // Shape inference - static absl::Status ShapeInference(ShapeInferenceContext* c); -}; - -////////////////////////// Implementation - -template -std::vector PhraseTokenizeOp::Inputs() { - return {"input_values: string", "phrase_model: uint8"}; -} - -template -std::vector PhraseTokenizeOp::Outputs() { - return {"output_subwords: string", "output_ids: int64", - "output_row_splits: int64"}; -} - -template -absl::Status PhraseTokenizeOp::Invoke(InvokeContext* context) { - SH_ASSIGN_OR_RETURN(const auto input_values, context->GetInput(kInputValues)); - const auto& values_vec = input_values->template As(); - - SH_ASSIGN_OR_RETURN(const auto phrase_model, context->GetInput(kPhraseModel)); - // OK to create on every call because PhraseTokenizer is a - // lightweight, memory-mapped wrapper on `phrase_model` tensor, and thus - // Create() is very cheap. - auto phrase_tokenizer = ::tensorflow::text::PhraseTokenizer::Create( - phrase_model->template Data().data()); - SH_RETURN_IF_ERROR(phrase_tokenizer.status()); - - std::vector subwords; - std::vector subword_ids; - std::vector row_splits; - - row_splits.push_back(0); - - // Iterate through all the values and wordpiece tokenize them. - for (int i = 0; i < values_vec.Dim(0); ++i) { - // Tokenize into subwords and record the offset locations. - const int original_num_wordpieces = subwords.size(); - phrase_tokenizer->Tokenize(values_vec(i), &subwords, &subword_ids); - const int delta_num_wordpieces = subwords.size() - original_num_wordpieces; - - // Record the row splits. - row_splits.push_back(delta_num_wordpieces + row_splits.back()); - } - - const int subwords_size = subwords.size(); - SH_ASSIGN_OR_RETURN( - auto output_subwords, - context->GetOutput(kOutputSubwords, Shape({subwords_size}))); - auto output_subwords_vec = - output_subwords->template As(); - - SH_ASSIGN_OR_RETURN( - auto output_ids, - context->GetOutput( - kOutputIds, - Shape({static_cast( - subword_ids.size())}))); /* same shape as `output_subwords` */ - auto output_ids_vec = output_ids->template As(); - - SH_ASSIGN_OR_RETURN( - auto output_row_splits, - context->GetOutput(kOutputRowSplits, - Shape({static_cast(row_splits.size())}))); - auto output_row_splits_vec = output_row_splits->template As(); - - for (int i = 0; i < subwords.size(); ++i) { - output_subwords_vec(i) = subwords[i]; - } - - for (int i = 0; i < subword_ids.size(); ++i) { - output_ids_vec(i) = subword_ids[i]; - } - - for (int i = 0; i < row_splits.size(); ++i) { - output_row_splits_vec(i) = row_splits[i]; - } - - return absl::OkStatus(); -} - -template -absl::Status PhraseTokenizeOp::ShapeInference(ShapeInferenceContext* c) { - using tflite::shim::Shape; - SH_ASSIGN_OR_RETURN(const Shape input_values_shape, - c->GetInputShape(kInputValues)); - SH_ASSIGN_OR_RETURN(const auto phrase_model_shape, - c->GetInputShape(kPhraseModel)); - const auto rank_1_shape = Shape({Shape::kUnknownDim}); - if (!input_values_shape.Compatible(rank_1_shape)) { - return absl::FailedPreconditionError( - absl::StrCat("Shape must be rank 1: ", input_values_shape.ToString())); - } - if (!phrase_model_shape.Compatible(rank_1_shape)) { - return absl::FailedPreconditionError( - absl::StrCat("Shape must be rank 1: ", phrase_model_shape.ToString())); - } - SH_RETURN_IF_ERROR(c->SetOutputShape(kOutputSubwords, rank_1_shape)); - SH_RETURN_IF_ERROR(c->SetOutputShape(kOutputIds, rank_1_shape)); - // row splits size - const int num_splits = Shape::AddDims(1, input_values_shape.Dim(0)); - SH_RETURN_IF_ERROR(c->SetOutputShape(kOutputRowSplits, Shape({num_splits}))); - - return absl::OkStatus(); -} - -// See `kDoc` data member for the documentation on this op kernel. -// -// This template class can be instantiated into a kernel for either TF or -// TFLite. See -// https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/kernels/shim -// for more info on how this works. -template -class PhraseDetokenizeOp - : public tflite::shim::OpKernelShim { - private: - enum Inputs { kInputValues = 0, kInputRowSplits, kPhraseModel }; - enum Outputs { kOutputWords = 0 }; - - using Shape = tflite::shim::Shape; - using - typename tflite::shim::OpKernelShim::InitContext; - using typename tflite::shim::OpKernelShim::InvokeContext; - using typename tflite::shim::OpKernelShim::ShapeInferenceContext; - - public: - PhraseDetokenizeOp() = default; - static constexpr char kOpName[] = "TFText>PhraseDetokenize"; - static constexpr char kDoc[] = R"doc( - Detokenizes phrase ids into sentences. - - ### Example: - - ```python - >>> # Vocab of the model_buffer: ['I', 'have', 'a dream']. - >>> wordpiece_ids = [2, 3, 4] - >>> row_splits = [0, 2, 3] - >>> tokens = phrase_tokenizer_detokenize(tokens, row_splits, model_buffer) - >>> tokens - ['I have', 'a dream'] - ``` - - Args: - input_values: 1D Tensor of phrase ids. - input_row_splits: 1D Tensor of row splits that denotes the boundary of each - sentence in the `input_values`. - phrase_model: Buffer tensor for the PhraseTokenizerConfig flatbuffer. - - Returns: - * output_values: 1D tensor containing all the sentences. - )doc"; - - static const char* OpName() { return kOpName; } - static const char* Doc() { return kDoc; } - - // Attributes declaration (syntax: https://www.tensorflow.org/guide/create_op) - static std::vector Attrs() { return {}; } - - // Input tensors declaration (syntax: - // https://www.tensorflow.org/guide/create_op) - static std::vector Inputs(); - - // Output tensors declaration (syntax: - // https://www.tensorflow.org/guide/create_op) - static std::vector Outputs(); - - // Initializes the op - absl::Status Init(InitContext* context) { return absl::OkStatus(); } - - // Runs the operation - absl::Status Invoke(InvokeContext* context); - - // Shape inference - static absl::Status ShapeInference(ShapeInferenceContext* c); -}; - -////////////////////////// Implementation - -template -std::vector PhraseDetokenizeOp::Inputs() { - return {"input_values: int32", "input_row_splits: int64", - "phrase_model: uint8"}; -} - -template -std::vector PhraseDetokenizeOp::Outputs() { - return {"output_words: string"}; -} - -template -absl::Status PhraseDetokenizeOp::Invoke(InvokeContext* context) { - SH_ASSIGN_OR_RETURN(const auto input_values, context->GetInput(kInputValues)); - const auto& values_vec = input_values->template As(); - - SH_ASSIGN_OR_RETURN(const auto input_row_splits, - context->GetInput(kInputRowSplits)); - const auto& row_splits_vec = input_row_splits->template As(); - - SH_ASSIGN_OR_RETURN(const auto phrase_model, context->GetInput(kPhraseModel)); - // OK to create on every call because PhraseTokenizer is a - // lightweight, memory-mapped wrapper on `phrase_model` tensor, and thus - // Create() is very cheap. - auto phrase_tokenizer = ::tensorflow::text::PhraseTokenizer::Create( - phrase_model->template Data().data()); - SH_RETURN_IF_ERROR(phrase_tokenizer.status()); - - std::vector sentences; - - // Iterate through row_splits to split input_values. - for (int i = 0; i < row_splits_vec.Dim(0) - 1; ++i) { - auto single_input = - absl::Span(values_vec.Ptr() + row_splits_vec(i), - row_splits_vec(i + 1) - row_splits_vec(i)); - SH_ASSIGN_OR_RETURN(auto sentence, - phrase_tokenizer->Detokenize(single_input)); - sentences.push_back(sentence); - } - - SH_RETURN_IF_ERROR(this->template FillOutputTensor( - sentences, kOutputWords, context)); - - return absl::OkStatus(); -} - -template -absl::Status PhraseDetokenizeOp::ShapeInference(ShapeInferenceContext* c) { - using tflite::shim::Shape; - SH_ASSIGN_OR_RETURN(const Shape input_values_shape, - c->GetInputShape(kInputValues)); - SH_ASSIGN_OR_RETURN(const Shape input_row_splits_shape, - c->GetInputShape(kInputRowSplits)); - SH_ASSIGN_OR_RETURN(const auto phrase_model_shape, - c->GetInputShape(kPhraseModel)); - const auto rank_1_shape = Shape({Shape::kUnknownDim}); - if (!input_values_shape.Compatible(rank_1_shape)) { - return absl::FailedPreconditionError( - absl::StrCat("Shape must be rank 1: ", input_values_shape.ToString())); - } - if (!input_row_splits_shape.Compatible(rank_1_shape)) { - return absl::FailedPreconditionError(absl::StrCat( - "Shape must be rank 1: ", input_row_splits_shape.ToString())); - } - if (!phrase_model_shape.Compatible(rank_1_shape)) { - return absl::FailedPreconditionError( - absl::StrCat("Shape must be rank 1: ", phrase_model_shape.ToString())); - } - SH_RETURN_IF_ERROR(c->SetOutputShape(kOutputWords, rank_1_shape)); - return absl::OkStatus(); -} - -} // namespace text -} // namespace tensorflow - -#endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_phrase_TOKENIZER_KERNEL_TEMPLATE_H_ +#endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_PHRASE_TOKENIZER_KERNEL_TEMPLATE_H_ diff --git a/tensorflow_text/core/kernels/phrase_tokenizer_model.fbs b/tensorflow_text/core/kernels/phrase_tokenizer_model.fbs deleted file mode 100644 index d2d891cba..000000000 --- a/tensorflow_text/core/kernels/phrase_tokenizer_model.fbs +++ /dev/null @@ -1,38 +0,0 @@ -namespace tensorflow.text; - -table Trie { - nodes: [uint32]; -} - - -table PhraseTokenizerConfig { - // Probability of emitting a phrase when there is a match. - // The larger value means preferring shorter phrases over longer ones. - // I.e. 0 means always emit the longest possible phrase. - prob: int; - - // The unknown token string. - unk_token: string; - - // The unkown token id. - unk_token_id: int; - - // Whether the tokenizer supports detokenization function. - support_detokenization: bool; - - // Phrases Vocabulary array, this is for storting the phrase tokens in order, - // mainly used for detokenization. - vocab_array: [string]; - - // The trie is used to construct DoubleArrayTrie to do efficient prefix - // matching during tokenization. - vocab_trie: Trie; - - // whilte space config used to initalize the whitespace tokenzier. - whitespace_config: string; - - // Whether to split the end_puctualtion for each token. - split_end_punctuation: bool; -} - -root_type PhraseTokenizerConfig; diff --git a/tensorflow_text/core/kernels/phrase_tokenizer_model_builder.cc b/tensorflow_text/core/kernels/phrase_tokenizer_model_builder.cc deleted file mode 100644 index 268aeb32c..000000000 --- a/tensorflow_text/core/kernels/phrase_tokenizer_model_builder.cc +++ /dev/null @@ -1,143 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/phrase_tokenizer_model_builder.h" - -#include - -#include -#include -#include - -#include "absl/container/flat_hash_map.h" -#include "absl/container/flat_hash_set.h" -#include "absl/status/status.h" -#include "absl/strings/string_view.h" -#include "tensorflow/lite/kernels/shim/status_macros.h" -#include "tensorflow_text/core/kernels/phrase_tokenizer_model_generated.h" -#include "tensorflow_text/core/kernels/sentencepiece/double_array_trie_builder.h" -#include "tensorflow_text/core/kernels/string_vocab.h" -#include "tensorflow_text/core/kernels/whitespace_tokenizer_config_builder.h" - -namespace tensorflow { -namespace text { -namespace { - -// Builds the PhraseTokenizer model. -class PhraseBuilder { - public: - absl::Status BuildModel(const std::vector& vocab, - const std::string& unk_token, - bool support_detokenization, int prob, - bool split_end_punctuation); - - absl::StatusOr ExportToFlatBuffer() const; - - private: - std::unique_ptr vocab_; - std::vector trie_data_; - std::string unk_token_; - int unk_token_id_; - // Whether the tokenizer supports the detokenization function. - bool support_detokenization_; - int prob_; - bool split_end_punctuation_; -}; - -absl::Status PhraseBuilder::BuildModel(const std::vector& vocab, - const std::string& unk_token, - bool support_detokenization, int prob, - bool split_end_punctuation) { - unk_token_ = std::string(unk_token); - support_detokenization_ = support_detokenization; - prob_ = prob; - split_end_punctuation_ = split_end_punctuation; - - vocab_ = std::make_unique(vocab); - if (vocab_->Size() != vocab.size()) { - return absl::FailedPreconditionError( - "Tokens in the vocabulary must be unique."); - } - - // Determine `unk_token_id_`. - const absl::optional unk_token_id = vocab_->LookupId(unk_token_); - if (!unk_token_id.has_value()) { - return absl::FailedPreconditionError("Cannot find unk_token in the vocab!"); - } - unk_token_id_ = *unk_token_id; - - // build trie. - trie_data_ = sentencepiece::BuildTrie(vocab); - - return absl::OkStatus(); -} - -absl::StatusOr PhraseBuilder::ExportToFlatBuffer() const { - flatbuffers::FlatBufferBuilder builder; - - const auto unk_token = builder.CreateString(unk_token_); - - std::vector> vocab_fbs_vector; - - if (support_detokenization_) { - vocab_fbs_vector.reserve(vocab_->Size()); - for (int i = 0; i < vocab_->Size(); ++i) { - const absl::optional word = vocab_->LookupWord(i); - if (!word.has_value()) { - return absl::FailedPreconditionError( - "Impossible. `token_id` is definitely within the range of vocab " - "token ids; hence LookupWord() should always succeed."); - } - absl::string_view token = word.value(); - vocab_fbs_vector.emplace_back(builder.CreateString(token)); - } - } - - auto vocab_array = builder.CreateVector(vocab_fbs_vector); - - std::string ws_config = BuildWhitespaceTokenizerConfig(); - auto whitespace_config = builder.CreateString(ws_config); - auto trie_data = builder.CreateVector(trie_data_); - - TrieBuilder trie_builder(builder); - trie_builder.add_nodes(trie_data); - const auto trie_fbs = trie_builder.Finish(); - - PhraseTokenizerConfigBuilder wtcb(builder); - wtcb.add_unk_token(unk_token); - wtcb.add_unk_token_id(unk_token_id_); - wtcb.add_support_detokenization(support_detokenization_); - wtcb.add_vocab_array(vocab_array); - wtcb.add_whitespace_config(whitespace_config); - wtcb.add_vocab_trie(trie_fbs); - wtcb.add_prob(prob_); - wtcb.add_split_end_punctuation(split_end_punctuation_); - FinishPhraseTokenizerConfigBuffer(builder, wtcb.Finish()); - return std::string(reinterpret_cast(builder.GetBufferPointer()), - builder.GetSize()); -} -} // namespace - -absl::StatusOr BuildPhraseModelAndExportToFlatBuffer( - const std::vector& vocab, const std::string& unk_token, - bool support_detokenization, int prob, bool split_end_punctuation) { - PhraseBuilder builder; - SH_RETURN_IF_ERROR(builder.BuildModel( - vocab, unk_token, support_detokenization, prob, split_end_punctuation)); - SH_ASSIGN_OR_RETURN(std::string flatbuffer, builder.ExportToFlatBuffer()); - return flatbuffer; -} - -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/phrase_tokenizer_model_builder.h b/tensorflow_text/core/kernels/phrase_tokenizer_model_builder.h index 86cd35b20..2f89b19e7 100644 --- a/tensorflow_text/core/kernels/phrase_tokenizer_model_builder.h +++ b/tensorflow_text/core/kernels/phrase_tokenizer_model_builder.h @@ -15,30 +15,6 @@ #ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_PHRASE_TOKENIZER_MODEL_BUILDER_H_ #define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_PHRASE_TOKENIZER_MODEL_BUILDER_H_ -#include -#include - -#include "absl/container/flat_hash_map.h" -#include "absl/status/statusor.h" - -namespace tensorflow { -namespace text { - -// Builds a PhraseTokenizer model in flatbuffer format. -// -// Args: -// * vocab: The phrase vocabulary. -// * unk_token: The unknown token string. -//. * support_detokenization: Whether to enable the detokenization function. -// Setting it to true expands the size of the flatbuffer. -// * prob: Probability of emitting a phrase when there is a match. -// Returns: -// The bytes of the flatbuffer that stores the model. -absl::StatusOr BuildPhraseModelAndExportToFlatBuffer( - const std::vector& vocab, const std::string& unk_token, - bool support_detokenization = false, int prob = 0, - bool split_end_punctuation = false); -} // namespace text -} // namespace tensorflow +#include "tensorflow/core/kernels/text/phrase_tokenizer_model_builder.h" #endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_PHRASE_TOKENIZER_MODEL_BUILDER_H_ diff --git a/tensorflow_text/core/kernels/utf8_binarize_kernel.cc b/tensorflow_text/core/kernels/phrase_tokenizer_model_generated.h similarity index 64% rename from tensorflow_text/core/kernels/utf8_binarize_kernel.cc rename to tensorflow_text/core/kernels/phrase_tokenizer_model_generated.h index 80049896f..8b7067bef 100644 --- a/tensorflow_text/core/kernels/utf8_binarize_kernel.cc +++ b/tensorflow_text/core/kernels/phrase_tokenizer_model_generated.h @@ -12,16 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "tensorflow_text/core/kernels/utf8_binarize_kernel.h" +#ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_PHRASE_TOKENIZER_MODEL_GENERATED_H_ +#define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_PHRASE_TOKENIZER_MODEL_GENERATED_H_ -#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/kernels/text/phrase_tokenizer_model_generated.h" -namespace tensorflow { -namespace text { - -REGISTER_KERNEL_BUILDER( - Name(Utf8BinarizeOpKernel::OpName()).Device(tensorflow::DEVICE_CPU), - Utf8BinarizeOpKernel); - -} // namespace text -} // namespace tensorflow +#endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_PHRASE_TOKENIZER_MODEL_GENERATED_H_ diff --git a/tensorflow_text/core/kernels/phrase_tokenizer_test.cc b/tensorflow_text/core/kernels/phrase_tokenizer_test.cc deleted file mode 100644 index e8ae06570..000000000 --- a/tensorflow_text/core/kernels/phrase_tokenizer_test.cc +++ /dev/null @@ -1,98 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/phrase_tokenizer.h" - -#include -#include -#include "absl/status/status.h" -#include "absl/status/statusor.h" -#include "tensorflow/core/platform/env.h" - -namespace tensorflow { -namespace text { -namespace { - -using ::testing::ElementsAre; - -/* With the following vocab: - -I -heard -the -news -today -have -heard news today -the news today -*/ -constexpr char kTestConfigPath[] = - "tensorflow_text/python/ops/test_data/" - "phrase_tokenizer_model.fb"; - -TEST(PhraseTokenizerTest, Tokenize) { - absl::string_view input("I heard the news today"); - std::vector output_tokens; - std::vector output_token_ids; - - std::string config_flatbuffer; - auto status = tensorflow::ReadFileToString( - tensorflow::Env::Default(), kTestConfigPath, &config_flatbuffer); - ASSERT_TRUE(status.ok()); - - ASSERT_OK_AND_ASSIGN(auto tokenizer, - PhraseTokenizer::Create(config_flatbuffer.data())); - - tokenizer.Tokenize(input, &output_tokens, &output_token_ids); - EXPECT_THAT(output_tokens, ElementsAre("I", "heard", "the news today")); - EXPECT_THAT(output_token_ids, ElementsAre(1, 2, 8)); -} - -TEST(PhraseTokenizerTest, TokenizeLonger) { - absl::string_view input("I heard the news today I heard"); - std::vector output_tokens; - std::vector output_token_ids; - - std::string config_flatbuffer; - auto status = tensorflow::ReadFileToString( - tensorflow::Env::Default(), kTestConfigPath, &config_flatbuffer); - ASSERT_TRUE(status.ok()); - - ASSERT_OK_AND_ASSIGN(auto tokenizer, - PhraseTokenizer::Create(config_flatbuffer.data())); - - tokenizer.Tokenize(input, &output_tokens, &output_token_ids); - EXPECT_THAT(output_tokens, - ElementsAre("I", "heard", "the news today", "I", "heard")); - EXPECT_THAT(output_token_ids, ElementsAre(1, 2, 8, 1, 2)); -} - -TEST(PhraseTokenizerTest, DeTokenize) { - std::vector input({1, 2, 8}); - - std::string config_flatbuffer; - auto status = tensorflow::ReadFileToString( - tensorflow::Env::Default(), kTestConfigPath, &config_flatbuffer); - ASSERT_TRUE(status.ok()); - - ASSERT_OK_AND_ASSIGN(auto tokenizer, - PhraseTokenizer::Create(config_flatbuffer.data())); - - auto output_string = tokenizer.Detokenize(input); - EXPECT_EQ(output_string.value(), "I heard the news today"); -} - -} // namespace -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/ragged_tensor_to_tensor_tflite.cc b/tensorflow_text/core/kernels/ragged_tensor_to_tensor_tflite.cc deleted file mode 100644 index 977cf5836..000000000 --- a/tensorflow_text/core/kernels/ragged_tensor_to_tensor_tflite.cc +++ /dev/null @@ -1,745 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include -#include - -#include "flatbuffers/flexbuffers.h" -#include "tensorflow/core/util/ragged_to_dense_util_common.h" -#include "tensorflow/lite/c/common.h" -#include "tensorflow/lite/mutable_op_resolver.h" -#include "tensorflow/lite/kernels/internal/types.h" -#include "tensorflow/lite/kernels/kernel_util.h" - -namespace tflite { -namespace ops { -namespace custom { -namespace text { -namespace ragged_tensor_to_tensor { -namespace { - -constexpr int kShapeInput = 0; -constexpr int kValuesInput = 1; -constexpr int kDefaultValueInput = 2; -constexpr int kFirstPartitionInputIndex = 3; - -constexpr int kOutputTensor = 0; - -constexpr char kRowPartitionTypesAttr[] = "row_partition_types"; - -// The following three functions are copied from -// .../tensorflow/lite/kernels/internal/tensor_ctypes.h -// This header is not available in tensorflow package when building. -template -inline T* GetTensorData(TfLiteTensor* tensor) { - return tensor != nullptr ? reinterpret_cast(tensor->data.raw) : nullptr; -} - -template -inline const T* GetTensorData(const TfLiteTensor* tensor) { - return tensor != nullptr ? reinterpret_cast(tensor->data.raw) - : nullptr; -} - -inline RuntimeShape GetTensorShape(const TfLiteTensor* tensor) { - if (tensor == nullptr) { - return RuntimeShape(); - } - - TfLiteIntArray* dims = tensor->dims; - const int dims_size = dims->size; - const int32_t* dims_data = reinterpret_cast(dims->data); - return RuntimeShape(dims_size, dims_data); -} - -struct ConversionAttributes { - std::vector partition_types; - int ragged_rank = 0; - - tensorflow::RowPartitionType GetRowPartitionTypeByDimension( - int dimension) const { - if (partition_types.front() == - tensorflow::RowPartitionType::FIRST_DIM_SIZE) { - return partition_types[dimension + 1]; - } else { - return partition_types[dimension]; - } - } -}; -template -int GetFirstDimensionSizeT(TfLiteContext* context, - const TfLiteTensor& first_partition_input, - const ConversionAttributes* attributes) { - const tensorflow::RowPartitionType first_partition_type = - attributes->partition_types.front(); - switch (first_partition_type) { - case tensorflow::RowPartitionType::FIRST_DIM_SIZE: - return *GetTensorData(&first_partition_input); - case tensorflow::RowPartitionType::VALUE_ROWIDS: - context->ReportError(context, - "Cannot handle VALUE_ROWIDS in first dimension."); - return -1; - case tensorflow::RowPartitionType::ROW_SPLITS: { - const auto shape = GetTensorShape(&first_partition_input); - return shape.Dims(0) - 1; - } - - default: - context->ReportError( - context, "Cannot handle type ", - RowPartitionTypeToString(first_partition_type).c_str()); - return -1; - } -} - -int GetFirstDimensionSize(TfLiteContext* context, - const TfLiteTensor& first_partition_input, - const ConversionAttributes* attributes) { - switch (first_partition_input.type) { - case kTfLiteInt32: - return GetFirstDimensionSizeT(context, first_partition_input, - attributes); - case kTfLiteInt64: - return GetFirstDimensionSizeT(context, first_partition_input, - attributes); - default: - context->ReportError(context, - "Not supported row partitioning tensor type"); - return -1; - } -} - -bool ValidateDefaultValueShape(TfLiteContext* context, - const RuntimeShape& default_value_shape, - const RuntimeShape& /*value_shape*/) { - // TF implementation also checks that shapes are not defined, not needed in - // TFLite. - // TODO(mgubin): Only scalar default value sizes are supported. - if (default_value_shape.FlatSize() != 1) { - context->ReportError(context, "Only scalar default value is supported"); - return false; - } - return true; -} - -RuntimeShape TensorShapeFromTensor(const TfLiteTensor& tensor) { - // TODO(mgubin): No checks, see - // third_party/tensorflow/core/kernels/list_kernels.cc - const RuntimeShape tensor_shape(tensor.dims->size, tensor.dims->data); - if (0 == tensor.dims->size) { - // If the input tensor is scalar then the shape is empty (also scalar). - return RuntimeShape{}; - } - RuntimeShape result(tensor_shape.FlatSize()); - switch (tensor.type) { - case kTfLiteInt32: { - for (int i = 0; i < tensor_shape.FlatSize(); ++i) { - result.SetDim(i, GetTensorData(&tensor)[i]); - } - } break; - case kTfLiteInt64: { - for (int i = 0; i < tensor_shape.FlatSize(); ++i) { - result.SetDim(i, GetTensorData(&tensor)[i]); - } - } break; - default: { - // Checked in Prepare. - } - } - return result; -} - -const TfLiteTensor* GetRowPartitionTensor( - const ConversionAttributes& conversion_attributes, TfLiteContext* context, - TfLiteNode* node, int dimension) { - if (conversion_attributes.partition_types.front() == - tensorflow::RowPartitionType::FIRST_DIM_SIZE) { - return &context->tensors[node->inputs->data[kFirstPartitionInputIndex + 1 + - dimension]]; - } else { - return &context->tensors[node->inputs - ->data[kFirstPartitionInputIndex + dimension]]; - } -} - -int GetMaxWidthValueRowID(const TfLiteTensor* tensor) { - const RuntimeShape tensor_shape(tensor->dims->size, tensor->dims->data); - const int index_length = tensor_shape.FlatSize(); - if (index_length == 0) { - return 0; - } - auto value_rowids = [tensor](int index) { - switch (tensor->type) { - case kTfLiteInt32: - return static_cast(tensor->data.i32[index]); - case kTfLiteInt64: - return static_cast(tensor->data.i64[index]); - default: - // TODO(mgubin): Add error checks. - return 0; - } - }; - int first_equal_index = 0; - int first_equal_index_value = value_rowids(0); - int max_width = 0; - for (int i = 0; i < index_length; ++i) { - const int value = value_rowids(i); - if (value != first_equal_index_value) { - first_equal_index_value = value; - max_width = std::max(i - first_equal_index, max_width); - first_equal_index = i; - } - } - return std::max(index_length - first_equal_index, max_width); -} - -int GetMaxWidthRowSplit(const TfLiteTensor* tensor) { - const RuntimeShape tensor_shape(tensor->dims->size, tensor->dims->data); - const int tensor_length = tensor_shape.FlatSize(); - if (tensor_length == 0 || tensor_length == 1) { - return 0; - } - auto value_rowsplit = [tensor](int index) { - switch (tensor->type) { - case kTfLiteInt32: - return static_cast(tensor->data.i32[index]); - case kTfLiteInt64: - return static_cast(tensor->data.i64[index]); - default: - // TODO(mgubin): Add error checks. - return 0; - } - }; - int max_width = 1; - int prev_split = value_rowsplit(0); - for (int i = 1; i < tensor_length; ++i) { - const int split = value_rowsplit(i); - max_width = std::max(max_width, split - prev_split); - prev_split = split; - } - return max_width; -} - -int GetMaxWidth(const ConversionAttributes& conversion_attributes, - TfLiteContext* context, TfLiteNode* node, int dimension) { - const TfLiteTensor* tensor = GetRowPartitionTensor( - conversion_attributes, context, node, dimension - 1); - switch (conversion_attributes.GetRowPartitionTypeByDimension(dimension - 1)) { - case tensorflow::RowPartitionType::VALUE_ROWIDS: - return GetMaxWidthValueRowID(tensor); - case tensorflow::RowPartitionType::ROW_SPLITS: - return GetMaxWidthRowSplit(tensor); - default: - context->ReportError(context, "Cannot handle partition type"); - return -1; - } -} - -RuntimeShape CombineRaggedTensorToTensorShapes( - int ragged_rank, const RuntimeShape& output_shape, - const RuntimeShape& value_shape) { - // TODO(mgubin): No checks, see - // third_party/tensorflow/core/ops/ragged_to_dense_util.cc - RuntimeShape result(output_shape); - if (output_shape.DimensionsCount() == 0) { - const int output_shape_rank = ragged_rank + value_shape.DimensionsCount(); - result.Resize(output_shape_rank); - for (int i = 0; i < output_shape_rank; ++i) { - result.SetDim(i, -1); - } - } - const int need_to_set = - output_shape.DimensionsCount() - value_shape.DimensionsCount(); - for (int i = 1; i < value_shape.DimensionsCount(); ++i) { - result.SetDim(need_to_set + i, value_shape.Dims(i)); - } - return result; -} - -RuntimeShape CalculateOutputSize( - const ConversionAttributes& conversion_attributes, TfLiteContext* context, - TfLiteNode* node, int first_dimension, int ragged_rank, - const TfLiteTensor& values, const TfLiteTensor& default_value, - const TfLiteTensor& output_shape) { - RuntimeShape values_shape(values.dims->size, values.dims->data); - RuntimeShape default_value_shape(default_value.dims->size, - default_value.dims->data); - - if (!ValidateDefaultValueShape(context, default_value_shape, values_shape)) { - return {}; - } - RuntimeShape output_shape_shape = TensorShapeFromTensor(output_shape); - - RuntimeShape result_shape = CombineRaggedTensorToTensorShapes( - ragged_rank, output_shape_shape, values_shape); - if (result_shape.Dims(0) < 0) { - result_shape.SetDim(0, first_dimension); - } - for (int i = 1; i <= ragged_rank; ++i) { - if (result_shape.Dims(i) < 0) { - result_shape.SetDim(i, - GetMaxWidth(conversion_attributes, context, node, i)); - } - } - return result_shape; -} - -TfLiteIntArray* IntArrayFromShape(const RuntimeShape& shape) { - TfLiteIntArray* result = TfLiteIntArrayCreate(shape.DimensionsCount()); - for (int i = 0; i < shape.DimensionsCount(); ++i) { - result->data[i] = shape.Dims(i); - } - return result; -} - -/** - * The output_index represents the index in the output tensor - * where the first element of a particular dimension would be written. - * If it is -1, it indicates that the index is out of scope. - * Example, given first_dimension = 10, first_dimension_output = 6, - * and output_index_multiplier = 100: - * result = [0 100 200 300 400 500 -1 -1 -1 -1] - * If first_dimension_output = 11 instead, then: - * result = [0 100 200 300 400 500 600 700 800 900] - */ -void CalculateFirstParentOutputIndex(int first_dimension, - int output_index_multiplier, - int first_dimension_output, - std::vector* result) { - const int min_dimension = std::min(first_dimension, first_dimension_output); - result->reserve(first_dimension); - int current_output_index = 0; - for (int i = 0; i < min_dimension; - ++i, current_output_index += output_index_multiplier) { - result->push_back(current_output_index); - } - for (int i = min_dimension; i < first_dimension; ++i) { - result->push_back(-1); - } -} -// Calculate the output index of the first element of a list. -// The parent_output_index is the same computation for the previous list. -// -1 indicates an element or list that is out of range. -// The output_index_multiplier is the number of output indices one moves -// forward for each column. -// E.g., given: -// value_rowids:[0 1 2 2 2 3 5 5 6] -// parent_output_index:[1000 1100 2000 2100 -1 3000 4000] -// output_index_multiplier: 10 -// output_size: 2 -// You get: -// result = [1000 1100 2000 2010 -1 2100 -1 -1 3000] -// result[0] = parent_output_index[value_rowids[0]] -// result[1] = parent_output_index[value_rowids[1]] -// result[2] = parent_output_index[value_rowids[2]] -// result[3] = parent_output_index[value_rowids[2] + 10] -// result[4] = -1 because it is the third element the size is 2. -// result[5] = parent_output_index[value_rowids[3]] -// result[6] = -1 because parent_output_index[value_rowids[6]] == -1 -// result[7] = -1 because parent_output_index[value_rowids[6]] == -1 -// result[8] = parent_output_index[value_rowids[7]] -void CalculateOutputIndexValueRowID(const TfLiteTensor& value_rowids, - const std::vector& parent_output_index, - int output_index_multiplier, - int output_size, std::vector* result) { - const RuntimeShape tensor_shape(value_rowids.dims->size, - value_rowids.dims->data); - const int index_size = tensor_shape.FlatSize(); - result->reserve(index_size); - if (index_size == 0) { - return; - } - - auto value_rowids_val = [value_rowids](int index) { - switch (value_rowids.type) { - case kTfLiteInt32: - return static_cast(value_rowids.data.i32[index]); - case kTfLiteInt64: - return static_cast(value_rowids.data.i64[index]); - default: - // TODO(mgubin): Add error checks. - return 0; - } - }; - int current_output_column = 0; - int current_value_rowid = value_rowids_val(0); - // DCHECK_LT(current_value_rowid, parent_output_index.size()); - int current_output_index = parent_output_index[current_value_rowid]; - result->push_back(current_output_index); - for (int i = 1; i < index_size; ++i) { - int next_value_rowid = value_rowids_val(i); - if (next_value_rowid == current_value_rowid) { - if (current_output_index >= 0) { - ++current_output_column; - if (current_output_column < output_size) { - current_output_index += output_index_multiplier; - } else { - current_output_index = -1; - } - } - } else { - current_output_column = 0; - current_value_rowid = next_value_rowid; - // DCHECK_LT(next_value_rowid, parent_output_index.size()); - current_output_index = parent_output_index[next_value_rowid]; - } - result->push_back(current_output_index); - } - // DCHECK_EQ(result->size(), value_rowids.size()); -} - -void CalculateOutputIndexRowSplit(const TfLiteTensor& row_split, - const std::vector& parent_output_index, - int output_index_multiplier, int output_size, - std::vector* result) { - const RuntimeShape row_split_shape(row_split.dims->size, - row_split.dims->data); - const int row_split_size = row_split_shape.FlatSize(); - auto row_split_val = [row_split](int index) { - switch (row_split.type) { - case kTfLiteInt32: - return static_cast(row_split.data.i32[index]); - case kTfLiteInt64: - return static_cast(row_split.data.i64[index]); - default: - // TODO(mgubin): Add error checks. - return 0; - } - }; - if (row_split_size > 0) { - result->reserve(row_split_val(row_split_size - 1)); - } - for (int i = 0; i < row_split_size - 1; ++i) { - const int row_length = row_split_val(i + 1) - row_split_val(i); - int real_length = std::min(output_size, row_length); - int parent_output_index_current = parent_output_index[i]; - - if (parent_output_index_current == -1) { - real_length = 0; - } - for (int j = 0; j < real_length; ++j) { - result->push_back(parent_output_index_current); - parent_output_index_current += output_index_multiplier; - } - for (int j = 0; j < row_length - real_length; ++j) { - result->push_back(-1); - } - } - // if (row_split_size > 0) { - // DCHECK_EQ(result->size(), row_split(row_split_size - 1)); - //} -} - -TfLiteStatus CalculateOutputIndex( - const ConversionAttributes& conversion_attributes, TfLiteContext* context, - TfLiteNode* node, int dimension, - const std::vector& parent_output_index, int output_index_multiplier, - int output_size, std::vector* result) { - const TfLiteTensor* row_partition_tensor = - GetRowPartitionTensor(conversion_attributes, context, node, dimension); - auto partition_type = - conversion_attributes.GetRowPartitionTypeByDimension(dimension); - switch (partition_type) { - case tensorflow::RowPartitionType::VALUE_ROWIDS: - CalculateOutputIndexValueRowID(*row_partition_tensor, parent_output_index, - output_index_multiplier, output_size, - result); - return kTfLiteOk; - case tensorflow::RowPartitionType::ROW_SPLITS: - CalculateOutputIndexRowSplit(*row_partition_tensor, parent_output_index, - output_index_multiplier, output_size, - result); - return kTfLiteOk; - default: - context->ReportError(context, "Unsupported partition type"); - return kTfLiteError; - } -} - -template -void SetOutputT(TfLiteContext* context, int ragged_rank, - const std::vector& output_index, - const TfLiteTensor& values_tensor, - const TfLiteTensor& default_value_tensor, - TfLiteTensor* output_tensor) { - const VALUE_TYPE* values_base = GetTensorData(&values_tensor); - VALUE_TYPE* output_base = GetTensorData(output_tensor); - const VALUE_TYPE* default_value = - GetTensorData(&default_value_tensor); - - RuntimeShape output_shape = GetTensorShape(output_tensor); - RuntimeShape element_shape = - RuntimeShape(output_shape.DimensionsCount() - ragged_rank - 1, - output_shape.DimsData() + ragged_rank + 1); - - // element_shape.RemoveDimRange(0, ragged_rank + 1); - const int value_element_size = element_shape.FlatSize(); - size_t output_index_size = output_index.size(); - - // Loop through the output_index vector, finding contiguous regions that - // should be copied. Once we find the end of a contiguous region, copy it - // and add any necessary padding (with default_value). - int src_start = 0; // Start of contiguous region (in values) - int dst_start = 0; // Destination for contiguous region (in output) - int dst_end = 0; // Destination for contiguous region (in output) - for (int src_i = 0; src_i <= output_index_size; ++src_i) { - // dst_i is the destination where the value at src_i should be copied. - int dst_i = src_i < output_index_size ? output_index[src_i] : -1; - - // If we're still in a contiguous region, then update dst_end go to the - // next src_i. - if (dst_i == dst_end) { - ++dst_end; - continue; - } - - // We found the end of contiguous region. This can be because we found - // a gap (dst_i > dst_end), or a source value that shouldn't be copied - // because it's out-of-bounds (dst_i == -1), or the end of the tensor - // (dst_i = -1). - if (dst_start < dst_end) { - // Copy the contiguous region. - const VALUE_TYPE* src = values_base + src_start * value_element_size; - VALUE_TYPE* dst = output_base + dst_start * value_element_size; - int nvals = (dst_end - dst_start) * value_element_size; - std::copy(src, src + nvals, dst); - // copy_array(dst, src, nvals); - } - - // Add any necessary padding (w/ default_value). - if (src_i >= output_index_size) { - // We reached the end of values: pad to the end of output. - const int output_size = output_shape.FlatSize(); - dst_i = output_size / value_element_size; - } - if (dst_i > dst_end) { - std::fill(output_base + dst_end * value_element_size, - output_base + dst_i * value_element_size, *default_value); - dst_end = dst_i; - } - - // Update indices. - if (dst_i < 0) { - // src_i should be skipped -- leave it out of the contiguous region. - src_start = src_i + 1; - dst_start = dst_end; - } else { - // src_i should be copied -- include it in the contiguous region. - src_start = src_i; - dst_start = dst_end; - dst_end = dst_start + 1; - } - } -} - -bool IsSupportedTensorType(TfLiteType type) { - // Should reflect SetOutput capabilities. - return type == kTfLiteInt32 || type == kTfLiteInt64 || type == kTfLiteFloat32; -} - -TfLiteStatus SetOutput(TfLiteContext* context, int ragged_rank, - const std::vector& output_index, - const TfLiteTensor& values_tensor, - const TfLiteTensor& default_value_tensor, - TfLiteTensor* output_tensor) { - switch (output_tensor->type) { - case kTfLiteInt32: - SetOutputT(context, ragged_rank, output_index, values_tensor, - default_value_tensor, output_tensor); - return kTfLiteOk; - case kTfLiteInt64: - SetOutputT(context, ragged_rank, output_index, values_tensor, - default_value_tensor, output_tensor); - return kTfLiteOk; - case kTfLiteFloat32: - SetOutputT(context, ragged_rank, output_index, values_tensor, - default_value_tensor, output_tensor); - return kTfLiteOk; - default: - // Should not happen, checked in Prepare. - // Left as a defensive programming artifact for future updates. - context->ReportError(context, "Not supported values type"); - return kTfLiteError; - } -} - -} // namespace - -void* Initialize(TfLiteContext* context, const char* buffer, size_t length) { - auto attributes = std::make_unique(); - - const uint8_t* buffer_t = reinterpret_cast(buffer); - - const flexbuffers::Map& m = flexbuffers::GetRoot(buffer_t, length).AsMap(); - // TODO (mgubin): Converting flat buffer to a vector of strings looks not very - // effective but simple. A cleaner way is needed. - const flexbuffers::TypedVector row_partition_types_attr = - m[kRowPartitionTypesAttr].AsTypedVector(); - std::vector row_partition_types_attr_strings; - row_partition_types_attr_strings.reserve(row_partition_types_attr.size()); - for (int i = 0; i < row_partition_types_attr.size(); ++i) { - row_partition_types_attr_strings.emplace_back( - row_partition_types_attr[i].AsString().str()); - } - attributes->partition_types = - tensorflow::GetRowPartitionTypesHelper(row_partition_types_attr_strings); - if (attributes->partition_types.size() != - row_partition_types_attr_strings.size()) { - context->ReportError(context, "Can't parse partition type attribute"); - return nullptr; - } - attributes->ragged_rank = - tensorflow::GetRaggedRank(attributes->partition_types); - return attributes.release(); -} -void Free(TfLiteContext* /*context*/, void* buffer) { - ConversionAttributes* attributes = - reinterpret_cast(buffer); - delete attributes; -} - -TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { - const ConversionAttributes* attributes = - reinterpret_cast(node->user_data); - if (attributes == nullptr) { - // Parsing attributes failed, can't prepare. - context->ReportError(context, "Attributes are not initialized"); - return kTfLiteError; - } - TfLiteTensor& output_tensor = - context->tensors[node->outputs->data[kOutputTensor]]; - if (!IsSupportedTensorType(output_tensor.type)) { - context->ReportError(context, "Unsupported ragged tensor type"); - return kTfLiteError; - } - // The output tensor needs to be set to dynamic because it can have different - // size. - SetTensorToDynamic(&output_tensor); - - // Check that input shape tensor is int32 or int64 - TfLiteTensor& input_shape = context->tensors[node->inputs->data[kShapeInput]]; - if (input_shape.type != kTfLiteInt32 && input_shape.type != kTfLiteInt64) { - context->ReportError(context, - "Input shape tensor could be only int32 or int64"); - return kTfLiteError; - } - return kTfLiteOk; -} - -TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { - const ConversionAttributes* attributes = - reinterpret_cast(node->user_data); - TfLiteTensor& input_shape = context->tensors[node->inputs->data[kShapeInput]]; - TfLiteTensor& input_values = - context->tensors[node->inputs->data[kValuesInput]]; - TfLiteTensor& default_value = - context->tensors[node->inputs->data[kDefaultValueInput]]; - // TODO (mgubin): Only scallar default value is supported. - if (RuntimeShape(default_value.dims->size, default_value.dims->data) - .FlatSize() != 1) { - context->ReportError(context, "Only scallar default value is supported"); - return kTfLiteError; - } - TfLiteTensor& first_partition_input = - context->tensors[node->inputs->data[kFirstPartitionInputIndex]]; - - // Calculate dimensions. - const int first_dimension = - GetFirstDimensionSize(context, first_partition_input, attributes); - if (first_dimension < 0) { - return kTfLiteError; - } - RuntimeShape output_shape = CalculateOutputSize( - *attributes, context, node, first_dimension, attributes->ragged_rank, - input_values, default_value, input_shape); - if (output_shape.DimensionsCount() == 0) { - return kTfLiteError; - } - - std::vector multiplier; - multiplier.resize(attributes->ragged_rank + 1); - multiplier.back() = 1; - for (int i = multiplier.size() - 2; i >= 0; --i) { - multiplier[i] = multiplier[i + 1] * output_shape.Dims(i + 1); - } - - // Allocate output tensor. - TfLiteTensor& output_tensor = - context->tensors[node->outputs->data[kOutputTensor]]; - - TF_LITE_ENSURE_OK(context, - context->ResizeTensor(context, &output_tensor, - IntArrayFromShape(output_shape))); - - // Copy data. - const int full_size = multiplier.front() * output_shape.Dims(0); - if (full_size > 0) { - std::vector output_index, new_output_index; - int nvals = input_values.dims->data[0]; - output_index.reserve(nvals); - new_output_index.reserve(nvals); - - CalculateFirstParentOutputIndex(first_dimension, multiplier[0], - output_shape.Dims(0), &output_index); - for (int i = 1; i <= attributes->ragged_rank; ++i) { - TF_LITE_ENSURE_OK( - context, CalculateOutputIndex( - *attributes, context, node, i - 1, output_index, - multiplier[i], output_shape.Dims(i), &new_output_index)); - output_index.swap(new_output_index); - new_output_index.clear(); - } - - TF_LITE_ENSURE_OK(context, - SetOutput(context, attributes->ragged_rank, output_index, - input_values, default_value, &output_tensor)); - } - return kTfLiteOk; -} - -static TfLiteRegistration* GetTfLiteRegistration() { - static TfLiteRegistration r = {Initialize, Free, Prepare, Eval}; - return &r; -} - -} // namespace ragged_tensor_to_tensor - -extern "C" void AddRaggedTensorToTensor(tflite::MutableOpResolver* resolver) { - resolver->AddCustom("RaggedTensorToTensor", - ragged_tensor_to_tensor::GetTfLiteRegistration()); -} - -TfLiteRegistration* Register_RAGGED_TENSOR_TO_TENSOR() { - return ragged_tensor_to_tensor::GetTfLiteRegistration(); -} - -} // namespace text -} // namespace custom -} // namespace ops -} // namespace tflite diff --git a/tensorflow_text/core/kernels/ragged_tensor_to_tensor_tflite.h b/tensorflow_text/core/kernels/ragged_tensor_to_tensor_tflite.h index 30586a094..bf587ca35 100644 --- a/tensorflow_text/core/kernels/ragged_tensor_to_tensor_tflite.h +++ b/tensorflow_text/core/kernels/ragged_tensor_to_tensor_tflite.h @@ -15,19 +15,6 @@ #ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_RAGGED_TENSOR_TO_TENSOR_TFLITE_H_ #define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_RAGGED_TENSOR_TO_TENSOR_TFLITE_H_ -#include "tensorflow/lite/c/common.h" -#include "tensorflow/lite/mutable_op_resolver.h" - -namespace tflite { -namespace ops { -namespace custom { -namespace text { - -extern "C" void AddRaggedTensorToTensor(::tflite::MutableOpResolver* resolver); - -} // namespace text -} // namespace custom -} // namespace ops -} // namespace tflite +#include "tensorflow/core/kernels/text/ragged_tensor_to_tensor_tflite.h" #endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_RAGGED_TENSOR_TO_TENSOR_TFLITE_H_ diff --git a/tensorflow_text/core/kernels/ragged_tensor_to_tensor_tflite_test.cc b/tensorflow_text/core/kernels/ragged_tensor_to_tensor_tflite_test.cc deleted file mode 100644 index 347f47126..000000000 --- a/tensorflow_text/core/kernels/ragged_tensor_to_tensor_tflite_test.cc +++ /dev/null @@ -1,317 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include -#include -#include - -#include -#include -#include "flatbuffers/flexbuffers.h" -#include "tensorflow/lite/c/common.h" -#include "tensorflow/lite/interpreter.h" -#include "tensorflow/lite/kernels/internal/tensor.h" -#include "tensorflow/lite/kernels/internal/tensor_ctypes.h" -#include "tensorflow/lite/kernels/test_util.h" -#include "tensorflow/lite/schema/schema_generated.h" - -namespace tflite { -namespace ops { -namespace custom { -namespace text { -TfLiteRegistration* Register_RAGGED_TENSOR_TO_TENSOR(); -} // namespace text -} // namespace custom -} // namespace ops - -namespace { - -class RaggedTensorToTensorOpModel : public SingleOpModel { - public: - RaggedTensorToTensorOpModel(int output_shape_dims, - std::initializer_list values_shape, - std::initializer_list> - partition_tensors_shapes, - std::vector partition_types, - TensorType value_type = TensorType_FLOAT32, - TensorType index_type = TensorType_INT32, - bool allocate_and_delegate = true) { - // A structure to collect shapes for the input. - std::vector> shapes; - input_shape_ = AddInput(index_type); - shapes.push_back({output_shape_dims}); - input_values_ = AddInput(value_type); - shapes.emplace_back(values_shape); - input_default_values_ = AddInput(value_type); - shapes.push_back({1}); - for (const auto& p : partition_tensors_shapes) { - partition_tensors_.push_back(AddInput(TensorType_INT32)); - shapes.emplace_back(p); - } - output_ = AddOutput(value_type); - - flexbuffers::Builder fbb; - size_t start = fbb.StartMap(); - { - size_t start = fbb.StartVector("row_partition_types"); - for (const auto& s : partition_types) { - fbb.String(s); - } - fbb.EndVector(start, /*typed=*/true, /*fixed=*/false); - } - fbb.Int("num_row_partition_tensors", partition_types.size()); - fbb.EndMap(start); - fbb.Finish(); - SetCustomOp("RaggedTensorToTensor", fbb.GetBuffer(), - ops::custom::text::Register_RAGGED_TENSOR_TO_TENSOR); - BuildInterpreter(shapes, /*num_threads=*/-1, - /*allow_fp32_relax_to_fp16=*/false, - /*apply_delegate=*/true, - /*allocate_and_delegate=*/allocate_and_delegate); - } - - std::vector GetOutputShape() { return GetTensorShape(output_); } - - std::vector GetOutputFloat() { return ExtractVector(output_); } - std::vector GetOutputInt() { - return ExtractVector(output_); - } - - void InvokeFloat(const std::vector& shape, - const std::vector& values, float default_value, - const std::vector>& partition_values) { - PopulateTensor(input_shape_, shape); - PopulateTensor(input_values_, values); - PopulateTensor(input_default_values_, {default_value}); - for (int i = 0; i < partition_values.size(); ++i) { - PopulateTensor(partition_tensors_[i], partition_values[i]); - } - SingleOpModel::Invoke(); - } - void InvokeInt(const std::vector& shape, - const std::vector& values, int32_t default_value, - const std::vector>& partition_values) { - PopulateTensor(input_shape_, shape); - PopulateTensor(input_values_, values); - PopulateTensor(input_default_values_, {default_value}); - for (int i = 0; i < partition_values.size(); ++i) { - PopulateTensor(partition_tensors_[i], partition_values[i]); - } - SingleOpModel::Invoke(); - } - TfLiteStatus TryAllocateTensors() { return interpreter_->AllocateTensors(); } - - private: - int input_shape_; - int input_values_; - int input_default_values_; - std::vector partition_tensors_; - int output_; -}; - -TEST(RaggedTensorToTensorTest, RaggedTensorToTensor) { - // indices = [2, 1, 0, 3] - // params = [[.1, .2, .3], [], [.4, .5, .6, .7], [.8, .9]] - // params.shape = [4, None] - RaggedTensorToTensorOpModel model( - 2, // output_shape_dims - {9}, // values_shape - {{1}, {9}}, // partition_tensors_shapes - std::vector({"FIRST_DIM_SIZE", "VALUE_ROWIDS"})); - model.InvokeFloat({4, 4}, // shape - {.1, .2, .3, .4, .5, .6, .7, .8, .9}, // values - 1.5, // default_value - std::vector>( - {std::vector({4}), - std::vector({0, 0, 0, 2, 2, 2, 2, 3, 3})})); - EXPECT_THAT(model.GetOutputShape(), testing::ElementsAreArray({4, 4})); - EXPECT_THAT(model.GetOutputFloat(), - testing::ElementsAreArray({.1, .2, .3, 1.5, 1.5, 1.5, 1.5, 1.5, - .4, .5, .6, .7, .8, .9, 1.5, 1.5})); -} - -TEST(RaggedTensorToTensorTest, RaggedTensorToTensorRowSplits) { - // indices = [2, 1, 0, 3] - // params = [[.1, .2, .3], [], [.4, .5, .6, .7], [.8, .9]] - RaggedTensorToTensorOpModel model(2, // output_shape_dims - {9}, // values_shape - {{5}}, // partition_tensors_shapes - std::vector({"ROW_SPLITS"})); - model.InvokeFloat( - {4, 4}, // shape - {.1, .2, .3, .4, .5, .6, .7, .8, .9}, // values - 1.5, // default_value - std::vector>({std::vector({0, 3, 3, 7, 9})})); - EXPECT_THAT(model.GetOutputShape(), testing::ElementsAreArray({4, 4})); - EXPECT_THAT(model.GetOutputFloat(), - testing::ElementsAreArray({.1, .2, .3, 1.5, 1.5, 1.5, 1.5, 1.5, - .4, .5, .6, .7, .8, .9, 1.5, 1.5})); -} - -TEST(RaggedTensorToTensorTest, RaggedTensorToTensor_3DParams) { - // params = [ - // [[]], - // [[.1, .2], [.3]], - // [], - // [[.4, .5], [.6, .7, .8]], - // [[.9]] - // ] - RaggedTensorToTensorOpModel model( - 3, // output_shape_dims - {9}, // values_shape - {{1}, {6}, {9}}, // partition_tensors_shapes - std::vector( - {"FIRST_DIM_SIZE", "VALUE_ROWIDS", "VALUE_ROWIDS"})); - model.InvokeFloat( - {5, 2, 3}, // shape - {.1, .2, .3, .4, .5, .6, .7, .8, .9}, // values - 1.5, // default_value - std::vector>( - {std::vector({5}), std::vector({0, 1, 1, 3, 3, 4}), - std::vector({1, 1, 2, 3, 3, 4, 4, 4, 5})})); - - EXPECT_THAT(model.GetOutputShape(), testing::ElementsAreArray({5, 2, 3})); - EXPECT_THAT(model.GetOutputFloat(), - testing::ElementsAreArray({1.5, 1.5, 1.5, 1.5, 1.5, 1.5, .1, .2, - 1.5, .3, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, - 1.5, 1.5, .4, .5, 1.5, .6, .7, .8, - .9, 1.5, 1.5, 1.5, 1.5, 1.5})); -} - -TEST(RaggedTensorToTensorOpTest, RaggedTensorToTensor_3DParamsRowSplits) { - // params = [ - // [[]], - // [[.1, .2], [.3]], - // [], - // [[.4, .5], [.6, .7, .8]], - // [[.9]] - // ] - RaggedTensorToTensorOpModel model( - 3, // output_shape_dims - {9}, // values_shape - {{6}, {7}}, // partition_tensors_shapes - std::vector({"ROW_SPLITS", "ROW_SPLITS"})); - model.InvokeFloat( - {5, 2, 3}, // shape - {.1, .2, .3, .4, .5, .6, .7, .8, .9}, // values - 1.5, // default_value - std::vector>({std::vector({0, 1, 3, 3, 5, 6}), - std::vector({0, 0, 2, 3, 5, 8, 9})})); - EXPECT_THAT(model.GetOutputShape(), testing::ElementsAreArray({5, 2, 3})); - EXPECT_THAT(model.GetOutputFloat(), - testing::ElementsAreArray({1.5, 1.5, 1.5, 1.5, 1.5, 1.5, .1, .2, - 1.5, .3, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, - 1.5, 1.5, .4, .5, 1.5, .6, .7, .8, - .9, 1.5, 1.5, 1.5, 1.5, 1.5})); -} - -TEST(RaggedTensorToTensorTest, RaggedTensorToTensor_3DParamsRowSplits2) { - // params = [ - // [[0, 1, 2], []], - // [], - // [[3]] - // ] - - RaggedTensorToTensorOpModel model( - 3, // output_shape_dims - {4}, // values_shape - {{4}, {4}}, // partition_tensors_shapes - std::vector({"ROW_SPLITS", "ROW_SPLITS"}), TensorType_INT32); - model.InvokeInt( - {3, 2, 3}, // shape - {0, 1, 2, 3}, // values - 5, // default_value - std::vector>( - {std::vector({0, 2, 2, 3}), std::vector({0, 3, 3, 4})})); - - EXPECT_THAT(model.GetOutputShape(), testing::ElementsAreArray({3, 2, 3})); - - EXPECT_THAT(model.GetOutputInt(), - testing::ElementsAreArray( - {0, 1, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 5, 5, 5, 5, 5})); -} - -TEST(RaggedTensorToTensorTest, RaggedTensorToTensorContractExpanded) { - // params = [[.1, .2, .3], [], [.4, .5, .6, .7], [.8, .9]] - RaggedTensorToTensorOpModel model( - 2, // output_shape_dims - {9}, // values_shape - {{1}, {9}}, // partition_tensors_shapes - std::vector({"FIRST_DIM_SIZE", "VALUE_ROWIDS"})); - model.InvokeFloat({3, 5}, // shape - {.1, .2, .3, .4, .5, .6, .7, .8, .9}, // values - 1.5, // default_value - std::vector>( - {std::vector({4}), - std::vector({0, 0, 0, 2, 2, 2, 2, 3, 3})})); - EXPECT_THAT(model.GetOutputShape(), testing::ElementsAreArray({3, 5})); - - EXPECT_THAT(model.GetOutputFloat(), - testing::ElementsAreArray({.1, .2, .3, 1.5, 1.5, // - 1.5, 1.5, 1.5, 1.5, 1.5, // - .4, .5, .6, .7, 1.5})); -} - -// Adds a dense dimension. -TEST(RaggedTensorToTensorTest, RaggedTensorToTensorContractExpandedDense) { - // params = [[.1, .2, .3], [], [.4, .5, .6, .7], [.8, .9]] - RaggedTensorToTensorOpModel model( - 3, // output_shape_dims - {9, 2}, // values_shape - {{1}, {9}}, // partition_tensors_shapes - std::vector({"FIRST_DIM_SIZE", "VALUE_ROWIDS"})); - - model.InvokeFloat({3, 5, 2}, // shape - {.1, 1.1, .2, 1.2, .3, 1.3, .4, 1.4, .5, 1.5, .6, 1.6, .7, - 1.7, .8, 1.8, .9, 1.9}, // values - 1.5, // default_value - std::vector>( - {std::vector({4}), - std::vector({0, 0, 0, 2, 2, 2, 2, 3, 3})})); - - EXPECT_THAT(model.GetOutputShape(), testing::ElementsAreArray({3, 5, 2})); - EXPECT_THAT(model.GetOutputFloat(), - testing::ElementsAreArray( - {.1, 1.1, .2, 1.2, .3, 1.3, 1.5, 1.5, 1.5, 1.5, // - 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, // - .4, 1.4, .5, 1.5, .6, 1.6, .7, 1.7, 1.5, 1.5})); -} - -TEST(RaggedTensorToTensorTest, StringType) { - RaggedTensorToTensorOpModel model( - 2, // output_shape_dims - {9}, // values_shape - {{1}, {9}}, // partition_tensors_shapes - std::vector({"FIRST_DIM_SIZE", "VALUE_ROWIDS"}), - TensorType_STRING, TensorType_INT32, /*allocate_and_delegate=*/false); - EXPECT_EQ(model.TryAllocateTensors(), kTfLiteError); -} - -} // namespace -} // namespace tflite diff --git a/tensorflow_text/core/kernels/regex_split.cc b/tensorflow_text/core/kernels/regex_split.cc deleted file mode 100644 index add317d1a..000000000 --- a/tensorflow_text/core/kernels/regex_split.cc +++ /dev/null @@ -1,90 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/regex_split.h" - -#include - -namespace tensorflow { -namespace text { -namespace { - -template -void RegexSplitImpl(absl::string_view input, const RE2& re2, - bool include_delimiter, const RE2& include_delim_regex, - std::vector* tokens, - std::vector* begin_offsets, - std::vector* end_offsets) { - absl::string_view leftover = input; - absl::string_view last_end = leftover; - - // Keep looking for split points until we have reached the end of the input. - absl::string_view extracted_delim_token; - while (RE2::FindAndConsume(&leftover, re2, &extracted_delim_token)) { - absl::string_view token(last_end.data(), - extracted_delim_token.data() - last_end.data()); - bool has_non_empty_token = token.length() > 0; - bool should_include_delim = - include_delimiter && include_delim_regex.FullMatch( - extracted_delim_token, include_delim_regex); - last_end = leftover; - - // Mark the end of the previous token, only if there was something. - if (has_non_empty_token) { - tokens->push_back(token); - // Mark the end of the last token - begin_offsets->push_back(token.data() - input.data()); - end_offsets->push_back(token.data() + token.length() - input.data()); - } - - if (should_include_delim) { - // If desired, include the deliminator as a token. - tokens->push_back(extracted_delim_token); - // Mark the end of the token at the end of the beginning of the delimiter. - begin_offsets->push_back(extracted_delim_token.data() - input.data()); - end_offsets->push_back(extracted_delim_token.data() + - extracted_delim_token.length() - input.data()); - } - } - - // Close the last token. - if (!leftover.empty()) { - tokens->push_back(leftover); - begin_offsets->push_back(leftover.data() - input.data()); - end_offsets->push_back(leftover.data() + leftover.length() - input.data()); - } -} - -} // namespace - -void RegexSplit(absl::string_view input, const RE2& re2, bool include_delimiter, - const RE2& include_delim_regex, - std::vector* tokens, - std::vector* begin_offsets, // NOLINT - std::vector* end_offsets) { // NOLINT - RegexSplitImpl(input, re2, include_delimiter, include_delim_regex, tokens, - begin_offsets, end_offsets); -} - -void RegexSplit(absl::string_view input, const RE2& re2, bool include_delimiter, - const RE2& include_delim_regex, - std::vector* tokens, - std::vector* begin_offsets, // NOLINT - std::vector* end_offsets) { // NOLINT - RegexSplitImpl(input, re2, include_delimiter, include_delim_regex, tokens, - begin_offsets, end_offsets); -} - -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/regex_split.h b/tensorflow_text/core/kernels/regex_split.h index 770efaa7e..12867d4d3 100644 --- a/tensorflow_text/core/kernels/regex_split.h +++ b/tensorflow_text/core/kernels/regex_split.h @@ -12,31 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef TENSORFLOW_TEXT_CORE_KERNELS_REGEX_SPLIT_H_ -#define TENSORFLOW_TEXT_CORE_KERNELS_REGEX_SPLIT_H_ +#ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_REGEX_SPLIT_H_ +#define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_REGEX_SPLIT_H_ -#include -#include +#include "tensorflow/core/kernels/text/regex_split.h" -#include "absl/strings/string_view.h" -#include "re2/re2.h" - -namespace tensorflow { -namespace text { - -void RegexSplit(absl::string_view input, const RE2& re2, bool include_delimiter, - const RE2& include_delim_regex, - std::vector* tokens, - std::vector* begin_offsets, // NOLINT - std::vector* end_offsets); // NOLINT - -void RegexSplit(absl::string_view input, const RE2& re2, bool include_delimiter, - const RE2& include_delim_regex, - std::vector* tokens, - std::vector* begin_offsets, // NOLINT - std::vector* end_offsets); // NOLINT - -} // namespace text -} // namespace tensorflow - -#endif // TENSORFLOW_TEXT_CORE_KERNELS_REGEX_SPLIT_H_ +#endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_REGEX_SPLIT_H_ diff --git a/tensorflow_text/core/kernels/regex_split_kernels.cc b/tensorflow_text/core/kernels/regex_split_kernels.cc deleted file mode 100644 index fd79a63d7..000000000 --- a/tensorflow_text/core/kernels/regex_split_kernels.cc +++ /dev/null @@ -1,193 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include - -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/framework/tensor_types.h" -#include "tensorflow/core/framework/types.h" -#include "tensorflow/core/platform/mutex.h" -#include "tensorflow_text/core/kernels/regex_split.h" - -namespace tensorflow { -namespace text { - -class RegexSplitOp : public tensorflow::OpKernel { - public: - explicit RegexSplitOp(tensorflow::OpKernelConstruction* ctx) - : tensorflow::OpKernel(ctx) {} - - void Compute(tensorflow::OpKernelContext* ctx) override { - bool should_keep_delim; - std::shared_ptr delim_re; - std::shared_ptr keep_delim_re; - - // get regular expressions from input - const Tensor* delim_regex_pattern_tensor; - OP_REQUIRES_OK( - ctx, ctx->input("delim_regex_pattern", &delim_regex_pattern_tensor)); - OP_REQUIRES(ctx, - TensorShapeUtils::IsScalar(delim_regex_pattern_tensor->shape()), - errors::InvalidArgument( - "Pattern must be scalar, but received ", - delim_regex_pattern_tensor->shape().DebugString())); - const string delim_regex_pattern = - delim_regex_pattern_tensor->flat()(0); - delim_re = CachedDelimRE2(delim_regex_pattern); - OP_REQUIRES( - ctx, delim_re->ok(), - errors::InvalidArgument("Invalid pattern: ", delim_regex_pattern, - ", error: ", delim_re->error())); - - const Tensor* keep_delim_regex_pattern_tensor; - OP_REQUIRES_OK(ctx, ctx->input("keep_delim_regex_pattern", - &keep_delim_regex_pattern_tensor)); - OP_REQUIRES( - ctx, - TensorShapeUtils::IsScalar(keep_delim_regex_pattern_tensor->shape()), - errors::InvalidArgument( - "Pattern must be scalar, but received ", - keep_delim_regex_pattern_tensor->shape().DebugString())); - const string keep_delim_regex_pattern = - keep_delim_regex_pattern_tensor->flat()(0); - keep_delim_re = CachedKeepDelimRE2(keep_delim_regex_pattern); - OP_REQUIRES( - ctx, keep_delim_re->ok(), - errors::InvalidArgument("Invalid pattern: ", keep_delim_regex_pattern, - ", error: ", keep_delim_re->error())); - - should_keep_delim = keep_delim_re->pattern().empty() ? false : true; - - const Tensor* input_tensor; - OP_REQUIRES_OK(ctx, ctx->input("input", &input_tensor)); - const auto& input_flat = input_tensor->flat(); - - std::vector begin_offsets; - std::vector end_offsets; - std::vector tokens; - std::vector row_splits; - row_splits.push_back(0); - - for (size_t i = 0; i < input_flat.size(); ++i) { - RegexSplit(absl::string_view(input_flat(i).data()), *delim_re, - should_keep_delim, *keep_delim_re, &tokens, &begin_offsets, - &end_offsets); - row_splits.push_back(begin_offsets.size()); - } - - // Emit the flat Tensors needed to construct RaggedTensors for tokens, - // start, end offsets. - std::vector tokens_shape; - tokens_shape.push_back(tokens.size()); - - std::vector offsets_shape; - offsets_shape.push_back(begin_offsets.size()); - - std::vector row_splits_shape; - row_splits_shape.push_back(row_splits.size()); - - Tensor* output_tokens_tensor = nullptr; - OP_REQUIRES_OK(ctx, - ctx->allocate_output("tokens", TensorShape(tokens_shape), - &output_tokens_tensor)); - auto output_tokens = output_tokens_tensor->flat(); - - Tensor* output_begin_offsets_tensor = nullptr; - OP_REQUIRES_OK( - ctx, ctx->allocate_output("begin_offsets", TensorShape(offsets_shape), - &output_begin_offsets_tensor)); - auto output_begin_offsets = output_begin_offsets_tensor->flat(); - - Tensor* output_end_offsets_tensor = nullptr; - OP_REQUIRES_OK( - ctx, ctx->allocate_output("end_offsets", TensorShape(offsets_shape), - &output_end_offsets_tensor)); - auto output_end_offsets = output_end_offsets_tensor->flat(); - - Tensor* output_row_splits_tensor = nullptr; - OP_REQUIRES_OK( - ctx, ctx->allocate_output("row_splits", TensorShape(row_splits_shape), - &output_row_splits_tensor)); - auto output_row_splits = output_row_splits_tensor->flat(); - - // Copy outputs to Tensors. - for (size_t i = 0; i < tokens.size(); ++i) { - const auto& token = tokens[i]; - output_tokens(i) = tstring(token.data(), token.length()); - } - - for (size_t i = 0; i < begin_offsets.size(); ++i) { - output_begin_offsets(i) = begin_offsets[i]; - } - - for (size_t i = 0; i < end_offsets.size(); ++i) { - output_end_offsets(i) = end_offsets[i]; - } - - for (size_t i = 0; i < row_splits.size(); ++i) { - output_row_splits(i) = row_splits[i]; - } - } - - private: - std::shared_ptr CachedDelimRE2(const string& pattern) { - { - tf_shared_lock l(delim_mu_); - if (delim_re_ != nullptr && delim_re_->pattern() == pattern) { - return delim_re_; - } - } - // Construct the new RE2 object before acquiring the lock. - auto regex = std::make_shared(pattern); - { - mutex_lock l(delim_mu_); - // Swap instead of assigning so that we destruct the old - // RE2 object (when necessary) after releasing the lock. - delim_re_.swap(regex); - return delim_re_; - } - } - - std::shared_ptr CachedKeepDelimRE2(const string& pattern) { - { - tf_shared_lock l(keep_delim_mu_); - if (keep_delim_re_ != nullptr && keep_delim_re_->pattern() == pattern) { - return keep_delim_re_; - } - } - // Construct the new RE2 object before acquiring the lock. - auto regex = std::make_shared(pattern); - { - mutex_lock l(keep_delim_mu_); - // Swap instead of assigning so that we destruct the old - // RE2 object (when necessary) after releasing the lock. - keep_delim_re_.swap(regex); - return keep_delim_re_; - } - } - - mutex delim_mu_; - std::shared_ptr delim_re_ TF_GUARDED_BY(delim_mu_); - - mutex keep_delim_mu_; - std::shared_ptr keep_delim_re_ TF_GUARDED_BY(keep_delim_mu_); - - TF_DISALLOW_COPY_AND_ASSIGN(RegexSplitOp); -}; - -REGISTER_KERNEL_BUILDER( - Name("RegexSplitWithOffsets").Device(tensorflow::DEVICE_CPU), RegexSplitOp); - -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/regex_split_test.cc b/tensorflow_text/core/kernels/regex_split_test.cc deleted file mode 100644 index 045623746..000000000 --- a/tensorflow_text/core/kernels/regex_split_test.cc +++ /dev/null @@ -1,77 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/regex_split.h" - -#include -#include -#include "absl/strings/string_view.h" -#include "re2/re2.h" -#include "tensorflow/core/platform/tstring.h" - -namespace tensorflow { -namespace text { -namespace { - -std::vector RunTest(const tstring& input, - const tstring& regex, - const tstring& delim_regex) { - RE2 re2((absl::string_view(regex))); - RE2 include_delim_re2((absl::string_view(delim_regex))); - - std::vector begin_offsets; - std::vector end_offsets; - std::vector tokens; - - RegexSplit(input, re2, true, include_delim_re2, &tokens, &begin_offsets, - &end_offsets); - return tokens; -} - -TEST(RegexSplitTest, JapaneseAndWhitespace) { - tstring regex = "(\\p{Hiragana}+|\\p{Katakana}+|\\s)"; - tstring delim_regex = "(\\p{Hiragana}+|\\p{Katakana}+)"; - tstring input = "He said フランスです"; - auto extracted_tokens = RunTest(input, regex, delim_regex); - EXPECT_THAT(extracted_tokens, testing::ElementsAreArray({ - "He", - "said", - "フランス", - "です", - })); -} - -TEST(RegexSplitTest, Japanese) { - tstring regex = "(\\p{Hiragana}+|\\p{Katakana}+)"; - tstring input = "He said フランスです"; - auto extracted_tokens = RunTest(input, regex, regex); - EXPECT_THAT(extracted_tokens, testing::ElementsAreArray({ - "He said ", - "フランス", - "です", - })); -} - -TEST(RegexSplitTest, ChineseHan) { - tstring regex = "(\\p{Han})"; - tstring input = "敵人變盟友背後盤算"; - auto extracted_tokens = RunTest(input, regex, regex); - EXPECT_THAT(extracted_tokens, - testing::ElementsAreArray( - {"敵", "人", "變", "盟", "友", "背", "後", "盤", "算"})); -} - -} // namespace -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/rouge_l_kernel.cc b/tensorflow_text/core/kernels/rouge_l_kernel.cc deleted file mode 100644 index 50a730f47..000000000 --- a/tensorflow_text/core/kernels/rouge_l_kernel.cc +++ /dev/null @@ -1,225 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include - -#include "tensorflow/core/framework/lookup_interface.h" -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/framework/register_types.h" -#include "tensorflow/core/framework/resource_mgr.h" -#include "tensorflow/core/framework/tensor.h" -#include "tensorflow/core/framework/tensor_shape.h" -#include "tensorflow/core/lib/core/errors.h" -#include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/lib/core/threadpool.h" -#include "tensorflow/core/lib/io/path.h" -#include "tensorflow/core/platform/logging.h" - -namespace tensorflow { -namespace text { - -namespace { -} // namespace - - -// ROUGE-L implementation based on -// https://www.microsoft.com/en-us/research/publication/ -// rouge-a-package-for-automatic-evaluation-of-summaries/ -template -class RougeLOp : public OpKernel { - public: - using ConstFlatSplits = typename TTypes::ConstFlat; - using ConstFlatValues = typename TTypes::ConstFlat; - - explicit RougeLOp(OpKernelConstruction* ctx) : OpKernel(ctx) {} - - void Compute(OpKernelContext* ctx) override { - const Tensor& hyp_tensor = ctx->input(0); - const auto hyp_tensor_flat = hyp_tensor.flat(); - const Tensor& hyp_splits = ctx->input(1); - const auto hyp_splits_flat = hyp_splits.flat(); - - const Tensor& ref_tensor = ctx->input(2); - const auto ref_tensor_flat = ref_tensor.flat(); - const Tensor& ref_splits = ctx->input(3); - const auto ref_splits_flat = ref_splits.flat(); - - const Tensor& alpha_tensor = ctx->input(4); - const auto alpha_scalar = alpha_tensor.scalar(); - const float alpha = alpha_scalar(); - - // Alpha must be <=1. - OP_REQUIRES(ctx, alpha <= 1, - errors::InvalidArgument("alpha must be <1 but was=", alpha)); - - // Ref and Hyp must have the same number of rows. - OP_REQUIRES(ctx, ref_splits_flat.size() == hyp_splits_flat.size(), - errors::InvalidArgument( - "ref splits len=", ref_splits_flat.size(), - "must equal hyp splits len=", hyp_splits_flat.size())); - - // All inputs must be vectors. - OP_REQUIRES(ctx, TensorShapeUtils::IsVector(hyp_tensor.shape()), - errors::InvalidArgument("hypotheses values must be a vector")); - OP_REQUIRES(ctx, TensorShapeUtils::IsVector(ref_tensor.shape()), - errors::InvalidArgument("references values must be a vector")); - OP_REQUIRES(ctx, TensorShapeUtils::IsVector(hyp_splits.shape()), - errors::InvalidArgument("hypotheses splits must be a vector")); - OP_REQUIRES(ctx, TensorShapeUtils::IsVector(ref_splits.shape()), - errors::InvalidArgument("references splits must be a vector")); - // Ref and Hyp must have at least one split. - OP_REQUIRES(ctx, ref_splits_flat.size() > 0, - errors::InvalidArgument( - "ref splits len=0; must have at least 1 split")); - - // Output is a dense Tensor containing one row per input row. - TensorShape output_shape({ref_splits_flat.size() - 1}); - - // Allocate the F-Measure output tensor. - Tensor* f_measure_tensor; - OP_REQUIRES_OK(ctx, ctx->allocate_output("f_measure", output_shape, - &f_measure_tensor)); - auto f_measures_flat = f_measure_tensor->flat(); - - // Allocate the P-Measure output tensor. - Tensor* p_measure_tensor; - OP_REQUIRES_OK(ctx, ctx->allocate_output("p_measure", output_shape, - &p_measure_tensor)); - auto p_measures_flat = p_measure_tensor->flat(); - - // Allocate the R-Measure output tensor. - Tensor* r_measure_tensor; - OP_REQUIRES_OK(ctx, ctx->allocate_output("r_measure", output_shape, - &r_measure_tensor)); - auto r_measures_flat = r_measure_tensor->flat(); - - // Iterate over the splits, skipping the first split as it is always zero. - for (int i = 1; i < hyp_splits_flat.size(); i++) { - // Length of hyp and ref. - SPLITS_TYPE lhyp = hyp_splits_flat(i) - hyp_splits_flat(i-1); - SPLITS_TYPE lref = ref_splits_flat(i) - ref_splits_flat(i-1); - // Length of longest common substring. - int32 llcs = LongestCommonSubsequenceLength(hyp_splits_flat(i-1), - hyp_splits_flat(i), - hyp_tensor_flat, - ref_splits_flat(i-1), - ref_splits_flat(i), - ref_tensor_flat); - auto measures = ComputeMeasures(lhyp, lref, llcs, alpha); - f_measures_flat(i - 1) = std::get<0>(measures); - p_measures_flat(i - 1) = std::get<1>(measures); - r_measures_flat(i - 1) = std::get<2>(measures); - } - } - - private: - // By using LCS, the ROUGE-L algorithm does not require consecutive matches - // but rather credits the order of N-grams. - int32 LongestCommonSubsequenceLength( - const SPLITS_TYPE hyp_i, - const SPLITS_TYPE hyp_j, - const ConstFlatValues& hyp, - const SPLITS_TYPE ref_i, - const SPLITS_TYPE ref_j, - const ConstFlatValues& ref) { - SPLITS_TYPE lhyp = hyp_j - hyp_i; - SPLITS_TYPE lref = ref_j - ref_i; - // Create a scratch matrix to keep track of the LCS seen so far using DP. - // http://www.algorithmist.com/index.php/Longest_Common_Subsequence - Tensor scratch(DT_INT32, {lhyp + 2, lref + 2}); - auto scratch2d = scratch.matrix(); - for (SPLITS_TYPE x = hyp_i; x <= hyp_j + 1; x++) { - for (SPLITS_TYPE y = ref_i; y <= ref_j + 1; y++) { - SPLITS_TYPE a = x - hyp_i; - SPLITS_TYPE b = y - ref_i; - if (a == 0 || b == 0) { - // If in first row or column, we write a zero to the table. - scratch2d(a, b) = 0; - } else if (x == hyp_j+1 || y == ref_j+1 || hyp(x-1) != ref(y-1)) { - // If in the last row or column, or if the tokens are not equal, - // carry the largest score seen in the cell above or to the left of - // the current cell. - scratch2d(a, b) = - std::max({scratch2d(a - 1, b), scratch2d(a, b - 1)}); - } else { - // If tokens are equal, we are part of a subsequence, so increment the - // diagonal score. - scratch2d(a, b) = scratch2d(a - 1, b - 1) + 1; - } - } - } - return scratch2d(lhyp, lref); - } - - std::tuple ComputeMeasures(const SPLITS_TYPE lhyp_int, - const SPLITS_TYPE lref_int, - const int32 llcs_int, - const float alpha) { - const float lhyp = static_cast(lhyp_int); - const float lref = static_cast(lref_int); - const float llcs = static_cast(llcs_int); - const float p_lcs = llcs / (lhyp + 1e-12); - const float r_lcs = llcs / (lref + 1e-12); - // Use the tensor2tensor formulation if the alpha value is <0, - // which does not make sense as a weighted average term. - const float f_lcs = alpha < 0 ? - ComputeTensor2TensorF(p_lcs, r_lcs) : - ComputeOfficialF(p_lcs, r_lcs, alpha); - return std::make_tuple(f_lcs, p_lcs, r_lcs); - } - - float ComputeTensor2TensorF(const float p_lcs, const float r_lcs) { - const float beta = p_lcs / (r_lcs + 1e-12); - const float numerator = (1 + (beta * beta)) * r_lcs * p_lcs; - const float denominator = r_lcs + ((beta * beta) * p_lcs); - if (denominator > 0) { - return numerator / denominator; - } - return 0; - } - - float ComputeOfficialF(const float p_lcs, const float r_lcs, - const float alpha) { - float denominator = (alpha * r_lcs + (1 - alpha) * p_lcs); - if (denominator > 0) { - return (p_lcs * r_lcs) / denominator; - } - return denominator; - } - - TF_DISALLOW_COPY_AND_ASSIGN(RougeLOp); -}; - -#define REGISTER(VALUES_TYPE) \ - REGISTER_KERNEL_BUILDER(Name("RougeL") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("Tsplits") \ - .TypeConstraint("Tvalues"), \ - RougeLOp); \ - REGISTER_KERNEL_BUILDER(Name("RougeL") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("Tsplits") \ - .TypeConstraint("Tvalues"), \ - RougeLOp); - -TF_CALL_int32(REGISTER); -TF_CALL_int64(REGISTER); -TF_CALL_string(REGISTER); -#undef REGISTER - -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/rouge_l_kernel_test.cc b/tensorflow_text/core/kernels/rouge_l_kernel_test.cc deleted file mode 100644 index 3dc00a120..000000000 --- a/tensorflow_text/core/kernels/rouge_l_kernel_test.cc +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow/core/framework/fake_input.h" -#include "tensorflow/core/framework/node_def_builder.h" -#include "tensorflow/core/framework/shape_inference.h" -#include "tensorflow/core/framework/shape_inference_testutil.h" -#include "tensorflow/core/framework/tensor.h" -#include "tensorflow/core/framework/tensor_shape.h" -#include "tensorflow/core/framework/tensor_testutil.h" -#include "tensorflow/core/kernels/ops_testutil.h" -#include "tensorflow/core/platform/test.h" - -namespace tensorflow { -namespace { - -TEST(RougeLFMeasureOpTest, ShapeFn) { - ShapeInferenceTestOp op("RougeL"); - - INFER_OK(op, "[?];[3];[?];[3];[]", "[2];[2];[2]"); - INFER_OK(op, "[5];[3];[?];[3];[]", "[2];[2];[2]"); - INFER_OK(op, "[?];[3];[8];[3];[]", "[2];[2];[2]"); - INFER_OK(op, "[5];[3];[8];[3];[]", "[2];[2];[2]"); - INFER_OK(op, "[5];[3];[8];?;[]", "[2];[2];[2]"); - INFER_OK(op, "[5];?;[8];[3];[]", "[2];[2];[2]"); - INFER_OK(op, "[5];[?];[8];[?];[]", "[?];[?];[?]"); - INFER_OK(op, "?;?;?;?;?", "[?];[?];[?]"); - INFER_ERROR("Dimension 0 in both shapes must be equal, but are 3 and 2.", op, - "[5];[3];[8];[2];[]"); - INFER_ERROR("Shape must be rank 0 but is rank 1", op, - "[5];[3];[8];[3];[1]"); -} - -} // namespace -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/round_robin_trimmer.h b/tensorflow_text/core/kernels/round_robin_trimmer.h index 5273dfa9e..7a8b7014f 100644 --- a/tensorflow_text/core/kernels/round_robin_trimmer.h +++ b/tensorflow_text/core/kernels/round_robin_trimmer.h @@ -15,304 +15,6 @@ #ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_ROUND_ROBIN_TRIMMER_H_ #define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_ROUND_ROBIN_TRIMMER_H_ -#include -#include -#include -#include -#include "tensorflow_text/core/kernels/trimmer.h" - - -namespace tensorflow { -namespace text { - -template -class RoundRobinTrimmer : Trimmer, BatchTrimmer { - using Values_ = Values; - using ValuesSpan_ = ValuesSpan; - using RowSplits_ = RowSplits; - using RowSplitsSpan_ = RowSplitsSpan; - - public: - RoundRobinTrimmer(int max_sequence_length) - : max_sequence_length_(std::max(max_sequence_length, 0)) {} - virtual ~RoundRobinTrimmer() = default; - - // Generates masks for a single batch of values. - std::vector GenerateMasks( - const std::vector& values) const; - - // Generates masks for a batch of values row splits. - // - // Args: - // row_splits: Row splits of the values in the shape [batch, (num values)] - // - // Returns: - // The returned value is a flattened list of mask values which can be split - // into batches using the same input row splits. - std::vector GenerateMasksBatch( - const std::vector& row_splits) const; - std::vector GenerateMasksBatch( - const std::vector& row_splits) const; - - // Trims a single batch of values. - void Trim(std::vector* values) const; - - // Trims a batch of values given their flattened values and row splits. - // - // Args: - // flat_values: Flattened values in shape [batch, (num values)] - // row_splits: Row splits of the values in the shape [batch, (num values)] - // - // Returns: - // The returned values are the flattened trimmed values and new row splits. - std::pair, std::vector> TrimBatch( - const std::vector& flat_values, - const std::vector& row_splits) const; - std::pair, std::vector> TrimBatch( - const std::vector& flat_values, - const std::vector& row_splits) const; - - protected: - // Used for holding data about value sizes and how much of it is used. - struct Row { - Row() : idx(0), size(0), used(0) {} - Row(int idx, int size, int used) : idx(idx), size(size), used(used) {} - int idx; // Index into the list of values - Tsplits size; // Size of the row values - int used; // How much of the values is used - }; - - // Internal execution to share code for Span & Vector row_splits. - template - std::vector GenerateMasksInternal(Iterator begin, Iterator end) const; - - // Internal execution to share code for Span & Vector row_splits. - template - std::pair, std::vector> TrimInternal( - ValuesIterator flat_values_begin, - ValuesIterator flat_values_end, - RowSplitsIterator row_splits_begin, - RowSplitsIterator row_splits_end) const; - - // Main process of the timmer. Process row splits a batch at a time. Once each - // it is known how much each row in a batch is used, the callback is called - // with the row information. - // Algorithm to fill values: - // 1. Fill values that will max starting from smallest to largest. - // 2. Partially fill the rest up the same amount up to the sequence length. - // 3. Add the remainder to the available rows in order. - template - void ProcessBatch(Iterator values_begin, Iterator values_end, - std::function*)> callback) const; - void ProcessBatch(std::vector* value_row_sizes, - std::function*)> callback) const; - - template - void ProcessSplitsByBatch(Iterator begin, Iterator end, - std::function*)> callback) const; - - const int max_sequence_length_; -}; - -/******************************* Implementation *******************************/ - -template -std::vector RoundRobinTrimmer::GenerateMasks( - const std::vector& values) const { - std::vector masks(values.size()); - ProcessBatch(values.begin(), values.end(), - [&masks](std::vector* value_row_sizes) { - for (int i = 0; i < masks.size(); ++i) { - Mask& mask = masks[i]; - const Row& values_row = (*value_row_sizes)[i]; - mask.reserve(values_row.size); - mask.insert(mask.end(), values_row.used, true); - mask.insert(mask.end(), values_row.size - values_row.used, false); - } - }); - return masks; -} - -template -std::vector RoundRobinTrimmer::GenerateMasksBatch( - const std::vector& row_splits) const { - return GenerateMasksInternal(row_splits.begin(), row_splits.end()); -} - -template -std::vector RoundRobinTrimmer::GenerateMasksBatch( - const std::vector& row_splits) const { - return GenerateMasksInternal(row_splits.begin(), row_splits.end()); -} - -template -template -std::vector RoundRobinTrimmer::GenerateMasksInternal( - const Iterator begin, const Iterator end) const { - // First reserve necessary space for the masks - std::vector masks(end - begin); - auto m = masks.begin(); - for (auto it = begin; it != end; ++it, ++m) { - m->reserve(it->back()); - } - // Process all batches, updating the masks a batch at a time. - ProcessSplitsByBatch(begin, end, [&masks](std::vector* rows) { - for (int s = 0; s < masks.size(); ++s) { - const Row& row = (*rows)[s]; - masks[s].reserve(row.size); - masks[s].insert(masks[s].end(), row.used, true); - masks[s].insert(masks[s].end(), row.size - row.used, false); - } - }); - return masks; -} - -template -void RoundRobinTrimmer::Trim(std::vector* values) const { - ProcessBatch(values->begin(), values->end(), - [values] (std::vector* value_row_sizes) { - for (int s = 0; s < values->size(); ++s) { - (*values)[s].resize((*value_row_sizes)[s].used); - } - }); -} - -template -std::pair>, std::vector>> -RoundRobinTrimmer::TrimBatch( - const std::vector& flat_values, - const std::vector& row_splits) const { - return TrimInternal( - flat_values.begin(), flat_values.end(), - row_splits.begin(), row_splits.end()); -} - -template -std::pair>, std::vector>> -RoundRobinTrimmer::TrimBatch( - const std::vector& flat_values, - const std::vector& row_splits) const { - return TrimInternal( - flat_values.begin(), flat_values.end(), - row_splits.begin(), row_splits.end()); -} - -template -template -std::pair>, std::vector>> -RoundRobinTrimmer::TrimInternal( - ValuesIterator flat_values_begin, - ValuesIterator flat_values_end, - RowSplitsIterator splits_begin, - RowSplitsIterator splits_end) const { - std::pair, std::vector> trimmed( - {std::vector(flat_values_end - flat_values_begin), - std::vector(splits_end - splits_begin)}); - // All row splits start at index 0 - for (int i = 0; i < trimmed.second.size(); ++i) { - trimmed.second[i].push_back({0}); - } - ProcessSplitsByBatch(splits_begin, splits_end, - [&trimmed, flat_values_begin, splits_begin](std::vector* values_row) - { - auto values_it = flat_values_begin; - auto splits_it = splits_begin; - for (int s = 0; s < values_row->size(); ++s, ++values_it, ++splits_it) { - Values_* vals = &trimmed.first[s]; - RowSplits_* splits = &trimmed.second[s]; - auto start = values_it->begin() + (*splits_it)[splits->size()-1]; - vals->insert(vals->end(), start, start + (*values_row)[s].used); - splits->insert(splits->end(), splits->back() + (*values_row)[s].used); - } - }); - return trimmed; -} - -template -template -void RoundRobinTrimmer::ProcessBatch( - Iterator values_begin, Iterator values_end, - std::function*)> callback) const { - int num_values = values_end - values_begin; - // Get size of each segment - std::vector value_row_sizes(num_values); - int i = 0; - for (auto it = values_begin; it != values_end; ++it, ++i) { - value_row_sizes[i].idx = i; - value_row_sizes[i].size = it->size(); - } - // Process the values - ProcessBatch(&value_row_sizes, callback); -} - -template -void RoundRobinTrimmer::ProcessBatch( - std::vector* value_row_sizes, - std::function*)> callback) const { - int num_values = value_row_sizes->size(); - int sequence_left = max_sequence_length_; - - // Fill all values to the max (smallest first to largest) that we can - // without crossing the max_sequence_length - std::sort(value_row_sizes->begin(), value_row_sizes->end(), - [] (Row a, Row b) { return a.size < b.size; }); - int filled_value_rows = 0; - for (int i = 0; i < num_values; ++i) { - // Break if we will not be able to fill up the smallest unfilled value row - if ((*value_row_sizes)[i].size * (num_values - filled_value_rows) - > sequence_left) { - break; - } - (*value_row_sizes)[i].used = (*value_row_sizes)[i].size; - sequence_left -= (*value_row_sizes)[i].used; - ++filled_value_rows; - } - - // Fill the remaining value rows evenly - if (filled_value_rows < num_values) { - int count = sequence_left / (num_values - filled_value_rows); - for (int i = filled_value_rows; i < num_values; ++i) { - (*value_row_sizes)[i].used = count; - sequence_left -= count; - } - } - - // Finally add the remainder - index order - std::sort(value_row_sizes->begin(), value_row_sizes->end(), - [] (Row a, Row b) { return a.idx < b.idx; }); - for (int i = 0; i < num_values && sequence_left > 0; ++i) { - if ((*value_row_sizes)[i].used < (*value_row_sizes)[i].size) { - ++((*value_row_sizes)[i].used); - --sequence_left; - } - } - - // Usage of rows computed. Execute callback to process. - callback(value_row_sizes); -} - -template -template -void RoundRobinTrimmer::ProcessSplitsByBatch( - Iterator begin, Iterator end, - std::function*)> callback) const { - int num_in_batch = begin->size() - 1; - int num_values = end - begin; - // Process one batch at a time. - std::vector value_row_sizes(num_values); - for (int batch_idx = 0; batch_idx < num_in_batch; ++batch_idx) { - // First, get size of each row. - int idx = 0; - for (auto i = begin; i < end; ++i, ++idx) { - value_row_sizes[idx].idx = idx; - value_row_sizes[idx].size = (*i)[batch_idx + 1] - (*i)[batch_idx]; - } - // Perform the main processing of the batch - ProcessBatch(&value_row_sizes, callback); - } -} - -} // namespace text -} // namespace tensorflow +#include "tensorflow/core/kernels/text/round_robin_trimmer.h" #endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_ROUND_ROBIN_TRIMMER_H_ diff --git a/tensorflow_text/core/kernels/round_robin_trimmer_kernel.cc b/tensorflow_text/core/kernels/round_robin_trimmer_kernel.cc deleted file mode 100644 index 4635d7b45..000000000 --- a/tensorflow_text/core/kernels/round_robin_trimmer_kernel.cc +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/round_robin_trimmer_kernel.h" - -#include - -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/framework/register_types.h" - -namespace tensorflow { -namespace text { - -using RoundRobinGenerateMasksOpKernelInstance = - RoundRobinGenerateMasksOpKernel; - -#define REGISTER_ROUND_ROBIN_GENERATE_MASKS_SPLITS(vals_type, splits_type) \ - REGISTER_KERNEL_BUILDER( \ - Name(RoundRobinGenerateMasksOpKernelInstance::OpName()) \ - .Device(tensorflow::DEVICE_CPU) \ - .TypeConstraint("T") \ - .TypeConstraint("Tsplits"), \ - RoundRobinGenerateMasksOpKernel); - -#define REGISTER_ROUND_ROBIN_GENERATE_MASKS(vals_type) \ - REGISTER_ROUND_ROBIN_GENERATE_MASKS_SPLITS(vals_type, int32_t) \ - REGISTER_ROUND_ROBIN_GENERATE_MASKS_SPLITS(vals_type, int64_t) - -TF_CALL_tstring(REGISTER_ROUND_ROBIN_GENERATE_MASKS) -TF_CALL_bool(REGISTER_ROUND_ROBIN_GENERATE_MASKS) -TF_CALL_float(REGISTER_ROUND_ROBIN_GENERATE_MASKS) -TF_CALL_double(REGISTER_ROUND_ROBIN_GENERATE_MASKS) -TF_CALL_INTEGRAL_TYPES(REGISTER_ROUND_ROBIN_GENERATE_MASKS) - -#undef REGISTER_ROUND_ROBIN_GENERATE_MASKS -#undef REGISTER_ROUND_ROBIN_GENERATE_MASKS_SPLITS - - using RoundRobinTrimOpKernelInstance = - RoundRobinTrimOpKernel; - -#define REGISTER_ROUND_ROBIN_TRIM_SPLITS(vals_type, splits_type) \ - REGISTER_KERNEL_BUILDER(Name(RoundRobinTrimOpKernelInstance::OpName()) \ - .Device(tensorflow::DEVICE_CPU) \ - .TypeConstraint("T") \ - .TypeConstraint("Tsplits"), \ - RoundRobinTrimOpKernel); - -#define REGISTER_ROUND_ROBIN_TRIM(vals_type) \ - REGISTER_ROUND_ROBIN_TRIM_SPLITS(vals_type, int32_t) \ - REGISTER_ROUND_ROBIN_TRIM_SPLITS(vals_type, int64_t) - -TF_CALL_tstring(REGISTER_ROUND_ROBIN_TRIM) -TF_CALL_bool(REGISTER_ROUND_ROBIN_TRIM) -TF_CALL_float(REGISTER_ROUND_ROBIN_TRIM) -TF_CALL_double(REGISTER_ROUND_ROBIN_TRIM) -TF_CALL_INTEGRAL_TYPES(REGISTER_ROUND_ROBIN_TRIM) - -#undef REGISTER_ROUND_ROBIN_TRIM -#undef REGISTER_ROUND_ROBIN_TRIM_SPLITS - -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/round_robin_trimmer_kernel.h b/tensorflow_text/core/kernels/round_robin_trimmer_kernel.h index 69edec748..0383529ab 100644 --- a/tensorflow_text/core/kernels/round_robin_trimmer_kernel.h +++ b/tensorflow_text/core/kernels/round_robin_trimmer_kernel.h @@ -15,29 +15,6 @@ #ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_ROUND_ROBIN_TRIMMER_KERNEL_H_ #define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_ROUND_ROBIN_TRIMMER_KERNEL_H_ -#include "tensorflow/lite/kernels/shim/tf_op_shim.h" -#include "tensorflow_text/core/kernels/round_robin_trimmer_kernel_template.h" - -namespace tensorflow { -namespace text { - -template -class RoundRobinGenerateMasksOpKernel - : public tflite::shim::TfOpKernel { - public: - using tflite::shim::TfOpKernel::TfOpKernel; -}; - -template -class RoundRobinTrimOpKernel - : public tflite::shim::TfOpKernel { - public: - using tflite::shim::TfOpKernel::TfOpKernel; -}; - -} // namespace text -} // namespace tensorflow - +#include "tensorflow/core/kernels/text/round_robin_trimmer_kernel.h" #endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_ROUND_ROBIN_TRIMMER_KERNEL_H_ diff --git a/tensorflow_text/core/kernels/round_robin_trimmer_kernel_template.h b/tensorflow_text/core/kernels/round_robin_trimmer_kernel_template.h index 51f17da43..b56b9c0e4 100644 --- a/tensorflow_text/core/kernels/round_robin_trimmer_kernel_template.h +++ b/tensorflow_text/core/kernels/round_robin_trimmer_kernel_template.h @@ -15,310 +15,6 @@ #ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_ROUND_ROBIN_TRIMMER_KERNEL_TEMPLATE_H_ #define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_ROUND_ROBIN_TRIMMER_KERNEL_TEMPLATE_H_ -#include -#include -#include - -#include "absl/status/status.h" -#include "absl/types/span.h" -#include "tensorflow/lite/kernels/shim/op_kernel.h" -#include "tensorflow/lite/kernels/shim/shape.h" -#include "tensorflow/lite/kernels/shim/status_macros.h" -#include "tensorflow_text/core/kernels/round_robin_trimmer.h" - -namespace tensorflow { -namespace text { - -template -class RoundRobinTrimOp - : public tflite::shim::OpKernelShim { - private: - enum Inputs { - kMaxSeqLength = 0, - kInputValues, - kInputRowSplits - }; - enum Outputs { - kOutputValues = 0, - kOutputRowSplits - }; - int64_t number_of_segments_; - - using typename tflite::shim::OpKernelShim::InitContext; - using typename tflite::shim::OpKernelShim::InvokeContext; - using typename tflite::shim::OpKernelShim::ShapeInferenceContext; - - public: - RoundRobinTrimOp() = default; - static constexpr char kOpName[] = "TFText>RoundRobinTrim"; - static constexpr char kDoc[] = R"doc( - Trims a tensor. - )doc"; - - static const char* OpName() { return kOpName; } - static const char* Doc() { return kDoc; } - - // Attributes declaration (syntax: https://www.tensorflow.org/guide/create_op) - static std::vector Attrs(); - - // Inputs declaration (syntax: https://www.tensorflow.org/guide/create_op) - static std::vector Inputs(); - - // Outputs declaration (syntax: https://www.tensorflow.org/guide/create_op) - static std::vector Outputs(); - - // Initializes the op - absl::Status Init(InitContext* context) { - // Attr - SH_RETURN_IF_ERROR(context->GetAttr("N", &number_of_segments_)); - return absl::OkStatus(); - } - - // Runs the operation - absl::Status Invoke(InvokeContext* context); - - // Shape inference - static absl::Status ShapeInference(ShapeInferenceContext* c); -}; - -template -std::vector RoundRobinTrimOp::Attrs() { - return {"N: int >= 1", "T: type", "Tsplits: {int32, int64}"}; -} - -template -std::vector RoundRobinTrimOp::Inputs() { - return {"max_sequence_length: int32", "input_values: N * T", - "input_row_splits: N * Tsplits"}; -} - -template -std::vector RoundRobinTrimOp::Outputs() { - return {"values: N * T", "row_splits: N * Tsplits"}; -} - -template -absl::Status RoundRobinTrimOp::ShapeInference( - ShapeInferenceContext* c) { - using tflite::shim::Shape; - const auto rank_1_shape = Shape({Shape::kUnknownDim}); - int64_t num_segments; - SH_RETURN_IF_ERROR(c->GetAttr("N", &num_segments)); - - SH_ASSIGN_OR_RETURN(const Shape& max_seq_shape, - c->GetInputShape(kMaxSeqLength)); - if (!max_seq_shape.Compatible(Shape({}))) { - return absl::FailedPreconditionError( - absl::StrCat("Shape must be a scalar: ", max_seq_shape.ToString())); - } - - for (int i = 0; i < num_segments; ++i) { - SH_ASSIGN_OR_RETURN( - const Shape& values_shape, - c->GetInputShape( - (kInputValues - 1) * num_segments + i + 1)); - if (!values_shape.Compatible(rank_1_shape)) { - return absl::FailedPreconditionError( - absl::StrCat("Shape must be rank 1: ", values_shape.ToString())); - } - - SH_ASSIGN_OR_RETURN( - const Shape& row_splits_shape, - c->GetInputShape( - (kInputRowSplits - 1) * num_segments + i + 1)); - if (!row_splits_shape.Compatible(rank_1_shape)) { - return absl::FailedPreconditionError( - absl::StrCat("Shape must be rank 1: ", row_splits_shape.ToString())); - } - - SH_RETURN_IF_ERROR(c->SetOutputShape( - kOutputRowSplits * num_segments + i, row_splits_shape)); - SH_RETURN_IF_ERROR(c->SetOutputShape( - kOutputValues * num_segments + i, rank_1_shape)); - } - - return absl::OkStatus(); -} - -template -absl::Status RoundRobinTrimOp::Invoke(InvokeContext* context) { - // Inputs - SH_ASSIGN_OR_RETURN(const auto msl, context->GetInput(kMaxSeqLength)); - const int max_sequence_length = msl->template AsScalar(); - - std::vector> list_of_values(number_of_segments_); - std::vector> list_of_splits(number_of_segments_); - for (int i = 0; i < number_of_segments_; ++i) { - SH_ASSIGN_OR_RETURN(const auto fv, context->GetInput(kInputValues + i)); - list_of_values[i] = fv->template Data(); - - int row_split_idx = kInputRowSplits + number_of_segments_ - 1 + i; - SH_ASSIGN_OR_RETURN(const auto rs, context->GetInput(row_split_idx)); - list_of_splits[i] = rs->template Data(); - } - - // Compute - RoundRobinTrimmer trimmer(max_sequence_length); - auto [trimmed_vals, trimmed_splits] = trimmer.TrimBatch( - list_of_values, list_of_splits); - - for (int i = 0; i < number_of_segments_; ++i) { - // Allocate output & fill output tensors. - SH_RETURN_IF_ERROR(this->template FillOutputTensor( - trimmed_vals[i], (kOutputValues * number_of_segments_) + i, context)); - SH_RETURN_IF_ERROR( - this->template FillOutputTensor(trimmed_splits[i], - (kOutputRowSplits * number_of_segments_) + i, context)); - } - - return absl::OkStatus(); -} - -template -class RoundRobinGenerateMasksOp - : public tflite::shim::OpKernelShim { - private: - enum Inputs { - kMaxSeqLength = 0, - kInputValues, - kInputRowSplits - }; - enum Outputs { - kOutputMasks = 0 - }; - int64_t number_of_segments_; - - using typename tflite::shim::OpKernelShim::InitContext; - using typename tflite::shim::OpKernelShim::InvokeContext; - using typename tflite::shim::OpKernelShim::ShapeInferenceContext; - - public: - RoundRobinGenerateMasksOp() = default; - static constexpr char kOpName[] = "TFText>RoundRobinGenerateMasks"; - static constexpr char kDoc[] = R"doc( - Generates a mask for trimming a tensor. - )doc"; - - static const char* OpName() { return kOpName; } - static const char* Doc() { return kDoc; } - - // Attributes declaration (syntax: https://www.tensorflow.org/guide/create_op) - static std::vector Attrs(); - - // Inputs declaration (syntax: https://www.tensorflow.org/guide/create_op) - static std::vector Inputs(); - - // Outputs declaration (syntax: https://www.tensorflow.org/guide/create_op) - static std::vector Outputs(); - - // Initializes the op - absl::Status Init(InitContext* context) { - // Attr - SH_RETURN_IF_ERROR(context->GetAttr("N", &number_of_segments_)); - return absl::OkStatus(); - } - - // Runs the operation - absl::Status Invoke(InvokeContext* context); - - // Shape inference - static absl::Status ShapeInference(ShapeInferenceContext* c); -}; - -template -std::vector RoundRobinGenerateMasksOp::Attrs() { - return {"N: int >= 1", "T: type", "Tsplits: {int32, int64}"}; -} - -template -std::vector RoundRobinGenerateMasksOp::Inputs() { - // TODO(broken): use templated value - return {"max_sequence_length: int32", "input_values: N * T", - "input_row_splits: N * Tsplits"}; -} - -template -std::vector RoundRobinGenerateMasksOp::Outputs() { - return {"masks: N * bool"}; -} - -template -absl::Status RoundRobinGenerateMasksOp::ShapeInference( - ShapeInferenceContext* c) { - using tflite::shim::Shape; - const auto rank_1_shape = Shape({Shape::kUnknownDim}); - int64_t num_segments; - SH_RETURN_IF_ERROR(c->GetAttr("N", &num_segments)); - - SH_ASSIGN_OR_RETURN(const Shape& max_seq_shape, - c->GetInputShape(kMaxSeqLength)); - if (!max_seq_shape.Compatible(Shape({}))) { - return absl::FailedPreconditionError( - absl::StrCat("Shape must be a scalar: ", max_seq_shape.ToString())); - } - - for (int i = 0; i < num_segments; ++i) { - SH_ASSIGN_OR_RETURN( - const Shape& values_shape, - c->GetInputShape( - (kInputValues - 1) * num_segments + i + 1)); - if (!values_shape.Compatible(rank_1_shape)) { - return absl::FailedPreconditionError( - absl::StrCat("Shape must be rank 1: ", values_shape.ToString())); - } - - SH_ASSIGN_OR_RETURN( - const Shape& row_splits_shape, - c->GetInputShape( - (kInputRowSplits - 1) * num_segments + i + 1)); - if (!row_splits_shape.Compatible(rank_1_shape)) { - return absl::FailedPreconditionError( - absl::StrCat("Shape must be rank 1: ", row_splits_shape.ToString())); - } - - SH_RETURN_IF_ERROR(c->SetOutputShape( - kOutputMasks * num_segments + i, values_shape)); - } - - return absl::OkStatus(); -} - -template -absl::Status RoundRobinGenerateMasksOp::Invoke( - InvokeContext* context) { - // Inputs - SH_ASSIGN_OR_RETURN(const auto msl, context->GetInput(kMaxSeqLength)); - const int max_sequence_length = msl->template AsScalar(); - - std::vector> list_of_splits(number_of_segments_); - for (int i = 0; i < number_of_segments_; ++i) { - int row_split_idx = kInputRowSplits + number_of_segments_ - 1 + i; - SH_ASSIGN_OR_RETURN(const auto rs, context->GetInput(row_split_idx)); - list_of_splits[i] = rs->template Data(); - } - - // Compute - RoundRobinTrimmer trimmer(max_sequence_length); - std::vector> masks = - trimmer.GenerateMasksBatch(list_of_splits); - - for (int i = 0; i < number_of_segments_; ++i) { - // Allocate output & fill output tensors. - SH_RETURN_IF_ERROR(this->template FillOutputTensor(masks[i], - (kOutputMasks * number_of_segments_) + i, context)); - } - - return absl::OkStatus(); -} - -} // namespace text -} // namespace tensorflow +#include "tensorflow/core/kernels/text/round_robin_trimmer_kernel_template.h" #endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_ROUND_ROBIN_TRIMMER_KERNEL_TEMPLATE_H_ diff --git a/tensorflow_text/core/kernels/round_robin_trimmer_test.cc b/tensorflow_text/core/kernels/round_robin_trimmer_test.cc deleted file mode 100644 index 50c21e32d..000000000 --- a/tensorflow_text/core/kernels/round_robin_trimmer_test.cc +++ /dev/null @@ -1,230 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/round_robin_trimmer.h" - -#include -#include -#include - -#include -#include - -namespace tensorflow { -namespace text { -namespace { - -using ::testing::ElementsAreArray; - -struct TestSpec { - int max_sequence_length; - std::vector vals_a_row_1; - std::vector vals_a_row_2; - std::vector vals_b_row_1; - std::vector vals_b_row_2; - std::vector mask_a_row_1; - std::vector mask_a_row_2; - std::vector mask_b_row_1; - std::vector mask_b_row_2; -}; - -class RoundRobinTrimmerTest : public testing::TestWithParam { - protected: - using Segment = std::vector; - using SegmentBatch = std::vector; - using Splits = std::vector; - using Masks = std::vector; - using MasksBatch = std::vector; - - std::vector GetRaggedInput() { - SegmentBatch a = {input_a_row_1, input_a_row_2}; - SegmentBatch b = {input_b_row_1, input_b_row_2}; - - return {a, b}; - } - - std::vector GetFirstBatch() { - return {input_a_row_1, input_b_row_1}; - } - - std::vector GetSecondBatch() { - return {input_a_row_2, input_b_row_2}; - } - - std::pair, std::vector> GetFlatInput() { - Segment a_vals(input_a_row_1.begin(), input_a_row_1.end()); - a_vals.insert(a_vals.end(), input_a_row_2.begin(), input_a_row_2.end()); - Segment b_vals(input_b_row_1.begin(), input_b_row_1.end()); - b_vals.insert(b_vals.end(), input_b_row_2.begin(), input_b_row_2.end()); - - Splits a_splits = {0}; - a_splits.push_back(input_a_row_1.size()); - a_splits.push_back(a_splits.back() + input_a_row_2.size()); - Splits b_splits = {0}; - b_splits.push_back(input_b_row_1.size()); - b_splits.push_back(b_splits.back() + input_b_row_2.size()); - - std::vector vals = {a_vals, b_vals}; - std::vector splits = {a_splits, b_splits}; - return std::make_pair(vals, splits); - } - - template - std::vector Concat(std::vector a, std::vector b) { - std::vector result(a.begin(), a.end()); - result.insert(result.end(), b.begin(), b.end()); - return result; - } - - private: - const Segment input_a_row_1 = {1, 2, 3, 4, 5}; - const Segment input_a_row_2 = {6, 7}; - const Segment input_b_row_1 = {10, 20, 30, 40, 50}; - const Segment input_b_row_2 = {60, 70}; -}; - -static const std::vector& params = { - { - .max_sequence_length = 10, - .vals_a_row_1 = {1, 2, 3, 4, 5}, - .vals_a_row_2 = {6, 7}, - .vals_b_row_1 = {10, 20, 30, 40, 50}, - .vals_b_row_2 = {60, 70}, - .mask_a_row_1 = {true, true, true, true, true}, - .mask_a_row_2 = {true, true}, - .mask_b_row_1 = {true, true, true, true, true}, - .mask_b_row_2 = {true, true}, - }, - { - .max_sequence_length = 6, - .vals_a_row_1 = {1, 2, 3}, - .vals_a_row_2 = {6, 7}, - .vals_b_row_1 = {10, 20, 30}, - .vals_b_row_2 = {60, 70}, - .mask_a_row_1 = {true, true, true, false, false}, - .mask_a_row_2 = {true, true}, - .mask_b_row_1 = {true, true, true, false, false}, - .mask_b_row_2 = {true, true}, - }, - { - .max_sequence_length = 3, - .vals_a_row_1 = {1, 2}, - .vals_a_row_2 = {6, 7}, - .vals_b_row_1 = {10}, - .vals_b_row_2 = {60}, - .mask_a_row_1 = {true, true, false, false, false}, - .mask_a_row_2 = {true, true}, - .mask_b_row_1 = {true, false, false, false, false}, - .mask_b_row_2 = {true, false}, - }, - { - .max_sequence_length = 0, - .vals_a_row_1 = {}, - .vals_a_row_2 = {}, - .vals_b_row_1 = {}, - .vals_b_row_2 = {}, - .mask_a_row_1 = {false, false, false, false, false}, - .mask_a_row_2 = {false, false}, - .mask_b_row_1 = {false, false, false, false, false}, - .mask_b_row_2 = {false, false}, - } -}; - -TEST_P(RoundRobinTrimmerTest, GenerateMasks) { - TestSpec p = GetParam(); - RoundRobinTrimmer t(p.max_sequence_length); - std::vector masks1 = t.GenerateMasks(GetFirstBatch()); - EXPECT_THAT(masks1[0], ElementsAreArray(p.mask_a_row_1)); - EXPECT_THAT(masks1[1], ElementsAreArray(p.mask_b_row_1)); - std::vector masks2 = t.GenerateMasks(GetSecondBatch()); - EXPECT_THAT(masks2[0], ElementsAreArray(p.mask_a_row_2)); - EXPECT_THAT(masks2[1], ElementsAreArray(p.mask_b_row_2)); -} - -TEST_P(RoundRobinTrimmerTest, GenerateMasks_flat) { - TestSpec p = GetParam(); - RoundRobinTrimmer t(p.max_sequence_length); - std::vector masks = t.GenerateMasksBatch(GetFlatInput().second); - EXPECT_THAT(masks[0], - ElementsAreArray(Concat(p.mask_a_row_1, p.mask_a_row_2))); - EXPECT_THAT(masks[1], - ElementsAreArray(Concat(p.mask_b_row_1, p.mask_b_row_2))); -} - -TEST_P(RoundRobinTrimmerTest, Trim) { - TestSpec p = GetParam(); - RoundRobinTrimmer t(p.max_sequence_length); - std::vector vals1 = GetFirstBatch(); - t.Trim(&vals1); - EXPECT_THAT(vals1[0], ElementsAreArray(p.vals_a_row_1)); - EXPECT_THAT(vals1[1], ElementsAreArray(p.vals_b_row_1)); - std::vector vals2 = GetSecondBatch(); - t.Trim(&vals2); - EXPECT_THAT(vals2[0], ElementsAreArray(p.vals_a_row_2)); - EXPECT_THAT(vals2[1], ElementsAreArray(p.vals_b_row_2)); -} - -TEST_P(RoundRobinTrimmerTest, Trim_flat) { - TestSpec p = GetParam(); - RoundRobinTrimmer t(p.max_sequence_length); - auto [input_vals, input_splits] = GetFlatInput(); - auto [vals, splits] = t.TrimBatch(input_vals, input_splits); - EXPECT_THAT(vals[0], - ElementsAreArray(Concat(p.vals_a_row_1, p.vals_a_row_2))); - EXPECT_THAT(vals[1], - ElementsAreArray(Concat(p.vals_b_row_1, p.vals_b_row_2))); - std::vector result_splits = { 0 }; - result_splits.push_back(p.vals_a_row_1.size()); - result_splits.push_back(p.vals_a_row_1.size() + p.vals_a_row_2.size()); - EXPECT_THAT(splits[0], ElementsAreArray(result_splits)); - result_splits = { 0 }; - result_splits.push_back(p.vals_b_row_1.size()); - result_splits.push_back(p.vals_b_row_1.size() + p.vals_b_row_2.size()); - EXPECT_THAT(splits[1], ElementsAreArray(result_splits)); -} - -TEST_P(RoundRobinTrimmerTest, Trim_int64) { - TestSpec p = GetParam(); - RoundRobinTrimmer t(p.max_sequence_length); - auto [input_vals, input_splits] = GetFlatInput(); - std::vector> input_splits_64(input_splits.size()); - for (int i = 0; i < input_splits.size(); ++i) - input_splits_64[i].insert(input_splits_64[i].end(), - input_splits[i].begin(), input_splits[i].end()); - std::vector> input_vals_64(input_vals.size()); - for (int i = 0; i < input_vals.size(); ++i) - input_vals_64[i].insert(input_vals_64[i].end(), - input_vals[i].begin(), input_vals[i].end()); - auto [vals, splits] = t.TrimBatch(input_vals_64, input_splits_64); - EXPECT_THAT(vals[0], - ElementsAreArray(Concat(p.vals_a_row_1, p.vals_a_row_2))); - EXPECT_THAT(vals[1], - ElementsAreArray(Concat(p.vals_b_row_1, p.vals_b_row_2))); - std::vector result_splits = { 0 }; - result_splits.push_back(p.vals_a_row_1.size()); - result_splits.push_back(p.vals_a_row_1.size() + p.vals_a_row_2.size()); - EXPECT_THAT(splits[0], ElementsAreArray(result_splits)); - result_splits = { 0 }; - result_splits.push_back(p.vals_b_row_1.size()); - result_splits.push_back(p.vals_b_row_1.size() + p.vals_b_row_2.size()); - EXPECT_THAT(splits[1], ElementsAreArray(result_splits)); -} - -INSTANTIATE_TEST_SUITE_P(RoundRobinTrimmerTestSuite, - RoundRobinTrimmerTest, - testing::ValuesIn(params)); - -} // namespace -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/round_robin_trimmer_tflite.cc b/tensorflow_text/core/kernels/round_robin_trimmer_tflite.cc deleted file mode 100644 index a724beba6..000000000 --- a/tensorflow_text/core/kernels/round_robin_trimmer_tflite.cc +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/round_robin_trimmer_tflite.h" - -#include -#include -#include -#include -#include - -#include "tensorflow/lite/c/common.h" -#include "tensorflow/lite/kernels/shim/op_kernel.h" -#include "tensorflow/lite/kernels/shim/tflite_op_shim.h" -#include "tensorflow/lite/kernels/shim/tflite_op_wrapper.h" -#include "tensorflow/lite/mutable_op_resolver.h" -#include "tensorflow_text/core/kernels/round_robin_trimmer_kernel_template.h" - -namespace tflite { -namespace ops { -namespace custom { -namespace text { -namespace { -const char splits_type[]("Tsplits"), vals_type[]("T"); -} // namespace - -using ::tflite::shim::op_wrapper::Attr; -using ::tflite::shim::op_wrapper::AttrName; -using ::tflite::shim::op_wrapper::OpWrapper; - -template -using GenerateMasksOp = - OpWrapper, ::tensorflow::tstring, float, double, - int8_t, uint8_t, int16_t, uint16_t, int32_t, uint32_t, - int64_t, uint64_t, bool>, - Attr, int32_t, int64_t>>; - -extern "C" void AddRoundRobinGenerateMasks( - tflite::MutableOpResolver* resolver) { - tflite::shim::TfLiteOpKernel::Add(resolver); -} - -template -using TrimOp = - OpWrapper, ::tensorflow::tstring, float, double, - int8_t, uint8_t, int16_t, uint16_t, int32_t, uint32_t, - int64_t, uint64_t, bool>, - Attr, int32_t, int64_t>>; - -extern "C" void AddRoundRobinTrim(tflite::MutableOpResolver* resolver) { - tflite::shim::TfLiteOpKernel::Add(resolver); -} - -} // namespace text -} // namespace custom -} // namespace ops -} // namespace tflite diff --git a/tensorflow_text/core/kernels/round_robin_trimmer_tflite.h b/tensorflow_text/core/kernels/round_robin_trimmer_tflite.h index 46ffe63b9..6c4459dae 100644 --- a/tensorflow_text/core/kernels/round_robin_trimmer_tflite.h +++ b/tensorflow_text/core/kernels/round_robin_trimmer_tflite.h @@ -15,21 +15,6 @@ #ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_ROUND_ROBIN_TRIMMER_TFLITE_H_ #define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_ROUND_ROBIN_TRIMMER_TFLITE_H_ -#include "tensorflow/lite/c/common.h" -#include "tensorflow/lite/mutable_op_resolver.h" - -namespace tflite { -namespace ops { -namespace custom { -namespace text { - -extern "C" void AddRoundRobinGenerateMasks(tflite::MutableOpResolver* resolver); - -extern "C" void AddRoundRobinTrim(::tflite::MutableOpResolver* resolver); - -} // namespace text -} // namespace custom -} // namespace ops -} // namespace tflite +#include "tensorflow/core/kernels/text/round_robin_trimmer_tflite.h" #endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_ROUND_ROBIN_TRIMMER_TFLITE_H_ diff --git a/tensorflow_text/core/kernels/sentence_breaking_kernels.cc b/tensorflow_text/core/kernels/sentence_breaking_kernels.cc deleted file mode 100644 index 0f4c34c82..000000000 --- a/tensorflow_text/core/kernels/sentence_breaking_kernels.cc +++ /dev/null @@ -1,267 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include - -#include "absl/strings/str_cat.h" -#include "icu4c/source/common/unicode/uchar.h" -#include "icu4c/source/common/unicode/ucnv_err.h" -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/framework/tensor_types.h" -#include "tensorflow/core/framework/types.h" -#include "tensorflow_text/core/kernels/sentence_breaking_utils.h" -#include "tensorflow_text/core/kernels/sentence_fragmenter.h" - -using ::tensorflow::tstring; -using ::tensorflow::errors::InvalidArgument; - -namespace tensorflow { -namespace text { - -// TODO(thuang513): This is copied from unicode_ops.cc, move this to a separate -// util lib in tensorflow and reuse it here instead. -namespace { -// Lifecycle wrapper for UConverter making it easier to use with thread_local. -// TODO(gregbillock): Consider whether to use the higher-level convert API and -// create a specialized fast code path for UTF8. -class WrappedConverter { - public: - WrappedConverter() {} - - ~WrappedConverter() { - if (converter_) { - ucnv_close(converter_); - } - } - - void init(const string& name) { - if (converter_ && name == name_) { - // Note: this reset is not typically needed, but if not done, then in some - // cases the cached converter will maintain state of input endianness - // which isn't valid from input to input in every batched case. - ucnv_reset(converter_); - return; - } - - if (converter_) { - ucnv_close(converter_); - converter_ = nullptr; - name_ = ""; - } - - UErrorCode status = U_ZERO_ERROR; - converter_ = ucnv_open(name.c_str(), &status); - if (U_FAILURE(status)) { - if (converter_) { - ucnv_close(converter_); - converter_ = nullptr; - } - } else { - name_ = name; - } - } - - UConverter* converter_ = nullptr; - string name_; -}; - -struct ErrorOptions { - UChar32 subst = 0xFFFD; - bool elide_replacement = false; - bool replace_control_chars = false; - bool error_on_malformatting = false; -}; - -absl::Status GetErrorOptions(OpKernelConstruction* context, ErrorOptions* out) { - *out = ErrorOptions(); - - string error_policy; - TF_RETURN_IF_ERROR(context->GetAttr("errors", &error_policy)); - - if (error_policy == "replace") { - out->elide_replacement = false; - } else if (error_policy == "ignore") { - out->elide_replacement = true; - } else if (error_policy == "strict") { - out->error_on_malformatting = true; - } else { - return InvalidArgument( - "errors policy must be one of 'strict', 'replace', or 'ignore'"); - } - - int32 replacement_char; - TF_RETURN_IF_ERROR(context->GetAttr("replacement_char", &replacement_char)); - - if (replacement_char >= UCHAR_MIN_VALUE && - replacement_char <= UCHAR_MAX_VALUE) { - out->subst = replacement_char; - } else { - return InvalidArgument("replacement_char out of unicode codepoint range"); - } - - if (context->HasAttr("replace_control_characters")) { - TF_RETURN_IF_ERROR(context->GetAttr("replace_control_characters", - &(out->replace_control_chars))); - } - - return absl::OkStatus(); -} - -inline bool ShouldHandleFormatError(const ErrorOptions& error_options, - UChar32 ch, bool format_error) { - return ((error_options.replace_control_chars && ch <= 0x1F) || format_error); -} - -} // namespace - -class SentenceFragmentsOp : public OpKernel { - public: - explicit SentenceFragmentsOp(OpKernelConstruction* context) - : OpKernel(context) { - OP_REQUIRES_OK(context, GetErrorOptions(context, &error_options_)); - - OP_REQUIRES_OK(context, - context->GetAttr("input_encoding", &input_encoding_)); - // Make a temporary UConverter to ensure it will create without error - // at execution time (and to warm any data caches the converter needs). - // This instance is not used. - std::unique_ptr input_encoder = - std::make_unique(); - input_encoder->init(input_encoding_); - OP_REQUIRES( - context, input_encoder->converter_, - InvalidArgument("Could not create converter for input encoding: " + - input_encoding_)); - } - - void Compute(::tensorflow::OpKernelContext* context) override { -#define DECLARE_AND_VALIDATE_INPUT_VECTOR(name, dtype) \ - const Tensor* name##_tensor; \ - OP_REQUIRES_OK(context, context->input(#name, &name##_tensor)); \ - OP_REQUIRES(context, TensorShapeUtils::IsVector(name##_tensor->shape()), \ - InvalidArgument( \ - absl::StrCat("'", #name, "' must be a vector, got shape: ", \ - name##_tensor->shape().DebugString()))); \ - const auto& name = name##_tensor->vec(); - - DECLARE_AND_VALIDATE_INPUT_VECTOR(row_lengths, int64); - DECLARE_AND_VALIDATE_INPUT_VECTOR(token_start, int64); - DECLARE_AND_VALIDATE_INPUT_VECTOR(token_end, int64); - DECLARE_AND_VALIDATE_INPUT_VECTOR(token_word, tstring); - DECLARE_AND_VALIDATE_INPUT_VECTOR(token_properties, int64); - -#undef DECLARE_AND_VALIDATE_INPUT_TENSOR - - static thread_local std::unique_ptr input_encoder; - if (!input_encoder) { - input_encoder = std::make_unique(); - } - input_encoder->init(input_encoding_); - OP_REQUIRES( - context, input_encoder->converter_, - InvalidArgument("Could not create converter for input encoding: " + - input_encoding_)); - - UConverter* converter = input_encoder->converter_; - UnicodeUtil util(converter); - - int num_elements = 0; - for (int i = 0; i < row_lengths.size(); ++i) { - num_elements += row_lengths(i); - } - OP_REQUIRES(context, - num_elements == token_start.size() && - token_start.size() == token_end.size() && - token_end.size() == token_word.size(), - InvalidArgument(absl::StrCat( - "num_elements(", num_elements, "), token_start(", - token_start.size(), "), token_end(", token_end.size(), - "), token_word(", token_word.size(), - ") must all be the same size."))); - - // Iterate through the text - int token_index = 0; - int num_fragments = 0; - std::vector> fragments; - for (int i = 0; i < row_lengths.size(); ++i) { - std::vector tokens; - Document doc(&tokens); - for (int j = 0; j < row_lengths(i); ++j) { - doc.AddToken( - token_word(token_index), token_start(token_index), - token_end(token_index), Token::SPACE_BREAK, - static_cast(token_properties(token_index))); - ++token_index; - } - - // Find fragments. - SentenceFragmenter fragmenter(&doc, &util); - std::vector frags; - OP_REQUIRES_OK(context, fragmenter.FindFragments(&frags)); - - num_fragments += frags.size(); - fragments.push_back(std::move(frags)); - } - - std::vector fragment_shape; - fragment_shape.push_back(num_fragments); - - std::vector doc_batch_shape; - doc_batch_shape.push_back(fragments.size()); - -#define DECLARE_OUTPUT_TENSOR(name, out_shape) \ - Tensor* name##_tensor = nullptr; \ - OP_REQUIRES_OK(context, context->allocate_output( \ - #name, TensorShape(out_shape), &name##_tensor)); \ - auto name = name##_tensor->vec(); - - DECLARE_OUTPUT_TENSOR(fragment_start, fragment_shape); - DECLARE_OUTPUT_TENSOR(fragment_end, fragment_shape); - DECLARE_OUTPUT_TENSOR(fragment_properties, fragment_shape); - DECLARE_OUTPUT_TENSOR(terminal_punc_token, fragment_shape); - DECLARE_OUTPUT_TENSOR(output_row_lengths, doc_batch_shape); - -#undef DECLARE_OUTPUT_TENSOR - - // output_row_splits should have shape of - // [number of fragments over the entire batch] - int element_index = 0; - // Iterate through all the documents - for (int i = 0; i < fragments.size(); ++i) { - const std::vector& fragments_in_doc = fragments[i]; - // Iterate through all the fragments of a document - for (int j = 0; j < fragments_in_doc.size(); ++j) { - const SentenceFragment& fragment = fragments_in_doc[j]; - fragment_start(element_index) = fragment.start; - fragment_end(element_index) = fragment.limit; - fragment_properties(element_index) = fragment.properties; - terminal_punc_token(element_index) = fragment.terminal_punc_token; - ++element_index; - } - output_row_lengths(i) = fragments_in_doc.size(); - } - } - - private: - string input_encoding_; - ErrorOptions error_options_; -}; - -REGISTER_KERNEL_BUILDER(Name("SentenceFragments").Device(DEVICE_CPU), - SentenceFragmentsOp); - -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/sentence_breaking_utils.cc b/tensorflow_text/core/kernels/sentence_breaking_utils.cc deleted file mode 100644 index 5bdcc1c02..000000000 --- a/tensorflow_text/core/kernels/sentence_breaking_utils.cc +++ /dev/null @@ -1,238 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/sentence_breaking_utils.h" - -#include - -#include "absl/strings/str_cat.h" -#include "absl/strings/string_view.h" -#include "icu4c/source/common/unicode/uchar.h" -#include "icu4c/source/common/unicode/utypes.h" -#include "tensorflow/core/lib/core/errors.h" -#include "tensorflow/core/lib/core/status.h" - -using ::tensorflow::Status; - -namespace tensorflow { -namespace text { - -absl::Status UnicodeUtil::GetOneUChar(const absl::string_view& input, - bool* has_more_than_one_char, - UChar32* result) const { - UErrorCode status = U_ZERO_ERROR; - const char* source = input.data(); - const char* limit = input.data() + input.length(); - if (!converter_) { - return tensorflow::errors::Internal( - absl::StrCat("Converter has not been initialized!")); - } - *result = ucnv_getNextUChar(converter_, &source, limit, &status); - - if (U_FAILURE(status)) { - return tensorflow::errors::Internal( - absl::StrCat("Failed to decode string, error status=", status)); - } - - if (source != limit) { - *has_more_than_one_char = true; - } else { - *has_more_than_one_char = false; - } - - return absl::OkStatus(); -} - -absl::Status UnicodeUtil::IsTerminalPunc(const absl::string_view& input, - bool* result) const { - *result = false; - const auto& ellipsis_status = IsEllipsis(input, result); - // If there was a error decoding, or if we found an ellipsis, then return. - if (!ellipsis_status.ok()) return ellipsis_status; - if (*result) return absl::OkStatus(); - - bool has_more_than_one_char = false; - UChar32 char_value; - const auto& status = GetOneUChar(input, &has_more_than_one_char, &char_value); - if (!status.ok()) return status; - if (has_more_than_one_char) { - *result = false; - return absl::OkStatus(); - } - - // These are unicode characters that should be considered in this category but - // are not covered by any of the ICU properties. - switch (char_value) { - case 0x055C: // Armenian exclamation mark - case 0x055E: // Armenian question mark - case 0x17d4: // Khmer sign khan - case 0x037E: // Greek question mark - case 0x2026: // ellipsis - *result = true; - return absl::OkStatus(); - } - - USentenceBreak sb_property = static_cast( - u_getIntPropertyValue(char_value, UCHAR_SENTENCE_BREAK)); - *result = sb_property == U_SB_ATERM || sb_property == U_SB_STERM; - return absl::OkStatus(); -} - -absl::Status UnicodeUtil::IsClosePunc(const absl::string_view& input, - bool* result) const { - *result = false; - if (input == "''") { - *result = true; - return absl::OkStatus(); - } - - bool has_more_than_one_char = false; - UChar32 char_value; - const auto& status = GetOneUChar(input, &has_more_than_one_char, &char_value); - if (!status.ok()) return status; - if (has_more_than_one_char) { - *result = false; - return absl::OkStatus(); - } - - // These are unicode characters that should be considered in this category but - // are not covered by any of the ICU properties. - switch (char_value) { - case '>': - case ']': - case '`': - case 64831: // Ornate right parenthesis - case 65282: // fullwidth quotation mark - case 65287: // fullwidth apostrophe - *result = true; - return absl::OkStatus(); - } - - ULineBreak lb_property = static_cast( - u_getIntPropertyValue(char_value, UCHAR_LINE_BREAK)); - - *result = lb_property == U_LB_CLOSE_PUNCTUATION || - lb_property == U_LB_CLOSE_PARENTHESIS || - lb_property == U_LB_QUOTATION; - return absl::OkStatus(); -} - -absl::Status UnicodeUtil::IsOpenParen(const absl::string_view& input, - bool* result) const { - *result = false; - bool has_more_than_one_char = false; - UChar32 char_value; - const auto& status = GetOneUChar(input, &has_more_than_one_char, &char_value); - if (!status.ok()) return status; - if (has_more_than_one_char) { - *result = false; - return absl::OkStatus(); - } - - // These are unicode characters that should be considered in this category but - // are not covered by any of the ICU properties. - switch (char_value) { - case '<': - case 64830: // Ornate left parenthesis - *result = true; - return absl::OkStatus(); - } - - ULineBreak lb_property = static_cast( - u_getIntPropertyValue(char_value, UCHAR_LINE_BREAK)); - *result = lb_property == U_LB_OPEN_PUNCTUATION; - return absl::OkStatus(); -} - -absl::Status UnicodeUtil::IsCloseParen(const absl::string_view& input, - bool* result) const { - *result = false; - bool has_more_than_one_char = false; - UChar32 char_value; - const auto& status = GetOneUChar(input, &has_more_than_one_char, &char_value); - if (!status.ok()) return status; - if (has_more_than_one_char) { - *result = false; - return absl::OkStatus(); - } - - // These are unicode characters that should be considered in this category but - // are not covered by any of the ICU properties. - switch (char_value) { - case '>': - case 64831: // Ornate right parenthesis - *result = true; - return absl::OkStatus(); - } - - ULineBreak lb_property = static_cast( - u_getIntPropertyValue(char_value, UCHAR_LINE_BREAK)); - *result = lb_property == U_LB_CLOSE_PUNCTUATION || - lb_property == U_LB_CLOSE_PARENTHESIS; - return absl::OkStatus(); -} - -absl::Status UnicodeUtil::IsPunctuationWord(const absl::string_view& input, - bool* result) const { - *result = false; - bool has_more_than_one_char = false; - UChar32 char_value; - const auto& status = GetOneUChar(input, &has_more_than_one_char, &char_value); - if (!status.ok()) return status; - if (has_more_than_one_char) { - *result = false; - return absl::OkStatus(); - } - - // These are unicode characters that should be considered in this category but - // are not covered by any of the ICU properties. - switch (char_value) { - case '`': - case '<': - case '>': - case '~': - case 5741: - *result = true; - return absl::OkStatus(); - } - - *result = u_ispunct(char_value) || - u_hasBinaryProperty(char_value, UCHAR_DASH) || - u_hasBinaryProperty(char_value, UCHAR_HYPHEN); - return absl::OkStatus(); -} - -absl::Status UnicodeUtil::IsEllipsis(const absl::string_view& input, - bool* result) const { - *result = false; - if (input == "...") { - *result = true; - return absl::OkStatus(); - } - - bool has_more_than_one_char = false; - UChar32 char_value; - const auto& status = GetOneUChar(input, &has_more_than_one_char, &char_value); - if (!status.ok()) return status; - if (has_more_than_one_char) { - *result = false; - return absl::OkStatus(); - } - - *result = char_value == 0x2026; - return absl::OkStatus(); -} - -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/sentence_breaking_utils.h b/tensorflow_text/core/kernels/sentence_breaking_utils.h index 02534a7e3..b4588afae 100644 --- a/tensorflow_text/core/kernels/sentence_breaking_utils.h +++ b/tensorflow_text/core/kernels/sentence_breaking_utils.h @@ -12,57 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef TENSORFLOW_TEXT_CORE_KERNELS_SENTENCE_BREAKING_UTILS_H_ -#define TENSORFLOW_TEXT_CORE_KERNELS_SENTENCE_BREAKING_UTILS_H_ +#ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_SENTENCE_BREAKING_UTILS_H_ +#define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_SENTENCE_BREAKING_UTILS_H_ -#include -#include "absl/strings/string_view.h" -#include "icu4c/source/common/unicode/ucnv.h" -#include "icu4c/source/common/unicode/ucnv_err.h" -#include "icu4c/source/common/unicode/utypes.h" -#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/kernels/text/sentence_breaking_utils.h" -namespace tensorflow { -namespace text { - -// A class of utils for identifying certain classes and properties of unicode -// characters. -class UnicodeUtil { - public: - // `converter` not owned. - explicit UnicodeUtil(UConverter* converter) : converter_(converter) {} - - // Returns true iff a string is terminal punctuation. - absl::Status IsTerminalPunc(const absl::string_view& input, - bool* result) const; - - // Returns true iff a string is close punctuation (close quote or close - // paren). - absl::Status IsClosePunc(const absl::string_view& input, bool* result) const; - - // Returns true iff a string is an open paren. - absl::Status IsOpenParen(const absl::string_view& input, bool* result) const; - - // Returns true iff a string is a close paren. - absl::Status IsCloseParen(const absl::string_view& input, bool* result) const; - - // Returns true iff a word is made of punctuation characters only. - absl::Status IsPunctuationWord(const absl::string_view& input, - bool* result) const; - - // Returns true iff a string is an ellipsis token ("..."). - absl::Status IsEllipsis(const absl::string_view& input, bool* result) const; - - private: - absl::Status GetOneUChar(const absl::string_view&, - bool* has_more_than_one_char, UChar32* result) const; - - // not owned. mutable because UConverter contains some internal options and - // buffer. - mutable UConverter* converter_; -}; - -} // namespace text -} // namespace tensorflow - -#endif // TENSORFLOW_TEXT_CORE_KERNELS_SENTENCE_BREAKING_UTILS_H_ +#endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_SENTENCE_BREAKING_UTILS_H_ diff --git a/tensorflow_text/core/kernels/sentence_breaking_utils_test.cc b/tensorflow_text/core/kernels/sentence_breaking_utils_test.cc deleted file mode 100644 index 7aee97091..000000000 --- a/tensorflow_text/core/kernels/sentence_breaking_utils_test.cc +++ /dev/null @@ -1,576 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/sentence_breaking_utils.h" - -#include -#include -#include - -#include -#include -#include "icu4c/source/common/unicode/uchar.h" -#include "icu4c/source/common/unicode/ucnv.h" -#include "icu4c/source/common/unicode/ucnv_err.h" -#include "icu4c/source/common/unicode/umachine.h" -#include "icu4c/source/common/unicode/uniset.h" -#include "icu4c/source/common/unicode/unistr.h" -#include "icu4c/source/common/unicode/uset.h" -#include "icu4c/source/common/unicode/utypes.h" - -namespace tensorflow { -namespace text { -namespace { - -class SentenceBreakingUtilsTest { - protected: - UConverter* GetUConverter() { - constexpr char name[] = "UTF-8"; - UErrorCode status = U_ZERO_ERROR; - UConverter* converter = ucnv_open(name, &status); - if (U_FAILURE(status)) { - if (converter) { - ucnv_close(converter); - } - return nullptr; - } - return converter; - } -}; - -class SentenceBreakingUtilsParamTest : public SentenceBreakingUtilsTest, - public ::testing::TestWithParam { - protected: - void SetUp() override { - converter_ = SentenceBreakingUtilsTest::GetUConverter(); - ASSERT_NE(converter_, nullptr); - } - - void TearDown() override { ucnv_close(converter_); } - - std::string StringFromUnicodeChar(UChar32 input) { - std::string result; - icu::UnicodeString test_unicode_string(input); - test_unicode_string.toUTF8String(result); - return result; - } - - UConverter* converter_; -}; - -class IsTerminalPuncParamTest : public SentenceBreakingUtilsParamTest {}; - -class IsTerminalPuncTest : public SentenceBreakingUtilsTest, - public ::testing::Test {}; - -const UChar is_terminal_punc_test_cases[] = { - 0x055C, // Armenian exclamation mark - 0x055E, // Armenian question mark - 0x0589, // Armenian full stop - 0x061F, // Arabic question mark - 0x06D4, // Arabic full stop - 0x0700, // Syriabc end of paragraph - 0x0701, // Syriac supralinear full stop - 0x0702, // Syriac sublinear full stop - 0x1362, // Ethiopic full stop - 0x1367, // Ethiopic question mark - 0x1368, // Ethiopic paragraph separator - 0x104A, // Myanmar sign little section - 0x104B, // Myanmar sign section - 0x166E, // Canadian syllabics full stop - 0x17d4, // Khmer sign khan - 0x1803, // Mongolian full stop - 0x1809, // Mongolian Manchu full stop - 0x1944, // Limbu exclamation mark - 0x1945, // Limbu question mark - 0x203C, // double exclamation mark - 0x203D, // interrobang - 0x2047, // double question mark - 0x2048, // question exclamation mark - 0x2049, // exclamation question mark - 0x3002, // ideographic full stop - 0x037E, // Greek question mark - 0xFE52, // small full stop - 0xFE56, // small question mark - 0xFE57, // small exclamation mark - 0xFF01, // fullwidth exclamation mark - 0xFF0E, // fullwidth full stop - 0xFF1F, // fullwidth question mark - 0xFF61, // halfwidth ideographic full stop - 0x2026, // ellipsis - 0x0964, - 0x0965, // Devanagari danda..Devanagari double -}; - -TEST_P(IsTerminalPuncParamTest, IsTerminalPunc) { - UnicodeUtil util(converter_); - std::string test_string = StringFromUnicodeChar(GetParam()); - bool result = false; - EXPECT_TRUE(util.IsTerminalPunc(test_string, &result).ok()); - EXPECT_TRUE(result); -} - -INSTANTIATE_TEST_SUITE_P(IsTerminalPuncTest, IsTerminalPuncParamTest, - ::testing::ValuesIn(is_terminal_punc_test_cases)); - -TEST_F(IsTerminalPuncTest, IsMultiCharEllipseTerminalPunc) { - UConverter* converter = SentenceBreakingUtilsTest::GetUConverter(); - ASSERT_NE(converter, nullptr); - UnicodeUtil util(converter); - std::string test_string = "..."; - bool result; - EXPECT_TRUE(util.IsTerminalPunc(test_string, &result).ok()); - EXPECT_TRUE(result); - ucnv_close(converter); -} - -TEST_F(IsTerminalPuncTest, TestMultiUnicodeChars) { - UConverter* converter = SentenceBreakingUtilsTest::GetUConverter(); - ASSERT_NE(converter, nullptr); - UnicodeUtil util(converter); - std::string test_string = "never gonna let you decode"; - bool result; - EXPECT_TRUE(util.IsTerminalPunc(test_string, &result).ok()); - EXPECT_FALSE(result); - ucnv_close(converter); -} - -TEST_F(IsTerminalPuncTest, TestInvalidConverter) { - UErrorCode status = U_ZERO_ERROR; - UConverter* converter = ucnv_open("cant find me", &status); - UnicodeUtil util(converter); - std::string test_string = "."; - bool result; - EXPECT_FALSE(util.IsTerminalPunc(test_string, &result).ok()); - ucnv_close(converter); -} - -class ClosePuncParamTest : public SentenceBreakingUtilsParamTest {}; - -const UChar close_punc_test_cases[] = { - 0x29, 0x5D, 0x3E, 0x7D, - 0x207E, // superscript right parenthesis - 0x208E, // subscript right parenthesis - 0x27E7, // mathematical right white square bracket - 0x27E9, // mathematical right angle bracket - 0x27EB, // mathematical right double angle bracket - 0x2984, // right white curly bracket - 0x2986, // right white parenthesis - 0x2988, // Z notation right image bracket - 0x298A, // Z notation right binding bracket - 0x298C, // right square bracket with underbar - 0x298E, // right square bracket with tick in top corner - 0x2990, // right square bracket with tick in bottom corner - 0x2992, // right angle bracket with dot - 0x2994, // right arc greater-than bracket - 0x2996, // double right arc less-than bracket - 0x2998, // right black tortoise shell bracket - 0x29D9, // right wiggly fence - 0x29DB, // right double wiggly fence - 0x29FD, // right-pointing curved angle bracket - 0x3009, // CJK right angle bracket - 0x300B, // CJK right double angle bracket - 0x3011, // CJK right black lenticular bracket - 0x3015, // CJK right tortoise shell bracket - 0x3017, // CJK right white lenticular bracket - 0x3019, // CJK right white tortoise shell bracket - 0x301B, // CJK right white square bracket - 0xFD3F, // Ornate right parenthesis - 0xFE5A, // small right parenthesis - 0xFE5C, // small right curly bracket - 0xFF09, // fullwidth right parenthesis - 0xFF3D, // fullwidth right square bracket - 0xFF5D, // fullwidth right curly bracket - 0x27, 0x60, 0x22, - 0xFF07, // fullwidth apostrophe - 0xFF02, // fullwidth quotation mark - 0x2019, // right single quotation mark (English, others) - 0x201D, // right double quotation mark (English, others) - 0x2018, // left single quotation mark (Czech, German, Slovak) - 0x201C, // left double quotation mark (Czech, German, Slovak) - 0x203A, // single right-pointing angle quotation mark (French, others) - 0x00BB, // right-pointing double angle quotation mark (French, others) - 0x2039, // single left-pointing angle quotation mark (Slovenian, others) - 0x00AB, // left-pointing double angle quotation mark (Slovenian, others) - 0x300D, // right corner bracket (East Asian languages) - 0xfe42, // presentation form for vertical right corner bracket - 0xFF63, // halfwidth right corner bracket (East Asian languages) - 0x300F, // right white corner bracket (East Asian languages) - 0xfe44, // presentation form for vertical right white corner bracket - 0x301F, // low double prime quotation mark (East Asian languages) - 0x301E, // close double prime (East Asian languages written horizontally) -}; - -TEST_P(ClosePuncParamTest, IsClosePunc) { - UnicodeUtil util(converter_); - std::string test_string = StringFromUnicodeChar(GetParam()); - bool result = false; - EXPECT_TRUE(util.IsClosePunc(test_string, &result).ok()); - EXPECT_TRUE(result); -} - -INSTANTIATE_TEST_SUITE_P(IsClosePuncParamTest, ClosePuncParamTest, - ::testing::ValuesIn(close_punc_test_cases)); - -class OpenParenParamTest : public SentenceBreakingUtilsParamTest {}; - -const UChar open_paren_test_cases[] = { - '(', '[', '<', '{', - 0x207D, // superscript left parenthesis - 0x208D, // subscript left parenthesis - 0x27E6, // mathematical left white square bracket - 0x27E8, // mathematical left angle bracket - 0x27EA, // mathematical left double angle bracket - 0x2983, // left white curly bracket - 0x2985, // left white parenthesis - 0x2987, // Z notation left image bracket - 0x2989, // Z notation left binding bracket - 0x298B, // left square bracket with underbar - 0x298D, // left square bracket with tick in top corner - 0x298F, // left square bracket with tick in bottom corner - 0x2991, // left angle bracket with dot - 0x2993, // left arc less-than bracket - 0x2995, // double left arc greater-than bracket - 0x2997, // left black tortoise shell bracket - 0x29D8, // left wiggly fence - 0x29DA, // left double wiggly fence - 0x29FC, // left-pointing curved angle bracket - 0x3008, // CJK left angle bracket - 0x300A, // CJK left double angle bracket - 0x3010, // CJK left black lenticular bracket - 0x3014, // CJK left tortoise shell bracket - 0x3016, // CJK left white lenticular bracket - 0x3018, // CJK left white tortoise shell bracket - 0x301A, // CJK left white square bracket - 0xFD3E, // Ornate left parenthesis - 0xFE59, // small left parenthesis - 0xFE5B, // small left curly bracket - 0xFF08, // fullwidth left parenthesis - 0xFF3B, // fullwidth left square bracket - 0xFF5B, // fullwidth left curly bracket -}; - -TEST_P(OpenParenParamTest, IsOpenParen) { - UnicodeUtil util(converter_); - std::string test_string = StringFromUnicodeChar(GetParam()); - bool result = false; - EXPECT_TRUE(util.IsOpenParen(test_string, &result).ok()); - EXPECT_TRUE(result); -} - -INSTANTIATE_TEST_SUITE_P(IsOpenParenParamTest, OpenParenParamTest, - ::testing::ValuesIn(open_paren_test_cases)); - -class CloseParenParamTest : public SentenceBreakingUtilsParamTest {}; - -const UChar close_paren_test_cases[] = { - ')', ']', '>', '}', - 0x207E, // superscript right parenthesis - 0x208E, // subscript right parenthesis - 0x27E7, // mathematical right white square bracket - 0x27E9, // mathematical right angle bracket - 0x27EB, // mathematical right double angle bracket - 0x2984, // right white curly bracket - 0x2986, // right white parenthesis - 0x2988, // Z notation right image bracket - 0x298A, // Z notation right binding bracket - 0x298C, // right square bracket with underbar - 0x298E, // right square bracket with tick in top corner - 0x2990, // right square bracket with tick in bottom corner - 0x2992, // right angle bracket with dot - 0x2994, // right arc greater-than bracket - 0x2996, // double right arc less-than bracket - 0x2998, // right black tortoise shell bracket - 0x29D9, // right wiggly fence - 0x29DB, // right double wiggly fence - 0x29FD, // right-pointing curved angle bracket - 0x3009, // CJK right angle bracket - 0x300B, // CJK right double angle bracket - 0x3011, // CJK right black lenticular bracket - 0x3015, // CJK right tortoise shell bracket - 0x3017, // CJK right white lenticular bracket - 0x3019, // CJK right white tortoise shell bracket - 0x301B, // CJK right white square bracket - 0xFD3F, // Ornate right parenthesis - 0xFE5A, // small right parenthesis - 0xFE5C, // small right curly bracket - 0xFF09, // fullwidth right parenthesis - 0xFF3D, // fullwidth right square bracket - 0xFF5D, // fullwidth right curly bracket -}; - -TEST_P(CloseParenParamTest, IsCloseParen) { - UnicodeUtil util(converter_); - std::string test_string = StringFromUnicodeChar(GetParam()); - bool result = false; - EXPECT_TRUE(util.IsCloseParen(test_string, &result).ok()); - EXPECT_TRUE(result); -} - -INSTANTIATE_TEST_SUITE_P(IsCloseParenParamTest, CloseParenParamTest, - ::testing::ValuesIn(close_paren_test_cases)); - -class IsPunctuationWordParamTest : public SentenceBreakingUtilsParamTest {}; - -const UChar punc_word_test_cases[] = { - '(', '[', '<', '{', - 0x207D, // superscript left parenthesis - 0x208D, // subscript left parenthesis - 0x27E6, // mathematical left white square bracket - 0x27E8, // mathematical left angle bracket - 0x27EA, // mathematical left double angle bracket - 0x2983, // left white curly bracket - 0x2985, // left white parenthesis - 0x2987, // Z notation left image bracket - 0x2989, // Z notation left binding bracket - 0x298B, // left square bracket with underbar - 0x298D, // left square bracket with tick in top corner - 0x298F, // left square bracket with tick in bottom corner - 0x2991, // left angle bracket with dot - 0x2993, // left arc less-than bracket - 0x2995, // double left arc greater-than bracket - 0x2997, // left black tortoise shell bracket - 0x29D8, // left wiggly fence - 0x29DA, // left double wiggly fence - 0x29FC, // left-pointing curved angle bracket - 0x3008, // CJK left angle bracket - 0x300A, // CJK left double angle bracket - 0x3010, // CJK left black lenticular bracket - 0x3014, // CJK left tortoise shell bracket - 0x3016, // CJK left white lenticular bracket - 0x3018, // CJK left white tortoise shell bracket - 0x301A, // CJK left white square bracket - 0xFD3E, // Ornate left parenthesis - 0xFE59, // small left parenthesis - 0xFE5B, // small left curly bracket - 0xFF08, // fullwidth left parenthesis - 0xFF3B, // fullwidth left square bracket - 0xFF5B, // fullwidth left curly bracket - '"', '\'', '`', - 0xFF07, // fullwidth apostrophe - 0xFF02, // fullwidth quotation mark - 0x2018, // left single quotation mark (English, others) - 0x201C, // left double quotation mark (English, others) - 0x201B, // single high-reveresed-9 quotation mark (PropList.txt) - 0x201A, // single low-9 quotation mark (Czech, German, Slovak) - 0x201E, // double low-9 quotation mark (Czech, German, Slovak) - 0x201F, // double high-reversed-9 quotation mark (PropList.txt) - 0x2019, // right single quotation mark (Danish, Finnish, Swedish, Norw.) - 0x201D, // right double quotation mark (Danish, Finnish, Swedish, Norw.) - 0x2039, // single left-pointing angle quotation mark (French, others) - 0x00AB, // left-pointing double angle quotation mark (French, others) - 0x203A, // single right-pointing angle quotation mark (Slovenian, others) - 0x00BB, // right-pointing double angle quotation mark (Slovenian, others) - 0x300C, // left corner bracket (East Asian languages) - 0xFE41, // presentation form for vertical left corner bracket - 0xFF62, // halfwidth left corner bracket (East Asian languages) - 0x300E, // left white corner bracket (East Asian languages) - 0xFE43, // presentation form for vertical left white corner bracket - 0x301D, // reversed double prime quotation mark (East Asian langs, horiz.) - ')', ']', '>', '}', - 0x207E, // superscript right parenthesis - 0x208E, // subscript right parenthesis - 0x27E7, // mathematical right white square bracket - 0x27E9, // mathematical right angle bracket - 0x27EB, // mathematical right double angle bracket - 0x2984, // right white curly bracket - 0x2986, // right white parenthesis - 0x2988, // Z notation right image bracket - 0x298A, // Z notation right binding bracket - 0x298C, // right square bracket with underbar - 0x298E, // right square bracket with tick in top corner - 0x2990, // right square bracket with tick in bottom corner - 0x2992, // right angle bracket with dot - 0x2994, // right arc greater-than bracket - 0x2996, // double right arc less-than bracket - 0x2998, // right black tortoise shell bracket - 0x29D9, // right wiggly fence - 0x29DB, // right double wiggly fence - 0x29FD, // right-pointing curved angle bracket - 0x3009, // CJK right angle bracket - 0x300B, // CJK right double angle bracket - 0x3011, // CJK right black lenticular bracket - 0x3015, // CJK right tortoise shell bracket - 0x3017, // CJK right white lenticular bracket - 0x3019, // CJK right white tortoise shell bracket - 0x301B, // CJK right white square bracket - 0xFD3F, // Ornate right parenthesis - 0xFE5A, // small right parenthesis - 0xFE5C, // small right curly bracket - 0xFF09, // fullwidth right parenthesis - 0xFF3D, // fullwidth right square bracket - 0xFF5D, // fullwidth right curly bracket - '\'', '"', '`', - 0xFF07, // fullwidth apostrophe - 0xFF02, // fullwidth quotation mark - 0x2019, // right single quotation mark (English, others) - 0x201D, // right double quotation mark (English, others) - 0x2018, // left single quotation mark (Czech, German, Slovak) - 0x201C, // left double quotation mark (Czech, German, Slovak) - 0x203A, // single right-pointing angle quotation mark (French, others) - 0x00BB, // right-pointing double angle quotation mark (French, others) - 0x2039, // single left-pointing angle quotation mark (Slovenian, others) - 0x00AB, // left-pointing double angle quotation mark (Slovenian, others) - 0x300D, // right corner bracket (East Asian languages) - 0xfe42, // presentation form for vertical right corner bracket - 0xFF63, // halfwidth right corner bracket (East Asian languages) - 0x300F, // right white corner bracket (East Asian languages) - 0xfe44, // presentation form for vertical right white corner bracket - 0x301F, // low double prime quotation mark (East Asian languages) - 0x301E, // close double prime (East Asian languages written horizontally) - 0x00A1, // Spanish inverted exclamation mark - 0x00BF, // Spanish inverted question mark - '.', '!', '?', - 0x055C, // Armenian exclamation mark - 0x055E, // Armenian question mark - 0x0589, // Armenian full stop - 0x061F, // Arabic question mark - 0x06D4, // Arabic full stop - 0x0700, // Syriac end of paragraph - 0x0701, // Syriac supralinear full stop - 0x0702, // Syriac sublinear full stop - 0x0964, // Devanagari danda..Devanagari double danda - 0x0965, - 0x1362, // Ethiopic full stop - 0x1367, // Ethiopic question mark - 0x1368, // Ethiopic paragraph separator - 0x104A, // Myanmar sign little section - 0x104B, // Myanmar sign section - 0x166E, // Canadian syllabics full stop - 0x17d4, // Khmer sign khan - 0x1803, // Mongolian full stop - 0x1809, // Mongolian Manchu full stop - 0x1944, // Limbu exclamation mark - 0x1945, // Limbu question mark - 0x203C, // double exclamation mark - 0x203D, // interrobang - 0x2047, // double question mark - 0x2048, // question exclamation mark - 0x2049, // exclamation question mark - 0x3002, // ideographic full stop - 0x037E, // Greek question mark - 0xFE52, // small full stop - 0xFE56, // small question mark - 0xFE57, // small exclamation mark - 0xFF01, // fullwidth exclamation mark - 0xFF0E, // fullwidth full stop - 0xFF1F, // fullwidth question mark - 0xFF61, // halfwidth ideographic full stop - 0x2026, // ellipsis - 0x30fb, // Katakana middle dot - 0xff65, // halfwidth Katakana middle dot - 0x2040, // character tie - '-', '~', - 0x058a, // Armenian hyphen - 0x1806, // Mongolian todo soft hyphen - 0x2010, // hyphen..horizontal bar - 0x2011, 0x2012, 0x2013, 0x2014, 0x2015, - 0x2053, // swung dash -- from Table 6-3 of Unicode book - 0x207b, // superscript minus - 0x208b, // subscript minus - 0x2212, // minus sign - 0x301c, // wave dash - 0x3030, // wavy dash - 0xfe31, // presentation form for vertical em dash..en dash - 0xfe32, - 0xfe58, // small em dash - 0xfe63, // small hyphen-minus - 0xff0d, // fullwidth hyphen-minus - ',', ':', ';', - 0x00b7, // middle dot - 0x0387, // Greek ano teleia - 0x05c3, // Hebrew punctuation sof pasuq - 0x060c, // Arabic comma - 0x061b, // Arabic semicolon - 0x066b, // Arabic decimal separator - 0x066c, // Arabic thousands separator - 0x0703, // Syriac contraction and others - 0x0704, 0x0705, 0x0706, 0x0707, 0x0708, 0x0709, 0x70a, - 0x070c, // Syric harklean metobelus - 0x0e5a, // Thai character angkhankhu - 0x0e5b, // Thai character khomut - 0x0f08, // Tibetan mark sbrul shad - 0x0f0d, // Tibetan mark shad..Tibetan mark rgya gram shad - 0x0f0e, 0x0f0f, 0x0f10, 0x0f11, 0x0f12, - 0x1361, // Ethiopic wordspace - 0x1363, // other Ethiopic chars - 0x1364, 0x1365, 0x1366, - 0x166d, // Canadian syllabics chi sign - 0x16eb, // Runic single punctuation..Runic cross punctuation - 0x16ed, - 0x17d5, // Khmer sign camnuc pii huuh and other - 0x17d6, - 0x17da, // Khmer sign koomut - 0x1802, // Mongolian comma - 0x1804, // Mongolian four dots and other - 0x1805, - 0x1808, // Mongolian manchu comma - 0x3001, // ideographic comma - 0xfe50, // small comma and others - 0xfe51, - 0xfe54, // small semicolon and other - 0xfe55, - 0xff0c, // fullwidth comma - 0xff0e, // fullwidth stop..fullwidth solidus - 0xff0f, - 0xff1a, // fullwidth colon..fullwidth semicolon - 0xff1b, - 0xff64, // halfwidth ideographic comma - 0x2016, // double vertical line - 0x2032, 0x2033, - 0x2034, // prime..triple prime - 0xfe61, // small asterisk - 0xfe68, // small reverse solidus - 0xff3c, // fullwidth reverse solidus -}; - -TEST_P(IsPunctuationWordParamTest, IsPunctuation) { - UnicodeUtil util(converter_); - std::string test_string = StringFromUnicodeChar(GetParam()); - bool result = false; - EXPECT_TRUE(util.IsPunctuationWord(test_string, &result).ok()); - EXPECT_TRUE(result); -} - -INSTANTIATE_TEST_SUITE_P(IsPuncWordParamTest, IsPunctuationWordParamTest, - ::testing::ValuesIn(punc_word_test_cases)); - -class IsEllipsisTest : public SentenceBreakingUtilsTest, - public ::testing::Test { - protected: - void SetUp() override { - converter_ = SentenceBreakingUtilsTest::GetUConverter(); - } - - void TearDown() override { ucnv_close(converter_); } - - UConverter* converter_; -}; - -TEST_F(IsEllipsisTest, IsEllipsis) { - UnicodeUtil util(converter_); - bool result = false; - EXPECT_TRUE(util.IsEllipsis("...", &result).ok()); - EXPECT_TRUE(result); - - EXPECT_TRUE(util.IsEllipsis("…", &result).ok()); - EXPECT_TRUE(result); - - EXPECT_TRUE(util.IsEllipsis("@", &result).ok()); - EXPECT_FALSE(result); -} - -} // namespace -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/sentence_fragmenter.cc b/tensorflow_text/core/kernels/sentence_fragmenter.cc deleted file mode 100644 index c336b5cfa..000000000 --- a/tensorflow_text/core/kernels/sentence_fragmenter.cc +++ /dev/null @@ -1,443 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/sentence_fragmenter.h" -#include -#include "tensorflow/core/lib/core/errors.h" -#include "tensorflow_text/core/kernels/sentence_breaking_utils.h" - -using ::tensorflow::Status; - -namespace tensorflow { -namespace text { -namespace { - -// Sets a property of a sentence fragment. -void SetFragmentProperty(SentenceFragment::Property property, - SentenceFragment *fragment) { - fragment->properties = fragment->properties | property; -} - -// Returns true iff a token has any of the given properties. -bool TokenHasProperty(uint32 properties, const Token &token) { - return token.text_properties() & properties; -} - -// Returns true iff a token has the ACRONYM text property and token.word() -// ends with a period. -bool IsPeriodSeparatedAcronym(const Token &token) { - return TokenHasProperty(Token::ACRONYM, token) && - (!token.word().empty() && token.word().back() == '.'); -} - -// Returns true iff the token can appear after a space in a sentence-terminal -// token sequence. -absl::Status SpaceAllowedBeforeToken(const UnicodeUtil *util, - const Token &token, bool *result) { - const tstring &word = token.word(); - bool is_ellipsis = false; - TF_RETURN_IF_ERROR(util->IsEllipsis(word, &is_ellipsis)); - - bool is_terminal_punc = false; - TF_RETURN_IF_ERROR(util->IsTerminalPunc(word, &is_terminal_punc)); - - bool is_close_paren = false; - TF_RETURN_IF_ERROR(util->IsCloseParen(word, &is_close_paren)); - - *result = (TokenHasProperty(Token::EMOTICON, token) || - (is_ellipsis || is_terminal_punc || is_close_paren)); - return absl::OkStatus(); -} -} // namespace - -class SentenceFragmenter::FragmentBoundaryMatch { - public: - FragmentBoundaryMatch() { - Reset(); - } - - // Goes to initial state. - void Reset() { - state_ = INITIAL_STATE; - first_terminal_punc_index_ = -1; - first_close_punc_index_ = -1; - limit_index_ = -1; - } - - // Follows the state transition for the token at the given index. Returns - // true for success, or false if there was no valid transition. - absl::Status Advance(const UnicodeUtil *util, const Document &document, - int index, bool *result) { - const Token &token = document.tokens()[index]; - const tstring &word = token.word(); - bool no_transition = false; - - bool is_terminal_punc = false; - TF_RETURN_IF_ERROR(util->IsTerminalPunc(word, &is_terminal_punc)); - - bool is_ellipsis = false; - TF_RETURN_IF_ERROR(util->IsEllipsis(word, &is_ellipsis)); - - bool is_close_punc = false; - TF_RETURN_IF_ERROR(util->IsClosePunc(word, &is_close_punc)); - - switch (state_) { - case INITIAL_STATE: - if (is_terminal_punc || is_ellipsis || - IsPeriodSeparatedAcronym(token) || - TokenHasProperty(Token::EMOTICON, token)) { - first_terminal_punc_index_ = index; - state_ = COLLECTING_TERMINAL_PUNC; - } - break; - case COLLECTING_TERMINAL_PUNC: - - if (is_terminal_punc || is_ellipsis || - TokenHasProperty(Token::EMOTICON, token)) { - // Stay in COLLECTING_TERMINAL_PUNC state. - } else if (is_close_punc) { - first_close_punc_index_ = index; - state_ = COLLECTING_CLOSE_PUNC; - } else { - no_transition = true; - } - break; - case COLLECTING_CLOSE_PUNC: - if (is_close_punc || is_ellipsis || - TokenHasProperty(Token::EMOTICON, token)) { - // Stay in COLLECTING_CLOSE_PUNC state. We effectively ignore - // emoticons and ellipses and continue to accept closing punctuation - // after them. - } else { - no_transition = true; - } - break; - } - - if (no_transition) { - *result = false; - return absl::OkStatus(); - } else { - limit_index_ = index + 1; - if (state_ == COLLECTING_TERMINAL_PUNC) { - // We've gotten terminal punctuation, but no close punctuation yet. - first_close_punc_index_ = limit_index_; - } - *result = true; - return absl::OkStatus(); - } - } - - // Returns true iff we have matched at least one terminal punctuation - // character. - bool GotTerminalPunc() const { - return first_terminal_punc_index_ >= 0; - } - - // Field accessors. - int first_terminal_punc_index() const { - return first_terminal_punc_index_; - } - int first_close_punc_index() const { - return first_close_punc_index_; - } - int limit_index() const { - return limit_index_; - } - - private: - // Match state. - enum MatchState { - INITIAL_STATE = 0, - COLLECTING_TERMINAL_PUNC, - COLLECTING_CLOSE_PUNC - }; - MatchState state_ = INITIAL_STATE; - - // First terminal punctuation mark matched; may be an acronym. - // -1 for not found. - int first_terminal_punc_index_ = -1; - - // First closing punctuation mark matched. -1 for not found. - int first_close_punc_index_ = -1; - - // First token after the terminal sequence. - int limit_index_ = -1; -}; - -absl::Status SentenceFragmenter::FindFragments( - std::vector *result) { - // Partition tokens into sentence fragments. - for (int i_start = 0; i_start < document_->tokens().size();) { - SentenceFragment fragment; - - // Match regexp for fragment boundary. - FragmentBoundaryMatch match; - TF_RETURN_IF_ERROR(FindNextFragmentBoundary(i_start, &match)); - - // Update 'latest_open_paren_is_sentential_' for the tokens in this - // fragment. - TF_RETURN_IF_ERROR( - UpdateLatestOpenParenForFragment(i_start, match.limit_index())); - - // Add a new sentence fragment up to this boundary. - TF_RETURN_IF_ERROR(FillInFragmentFields(i_start, match, &fragment)); - - result->push_back(std::move(fragment)); - i_start = match.limit_index(); - } - return absl::OkStatus(); -} - -// This method is essentially a control layer on top of a simple state machine -// that matches an end-of-fragment regexp. This method finds the next token to -// feed to the state machine, and handles embedded whitespace. The main -// complexity is that a space may delimit end-of-match, or be embedded in the -// termination sequence. When we encounter a space, we record the match found so -// far, but also continue matching. We return the longer match if it succeeds, -// else fall back to the earlier one. Note that the lookahead can incur at most -// 2n cost. -// -// E.g., suppose we're given: x? !!!y. We encounter the space after "x?" and -// have to look ahead all the way to "y" before realizing that the longer match -// fails. We put a fragment boundary after "x?", and next time around, we again -// scan "!!!" looking for a fragment boundary. Since we failed to find one last -// time, we'll fail again this time and therefore continue past "y" to find the -// next boundary. We will not try to scan "!!!" a third time. -absl::Status SentenceFragmenter::FindNextFragmentBoundary( - int i_start, SentenceFragmenter::FragmentBoundaryMatch *result) const { - FragmentBoundaryMatch current_match; - FragmentBoundaryMatch previous_match; - - for (int i = i_start; i < static_cast(document_->tokens().size()); ++i) { - const auto &token = document_->tokens()[i]; - if (current_match.GotTerminalPunc() && i > i_start && - token.break_level() >= Token::SPACE_BREAK) { - // Got terminal punctuation and a space delimiter, so match is valid. - bool space_allowed_before_token = false; - TF_RETURN_IF_ERROR( - SpaceAllowedBeforeToken(util_, token, &space_allowed_before_token)); - if (space_allowed_before_token) { - // Remember this match. Try to extend it. - previous_match = current_match; - } else { - // Stop here. We're not allowed to extend the match in this case. - break; - } - } - bool got_transition = false; - TF_RETURN_IF_ERROR( - current_match.Advance(util_, *document_, i, &got_transition)); - if (!got_transition) { - if (previous_match.GotTerminalPunc()) { - // Extension failed. Return previous match. - *result = previous_match; - return absl::OkStatus(); - } else { - // Start matching again from scratch. - current_match.Reset(); - - // Reprocess current token since it might be terminal punctuation. No - // infinite loop, because can't be "no transition" from INITIAL_STATE. - --i; - } - } - } - *result = current_match; - return absl::OkStatus(); -} - -// Keep track of whether the latest open parenthesis seen so far appears to be -// sentence-initial. This is useful because if it is *non-sentence-initial*, -// then any terminal punctuation before the corresponding close paren is -// probably not a sentence boundary. Example: -// -// Mushrooms (they're fungi!!) are delicious. -// (Mushrooms are fungi!!) -// -// In the first case, the open paren is non-sentence-initial, and therefore -// the "!!)" is not a sentence boundary. In the second case, the open paren *is* -// sentence-initial, and so the "!!)" is a sentence boundary. -// -// Of course, we don't know true sentence boundaries, so we make the -// approximation that an open paren is sentence-initial iff it is -// fragment-initial. This will be wrong if the open paren occurs after terminal -// punctuation that turns out not to be a sentence boundary, e.g., -// "Yahoo! (known for search, etc.) blah", but this is not expected to happen -// often. -absl::Status SentenceFragmenter::UpdateLatestOpenParenForFragment(int i_start, - int i_end) { - for (int i = i_end; i > i_start; --i) { - const auto &token = document_->tokens()[i - 1]; - bool is_open_paren = false; - TF_RETURN_IF_ERROR(util_->IsOpenParen(token.word(), &is_open_paren)); - if (is_open_paren) { - // Make the approximation that this open paren is sentence-initial iff it - // is fragment-initial. - latest_open_paren_is_sentential_ = (i - 1 == i_start); - break; - } - } - - return absl::OkStatus(); -} - -absl::Status SentenceFragmenter::FillInFragmentFields( - int i_start, const FragmentBoundaryMatch &match, - SentenceFragment *fragment) const { - // Set the fragment's boundaries. - fragment->start = i_start; - fragment->limit = match.limit_index(); - - // Set the fragment's properties. - if (match.GotTerminalPunc()) { - // TERMINAL_PUNC. - SetFragmentProperty(SentenceFragment::TERMINAL_PUNC, fragment); - int terminal_punc_index = -1; - TF_RETURN_IF_ERROR( - GetAdjustedFirstTerminalPuncIndex(match, &terminal_punc_index)); - bool has_unattachable_terminal_punc = false; - TF_RETURN_IF_ERROR( - HasUnattachableTerminalPunc(match, &has_unattachable_terminal_punc)); - bool has_close_paren = false; - TF_RETURN_IF_ERROR(HasCloseParen(match, &has_close_paren)); - - fragment->terminal_punc_token = terminal_punc_index; - // MULTIPLE_TERMINAL_PUNC. - if (has_unattachable_terminal_punc) { - SetFragmentProperty(SentenceFragment::MULTIPLE_TERMINAL_PUNC, fragment); - } - - // HAS_CLOSE_PAREN & HAS_SENTENTIAL_CLOSE_PAREN. - if (has_close_paren) { - SetFragmentProperty(SentenceFragment::HAS_CLOSE_PAREN, fragment); - - if (latest_open_paren_is_sentential_) { - SetFragmentProperty(SentenceFragment::HAS_SENTENTIAL_CLOSE_PAREN, - fragment); - } - } - } - - return absl::OkStatus(); -} - -// The standard first terminal punctuation index is just -// match.first_terminal_punc_index(). But if there is an ambiguous terminal -// punctuation mark (ellipsis) followed by an unambiguous one (.!?), then we -// treat the ellipsis as part of the sentence, and return the index of the first -// unambiguous punctuation mark after it. Example: -// -// He agreed...! -// -// We treat "!" as the first terminal punctuation mark; the ellipsis acts as -// left context. -absl::Status SentenceFragmenter::GetAdjustedFirstTerminalPuncIndex( - const FragmentBoundaryMatch &match, int *result) const { - // Get terminal punctuation span. - int i1 = match.first_terminal_punc_index(); - if (i1 < 0) { - *result = i1; - return absl::OkStatus(); - } - int i2 = match.first_close_punc_index(); - - for (int i = i2; i > i1; --i) { - const auto &token = document_->tokens()[i - 1]; - bool is_ellipsis = false; - TF_RETURN_IF_ERROR(util_->IsEllipsis(token.word(), &is_ellipsis)); - if (is_ellipsis || TokenHasProperty(Token::EMOTICON, token)) { - if (i == i2) { - // Ellipsis is last terminal punctuation mark. No adjustment. - *result = i1; - return absl::OkStatus(); - } else { - // Ellipsis is not the last terminal punctuation mark. Return the index - // of the terminal punctuation mark after it. - *result = i; // current token = i - 1 - return absl::OkStatus(); - } - } - } - - // No ellipsis. - *result = i1; - return absl::OkStatus(); -} - -// Example of an an "unattachable" terminal punctuation mark: -// -// He agreed!? -// -// The "?" is "unattachable" in that it can't be part of the word "agreed" -// because of the intervening "!", and therefore strongly suggests this is a -// true sentence boundary. The terminal punctuation mark must be unambiguous -// (.!?), as ambiguous ones (ellipsis/emoticon) do not necessarily imply a -// sentence boundary. -absl::Status SentenceFragmenter::HasUnattachableTerminalPunc( - const FragmentBoundaryMatch &match, bool *result) const { - *result = false; - // Get terminal punctuation span. - int i1 = match.first_terminal_punc_index(); - if (i1 < 0) { - *result = false; - return absl::OkStatus(); - } - int i2 = match.first_close_punc_index(); - - // Iterate over the second and later punctuation marks. - for (int i = i1 + 1; i < i2; ++i) { - const auto &token = document_->tokens()[i]; - bool is_punctuation = false; - TF_RETURN_IF_ERROR(util_->IsPunctuationWord(token.word(), &is_punctuation)); - bool is_ellipsis = false; - TF_RETURN_IF_ERROR(util_->IsEllipsis(token.word(), &is_ellipsis)); - if (is_punctuation && !is_ellipsis && - !TokenHasProperty(Token::EMOTICON, token)) { - // Found an unattachable, unambiguous terminal punctuation mark. - *result = true; - return absl::OkStatus(); - } - } - - *result = false; - return absl::OkStatus(); -} - -absl::Status SentenceFragmenter::HasCloseParen( - const FragmentBoundaryMatch &match, bool *result) const { - *result = false; - // Get close punctuation span. - int i1 = match.first_close_punc_index(); - if (i1 < 0) { - *result = false; - return absl::OkStatus(); - } - int i2 = match.limit_index(); - - for (int i = i1; i < i2; ++i) { - const auto &token = document_->tokens()[i]; - bool is_close_paren = false; - TF_RETURN_IF_ERROR(util_->IsCloseParen(token.word(), &is_close_paren)); - if (is_close_paren) { - *result = true; - return absl::OkStatus(); - } - } - *result = false; - return absl::OkStatus(); -} - -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/sentence_fragmenter.h b/tensorflow_text/core/kernels/sentence_fragmenter.h index 25f1038e1..c30f8ad1a 100644 --- a/tensorflow_text/core/kernels/sentence_fragmenter.h +++ b/tensorflow_text/core/kernels/sentence_fragmenter.h @@ -12,213 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -// A class to split up a document into sentence fragments. A sentence -// fragment is a token sequence whose end is potentially an end-of-sentence. -// -// Example: -// -// Document text: -// John said, "I.B.M. went up 5 points today." -// -// SentenceFragments: -// (1) John said, "I.B.M. -// (2) went up 5 points today." -// -// Fragment boundaries are induced by punctuation and paragraph breaks. - -#ifndef TENSORFLOW_TEXT_CORE_KERNELS_SENTENCE_FRAGMENTER_H_ -#define TENSORFLOW_TEXT_CORE_KERNELS_SENTENCE_FRAGMENTER_H_ - -#include -#include - -#include "tensorflow/core/lib/core/status.h" -#include "tensorflow_text/core/kernels/sentence_breaking_utils.h" - -namespace tensorflow { -namespace text { - -class Token { - public: - enum BreakLevel { - NO_BREAK = 0, // No separation between tokens. - SPACE_BREAK = 1, // Tokens separated by space. - LINE_BREAK = 2, // Tokens separated by line break. - SENTENCE_BREAK = 3, // Tokens separated by sentence break. - PARAGRAPH_BREAK = 4, // Tokens separated by paragraph break. - SECTION_BREAK = 10, // Tokens separated by section break. - CHAPTER_BREAK = 20, // Tokens separated by chapter break. - }; - - // Bitmask for properties of the token text. - enum TextProperty { - NONE = 0x00, - - // Token is ill-formed if: - // - // All tokens in a paragraph are marked as ill-formed if it has too few - // non-punctuation tokens in a paragraph (currently, a heading must have - // at least 2 tokens, and a non-heading must have at least 8). - // - // All tokens in a paragraph are marked as ill-formed if it lacks terminal - // sentence ending punctuation(e.g.: . ! ? …) or an emoticon (e.g.: ':)', - // ':D'). - // Exception: If a paragraph ends in an introductory punctuation - // character (','':' ';'), we say that it is an introductory paragraph. - // If it is followed by a "simple" HTML list (one whose list items have - // no substructure, such as embedded tables), then we keep both the - // introductory paragraph and the entire list. If not, we keep the - // introductory paragraph if it is followed by a well-formed paragraph. - // - // All tokens in a paragraph are marked as ill-formed if it contains the - // copyright sign (C in a circle) as this usually indicates a copyright - // notice, and is therefore effectively boilerplate. - ILL_FORMED = 0x01, - - // Indicates that the token is a part of the page title ( tag) or - // a heading (<hN> tag). - TITLE = 0x40, - HEADING = 0x02, - - // Text style. Determined from HTML tags only (<b>, etc), not from CSS. - BOLD = 0x04, - ITALIC = 0x08, - UNDERLINED = 0x10, - - // Indicates that the token is a part of a list. Currently set only for - // "simple" HTML lists (have no embedded paragraph boundaries) that are - // preceded by an introductory paragraph (ends in colon or a few other - // characters). - LIST = 0x20, - - // Token is an emoticon. - EMOTICON = 0x80, - - // Token was identified by Lexer as an acronym. Lexer identifies period-, - // hyphen-, and space-separated acronyms: "U.S.", "U-S", and "U S". - // Lexer normalizes all three to "US", but the token.word field - // normalizes only space-separated acronyms. - ACRONYM = 0x100, - - // Indicates that the token (or part of the token) is a covered by at - // least one hyperlink. More information of the hyperlink is stored in the - // first token covered by the hyperlink. - HYPERLINK = 0x200, - }; - - Token(const tstring &word, uint32 start, uint32 end, BreakLevel break_level, - TextProperty text_properties) - : word_(word), - start_(start), - end_(end), - break_level_(break_level), - text_properties_(text_properties) {} - - const tstring &word() const { return word_; } - const uint32 start() const { return start_; } - const uint32 end() const { return end_; } - const BreakLevel break_level() const { return break_level_; } - const TextProperty text_properties() const { return text_properties_; } - - private: - const tstring &word_; - uint32 start_; - uint32 end_; - BreakLevel break_level_; - TextProperty text_properties_; -}; - -class Document { - public: - // Does NOT take ownership of 'tokens'. - Document(std::vector<Token> *tokens) : tokens_(tokens) {} - - void AddToken(const tstring &word, uint32 start, uint32 end, - Token::BreakLevel break_level, - Token::TextProperty text_properties) { - tokens_->emplace_back(word, start, end, break_level, text_properties); - } - - const std::vector<Token> &tokens() const { return *tokens_; } - - private: - // not owned - std::vector<Token> *tokens_; -}; - -struct SentenceFragment { - int start; - int limit; - - enum Property { - TERMINAL_PUNC = 0x0001, // ends with terminal punctuation - MULTIPLE_TERMINAL_PUNC = 0x0002, // e.g.: She said what?! - HAS_CLOSE_PAREN = 0x0004, // e.g.: Mushrooms (they're fungi!!) - HAS_SENTENTIAL_CLOSE_PAREN = 0x0008, // e.g.: (Mushrooms are fungi!) - }; - // A mask of the above listed properties. - uint32 properties = 0; - int terminal_punc_token = -1; -}; - -// Utility class for splitting documents into a list of sentence fragments. -class SentenceFragmenter { - public: - // Constructs a fragmenter to process a specific part of a document. - SentenceFragmenter(const Document *document, UnicodeUtil *util) - : document_(document), util_(util) {} - - // Finds sentence fragments in the [start_, limit_) range of the associated - // document. - absl::Status FindFragments(std::vector<SentenceFragment> *result); - - private: - // State for matching a fragment-boundary regexp against a token sequence. - // The regexp is: terminal_punc+ close_punc*. - class FragmentBoundaryMatch; - - // Matches a fragment-boundary regexp against the tokens starting at - // 'i_start'. Returns the longest match found; will be non-empty as long as - // 'i_start' was not already at the end of the associated token range. - absl::Status FindNextFragmentBoundary(int i_start, - FragmentBoundaryMatch *result) const; - - // Updates 'latest_open_paren_is_sentential_' for the tokens in the given - // fragment. - absl::Status UpdateLatestOpenParenForFragment(int i_start, int i_end); - - // Populates a sentence fragment with the tokens from 'i_start' to the end - // of the given FragmentBoundaryMatch. - absl::Status FillInFragmentFields(int i_start, - const FragmentBoundaryMatch &match, - SentenceFragment *fragment) const; - - // Returns the adjusted first terminal punctuation index in a - // FragmentBoundaryMatch. - absl::Status GetAdjustedFirstTerminalPuncIndex( - const FragmentBoundaryMatch &match, int *result) const; - - // Returns true iff a FragmentBoundaryMatch has an "unattachable" terminal - // punctuation mark. - absl::Status HasUnattachableTerminalPunc(const FragmentBoundaryMatch &match, - bool *result) const; - - // Returns true iff a FragmentBoundaryMatch has a close paren in its closing - // punctuation. - absl::Status HasCloseParen(const FragmentBoundaryMatch &match, - bool *result) const; - - // Whether the latest open paren seen so far appears to be sentence-initial. - // See UpdateLatestOpenParenForFragment() in the .cc file for details. - bool latest_open_paren_is_sentential_ = false; - - const Document *document_ = nullptr; // not owned - UnicodeUtil *util_ = nullptr; // not owned - - // TODO(thuang513): DISALLOW_COPY_AND_ASSIGN(SentenceFragmenter); -}; +#ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_SENTENCE_FRAGMENTER_H_ +#define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_SENTENCE_FRAGMENTER_H_ -} // namespace text -} // namespace tensorflow +#include "tensorflow/core/kernels/text/sentence_fragmenter.h" -#endif // TENSORFLOW_TEXT_CORE_KERNELS_SENTENCE_FRAGMENTER_H_ +#endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_SENTENCE_FRAGMENTER_H_ diff --git a/tensorflow_text/core/kernels/sentence_fragmenter_v2.cc b/tensorflow_text/core/kernels/sentence_fragmenter_v2.cc deleted file mode 100644 index d917106c2..000000000 --- a/tensorflow_text/core/kernels/sentence_fragmenter_v2.cc +++ /dev/null @@ -1,706 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/sentence_fragmenter_v2.h" - -#include <string> - -#include "absl/status/status.h" -#include "absl/strings/match.h" -#include "absl/strings/string_view.h" -#include "icu4c/source/common/unicode/uchar.h" -#include "icu4c/source/common/unicode/utf8.h" -#include "tensorflow/core/lib/core/status.h" - -namespace tensorflow { -namespace text { - -void ConsumeOneUChar(const absl::string_view& input, UChar32* result, - int* offset) { - const char* source = input.data(); - - int input_length = input.length(); - U8_NEXT_OR_FFFD(source, *offset, input_length, *result); -} - -bool IsTerminalPunc(const absl::string_view& input, int* offset) { - *offset = 0; - bool is_ellipsis = IsEllipsis(input, offset); - if (is_ellipsis) return true; - - *offset = 0; - UChar32 char_value; - ConsumeOneUChar(input, &char_value, offset); - - // These are unicode characters that should be considered in this category but - // are not covered by any of the ICU properties. - switch (char_value) { - case 0x055C: // Armenian exclamation mark - case 0x055E: // Armenian question mark - case 0x17d4: // Khmer sign khan - case 0x037E: // Greek question mark - case 0x2026: // ellipsis - return true; - } - - USentenceBreak sb_property = static_cast<USentenceBreak>( - u_getIntPropertyValue(char_value, UCHAR_SENTENCE_BREAK)); - return sb_property == U_SB_ATERM || sb_property == U_SB_STERM; -} - -bool IsClosePunc(const absl::string_view& input, int* offset) { - *offset = 0; - - if (absl::StartsWith(input, "''")) { - *offset += absl::string_view("''").length(); - return true; - } - - UChar32 char_value; - ConsumeOneUChar(input, &char_value, offset); - - // These are unicode characters that should be considered in this category but - // are not covered by any of the ICU properties. - switch (char_value) { - case '>': - case ']': - case '`': - case 64831: // Ornate right parenthesis - case 65282: // fullwidth quotation mark - case 65287: // fullwidth apostrophe - return true; - } - - ULineBreak lb_property = static_cast<ULineBreak>( - u_getIntPropertyValue(char_value, UCHAR_LINE_BREAK)); - - return lb_property == U_LB_CLOSE_PUNCTUATION || - lb_property == U_LB_CLOSE_PARENTHESIS || lb_property == U_LB_QUOTATION; -} - -bool IsOpenParen(const absl::string_view& input) { - int offset = 0; - UChar32 char_value; - ConsumeOneUChar(input, &char_value, &offset); - - // These are unicode characters that should be considered in this category but - // are not covered by any of the ICU properties. - switch (char_value) { - case '<': - case 64830: // Ornate left parenthesis - return true; - } - - ULineBreak lb_property = static_cast<ULineBreak>( - u_getIntPropertyValue(char_value, UCHAR_LINE_BREAK)); - return lb_property == U_LB_OPEN_PUNCTUATION; -} - -bool IsCloseParen(const absl::string_view& input) { - int offset = 0; - - UChar32 char_value; - ConsumeOneUChar(input, &char_value, &offset); - - // These are unicode characters that should be considered in this category but - // are not covered by any of the ICU properties. - switch (char_value) { - case '>': - case 64831: // Ornate right parenthesis - return true; - } - - ULineBreak lb_property = static_cast<ULineBreak>( - u_getIntPropertyValue(char_value, UCHAR_LINE_BREAK)); - return lb_property == U_LB_CLOSE_PUNCTUATION || - lb_property == U_LB_CLOSE_PARENTHESIS; -} - -bool IsPunctuationWord(const absl::string_view& input) { - int offset = 0; - UChar32 char_value; - ConsumeOneUChar(input, &char_value, &offset); - - // These are unicode characters that should be considered in this category but - // are not covered by any of the ICU properties. - switch (char_value) { - case '`': - case '<': - case '>': - case '~': - case 5741: - return true; - } - - return u_ispunct(char_value) || u_hasBinaryProperty(char_value, UCHAR_DASH) || - u_hasBinaryProperty(char_value, UCHAR_HYPHEN); -} - -bool IsEllipsis(const absl::string_view& input, int* offset) { - *offset = 0; - if (absl::StartsWith(input, "...")) { - *offset += absl::string_view("...").length(); - return true; - } - - const UChar32 kEllipsisCharValue = 0x2026; - UChar32 char_value; - ConsumeOneUChar(input, &char_value, offset); - - return char_value == kEllipsisCharValue; -} - -inline bool IsAcronymComponent(const absl::string_view& input, int index) { - return (input.data()[index] >= 'A' && input.data()[index] <= 'Z') && - input.data()[index + 1] == '.'; -} - -bool IsPeriodSeparatedAcronym(const absl::string_view& input, int* offset) { - bool result = false; - - for (int i = 0; i < static_cast<int>(input.length()) - 1; i += 2) { - if (IsAcronymComponent(input, i)) { - *offset = i + 2; - if (*offset > 2) { - result = true; - } - } else { - break; - } - } - return result; -} - -bool IsEmoticon(const absl::string_view& input, int* offset) { - *offset = 0; - static std::vector<std::string> emoticon_list = {":(:)", - ":)", - ":(", - ":o)", - ":]", - ":3", - ":>", - "=]", - "=)", - ":}", - ":^)", - ":-D", - ":-)))))", - ":-))))", - ":-)))", - ":-))", - ":-)", - ">:[", - ":-(", - ":(", - ":-c", - ":c", - ":-<", - ":<", - ":-[", - ":[", - ":{", - ";(", - ":-||", - ":@", - ">:(", - ":'-(", - ":'(", - ":'-)", - ":')", - "D:<", - ">:O", - ":-O", - ":-o", - ":*", - ":-*", - ":^*", - ";-)", - ";)", - "*-)", - "*)", - ";-]", - ";]", - ";^)", - ":-,", - ">:P", - ":-P", - ":p", - "=p", - ":-p", - "=p", - ":P", - "=P", - ";p", - ";-p", - ";P", - ";-P", - ">:\\", - ">:/", - ":-/", - ":-.", - ":/", - ":\\", - "=/", - "=\\", - ":|", - ":-|", - ":$", - ":-#", - ":#", - "O:-)", - "0:-)", - "0:)", - "0;^)", - ">:)", - ">;)", - ">:-)", - "}:-)", - "}:)", - "3:-)", - ">_>^", - "^<_<", - "|;-)", - "|-O", - ":-J", - ":-&", - ":&", - "#-)", - "<3", - "8-)", - "^_^", - ":D", - ":-D", - "=D", - "^_^;;", - "O=)", - "}=)", - "B)", - "B-)", - "=|", - "-_-", - "o_o;", - "u_u", - ":-\\", - ":s", - ":S", - ":-s", - ":-S", - ";*", - ";-*" - "=(", - ">.<", - ">:-(", - ">:(", - ">=(", - ";_;", - "T_T", - "='(", - ">_<", - "D:", - ":o", - ":-o", - "=o", - "o.o", - ":O", - ":-O", - "=O", - "O.O", - "x_x", - "X-(", - "X(", - "X-o", - "X-O", - ":X)", - "(=^.^=)", - "(=^..^=)", - "=^_^=", - "-<@%", - ":(|)", - "(]:{", - "<\\3", - "~@~", - "8'(", - "XD", - "DX"}; - - for (int i = 0; i < static_cast<int>(emoticon_list.size()); ++i) { - if (absl::StartsWith(input, emoticon_list[i])) { - *offset = emoticon_list[i].length(); - return true; - } - } - return false; -} - -// Returns true iff the punctuation input can appear after a space in a -// sentence-terminal punctuation sequence. -bool SpaceAllowedBeforeChar(const absl::string_view& input) { - int offset = 0; - bool is_terminal_punc = IsTerminalPunc(input, &offset); - bool is_close_paren = IsCloseParen(input); - bool is_emoticon = IsEmoticon(input, &offset); - return is_terminal_punc || is_close_paren || is_emoticon; -} - -bool IsWhiteSpace(const absl::string_view& input) { - int offset = 0; - - if (absl::StartsWith(input, " ")) { - return true; - } else if (absl::StartsWith(input, "\n")) { - return true; - } else if (absl::StartsWith(input, " ")) { - return true; - } - - UChar32 char_value; - ConsumeOneUChar(input, &char_value, &offset); - - return u_isUWhiteSpace(char_value); -} - -// Follows the state transition for the slice at the given index. Returns true -// for success, or false if there was no valid transition. -bool FragmentBoundaryMatch::Advance(int index, absl::string_view slice) { - int temp_offset; - // By defualt offset is the next character. - int offset = 1; - bool no_transition = false; - bool is_terminal_punc = IsTerminalPunc(slice, &temp_offset); - if (is_terminal_punc) { - offset = temp_offset; - } - - bool is_ellipsis = IsEllipsis(slice, &temp_offset); - if (is_ellipsis) { - offset = temp_offset; - } - bool is_close_punc = IsClosePunc(slice, &temp_offset); - if (is_close_punc) { - offset = temp_offset; - } - bool is_acronym = IsPeriodSeparatedAcronym(slice, &temp_offset); - if (is_acronym) { - is_terminal_punc = false; - offset = temp_offset; - } - bool is_emoticon = IsEmoticon(slice, &temp_offset); - if (is_emoticon) { - is_terminal_punc = false; - offset = temp_offset; - } - - switch (state_) { - case INITIAL_STATE: - if (is_terminal_punc || is_acronym || is_emoticon) { - first_terminal_punc_index_ = index; - state_ = COLLECTING_TERMINAL_PUNC; - } - break; - case COLLECTING_TERMINAL_PUNC: - if (is_terminal_punc || is_emoticon) { - // Stay in COLLECTING_TERMINAL_PUNC state. - } else if (is_close_punc) { - first_close_punc_index_ = index; - state_ = COLLECTING_CLOSE_PUNC; - } else { - no_transition = true; - } - break; - case COLLECTING_CLOSE_PUNC: - if (is_close_punc || is_ellipsis || is_emoticon) { - // Stay in COLLECTING_CLOSE_PUNC state. We effectively ignore - // emoticons and ellipses and continue to accept closing punctuation - // after them. - } else { - no_transition = true; - } - break; - } - - if (no_transition) { - return false; - } else { - limit_index_ = index + offset; - if (state_ == COLLECTING_TERMINAL_PUNC) { - // We've gotten terminal punctuation, but no close punctuation yet. - first_close_punc_index_ = limit_index_; - } - return true; - } -} - -// Sets a property of a sentence fragment. -void SetFragmentProperty(SentenceFragment::Property property, - SentenceFragment* fragment) { - fragment->properties = fragment->properties | property; -} - -absl::Status SentenceFragmenterV2::FindFragments( - std::vector<SentenceFragment>* result) { - // Partition document into sentence fragments. - for (int i_start = 0; i_start < static_cast<int>(document_.size());) { - bool is_white_space = IsWhiteSpace(document_.substr(i_start)); - if (is_white_space) { - ++i_start; - continue; - } - - SentenceFragment fragment; - - // Match regexp for fragment boundary. - FragmentBoundaryMatch match = FindNextFragmentBoundary(i_start); - - // Update 'latest_open_paren_is_sentential_' for this fragment. - UpdateLatestOpenParenForFragment(i_start, match.limit_index()); - - // Add a new sentence fragment up to this boundary. - FillInFragmentFields(i_start, match, &fragment); - - result->push_back(std::move(fragment)); - i_start = match.limit_index(); - } - return absl::OkStatus(); -} - -// This method is essentially a control layer on top of a simple state machine -// that matches an end-of-fragment regexp. This method finds the next slice of -// text to feed to the state machine, and handles embedded whitespace. The main -// complexity is that a space may delimit end-of-match, or be embedded in the -// termination sequence. When we encounter a space, we record the match found so -// far, but also continue matching. We return the longer match if it succeeds, -// else fall back to the earlier one. Note that the lookahead can incur at most -// 2n cost. -// -// E.g., suppose we're given: x? !!!y. We encounter the space after "x?" and -// have to look ahead all the way to "y" before realizing that the longer match -// fails. We put a fragment boundary after "x?", and next time around, we again -// scan "!!!" looking for a fragment boundary. Since we failed to find one last -// time, we'll fail again this time and therefore continue past "y" to find the -// next boundary. We will not try to scan "!!!" a third time. - -FragmentBoundaryMatch SentenceFragmenterV2::FindNextFragmentBoundary( - int doc_index) const { - FragmentBoundaryMatch current_match; - FragmentBoundaryMatch previous_match; - - for (int i = doc_index; i < static_cast<int>(document_.size()); ++i) { - absl::string_view slice = document_.substr(i); - if (current_match.GotTerminalPunc() && i > doc_index) { - // Got terminal punctuation and a space delimiter, so match is valid. - bool space_allowed_before_char = SpaceAllowedBeforeChar(slice); - if (space_allowed_before_char) { - // Remember this match. Try to extend it. - previous_match = current_match; - } else { - // Stop here. We're not allowed to extend the match in this case. - break; - } - } - bool got_transition = current_match.Advance(i, slice); - if (!got_transition) { - if (previous_match.GotTerminalPunc()) { - // Extension failed. Return previous match. - return previous_match; - } else { - // Start matching again from scratch. - current_match.Reset(); - - // Reprocess current character since it might be terminal punctuation. - // No infinite loop, because can't be "no transition" from - // INITIAL_STATE. - --i; - } - } else { - i = current_match.limit_index() - 1; - } - } - return current_match; -} - -// Keep track of whether the latest open parenthesis seen so far appears to be -// sentence-initial. This is useful because if it is *non-sentence-initial*, -// then any terminal punctuation before the corresponding close paren is -// probably not a sentence boundary. Example: -// -// Mushrooms (they're fungi!!) are delicious. -// (Mushrooms are fungi!!) -// -// In the first case, the open paren is non-sentence-initial, and therefore -// the "!!)" is not a sentence boundary. In the second case, the open paren *is* -// sentence-initial, and so the "!!)" is a sentence boundary. -// -// Of course, we don't know true sentence boundaries, so we make the -// approximation that an open paren is sentence-initial iff it is -// fragment-initial. This will be wrong if the open paren occurs after terminal -// punctuation that turns out not to be a sentence boundary, e.g., -// "Yahoo! (known for search, etc.) blah", but this is not expected to happen -// often. -void SentenceFragmenterV2::UpdateLatestOpenParenForFragment(int i_start, - int i_end) { - for (int i = i_end; i > i_start; --i) { - absl::string_view slice = document_.substr(i); - if (slice.length() > 0 && IsOpenParen(slice)) { - // Make the approximation that this open paren is sentence-initial iff it - // is fragment-initial. - latest_open_paren_is_sentential_ = (i == i_start); - break; - } - } -} - -void SentenceFragmenterV2::FillInFragmentFields( - int i_start, const FragmentBoundaryMatch& match, - SentenceFragment* fragment) const { - // Set the fragment's boundaries. - fragment->start = i_start; - fragment->limit = match.limit_index(); - - // Set the fragment's properties. - if (match.GotTerminalPunc()) { - // TERMINAL_PUNC. - SetFragmentProperty(SentenceFragment::TERMINAL_PUNC, fragment); - int terminal_punc_index = GetAdjustedFirstTerminalPuncIndex(match); - - bool has_unattachable_terminal_punc = HasUnattachableTerminalPunc(match); - bool has_close_paren = HasCloseParen(match); - - fragment->terminal_punc_token = terminal_punc_index; - // MULTIPLE_TERMINAL_PUNC. - if (has_unattachable_terminal_punc) { - SetFragmentProperty(SentenceFragment::MULTIPLE_TERMINAL_PUNC, fragment); - } - - // HAS_CLOSE_PAREN & HAS_SENTENTIAL_CLOSE_PAREN. - if (has_close_paren) { - SetFragmentProperty(SentenceFragment::HAS_CLOSE_PAREN, fragment); - - if (latest_open_paren_is_sentential_) { - SetFragmentProperty(SentenceFragment::HAS_SENTENTIAL_CLOSE_PAREN, - fragment); - } - } - } -} - -// The standard first terminal punctuation index is just -// match.first_terminal_punc_index(). But if there is an ambiguous terminal -// punctuation mark (ellipsis) followed by an unambiguous one (.!?), then we -// treat the ellipsis as part of the sentence, and return the index of the first -// unambiguous punctuation mark after it. Example: -// -// He agreed...! -// -// We treat "!" as the first terminal punctuation mark; the ellipsis acts as -// left context. -int SentenceFragmenterV2::GetAdjustedFirstTerminalPuncIndex( - const FragmentBoundaryMatch& match) const { - // Get terminal punctuation span. - int i1 = match.first_terminal_punc_index(); - if (i1 < 0) { - return i1; - } - int i2 = match.first_close_punc_index(); - - for (int i = i2; i > i1; --i) { - absl::string_view slice = document_.substr(i); - int temp_offset = 0; - bool is_ellipsis = IsEllipsis(slice, &temp_offset); - bool is_emoticon = IsEmoticon(slice, &temp_offset); - if (is_ellipsis || is_emoticon) { - if (i == i2) { - // Ellipsis is last terminal punctuation mark. No adjustment. - return i1; - } else { - // Ellipsis is not the last terminal punctuation mark. Return the index - // of the terminal punctuation mark after it. - return i; // current character = i - 1 - } - } - } - // No ellipsis. - return i1; -} - -// Example of an an "unattachable" terminal punctuation mark: -// -// He agreed!? -// -// The "?" is "unattachable" in that it can't be part of the word "agreed" -// because of the intervening "!", and therefore strongly suggests this is a -// true sentence boundary. The terminal punctuation mark must be unambiguous -// (.!?), as ambiguous ones (ellipsis/emoticon) do not necessarily imply a -// sentence boundary. -bool SentenceFragmenterV2::HasUnattachableTerminalPunc( - const FragmentBoundaryMatch& match) const { - // Get terminal punctuation span. - int i1 = match.first_terminal_punc_index(); - if (i1 < 0) { - return false; - } - // Check where second and later punctuation marks start - absl::string_view start_slice = document_.substr(i1); - int temp_offset = 0; - bool is_ellipsis = IsEllipsis(start_slice, &temp_offset); - if (is_ellipsis) { - i1 += temp_offset - 1; - } - bool is_emoticon = IsEmoticon(start_slice, &temp_offset); - if (is_emoticon) { - i1 += temp_offset - 1; - } - - int i2 = match.first_close_punc_index(); - - // Iterate over the second and later punctuation marks. - for (int i = i1 + 1; i < i2; ++i) { - absl::string_view slice = document_.substr(i); - bool is_punctuation = IsPunctuationWord(slice); - is_ellipsis = IsEllipsis(slice, &temp_offset); - if (is_ellipsis) { - i += temp_offset - 1; - } - is_emoticon = IsEmoticon(slice, &temp_offset); - if (is_emoticon) { - i += temp_offset - 1; - } - if (is_punctuation && !is_ellipsis && !is_emoticon) { - // Found an unattachable, unambiguous terminal punctuation mark. - return true; - } - } - return false; -} - -bool SentenceFragmenterV2::HasCloseParen( - const FragmentBoundaryMatch& match) const { - // Get close punctuation span. - int i1 = match.first_close_punc_index(); - if (i1 < 0) { - return false; - } - int i2 = match.limit_index(); - - for (int i = i1; i < i2; ++i) { - absl::string_view slice = document_.substr(i); - if (IsCloseParen(slice)) { - return true; - } - } - return false; -} - -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/sentence_fragmenter_v2.h b/tensorflow_text/core/kernels/sentence_fragmenter_v2.h index 6c06867eb..fec2ea0b3 100644 --- a/tensorflow_text/core/kernels/sentence_fragmenter_v2.h +++ b/tensorflow_text/core/kernels/sentence_fragmenter_v2.h @@ -12,189 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Updated version of sentence fragmenter and util functions to split up a -// document into sentence fragments. A sentence fragment is a string whose end -// is potentially an end-of-sentence. The original version of -// sentence_fragmenter operates on tokens and defines the start and end of -// fragments using token indices, while sentence_fragmenter_v2 operates on a -// string_view sliding window of the text and defines the start and end of a -// fragment based on the character offset. -// -// Example: -// -// Document text: -// John said, "I.B.M. went up 5 points today." -// -// SentenceFragments: -// (1) John said, "I.B.M. -// (2) went up 5 points today." -// -// Fragment boundaries are induced by punctuation and paragraph breaks. - -#ifndef TENSORFLOW_TEXT_CORE_KERNELS_SENTENCE_FRAGMENTER_V2_H_ -#define TENSORFLOW_TEXT_CORE_KERNELS_SENTENCE_FRAGMENTER_V2_H_ - -#include <vector> - -#include "absl/status/status.h" -#include "absl/strings/string_view.h" -#include "icu4c/source/common/unicode/utypes.h" -#include "tensorflow/core/lib/core/status.h" - -namespace tensorflow { -namespace text { - -// A class of utils for identifying certain classes and properties of unicode -// characters. These utils are included in the header for use in tests. - -// Returns true iff a string is terminal punctuation. -bool IsTerminalPunc(const absl::string_view& input, int* offset); - -// Returns true iff a string is close punctuation (close quote or close -// paren). -bool IsClosePunc(const absl::string_view& input, int* offset); - -// Returns true iff a string is an open paren. -bool IsOpenParen(const absl::string_view& input); - -// Returns true iff a string is a close paren. -bool IsCloseParen(const absl::string_view& input); - -// Returns true iff a word is made of punctuation characters only. -bool IsPunctuationWord(const absl::string_view& input); - -// Returns true iff a string is an ellipsis ("..."). -bool IsEllipsis(const absl::string_view& input, int* offset); - -// Returns true iff a string is a period separated acronym (ex: "A.B.C."). -bool IsPeriodSeparatedAcronym(const absl::string_view& input, int* offset); - -// Returns true iff a string is an emoticon (ex: ":-)"). -bool IsEmoticon(const absl::string_view& input, int* offset); - -bool SpaceAllowedBeforeChar(const absl::string_view& input); - -void ConsumeOneUChar(const absl::string_view& input, UChar32* result, - int* offset); - -// Returns true iff a string is white space. -bool IsWhiteSpace(const absl::string_view& input); - -class FragmentBoundaryMatch { - public: - FragmentBoundaryMatch() {} - - // Goes to initial state. - void Reset() { - state_ = INITIAL_STATE; - first_terminal_punc_index_ = -1; - first_close_punc_index_ = -1; - limit_index_ = -1; - } - - // Follows the state transition for the slice at - // the given index. Returns true for success, or - // false if there was no valid transition. - bool Advance(int index, absl::string_view slice); - - // Returns true iff we have matched at least one terminal punctuation - // character. - bool GotTerminalPunc() const { return first_terminal_punc_index_ >= 0; } - - // Field accessors. - int first_terminal_punc_index() const { return first_terminal_punc_index_; } - int first_close_punc_index() const { return first_close_punc_index_; } - int limit_index() const { return limit_index_; } - - // Match state. - enum MatchState { - INITIAL_STATE = 0, - COLLECTING_TERMINAL_PUNC, - COLLECTING_CLOSE_PUNC - }; - - MatchState state() const { return state_; } - - private: - MatchState state_ = INITIAL_STATE; - - // First terminal punctuation mark matched; may be an acronym. - // -1 for not found. - int first_terminal_punc_index_ = -1; - - // First closing punctuation mark matched. -1 for not found. - int first_close_punc_index_ = -1; - - // First character after the terminal sequence. - int limit_index_ = -1; -}; - -struct SentenceFragment { - int start; - int limit; - - enum Property { - TERMINAL_PUNC = 0x0001, // ends with terminal punctuation - MULTIPLE_TERMINAL_PUNC = 0x0002, // e.g.: She said what?! - HAS_CLOSE_PAREN = 0x0004, // e.g.: Mushrooms (they're fungi!!) - HAS_SENTENTIAL_CLOSE_PAREN = 0x0008, // e.g.: (Mushrooms are fungi!) - }; - // A mask of the above listed properties. - uint32 properties = 0; - int terminal_punc_token = -1; -}; - -// Utility class for splitting documents into a list of sentence fragments. -class SentenceFragmenterV2 { - public: - // Constructs a fragmenter to process a specific part of a document. - SentenceFragmenterV2(absl::string_view document) : document_(document) {} - - // Finds sentence fragments in the [start_, limit_) range of the associated - // document. - absl::Status FindFragments(std::vector<SentenceFragment>* result); - - private: - // State for matching a fragment-boundary regexp against a character sequence. - // The regexp is: terminal_punc+ close_punc*. - - // Matches a fragment-boundary regexp against a slice of the document starting - // at 'doc_index'. Returns the longest match found; will be non-empty as long - // as 'doc_index' was not already at the end of the associated document. - FragmentBoundaryMatch FindNextFragmentBoundary(int doc_index) const; - - // Updates 'latest_open_paren_is_sentential_' for the given - // fragment. - void UpdateLatestOpenParenForFragment(int i_start, int i_end); - - // Populates a sentence fragment with the text from 'i_start' to the end - // of the given FragmentBoundaryMatch. - void FillInFragmentFields(int i_start, const FragmentBoundaryMatch& match, - SentenceFragment* fragment) const; - - // Returns the adjusted first terminal punctuation index in a - // FragmentBoundaryMatch. - int GetAdjustedFirstTerminalPuncIndex( - const FragmentBoundaryMatch& match) const; - - // Returns true iff a FragmentBoundaryMatch has an "unattachable" terminal - // punctuation mark. - bool HasUnattachableTerminalPunc(const FragmentBoundaryMatch& match) const; - - // Returns true iff a FragmentBoundaryMatch has a close paren in its closing - // punctuation. - bool HasCloseParen(const FragmentBoundaryMatch& match) const; - - // Whether the latest open paren seen so far appears to be sentence-initial. - // See UpdateLatestOpenParenForFragment() in the .cc file for details. - bool latest_open_paren_is_sentential_ = false; - - absl::string_view document_ = {}; // not owned - - // TODO(thuang513): DISALLOW_COPY_AND_ASSIGN(SentenceFragmenter); -}; +#ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_SENTENCE_FRAGMENTER_V2_H_ +#define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_SENTENCE_FRAGMENTER_V2_H_ -} // namespace text -} // namespace tensorflow +#include "tensorflow/core/kernels/text/sentence_fragmenter_v2.h" #endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_SENTENCE_FRAGMENTER_V2_H_ diff --git a/tensorflow_text/core/kernels/sentence_fragmenter_v2_kernel.h b/tensorflow_text/core/kernels/sentence_fragmenter_v2_kernel.h index d36c7e9c5..fd0e910a2 100644 --- a/tensorflow_text/core/kernels/sentence_fragmenter_v2_kernel.h +++ b/tensorflow_text/core/kernels/sentence_fragmenter_v2_kernel.h @@ -15,19 +15,6 @@ #ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_SENTENCE_FRAGMENTER_V2_KERNEL_H_ #define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_SENTENCE_FRAGMENTER_V2_KERNEL_H_ -#include "tensorflow/lite/kernels/shim/tf_op_shim.h" -#include "tensorflow_text/core/kernels/sentence_fragmenter_v2_kernel_template.h" - -namespace tensorflow { -namespace text { - -class SentenceFragmenterV2OpKernel - : public tflite::shim::TfOpKernel<SentenceFragmenterV2Op> { - public: - using TfOpKernel::TfOpKernel; -}; - -} // namespace text -} // namespace tensorflow +#include "tensorflow/core/kernels/text/sentence_fragmenter_v2_kernel.h" #endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_SENTENCE_FRAGMENTER_V2_KERNEL_H_ diff --git a/tensorflow_text/core/kernels/sentence_fragmenter_v2_kernel_template.h b/tensorflow_text/core/kernels/sentence_fragmenter_v2_kernel_template.h index ea7be5862..954d03ac8 100644 --- a/tensorflow_text/core/kernels/sentence_fragmenter_v2_kernel_template.h +++ b/tensorflow_text/core/kernels/sentence_fragmenter_v2_kernel_template.h @@ -15,150 +15,6 @@ #ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_SENTENCE_FRAGMENTER_V2_KERNEL_TEMPLATE_H_ #define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_SENTENCE_FRAGMENTER_V2_KERNEL_TEMPLATE_H_ - -#include <iostream> -#include <vector> - -#include "absl/status/status.h" -#include "tensorflow/lite/kernels/shim/op_kernel.h" -#include "tensorflow/lite/kernels/shim/shape.h" -#include "tensorflow/lite/kernels/shim/status_macros.h" -#include "tensorflow_text/core/kernels/sentence_fragmenter_v2.h" - -namespace tensorflow { -namespace text { - -template <tflite::shim::Runtime Rt> -class SentenceFragmenterV2Op - : public tflite::shim::OpKernelShim<SentenceFragmenterV2Op, Rt> { - private: - enum Inputs { - kInputValues = 0 - }; - enum Outputs { - kFragmentStart = 0, - kFragmentEnd, - kFragmentProperties, - kTerminalPuncToken, - kOutputRowLengths - }; - - using typename tflite::shim::OpKernelShim<SentenceFragmenterV2Op, - Rt>::InitContext; - using typename tflite::shim::OpKernelShim<SentenceFragmenterV2Op, - Rt>::InvokeContext; - using typename tflite::shim::OpKernelShim<SentenceFragmenterV2Op, - Rt>::ShapeInferenceContext; - - public: - SentenceFragmenterV2Op() = default; - static constexpr char kOpName[] = "SentenceFragmentsV2"; - static constexpr char kDoc[] = R"doc( - Splits a string into sentence fragments - )doc"; - - static const char* OpName() { return kOpName; } - static const char* Doc() { return kDoc; } - - // Attributes declaration (syntax: https://www.tensorflow.org/guide/create_op) - static std::vector<std::string> Attrs() { return {}; } - - // Inputs declaration (syntax: https://www.tensorflow.org/guide/create_op) - static std::vector<std::string> Inputs(); - - // Outputs declaration (syntax: https://www.tensorflow.org/guide/create_op) - static std::vector<std::string> Outputs(); - - // Initializes the op - absl::Status Init(InitContext* context) { return absl::OkStatus(); } - - // Runs the operation - absl::Status Invoke(InvokeContext* context); - - // Shape inference - static absl::Status ShapeInference(ShapeInferenceContext* c); -}; - -template <tflite::shim::Runtime Rt> -std::vector<std::string> SentenceFragmenterV2Op<Rt>::Inputs() { - return {"doc: string"}; -} - -template <tflite::shim::Runtime Rt> -std::vector<std::string> SentenceFragmenterV2Op<Rt>::Outputs() { - return {"fragment_start: int64", "fragment_end: int64", - "fragment_properties: int64", "terminal_punc_token: int64", - "output_row_lengths: int64"}; -} - -template <tflite::shim::Runtime Rt> -absl::Status SentenceFragmenterV2Op<Rt>::ShapeInference( - ShapeInferenceContext* c) { - using tflite::shim::Shape; - const auto rank_1_shape = Shape({Shape::kUnknownDim}); - - SH_ASSIGN_OR_RETURN(const Shape& input_values_shape, - c->GetInputShape(kInputValues)); - if (!input_values_shape.Compatible(rank_1_shape)) { - return absl::FailedPreconditionError( - absl::StrCat("Shape must be rank 1: ", input_values_shape.ToString())); - } - - SH_RETURN_IF_ERROR(c->SetOutputShape(kFragmentStart, rank_1_shape)); - SH_RETURN_IF_ERROR(c->SetOutputShape(kFragmentEnd, rank_1_shape)); - SH_RETURN_IF_ERROR(c->SetOutputShape(kFragmentProperties, rank_1_shape)); - SH_RETURN_IF_ERROR(c->SetOutputShape(kTerminalPuncToken, rank_1_shape)); - SH_RETURN_IF_ERROR(c->SetOutputShape(kOutputRowLengths, rank_1_shape)); - - return absl::OkStatus(); -} - -template <tflite::shim::Runtime Rt> -absl::Status SentenceFragmenterV2Op<Rt>::Invoke(InvokeContext* context) { - // Inputs - SH_ASSIGN_OR_RETURN(const auto input_values, context->GetInput(kInputValues)); - const auto document = input_values->template As<tensorflow::tstring, 1>(); - - // Outputs - std::vector<int64> fragment_start; - std::vector<int64> fragment_end; - std::vector<int64> fragment_properties; - std::vector<int64> terminal_punc_token; - std::vector<int64> output_row_lengths; - - // Iterate through all the documents and find fragments. - for (int i = 0; i < document.Dim(0); ++i) { - // Find fragments. - SentenceFragmenterV2 fragmenter(document(i)); - std::vector<SentenceFragment> frags; - - SH_RETURN_IF_ERROR(fragmenter.FindFragments(&frags)); - - for (const auto& f : frags) { - fragment_start.push_back(f.start); - fragment_end.push_back(f.limit); - fragment_properties.push_back(f.properties); - terminal_punc_token.push_back(f.terminal_punc_token); - } - output_row_lengths.push_back(frags.size()); - } - - // Allocate output & fill output tensors. - SH_RETURN_IF_ERROR(this->template FillOutputTensor<int64_t, int64_t>( - fragment_start, kFragmentStart, context)); - SH_RETURN_IF_ERROR(this->template FillOutputTensor<int64_t, int64_t>( - fragment_end, kFragmentEnd, context)); - SH_RETURN_IF_ERROR(this->template FillOutputTensor<int64_t, int64_t>( - fragment_properties, kFragmentProperties, context)); - SH_RETURN_IF_ERROR(this->template FillOutputTensor<int64_t, int64_t>( - terminal_punc_token, kTerminalPuncToken, context)); - SH_RETURN_IF_ERROR(this->template FillOutputTensor<int64_t, int64_t>( - output_row_lengths, kOutputRowLengths, context)); - - return absl::OkStatus(); -} - -} // namespace text -} // namespace tensorflow +#include "tensorflow/core/kernels/text/sentence_fragmenter_v2_kernel_template.h" #endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_SENTENCE_FRAGMENTER_V2_KERNEL_TEMPLATE_H_ diff --git a/tensorflow_text/core/kernels/sentence_fragmenter_v2_test.cc b/tensorflow_text/core/kernels/sentence_fragmenter_v2_test.cc deleted file mode 100644 index 87cd49265..000000000 --- a/tensorflow_text/core/kernels/sentence_fragmenter_v2_test.cc +++ /dev/null @@ -1,1092 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/sentence_fragmenter_v2.h" - -#include <string> -#include <vector> - -#include <gtest/gtest.h> -#include "absl/strings/string_view.h" -#include "icu4c/source/common/unicode/uchar.h" -#include "icu4c/source/common/unicode/umachine.h" -#include "icu4c/source/common/unicode/unistr.h" - -namespace tensorflow { -namespace text { -namespace { - -class SentenceBreakingUtilsParamTest : public ::testing::TestWithParam<UChar> { - protected: - std::string StringFromUnicodeChar(UChar32 input) { - std::string result; - icu::UnicodeString test_unicode_string(input); - test_unicode_string.toUTF8String(result); - return result; - } -}; - -class SentenceBreakingUtilsStringParamTest - : public ::testing::TestWithParam<const char*> {}; - -class IsTerminalPuncParamTest : public SentenceBreakingUtilsParamTest {}; - -class IsTerminalPuncTest : public ::testing::Test {}; - -const UChar is_terminal_punc_test_cases[] = { - 0x055C, // Armenian exclamation mark - 0x055E, // Armenian question mark - 0x0589, // Armenian full stop - 0x061F, // Arabic question mark - 0x06D4, // Arabic full stop - 0x0700, // Syriabc end of paragraph - 0x0701, // Syriac supralinear full stop - 0x0702, // Syriac sublinear full stop - 0x1362, // Ethiopic full stop - 0x1367, // Ethiopic question mark - 0x1368, // Ethiopic paragraph separator - 0x104A, // Myanmar sign little section - 0x104B, // Myanmar sign section - 0x166E, // Canadian syllabics full stop - 0x17d4, // Khmer sign khan - 0x1803, // Mongolian full stop - 0x1809, // Mongolian Manchu full stop - 0x1944, // Limbu exclamation mark - 0x1945, // Limbu question mark - 0x203C, // double exclamation mark - 0x203D, // interrobang - 0x2047, // double question mark - 0x2048, // question exclamation mark - 0x2049, // exclamation question mark - 0x3002, // ideographic full stop - 0x037E, // Greek question mark - 0xFE52, // small full stop - 0xFE56, // small question mark - 0xFE57, // small exclamation mark - 0xFF01, // fullwidth exclamation mark - 0xFF0E, // fullwidth full stop - 0xFF1F, // fullwidth question mark - 0xFF61, // halfwidth ideographic full stop - 0x2026, // ellipsis - 0x0964, - 0x0965, // Devanagari danda..Devanagari double -}; - -TEST_P(IsTerminalPuncParamTest, IsTerminalPunc) { - std::string test_string = StringFromUnicodeChar(GetParam()); - int offset; - EXPECT_TRUE(IsTerminalPunc(test_string, &offset)); -} - -INSTANTIATE_TEST_SUITE_P(IsTerminalPuncTest, IsTerminalPuncParamTest, - ::testing::ValuesIn(is_terminal_punc_test_cases)); - -TEST_F(IsTerminalPuncTest, IsMultiCharEllipseTerminalPunc) { - std::string test_string = "..."; - int offset; - EXPECT_TRUE(IsTerminalPunc(test_string, &offset)); -} - -TEST_F(IsTerminalPuncTest, TestMultiUnicodeChars) { - std::string test_string = "never gonna let you decode"; - int offset; - EXPECT_FALSE(IsTerminalPunc(test_string, &offset)); -} - -struct ClosePuncOffsetPairs { - const UChar close_punc; - const int offset; -}; - -class SentenceBreakingUtilsClosePuncPairParamTest - : public ::testing::TestWithParam<ClosePuncOffsetPairs> { - protected: - std::string StringFromUnicodeChar(UChar32 input) { - std::string result; - icu::UnicodeString test_unicode_string(input); - test_unicode_string.toUTF8String(result); - return result; - } -}; - -class ClosePuncParamTest : public SentenceBreakingUtilsClosePuncPairParamTest { -}; - -const ClosePuncOffsetPairs close_punc_test_cases[] = { - {0x29, 1}, - {0x5D, 1}, - {0x3E, 1}, - {0x7D, 1}, - {0x207E, 3}, // superscript right parenthesis - {0x208E, 3}, // subscript right parenthesis - {0x27E7, 3}, // mathematical right white square bracket - {0x27E9, 3}, // mathematical right angle bracket - {0x27EB, 3}, // mathematical right double angle bracket - {0x2984, 3}, // right white curly bracket - {0x2986, 3}, // right white parenthesis - {0x2988, 3}, // Z notation right image bracket - {0x298A, 3}, // Z notation right binding bracket - {0x298C, 3}, // right square bracket with underbar - {0x298E, 3}, // right square bracket with tick in top corner - {0x2990, 3}, // right square bracket with tick in bottom corner - {0x2992, 3}, // right angle bracket with dot - {0x2994, 3}, // right arc greater-than bracket - {0x2996, 3}, // double right arc less-than bracket - {0x2998, 3}, // right black tortoise shell bracket - {0x29D9, 3}, // right wiggly fence - {0x29DB, 3}, // right double wiggly fence - {0x29FD, 3}, // right-pointing curved angle bracket - {0x3009, 3}, // CJK right angle bracket - {0x300B, 3}, // CJK right double angle bracket - {0x3011, 3}, // CJK right black lenticular bracket - {0x3015, 3}, // CJK right tortoise shell bracket - {0x3017, 3}, // CJK right white lenticular bracket - {0x3019, 3}, // CJK right white tortoise shell bracket - {0x301B, 3}, // CJK right white square bracket - {0xFD3F, 3}, // Ornate right parenthesis - {0xFE5A, 3}, // small right parenthesis - {0xFE5C, 3}, // small right curly bracket - {0xFF09, 3}, // fullwidth right parenthesis - {0xFF3D, 3}, // fullwidth right square bracket - {0xFF5D, 3}, // fullwidth right curly bracket - {0x27, 1}, - {0x60, 1}, - {0x22, 1}, - {0xFF07, 3}, // fullwidth apostrophe - {0xFF02, 3}, // fullwidth quotation mark - {0x2019, 3}, // right single quotation mark (English, others) - {0x201D, 3}, // right double quotation mark (English, others) - {0x2018, 3}, // left single quotation mark (Czech, German, Slovak) - {0x201C, 3}, // left double quotation mark (Czech, German, Slovak) - {0x203A, 3}, // single right-pointing angle quotation mark (French, others) - {0x00BB, 2}, // right-pointing double angle quotation mark (French, others) - {0x2039, 3}, // single left-pointing angle quotation mark (Slovenian, - // others) - {0x00AB, 2}, // left-pointing double angle quotation mark (Slovenian, - // others) - {0x300D, 3}, // right corner bracket (East Asian languages) - {0xfe42, 3}, // presentation form for vertical right corner bracket - {0xFF63, 3}, // halfwidth right corner bracket (East Asian languages) - {0x300F, 3}, // right white corner bracket (East Asian languages) - {0xfe44, 3}, // presentation form for vertical right white corner bracket - {0x301F, 3}, // low double prime quotation mark (East Asian languages) - {0x301E, 3} // close double prime (East Asian languages written - // horizontally) -}; - -TEST_P(ClosePuncParamTest, IsClosePunc) { - ClosePuncOffsetPairs test_punc = GetParam(); - std::string test_string = StringFromUnicodeChar(test_punc.close_punc); - int expected_offset = test_punc.offset; - int offset; - EXPECT_TRUE(IsClosePunc(test_string, &offset)); - EXPECT_EQ(offset, expected_offset); -} - -INSTANTIATE_TEST_SUITE_P(IsClosePuncParamTest, ClosePuncParamTest, - ::testing::ValuesIn(close_punc_test_cases)); - -class OpenParenParamTest : public SentenceBreakingUtilsParamTest {}; - -const UChar open_paren_test_cases[] = { - '(', '[', '<', '{', - 0x207D, // superscript left parenthesis - 0x208D, // subscript left parenthesis - 0x27E6, // mathematical left white square bracket - 0x27E8, // mathematical left angle bracket - 0x27EA, // mathematical left double angle bracket - 0x2983, // left white curly bracket - 0x2985, // left white parenthesis - 0x2987, // Z notation left image bracket - 0x2989, // Z notation left binding bracket - 0x298B, // left square bracket with underbar - 0x298D, // left square bracket with tick in top corner - 0x298F, // left square bracket with tick in bottom corner - 0x2991, // left angle bracket with dot - 0x2993, // left arc less-than bracket - 0x2995, // double left arc greater-than bracket - 0x2997, // left black tortoise shell bracket - 0x29D8, // left wiggly fence - 0x29DA, // left double wiggly fence - 0x29FC, // left-pointing curved angle bracket - 0x3008, // CJK left angle bracket - 0x300A, // CJK left double angle bracket - 0x3010, // CJK left black lenticular bracket - 0x3014, // CJK left tortoise shell bracket - 0x3016, // CJK left white lenticular bracket - 0x3018, // CJK left white tortoise shell bracket - 0x301A, // CJK left white square bracket - 0xFD3E, // Ornate left parenthesis - 0xFE59, // small left parenthesis - 0xFE5B, // small left curly bracket - 0xFF08, // fullwidth left parenthesis - 0xFF3B, // fullwidth left square bracket - 0xFF5B, // fullwidth left curly bracket -}; - -TEST_P(OpenParenParamTest, IsOpenParen) { - std::string test_string = StringFromUnicodeChar(GetParam()); - EXPECT_TRUE(IsOpenParen(test_string)); -} - -INSTANTIATE_TEST_SUITE_P(IsOpenParenParamTest, OpenParenParamTest, - ::testing::ValuesIn(open_paren_test_cases)); - -class CloseParenParamTest : public SentenceBreakingUtilsParamTest {}; - -const UChar close_paren_test_cases[] = { - ')', ']', '>', '}', - 0x207E, // superscript right parenthesis - 0x208E, // subscript right parenthesis - 0x27E7, // mathematical right white square bracket - 0x27E9, // mathematical right angle bracket - 0x27EB, // mathematical right double angle bracket - 0x2984, // right white curly bracket - 0x2986, // right white parenthesis - 0x2988, // Z notation right image bracket - 0x298A, // Z notation right binding bracket - 0x298C, // right square bracket with underbar - 0x298E, // right square bracket with tick in top corner - 0x2990, // right square bracket with tick in bottom corner - 0x2992, // right angle bracket with dot - 0x2994, // right arc greater-than bracket - 0x2996, // double right arc less-than bracket - 0x2998, // right black tortoise shell bracket - 0x29D9, // right wiggly fence - 0x29DB, // right double wiggly fence - 0x29FD, // right-pointing curved angle bracket - 0x3009, // CJK right angle bracket - 0x300B, // CJK right double angle bracket - 0x3011, // CJK right black lenticular bracket - 0x3015, // CJK right tortoise shell bracket - 0x3017, // CJK right white lenticular bracket - 0x3019, // CJK right white tortoise shell bracket - 0x301B, // CJK right white square bracket - 0xFD3F, // Ornate right parenthesis - 0xFE5A, // small right parenthesis - 0xFE5C, // small right curly bracket - 0xFF09, // fullwidth right parenthesis - 0xFF3D, // fullwidth right square bracket - 0xFF5D, // fullwidth right curly bracket -}; - -TEST_P(CloseParenParamTest, IsCloseParen) { - std::string test_string = StringFromUnicodeChar(GetParam()); - EXPECT_TRUE(IsCloseParen(test_string)); -} - -INSTANTIATE_TEST_SUITE_P(IsCloseParenParamTest, CloseParenParamTest, - ::testing::ValuesIn(close_paren_test_cases)); - -class IsPunctuationWordParamTest : public SentenceBreakingUtilsParamTest {}; - -const UChar punc_word_test_cases[] = { - '(', '[', '<', '{', - 0x207D, // superscript left parenthesis - 0x208D, // subscript left parenthesis - 0x27E6, // mathematical left white square bracket - 0x27E8, // mathematical left angle bracket - 0x27EA, // mathematical left double angle bracket - 0x2983, // left white curly bracket - 0x2985, // left white parenthesis - 0x2987, // Z notation left image bracket - 0x2989, // Z notation left binding bracket - 0x298B, // left square bracket with underbar - 0x298D, // left square bracket with tick in top corner - 0x298F, // left square bracket with tick in bottom corner - 0x2991, // left angle bracket with dot - 0x2993, // left arc less-than bracket - 0x2995, // double left arc greater-than bracket - 0x2997, // left black tortoise shell bracket - 0x29D8, // left wiggly fence - 0x29DA, // left double wiggly fence - 0x29FC, // left-pointing curved angle bracket - 0x3008, // CJK left angle bracket - 0x300A, // CJK left double angle bracket - 0x3010, // CJK left black lenticular bracket - 0x3014, // CJK left tortoise shell bracket - 0x3016, // CJK left white lenticular bracket - 0x3018, // CJK left white tortoise shell bracket - 0x301A, // CJK left white square bracket - 0xFD3E, // Ornate left parenthesis - 0xFE59, // small left parenthesis - 0xFE5B, // small left curly bracket - 0xFF08, // fullwidth left parenthesis - 0xFF3B, // fullwidth left square bracket - 0xFF5B, // fullwidth left curly bracket - '"', '\'', '`', - 0xFF07, // fullwidth apostrophe - 0xFF02, // fullwidth quotation mark - 0x2018, // left single quotation mark (English, others) - 0x201C, // left double quotation mark (English, others) - 0x201B, // single high-reveresed-9 quotation mark (PropList.txt) - 0x201A, // single low-9 quotation mark (Czech, German, Slovak) - 0x201E, // double low-9 quotation mark (Czech, German, Slovak) - 0x201F, // double high-reversed-9 quotation mark (PropList.txt) - 0x2019, // right single quotation mark (Danish, Finnish, Swedish, Norw.) - 0x201D, // right double quotation mark (Danish, Finnish, Swedish, Norw.) - 0x2039, // single left-pointing angle quotation mark (French, others) - 0x00AB, // left-pointing double angle quotation mark (French, others) - 0x203A, // single right-pointing angle quotation mark (Slovenian, others) - 0x00BB, // right-pointing double angle quotation mark (Slovenian, others) - 0x300C, // left corner bracket (East Asian languages) - 0xFE41, // presentation form for vertical left corner bracket - 0xFF62, // halfwidth left corner bracket (East Asian languages) - 0x300E, // left white corner bracket (East Asian languages) - 0xFE43, // presentation form for vertical left white corner bracket - 0x301D, // reversed double prime quotation mark (East Asian langs, horiz.) - ')', ']', '>', '}', - 0x207E, // superscript right parenthesis - 0x208E, // subscript right parenthesis - 0x27E7, // mathematical right white square bracket - 0x27E9, // mathematical right angle bracket - 0x27EB, // mathematical right double angle bracket - 0x2984, // right white curly bracket - 0x2986, // right white parenthesis - 0x2988, // Z notation right image bracket - 0x298A, // Z notation right binding bracket - 0x298C, // right square bracket with underbar - 0x298E, // right square bracket with tick in top corner - 0x2990, // right square bracket with tick in bottom corner - 0x2992, // right angle bracket with dot - 0x2994, // right arc greater-than bracket - 0x2996, // double right arc less-than bracket - 0x2998, // right black tortoise shell bracket - 0x29D9, // right wiggly fence - 0x29DB, // right double wiggly fence - 0x29FD, // right-pointing curved angle bracket - 0x3009, // CJK right angle bracket - 0x300B, // CJK right double angle bracket - 0x3011, // CJK right black lenticular bracket - 0x3015, // CJK right tortoise shell bracket - 0x3017, // CJK right white lenticular bracket - 0x3019, // CJK right white tortoise shell bracket - 0x301B, // CJK right white square bracket - 0xFD3F, // Ornate right parenthesis - 0xFE5A, // small right parenthesis - 0xFE5C, // small right curly bracket - 0xFF09, // fullwidth right parenthesis - 0xFF3D, // fullwidth right square bracket - 0xFF5D, // fullwidth right curly bracket - '\'', '"', '`', - 0xFF07, // fullwidth apostrophe - 0xFF02, // fullwidth quotation mark - 0x2019, // right single quotation mark (English, others) - 0x201D, // right double quotation mark (English, others) - 0x2018, // left single quotation mark (Czech, German, Slovak) - 0x201C, // left double quotation mark (Czech, German, Slovak) - 0x203A, // single right-pointing angle quotation mark (French, others) - 0x00BB, // right-pointing double angle quotation mark (French, others) - 0x2039, // single left-pointing angle quotation mark (Slovenian, others) - 0x00AB, // left-pointing double angle quotation mark (Slovenian, others) - 0x300D, // right corner bracket (East Asian languages) - 0xfe42, // presentation form for vertical right corner bracket - 0xFF63, // halfwidth right corner bracket (East Asian languages) - 0x300F, // right white corner bracket (East Asian languages) - 0xfe44, // presentation form for vertical right white corner bracket - 0x301F, // low double prime quotation mark (East Asian languages) - 0x301E, // close double prime (East Asian languages written horizontally) - 0x00A1, // Spanish inverted exclamation mark - 0x00BF, // Spanish inverted question mark - '.', '!', '?', - 0x055C, // Armenian exclamation mark - 0x055E, // Armenian question mark - 0x0589, // Armenian full stop - 0x061F, // Arabic question mark - 0x06D4, // Arabic full stop - 0x0700, // Syriac end of paragraph - 0x0701, // Syriac supralinear full stop - 0x0702, // Syriac sublinear full stop - 0x0964, // Devanagari danda..Devanagari double danda - 0x0965, - 0x1362, // Ethiopic full stop - 0x1367, // Ethiopic question mark - 0x1368, // Ethiopic paragraph separator - 0x104A, // Myanmar sign little section - 0x104B, // Myanmar sign section - 0x166E, // Canadian syllabics full stop - 0x17d4, // Khmer sign khan - 0x1803, // Mongolian full stop - 0x1809, // Mongolian Manchu full stop - 0x1944, // Limbu exclamation mark - 0x1945, // Limbu question mark - 0x203C, // double exclamation mark - 0x203D, // interrobang - 0x2047, // double question mark - 0x2048, // question exclamation mark - 0x2049, // exclamation question mark - 0x3002, // ideographic full stop - 0x037E, // Greek question mark - 0xFE52, // small full stop - 0xFE56, // small question mark - 0xFE57, // small exclamation mark - 0xFF01, // fullwidth exclamation mark - 0xFF0E, // fullwidth full stop - 0xFF1F, // fullwidth question mark - 0xFF61, // halfwidth ideographic full stop - 0x2026, // ellipsis - 0x30fb, // Katakana middle dot - 0xff65, // halfwidth Katakana middle dot - 0x2040, // character tie - '-', '~', - 0x058a, // Armenian hyphen - 0x1806, // Mongolian todo soft hyphen - 0x2010, // hyphen..horizontal bar - 0x2011, 0x2012, 0x2013, 0x2014, 0x2015, - 0x2053, // swung dash -- from Table 6-3 of Unicode book - 0x207b, // superscript minus - 0x208b, // subscript minus - 0x2212, // minus sign - 0x301c, // wave dash - 0x3030, // wavy dash - 0xfe31, // presentation form for vertical em dash..en dash - 0xfe32, - 0xfe58, // small em dash - 0xfe63, // small hyphen-minus - 0xff0d, // fullwidth hyphen-minus - ',', ':', ';', - 0x00b7, // middle dot - 0x0387, // Greek ano teleia - 0x05c3, // Hebrew punctuation sof pasuq - 0x060c, // Arabic comma - 0x061b, // Arabic semicolon - 0x066b, // Arabic decimal separator - 0x066c, // Arabic thousands separator - 0x0703, // Syriac contraction and others - 0x0704, 0x0705, 0x0706, 0x0707, 0x0708, 0x0709, 0x70a, - 0x070c, // Syric harklean metobelus - 0x0e5a, // Thai character angkhankhu - 0x0e5b, // Thai character khomut - 0x0f08, // Tibetan mark sbrul shad - 0x0f0d, // Tibetan mark shad..Tibetan mark rgya gram shad - 0x0f0e, 0x0f0f, 0x0f10, 0x0f11, 0x0f12, - 0x1361, // Ethiopic wordspace - 0x1363, // other Ethiopic chars - 0x1364, 0x1365, 0x1366, - 0x166d, // Canadian syllabics chi sign - 0x16eb, // Runic single punctuation..Runic cross punctuation - 0x16ed, - 0x17d5, // Khmer sign camnuc pii huuh and other - 0x17d6, - 0x17da, // Khmer sign koomut - 0x1802, // Mongolian comma - 0x1804, // Mongolian four dots and other - 0x1805, - 0x1808, // Mongolian manchu comma - 0x3001, // ideographic comma - 0xfe50, // small comma and others - 0xfe51, - 0xfe54, // small semicolon and other - 0xfe55, - 0xff0c, // fullwidth comma - 0xff0e, // fullwidth stop..fullwidth solidus - 0xff0f, - 0xff1a, // fullwidth colon..fullwidth semicolon - 0xff1b, - 0xff64, // halfwidth ideographic comma - 0x2016, // double vertical line - 0x2032, 0x2033, - 0x2034, // prime..triple prime - 0xfe61, // small asterisk - 0xfe68, // small reverse solidus - 0xff3c, // fullwidth reverse solidus -}; - -TEST_P(IsPunctuationWordParamTest, IsPunctuation) { - std::string test_string = StringFromUnicodeChar(GetParam()); - EXPECT_TRUE(IsPunctuationWord(test_string)); -} - -INSTANTIATE_TEST_SUITE_P(IsPuncWordParamTest, IsPunctuationWordParamTest, - ::testing::ValuesIn(punc_word_test_cases)); - -class IsEllipsisTest : public ::testing::Test {}; - -TEST_F(IsEllipsisTest, IsEllipsis) { - int offset; - EXPECT_TRUE(IsEllipsis("...", &offset)); - EXPECT_EQ(offset, 3); - EXPECT_TRUE(IsEllipsis("…", &offset)); - EXPECT_EQ(offset, 3); - EXPECT_FALSE(IsEllipsis("@", &offset)); - EXPECT_EQ(offset, 1); -} - -class IsWhiteSpaceTest : public ::testing::Test {}; - -TEST_F(IsWhiteSpaceTest, IsWhiteSpace) { - EXPECT_TRUE(IsWhiteSpace(" ")); - - EXPECT_TRUE(IsWhiteSpace("\n")); - - EXPECT_TRUE(IsWhiteSpace(" ")); - - EXPECT_FALSE(IsWhiteSpace("@")); - - EXPECT_FALSE(IsWhiteSpace("w")); -} - -class IsAcronymTest : public ::testing::Test {}; - -TEST_F(IsAcronymTest, IsAcronym) { - int offset = 0; - EXPECT_TRUE(IsPeriodSeparatedAcronym("U.S.", &offset)); - EXPECT_EQ(offset, 4); - - offset = 0; - EXPECT_TRUE(IsPeriodSeparatedAcronym("E.A.T.", &offset)); - EXPECT_EQ(offset, 6); - - offset = 0; - EXPECT_TRUE(IsPeriodSeparatedAcronym("A.B.C.D.E.F.", &offset)); - EXPECT_EQ(offset, 12); - - offset = 0; - EXPECT_FALSE(IsPeriodSeparatedAcronym("X.", &offset)); - - EXPECT_FALSE(IsPeriodSeparatedAcronym("US", &offset)); - - EXPECT_FALSE(IsPeriodSeparatedAcronym("U-S", &offset)); -} - -class EmoticonParamTest : public SentenceBreakingUtilsStringParamTest {}; - -static const char* const emoticon_test_cases[] = {":(:)", - ":)", - ":(", - ":o)", - ":]", - ":3", - ":>", - "=]", - "=)", - ":}", - ":^)", - ":-D", - ":-)))))", - ":-))))", - ":-)))", - ":-))", - ":-)", - ">:[", - ":-(", - ":(", - ":-c", - ":c", - ":-<", - ":<", - ":-[", - ":[", - ":{", - ";(", - ":-||", - ":@", - ">:(", - ":'-(", - ":'(", - ":'-)", - ":')", - "D:<", - ">:O", - ":-O", - ":-o", - ":*", - ":-*", - ":^*", - ";-)", - ";)", - "*-)", - "*)", - ";-]", - ";]", - ";^)", - ":-,", - ">:P", - ":-P", - ":p", - "=p", - ":-p", - "=p", - ":P", - "=P", - ";p", - ";-p", - ";P", - ";-P", - ">:\\", - ">:/", - ":-/", - ":-.", - ":/", - ":\\", - "=/", - "=\\", - ":|", - ":-|", - ":$", - ":-#", - ":#", - "O:-)", - "0:-)", - "0:)", - "0;^)", - ">:)", - ">;)", - ">:-)", - "}:-)", - "}:)", - "3:-)", - ">_>^", - "^<_<", - "|;-)", - "|-O", - ":-J", - ":-&", - ":&", - "#-)", - "<3", - "8-)", - "^_^", - ":D", - ":-D", - "=D", - "^_^;;", - "O=)", - "}=)", - "B)", - "B-)", - "=|", - "-_-", - "o_o;", - "u_u", - ":-\\", - ":s", - ":S", - ":-s", - ":-S", - ";*", - ";-*" - "=(", - ">.<", - ">:-(", - ">:(", - ">=(", - ";_;", - "T_T", - "='(", - ">_<", - "D:", - ":o", - ":-o", - "=o", - "o.o", - ":O", - ":-O", - "=O", - "O.O", - "x_x", - "X-(", - "X(", - "X-o", - "X-O", - ":X)", - "(=^.^=)", - "(=^..^=)", - "=^_^=", - "-<@%", - ":(|)", - "(]:{", - "<\\3", - "~@~", - "8'(", - "XD", - "DX"}; - -TEST_P(EmoticonParamTest, IsEmoticon) { - int offset = 0; - EXPECT_TRUE(IsEmoticon(GetParam(), &offset)); -} - -INSTANTIATE_TEST_SUITE_P(IsEmoticonParamTest, EmoticonParamTest, - ::testing::ValuesIn(emoticon_test_cases)); - -class IsEmoticonTest : public ::testing::Test {}; - -TEST_F(IsEmoticonTest, IsEmoticon) { - int offset = 0; - - EXPECT_TRUE(IsEmoticon(">:-(", &offset)); - - EXPECT_FALSE(IsEmoticon("w", &offset)); - - EXPECT_FALSE(IsEmoticon(":", &offset)); -} - -TEST(SentenceFragmenterTest, Basic) { - // 1 - // 012345678901234 - string test_input = "Hello. Foo bar!"; - SentenceFragmenterV2 fragmenter(test_input); - std::vector<SentenceFragment> fragments; - EXPECT_TRUE(fragmenter.FindFragments(&fragments).ok()); - EXPECT_EQ(fragments[0].start, 0); - EXPECT_EQ(fragments[0].limit, 6); - EXPECT_EQ(fragments[1].start, 7); - EXPECT_EQ(fragments[1].limit, 15); -} - -TEST(SentenceFragmenterTest, BasicEllipsis) { - // 1 - // 012345678901234 - string test_input = "Hello...foo bar"; - SentenceFragmenterV2 fragmenter(test_input); - std::vector<SentenceFragment> fragments; - EXPECT_TRUE(fragmenter.FindFragments(&fragments).ok()); - - EXPECT_EQ(fragments[0].start, 0); - EXPECT_EQ(fragments[0].limit, 8); - EXPECT_EQ(fragments[1].start, 8); - EXPECT_EQ(fragments[1].limit, 15); -} - -TEST(SentenceFragmenterTest, Parentheses) { - // 1 2 - // 012345678901234567890123456789 - string test_input = "Hello (who are you...) foo bar"; - SentenceFragmenterV2 fragmenter(test_input); - std::vector<SentenceFragment> fragments; - EXPECT_TRUE(fragmenter.FindFragments(&fragments).ok()); - EXPECT_EQ(fragments[0].start, 0); - EXPECT_EQ(fragments[0].limit, 22); - EXPECT_EQ(fragments[1].start, 23); - EXPECT_EQ(fragments[1].limit, 30); -} - -TEST(SentenceFragmenterTest, MidFragmentParentheses) { - // 1 2 - // 012345678901234567890123456789 - string test_input = "Hello (who are you) world? Foo bar"; - SentenceFragmenterV2 fragmenter(test_input); - std::vector<SentenceFragment> fragments; - EXPECT_TRUE(fragmenter.FindFragments(&fragments).ok()); - EXPECT_EQ(fragments[0].start, 0); - EXPECT_EQ(fragments[0].limit, 26); - EXPECT_EQ(fragments[1].start, 27); - EXPECT_EQ(fragments[1].limit, 34); -} - -TEST(SentenceFragmenterTest, PunctuationAfterParentheses) { - // 1 2 - // 01234567890123456789012345678 - string test_input = "Hello (who are you)? Foo bar!"; - SentenceFragmenterV2 fragmenter(test_input); - std::vector<SentenceFragment> fragments; - EXPECT_TRUE(fragmenter.FindFragments(&fragments).ok()); - EXPECT_EQ(fragments[0].start, 0); - EXPECT_EQ(fragments[0].limit, 20); - EXPECT_EQ(fragments[1].start, 21); - EXPECT_EQ(fragments[1].limit, 29); -} - -TEST(SentenceFragmenterTest, ManyFinalPunctuations) { - // 1 2 - // 0123456789012345678901234 - string test_input = "Hello!!!!! Who are you??"; - SentenceFragmenterV2 fragmenter(test_input); - std::vector<SentenceFragment> fragments; - EXPECT_TRUE(fragmenter.FindFragments(&fragments).ok()); - EXPECT_EQ(fragments[0].start, 0); - EXPECT_EQ(fragments[0].limit, 10); - EXPECT_EQ(fragments[1].start, 11); - EXPECT_EQ(fragments[1].limit, 24); -} - -TEST(SentenceFragmenterTest, NewLine) { - // 1 2 3 - // 012345678901234567890 1 23456 7 89012 3 45678 - string test_input = "Who let the dogs out?\r\nWho?\r\nWho?\r\nWho?"; - SentenceFragmenterV2 fragmenter(test_input); - std::vector<SentenceFragment> fragments; - EXPECT_TRUE(fragmenter.FindFragments(&fragments).ok()); - EXPECT_EQ(fragments[0].start, 0); - EXPECT_EQ(fragments[0].limit, 21); - EXPECT_EQ(fragments[1].start, 23); - EXPECT_EQ(fragments[1].limit, 27); - EXPECT_EQ(fragments[2].start, 29); - EXPECT_EQ(fragments[2].limit, 33); - EXPECT_EQ(fragments[3].start, 35); - EXPECT_EQ(fragments[3].limit, 39); -} - -TEST(SentenceFragmenterTest, WhiteSpaceInPunctuation) { - // 1 2 - // 0123456789012345678901234 - string test_input = "Hello?? !!! Who are you??"; - SentenceFragmenterV2 fragmenter(test_input); - std::vector<SentenceFragment> fragments; - EXPECT_TRUE(fragmenter.FindFragments(&fragments).ok()); - EXPECT_EQ(fragments[0].start, 0); - EXPECT_EQ(fragments[0].limit, 7); - EXPECT_EQ(fragments[1].start, 8); - EXPECT_EQ(fragments[1].limit, 11); - EXPECT_EQ(fragments[2].start, 12); - EXPECT_EQ(fragments[2].limit, 25); -} - -} // namespace - -TEST(FragmentBoundaryMatchTest, NoStateChange) { - FragmentBoundaryMatch f; - // || - // 012345678901234 - string test_input = "Hello...foo bar"; - int index = 0; - EXPECT_TRUE(f.Advance(index, test_input)); - EXPECT_FALSE(f.GotTerminalPunc()); - EXPECT_EQ(f.first_terminal_punc_index(), -1); - EXPECT_EQ(f.first_close_punc_index(), -1); - EXPECT_EQ(f.limit_index(), 1); - EXPECT_EQ(f.state(), FragmentBoundaryMatch::INITIAL_STATE); -} - -TEST(FragmentBoundaryMatchTest, BasicEllipsis) { - FragmentBoundaryMatch f; - // | | - // 0123456789 - string test_input = "...foo bar"; - int index = 0; - EXPECT_TRUE(f.Advance(index, test_input)); - EXPECT_TRUE(f.GotTerminalPunc()); - EXPECT_EQ(f.first_terminal_punc_index(), 0); - EXPECT_EQ(f.first_close_punc_index(), 3); - EXPECT_EQ(f.limit_index(), 3); - EXPECT_EQ(f.state(), FragmentBoundaryMatch::COLLECTING_TERMINAL_PUNC); -} - -TEST(FragmentBoundaryMatchTest, BasicPeriod) { - FragmentBoundaryMatch f; - // || - // 0123456789 - string test_input = ". Foo bar"; - int index = 0; - EXPECT_TRUE(f.Advance(index, test_input)); - EXPECT_TRUE(f.GotTerminalPunc()); - EXPECT_EQ(f.first_terminal_punc_index(), 0); - EXPECT_EQ(f.first_close_punc_index(), 1); - EXPECT_EQ(f.limit_index(), 1); - EXPECT_EQ(f.state(), FragmentBoundaryMatch::COLLECTING_TERMINAL_PUNC); -} - -TEST(FragmentBoundaryMatchTest, BasicAcronym) { - FragmentBoundaryMatch f; - // | | - // 0123456789 - string test_input = "A.B. xyz"; - int index = 0; - EXPECT_TRUE(f.Advance(index, test_input)); - EXPECT_TRUE(f.GotTerminalPunc()); - EXPECT_EQ(f.first_terminal_punc_index(), 0); - EXPECT_EQ(f.first_close_punc_index(), 4); - EXPECT_EQ(f.limit_index(), 4); - EXPECT_EQ(f.state(), FragmentBoundaryMatch::COLLECTING_TERMINAL_PUNC); -} - -TEST(FragmentBoundaryMatchTest, LongerAcronym) { - FragmentBoundaryMatch f; - // | | - // 0123456789 - string test_input = "I.B.M. yo"; - int index = 0; - EXPECT_TRUE(f.Advance(index, test_input)); - EXPECT_TRUE(f.GotTerminalPunc()); - EXPECT_EQ(f.first_terminal_punc_index(), 0); - EXPECT_EQ(f.first_close_punc_index(), 6); - EXPECT_EQ(f.limit_index(), 6); - EXPECT_EQ(f.state(), FragmentBoundaryMatch::COLLECTING_TERMINAL_PUNC); -} - -TEST(FragmentBoundaryMatchTest, Emoticon) { - FragmentBoundaryMatch f; - // | | - // 0123456789012 - string test_input = ">:-( hello..."; - int index = 0; - EXPECT_TRUE(f.Advance(index, test_input)); - EXPECT_TRUE(f.GotTerminalPunc()); - EXPECT_EQ(f.first_terminal_punc_index(), 0); - EXPECT_EQ(f.first_close_punc_index(), 4); - EXPECT_EQ(f.limit_index(), 4); - EXPECT_EQ(f.state(), FragmentBoundaryMatch::COLLECTING_TERMINAL_PUNC); -} - -TEST(FragmentBoundaryMatchTest, ParensWithEllipsis) { - FragmentBoundaryMatch f; - // || - // 0123456789012345 - string test_input = ".foo...) foo bar"; - int index = 0; - EXPECT_TRUE(f.Advance(index, test_input)); - EXPECT_TRUE(f.GotTerminalPunc()); - EXPECT_EQ(f.first_terminal_punc_index(), 0); - EXPECT_EQ(f.first_close_punc_index(), 1); - EXPECT_EQ(f.limit_index(), 1); - EXPECT_EQ(f.state(), FragmentBoundaryMatch::COLLECTING_TERMINAL_PUNC); -} - -TEST(FragmentBoundaryMatchTest, ClosingParenWithEllipsis) { - FragmentBoundaryMatch f; - // | | - // 012345678901 - string test_input = "...) foo bar"; - int index = 0; - EXPECT_TRUE(f.Advance(index, test_input)); - EXPECT_TRUE(f.GotTerminalPunc()); - EXPECT_EQ(f.first_terminal_punc_index(), 0); - EXPECT_EQ(f.first_close_punc_index(), 3); - EXPECT_EQ(f.limit_index(), 3); - EXPECT_EQ(f.state(), FragmentBoundaryMatch::COLLECTING_TERMINAL_PUNC); -} - -TEST(FragmentBoundaryMatchTest, BeginAndEndParenWithEllipsis) { - FragmentBoundaryMatch f; - // || - // 0123456789012 - string test_input = "(...) foo bar"; - int index = 0; - EXPECT_TRUE(f.Advance(index, test_input)); - EXPECT_FALSE(f.GotTerminalPunc()); - EXPECT_EQ(f.first_terminal_punc_index(), -1); - EXPECT_EQ(f.first_close_punc_index(), -1); - EXPECT_EQ(f.limit_index(), 1); - EXPECT_EQ(f.state(), FragmentBoundaryMatch::INITIAL_STATE); - - // | | - // 0123456789012 - test_input = "...) foo bar"; - EXPECT_TRUE(f.Advance(index, test_input)); - EXPECT_TRUE(f.GotTerminalPunc()); - EXPECT_EQ(f.first_terminal_punc_index(), 0); - EXPECT_EQ(f.first_close_punc_index(), 3); - EXPECT_EQ(f.limit_index(), 3); - EXPECT_EQ(f.state(), FragmentBoundaryMatch::COLLECTING_TERMINAL_PUNC); -} - -TEST(FragmentBoundaryMatchTest, AcronymInSentence) { - FragmentBoundaryMatch f; - // | | - // 0123456789012 - string test_input = "U.S. don't be surprised."; - int index = 0; - EXPECT_TRUE(f.Advance(index, test_input)); - EXPECT_TRUE(f.GotTerminalPunc()); - EXPECT_EQ(f.first_terminal_punc_index(), 0); - EXPECT_EQ(f.first_close_punc_index(), 4); - EXPECT_EQ(f.limit_index(), 4); - EXPECT_EQ(f.state(), FragmentBoundaryMatch::COLLECTING_TERMINAL_PUNC); -} - -TEST(FragmentBoundaryMatchTest, HelloWithEllipsis) { - FragmentBoundaryMatch f; - // || - // 01234567890 - string test_input = "o...foo bar"; - int index = 0; - EXPECT_TRUE(f.Advance(index, test_input)); - EXPECT_FALSE(f.GotTerminalPunc()); - EXPECT_EQ(f.first_terminal_punc_index(), -1); - EXPECT_EQ(f.first_close_punc_index(), -1); - EXPECT_EQ(f.limit_index(), 1); - EXPECT_EQ(f.state(), FragmentBoundaryMatch::INITIAL_STATE); - - // | | - // 0123456789 - test_input = "...foo bar"; - EXPECT_TRUE(f.Advance(index, test_input)); - EXPECT_TRUE(f.GotTerminalPunc()); - EXPECT_EQ(f.first_terminal_punc_index(), 0); - EXPECT_EQ(f.first_close_punc_index(), 3); - EXPECT_EQ(f.limit_index(), 3); - EXPECT_EQ(f.state(), FragmentBoundaryMatch::COLLECTING_TERMINAL_PUNC); -} - -TEST(FragmentBoundaryMatchTest, ThreeStatesWithClosigParen) { - FragmentBoundaryMatch f; - // || - // 0123456789012 - string test_input = "w...) foo bar"; - int index = 0; - EXPECT_TRUE(f.Advance(index, test_input)); - EXPECT_FALSE(f.GotTerminalPunc()); - EXPECT_EQ(f.first_terminal_punc_index(), -1); - EXPECT_EQ(f.first_close_punc_index(), -1); - EXPECT_EQ(f.limit_index(), 1); - EXPECT_EQ(f.state(), FragmentBoundaryMatch::INITIAL_STATE); - - // | | - // 0123456789012 - test_input = "...) foo bar"; - EXPECT_TRUE(f.Advance(index, test_input)); - EXPECT_TRUE(f.GotTerminalPunc()); - EXPECT_EQ(f.first_terminal_punc_index(), 0); - EXPECT_EQ(f.first_close_punc_index(), 3); - EXPECT_EQ(f.limit_index(), 3); - EXPECT_EQ(f.state(), FragmentBoundaryMatch::COLLECTING_TERMINAL_PUNC); - - // || - // 0123456789012 - test_input = ") foo bar"; - EXPECT_TRUE(f.Advance(index, test_input)); - EXPECT_TRUE(f.GotTerminalPunc()); - EXPECT_EQ(f.first_terminal_punc_index(), 0); - EXPECT_EQ(f.first_close_punc_index(), 0); - EXPECT_EQ(f.limit_index(), 1); - EXPECT_EQ(f.state(), FragmentBoundaryMatch::COLLECTING_CLOSE_PUNC); - - // || - // 0123456789012 - test_input = " foo bar"; - EXPECT_FALSE(f.Advance(index, test_input)); - EXPECT_TRUE(f.GotTerminalPunc()); - EXPECT_EQ(f.first_terminal_punc_index(), 0); - EXPECT_EQ(f.first_close_punc_index(), 0); - EXPECT_EQ(f.limit_index(), 1); - EXPECT_EQ(f.state(), FragmentBoundaryMatch::COLLECTING_CLOSE_PUNC); -} - -TEST(FragmentBoundaryMatchTest, NoTransition) { - FragmentBoundaryMatch f; - // | | - // 0123456789012 - string test_input = "...foo bar"; - int index = 0; - EXPECT_TRUE(f.Advance(index, test_input)); - EXPECT_TRUE(f.GotTerminalPunc()); - EXPECT_EQ(f.first_terminal_punc_index(), 0); - EXPECT_EQ(f.first_close_punc_index(), 3); - EXPECT_EQ(f.limit_index(), 3); - EXPECT_EQ(f.state(), FragmentBoundaryMatch::COLLECTING_TERMINAL_PUNC); - - // || - // 0123456789012 - test_input = "foo bar"; - EXPECT_FALSE(f.Advance(index, test_input)); - EXPECT_TRUE(f.GotTerminalPunc()); - EXPECT_EQ(f.first_terminal_punc_index(), 0); - EXPECT_EQ(f.first_close_punc_index(), 3); - EXPECT_EQ(f.limit_index(), 3); - EXPECT_EQ(f.state(), FragmentBoundaryMatch::COLLECTING_TERMINAL_PUNC); -} - -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/sentence_fragmenter_v2_tflite.cc b/tensorflow_text/core/kernels/sentence_fragmenter_v2_tflite.cc deleted file mode 100644 index 47cb94d1f..000000000 --- a/tensorflow_text/core/kernels/sentence_fragmenter_v2_tflite.cc +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/sentence_fragmenter_v2_tflite.h" - -#include "tensorflow/lite/c/common.h" -#include "tensorflow/lite/kernels/shim/tflite_op_shim.h" -#include "tensorflow_text/core/kernels/sentence_fragmenter_v2_kernel_template.h" - -namespace tflite { -namespace ops { -namespace custom { -namespace text { - -extern "C" void AddSentenceFragmenterV2(tflite::MutableOpResolver* resolver) { - tflite::shim::TfLiteOpKernel< - tensorflow::text::SentenceFragmenterV2Op>::Add(resolver); -} - -} // namespace text -} // namespace custom -} // namespace ops -} // namespace tflite diff --git a/tensorflow_text/core/kernels/sentence_fragmenter_v2_tflite.h b/tensorflow_text/core/kernels/sentence_fragmenter_v2_tflite.h index 7f0694eb2..091283d9d 100644 --- a/tensorflow_text/core/kernels/sentence_fragmenter_v2_tflite.h +++ b/tensorflow_text/core/kernels/sentence_fragmenter_v2_tflite.h @@ -15,19 +15,6 @@ #ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_SENTENCE_FRAGMENTER_V2_TFLITE_H_ #define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_SENTENCE_FRAGMENTER_V2_TFLITE_H_ -#include "tensorflow/lite/c/common.h" -#include "tensorflow/lite/mutable_op_resolver.h" - -namespace tflite { -namespace ops { -namespace custom { -namespace text { - -extern "C" void AddSentenceFragmenterV2(::tflite::MutableOpResolver* resolver); - -} // namespace text -} // namespace custom -} // namespace ops -} // namespace tflite +#include "tensorflow/core/kernels/text/sentence_fragmenter_v2_tflite.h" #endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_SENTENCE_FRAGMENTER_V2_TFLITE_H_ diff --git a/tensorflow_text/core/kernels/sentencepiece/BUILD b/tensorflow_text/core/kernels/sentencepiece/BUILD index 833984106..8146ca63e 100644 --- a/tensorflow_text/core/kernels/sentencepiece/BUILD +++ b/tensorflow_text/core/kernels/sentencepiece/BUILD @@ -1,173 +1,173 @@ -# Memorymappable, WASM compilable, implementation of the encoder. -# +"""Sentencepiece kernels for tf.text ops. +All implementation files moved to //third_party/tensorflow/core/kernels/text/sentencepiece. +""" -load("@flatbuffers//:build_defs.bzl", "flatbuffer_cc_library") load("@rules_cc//cc:cc_library.bzl", "cc_library") -load("@rules_cc//cc:cc_test.bzl", "cc_test") -load("//tensorflow_text:tftext.bzl", "tf_cc_library", "tflite_cc_library") licenses(["notice"]) -# Visibility rules package( default_applicable_licenses = ["//tensorflow_text:license"], + default_compatible_with = ["//buildenv/target:non_prod"], default_visibility = ["//visibility:public"], ) -filegroup( +# Aliases to relocated targets + +ALIAS_NAMES = [ + "testdata", + "config_fbs", + "sp_headers", + "config", + "encoder_config", + "decoder_config", + "double_array_trie_test", + "sentencepiece_tokenizer_kernel", + "sentencepiece_detokenizer_kernel", + "sentencepiece_tokenizer_tflite", + "sentencepiece_detokenizer_tflite", + "optimized_encoder_test", + "optimized_decoder_test", + "macos", + "apple", +] + +alias( name = "testdata", - srcs = [ - "//tensorflow_text:python/ops/test_data/fast_sentencepiece.model", - ], + actual = "@org_tensorflow//tensorflow/core/kernels/text/sentencepiece:testdata", ) -filegroup( +alias( name = "config_fbs", - srcs = ["config.fbs"], + actual = "@org_tensorflow//tensorflow/core/kernels/text/sentencepiece:config_fbs", ) -filegroup( +alias( name = "sp_headers", - srcs = [ - "py_tflite_registerer.h", - ], - visibility = ["//visibility:public"], + actual = "@org_tensorflow//tensorflow/core/kernels/text/sentencepiece:sp_headers", ) -flatbuffer_cc_library( +alias( name = "config", - srcs = [ - "config.fbs", - ], + actual = "@org_tensorflow//tensorflow/core/kernels/text/sentencepiece:config", ) -flatbuffer_cc_library( +alias( name = "encoder_config", - srcs = [ - "encoder_config.fbs", - ], - includes = [":config_fbs"], + actual = "@org_tensorflow//tensorflow/core/kernels/text/sentencepiece:encoder_config", ) -flatbuffer_cc_library( +alias( name = "decoder_config", - srcs = [ - "decoder_config.fbs", - ], - includes = [":config_fbs"], + actual = "@org_tensorflow//tensorflow/core/kernels/text/sentencepiece:decoder_config", +) + +alias( + name = "double_array_trie_test", + actual = "@org_tensorflow//tensorflow/core/kernels/text/sentencepiece:double_array_trie_test", +) + +alias( + name = "sentencepiece_tokenizer_kernel", + actual = "@org_tensorflow//tensorflow/core/kernels/text/sentencepiece:sentencepiece_tokenizer_kernel", +) + +alias( + name = "sentencepiece_detokenizer_kernel", + actual = "@org_tensorflow//tensorflow/core/kernels/text/sentencepiece:sentencepiece_detokenizer_kernel", +) + +alias( + name = "sentencepiece_tokenizer_tflite", + actual = "@org_tensorflow//tensorflow/core/kernels/text/sentencepiece:sentencepiece_tokenizer_tflite", +) + +alias( + name = "sentencepiece_detokenizer_tflite", + actual = "@org_tensorflow//tensorflow/core/kernels/text/sentencepiece:sentencepiece_detokenizer_tflite", ) +alias( + name = "optimized_encoder_test", + actual = "@org_tensorflow//tensorflow/core/kernels/text/sentencepiece:optimized_encoder_test", +) + +alias( + name = "optimized_decoder_test", + actual = "@org_tensorflow//tensorflow/core/kernels/text/sentencepiece:optimized_decoder_test", +) + +alias( + name = "macos", + actual = "@org_tensorflow//tensorflow/core/kernels/text/sentencepiece:macos", +) + +alias( + name = "apple", + actual = "@org_tensorflow//tensorflow/core/kernels/text/sentencepiece:apple", +) + +LIBRARY_HEADERS = { + "utils": "utils.h", + "double_array_trie": "double_array_trie.h", + "double_array_trie_builder": "double_array_trie_builder.h", + "sentencepiece_constants": "sentencepiece_constants.h", + "model_converter": "model_converter.h", + "optimized_encoder": "optimized_encoder.h", + "optimized_decoder": "optimized_decoder.h", + "sentencepiece_tokenizer_h": "sentencepiece_tokenizer.h", + "sentencepiece_detokenizer_h": "sentencepiece_detokenizer.h", + "py_tflite_registerer": "py_tflite_registerer.h", +} + cc_library( name = "utils", - srcs = [ - ], - hdrs = [ - "utils.h", - ], + hdrs = ["utils.h"], + deps = ["@org_tensorflow//tensorflow/core/kernels/text/sentencepiece:utils"], ) cc_library( name = "double_array_trie", - srcs = [ - ], - hdrs = [ - "double_array_trie.h", - ], - deps = [ - ":config", - ":utils", - ], + hdrs = ["double_array_trie.h"], + deps = ["@org_tensorflow//tensorflow/core/kernels/text/sentencepiece:double_array_trie"], ) cc_library( name = "double_array_trie_builder", - srcs = [ - "double_array_trie_builder.cc", - ], - hdrs = [ - "double_array_trie_builder.h", - ], - deps = [ - ":config", - ":utils", - "@darts_clone", - ], -) - -cc_test( - name = "double_array_trie_test", - srcs = [ - "double_array_trie_test.cc", - ], - deps = [ - ":double_array_trie", - ":double_array_trie_builder", - ":encoder_config", - "@com_google_googletest//:gtest_main", - ], + hdrs = ["double_array_trie_builder.h"], + deps = ["@org_tensorflow//tensorflow/core/kernels/text/sentencepiece:double_array_trie_builder"], ) cc_library( name = "sentencepiece_constants", - srcs = [], hdrs = ["sentencepiece_constants.h"], + deps = ["@org_tensorflow//tensorflow/core/kernels/text/sentencepiece:sentencepiece_constants"], ) cc_library( name = "model_converter", - srcs = [ - "model_converter.cc", - ], - hdrs = [ - "model_converter.h", - ], - deps = [ - ":config", - ":decoder_config", - ":double_array_trie_builder", - ":encoder_config", - ":sentencepiece_constants", - "@com_google_absl//absl/status", - "@com_google_absl//absl/status:statusor", - "@com_google_absl//absl/strings", - "@com_google_sentencepiece//:sentencepiece_model_cc_proto", - ], + hdrs = ["model_converter.h"], + deps = ["@org_tensorflow//tensorflow/core/kernels/text/sentencepiece:model_converter"], ) cc_library( name = "optimized_encoder", - srcs = [ - "optimized_encoder.cc", - ], hdrs = [ "optimized_encoder.h", ], - deps = [ - ":double_array_trie", - ":encoder_config", - ], + deps = ["@org_tensorflow//tensorflow/core/kernels/text/sentencepiece:optimized_encoder"], ) cc_library( name = "optimized_decoder", - srcs = [ - "optimized_decoder.cc", - ], - hdrs = [ - "optimized_decoder.h", - ], - deps = [ - "config", - ":decoder_config", - ":double_array_trie", - ], + hdrs = ["optimized_decoder.h"], + deps = ["@org_tensorflow//tensorflow/core/kernels/text/sentencepiece:optimized_decoder"], ) cc_library( name = "sentencepiece_tokenizer_h", - hdrs = [ - "sentencepiece_tokenizer.h", - ], + hdrs = ["sentencepiece_tokenizer.h"], + deps = ["@org_tensorflow//tensorflow/core/kernels/text/sentencepiece:sentencepiece_tokenizer_h"], ) cc_library( @@ -175,194 +175,11 @@ cc_library( hdrs = [ "sentencepiece_detokenizer.h", ], + deps = ["@org_tensorflow//tensorflow/core/kernels/text/sentencepiece:sentencepiece_detokenizer_h"], ) -tf_cc_library( - name = "sentencepiece_tokenizer_kernel", - srcs = ["sentencepiece_tokenizer_kernel.cc"], - tf_deps = [ - # tf:lib tensorflow dep, - # tf:framework tensorflow dep, - ], - deps = [ - ":optimized_encoder", - ":sentencepiece_tokenizer_h", - ], -) - -tf_cc_library( - name = "sentencepiece_detokenizer_kernel", - srcs = ["sentencepiece_detokenizer_kernel.cc"], - tf_deps = [ - # tf:lib tensorflow dep, - # tf:framework tensorflow dep, - ], - deps = [ - ":optimized_decoder", - ":sentencepiece_detokenizer_h", - "@com_google_absl//absl/status", - ], -) - -tflite_cc_library( - name = "sentencepiece_tokenizer_tflite", - srcs = ["sentencepiece_tokenizer_tflite.cc"], - deps = - [ - ":optimized_encoder", - ":sentencepiece_tokenizer_h", - "@flatbuffers", - # lite:framework tensorflow dep, - # lite:string_util tensorflow dep, - # lite/c:common tensorflow dep, - # lite/kernels:kernel_util tensorflow dep, - # lite/kernels/internal:tensor tensorflow dep, - ], -) - -tflite_cc_library( - name = "sentencepiece_detokenizer_tflite", - srcs = ["sentencepiece_detokenizer_tflite.cc"], - deps = - [ - ":optimized_decoder", - ":sentencepiece_detokenizer_h", - "@flatbuffers", - # lite:framework tensorflow dep, - # lite:string_util tensorflow dep, - # lite/c:common tensorflow dep, - # lite/kernels:kernel_util tensorflow dep, - # lite/kernels/internal:tensor tensorflow dep, - ], -) - -cc_test( - name = "optimized_encoder_test", - srcs = [ - "optimized_encoder_test.cc", - ], - data = [ - ":testdata", - ], - deps = [ - ":double_array_trie_builder", - ":encoder_config", - ":model_converter", - ":optimized_encoder", - "//file/base:path", - "//file/localfile", - "@com_google_googletest//:gtest_main", - "@com_google_absl//absl/status", - "@com_google_absl//absl/strings:str_format", - "@com_google_sentencepiece//:sentencepiece_cc_proto", - "@com_google_sentencepiece//:sentencepiece_processor", - # tf:lib tensorflow dep, - # lite/kernels:test_util tensorflow dep, - ], -) - -cc_test( - name = "optimized_decoder_test", - srcs = [ - "optimized_decoder_test.cc", - ], - data = [ - ":testdata", - ], - deps = [ - ":model_converter", - ":optimized_decoder", - "//file/base:path", - "//file/localfile", - "@com_google_googletest//:gtest_main", - "@com_google_absl//absl/strings:str_format", - "@com_google_sentencepiece//:sentencepiece_cc_proto", - "@com_google_sentencepiece//:sentencepiece_processor", - # tf:lib tensorflow dep, - # lite/kernels:test_util tensorflow dep, - ], -) - -tflite_cc_library( +cc_library( name = "py_tflite_registerer", - srcs = ["py_tflite_registerer.cc"], hdrs = ["py_tflite_registerer.h"], - deps = [ - ":sentencepiece_detokenizer_tflite", - ":sentencepiece_tokenizer_tflite", - # lite:framework tensorflow dep, - # lite/kernels:builtin_ops tensorflow dep, - ], - alwayslink = 1, -) - -config_setting( - name = "armeabi_v7a_and_fastbuild", - constraint_values = ["//third_party/bazel_platforms/cpu:armv7"], - values = { - "compilation_mode": "fastbuild", - }, - visibility = ["//visibility:public"], -) - -config_setting( - name = "armeabi_v7a_and_dbg", - constraint_values = ["//third_party/bazel_platforms/cpu:armv7"], - values = { - "compilation_mode": "dbg", - }, - visibility = ["//visibility:public"], -) - -config_setting( - name = "android", - values = {"crosstool_top": "//external:android/crosstool"}, - visibility = ["//visibility:public"], -) - -config_setting( - name = "macos_i386", - values = { - "apple_platform_type": "macos", - "cpu": "darwin", - }, - visibility = ["//visibility:public"], -) - -config_setting( - name = "macos_x86_64", - values = { - "apple_platform_type": "macos", - "cpu": "darwin_x86_64", - }, - visibility = ["//visibility:public"], -) - -alias( - name = "macos", - actual = select({ - ":macos_i386": ":macos_i386", - ":macos_x86_64": ":macos_x86_64", - "//conditions:default": ":macos_i386", # Arbitrarily chosen from above. - }), - visibility = ["//visibility:public"], -) - -config_setting( - name = "ios", - values = { - "crosstool_top": "@bazel_tools//tools/cpp:toolchain", - "apple_platform_type": "ios", - }, - visibility = ["//visibility:public"], -) - -alias( - name = "apple", - actual = select({ - ":macos": ":macos", - ":ios": ":ios", - "//conditions:default": ":ios", # Arbitrarily chosen from above. - }), - visibility = ["//visibility:public"], + deps = ["@org_tensorflow//tensorflow/core/kernels/text/sentencepiece:py_tflite_registerer"], ) diff --git a/tensorflow_text/core/kernels/sentencepiece/config.fbs b/tensorflow_text/core/kernels/sentencepiece/config.fbs deleted file mode 100644 index 4b9cc9a81..000000000 --- a/tensorflow_text/core/kernels/sentencepiece/config.fbs +++ /dev/null @@ -1,25 +0,0 @@ -/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -namespace tensorflow.text.sentencepiece; - -table Trie { - nodes: [uint32]; -} - - -enum EncoderVersion: byte { - SENTENCE_PIECE = 0, -} diff --git a/tensorflow_text/core/kernels/sentencepiece/decoder_config.fbs b/tensorflow_text/core/kernels/sentencepiece/decoder_config.fbs deleted file mode 100644 index bcb787dbd..000000000 --- a/tensorflow_text/core/kernels/sentencepiece/decoder_config.fbs +++ /dev/null @@ -1,43 +0,0 @@ -/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -include "config.fbs"; - -namespace tensorflow.text.sentencepiece; - - -table DecoderConfig { - version: EncoderVersion = SENTENCE_PIECE; - - // The offset for encoding, usually used when codes with low codes are reserved - // for some special needs. - encoding_offset: int32; - - // A vector of strings that represent sentencepieces. - decode_pieces: [string]; - - // TODO(mgubin): Currently is not populated, haven't seen any Sentencepiece - // model with a denormalizer. - denormalized_prefixes: Trie; - denormalized_replacements: [byte]; - - // During encoding a dummy prefix (a whitespace) can be added to the input string, - // if this flag is true, this prefix will be removed. - remove_dummy_prefix: bool; - -} - - -root_type DecoderConfig; diff --git a/tensorflow_text/core/kernels/sentencepiece/double_array_trie.h b/tensorflow_text/core/kernels/sentencepiece/double_array_trie.h index 0599cb641..a8ac801c2 100644 --- a/tensorflow_text/core/kernels/sentencepiece/double_array_trie.h +++ b/tensorflow_text/core/kernels/sentencepiece/double_array_trie.h @@ -12,121 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. +#ifndef TENSORFLOW_TEXT_CORE_KERNELS_SENTENCEPIECE_DOUBLE_ARRAY_TRIE_H_ +#define TENSORFLOW_TEXT_CORE_KERNELS_SENTENCEPIECE_DOUBLE_ARRAY_TRIE_H_ -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at +#include "tensorflow/core/kernels/text/sentencepiece/double_array_trie.h" - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#ifndef TENSORFLOW_LITE_SUPPORT_CUSTOM_OPS_KERNEL_SENTENCEPIECE_DOUBLE_ARRAY_TRIE_H_ -#define TENSORFLOW_LITE_SUPPORT_CUSTOM_OPS_KERNEL_SENTENCEPIECE_DOUBLE_ARRAY_TRIE_H_ - -#include <functional> -#include <vector> - -#include "tensorflow_text/core/kernels/sentencepiece/config_generated.h" -#include "tensorflow_text/core/kernels/sentencepiece/utils.h" - -namespace tensorflow { -namespace text { -namespace sentencepiece { - -// A trie node specifies a node in the tree, either an intermediate node or -// a leaf node. -// A leaf node contains the id as an int of the string match. This id is encoded -// in the lower 31 bits, thus the number of distinct ids is 2^31. -// An intermediate node has an associated label and an offset to its children. -// The label is encoded in the least significant byte and must match the input -// character during matching. - -// A memory mappable trie, compatible with Darts::DoubleArray. -class DoubleArrayTrie { - public: - struct Match { - Match() {} - Match(int id, int match_length) : id(id), match_length(match_length) {} - int id = -1; - int match_length = -1; - bool empty() const { return match_length == -1; } - bool operator==(const Match& m) const { - return m.id == id && m.match_length == match_length; - } - }; - - // nodes and nodes_length specify the array of the nodes of the trie. - explicit DoubleArrayTrie(const flatbuffers::Vector<uint32_t>* nodes) - : nodes_(nodes) {} - - // Finds matches that are prefixes of a string. - template <typename callback> - void IteratePrefixMatches(const utils::string_view& input, - callback update_fn) const; - - // Finds the longest prefix match of a string. - Match LongestPrefixMatch(const utils::string_view& input) const { - Match match; - IteratePrefixMatches(input, [&match](const Match& m) { match = m; }); - return match; - } - - private: - // Returns whether a node as a leaf as a child. - bool has_leaf(uint32_t i) const { return ((*nodes_)[i]) & 0x100; } - - // Returns a value associated with a node. Available when a node is a leaf. - int value(uint32_t i) const { - return static_cast<int>(((*nodes_)[i]) & 0x7fffffff); - } - - // Returns a label associated with a node. - // A leaf node will have the MSB set and thus return an invalid label. - int32_t label(uint32_t i) const { return ((*nodes_)[i]) & 0x800000ff; } - - // Returns offset to children. - int32_t offset(uint32_t i) const { - const uint32_t node = (*nodes_)[i]; - return (node >> 10) << ((node & 0x200) >> 6); - } - - const flatbuffers::Vector<uint32_t>* nodes_; -}; - -template <typename callback> -void DoubleArrayTrie::IteratePrefixMatches(const utils::string_view& input, - callback update_fn) const { - if (nodes_->size() == 0) { - return; - } - uint32_t pos = offset(0); - for (int i = 0; i < input.length(); ++i) { - pos ^= static_cast<unsigned char>(input.at(i)); - if (pos < 0 || pos >= nodes_->size() || label(pos) != input.at(i)) { - // No match, exit. - return; - } - const bool node_has_leaf = has_leaf(pos); - pos ^= offset(pos); - if (pos < 0 || pos >= nodes_->size()) { - // We can get here only if the structure is corrupted. - return; - } - if (node_has_leaf) { - update_fn(Match(value(pos), i + 1)); - } - } -} - -} // namespace sentencepiece -} // namespace text -} // namespace tensorflow - -#endif // TENSORFLOW_LITE_SUPPORT_CUSTOM_OPS_KERNEL_SENTENCEPIECE_DOUBLE_ARRAY_TRIE_H_ +#endif // TENSORFLOW_TEXT_CORE_KERNELS_SENTENCEPIECE_DOUBLE_ARRAY_TRIE_H_ diff --git a/tensorflow_text/core/kernels/sentencepiece/double_array_trie_builder.cc b/tensorflow_text/core/kernels/sentencepiece/double_array_trie_builder.cc deleted file mode 100644 index 7e5bdae64..000000000 --- a/tensorflow_text/core/kernels/sentencepiece/double_array_trie_builder.cc +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow_text/core/kernels/sentencepiece/double_array_trie_builder.h" - -#include <algorithm> -#include <memory> - -#include "include/darts.h" - -namespace tensorflow { -namespace text { -namespace sentencepiece { - -std::vector<uint32_t> BuildTrie(const std::vector<std::string>& data) { - std::vector<int> ids; - ids.reserve(data.size()); - for (int i = 0; i < data.size(); ++i) { - ids.push_back(i); - } - return BuildTrie(data, ids); -} - -std::vector<uint32_t> BuildTrie(const std::vector<std::string>& data, - const std::vector<int>& ids) { - // We make strong assumptions about binary structure of trie. - struct OneElement { - OneElement(const std::string* key_, int index_) - : key(key_), index(index_) {} - const std::string* key; - int index; - bool operator<(const OneElement& el) const { return *key < *el.key; } - }; - std::vector<OneElement> elements; - elements.reserve(data.size()); - auto data_iterator = std::begin(data); - auto ids_iterator = std::begin(ids); - for (; data_iterator != std::end(data) && ids_iterator != std::end(ids); - ++data_iterator, ++ids_iterator) { - elements.emplace_back(&(*data_iterator), *ids_iterator); - } - // Sort by keys. - std::sort(elements.begin(), elements.end()); - - // Create vectors to build the trie. - std::vector<const char*> strings; - std::vector<int32_t> indexes; - strings.reserve(data.size()); - indexes.reserve(data.size()); - for (const auto& el : elements) { - strings.push_back(el.key->c_str()); - indexes.push_back(el.index); - } - auto trie = std::make_unique<Darts::DoubleArray>(); - trie->build(data.size(), const_cast<char**>(&strings[0]), nullptr, - &indexes[0]); - // We make strong assumptions about internal Darts trie structure: - // - it is a vector of 32 bit signed integers - // - the "array" is the only one structure that contains all information about - // the trie. - const uint32_t* trie_data = static_cast<const uint32_t*>(trie->array()); - return std::vector<uint32_t>(trie_data, trie_data + trie->size()); -} - -} // namespace sentencepiece -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/sentencepiece/double_array_trie_builder.h b/tensorflow_text/core/kernels/sentencepiece/double_array_trie_builder.h index 1e585f99a..a3c444398 100644 --- a/tensorflow_text/core/kernels/sentencepiece/double_array_trie_builder.h +++ b/tensorflow_text/core/kernels/sentencepiece/double_array_trie_builder.h @@ -12,42 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. +#ifndef TENSORFLOW_TEXT_CORE_KERNELS_SENTENCEPIECE_DOUBLE_ARRAY_TRIE_BUILDER_H_ +#define TENSORFLOW_TEXT_CORE_KERNELS_SENTENCEPIECE_DOUBLE_ARRAY_TRIE_BUILDER_H_ -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at +#include "tensorflow/core/kernels/text/sentencepiece/double_array_trie_builder.h" - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#ifndef TENSORFLOW_LITE_SUPPORT_CUSTOM_OPS_KERNEL_SENTENCEPIECE_DOUBLE_ARRAY_TRIE_BUILDER_H_ -#define TENSORFLOW_LITE_SUPPORT_CUSTOM_OPS_KERNEL_SENTENCEPIECE_DOUBLE_ARRAY_TRIE_BUILDER_H_ - -#include <string> -#include <vector> - -#include "tensorflow_text/core/kernels/sentencepiece/config_generated.h" -#include "tensorflow_text/core/kernels/sentencepiece/utils.h" - -namespace tensorflow { -namespace text { -namespace sentencepiece { - -std::vector<uint32_t> BuildTrie(const std::vector<std::string>& data, - const std::vector<int>& ids); - -// A variant where ids are indexes in data. -std::vector<uint32_t> BuildTrie(const std::vector<std::string>& data); - -} // namespace sentencepiece -} // namespace text -} // namespace tensorflow - -#endif // TENSORFLOW_LITE_SUPPORT_CUSTOM_OPS_KERNEL_SENTENCEPIECE_DOUBLE_ARRAY_TRIE_BUILDER_H_ +#endif // TENSORFLOW_TEXT_CORE_KERNELS_SENTENCEPIECE_DOUBLE_ARRAY_TRIE_BUILDER_H_ diff --git a/tensorflow_text/core/kernels/sentencepiece/double_array_trie_test.cc b/tensorflow_text/core/kernels/sentencepiece/double_array_trie_test.cc deleted file mode 100644 index 118a0573a..000000000 --- a/tensorflow_text/core/kernels/sentencepiece/double_array_trie_test.cc +++ /dev/null @@ -1,90 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow_text/core/kernels/sentencepiece/double_array_trie.h" - -#include <gmock/gmock.h> -#include <gtest/gtest.h> -#include "tensorflow_text/core/kernels/sentencepiece/double_array_trie_builder.h" -#include "tensorflow_text/core/kernels/sentencepiece/encoder_config_generated.h" - -namespace tensorflow { -namespace text { -namespace sentencepiece { - -TEST(DoubleArrayTrieTest, Match) { - flatbuffers::FlatBufferBuilder builder(1024); - const std::vector<std::string> test_strings = {"A", "AAX", "AA", "B"}; - const auto trie_vector = builder.CreateVector(BuildTrie(test_strings)); - TrieBuilder trie_builder(builder); - trie_builder.add_nodes(trie_vector); - const auto pieces = trie_builder.Finish(); - EncoderConfigBuilder ecb(builder); - ecb.add_pieces(pieces); - FinishEncoderConfigBuffer(builder, ecb.Finish()); - const EncoderConfig* config = GetEncoderConfig(builder.GetBufferPointer()); - DoubleArrayTrie dat(config->pieces()->nodes()); - EXPECT_EQ(dat.LongestPrefixMatch(utils::string_view("AAL")), - DoubleArrayTrie::Match(2, 2)); - - std::vector<DoubleArrayTrie::Match> matches; - dat.IteratePrefixMatches( - utils::string_view("AAXL"), - [&matches](const DoubleArrayTrie::Match& m) { matches.push_back(m); }); - EXPECT_THAT(matches, testing::ElementsAre(DoubleArrayTrie::Match(0, 1), - DoubleArrayTrie::Match(2, 2), - DoubleArrayTrie::Match(1, 3))); -} - -TEST(DoubleArrayTrieTest, ComplexMatch) { - flatbuffers::FlatBufferBuilder builder(1024); - const std::vector<std::string> test_strings = {"\xe2\x96\x81the", ",", "s", - "\xe2\x96\x81Hello"}; - const std::vector<int> test_ids = {0, 5, 10, 15}; - const auto trie_vector = - builder.CreateVector(BuildTrie(test_strings, test_ids)); - TrieBuilder trie_builder(builder); - trie_builder.add_nodes(trie_vector); - const auto pieces = trie_builder.Finish(); - EncoderConfigBuilder ecb(builder); - ecb.add_pieces(pieces); - FinishEncoderConfigBuffer(builder, ecb.Finish()); - const EncoderConfig* config = GetEncoderConfig(builder.GetBufferPointer()); - DoubleArrayTrie dat(config->pieces()->nodes()); - - std::vector<DoubleArrayTrie::Match> matches; - dat.IteratePrefixMatches( - utils::string_view("\xe2\x96\x81Hello"), - [&matches](const DoubleArrayTrie::Match& m) { matches.push_back(m); }); - EXPECT_THAT(matches, testing::ElementsAre(DoubleArrayTrie::Match(15, 8))); -} - -} // namespace sentencepiece -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/sentencepiece/encoder_config.fbs b/tensorflow_text/core/kernels/sentencepiece/encoder_config.fbs deleted file mode 100644 index 1c98ddde1..000000000 --- a/tensorflow_text/core/kernels/sentencepiece/encoder_config.fbs +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright 2020 The TensorFlow Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -include "config.fbs"; - -namespace tensorflow.text.sentencepiece; - -table EncoderConfig { - // Version of the encoder. - version: EncoderVersion = SENTENCE_PIECE; - start_code: int32 = 0; - end_code: int32 = 0; - - unknown_code: int32 = -1; - // Weight of "unknown code" when encoding. "Penalty" because it usually has a - // big negative weight,less than any other sentencepiece. - unknown_penalty: float = 0; - - // The offset for encoding, usually used when codes with low codes are reserved - // for some special needs. - encoding_offset: int32; - - // String pieces for encoding. - pieces: Trie; - pieces_scores: [float]; - - // Normalization related parameters. - remove_extra_whitespaces: bool; - - // Add a whitespace prefix before encoding. - add_dummy_prefix: bool; - - // Escape whitespaces during encoding so the decoder can restore them exactly as - // in the input. - escape_whitespaces: bool; - - // Normalization parameters. - normalized_prefixes: Trie; - normalized_replacements: [byte]; -} - -root_type EncoderConfig; diff --git a/tensorflow_text/core/kernels/sentencepiece/model_converter.cc b/tensorflow_text/core/kernels/sentencepiece/model_converter.cc deleted file mode 100644 index bdaaff375..000000000 --- a/tensorflow_text/core/kernels/sentencepiece/model_converter.cc +++ /dev/null @@ -1,216 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow_text/core/kernels/sentencepiece/model_converter.h" -#include <tuple> - -#include "absl/status/status.h" -#include "absl/strings/str_replace.h" -#include "src/sentencepiece_model.pb.h" -#include "tensorflow_text/core/kernels/sentencepiece/decoder_config_generated.h" -#include "tensorflow_text/core/kernels/sentencepiece/double_array_trie_builder.h" -#include "tensorflow_text/core/kernels/sentencepiece/encoder_config_generated.h" -#include "tensorflow_text/core/kernels/sentencepiece/sentencepiece_constants.h" - -namespace tensorflow { -namespace text { -namespace sentencepiece { - -std::tuple<std::vector<uint32_t>, std::vector<int8_t>> -DecodePrecompiledCharsmap( - const ::sentencepiece::NormalizerSpec& normalizer_spec) { - // This function "undoes" encoding done by - // sentencepiece::normalizer::Normalizer::EncodePrecompiledCharsMap. - if (normalizer_spec.precompiled_charsmap().empty()) { - return std::make_tuple(std::vector<uint32_t>(), std::vector<int8_t>()); - } - const char* precompiled_map = normalizer_spec.precompiled_charsmap().data(); - const uint32_t trie_size = - *reinterpret_cast<const uint32_t*>(precompiled_map); - const uint32_t* trie_ptr = - reinterpret_cast<const uint32_t*>(precompiled_map + sizeof(uint32_t)); - const int8_t* normalized_ptr = reinterpret_cast<const int8_t*>( - precompiled_map + sizeof(uint32_t) + trie_size); - const int normalized_size = normalizer_spec.precompiled_charsmap().length() - - sizeof(uint32_t) - trie_size; - return std::make_tuple( - std::vector<uint32_t>(trie_ptr, trie_ptr + trie_size / sizeof(uint32_t)), - std::vector<int8_t>(normalized_ptr, normalized_ptr + normalized_size)); -} - -absl::StatusOr<std::string> ConvertSentencepieceModelToFlatBuffer( - const std::string& model_config_str, int encoding_offset) { - ::sentencepiece::ModelProto model_config; - if (!model_config.ParseFromString(model_config_str)) { - return absl::InvalidArgumentError( - "Invalid configuration, can't parse SentencePiece model config " + - model_config.InitializationErrorString()); - } - // Convert sentencepieces. - std::vector<std::string> pieces; - pieces.reserve(model_config.pieces_size()); - std::vector<float> scores; - scores.reserve(model_config.pieces_size()); - std::vector<int> ids; - ids.reserve(model_config.pieces_size()); - float min_score = 0.0; - int index = 0; - for (const auto& piece : model_config.pieces()) { - switch (piece.type()) { - case ::sentencepiece::ModelProto::SentencePiece::NORMAL: - case ::sentencepiece::ModelProto::SentencePiece::USER_DEFINED: - pieces.push_back(piece.piece()); - ids.push_back(index); - if (piece.score() < min_score) { - min_score = piece.score(); - } - break; - case ::sentencepiece::ModelProto::SentencePiece::UNKNOWN: - case ::sentencepiece::ModelProto::SentencePiece::CONTROL: - case ::sentencepiece::ModelProto::SentencePiece::BYTE: - // Ignore unknown and control codes. - break; - default: - return absl::InvalidArgumentError("Invalid SentencePiece piece type " + - piece.piece()); - } - scores.push_back(piece.score()); - ++index; - } - flatbuffers::FlatBufferBuilder builder(1024); - const auto pieces_trie_vector = builder.CreateVector(BuildTrie(pieces, ids)); - const auto pieces_score_vector = builder.CreateVector(scores); - TrieBuilder pieces_trie_builder(builder); - pieces_trie_builder.add_nodes(pieces_trie_vector); - const auto pieces_trie_fbs = pieces_trie_builder.Finish(); - - // Converting normalization. - const auto normalization = - DecodePrecompiledCharsmap(model_config.normalizer_spec()); - const auto normalization_trie = std::get<0>(normalization); - const auto normalization_strings = std::get<1>(normalization); - const auto normalization_trie_vector = - builder.CreateVector(normalization_trie); - TrieBuilder normalization_trie_builder(builder); - normalization_trie_builder.add_nodes(normalization_trie_vector); - const auto normalization_trie_fbs = normalization_trie_builder.Finish(); - const auto normalization_strings_fbs = - builder.CreateVector(normalization_strings); - - EncoderConfigBuilder ecb(builder); - ecb.add_version(EncoderVersion::EncoderVersion_SENTENCE_PIECE); - ecb.add_start_code(model_config.trainer_spec().bos_id()); - ecb.add_end_code(model_config.trainer_spec().eos_id()); - ecb.add_unknown_code(model_config.trainer_spec().unk_id()); - ecb.add_unknown_penalty(min_score - kUnkPenalty); - ecb.add_encoding_offset(encoding_offset); - ecb.add_pieces(pieces_trie_fbs); - ecb.add_pieces_scores(pieces_score_vector); - ecb.add_remove_extra_whitespaces( - model_config.normalizer_spec().remove_extra_whitespaces()); - ecb.add_add_dummy_prefix(model_config.normalizer_spec().add_dummy_prefix()); - ecb.add_escape_whitespaces( - model_config.normalizer_spec().escape_whitespaces()); - ecb.add_normalized_prefixes(normalization_trie_fbs); - ecb.add_normalized_replacements(normalization_strings_fbs); - FinishEncoderConfigBuffer(builder, ecb.Finish()); - return std::string(reinterpret_cast<const char*>(builder.GetBufferPointer()), - builder.GetSize()); -} - -absl::StatusOr<std::string> -ConvertSentencepieceModelToFlatBufferForDecoder( - const std::string& model_config_str, int encoding_offset) { - ::sentencepiece::ModelProto model_config; - if (!model_config.ParseFromString(model_config_str)) { - return absl::InvalidArgumentError( - "Invalid configuration, can't parse SentencePiece model config " + - model_config.InitializationErrorString()); - } - flatbuffers::FlatBufferBuilder builder(1024); - // Collect sentencepieces. - std::vector<std::string> pieces; - for (const auto& piece : model_config.pieces()) { - // In the original library all pieces processing is done during decoding. - // Because it is independent from context or parameters we can do it in - // advance here. - switch (piece.type()) { - case ::sentencepiece::ModelProto::SentencePiece::NORMAL: - case ::sentencepiece::ModelProto::SentencePiece::USER_DEFINED: - pieces.push_back( - absl::StrReplaceAll(piece.piece(), {{kSpaceSymbol, " "}})); - break; - case ::sentencepiece::ModelProto::SentencePiece::UNKNOWN: - pieces.push_back( - kDefaultUnknownSymbol); // Always decode with the default unknown. - break; - default: - pieces.push_back(""); - } - } - const auto pieces_fbs = builder.CreateVectorOfStrings(pieces); - DecoderConfigBuilder decb(builder); - - decb.add_version(EncoderVersion::EncoderVersion_SENTENCE_PIECE); - decb.add_encoding_offset(encoding_offset); - decb.add_decode_pieces(pieces_fbs); - decb.add_remove_dummy_prefix( - model_config.normalizer_spec().add_dummy_prefix()); - - FinishDecoderConfigBuffer(builder, decb.Finish()); - return std::string(reinterpret_cast<const char*>(builder.GetBufferPointer()), - builder.GetSize()); -} - -int GetVocabularySize(const std::string& model_string) { - const EncoderConfig* config = GetEncoderConfig(model_string.data()); - return config->pieces_scores()->size() + config->encoding_offset(); -} - -std::string ConvertSentencepieceModel(const std::string& model_string) { - const auto result = ConvertSentencepieceModelToFlatBuffer(model_string); - // TODO(mgubin): Propogate error to the Python code and throw correct - // exception. - assert(result.status().ok()); - return result.value(); -} - -std::string ConvertSentencepieceModelForDecoder( - const std::string& model_string) { - const auto result = - ConvertSentencepieceModelToFlatBufferForDecoder(model_string); - // TODO(mgubin): Propogate error to the Python code and throw correct - // exception. - assert(result.status().ok()); - return result.value(); -} - -} // namespace sentencepiece -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/sentencepiece/model_converter.h b/tensorflow_text/core/kernels/sentencepiece/model_converter.h index 716e989b4..faea9f55d 100644 --- a/tensorflow_text/core/kernels/sentencepiece/model_converter.h +++ b/tensorflow_text/core/kernels/sentencepiece/model_converter.h @@ -12,53 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. +#ifndef TENSORFLOW_TEXT_CORE_KERNELS_SENTENCEPIECE_MODEL_CONVERTER_H_ +#define TENSORFLOW_TEXT_CORE_KERNELS_SENTENCEPIECE_MODEL_CONVERTER_H_ -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at +#include "tensorflow/core/kernels/text/sentencepiece/model_converter.h" - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#ifndef TENSORFLOW_LITE_SUPPORT_CUSTOM_OPS_KERNEL_SENTENCEPIECE_MODEL_CONVERTER_H_ -#define TENSORFLOW_LITE_SUPPORT_CUSTOM_OPS_KERNEL_SENTENCEPIECE_MODEL_CONVERTER_H_ -#include <string> - -#include "absl/status/statusor.h" - -namespace tensorflow { -namespace text { -namespace sentencepiece { - -// Converts Sentencepiece configuration to flatbuffer format. -// encoding_offset is used by some encoders that combine different encodings. -absl::StatusOr<std::string> ConvertSentencepieceModelToFlatBuffer( - const std::string& model_config_str, int encoding_offset = 0); - -// Converts Sentencepiece configuration to flatbuffer format for encoder. -// encoding_offset is used by some encoders that combine different encodings. -absl::StatusOr<std::string> -ConvertSentencepieceModelToFlatBufferForDecoder( - const std::string& model_config_str, int encoding_offset = 0); - -// The functions that are provided for the Python wrapper. -std::string ConvertSentencepieceModel(const std::string& model_string); -std::string ConvertSentencepieceModelForDecoder( - const std::string& model_string); - -// Returns size of a vocabulary from Sentencepiece configuration in flatbuffer -// format. -int GetVocabularySize(const std::string& model_string); - -} // namespace sentencepiece -} // namespace text -} // namespace tensorflow - -#endif // TENSORFLOW_LITE_SUPPORT_CUSTOM_OPS_KERNEL_SENTENCEPIECE_MODEL_CONVERTER_H_ +#endif // TENSORFLOW_TEXT_CORE_KERNELS_SENTENCEPIECE_MODEL_CONVERTER_H_ diff --git a/tensorflow_text/core/kernels/sentencepiece/native.bzl b/tensorflow_text/core/kernels/sentencepiece/native.bzl deleted file mode 100644 index 0d0d2184d..000000000 --- a/tensorflow_text/core/kernels/sentencepiece/native.bzl +++ /dev/null @@ -1,89 +0,0 @@ -"""Build definitions supporting platform-independent native build.""" - -load("//third_party/bazel_skylib/lib:selects.bzl", "selects") -load("//third_party/tensorflow:tensorflow.bzl", "tf_copts", "tf_opts_nortti_if_android") - -def micore_if(android, ios = [], default = []): - """Helper to create a select. - - Args: - android: what to return if compiling for Android. - ios: what to return if compiling for iOS. - default: what to return otherwise. - Returns: - the `android` list for Android compilation and the - `default` list otherwise. - """ - return select({ - "//tools/cc_target_os:android": android, - "//tools/cc_target_os:apple": ios, - "//conditions:default": default, - }) - -def micore_tf_copts(): - """C options for Tensorflow builds. - - Returns: - a list of copts which must be used by each cc_library which - refers to Tensorflow. Enables the library to compile both for - Android and for Google3. - """ - return tf_copts(android_optimization_level_override = None) + tf_opts_nortti_if_android() + [ - "-Wno-narrowing", - "-Wno-sign-compare", - "-Wno-overloaded-virtual", - ] + micore_if( - android = [ - # Set a define so Tensorflow's register_types.h - # adopts to support a rich set of types, to be pruned by - # selective registration. - "-DSUPPORT_SELECTIVE_REGISTRATION", - # Selective registration uses constexprs with recursive - # string comparisons; that can lead to compiler errors, so - # we increase the constexpr recursion depth. - "-fconstexpr-depth=1024", - ], - ) + selects.with_or({ - # If building for armeabi-v7a, and if compilation_mode is 'fastbuild' - # or 'dbg' then forcefully add -Oz to the list compiler options. - # Without it, some TF dependencies can't build (b/112286436). If - # compilation_mode is 'opt' then rely on the toolchain default. - ( - "//intelligence/micore/tools/build:armeabi_v7a_and_fastbuild", - "//intelligence/micore/tools/build:armeabi_v7a_and_dbg", - ): ["-Oz"], - "//conditions:default": [], - }) - -def micore_tf_deps(): - """Dependencies for Tensorflow builds. - - Returns: - list of dependencies which must be used by each cc_library - which refers to Tensorflow. Enables the library to compile both for - Android and for Google3. Use this macro instead of directly - declaring dependencies on Tensorflow. - """ - return micore_if( - android = [ - # Link to library which does not contain any ops. - # tf:portable_tensorflow_lib_lite tensorflow dep, - "//third_party/gemmlowp:eight_bit_int_gemm", - "//third_party/fft2d", - ], - ios = [ - # tf:portable_tensorflow_lib tensorflow dep, - "//third_party/gemmlowp:eight_bit_int_gemm", - "//third_party/fft2d", - ], - default = [ - # Standard references for Tensorflow when building for non-mobile, plain Google3. We use - # an indirection via the alias targets below, to facilitate whitelisting these deps in - # the mobile license presubmit checks. - "//intelligence/micore/tools/build:tensorflow_core_cpu", - "//intelligence/micore/tools/build:tensorflow_core_framework", - "//intelligence/micore/tools/build:tensorflow_core_lib", - "//intelligence/micore/tools/build:tensorflow_core_protos_all_cc", - "//intelligence/micore/tools/build:tensorflow_core_tensorflow", - ], - ) diff --git a/tensorflow_text/core/kernels/sentencepiece/native.bzl.oss b/tensorflow_text/core/kernels/sentencepiece/native.bzl.oss deleted file mode 100644 index c12530abf..000000000 --- a/tensorflow_text/core/kernels/sentencepiece/native.bzl.oss +++ /dev/null @@ -1,87 +0,0 @@ -"""Build definitions supporting platform-independent native build.""" - -load("@org_tensorflow//tensorflow:tensorflow.bzl", "tf_copts", "tf_opts_nortti_if_android") -load("@bazel_skylib//lib:selects.bzl", "selects") - -def micore_if(android, ios = [], default = []): - """Helper to create a select. - - Args: - android: what to return if compiling for Android. - ios: what to return if compiling for iOS. - default: what to return otherwise. - Returns: - the `android` list for Android compilation and the - `default` list otherwise. - """ - return select({ - ":android": android, - ":apple": ios, - "//conditions:default": default, - }) - -def micore_tf_copts(): - """C options for Tensorflow builds. - - Returns: - a list of copts which must be used by each cc_library which - refers to Tensorflow. Enables the library to compile both for - Android and for Linux. - """ - return tf_copts(android_optimization_level_override = None) + tf_opts_nortti_if_android() + [ - "-Wno-narrowing", - "-Wno-sign-compare", - "-Wno-overloaded-virtual", - ] + micore_if( - android = [ - # Set a define so Tensorflow's register_types.h - # adopts to support a rich set of types, to be pruned by - # selective registration. - "-DSUPPORT_SELECTIVE_REGISTRATION", - # Selective registration uses constexprs with recursive - # string comparisons; that can lead to compiler errors, so - # we increase the constexpr recursion depth. - "-fconstexpr-depth=1024", - ], - ) + selects.with_or({ - # If building for armeabi-v7a, and if compilation_mode is 'fastbuild' - # or 'dbg' then forcefully add -Oz to the list compiler options. - # Without it, some TF dependencies can't build (b/112286436). If - # compilation_mode is 'opt' then rely on the toolchain default. - ( - ":armeabi_v7a_and_fastbuild", - ":armeabi_v7a_and_dbg", - ): ["-Oz"], - "//conditions:default": [], - }) - -def micore_tf_deps(): - """Dependencies for Tensorflow builds. - - Returns: - list of dependencies which must be used by each cc_library - which refers to Tensorflow. Enables the library to compile both for - Android and for Linux. Use this macro instead of directly - declaring dependencies on Tensorflow. - """ - return micore_if( - android = [ - # Link to library which does not contain any ops. - "@org_tensorflow//tensorflow/core:portable_tensorflow_lib_lite", - "@gemmlowp//:eight_bit_int_gemm", - "@fft2d//:fft2d", - ], - ios = [ - "@org_tensorflow//tensorflow/core:portable_tensorflow_lib", - "@gemmlowp//:eight_bit_int_gemm", - "@fft2d//:fft2d", - ], - default = [ - # Standard references for Tensorflow when building for Linux. We use - # an indirection via the alias targets below, to facilitate whitelisting - # these deps in the mobile license presubmit checks. - "@release_or_nightly//:tensorflow_libtensorflow_framework", - "@release_or_nightly//:tensorflow_tf_header_lib", - ], - - ) diff --git a/tensorflow_text/core/kernels/sentencepiece/optimized_decoder.cc b/tensorflow_text/core/kernels/sentencepiece/optimized_decoder.cc deleted file mode 100644 index 397349c58..000000000 --- a/tensorflow_text/core/kernels/sentencepiece/optimized_decoder.cc +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow_text/core/kernels/sentencepiece/optimized_decoder.h" - -#include <string> -#include <tuple> - -#include "tensorflow_text/core/kernels/sentencepiece/decoder_config_generated.h" -#include "tensorflow_text/core/kernels/sentencepiece/double_array_trie.h" - -namespace tensorflow { -namespace text { -namespace sentencepiece { - -DecoderResult DecodeString(const std::vector<int>& encoded, - const void* config_buffer) { - DecoderResult result; - - // Get the config from the buffer. - const DecoderConfig* config = GetDecoderConfig(config_buffer); - if (config->version() != EncoderVersion::EncoderVersion_SENTENCE_PIECE) { - result.type = DecoderResultType::WRONG_CONFIG; - return result; - } - bool remove_dummy_prefix = config->remove_dummy_prefix(); - const auto config_pieces = config->decode_pieces(); - for (const auto code : encoded) { - const int real_code = code - config->encoding_offset(); - if (real_code >= config_pieces->size()) { - result.type = DecoderResultType::INVALID_INPUT; - return result; - } - const auto& piece_text = config_pieces->GetAsString(real_code); - const char* piece_str = piece_text->c_str(); - if (remove_dummy_prefix && *piece_str == ' ') { - ++piece_str; - } - result.decoded.append(piece_str); - remove_dummy_prefix = false; - } - // TODO(mgubin): Denormalize the string, haven't seen any Sentencepiece model - // with a denormalizer. - return result; -} - -} // namespace sentencepiece -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/sentencepiece/optimized_decoder.h b/tensorflow_text/core/kernels/sentencepiece/optimized_decoder.h index 8513d8e06..50d1fa4c5 100644 --- a/tensorflow_text/core/kernels/sentencepiece/optimized_decoder.h +++ b/tensorflow_text/core/kernels/sentencepiece/optimized_decoder.h @@ -12,51 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. +#ifndef TENSORFLOW_TEXT_CORE_KERNELS_SENTENCEPIECE_OPTIMIZED_DECODER_H_ +#define TENSORFLOW_TEXT_CORE_KERNELS_SENTENCEPIECE_OPTIMIZED_DECODER_H_ -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at +#include "tensorflow/core/kernels/text/sentencepiece/optimized_decoder.h" - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#ifndef TENSORFLOW_LITE_SUPPORT_CUSTOM_OPS_KERNEL_SENTENCEPIECE_OPTIMIZED_DECODER_H_ -#define TENSORFLOW_LITE_SUPPORT_CUSTOM_OPS_KERNEL_SENTENCEPIECE_OPTIMIZED_DECODER_H_ - -// Sentencepiece decoder optimized with memmapped model. - -#include <string> -#include <vector> - -namespace tensorflow { -namespace text { -namespace sentencepiece { - -enum class DecoderResultType { - SUCCESS = 0, - WRONG_CONFIG = 1, - INVALID_INPUT = 2 -}; - -struct DecoderResult { - DecoderResultType type = DecoderResultType::SUCCESS; - std::string decoded; -}; - -// Decodes one string from a vector of id. Takes the configuration as a -// type-erased buffer. -DecoderResult DecodeString(const std::vector<int>& encoded, - const void* config_buffer); - -} // namespace sentencepiece -} // namespace text -} // namespace tensorflow - -#endif // TENSORFLOW_LITE_SUPPORT_CUSTOM_OPS_KERNEL_SENTENCEPIECE_OPTIMIZED_DECODER_H_ +#endif // TENSORFLOW_TEXT_CORE_KERNELS_SENTENCEPIECE_OPTIMIZED_DECODER_H_ diff --git a/tensorflow_text/core/kernels/sentencepiece/optimized_decoder_test.cc b/tensorflow_text/core/kernels/sentencepiece/optimized_decoder_test.cc deleted file mode 100644 index 600941e76..000000000 --- a/tensorflow_text/core/kernels/sentencepiece/optimized_decoder_test.cc +++ /dev/null @@ -1,107 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow_text/core/kernels/sentencepiece/optimized_decoder.h" - -#include <fstream> - -#include "file/base/path.h" -#include <gmock/gmock.h> -#include <gtest/gtest.h> -#include "absl/strings/str_format.h" -#include "src/sentencepiece.proto.h" -#include "src/sentencepiece_processor.h" -#include "tensorflow/core/platform/env.h" -#include "tensorflow/lite/kernels/test_util.h" -#include "tensorflow_text/core/kernels/sentencepiece/model_converter.h" - -namespace tensorflow { -namespace text { -namespace sentencepiece { - -namespace internal { - -absl::Status TFReadFileToString(const std::string& filepath, - std::string* data) { - return tensorflow::ReadFileToString(tensorflow::Env::Default(), filepath, - data); -} - -absl::Status StdReadFileToString(const std::string& filepath, - std::string* data) { - std::ifstream infile(filepath); - if (!infile.is_open()) { - return absl::NotFoundError( - absl::StrFormat("Error when opening %s", filepath)); - } - std::string contents((std::istreambuf_iterator<char>(infile)), - (std::istreambuf_iterator<char>())); - data->append(contents); - infile.close(); - return absl::OkStatus(); -} - -} // namespace internal - -namespace { - -static char kConfigFilePath[] = - "/tensorflow_text/python/ops/test_data/" - "fast_sentencepiece.model"; - -TEST(OptimizedEncoder, ConfigConverter) { - std::string config; - - auto status = internal::TFReadFileToString( - file::JoinPath(::testing::SrcDir(), kConfigFilePath), &config); - ASSERT_TRUE(status.ok()); - - ::sentencepiece::SentencePieceProcessor processor; - ASSERT_TRUE(processor.LoadFromSerializedProto(config).ok()); - const auto converted_model = ConvertSentencepieceModelForDecoder(config); - const std::string test_string("Hello world!\\xF0\\x9F\\x8D\\x95"); - ::sentencepiece::SentencePieceText reference_encoded; - ASSERT_TRUE(processor.Encode(test_string, &reference_encoded).ok()); - - std::vector<int> encoded_vector; - encoded_vector.reserve(reference_encoded.pieces_size()); - for (const auto& piece : reference_encoded.pieces()) { - encoded_vector.push_back(piece.id()); - } - std::string ref_decoded; - ASSERT_TRUE(processor.Decode(encoded_vector, &ref_decoded).ok()); - const auto decoded = DecodeString(encoded_vector, converted_model.data()); - ASSERT_EQ(decoded.type, DecoderResultType::SUCCESS); - ASSERT_EQ(ref_decoded, decoded.decoded); -} -} // namespace - -} // namespace sentencepiece -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/sentencepiece/optimized_encoder.cc b/tensorflow_text/core/kernels/sentencepiece/optimized_encoder.cc deleted file mode 100644 index fee0a6ef2..000000000 --- a/tensorflow_text/core/kernels/sentencepiece/optimized_encoder.cc +++ /dev/null @@ -1,259 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow_text/core/kernels/sentencepiece/optimized_encoder.h" - -#include <algorithm> -#include <tuple> - -#include "tensorflow_text/core/kernels/sentencepiece/double_array_trie.h" -#include "tensorflow_text/core/kernels/sentencepiece/encoder_config_generated.h" - -namespace tensorflow { -namespace text { -namespace sentencepiece { -namespace { - -const char kSpaceSymbol[] = "\xe2\x96\x81"; - -template <typename processing_callback> -std::tuple<std::string, std::vector<int>> process_string( - const std::string& input, const std::vector<int>& offsets, - const processing_callback& pc) { - std::string result_string; - result_string.reserve(input.size()); - std::vector<int> result_offsets; - result_offsets.reserve(offsets.size()); - for (int i = 0, j = 0; i < input.size();) { - auto result = pc(input.data() + i, input.size() - i); - auto consumed = std::get<0>(result); - auto new_string = std::get<1>(result); - if (consumed == 0) { - // Skip the current byte and move forward. - result_string.push_back(input[i]); - result_offsets.push_back(offsets[j]); - i++; - j++; - continue; - } - result_string.append(new_string.data(), new_string.length()); - for (int i = 0; i < new_string.length(); ++i) { - result_offsets.push_back(offsets[j]); - } - j += consumed; - i += consumed; - } - return std::make_tuple(result_string, result_offsets); -} - -inline char is_whitespace(char c) { - return c == ' ' || c == '\t' || c == '\r' || c == '\n'; -} - -std::tuple<int, utils::string_view> remove_extra_whitespaces(const char* data, - int len) { - if (len == 0 || !is_whitespace(*data)) { - return std::make_tuple(0, utils::string_view(nullptr, 0)); - } - int num_consumed = 1; - for (; num_consumed < len && is_whitespace(data[num_consumed]); - ++num_consumed) { - } - return num_consumed > 1 - ? std::make_tuple(num_consumed, utils::string_view(" ", 1)) - : std::make_tuple(0, utils::string_view(nullptr, 0)); -} - -std::tuple<int, utils::string_view> find_replacement( - const char* data, int len, const DoubleArrayTrie& dat, - const flatbuffers::Vector<int8_t>& replacements) { - const auto max_match = dat.LongestPrefixMatch(utils::string_view(data, len)); - if (!max_match.empty()) { - if (max_match.id < 0 || max_match.id >= replacements.size()) { - return std::make_tuple(0, utils::string_view(nullptr, 0)); - } - // Because flatbuffer byte is signed char which is not the same as char, - // there is the reinterpret_cast here. - const char* replaced_string_ptr = - reinterpret_cast<const char*>(replacements.data() + max_match.id); - return std::make_tuple(max_match.match_length, - utils::string_view(replaced_string_ptr)); - } - return std::make_tuple(0, utils::string_view(nullptr, 0)); -} -} // namespace - -std::tuple<std::string, std::vector<int>> NormalizeString( - const std::string& in_string, const EncoderConfig& config) { - std::vector<int> output_offsets; - std::string result = in_string; - output_offsets.reserve(in_string.length()); - for (int i = 0; i < in_string.length(); ++i) { - output_offsets.push_back(i); - } - if (in_string.empty()) { - return std::make_tuple(result, output_offsets); - } - if (config.add_dummy_prefix()) { - result.insert(result.begin(), ' '); - output_offsets.insert(output_offsets.begin(), 0); - } - // Greedely replace normalized_prefixes with normalized_replacements - if (config.normalized_prefixes() != nullptr && - config.normalized_replacements() != nullptr) { - const DoubleArrayTrie normalized_prefixes_matcher( - config.normalized_prefixes()->nodes()); - const auto norm_replace = [&config, &normalized_prefixes_matcher]( - const char* data, int len) { - return find_replacement(data, len, normalized_prefixes_matcher, - *config.normalized_replacements()); - }; - std::tie(result, output_offsets) = - process_string(result, output_offsets, norm_replace); - } - if (config.remove_extra_whitespaces()) { - std::tie(result, output_offsets) = - process_string(result, output_offsets, remove_extra_whitespaces); - if (!result.empty() && is_whitespace(result.back())) { - result.pop_back(); - output_offsets.pop_back(); - } - } - if (config.escape_whitespaces()) { - const auto replace_whitespaces = [](const char* data, int len) { - if (len > 0 && is_whitespace(*data)) { - return std::make_tuple(1, utils::string_view(kSpaceSymbol)); - } - return std::make_tuple(0, utils::string_view(nullptr, 0)); - }; - std::tie(result, output_offsets) = - process_string(result, output_offsets, replace_whitespaces); - } - - return std::make_tuple(result, output_offsets); -} - -EncoderResult EncodeNormalizedString(const std::string& str, - const std::vector<int>& offsets, - const EncoderConfig& config, bool add_bos, - bool add_eos, bool reverse) { - const DoubleArrayTrie piece_matcher(config.pieces()->nodes()); - const flatbuffers::Vector<float>* piece_scores = config.pieces_scores(); - const int unknown_code = config.unknown_code(); - const float unknown_penalty = config.unknown_penalty(); - struct LatticeElement { - float score = 0; - int code = -1; - int prev_position = -1; - LatticeElement(float score_, int code_, int prev_position_) - : score(score_), code(code_), prev_position(prev_position_) {} - LatticeElement() {} - }; - const int length = str.length(); - std::vector<LatticeElement> lattice(length + 1); - for (int i = 0; i < length; ++i) { - if (i > 0 && lattice[i].prev_position < 0) { - // This state is unreachable. - continue; - } - if (unknown_code >= 0) { - // Put unknown code. - const float penalized_score = lattice[i].score + unknown_penalty; - const int pos = i + 1; - LatticeElement& current_element = lattice[pos]; - if (current_element.prev_position < 0 || - current_element.score < penalized_score) { - current_element = LatticeElement( - penalized_score, unknown_code, - // If the current state is already reached by unknown code, merge - // states. - lattice[i].code == unknown_code ? lattice[i].prev_position : i); - } - } - auto lattice_update = [&lattice, i, - piece_scores](const DoubleArrayTrie::Match& m) { - if (m.id < 0 || m.id >= piece_scores->size()) { - return; - } - LatticeElement& target_element = lattice[i + m.match_length]; - const float score = lattice[i].score + (*piece_scores)[m.id]; - if (target_element.prev_position < 0 || target_element.score < score) { - target_element = LatticeElement(score, m.id, i); - } - }; - piece_matcher.IteratePrefixMatches( - utils::string_view(str.data() + i, length - i), lattice_update); - } - - EncoderResult result; - if (add_eos) { - result.codes.push_back(config.end_code()); - result.offsets.push_back(length); - } - if (lattice[length].prev_position >= 0) { - for (int pos = length; pos > 0;) { - auto code = lattice[pos].code; - if (code != config.unknown_code()) { - code += config.encoding_offset(); - } - result.codes.push_back(code); - pos = lattice[pos].prev_position; - result.offsets.push_back(offsets[pos]); - } - } - if (add_bos) { - result.codes.push_back(config.start_code()); - result.offsets.push_back(0); - } - if (!reverse) { - std::reverse(result.codes.begin(), result.codes.end()); - std::reverse(result.offsets.begin(), result.offsets.end()); - } - return result; -} - -EncoderResult EncodeString(const std::string& string, const void* config_buffer, - bool add_bos, bool add_eos, bool reverse) { - // Get the config from the buffer. - const EncoderConfig* config = GetEncoderConfig(config_buffer); - if (config->version() != EncoderVersion::EncoderVersion_SENTENCE_PIECE) { - EncoderResult result; - result.type = EncoderResultType::WRONG_CONFIG; - return result; - } - std::string normalized_string; - std::vector<int> offsets; - std::tie(normalized_string, offsets) = NormalizeString(string, *config); - return EncodeNormalizedString(normalized_string, offsets, *config, add_bos, - add_eos, reverse); -} - -} // namespace sentencepiece -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/sentencepiece/optimized_encoder.h b/tensorflow_text/core/kernels/sentencepiece/optimized_encoder.h index d1ca949a6..324219aa7 100644 --- a/tensorflow_text/core/kernels/sentencepiece/optimized_encoder.h +++ b/tensorflow_text/core/kernels/sentencepiece/optimized_encoder.h @@ -12,53 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. +#ifndef TENSORFLOW_TEXT_CORE_KERNELS_SENTENCEPIECE_OPTIMIZED_ENCODER_H_ +#define TENSORFLOW_TEXT_CORE_KERNELS_SENTENCEPIECE_OPTIMIZED_ENCODER_H_ -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at +#include "tensorflow/core/kernels/text/sentencepiece/optimized_encoder.h" - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#ifndef TENSORFLOW_LITE_SUPPORT_CUSTOM_OPS_KERNEL_SENTENCEPIECE_OPTIMIZED_ENCODER_H_ -#define TENSORFLOW_LITE_SUPPORT_CUSTOM_OPS_KERNEL_SENTENCEPIECE_OPTIMIZED_ENCODER_H_ - -// Sentencepiece encoder optimized with memmapped model. - -#include <string> -#include <tuple> -#include <vector> - -#include "tensorflow_text/core/kernels/sentencepiece/encoder_config_generated.h" - -namespace tensorflow { -namespace text { -namespace sentencepiece { - -enum class EncoderResultType { SUCCESS = 0, WRONG_CONFIG = 1 }; - -struct EncoderResult { - EncoderResultType type = EncoderResultType::SUCCESS; - std::vector<int> codes; - std::vector<int> offsets; -}; -std::tuple<std::string, std::vector<int>> NormalizeString( - const std::string& in_string, const EncoderConfig& config); - -// Encodes one string and returns ids and offsets. Takes the configuration as a -// type-erased buffer. -EncoderResult EncodeString(const std::string& string, const void* config_buffer, - bool add_bos, bool add_eos, bool reverse); - -} // namespace sentencepiece -} // namespace text -} // namespace tensorflow - -#endif // TENSORFLOW_LITE_SUPPORT_CUSTOM_OPS_KERNEL_SENTENCEPIECE_OPTIMIZED_ENCODER_H_ +#endif // TENSORFLOW_TEXT_CORE_KERNELS_SENTENCEPIECE_OPTIMIZED_ENCODER_H_ diff --git a/tensorflow_text/core/kernels/sentencepiece/optimized_encoder_test.cc b/tensorflow_text/core/kernels/sentencepiece/optimized_encoder_test.cc deleted file mode 100644 index ecab756f4..000000000 --- a/tensorflow_text/core/kernels/sentencepiece/optimized_encoder_test.cc +++ /dev/null @@ -1,187 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow_text/core/kernels/sentencepiece/optimized_encoder.h" - -#include <fstream> - -#include "file/base/path.h" -#include <gmock/gmock.h> -#include <gtest/gtest.h> -#include "absl/status/status.h" -#include "absl/strings/str_format.h" -#include "src/sentencepiece.proto.h" -#include "src/sentencepiece_processor.h" -#include "tensorflow/core/platform/env.h" -#include "tensorflow/lite/kernels/test_util.h" -#include "tensorflow_text/core/kernels/sentencepiece/double_array_trie_builder.h" -#include "tensorflow_text/core/kernels/sentencepiece/encoder_config_generated.h" -#include "tensorflow_text/core/kernels/sentencepiece/model_converter.h" - -namespace tensorflow { -namespace text { -namespace sentencepiece { - -namespace internal { - -absl::Status TFReadFileToString(const std::string& filepath, - std::string* data) { - return tensorflow::ReadFileToString(tensorflow::Env::Default(), filepath, - data); -} - -absl::Status StdReadFileToString(const std::string& filepath, - std::string* data) { - std::ifstream infile(filepath); - if (!infile.is_open()) { - return absl::NotFoundError( - absl::StrFormat("Error when opening %s", filepath)); - } - std::string contents((std::istreambuf_iterator<char>(infile)), - (std::istreambuf_iterator<char>())); - data->append(contents); - infile.close(); - return absl::OkStatus(); -} -} // namespace internal - -namespace { - -static char kConfigFilePath[] = - "/tensorflow_text/python/ops/test_data/" - "fast_sentencepiece.model"; - -TEST(OptimizedEncoder, NormalizeStringWhitestpaces) { - flatbuffers::FlatBufferBuilder builder(1024); - EncoderConfigBuilder ecb(builder); - ecb.add_remove_extra_whitespaces(true); - ecb.add_add_dummy_prefix(true); - ecb.add_escape_whitespaces(true); - FinishEncoderConfigBuffer(builder, ecb.Finish()); - const EncoderConfig* config = GetEncoderConfig(builder.GetBufferPointer()); - { - const auto result = NormalizeString("x y", *config); - const auto res_string = std::get<0>(result); - const auto offsets = std::get<1>(result); - EXPECT_EQ(res_string, "\xe2\x96\x81x\xe2\x96\x81y"); - EXPECT_THAT(offsets, ::testing::ElementsAre(0, 0, 0, 0, 1, 1, 1, 3)); - } - { - const auto result = NormalizeString("\tx y\n", *config); - const auto res_string = std::get<0>(result); - const auto offsets = std::get<1>(result); - EXPECT_EQ(res_string, "\xe2\x96\x81x\xe2\x96\x81y"); - EXPECT_THAT(offsets, ::testing::ElementsAre(0, 0, 0, 1, 2, 2, 2, 4)); - } -} - -TEST(OptimizedEncoder, NormalizeStringReplacement) { - flatbuffers::FlatBufferBuilder builder(1024); - const std::vector<std::string> norm_prefixes = {"A", "AA", "AAA", "AAAA"}; - const char norm_replacements[] = "A1\0A2\0A3\0A4"; - const auto trie_vector = - builder.CreateVector(BuildTrie(norm_prefixes, {0, 3, 6, 9})); - const auto norm_r = builder.CreateVector<int8_t>( - reinterpret_cast<const signed char*>(norm_replacements), - sizeof(norm_replacements)); - TrieBuilder trie_builder(builder); - trie_builder.add_nodes(trie_vector); - const auto norm_p = trie_builder.Finish(); - EncoderConfigBuilder ecb(builder); - ecb.add_remove_extra_whitespaces(false); - ecb.add_normalized_prefixes(norm_p); - ecb.add_normalized_replacements(norm_r); - FinishEncoderConfigBuffer(builder, ecb.Finish()); - const EncoderConfig* config = GetEncoderConfig(builder.GetBufferPointer()); - { - const auto result = NormalizeString("ABAABAAABAAAA", *config); - const auto res_string = std::get<0>(result); - const auto offsets = std::get<1>(result); - EXPECT_EQ(res_string, "A1BA2BA3BA4"); - EXPECT_THAT(offsets, - ::testing::ElementsAre(0, 0, 1, 2, 2, 4, 5, 5, 8, 9, 9)); - } -} - -TEST(OptimizedEncoder, NormalizeStringWhitespacesRemove) { - flatbuffers::FlatBufferBuilder builder(1024); - const std::vector<std::string> norm_prefixes = {"A", "AA", "AAA", "AAAA", - "X"}; - const char norm_replacements[] = "A1\0A2\0A3\0A4\0 "; - const auto trie_vector = - builder.CreateVector(BuildTrie(norm_prefixes, {0, 3, 6, 9, 12})); - const auto norm_r = builder.CreateVector<int8_t>( - reinterpret_cast<const signed char*>(norm_replacements), - sizeof(norm_replacements)); - TrieBuilder trie_builder(builder); - trie_builder.add_nodes(trie_vector); - const auto norm_p = trie_builder.Finish(); - EncoderConfigBuilder ecb(builder); - ecb.add_remove_extra_whitespaces(true); - ecb.add_normalized_prefixes(norm_p); - ecb.add_normalized_replacements(norm_r); - FinishEncoderConfigBuffer(builder, ecb.Finish()); - const EncoderConfig* config = GetEncoderConfig(builder.GetBufferPointer()); - { - const auto result = NormalizeString("XXABAABAAABAAAA", *config); - const auto res_string = std::get<0>(result); - const auto offsets = std::get<1>(result); - EXPECT_EQ(res_string, " A1BA2BA3BA4"); - EXPECT_THAT(offsets, - ::testing::ElementsAre(0, 2, 2, 3, 4, 4, 6, 7, 7, 10, 11, 11)); - } -} - -TEST(OptimizedEncoder, ConfigConverter) { - std::string config; - auto status = internal::TFReadFileToString( - file::JoinPath(::testing::SrcDir(), kConfigFilePath), &config); - ASSERT_TRUE(status.ok()); - - ::sentencepiece::SentencePieceProcessor processor; - ASSERT_TRUE(processor.LoadFromSerializedProto(config).ok()); - const auto converted_model = ConvertSentencepieceModel(config); - const std::string test_string("Hello world!\\xF0\\x9F\\x8D\\x95"); - const auto encoded = - EncodeString(test_string, converted_model.data(), false, false, false); - ASSERT_EQ(encoded.codes.size(), encoded.offsets.size()); - - ::sentencepiece::SentencePieceText reference_encoded; - ASSERT_TRUE(processor.Encode(test_string, &reference_encoded).ok()); - EXPECT_EQ(encoded.codes.size(), reference_encoded.pieces_size()); - for (int i = 0; i < encoded.codes.size(); ++i) { - EXPECT_EQ(encoded.codes[i], reference_encoded.pieces(i).id()); - EXPECT_EQ(encoded.offsets[i], reference_encoded.pieces(i).begin()); - } -} - -} // namespace -} // namespace sentencepiece -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/sentencepiece/py_tflite_registerer.cc b/tensorflow_text/core/kernels/sentencepiece/py_tflite_registerer.cc deleted file mode 100644 index e5ae73622..000000000 --- a/tensorflow_text/core/kernels/sentencepiece/py_tflite_registerer.cc +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow_text/core/kernels/sentencepiece/py_tflite_registerer.h" - -namespace tflite { -namespace ops { -namespace custom { -TfLiteRegistration* Register_FAST_SENTENCEPIECE_TOKENIZER(); -TfLiteRegistration* Register_FAST_SENTENCEPIECE_DETOKENIZER(); - -namespace text { - -extern "C" void AddFastSentencepieceTokenize( - tflite::MutableOpResolver* resolver) { - resolver->AddCustom( - "TFText>FastSentencepieceTokenize", - ::tflite::ops::custom::Register_FAST_SENTENCEPIECE_TOKENIZER()); -} - -extern "C" void AddFastSentencepieceDetokenize( - tflite::MutableOpResolver* resolver) { - resolver->AddCustom( - "TFText>FastSentencepieceDetokenize", - ::tflite::ops::custom::Register_FAST_SENTENCEPIECE_DETOKENIZER()); -} - -} // namespace text -} // namespace custom -} // namespace ops -} // namespace tflite diff --git a/tensorflow_text/core/kernels/sentencepiece/py_tflite_registerer.h b/tensorflow_text/core/kernels/sentencepiece/py_tflite_registerer.h index 6a64a6b7a..4a0eb6c7e 100644 --- a/tensorflow_text/core/kernels/sentencepiece/py_tflite_registerer.h +++ b/tensorflow_text/core/kernels/sentencepiece/py_tflite_registerer.h @@ -12,40 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. +#ifndef TENSORFLOW_TEXT_CORE_KERNELS_SENTENCEPIECE_PY_TFLITE_REGISTERER_H_ +#define TENSORFLOW_TEXT_CORE_KERNELS_SENTENCEPIECE_PY_TFLITE_REGISTERER_H_ -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at +#include "tensorflow/core/kernels/text/sentencepiece/py_tflite_registerer.h" - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#ifndef TENSORFLOW_LITE_SUPPORT_CUSTOM_OPS_KERNEL_SENTENCEPIECE_PY_TFLITE_REGISTERER_H_ -#define TENSORFLOW_LITE_SUPPORT_CUSTOM_OPS_KERNEL_SENTENCEPIECE_PY_TFLITE_REGISTERER_H_ -#include "tensorflow/lite/mutable_op_resolver.h" - -namespace tflite { -namespace ops { -namespace custom { -namespace text { - -// C-function that is called from the Python Wrapper. -extern "C" void AddFastSentencepieceTokenize( - tflite::MutableOpResolver *resolver); - -extern "C" void AddFastSentencepieceDetokenize( - tflite::MutableOpResolver *resolver); - -} // namespace text -} // namespace custom -} // namespace ops -} // namespace tflite - -#endif // TENSORFLOW_LITE_SUPPORT_CUSTOM_OPS_KERNEL_SENTENCEPIECE_PY_TFLITE_REGISTERER_H_ +#endif // TENSORFLOW_TEXT_CORE_KERNELS_SENTENCEPIECE_PY_TFLITE_REGISTERER_H_ diff --git a/tensorflow_text/core/kernels/sentencepiece/sentencepiece_constants.h b/tensorflow_text/core/kernels/sentencepiece/sentencepiece_constants.h index f0d95d1c8..cf3917f2c 100644 --- a/tensorflow_text/core/kernels/sentencepiece/sentencepiece_constants.h +++ b/tensorflow_text/core/kernels/sentencepiece/sentencepiece_constants.h @@ -12,44 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. +#ifndef TENSORFLOW_TEXT_CORE_KERNELS_SENTENCEPIECE_SENTENCEPIECE_CONSTANTS_H_ +#define TENSORFLOW_TEXT_CORE_KERNELS_SENTENCEPIECE_SENTENCEPIECE_CONSTANTS_H_ -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at +#include "tensorflow/core/kernels/text/sentencepiece/sentencepiece_constants.h" - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#ifndef TENSORFLOW_LITE_SUPPORT_CUSTOM_OPS_KERNEL_SENTENCEPIECE_SENTENCEPIECE_CONSTANTS_H_ -#define TENSORFLOW_LITE_SUPPORT_CUSTOM_OPS_KERNEL_SENTENCEPIECE_SENTENCEPIECE_CONSTANTS_H_ - -namespace tensorflow { -namespace text { -namespace sentencepiece { - -// The constant is copied from -// https://github.com/google/sentencepiece/blob/master/src/unigram_model.cc -constexpr float kUnkPenalty = 10.0; - -// These constants are copied from -// https://github.com/google/sentencepiece/blob/master/src/sentencepiece_processor.cc -// -// Replaces white space with U+2581 (LOWER ONE EIGHT BLOCK). -constexpr char kSpaceSymbol[] = "\xe2\x96\x81"; - -// Encodes <unk> into U+2047 (DOUBLE QUESTION MARK), -// since this character can be useful both for user and -// developer. We can easily figure out that <unk> is emitted. -constexpr char kDefaultUnknownSymbol[] = " \xE2\x81\x87 "; - -} // namespace sentencepiece -} // namespace text -} // namespace tensorflow - -#endif // TENSORFLOW_LITE_SUPPORT_CUSTOM_OPS_KERNEL_SENTENCEPIECE_SENTENCEPIECE_CONSTANTS_H_ +#endif // TENSORFLOW_TEXT_CORE_KERNELS_SENTENCEPIECE_SENTENCEPIECE_CONSTANTS_H_ diff --git a/tensorflow_text/core/kernels/sentencepiece/sentencepiece_detokenizer.h b/tensorflow_text/core/kernels/sentencepiece/sentencepiece_detokenizer.h index 24b41fc8c..898ba8b3e 100644 --- a/tensorflow_text/core/kernels/sentencepiece/sentencepiece_detokenizer.h +++ b/tensorflow_text/core/kernels/sentencepiece/sentencepiece_detokenizer.h @@ -12,34 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. +#ifndef TENSORFLOW_TEXT_CORE_KERNELS_SENTENCEPIECE_SENTENCEPIECE_DETOKENIZER_H_ +#define TENSORFLOW_TEXT_CORE_KERNELS_SENTENCEPIECE_SENTENCEPIECE_DETOKENIZER_H_ -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at +#include "tensorflow/core/kernels/text/sentencepiece/sentencepiece_detokenizer.h" - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#ifndef TENSORFLOW_LITE_SUPPORT_CUSTOM_OPS_KERNEL_SENTENCEPIECE_SENTENCEPIECE_DETOKENIZER_H_ -#define TENSORFLOW_LITE_SUPPORT_CUSTOM_OPS_KERNEL_SENTENCEPIECE_SENTENCEPIECE_DETOKENIZER_H_ - -// Constants are shared between TF and TFLite SentencepieceTokenizer kernels. -namespace tensorflow { -namespace text { -constexpr int kSPModelIndex = 0; -constexpr int kInputIndex = 1; -constexpr int kInputSplits = 2; -constexpr int kAddBOSInput = 4; -constexpr int kAddEOSInput = 5; -constexpr int kReverseInput = 6; -} // namespace text -} // namespace tensorflow - -#endif // TENSORFLOW_LITE_SUPPORT_CUSTOM_OPS_KERNEL_SENTENCEPIECE_SENTENCEPIECE_DETOKENIZER_H_ +#endif // TENSORFLOW_TEXT_CORE_KERNELS_SENTENCEPIECE_SENTENCEPIECE_DETOKENIZER_H_ diff --git a/tensorflow_text/core/kernels/sentencepiece/sentencepiece_detokenizer_kernel.cc b/tensorflow_text/core/kernels/sentencepiece/sentencepiece_detokenizer_kernel.cc deleted file mode 100644 index dc100f7a5..000000000 --- a/tensorflow_text/core/kernels/sentencepiece/sentencepiece_detokenizer_kernel.cc +++ /dev/null @@ -1,101 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/core/framework/op.h" -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/framework/shape_inference.h" -#include "tensorflow/core/framework/tensor.h" -#include "tensorflow/core/platform/errors.h" -#include "tensorflow/core/framework/op_requires.h" -#include "absl/status/status.h" -#include "tensorflow_text/core/kernels/sentencepiece/optimized_decoder.h" -#include "tensorflow_text/core/kernels/sentencepiece/sentencepiece_detokenizer.h" - -namespace tensorflow { -namespace text { - -template <typename Tsplits> -class TFSentencepieceDetokenizerOp : public tensorflow::OpKernel { - public: - explicit TFSentencepieceDetokenizerOp(tensorflow::OpKernelConstruction* ctx) - : OpKernel(ctx) {} - void Compute(tensorflow::OpKernelContext* ctx) override { - const auto& model_tensor = ctx->input(kSPModelIndex); - const auto& input_values_tensor = ctx->input(kInputIndex); - const auto input_values_flat = - input_values_tensor.flat<tensorflow::int32>(); - const auto& input_splits_tensor = ctx->input(kInputSplits); - const auto input_splits_flat = input_splits_tensor.flat<Tsplits>(); - OP_REQUIRES(ctx, input_splits_flat.size() > 0, - absl::InvalidArgumentError( - "input_splits must have at least 1 element.")); - const int num_of_sentences = input_splits_flat.size() - 1; - Tensor* output_tensor = nullptr; - OP_REQUIRES_OK(ctx, - ctx->allocate_output(0, {num_of_sentences}, &output_tensor)); - auto output_flat = output_tensor->flat<tensorflow::tstring>(); - std::vector<int> codes_for_split; - int input_offset = 0; - for (int i = 0; i < num_of_sentences; i++) { - // Create a vector of int32 from input according to spans. - const int split_size = input_splits_flat(i + 1) - input_splits_flat(i); - OP_REQUIRES( - ctx, - split_size >= 0 && - (input_offset + split_size) <= input_values_flat.size(), - absl::InvalidArgumentError("input_splits must be monotonically " - "non-decreasing and within bounds.")); - codes_for_split.clear(); - codes_for_split.reserve(split_size); - for (int j = 0; j < split_size; ++j) { - codes_for_split.push_back(input_values_flat(input_offset++)); - } - const auto res = sentencepiece::DecodeString( - codes_for_split, model_tensor.data()); - OP_REQUIRES(ctx, res.type == sentencepiece::DecoderResultType::SUCCESS, - absl::Status(static_cast<absl::StatusCode>( - absl::StatusCode::kInternal), - "Sentencepiece conversion failed")); - output_flat(i) = res.decoded; - } - } -}; -} // namespace text -} // namespace tensorflow - -REGISTER_KERNEL_BUILDER( - Name("TFText>FastSentencepieceDetokenize") - .Device(tensorflow::DEVICE_CPU) - .TypeConstraint<tensorflow::int32>("Tsplits"), - tensorflow::text::TFSentencepieceDetokenizerOp<tensorflow::int32>); -REGISTER_KERNEL_BUILDER( - Name("TFText>FastSentencepieceDetokenize") - .Device(tensorflow::DEVICE_CPU) - .TypeConstraint<tensorflow::int64>("Tsplits"), - tensorflow::text::TFSentencepieceDetokenizerOp<tensorflow::int64>); diff --git a/tensorflow_text/core/kernels/sentencepiece/sentencepiece_detokenizer_tflite.cc b/tensorflow_text/core/kernels/sentencepiece/sentencepiece_detokenizer_tflite.cc deleted file mode 100644 index 3f8f6df4d..000000000 --- a/tensorflow_text/core/kernels/sentencepiece/sentencepiece_detokenizer_tflite.cc +++ /dev/null @@ -1,127 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Copyright 2020 The TensorFlow Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -/** - * Sentencepiece tflite detokenizer implementation. - */ -#include <algorithm> -#include <iterator> - -#include "flatbuffers/flexbuffers.h" -#include "tensorflow/lite/c/common.h" -#include "tensorflow/lite/context.h" -#include "tensorflow/lite/kernels/internal/tensor.h" -#include "tensorflow/lite/kernels/kernel_util.h" -#include "tensorflow/lite/model.h" -#include "tensorflow/lite/string_util.h" -#include "tensorflow_text/core/kernels/sentencepiece/optimized_decoder.h" -#include "tensorflow_text/core/kernels/sentencepiece/sentencepiece_detokenizer.h" - -namespace tflite { -namespace ops { -namespace custom { -namespace text { -namespace sentencepiece { -namespace detokenizer { - -constexpr int kOutputValuesInd = 0; -// Initializes text encoder object from serialized parameters. -void* Initialize(TfLiteContext* /*context*/, const char* /*buffer*/, - size_t /*length*/) { - return nullptr; -} -void Free(TfLiteContext* /*context*/, void* /*buffer*/) {} - -TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { - // TODO(mgubin): Add checks for input and output tensors. - TfLiteTensor& output_values = - context->tensors[node->outputs->data[kOutputValuesInd]]; - SetTensorToDynamic(&output_values); - // TODO(mgubin): Check input types. - - return kTfLiteOk; -} - -TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { - const TfLiteTensor& model_tensor = - context->tensors[node->inputs->data[tensorflow::text::kSPModelIndex]]; - const auto model_buffer_data = model_tensor.data.data; - const TfLiteTensor& input_encoded = - context->tensors[node->inputs->data[tensorflow::text::kInputIndex]]; - const int32_t* input_encoded_data = input_encoded.data.i32; - const TfLiteTensor& input_splits = - context->tensors[node->inputs->data[tensorflow::text::kInputSplits]]; - const int num_of_sentences = NumElements(input_splits.dims) - 1; - const int32_t* input_splits_data = input_splits.data.i32; - - DynamicBuffer buf; - - std::vector<int> codes_for_split; - int input_offset = 0; - for (int i = 0; i < num_of_sentences; i++) { - // Create a vector of int32 from input according to spans. - const int split_size = input_splits_data[i + 1] - input_splits_data[i]; - TF_LITE_ENSURE_MSG( - context, - split_size >= 0 && - (input_offset + split_size) <= NumElements(input_encoded.dims), - "input_splits must be monotonically non-decreasing and " - "within bounds."); - codes_for_split.clear(); - std::copy(input_encoded_data + input_offset, - input_encoded_data + input_offset + split_size, - std::back_inserter(codes_for_split)); - const auto res = tensorflow::text::sentencepiece::DecodeString( - codes_for_split, model_buffer_data); - TF_LITE_ENSURE_MSG( - context, - res.type == tensorflow::text::sentencepiece::DecoderResultType::SUCCESS, - "Sentencepiece decoding failed"); - buf.AddString(res.decoded.data(), res.decoded.length()); - input_offset += split_size; - } - TfLiteTensor& output_values = - context->tensors[node->outputs->data[kOutputValuesInd]]; - buf.WriteToTensor(&output_values, nullptr); - return kTfLiteOk; -} -} // namespace detokenizer -} // namespace sentencepiece -} // namespace text - -TfLiteRegistration* Register_FAST_SENTENCEPIECE_DETOKENIZER() { - static TfLiteRegistration r = { - text::sentencepiece::detokenizer::Initialize, - text::sentencepiece::detokenizer::Free, - text::sentencepiece::detokenizer::Prepare, - text::sentencepiece::detokenizer::Eval}; - return &r; -} - -} // namespace custom -} // namespace ops -} // namespace tflite diff --git a/tensorflow_text/core/kernels/sentencepiece/sentencepiece_tokenizer.h b/tensorflow_text/core/kernels/sentencepiece/sentencepiece_tokenizer.h index 8ce03f69b..423fda05f 100644 --- a/tensorflow_text/core/kernels/sentencepiece/sentencepiece_tokenizer.h +++ b/tensorflow_text/core/kernels/sentencepiece/sentencepiece_tokenizer.h @@ -12,34 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. +#ifndef TENSORFLOW_TEXT_CORE_KERNELS_SENTENCEPIECE_SENTENCEPIECE_TOKENIZER_H_ +#define TENSORFLOW_TEXT_CORE_KERNELS_SENTENCEPIECE_SENTENCEPIECE_TOKENIZER_H_ -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at +#include "tensorflow/core/kernels/text/sentencepiece/sentencepiece_tokenizer.h" - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#ifndef TENSORFLOW_LITE_SUPPORT_CUSTOM_OPS_KERNEL_SENTENCEPIECE_SENTENCEPIECE_TOKENIZER_H_ -#define TENSORFLOW_LITE_SUPPORT_CUSTOM_OPS_KERNEL_SENTENCEPIECE_SENTENCEPIECE_TOKENIZER_H_ - -// Constants are shared between TF and TFLite SentencepieceTokenizer kernels. -namespace tensorflow { -namespace text { - -constexpr int kSPModelIndex = 0; -constexpr int kInputIndex = 1; -constexpr int kAddBOSInput = 4; -constexpr int kAddEOSInput = 5; -constexpr int kReverseInput = 6; -} // namespace text -} // namespace tensorflow - -#endif // TENSORFLOW_LITE_SUPPORT_CUSTOM_OPS_KERNEL_SENTENCEPIECE_SENTENCEPIECE_TOKENIZER_H_ +#endif // TENSORFLOW_TEXT_CORE_KERNELS_SENTENCEPIECE_SENTENCEPIECE_TOKENIZER_H_ diff --git a/tensorflow_text/core/kernels/sentencepiece/sentencepiece_tokenizer_kernel.cc b/tensorflow_text/core/kernels/sentencepiece/sentencepiece_tokenizer_kernel.cc deleted file mode 100644 index 22a5beaf6..000000000 --- a/tensorflow_text/core/kernels/sentencepiece/sentencepiece_tokenizer_kernel.cc +++ /dev/null @@ -1,108 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include <cstdint> -#include <iterator> -#include <limits> -#include <vector> - -#include "tensorflow/core/framework/op.h" -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/framework/shape_inference.h" -#include "tensorflow/core/framework/tensor.h" -#include "tensorflow/core/platform/errors.h" -#include "tensorflow_text/core/kernels/sentencepiece/optimized_encoder.h" -#include "tensorflow_text/core/kernels/sentencepiece/sentencepiece_tokenizer.h" - -namespace tensorflow { -namespace text{ - -class TFSentencepieceOp : public tensorflow::OpKernel { - public: - explicit TFSentencepieceOp(tensorflow::OpKernelConstruction* ctx) - : OpKernel(ctx) {} - void Compute(tensorflow::OpKernelContext* ctx) override { - const auto& model_tensor = ctx->input(kSPModelIndex); - const auto& input_values_tensor = ctx->input(kInputIndex); - const auto input_values_flat = - input_values_tensor.flat<tensorflow::tstring>(); - const int64_t num_of_input_values = input_values_flat.size(); - - const auto& add_bos_tensor = ctx->input(kAddBOSInput); - const bool add_bos = add_bos_tensor.scalar<bool>()(); - const auto& add_eos_tensor = ctx->input(kAddEOSInput); - const bool add_eos = add_eos_tensor.scalar<bool>()(); - const auto& reverse_tensor = ctx->input(kReverseInput); - const bool reverse = reverse_tensor.scalar<bool>()(); - - std::vector<int32> encoded; - std::vector<int32> splits; - for (int i = 0; i < num_of_input_values; ++i) { - const auto res = sentencepiece::EncodeString( - input_values_flat(i), model_tensor.data(), add_bos, add_eos, reverse); - OP_REQUIRES(ctx, res.type == sentencepiece::EncoderResultType::SUCCESS, - absl::Status(static_cast<absl::StatusCode>( - absl::StatusCode::kInternal), - "Sentencepiece conversion failed")); - std::copy(res.codes.begin(), res.codes.end(), - std::back_inserter(encoded)); - splits.emplace_back(encoded.size()); - } - tensorflow::Tensor* output_values_tensor = nullptr; - tensorflow::Tensor* output_splits_tensor = nullptr; - OP_REQUIRES(ctx, encoded.size() < std::numeric_limits<int32_t>::max(), - errors::InvalidArgument( - "Encoded input must contain less than 2^31 characters.")); - OP_REQUIRES( - ctx, splits.size() + 1 < std::numeric_limits<int32_t>::max(), - errors::InvalidArgument("Splits tensor is limited to 2^31-1 values.")); - OP_REQUIRES_OK( - ctx, ctx->allocate_output(0, {static_cast<int32_t>(encoded.size())}, - &output_values_tensor)); - OP_REQUIRES_OK( - ctx, ctx->allocate_output(1, {static_cast<int32_t>(splits.size()) + 1}, - &output_splits_tensor)); - - auto values_tensor_flat = output_values_tensor->vec<int32>(); - auto splits_tensor_flat = output_splits_tensor->vec<int32>(); - for (int32_t i = 0; i < encoded.size(); ++i) { - values_tensor_flat(i) = encoded[i]; - } - splits_tensor_flat(0) = 0; - for (int32_t i = 0; i < splits.size(); ++i) { - splits_tensor_flat(i + 1) = splits[i]; - } - } -}; - -} // namespace text -} // namespace tensorflow -REGISTER_KERNEL_BUILDER( - Name("TFText>FastSentencepieceTokenize").Device(tensorflow::DEVICE_CPU), - tensorflow::text::TFSentencepieceOp); diff --git a/tensorflow_text/core/kernels/sentencepiece/sentencepiece_tokenizer_tflite.cc b/tensorflow_text/core/kernels/sentencepiece/sentencepiece_tokenizer_tflite.cc deleted file mode 100644 index ddce5bd48..000000000 --- a/tensorflow_text/core/kernels/sentencepiece/sentencepiece_tokenizer_tflite.cc +++ /dev/null @@ -1,150 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -/** - * Sentencepiece tflite tokenizer implementation. - */ -#include "tensorflow_text/core/kernels/sentencepiece/optimized_encoder.h" -#include "tensorflow_text/core/kernels/sentencepiece/sentencepiece_tokenizer.h" -#include "flatbuffers/flexbuffers.h" -#include "tensorflow/lite/c/common.h" -#include "tensorflow/lite/context.h" -#include "tensorflow/lite/kernels/internal/tensor.h" -#include "tensorflow/lite/kernels/kernel_util.h" -#include "tensorflow/lite/model.h" -#include "tensorflow/lite/string_util.h" - -namespace tflite { -namespace ops { -namespace custom { -namespace text { -namespace sentencepiece { -namespace tokenizer { - -constexpr int kOutputValuesInd = 0; -constexpr int kOutputSplitsInd = 1; - -namespace { -TfLiteIntArray* CreateSizeArray(const std::initializer_list<int>& sizes) { - TfLiteIntArray* array_size = TfLiteIntArrayCreate(sizes.size()); - int index = 0; - for (const int size : sizes) { - array_size->data[index++] = size; - } - return array_size; -} -} // namespace - -// Initializes text encoder object from serialized parameters. -void* Initialize(TfLiteContext* /*context*/, const char* /*buffer*/, - size_t /*length*/) { - return nullptr; -} -void Free(TfLiteContext* /*context*/, void* /*buffer*/) {} - -TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { - // TODO(mgubin): Add checks for input and output tensors. - TfLiteTensor& output_values = - context->tensors[node->outputs->data[kOutputValuesInd]]; - SetTensorToDynamic(&output_values); - - TfLiteTensor& output_splits = - context->tensors[node->outputs->data[kOutputSplitsInd]]; - SetTensorToDynamic(&output_splits); - return kTfLiteOk; -} - -TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { - const TfLiteTensor& model_tensor = - context->tensors[node->inputs->data[tensorflow::text::kSPModelIndex]]; - const auto model_buffer_data = model_tensor.data.data; - const TfLiteTensor& input_text = - context->tensors[node->inputs->data[tensorflow::text::kInputIndex]]; - - const TfLiteTensor add_bos_tensor = - context->tensors[node->inputs->data[tensorflow::text::kAddBOSInput]]; - const bool add_bos = add_bos_tensor.data.b[0]; - const TfLiteTensor add_eos_tensor = - context->tensors[node->inputs->data[tensorflow::text::kAddEOSInput]]; - const bool add_eos = add_eos_tensor.data.b[0]; - const TfLiteTensor reverse_tensor = - context->tensors[node->inputs->data[tensorflow::text::kReverseInput]]; - const bool reverse = reverse_tensor.data.b[0]; - - std::vector<int32> encoded; - std::vector<int32> splits; - const int num_strings = tflite::GetStringCount(&input_text); - for (int i = 0; i < num_strings; ++i) { - const auto strref = tflite::GetString(&input_text, i); - const auto res = tensorflow::text::sentencepiece::EncodeString( - std::string(strref.str, strref.len), model_buffer_data, add_bos, - add_eos, reverse); - TF_LITE_ENSURE_MSG( - context, - res.type == tensorflow::text::sentencepiece::EncoderResultType::SUCCESS, - "Sentencepiece conversion failed"); - std::copy(res.codes.begin(), res.codes.end(), std::back_inserter(encoded)); - splits.emplace_back(encoded.size()); - } - - TfLiteTensor& output_values = - context->tensors[node->outputs->data[kOutputValuesInd]]; - TF_LITE_ENSURE_OK(context, - context->ResizeTensor( - context, &output_values, - CreateSizeArray({static_cast<int>(encoded.size())}))); - int32_t* output_values_flat = output_values.data.i32; - std::copy(encoded.begin(), encoded.end(), output_values_flat); - TfLiteTensor& output_splits = - context->tensors[node->outputs->data[kOutputSplitsInd]]; - TF_LITE_ENSURE_OK( - context, context->ResizeTensor( - context, &output_splits, - CreateSizeArray({static_cast<int>(splits.size() + 1)}))); - int32_t* output_splits_flat = output_splits.data.i32; - *output_splits_flat = 0; - std::copy(splits.begin(), splits.end(), output_splits_flat + 1); - return kTfLiteOk; -} -} // namespace tokenizer -} // namespace sentencepiece -} // namespace text - -TfLiteRegistration* Register_FAST_SENTENCEPIECE_TOKENIZER() { - static TfLiteRegistration r = { - text::sentencepiece::tokenizer::Initialize, - text::sentencepiece::tokenizer::Free, - text::sentencepiece::tokenizer::Prepare, - text::sentencepiece::tokenizer::Eval}; - return &r; -} - -} // namespace custom -} // namespace ops -} // namespace tflite diff --git a/tensorflow_text/core/kernels/sentencepiece/utils.h b/tensorflow_text/core/kernels/sentencepiece/utils.h index fb9d850e0..e6d84c924 100644 --- a/tensorflow_text/core/kernels/sentencepiece/utils.h +++ b/tensorflow_text/core/kernels/sentencepiece/utils.h @@ -12,67 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. +#ifndef TENSORFLOW_TEXT_CORE_KERNELS_SENTENCEPIECE_UTILS_H_ +#define TENSORFLOW_TEXT_CORE_KERNELS_SENTENCEPIECE_UTILS_H_ -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at +#include "tensorflow/core/kernels/text/sentencepiece/utils.h" - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#ifndef TENSORFLOW_LITE_SUPPORT_CUSTOM_OPS_KERNEL_SENTENCEPIECE_UTILS_H_ -#define TENSORFLOW_LITE_SUPPORT_CUSTOM_OPS_KERNEL_SENTENCEPIECE_UTILS_H_ - -#include <ostream> -#include <string> - -namespace tensorflow { -namespace text { -namespace sentencepiece { - -// AOSP and WASM doesn't support string_view, -// we put here a minimal re-implementation. -namespace utils { - -class string_view { - public: - explicit string_view(const std::string& s) - : str_(s.data()), len_(s.length()) {} - string_view(const char* str, int len) : str_(str), len_(len) {} - // A constructor from c string. - explicit string_view(const char* s) : str_(s), len_(strlen(s)) {} - - int length() const { return len_; } - const char* data() const { return str_; } - bool empty() const { return len_ == 0; } - unsigned char at(int i) const { return str_[i]; } - - private: - const char* str_ = nullptr; - const int len_ = 0; -}; - -inline std::ostream& operator<<(std::ostream& os, const string_view& sv) { - os << std::string(sv.data(), sv.length()); - return os; -} -inline bool operator==(const string_view& view1, const string_view& view2) { - if (view1.length() != view2.length()) { - return false; - } - return memcmp(view1.data(), view2.data(), view1.length()) == 0; -} - -} // namespace utils -} // namespace sentencepiece -} // namespace text -} // namespace tensorflow - -#endif // TENSORFLOW_LITE_SUPPORT_CUSTOM_OPS_KERNEL_SENTENCEPIECE_UTILS_H_ +#endif // TENSORFLOW_TEXT_CORE_KERNELS_SENTENCEPIECE_UTILS_H_ diff --git a/tensorflow_text/core/kernels/sentencepiece_kernels.cc b/tensorflow_text/core/kernels/sentencepiece_kernels.cc deleted file mode 100644 index a002f8fa4..000000000 --- a/tensorflow_text/core/kernels/sentencepiece_kernels.cc +++ /dev/null @@ -1,739 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "absl/base/attributes.h" -#include "absl/base/optimization.h" -#include "absl/base/thread_annotations.h" -#include "absl/container/flat_hash_map.h" -#include "absl/meta/type_traits.h" -#include "absl/status/status.h" -#include "absl/strings/string_view.h" -#include "absl/synchronization/mutex.h" -#include "absl/types/span.h" -#include "src/sentencepiece_model.pb.h" -#include "src/sentencepiece.pb.h" -#include "src/sentencepiece_processor.h" -#include "tensorflow/core/framework/bounds_check.h" -#include "tensorflow/core/framework/dataset_stateful_op_allowlist.h" -#include "tensorflow/core/framework/device_base.h" -#include "tensorflow/core/framework/node_def_util.h" -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/framework/resource_mgr.h" -#include "tensorflow/core/framework/tensor.h" -#include "tensorflow/core/framework/tensor_types.h" -#include "tensorflow/core/framework/types.h" -#include "tensorflow/core/framework/types.pb.h" -#include "tensorflow/core/graph/graph_def_builder.h" -#include "tensorflow/core/lib/core/errors.h" -#include "tensorflow/core/lib/core/refcount.h" -#include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/platform/errors.h" -#include "tensorflow/core/platform/macros.h" -#include "tensorflow/core/platform/mutex.h" -#include "tensorflow/core/platform/thread_annotations.h" -#include "tensorflow/core/platform/types.h" -#include "tensorflow/core/util/work_sharder.h" - -namespace tensorflow { -namespace text { - -namespace { - -// Our resource object that will hold the SentencePiece processor. -struct SentencepieceResource : public ResourceBase { - sentencepiece::SentencePieceProcessor processor; - int64 memory_used; - bool add_bos = false; - bool add_eos = false; - bool reverse = false; - mutable absl::Mutex mu; - - string DebugString() const override { return "Sentencepiece Resource"; } - - int64 MemoryUsed() const override { return memory_used; } - - bool SameOptions(bool add_bos, bool add_eos, bool reverse) const { - return (add_bos == this->add_bos) && (add_eos == this->add_eos) && - (reverse == this->reverse); - } - - Status AsGraphDef(GraphDefBuilder* builder, Node** out) const override { - absl::ReaderMutexLock l(&mu); - // We set use_node_name_sharing with a unique node name so that the resource - // can outlive the kernel. This means that the lifetime of the re-created - // resource will be tied to the lifetime of the resource manager it is - // created in. - static std::atomic<int64> counter(0); - std::string unique_node_name = strings::StrCat( - "SentencepieceResourceFromGraphDef", "/", counter.fetch_add(1)); - std::string model = processor.model_proto().SerializeAsString(); - *out = ops::SourceOp( - "SentencepieceOp", - builder->opts() - .WithName(unique_node_name) - .WithAttr("model", model) - .WithAttr("use_node_name_sharing", true)); - return absl::OkStatus(); - } -}; - -// According to .../tensorflow/core/util/work_sharder.cc, this values determines -// how much to shard. It assumes each cost unit is 1ns, and the minimum cost -// per shard is 10000 (10us). -// TODO(broken) Determine a medium cost of a call to the SentencePiece processor -constexpr int64 kCostPerUnit = 10000; - -::tensorflow::Status ToTFStatus(const sentencepiece::util::Status& s) { - if (s.ok()) return ::tensorflow::Status(); - return ::tensorflow::Status(static_cast<::absl::StatusCode>(s.code()), - ::tensorflow::string(s.message())); -} - -template <typename T> -T GetPieceOrId(const sentencepiece::SentencePieceText::SentencePiece& sp); - -template <> -tensorflow::tstring GetPieceOrId<tensorflow::tstring>( - const sentencepiece::SentencePieceText::SentencePiece& sp) { - return sp.piece(); -} - -template <> -int32 GetPieceOrId<int32>( - const sentencepiece::SentencePieceText::SentencePiece& sp) { - return sp.id(); -} - -tensorflow::Status HandleExtraOptions(OpKernelContext* ctx, - SentencepieceResource* sp) { - const Tensor* add_bos_tensor = nullptr; - TF_RETURN_IF_ERROR(ctx->input("add_bos", &add_bos_tensor)); - const bool add_bos = add_bos_tensor->scalar<bool>()(); - - const Tensor* add_eos_tensor = nullptr; - TF_RETURN_IF_ERROR(ctx->input("add_eos", &add_eos_tensor)); - const bool add_eos = add_eos_tensor->scalar<bool>()(); - - const Tensor* reverse_tensor = nullptr; - TF_RETURN_IF_ERROR(ctx->input("reverse", &reverse_tensor)); - const bool reverse = reverse_tensor->scalar<bool>()(); - - { - // Because we expect most of the time no change in these options, we grab - // the reader lock once and do a quick check first. - absl::ReaderMutexLock l(&sp->mu); - if (sp->SameOptions(add_bos, add_eos, reverse)) { - return absl::OkStatus(); - } - } - - absl::WriterMutexLock lock(&sp->mu); - if (sp->SameOptions(add_bos, add_eos, reverse)) { - return absl::OkStatus(); - } - string options; - sp->add_bos = add_bos; - if (sp->add_bos) { - absl::StrAppend(&options, "bos"); - } - sp->add_eos = add_eos; - if (sp->add_eos) { - if (!options.empty()) { - absl::StrAppend(&options, ":"); - } - absl::StrAppend(&options, "eos"); - } - sp->reverse = reverse; - if (sp->reverse) { - if (!options.empty()) { - absl::StrAppend(&options, ":"); - } - absl::StrAppend(&options, "reverse"); - } - - TF_RETURN_IF_ERROR(ToTFStatus(sp->processor.SetEncodeExtraOptions(options))); - TF_RETURN_IF_ERROR(ToTFStatus(sp->processor.SetDecodeExtraOptions(options))); - - return absl::OkStatus(); -} - -} // namespace - -class SentencepieceOp : public OpKernel { - public: - explicit SentencepieceOp(OpKernelConstruction* ctx) - : OpKernel(ctx), sp_set_(false) { - OP_REQUIRES_OK(ctx, ctx->allocate_temp(tensorflow::DT_STRING, - tensorflow::TensorShape({2}), &sp_)); - OP_REQUIRES_OK( - ctx, ctx->GetAttr("use_node_name_sharing", &use_node_name_sharing_)); - } - - ~SentencepieceOp() override { - // If the table object was not shared, delete it. - if (sp_set_ && cinfo_.resource_is_private_to_kernel()) { - if (!cinfo_.resource_manager() - ->template Delete<SentencepieceResource>(cinfo_.container(), - cinfo_.name()) - .ok()) { - // Do nothing; the resource may have been deleted by session resets. - } - } - } - - void Compute(OpKernelContext* ctx) override { - absl::MutexLock lock(&mu_); - - if (!sp_set_) { - OP_REQUIRES_OK(ctx, cinfo_.Init(ctx->resource_manager(), def(), - use_node_name_sharing_)); - } - - auto creator = - [ctx, this](SentencepieceResource** resource) - ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_) { - SentencepieceResource* sp = new SentencepieceResource(); - - string model_proto_attr; - TF_RETURN_IF_ERROR( - GetNodeAttr(this->def(), "model", &model_proto_attr)); - - if (TF_PREDICT_FALSE(model_proto_attr.empty())) { - return Status(tensorflow::errors::InvalidArgument( - "Model argument must be specified.")); - } - // Loads serialized sentencepiece model proto to enable embedding - // the relatively small sentencepiece model proto into the - // tensorflow graph such that the tensorflow graph is - // self-contained. - TF_RETURN_IF_ERROR(ToTFStatus( - sp->processor.LoadFromSerializedProto(model_proto_attr))); - // TODO(broken): Determine a better computation of what the memory - // requirements for the processor are. - sp->memory_used = model_proto_attr.size(); - - if (ctx->track_allocations()) { - ctx->record_persistent_memory_allocation(sp->MemoryUsed()); - } - - *resource = sp; - return absl::OkStatus(); - }; - - // Register the ResourceType alias. - SentencepieceResource* resource = nullptr; - OP_REQUIRES_OK( - ctx, cinfo_.resource_manager() - ->template LookupOrCreate<SentencepieceResource>( - cinfo_.container(), cinfo_.name(), &resource, creator)); - core::ScopedUnref unref_me(resource); - - // Put a handle to resource in the output tensor (the other aliases will - // have the same handle). - Tensor* handle; - OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &handle)); - handle->scalar<ResourceHandle>()() = - MakeResourceHandle<SentencepieceResource>(ctx, cinfo_.container(), - cinfo_.name()); - sp_set_ = true; - } - - private: - absl::Mutex mu_; - Tensor sp_ ABSL_GUARDED_BY(mu_); - bool sp_set_ ABSL_GUARDED_BY(mu_); - ContainerInfo cinfo_; - bool use_node_name_sharing_; - TF_DISALLOW_COPY_AND_ASSIGN(SentencepieceOp); -}; - -REGISTER_KERNEL_BUILDER(Name("SentencepieceOp").Device(DEVICE_CPU), - tensorflow::text::SentencepieceOp); -ALLOW_STATEFUL_OP_FOR_DATASET_FUNCTIONS("SentencepieceOp"); - -template <typename T, typename Tsplits> -class SentencepieceTokenizeOp : public OpKernel { - public: - explicit SentencepieceTokenizeOp(OpKernelConstruction* ctx) : OpKernel(ctx) { - ctx->GetAttr("return_nbest", &return_nbest_).IgnoreError(); - } - - void Compute(OpKernelContext* ctx) override { - SentencepieceResource* sp; - const Tensor& resource_tensor = ctx->input(0); - ResourceHandle resource_handle(resource_tensor.scalar<ResourceHandle>()()); - OP_REQUIRES_OK( - ctx, ctx->resource_manager()->Lookup<SentencepieceResource>( - resource_handle.container(), resource_handle.name(), &sp)); - core::ScopedUnref unref_me(sp); - - const Tensor& input_values_tensor = ctx->input(1); - const auto input_values_flat = - input_values_tensor.flat<tensorflow::tstring>(); - const int64 num_of_input_values = input_values_flat.size(); - - const Tensor* nbest_size_tensor = nullptr; - OP_REQUIRES_OK(ctx, ctx->input("nbest_size", &nbest_size_tensor)); - const Tensor* alpha_tensor = nullptr; - OP_REQUIRES_OK(ctx, ctx->input("alpha", &alpha_tensor)); - - OP_REQUIRES_OK(ctx, HandleExtraOptions(ctx, sp)); - - if (return_nbest_) { - OP_REQUIRES(ctx, nbest_size_tensor->dims() == 0, - errors::InvalidArgument( - "When return_nbest is true nbest_size must " - "be a scalar; got", - nbest_size_tensor->shape().DebugString(), "instead")); - OP_REQUIRES(ctx, nbest_size_tensor->scalar<int32>()() >= 1, - errors::InvalidArgument( - "When return_nbest is true nbest_size must be >= 1; got ", - nbest_size_tensor->scalar<int32>()())); - } - - std::vector<std::vector<typename std::conditional< - std::is_same<T, tstring>::value, std::string, T>::type>> - tokens(return_nbest_ ? 0 : num_of_input_values); - std::vector<std::vector<std::vector<typename std::conditional< - std::is_same<T, tstring>::value, std::string, T>::type>>> - nbest_tokens(return_nbest_ ? num_of_input_values : 0); - if (num_of_input_values > 0) { - const bool return_nbest = return_nbest_; - const auto& worker_threads = - *(ctx->device()->tensorflow_cpu_worker_threads()); - ::tensorflow::Shard( - worker_threads.num_threads, // max parallelism - worker_threads.workers, // thread pool - num_of_input_values, // total number of data to process. - kCostPerUnit, // cost per unit - [ctx, sp, &input_values_flat, &tokens, &nbest_tokens, - &nbest_size_tensor, &alpha_tensor, - return_nbest](int64 start, int64 limit) { - absl::ReaderMutexLock lock(&sp->mu); - for (int i = start; i < limit; ++i) { - const int32 nbest_size = nbest_size_tensor->dims() == 1 - ? nbest_size_tensor->vec<int32>()(i) - : nbest_size_tensor->scalar<int32>()(); - if (return_nbest) { - OP_REQUIRES_OK(ctx, ToTFStatus(sp->processor.NBestEncode( - input_values_flat(i), nbest_size, - &nbest_tokens[i]))); - } else if (nbest_size == 0 || nbest_size == 1) { - OP_REQUIRES_OK(ctx, ToTFStatus(sp->processor.Encode( - input_values_flat(i), &tokens[i]))); - } else { - const float alpha = alpha_tensor->dims() == 1 - ? alpha_tensor->vec<float>()(i) - : alpha_tensor->scalar<float>()(); - OP_REQUIRES_OK(ctx, ToTFStatus(sp->processor.SampleEncode( - input_values_flat(i), nbest_size, alpha, - &tokens[i]))); - } - } - }); - } - - if (return_nbest_) { - for (auto& col : nbest_tokens) { - for (auto& row : col) { - tokens.push_back(std::move(row)); - } - } - nbest_tokens.clear(); - } - int64 total_tokens = 0; - for (auto& tokens_row : tokens) { - total_tokens += tokens_row.size(); - } - - Tensor* output_values_tensor = nullptr; - Tensor* output_splits_tensor = nullptr; - - OP_REQUIRES_OK( - ctx, ctx->allocate_output(0, {total_tokens}, &output_values_tensor)); - int64 splits_size = tokens.size() + 1; - OP_REQUIRES_OK( - ctx, ctx->allocate_output(1, {splits_size}, &output_splits_tensor)); - - auto values_tensor_flat = output_values_tensor->vec<T>(); - auto splits_tensor_flat = output_splits_tensor->vec<Tsplits>(); - - int i = 0; - splits_tensor_flat(0) = 0; - for (int row = 0; row < tokens.size(); ++row) { - for (int col = 0; col < tokens[row].size(); ++col, ++i) { - values_tensor_flat(i) = tokens[row][col]; - } - splits_tensor_flat(row + 1) = i; - } - } - - bool return_nbest_{false}; -}; - -REGISTER_KERNEL_BUILDER(Name("SentencepieceTokenizeOp") - .Device(DEVICE_CPU) - .TypeConstraint<int32>("out_type") - .TypeConstraint<int32>("Tsplits"), - SentencepieceTokenizeOp<int32, int32>); -REGISTER_KERNEL_BUILDER(Name("SentencepieceTokenizeOp") - .Device(DEVICE_CPU) - .TypeConstraint<tensorflow::tstring>("out_type") - .TypeConstraint<int32>("Tsplits"), - SentencepieceTokenizeOp<tensorflow::tstring, int32>); -REGISTER_KERNEL_BUILDER(Name("SentencepieceTokenizeOp") - .Device(DEVICE_CPU) - .TypeConstraint<int32>("out_type") - .TypeConstraint<int64>("Tsplits"), - SentencepieceTokenizeOp<int32, int64>); -REGISTER_KERNEL_BUILDER(Name("SentencepieceTokenizeOp") - .Device(DEVICE_CPU) - .TypeConstraint<tensorflow::tstring>("out_type") - .TypeConstraint<int64>("Tsplits"), - SentencepieceTokenizeOp<tensorflow::tstring, int64>); -ALLOW_STATEFUL_OP_FOR_DATASET_FUNCTIONS("SentencepieceTokenizeOp"); - -template <typename T, typename Tsplits> -class SentencepieceTokenizeWithOffsetsOp : public OpKernel { - public: - explicit SentencepieceTokenizeWithOffsetsOp(OpKernelConstruction* ctx) - : OpKernel(ctx) { - ctx->GetAttr("return_nbest", &return_nbest_).IgnoreError(); - } - - void Compute(OpKernelContext* ctx) override { - SentencepieceResource* sp; - const Tensor& resource_tensor = ctx->input(0); - ResourceHandle resource_handle(resource_tensor.scalar<ResourceHandle>()()); - OP_REQUIRES_OK( - ctx, ctx->resource_manager()->Lookup<SentencepieceResource>( - resource_handle.container(), resource_handle.name(), &sp)); - core::ScopedUnref unref_me(sp); - - const Tensor& input_values_tensor = ctx->input(1); - const auto input_values_flat = - input_values_tensor.flat<tensorflow::tstring>(); - const int64 num_of_input_values = input_values_flat.size(); - - const Tensor* nbest_size_tensor = nullptr; - OP_REQUIRES_OK(ctx, ctx->input("nbest_size", &nbest_size_tensor)); - const Tensor* alpha_tensor = nullptr; - OP_REQUIRES_OK(ctx, ctx->input("alpha", &alpha_tensor)); - - OP_REQUIRES_OK(ctx, HandleExtraOptions(ctx, sp)); - - if (return_nbest_) { - OP_REQUIRES(ctx, nbest_size_tensor->dims() == 0, - errors::InvalidArgument( - "When return_nbest is true nbest_size must " - "be a scalar; got", - nbest_size_tensor->shape().DebugString(), "instead")); - OP_REQUIRES(ctx, nbest_size_tensor->scalar<int32>()() >= 1, - errors::InvalidArgument( - "When return_nbest is true nbest_size must be >= 1; got ", - nbest_size_tensor->scalar<int32>()())); - } - - std::vector<sentencepiece::SentencePieceText> results( - return_nbest_ ? 0 : num_of_input_values); - std::vector<sentencepiece::NBestSentencePieceText> nbest_results( - return_nbest_ ? num_of_input_values : 0); - if (num_of_input_values > 0) { - const bool return_nbest = return_nbest_; - const auto& worker_threads = - *(ctx->device()->tensorflow_cpu_worker_threads()); - ::tensorflow::Shard( - worker_threads.num_threads, // max parallelism - worker_threads.workers, // thread pool - num_of_input_values, // total number of data to process. - kCostPerUnit, - [ctx, sp, &input_values_flat, &results, &nbest_results, - &nbest_size_tensor, &alpha_tensor, - return_nbest](int64 start, int64 limit) { - absl::ReaderMutexLock lock(&sp->mu); - for (int i = start; i < limit; ++i) { - const int32 nbest_size = nbest_size_tensor->dims() == 1 - ? nbest_size_tensor->vec<int32>()(i) - : nbest_size_tensor->scalar<int32>()(); - if (return_nbest) { - OP_REQUIRES_OK(ctx, ToTFStatus(sp->processor.NBestEncode( - input_values_flat(i), nbest_size, - &nbest_results[i]))); - } else if (nbest_size == 0 || nbest_size == 1) { - OP_REQUIRES_OK(ctx, ToTFStatus(sp->processor.Encode( - input_values_flat(i), &results[i]))); - } else { - const float alpha = alpha_tensor->dims() == 1 - ? alpha_tensor->vec<float>()(i) - : alpha_tensor->scalar<float>()(); - OP_REQUIRES_OK(ctx, ToTFStatus(sp->processor.SampleEncode( - input_values_flat(i), nbest_size, alpha, - &results[i]))); - } - } - }); - } - - if (return_nbest_) { - for (auto& nbest : nbest_results) { - for (auto& result : nbest.nbests()) { - results.push_back(std::move(result)); - } - } - } - int64 total_tokens = 0; - for (auto& sp_result : results) { - total_tokens += sp_result.pieces_size(); - } - - Tensor* output_values_tensor = nullptr; - Tensor* output_splits_tensor = nullptr; - Tensor* output_starts_tensor = nullptr; - Tensor* output_limits_tensor = nullptr; - - OP_REQUIRES_OK( - ctx, ctx->allocate_output(0, {total_tokens}, &output_values_tensor)); - int64 splits_size = results.size() + 1; - OP_REQUIRES_OK( - ctx, ctx->allocate_output(1, {splits_size}, &output_splits_tensor)); - OP_REQUIRES_OK( - ctx, ctx->allocate_output(2, {total_tokens}, &output_starts_tensor)); - OP_REQUIRES_OK( - ctx, ctx->allocate_output(3, {total_tokens}, &output_limits_tensor)); - - auto values_tensor_flat = output_values_tensor->vec<T>(); - auto splits_tensor_flat = output_splits_tensor->vec<Tsplits>(); - auto starts_tensor_flat = output_starts_tensor->vec<int64>(); - auto limits_tensor_flat = output_limits_tensor->vec<int64>(); - - int i = 0; - splits_tensor_flat(0) = 0; - for (int row = 0; row < results.size(); ++row) { - for (auto& sp : results[row].pieces()) { - values_tensor_flat(i) = GetPieceOrId<T>(sp); - starts_tensor_flat(i) = sp.begin(); - limits_tensor_flat(i) = sp.end(); - ++i; - } - splits_tensor_flat(row + 1) = i; - } - } - - bool return_nbest_{false}; -}; - -REGISTER_KERNEL_BUILDER(Name("SentencepieceTokenizeWithOffsetsOp") - .Device(DEVICE_CPU) - .TypeConstraint<int32>("out_type") - .TypeConstraint<int32>("Tsplits"), - SentencepieceTokenizeWithOffsetsOp<int32, int32>); -REGISTER_KERNEL_BUILDER( - Name("SentencepieceTokenizeWithOffsetsOp") - .Device(DEVICE_CPU) - .TypeConstraint<tensorflow::tstring>("out_type") - .TypeConstraint<int32>("Tsplits"), - SentencepieceTokenizeWithOffsetsOp<tensorflow::tstring, int32>); -REGISTER_KERNEL_BUILDER(Name("SentencepieceTokenizeWithOffsetsOp") - .Device(DEVICE_CPU) - .TypeConstraint<int32>("out_type") - .TypeConstraint<int64>("Tsplits"), - SentencepieceTokenizeWithOffsetsOp<int32, int64>); -REGISTER_KERNEL_BUILDER( - Name("SentencepieceTokenizeWithOffsetsOp") - .Device(DEVICE_CPU) - .TypeConstraint<tensorflow::tstring>("out_type") - .TypeConstraint<int64>("Tsplits"), - SentencepieceTokenizeWithOffsetsOp<tensorflow::tstring, int64>); -ALLOW_STATEFUL_OP_FOR_DATASET_FUNCTIONS("SentencepieceTokenizeWithOffsetsOp"); - -template <typename T, typename Tsplits> -class SentencepieceDetokenizeOp : public OpKernel { - public: - explicit SentencepieceDetokenizeOp(OpKernelConstruction* ctx) - : OpKernel(ctx) {} - - void Compute(OpKernelContext* ctx) override { - SentencepieceResource* sp; - const Tensor& resource_tensor = ctx->input(0); - ResourceHandle resource_handle(resource_tensor.scalar<ResourceHandle>()()); - OP_REQUIRES_OK( - ctx, ctx->resource_manager()->Lookup<SentencepieceResource>( - resource_handle.container(), resource_handle.name(), &sp)); - core::ScopedUnref unref_me(sp); - - const Tensor& input_values_tensor = ctx->input(1); - const auto input_values_flat = input_values_tensor.flat<T>(); - const Tensor& input_splits_tensor = ctx->input(2); - const auto input_splits_flat = input_splits_tensor.flat<Tsplits>(); - OP_REQUIRES(ctx, input_splits_flat.size() > 0, - absl::InvalidArgumentError( - "input_splits must have at least 1 element.")); - const int64 num_of_sentences = input_splits_flat.size() - 1; - - OP_REQUIRES_OK(ctx, HandleExtraOptions(ctx, sp)); - - Tensor* output_tensor; - OP_REQUIRES_OK(ctx, - ctx->allocate_output(0, {num_of_sentences}, &output_tensor)); - auto output_flat = output_tensor->flat<tensorflow::tstring>(); - - if (input_values_flat.size() > 0) { - const auto& worker_threads = - *(ctx->device()->tensorflow_cpu_worker_threads()); - ::tensorflow::Shard( - worker_threads.num_threads, // max parallelism - worker_threads.workers, // thread pool - num_of_sentences, // total number of data to process. - kCostPerUnit, - [ctx, sp, &input_values_flat, &input_splits_flat, &output_flat]( - int64 start, int64 limit) { - absl::ReaderMutexLock lock(&sp->mu); - for (int i = start; i < limit; ++i) { - if (i + 1 >= input_splits_flat.size()) { - ctx->CtxFailure(errors::OutOfRange("Invalid splits; ", i)); - return; - } - if (input_splits_flat(i) > input_values_flat.size()) { - ctx->CtxFailure(errors::OutOfRange( - "Splits and values do not match; split ", - input_splits_flat(i), "but values size is ", - input_values_flat.size())); - return; - } - const std::vector<typename std::conditional< - std::is_same<T, tstring>::value, std::string, T>::type> - pieces(&input_values_flat(input_splits_flat(i)), - &input_values_flat(input_splits_flat(i + 1))); - std::string output_flat_str; - OP_REQUIRES_OK(ctx, ToTFStatus(sp->processor.Decode( - pieces, &output_flat_str))); - output_flat(i) = output_flat_str; - } - }); - } - } -}; - -REGISTER_KERNEL_BUILDER(Name("SentencepieceDetokenizeOp") - .Device(DEVICE_CPU) - .TypeConstraint<int32>("T") - .TypeConstraint<int32>("Tsplits"), - SentencepieceDetokenizeOp<int32, int32>); -REGISTER_KERNEL_BUILDER(Name("SentencepieceDetokenizeOp") - .Device(DEVICE_CPU) - .TypeConstraint<tensorflow::tstring>("T") - .TypeConstraint<int32>("Tsplits"), - SentencepieceDetokenizeOp<tensorflow::tstring, int32>); -REGISTER_KERNEL_BUILDER(Name("SentencepieceDetokenizeOp") - .Device(DEVICE_CPU) - .TypeConstraint<int32>("T") - .TypeConstraint<int64>("Tsplits"), - SentencepieceDetokenizeOp<int32, int64>); -REGISTER_KERNEL_BUILDER(Name("SentencepieceDetokenizeOp") - .Device(DEVICE_CPU) - .TypeConstraint<tensorflow::tstring>("T") - .TypeConstraint<int64>("Tsplits"), - SentencepieceDetokenizeOp<tensorflow::tstring, int64>); -ALLOW_STATEFUL_OP_FOR_DATASET_FUNCTIONS("SentencepieceDetokenizeOp"); - -class SentencepieceVocabSizeOp : public OpKernel { - public: - explicit SentencepieceVocabSizeOp(OpKernelConstruction* ctx) - : OpKernel(ctx) {} - - void Compute(OpKernelContext* ctx) override { - SentencepieceResource* sp; - const Tensor& resource_tensor = ctx->input(0); - ResourceHandle resource_handle(resource_tensor.scalar<ResourceHandle>()()); - OP_REQUIRES_OK( - ctx, ctx->resource_manager()->Lookup<SentencepieceResource>( - resource_handle.container(), resource_handle.name(), &sp)); - core::ScopedUnref unref_me(sp); - - Tensor* output_tensor; - OP_REQUIRES_OK(ctx, ctx->allocate_output(0, {}, &output_tensor)); - output_tensor->scalar<int32>()() = sp->processor.GetPieceSize(); - } -}; - -REGISTER_KERNEL_BUILDER(Name("SentencepieceVocabSizeOp").Device(DEVICE_CPU), - SentencepieceVocabSizeOp); -ALLOW_STATEFUL_OP_FOR_DATASET_FUNCTIONS("SentencepieceVocabSizeOp"); - -class SentencepieceIdToStringOp : public OpKernel { - public: - explicit SentencepieceIdToStringOp(OpKernelConstruction* ctx) - : OpKernel(ctx) {} - - void Compute(OpKernelContext* ctx) override { - SentencepieceResource* sp; - const Tensor& resource_tensor = ctx->input(0); - ResourceHandle resource_handle(resource_tensor.scalar<ResourceHandle>()()); - OP_REQUIRES_OK( - ctx, ctx->resource_manager()->Lookup<SentencepieceResource>( - resource_handle.container(), resource_handle.name(), &sp)); - core::ScopedUnref unref_me(sp); - - const Tensor& input_tensor = ctx->input(1); - const auto input_tensor_flat = input_tensor.flat<int32>(); - Tensor* output_tensor; - OP_REQUIRES_OK( - ctx, ctx->allocate_output(0, input_tensor.shape(), &output_tensor)); - auto output_tensor_flat = output_tensor->flat<tensorflow::tstring>(); - - absl::ReaderMutexLock lock(&sp->mu); - for (int i = 0; i < input_tensor_flat.size(); ++i) { - output_tensor_flat(i) = sp->processor.IdToPiece(input_tensor_flat(i)); - } - } -}; - -REGISTER_KERNEL_BUILDER(Name("SentencepieceIdToStringOp").Device(DEVICE_CPU), - SentencepieceIdToStringOp); -ALLOW_STATEFUL_OP_FOR_DATASET_FUNCTIONS("SentencepieceIdToStringOp"); - -class SentencepieceStringToIdOp : public OpKernel { - public: - explicit SentencepieceStringToIdOp(OpKernelConstruction* ctx) - : OpKernel(ctx) {} - - void Compute(OpKernelContext* ctx) override { - SentencepieceResource* sp; - const Tensor& resource_tensor = ctx->input(0); - ResourceHandle resource_handle(resource_tensor.scalar<ResourceHandle>()()); - OP_REQUIRES_OK( - ctx, ctx->resource_manager()->Lookup<SentencepieceResource>( - resource_handle.container(), resource_handle.name(), &sp)); - core::ScopedUnref unref_me(sp); - - const Tensor& input_tensor = ctx->input(1); - const auto input_tensor_flat = input_tensor.flat<tensorflow::tstring>(); - Tensor* output_tensor; - OP_REQUIRES_OK( - ctx, ctx->allocate_output(0, input_tensor.shape(), &output_tensor)); - auto output_tensor_flat = output_tensor->flat<int32>(); - - absl::ReaderMutexLock lock(&sp->mu); - for (int i = 0; i < input_tensor_flat.size(); ++i) { - output_tensor_flat(i) = sp->processor.PieceToId(input_tensor_flat(i)); - } - } -}; - -REGISTER_KERNEL_BUILDER(Name("SentencepieceStringToIdOp").Device(DEVICE_CPU), - SentencepieceStringToIdOp); -ALLOW_STATEFUL_OP_FOR_DATASET_FUNCTIONS("SentencepieceStringToIdOp"); - -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/spanning_tree_iterator.cc b/tensorflow_text/core/kernels/spanning_tree_iterator.cc deleted file mode 100644 index 1c859a543..000000000 --- a/tensorflow_text/core/kernels/spanning_tree_iterator.cc +++ /dev/null @@ -1,96 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/spanning_tree_iterator.h" - -namespace tensorflow { -namespace text { - -SpanningTreeIterator::SpanningTreeIterator(bool forest) : forest_(forest) {} - -bool SpanningTreeIterator::HasCycle(const SourceList &sources) { - // Flags for whether each node has already been searched. - searched_.assign(sources.size(), false); - - // Flags for whether the search is currently visiting each node. - visiting_.assign(sources.size(), false); - - // Search upwards from each node to find cycles. - for (uint32 initial_node = 0; initial_node < sources.size(); ++initial_node) { - // Search upwards to try to find a cycle. - uint32 current_node = initial_node; - while (true) { - if (searched_[current_node]) break; // already searched - if (visiting_[current_node]) return true; // revisiting implies cycle - visiting_[current_node] = true; // mark as being currently visited - const uint32 source_node = sources[current_node]; - if (source_node == current_node) break; // self-loops are roots - current_node = source_node; // advance upwards - } - - // No cycle; search upwards again to update flags. - current_node = initial_node; - while (true) { - if (searched_[current_node]) break; // already searched - searched_[current_node] = true; - visiting_[current_node] = false; - const uint32 source_node = sources[current_node]; - if (source_node == current_node) break; // self-loops are roots - current_node = source_node; // advance upwards - } - } - - return false; -} - -uint32 SpanningTreeIterator::NumRoots(const SourceList &sources) { - uint32 num_roots = 0; - for (uint32 node = 0; node < sources.size(); ++node) { - num_roots += (node == sources[node]); - } - return num_roots; -} - -bool SpanningTreeIterator::NextSourceList(SourceList *sources) { - const uint32 num_nodes = sources->size(); - for (uint32 i = 0; i < num_nodes; ++i) { - const uint32 new_source = ++(*sources)[i]; - if (new_source < num_nodes) return true; // absorbed in this digit - (*sources)[i] = 0; // overflowed this digit, carry to next digit - } - return false; // overflowed the last digit -} - -bool SpanningTreeIterator::NextTree(SourceList *sources) { - // Iterate source lists, skipping non-trees. - while (NextSourceList(sources)) { - // Check the number of roots. - const uint32 num_roots = NumRoots(*sources); - if (forest_) { - if (num_roots == 0) continue; - } else { - if (num_roots != 1) continue; - } - - // Check for cycles. - if (HasCycle(*sources)) continue; - - // Acyclic and rooted, therefore tree. - return true; - } - return false; -} - -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/spanning_tree_iterator.h b/tensorflow_text/core/kernels/spanning_tree_iterator.h index 68bc6f14a..34041d157 100644 --- a/tensorflow_text/core/kernels/spanning_tree_iterator.h +++ b/tensorflow_text/core/kernels/spanning_tree_iterator.h @@ -12,67 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef TENSORFLOW_TEXT_CORE_KERNELS_SPANNING_TREE_ITERATOR_H_ -#define TENSORFLOW_TEXT_CORE_KERNELS_SPANNING_TREE_ITERATOR_H_ +#ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_SPANNING_TREE_ITERATOR_H_ +#define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_SPANNING_TREE_ITERATOR_H_ -#include <vector> +#include "tensorflow/core/kernels/text/spanning_tree_iterator.h" -#include "tensorflow/core/lib/core/status.h" - -namespace tensorflow { -namespace text { - -// A class that iterates over all possible spanning trees of a complete digraph. -// Thread-compatible. Useful for brute-force comparison tests. -// -// TODO(terrykoo): Try using Prufer sequences, which are more efficient to -// enumerate as there are no non-trees to filter out. -class SpanningTreeIterator { - public: - // An array that provides the source of the inbound arc for each node. Roots - // are represented as self-loops. - using SourceList = std::vector<uint32>; - - // Creates a spanning tree iterator. If |forest| is true, then this iterates - // over forests instead of trees (i.e., multiple roots are allowed). - explicit SpanningTreeIterator(bool forest); - - // Applies the |functor| to all spanning trees (or forests, if |forest_| is - // true) of a complete digraph containing |num_nodes| nodes. Each tree is - // passed to the |functor| as a SourceList. - template <class Functor> - void ForEachTree(uint32 num_nodes, Functor functor) { - // Conveniently, the all-zero vector represents a valid tree. - SourceList sources(num_nodes, 0); - do { - functor(sources); - } while (NextTree(&sources)); - } - - private: - // Returns true if the |sources| contains a cycle. - bool HasCycle(const SourceList &sources); - - // Returns the number of roots in the |sources|. - static uint32 NumRoots(const SourceList &sources); - - // Advances |sources| to the next source list, or returns false if there are - // no more source lists. - static bool NextSourceList(SourceList *sources); - - // Advances |sources| to the next tree (or forest, if |forest_| is true), or - // returns false if there are no more trees. - bool NextTree(SourceList *sources); - - // If true, iterate over spanning forests instead of spanning trees. - const bool forest_; - - // Workspaces used by the search in HasCycle(). - std::vector<bool> searched_; - std::vector<bool> visiting_; -}; - -} // namespace text -} // namespace tensorflow - -#endif // TENSORFLOW_TEXT_CORE_KERNELS_SPANNING_TREE_ITERATOR_H_ +#endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_SPANNING_TREE_ITERATOR_H_ diff --git a/tensorflow_text/core/kernels/spanning_tree_iterator_test.cc b/tensorflow_text/core/kernels/spanning_tree_iterator_test.cc deleted file mode 100644 index db3e4439e..000000000 --- a/tensorflow_text/core/kernels/spanning_tree_iterator_test.cc +++ /dev/null @@ -1,139 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/spanning_tree_iterator.h" - -#include <gmock/gmock.h> -#include <gtest/gtest.h> -#include "tensorflow/core/platform/logging.h" - -namespace tensorflow { -namespace text { - -// Testing rig. When the bool parameter is true, iterates over spanning forests -// instead of spanning trees. -class SpanningTreeIteratorTest : public ::testing::TestWithParam<bool> { - protected: - using SourceList = SpanningTreeIterator::SourceList; - - // Returns |base|^|exponent|. Computes the value as an integer to avoid - // rounding issues. - static int Pow(int base, int exponent) { - double real_product = 1.0; - int product = 1; - for (int i = 0; i < exponent; ++i) { - product *= base; - real_product *= base; - } - CHECK_EQ(product, real_product) << "Overflow detected."; - return product; - } - - // Expects that the number of possible spanning trees for a complete digraph - // of |num_nodes| nodes is |expected_num_trees|. - void ExpectNumTrees(int num_nodes, int expected_num_trees) { - int actual_num_trees = 0; - iterator_.ForEachTree( - num_nodes, [&](const SourceList &sources) { ++actual_num_trees; }); - LOG(INFO) << "num_nodes=" << num_nodes - << " expected_num_trees=" << expected_num_trees - << " actual_num_trees=" << actual_num_trees; - EXPECT_EQ(expected_num_trees, actual_num_trees); - } - - // Expects that the set of possible spanning trees for a complete digraph of - // |num_nodes| nodes is |expected_trees|. - void ExpectTrees(int num_nodes, const std::set<SourceList> &expected_trees) { - std::set<SourceList> actual_trees; - iterator_.ForEachTree(num_nodes, [&](const SourceList &sources) { - CHECK(actual_trees.insert(sources).second); - }); - EXPECT_EQ(expected_trees, actual_trees); - } - - // Instance for tests. Shared across assertions in a test to exercise reuse. - SpanningTreeIterator iterator_{GetParam()}; -}; - -INSTANTIATE_TEST_SUITE_P(AllowForest, SpanningTreeIteratorTest, - ::testing::Bool()); - -TEST_P(SpanningTreeIteratorTest, NumberOfTrees) { - // According to Cayley's formula, the number of undirected spanning trees on a - // complete graph of n nodes is n^{n-2}: - // https://en.wikipedia.org/wiki/Cayley%27s_formula - // - // To count the number of directed spanning trees, note that each undirected - // spanning tree gives rise to n directed spanning trees: choose one of the n - // nodes as the root, and then orient arcs outwards. Therefore, the number of - // directed spanning trees on a complete digraph of n nodes is n^{n-1}. - // - // To count the number of directed spanning forests, consider undirected - // spanning trees on a complete graph of n+1 nodes. Arbitrarily select one - // node as the artificial root, orient arcs outwards, and then delete the - // artificial root and its outbound arcs. The result is a directed spanning - // forest on n nodes. Therefore, the number of directed spanning forests on a - // complete digraph of n nodes is (n+1)^{n-1}. - for (int num_nodes = 1; num_nodes <= 7; ++num_nodes) { - if (GetParam()) { // forest - ExpectNumTrees(num_nodes, Pow(num_nodes + 1, num_nodes - 1)); - } else { // tree - ExpectNumTrees(num_nodes, Pow(num_nodes, num_nodes - 1)); - } - } -} - -TEST_P(SpanningTreeIteratorTest, OneNodeDigraph) { ExpectTrees(1, {{0}}); } - -TEST_P(SpanningTreeIteratorTest, TwoNodeDigraph) { - if (GetParam()) { // forest - ExpectTrees(2, {{0, 0}, {0, 1}, {1, 1}}); // {0, 1} is two-root structure - } else { // tree - ExpectTrees(2, {{0, 0}, {1, 1}}); - } -} - -TEST_P(SpanningTreeIteratorTest, ThreeNodeDigraph) { - if (GetParam()) { // forest - ExpectTrees(3, {{0, 0, 0}, - {0, 0, 1}, - {0, 0, 2}, // 2-root - {0, 1, 0}, // 2-root - {0, 1, 1}, // 2-root - {0, 1, 2}, // 3-root - {0, 2, 0}, - {0, 2, 2}, // 2-root - {1, 1, 0}, - {1, 1, 1}, - {1, 1, 2}, // 2-root - {1, 2, 2}, - {2, 0, 2}, - {2, 1, 1}, - {2, 1, 2}, // 2-root - {2, 2, 2}}); - } else { // tree - ExpectTrees(3, {{0, 0, 0}, - {0, 0, 1}, - {0, 2, 0}, - {1, 1, 0}, - {1, 1, 1}, - {1, 2, 2}, - {2, 0, 2}, - {2, 1, 1}, - {2, 2, 2}}); - } -} - -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/split_merge_tokenize_kernel.cc b/tensorflow_text/core/kernels/split_merge_tokenize_kernel.cc deleted file mode 100644 index 5491fab4d..000000000 --- a/tensorflow_text/core/kernels/split_merge_tokenize_kernel.cc +++ /dev/null @@ -1,222 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include <limits> -#include <memory> -#include <string> -#include <vector> - -#include "absl/status/status.h" -#include "absl/strings/str_cat.h" -#include "icu4c/source/common/unicode/uchar.h" -#include "icu4c/source/common/unicode/umachine.h" -#include "icu4c/source/common/unicode/utf8.h" -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/framework/tensor.h" -#include "tensorflow/core/framework/tensor_shape.h" -#include "tensorflow/core/lib/core/status.h" - -namespace tensorflow { -namespace text { - -namespace { - -// Returns the length (number of bytes) of the UTF8 code point starting at src, -// by reading only the byte from address src. -// -// The result is a number from the set {1, 2, 3, 4}. -int OneCharLen(const char* src) { - // On most platforms, char is unsigned by default, but iOS is an exception. - // The cast below makes sure we always interpret *src as an unsigned char. - return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4" - [(*(reinterpret_cast<const unsigned char*>(src)) & 0xFF) >> 4]; -} - -bool GetUTF8Chars(absl::string_view text, - std::vector<absl::string_view>* chars) { - const char* start = text.data(); - const char* end = text.data() + text.size(); - while (start < end) { - const int char_length = OneCharLen(start); - if (char_length <= 0) { - return false; - } - chars->emplace_back(start, char_length); - start += char_length; - } - return true; -} - -bool IsBreakChar(absl::string_view text) { - UChar32 c; - int position = 0; - U8_NEXT_OR_FFFD(text.data(), position, text.length(), c); - return u_isUWhiteSpace(c); -} - -Status TokenizeByLabel(const absl::string_view& text, - const Tensor& labels_tensor, - bool force_split_at_break_character, - std::vector<std::string>* tokens, - std::vector<int>* begin_offset, - std::vector<int>* end_offset, int* num_tokens) { - std::vector<absl::string_view> chars; - if (!GetUTF8Chars(text, &chars)) { - return Status(static_cast<::absl::StatusCode>( - absl::StatusCode::kInvalidArgument), - absl::StrCat("Input string is not utf8 valid: ", text)); - } - - if (chars.size() > labels_tensor.dim_size(0)) { - return Status(static_cast<::absl::StatusCode>( - absl::StatusCode::kInvalidArgument), - absl::StrCat("Number of labels ", labels_tensor.dim_size(0), - " is insufficient for text ", text)); - } - - const int split_label = 0; - bool last_character_is_break_character = false; - int start = 0; - bool has_new_token_generated_for_text = false; - const auto& labels = labels_tensor.unaligned_flat<int32>(); - for (int i = 0; i < chars.size(); ++i) { - const bool is_break_character = IsBreakChar(chars[i]); - if (!is_break_character) { - if (labels(i) == split_label || !has_new_token_generated_for_text || - (last_character_is_break_character && - force_split_at_break_character)) { - tokens->emplace_back(chars[i].data(), chars[i].length()); - begin_offset->push_back(start); - end_offset->push_back(start + chars[i].length()); - *num_tokens += 1; - has_new_token_generated_for_text = true; - } else { - tokens->back().append(chars[i].data(), chars[i].length()); - end_offset->back() = start + chars[i].length(); - } - } - - start += chars[i].length(); - last_character_is_break_character = is_break_character; - } - - return absl::OkStatus(); -} - -} // namespace - -class SplitMergeTokenizeWithOffsetsOp : public OpKernel { - public: - explicit SplitMergeTokenizeWithOffsetsOp(OpKernelConstruction* ctx) - : OpKernel(ctx) { - OP_REQUIRES_OK(ctx, ctx->GetAttr("force_split_at_break_character", - &force_split_at_break_character_)); - } - - void Compute(OpKernelContext* ctx) override { - const Tensor* input_values; - OP_REQUIRES_OK(ctx, ctx->input("input_values", &input_values)); - - const Tensor* labels; - OP_REQUIRES_OK(ctx, ctx->input("labels", &labels)); - const Tensor* row_splits; - OP_REQUIRES_OK(ctx, ctx->input("row_splits", &row_splits)); - OP_REQUIRES(ctx, input_values->dim_size(0) == row_splits->dim_size(0) - 1, - errors::InvalidArgument("Expecting row_splits have ", - input_values->dim_size(0) + 1, - " elements, got ", - row_splits->dim_size(0))); - - std::vector<string> tokens; - std::vector<int> begin_offset; - std::vector<int> end_offset; - std::vector<int> output_row_splits(1, 0); - - // Iterate through all the values and tokenize them. - const auto& values_vec = input_values->flat<tstring>(); - const auto& row_splits_vec = row_splits->flat<int32>(); - for (int i = 0; i < values_vec.size(); ++i) { - // Tokenize into tokens and record the offset locations. - int num_tokens = 0; - OP_REQUIRES_OK( - ctx, TokenizeByLabel( - values_vec(i), - labels->Slice(row_splits_vec(i), row_splits_vec(i + 1)), - force_split_at_break_character_, &tokens, &begin_offset, - &end_offset, &num_tokens)); - - // Record the row splits. - output_row_splits.push_back(num_tokens + output_row_splits.back()); - } - - std::vector<int64> output_tokens_shape; - output_tokens_shape.push_back(tokens.size()); - - std::vector<int64> output_row_splits_shape; - output_row_splits_shape.push_back(output_row_splits.size()); - - Tensor* output_values; - OP_REQUIRES_OK(ctx, ctx->allocate_output("output_values", - TensorShape(output_tokens_shape), - &output_values)); - auto output_values_vec = output_values->vec<tstring>(); - - Tensor* output_row_splits_tensor; - OP_REQUIRES_OK(ctx, - ctx->allocate_output("output_row_splits", - TensorShape(output_row_splits_shape), - &output_row_splits_tensor)); - auto output_row_splits_vec = output_row_splits_tensor->vec<int64>(); - - Tensor* start_values; - OP_REQUIRES_OK(ctx, ctx->allocate_output("start_values", - TensorShape(output_tokens_shape), - &start_values)); - auto start_values_vec = start_values->vec<int64>(); - - Tensor* limit_values; - OP_REQUIRES_OK(ctx, ctx->allocate_output("limit_values", - TensorShape(output_tokens_shape), - &limit_values)); - auto limit_values_vec = limit_values->vec<int64>(); - - for (int i = 0; i < tokens.size(); ++i) { - output_values_vec(i) = tokens[i]; - } - - for (int i = 0; i < output_row_splits.size(); ++i) { - output_row_splits_vec(i) = output_row_splits[i]; - } - - for (int i = 0; i < begin_offset.size(); ++i) { - start_values_vec(i) = begin_offset[i]; - } - - for (int i = 0; i < end_offset.size(); ++i) { - limit_values_vec(i) = end_offset[i]; - } - } - - private: - bool force_split_at_break_character_; - - TF_DISALLOW_COPY_AND_ASSIGN(SplitMergeTokenizeWithOffsetsOp); -}; - -REGISTER_KERNEL_BUILDER( - Name("SplitMergeTokenizeWithOffsets").Device(DEVICE_CPU), - SplitMergeTokenizeWithOffsetsOp); - -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/string_vocab.cc b/tensorflow_text/core/kernels/string_vocab.cc deleted file mode 100644 index a2c239a93..000000000 --- a/tensorflow_text/core/kernels/string_vocab.cc +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/string_vocab.h" - -namespace tensorflow { -namespace text { - -StringVocab::StringVocab(const std::vector<std::string>& vocab) - : vocab_(vocab) { - index_map_.reserve(vocab.size()); - for (int i = 0; i < vocab.size(); ++i) { - index_map_[vocab_[i]] = i; - } -} - -LookupStatus StringVocab::Contains(absl::string_view key, bool* value) const { - *value = index_map_.contains(key); - return LookupStatus(); -} - -absl::optional<int> StringVocab::LookupId(absl::string_view key) const { - auto it = index_map_.find(key); - if (it == index_map_.end()) { - return absl::nullopt; - } else { - return it->second; - } -} - -// Returns the key of `vocab_id` or empty if `vocab_id` is not valid. -absl::optional<absl::string_view> StringVocab::LookupWord(int vocab_id) const { - if (vocab_id >= vocab_.size() || vocab_id < 0) { - return absl::nullopt; - } - return vocab_[vocab_id]; -} -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/string_vocab.h b/tensorflow_text/core/kernels/string_vocab.h index 4590f2775..d58daa772 100644 --- a/tensorflow_text/core/kernels/string_vocab.h +++ b/tensorflow_text/core/kernels/string_vocab.h @@ -15,34 +15,6 @@ #ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_STRING_VOCAB_H_ #define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_STRING_VOCAB_H_ -#include <string> -#include <vector> - -#include "absl/container/flat_hash_map.h" -#include "tensorflow_text/core/kernels/wordpiece_tokenizer.h" - -namespace tensorflow { -namespace text { - -// An implementation of WordpieceVocab, used (1) to store the input vocabulary -// and (2) to call the original implementation of WordPiece tokenization to -// pre-compute the result for the suffix indicator string. -class StringVocab : public WordpieceVocab { - public: - explicit StringVocab(const std::vector<std::string>& vocab); - StringVocab(const StringVocab&) = delete; - StringVocab& operator=(const StringVocab&) = delete; - LookupStatus Contains(absl::string_view key, bool* value) const override; - absl::optional<int> LookupId(absl::string_view key) const; - // Returns the key of `vocab_id` or empty if `vocab_id` is not valid. - absl::optional<absl::string_view> LookupWord(int vocab_id) const; - int Size() const { return index_map_.size(); } - - private: - std::vector<std::string> vocab_; - absl::flat_hash_map<absl::string_view, int> index_map_; -}; -} // namespace text -} // namespace tensorflow +#include "tensorflow/core/kernels/text/string_vocab.h" #endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_STRING_VOCAB_H_ diff --git a/tensorflow_text/core/kernels/text_kernels_test_util.cc b/tensorflow_text/core/kernels/text_kernels_test_util.cc deleted file mode 100644 index 15da35665..000000000 --- a/tensorflow_text/core/kernels/text_kernels_test_util.cc +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/text_kernels_test_util.h" - -using ::testing::MakeMatcher; -using ::testing::Matcher; -using ::testing::MatchResultListener; - -namespace tensorflow { -namespace text_kernels_test_util { - -bool TensorEqMatcher::MatchAndExplain( - Tensor actual, ::testing::MatchResultListener* listener) const { - std::string expect_values = expect_.SummarizeValue(expect_.NumElements()); - std::string actual_values = actual.SummarizeValue(actual.NumElements()); - if (expect_.dtype() != actual.dtype() || expect_.shape() != actual.shape() || - expect_values != actual_values) { - *listener << "\n dtype=" << DataTypeString(actual.dtype()); - *listener << "\n shape=" << actual.shape().DebugString(); - *listener << "\n values=" << actual_values; - return false; - } - return true; -} - -void TensorEqMatcher::DescribeTo(::std::ostream* gmock_os) const { - *gmock_os << "dtype=" << DataTypeString(expect_.dtype()) - << "\n shape=" << expect_.shape().DebugString() - << "\n values=" - << expect_.SummarizeValue(expect_.NumElements()); -} - -void TensorEqMatcher::DescribeNegationTo(::std::ostream* gmock_os) const { - *gmock_os << "is not equal to " << expect_.DebugString(); -} - -bool TensorHasShapeMatcher::MatchAndExplain( - Tensor actual, ::testing::MatchResultListener* listener) const { - if (expect_ != actual.shape()) { - *listener << "\n shape=" << actual.shape().DebugString(); - return false; - } - return true; -} - -void TensorHasShapeMatcher::DescribeTo(::std::ostream* gmock_os) const { - *gmock_os << "shape=" << expect_.DebugString(); -} - -void TensorHasShapeMatcher::DescribeNegationTo(::std::ostream* gmock_os) const { - *gmock_os << "shape!=" << expect_.DebugString(); -} - -Matcher<Tensor> TensorHasShape(const TensorShape& shape) { - // MakeMatcher takes ownership of the TensorHasShapeMatcher. - return MakeMatcher(new TensorHasShapeMatcher(shape)); -} - -} // namespace text_kernels_test_util -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/text_kernels_test_util.h b/tensorflow_text/core/kernels/text_kernels_test_util.h index 9762b385f..c992dc278 100644 --- a/tensorflow_text/core/kernels/text_kernels_test_util.h +++ b/tensorflow_text/core/kernels/text_kernels_test_util.h @@ -12,112 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -// GMock matchers for testing text kernels: -// TensorHasShapeAndValues<DTYPE>({dim1, ..., dimN}, {v1, v2, ..., vN}); -// VectorEq<DTYPE>({v1, v2, ..., vN}); -// MatrixEq<DTYPE>({{v1_1, ..., v1_M}, ..., {vN_1, ..., vN_M}}); -// TensorHasShape({dim1, ..., dimN}); +#ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_TEXT_KERNELS_TEST_UTIL_H_ +#define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_TEXT_KERNELS_TEST_UTIL_H_ -#ifndef TENSORFLOW_TEXT_CORE_KERNELS_TEXT_KERNELS_TEST_UTIL_H_ -#define TENSORFLOW_TEXT_CORE_KERNELS_TEXT_KERNELS_TEST_UTIL_H_ +#include "tensorflow/core/kernels/text/text_kernels_test_util.h" -#include <gmock/gmock.h> -#include "tensorflow/core/framework/tensor.h" -#include "tensorflow/core/framework/tensor_shape.h" -#include "tensorflow/core/framework/tensor_testutil.h" - -namespace tensorflow { -namespace text_kernels_test_util { - -// GMock MatcherInterface for testing tensor equality. -class TensorEqMatcher : public ::testing::MatcherInterface<Tensor> { - public: - explicit TensorEqMatcher(const Tensor& expect) : expect_(expect) {} - bool MatchAndExplain(Tensor actual, - ::testing::MatchResultListener* listener) const override; - void DescribeTo(::std::ostream* gmock_os) const override; - void DescribeNegationTo(::std::ostream* gmock_os) const override; - - private: - Tensor expect_; -}; - -// GMock MatcherInterface for testing tensor shapes. -class TensorHasShapeMatcher : public ::testing::MatcherInterface<Tensor> { - public: - explicit TensorHasShapeMatcher(const TensorShape& expect) : expect_(expect) {} - bool MatchAndExplain(Tensor actual, - ::testing::MatchResultListener* listener) const override; - void DescribeTo(::std::ostream* gmock_os) const override; - void DescribeNegationTo(::std::ostream* gmock_os) const override; - - private: - TensorShape expect_; -}; - -// Returns a gmock matcher that checks whether a given tensor has the specified -// dtype, values, and shape. dtype is specified using the template parameter. -// values are specified as a flattened vector. -// Example: -// EXPECT_THAT(*GetOutput(0), -// TensorHasShapeAndValues<int64>({3, 2}, {1, 2, 3, 4, 5, 6}); -template <typename DTYPE> -::testing::Matcher<Tensor> TensorHasShapeAndValues( - const TensorShape& shape, const std::vector<DTYPE>& values) { - Tensor expect = test::AsTensor<DTYPE>(values, shape); - // MakeMatcher takes ownership of the TensorEqMatcher. - return ::testing::MakeMatcher(new TensorEqMatcher(expect)); -} - -// Returns a gmock matcher that checks whether a given tensor is a 1-D tensor -// with the specified dtype and values. dtype is specified using the template -// parameter. -// Example: -// EXPECT_THAT(*GetOutput(0), -// VectorEq<int64>({1, 2, 3, 4, 5, 6}); -template <typename DTYPE> -::testing::Matcher<Tensor> VectorEq(const std::vector<DTYPE>& values) { - int64_t nvals = values.size(); - Tensor expect = test::AsTensor<DTYPE>(values, {nvals}); - // MakeMatcher takes ownership of the TensorEqMatcher. - return ::testing::MakeMatcher(new TensorEqMatcher(expect)); -} - -// Returns a gmock matcher that checks whether a given tensor is a 2-D tensor -// with the specified dtype and values. dtype is specified using the template -// parameter. values are specified as a nested vector. All rows of the values -// vector must have the same length. The values vector may not be empty, -// since we can't infer the number of columns for an empty matrix; to test -// empty matrices, use the more general TensorHasShapeAndValues() instead. -// Example: -// EXPECT_THAT(*GetOutput(0), -// MatrixEq<int64>({{1, 2, 3}, {4, 5, 6}}); -template <typename DTYPE> -::testing::Matcher<Tensor> MatrixEq( - const std::vector<std::vector<DTYPE>>& values) { - int64_t nrows = values.size(); - CHECK_GT(nrows, 0) // Crash OK - << "Invalid use of MatrixEq: to test empty matrices, use " - << "TensorHasShapeAndValues<dtype>{{0, ndims}, {}} instead."; - int64_t ncols = values[0].size(); - std::vector<DTYPE> flat; - for (const auto& row : values) { - CHECK_EQ(ncols, row.size()) // Crash OK - << "Invalid use of MatrixEq: all rows must have equal length"; - flat.insert(flat.end(), row.begin(), row.end()); - } - Tensor expect = test::AsTensor<DTYPE>(flat, TensorShape({nrows, ncols})); - // MakeMatcher takes ownership of the TensorEqMatcher. - return ::testing::MakeMatcher(new TensorEqMatcher(expect)); -} - -// Returns a gmock matcher that checks whether a given tensor has a specified -// shape. -// Example: -// EXPECT_THAT(*GetOutput(0), TensorHasShape({2, 8}); -::testing::Matcher<Tensor> TensorHasShape(const TensorShape& shape); - -} // namespace text_kernels_test_util -} // namespace tensorflow - -#endif // TENSORFLOW_TEXT_CORE_KERNELS_TEXT_KERNELS_TEST_UTIL_H_ +#endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_TEXT_KERNELS_TEST_UTIL_H_ diff --git a/tensorflow_text/core/kernels/tokenizer_from_logits_kernel.cc b/tensorflow_text/core/kernels/tokenizer_from_logits_kernel.cc deleted file mode 100644 index 39262dd94..000000000 --- a/tensorflow_text/core/kernels/tokenizer_from_logits_kernel.cc +++ /dev/null @@ -1,235 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include <limits> -#include <memory> -#include <string> -#include <vector> - -#include "absl/strings/str_cat.h" -#include "icu4c/source/common/unicode/uchar.h" -#include "icu4c/source/common/unicode/umachine.h" -#include "icu4c/source/common/unicode/utf8.h" -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/framework/tensor.h" -#include "tensorflow/core/framework/tensor_shape.h" -#include "tensorflow/core/lib/core/status.h" - -namespace tensorflow { -namespace text { - -namespace { - -// Returns the length (number of bytes) of the UTF8 code point starting at src, -// by reading only the byte from address src. -// -// The result is a number from the set {1, 2, 3, 4}. -int OneCharLen(const char* src) { - // On most platforms, char is unsigned by default, but iOS is an exception. - // The cast below makes sure we always interpret *src as an unsigned char. - return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4" - [(*(reinterpret_cast<const unsigned char*>(src)) & 0xFF) >> 4]; -} - -bool GetUTF8Chars(absl::string_view text, - std::vector<absl::string_view>* chars) { - const char* start = text.data(); - const char* end = text.data() + text.size(); - while (start < end) { - const int char_length = OneCharLen(start); - if (char_length <= 0) { - return false; - } - chars->emplace_back(start, char_length); - start += char_length; - } - return true; -} - -bool IsBreakChar(absl::string_view text) { - UChar32 c; - int position = 0; - U8_NEXT_OR_FFFD(text.data(), position, text.length(), c); - return u_isUWhiteSpace(c); -} - -// Tokenizes text, the input string #(batch_index). Knowing the batch_index -// allows us to retrieve the corresponding data from logits. I.e., the logits -// for the i-th character from text are logits(batch_index, i, 0) (for the -// "split" action) and logits(batch_index, i, 1) (for the "merge" action). -Status TokenizeByLogits(const absl::string_view& text, - const TTypes<const float, 3>::Tensor& logits, - int batch_index, - bool force_split_at_break_character, - std::vector<std::string>* tokens, - std::vector<int>* begin_offset, - std::vector<int>* end_offset, int* num_tokens) { - std::vector<absl::string_view> chars; - if (!GetUTF8Chars(text, &chars)) { - return Status( - static_cast<absl::StatusCode>(absl::StatusCode::kInvalidArgument), - absl::StrCat("Input string is not utf8 valid: ", text)); - } - - if (chars.size() > logits.dimension(1)) { - return Status( - static_cast<absl::StatusCode>(absl::StatusCode::kInvalidArgument), - absl::StrCat("Number of logits, ", logits.dimension(1), - ", is insufficient for text \"", text, "\"")); - } - - bool last_character_is_break_character = false; - int start = 0; - bool has_new_token_generated_for_text = false; - for (int i = 0; i < chars.size(); ++i) { - const bool is_break_character = IsBreakChar(chars[i]); - if (!is_break_character) { - const float logit_split = logits(batch_index, i, 0); - const float logit_merge = logits(batch_index, i, 1); - if ((logit_split > logit_merge) || - !has_new_token_generated_for_text || - (last_character_is_break_character && - force_split_at_break_character)) { - tokens->emplace_back(chars[i].data(), chars[i].length()); - begin_offset->push_back(start); - end_offset->push_back(start + chars[i].length()); - *num_tokens += 1; - has_new_token_generated_for_text = true; - } else { - tokens->back().append(chars[i].data(), chars[i].length()); - end_offset->back() = start + chars[i].length(); - } - } - - start += chars[i].length(); - last_character_is_break_character = is_break_character; - } - - return absl::OkStatus(); -} - -} // namespace - -class TokenizerFromLogitsOp : public OpKernel { - public: - explicit TokenizerFromLogitsOp(OpKernelConstruction* ctx) - : OpKernel(ctx) {} - - void Compute(OpKernelContext* ctx) override { - const Tensor* strings; - OP_REQUIRES_OK(ctx, ctx->input("strings", &strings)); - const Tensor* logits; - OP_REQUIRES_OK(ctx, ctx->input("logits", &logits)); - OP_REQUIRES(ctx, strings->dim_size(0) == logits->dim_size(0), - errors::InvalidArgument("Expecting logits to have ", - strings->dim_size(0), - " rows, got ", - logits->dim_size(0))); - const Tensor* force_split_at_break_character; - OP_REQUIRES_OK(ctx, ctx->input("force_split_at_break_character", - &force_split_at_break_character)); - const bool force_split_at_break_character_bool = - force_split_at_break_character->scalar<bool>()(); - - std::vector<string> tokens; - std::vector<int> begin_offset; - std::vector<int> end_offset; - std::vector<int> output_row_splits(1, 0); - - // Tensor to access values from logits. - const TTypes<const float, 3>::Tensor logits_tensor = - logits->tensor<float, 3>(); - - // Iterate through all the values and tokenize them. - const auto& strings_vec = strings->flat<tstring>(); - OP_REQUIRES(ctx, logits_tensor.dimension(0) >= strings_vec.size(), - errors::Internal("Bad logits dimension #0: ", - logits_tensor.dimension(0), " < ", - strings_vec.size())); - // Dimension #1 of logits will be checked inside TokenizeByLogits. - OP_REQUIRES(ctx, logits_tensor.dimension(2) == 2, - errors::Internal("Bad logits dimension #2: ", - logits_tensor.dimension(2), " != 2")); - for (int i = 0; i < strings_vec.size(); ++i) { - // Tokenize into tokens and record the offset locations. - int num_tokens = 0; - OP_REQUIRES_OK( - ctx, TokenizeByLogits( - strings_vec(i), - logits_tensor, i, - force_split_at_break_character_bool, - &tokens, &begin_offset, &end_offset, &num_tokens)); - - // Record the row splits. - output_row_splits.push_back(num_tokens + output_row_splits.back()); - } - - std::vector<int64> output_tokens_shape; - output_tokens_shape.push_back(tokens.size()); - - std::vector<int64> output_row_splits_shape; - output_row_splits_shape.push_back(output_row_splits.size()); - - Tensor* output_values; - OP_REQUIRES_OK(ctx, ctx->allocate_output("output_values", - TensorShape(output_tokens_shape), - &output_values)); - auto output_values_vec = output_values->vec<tstring>(); - - Tensor* output_row_splits_tensor; - OP_REQUIRES_OK(ctx, - ctx->allocate_output("row_splits", - TensorShape(output_row_splits_shape), - &output_row_splits_tensor)); - auto output_row_splits_vec = output_row_splits_tensor->vec<int64>(); - - Tensor* start_values; - OP_REQUIRES_OK(ctx, ctx->allocate_output("start_values", - TensorShape(output_tokens_shape), - &start_values)); - auto start_values_vec = start_values->vec<int64>(); - - Tensor* limit_values; - OP_REQUIRES_OK(ctx, ctx->allocate_output("limit_values", - TensorShape(output_tokens_shape), - &limit_values)); - auto limit_values_vec = limit_values->vec<int64>(); - - for (int i = 0; i < tokens.size(); ++i) { - output_values_vec(i) = tokens[i]; - } - - for (int i = 0; i < output_row_splits.size(); ++i) { - output_row_splits_vec(i) = output_row_splits[i]; - } - - for (int i = 0; i < begin_offset.size(); ++i) { - start_values_vec(i) = begin_offset[i]; - } - - for (int i = 0; i < end_offset.size(); ++i) { - limit_values_vec(i) = end_offset[i]; - } - } - - private: - TF_DISALLOW_COPY_AND_ASSIGN(TokenizerFromLogitsOp); -}; - -REGISTER_KERNEL_BUILDER( - Name("TokenizerFromLogits").Device(DEVICE_CPU), - TokenizerFromLogitsOp); - -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/trimmer.h b/tensorflow_text/core/kernels/trimmer.h index f2781fc93..7c4f463cb 100644 --- a/tensorflow_text/core/kernels/trimmer.h +++ b/tensorflow_text/core/kernels/trimmer.h @@ -15,78 +15,6 @@ #ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_TRIMMER_H_ #define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_TRIMMER_H_ -#include <vector> - -#include "absl/types/span.h" - -namespace tensorflow { -namespace text { - -using Mask = std::vector<bool>; -template <typename T> -using Values = std::vector<T>; -template <typename T> -using ValuesSpan = absl::Span<T>; -template <typename Tsplits> -using RowSplits = std::vector<Tsplits>; -template <typename Tsplits> -using RowSplitsSpan = absl::Span<Tsplits>; - -template <typename T> -class Trimmer { - using ValuesT = Values<T>; - - public: - // Generates masks for a single batch of values. - virtual std::vector<Mask> GenerateMasks( - const std::vector<ValuesT>& values) const = 0; - - // Trims a single batch of values. - virtual void Trim(std::vector<ValuesT>* values) const = 0; - - virtual ~Trimmer() = default; -}; - -template <typename T, typename Tsplits> -class BatchTrimmer { - using Values_ = Values<T>; - using ValuesSpan_ = ValuesSpan<T>; - using RowSplits_ = RowSplits<Tsplits>; - using RowSplitsSpan_ = RowSplitsSpan<Tsplits>; - - public: - // Generates masks for a batch of value row splits. - // - // Args: - // row_splits: Row splits of the values in the shape [batch, (num values)] - // - // Returns: - // The returned value is a flattened list of mask values which can be split - // into batches using the same input row splits. - virtual std::vector<Mask> GenerateMasksBatch( - const std::vector<RowSplits_>& row_splits) const = 0; - virtual std::vector<Mask> GenerateMasksBatch( - const std::vector<RowSplitsSpan_>& row_splits) const = 0; - - // Trims a batch of values given their flattened values and row splits. - // - // Args: - // flat_values: Flattened values in shape [batch, (num values)] - // row_splits: Row splits of the values in the shape [batch, (num values)] - // - // Returns: - // The returned values are the flattened trimmed values and new row splits. - virtual std::pair<std::vector<Values_>, std::vector<RowSplits_>> TrimBatch( - const std::vector<Values_>& flat_values, - const std::vector<RowSplits_>& row_splits) const = 0; - virtual std::pair<std::vector<Values_>, std::vector<RowSplits_>> TrimBatch( - const std::vector<ValuesSpan_>& flat_values, - const std::vector<RowSplitsSpan_>& row_splits) const = 0; - - virtual ~BatchTrimmer() = default; -}; - -} // namespace text -} // namespace tensorflow +#include "tensorflow/core/kernels/text/trimmer.h" #endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_TRIMMER_H_ diff --git a/tensorflow_text/core/kernels/unicode_script_tokenize_kernel.cc b/tensorflow_text/core/kernels/unicode_script_tokenize_kernel.cc deleted file mode 100644 index 6217f1bd5..000000000 --- a/tensorflow_text/core/kernels/unicode_script_tokenize_kernel.cc +++ /dev/null @@ -1,194 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include <string.h> - -#include <vector> - -#include "icu4c/source/common/unicode/errorcode.h" -#include "icu4c/source/common/unicode/uchar.h" -#include "icu4c/source/common/unicode/uscript.h" -#include "tensorflow/core/framework/kernel_def_builder.h" -#include "tensorflow/core/framework/lookup_interface.h" -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/framework/resource_mgr.h" -#include "tensorflow/core/framework/tensor.h" -#include "tensorflow/core/framework/tensor_shape.h" -#include "tensorflow/core/framework/tensor_types.h" -#include "tensorflow/core/framework/types.h" -#include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/lib/core/threadpool.h" -#include "tensorflow/core/lib/io/path.h" -#include "tensorflow/core/platform/logging.h" -#include "tensorflow/core/platform/macros.h" - -namespace tensorflow { -namespace text { - -template <typename SPLITS_TYPE> -class UnicodeScriptTokenizeWithOffsetsOp : public OpKernel { - public: - explicit UnicodeScriptTokenizeWithOffsetsOp(OpKernelConstruction* ctx) - : OpKernel(ctx) { - OP_REQUIRES_OK(ctx, ctx->GetAttr("keep_whitespace", &keep_whitespace_)); - } - - /** - * Breaks a series of codepoints into individual groups based on the script - * code as defined by ICU. - * - * We gain a dimension while tokenizing since a series of integer codepoints - * is tokenized into different codepoint groups. - * - * This accepts two input tensors: a rank 1 tensor of codepoint values and - * a single rank 1 tensor of splits which determine where each string begins - * and ends from the provided codepoints. - */ - void Compute(OpKernelContext* context) override { - // Get inputs - const Tensor& input_values_tensor = context->input(0); - const auto input_values_flat = input_values_tensor.flat<int32>(); - const Tensor& input_splits_tensor = context->input(1); - const auto input_splits_flat = input_splits_tensor.flat<SPLITS_TYPE>(); - - // Since we limit to a 2-D input (flat_values of rank 1 and a single splits - // tensor), our output dimension will always be 3-D (flat_values of rank 1 - // with two splits - inner for the tokenized values and the outer for those - // grouped by the original strings). - // A few things to note: - // 1) The values and inner splits of the tokenized strings have an unknown - // length, as well as the offsets, so we allocate them at the end. - // 2) The outer splits of the tokenized strings matches that of the offset - // splits. Thus, we will only return one set and use it for all of them. - // 3) The outer splits shape will match the original input_splits. - Tensor* output_outer_splits_tensor; - OP_REQUIRES_OK(context, - context->allocate_output("output_outer_splits", - input_splits_tensor.shape(), - &output_outer_splits_tensor)); - auto output_outer_splits_flat = - output_outer_splits_tensor->flat<SPLITS_TYPE>(); - - std::vector<int32> output_values; - std::vector<SPLITS_TYPE> output_values_inner_splits; - std::vector<int64> output_offset_starts; - std::vector<int64> output_offset_limits; - - // Loop over the codepoints (a split at a time) and create splits of tokens. - icu::ErrorCode status; - for (int splits_idx = 0; splits_idx < input_splits_flat.size() - 1; - splits_idx++) { - output_outer_splits_flat(splits_idx) = output_offset_starts.size(); - UScriptCode prev_script = USCRIPT_INVALID_CODE; - bool token_has_start_set = false; - int32 curr_skipped_spaces = 0; // Used when computing the end of a token - const int curr_word_start_idx = input_splits_flat(splits_idx); - bool was_space = false; - for (int values_idx = curr_word_start_idx; - values_idx < input_splits_flat(splits_idx + 1); values_idx++) { - const int32 input_value = input_values_flat(values_idx); - const bool is_space = u_isUWhiteSpace(input_value); - UScriptCode script = uscript_getScript(input_value, status); - // Split these failures out as if they are a different code and ignore - // the error. - if (status.isFailure()) { - status.reset(); - script = USCRIPT_INVALID_CODE; - } - // Split out a new token if the unicode script changes from the - // previous token. - if (script != prev_script || - (keep_whitespace_ && is_space != was_space)) { - if (token_has_start_set) { - output_offset_limits.push_back(values_idx - curr_word_start_idx - - curr_skipped_spaces); - } - prev_script = script; - token_has_start_set = false; - } - // Only copy characters other than whitespace. Because of this, also do - // not start new tokens until a character other than a space is reached. - if (!is_space || keep_whitespace_) { - if (!token_has_start_set) { - // Set token start offset relative to current string. - output_offset_starts.push_back(values_idx - curr_word_start_idx); - // Set split to indicate start of a new token. - output_values_inner_splits.push_back(output_values.size()); - token_has_start_set = true; - } - output_values.push_back(input_value); - } - if (!keep_whitespace_) { - if (is_space) { - curr_skipped_spaces++; - } else { - curr_skipped_spaces = 0; - } - } - was_space = is_space; - } - // Looping through the codepoints for current tokens complete. Now set the - // last limit of out last token (if we found a start earlier). - if (token_has_start_set) { - output_offset_limits.push_back(input_splits_flat(splits_idx + 1) - - curr_word_start_idx - - curr_skipped_spaces); - } - } - // Now set the closing value of our splits. - output_outer_splits_flat(input_splits_flat.size() - 1) = - output_offset_starts.size(); - output_values_inner_splits.push_back(output_values.size()); - -// Allocate output & fill output tensors. -#define DECLARE_ALLOCATE_AND_FILL_OUTPUT_TENSOR(name, dtype) \ - int64 name##_size = name.size(); \ - Tensor* name##_tensor = nullptr; \ - OP_REQUIRES_OK(context, \ - context->allocate_output(#name, TensorShape({name##_size}), \ - &name##_tensor)); \ - auto name##_data = name##_tensor->flat<dtype>().data(); \ - /* For empty outputs, the data pointer might be null. */ \ - if (name##_size > 0) { \ - memcpy(name##_data, name.data(), name##_size * sizeof(dtype)); \ - } \ - do { \ - } while (false) - - DECLARE_ALLOCATE_AND_FILL_OUTPUT_TENSOR(output_values, int32); - DECLARE_ALLOCATE_AND_FILL_OUTPUT_TENSOR(output_values_inner_splits, - SPLITS_TYPE); - DECLARE_ALLOCATE_AND_FILL_OUTPUT_TENSOR(output_offset_starts, int64); - DECLARE_ALLOCATE_AND_FILL_OUTPUT_TENSOR(output_offset_limits, int64); - -#undef DECLARE_ALLOCATE_AND_FILL_OUTPUT_TENSOR - } - - private: - bool keep_whitespace_; - - TF_DISALLOW_COPY_AND_ASSIGN(UnicodeScriptTokenizeWithOffsetsOp); -}; - -REGISTER_KERNEL_BUILDER(Name("UnicodeScriptTokenizeWithOffsets") - .Device(DEVICE_CPU) - .TypeConstraint<int32>("Tsplits"), - UnicodeScriptTokenizeWithOffsetsOp<int32>); -REGISTER_KERNEL_BUILDER(Name("UnicodeScriptTokenizeWithOffsets") - .Device(DEVICE_CPU) - .TypeConstraint<int64>("Tsplits"), - UnicodeScriptTokenizeWithOffsetsOp<int64>); - -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/unicode_script_tokenize_kernel_test.cc b/tensorflow_text/core/kernels/unicode_script_tokenize_kernel_test.cc deleted file mode 100644 index ebd84b10f..000000000 --- a/tensorflow_text/core/kernels/unicode_script_tokenize_kernel_test.cc +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include <vector> - -#include <gmock/gmock.h> -#include <gtest/gtest.h> -#include "tensorflow/core/framework/fake_input.h" -#include "tensorflow/core/framework/node_def_builder.h" -#include "tensorflow/core/framework/tensor.h" -#include "tensorflow/core/framework/tensor_shape.h" -#include "tensorflow/core/kernels/ops_testutil.h" -#include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/lib/core/status_test_util.h" -#include "tensorflow_text/core/kernels/text_kernels_test_util.h" - -namespace tensorflow { -namespace text { - -using tensorflow::FakeInput; -using tensorflow::NodeDefBuilder; -using tensorflow::Status; -using tensorflow::TensorShape; -using tensorflow::text_kernels_test_util::VectorEq; - -class UnicodeScriptTokenizeWithOffsetsKernelTest - : public tensorflow::OpsTestBase { - public: - void MakeOp() { - TF_ASSERT_OK(NodeDefBuilder("tested_op", "UnicodeScriptTokenizeWithOffsets") - .Input(FakeInput()) - .Input(FakeInput()) - .Finalize(node_def())); - TF_ASSERT_OK(InitOp()); - } -}; - -TEST_F(UnicodeScriptTokenizeWithOffsetsKernelTest, Test) { - MakeOp(); - AddInputFromArray<int32_t>(TensorShape({6}), {111, 112, 32, 116, 117, 118}); - AddInputFromArray<int64_t>(TensorShape({3}), {0, 4, 6}); - TF_ASSERT_OK(RunOpKernel()); - - std::vector<int32_t> expected_values({111, 112, 116, 117, 118}); - std::vector<int64_t> expected_values_inner_splits({0, 2, 3, 5}); - std::vector<int64_t> expected_offset_starts({0, 3, 0}); - std::vector<int64_t> expected_offset_limits({2, 4, 2}); - std::vector<int64_t> output_outer_splits({0, 2, 3}); - EXPECT_THAT(*GetOutput(0), VectorEq(expected_values)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_values_inner_splits)); - EXPECT_THAT(*GetOutput(2), VectorEq(expected_offset_starts)); - EXPECT_THAT(*GetOutput(3), VectorEq(expected_offset_limits)); - EXPECT_THAT(*GetOutput(4), VectorEq(output_outer_splits)); -} - -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/utf8_binarize.cc b/tensorflow_text/core/kernels/utf8_binarize.cc deleted file mode 100644 index 2bba0de80..000000000 --- a/tensorflow_text/core/kernels/utf8_binarize.cc +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/utf8_binarize.h" -#include <algorithm> -#include <cassert> - -#include "absl/strings/string_view.h" -#include "icu4c/source/common/unicode/utf8.h" - -namespace tensorflow { -namespace text { - -void Utf8Binarize( - absl::string_view input, int word_length, int bits_per_char, - int replacement, /* out */ absl::Span<float> result) { - assert(result.size() == word_length * bits_per_char); - - const int input_size = input.size(); - int string_pos = 0; - int chars = 0; - int result_pos = 0; - while (string_pos < input_size && chars < word_length) { - UChar32 chr; - U8_NEXT(input, string_pos, input_size, chr); - if (chr < 0) { - // Decoding failure. - chr = replacement; - } - int bits = bits_per_char; - while (bits-- != 0) { - result[result_pos++] = (chr & 1) == 1 ? 1.0f : 0.0f; - chr >>= 1; - } - ++chars; - } - - std::fill(result.begin() + result_pos, result.end(), 0.0f); -} - -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/utf8_binarize.h b/tensorflow_text/core/kernels/utf8_binarize.h index 908cf006b..a6e630ded 100644 --- a/tensorflow_text/core/kernels/utf8_binarize.h +++ b/tensorflow_text/core/kernels/utf8_binarize.h @@ -15,21 +15,6 @@ #ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_UTF8_BINARIZE_H_ #define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_UTF8_BINARIZE_H_ -#include "absl/strings/string_view.h" -#include "absl/types/span.h" - -namespace tensorflow { -namespace text { - -// Stores low-endian floating-point bitwise representations of Unicode code -// points of `input` in `result` (`result.size()` is required to be exactly -// `word_length * bits_per_char` - output is padded / truncated accordingly). -// Replacements (for invalid UTF sequences) are represented by the -// `bits_per_char` lowest bits of `replacement`. -void Utf8Binarize(absl::string_view input, int word_length, int bits_per_char, - int replacement, /* out */ absl::Span<float> result); - -} // namespace text -} // namespace tensorflow +#include "tensorflow/core/kernels/text/utf8_binarize.h" #endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_UTF8_BINARIZE_H_ diff --git a/tensorflow_text/core/kernels/utf8_binarize_kernel.h b/tensorflow_text/core/kernels/utf8_binarize_kernel.h index 3dfdcaad5..7d3e8847f 100644 --- a/tensorflow_text/core/kernels/utf8_binarize_kernel.h +++ b/tensorflow_text/core/kernels/utf8_binarize_kernel.h @@ -15,18 +15,6 @@ #ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_UTF8_BINARIZE_KERNEL_H_ #define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_UTF8_BINARIZE_KERNEL_H_ -#include "tensorflow/lite/kernels/shim/tf_op_shim.h" -#include "tensorflow_text/core/kernels/utf8_binarize_kernel_template.h" - -namespace tensorflow { -namespace text { - -class Utf8BinarizeOpKernel : public tflite::shim::TfOpKernel<Utf8BinarizeOp> { - public: - using TfOpKernel::TfOpKernel; -}; - -} // namespace text -} // namespace tensorflow +#include "tensorflow/core/kernels/text/utf8_binarize_kernel.h" #endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_UTF8_BINARIZE_KERNEL_H_ diff --git a/tensorflow_text/core/kernels/utf8_binarize_kernel_template.h b/tensorflow_text/core/kernels/utf8_binarize_kernel_template.h index c8304ea6f..c4e921264 100644 --- a/tensorflow_text/core/kernels/utf8_binarize_kernel_template.h +++ b/tensorflow_text/core/kernels/utf8_binarize_kernel_template.h @@ -15,170 +15,6 @@ #ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_UTF8_BINARIZE_KERNEL_TEMPLATE_H_ #define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_UTF8_BINARIZE_KERNEL_TEMPLATE_H_ -#include <cstdint> -#include <vector> - -#include "absl/status/status.h" -#include "absl/strings/str_cat.h" -#include "tensorflow/core/platform/tstring.h" -#include "tensorflow/lite/kernels/shim/op_kernel.h" -#include "tensorflow/lite/kernels/shim/shape.h" -#include "tensorflow/lite/kernels/shim/status_macros.h" -#include "tensorflow_text/core/kernels/utf8_binarize.h" - -namespace tensorflow { -namespace text { - -template <tflite::shim::Runtime Rt> -class Utf8BinarizeOp : public tflite::shim::OpKernelShim<Utf8BinarizeOp, Rt> { - private: - enum Inputs { kInputTokens = 0 }; - enum Outputs { kOutputBinarizations = 0 }; - - using typename tflite::shim::OpKernelShim<Utf8BinarizeOp, Rt>::InitContext; - using typename tflite::shim::OpKernelShim<Utf8BinarizeOp, Rt>::InvokeContext; - using typename tflite::shim::OpKernelShim<Utf8BinarizeOp, - Rt>::ShapeInferenceContext; - - public: - Utf8BinarizeOp() = default; - static constexpr char kOpName[] = "TFText>Utf8Binarize"; - static constexpr char kDoc[] = R"doc( - Decode a UTF-8 string into Unicode code points - and return their bitwise little-endian representations - (see the [RetVec paper](https://arxiv.org/abs/2302.09207)). - )doc"; - - static const char* OpName() { return kOpName; } - static const char* Doc() { return kDoc; } - - // Attributes declaration (syntax: https://www.tensorflow.org/guide/create_op) - static std::vector<std::string> Attrs(); - - // Inputs declaration (syntax: https://www.tensorflow.org/guide/create_op) - static std::vector<std::string> Inputs(); - - // Outputs declaration (syntax: https://www.tensorflow.org/guide/create_op) - static std::vector<std::string> Outputs(); - - // Initializes the op - absl::Status Init(InitContext* context); - - // Runs the operation - absl::Status Invoke(InvokeContext* context); - - // Shape inference - static absl::Status ShapeInference(ShapeInferenceContext* c); - - private: - inline static constexpr absl::string_view kMaxCharsAttr = "word_length"; - inline static constexpr absl::string_view kBitsPerCharAttr = "bits_per_char"; - inline static constexpr absl::string_view kReplacementCharAttr = - "replacement_char"; - - int64_t word_length_; - int64_t bits_per_char_; - int64_t replacement_char_; -}; - -template <tflite::shim::Runtime Rt> -std::vector<std::string> Utf8BinarizeOp<Rt>::Attrs() { - return {absl::StrCat(kMaxCharsAttr, ": int"), - absl::StrCat(kBitsPerCharAttr, ": int"), - absl::StrCat(kReplacementCharAttr, ": int")}; -} - -template <tflite::shim::Runtime Rt> -std::vector<std::string> Utf8BinarizeOp<Rt>::Inputs() { - return {"input_tokens: string"}; -} - -template <tflite::shim::Runtime Rt> -std::vector<std::string> Utf8BinarizeOp<Rt>::Outputs() { - return {"output_binarizations: float"}; -} - -template <tflite::shim::Runtime Rt> -absl::Status Utf8BinarizeOp<Rt>::Init(InitContext* context) { - // Attrs - SH_RETURN_IF_ERROR( - context->GetAttr(std::string(kMaxCharsAttr), &word_length_)); - SH_RETURN_IF_ERROR( - context->GetAttr(std::string(kBitsPerCharAttr), &bits_per_char_)); - SH_RETURN_IF_ERROR( - context->GetAttr(std::string(kReplacementCharAttr), &replacement_char_)); - - return absl::OkStatus(); -} - -template <tflite::shim::Runtime Rt> -absl::Status Utf8BinarizeOp<Rt>::ShapeInference(ShapeInferenceContext* c) { - using tflite::shim::Shape; - const auto input_tokens_shape_status = c->GetInputShape(kInputTokens); - if (!input_tokens_shape_status.ok()) { - return input_tokens_shape_status.status(); - } - const Shape& input_tokens_shape = *input_tokens_shape_status; - - const auto rank_1_shape = Shape({Shape::kUnknownDim}); - if (!input_tokens_shape.Compatible(rank_1_shape)) { - return absl::FailedPreconditionError( - absl::StrCat("Shape must be rank 1: ", input_tokens_shape.ToString())); - } - - int64_t word_length; - SH_RETURN_IF_ERROR( - c->GetAttr(std::string(kMaxCharsAttr), &word_length)); - int64_t bits_per_char; - SH_RETURN_IF_ERROR(c->GetAttr(std::string(kBitsPerCharAttr), &bits_per_char)); - - const int num_tokens = input_tokens_shape.Dim(0); - const int bits_per_token = word_length * bits_per_char; - const Shape output_shape{num_tokens, bits_per_token}; - SH_RETURN_IF_ERROR(c->SetOutputShape(kOutputBinarizations, output_shape)); - - return absl::OkStatus(); -} - -template <tflite::shim::Runtime Rt> -absl::Status Utf8BinarizeOp<Rt>::Invoke(InvokeContext* context) { - // Attrs - const int word_length = word_length_; - const int bits_per_char = bits_per_char_; - const int replacement_char = replacement_char_; - const int bits_per_token = word_length * bits_per_char; - - // Inputs - const auto tokens_statusor = context->GetInput(kInputTokens); - if (!tokens_statusor.ok()) { - return tokens_statusor.status(); - } - const auto tokens = (*tokens_statusor)->template As<tensorflow::tstring, 1>(); - const int num_tokens = tokens.Dim(0); - - // Outputs - auto binarizations_statusor = - context->GetOutput(kOutputBinarizations, {num_tokens, bits_per_token}); - if (!binarizations_statusor.ok()) { - return binarizations_statusor.status(); - } - auto binarizations = (*binarizations_statusor)->template As<float, 2>(); - - // Iterate through all the token strings and binarize them. - for (int token_idx = 0; token_idx < num_tokens; ++token_idx) { - float* row_start = &binarizations(token_idx, 0); - absl::Span<float> output_binarization(row_start, bits_per_token); - Utf8Binarize(tokens(token_idx), - /*word_length=*/word_length, - /*bits_per_char=*/bits_per_char, - /*replacement=*/replacement_char, - /*result=*/output_binarization); - } - - return absl::OkStatus(); -} - -} // namespace text -} // namespace tensorflow +#include "tensorflow/core/kernels/text/utf8_binarize_kernel_template.h" #endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_UTF8_BINARIZE_KERNEL_TEMPLATE_H_ diff --git a/tensorflow_text/core/kernels/utf8_binarize_test.cc b/tensorflow_text/core/kernels/utf8_binarize_test.cc deleted file mode 100644 index 9f61896c9..000000000 --- a/tensorflow_text/core/kernels/utf8_binarize_test.cc +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/utf8_binarize.h" -#include <vector> - -#include <gmock/gmock.h> -#include "absl/types/span.h" - -namespace tensorflow { -namespace text { -namespace { - -using ::testing::ElementsAre; - -TEST(UnicodeTest, Utf8Binarize) { - std::vector<float> out1(3 * 4); - Utf8Binarize("hello", /*word_length=*/3, /*bits_per_char=*/4, - /*replacement=*/3, /*result=*/absl::MakeSpan(out1)); - // L-endian 4 lowest bits of: - EXPECT_THAT(out1, ElementsAre(0, 0, 0, 1, // "h" - 1, 0, 1, 0, // "e" - 0, 0, 1, 1)); // "l" - - std::vector<float> out2(4 * 5); - Utf8Binarize("爱上一个不回", /*word_length=*/4, /*bits_per_char=*/5, - /*replacement=*/7, /*result=*/absl::MakeSpan(out2)); - // L-endian 5 lowest bits of: - EXPECT_THAT(out2, ElementsAre(1, 0, 0, 0, 1, // "爱" - 0, 1, 0, 1, 0, // "上" - 0, 0, 0, 0, 0, // "一" - 0, 1, 0, 1, 0)); // "个" - - // Notable example: - // - (Unicode) characters are padded, not truncated as above (zero-padding); - // - the UTF-8 sequence is invalid, so we get a replacement bit pattern. - std::vector<float> out3(3 * 6); - Utf8Binarize("\xc3(", /*word_length=*/3, /*bits_per_char=*/6, - /*replacement=*/35, /*result=*/absl::MakeSpan(out3)); - // LE 6 lowest bits of: - EXPECT_THAT(out3, ElementsAre(1, 1, 0, 0, 0, 1, // Replacement. - 0, 0, 0, 1, 0, 1, // "(". - 0, 0, 0, 0, 0, 0)); // Padding. -} - -} // namespace -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/utf8_binarize_tflite.cc b/tensorflow_text/core/kernels/utf8_binarize_tflite.cc deleted file mode 100644 index 9bb6ac3c5..000000000 --- a/tensorflow_text/core/kernels/utf8_binarize_tflite.cc +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/utf8_binarize_tflite.h" - -#include "tensorflow/lite/c/common.h" -#include "tensorflow/lite/kernels/shim/tflite_op_shim.h" -#include "tensorflow_text/core/kernels/utf8_binarize_kernel_template.h" - -namespace tflite { -namespace ops { -namespace custom { -namespace text { - -extern "C" void AddUtf8Binarize(tflite::MutableOpResolver* resolver) { - tflite::shim::TfLiteOpKernel<tensorflow::text::Utf8BinarizeOp>::Add(resolver); -} - -} // namespace text -} // namespace custom -} // namespace ops -} // namespace tflite diff --git a/tensorflow_text/core/kernels/utf8_binarize_tflite.h b/tensorflow_text/core/kernels/utf8_binarize_tflite.h index b4e145d1e..c34028803 100644 --- a/tensorflow_text/core/kernels/utf8_binarize_tflite.h +++ b/tensorflow_text/core/kernels/utf8_binarize_tflite.h @@ -12,22 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef THIRD_PARTY_TENSORFLOW_TEXT_GOOGLE_KERNELS_UTF8_BINARIZE_TFLITE_H_ -#define THIRD_PARTY_TENSORFLOW_TEXT_GOOGLE_KERNELS_UTF8_BINARIZE_TFLITE_H_ +#ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_UTF8_BINARIZE_TFLITE_H_ +#define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_UTF8_BINARIZE_TFLITE_H_ -#include "tensorflow/lite/c/common.h" -#include "tensorflow/lite/mutable_op_resolver.h" +#include "tensorflow/core/kernels/text/utf8_binarize_tflite.h" -namespace tflite { -namespace ops { -namespace custom { -namespace text { - -extern "C" void AddUtf8Binarize(::tflite::MutableOpResolver* resolver); - -} // namespace text -} // namespace custom -} // namespace ops -} // namespace tflite - -#endif // THIRD_PARTY_TENSORFLOW_TEXT_GOOGLE_KERNELS_UTF8_BINARIZE_TFLITE_H_ +#endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_UTF8_BINARIZE_TFLITE_H_ diff --git a/tensorflow_text/core/kernels/whitespace_tokenize_kernel.cc b/tensorflow_text/core/kernels/whitespace_tokenize_kernel.cc deleted file mode 100644 index dcdac0c5f..000000000 --- a/tensorflow_text/core/kernels/whitespace_tokenize_kernel.cc +++ /dev/null @@ -1,161 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include <string.h> - -#include <vector> - -#include "icu4c/source/common/unicode/uchar.h" -#include "tensorflow/core/framework/kernel_def_builder.h" -#include "tensorflow/core/framework/lookup_interface.h" -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/framework/resource_mgr.h" -#include "tensorflow/core/framework/tensor.h" -#include "tensorflow/core/framework/tensor_shape.h" -#include "tensorflow/core/framework/tensor_types.h" -#include "tensorflow/core/framework/types.h" -#include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/lib/core/threadpool.h" -#include "tensorflow/core/lib/io/path.h" -#include "tensorflow/core/platform/logging.h" -#include "tensorflow/core/platform/macros.h" - -namespace tensorflow { -namespace text { - -template <typename SPLITS_TYPE> -class WhitespaceTokenizeWithOffsetsOp : public OpKernel { - public: - explicit WhitespaceTokenizeWithOffsetsOp(OpKernelConstruction* ctx) - : OpKernel(ctx) {} - - /** - * Breaks a series of codepoints into individual groups based on the script - * code. - * - * We gain a dimension while tokenizing since a series of integer codepoints - * is tokenized into different codepoint groups. - * - * This accepts two input tensors: a rank 1 tensor of codepoint values and - * a single rank 1 tensor of splits which determine where each string begins - * and ends from the provided codepoints. - */ - void Compute(OpKernelContext* context) override { - // Get inputs - const Tensor& input_values_tensor = context->input(0); - const auto input_values_flat = input_values_tensor.flat<int32>(); - const Tensor& input_splits_tensor = context->input(1); - const auto input_splits_flat = input_splits_tensor.flat<SPLITS_TYPE>(); - - // Since we limit to a 2-D input (flat_values of rank 1 and a single splits - // tensor), our output dimension will always be 3-D (flat_values of rank 1 - // with two splits - inner for the tokenized values and the outer for those - // grouped by the original strings). - // A few things to note: - // 1) The values and inner splits of the tokenized strings have an unknown - // length, as well as the offsets, so we allocate them at the end. - // 2) The outer splits of the tokenized strings matches that of the offset - // splits. Thus, we will only return one set and use it for all of them. - // 3) The outer splits shape will match the original input_splits. - Tensor* output_outer_splits_tensor; - OP_REQUIRES_OK(context, - context->allocate_output("output_outer_splits", - input_splits_tensor.shape(), - &output_outer_splits_tensor)); - auto output_outer_splits_flat = - output_outer_splits_tensor->flat<SPLITS_TYPE>(); - - std::vector<int32> output_values; - std::vector<SPLITS_TYPE> output_values_inner_splits; - std::vector<int64> output_offset_starts; - std::vector<int64> output_offset_limits; - - // Loop over the codepoints (a split at a time) and create splits of tokens. - for (int splits_idx = 0; splits_idx < input_splits_flat.size() - 1; - splits_idx++) { - output_outer_splits_flat(splits_idx) = output_offset_starts.size(); - bool token_has_start_set = false; - int32 curr_skipped_spaces = 0; // Used when computing the end of a token - const int curr_word_start_idx = input_splits_flat(splits_idx); - for (int values_idx = curr_word_start_idx; - values_idx < input_splits_flat(splits_idx + 1); values_idx++) { - // End current token if we find whitespace - if (u_isUWhiteSpace(input_values_flat(values_idx))) { - if (token_has_start_set) { - output_offset_limits.push_back(values_idx - curr_word_start_idx - - curr_skipped_spaces); - } - token_has_start_set = false; - ++curr_skipped_spaces; - } else { - // Non whitespace. Start a new token if needed, and append the - // codepoint to our current token. - if (!token_has_start_set) { - // Set token start offset relative to current string. - output_offset_starts.push_back(values_idx - curr_word_start_idx); - // Set split to indicate start of a new token. - output_values_inner_splits.push_back(output_values.size()); - token_has_start_set = true; - } - output_values.push_back(input_values_flat(values_idx)); - curr_skipped_spaces = 0; - } - } - // Looping through the codepoints for current tokens complete. Now set the - // last limit of out last token (if we found a start earlier). - if (token_has_start_set) { - output_offset_limits.push_back(input_splits_flat(splits_idx + 1) - - curr_word_start_idx - - curr_skipped_spaces); - } - } - // Now set the closing value of our splits. - output_outer_splits_flat(input_splits_flat.size() - 1) = - output_offset_starts.size(); - output_values_inner_splits.push_back(output_values.size()); - -// Allocate output & fill output tensors. -#define DECLARE_ALLOCATE_AND_FILL_OUTPUT_TENSOR(name, dtype) \ - int64 name##_size = name.size(); \ - Tensor* name##_tensor = nullptr; \ - OP_REQUIRES_OK(context, \ - context->allocate_output(#name, TensorShape({name##_size}), \ - &name##_tensor)); \ - auto name##_data = name##_tensor->flat<dtype>().data(); \ - memcpy(name##_data, name.data(), name##_size * sizeof(dtype)); - - DECLARE_ALLOCATE_AND_FILL_OUTPUT_TENSOR(output_values, int32); - DECLARE_ALLOCATE_AND_FILL_OUTPUT_TENSOR(output_values_inner_splits, - SPLITS_TYPE); - DECLARE_ALLOCATE_AND_FILL_OUTPUT_TENSOR(output_offset_starts, int64); - DECLARE_ALLOCATE_AND_FILL_OUTPUT_TENSOR(output_offset_limits, int64); - -#undef DECLARE_ALLOCATE_AND_FILL_OUTPUT_TENSOR - } - - private: - TF_DISALLOW_COPY_AND_ASSIGN(WhitespaceTokenizeWithOffsetsOp); -}; - -REGISTER_KERNEL_BUILDER(Name("WhitespaceTokenizeWithOffsets") - .Device(DEVICE_CPU) - .TypeConstraint<int32>("Tsplits"), - WhitespaceTokenizeWithOffsetsOp<int32>); -REGISTER_KERNEL_BUILDER(Name("WhitespaceTokenizeWithOffsets") - .Device(DEVICE_CPU) - .TypeConstraint<int64>("Tsplits"), - WhitespaceTokenizeWithOffsetsOp<int64>); - -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/whitespace_tokenize_kernel_test.cc b/tensorflow_text/core/kernels/whitespace_tokenize_kernel_test.cc deleted file mode 100644 index d9792be45..000000000 --- a/tensorflow_text/core/kernels/whitespace_tokenize_kernel_test.cc +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include <vector> - -#include <gmock/gmock.h> -#include <gtest/gtest.h> -#include "tensorflow/core/framework/fake_input.h" -#include "tensorflow/core/framework/node_def_builder.h" -#include "tensorflow/core/framework/tensor.h" -#include "tensorflow/core/framework/tensor_shape.h" -#include "tensorflow/core/kernels/ops_testutil.h" -#include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/lib/core/status_test_util.h" -#include "tensorflow_text/core/kernels/text_kernels_test_util.h" - -namespace tensorflow { -namespace text { - -using tensorflow::FakeInput; -using tensorflow::NodeDefBuilder; -using tensorflow::Status; -using tensorflow::TensorShape; -using tensorflow::text_kernels_test_util::VectorEq; - -class WhitespaceTokenizeWithOffsetsKernelTest - : public tensorflow::OpsTestBase { - public: - void MakeOp() { - TF_ASSERT_OK(NodeDefBuilder("tested_op", "WhitespaceTokenizeWithOffsets") - .Input(FakeInput()) - .Input(FakeInput()) - .Finalize(node_def())); - TF_ASSERT_OK(InitOp()); - } -}; - -TEST_F(WhitespaceTokenizeWithOffsetsKernelTest, Test) { - MakeOp(); - AddInputFromArray<int32_t>(TensorShape({6}), {111, 112, 32, 116, 117, 118}); - AddInputFromArray<int64_t>(TensorShape({3}), {0, 4, 6}); - TF_ASSERT_OK(RunOpKernel()); - - std::vector<int32_t> expected_values({111, 112, 116, 117, 118}); - std::vector<int64_t> expected_values_inner_splits({0, 2, 3, 5}); - std::vector<int64_t> expected_offset_starts({0, 3, 0}); - std::vector<int64_t> expected_offset_limits({2, 4, 2}); - std::vector<int64_t> output_outer_splits({0, 2, 3}); - EXPECT_THAT(*GetOutput(0), VectorEq(expected_values)); - EXPECT_THAT(*GetOutput(1), VectorEq(expected_values_inner_splits)); - EXPECT_THAT(*GetOutput(2), VectorEq(expected_offset_starts)); - EXPECT_THAT(*GetOutput(3), VectorEq(expected_offset_limits)); - EXPECT_THAT(*GetOutput(4), VectorEq(output_outer_splits)); -} - -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/whitespace_tokenizer.cc b/tensorflow_text/core/kernels/whitespace_tokenizer.cc deleted file mode 100644 index dfe7107fc..000000000 --- a/tensorflow_text/core/kernels/whitespace_tokenizer.cc +++ /dev/null @@ -1,87 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/whitespace_tokenizer.h" - -#include <string> -#include <vector> - -#include "absl/strings/string_view.h" -#include "icu4c/source/common/unicode/appendable.h" -#include "icu4c/source/common/unicode/schriter.h" -#include "icu4c/source/common/unicode/uchar.h" -#include "icu4c/source/common/unicode/ucnv.h" -#include "icu4c/source/common/unicode/ucnv_err.h" -#include "icu4c/source/common/unicode/umachine.h" -#include "icu4c/source/common/unicode/uniset.h" -#include "icu4c/source/common/unicode/unistr.h" -#include "icu4c/source/common/unicode/uset.h" -#include "icu4c/source/common/unicode/utypes.h" -#include "icu4c/source/common/unicode/bytestream.h" -#include "icu4c/source/common/unicode/edits.h" -#include "icu4c/source/common/unicode/normalizer2.h" -#include "icu4c/source/common/unicode/stringoptions.h" -#include "icu4c/source/common/unicode/stringpiece.h" -#include "icu4c/source/common/unicode/utf.h" -#include "icu4c/source/common/unicode/utf8.h" - - -namespace tensorflow { -namespace text { - -void WhitespaceTokenizer::Tokenize(const absl::string_view input, - std::vector<std::string>* tokens) { - std::vector<int> start_offsets, end_offsets; - Tokenize(input, tokens, &start_offsets, &end_offsets); -} - -void WhitespaceTokenizer::Tokenize(const absl::string_view input, - std::vector<std::string>* tokens, - std::vector<int>* start_offsets, - std::vector<int>* end_offsets) { - const int input_size = input.size(); - int position = 0, prev_position = 0; - UChar32 codepoint; - bool inside_token = false; - while (position < input_size) { - prev_position = position; - U8_NEXT(input, position, input_size, codepoint); - if (config_.IsWhitespace(codepoint)) { - if (inside_token) { - int end_pos = position - 1; - end_offsets->push_back(end_pos); - int start_pos = start_offsets->back(); - std::string token(input.substr(start_pos, end_pos - start_pos)); - tokens->push_back(token); - inside_token = false; - } - } else { - if (!inside_token) { - start_offsets->push_back(prev_position); - inside_token = true; - } - } - } - // save final word - if (inside_token) { - int end_pos = position; - end_offsets->push_back(end_pos); - int start_pos = start_offsets->back(); - std::string token(input.substr(start_pos, end_pos - start_pos)); - tokens->push_back(token); - } -} - -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/whitespace_tokenizer.h b/tensorflow_text/core/kernels/whitespace_tokenizer.h index b2b357500..21e776938 100644 --- a/tensorflow_text/core/kernels/whitespace_tokenizer.h +++ b/tensorflow_text/core/kernels/whitespace_tokenizer.h @@ -15,100 +15,6 @@ #ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_WHITESPACE_TOKENIZER_H_ #define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_WHITESPACE_TOKENIZER_H_ -#include <string> -#include <vector> - -#include "absl/strings/string_view.h" -#include "icu4c/source/common/unicode/umachine.h" - -namespace tensorflow { -namespace text { - -// Helper class for working with the WhitespaceaTokenizer config. The -// config is essentially a bit array stored in characters, where each bit in -// the char represents a Unicode character and whether or not it is considered -// as whitespace. -// -// This bit array contains all codepoints up to the largest whitespace -// character. So any codepoint larger than the array is not whitespace, and -// a lookup is simply using the codepoint value as the index. The first 3 bits -// of the codepoint indicate which bit in a character is the value located, and -// using the rest of the bits of the codepoint we can determine which -// character the particular codepoint is located at. -class WhitespaceTokenizerConfig { - public: - // This object does not own the config, so make certain it exists for the - // lifetime of the class. - WhitespaceTokenizerConfig(const absl::string_view config) - : config_(config), max_codepoint_(config.length() * 8) {} - WhitespaceTokenizerConfig(const std::string* config) - : config_(*config), max_codepoint_(config->length() * 8) {} - - inline bool IsWhitespace(const UChar32 codepoint) const { - return codepoint != U_SENTINEL && - codepoint < max_codepoint_ && - config_[codepoint >> 3] & (1 << (char)(codepoint & 0x7)); - } - - private: - const absl::string_view config_; - const int max_codepoint_; -}; - -class WhitespaceTokenizer { - public: - // Creates an instance. - // - // Args: - // * config: A WhitespaceTokenizerConfig which should be created using the - // WhitespaceTokenizerConfigBuilder - WhitespaceTokenizer(const WhitespaceTokenizerConfig& cfg) - : config_(cfg) { } - - // Tokenizes a string (or series of character codepoints) by whitespace. - // - // Example: - // input = "Show me the way." - // tokens = ["Show", "me", "the", "way."] - // start_offsets = [0, 5, 8, 12] - // end_offsets = [4, 7, 11, 16] - // - // The input should be UTF-8 but the tokenization is performed on Unicode - // codepoints. - // - // Args: - // * input: The UTF-8 string of an input. - // * tokens: The output tokens. - // * start_offsets: The start offsets of output tokens in the input - // text, in utf-8 bytes. - // * end_offsets: The end offsets of output tokens in the input - // text, in utf-8 bytes. - // Note: the start offsets are inclusive and the end offsets are exclusive. - void Tokenize(const absl::string_view input, - std::vector<std::string>* tokens, - std::vector<int>* start_offsets, - std::vector<int>* end_offsets); - - // Tokenizes a string (or series of character codepoints) by whitespace. - // - // Example: - // input = "Show me the way." - // output = ["Show", "me", "the", "way."] - // - // The input should be UTF-8 but the tokenization is performed on Unicode - // codepoints. - // - // Args: - // * input: The UTF-8 string of an input. - // * tokens: The output tokens. - void Tokenize(const absl::string_view input, - std::vector<std::string>* tokens); - - private: - const WhitespaceTokenizerConfig config_; -}; - -} // namespace text -} // namespace tensorflow +#include "tensorflow/core/kernels/text/whitespace_tokenizer.h" #endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_WHITESPACE_TOKENIZER_H_ diff --git a/tensorflow_text/core/kernels/whitespace_tokenizer_config_builder.cc b/tensorflow_text/core/kernels/whitespace_tokenizer_config_builder.cc deleted file mode 100644 index db4a17063..000000000 --- a/tensorflow_text/core/kernels/whitespace_tokenizer_config_builder.cc +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/whitespace_tokenizer_config_builder.h" - -#include <string> - -#include "icu4c/source/common/unicode/uchar.h" -#include "icu4c/source/common/unicode/umachine.h" -#include "icu4c/source/common/unicode/uniset.h" -#include "icu4c/source/common/unicode/uset.h" -#include "icu4c/source/common/unicode/utf8.h" -#include "icu4c/source/common/unicode/utypes.h" - -namespace tensorflow { -namespace text { - -namespace { - -const icu::UnicodeSet& WhiteSpaceSet() { - // Will not fail because the data is hardcoded in the ICU library. - UErrorCode error_code = U_ZERO_ERROR; - const USet* c_set = u_getBinaryPropertySet(UCHAR_WHITE_SPACE, &error_code); - // assert(U_SUCCESS(error_code)); - const icu::UnicodeSet* set = icu::UnicodeSet::fromUSet(c_set); - return *set; -} - -} // namespace - -std::string BuildWhitespaceString() { - std::string str; - char buf[U8_MAX_LENGTH]; - for (auto cp : WhiteSpaceSet().codePoints()) { - int len = 0; - U8_APPEND_UNSAFE(buf, len, cp); - str.append(buf, len); - } - return str; -} - -std::string BuildWhitespaceTokenizerConfig() { - const icu::UnicodeSet& set = WhiteSpaceSet(); - int range_count = set.getRangeCount(); - UChar32 largest_whitespace = set.getRangeEnd(range_count - 1); - // The string will hold our bit array - std::string bitset((largest_whitespace >> 3) + 1, 0); - for (auto cp : set.codePoints()) { - int index = cp >> 3; - bitset[index] |= 1 << (cp & 7); - } - return bitset; -} - -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/whitespace_tokenizer_config_builder.h b/tensorflow_text/core/kernels/whitespace_tokenizer_config_builder.h index 60a3ca092..e11425fff 100644 --- a/tensorflow_text/core/kernels/whitespace_tokenizer_config_builder.h +++ b/tensorflow_text/core/kernels/whitespace_tokenizer_config_builder.h @@ -15,30 +15,6 @@ #ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_WHITESPACE_TOKENIZER_CONFIG_BUILDER_H_ #define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_WHITESPACE_TOKENIZER_CONFIG_BUILDER_H_ -#include <string> - - -namespace tensorflow { -namespace text { - -// Builds a WhitespaceTokenizer config object. This contains the Unicode -// codepoints which are considered whitespaces. -// -// The config object is a series of bytes, where each bit represents a Unicode -// character and is 1 if it is a whitespace character, and 0 otherwise. -// -// Returns: -// The bytes of the config as a string. -std::string BuildWhitespaceTokenizerConfig(); - -// Builds a string full of all the whitespace characters. It is mainly used -// for testing and validation. -// -// Returns: -// A string of Unicode whitespace characters. -std::string BuildWhitespaceString(); - -} // namespace text -} // namespace tensorflow +#include "tensorflow/core/kernels/text/whitespace_tokenizer_config_builder.h" #endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_WHITESPACE_TOKENIZER_CONFIG_BUILDER_H_ diff --git a/tensorflow_text/core/kernels/whitespace_tokenizer_config_builder_test.cc b/tensorflow_text/core/kernels/whitespace_tokenizer_config_builder_test.cc deleted file mode 100644 index 9c8a2724b..000000000 --- a/tensorflow_text/core/kernels/whitespace_tokenizer_config_builder_test.cc +++ /dev/null @@ -1,89 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/whitespace_tokenizer_config_builder.h" - -#include <string> - -#include <gmock/gmock.h> -#include <gtest/gtest.h> -#include "icu4c/source/common/unicode/appendable.h" -#include "icu4c/source/common/unicode/bytestream.h" -#include "icu4c/source/common/unicode/edits.h" -#include "icu4c/source/common/unicode/normalizer2.h" -#include "icu4c/source/common/unicode/schriter.h" -#include "icu4c/source/common/unicode/stringoptions.h" -#include "icu4c/source/common/unicode/stringpiece.h" -#include "icu4c/source/common/unicode/uchar.h" -#include "icu4c/source/common/unicode/ucnv.h" -#include "icu4c/source/common/unicode/ucnv_err.h" -#include "icu4c/source/common/unicode/umachine.h" -#include "icu4c/source/common/unicode/uniset.h" -#include "icu4c/source/common/unicode/unistr.h" -#include "icu4c/source/common/unicode/uset.h" -#include "icu4c/source/common/unicode/utf.h" -#include "icu4c/source/common/unicode/utf8.h" -#include "icu4c/source/common/unicode/utypes.h" -#include "tensorflow/core/platform/types.h" -#include "tensorflow_text/core/kernels/whitespace_tokenizer.h" - -namespace tensorflow { -namespace text { -namespace { - -TEST(WhitespaceTokenizerConfigBuilderTest, BuildWhitespaceString) { - std::string result = BuildWhitespaceString(); - EXPECT_THAT(result, ::testing::HasSubstr(" ")); - EXPECT_THAT(result, ::testing::HasSubstr("\n")); -} - -TEST(WhitespaceTokenizerConfigBuilderTest, - BuildWhitespaceTokenizerConfig_AllWhitespacePresent) { - std::string whitespaces = BuildWhitespaceString(); - icu::UnicodeString codepoints = icu::UnicodeString::fromUTF8(whitespaces); - std::string config = BuildWhitespaceTokenizerConfig(); - // verify all whitepaces are present - WhitespaceTokenizerConfig cfg(config); - for (int i = 0; i < codepoints.length(); ++i) { - EXPECT_TRUE(cfg.IsWhitespace(codepoints[i])); - } -} - -TEST(WhitespaceTokenizerConfigBuilderTest, - BuildWhitespaceTokenizerConfig_MinSize) { - std::string whitespaces = BuildWhitespaceString(); - icu::UnicodeString codepoints = icu::UnicodeString::fromUTF8(whitespaces); - std::string config = BuildWhitespaceTokenizerConfig(); - // verify we are the minimum perfect hash - auto largest_cp = codepoints[codepoints.length() - 1]; - EXPECT_EQ(config.length(), (largest_cp / 8) + 1); -} - -TEST(WhitespaceTokenizerConfigBuilderTest, - BuildWhitespaceTokenizerConfig_VerifyCount) { - std::string whitespaces = BuildWhitespaceString(); - icu::UnicodeString codepoints = icu::UnicodeString::fromUTF8(whitespaces); - std::string config = BuildWhitespaceTokenizerConfig(); - // verify we have the correct number of true values (rest will be false) - int count = 0; - WhitespaceTokenizerConfig cfg(config); - for (int i = 0; i < config.length() * 8; ++i) { - count += cfg.IsWhitespace(i) ? 1 : 0; - } - EXPECT_EQ(count, codepoints.length()); -} - -} // namespace -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/whitespace_tokenizer_kernel.cc b/tensorflow_text/core/kernels/whitespace_tokenizer_kernel.cc deleted file mode 100644 index 78a90c02b..000000000 --- a/tensorflow_text/core/kernels/whitespace_tokenizer_kernel.cc +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/whitespace_tokenizer_kernel.h" - -#include "tensorflow/core/framework/op_kernel.h" - -namespace tensorflow { -namespace text { - -REGISTER_KERNEL_BUILDER(Name(WhitespaceTokenizeWithOffsetsV2OpKernel::OpName()) - .Device(tensorflow::DEVICE_CPU), - WhitespaceTokenizeWithOffsetsV2OpKernel); - -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/whitespace_tokenizer_kernel.h b/tensorflow_text/core/kernels/whitespace_tokenizer_kernel.h index 00eae4b3f..97ce10b9c 100644 --- a/tensorflow_text/core/kernels/whitespace_tokenizer_kernel.h +++ b/tensorflow_text/core/kernels/whitespace_tokenizer_kernel.h @@ -12,22 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_WHITESPACE_TOKENIZE_KERNEL_H_ -#define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_WHITESPACE_TOKENIZE_KERNEL_H_ +#ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_WHITESPACE_TOKENIZER_KERNEL_H_ +#define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_WHITESPACE_TOKENIZER_KERNEL_H_ -#include "tensorflow/lite/kernels/shim/tf_op_shim.h" -#include "tensorflow_text/core/kernels/whitespace_tokenizer_kernel_template.h" +#include "tensorflow/core/kernels/text/whitespace_tokenizer_kernel.h" -namespace tensorflow { -namespace text { - -class WhitespaceTokenizeWithOffsetsV2OpKernel - : public tflite::shim::TfOpKernel<WhitespaceTokenizeWithOffsetsV2Op> { - public: - using TfOpKernel::TfOpKernel; -}; - -} // namespace text -} // namespace tensorflow - -#endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_WHITESPACE_TOKENIZE_KERNEL_H_ +#endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_WHITESPACE_TOKENIZER_KERNEL_H_ diff --git a/tensorflow_text/core/kernels/whitespace_tokenizer_kernel_template.h b/tensorflow_text/core/kernels/whitespace_tokenizer_kernel_template.h index e05a914bb..1682fee2e 100644 --- a/tensorflow_text/core/kernels/whitespace_tokenizer_kernel_template.h +++ b/tensorflow_text/core/kernels/whitespace_tokenizer_kernel_template.h @@ -15,180 +15,6 @@ #ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_WHITESPACE_TOKENIZER_KERNEL_TEMPLATE_H_ #define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_WHITESPACE_TOKENIZER_KERNEL_TEMPLATE_H_ -#include <iostream> -#include <vector> - -#include "absl/status/status.h" -#include "absl/status/statusor.h" -#include "tensorflow/core/framework/tensor.h" -#include "tensorflow/lite/kernels/shim/op_kernel.h" -#include "tensorflow/lite/kernels/shim/shape.h" -#include "tensorflow/lite/kernels/shim/status_macros.h" -#include "tensorflow/lite/kernels/shim/tensor_view.h" -#include "tensorflow_text/core/kernels/whitespace_tokenizer.h" - -namespace tensorflow { -namespace text { - -template <tflite::shim::Runtime Rt> -class WhitespaceTokenizeWithOffsetsV2Op - : public tflite::shim::OpKernelShim<WhitespaceTokenizeWithOffsetsV2Op, Rt> { - private: - enum Inputs { - kInputValues = 0, - kInputConfig - }; - enum Outputs { - kOutputTokens = 0, - kOutputRowSplits, - kOutputStartOffsets, - kOutputEndOffsets - }; - - using typename tflite::shim::OpKernelShim<WhitespaceTokenizeWithOffsetsV2Op, - Rt>::InitContext; - using typename tflite::shim::OpKernelShim<WhitespaceTokenizeWithOffsetsV2Op, - Rt>::InvokeContext; - using typename tflite::shim::OpKernelShim<WhitespaceTokenizeWithOffsetsV2Op, - Rt>::ShapeInferenceContext; - - public: - WhitespaceTokenizeWithOffsetsV2Op() = default; - static constexpr char kOpName[] = "TFText>WhitespaceTokenizeWithOffsetsV2"; - static constexpr char kDoc[] = R"doc( - Splits a string into tokens based off of Unicode whitespaces. It also returns - the relative byte offsets for each token. - - ### Example: - - ```python - >>> splitter = WhitespaceTokenizer() - >>> tokens, starts, ends = splitter.tokenize_with_offsets("a bb ccc") - >>> print(tokens.numpy(), starts.numpy(), ends.numpy()) - [b'a' b'bb' b'ccc'] [0 2 5] [1 4 8] - ``` - - Args: - input_values: 1D Tensor of strings to tokenize. - input_config: A string representing a WhitespaceTokenizerConfig. - - Returns: - * output_tokens: 1D tensor containing the tokens for all input strings. - A 2D RaggedTensor can be constructed from this and output_row_splits. - * output_row_splits: 1D int tensor with the row splits that allow us to - build RaggedTensors from output_tokens, output_start_offsets, and - output_end_offsets. - * output_start_offsets: 1D tensor containing the inclusive start byte offset - for each token in all input strings. Corresponds 1:1 with output_tokens. - A 2D RaggedTensor can be constructed from this and output_row_splits. - * output_end_offsets: 1D tensor containing the exclusive end byte offset for - each token in all input strings. Corresponds 1:1 with output_tokens. - A 2D RaggedTensor can be constructed from this and output_row_splits. - )doc"; - - static const char* OpName() { return kOpName; } - static const char* Doc() { return kDoc; } - - // Attributes declaration (syntax: https://www.tensorflow.org/guide/create_op) - static std::vector<std::string> Attrs() { return {}; } - - // Inputs declaration (syntax: https://www.tensorflow.org/guide/create_op) - static std::vector<std::string> Inputs(); - - // Outputs declaration (syntax: https://www.tensorflow.org/guide/create_op) - static std::vector<std::string> Outputs(); - - // Initializes the op - absl::Status Init(InitContext* context) { return absl::OkStatus(); } - - // Runs the operation - absl::Status Invoke(InvokeContext* context); - - // Shape inference - static absl::Status ShapeInference(ShapeInferenceContext* c); -}; - -template <tflite::shim::Runtime Rt> -std::vector<std::string> WhitespaceTokenizeWithOffsetsV2Op<Rt>::Inputs() { - return {"input_values: string", "input_config: string"}; -} - -template <tflite::shim::Runtime Rt> -std::vector<std::string> WhitespaceTokenizeWithOffsetsV2Op<Rt>::Outputs() { - return {"output_tokens: string", "output_row_splits: int64", - "output_start_offsets: int32", "output_end_offsets: int32"}; -} - -template <tflite::shim::Runtime Rt> -absl::Status WhitespaceTokenizeWithOffsetsV2Op<Rt>::ShapeInference( - ShapeInferenceContext* c) { - using tflite::shim::Shape; - const auto input_values_shape_status = c->GetInputShape(kInputValues); - if (!input_values_shape_status.ok()) { - return input_values_shape_status.status(); - } - const Shape& input_values_shape = *input_values_shape_status; - - const auto rank_1_shape = Shape({Shape::kUnknownDim}); - SH_RETURN_IF_ERROR(c->SetOutputShape(kOutputTokens, rank_1_shape)); - SH_RETURN_IF_ERROR(c->SetOutputShape(kOutputStartOffsets, rank_1_shape)); - SH_RETURN_IF_ERROR(c->SetOutputShape(kOutputEndOffsets, rank_1_shape)); - const int num_splits = Shape::AddDims(1, input_values_shape.Dim(0)); - SH_RETURN_IF_ERROR(c->SetOutputShape(kOutputRowSplits, Shape({num_splits}))); - - return absl::OkStatus(); -} - -template <tflite::shim::Runtime Rt> - absl::Status WhitespaceTokenizeWithOffsetsV2Op<Rt> - ::Invoke(InvokeContext* context) { - // Inputs - const auto values_statusor = context->GetInput(kInputValues); - if (!values_statusor.ok()) { - return values_statusor.status(); - } - const auto values = (*values_statusor)->template As<tensorflow::tstring, 1>(); - - const auto cfg_statusor = context->GetInput(kInputConfig); - if (!cfg_statusor.ok()) { - return cfg_statusor.status(); - } - const absl::string_view config = - (*cfg_statusor)->template AsScalar<tensorflow::tstring>(); - WhitespaceTokenizer tokenizer(config); - - // Outputs - std::vector<std::string> tokens; - std::vector<int64_t> row_splits; - std::vector<int32_t> start_offsets; - std::vector<int32_t> end_offsets; - - // Iterate through all the values and wordpiece tokenize them. - row_splits.push_back(0); - for (int i = 0; i < values.Dim(0); ++i) { - // Tokenize into subwords and record the offset locations. - const int orig_num_tokens = tokens.size(); - tokenizer.Tokenize(values(i), &tokens, &start_offsets, &end_offsets); - const int delta_num_tokens = tokens.size() - orig_num_tokens; - // Record the row splits. - row_splits.push_back(delta_num_tokens + row_splits.back()); - } - - // Allocate output & fill output tensors. - SH_RETURN_IF_ERROR(this->template FillOutputTensor<std::string, - tensorflow::tstring>( - tokens, kOutputTokens, context)); - SH_RETURN_IF_ERROR(this->template FillOutputTensor<int64_t, int64_t>( - row_splits, kOutputRowSplits, context)); - SH_RETURN_IF_ERROR(this->template FillOutputTensor<int32_t, int32_t>( - start_offsets, kOutputStartOffsets, context)); - SH_RETURN_IF_ERROR(this->template FillOutputTensor<int32_t, int32_t>( - end_offsets, kOutputEndOffsets, context)); - - return absl::OkStatus(); -} - -} // namespace text -} // namespace tensorflow +#include "tensorflow/core/kernels/text/whitespace_tokenizer_kernel_template.h" #endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_WHITESPACE_TOKENIZER_KERNEL_TEMPLATE_H_ diff --git a/tensorflow_text/core/kernels/whitespace_tokenizer_test.cc b/tensorflow_text/core/kernels/whitespace_tokenizer_test.cc deleted file mode 100644 index aa94839f7..000000000 --- a/tensorflow_text/core/kernels/whitespace_tokenizer_test.cc +++ /dev/null @@ -1,90 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/whitespace_tokenizer.h" - -#include <gmock/gmock.h> -#include <gtest/gtest.h> -#include "absl/flags/flag.h" -#include "tensorflow/core/platform/env.h" -#include "tensorflow_text/core/kernels/whitespace_tokenizer_config_builder.h" -#include "absl/status/status.h" -#include "absl/status/statusor.h" - -namespace tensorflow { -namespace text { -namespace { - -using ::testing::ElementsAre; - -TEST(WhitespaceTokenizerTest, TokenizeWithOffsets) { - absl::string_view input("I heard the news today"); - std::vector<std::string> output_tokens; - std::vector<int> output_start_offsets; - std::vector<int> output_end_offsets; - std::string config(BuildWhitespaceTokenizerConfig()); - WhitespaceTokenizer t(&config); - t.Tokenize(input, &output_tokens, &output_start_offsets, &output_end_offsets); - EXPECT_THAT(output_tokens, ElementsAre("I", "heard", "the", "news", "today")); - EXPECT_THAT(output_start_offsets, ElementsAre(0, 2, 8, 12, 17)); - EXPECT_THAT(output_end_offsets, ElementsAre(1, 7, 11, 16, 22)); -} - -TEST(WhitespaceTokenizerTest, Tokenize) { - absl::string_view input("I heard the news today"); - std::vector<std::string> output_tokens; - std::string config = BuildWhitespaceTokenizerConfig(); - WhitespaceTokenizer t(&config); - t.Tokenize(input, &output_tokens); - EXPECT_THAT(output_tokens, ElementsAre("I", "heard", "the", "news", "today")); -} - -TEST(WhitespaceTokenizerTest, Internationalization) { - absl::string_view input("la灯 灯a 瀮b"); - std::vector<std::string> output_tokens; - std::vector<int> output_start_offsets; - std::vector<int> output_end_offsets; - std::string config = BuildWhitespaceTokenizerConfig(); - WhitespaceTokenizer t(&config); - t.Tokenize(input, &output_tokens, &output_start_offsets, &output_end_offsets); - EXPECT_THAT(output_start_offsets, ElementsAre(0, 6, 11)); - EXPECT_THAT(output_end_offsets, ElementsAre(5, 10, 15)); -} - -TEST(WhitespaceTokenizerTest, InvalidCodepoint) { - absl::string_view input("\xE3"); - std::vector<std::string> output_tokens; - std::vector<int> output_start_offsets; - std::vector<int> output_end_offsets; - std::string config = BuildWhitespaceTokenizerConfig(); - WhitespaceTokenizer t(&config); - t.Tokenize(input, &output_tokens, &output_start_offsets, &output_end_offsets); - EXPECT_THAT(output_start_offsets, ElementsAre(0)); - EXPECT_THAT(output_end_offsets, ElementsAre(1)); -} - -TEST(WhitespaceTokenizerTest, MaxCodepoint) { - // Create an artificially-small config so that we can test behavior with - // codepoints at the upper edge of its range. This bitmap marks 0x00-0x3f as - // whitespace. - std::string config(8, '\xff'); - // Verify that reading one bit off the end of the bitmap returns - // not-whitespace. - WhitespaceTokenizerConfig cfg(config); - EXPECT_FALSE(cfg.IsWhitespace(0x40)); -} - -} // namespace -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/whitespace_tokenizer_tflite.cc b/tensorflow_text/core/kernels/whitespace_tokenizer_tflite.cc deleted file mode 100644 index 58c09915a..000000000 --- a/tensorflow_text/core/kernels/whitespace_tokenizer_tflite.cc +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/whitespace_tokenizer_tflite.h" - -#include "tensorflow/lite/c/common.h" -#include "tensorflow/lite/kernels/shim/tflite_op_shim.h" -#include "tensorflow_text/core/kernels/whitespace_tokenizer_kernel_template.h" - -namespace tflite { -namespace ops { -namespace custom { -namespace text { - -extern "C" void AddWhitespaceTokenize(tflite::MutableOpResolver* resolver) { - tflite::shim::TfLiteOpKernel< - tensorflow::text::WhitespaceTokenizeWithOffsetsV2Op>::Add(resolver); -} - -} // namespace text -} // namespace custom -} // namespace ops -} // namespace tflite diff --git a/tensorflow_text/core/kernels/whitespace_tokenizer_tflite.h b/tensorflow_text/core/kernels/whitespace_tokenizer_tflite.h index 85b003e00..e3b5aeae9 100644 --- a/tensorflow_text/core/kernels/whitespace_tokenizer_tflite.h +++ b/tensorflow_text/core/kernels/whitespace_tokenizer_tflite.h @@ -15,19 +15,6 @@ #ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_WHITESPACE_TOKENIZER_TFLITE_H_ #define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_WHITESPACE_TOKENIZER_TFLITE_H_ -#include "tensorflow/lite/c/common.h" -#include "tensorflow/lite/mutable_op_resolver.h" - -namespace tflite { -namespace ops { -namespace custom { -namespace text { - -extern "C" void AddWhitespaceTokenize(::tflite::MutableOpResolver* resolver); - -} // namespace text -} // namespace custom -} // namespace ops -} // namespace tflite +#include "tensorflow/core/kernels/text/whitespace_tokenizer_tflite.h" #endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_WHITESPACE_TOKENIZER_TFLITE_H_ diff --git a/tensorflow_text/core/kernels/wordpiece_kernel.cc b/tensorflow_text/core/kernels/wordpiece_kernel.cc deleted file mode 100644 index 8863d80ab..000000000 --- a/tensorflow_text/core/kernels/wordpiece_kernel.cc +++ /dev/null @@ -1,317 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include <limits> -#include <memory> -#include <string> -#include <vector> - -#include "tensorflow/core/framework/dataset_stateful_op_allowlist.h" -#include "tensorflow/core/framework/lookup_interface.h" -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/framework/resource_mgr.h" -#include "tensorflow/core/framework/tensor.h" -#include "tensorflow/core/framework/tensor_shape.h" -#include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/lib/core/threadpool.h" -#include "tensorflow/core/lib/io/path.h" -#include "tensorflow/core/platform/logging.h" -#include "tensorflow/core/public/version.h" -#include "tensorflow_text/core/kernels/wordpiece_tokenizer.h" - -namespace tensorflow { -namespace text { - -namespace { -string GetWordSplitChar(OpKernelConstruction* ctx) { - string suffix_indicator; - ([=](string* c) -> void { - OP_REQUIRES_OK(ctx, ctx->GetAttr("suffix_indicator", c)); - })(&suffix_indicator); - return suffix_indicator; -} - -int32 GetMaxCharsPerWord(OpKernelConstruction* ctx) { - int32 max_chars_per_word; - ([=](int32* c) -> void { - OP_REQUIRES_OK(ctx, ctx->GetAttr("max_bytes_per_word", c)); - })(&max_chars_per_word); - return max_chars_per_word; -} - -int32 GetMaxCharsPerToken(OpKernelConstruction* ctx) { - int32 max_chars_per_token; - ([=](int32* c) -> void { - OP_REQUIRES_OK(ctx, ctx->GetAttr("max_chars_per_token", c)); - })(&max_chars_per_token); - return max_chars_per_token; -} - -bool GetShouldUseUnknownToken(OpKernelConstruction* ctx) { - bool use_unknown_token; - ([=](bool* c) -> void { - OP_REQUIRES_OK(ctx, ctx->GetAttr("use_unknown_token", c)); - })(&use_unknown_token); - return use_unknown_token; -} - -string GetUnknownToken(OpKernelConstruction* ctx) { - string unknown_token; - ([=](string* c) -> void { - OP_REQUIRES_OK(ctx, ctx->GetAttr("unknown_token", c)); - })(&unknown_token); - return unknown_token; -} - -bool GetSplitUnknownCharacters(OpKernelConstruction* ctx) { - bool split_unknown_characters; - ([=](bool* c) -> void { - OP_REQUIRES_OK(ctx, ctx->GetAttr("split_unknown_characters", c)); - })(&split_unknown_characters); - return split_unknown_characters; -} - -Status GetTableHandle(const string& input_name, OpKernelContext* ctx, - string* container, string* table_handle) { - { - mutex* mu; - TF_RETURN_IF_ERROR(ctx->input_ref_mutex(input_name, &mu)); - mutex_lock l(*mu); - Tensor tensor; - TF_RETURN_IF_ERROR(ctx->mutable_input(input_name, &tensor, true)); - if (tensor.NumElements() != 2) { - return errors::InvalidArgument( - "Lookup table handle must be scalar, but had shape: ", - tensor.shape().DebugString()); - } - auto h = tensor.flat<tstring>(); - *container = h(0); - *table_handle = h(1); - } - return absl::OkStatus(); -} - -// Gets the LookupTable stored in the ctx->resource_manager() with key -// passed by attribute with name input_name, returns null if the table -// doesn't exist. -Status GetLookupTable(const string& input_name, OpKernelContext* ctx, - lookup::LookupInterface** table) { - string container; - string table_handle; - DataType handle_dtype; - TF_RETURN_IF_ERROR(ctx->input_dtype(input_name, &handle_dtype)); - if (handle_dtype == DT_RESOURCE) { - ResourceHandle handle; - TF_RETURN_IF_ERROR(HandleFromInput(ctx, input_name, &handle)); - return LookupResource(ctx, handle, table); - } else { - TF_RETURN_IF_ERROR( - GetTableHandle(input_name, ctx, &container, &table_handle)); - return ctx->resource_manager()->Lookup(container, table_handle, table); - } -} - -class LookupTableVocab : public WordpieceVocab { - public: - LookupTableVocab(lookup::LookupInterface* table, OpKernelContext* ctx); - - virtual LookupStatus Contains(const absl::string_view key, bool* value) const; - - private: - // not owned - mutable lookup::LookupInterface* table_; - OpKernelContext* ctx_; - Tensor default_value_; -}; - -Status ToStatus(const LookupStatus& status) { - if (status.success) { - return absl::OkStatus(); - } - - return errors::InvalidArgument(status.error_msg); -} - -constexpr int64 kOutOfVocabValue = -1; - -LookupTableVocab::LookupTableVocab(lookup::LookupInterface* table, - OpKernelContext* ctx) - : table_(table), ctx_(ctx), default_value_(DT_INT64, TensorShape({1})) { - default_value_.flat<int64>()(0) = kOutOfVocabValue; -} - -LookupStatus LookupTableVocab::Contains(const absl::string_view key, - bool* value) const { - if (value == nullptr) { - return LookupStatus("Bad 'value' param."); - } - Tensor keys(DT_STRING, TensorShape({1})); - keys.flat<tstring>()(0) = tstring(key.data(), key.size()); - Tensor values(DT_INT64, TensorShape({1})); - auto status = table_->Find(ctx_, keys, &values, default_value_); - if (!status.ok()) { -// On April 2023, there is not yet an official release of Tensorflow which -// includes `message().` One will need to wait for the release following 2.12.0. -// The code can be updated to just be the else branch after such release exists. -#if TF_GRAPH_DEF_VERSION < 1467 - return LookupStatus(std::string(status.error_message())); -#else - return LookupStatus(std::string(status.message())); -#endif - } - - if (static_cast<int64>(values.flat<int64>()(0)) != kOutOfVocabValue) { - *value = true; - return LookupStatus::OK(); - } - *value = false; - return LookupStatus::OK(); -} - -} // namespace - -class WordpieceTokenizeWithOffsetsOp : public OpKernel { - public: - explicit WordpieceTokenizeWithOffsetsOp(OpKernelConstruction* ctx) - : OpKernel(ctx), - suffix_indicator_(GetWordSplitChar(ctx)), - max_bytes_per_word_(GetMaxCharsPerWord(ctx)), - max_chars_per_token_(GetMaxCharsPerToken(ctx)), - use_unknown_token_(GetShouldUseUnknownToken(ctx)), - unknown_token_(GetUnknownToken(ctx)), - split_unknown_characters_(GetSplitUnknownCharacters(ctx)) { - string output_row_partition_type; - OP_REQUIRES_OK(ctx, ctx->GetAttr("output_row_partition_type", - &output_row_partition_type)); - if (output_row_partition_type == "row_lengths") { - row_partition_type_ = ROW_LENGTHS; - } else if (output_row_partition_type == "row_splits") { - row_partition_type_ = ROW_SPLITS; - } else { - OP_REQUIRES( - ctx, false, - errors::Internal("Unexpected value for output_row_partition_type")); - } - } - - void Compute(OpKernelContext* ctx) override { - const Tensor* input_values; - OP_REQUIRES_OK(ctx, ctx->input("input_values", &input_values)); - const auto& values_vec = input_values->flat<tstring>(); - - lookup::LookupInterface* lookup_table; - OP_REQUIRES_OK(ctx, - GetLookupTable("vocab_lookup_table", ctx, &lookup_table)); - core::ScopedUnref unref_me(lookup_table); - LookupTableVocab vocab_map(lookup_table, ctx); - - std::vector<string> subwords; - std::vector<int> begin_offset; - std::vector<int> end_offset; - std::vector<int> row_partition; - - if (row_partition_type_ == ROW_SPLITS) { - row_partition.push_back(0); - } - - // Iterate through all the values and wordpiece tokenize them. - for (int i = 0; i < values_vec.size(); ++i) { - // Tokenize into subwords and record the offset locations. - int num_wordpieces = 0; - OP_REQUIRES_OK( - ctx, ToStatus(WordpieceTokenize( - values_vec(i), max_bytes_per_word_, max_chars_per_token_, - suffix_indicator_, use_unknown_token_, unknown_token_, - split_unknown_characters_, &vocab_map, &subwords, - &begin_offset, &end_offset, &num_wordpieces))); - - // Record the row splits. - switch (row_partition_type_) { - case ROW_LENGTHS: - row_partition.push_back(num_wordpieces); - break; - case ROW_SPLITS: - row_partition.push_back(num_wordpieces + row_partition.back()); - break; - } - } - - std::vector<int64> output_subwords_shape; - output_subwords_shape.push_back(subwords.size()); - - std::vector<int64> output_row_partition_shape; - output_row_partition_shape.push_back(row_partition.size()); - - Tensor* output_values; - OP_REQUIRES_OK(ctx, ctx->allocate_output("output_values", - TensorShape(output_subwords_shape), - &output_values)); - auto output_values_vec = output_values->vec<tstring>(); - - Tensor* output_row_partition; - OP_REQUIRES_OK(ctx, - ctx->allocate_output("output_row_lengths", - TensorShape(output_row_partition_shape), - &output_row_partition)); - auto output_row_partition_vec = output_row_partition->vec<int64>(); - - Tensor* start_values; - OP_REQUIRES_OK(ctx, ctx->allocate_output("start_values", - TensorShape(output_subwords_shape), - &start_values)); - auto start_values_vec = start_values->vec<int64>(); - - Tensor* limit_values; - OP_REQUIRES_OK(ctx, ctx->allocate_output("limit_values", - TensorShape(output_subwords_shape), - &limit_values)); - auto limit_values_vec = limit_values->vec<int64>(); - - for (int i = 0; i < subwords.size(); ++i) { - output_values_vec(i) = subwords[i]; - } - - for (int i = 0; i < row_partition.size(); ++i) { - output_row_partition_vec(i) = row_partition[i]; - } - - for (int i = 0; i < begin_offset.size(); ++i) { - start_values_vec(i) = begin_offset[i]; - } - - for (int i = 0; i < end_offset.size(); ++i) { - limit_values_vec(i) = end_offset[i]; - } - } - - private: - enum RowPartitionType { ROW_LENGTHS, ROW_SPLITS }; - - const string suffix_indicator_; - const int max_bytes_per_word_; - const int max_chars_per_token_; - const bool use_unknown_token_; - const string unknown_token_; - const bool split_unknown_characters_; - RowPartitionType row_partition_type_; - - TF_DISALLOW_COPY_AND_ASSIGN(WordpieceTokenizeWithOffsetsOp); -}; - -REGISTER_KERNEL_BUILDER(Name("WordpieceTokenizeWithOffsets").Device(DEVICE_CPU), - WordpieceTokenizeWithOffsetsOp); -ALLOW_STATEFUL_OP_FOR_DATASET_FUNCTIONS("WordpieceTokenizeWithOffsets"); - -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/wordpiece_kernel_test.cc b/tensorflow_text/core/kernels/wordpiece_kernel_test.cc deleted file mode 100644 index d9b81677a..000000000 --- a/tensorflow_text/core/kernels/wordpiece_kernel_test.cc +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow/core/framework/fake_input.h" -#include "tensorflow/core/framework/node_def_builder.h" -#include "tensorflow/core/framework/shape_inference.h" -#include "tensorflow/core/framework/shape_inference_testutil.h" -#include "tensorflow/core/framework/tensor.h" -#include "tensorflow/core/framework/tensor_shape.h" -#include "tensorflow/core/framework/tensor_testutil.h" -#include "tensorflow/core/kernels/ops_testutil.h" -#include "tensorflow/core/platform/test.h" - -namespace tensorflow { -namespace { - -TEST(WordpieceTokenizeWithOffsetsOpTest, ShapeFn) { - // WordpieceTokenizeWithOffsets(input_values, vocab_lookup_table) -> - // [output_values, output_row_lengths, start_values, limit_values] - ShapeInferenceTestOp op("WordpieceTokenizeWithOffsets"); - auto &attr = *op.node_def.mutable_attr(); - - attr["output_row_partition_type"].set_s("row_lengths"); - INFER_OK(op, "?;?", "[?];[?];[?];[?]"); - INFER_OK(op, "[?];?", "[?];[d0_0];[?];[?]"); - INFER_OK(op, "[?];[]", "[?];[d0_0];[?];[?]"); - INFER_OK(op, "[5];?", "[?];[d0_0];[?];[?]"); - INFER_OK(op, "[5];[]", "[?];[d0_0];[?];[?]"); - INFER_ERROR("Shape must be rank 1 but is rank 0", op, "[];?"); - INFER_ERROR("Shape must be rank 1 but is rank 2", op, "[1,2];?"); - INFER_ERROR("Shape must be rank 0 but is rank 1", op, "?;[1]"); - - attr["output_row_partition_type"].set_s("row_splits"); - INFER_OK(op, "?;?", "[?];[?];[?];[?]"); - INFER_OK(op, "[?];?", "[?];[?];[?];[?]"); - INFER_OK(op, "[?];[]", "[?];[?];[?];[?]"); - INFER_OK(op, "[5];?", "[?];[6];[?];[?]"); - INFER_OK(op, "[5];[]", "[?];[6];[?];[?]"); -} - -} // namespace -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/wordpiece_tokenizer.cc b/tensorflow_text/core/kernels/wordpiece_tokenizer.cc deleted file mode 100644 index fd9adad5a..000000000 --- a/tensorflow_text/core/kernels/wordpiece_tokenizer.cc +++ /dev/null @@ -1,246 +0,0 @@ -// Copyright 2026 TF.Text Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tensorflow_text/core/kernels/wordpiece_tokenizer.h" - -#include "absl/strings/str_cat.h" -#include "absl/strings/str_join.h" -#include "absl/strings/string_view.h" -#include "icu4c/source/common/unicode/utf8.h" - -namespace tensorflow { -namespace text { - -namespace { - -LookupStatus Lookup(int byte_start, int byte_end, - const absl::string_view& token, - const std::string& suffix_indicator, - const WordpieceVocab* vocab_map, bool* in_vocab) { - int byte_len = byte_end - byte_start; - absl::string_view substr(token.data() + byte_start, byte_len); - return vocab_map->Contains( - byte_start > 0 ? absl::StrCat(suffix_indicator, substr) : substr, - in_vocab); -} - -// Sets byte_end to the longest byte sequence which: -// 1) is a proper UTF8 sequence -// 2) is in the vocab OR if split_unknown_characters is true, is a single -// UTF8 character. -// If no match is found, found_match is set to false. -LookupStatus LongestMatchStartingAt( - int byte_start, const absl::string_view& token, - const std::string& suffix_indicator, const int max_chars_per_subtoken, - bool split_unknown_characters, const WordpieceVocab* vocab_map, - int* byte_end, bool* found_match, bool* match_is_unknown_character) { - *match_is_unknown_character = false; - *found_match = false; - const char* token_bytes = token.data(); - std::vector<int32_t> byte_ends; - int upper_limit = token.length(); - - for (int32_t i = byte_start; i < token.length();) { - UChar32 c; - U8_NEXT(token_bytes, i, upper_limit, c); - byte_ends.push_back(i); - if (max_chars_per_subtoken > 0 && - byte_ends.size() == max_chars_per_subtoken) { - // If the max bytes of a subtoken is known, do not search beyond that - // length. - break; - } - } - int n = byte_ends.size(); - for (int i = n - 1; i >= 0; i--) { - bool in_vocab; - auto status = Lookup(byte_start, byte_ends[i], token, suffix_indicator, - vocab_map, &in_vocab); - if (!status.success) return status; - if (in_vocab) { - *byte_end = byte_ends[i]; - *found_match = true; - return LookupStatus::OK(); - } - if (i == 0 && split_unknown_characters) { - *byte_end = byte_ends[0]; - *found_match = true; - *match_is_unknown_character = true; - return LookupStatus::OK(); - } - } - return LookupStatus::OK(); -} - -// Sets the outputs 'begin_offset', 'end_offset' and 'num_word_pieces' when no -// token is found. -LookupStatus NoTokenFound(const absl::string_view& token, - bool use_unknown_token, - const std::string& unknown_token, - std::vector<std::string>* subwords, - std::vector<int>* begin_offset, - std::vector<int>* end_offset, int* num_word_pieces) { - begin_offset->push_back(0); - if (use_unknown_token) { - subwords->push_back(unknown_token); - end_offset->push_back(token.length()); - } else { - subwords->emplace_back(token.data(), token.length()); - end_offset->push_back(token.length()); - } - ++(*num_word_pieces); - - return LookupStatus::OK(); -} - -// When a subword is found, this helper function will add the outputs to -// 'subwords', 'begin_offset' and 'end_offset'. -void AddWord(const absl::string_view& token, int byte_start, int byte_end, - const std::string& suffix_indicator, - std::vector<std::string>* subwords, std::vector<int>* begin_offset, - std::vector<int>* end_offset) { - begin_offset->push_back(byte_start); - int len = byte_end - byte_start; - - if (byte_start > 0) { - // Prepend suffix_indicator if the token is within a word. - subwords->push_back(::absl::StrCat( - suffix_indicator, absl::string_view(token.data() + byte_start, len))); - } else { - subwords->emplace_back(token.data(), len); - } - end_offset->push_back(byte_end); -} - -// Adds a single unknown character subword, found when split_unknown_characters -// is true. -void AddUnknownCharacter(const absl::string_view& token, int byte_start, - int byte_end, const std::string& suffix_indicator, - bool use_unknown_token, - const std::string& unknown_token, - std::vector<std::string>* subwords, - std::vector<int>* begin_offset, - std::vector<int>* end_offset) { - begin_offset->push_back(byte_start); - end_offset->push_back(byte_end); - int len = byte_end - byte_start; - if (use_unknown_token) { - if (byte_start > 0) { - // Prepend suffix_indicator if the character is within a word. - subwords->push_back(::absl::StrCat(suffix_indicator, unknown_token)); - } else { - subwords->push_back(unknown_token); - } - } else { - if (byte_start > 0) { - // Prepend suffix_indicator if the character is within a word. - subwords->push_back(::absl::StrCat( - suffix_indicator, absl::string_view(token.data() + byte_start, len))); - } else { - subwords->emplace_back(token.data(), len); - } - } -} - -LookupStatus TokenizeL2RGreedy( - const absl::string_view& token, const int max_bytes_per_token, - const int max_chars_per_subtoken, const std::string& suffix_indicator, - bool use_unknown_token, const std::string& unknown_token, - bool split_unknown_characters, const WordpieceVocab* vocab_map, - std::vector<std::string>* subwords, std::vector<int>* begin_offset, - std::vector<int>* end_offset, int* num_word_pieces) { - std::vector<std::string> candidate_subwords; - std::vector<int> candidate_begin_offsets; - std::vector<int> candidate_end_offsets; - const int token_len = token.length(); - for (int byte_start = 0; byte_start < token_len;) { - int byte_end; - bool found_subword; - bool match_is_unknown_character; - auto status = LongestMatchStartingAt( - byte_start, token, suffix_indicator, max_chars_per_subtoken, - split_unknown_characters, vocab_map, &byte_end, &found_subword, - &match_is_unknown_character); - if (!status.success) return status; - if (found_subword) { - if (match_is_unknown_character) { - AddUnknownCharacter(token, byte_start, byte_end, suffix_indicator, - use_unknown_token, unknown_token, - &candidate_subwords, &candidate_begin_offsets, - &candidate_end_offsets); - } else { - AddWord(token, byte_start, byte_end, suffix_indicator, - &candidate_subwords, &candidate_begin_offsets, - &candidate_end_offsets); - } - byte_start = byte_end; - } else { - return NoTokenFound(token, use_unknown_token, unknown_token, subwords, - begin_offset, end_offset, num_word_pieces); - } - } - - subwords->insert(subwords->end(), candidate_subwords.begin(), - candidate_subwords.end()); - begin_offset->insert(begin_offset->end(), candidate_begin_offsets.begin(), - candidate_begin_offsets.end()); - end_offset->insert(end_offset->end(), candidate_end_offsets.begin(), - candidate_end_offsets.end()); - *num_word_pieces += candidate_subwords.size(); - return LookupStatus::OK(); -} - -} // namespace - -LookupStatus WordpieceTokenize( - const absl::string_view& token, const int max_bytes_per_token, - const int max_chars_per_subtoken, const std::string& suffix_indicator, - bool use_unknown_token, const std::string& unknown_token, - bool split_unknown_characters, const WordpieceVocab* vocab_map, - std::vector<std::string>* subwords, std::vector<int>* begin_offset, - std::vector<int>* end_offset, int* num_word_pieces) { - int token_len = token.size(); - if (token_len > max_bytes_per_token) { - begin_offset->push_back(0); - *num_word_pieces = 1; - if (use_unknown_token) { - end_offset->push_back(unknown_token.size()); - subwords->emplace_back(unknown_token); - } else { - subwords->emplace_back(token); - end_offset->push_back(token.size()); - } - return LookupStatus::OK(); - } - return TokenizeL2RGreedy(token, max_bytes_per_token, max_chars_per_subtoken, - suffix_indicator, use_unknown_token, unknown_token, - split_unknown_characters, vocab_map, subwords, - begin_offset, end_offset, num_word_pieces); -} - -LookupStatus WordpieceTokenize( - const absl::string_view& token, const int max_bytes_per_token, - const std::string& suffix_indicator, bool use_unknown_token, - const std::string& unknown_token, const WordpieceVocab* vocab_map, - std::vector<std::string>* subwords, std::vector<int>* begin_offset, - std::vector<int>* end_offset, int* num_word_pieces) { - return WordpieceTokenize(token, max_bytes_per_token, - /* max_chars_per_subtoken= */ 0, suffix_indicator, - use_unknown_token, unknown_token, - /* split_unknown_characters= */ false, vocab_map, - subwords, begin_offset, end_offset, num_word_pieces); -} - -} // namespace text -} // namespace tensorflow diff --git a/tensorflow_text/core/kernels/wordpiece_tokenizer.h b/tensorflow_text/core/kernels/wordpiece_tokenizer.h index c173497ee..69913fce0 100644 --- a/tensorflow_text/core/kernels/wordpiece_tokenizer.h +++ b/tensorflow_text/core/kernels/wordpiece_tokenizer.h @@ -12,52 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef TENSORFLOW_TEXT_CORE_KERNELS_WORDPIECE_TOKENIZER_H_ -#define TENSORFLOW_TEXT_CORE_KERNELS_WORDPIECE_TOKENIZER_H_ +#ifndef THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_WORDPIECE_TOKENIZER_H_ +#define THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_WORDPIECE_TOKENIZER_H_ -#include <string> -#include <utility> -#include <vector> +#include "tensorflow/core/kernels/text/wordpiece_tokenizer.h" -#include "absl/strings/string_view.h" - -namespace tensorflow { -namespace text { - -struct LookupStatus { - LookupStatus() : error_msg(""), success(true) {} - LookupStatus(std::string msg) : error_msg(std::move(msg)), success(false) {} - std::string error_msg; - bool success; - - static LookupStatus OK() { return LookupStatus(); } -}; - -class WordpieceVocab { - public: - virtual ~WordpieceVocab() {} - virtual LookupStatus Contains(const absl::string_view key, - bool* value) const = 0; -}; - -LookupStatus WordpieceTokenize( - const absl::string_view& token, const int max_bytes_per_token, - const int max_chars_per_subtoken, const std::string& suffix_indicator, - bool use_unknown_token, const std::string& unknown_token, - bool split_unknown_characters, const WordpieceVocab* vocab_map, - std::vector<std::string>* subwords, std::vector<int>* begin_offset, - std::vector<int>* end_offset, int* num_word_pieces); - -// As above but with `max_bytes_per_subtoken` unknown, -// and split_unknown_characters=false. (For backwards compatability.) -LookupStatus WordpieceTokenize( - const absl::string_view& token, const int max_bytes_per_token, - const std::string& suffix_indicator, bool use_unknown_token, - const std::string& unknown_token, const WordpieceVocab* vocab_map, - std::vector<std::string>* subwords, std::vector<int>* begin_offset, - std::vector<int>* end_offset, int* num_word_pieces); - -} // namespace text -} // namespace tensorflow - -#endif // TENSORFLOW_TEXT_CORE_KERNELS_WORDPIECE_TOKENIZER_H_ +#endif // THIRD_PARTY_TENSORFLOW_TEXT_CORE_KERNELS_WORDPIECE_TOKENIZER_H_ diff --git a/tensorflow_text/core/pybinds/BUILD b/tensorflow_text/core/pybinds/BUILD index 94f6ff07b..2e8e19881 100644 --- a/tensorflow_text/core/pybinds/BUILD +++ b/tensorflow_text/core/pybinds/BUILD @@ -20,10 +20,19 @@ pybind_extension( "tflite_registrar.cc", ], deps = [ + "@org_tensorflow//tensorflow/core/kernels/text:byte_splitter_tflite", + "@org_tensorflow//tensorflow/core/kernels/text:fast_bert_normalizer_tflite", + "@org_tensorflow//tensorflow/core/kernels/text:fast_wordpiece_tokenizer_tflite", + "@org_tensorflow//tensorflow/core/kernels/text:ngrams_tflite", + "@org_tensorflow//tensorflow/core/kernels/text:ragged_tensor_to_tensor_tflite", + "@org_tensorflow//tensorflow/core/kernels/text:round_robin_trimmer_tflite", + "@org_tensorflow//tensorflow/core/kernels/text:sentence_fragmenter_v2_tflite", + "@org_tensorflow//tensorflow/core/kernels/text:utf8_binarize_tflite", + "@org_tensorflow//tensorflow/core/kernels/text:whitespace_tokenizer_tflite", + "@org_tensorflow//tensorflow/core/kernels/text/sentencepiece:py_tflite_registerer", # lite:framework tensorflow dep, # lite/c:common tensorflow dep, # lite/kernels:builtin_ops tensorflow dep, - "//tensorflow_text/core/kernels:tflite_ops", ], ) @@ -33,7 +42,7 @@ pybind_extension( copts = ["-fexceptions"], features = ["-use_header_modules"], deps = [ - "//tensorflow_text/core/kernels:fast_bert_normalizer_model_builder", + "@org_tensorflow//tensorflow/core/kernels/text:fast_bert_normalizer_model_builder", ], ) @@ -64,7 +73,7 @@ pybind_extension( ], features = ["-use_header_modules"], deps = [ - "//tensorflow_text/core/kernels:fast_wordpiece_tokenizer_model_builder", + "@org_tensorflow//tensorflow/core/kernels/text:fast_wordpiece_tokenizer_model_builder", ], ) @@ -91,7 +100,7 @@ pybind_extension( "//tensorflow_text:__subpackages__", ], deps = [ - "//tensorflow_text/core/kernels:phrase_tokenizer_model_builder", + "@org_tensorflow//tensorflow/core/kernels/text:phrase_tokenizer_model_builder", ], ) @@ -114,7 +123,7 @@ pybind_extension( copts = ["-fexceptions"], features = ["-use_header_modules"], deps = [ - "//tensorflow_text/core/kernels/sentencepiece:model_converter", + "@org_tensorflow//tensorflow/core/kernels/text/sentencepiece:model_converter", ], ) @@ -124,7 +133,7 @@ pybind_extension( copts = ["-fexceptions"], features = ["-use_header_modules"], deps = [ - "//tensorflow_text/core/kernels:whitespace_tokenizer_config_builder", + "@org_tensorflow//tensorflow/core/kernels/text:whitespace_tokenizer_config_builder", ], ) diff --git a/tensorflow_text/core/pybinds/pywrap_fast_bert_normalizer_model_builder.cc b/tensorflow_text/core/pybinds/pywrap_fast_bert_normalizer_model_builder.cc index d339ec6d6..9d0fe72bd 100644 --- a/tensorflow_text/core/pybinds/pywrap_fast_bert_normalizer_model_builder.cc +++ b/tensorflow_text/core/pybinds/pywrap_fast_bert_normalizer_model_builder.cc @@ -16,7 +16,7 @@ #include "include/pybind11/pybind11.h" #include "include/pybind11/stl.h" -#include "tensorflow_text/core/kernels/fast_bert_normalizer_model_builder.h" +#include "tensorflow/core/kernels/text/fast_bert_normalizer_model_builder.h" namespace tensorflow { namespace text { diff --git a/tensorflow_text/core/pybinds/pywrap_fast_wordpiece_tokenizer_model_builder.cc b/tensorflow_text/core/pybinds/pywrap_fast_wordpiece_tokenizer_model_builder.cc index 573250db0..4e4ddda10 100644 --- a/tensorflow_text/core/pybinds/pywrap_fast_wordpiece_tokenizer_model_builder.cc +++ b/tensorflow_text/core/pybinds/pywrap_fast_wordpiece_tokenizer_model_builder.cc @@ -16,7 +16,7 @@ #include "include/pybind11/pybind11.h" #include "include/pybind11/stl.h" -#include "tensorflow_text/core/kernels/fast_wordpiece_tokenizer_model_builder.h" +#include "tensorflow/core/kernels/text/fast_wordpiece_tokenizer_model_builder.h" namespace tensorflow { namespace text { diff --git a/tensorflow_text/core/pybinds/pywrap_model_converter.cc b/tensorflow_text/core/pybinds/pywrap_model_converter.cc index 73a932805..70900269d 100644 --- a/tensorflow_text/core/pybinds/pywrap_model_converter.cc +++ b/tensorflow_text/core/pybinds/pywrap_model_converter.cc @@ -16,7 +16,7 @@ #include "include/pybind11/pybind11.h" #include "include/pybind11/stl.h" -#include "tensorflow_text/core/kernels/sentencepiece/model_converter.h" +#include "tensorflow/core/kernels/text/sentencepiece/model_converter.h" namespace tensorflow { namespace text { diff --git a/tensorflow_text/core/pybinds/pywrap_phrase_tokenizer_model_builder.cc b/tensorflow_text/core/pybinds/pywrap_phrase_tokenizer_model_builder.cc index 1221f92a9..225f2df4f 100644 --- a/tensorflow_text/core/pybinds/pywrap_phrase_tokenizer_model_builder.cc +++ b/tensorflow_text/core/pybinds/pywrap_phrase_tokenizer_model_builder.cc @@ -17,7 +17,7 @@ #include <stdexcept> #include "include/pybind11/pybind11.h" -#include "tensorflow_text/core/kernels/phrase_tokenizer_model_builder.h" +#include "tensorflow/core/kernels/text/phrase_tokenizer_model_builder.h" namespace tensorflow { namespace text { diff --git a/tensorflow_text/core/pybinds/pywrap_whitespace_tokenizer_config_builder.cc b/tensorflow_text/core/pybinds/pywrap_whitespace_tokenizer_config_builder.cc index 3266e2f77..f4b6c5f44 100644 --- a/tensorflow_text/core/pybinds/pywrap_whitespace_tokenizer_config_builder.cc +++ b/tensorflow_text/core/pybinds/pywrap_whitespace_tokenizer_config_builder.cc @@ -12,11 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include <stdexcept> #include <iostream> +#include <stdexcept> + #include "include/pybind11/pybind11.h" #include "include/pybind11/stl.h" -#include "tensorflow_text/core/kernels/whitespace_tokenizer_config_builder.h" +#include "tensorflow/core/kernels/text/whitespace_tokenizer_config_builder.h" namespace tensorflow { namespace text { diff --git a/tensorflow_text/core/pybinds/tflite_registrar.cc b/tensorflow_text/core/pybinds/tflite_registrar.cc index 08a7fbfae..7619e2dff 100644 --- a/tensorflow_text/core/pybinds/tflite_registrar.cc +++ b/tensorflow_text/core/pybinds/tflite_registrar.cc @@ -14,16 +14,16 @@ #include "include/pybind11/pybind11.h" #include "include/pybind11/pytypes.h" -#include "tensorflow_text/core/kernels/byte_splitter_tflite.h" -#include "tensorflow_text/core/kernels/fast_bert_normalizer_tflite.h" -#include "tensorflow_text/core/kernels/fast_wordpiece_tokenizer_tflite.h" -#include "tensorflow_text/core/kernels/ngrams_tflite.h" -#include "tensorflow_text/core/kernels/ragged_tensor_to_tensor_tflite.h" -#include "tensorflow_text/core/kernels/round_robin_trimmer_tflite.h" -#include "tensorflow_text/core/kernels/sentence_fragmenter_v2_tflite.h" -#include "tensorflow_text/core/kernels/sentencepiece/py_tflite_registerer.h" -#include "tensorflow_text/core/kernels/utf8_binarize_tflite.h" -#include "tensorflow_text/core/kernels/whitespace_tokenizer_tflite.h" +#include "tensorflow/core/kernels/text/byte_splitter_tflite.h" +#include "tensorflow/core/kernels/text/fast_bert_normalizer_tflite.h" +#include "tensorflow/core/kernels/text/fast_wordpiece_tokenizer_tflite.h" +#include "tensorflow/core/kernels/text/ngrams_tflite.h" +#include "tensorflow/core/kernels/text/ragged_tensor_to_tensor_tflite.h" +#include "tensorflow/core/kernels/text/round_robin_trimmer_tflite.h" +#include "tensorflow/core/kernels/text/sentence_fragmenter_v2_tflite.h" +#include "tensorflow/core/kernels/text/sentencepiece/py_tflite_registerer.h" +#include "tensorflow/core/kernels/text/utf8_binarize_tflite.h" +#include "tensorflow/core/kernels/text/whitespace_tokenizer_tflite.h" PYBIND11_MODULE(tflite_registrar, m) { m.doc() = R"pbdoc( diff --git a/tensorflow_text/python/ops/test_data/fast_bert_normalizer_model_lower_case_nfd_strip_accents.fb b/tensorflow_text/python/ops/test_data/fast_bert_normalizer_model_lower_case_nfd_strip_accents.fb index 04186b144ce3bbc1d28b4f414a916cf35719802e..38e7a183fea125aecd935767acebef259387a5df 100644 GIT binary patch delta 87430 zcmce;d3;q@x%mC;y>m`DkO2yr$iS3PLJKLi3}H$k1$%1I#!78$BeWK;w8fujDWrv- zBqs?unG=%i%oCaCIXM{@P3bjSRBBP9qSh-~YEkiuiW=|loA%P*d++^x-ap>={k(sC z^E}Ub*4}5IwVpNZbux0n_niyA`GMXe1Da?e4|<45b8p1yfQ5PQ*A%>8bMF0`VrPB8 zaD9Mw<ea&vUHiVdzC){Ul#Gu3p=5L%CSVe#{!lWO1j&#BPDq6`NQbkK0WQddEXal& z$b~$}hXN>sbAPBxGocYm|C{U@{zZrOj_2^;fVRO{l_c*!94|K{Rr}xpL;HR0>)Pvg zi`u(R)F0$M?cY0Y3>zcbn+vs>;cvaJd0Mp_=G))Yeto#+;z~(5?kCv?t&)@ICz%OW z$=YKT<Fcq}NQ&^2<W+u>vcXTBk$#f8*-z3o`AO0N?6*agVwYi;@A8w_?~AG=|2X-l z$X^s>6*KT#+P%w-cSPl1@{od?9#VMAL(X0IkmM_VGX0ME#cyf;y7P+p^0&1855)dJ zl<`A88S;+!OYSrNl5@;ovXA<U^GBk_<NReR!Cxlh{Uv3ezr@apGXL-`?XkOzA9EEr ze<CXOr`+YL`Ppx4Yu8R+6_wiSFXI#bGGg<T(MEqss`HngPVV?K^8TE>zc8!6t%ZHA zyv##Vu2`l2;%b@xg{W$`hg3~?NG0;z!2l_Y3y{1G0pfg@>wYP!WKV#6hfpc92Z-@+ zqB1uH$oQ@RIoD~Gf`|YaTNNPY;<vPw4^QK*u?0x&tpKUJ86fp@0doF!fHYj=ZolGg z*F-g650I8C0dk!?wp|U7_SpdGxELUv7Xl>dQh+4SaJN%$X?Fdax;g!pwsGSFzS8aC zD?LSC(!0(oeINKrzlT)@-t(5h5KkF;Csc;-^Olk4ba6Kr=88*NkRv}WPzsU*#rQSl z1<v2_>F-en*D1;Cfl_)UP|7X`O8M15shACv%8P+gbs<oyF9k}?OrZ1*2Ff&f`_C*A z=lh}t8v|vaE>MO$17)Z+P~63VGExvIvA-2HTE^#mz~_MT2A}gg*`$o$i%K~jB*|xj zWFj$0rm>xWz<HC0_#>IY`R~|&!oI~ry?04lxqKR}eN~W@YzPuFGDu4Of~0IokU0Nr zuKtd;))D&`QH}2gNz;uWX}%sL#$QFXUJjxlgDA)#3Nna-<kLuZ-sbT)ERw3;AgP`R zk{WlA?Ejmnp;e3I&A*E}+ZrSp#X;gK2$IaIAjv8VlI*k~$w>~9+?*g8caeT#xnvz* zF8k+1z4=eRR_>AIlK0GV$v(DRa*i&Sk>iHA4;V7)He@Wv;37jNV83YUO-)ngU%#Wh zp~o8L<af1=cUe6&)q5aVdhEecusK-LE#^PJt8Lc}PjmbCw6HJdT?v-l%fXU!HCVD| zgC*-?uuOYus^C(vI2UL%agkV?rgA3myYV*$i&+;e#zIYHv<A!B;$W#L2$s?+t|QI4 zNK+}v!IGR4EU{jiN<I-HhPOHSJ#Bed&A|}KhzpTvZ}RzY&wU|MvL{4}4}?gpuleWi zX?t~JvAO-Sc7NpbVomj1e8ss$Q}u%(a=w>5cWKI4s;OykF4I)3pQdWf;>+46_8b11 zN_{3&&K(Pt!lR)wjhzwgBU#=);tZJos8Goc@{#OS3nVs>N8B7L&P}0W1o4Q`p;8>d zH9@A&+uC}6=W=cxtf}*Jp;CXFJRy8!i23Qai3Vrr{Nsf3IH3|7#^bEO2g7yTZ>6RN zTS;F<`f5$RLeaU)LZ$MWO{!XLGEJIu4fhV`Gw~S_Jab~0IFE!$$}?e-d@M|+Npr3x z&)qz8LYQRRy(H_1ml*3bRd^uGA?NI2lC;NDk`H=HN`j|2k*NnfCGCW#Oq1zsqNmhM zc+sP3s_K%LR9^Rzz3b<1<|(fuWAD*aQ`!R2@8yFab^~AZTA0+o8z#m^z6^9-36t*2 zJXDmXI%mUVyvkGb`!v;e$^82FwbhOk^vU8dnJfqseUqjp%EDwcjr{2P{hD&S_)?(@ z<pN)_`pgQcNn9bNM^;GLGb_Z2fA1$W^*Y2ppsAjO71A5OLiEk^mBMpc7K*+F9SUs= zrDfL&X^mbX=GBGLy^4HWwa=^0$Q6>Zl@HrSMlhoJQkE6+I&$xJO;ye=lwa`S)9B6} ze9%tv?9$X5^X!HgO$}UGAr~IhRIA@YG4^Qc!XESaceLfYzSm5DN820k+^eayhj+{V zSWV45q$%qsHRXDk2Y-am_!JUyQg_JSM>UoAk8Lu(d8JHjS}AW~U-*osMj}><d(}!A z-LO)|B3H_Q-%1%=vQjj2)fFvpUr4m<d5l=<iI!w!!(vS?AhkG6?SYnc(b6g&()OMv z?HyL>=<$@!w>+imADVQ3ZkP1DVU^@K^R+A5-n*vBpZr-(r5{-(7m(WH=G#|@)t=3( zq<7OQ+50%v3g;qLN#UwhQm|o_<VUU&=jSMygM>-wDlwiQ4>&*1w|kN_+bXHA+e(~o zmD=R3QU^`iR%wQYC0nHt+Rkj1_I+EWWz$w^J+@W4s<udX+7{_Z*eac!Tb`1>gIlCO zVT<&*w$Qt8k)b`&G7JN8(J~058=_^*JDMIXTHKM*GHHvJDHw+ds6Me;YL2g#$}_8_ zDsi=xA6YFG&#ac$pETe8fwpmfY&=yEy;}4y@ZhlbDXIn1{Z`A_C95ShbhYe#nh>$9 z7N@palDt<-vW@FWi%rnf_}pqSzG&|Mq4t{N;uke#IjpJu&uHr6vzlr?x<*<Kt`W;~ zn%Z}SQ1~+G|3aRl*w16XK%sA5BT1XqNZ+nCa`8ocU&Z&o@O_OCJI=Fwowl%Kjf{q_ zk+GmP68jB8_9dPNj1$EEtu@knlk4Z!n6W?7HaOlOacX9by#6v@<|GYsVvW?h*GOaI z8fmCoBh#cgPZ6`lJY2yVX|Gx%`x0rSX=|h-d5t(<;fuYhsiYI(l6*W|QqF{nGcjCJ zkAzFwGvShcEL_eW4VR3A;o^!5XG#_>S@GeLy)RsH_Jlhm_dvMh*~2A&bGQ_23YWrN z;c_lIT#6#XrFd01gJd{^WH^IlID=%kl!t~(MNqg@TEeAD3m4-wVFb0e!kLYPGaCtK zHWJQkBwQNb4QCJyXAli%HWDtam%|xE!x==wrSoFAbX;(ROZTO4>6!^AT*9TNH(dHB z!llp6w>ZN$dyRg!Gn~l)Pg_O*>7eE963K;WbY~KIk}2Ah`4Whr1R^Mb2udJA-XzVQ zN@&dN6xU6wWZd2<>FJuvToog>3{7S4iV^4MuUjQoi;>t&Zj!|}fx;6wvNcuSYLyCz z&B2#T69+Ai4+JBBKF0W<<GY!??v~`}-O_bilWEeO1&qi)n$=c1VoT{kFGfiJg$U`n z6d}Dc5z;jnA>F+Z(lHSso$d%3XpE5dx(FHSjF92h2pK7k5O+a@j8;X+*!A5qo)#e! zH+PFs#?wLTiM7&le66&fSu1UcYsFcvsg7sXN^AvBRY@MGjU&BETcqma*Gl6)oYm&< ze#~%e)X)T0tz|;JR;JONwS1jAhTYJ$V$_>!f1-URre0HJgKMR<cdb-Rtd(;2TB&MW zE0uL?W$$@SRkyB{j_Yfs^XA>sb#1M@PTJlEO|_MYv{FUpz)!RYM;&Eeuv1>gvA2<C z0ehP?H9x_99hpD3lZoR_DJ0K1@)kjH>rN?2*dgYL9a4H=hm^(dkn-z0m^ki`%7oog zHM2vidv{0;)W+?QI;e;9RokVZce^yYw@Xvwc4@w{T^ud9wo7Zoc4^zNUD|hTmk#t! z+ji+XzFoSZXUTTy4c#t%k=vyo24E0|X6di4&_B)5KV781x)d#YoB0ATae@A-j{d4F zT9OW~ljMYTl5%vNI4$cW^_eY_c5I8JAK5}bwC-`q*t||$yVgl&d?f8HQnKj`bNtpx zYzq<Wy-xBI*GYjTQVMPB<eXom6wPjtVkik+Cnjm7(d(paZi|!`M@q$}by9i5Tz^&D z?l4-ZVsN%mbL~`l2h~06BXv-J-AB$}^O0%NoSpc)H09CFxD>xyJbI|2t3J{`;UgWL zKGIq2BObl@`!rS2x>Y>-(Fb5qQ#Hj~rJ84~>f9=o-jU)lMEWp{Xy(bQn)hOTjOYh_ z90_LSRV~Efyue4k&Ld6ph~Rt!`y%$6ni{(iDWlidi~Hz3GV<;gnYy`ACKK<G3H0&n zj2RcVNFVfWzDIg)ZIR()_sG!I^)h&QiwvB(N7`?0kv8(SlDCDl?t(4S6@QO(USsef z?Y!SshtzvBYHeopx*18AvqhS)8!5{MXgslArg;$Ozw%Ju<e_GGNN|3OdBM^368mk^ z-lFqPST7^-<VAO0BJX!d|1SOV=Jis)iFEYZ==IVZv0j>1t(V3P>!l%by*R(OlFXOM z{5C^*5E+rq@6%CxbMtpdzhZ9xxwhLJ|CtuJ_R$}c`6{1GKQ?F+84427f5rt6`*Sk? zf~Wh%@X*h-7c{>cpOA`O_p(sL_e&aJ!Uh(I!&}~+_dRpJO#d5g`po?-{g|R1y<cLl z4G;ZNdrFJ=HTQthxkxFy9Vz9vh$Kp;lB%e>9x2Y>aKHD=Ex*#7j^ZduE{Kwpswi=m zMM-K}l%yp`NqSC{oOMM>#$c4VdZQ$BB1*E{QIg#lB{_9blG_<2d96{BKN}?lSEHox za+I9A5+y}5QBr&<N=hz7iFq+fN^eII%u!N)GfFCMMKME-lByd~QvGg})Le@aM{Q`7 z)CEOJy<e1^UlJt^-ci(Clr&kQn14huC5&Q97$t3+qojRPlyq!}lFrB|>57PAiWnt5 zHwYbSye}b2`s1TyU|*CN*EKaXA4>Kp8M(AY+%sEbln@)cK*$}5qEo$3CW9W3soVE` zLXs?-Bw5=eDFqwFS-eqFT^m`MGI)g2p3^o;#>7@}xwlH@hAom6xka*D&G)ZqD=hC* zhQBp^eys&-ADHWZt$ivkhzmcUCU4O7AKWPYaT{gez($Gv9Zy4h9^SK2MtD*;Pdv&~ zjqz0D$2Rgbn`ClslNi6Jqlwro<Eu8an&m!!FyH*Oc5?Ymnq}E$G5$#JoQw-u;o2<D zf7i^<|3+KC_D=-OEvBopn<eAwX34s|Su(F|mbB}eCG`gR@j3sjna#h^?r|)Wv)}ZS zjQbXeYuzHr{K_K9dXZJ>ivbe*7jE-c;s%_zSrWdnPzuFM&OPcav46w&cYNUd2j6=R z`=8k0{10r=m7(cs8l1YWVhwB$UA+#m7G3F9(jnI2iDQATT9RU<wTLw&dixuUr8Zsd zUC0FxyGU0(%2#^d@|3=B`bfWgO$;wx4Svp3rjf%xT_{d(^7-g$v?yH0?p-V6e>g4^ zfBJ?@{`KoJB`?X@4a>yv)s@%$vH<B^O#UUhGIzeGg**})pevWQOftNe(czLakVgk+ z5V@A~gu&S04AE81kq0C;l(aC?z`24vE3sE$uQIn>*J5`aTqZ;IWiohR8AJLqajquI z8eR3?d_Z~%mdVuB2dG~^nSk+k{bcO6pNwu=ChjB4WaNlBbY1&oF#V$u&K<!S!R2fD zgu8j(_{~ysT?;fr_~YL&-|5gnHM!XFx{sWj$a%l{#rL)44&xJC4bBI2)z6ODz$Nz4 zHtTAL4;uz!3x05J<(h4}8lNRCT33^EL<lnJfIqEZD|>ibnAvQR)I@(tEAW@}Vt@Jp zf5~wB(+d10^M=1<u_u+y{!|Y8Qn}DQXJ}&n^?mIDhjTl(-=V81+ZL&}S>!y_dRxTU z$;U&pW}#2Eh;tV~z(!Sjv_)cf>#B9LMY_@=rTgMKF=BMpiR?RSk!kdvXBcQ7<i2|- zl)c;s29I%%SkfLMZ69ecxzA$A)B&^Ww_40o`|*7e-^2L8X~%v9`%~E9d=&ct_NTGI z`5ElTu;Z}7`C078u|LP>-`XnENasP)o}g0x`djS*pE}k`(}ziYhSX=x%n!6EN6vCF zzQhf{AoAw7f77&WlJ31t&RVufhHaa;!1)}P93jt_$qUYZ!TyS_^4Ym5I2bI23Bl|W z220V2U@1NmEG0<u7`rZ{mEkK#R;&t^N~nTrr~yZ9>o%#o7R)|IFca}$X}Gja8gB$k z6KTy6!P4?fu(V#-CT-lTott)~ZIjN%ZPL}bjkVl1>AAj5dK-h8g9l4L4CHK+!OmbA zB5n9cu#E5_?&NJO<$`5wbFhqG+$Iy(w#np;Z8CLnn?oL)|HelHB|R}v&L(b?q=Z08 zJ`pG>tnZwpWm}>pM~jwR)`5AHjB%8fxF=9DgQ6u1rm>3?qorh1pg5nWM9>S)L`&hZ zXgPN@S}He0OI2jF#J)i3y-4Xn*`{bIhl)*(Xlb~?2OZN@Q|E4p{i?2NUBOa!l@gg@ z6YGEJs-rVnItTe6^loI!Rq}sLS8c9nX~!N&X0ckuH!kKI*Rhc{6V22;TKaM*k!vhg z_wAO6%e!Ur>Ta2`?`APeiExkD<GLDiBnC?=yDP@mbykZjWEy$)AiE&npnhM{RVuq5 z+3dC$C-~OvkW3?;FQZ=!lh~7dqf@j)HY$>@;zM^Pa?Pa`l6Gr_#J+<6Rb6FVTp=#f zrqP|Jx$ha$U!x?z;dJP#fjy>rZqRfhjI})5a!F)JCIdKAD1qJ;(h0FnO2D;3y7|B^ z=)AT<j8yW0GmYM>afS5EGe_QZT3p-;8DQ(iI7=RIv&}QY7EcFXs?(c$SXMZs`$mY= zTDD8w87AmV=Fbx}4HRW#89_)9Hm4DU69gd}Wo<V@rTs!EE5T6d3=NSkzYyv64v`-A z5PK6tq^~iA1!0H`99=4dx0lM$^`$ZlBeP4z4WsU*G6v(VOJxEkt1L@pDsAcT)<0<L zHRBq)jlb8nIgE2WFF1=RtYXTd1RI>Dt_tIq$vMb}0w_sjK!74BhVuAjQURq<2Gw+W zHFSQJ$g0f&l0vWVgrq@UZXnYW8DAk4(jYT_*$$Qfj5Oea+~{SJ2icGVef9w9haTvK zp|V{v3<KzccJqlpYHJ+tvc(UzY~|Ont>1J#fEg}Z{?G_*Y4oBwZ28v(NGq}nx=*m> z?_$dzl5Q`PWJpQ8izaxNq#nCV(rz!6^czd%?7K@PBXNniATw@>WI;Boh@4fvl6%xw z^1zY5$yW-}e5EkQSI(9BN>Qh;6eoB|$w4nMXT7BKhL@Dx^kS0fB^8bAL9!oNRplks z#a>cl_m*0>x70!XjJKS>>Macyy`}Mzw=^M}=e(r_*^1qUY{%b$?8NRucJK0DC9$Q{ zS{bzl&T?w80=p6$oK^J4&)sCfoy5n+Hs~saM=}~|I5S={jZDAI_Gr_5!@=LIt4wz6 zjTW6ffJHKm%#Gv)L@Qm*!9^0=Moh%vZ`W1P0sP34MEo7PD&2>_Q&;7C@OSB|G66rb z+K#_lcc|JyGW6)G{t_8_X^69AKsH?=Lm$K4P5k|IHFNj}Xo%PGBfGBQA7q$m#6P5~ zzB>HFx*C|kj~r^nKccIVV*GCAJO%hi8NqY#BPZv-z!)t*>B<R7zIH-VZk!M&GIj2R zq`iAW($UWzeVHJ9SzN6zOQ!o}$?APsvddnUoSc^>ckX4$yZN%@7rZP5PzdKbUzVc5 zm!){(Whp_LM^DlUPSOfaN_pf-sesDplTroM=rs=h)Xtt{F?y24=t(&b4YyB9BQ&8m zLkqrEXxng7+9OU%$F7soiR{AeM)o9}l-?sJrSI5D=|>KnIVpq4A?#t~2!40ONf}*r zl6PV!nJAxR@_b4r!7+8^l<Z;n>8pE5de7{Up2R&8J5Hob5GmlCByy&(6*f38V84z% zjSbE>urE@5GcSqpCL_nimsmBD{;#BelYBEwRp&@UI=@BQx9LAEd&GE)*~;uo4wj8L zE)hW2Uy|7G5Kz~+(RT@;cew~zbmJv)evbf}#ebQAx`O|00%#L{WaTdW-zR``@V`TK z2AvS&3IUvkADLQ!{|5x{1pXfq&~C2(5dlp4^pB|j?5dX}X~Qd$y!jPL+4YJzk*U~e z5wA*m{Ht;{?p4XKzbY=s#LjAcRk9~ul^kSl!K;$zdR6k>uS&t5Q&PC^6g}f9UP+#k z;^U{7f}COsa*7x@MGTyh@>Qppf}E1d$Wsi+r=;4k<doDvEp<@`^>7{<v{TXuP0$Q2 zwo}p?bV}Neo|1OxIC4rl51x{)gj3SJ<TdFz{+jecAN0e(lGoTse@%v97)Ca|CT<vo zF&JO+noNLWl0Q?BbkiZpkOEFfy?jQ}ARW#^#?3SA37(Nm$b#(4XCw!5ArJC<&qx6j z!Z|29cv_0%PfN*x(_-#9Ev5TTOPT$&lqZ~)in!BKx%o7$;PjJHz2P*ou+vg|?X=Wg zKh19FX-eR<H29sC#-P(|Nt~txPD_jBw6xkzOIzq^X<u?$I!>IH&cxHwb?mfsKXY1o zB2G(h<Z1SqPfP!%)8^SkW0N`ZS8ct6#hNg7+{e4@N<Ycnwp8*YM4Yn(<Bti@pAg94 z{3-TT?4MzS^XJ&Vz<w7SoWI2WH|$?wgYz2pud#oF4e9TpzmE>i-{SiK`vyTCcJ(3Y zn0-jvFFqt~7ao#n5}m&z!|&;&LM`TB{>rBIXsAWpK^7UA`G$<oXcGHZdZ>$<7`N%c zI@u*fj?QZ0{2M)78veg?aRL5+&;!o<k$q*Fq|ebqUeP3uCf#)z|9|jA;1ol}YKAg& zLrsIz!1lnlU|S3a>r4_nNmyX0%CvnfF%4C1(WEAn549Pp?A?7*eq*0hl<kwENFHFJ z!IaZOijMGd{<wz}dUL$M<{|lh9=wS4klX|h$%*%1*W82g(L*v@J;c@NAsM|MEQ~xP zJ=sIj20bJdoQ)0-Non+uWSmJ8Ty)b@j75gZMNT7~UOb#P59ed3vwX8@q|=wQ#iT9a z8-VjJ?4{Vt_yR$?Oe3rPbaDC_s#Mb@*56R&-ntk8hAN80k1UD6A84rjMVrVFWT?Vj zx|l2frUg1$ui*;gmMggBN<%eX;u2)@1zl<{<6C8@`m0>E+EC;BbeY)0t=I4<;0!mE z`=BnP@%SSQHBg3ct)YgBbr~+ur6*07-em4aT0eTyWkZ~I8!F|hA+hWDyhth}?UEtW zk$m3^IM*A@c5&W=^Ikp#oEvz!jo48<f1@D<b>v4shhD^6Z|8k{zd=LHUas3j1r{4p zUS^1KKjm{mm#X7D0J`%NhU(?Z#Xi8p@P+zd@RCDkDa_N(=`sXv^br`tISLc#<1j@* zPDb<XHuF&6+`>a`<r|a7*hU%*&v5PhJt(=+i<Dfnq5eT7#%?#%<orEIPrBht@8c^e zn?;-lL{c3GSrHYBq_aPM7Mam0;u;k8(?qh6+1NRx=i)bZ@Im0*Nd;E@XZLekq!>y# z&S667tVkJ@r=5`suC3%ARotr@Sz~!kVs}xJySWcIV+`dH`=FuBU;Ld&aPH$f?Z^E| z+~9l|+m8JRHaI_JsC|zbD(!%wE`HijmdCK;us@6aIQBv8C$K+{{Ur7m43$LFOokM2 zLh4OT(jXnqLdIoHT#$KLbFed|Nj8O&Ly+b|9^}XAQUHZ;4vLQJ3^=-!fC;5>x|BgV zR6yk>Y8<Lz-&2OFrQqwX=q!13Igf0>ZoH*S6BX5rY{70Nz3nQI(2mkULg!{(x_0T( zjqEu<3?C$15_IWD4qy+SAX?54HOLX{58D9#G5q7$6Pt+^uAf4t1$jy85-+x#yd)ix z{k$Zpm!k|eFG(RiC)7)_A<w~!{oJEo;=;&)EN?H#gyLOZQnZP=^d;uhP(WI~#>U%b zFDZ;>E`5PHHIvwytIVmH#Fou5=Z1=RnRDOplEwrtX}I}_G#~epChlJs?<KX7UUJ^< z#Y++|76Htu)4b*B?lLCT$xOB<m~{6t*{);K-N_jr<UoP943jp5-h16!dfxSxe$x7o z6C1o`9HusT(?-0-eS)(<+;nW0w~T_K4VVwlX<zfO<P&S-|Dknw*hwq?ivE~6qUjm2 zNzBcXZ=aHsI*T|V^%<fB(&20}vyud&WXUU%x$h;(vYnLd$di(Tp4<6#)*!D)zV}P! z-;?$A4(%1GWbmkhY6h1YsEyX74(jb1Us;oeRTgQ4CTKpWNy{-!T2E-azvkUKvID!5 z^e#TL8`-l#<JE)4aH~l_asYeqioq)ggV#ZZjNIS=&Mgk$%o(z53}u{zi46u@dIZ2N zqVz#sU+GA`X_2Jsd_82!Z3|Nlz9#nBs}@POy)0#fO(|3mM&(e2UI{hFiRCl2fgFqE z!?`Mp6hbk25t#T&ptW~_v~(_zc4V7-fplJ3ARVm>q`Q?+?$wV;-R(tE%aY>!#YHT& z7D?m80^W@+kmjldyc=7<*%V^AfEeb2QRIZ1Snkzj@1N<tkiE0Ch0C-7{C&tl`~##9 zV-F#dsL|KSyZ0~D@z-?yVRKy9Ll#VXdvLnPLzyEUdWIvZmE8+SakJ|Ispx5tKD$EB zLdJD=FCg>w3dw?O^c={=mk0T+E2IDl-7DlA6rmSGNjw{jP<m98GAK{fqyj3@tDqWR zO*~ypZ7b0L^=_g88qgb|X;zbFXt_=_K-+Di0Xon-p$lI(^t2KU(B~!^U;up(hT=6D zhLNKhFS0ZlP1Ixz#?dEW65mw3&Qm^U(NCEpR{cJY=jYqSdsh8(jx5d`Wpe(=h0ItL zDtYmtk{cJwyn=H|*iO<@@Xw!3O2R%H6e{UULg`yVB^A43&R@!*jLp|ls7dyh>N0<+ zVuQA_%Ac-c{v1?*6mWe$`AzbaKrs|WggT`Cm_M6-{v6!!mzH-!rTbbaZ*bXnsbZsm zwDWuXrJfz%y5s&*i)@|^m8Q#~(uiyz{}>w&qenvN2t#EArcQ*)B<Bt%<~XT%fjta- zeb8GRDm^faJ_Lj08#u9t&7L?<X@{0Qp3(|k$31x&>nR=Rolw8Tlea&fQmc7N9W>!< zh6eOT&C@*j6@7IW{(k5otrv!Fdde^ipbwH~_+RwBF_%510!lA<N*Pq+uYpSRDk$`0 zm<eUrvC!m@CHPDz!Y<Z4s6J2qlwmBlnA4tmnmKZitpsn)JoIT}ckmH9foC{{2jegS zlQ4CZxyi!|^skvCHvL}nJ)8cV$9!WkA6=-w=wW%yLfi1t!_B{X=?^-L5(~Re7IvR3 zDz?<Zz6Lfp%dso4D=o_UCl6I7OoGrWGG3K(FM?b=q_WhInn1o&h?Q6T<gX@w4cCLS z7P}6+-omRBCYDI&dD0q4Ya|ain=B3$+e|_W3E*rcLmPHGHaI)5JF&Yg$}RWGNa{T@ zCLC(Y=9QU*%XrZmnUWvLWaZt`D?6p9A)2={_eozzj10;HGBEH78J2Z2G|mwr-qy4n z<_-J@p3;`h8~9&rXS>8<QQZ%+)Bn>Q?3q|p?FYOeXtA(uVo{A9ycr;K^NU}TCfO!Q zWxUn4Gqd1rwb5%)sl7hZw1+W+x7ZB_m|3`dI0V7WqIIvdoM2*6<s)5jn>+LEoN# z%SZa?%L{MvR-EgLZurQMBSMoBZalDwzNFDdMnmaK>U?A*h`xjyPu9_wwED=n#=vrr z!-RbnRUhZWzKumS9N<9)EUGDy8zNiw;UBc9wmtZVEUF^`|1c#9-81p**-)BpN^_Jn z7-)<aGuuZGcR0uJj9XOR6=nnxOc7?8A~4ElAu}&CLqKNKF{LPDN>Sh?X~@)Mwy}^Y z*vZHw{51!?r5dXCc}pc!pqIBYL%>&xG|5}yVur!BMZEYw$BtZKv6n*%8ku51_Z4OY z(0TErZMSyxHfU{Rh5*gQY|}ww8rybo9=#sw@YTZ9L1qLnv5y%7jG>Q$8{Y^F7c--P z!8B$V(2w3%>|lz4vj>uHvROA_Q7O!4rje=F`IeIw-n|+UJ7rNBnjuUyRi?Kg)5z=~ zL!1{ZD%TJH>lT%N9RIXM6`sM5ETSzs->|3>^2J`XsL}}hZ(3COD*VXGvN)-7q{VT@ z;VG%HJtfY6r7FHjCCymWzs<KBr1M*(eVeqms5o$5!u}5Scd^0wJ&W3V*`i*Dy>F8S z&hL}w9r9ek2Imj3e~A4fY;ewE{}}rx7Uh`cPs&d*u3A*mK~{#y6r}TKl)wp3iTybx zfHZ!=#o&CGi+^cRS)HDu|C>eST-Ym4WbCgffs5q3MhVcly^f6iHTV4uB{0K%k)^k| z?|YO0om}j7O5ld4LyY$=s(O|T$l3@p{FX3ZMTQRu^IhEV24NnFAKBuE|96D>hC|XG zxmP;k_p;wl558xw#Qxr*dd|=rTX<D_C06=v^u|G4{|Ad2;<{<%$R@78X;GuxH};Pf zH6H08!@pb9Bp*DDOnR5@>`#=y4bEHLq6BVRCGGfLNxx;4v!o>-OkfFN730s80H^A+ zFIi<8J(mfa^DmSDok8qhDFGV2ahn*xo<^3?IeeZfE#1d3y(B@(={%f&qXZl%v45wS z{z36V)2#$)zL_A-Ig0O}7S(wzK|0>WzLCJT19>3(GHF*6q;obw+Ab#W;v#{iMS`@> zkWZ|tw>Uxi@b!~#pe#WKZ}1tKm2C>E8oBhexa*#lX>_MyRgPE>92P746;?ImN|02F z(`mJ;^nDbOr&T3!WX)J$RVkFfG%|Y-zs;(0k+BP{Dg$XOvZ_qvG_vRxelM$<AIxIC zttube@Uf~w<TSFf6~C`lRX5^aY;~y8UNS7Ps`3dkARE${xZY(|O~}}#R#kfx|1ztp zKY}0GG4J=Ys;+szzg4v~TG)QDsy5^_(ivz~u|a&P<$NjVnInHNX(6PAk_HoX7MUc! zGt7Zwg;lvPaFdl*HHJ(%afnR`ZnVm(8WRt3mg<n$S6fxVu|tx7^pLp1ttu<-kj$*L zs?PXBVvDpY`#n6!0el;*>g;BGQC4NUpY#W;DmCJeq^-iY#j0kuSyi6jA<12Gh{f0; z$#w)Ck}S(1$<)Z;eTdhxhd6k2h{KuOcn3GwMM+#cB-aQYR}2qyoty9B<`3c9Z{=Jt zH+=;A03~zjkW9@SlG?#TQrCM(>L(5{?>r<8jfbSM?vONf9+KwPL()=wNLmXHNn6z+ zakQ5mVnudHI+G7cSI!~ncHum6SnQv%s`fL78P*O<=aIwG@yuZkZc^+=56jHseBcv2 zz>_?{7kGfDt;%``{V@8o)Un?-PR;I=>I;w4vF#Mw5%T^E{-gL`AVB>Ni|wnVeU19> z3py-=7H<A^oG%fCw!@SF8BSVN-`rv8zkOH+t{rBxey7+It!nuCVHvqX{#VI=h6hRF zMk(Ap)ymGtVM^ezjGx)bM(j?R^xMg9%1$=xcZz$@P8r>|lbP~P-X-spLC*XQ6+3py z@KI(D9Q^BrzQ&!>U$&F2lU<U0bC;yt+C{YQ<UH?AN!!K2%?&%{?5drTafE|~T$jlQ zWpU$dZk%&qr{p5@26sySB@Pzyz=hX2SjdMJxj0w|CA~Y@JK8Cwe0Uj@JNQ#^G)5}V z#7NaMF;aabhQ&jS)ON;DhcQw=7$fIrVx-|}j5J<|k*13=(tJCH_RgV|%Q1|QG4zBn zyuOH$PEM3|MZ`#VP>l2(i(%_JM*28J+V34B1Aa`4LStkIIc$rOk?0t4!?U9pW1Ku4 zhY2!H&haw&)^3?er)XU?l{3%Ec;d5S&!UKPXeq~@W#RFx3?F<}hT@);nLP3pT2=o( z@)VJ$glZ|J-4n333ai>*Wo8EG8$DdZR_308`nu(<c<kRJ=iB6bhpT@`1<umr{KWio zpuWYClKiYB=R7M(F5rMw@{*mB61h{H5!3@^p9bmZXXB}dqyJeC$EgR%c2f_KTSq-q zQ4iOsha1!b^4uKtfGmzP`0WStWX9>zeFm#x_VADu2?jG4!@+XckZKZYHW*TS#o(R2 zA@#`f*9~4H85}1zqzRgX3~7lrq*XJd4cegtI-$$W8V-8cX6l8$cUi;1z%|xz^v6TU z;WE~6t*qf-l)Pgw-s;05j)X~^Q()d3q(5MB1$e4A%_Bkj<E!ltdn(r>o@(Y%Pi6az zr?SU+s;^L_uE$9`=&7tvnm<^sf7X$d$Zi;<xEN|6m7AqOI{I12aI?<=nK_zdrOnS- zHOWEdV&@_A=QJrm7Gj@67U3^OmSCI6(g~Ko$a3roWF`J8WHok8+WhkO49j0OWb5HP zG(aOXK{K>KE3`p7bU-I`K{xck{Dy8H^uquQ!VnC@2)JPs#$X&KU=pU1*<ou5;sc;9 zk`I86Xg&bC(7U1bk_X#$9#W5f9vbj9LKAv3j0EukFp53~<M<|E5`BJ;wkMtkgg*3s z7{E6OL+HZ}{$#Lq?}E%Aw(B7qJqL25+0Nh0uKWzU^N=#jc0Ht`r$IXYvrx+Rf0>^L zXWc!d0$GV&#dX!YJfsGhf53w?I=m)G;B^7A2)mf{68vjTl%)B%NGC2bh?Oj2C5PC_ z!!E!s@>Kgv%>BXo^8xm^JyrTUr2T+AKO)bM$@5cB^Si-%yv4P60kO9sM1R3L^XUc3 z8fR9A=*!KKP(9?MZH0G2^#`oBoCRusp6L^&FW;S%Oz1%hI3X3%ARW#^2Dl)TaLFQU zvXME2O)fHT&fM}F{Syw<Dw3*^HHp-fi>+0tFQcxY0lg8L+|(7cptnLBzINz9?}V<~ z)HU>=_d*}Oei%R>gdsO|1taKg7{xaRj&Y0$m`tRy;Z6{>MDjs!Cx|-GJK;_c)uPwI zogivJZ-hHRG=lDiJ3%y#J^^=vs0Y0_Qg?h5MHmAx2$_-Ik_8#yf?UXhY{-EWa6%F! zLpq#=R7itzsDM%^gKDUON~nTDI0yMq5b5tNB`78oK{3n+lesQTf6QwC_5ziD$$TwL zU%%LOZGoEk%>re6e}S@puz)^(rGB5qWwQ}8H&)Ka%qMNi`iS|(Rr(iJ+RoUN-C<L% zWSg3C+LSHbruJu`yKL&k`P;v<T7T4$MDLUgDfCuONJUSBbb70^kb&-kOhPBCnD8M^ za&8embA%5vAG?6`Lj32DMcBoI^js74T*y-FGSbWOS0F2~=U?1Z16!c<Tm{r2)Wdmb z=%o&!37Vm$fEa)_XorrQ!~k?bH}n)xhtLQ8Fu!#<2=iN)!!W<~>4y2O&oLN3P7J^# zOdY4^`nadEb3<r}<^$nQ5PsZK*?1tR#a9P+g7D*>YK9xa2tGI53Br$iDy|)d9(=v< zVGz#uT-nhU&IDN`*KUzK$Ut{NW~7B6WH%W-4bpAwDnk-_GNj;hLM3_?RP*6AP>Nm# z<@hR~2)!6eZ0ssSK6(KZ;#+WzKOY9$$~AhR)&7P}rN3$JUZX$1*rhL2Z+a|LcI!gr zTCkA){Wbbw>&$}-l{MB}9j-rbwK*25{mEuwxW3vPiO|D6zB(UPKSgucCl;#osfEgR zfg4P7gNqB*%)erP3;V5wDye|5gB0MmGm_d%*g-m+g^U8i4l)UutQ&;gEy9kF%SGm4 z=aXJQNEIT_brN=igdHJQf;2lB5KD2Ck)a$}ag;6hM7G|c8odT;%cw=DZzb%Y!A;mf z6M8eW;A@4p>x3P2+$QXx3%wh9@byApD`5u%Zo&?R(1%+Is}Y=T7(Ghd!FVEZ2b1Vi z&}IK9+&Yk*a3|dE^m$FTkHW11*$8*S?M|OJIrCAtjUy-EPPl#8=go)JknN**8$b@i zf9v!JH}<0;8*(5sGL&#*pBh|{Zeu?hQXvgeB0~wcQ2IRftKm+#RnAbdP)=F}l)}0) z{*>6*kcJ{ChQdgGxC{AE03U_jTKzGL>+gh`d19@;W5G<=B4t~(h^}p&zRo%my+~Pi zn!jE*A8n@=sr|2*PekhbJr2(YT4kgj;xp4r{((ixKD0=skC<<+*FS5H>@bd)p?i!9 zM{N+b40j6e<F*`0;ljtWmZQ`%bWlN^(1qR&JzU!hcgpVLwj4>>!pF0g8&vW&DtV5| zKu%&$U88qP3ZkvTowoXMTc+8ag$#Ty$UI70hwMc90?0+rgM54iFyB_^o9f4H8Qp|Z zd}UC6oi|KSd7H|BYV?}xYd(5=WBU!|bA6F={dSR>xj}jT0sD{Ge_BMbZ@x!=A#mn? zFJ;~ArCeLQST}nq+YT?5!(KG{d-Nx*w&PxE|2NF#_v+8Dw0C={^j>oGlVgx8hPlFx zJ?^ERov)VKd-Y}J$OipEi|d9L_n+IK2b&`Yj0DrqZZy&|=BIymqWHg$+-*lcs)!b3 zE8MAw4)jjAQxX3;a{rHkdvv}ccsDn4oz@2*R>T!5hV=Q0NpUc^{@;z<nMWD<Ap=~H z3we+YIq+Xc?*BP(->HaFKD-P*st76uy$U>%=F2E>BP09FJ>JT?(fsO0{Sk8{N?&QU zJ@2jdzi9p}ifFgDc&qd_^V9d~PlP_lCH9|qE7w(THS=?CW&5SKvi}PE*WQ%x#3uc5 zbL4U3pd*gg8el>xqgxr2Lj_dE@$pa%HBj42!+?4?4-J>;WuOU~q2(s!4{gv69hWJ8 z=z?zO>7`*nAN0dO9MiZ#{tUq|j2x!|U=+q+JdO&0Ntl9<TcVv0g#SLOe%uoAeK;y7 z@<4DWKt67XGkhK}syhD9up0TFP4OZhc!>{0j?gOH@L_}y2_LsaJ1r6Z`>6VHOT_o# zsGLYkg*z?v<Ce&Wn~)!U|Nj|Q=bP%sO%Z1el=9(aa3?}O41~q^GwRNS`zY((=EnQy zd$}+9sQpLG>ig$s{hyeh|JC`Z^z)|A1N!q8*Ux>Z@G}qScL)9pG2!})kD9sdqip~1 zQTBiOD3|!sV7Kb`Ic7qAm35`Ba;@@JGi!X6ZLP1euftxCeXp;Y|LkNBtE9*RHV_v{ z>SCLuE%TA|KwdtC+$9+tH*<vtNai{}$-0++N^xJH<U9~2xx5t0+qp*aALRGt53QBL zhxy^~qwA$e?v>)t@#C*2H;DQ4{S3c{?~}6U_$ebVbShrt$DCi|{QgV3rTWxPsX4t@ zYULrROZ}wOXY7;n*^fv={=?E(^l4rIJtobSk4sB!oU}GPA#E+6<A?EIkPdlTItSP_ z8%dDv@n@w+9hTlVzASz673qKL1sV9>Q5k&at1|SX7iIXTj<3rIKL&FDiX)@%osh8) zUg3vAr}&}JYceTG+SEUsT9f^wxNAT*!krP~PRH~=hUcUIwPPBZ`6%uNkc04FI;Q#I z_rtiG?|F#73F7a=xWm5FGo{<-h`TvSg)~TsoTFo!V|bloc%5T-o+Iw&7@p@Cp67_W zIXb2}I;J@~ra39$L+9hJ2#TRFa*h#Vj?r~a90mOOALH}&kA{dlJyX)4A;}Ylq#&J_ z3`u3zC5_#ebmUp=j9Z4d<_yV1W?^TOo`XLZnTMS}$dN6sFGQZ>4Qf%GC)+)qbV0mm zFi}d6^C}j~p#m!7cohrPPy@BSyz+&5I1de%dF2aD&<rg%d1VZ3&<-7!d1VY;&<#Dk zynui{=!bzgUI)Wa+*;lbql_H)qz~d>_P`j7$9d8RdC~`Y(g*SWckwp;QLF7~U$y^> z=Cj-MCzjjtXeR}<lXJ9_V%kk9cDb)guQb0Ct?zCgNia0iYrB50#q}d!n&PhQ`U_q& z{)?40Xt8oFUrh7cu0QE9-@MFaJM@PQTa>T))DC@<b*6w!MP}g+{rTr?-&?Hqzr9$^ ze19>!0E<=9AZN!WI6H=PUh<_Te5navYQmSA@TDevsR>_d!k3!xr6zo-2~O%G^RV*= zIjGO|g~)RUm<%6e68x6Kh;Z0h&_gW?xjLwy;cx^r%yKvan$Vk}<u->SpbfnpI`DNu z7kW4JxY<~PK06z0Fc8ni8VsQi!$=|<YcPsF2IKf9VA&+b6xe@6@%(tPiv6%)b~2@~ z`z%rEi_Nd^WHx8twnUj~m%h^CdUOe;dTp0}w<F0-ctXky;R&g;geRn<pM{LugePR8 zXF)c;9LPn_gM2sP35A5_xdT=T#mcW2to(X`|0#uCdcrDYXRK0=tiY}$y$XLdvgUx5 zh3TMG>L#r03R~qoubvvNSf%l*Rhp2^*e$oL(mH3AHe@??2kD*oyO7=3J%d*1<@!Ek z{{jB1(m|^XCRk+%IgCAW!Yb}FRvAT(VULqOfqxP?b-+RA>}IM0NstWbvurXzDx^W? zZKf)a0WQd$WvT+%kOPHoHW?rv3ZR7ZV&*>H2qTLRa)vCCvt&>j|F2Sptd8f+GE_p< zD*n{FIZOt%PzO!3UebJ#LubgwS@Q?4>38qoemxWX`=~)L8REXf*aOHx?mv3QOUAHA zknR&+GD-Rr_Be6^zqxp~{-AXxbBVI%m{081pY?IQu|&<hxkTA!mZ<&THeY{mer9yU zD0Wb<P1I>KwA^MB7~0U=p#xthbfI@ckJ}(t4Pw<GRt;j+AXYing+2@;i5%;KQS>nw z$2S3!=u=?7wM4o8!gJi_IsUPPftBD~xkvZ1+CA@5={EEJJ^E_H_1qHk@jbfT;@Wo? z5nH!M4=_je>W@8_<Yr+DDKji=A$698Eu^ELg^b%QY#|dp3$pR$KrVV7<hxneLZKEy z*m4l+AP1rNrBw;i#4bI740+C|w9Yv1Qr2|yuGslm!|Qjc{co5lvHBKsJXU{Tg<Y1a zbbYC^c`RiCzmx_1QZ=&>+Z)?=>G1FoL)XkD59yz=xOOk)8yrDhZ99hdf06q*xxYc~ zm$6S_zk+>wsj@oEpY7APf7HJP?$_@Nu#GKM`zP>DlKTR=r^%gsm&wmC6Rh^zJjCD4 z(TC^PWM47%t>4G@qJKJvEJ|R(jx518k)>x?ukT^K4i$Se^KMOF?YKZU1x?TlEzk;W z&<-8Y30=?)J$vY;pbz?C00v<QhG7KUFbZQZ4ihj5Q(#-aOxf>Urd%7BshRthDKo^b zFYt10U8dfQ#xvh{-D}ssVx2j$Oj%Ev<#s*H9C?IIG~2*3m2`pr3R1ucsgMTgFn<C% z16+^^S&$7mkPCS*fAX_{lO=_mHaUkZ;#5g7vIN^imY(sEGGsY+1+o%<6|(w(k3(uu zYA5)Q5EnT+bBX^5fo!<yBaO%=>}F)koR733+pyb_9r!zuUD(~o9<J}*$Jv<!{PY?* zkl-VO$RX@u<j5HxaU(~u$B^UrCy<i|96q#bAIZ4F44G4pnaHdgtYvSpmgR+TE;0{0 zshG7eCmd6dPV7|D)9|Mw&tjLxdq^3SA7uv~Dihg(hidd1$mhBOt}jHM!!E)%e}KQl zfnh?+RS#*s<{@o2Jft1jG3Ozj$S&+|WNjJmAfW!bhn$B7{EehH;crHcBzTDXh=+_G z^N=y*_!$qGKu%&$A$zJk_?eOiKU4DH;F||OQ}W<vN*=Och=gGj)9X`&hy9PsRQjLH zS3aflpC3LoKUcWVj}~_0Q~DZn<d~7>Lv7c?d1z?7Vt(qY##hZHkLoeT{-I^&XCBp$ z__^x+)J%h)vNiiDd#j&vwa?$-@&P^89Ql&5pC20(+CAkQ6vZ>^f)aESN)wrMK{<K_ zRN||GYV;bYbu;OL`WYr&&@juS3!2cIq2)G{E@(q<hYoz5(1qR&J#HplJNVOQXVwJ+ z@yxnl2z?kv5}9<tDEb(T<C}m<^eHd{KdpybXRiAx>j&oEPqWczTj#I#ulH9o_xLN@ zMt`<`9@C#@{Qv)`Bc)G22SxPp#ZZE7LTMszBcU9<0xI!UK{a}foi~%UZr(;h{R~wF z4YRzBg#UjAA}>R1&-p9+m;F`xSIn@u`PE;8KS7War{80ae0F~1{H4FLUNissS>6Am z9(md0`h#Ki?-M$H0m>E_pzO;7lq)1a&4gjE3Q+skm|uOIF?IZLb{p)60#y1l<}V-D zH#(9o#L#)g(0RqsdBxCq#n5@h(0RqsdBxCq#n5@h(0RqsdBxCq#n5@h(0RqsdBsRR zy=MUw!Z|2{VkiL<O5y*~yGtc@6;#vn*X+{x!SY3BsF#?bBF|rChKg*&ZbCMjxBjND zjNL^CzmE={0izGu&j2!j9K;?%4l{s^Al=xb$T9rm$O-I8<P_JNi$AAt^ZpDSxxIpt zsS2Px<36W<DbW5y%vlQRCjrX(GxVRMzZ;;;cMfuPGwC8-J1d?Pq>~j;DsukEE$PU! zb9C)U7j`Bx3x75;2Rj#;$MiFwe}`HCg>VjvpcqQPgi<Jja;ShxsDf&!*)u;Cy}(oy z>ft;zKqE9kGqgY}v_U&`KqquTH}vdbDhhqj4+Ag=Lof^@;D%8cgK?OENtoKhR5X|4 z8F`TX?k>rJO!O?sxUh@3*u}3tcJZr^U6P6~4N}mZkd(~P4yZoCvY+D{l}C0;6|x+= z0$I9`qZ!B&Y!g|8zZhAFeGZw=wFM6Tblu*`tHqsc!|#+%XoGfWfmUdOW@vy$sE6}V zo3>Nx<~W>^$l(+ihY1*k`NJtA;D#X>h5;CaKIn%Y@aW}_8Tf=AVzn&~RQp5B?N8{> z2il(uROw#`RJNxBmHmr>9LNdeKu#dt*OU5_hN~pd9Dc#r>)69G1Pb9C6hSe3cqjr{ z3T03Z6;KIPPz^P^I1@n()e%ee$n(Tj1F{jj3E52SwIExu+mP+}JCL2&UC3^(-_x^? z`S1be!^r*w=EKNA>>=dv8Ro;t5C6|7at!}CasvM(a*7+gY5K+MJ9OIu^V9MA{nnXx z0+scLW;(h#@&$d{3fs~kwcjsD&G-i?TTqa)2L~xvDE12M)j{U)31hc|xlR5a7O7AO z=b#9Rp#)4Qg)%6I3aErCsD>J-z2Hk5@TCp-(gu8K1HQBYU)q2#ZNQf{;7c3ur49Jf z27GA)zO(^f+JLV-)XkrsJ^X7K=!1S3fI%37VHg27jKUa<!vsvi6qw7NVz<B^7o^f3 zH^2N8bK3Ez^gxR%FNj`b_9;Ea9C^Xm^Z%;8sq@+k)Hi%QK|pVWCTNBhXoWUthYsk3 zF6f2_d-hP@&<Fi60D~|D!!QDF7=<wyhY6U3DKG<{*6+2>TnbXw@0pK0J-@i{TCVo{ zn3D-?N&YlJf5<<H4%8L9T+QrTu51r4SN2DitMo_B`@g8)t%nDhpZ}u1&*I8hPO;Q| zQ4jENnagPuhxNxCGjA?e)|utX^{wS9X~t8MA!U{`ijaz)2I;pw<t${NyC4%^7G$I6 zKyEY(CCIlk<UnCOLk<+77sLGNPZLVf%b*-z1ytI3D_BKBbu?oR)Xp&EK>byQoGT1D zml$$xG33lK<RDwHTS;%j-;V6S?%d33gMp+Q*~7rndyv5>fx!qlfIWDE!RQQw5po3E zP5LPQG35Ax`O`+58I8EX6w+MwjQ&{IK?-x`HU;?)3i6+njRdP1Jy_W+!D_#!`Pwu3 zc5~!e{XVNbHkgB==I@?mVU$D*Pi7x({`ZznWGZ&rEk>j{MkHhgwu|&k{8`9s>>O5< zxzS-1UKoWJM&X4~cwrP?7=;%`;e}CnVH92%g%?KQg;9866kZsG7e?8IQFdXJT^J+M zOc;g7KOC72lk?Dk-Uv;%!{~>@qy@bd+VHhQ2YM%TMe{#rp~oI3z0emQCjBsgJ_tjJ zVf4dcGJ@`gQG8=Cjy?gC(P1(b?cj<1nEZz_`A3=?7-NwI*oDY*-Yg!FRoHhH52e^; z*yYFy{HX;jJ|G>=LJ}lH3OFGfav&G-AOl>G30V&Q3<NRxhoMO3{?LQo3w<^g2QZF4 z0h32rJirLL8%FVsK|>jf4`^y-@c|z%PFyTbpaWkgbRA{!0xjsR(1y>Woj>NXFX<0i zT*<+-vxG0{Px;Rb5jE~$WgiPxt_h-ODp*-xH{bq}zTX`Ah7s+EW;z1-%y0{!FrFa^ ziqMOpB$3t+rRZf)j<4b%L(&0;q|FRTj0QExT7s#LpsFXh&LbPJ8*kA?5q!<a7VK8i z+wiv|JFq)98}e`$H|R#`IY1Y6kS;2LE($q-J$QmH>I_{Jl9z?F9J(m{W5{vriOqCT zTt9_01E15wm)ibG1CbClqlK`|0rA1+r=HWFSzzB8qSAMV(B~Y{*O|XMqHm+Uu$Z{Q zuFO?-Wss@ZX}8#wnPXQ5nSt#hJrjQxG8;RGnOknOCq?E-k$F;N{E8@^RTmVY7eh%R z3mPazFN1P?6;O#@1=Z24x}w<+ubpAp1@*HmyPyHR5t?qZ?1C2bR%pZ54jt&7&=t+H z3wrD<yPz+gRTm7P55iC)t1cKpcf%;YF&IamfXQf9UD1m?#eOP;ZJZEdCGyMj2ZcsL zly%H3|FRzDVSj7BtiGZ@9Blhbh}wUfBK;f1ITxbr|G`CCsG2cC&Eant+y5U$5K&Y~ zG*v-$G!X=~Gei*7&k{k<fZhmAw}~KVL2rdNeC^PI-U$zPMH4~LV<&>3FP;d30rWu_ zN+g0{1l<jz_{LxyeF7$<i6AgTj_RMY+Bb))^sVO0NA<nt$n*L-i|ZSq6wtNj^{pQB zW1JcCf_|Tm?PUs~n?mRfRn`IXi5KQKDu0W8Ba}Dkp=$ph&2PP+f7+3BnO%HH0Vky1 zWEUUO;VfiaW)~kaAq%oM5o3@Gd5|B+fChzd4vLO5pg{?kP#VX82IWuzm77>YK{eED znm+@2nE?&z;XE|lWI%%^0=)SK0~&$dN`SW^+cz+vZDv5*#ejzF#_l=DfR@03hU`Da zfOdib?F<7Nau|DrbT|G{<QVzKH#44b{UmZ~!~9Pm17Bp0Y~L8B((g03zsS!1Q}d;m z^r9ZV*!5zVdh@Gc977LNuCIp?aL4rf%n$hENV>vcc9p^Gf3x@Y@l98E|9_g$XdED@ zplEBkD^jEgND+`0<t7jn1Qi9fA_^+HI&s#G&63_FZQ3SH+9W6G{r!E@+q6lWw#Q9z z*~Tu}WRp!c*~T`m*<E8>uWL9uf6q_7y6>FM?YrORbv?e@A5UKAecqo>NKf9MbI$v7 zb3SO+D1+G;gV{KP8JdHgOT7hu9-7ZVS+IgPO@Ivw!M=<13W}f@oQFxTzzqUCyGXB~ z1WKW71?d%3K;;T0y*fd91vO9$b>~U1paB}8=>+K&v_LDgtsuPuA9O(HF48OLh92lW zOnL=<&<}xKq*pKigAiIldIiHUvSKdZY%OQv!4u0fT#0{jCVsD&eK$jX9uH~fLvA%^ z`@zN;^g_<07f`lBCcZ34r<?(12UiOqo3aT8`20Z#KoExaayA}9Fa+Ier>c6O6S|;p z4rlYB7yNUkZc?=br>a_^37WyTm*>c!4cg&DR|~vcEr2?x2W`8U|Ew~dJa@g+|6CM3 zt6nrk7ZF1wJjcll!woUg94CJNocel>BzkMege>@&wFNkm=&e^Uy8#;%f_)bQ927w@ zI1e+xfg3~$1Ke%~xMT)6v^1XXDS_@Ok?slQK6C=&B>fW+H*q665m+}y|1?hjgf?O~ z;cv#@g0}JjZ3*;Ge1H$_!0y~l_moWcg!ZJ+J*Cn;rO`d1{n$KR%8^9(gbuQOD14rP zpnHOm6>6n_I;>u*3AbD?wjNf`i)76>`lXb^>N`c?dG(|J_YK7AGu8yw{Z2G;FSL<+ znRyg&w~>39dD8&u#CD|`dET5Wx2Ol(OMMCcQnU=aJi*8efK$y0MjGh2ktqPH@18a? z1u!xNFfs)&G6gU)1u!xNFfs)&G6mofWV8*tJ;BJ_nWf^;&h6Z+vD?TDz{m{1$PB>9 z41oJ^(~QgjjLZOx%m9qc0F2B4jLZOx%m9qc0F2B4xYI*KeOBqO-&Wei@L6R*yMC_J z&J)9*RX_H(jrIw|4?kfcYTr)$@Y59W6BVOQY*#Ar!%rws54M;368s-AK~kFMZ<`_g zeI`gV=q~rnmEi~HN?r0?soy7-y})s2D4i?K<)Y#R^=uJHRj(4iOjXZ~vYeVLk9}{h zG@PExSugfF!(Kn0J8_4ISp1?Ag|EekdtX$?Yr|L1liF*A`9<~0DVEjqIB-9&+>3ty zJgMI!e)V})X-GdwPIsD|4$T@Rr{g&^(>OUDnuDE7y#;?Bn$HMQz{p``1hJuo*!JBF zfs8OkXfY#>GnFAQjUf;f*dFS=_)E}Iwl7Oy2xR{Xv@%{90#7moo@NL{YeyLZ#~1>~ z|I82=#}G&!*bY9*9niU#ArQI`F$6;I5r#nMqudVxd_fq9V+e%MT82OvraTg-41rNc zl+MgKPnzB0u_Ih7+pi3PWk=Lea}2M|la}y28UETlsr$w}X?Pv`oAZd^6JJnoHKw0p z7=+Ao41<t;fngBLlye{#p9S(bnUWvRJ5CZ<uP<>rjW?afn@;0Rr}4Oegchf80V$OW zNL;gVqXOGQy%&E8TFUli30y#8{|dA+o{LCTC%J%hnhQv1?I;(J#<+kq&IKg25xa@{ zX8bK^D<9C7zy%~ez=w8Vckbo_QZg5i(4G`7Af<8vDUAzAXg_v<`XK%Rbdc>s30y#8 z|6z0_o{LCTyQiqCles>F)~1jmrII40ks_gu*iF<o<8MJ*+ewl9q)0=gNN5Lk=Sfnq z)1+W%&nVYt#z@h|Nzu@L>;Uya`~&D9+lTz5XzV|Xju@#(-%b*^n<OxqBoNKUHl>mz zrjaC~x!4x!^YG`R1=v<re6m3y*iVpzLJ<^$^E^o^xIuu2|JchHC_zj4|H}A}mG?tc zpi?TT5K&*`%zSvpe5t)sZ2Y47>8ZNA=S%&r`7(UZd~PoJqWS;_8KWaHo{44=<JrWx zi5NGdIoP?>Tkz+h`SIlb35<^PY&NtI+fL8rpyw(=i&N;`Qt91}(7PR?cLUE}dN(ME zqj!U{wTzBXLAjE3om5q)=-r^^9J6<rcuea8qa)=;Xu{VFEpd#F(6*M*5qy+8pmQ&y zBXl2PbcEg`jE>MpxgP@ff-n%r=m?>;Ox|Ib@<`mJT%xzmm*zsT=q2Ss{Ka`<>r2Xo z_^-|9NOHZTjuC-0_4?Ui)dH#0ERcrC1=4cK0vXmVklLwY`cd_44mFw(GSAV3kbQwB z1T*Cv$i-)YJZ78u@tk`~;M`Lp=bq3)Z2NA~!DP}wv^a%yFqL#LjdTzd*dFS=_)E}I zwl7N{9c2Fsv@%{v2Tzg?o+cecYez{3$4CdqNe9tJ>?Z1)@wcF@d_Y?Q=^!8ALp!iL zcash#lMbRiDWrp`q=RXsgJ?f?fchZ*0d$b<LkXmV>_3b$Rp(nSKFYm?x@`-j{tj{U zDE*Y-nFaLd>74sBm={QMuK4$K^}2sdurZ9f3A+ECQ?UesjiY54rKR|m3p12wY+hMF zbnVSh@4G^Gk)ZlLL8V&AiMoYS8!5h;sebTAC1cdBStt$b7D~%{d`SzX?)HUJzlri@ z%3JXnC||QcY|T<9Uu7^Zl;#W^SvXAWl(UdSJWG9GWBO@U(>cp(I-{(ngJzGjnht8l z&Ovjd%ZN`_(~08{hk~_?qhO<42==|=-9pX2eS~l`v{2Ow9~$kF8SM@-+NChsp}lF0 zc4!}VKN`RvjAgWoXS72@iHvsWF!l&4=4Pu8Xbtc3XIvC{+3F>iTCR$f%AKhqWKyq> zw5*Ax(IU+11;+H#3~px`+(sGP(Cl#rH`I)sgXZG5aJ1#c@%BI{SWAuvHp+!y-%E}M zMTf}oz<Gom58RXmc<_0lB#s;p%GQ$OK?UW?IIYrYk>f$lIdVLxyFiWy4U`+9312g` z#F67c+gfrw@KNr7&b{P#(0zy;4|<P~<3S(gehA<T!ay839)#AC<H0cHkvJ})HL$vI zV_KA|?qC#yT9m2=yB4k55~Zp}eSTI!9*R=6)kiU?MX6e_ThS)`&1m0Pl&XK6Rgup| zF{rUhGIkHz`6{a>qr+@JG8Cl>vHuV{fIWx?@CS_)%)B!#hh=%PQ4>q_WNznq4(xO^ zV+-pN@#?igD5#Dip^Q@HQJ;^Ui(2rz`47c-6p!^rsa$B$dG1gG`^!-(2UPGMR`UOp z@de7!66{jc!xxz1r63lY)tj~97h|RNh<M)2>49)nEPYNcL!QnbEA{>2`?+fUU*9`3 z9^^v-V}liJlncSWmjMon4l%$TWPnR!fJ5C3b^`SzGr+|%z{NAbp=F5-aA*Z~WvoWr zGF!dqA?z0HR<sR&JL<#kKs(vL>mcJ-3gZ{ro5uKs_F;2h6^9MuS1jWf+Yh24_8&rr zu}4s`(4t;=nPEq)H2gYNn#aTri*j{;#Uc)tp*;2D94rh^XBnPG8J^JWafT<<jGcq# z;<qph%!}hp5)`cEeo(McE(H5tF110?AuhFn^9YyPz)e|z2cH*8;<(fXWox<A1{IVm z<1SYY7Ou5H%{i{MLEQzewLt^rMrgv<3@ve7YlF76Tx$a#<qqiF%cVBxKE$Oq=sm)v zHt3_=4*`5Z7>MIi8-&(!sSSoHkHk&oJXy*j8UD;7seN87%U8}B_!dchr>My15>zOk zD@lfT7D@9j7I6Y@5s$Jg;!zfM_)qK$i{xKDx)UpZF@A_4=aCOhkVuaSus<_H`UECO z*MHm$i3nR<puR8Cvg8ILDx*Msvj|w#Yi5L>x<P86xj|Y!eS-`?dxI2HZ0e;`lwnIO zx2fl949_eQciYsnw7SL{q`p}^ZBy@<Z#l`v-?>5RzI%f-yv5Gn$9^08N7!d?kmjF? zZ`+l#pJzz@AQQ468%$t^9LSwV%t9XILjhR928CeXPwEFnPz+9Rfg1#P=8^hA36w$^ zltTqn&QnsqGo*e{1GP{G_0Rx~&@_+15L%!W+MpeL&;gzM84RHtdZ70pgJBATA=)3! zU>M6_7|&pc4kj`fqC?oj=t%T}2vzvX#ZotWu{2z>m|MXY%WxES4E8+i1&jG9wm8(! zTpHfLSZWU}rr#^#neG&&_1`E`9>hJ%2sz3KiDr#6LZT*YGnx~@>I?IdXuu>IFo_0C zq5+d=z$6+li3Uuf0h4IJBpNV@227#>lW4#s8Ze0lOrimkXuu>IFo_0Cq5*lm7F0mx zyd<RolW4#s8Ze0lOrimkXuu>IFo_0Cq5+d=z$6+li3Uuf0h4GzUV#Q3(78W}227#> zlW4#sRWJCV5Bec6FG&@I0T_f348bsr%ro+Vjwz~k@IePOK{K>KE7a{G9fSsGges_p z8mNVkgLDptVFUsYgaH_Y-V>yQ&<Fj{30=?)Jx2a1uuf4~U8Hm6q;qKQp-7bl^04!< zvkjz!VBSkQ2<iATAd_+ylsQP}paLqv170YBQgEIi9RxQBu#b{1o_s%D)GGO+p4YJz zE2nC{zgU{z7VcsmtPK^bw?$f_;)oxeQys&B#^3;%kOkRb0yE@5?mT8vkO%ot09LR; zVKk%TK}N?EMn|+bjouA)VY^X*Us*ZJ8&BtkmL@VfqUG2XXl1mL{+%WL8zuciYsX3d z(0c3!w2=>JiY5JHhZeMz4`@T%v3+O<{?3C8pD7HVXipl$C+f%UL;LXuV(Hx2egGY0 z{}4K{>?S%At<1<5JJnGdUDaZ-$*DY9@nD?PCW|jPl?&xAE|lxMaZ+C@e&J%UnCfPU z&h)c1*(gnhW{uNis0rJQ<}m5ZoyTb@$b)<+04vy_5bXO&_n-)h!8x;o2?gmMnU+93 zWLn->(mgV*QnZXrs~oMsu8dXEz0;(7XG!-)N%zpYane1s0lN`x!rvTAy2lQ!Xd55U zj{2}W(9UGiy@RBCDWrR7ZyM<y+K1hb2Ji=CN%z=(5Dl^a5IT%K604+p%iZeS*}6|H zk%nnYq~-D@GCXsM)Lpei>Sv3udX#gfr~isM2fsibzf3+_z%OG(ZP<mVJ(;jNNLcYp z{;iWPBcJn+nskM?^Y`xL@4sh>G~Y-0e#(1jiw9|ohq3o9kz%J;ecx3U`x1Gq2(J^b z8?OgXphW#}RQMZ9r1k{0-=y{p_W2H5om%q#xzeSj>T5KHP@Lc`6}M}3({7ad%SB$P zdYK57s@H1`cit$?+r{romD8kWxl%aFl|nR&=X$bH6Sf)6VW`QCWvpaq$wTuQY6?&* zwhb-BZ$HRbnZj6!7N;>*qAqMVD)4(^87tYo1TAI%GPE4K0%bLH<q&499A&ITYsVQY z(R%C#w2=>Jie;>1hZeMz4`@T%v3+O<{?3Dpl_`vsXiwUo87oOR2gpeWAw+oyhUYL= z&QVU2E-q7^54PM$ua;8A4XtHm>L?K?S1()@e(gr7{pyX<^0gaf`1p-d_xg>}@XZ_L zXBjCiZ&3Gb>P}JjJ+?WG{ev5&m{FmQGNzwn1cl5CjG&NB*#u?|@tkO01{$lSS+z7P zIc6d`CThjDp@sPE2gxy0$T894G;&PTh3!TKeorhpCfk>wrR-mZmSb0-mC;I$d6pb= zlpGVS9Vf>`>#-ZqMn0e^mK>8ETF_QLpbc%u_Msj4I}eg$rjTQzJ!#~as2{rz?Z+R8 zCC6m@0d$c4L+B9pFgg-Vj@jMLH9SAp@X)S$uHm8mC%J})`mqN_xdu4KH9$0YmTQ3M z2>uW{gk2LqRaKk713zeWED!vkjri-(dhC`_E&;NC6WYx7KC}aW8`^H9qNttatNq;U zhbn7Lx>3OfPSl01EL&a52TZ&=hV3h`EAf}1<=C0AT*PF>>GWA##6->5IcRnQ7ctR1 zmWj-VT>KW)2FhZZ1?Utj|A^%kN;auoD)pC&`zyFqAKtK3>NYNwhRsW*Sueg+rG9$l z#|{V#LAi897CN>(G@p*HAXfbQ9Q8LMEtyLR*cH|4D>dQsH;PTw>b+56xm0Suyi{6V zS<16Z{9j@IudiVrUrM)8!y>@C_xLsu@tiD*=dP}JX}Ba_T6EZ-ikIo<=yoCV0z(sI zQ#OG)B8r&hOiyeK--I(gXnrE=CZkqt8(N6pevoyOQ&=|{El%TcA=HKKMg@LPYz(;y zJCwx65VSD_Z45yhL(uZVZZ@bo8$-;-5VJAFYz#3QL(IkyvoXYM3^5x+%*GJ2F~lry zeL~x@eP{>%&Vw<;Yz#3QL(KA~DAbSLhxX$S#9kjm(8dt7F$8T4K^sHR#t^hI1TEik zX|3{<b7H(SZxETa$^%Tb$|77}jHju`YSp)X{P6vezZOFizn+a>@Bfh*&+<CvHQ`{q z)P}?pb?O}=P_NvL{M&e`e^-3Bo|(@z4eGhqE4iE@ewnn~v`mIqER(ub%cNm7I-S;C zBX%?>=NVpDM#Bc1)Xx~h-OHrbzf7i&lDmzOyN#2(q1o7`Sp*?neGZyS*KbK+JYp!w zM+>m6)Z6eEqIQM?M=IkHLqRd>#CB2d#xGD0+j|ojkJ!HyEyFIK#W+;K29<13HOhE2 z#&|T&c!btr*HhnszY%R>hvo#vBR-%NZNqM--iN;f?Mz`jN@YAsV?088vHjHd;qONS zY#&TuJYxSrG=x1g^D+jcnG8rEmNxU?j_&+2ssFVoX;z-=YEj=c)o{~I(!Am(c|^(5 zrnV|$%z4Hb$b>A&<}{3H77xxxPp7GP28!!i7OrRIaa}7PEx@)?Z^K`R+PMzqNaZ43 z8W-tMC$@`vH-3S7*xsAKMLPB`Ma!_usjt9aIcqv&%qSP>#<)l~&P6)34!fQW8rYx_ zZDNP!1TNC?0j+2oc02Vx{2gd#3K!{8xk#7BMLM(>+fRKT{(dyT_Q3=$(y{*_8p0l; zei;AAEH2Uoj+0f9v<#dkt3pHALt|uJ<78cEr<JVAMOH=9)Pwe7`>F54-;Xx2ee+4O zI`(fx+pycI_u=nAtEQ3F%_6IdCaXj1u<NOBz~5-3;9&>vNwO+FpcE~`E~mbN4JuK) zm8{A|R#nbLIMj*lqTY>Ppt<2GDhuSDBCCS}_P0`R!(WJ|N0QY|BdbHRqPYl&ny}62 zl$;BE$kJ9NB}=)9<LF2$_kD+2l{=<eZ<1P{_(iLF<NWkddi61S^>KQ2G#lGAi<t<$ zRSudPFTOiZ{rxK}-?~YLPu?VTZ{8#g-=(}!B(<rp)$0EJCaHf%Jkq8-k{-QW8e*19 z%e>_*<*=Nk9IzKJm*yqno9#TyGuNlSim>EAgiOeSY^FS>S)6@|=IjfaOCn%NAj2Us z$VUsXt<>A_7ozqQGMrR0oHQ~V)QRn)-i=?N9=7);km0a@DO!eIPJIRb%2`T=GfIXt zMusy^hJ)5&*Rw$b8#JO#?9iM*hQkN6qHWmi)cf#vpq(jXIH_bfX=FHPFSeihKK%V? zfbD|`WH{_Uh=#C-s2|2ZGVAh7RJzA#mIE}?6U+How)m9i^6QpMZ6j4p%NdC~xpb=g z(Q>IjOFRCAc09)pKgT<|oDT3W{a(xmxFD3rTp%AUV05waJKFd?3sE~sk0X`jCynF> zbz-}ycjFhRhwZ%y8s2Tr2b3m|R8vuoqk;`8(W+5$>M?TaadK+34!fTE2K<d^Q@nU^ zw)zpVs!Mrve%=acULcCP)Qd!*Te$~#-wOWbh#vLLS5AxoM^{Ko`U)A&gni4!h92eS zjq(*zTP2?F;b!1ak9wh2_tpxj|GxN1kNOT{`q+CB8RPFoWTIKq2%lNRD}ARK&B4x1 zU;v=+%tQ0B3#hl^x1ojD_EZLdGzI{)7~4s`3%?r`*q#Ii0QN6IOR>wSFUMbjR!&nA zk}(E=aRvaimJRA~)U!bY+KAnhzyQDpw4klnZPd5p_n{rwov91}X$$~p4|Xs0e*ArC zKXxF20f7An&_V1F^+WiF(UECNLbAM<d86UlmC_s~?(bDUGBdnurPSTGQX1}GDJ^?e z(!gQA`XOW3O0|8ZG%7=F(Moc^DSxz*%4Mo--kdzKl8S1^AFZUq>vL7tyuXr)imNMs zSV?8R%C1`Y{z@t=pZ|YWQdy>Qv%Eelsi-_Gum67*Qd!5nSSlW&QdUw?-S$T-spwTT zs%`JDq@ud(k5*E-N7bad@BNijRD1qtC6z~1KGmb|ucV?n@JB1DJgw?gefs^CRCxPx z|67v_sl2EfP<@_-RE(^oqIzj^B^A}k#Y~cjixDGZ7b8OG(AkR-1II5$3}Oe+;HwuS z`l#>6?nV9hyR8=^dayguF4M&bAN3vBZD>3Gmg5&ATCtnZX7+EOz7e|)tv9kk&9sXV zwNzB0)sYt?DsWU{m!akOOO9WRD8=@mUOvE0y})*&F8oE-ixI`xcGO|I7-6Hn5W4`i z;?FyNF(My37qzgz`P{{b9LSzE9&w$CLT2PRhsk(EIu#j;jxl0fh!i6W)fX<aI9JNB zd!^KRMN*&gbnD5LQvV(5-Xw6ox00SHpni0L;mVt(dDhKRclFKEFz04zx$b5ej>evQ zvrH#<&A6bM_&Bj9sGP~)bu;nj2&!)ofkEXOTH(!7TO`yW?hCclvu7iFw(v=9e3I{G znHj+_0O{vB(Ew)3IgmY*;~MfP=R@uaj%%<{E`$PnRw$xe4EDnu*Wjisz=_WVC6r6S zquh4vUBQ8jQ3+)@%b|vHEmWQ7xP}JGjZlZL9$F~3LemP4YnYhGMRHt^Z;ntVb`ujh z>`snpWnwomk;Crgm{lfr6B9Y?K;rE8XLiFlMyLp(Lu??{hm?EMzHzhEzb;-3sr!t& zTUJTK+EvnW>na&eTqV=#W;5t+Gtp0=&*Q-5YyDBD_X+oTi~qFKW4VLdXpxPy$YxqZ zze?&1tEB!;asQB#HD_X*@aL?OW{WsBqP}0umCQ6Oudb5EUc(V)$FHrD*)+O|9@UI0 z=dRMvl1q$|OQ2cU1=L&d+fe1))d$XPVLPdJ;di6TxvLMH+rloRz8rrAI&tpm11GoG zpbkep8#JKGxvLMH+rn<6z8$|0RnA>~;M^8=FZF)>eW-Gs=L6@qutU@j;U7jvqObUq z3qfokHcMq)OihB+YDJ4wu2ybLkowKy+LzVW8V&a*NXza78Gax^>K;t^XA3*p>w{x0 z?X~S?<rc~G1ZmC`wwKjQMBvNH$-v<R`nd5gt8Xz{en5Mjp}l^bAPqmE{eH%_`~*ra znvI(HDJE`>!A>82|5h0r^-3;UfLihA#nZ)MPvow##258SF6u;G_={5L;;<)j*Mr0t z^-3yQrc6i5*`OqzK8}is<dqL-U;`x=twZba*NoD|VOOEmXNfQBm0Yw9ZO7jdPZx(h zk-M^gAN5Ku+Kc+}Puz5aJ(0T}B)+I0!46HMOB-T?iMwv7m`Gmv0CCqVOi{uYiL>{_ z^RFmzcI#@XPZU3VMg5@B@Z4&it60sd{;Q?#bF2T^;_R9a$C=^CYH9wWxc^o4W7Bj! zt68j)7V2M3kiGhv`pL^IzoxOqR!i+~#k#L>pytxl77ph;^rP-g{=mJ>{I=N%1Y#l` zADV;hVLvbXm!KbUZ?ftG_cpV=ll@%m??#1k?{X76HuC{3=tn)CQ+1L+J5AuBwR}Kb z0)fZ}G@y;xfmsZN>_32h`0*S9Z8u|SGJ%Kouzha=fyn-SX#d3HJNJD>xhHVdEmFT) zw0uRm4*C2o()@zBHq1o}OTjHNY`aD39JfeA@h$&s0rpWH*`Ba+XJF?ogp@U`tjh5< z^=(tbzqmze&)*^~zq&<=OTW(f-#mVld=B7T)G~{qkVY@WE<mj`ekMz~WnrhI87%2$ zralKd8#UoCi6)}3J*b!cE2yu;E<?-l7bVk`VcSv1ZX$|$f$c<H2}D*)G!aEb6WYuN z_;7S!x1sHPK+S0)3cCueK1oDT--um@*5e<DCZezdXpsFU9$>}};UB_3@dPw>XEG7C zn~0*mAG;UzCva%$7Os(oMQfyG@fsOkvPSCS*GT<M;`y&DG4{+FX-*M8q-?Y_uaU=E z*|wc+JJ$Rw##G|b<LW)r4L_ni&eC>2T_d&sE{^@1dbRs?Wq!M8t<*0T>t5&h{J=#* z8a_9YXq!fVhUQ~u^MjZ;?9FHncKWMK)Q&SzL$k0yaFGz(!+u`&FF}=yh)(u%vA-J? z*!C?%+jgQ4ElwoLK5&tc4Vof}Ha?&QZN;vOC7Sqv2DA~o>Q$ocIMIjJVuz^z!wogq zfk>i_{RhxN>|XqS_U}Xcu|KdFOrny*A9!7vSR7a@%}<Dy*OhbKHETIG=bYdoLip9S zQumd$((u)_((?7S{|YUYc<O{QpLlz%)c#1=PAK!RZ_$3bm~~P=PyGH{JSKF_8+2bb z+PsjXeB!J>ZJ)<)qFe>YMJ@a`X8bwW*{BJB=5f{#!cIpsUga%2)K_9pob|_F!hXtC z01xVAe>Z-C?L=MpCsv5Vwr?lEwh&l89350lob_jemT3eOb`#pn2Q=Vs#I8f@@z)$D zps=gZ>R0Kms8`lom^kZ?e_$E`g&jbH?B9pKAG;Uz<L}<i0Eyj+c5S(o)hm<NN%KDO z{2NM0m98VC&b*;qJ~_2cYTsHXEvMIU6?&c2{cs)2GGqS)`)BL^(ILiTdVI-`Ij!~& z&M@cGf_#BLe_D;7^|_PkZPP7R-pYzpx3VJDtx|ujIQAV*LC$!SQ;>%HZ<XdfV%?kS zb$8nNC<jM%5jwessVkZ2*v-fALwHj8ThoXhv=qCH`f~ggXeD;KpYCjk?#zFQ`0^2T z!eyNa{Ca#~cY?@_P_MqM%1<;55gmT9@e5pKeBn)H@OIwHui5;j@&MsCZ<X4U;@VT3 z57oVQt29Kcmln->8ID~4kNze9^oRW4c&^~m_0n(|Ei#oBnZ91?X0Dg|D~0Wp(rtWC zz4Th$W;W2X!8SIygAKN0-@RU%cM0`d%Fu3IFOL;c=U|)S^|FYcMp*~ajym|M+{ygT z*iO{NZ(5SZ?~Lt1y{Y`p)K_Acq2>59hnO^Br=uCHduygX2Rj=z;m=E>m%`3PEvfV( zHXMai6rff%=uYG)$L>VC66gh}@5k;%{rCsc=q0cNXfTytg!&Qe5ITgvW{6$}y9%xL z(+g4Gh+T))<8Mi$m%?sBn^T!S`fzkm(T29Ofq3aH<-Eby)=TX-#1EA=(p{D)^-~jN zcv_;=%}A7nD-xyUD(u<VbFi<&j>e9`o}c*l4ojW`o<-wF^JAcY#d-U`a|g4M#^*o& z>(ATsCDTu<*DMTgqCNDq&$dKqzJu~zly}fpJ87$Xuy-d)@%9gt6RGAzj`y{1tM4>A z`6*m<9Bx$blh|Vk#CSR&v>3aL`f~ggXeGAiAc2@dV4|hi+0>ixo6#KX^m;m=b~+$5 z3%h`ND}EbVh@E?oKxF?sG(VYu?8V_{gFdvM4LV~9#CQS|?ZFOFKZJi69l;J9BoO(4 z0dx?%j{1814QL~FRXu^&PGF+7*lpCe<M*K**i8osMD}k%Ta%UfQuf=*1mKlKseMhf zysey2&?HHHq`3Bs`oV>UHA&L4E=h*hCrRCgqz?z%-p3-u*Q3<$h}<)*<Pi7~H@WGK zvVXex{g0G+eJDwqhsCo$=7D~<f&UFA*Jvt=qm9814yZ8D#=sg)tP?^TAEJ$6@~z)y z>T)0(OpwW&!LwlUt>ZT83ZVe3kavhShFq{5qOB(vs_w<<hweDq7&@T~CfBJB8EA7D zI7A!6<XgWRsB45esE3+n+8C;!8hi%Y7}}s6S`N|1&;-23g1^4t$LhNxE$=1~ey4w| z3^!{wNbRj+$657~1-g4TNW<<8((=Ft8Q!~r73DU_Ke^FBycVTCFYIU4i!Temz!sm| zAoWMYmwrn3`^%pyvtIuOY3>)Rf2MxK_@RYD4%4bo0hM5%NvlH<6vLk`G?KfER)ajq zhl0bjD%hY9(j7!PWI`5XQ)dD*<Q%581G|V+7=S?t9i~-b7)GFTCan(L&;z}P`Bk6~ z`k`qTtp+X73T=mJRq#OvR5@sMsDWCjJIt>F4bXV_N^Zq}hbH<D8udJl`s)p}dHv6n zHeYd@)UFb< kYpnL8%sek@9mN~wSWsYxSnd93&y3JWGf9_OE0bAPG(s7#<U-`MR z8kg@jY3>xi|G6?JeCIa)i<EcNHyI~a8}sa?w}6Txv^kVPITX#I&A|>1m|U&Qd4%2w z^7hi^kP8;D9ih#k0IZPdW!Qss$N)2SIgkyeBeeYhBa6~I1UTx0=x{1+p7!6p3EE8$ z+Y`-i@ahW@U1%Tve(HN8`3=yPy|g(rK{NP{(B{wv?NH;T&7lgaq2UN^4s}p}M2YmW zcjyViOKz9ic=7#r)D6aP-tAIny<Hj#Z<iLw?J``9?Y>>6bCElPt1p>cg~>v*u}yqE zGhZtQoj7%yK$|4eCg?;0LcI;Ya;kUY)NLwl%GWMNopf$4>fQJS>WSw!P|kmp@BvE3 zF_D4b_+)Q8x|RchFQ8-`6B!6!VB%^ODyl}6*MHR0R!!nd|Dp82KfYb+e<D`?lIeh9 z{zhqDxKU0_dFZq&`1>kRPcnZOKX1wQ_oqls{$3aLZu|nZV>=>gmua*MIx*v+-iE&r z&1HYf7TT5ll_^qoIk$UJZ^oa4rekNAxTz!4$^mZTW(#&0!ZFMSBWM6SxP?E94^XB^ zz5IuM>ih8bqn+4Yk;K|GVhx>`@lfx>-+?x<fAbb%js2A=Qe8Q*Lwy7OMzjjM+QjWy zHCBSn#9vkSOZ8KcmPa>on9V$|er86vVx!bnZ<LmrjjWTrQ40Hc^)<#%D4_maJSlg< z@WsdY-s604v<|zT-?<^0KMb9?7LdRnp2#1Lw#9Q$QSZaATsxS!7LZEArO|L`Zwd`d zeINdQG{E-3gg+Ng%rPy!v`61YmVw*IGH|p@;Md$&{U-ltb*DE<{SU;uzgEWP>o!Sq zw0QbA>W#*V{X`X%?V^W*|6YLo56ewWK7ciO?e`DYhEu0TD5HptAFPn?S+JWP3eDR> zRBb29uq~(=e-8E8E~CnXW}1wuEUQtKj-6p;Etz5d^AQN~pAW(0>g<!FNFTm_=;c52 zLw5v?1CyhO4_^ngv2Q!H+)m@d<S5dBuMz4T{1{M^!H)y)k0MHs)%F{;Q)_V3CLW<5 zBmc8}X_E}UvPtS*+r)*=O<d^2K92qRChpj!*Mbk-Wj=?$6LRDDyCH8a?EnRotzg4f z2=+r9oltayqZ6EzUEsziz!OK0<&EQqSj*uJWt7XI0%s*uog>Jg<^q2Q)KRX727Has z6vxp8Eo*59XrtT?K71X}d5EJEx{uI~&`a45efatz5Vwvfi{pn_%i#?lynCP1A>X7$ z-q<9MeVf*Klk#_IowsP6A8eB1_5b1yrSKJ-rFNEB_FJV#+qGHh?-dolW!;4;{+nv3 zIl@<mI?DCX;N`1BQ!`&3T7rCaXrtT?K71X}d5*6R-52=k&`a45efU251;2z;GaVQH z5ijkUeu%COGFdV&3$iJjz|00Ykjusv$P4o0K>=ke*zgsC{Tx3Y6kXuQ11Dt{xbX?_ zG}ARhNs!+T$|#pZ1-?qJ5^KG`RTiEr+$_xwG5vxvc{skAUM=kcFG=H#Q6KXZ!<@fc ze0^Ma(TSb=N5|L4g_r32oqDx4{HZNcJ6-(hcS-==u|?|d5|3S^vkD*HB6Tlpk%kwy zNXzH9$nY1o$eP0h6I5}GS3?8Gcq7zNu7{Q`f(e?c2`=#A>wq@O?a+OmV1mvw1Q+z- z>xW*-ei-QDFn~Zc0TiqznsAOli1H9*9%fDk>HC?Jff-*8WK%XlUKhaxxom3z8@@s) zplpSr^8^#v&k$VT#wWl@*##wC1QU3wmlIvpL=(<RD5G2sx-?qENQ-39B3ZPM89R52 zH0OzLUsOWu*cL);%eZp;+V61x9=A#_Eo!|CPti;5C6q7K|1<P#*MD5-(dzO&^#QGJ zlV0k#xRz+1zSNMRm*y<JbX92<8sDGd5?~er462S0a8PrI0ED(60R=vM9neI%8CseN zK<K?dKtUhAe(0p!1>NTeKnMkiCm6;z0s+cF7-%K{n+d*bdMp!|appie<qXKA2g`y2 zHnxHdUm@gDwm@Dp0SL|u1QfXO39wUkK+!n@5XyqY6I9@<1P^5|lr$57&GcTbS2Wyh zI-)0<rbTF;xg<PJ^r*H9V}vHosJmt>FC5s)3kSCH!hx;4Z~%J&cI?)Fg_cTuMWb1$ z)vezu^-1Da8V!@MFKm_Om&9XJH1`;Ft+Z7;ZPh{Bb<uXcTV+EAVFr`gZOwkd45|(j zcHqO;0c~A`9kf&vX3#{r8T#<`L+^RQ4!X|}X3$Bw3x@HHK&Xq*8tNj>s);iQP!56_ zXAWc^X7w4!+|OATNT-|uHhhIpz_wP%t0v4Km$C)i_yjo56FE?HhA;y=Wd~H?tAw(y z7~-yrIIAYkz(d&!hCW&(u$5>#IYmSI{-0Z=_JYtyYB*JzuubZ3*~Ys{xACshZM>^= zo3w1ezJ1%jf=wk}zeK|<<FnhOIaMsVRP*#Tmac8`SP$*wr=9v~ry%VX+9pNkrSxn~ z9PG`|;w6NjjdDBqIQToD%0S3L&0azfCQpMk;A?~c+Xi94O9VlP@(>K;8-dO+AqU;3 z2tnwj?1w&l{XxPn*FwlZo|h1U0?Jmfu|XlE8|djEb1yv|WK%YQ8D9=~*wzarULpv} zD3?P8zDlr%2{|Y_MF@hEvJ2e!L{OQRy>Y2#l{Wkz+obkaV!DpQQ@6|@^~=RkorcHC zFVp<fPt2m2XY4mf!{Y{NIcSjKCk;~fH1?+r()_IW#bp}i87&5m)#OiUm}mR|_YZOZ zn9YB}=07t??awK{V~{Nlq70fi_M5@Sq22*)l-r?Z7f}XPw-a^HfUgniDA&V)gD8VQ z22lsY_(mW^c?h~s5K%oR2)maFJLtpN551KAkmn%EAUA`k0~@|VD4=YG%w0_AApLeG zbYRAp1KE^KP~srUz{9p)sK8eVWt7XI=mhhO;uD13%Y+@caSCu!c7b?fs)l*S4R=U= zoJg9cd3uK7;2qNZ<Q-CX=niRk<_;3~>HiA3iQ{s*GUcefgTRWuTp0w;+#$6;7PiYZ zTzH&88?||f8t^p}MbJdK8Conv8Px41YM=pMBUDkYh8hD=1|cs|1H<@6AV4_?0~VqT zdQTBF(1))dIw^NScbL8`Or#Zfi5jrsEQDOj7Ra*@WspsOWdbw497v~}0h#n)Sy0Bd z<xqjI5<HZ>P+}p<z<G+G0XIGYcFGPY3KL}Ei8h*{OfBZ#DfRQk?`J4ei%0I1=10Y| zGc|XJz!m=rq)#!ID7jNw%I+kLPG6yf(Rc5Z+S6jkl}b-P=Ps$gP7GhAnQin{6GhPG zAgZ7xgD8R~%FWP#uMz5Y5miugJ5dBxl&fJF-w1>pL=_BV5JeE69E3i6{m^@YsDkd7 zi6ZEv+@<teHk^e}z{Xa{%OHv%m$C)S_;Mh77jNi-%-gx!2GS{KKn1=^C}Z1lD9Ip- zz(d&!ZhQiqCx|L2dYLE!J7vd-iD<e?nL9jimo)Dc%VsIvy6Y~QjmOn^jr;6>1=B<u z%~oa$-?~d`zb&R;t;En3+ok?0arA0swr~xN)LG9D+RhJ*_6+fZqkil@wEr{#ah8A> zB|y-@aRLS%!X8FP@K?nVP*Ag$0E0Tp_0YhP-v~`B7DqI*eGA*SvVR-ej_pG`PDfm= zvN!XCLQ#+(7@U+{&0_k~>LpjF$I%5r=32TS$fj(HyF~okotn67bz8Pe!`AK6V%RRj zcW#%u9oRd!OY=SA7uRTx84X{;_X@sa_`>+UhVMA#f1`Y2yZqRNtfGnJSRy%|$VQbL z6AMr)_FuWW3Aa75gO?pj&{9624E>|!O;lB9X*!ys2Cb!O{=Tc5%;&e`KMe8(2GBvi zKnVTAor}>NUvFNgc{I|Z+reRz7^S&EJQbzcJR`hyht%G&Lt5_KA;a5uFvLe|<{G=( zi4Q*=0NPbge4za&i5Jw5Juu4uF~<Li2G0^7=m`E0I)q&l&mWmUyr9*w#0T1lzYeX( zZW$#$*uM#FW_usnfxiuHKTC&G)J}Z(=>SkiJskup*uaUpuuDd1Wj?@zdij6~v=V<A zT8^C=&txfq$r73o%VY`7!Jmzqu=Ch1pZ#-D3)|b!Li`1&^{jF*y&0|HwnOs{sm~SH zU#|>buk4WKW5RYlW4yecM*0Vb)L)7y&WZBH|8XJpmm*4Oq_=5@NQ?SzLMk#w8NhD4 zTWU9n7h_0<d;P@95V3>yA19X3F6?f!r=3_CBUZ+V9drbL01aXfpo9DjbqT~uBC&%u z;;%xhv1?G}NrSdAVukH}Xb1i#v>CevZ5<_soPJ_uh}b~|8`x0?b`e_KPOOX(E9_8# zR^s=dUhERIbd*@(iEdLOQzSG8e>$3horz|}b7G6_tZZ*X3-RZo7VJDUf0R2kb;i4; zK12M3*3cS;?v~~easONm&wc#Y-O_O3ZfW_Q(s1GVn(O`*ODgf>`I?>D@Y<bHyI!nY zz}W%a=XXl|7k0|<7k5hC(VfzezEfH<chW^I)GRjk4G~ta(j}pOKVgOT)DvcC7xwTt zVTC`04)Kc&po1JR0W^p`@i+ngx<tY(fv`d+9ydU%v3+dUfxiuHXZsekmHnI0<}t#{ zJw#ZsffIH42`jX?o-jil*cIc175*}`oDV2LOZfl~>cuv*>`V^+Y}Ay%i7YgW<!92- z3~U?Q72+>It!$r%=Cgk;Y8jh2`W9-q<)d$>GzY}&SY=N1o1FxdDOSVUnu}<ofASuR zXFS4jyQFUEE@@b{OInuilHrxRq&7kP=OSfxba<E4e^xwkgNBzT-k`Z=hM{AZG<UH> z4?FmG5oqtKe|S9y{WK2xNDlgFCMBo|+l=P$ONbxepp2@cyJ&dBV$I{z4L95)Ephko z-r#$rcA5Cn5{<I78dqnzkK`j)(Tx88J^A<_7%5EaBK-U5+o2x<^Ejwr00tohLof^@ z^ORV9XNfYLK7EhWe_E`(QDdH>3*18kFV!qJ4ushW0uY2@7=aKBLAQk?8#<v2`k)_r z!4EBAj!bBRX7E7=v_U)6=s2>W3aX(28levAbsWhhVUA4jfEOyD63U<)iYy%2U<U`d zL4XrnkQe62gj}$I4GN(EtdOZ=G6Lz40cOa7Y%uB6EPwp7r5f&qI7{pNR9qIX3>ORT zmF8ISg?LSp2rSbqHCp!HE7Q*qMvw_vkPRj<Lk{H9+7`%zd?)}b*q{*X`}zLxH>_sM z8+Q1Bsx!18)IcrNK|TD<tJ&`77lgkYMwcjY6rnmM&Mi}>8I|`+ZH;*FCe0&P7`}6_ zw4Az^mn7dSb>F{N>faXMxJh%pF+}P(1j8@_0SLkX3_`Di{~P+CA3C86x}gW!P7pod zgAQndW@v#{sGG^(4h_%<RZtBzP&<>TDmy{+Km}BS2fR=MrQmcBJ>Uibc5pxu6hpxY zem$^3A>@Ju@*p3wXVMda8FC;UG9VMOW-7~TzH^i2F0CQzK54#QBrVrGJ=OByee&4D z_wi!j`&iR|rG{4!5;p%sA*DnVb6CRXe<q|Rd#2y4)UZyH=RT<|6^m|G#)a?PC-tX< z`)1AjDdBf$slX~)YMAtP1VS(b126~y2tuEiID%gALpSt5Cv<`D6mbM?&<-uo3Qf=q zlQ+uL%^|L!25O-Ss^`#B6{m<JD1&k+fl}~*7u;Ur2%O-8A}9tsIKXy_ID!JOLLTHp zE?B@k=RIx}dQX)NCdh;=NQaC$SG}jQOy4cTGj>bemAj>3)^2IOTBsA0S;xJ*8T<|> zXts*LYOdRa9lNF0xm#LXY%kcpWcMf8b@6gAdWteAhYI*)yDlD-|L_pVPxH&wnuoQz z*LO?(H^n`-Xr8!CH}!sLn0~)B&k)CM(JY()7Z)o}5#McAXBn&2Tvb(bRaMR1B5Ljy zQLB8=0i9>m+%2M3bwdyIf*<;z9|F~CZWd8<QB}=FRW&z@sJU50%?Uy^CkWN50@bxD z>y&F$wo4bO3RQDexihX%Sy&q=?-~|*jN&Z~b3dhe*tAfmGB2L3%2|4qDt$Tq$10U7 zbIo*Bmg-Vf*=@I~$~Ui7RczC%!b|R#o}DXI-usrTO7^h6w`z^b`REdri^YT7Pu-#t zpH5KOpIe}Eyf9By^!a6~;+K}H#BXoWthv;3*ZuO?j{E7O)@g2=lCEN%yNl~Ii?!jr z`=!<@mfWfwrQ!Rf{%fM-R?YQOEN`)GV7+F&F?~O4W<e%oK{lAc3^_1)x69;J;mNyQ zCQooq-tAI!hJy#3-~u-Y@KkfKK?#(?<jpSseS(wzY(Ix9OkNkSqg)RS&<IT|&DUJd zOA6X~Ndel%lS1vN54!{HJk3iA&hnCiQC?Dj_Kx$C0<;gi9}VCS*7K4AwjV@8>_3DK zV~^C&xLg%hKfrpP53ru+1JbM$+C=3D-SPlOXlkNnlL#bf7GE9CdO&K;5Abf62V~g7 z_62Njd!WC+S*?x`7m_q1msu|2`aPRPh+`WxtN;5)*r1s!=H8~cN~=rUBlR1^y4y65 z&9$i5^u;|=cVv$=d~uI7r%^Ui&e$U@S$p^uH)?J(rYAEz9b|Y)VR%Bb(-^)`Gj<M| zi{DbuVsPy&28R|5u^1d`!!AVar&$c{42!`*F*v~mZV=$9W-&M@fl?@ga;SjHYUU4O z%_+^iwbk^t&;qT{2JJ9;hQ0F)y)|@05A=c``p$@=c2+jKxKR@oX$kJ3IiojeuA8z$ ziOn^el+5+N_DJo!!nR4XWQy*Jy~N{YB_0oQe55dCL$lJDvY{qyGn!M*l&zjM+}c^g z4b2~74L8(^Z9@z3+t0Fw+bC<ep~d5@;fA`f-KfCtsb>v0wl6_T*}n`e$F4vt%awJ} z53+_^3TwEbwP~#3hSp;@pp7i&+EmRev7rT8p$*!>2OZFPhF4-kH}pU+_@NK_AyCaL zv0(rPAp}D(3?tRN61&jF>#D(O;>~1`PdN`P2YGWD<m_e<LojV&QA1GPaFz)f_|gx4 zIik`Psj7f-6HD1cDdiIIo{dy_K%9(Jxxw{nq{<1!l#9TD&u-+O4)6Pm_bachZp&bK zM^st7zxiM!Paj398h1yk8lZklBu_6zs%k0MKsCN97-4&5@%|w+#QuY*vUq<G4V+>5 z$W!kx-tR|yUuJnpR9U>g3)ObAfqsp6ce665e_^jQzbKyGqG6>}y%I;=dx@h{dJXI6 z@BiQ4Z&kB?KD0n9v_U)gpaVM3uzo&tLl5+VANrsl0@bXa4+Ag=AsB*TVB{g1f2(J< z8vf;8sr|LMW2@#)_3FK%c&lcfR(I2bQolkBZ`G_@_qVr`_*Iws@y8=r>XQzk8Cswf z+MpeL&;gxiSo9IPp$B@w4}H)Nfoc|ggaH_Y5DdXEjDWbfRTDMaaNt2!eSA=cpL$U0 zo_SCjQXZ7%&xn=Vl<v6oL4Nn@ZJMh^=nl=cfA$~$Dve6_BU<PzE%sAd?A(J=`;M4> zr!vSden{$<2=kqql~cm{hv-!A(yY41@WeyX{M19->G2SEdORd8&ptG<+P!<b=HW<7 z-9t3){_UDuG=_T~6k@w(-nEwRJS4-X*x@a9_&z(ljs3%ir1q@%<_^swdTA2#6qde6 zv(iY+Q4_Wq%_-+4+4a2qhXFPZ&1b+ZK&{v|o^>h&`#D~11VtBkwGlWeyTFZ4fTx*P z8$n5sR~tbY<#MRNSLwa{Mpe}zUTp+5M=s|UbY5*lxgHw4%o?DnnPCfBg5oj1`hlz8 z<!>41Z+VZuCE{Tj);ugXD<jd=yE!AMoA<EPFAy8=CgU^gdsv$HKP+{RKg>9Mw=yEp z`DdguA~6_bq1o6b7b6m*Ne-IZ&WPk^L}E0_M+>m6)Z6eEqV`cnq%lUMaYiK6iS44^ zjbET1w)grOk=VZ!EyFISz5;)xOBs<;7?DyLk<u8E&^qjTj>`rw>(N6~GwabqOOW;G zp^b7o`0#Z==Q-A+hwclkM-RP}{m_T69|Fy+M-Kx*)}x0I<slfxH{xYIdQ&P(w?ht? zAp_DO3o^m#pQb8+La>1aav>k`z%@2a<pcq4aDW|(p$N+T(^O?p2^HW450pZQk$>t_ zr>W|o5gMQxs-PBXpgqXq^<6AtZ(%WeRC%v;6WZF$V)v;30*mLLXAyn;eP|bUC)#t4 z7iObFY#-`k5q<o_Xb?Mq4zhpz00o2PVQI-HZmdLx{b6Y+!ggZ2vAx)(*yRs%9L4R_ z6pGM2nr*An4>3YP<`HJrkWJYHX2LTEa+^sbAumW;2?Z%9BE&mg>aRyyu1V&TboVMJ zZthQ(+P&fn_iC=!=yDzw;$F?XDTbGlIk4~3EWS?XN|t(&%*!p3rLHtt8p@NUr4qY3 zS@d_QR~q?zE7<t|3c-Gk)E0^^Fk1&FWfx4Q!k%Vk>rfJ8whm>K%b~(cQV3Otn5;w1 z5pr$#!;>Gp*{HF6gQ}^1gQ~fGgQ^8>9onGU*+!wA3Lo0R2AyX&sJccssJhXf@eQh8 z)Q{bV_TvxKZ%_r<egGZh148H!_Aokv-@Tieqv2c0(tJ|fzgxL9H0=>-xcm`mnfVB9 z_UiqbTN?vi>InA(9M|EyACcO7#Oeo>Ya_Ntq~89B3_Bi?Iw$+M*~jw;$M^%9ZP%vL z7Z|=w&8uwj8e4poEx!JUG=D=}zen@*)t1Zl$zxOZk$vuymKpnG_=<f}SC=eG_h_O; z=s{-d;cfe*_Rf9s&p(>Ntli73y_s1%<OP|vLjkjOtBYBCIkR@uKFX|pj9L3Qvv$;p z?V{d|U!Weg_xhQ&4>4;;%dpE`mvFX&4Jz57D&-Q650-FFyM*I|C7iMAUF1{c^o(dz z7d<1iKr2k%BI9##x&=DVGogoW=z%{y?HB0c^bHKaAiRHz%(a|1ba1{0EO*g9J7}L> z`{YKYkDc+5lE>%llX{DI{vnNiimq!P@%yl5uGaACK52eUeE(t2Kc*k1pM^}wf@}x< zESMn&a=YkfArJDQz(GF?HYf!9dHPu>f?{ww=x4zV0z6&xvrq!1Q0AbYg$k&2DE;hV z`dO%fTBsxQtaq^{dpT>eqfPCs$?j)O_950}N87O5srTXUKs!fSlYNXe*~eLv9qq;T zQ{RWb9}Td5(9fFe>_3Qxu!pE0#y{d>P4=wA%;c4qQSE>KrBpc%X8PDB>})i@i%g)I zOrV-f04>CCL#@~aXz_Vw^5>YzpJ67C3jA)=h3!O3*}tTjOn~h@XeIs%v>dz4NTK#H zGx<Zz<o7d^M;l#?GH5+^9op(A6M*J=W`)p!-v{lK+n{HRnLKozWdV8U$JYmb%Dph? zCli2RJ(&QE;2VY^%2Pu8<4)GFB)jHO4(091O80mBqf)zB3_qgT$T&=Yno56~Mt_QC zW19$QGl89h=C%`ie)`iP`ct$3+e*C+e<5lgrSBV~?;EG@L!H<z>fQJS>S24YpT3X% zOVKjya_TGaSGtrluPJnXsdRp6bbe?Zc0B`i0|Rv<+Qh)!>}ST$K;DYBVYgH7!{32+ zjxysPW5z$uj34dA_EX=7zaI^-ebCR0pZy2X5cUxD!}v#B%=q2=IHjXYe^lx-#j<^x z$7jCDkrsTEJHQ{6hT%u0S&E8%8eXlqPqX>2Yc7u8JjjR1JJoC)!i8WzPjf*L6vLmM zefM<HTu=g~FnOn1g@fjDD9v@4=7PxwyXu&-)YCyV&`~v_P4ryNe$onhvR1SWyPbL; z{tmQrl(b@uv|^mJ0`0~2Q{RWb9}Td5&`(;y{)1=;dx-jB{39-9E&1QlEbl%lEf<w| zn)Rp>PwVzeZK5zgs#z&Qk8zS(cVxfRe{sJIzqFrIF#9<Lv!7Ei`{gGTY+OqF=xF<d zg6-Nr7j1NzQy<st(Hi>qOLI^>{kUewl<?X8blV3sJEsh#c0`Cv4|4OIVdZ1eyh?05 zsJUlxc#Ked{4r@b_?Waj`IrnJdQ9p*jr}b4bC1bm&r^QkF+Sr7C5KDls(mU~@zc19 zk7i?=T)Z5)oU8n3Zac5c>Ee|+kPih8UYP?nC<Oa?UYP?$Pz+87ugn2A2=H{p5TCp; z2TGyL!5ed+0xBIbN__Il9H@a>n0&CffdFoVf5<>dUouReGGgTtulq@^VTOH=No|*y z|D<N)l&YCs5#qp;%0WK*nAHD=D0)(}NQ9nbdZCMZT&AZoK&CN3qFGi3Ncsj7eS;az z!OrzFK+<>Qq50Sa)LZe}&_Znc7z5-u10-6E?WEp?-;D}vkDmdO{Y%hN>@w=h@mDBG zNF^0jsSJ>543KCoM{}Kn7h^#KG(uAsi3qenE3`RCM8F3f(0QIj1iGOIdL1Mp&<Fhx z=pqq;0T_gkgG2;|9V8(m6uifyS-|JLh|<&1yf-9TRgPT-mE+OeJrd2;z-X>HM{_kW znyWd{D%bgFt~p1m1nR!P`;SjVbIqCeA)|SxqE%Mv3$P1O8~%)$(Oh%peadL&9Nw=? zeKvNEk%E~Gf+u(rbC@?WqXVaSLo+W<4q=a=!xwmivx7G|TX>^0+U?~H&(!x~_oIFI zn@?~8k?mX27WQwaz74wr_2I9c7R{SeqE)rg(W)A#r(6e@HL^hi8<^w8Wlt$*L=HVp zpLXOa4X-VETDct8__)-zh+jNCF_B`Rq~pzmEXXF9O<;x`$n9dFgio&LcaX3`12jTY z7Xu}<Kr6I47%0I99ng86ffBl*2YMY0l+XwL5a?o{gaH_Ykb{8|hGE2^oEn?@45!9) zKY3j0e<mJzMsutB#>a(tMl)}=;kpCT5`BQRw+^uO)&Xf)ctDyLiLZWIvyKVq$F!T3 zcJtG2bk})kK6U{IvXukch8AMm#~3lk88OjfY$x?D{BBfWd;E-;>|cVGVwX{0j=ut} zv??QJDkEkZBPLqQ<g?DfNeyU#Mri8dea+AUt<dJ+QV95<13J%hDFnKq2YMY`3V}Z8 zhd>vXLSO&}A>`mv2n+-5#;KP}Q#4B>Esq@_w$`R-p1CSqbwFxsX|g(+tbrzLK0x>V ztY+uOH`f3ALB=UK!<)3rcWIZmXqO)xkoq&?+RrFwt)mZ0bBxIQjD~kcenzwY8p}fm z<+0?0(y;HKwCq19!;c@7J*k5cV#aeMMq%$ksVx)FKc`uwv1~mc#B-W?BJ{lSD9D=! zrT)9(C(kP<+)}w1l*Yv%G|S4xASOB{COT#dX(i;6N?IU~ay}GzNh`rdxe)9ZNGqX; zaxpmZxxh_XfG0>=2_=+Ep$uO+R8X$8kX}|DA+3ZO3$Y1xoDHal2Fi`l6eO*L$>(+2 z@J&3=gRcWRFOXKk<h@CgSN$jNO$xAY&_r6vfiZ}NI539LVeAnTqotVoS<U7s!|W%d z`I;xB?%F4$A?gWfx&8?mp8JGMj~j{*PkmOibDC}^o_p}z_XJV=@(Y?x#&pK43=;z< zV^|iN&7fvN&Dc3;?hpefnunc_7T~v{Hta&wKF+|27GW2oPW&#^jV(|Q`+Lz6>{7H0 ze>qx#U1?G>%QObgh?i)*qcq-88t*8Lca+9EO5+`+@s84XM`^sHJnVE-)rPMfe3Uz& z^TJUY?<kFTlpDK_a?8X~ZkagBja^4Mr*)J^%#U);>L?F89pz!CqdW|Cl&ddCc?A6^ z=d_GRxto_v5M1Dd;&w7caDW{ukCO>PIg~-^D48O7!2>zR$wa{f*^m`arV1I54u!|b zM8OILkUvVM3KqyU@{jVO=04um<$sw>5beS4ZYEQ#CR0R5u!pH1!XH8hu?NnPDV`xy zL>sXisISLght|fEsX{eWK?i;x_3ilE&{pi0b7ZRQKc$I+xHOeRDqQr0)VjpVRL#>- zx)V=G{kI68HwdTi5HzO<n(twsenN_EFDiZ95e5XvB!06X+si--X39B`8)QI$$w$Nr z@J&2IhOZFp7Z?y=@^-k%8%-u}hx4$n7bYJO`@<t->|1GxU@CBg0Rd{}s8qF}oAx9p zyPlMFqc2B@rWZ9o(->ZQLKJ>Z6DLBS)2uRX+VP~+-2J5dGGy(1Qf}IXeGfJ?s7SlG z;h;`MD^BGnychp{_;=$6-`p!y9jdETZM^ZQef^cHmgg8`R&mzjH=j~9^ERZu+nB~| z=6Yu5bt?Z|*Q&aC6H?E8b5xysW~sWOW~+uDy&+mP^65pqSZJYY=+m*Pf#(*e249%3 z3UFIQ@TGYwG57P_GOBrq{|>Sq=D$yVlJjq0(Cpz_w<#jSqE%&b$~g<NDVx9?)T(kI zm$C)&@a01RWh>Y&@XlATQ+7ZRzG84vc7Z#{J72*=*$XB3N};TsShw=dR}=4iO;Nfx z`qOIq(;BpP8fVRBan>xFWEE}1Zb~3oO(a=GTd~`yZ^!RLJFq)bNmi?W7twu~WEFb3 zr_2w1&<}wXB&#q0gAkfYvI@g6GSf)MzWd#XtZH6c$LWxa_9;9d#@lXBXGdgrO;MR3 z&;M>jemSqbJChw@@$#CxF<x`WtL_R+Q&d*8i2aK3+p!&}+fU37@!C9W7g~b9lzI=g z7p>q6SEkX?W0xB#O#J2LoIf<Z-x3Xe{_-JO0=ph<aq*YqZ^CXyePjIP<Fo{JJKBxE zhx$(JF0`+UzZ`nkUWo8#@RvLI%OOBH2*Zc@%hPBH%Elr7Nk312a_D?S<~YGQMn5x3 ze>01tEt;ckCP&)}jyA|U|I3K{1g2l{9Bue*_zO{c3P)QiM_U@xFVu<cqW(|Tk^87+ zzesP`aAMi71p1W=3~^9*jv)>vZ!~L)V~B&6AZNl7IddAz5XYDJzj~cA+kg0ZBbNLc zna9BXEXTxWI5yH=ix5#?)RaxxWPMUL242!ch`>?J+$r5}Ci0xkQEmawI6GaH4q1>1 zrZdx3*(Y-&a!~U*US2tm20O*eE73f@vvOB!!CD#$wP8CFOcD0IG!%_g1TGrQ34*e+ z0G&6P`6`qizD!jT&pefV%V_*c_N`bEAwKsF_58^A|BPrlAs+dK`nu@djPTF{-QZ8< z#4+^G8H?yk74BcC7Z?+8K^Lvo3H`gdNd)|ql{Mi9Q~$r#-aRa?q)z;<suMCaga|UM zLo{((MP=9Qnq=b249si@(TN&0Dk>^!)HnfxipnZ#Y@nNKXrO`SeruqC2AXhdpn-<N z1SVlMI?;(^bkN`oMl_f!0mgy6AI9~0-{;+DpWi>fKR*4f+c~FBovQDt%lA}MGT=Rl zi6BfJ=jES7C*TB~f&LAtU3x}uIU>b*WV2TY^Uu&}+`-o8n}>h?jNZ+$U7$UWf`N1K zJQ;9vt{!;GeEAu@?2-_NfANf7imX}}qH<0MJ7~XrMz4zz+^P^YkwR2Y3vr&X#-#hQ z;iwSxEMAR#P`&M4VM-M4z9$R9mB^R(r1=G&*p+8=s~@Idp#AC@y(H}?vMSo7{pdZ~ zU&#uXZ<6X3s`fU1FyFy{7ymu{U|vBFj%I3cL7H|-KM&?@r}VfgLzZ?R`0yJ}=~^(s zkh%N>{~wd3o%TN;L85n_;xIFFDJ2c4La;(nPL@2I^><ZSz?eB7b(iNrl_sE4kE2?V z=0m7jWETERWE1HcQI8F%!+NN*qbi|h5!DG*5dos|GAi{ds&yI_i!8%$TgZ~7^V?10 z{9|P2;3?g0rLdtpm019AK__&$m?b#MYylT;L$-3^mZa@WG;e33nZf-A4;y?iVMS3a zP#M;#47aHa+|<^0E5qPfp|Zf>B9#gHardpn()iCPVLOHzjN+fGt%|Xi;svlOE8RS~ zaPGpTSlCjpPk-0>6w8fU)7}bWCxx+-5>=xIRfNfpUW-)Xtbtz4ZY8#!YH^SfCCI*6 zgXlkwq2Mt;YB`32$1oI3$54<9k03`g{Dk)?hJr3XF`kH_V5%S6{<54%x~W7uDd=3s zNhMlRm1sq_ahwy`j&AQrQlc|K36qPTu1G8up(hGU#Y29g_aK&vdzE<o@8p)ZiP!(= z%&HgxD3NnUh}?7`@(9moGQS{Rh{AdP4>dtBufdDqm=MLci)G0nAxf#zHYkH~r~o@u zLKRTu*{}}B;rCJD{+$X3=J%h{4@aHSw>=I|Ks0eN#DV!q>gZGOA8?3tvBVudr9TU? zaY#6Pgs>0bhPV&H({L0%1Ro~dM~H*iW5{DB93RC2u^&T1?8lK1dmIU|pFl$FCr{}g zVR7kMWbCI1gV=Z^#D4meJ_FBw2KQ%?5SxI6*v}y$_VY-H{Q?qV6Okr<V!wz3V!woh z*e@d?_5>1Qzk-C=uOcD#Ye<MaiG<j%BO&$~B*dOVLR=CgLkgURRQLvb6TU^dv{Sli zhM$>l!*}4jkPiO|--GX;(*3#PnGD=NfFHt-;0*j2D7|dk9$|*etUV$Na;<yBOy()w zf;$IhvPhQ=X0V*n8zU~urmZwNr}Tz`Jti@edrB{N5|BrN{8M_RZ;z<LUxBoHFnY_t z=<UoNc0<Pq?&xjVJ9k%}yL4CfoWKa~<Q~ytyenIe>|rKjk8rHqmF<apL<jyhq%$6) zx4A7_UyR@`(YC`edYhMGrr?x5v5#vOat8y}yK?a4UD?-vSN6j&3Sj2!DSe2<9=S(Y zj7rQDozin@5qS}`3O~iyMub_ZkTl(*Gq<F5n%O)9%@%(??t)9q_Q6@)6*N_Qni7>z zg}d|wdV^+Ima0U#iFQ~|+ijp3Hqr!}xJdO;6vr_X#}Os!4$*FBX}8m~Tj;>ud|3&{ z725SY?fNRsc9~|ogn{uo&31!!jdUTqR%q6@(HnO3h7G*|Be?srFhI^huUKjSg9RuK z9~Q=5ER1n8O^kI07RYWCi}UXm$eAY?%${U0o62A|iNS0JgIUIxxlGFDr7@U2&R{m4 z!E7Rf*#ri&M;Xi>V=#M!!R#RhvoQ>2qZrK2F_^u?VD>VD*((fY=NYJ0U1c=8$Y^$f z(Ja*Au3u*~yTNF7h0*M7W`RQ(%!V-w9L_B8Zk9<Co|VqTXJz}5XJyBsXJzM^XQeCS zS=q&fy17tK1hacaX7@;Uj&yy-XJvoHvvQ`GI#vRuV1qKA|4cblfE_BK3aX(7YN3v| zAY-~L$OiM3ZNjoa7kQB`Gg>eYnU4l6Frv!gEEItiilGEb(;4`p>dS%{2qP<YGcd+o z`RORuOz8kt<EiOq1k6jKu7(kDI<H3FVvTfXo1pnPU1Vs1R%lD3iwy110iDO`B10E+ zAAhLo89RF(+IYLc>$CsViJ5wCrU4qE37WwHEzk;W;DmPQfKG5h7j&P}YZl1o)&F;- z-mp9)>*+x^LF0;_nCT&%o7RkfrWf5}U*b-exxq#56kObFXAC`AZn$fopKu@d6Fu>a zq<LBOCNQ$55Sn=rwuaLqri7f7RGVZ?nDBPOy@Zd3@UnxkFkW_qPY^yux=GUYNBW7r zy}T?V{KSCK&m=q){SclJ0*8sr+)`%e$Yme&!vGAz5O`o1Mqm`YFb3l=0X~?7DbQgW zo`v%;1G8w|=g_JbaBp#Ot8UG~f5A`Sr*H{=20w@A;WGRJehDwY75Ejr2)~A_@EiCo zyu=0OQQg0T-@`Rn;DUc3Zi|=U6<CBn!Y8lu^Cwt>Kf_;6>7_^Mh(p;{M;t0}+Zk3> zGMuV9L<fA1NjcoLvrNhzqXQmKN1WbKW6X=PDe6TA1vV9BilSE!eUk1F4A7U`+O2uW z%HoQH#g#Dn+Cg-q=*M~Y(XCowaRnyOCqDGbWCMBzMbtd|qI68ZC|fSQC|l=Vlx<%6 z*3iC>O6``SGm1)Wpi(!`y|dHJW7O*|kiyiv!RpQm9myap!@{r#1M^-i#*9HC2XY|~ z@(*DdX52}$+!~hbq*?A1R_jht9I;cB?A|F#Pwo`9)SZmqb~1k3$@pz2<F}nu)}5j% zWT&X!w^P(OcQRYz&lDwJPVC*u^tnHlFPC?+K=0398#|dT@fR(eu+?r5ZCt>4*&x~z zvEMizC_0m{-+-<R>^E#dqUSP}W1O&e9s3Q5{YIp}=;xvXWM(iN`wc(rH-fO=K#s&< zzk&4PA486lW&-I8GGV}hV~RFik%krHyn!4WMAfo^y*muThIM3FI#!M=20m~zhz4kc zCaAt*5H%Pr*5a<SV@0`_Kb?-piZT%^N@V7l9gK)~h-^BwW+*aZMLCTX<zB2PBe1eu z!phP_;=)DDEa#c<PY4hm7)}ZhBhZ%^Ao`C4hye~5JQ~0b-~rfL&?$9e>xLX9t(W+* zV*z3u*>=@mILU0g)t@DLN|eky2y2=17p>3}<}cg}fui?zpg6~vpws40r?tVK$pL>l zt^T4n{Wo&v2Jds*Po&TO6FIhF;2UxS(;vL=VR*?PMyRAid;Nvypg$(V7~x$t(1*t; z4|<cBEry<S25!i%lUrj6DxV#rK4kk*W}Bf6oWP!)Cb6|?=RXY78Vg>ZIurlcIj%N! zhp4mByG7QV+#xED?GRP*J46N2eq;yleZ~v;OLJ(7#J|2Z0fUetza31gFm_&Ntq1a< z0CFJ@ERX})>n178I5Hs%W|nE&e}$WH3vR<5xC{4S1@6Nttid|G3V(yw&<T?Hn<Grr zA7!E*DiWBehf3U4P|X~C4bvvI>03I1`5RL_Gxdom1!NNe&HcQbpaohzyqw&;oLsz| zuF{U7bCGtuOglz)FED=tZrr_uokRB3Fn<HD&*yOL5a;mtn9o6O&F75Tn7@I4%;%UU z(wNFhXF`WuO(8$<;FSKHNYX!`lJw^^q#trS(jRF+1|oMLcOu__3_-pT`7kmJ`6lEe z$X&=cBX=YBARk2<k#9%71NlzmyOK<7zJ}wU33v~3FY@1z??vuM9zY&Mz7P3+WEAoV zWHj<g<WtB)$XH|?@(A*S$fuDXLVg%|4Ea&y$C1a8pG1BN`Dx^5OgIv7d=B{qWMYz@ zX{9cpBC}DIW~7BmnFG00vOFJ^DT~Sk|JW3&2=~^8b;VEuANUgI`Eru}zs?(b{yz(e zCNB2+N+O+$|F*pm&1wHX)0@izO(;Zr4GXZ)>0$vEy1aDJpa-`bdWky+eM>CB!oWHU zurL(Q&^ux4qn`iyZG!3bj}yGCz<OAL<s?(^fv<1}U!_D&BEOD2g-k-GAWtK|f&3;i z4f$>4caiDH?;*dB`~mWZ$TP?vBQud%NHfxc%thuQ3zDAEpDV;sl%!v@LJ8QQ9PCg9 zH6*S}(oZxb=_Sr2{b^T{eyBT1H^bqcBwf1kLl&F|b1&iNl60*vNuLJUPkevXW;tu? z=ZA?OA%2u_FOQm1$%1TjkogdgI)mPLJiYNVJnD2Q4v{7cra_Kz9>{`ekmKZQ<CJ~~ znGI?J3Cf2AHHic@g#@Kj*|T*V(*%He776M+64VS5)GQLzb4XAZl5`7XUF5<b=g0_T z!Rs3qQJ-Gl4Foyl=27#whZFxkMK_Ap^>QO)+|YPB%5H*yX%5u>i_(J3)aPW@(vxVy zCsBk?3JY%YPbi_ECh2(#PclvR6tDejvT*M;c{cKzEIN2iS~s3#UgAknvhpO`Wk1P0 z(vz4ZKVXh8Nw3IRly=Tjx%?zA(I@##?MYFSHZ5zR&V;8P8lVxH5}p#xuBU_pT8`<m z6_eq%+q2RM?e=MTo)dQ3u}Q{sxC`06#5QX`BXd9J2A<~zpl|k?><@XL7`SvznlC5m zLszcJ!@o$<hv%=!5ySiFI9-$8eeV-v%h#l7JnDU7V&$6j8Q;f*)ipU4`94t^@)jl# zIT>WXKyhC|LNzwaH9^y|k_%Unfr^b+q#apS^Aw8|Pw}1cQ^E$t2Nz_?z6DtnvmmWe z3$id`L7p`($o#zvvLJFn=7ufEypRQH*}Y(rIpGU3+iyXd<$}x%T98?W1v#07arD9k zIf3-S*inq9VHCVD?A7H+62{giFt)}$kdE>78H};hH)TJvmk;*N@eyB-Z$Y}f3$n|( zAiEt4veUgFU9JU6azS?37Gzt)BMZ`5vmje?7G!JIf^4=fNJjw~Ia)PVsXbvzixH(k zF~Vku5oLZcqI`FZsE{$j4wZyg5ng>ThCe%u5w-hbL>=z>$QaQ8jS(@T$rvM==VOFp zF-Ej3#faAB7@lT~a6<b=jObX85uLL!!gVP|bWO*I?n{r+iF-`Ap?Cf<VLEp?M)bL3 zM879S40vPMu`NanIb!Gs$B1FVM_e&t)E*<e4KcjGV#GM^iGmp6gUOs2F_jfVrz@Jp z;%FBAqFHH=#(E|i>zQcm(xOFPO|-~&M)PS)v?z2$^VwyzDDp-ND-?U8O`@bfT9jUj z7Pc$VqU>t4D4&mJqAi+<wrEjFc-36As9uStS062EZ%2zd-1SS*q5&Egqeas~v}oQP z%@$zMq9rn#WzJ~Pwl7*Zp*;p0`>1Hq=@-qXcn3vS(n0!02SrcXLE#R&BYVT2z9Y~1 z9T0sP2SopgJ8~fPjvP$7BZt!ONY9yQF`O37RAn^PB3gKlMT@aR(e(PF#Y951P`{$i z!c@V_GSgW#R;{I&9FMaW%-6}z$63jKf|cwi=ny`^lI!Dq9QimOM?TJ$Q;!R07E7}p zhwNY_sgohND~hGpaF$@9$H0;-^rmAhoWxS^tRFj)bjyL7ZaFyZkV84$(qrwG!^fKC z$dP6_dZt-=kz@Ex<MGXM0*CKlvz*-5ET^E>{e-Byd`(u{W@QaDrt7lF@r0<)&}Bo& zj4Vr7l;u!*a#q?RUy+r*Sy|<nl@-TlrQJI#3sWB#XR~Hye&UQQh@X+g3$MzOg~vtF z`V+!RT=u2M(c_N`?L~BL*8CGpn?Ek{W*-+80&`$&c~OpMJRwGr-o-^Z8TmN#yH8+! z`nd4<9v1_I53VfAK4d@f!^n{<i*g9*!QX}KCcYEt!rzNLXJ3>(NH_i#WGnH_Zqws5 zghkm-KnEviLpt$iUT%?DkPT+A+-{LMM_n>E!zJ?|A9n#1UTTqNXIo^^e2cUqi(^`4 z$uXBKJ?N6Q7?&(VmS1woiYqQ@pLfYhWEKAE6_>2paLHN|jyeMBPqxYi`gV;$f0Rvz zKg#BnKT5}3t8BU2DqEoqw-egeTV)4y;&wq7aox~^+YP;B;N0F;*~j_%r<>)#Y_lAk zZ<a$y&!uKLyiOf}(K+e>n8vO)%W;^%<AX_GvpoMI_4JrSnvcV;QRG*X^xQKJnRmh= zEk_(OC&M8N6CLub(;>Crpp&84=a41%i;z|-YS~qXEQgVp>vA~ix~yDu$f_lWtXOwQ z`-(%>^1XAN%VCngWro7?+a%rVcU_Lk>$0ARF~fD)!qP}<)^*v;Qi+4riFU?*mY0(B zwp5nZPg46`LUbq0$WGp)E-G1X1Lv7f(tA8Yxa~p=c&^LA{_C>OcU|^-$%ON|9C2Ki zL+<O+<GL=#7O%_k1rrx-xGud**QMomyug1C*I<G7>b)bv@(0{6!z-`|e}wDsCs=|% zC+XG+hT(rn($Bu|d11Lh*fRVzNw0d8RlXS}8@^1&ZYJsFFEAenmRm`B=@$sQouoIK zFch%d;efkv4_4qltil?s!>jN&cnvmS6CQv_)-4hgXvuo_5DQP+aQh|eouim9ZO8pk zvhKl%VCbj#{gd^97g+N(5FP-5$+~H>XP5A;?Glz9$@=&jvrM1G9Yo~LWZfoqiPA@y z!F+$Xu)Kk=U<iRXChL{{A;J=h`{880?mcf8mat^KA@fb5{=Ha=zW-s-*cO6uIkP3V z>Gr^&m2QunZV%n_VY=TVFq*}{(o3IxjBfQfOt94AJ1N9ust{AiET<5eU}mW$+aW{_ ztAv(AEX8E7B!eu#pO4&HiYY{HEyWZeOPDb&1{>42rBHr`q6f=Ak>NMNBM=U|V)=P9 zyajf{9(ZfAe(w2A&GKk6{Vt|kn8nEUG2ODkb6#Y+Wtr&~WL`Kk-;vC4zx2EoZo*TV z$~1Qx)7(%F6}y=R4`LcTibMyQ<;GnT#w<7Cb;x@B4J*uoZ!imvY{u^(yoLByWE+0x zL1w{Ym<2~#jAZU@@OF3yM8G@YT|7t2yOZ^?dHTkY$@(bLdxdFGhRRbg0X`Ur5U(%l z^&J;1vk5W0!E;#TxgfU|{Ve~?$^Hf21AF0L;osoBun+db0eB1!!u#Og;r+=b-4cc4 z@npTAfM@gs?r4aCC*djh4>$x{MSd9K;0SyGJ_t`!-2FUr$Obc5F7wRidDd5W=9hTZ zD?IZJo;A{PG+8eq-U`K+O+0iQrTwhGK^a&+l&n{{S%0IlSU#MrXMZGFSI3Z`KANnX zp)FE~*pDT%3YM%N{y5>si3jxwB&bg!L467dDjo^jKAo&<pW!%&`)slvYf4DgXW-e- z;rTog;=X`85jVtsF<Czk*<VW5o4@v)rhS=kh&_RX*smZV_Nz#U`x=}~)}4ztWZc(r zKa;GtFX29gI|-5@1x`aM&p+-PIKBzrf;9Lxd<VV@>F}TMJ@`Ikzz^Vu@FO?_KL%5> zo|V2V<1#6oGs`kNV_D{$#Ggf6cCwzEgxieU0y&VItXq#P%c4WeCSH5XviK+o@`%WX z0w{#DPy|*eh7u?R8<asgRDc~Sp$e*@25Naki_6lvu*@rZS#~UO{yM_zp#d7937W}y zoC8{*723cF?a%?8;DWAX-D_W#qc%>|O^c~P_7Lt))~Bqvdz1AVBTbd&T*rQmwY2XB znqNI^EmSpv?W#wXBFo)3Wf^JhyKl-0Qv_`sPbCKwc>ez7*xAUNvJkh`&GN{;o3bR4 zm2SxL`H8r5TSa_RW?QkpiMc5)Hthc(u8#~FSdfxR#PyRw<kJJm`qM)&Oh!kM-ekS+ z5?kq9WkqY6m99BfwT5tvplF7XE|$GI<L6~}1509OZb-M4B{Mh|bVK&~P%`r<8)$W* zY;NC_PVZl3TR%$1gR+54JF8y}tbQ@cU<L~l4>^#FI}h?}Sp9;+C04(nXkGi5u&$$X z@svO*kv1rEp?jgi%kmdg;;w>f;%cBam154Ic#|mRc#1cXVn#N*s~$h0eaQ6mICtx# zl3l(jpPs@!O^MiV%5fVd*KkwDoln*$b8gD1tebMedQ<ueZp!ZTTQV!{mds_5GB4wn ze0qlCo`Z|YdQZ|VSrpIW<?&mxIPsP&Nw_8RkKB?4hi=KjV^};Ly(O~`-je2h9QR+z zfqL9ell9VwTQcra)lR4OPE*{^P;e{wf1a$L-Qb{g+|Q$k7C71REjhJxOZHtM>@tt) z7s>kI)mwDEZpqAPPI`%x&)t%vo?Ft}e@l+}Zpm@)Eji)5C4G)ta?;K5F5>OP+lX%< zzJ~Z5;<Jdi-ZIIl0!}7)JYVDSuu^Tl@fKkzMN9trX_3p$l6j*%o}V)F)p<|G{gO=5 zzbMLoMOq)jOgS1e<xgRz{AHfbyHS!Bj7-)v5%&U_y}~Q~d5ln=r}954M9up@Dr)1l zi_g(JtKUYw{1s_l+-haNMuDwu7jajU^%g80GuLm+wy!b!S^AiW`%SXm@ieoaulNbq z5M8v)KoR#_j(I6rci-cfdECFF@&<1calfZCk{KZ4t|jY3kNFAD!`QBVM2fft()<Bl zhF9p5yhn(MwLsycL(^cg-j?+Rw`C@St*o>=GH#JXe@xbM$mJx1vAFB_|HLE0AGegO z_b1+weF=9M_1uxehlu|(7dv`Kw(x(*t)91vW*c88!MO<H{(`RIb10|dt)h#OUU$eH z`Sgus-4%XEmcG}Y{h$JbsVC@;bQ|u_?)e@HN+sV%K_%`gsMyE%QBZq=@1dYNgzuxE z5qA^Rr}2Fh*gc&5ugQ9q4|$88mXnJ($fTRhy6#X1?obErPzUZ%2kuY@?obErRGsV4 z{7fIY!<#bwE+ueR#;s7vGVaPDmh&4=;J?o!NV-d}_^xyvzbm^E@5=53!dH0&hwjSW zW7rZLW$FJQOaJ@s%7GY60irlQg0wDr(sa=}YnWyWyDPIp?#f!ei-}uH*6WeElJl%@ zz3L5jW#$SQxJ{xBPO#1i7Vk>W0=5Lpcjd^^T{(K?uJm5ME63)^Km;dwHCgvfbArF2 zY3Goy@u)UnlN<1Iyp!V{9PcLX0pTJ=_u4qV0l!Sq<5Y_7<9czL3D33^O5mQHNW(hJ zwo6QA+><%hU1IbM%WkREe7_XEEdHLXJ$_G?Cf<{0eY@BlW0#2A&an@r=*11Ygw6V9 zk-hJp%(uT;WJcYS#S!;pvyo%{Nn?Nj2;@3gqQ&jN9h9Qyx!)|}OgmHb931wQd$Qv8 zJz2GJPgbsz=nW(aPSM+!@A0C#Cp)j)ldjA6WY_#XhG_R>&n#&|QuMk@WNhx9?DO2i z`r@7(@ZFPx-g}e)@o(fDq402u-eSKeqo~N^4fmv@=AQKBaKl;mq-n}}PfiwaQ2L6D z3nTGA!JAU_k(1Z~o>-Bd)D=0Dv?9kZ?Gj_hN%KgGKANy1r;e=1N!Ews!c+7K)`tfU zuE@cC#P8zxH^W<CcZxo=mvcm}$gvC-o=F$Chw!(;qbd57TrtUspcUyetT4ppU?UN4 zgSV&XV~hCTk)peoR~S)|nJX)D<T9C==Y&^R7>shk=@r>`X+`$Wt;lB23R7Y$vc<O| zTfHl?&B^f*l%N~=&J=yn&M`L9HgHT0=gJ}cT^~=;JyuRyup%4M@5?NTsOijoa(iF; zPTrRjC+^Fs)cZ2--6?v1JTj6KCEk|<3HPPv$bC6<h_HW7(T9(6>_OcBQnlHoz0EYV z`@Sp;C!ODYIVkVTzM%WE-$49(xVgQYW8=Pj`d_*BBIj7RFGrT|%i$%$-b?9TzAwk; z@5_(ur;^T+hw1yW<kEdPId@;$Joja3Kj$nph%)be>2hMw>9{Yu-S=s6_odT*U$)uq zn`B1=CpgFno@bwodjX==dSA8_aIy4N=}B9a!)I1`p{+8sUzPnQR^>qIDl<>3aw2|J z`i`&4kG!A9ldvjBkE}}Xp;e}LR^|B7RatRxRoeGqr58h*CsOpLh*fDbuFA5#tFk<D zRW^hX{v@sH=|d@cWB96k`f!R~BUfqrtFq2O<dGD;>-MU2ZLG@9^;PLuT$RlWtFm=@ zRkkc4uaM^H6#XL~hL5J`9kZ*leR@?oFRe0fu_`k?t1_#9Rc8BErP;eGEzVV$<5-ok z$7xY6Zph9JnQYuq19w!j%7hhnoW&hmSLL%LDokIKXVcbX=9x8_m9Zv^Pp-+56Kk?4 zbxm562#+WH_?pa4T$AR6HRdSRWX_>AnR{$a9{xm%o_&z;eQUBhW=+;atx4^ZDcB06 zFveMvGoM0Lgsn+adB~b9-MuDl;cK$ekBImby&`B$+6`;6FK12mXRXNr>zW)aSd&Bc zHR-Xf$>D}IIa0GGN1bcZ>sXUx?ln2?T9Xr=HR<bLV=`q;PI=d44}LfPUi|0q50id` z^u44%NBS}Qnn})lnu^2;y_|67Gvx5IkN}?}hw1CGJZ)Xhd>;Q7QuNA`>$2*^x~xcD zm-eJ}SsTAD>yEF>>cn+fldvuukF3k4L+i5s*gB@Q>$3gey6o7uF56<(r4w86mWXxP zYFwAid)K8Sa$WX@nbzgG5H65N^9@ISkqdn(Meht+moCG)99UVGgSXdZ-v)7Cru8f$ zPmty-w1_1xc!djIChV((eGN{+*WnpBg);MS!G12tIMm@KLryZ}AVY34<XV?qCi}YV zwsC?a4oFVXdvnN8*1GJmu1j~pI<Nd!rR60SrTv#R=XWu;NlDRLU)m;SPE(1JUX}Tn z_7@y~m3b*l_!C~0MMqwh)<dt#;$zG`9(|P=xmTra->b4L=2cl9^{T9hcvaesugaNJ zuJ?@;-Bg1fnfWH3Z$TQ$OkC4uzRe@~{$DlkcW-Jl-y!_FRGQ8=h-cHORGkMz?0@op zsQ!zVC2nZX;_k7$LBxJ9MSm6w{of|e&M-$Mc8b{VQ%cc+{5Lnol(=IvNCV~gV}F4A z2Pr1q_iIl0L)!4QC^2~oD~NAl8uora@ho9ubmLmi1TrBOD4LH13P(bqX!iz+ju`9- z3Ieeq2^3xN0iqk*gU*Zq;euXQ0PBnaq6eD;w-@__jQ}y2g}uT$_6jT5FZ5%-07G`{ z8L~_mD5PRXa0WX95_wM=#N<%}7V`!%f!mj05Sa!33=8~4w%1>n7yMa-@)tP~{vy}t zFY*%o**DK$6r}o#!gPOe_L#pYBFuWkUlbqm7bQ?S&wf<5vDAQaJNr>tvE=ZuAC-@f zB2E2lNM&O~ss>CoGT4v`>W{D?m5~jppb56|h+=<~qF;dQA5%Gl|EjqjzNvLR{hBt) zosKxTs}tN!2zO=RZufH6)0h-v9MOkS#5qhsdgly$zhn?S$Zl?~E6N~T`wYU=X)uTm z9PN<?;UuCh-XL1J>y`|IaHJVTGqP;uHCcZ9HCejxnzXII#-!$Jvfw4mwU=L$_NCXP zLW|~5Npr<(THcNQA{)P!$@7KStQ5UozoRwG+|?Q{-qxC^Gu6-E*J@r^)oOpeV$$lw zEv@3(tD60lziE|!TGy(?J+1WCre?eMKr36@(8|S{R%zIfRq!k)i_PZ5@BuTGD|JIY zXF+`v*97&@0JV`DvJR@D23ikp$ium0DlbJp4}}pMvfa4BDwt`5Wv~sI8@9nR*apjB z8?1tDu+*_3d*z1Q8V8v3Y4<_%awKG)H{iVV80O``(uN#_zAN*xzveZ0xPX%va&j=A zP0<e*;kVL?jPue9?FAdM1KPj|ElXqynoayTptpdFLJzp1%e5i9p%Yw?$#Pm&+NR7t zvnkCPo6>S}Q|6r5l)0&!GB0TpJ+O%$*hCL(q6aq71Doi9P4vJfdSDYhu!$bnL=SAD z2R6|Io9KZ}X{y}2DXStkWp&u5tO?nawYxWEUHGP~_uG^Wa#J=2ZBn$G%*<~}$L&qo zvau;!*EeO`;wJMGo6Jva((T)%+qX%#Z<B7{Cf&YGx_z6{J-sQ<Qwrwd6ur;GdSL%1 z-M&q_eVcUqHtG0on&|m%()Zt_@4rdkf0Lf?CVl@+`u>~r{Wlr^Z8H2LBUTKXoCa~Y zgrY4)AqBsp<$PC(+|Ry6oX4O4=yk2&r5l>rM)8-CrkuwC<_i3F;v*mMs2|AJNAD~> z6k%^*O*2tM%Z%Q!Ma?8K3$mjgNHbV)=KxzVOSWQ`lPeG8)a?gyV&j4Itv_HU{Q)!S z59H|b1G;?===MFJ+xLKO-vhdREI3!vdzyYA&m+%8FKOl~GEz;SA|1nsS*9kKiqK3o zv}gANo-qk=ci0}tHY}vfwY)sAj%r2eG}9y<At)k)P+5>2E|eK8xN{&kN~k=@$6Wx0 z#GQpA+!UHB&Jd~u=!q#Clo3}B6}U}yek!Acs)B0VHBfs(s5+>J255vPXm$wYfEH+l zHgG~abU^2VP%h|#Zs-9w^ujsla|qQB1270fSA|J=t_n3gC)5axt_$UbG2G)YL7WdJ zaZf>JlvG)eeN-wlSQ4blfn3~qke?w{0Tkjs3q{0Pp?IHEB~W@?DjSqRIaH*Xq&i$j z$0ALtDySx~25O-W_$)~^0G}nPCTPAal>=I!720k~<%D+V0Ja8HF6i<~)eSwk-OyVj z)j8-}lBypD)}<PRArl@C3=_$Q<!aO=l^4dmQjNm|ZXZk%Hw9hkN_D3xrPZhCb6{vn z(T_TK1*a<IPEx8qUa5xTN-?FRni7;MXeFJKbe)XAjw)q2sMPr0742wOieAve`+ss} zMVs1K(Iz5SG~fJ+HrGqj>8I&{d4L)_h<}Ja0G#)rI#-lBdt0ge4NktUlzEsmBcvH6 z9<r|x=B0lyuT<7mrE+JL%9~c|@EGriIi=b>N;&(LYWFGC;Z-KJZ9GMHIFxE}E5*#5 z>alYtHl=zS6h)>~XO2>?ETy`v+<AdgnMs<;f^4ZNGgu%8av=}$p#TcuEEItiilGEb z!3JeeE=`)Mz+s0<sDf&!fm*18`V38J6TD=&XcL^rZT8VTU7CuU<nE_Zbf;HS?f5mF zk^)zarn;6i)eSxC95YSV4);0eBd#9?T$&m*@iXMrlm~_{Yia~WZ)?g6=4Vs%apd9i zjE)GOgek~8u}x(`HkiQ@zD?zTHbbA#uubJhZBqqMh<k>x^Rp?smH1-(B~Y5NP1&Ff zx2c?;3L?$V(F$-^L3PwNRRguS>!6<a!xt!lv~8*hnr+*Z16rUJ+WNQgOt&e%H&PwY zY1^h;z{b9+8+vYUQ*P*mbI@norut#Pw#}plaSZiuQyv(G5#XzA<pu2`C9u9tP2l#y z<kfBJDPGB$k$x%*vZMTz87#QZlQuU2{~V9xzo?Kup|WQ9sk2a&<fp7qOneEHZlx!_ z?2yS%l^^m`6<~)-sDf&!fm*18dT4+~XoBWxKjnZHXoWU#LOXOoC%B*sx}nGGr`*u% z@>A!a@2a2bhXK=)pBjXrbwA~SVca7yN~9O~<V%gigx62`U=sJ#7Kz){B=}$g#$gP+ zFbX5Px2s|Bzz__=0Q5s2oJ-uUdcox8rw6*B3tR`ct4`?f+pgNdiMtJ2Bets+aNur+ zrrGVP5gI((RXx->x2syH!CehiIonkw*l|}tIdNrR!(9p`?(M4Bz1^g&IEvt`b-OBr z0?3Cv_jZ*FIbi-NP43cml?_>txkbW5ivLJe{Ik8{pY4?;;~|v;`~!#LA2?M0zK2u+ z6vA033VBFbp%_Y_H0>c}OEW#B%5aoJ1%Y;`ges`s_mHZATBw8ikcU(QG(r<J7d)gK z&{Fe|YK1o3PH6W&gl2n4b>enG7jfOtgWC<g>kp}O(1*L<#LobcgD`~K1H;~j)Ci2? z_QKf0Luwo*zz35s1)0bFRTgA}87yi3DhF~Q5Aq}ZRRI)6`Kz-~gxd<m;r^-wN|XGR z$;MAvhQBI@3fy+6B(e&sBmGqk)JFNMI;h9p0FA^oLGzNoazIO#ziNdxa6&tDKqt7M z3%a2P+|Uc>pfAhcr226Tz#t5P2Zmt;M!^eXFb)&z{>lfFTyzRD!wo76vLg-343;Q^ z%7I+md5}+B0Td<~)LAIXFeob&<1T^HaFapVNKh7OP~}h&Wl(mgJZ?}`P@QH_HBbw6 zP=DOO>(!tdp$VG%4axy6&<bsr4T{eKR6BG)=WT;>K^JsG&t-#hLob|zKGU2*^~1nb zgTh!{4dM2{@VY^bz$k7nj1e~u6S#da={2Y+7>@~1W8j5Rm`V&lF9e_$0?-QqYA_~1 z4M0Ei!N~3a^g;l7AppG)piJEn0jdjJVF9WW&Lsw*7y^|0V1Vj@R^nQ~5g(wMp#yh2 zIEib6I%fdwK0wvj15`CM;ckQmPXKx$K$Tw#P-S3)Qm7)X670lPcmhnc`vBT~0PQ}2 zb|0Whpct%B1m;TtwEF<seE{t~K;=O$<bVY-j|QqN$W91UX0YJSf!vHhl?VB_3!u;# zsLnzWSfLn7jHW<UdLmHSpbW~P0_;!;RZwjVR5efwbx;ot&<IV?yb!1y&;qT{cJwvP zne>`=_!2LnpV7Uy1}T?)hw5tBL9cj+>e(Bl+>1L@uWyGsXPV!k`iyT-{q7xVz_~*W zaz{hBJx6xXAK0NrGIprZqdSy0XNMXiY&>xXdSHk08F#43h#hKb_YRfWK<UI&0zs5O zDkX3mJ#dB+NT&pL2dVr-O2G5qB~U;K)KCIYY!6hX5-TNeoD#560tA%1DFF_&``yzj zXL(*Rlz@Q}=%)mxDS<gkfcnvJgc3ML3GAZ;c2hsDPy(E*b(s=C`8YQMRr^Yy>Szd3 zo%4abR{~X+BT#j7uAcBf<tBY^SfELr3kp<ymjhLQNT3=B3si%91Jw`*dY0~K!z<KS zPB=<puPacEr3a{SU!a;G-e(U~lPIAnWM(QQ;I~6%6K|$8EaB9Te(DM2!B)L0uv5>X z0@T@c>eVv!>k9R3w}}dux}uemLt6wj)kux4p=OfEew&(_P95On)$!EuNNPG4u8W~& zbD@SXYI+TN0NKn59Xz6z6<Q3X(Z)HP`)Dc9u|dlU`g;rO@&>5x3|bcFbjQ=uOyu@l z){550T{1hRn5{w+QU{Jv0y(tRc}l=X+m`47D{Y;y@nz}&Zr@4TI+>hGAk7tYMi{yS zoiOA;m!zRneCQVFBYY|qoe_fW2uFuVv}w?NZ4AH3dle<KfVRTZewF9uhZf5~o1H+b zWuV_T&~htitJ`QT4t6KKLG_TCZrojV?h4sS`l@)erv2~Qv%-irjX|q|4N3`bN=Iv6 zLVKdT>i6+JS-7v&+6*RDlYpA#VtF?7b^<k*KVh@1qt6%6>+8JX7Vm4B_)F5z+coI# zr8lUeEc7|<!Z4mIGN1TN7il5e$=lG!+mO4=ndWUc%iC~+x1k?z!}L|H(8ICEcpGvp z>w<~bp^evJ4X;Cr$d=CA@C<K54z3`#b`Gjc<ZT$k+c1k(5X9S%u)5TpsvdX4B5y-# zd(&0khLMyO{+0;dhNOS}ZFq>c;St`3oU=2Bw_(ky*2N8WH}E#Z?>2FVy%7}eGOxo~ zf7S2gZ5T-_Ko0qM8(!I|hH;OO<5AvtUQRd$<1mrP+wjPLzYV!~W_qy7N()xmXM&YE zBUo8Z2CJMC!74X3Smh-JtNi$2Rd7656($C&vkAef=t!_KSq}xP;$y+8<Y=%eJs7NP z`+|9H!KyqeSXD#>E4wjRRqhQ|RguA}IxJY#gaoVF-NC9ZJXqEH1*--btQv!YRg)o@ zdJ?Q0w}Vy7MzCsK4_0l9!OFQ1tlF1@RmW1W>bw%HT$h7Y*L<+*z8b80W`jSh+|$9T z_foJrHy5n>Ji)5JKbV>utOmWoYRDO^JdR*B><(7vQKgoj(+_-}{{3af1vSBHA}3h+ zvVzs5HCRm*R1MGa_d|I}A*$-vvs&C^H6bBnHH55&kkt^f8bVe>$Z7~#4I!%`WHp4W zhLF_|vdWPke@=VY)O|IC%!MfJ7b$w_d?+T@p<)L2`CoGV7r6cvRQs=(e{_VXd~c{Y zk2}{Dij_zx#+acZhxFNtp~5^HDl%6>`Mb$bF=*ef1`778e(QeKcXGcP2@6%j3Hxc~ z`_&MPJNK)xWBZji{*5Lz8vjN$<=(F*!3Pul`&H|aP}O4CkEY$PnxO;QWA-aZx` zbZ_ieUEqSwh5abjH>zH6p9xhx`}V7Hzx}FgdB3u)yit`xRnUG_33jLm*{|y6->7P# z#%&5!)tsOyV!vvP+OHb8NIe%b+y1O&FN87j3R9UOVJgorOy&B*RE{@HSq?p{&W4An z!rfu2;8>W-KN6-&Rzg*=m*ZZf#D2|`Kzb<kGnD!nO8pF_euh#%L#dykO1sJoU3@6@ zGgK9t5<{t=p;XXNWj(}6j)hV`L#dyk)Xz}rXDIbEl=>M;{S2jkhEhMtbXcf5&xy>x zVX`AUl!qS5Ll5Plhw{)vdFY`$^iXy9w=ANphw{+5@Jm}u*P+zUQ0iwW^)pnNy5>0% z5$1Vv{yU!a?|JsncO+c(9}QOnN#SZRewP|b59dw2OAV)ms}UHz8m_!o!qr$pxEhCv zW8unoFkDT>gsUlJ$I2tBeQ6IhZI5bO-lJMC?NKe@fMzgX<9Z7$in#afQav??Rd>`b z)#cfvTvv9h&Zs@AFX%DVe<(~1?0Z-Z!cf#>%5(WKHGJhUH8S@Y6KIbqFKB-t!(e`y z6TZR?EaHc})IBQq<Q|n1zeicXd}NQxhOCV}3@7%e63;GGoNn5stj;~E$hAkE4ces& z?H^GEhxX8u+M}wH_NdDE-OA31Dv;%AdsJD%9%W11qe>mSR8!6#)wp*L!-PF_miI7t z-lJ+~_o$lbJxs1~k@#gTD`8p7KC`TumzFil!m^eFxsV6>kvFsg6F-HAZfIu{Z)inF zZfI8T4XxOHLo0FK&`OtXXf`NYzoC^w1#UZ3_TSK|pgQfQR&(a2R=e+}RtNQAH?@Y{ zH?>A&li{Y;EN^NK=oEZ{@kdJMI;Ha`W^Lg760<tZfx^_<79g5CcL)cTKJDVYqGM>k zXycF4oD=U8=09`LUpVLn2Y_W6{z_5bB>WcPw+RRH9sGCk-@^~)6&BR*(+U{rWgVT< zvg7A8^T|2w@G28y9>%Tdb6Vb+IW6BYXVMC?7`XP&X=kCxHm6ykn1f0@b6V-;In5R` zr<KjmY2}f+RuM6$+2x#82~|;ZTD8=*n(#TTc6m;#TbR@8H*RPR>(6P8%Qv*9`5T&p z2^7cNoYrz>PSe&>^tR)=b{=V2XZm@AOucGi0!uP*-8ZLoLr=(@=7wGlI(KkR>pMQD z_50?ufkbluHw-77bDAe>P8)_1FDHEs8<v_m?L2b)1Si~}BsM9D2NW}8CtlRfi_;i! zpVrOlw0>ASt>>j*)U<7<_48mh`JL9!dM|1}A*bxjriu?;)Jl98wZq#_>(-i!TG<s6 zKSW}%9lWTS{ZH#v`z~q~dr4zBt!n|N^}3~t+6=NLgo_29)|;+S;dY$X8*X3FTINX? zM7o`)b;mU6kR2-*wYG$5ZRQQ9byHUq2L+$jJ0mY@E(0?qBsv#!QR@l6sJUSvo{NT@ z*837JYW<#z+Hk=|ZKVIAHsrpjd2o-Nyr_*QagjHk#+v7%HhFSc+ZK9SpGZeOd|KaH z(bK}X0Wkm5X}vn_XIf3t1+BK>f>vj{VA3k$FKG6(3tA;q9lfBHIxc9oV;8hC_XVvy z;b&UW!UfF=#ZeculGzJd{^bi=0X?+CZ=&R;FK8CW&$OIbI#hlaw7j4TTILc1WzuFx zT+qy67c}h=E)M2!ihtK>ef0LvH1E;{ZEW`iZ9K$uK^wYqLGw^l!!R;;LF+SK(E2Z3 z&<6Hi&<3ymOzSyuL32Z|_kwor$OWzQ_yx^H4^4aXX}$Z<1?@c2d4w)y&IPT*N`~G- zQIodCc|mKnlOfm3S{7t`Ue?UBFKd?hm&x$UTJH4ATAqoYd^`mN7Vcft%)80u9-h}* z$u*cCJ*}4>IiPIE4yZC@dEx<8fwbeV%s8N`p!)a$X7Ud(lYfAj`~#{!>Ht0V1FCWN z0eb8QRP*fvybBJ{V?Ur;gAS-R(m0_VOdb4mrX5f&=yEJ--M(<uvk|V`ejii4o<;4j zk!L^0y8s5z`GYWo+XKVvybE9yw-?5U8;1$pKA80KE`ZFi>snU$buGL9i%NSNHwNan zbEEG#ZPK*}0>J!E0^UW1c{g_-Nf?;_nXrGM1d`sSZ1TER2IaOdstWhNDLW^ujJU2< z**~GGr>|=@Pz!ba^IHA<b**9cy4HB<y4E!Niq;(SisrB`YAq{|GJCVAwc&R97PWSd zX;JHNFKV6kMV{%usV?~m)s4F+<I~EGyEpyQj9C`7zKsagpT4LKq%CTLL5tc@*rMi< zi`sDFr<uilJ7bnb&1+cH#(Zz)L;XjD_MX%F44mJ~P5kR=zA^|GS?S^I(jG3%NDKa) znCG=zQ$e`MgM4qeD1bu0=e4u=i@>@VE{YA$Yb8e?5v9|Qu<g_%qHOjNQ9l2OsJQfq zurK^kt6YCXR4qLsstX<wHTFkDEvAQcCtlR*6EQ<{?p2LPF-wfZ4sr!MNVbM=Ir)m# zTEk94Y**y;yr`Ml-Op<s&gZqxoEJ4$&K}W4MEBlbYdtG$MO5&-)*HVY<H=z5KnNE7 z;emp2s~8Lk6hr$0g$IUVYmbCcBPInf2IDXRzQ_RfFu}wCGC4L2vLge9`Cy>1#0QF; z#6XkCJsK$T4h8a+N1!M;7AOi60>xP(i#&nC>I)RbVZYN#+<~HW<zJNTYM>~)5-7@- z0!4*AP}psOqOu>W3s<11ZU_`LC)lA8>YzSy8{b~ALnGVTG$EUVwh2cfMixnIdwS(X zt&KEJuZdk)T<pdIozu^2E)2B0p!?E`S`UWXZUeixc-WZ3&9>{#7qx!Bd2L{xok6a! z?fRwdd{e>BAZ*+*f*i%~C47wdapVMkpC8+<2eC05GIN8yhb2b6z-o?QHAm$93qLI9 z2$pk1{?T7)1%4P^2Vr!LEaLkXE3z1W$+2H)Rcn#T_|Wm+Yc-3PwW>$ori^de_?6bW z!rnow>pH{O8Jr_J!hWGueRh{RXgcvbZTKW#niDu0!B*dU+3FiP9>Z4ONFV-5<P`Cl zXz9cMM#FnAbr#I~Xn6baAHWah$4={U2MK#0VgF7TnBR{-3jgEy!TbdNX#6qw!DN1t zfTsxf4+6k)h(xi39VRS}Ft8jU>;r^-kg%r-1M^Y*AHx4(8WEiT2w}$v`zYGsV}yU4 zFfbn{>=XDuNxCgQ1@Xjvn)7}J)nfiE2@*)~IRe4_dHi3%pNJnUUnJ~Hgnb!Jz8)aD zu%B^3C!CWnX}zovyD=K+SqKoVgtdSJnxSJpK(vDs+F;yg;4d@`!V9C*?^aWXO#Wgr z!C&}}`}1k2zZi5I#K0`3Dz`CJS!e$a*0+Zn48pVVZZ(8F8?!?cLILDM$vSnPx^4B+ z7OuXanIm`bStUD}KxY3Ak=O7`EjOH<PQrHZNf6s}NVb?v2oN>L14K2l$+SB_G(v;@ z4_bYA00ucU9<b%mm||Yks!p&QVMc(k<F5ei1f>GzukftDN~wH}k~&F|Zt->Eo+0iO z;Yoxi;|FsJ{?qtVNdxD<L8*R|@Ne<_<I;%yHj&>U63%~@h;&Z=pYT26zE4~R;b8s& z{vYE15q>bA!T)3YCj4N@BrJ>LvcXK8g}RUfxsXS?eA0oX01Dyk>4UnZh=Z&oE+#<< z3BX*+32cOw5mrvdwx|F*TH3p;RZXO5;eoU4F43^86?p!tRbBhM78Z5%Wvw)4S+jZA zX2{1jL(6~GDln$DL*?vWw5mitc{s@@4|9LkYENL*1@+KyqUv8ipnb=*_s?{aXm%Mi zJD9{h1?MZtP!*Y}CNni;VvAZbT1R+28Eqi0k+>$pz|u^(gRmCDS_uPl8|j>cwG-As z7??ZpyYP2$J}}L66Vby7+(h<rz&R51k)WRhTMQ6CNc<4-9^!|IA0d8}a4>uEkKrFD z?G_W@gGu72h}VgqCjMFcU_Ot32LCMn*{!Dd903=2Ru>7JBM{90bs8f;w#u-wbw&YO zXCS*Mm2T(+7qlNmqd^-up(Tmk(%|(!^K;(RCO0Ta&M|@Xp$W&3<BKS~BiA%9{^4m9 z;IxUoH`sE`1OND+(7vlTwXHu#_{ZN~6=k7pq0o-<4Zo@7pNtU&4JhaM7);+{gyl#K z@2eP*?fHvlMrK0R?4PvPHz((qdNRMYS$XX;Pal6ZvWARQUgCoo(-ppu$+)T6XZd30 zGGEL<=>p$qz~P@zB0r@>z<h~P`WgP8;|I(0a2b9<+%JiHfw(Kgf#p|(y~v~bHQ`qY z{|%30i{BFe5^?i{|BmqA^Zd{AW4=bf0s(&@0hnLL{|f#^{9yhg{_FVv#7W@%5@CNP z>@S4fzz>#X_$%Qz3BN`7ZPLQ|JA~aO>>gn&gl%!3@KwUsc>agidG@ao`8OiAc#X&n zA~$JvIc)yH$D`Rua|0hKF1%T>`>M!=JorED^HS<r(rmsZt&C}ka?bNV|1g66HvZRt zjL2Hj{>MHqw*Sk2wDI2l|K*=Htbf*~3jVAeen9DnR9%y)`gt%{E&N7%lj-n71o;yL z76ZWnsrunS!gde_mLS+k_!|fhCOjlnpMmplBs`Sxhg0>qFyj7+xHl0F=11^{<KKlJ zEN>?4ErjhRY!6{zek;d4ni{DeHm2&^-Ue@ncaR{01YmwA2fPdaySX4#<-f#c$~`f( z;UjE+bBs2P?8NVipiS?kO(WazJN;<WL9}UPGkyo*EyTAXCl1nv37bStU7`&~QED(2 zNt^clMH`}%?ZH26piM&`Y5O^Dfa3=v*mkk#2%q8|V~0XyNg|)<Mf!;XC`@1n!>IQP z3%$7<$YsyHJixX>U|S)wahsujnl=oL(6pNgmXKGp8a}6}g*vG4`|lkNkyX3d0Wrc) z*!X;+3|YRLu4JY#s;q5BmF;g-<{d_52{x*nhm9)t5u?g`i&5o2YE%X9Fsj0L8`atO z7*)}Gjmr9%Q5C=6s7j)Zs`NjM$`)r-WltMb`A3YZ;^RhT|CCWxe%7d}zF<_<UpA_m zuNhVCDWj@8ZB+H&GOC8}8dc->jjGA?Bcp20G%AP1s9N%ks<p_d+DeVeSz%P|)kf7( zZ&aPlM&)WVs;*9>>h3YBo<5^;4;fYOs8OApFseS?sQPD&YT%+#4gS=qhMqSn&kII1 z{A;5cdC90ouNjs16{8yalTnS|Fsg}LM&-L_RFi8)Wtw`;sQ!bFB%f!Ct$!>$3~_J- zJ^&wtr-4tp#fRX-u(kOtpLC0@|1<v>d>oF$C*YItDTs$p!)M^LkN{iOs-FkGvKNW) zMZvbaU&8TaI00XQufo^hBzzs7fm4tK$&do4Ar-y>--K^L8hjhR1K)*o_)qvAd>=C4 z2k=Aq5%5jB__5f&l?5S2Mt`SeN>ldjAYq2g1->tb+>juV2Nv8pP<Ztf?d(Ce^D}O3 z=f@wu?&XhP!}tSOq%|T)l%?{=uIc>2E7Eq7KYUGM2j=)>Re|5`NmjL_sUvN5Jb(0x Ue2q)<-L$Abo~#=71xfM$05<-XkpKVy delta 88171 zcmc$``+rndz5oB-doq)TB&5&;5=g=|l+Z$&+5iC(E-Bd5qNOdh*hWfE(L-B2MNLa6 zagt1Klly(=e!oqU$^Cv^G{VPd(Ndw7DpovFMT?4xN7Q(}&!Gn&$L}BT`TX$AYrWTd z?LB+1S^K@;d+jy3c*bY+j8B2TclDqqn#e9U5y|QcIlWXXv~?~W>0GKEE!HLs?@sOg z$5U&6U-nNN|5L6*U1gWL{Pq8%uJYZw|Lwoi#<<$4)r|bSOZ%Ps$h(8uCex~9`Soju z<VJIf_Q`X)_Ph0OYj58#YR`I6zfb+8_Q$oq9$7J}om-%Vk9_>L=5EmvjNg7o`}oD$ zct5E-;V0FJeo~X*CzVJ2r0R&Dlppt#ier9~6yhhztNbKoqo3G9{UkNQPtwBuq#(*q z3L|NUU4dPRZMq<;n)Z{lpP~IMw(-b!wFe)zy(g;Ryqgr>bd#c6Zc=>RO_DG9Nz4yL zHHN!MQ-qr|FLINXjc(E!<R+$zM$vb*eUHpu6lJ?4D&~Dr`A3&X-jOAedwhxH99tr$ zABmcUiMS<FcVvyE>|G+0yO)UJ^*!y$#kLPbrQBX3F_%S|KBVP`#@D{5ty`Bov_z8n zXgRe+Mm^kRyk&`$6q=>ixkPL~=97Lx`zyxy_q5<=X0Nc1OBP8=SSq%k;`<2S&)64a z$Gb_^F*nKIi0>-CpX2*aQMvn<O3o2CnWb#|1@+fN6+10b7_wAMAJZQXzN4*l8wg!3 z#*^=8{+3zlY`-+V_Kp^OPt32_$93A2yraeHulpDuy`yc~H0dQh4|z-PlP2j~Z;}2_ zd}P4QB7-GfG8E`8!xw{O<WIVcuGD4hbuqS|*8;3rxBR8=roYUxw}D7c8NB8%Ls$J} z_=dlXT=$pJOa3x;!C%I&_>1k=9OZBL<WD#=u-y<<8Rsc6zoqVX)In{cr`Ueaw~N*! zcuG;JzZ4hxOUzB${6SP%8lQ!h<g&j%O1Mgm50I1-0g{{;AX5nel5{jcV*W(qTkH_( z4hBfgfdHwE3y``)0b;m)Ut77%_GeL@s{*8KBQ>D`QtlTZ6^q#Nzl`YbYwN70zldtN z8X&DV0;KJFfXre$E(A!&l>q6y93Wk11H^eIKy0@~?fWaIzR^>Rz5qG*H|%*)=l&sT zc$KH@6HV1S0%WojpQfqlngE%p;B!oxs!HZ_bMe^&BrATIWS>|j8Hvlp;--bGyrat` z|Hv}QIlfGCkHJC82bM{?SyOq3mWh4uGRfS{M*EjZR_rn{e1D*QTQ|8IAO1kww0PEC zQ+@lFOK<FQDU4VywtI|a?`qrh7!TvwceUWJj85?t-qL0Kj4l&Vx=b$8Wy)KZ!t=|; zv_Mm{VDr>e?$mP08Cx#dEz2dwOH(FqO=UWkOGfE(nWb!7sHqqq>hIN5O7e0^&ZVAm za(tlJ78xJDt1SzTS<E&E0>yM6`-R-Sfl{_RP)hgH#@ASULEEFp_!-Y$&>jvoEz#6~ z*+*v4#-TuI>I;<YDcUV%yOu!7X$X{jXQ1R6eHXOP?2GZ&RO*o+DLx(~MaP1~6u?JC zE|hHVg)&PycW;p71T2)C_<O{*j6;kF5?gqX#4P6!BZH(gglz(i4ex0imdplnIzgIh znh%o3+q4Phv{o3eyvJymUEw;MAP$G^S8~d$@U7O=NJEg!qC*bq*HFJ!Q>Tc^V--PC zea%B^93El{;k3cFj?cswb3ezN5G=N%!IE+$Sdxzii)lS=z!u6e#|2AHte0dT^^!ds zG*z@e*eb=b!IEURNHV0<Sj5(1k<=-Rq%~ONZ5m~ubyHpv^MIym&U;DqbuZB$WM@a* z#fHXg)KqKkJ!0C#2SG|0CweVd8m|V6{tz~FUka9<3)tbBa$XLWNr$_nQ0_l(<o!@v zZPg!U_?8CCbRh>%d8#5<#?yjj&u29CHjLS+k6R#TILX??6;hY5LduV>kcuNKB<2xK z=@G6k0Nt#q-nbRgcW8yAP}a9_oI?vF1=Y85!dDkad(;YXM6M9y#scYCMZ2w<RXIaf zh$)f}gFV|g5ok595ZiW6aEGR<6FlT+e7Gq}Q{Q$G4Ldb;W*6VqZWIPDtq^^WrW_3( zVng@EaG{)8A^qo9$iN&7QSRg5rdG(@qkO|HD`dD~g$y~3?u**yLzd5KD($iDVt<U| zc%0+-9144$(9}C{<_nz6%iBeJQd9Y-BV{sTrA&pdl-=0LFdDK_##XJA@r^5GB6Ot; z`mL0qMJwfu5p_xP-}`d3Xa^W;y^*pTZN68NWONpipVCzO*6rdDH|hA5CY`To($#Ab z=Q|eZ9(9+VXQHI{MRz&#lu>y}+p}1EnxlgBqpM^VwLN3}_7Y>cC1RDt9Hd8{Wxr4y zvPz0pt&+lxtE3=wmDs+>cF!@4f>z1fXv`tLmM^hA)Of6tMvpDh<hMoY9b2RUTK8{} zw)id5?7c-=%v+?RZ?kklJM|97X6dfkEInzPrK@4HI2$%UFZ~z;xtpbTezWwU!?BxX z<oIS8q&!4<e1D`&9E_CFgPUdSXrxSI&xF&J=+xdwsf}Mv9<iD{Vl{V&)#MSY$s<<F zp689<zOQZC_xAIA1(Db<@YO&}94Gr_I>K+YWGq@OsX?pdEOv@{wb-=PlH|Qwl08<7 z`9+4q5lv0b<A2F`=||eP)|y||)ZU|f<zLa%*{^D<?bsS=Ke$HB$EbgW`d2k&to%Up z-`pRyMq<7}SACN`gDIZD2XDW@m9c1zj0dfeiGVe-=S{jDY$w@6g7MS`+O|h+r!+M) zw?>kN)<|+6`Nh;4v5l>f)Rr||;cFz_xkfS^Yb3LDjo1s<NLI}n$*x!<IcaMoH+hZZ z<*pIq-48ThtLZd*hxC>xv2*QbUXK!6qNcL0MM=!JG?jA|d%wHnp(ZPB!Is1|=ft0^ zsmkl_QpI&|O3~Cg$c-0?O=T$W70HVgNinY#TUwMP#Yao>!D#6&6iH9llt(80oW(V2 z%4Vlv%b_BdZphPADA(i}RLduroLwseXVyyZ`L)tFw^q7`)=E#`TIrfvE6%aCGT5?K zIvdu?uyd`9IM&K&=~@{pTr1->Yh@xRN+#3R%9NRdWnZQOO(n&LNb-pgnWbzi<WAFd zG(?<7LZth6i1Zu_5#z-VwUyS?6C!C4Q_f};Y?dIB46~GNmBb^z5b0YKBK<)jG7u0V znVhQ~hP^{%#3Mvvs@SfY3v)h1nr??k^R*CZxyr{<XRG1Dyc8mi3$&}%ROjUo(gTj- zOo*83=#D!2r7Kay=`CWbr(YV_StCOOIyk7#qas}n{LtMe^oK}2^$pN?M5GBE@$01h z#5(CrTqhj~>%?)zBwa_=$(|<mao!|vqxA={n>ols>!fAxI(eIN+x~UZ8fz^6v9>sP zPYZ*4)jASpe3bXJattsK#CC1Qo*!#p*x9D3ilKE<-nUMwrq)U2*gC0cStr#E>m;UK zQ?-tD(sg~EIB(uB-PhKM$-()7t%K9*<XU+5$6AOrWQW)n?GRHJPOv#~c5BM52YNMC zVBR5xXLpdE?~vl$9a2)dLrRl(NEsMoJEYvcLn@%MVuw@}?vUzZ(Nc3_yVM@qE_DaD zOTBZuG>mO05^a~J$nDa+al5qOYqf8awyAAmZO`2%j*4y4adn$?-rgo%;oHOs-J#p0 z2YM5?NuT#N={IkafuL<N=&?<PBDTqJ)HWFj*(RgZja`eB@vD*I*2h8jbGN%rB)Uu_ zf~14%B{^=rq#Ro>HuHK(4cRPd`!`E^;$|YudgG5j(H48`-YmI(>&0!rSa?NSx+-+D zltNk1dNDlKOL^paskpjXDoaD9Dtx_E-!}GL(Y9@M8|34LU>HVV6vhZE<AfFX4A+uA zK^gQ(6h;q`;s^dz3$#|{ZV}rVdi!lg*sP}Nj%<-y@AXoXyhW<LLuHma+qdzb#ea@( z2W<Zb`#ack*kJoE_V=*g!3NuT?C)d$02^%YVqd_14;ySh#J-4qiD5o*BUHw(Z;-KL z56I|^%`$UylT0T(AX7It%j9k1KBwGYLEMjcKzh-U;}6L2l?^;G5dRY&kj^WcNwGJJ zBYcarM{bcG>bnm;AWrTTUC?xJi!^d$Y2b!ae=}6t0=NN%a^s;(n?0UbFG=*z>?Mxs zeU9o!9MuOLCD<-=??1Ldp7@aZD}>~@4KjLY1NUE%ArCHet~LFqBt;P$q%nL04`D{| zN7|CrF(0vypRo^U51{@k+y9&d$$NvC{*(IuG@kuPi;lMag8FN;{g^ti{Sy0E*w?Ya z_G|3lV1I%Qwj0>L#r_@k??&GHNPAWDGk->^q8{Y&2j5LDwz!RA`oqXmS6#lEhk5Mb z`n5mIV-JZ~;=?@UjJ)@A?RhQa7W;tm`B1629V(T#7*ljkHD5-}^-!_>nf?8j@zgIg zo7Emhjvgi{$zjCtFiEWlleC&JNiPhOjM6a4bcBiB875f`VdVT_<o;ojI~68*ePNP6 z6eb1d!=!L7Op4BiN%5I5DY+6RrI*8`>{6H*7s90cMwnDw50lDkVN!K9OseO@q~>;* z)ZPk{x|?BQtzQ%-4Sr$L7!W2+L1EIYg^@LcNvlVgw0Va~dw7^QBEqC2GE6$7!lY|e zm^eeixOBs$XJeT3GN}5lGl+JF$-v$)89WpwroU)v_!fidV3>@aCux`?i8#AiCeASE z62cgK56N`EBQkUQq0dN?Ib4#pa7jtqB(~&Dl3KHghdb_=L0r3qn<R5;i`d7uNY=*9 zlFbF1Gi6vl)>fEr(}#aGHhiot*ZyYg`&j$jflxO58{g#LxvL%8Bm;4qWN`N;iJ9kU z7^)-tH_0eRI>r%?b5s)?)#Q;)98I`P&xecY9|V<<2$@_JA+wZi|84yIW9`jlqN}=! z2r+58sz}C#R@o!Ort8KxeyMF(XVR6)O;`5I5t4Z&Lb5MJNY<qYNxL2)sW&3zZG16i z-I)EQ_Q03@B;zqZ$$ZFD?CU)x>!hz_zu+r5sY^w-=*k8$?(FU!T@`+NffR|C6hGlD zv)DEdUF}(*tG6M>lQv$ungyFT_CjnQT^T?7mA3x1_N1NSc$CM(rMl|;Hn%)~T7oTr zmdkY2t9+#I9e3&f&O#ZG*TuA4S3|#I+5#Q<$pWzj(k@6><D>V>#B=Lp^7kiX>QCR0 z>A!qkX5^cavC3CW!MgI|$~Jag*L)wft!9fg9K%{Z32Y(Q>*$h2zLFW>E5>oQxE~+b z*5eDM?FMYHJwW#!eMF2Wey#cIvkw_hUDslw9KJGa_m!b!Um3jR%OfHiJ*=y~n~yNy zd}Zd!BQkx%Po_5d%H#=OnK<Sv<41gDEZ$c}6O8w+Yo9$r5Ho#-&j8yaY#za9Zl;e8 z@fD)BE!1tLE|NB2+lIXzdxx%i=7?FSEsDBmT{RP1T8Kfd#3Iv9+JkMEaq8FFGV9Dx zgv9J-yFI!ZNF;s*ZID^Y!*hNza+Zm+820ff{=IAmlN`2fpRT6o+3vHtN(x`X@ZZ9W z$!4)ZYUmP4V_GZy;1bD*Um}@IQrR6#B#TL_Y$mXBn5@dZ$#hsalS+^2+~IwHqdj7s zeT+kj<)a?gRU>m#O;EqrO-!HTW1-DymS(U$p{sW1Qt7;AmYDszaxnkZT^K4o$?L`R zd0jcLt(Si4W-0gX<!<@~_W2}-e}MhKkez)zMcvcXJwqK#JIp4TaTwqIjkeSCAiigH z<$C5cUj9UT#DDfhj_e4vFR{m4Tg3ElYz4;OKhgX*nT}zB?G;+Q%06Gi2HSD$uj#6x zk1hU#I<UQNw0@$6dBiP~m=iSpy7ALbv|#J(*KvJ==HH|)o~B@XLstbGmyy*hlcMv> z2#w36ByPEs?p-crOv4yxIaE-tM62*sqczyIXkFBDo~O1-!<psMcy_roU0E*8=eJ4= zb}O_&J2=j4wMxh3<xD{>m#(y};%wO}-OjB%U~QG&>szHSc{$Hl%VnTuInPsD8PUsS z7)D?e#*(+nc*Sy=fXTC4W$M~iM)p>jp}dPeOxy1->3jVpBVj9%(qEFj{UyaCQf!C( zCC3~oxtcXn^1LG@U)w4%Z*t8Y^p~uFNXZ7%NnMpDL`vC4rsyaaP%cc2l%nI2QhY2@ zsy9YTO(;7{p!ZJEdv6iqc(AL4DrmkDDYMkII-|vQT37Y+{?fp7V&e^_{}Of8<%|^P z<{>`lTU<0~`xQ2z++mNDPVB+tNExE*?wm0_&^s5&Lq??ZH~1Spztxs}v4tLB8=F;E z6O>b<f+Z$NSINv`n38psfzG0~6kVm}G7WPSzm1dZ!=K7m4fc3uInp?RbWR|HIxv!M zYFn+%=b29l2$ojnQ0f;g6H_r>u$9odl*6A7=9z#!m2vc;%(VmsOE);#W|lgeLA!FI zPDHTux;RAJ3a$aBEL_tTvy^R>x*B7uViaPkID%?E18g<ARn?QuHYAc4$CDQ`>DPQ{ zIe9T7=maB(FTCB!2)Y?89n64rdIw2YK#(}KAn9i2tcO{&-c>=;2mQ>74crWpL3@x4 zjRnatvs)uD%B<EHlXl}w;7!B@$|O34J&n#3YJnq9|6bdmnJNOs82lp-T`|oZU<(Hb zwpLEA4Z9s1Y!2)W>`rX3bzwVoRdmZwisza1zs96LTIP!9P=Z~GR(kkK6_i5-)IuFp zLk*<3;vM@)8e8_XB**(nIu#kEyiGvU*hyBOpJYR((@*Rzev*eTzs65;8vG;|`g51c zK<QHHMf=j0%5X)LjKCoFP@3`eKWJ;Lc3)|NdT4-FXoF^Gfez?|c5py9^gtIlA?c_u zvwj?F0=F7SrJPpkE9sYgB?B_A`-&a1Zgane9Ll+nhc6$j1rA>+grYHDDTWftrBD{S zn91zLQXaIJ2dMj`vgSS>pzf3E&_!H$i=<XtBy~`KaG^95`bgttA8CT-t3J|l&_`P1 ze55VbN7_+`(@Q#9yreVKOS+=G#JS2#x<hzD5btFbQ#aol*n0TZdifUnu))@kJ%Bxk z4YncdVeAoXu#IAmaThw`C8lv*r5yK?Su{1lOKcOmN{?p>b5duz82^;6vf}WguK7^g zw65~@;-BH_EjU91#c*M+a~3VROoKDJD!W93w{=y16aTEPD(CT|uK7^gw{=x}4gXm} z3uk3I$Gs~OKibTh+5QiqhqH|N4)?4e{BuMW4r~_fI)?wdy6QfJ-}*gnEo@+VM_2vl zJ!KXhyuxdX^RD|auQR@{tI=DYV)_AD%}xC1<PH4qa`U>3{{lU5FrG)qcpf3+#RjPf z@sbAVlrtdnP`ub7%Nb819#0w?FS(9*$*YJLYd(Jp(&D9%>6xOMcqyh)$xytM(x?nI zXjo2t#i2K(GUyGd@_U2Fzc<Ll-;mntZ%EzMH>5uF4QYsagQuxCq-pmX(hM!o3T=sR zNc-M5#DT8^?W}l{!TY8-Z@nqq^KV+EheB`eo6=YMru5gm$p!N!7s#73H1wtnPrWH4 z=qUCW_2c*_&`Ip66K~2i%)oALH~kmlr7v-}^d{^U(|e2&u>FuRa*;7}2^(zhWB&;I z18lHe=KGKNkcuDk{YAygELtB)H~xhBE7bp#hR_*A-AB~@jJm7DT=Q<3MV<7H?dR10 zC*y|$F#Uq@bcOw(WpnthF`mxGOU%cNpY!;C$#|lE7OkZ|-TEsAlxxFZGlFm6K$EVs z;cpnh*VynAM)XbmHyFVU_|eSE_<zd?9>f1TMl@ejQXyjiQoshOeT)G}hYZLpWDGzS zWJAtP#=tz^f8MQl$>)O$_MVc${imep$SEm?5-5c-FrfUvTT&77mQ+HO`7NmqdP{21 z+WEJ58hcCXu^TqNC5=&UNmKk=(p-8{S}IOTYwk&DOFJp;;IN;R4(dAbby4rUb<)Zp zILR0|$rw1v7&yroILR0|$rw1v7&yroILR0|$rw1v7&yroILR0|$rv~(Q%6tA^zoB2 zgC^A^NU}3QQic+^dJ-h{Qi7zNNs#p036cSs7ZSt{S&$7mHCD+@vs#()v`T)ORSM>< zQiv8|7o#OKC`HS#4YYj9DivrYb`@HUzXq+vu0!h+t<tbJk*gq4n$YIBM5fLXnbb~{ zHncr4Q5<Lob|>0}-}yL&ZYp}v-l#<BJ9Aq4&z_cn%co`V{An4wd0K{VotBa7r)BiY zX&Gxc&CB1@GBI{qCY`5cs_?W-m!6gx`)T8cf6*TBTOirx9+LCWQpuHjC2#9u$-j4z zQFU9}xH9H14Bp!eXt4d2LHjrCzhguCJgLXZzcTORyz;1YU4B$L&ps+0XC9RpH<L1% zO==c~uaZJrOg!mPcMo-7^T1w!?THOGFKln@g(fwA*~|-DGq0>q@RD1TS?Z?uYhv>; zsnM%$5_2zYP}3rl8a(4Bvx`i-)i4FyVw381HMq~D`dtltO{y!|O%8COccZh|Ha|XS z2_LkS4+fh*c7RD4PyS8ww^pa^6<e@L)keBWT?BhtVNw-W_e$lBy;4=NS4uqC>q?U< zwYy14AMf@ZyyI`+T|Zi|`;_EIoRYlIQ<A&!l;kWrCE4avB&Vmu9&}1FxgIhOo|5#~ zQ)IcPB=y88UcsH>CVYxl$EPGI`IJ?1!rf(-56Fvh7uzZhZ8e9whC_u6PQ$jAx)AEt zQ3tmBvDagVVncNR+il<k{M>mJXHw;wyO<s{sY-A5i<YdyztN=1(3njoRS=6m%%qC; z;t#X3!-v>mI6Hip9fIvMCgr$J<435ApblE**vV#-YP;+%^_TE%F{#E2>}ab=O&)RQ z0%K>CZISG2n@NrBXYIjm{M-4$j^RUx6WnDa(Or5^xJzF=``cks16B%2u_lR$GO3ik zCNV|xfsjVowv!%k`F5F<JrUn-d|=zdhs1Djk77eXkVy&yXusE_iZ`01B%E#bamG<5 zF(TRSvwWrACaKg+GE2Fd_BF2dkC~J;Cf20-aGD<HXmIqy5cVL9u!CV3qiz%?j+kWp zxJjna$pjt<(dlFClx=OFGpW5#aNzqn;|7zQ{k%z;pERiv2isP#4;ZKI-UBA}ch^(T zQw;G$lT~KoO_FqeAqz$pN($JnFO*bBTjV3@em;`n?IW3yK4Rw;b(ZEM*^mRdkOya< z=7SFM;m`7&1|>={(V^tHkCX=aNEzE2Q9e?RR$y0ZiBiRWt8e*8&5)1OPA&0~I&-3= zebJ=OKF5x}#Ezcl+W6qVwbxfC&2d<ea)ra<&{F3)tg8zpJw%g?I88E-8h_qu`jXXL zU{dvmG--gwV;WE9nlvY9(gLlN+n^nv13E78^1-1?*O<=B7hSq3_dxGuUHYK^x-J7S zcw3ht7^XY|qxi;P+`;Mrm>gsE08Be{`jCpWo1gYu2K|)o;Ume|t`o4KDYq9&F6W=a zx#uU)Z%63Qjr8j(`jbA+3ZTD2eWV22ps&l&LiSOh`A9K#5n6Nh(|)Tv>Ls<LW)=9$ zsjr&1x;mC@w#7@DZ$2(<Xe%_Nc}YDqjd@8U>byZ#dzEbV7Fq3WFL6-U4xP{eLwmhs z5Jq4a`lh_3x6exkTD+v+;U!ZCy<~E~m&}kaPLr&TaS)?Ny=20Z1abU`mneo+=@WWa z>EHDytw{&D&Bb$@iy#Ty>&cLEXTHu|(gXPJO1aJP9p1Uk)x5=n^fwt&-;~_kw<Pbz z*CoH<loZ@}i~GDLMSC?V=DRLIOR>vPBT-{&L1St`V`@Q@s<Bg2P1vr1TBzHt8BfRR zizCl^N()J8E3}<w4BTQ2%rgeiPVBBrnmDg$(v9|D_YP^CwI=;T#+NVan~f(lJ=3_T z>nrcw^JkNK8}|H{Ngd}jjh~tH$J~-!rS)!lpfx$sEJ<M7Z<Z8Di!)0qWW<^!y(K{^ zxWSZjgQ>c1mP)kdwpprgnxziA7A*`gOTi+u6k`{mrT9yzH?Yf4M~kPl7kWzPl&5qw zcyiR9ygu`ko>j~(g_^mcM=l_}UBFxF1-v6#z@yCqX@<5TPJjOb8KdDSA2NZChni&y zos2ZIpv6s1WA|bA9x}@S+K)Yi4&om{hq0Z<`9SE=0`GK}V?PJvK{v8wsUy)!KZkS8 zg>tQ-t&v=dM?9sTwM<QD<1tT}it?06m|=Cq^m$L_5qYAAiH)8zjt;>fjKDDTK`#tI zKja?QSr?>BexfdUVBfDxCS+62`ZxZh;Y@`L%IT2ot@D~&=T)#SDNvK_Db-O6r4FsN zd$O!<A#rJ;RD~{-%7}$hx@w`6++c9AmZmIZp%k)h0TeS}igH=$^n{yhG??|1)+Ej` z8CWU8ys0Ls2Q*28bjX0r6O3)hf^5h+ph+&|K|U0OYf=bBPz)s(7~4<=29)1qY(phf zLG=a3Hq=7h1&tIgoDPOYXoBVg1UqPjHfTS=K!*<KgsuY&bm)d2=nZF}Lq80_;010v zFbpFwdXpOljKc&>Uf`w!(=c;E=eG8QMSsXhvgpsaS^ngvjEnc^Cp}%4gK3$W1yXLt z7d-Sd<KhDSb!$?jyCg$OEVo=pJwzaYbjlf!ncyyV$fBGLIrwrRk8(Z~M7pyS(p`$? z+@%;wF1z#k#a+rM8&H1Roi{-4Qc1ZAs`1r8E#<m7cRDN5oy)^r8e`q137QYNGezVs zt(4oKJ;7ZZHSW?uD$q&Z&~?yVoN?~bjrL&oMvyO%9QC6EM2SJ7#Sl?r7#+bLy`{0P zOrx_jI!mLoG&)P8vozkE>U^b-dg>3lea;muUiZ{rHa_swU$f>Obz`oL6^@Vx_PuVB z3E7YZX-C~86*3?llFe?+PLgp#3e;S4lWM4gTBtbdCgo5Cl~8((d=U&NgF-X;A{0ZB z+081>qi)g#J<tt~y>2`JyOHv{@c`^5El1s?8QP!~8q97|4^7YrQ#XhpFf(tKX&5_C zgn<dl<1mD8@G236@-XzVO)m^UzuC%}#&P9>9Wo(5fh)VkO>z^t(!mzQg$v1$1a~j& zv_vj+sJf&}B~+Z%Idfg=Sld?%HI%ENI6x<svc3;3&~!2Il|iXlmlEqn&h(%z9pKoj zbLON0@uUObq}&Bfe!4V5gIVXybyoc7(h4n<n_=RDkBrax$k-Vl8AWGq`p7gog*}N5 zux&r|(Y6;x@DFR;Xeql5^2d10OJ5sUv6j@3H$7GBO;YVIQnQ0AUHE2qSj2eEOW$Sf zP1-Jfsm$qS?&NjSLo!(KunfH%F2m(pWJK1>XgzPAU*IitdzehfN}23kEmN{qrbk)$ zARkEmcgTdx%xV^za<~G%W096ju7D5B(ss{V(*81cm{f~&w40^#buLrGtenCu`(Ic) zu#&q=5;xVDa<fX=uQ5GqR;eUXvuOH30$`<CxdL8Hm04wN+#{xHv&uP&AI&?0zs5|4 zjK9{binu|T>ddMn0zX<7j=$cl%DwS7m{ldofvM4~s?GS(S`B~GlV+yHY0zv|jjjV| zF{@_R0idqz&em#H?JcZxX)~)1)YNWPUFa-obC`M6bV&M!4w1_ol9&!N)2~M3LVbgE zwiADsnJHLKAd(Ecn-hp7n?`dEk)HQ(0@yLVW|idf_t60!_|en={QYzQb`=+PIsTXd zPGFAIzTru!o_bR1HhxKJL%$@ZK{^2cELuc++YlWPK;|7YOb<}ev5}pRa1!V&+8s)x zQBDFoW{i_|`Nz#_7Hkuow5xuSPH@#vnbkONf@aat2)3Ixt7+_*8MB(aKxZhk>Suqm zsO^lIRT5@3%x9Wrt!6dI2D7uA)hbQC%UL8XqBp<ISz*VV<t+JnOy}qodU+Pjbou`e zy+ZHDe8<&G_~$sQNc?C4YWprHMRbh$9%qS~-l13c_GZy?)OMb)^)mi1exL8z)!+x5 zp=&&#^&vEPmosEs?YY1iV!w@c#ACll54pw_+Ku1#LwX21=AxN-GkOPlHxg~T?$!OR z6CUjO$7VHVCJ-~a%-^B~_7wF=m$)1Kgwu9CKwjZA&vQpQ7{d$<ccfc9EK+B`!e#JN zv&s$?$%&#rKQgO@uKU~1IL#v>Z=*3+`Q)F|4IwYcELz5Tij@E4G!F{X#lPUZ5AygN z@PbrF@C9MpuCdX_oZhV$r0wPl%(8J%FnR3-vHc3)b$0k`>cI9JPVdSK;=KHVbew%b zI?ucy?dM+*$J`4{w$bheUo5_X3opoE#S1c2!)H)u`z-_Eyfuy)m^fzLX!v_}2)3Ji zz#p*x$eG8-$t-I76Lq)v!jHy@>Ca|Wcsx!Dj^U%6eISn8V4QRviWA#^(e5v7dz*bh zMns$phg1Jo>i<R$hQvwQD)#Yr{E%<;i<7)XaXj?INlrkVWSir7c#IRfcbsH;#7Xb1 zI9AZJgL!uF4|?Pp8~?XiCEtjXmg^h<Wt&)3jAl_L-J)i}X0oWR^KmjW7bo>YanjHi zCyi5aJm17ga|_#X!mZ9YX>-J}9I7-<9EEYxQ4_~hE@zR(*(AqFcWxZ-CF1U^nTc^@ z=Vpt_9or?dXg2w7I^VI)Vo~X4v&6VtRBD7-O!rt+N(i|!nuKlhu&7$>m<1M9?T6pf zqAF=Wi<V>Cyew8#=4#;02Q1|1p=du2AB!qLWA3%6?zmk{CvirrcJVgMEFJV%=e1qZ z&JJd&Z)JPiVvB0NMEm<Js_`rz<ZDs&*t6&~PYE_Zi<;zvW0qLdINO<)T9hlvpIvIT zsNpyo_*>K<8WUhq{pMXVaEL7bDEDGCDgLk|pEyize3-0_g9O`hx;&7B4zei6!NW3( zcE%l+jzfpVx%aSi?LN$86$c-CSo$IkGg)O(Em4Q1Kk~2)g&Y=-l@=Adn$Na|9v1sr zi<(<!QO+P7p*S9}DEmh0!Yr!Y<1k}@`iD8N`NJ}B`>+gNJ1nt}&}Oqmja)y>uP6@7 z+*XV7*v__5)bC{5-JJRP!;As8eU$or)IVlX17oy(oVNS<aOYvMjyTvs>0ucwWT#)C z(E++D?XXNG)9`5;KFfD`V}~TO9xr8n2k*~z@VaaVvmQ~B4jGUc!GcoOU1kMD@rW8F zIUZ4x3wbA^Bp(W>D}<smQBn*gXQQO_R+N;@M==viq5OQ5R9uRZ$}3S)#e%--1Qzuj zjh0%dI}|PTy!378i<ZWrXla^@mgXzby#9`s*0a&lb~{?ysdGRF^_>@@r7I#@oT02h z4T_eYMbSJTMN1$1?DvkA0WDev&Cx9Eie}Ep8Z9H?(JV*XNou;2d~Byopp)2B7k0|@ znVm8-wo{UZHc0Y;Xh{Lv@n}h9hiU9E{aCbQKqd!iZ-{1PL^LZSq9vy^T5@Zmd4U)$ z`MJ?ja4VXr=xCmgqM3@0mXhRXDTT72zGyL^ydqjEpfZ&EBvf-Btcf%p&(gz;gns>b z>(u3FnLHCMGgqTYn4@KkjYjR!GC}?L7^mapv|xn#VeG!Wob~}u7ww1G=lE{FWKri{ zU@&}{QR~eSzsNv&iSPI*bzjAQ4Et5=uVKHA{dMebV!vs5nqLm^y#%nxl0L3I`lQ4r z(BLf^B+?*>4N|aEvD2~b*xA^**!kE+*d^Fy*yR>wsiItM1TWP$S(9$?jChM@MAtnJ zO~p>r$mIjL$v|c(k3EnT$zu=XP|k%seECptjK>}*O5m{vN+_2?89oEbLwW3h%19o2 zpqg?G)Z(iPa6R^1=dlMGZ}Zp#&6Hc9l?H9l9?D}6bVTyl16`Dz(2cJLdXMqg1N{j+ z_JC^|bqI#>jlgKAyNtnjq`OSOB;_fX#y1n7bCgdm)z>eHeV0CYkN&wxfBnd!_Fbm@ zp;5Gy`IU?Q`XlD_rS8hZ@D9+QemLnmeFiDF=`%>BoCfIuCi=`opPA@06Mbf)&rI~0 zVQJLAZ@d_w`!29Q?XJ!pbXT!oG*SZer-JS2?rJWReP>gb>#pqi*oE%OQbM`Z`15l8 zl@<Hmqwa_9DoI2<-lyFM_&%iWC+LX9cr#ExWKO^L9uDwQp#JJ3=RSXrihc4PWq;}( zHTMkFU&KCy{XBNuJt~O|KN(WM2B{=W#;<;^e`35Cqz4*2#XMrM<kCLR7!T5y86T|B zpS0}z9vi=7Y+6Z9dYS{VU&H<-JOA}PswvT18leH|p{>SSTA>A+p>xVxI=}(#&{N|r z-Qa{S7?}5#e&~Z<7^(4=VHkozn3(dGaTtTqDQ{*nshox>m;@Vdr&A!AiM}MrsPLBb zLT?^Vy?8w37dunlL<euNk9kXGgEz|xyd@8EAqR@*y`=~Wp#Y2;Zz+RPD1oXeZ>fX| z;Kh}<)KOUrHBb$cR~c3?4Kpx$hG7TeFad*C8I~{%BhZ`7u!VjYfUc_yYv_g^Xg|ZS zhYsk3=Br#5&<br<{?uz+H_!-8P<@pP3u>Vb%Fl4&K_yf{$yF{)C<6lulDTl92#O)+ zDi=27K|W-j;lhV3$cEIbWS@`@z8U;U;>pl8d6k0Nh#1Dduhbthr+auX3TsyCuUgK1 z)<ea{8VgtHUs}y797Y_M4A>xbk%@bliF+8A3}l8Ie|G9W4fIH48>@%1Cwr(ln}@Qb zQ_i4l_fSUWD*cJzxnVwFj8+qPrtr*QzwM#+eVg)G<IijKSA5g;1<J#1fr_;(Q1*Kk z+_`Ry*VpPVTF&jFc8~G?TKyG^#fmq{*cYO&_Kf|6tK!xQNuFO}{CHHqZTx$PzSbz< zk7eJ~0yQ^njEA@`g!}a;&FOz*@5aXU`gZHLxDMumJyly_khHQSrzJUvr6&9s4ZHDJ z5VMOxQjhjvcT?}g--UK!cQ}H?(GVo<=m_>O^+WhwOM3>e`>(U)=PFBo&>8G$>ZkBe zq7&HTjvy=9Gi!ewL6Sj5I*v3Nq@uRtfsz8rkOcYo^Qh0opMz#&XE}nnYX(UsYG9X9 zUy8p3Eygao9wdcVgQNhh!>*;i27fhLg<a{e21x}wD0gslHyQVE_c|D-JOOvFgJH@e zaQ8asr#t|6uY+#NJ#hCr=%CyQcdvt1%58A>I%uTaWaZD@>!6l$9o)SRDk)dN-Rq!? zvH^FmgCfesaQ8aMqnr<SuY)Yg*>LwdNT-|ucdvsK%3e19+_?^1<Nx*b`g+T`NKX~J z-Kbsf8ek_ql|8{&7^?3xo(t6jcO`MNNCwyAIq!MJ^?05JuE+BXa6R7JA&YW0<lxJN zJj(fC_`YXaWu-v{R8mm|)iiLurmLk~2lcm2(g2N=o1htA3$#*hgZ4<1IG}@aCv@R+ zLO10e=uO}!#?ViB00!|5f$MR61U8LEGV2QC7!xpwa|)&@&%j)tr}7x|RQrZK)!c}Y z5~@EH<nc?o<vQK*8|rT0|2_5}JXQLiC>ysP&|eKa_pq0WeZ-5r+DpxC^&+3fj`I5S z;ywAG{+z{f0`J$2_aD?>3EbC1t6o|S&}xYK5$rMSNiSu*xKa1DdaUwPNleTnLkid+ z71AIbG9VM|kOkS01G$g~`A`6bPz1$L0;Ny}29!euR6-S0Lk%z?#3BX?_0Rx~&;-rU z0<F*n?cjh8=!7nCLO1k4FZ4k_48R}^!7z-#D2%~4Ou!^e!L(*!R+obD&5imlbNUT> z#u(eEFE>62<6eI50dEz%(b)8m{<yW%!M&MVq62N`mf3@LV>{8VK5uD48?hVE`UBkC z(N^phv>E>dI*vVtj<Wp>I*mPrPU0Ux`?34b-V5B@(P8W%bg++mc{YVCD(q+`JIF`# zuyfHIc3?wOu#<VQmjoHu>DXy#D*h_861xH|XZt#|7P|(m#$SvUVHctW7r3{hW!R-? zNuN%|$%pin7RxK%%KoZR`w&;|zIJal=P(wA>(5$ae@y)q?2o*a$It1vUwHrD99Jxk z;cNU~1MC0txLW@IA6TD`tMrfQ4I?>Re>ymJ=|W`>Sg7WfFH|1E3svk&?9~g^xwVwn z8Grta{)*M|Z`8d^o3GI381=7WAIJU=?5{6W&LOjOK@W6;1KObzI-mubp$%H00qUU% z8es}1VFspQ3`SuB#)p_19-=S^BQOko&<g|554n&7`H%;8$b@Xjf;32l3`mD$NCIX_ zB?W4r8tR}HDxe&ypmJz6uSzMDfB|Jt2nA3KMPMX7;!3g`=#xg{{YUgy%;_J|H^$xw z{eI*12m)G?m?YVaADIiknD<&BslFbP=F3}r7A9o`E|$y(7crT!RI<chvLEDM608i6 z+(%YOo~)Go9jm3_Ietm}=>7bbWIewn*&rpahf3)){6L7QdgFzMrTnpQsd#ydR5GPr z_1ZS6-o={mZ|;;@*(G(S_wcATxLX=hKP!#$m^9@)F3nxB(o*sTX_Y6Xt@<fx|Jea? zG=EV#<T>f=`jT}0<&ZcB4@<YaC_R%eNiQ>!ec%47^vl1?z&o$VVCpd$y7(V5%ui27 ze)4r0ojf69zhGISuq5r1Q!-hSz|T#rGA&8k%-{7C&G5CELagMRS>&AAylcrpbFuSK z*BjRY)b&EN2rb56f|g>Jp+*$%U(gEdO0)`pHClsRi`LD5`ud~+yAityzw7l$3wA5o z#`f)~)j>rEg-#lDp-${>v?uY?_a}YW{n!KeU9V4uu!qqRb})*LVUME|_$N`<KX;f$ zXGkJ?rsky^oX`cG&;br;hqkGCtF%&Rfo5oeMreR~n1N}Sf=QTwaTtS97=d9Jf<YL7 ze&~Z<$cH@0g&fF+EU-f+WK6v@FX<H0AQfzo0?CjBbx;d6Pz_a32^CNd29!Z5lt3{Q zK_L`?BBeG`BlL$X=Yo7x><VK%LSN?Q5$40&+pYQ&7Rw7h${uH|izGDvKfJxWqA*tk zq51#K?VTum+h3R1LVBavN6nS`D35X<wXc$Ll~ER{M;jNn=?|LIKfqy_x9hL^o?CLS ziuJ!&*_Yj`Jo0@$O{`9D*Pk;!*secjtli=2^FrDb8(-R?zv6Fsm$nz~W#)nQ?^FK) z_J{ZK+p;K<v3&~{sW~4bI$GanO=2xkGHa(&P#f!qQqeT*bTlK1^+Tu~I}6RmpM&ON z=b`zDtRF%Pv5U}R{3U29b{TqST^3q_U5QrVuSRRIYtg!SAHD|)4OBFuO*CjmTd-Tv zwkTH3qYms2v=e_9>cs9wdlG$k?a2?}u=~*g{DbHa_Aola_M_++_Bc9$e-fR-o<?Wp z7ibc@b&=Zb8o8O#`bu;96N~6{XS9C5oBicQ4BTD%F3Y*BMJhJOShib#$(pi=HS3VX zYu02)4=1gGRLW_Pg)bX2Dcd3Mgcnn}UgW1<<fmS|FXrcfP(Zm5%EG<GfD+23Q0eW( z%5pE(@|=+hGYhe>>u&MCN6;EGzY=Jq&_siJw1Eb#Xj=pev(Ogo4zv@0JL<shM)`Sx zbfHe{eslnTFWQGajE=DVAUcFSj!xhoMaQtG(V1H$gy@vn%2^Zi?ksyD2)f>$r(vf; z20=0%?36RX73H(Q73Fi_?z)<PE_=aO1g<Dw0<I`u2CgVy4z4I)39cw#4X!9(YvqqC z%-4e}%r}B7%r}E8%(sFo%D00n%6EV(%6EY)%6Ee+%J+gR%J+jS$`67o$`6Ap%8!C8 z$}@>}C(2J^&)jyGh0}!kxpRw@Wp0tOf7i&`tv_yD+(Qz&&wa6)yT|zJ9(}d3GDd&Q zoW6H4L+N;ozLcwshZ)`xrl9;c9Ii4RX0Z9ULv$MtGpHRq3(dx#gXUuAq4|kC%%FwX zMQAbp60{V%3^mxk9Ie2vM62+--fq`m*P?avn(>)XQ?QkG?br_LJJ3$rb)in|ZnP(n zXBM;%yB{6EKZp)t52GV&KZ=fFkE0X#C($YFX>?|uC*Wu3gJ(JILyJ}H^NWf0*oU!S zTC9xv82yn2_T<ItoDFZ9@!_Mc2kp0MHETS+SKsWG<hlxz_A-OHPwrE5nlZkYiJOc2 zToX6Z_tBT3pVhx$wSRk&O7dX72T}s~KQ)j_IStbJ=QbITN!bor__85~axUa0tdM*t zpj-$=_==%~aw(KW^3QRgoN@(J;;Vvc$~91Xdj$`Nw^v9#MguhBY=UOWEzladLfW96 zvI9EsbwU?qCv+#QkRIrz+z0*m24Il#5DZ7MJ`F}GkHI*;37Dij#T@_i{Dbr~1&`zR zso4LxPuWl0r()>0xo^-xZ%}uVx>NV5bEheP%c%UUzRmdHv-(5#Stf8y(yh}pJcD!g zK4ZmWdYm;qC{Qv&0wonq^JDG_wV9cFLX)xc(EOwPlPEM7I}6RmpNZPB%TR;uOVCp6 zBD5HP0a}P%i`HEWlxnmFyArL!UyfGz1(Fj{Xk!P>XbTM*(Iy(yqYc>IXwT6==|Y{@ z9cU;1cGQ7AjE=DVAUcHIj}GAPMf<R)(HSPgCebPEadZOzC_3h+1scUx(`qZ%xhq)N zArtNftGi2uTq&t5SQSD6+znQxQ3Nlz8>}kumBXE2HBaz@YPcJ#8t~P_-B{HUMeG9W z-C*Uw*$#Jul{1mx1$Tp0AHH6=8?1(+2wreESdHNu1y`_g&FoE~ldhQcSHAaozW4uT z9B963&g82+EWRrC9$#f&;A_-9rbn#w*g(aDRBWQbLp1mdc7(4=-$FUkR~dha)&Jdc zE}q&qjjuee2V0Z4`jh>LY(xVanu?tkL1ZIZWT2VYcIvb6XQMgTx$#7{L?RnnfL%y^ z5&mMd1iLhX$j0^t>RPu`L476uDzqBA<`$7{p2&8~mB&%hfTNKHO=vTAO9az2?4S*8 z$97QPfxi>&!gj_J@)8MoXfJjj_5Jt<&_V2>2tpp)kD#O2j1oo-ArGCzp1Q@98R2f` zmd?yx>f`zo_gDt$wIO=w;uG}HzCX}^e>C>&*Q5U@s_l0rbPxFv#QOH@YmAGZ*LNBI zU(jE+?z?@TN?Jq`1}R{J)Nqn8NQVr_TtpHES&$7mCrH8|5Ava45lI*nK{1qslY~JT z7*M{5Bn&Fy&f@r+Bw<i{lLV}eLOqTKXoMzc-uOd0ktB=+sts)?VREpFyn_U)6Yauw z#*>64l7ykX*nQOZ;~zi=v4<i^!q|QU9mO7_ejNV<I>~#4DVV;wf^bH`-sq?1n(3Z4 zKNainQ;!nT(mP$<Rrm!x#x3@Te)Q~<OzEGyPJ`bV|9H|hGqHY&vWG5Fa}O+0S!6KT zQKVfVq+Mv<LDH@`(k|>=GzGtn`XuaRG@T45BZ#yOI}NSGUqyX6b_H5{iL~nqX%}`4 zT7<ut`U31iw9L=TaDQF*wszp?q@o>%1MNOYx)n#dh3!Ne@i$RlkKKT_`jM^$k*;C4 zpyT)_s2{~1L#Hn@4+oRjQ)oZ_0qR{pjqF2**?uI5bPanbh<q?LgcNKesTb->!T#mt z4tCB_Qn2GZXrr#EmTCSpMZ5DLj$OibrEFh@x}w^@JcwggUn7d$AoW6BQSDzI#OJMq zFE(sp2d$_ptks)8P0{W=iDP#iB?aREx=~kDb3KXw^FbVYFoYD0?T1lURC6t4{O5x> z_TBdq|9n4(|DO-yCfD@+$piWe3p`$4qV|1-(ePE{r%$=&V49aOeD*)BKM>_wb(Rb% zV1v|ff)u1f2Dnz8*&z$EA?E}^3i2S|xcMvH_qA}s61bKV8c@E7umqJ*1=TkROHd1S zP``+%1dY%H&EZ5PXoWUtUqn=b4(Nog6GSEGh92X@b=@~CoHzu-Fao2Ch(j<A6EJy` zse71)88A*h?RrxA`4VP1jp?WLCB_F-ESvLNsyzIcs@P>ql|68&nhRd4EUPH5Ha0(_ zKmI?_=^0o4dKmAEM(Z>BCgb8keXB=o)l!vS!{$c7v--bTJvJ>-Nq#$7EW1-uP}|0x zk{Yp7(xP@sI+}r<NxdC^7Mgv0C;uUGCyQ-&N*<bzT|j*y{vx!P_9YQJrIhW<Py@T1 z`U?D&Xw{9KQhjSDcTncBZtWycQBjYhfd-9e6B{%~?39+MozjZ7VYgH7z~6y(9^WZl z@jJztxRXV<JEaG^m-;^Z{pbMg2P1aM5Ze!<BiN(VkKrF@O~3?9^2%olrmwFiP*D;4 zBl_<0Qg!af^ywAKAJJ=9m#TdPETiOEeb+jVd;C@G0)J)q@>g>U{guZe?EA3&u$THP z%QDK#jXys}1l#u<b%$v40&Tub{fqu8FLsCILk{FZR_G4NhD@+Sy2lRQII)%p(jX<2 zwMCEw$xu7DgH`GLe=n$k%258_7nDN<lzH&~z@Wrq2fym1P()=h6hI+#$L^3G=mIBn zgzk_|Xa@(hdhFnB)DCjq9pt<_q%m}dG(kNyz%*-;W>}LniB7R#X`H$V7=<wy_SnJO zs2$|I)*U3hJNPxo4jF)6=!5i#Pb1l#$VLE2!REiXd>Yx%y!cNe*`3IiNZy9+`jN(+ z$cC2D-WAF2L^iYtyBNI_*-+OKZdW9$wqE-*L93*qii$g-4Q*uuS0uX=+0aJpCiG5Z zLtRU_U6JfgWJ5c!JJCCl4IQSvE0W!bY-m6B0D32~p?BZ7`ri$0|9s`@3T<~H+9CZp zb9xyg!?^S%eY@fNylaN%BBSJe<MHS9hpkBggfvJAC8R-WBq0sbDQ7??K0CM;)@DOa z0wE3ZDCa`~zCv)VmoA2qNCF&`Q8u6)Uj<ZNC%{4VZ2}zBQm(u1%DZqjKqHNtpgEEV z2d$LbpdFtBI*t+Hpeun02i=r=pch{s^oNpn!C)lu4u&a@z$m^k7{AV|N|?M&yn|`V zGuK^t*Xif=trp9D0m|-cj6cszW9-@hvV<3yY20@pK+Qd6tc%m1T9w3=>5&qkVpD0E zPJLzoD|WDR15|oGWur7s-)UU@vi_ju+<EGMU|0@wZO*;quad}Dl0!($Hj<d3spK<h zXgYQVnu*_jl*H^fi5Z%cNMeTOVdtX-_zOcw%xGVXmau&(T83?)<@hVEk(k{eF+;28 zNzBmN8$QH43iUK-2=S3dHfTbd*+C21irt2`<98e-F*{CThIS>An4#U+J!mifz7P^K z+7F<EY(In!V~?PtSLtOKhY6U3DVV;xfp|y3;<ZfKy_YdxxJ-F0UZ!Gwv6o=`V=rH( z9&^RLg2Vcwi|sMg?xpszWoqB!ls{+u`Y<m*{9e>!%;}jpjJOw>3n@B6CItlt$fTf* zvH>M0m;i&y1~M@y$5#Qhl<T0nk4y|wj*!Vg624?er<?()C&=U=YZaLwxc=LP9r7sW zLryrEU^od={}D1O=*8Iw!<0v0@B|qZjJNP+l>aI@if;_2v1hO+hj=rJHqy2U>hU!| zE9Ew5K0!tY9jnL)p&g$Cx+(WSS2!7AI1^}@d&$7|6Sq*;PgHWyT<kp5m7)Dh1{cKS zn)%a7wsh3>6BXA_Q7X<eIfhnUA&#Ln*tMuDuKh~}$M*m7D#c2J_Pu0ad|(IK8AlvL zo!H%|E3W-Z2FDKm<y{K?yDLNPu8^8M!*eD~!wmdO2FLdQ`&A16e$kb&y{JFwb?zMF z={t<4?=IuIJ)%FkZ0<J0<nPq}gIZax=Je&tVqUK77Guv#`ZjCQDxw~^eokP6R7eBY z&j~Uh^9WH7vLG9BAQxP}63K^xRYW}~f?_CvQYZrh%C8djpc1N}8fu^x>aMz?-YTLV zG(r<JLkqM*8?+xG>Olu|LKirp8+xF36;TiRVE_hU2!>$<Mz69!1IA$jCSeMuVdm<c zsP__gf_*Xc&7(&DOZsXzk3-Apy_fYz&FRU@xyXW!>d!B81--dp%(3Mxx1s~5maB6! zl+PGE2kbL0ennsFWBCV;|7K5GAZxAy4bQLYN38pH1gg2HKxH?UsiY95*)}rGhT8bw z(Wz(}b~>8D|C-J`$~4<?rrFT!M5fu$T<knFAAdmz(`>XaLW|kH1TDobLyap;vt47F z?FQ3qXw^K^Y}Z!sk0hw5rJ{}o^)zf?gGRK89W<jY*sW+=9Mfz^nPxlAG#lEP$TS=3 z#O_9W@b`u=%|`ovbb###(IM>N%XBx4UMKv)_-(=;OkQ6|{G(zT=L{I%{Hng*?QvI- ztNyCK)!XBZK&IFNRr)Eq%((LJuC;QbRE-;-KgMiGSpsumPy(e;1jSGQg-|=jTo_bC z4OBuEltTriComTWsgMRKV1pz`hP+7T;vffdAq%o06YP=9r41)A7Y2hc1pP1oz0e2K zQ>;i!V=fGx;$N?iqZ2JGL_^20TWQk<&Cmjk&;<3+0Ns(y#X%Q1p#wUh9UPI&r9FL* zzQ=m$)3|gePF?w5`PSJ_)2$-EPhaugiBp6J*H7k*`TvPisOuMVuCR0`Orifww_IT= zk#vmhUGr&o!W8QI#atWpcfu5E_}-(hT0tY%oY|dNg#L3Co0+FgR|tCjnEsf>@?VVf zzZlcUcwV~rivEmcUqq0a+iYxlRp0Y$5@|>>q(l;JAeC|&q#q*MK;|)`4P+${Z6Jqo zF1UWYln(`=L>qAZc&QjlD3^vBuLPJjSnDY_KqK`{&`h}nS`QIOp#2zu1UeE3B+y0K z3ElX5pf{920{xK$5*VaB1jCnE>II{uNn<ch$}|C!l&4@C-^^tbBm4DN_4VfTZv-)p zYhKm27$3Z<KeWK!6Qs`d1*zBpW65j!ON(7`#Qqz)<VKLP{60w8ZyM>Z>7QS14-Qsy zD}$BCnqU<h60Gd^V~1it7_9bfq8w)Y`8aWA*Vpv5*7VnKe2s>$)9~x)JAo>R1SFZ6 zp%fAl8=8upc96^}j?4<pJWgg6PiB=!W`*Wp=Te`CKOZfiePINd726l1CD^5x$gHlA zS)t`O$gFOWS-IXZR#B*?q6SAT4eHQ(HfV?-vtkELXfyUd|6A8VGOOccR`Fz3iDXu2 z7q*l7?l>|lw3qgM5oA_uKY$Ki;0u6Z7=h87gccaTxrEsA|B-k8@lDnD{y#}dG;W|M zIDha{kzWdef&x<cp;V)yqN0GJpt21eUUBO-yp;4;(xz?NrcL@u+N8hJHf_@;ZPPYw z(;oJfZn+!xjtw^1#7!q|Y_g4QY^$T=`~0M&6TEl#-pBVo?)Q)V@#OV+e?I4g=A8HE zectDj&-ok*15`#~5Qac}<u&b+q(@|Esi&kLZsy3eiXXf#e-$~!k@}@b*{hvvDz}pl zfh&W22r5J5L*Sw8g_2F=Q&7B)d<x2F6X2ligirk-xs-ee5<f_0ZX%z8bhgca@gFoP zo7a&q^*PChm?zz9W~LZANFFp4B_G0$q8;qq2|;!UK{xF^)OS%1L-S<vA!yQ&4?){f z@*!xY9Dv$Q<Wo?+j(iFlXlsOe%D#2vOY!@>J)aoCA`Xn_PxSZfU1QJYTTq!FU6emz ze`*Bl&?oYzQR2=?M!-%Jhfo=ZF3O)|z7)UD;|~}g!P@YN{E7ddeitKPv^StK5M7i% zVJB|$#z(Nok^b%s^9g?&0ZZzP2dljd?xf6{smA7u?!DR>;^-UN+eFfr<uBH|X3~Wh zd|CVOKj5kXOaG`uj}g^Q5Y>*!^yn*J*6v7(eRC!$(N|=A`>8xQnpEwpR}9aX6`wa@ zoF(2!)xJ2%x@DFc+crzpZ=a=FpBL|cRgTA0Q0JMY8meZg#v1DDuzlD~pqZ(rFEuF{ zkO^6ky~(7Q!2&suyVs;xArJDQV3SF)K_L`D@lqDnf)XeN#}S5LpbW$j6Dw<-VP!2S zhYIi<At8cFsDkRHEW`n|PzUv!Oo|U0pb?t(npl>>#Ig(~mSr%pEQ3i2KpV6}aH&ZN zK?i(#r2)pc;!6#T61L6~wGI=tjuEv^5Vg=DZ1KgfX_qB!j3-22liqinBO?xeO?!=a z`)j!5<zJV#(Vv^G8m||tzRo~JB};Twu{@WDrMbLlxp}`*VcV~`u-#}G?Sgv8=KYEj zE!nVNDc!hVDaN*=g}3ZiiWcox3a0L71*-l0M803ipSYi)#{K+czMoulKSPiEm29>* zqnYfVwQ)bwP4_Dq8}~B>&7a1^)-+`RqR?keQ~IG7BGBzmV`6KX(!~y8w1W@m<O71( zA+(MEpq&qHow{EMpv@EZD=icEGt{`BVeI|nqWc+o+^^KLy$`Kr|GJI)m1^vojXZDj zch?mmI{c^0iV&JSAaCs|LEiegK#o|IgA82`kfEy##^+a1E|K9&4!*a7ytR|_ALOlH z6UZT}W{{z)9%Sfhfbsbilxrb+YV6^F_)m)O5xQby4+r$~)1tfu4AU+j4(P#_56O(r zufS%Jf0=82T|Sod^la7ojA;J4cA+@BPsTIHY-T@+8{XtM>Md_-r+p@UISCiPyYW67 zG4r@~`9sB2*r_N%OYuGjUg*Rd%TR&sI?MgqDEDh<1-6HJFYT3R6?XM<vMlzmT`nqF zZf9!{JA~bVb{=MeJh3B8?C3^&uzUGoFTxLdeP}=S0QFJY2hkyH<2U&VG5+dr&sGip zij#kLwrcqvWnp_$`#_TQ+-%~`eg@NruewP!UM)V}uYKhz{bM(&*2iyRK<cKkiCeFy zX<yc-*r+KKKRTejsmvs2i+uYgK4zmyJJV#nYc|UPkRL&&k^Bg<w~`-$g>nw$?jt{f zJj(e{K${H;DHlQUO7bHpiIE?H<2d;dlu;Jo8X-S|a>^Cpq0I}GF*))?MfFPZBd9f! zA3^<A@*`-V9AC|JANdh9W4AynZ2@Sb+z!E&<VS}%)SQM+w2RXcM!T_l(B4t<BeV~@ zA041QiVhx@)88qGFPXHfZ+MJI6H{(h4Z53E<Aj@4>t#2q!v@Nq5rZb}wMnt-Zsw4u zGyhiqAgxBRFhjd)Ms1Xo;t(kXS`#LvI6+E*U61;Xi8t12=b6?qr=8{y&E1rHplcIz z+o6wgKlJV;<AFiSLoh&F6f)T_i;qi3GuYq4$L3&Xqvm6zT%!y@VdtXOvkXB|UxZzN z+GsC{epWG+ZXjo)VCMtMwv)4AJ5c9l#=z3Z`LJE6JC&S|`YLP>>ZM(d63Hcs60;NJ z=yx7iAs1v8Bv&XhgIvfY3o;=C(t%LIHDz?{c1%uBNW>C1Wfvr3iIcJeN}&YoPz;Gs zl9-*42qpeBLJK59iH~wU)PW2oHBb#hBlmt{dIB*c9!r?65FuvB;iY&i!46}~*wTs0 z2aiH%P==H*W^kcNvD~c2azr`jTpZ1m&#=9Ps|Z7u_J#Y>>E{_62qqi3dqcCgbN7Z? zuyfGdH16KeJnVe5fOZ>Nh+TvhCu_v#v^+6+h`4fsxPn$5Ca$2>$;1`5k&A`Yq4n(R zLmRLg(WW%+-q2?37POW20NRG#jt1F3bchFah$)?D7cnM`c4POTy~Lmh+J`;2pTYne zMA1PihET&fy1-v?cx8?{tevA8lf;`@+HE%*R#09wN42h=qsH!?qw3dDUQhY{IjZ3S z+Kh9=`7G`B>rzZqrc;?krJ0R#=3EF)*{mOQ)GQ;^$87Q!aj%7AV0`svHGLhUL68Yq zkPT+AKn~=dtl)q$8nltoAhdu17#muMU4#}h0AoMIXwYFsgHXo_MuX5YY=OE)84W_q zu`5sy?OwF<Frz|M6sq}vnhlHw@qu+{{dPu!&<5;Av?-0zAha2~1#P80fVN?`qrrn5 zb?AUj=z=hGLl5*q1p1&K1|SN92bm^1M8P`c7FE1%(XKW;PFS+6xCLJeS+qAN86TzL zXs-5t{qR?BVSq@?urjx{;TVG{Cm2jQ#9#_qi(Q9S`xs0?+pycwR@wt-Gj<Ewbdtdo zw0rAxr3bno3?0x3LG}-!gD?aG5QRSIhhB(m<q<;*ADF`jW%D1H(M;?tG@TE~Knt;p z&;r_RXdZSxntPJrB(!WZ!&%!I&f36m7FvQ`iWalK9j(N!LOry5(Q@nx)U}J@tnF*r z&`ij%5jxOPFQEi=Vwa&JN~l5I*yU&i?H<&NU5QqWimS79^ZvtU21Xg%$+-AQ+%D`e zbVCpHLInDt9|j-_gD?aa-KR~zFn~0XBxnK=ESX4#P8*-9zy_^Uj6X9Fk3v@Y_)+*4 z)!HozIp^YNo_y}~_qV9AKL~9;w-o8?2t$wwS&$88us{yvf)(;09}2(*g-`^=2T6gT z1WLgHPACHbE^tFRRDcJ(PzhBB<&gC{VinXv9n^yl8lVxHzz@yP0<92$HfV?7K^`iE z4(Nm~2tzmYKrcj~5BgyMqA&>j7{DWvOXt#0_U3C}nq+)+F3%3lRfqp+u4>#X-Yt-0 z9c|RLi$`tzf|pLW5?{2*r)fT#%b=4Ow8??FLhVx%EDPuHDT|7<cba0=x2WkG$d5LX zAE8+c$z-EuYzvyhkWDUX#m+<XX)i!+*oA0Ol>F!r`O#tWBee7c`4Q^GE<?pA`4Q^I zE=Mb9_n=<v%0pMKSE~4c>J8*a8_AE*y6xmgs1LgVZA@eQ2=!w(qb;<zq5<r-2!~ZH zHRz_z8zJmKZ;Y@5ea8tqFhDs9gAt+*h#f`R6;p=4I8QacGEcRBX`UK;ZJzpsJXB3Z z+;Y8Lq@8s`tb)T+$tKloQp+aw^HhB!<tA}$vG!3@^C<>lpb7lY2JO%a0jPyKsD>J7 zfJUeXAN1`ZB}yeFLL<9KiO`{`q(o>Gy94c{J&1;|yV0J_q*Q2l7mo_&@gez;3sxwE zA}9bGWI`6CLk3tN2eQEo<-19Vzy)rQ3zSxY2fR=MrBDoZC<6fwaPDRlsl+)JgW@sC zOAHE+#h`dhiV}lpmoX?FlhE;50>mOYi!+k}<*ex!Mwig>Sqj7ib29lC?fKN_Vq4L| zv;3Mjdf`E*nG`zgq%n|)THPU{6v(J0f^X`k%4eeE4`Ndx!_=q7xtEWHrT92^GWTx$ zk8;|m3{!q|{6TE&0MRM-Lww>#_{2N-!MpgxPqBYCPfa;V`9sR5X!{jyr?G!CPZh3W z?dsXFYv!wlYv=R4$$XwSnXksCVb8>#JzurlOj(?@YhS&5cqdga&F9TE1-g~IOO1(A zA)I>72o4J_9M}!z#|cZ|q3ngq2w_Q_xm-8Rw2$}$O-AAmG;byTKr7_{v_*KR8G<q5 z4|Fhc-3eWk!_YlK{DEG|5$L0>9|kB#VK74c0qZWl!x#9wU*Qk_65r`H?AQ6j(@V5T zBDX|7m{d)5jrf5>K0o>{%|8(-PJUV&3r9|QcQ$PSUp(y8&b-mEWPxhDV}WX2wt!y> zs9uSE4|ehb@%u9EQvL8YDxMPy1wU9?kIhrlH!^8zGn1yaGieIV#x|#}Awg%-6q<`| zr9O}Le6#@D7G}~^lu1))@nI%S9b?ke2_{XU4s7RHCQXg5Q3UG3c2i$Ydj;yj_J-Fe zmC-fJlqw3<8`mf`o0&AVeT`Cw)?@opnKYHgq$#us+fRKn?Jb+vh%Bpa{gokt5DdZ) zh?fOp)Ot7Hvs{!5V)aup0CfrNtR&;W0<s^MHrLd~T1M>?la<yptCavWzsJ-eXrk<g zhJ#EUf_ln6sHLqAswvmNAo~xo|3E5JhtR%V$x8p8WTh87f_866R(dujD_z)Ow1f6e z>Vw#!J*<gTsA2LH6!2j-$m7HFA(yfhEC-o91=*C%kV#t>q*Km-N)3~zz_XXhQ&7H> z$y4B>?1r*+s}%tb%1$VutrUtW+xIe`$q{Q(oHaxj5O7fj6eYuXk7xpwPzA0GG9cg< zMyY`KoTo*9XEx_a?)pv>T~HZa;?ahbB|e)|4)%0;$$@<2Kmno)8pIBnxp}e?eb52y z=t***(*zr|7dt|IAMO2U{cHw=paB}8TEkr%)IuG!5?BK4*T(+sXcKjQXohQB_{TVL zp=!Nyp&FaKP}NVNJXO5mX1q1FaiMB>e4(211Z_`>_sdC+EXCB7EM$ozwkxCFwUF*Y z!nB!$iF>9jG#lHTO2U*z!i45xTdB{ZJs&N=wuMQUqQoGy_%I36F%qT|BuuCS+j*9R zX%sI;UD$5w%TJITRZ#Fy;SCd-_<$<3dLs$bW)i0DBur>Mwl9^0DUE=IHevgzZ>GHk zZN(0RS%o1=z(RxEqlFkS>foNO6YatdpCw>%Q`dv`Vn?X&J8>-$i^2dE(J&E<4;Vtl zjtcE_w-5g-$K|^mm+x_0j&huSh<zOU$Ji&Ze~SG+_6OJ>E)>77(5}<#C*G=BCy7-a z?cxj3%cEVbPkH!OHTH;T_Q;VpuUu5XdMn}V`bzDsCc|ggy$-YCR@K<V-py>=irtPK zx>ZfzM4|<mkOkR$NwmNMIgq=FL<{mD9|}SwT2Kf@P<(_$3re6A9A`+hpbUhZf9|5v z4dqY)o)DQ9R6-S0Zz9oxTBw8iR1z%`tOm4k4~f=h5-k#}X0#>1=?HT=IF)T^J9h9G zcYP<g>q9$FbJus4yS`EG`p}-!a>$#82pjaF{p>IhCiL+EgXj>p*iosSbKUSaIUX^N z&$l@~hdDmq#s1!{Jl0gHy)KDm7!}c5shyS-J44;kYI$3rzm4H4@o_az4R%~ldIUiT zLHA11Bj|!K^j%MS1-%e~!F{AxFaS}gb&{TWNzcrrXJ|v1^ejqxhV4U}X>Xyv3EPjh zu}?d+Vh8q-ew9omJ(^B>G=cO8En7}{luUYr?L^CIub|$A?M5q8Nw3mKuduynrjzu{ zOL}G|Jwq*F(z7V(8MYbCqdlMcTx=^^$o@rXq<7f1G}13;fZIQiw|!+GZwrf|lyV8k zDdrcAa?|F5#K@GK;+_pT$pl9je+rRuCdi;z05T{R!uV5&l=Ea@4Ess7Y6yxEf+HHT zj-_Iq*m5bSPmOZZE^i>?BU7lH;$9E(#!x<GI6g{FxfTM9g3Bdv+8LpeOF@l4g@`?= znW$-4f18?e|7}c?6R*^0U-_IR^ETCxeVb~u;3v7%=V9A!6Thj|Zq*N~G<-umTBp61 zh3<)n+lh#1)*d2aDiJY_h=}H3=TdK_JrB*tVGF`UL>#yfEy6B7MnpV8L_|wZ6A{l6 z5l4xLsK9nn@1~s>oGTS<?+FtS*}oF43dmc?%|yiQL`1Z14-qkyh?qu1L>sZ2sQ1&} zjJ5<=mxdq~AegnG?byL%L`0%t2ikd>h<KKWNHpw5d$4<{kI>$S_Oty!n1IOsgXmB| z{;c<M9dk4E=Q&V{8045F873{Fo7KxZ#pFc{b%}+(#1n)&d~(gVe_lj{bNiS(d$d9R zx*l1?L@04ZBlkkFHMgnhTN%uP%-sxTLH0fdv%o?*2Xbk%LY|+&EGURDm<5HDi=g;8 zgIQ2=ioq;!j4+r5Wt0WDXmdlkpTR8fL>SD1O8-m-v#6-v%3v1M?q)Cx>i0321r3xN zp@}xXk7LRSX+c{#Apx`vyB!UlAgrAvteqyTp<SbdHMASM2koUj;v=fDeLp(D{!w%= zAjj7!h%JrUX?p#i7pc|}@n)lTRnjx@jIL2T>ssTY#j5r8#cFKnV%4y0v6^xh<-5i2 z8o6e7FJ)8;x@I#f1s#+-A*f-|D;V6%s1ywBWK;_JDEC7zZ4qb)Fe(;iR1EbokW_19 zR1CWstuZqy#<uOy%Dw?;roIKc3H6(q<5aeYQK{vON}<kM7?qmHs1$ZFYM;QU)E-8q zQW=#(y}KBdqP_y#g}Q0C1Q-<yGb)CfeT<6P7!|`#M>EWfitS-kjO`0h8~f)`pO2l3 zTFoZ@^XOOzbD|DNgfKhhVqkRr?+&gfLYTa3%mz8PU52m>NQXoSvyBqcARqF;3b`_r z#X}f(j$uNV3~BKYh8@C|cb6AE`$s6cXmFkOL<sXG6VgD=ZI>ae2C5+*!nhd?5V~YY zYeD6zUVd~!lMH2IXOp}K>{zTCyB5<C{n{_ytUrU({0>L?15WcNT1K${f<20@+^!n6 zxA%NMS*O*Aoz2>ZFHgDscAhc3T@}5}^0K*}($Q9VtF`BLdhYCicFkA@vm+pXQEp-X zfG7!Q*DtnypB;~}(c9Gh=yuim4)(j)?_vK8`y}?yZ&&rdr2H$PZ)dp)%U2ew={w0} zAQQ468_ZyV9LTL9lYu<QhXSxcArwLJdt@?D0;S*pCzOEzR}F~_ltTr0zzdZ%SH8du zW-^(bWHL|-bx;pJXn;m9VU%9!r@J<zEp*pbG=SZPw$mOwNtdU4cc7i4ba^z4-F;HD zt<%lD!7zOZt8Oe&t+SV?v748uhPg}Bl=+kwh_ANutj|{2ztwi_l*zFdXnm2bUtXeG zK2KS^7u3FL8m_0RVTsyjBpZQVh`=BW!2m>|!$39zK?p%N^gtJcq1ni~k<dhv;78k_ z9a<p(@?9}?HnJJoYtRPlMjKO&>d7K}RFrQZ8`(%UvYc!Lt;DWEJ)6l!(2@ycGZV>X zXt$$f*aCG-CDB0h*gk(F*+`gd1TDlaLJQd6hGv?{W^80Lv}d3e>>L|eO16`9hKlju z*iUe426B3}2NGR-JbUqyy?`9lc0l5HGb`KXLO%QEfsJwjB)ax^_A;HW59W!40y)&3 zGfH{`<G-;VAN!3x8spaO;DxS@j%P3KvFs&Gwju|T;=h}rezunj@U^hNT!=D&-HOJ$ zHaecY@SoR$kN^9#_59OFA8(+Gi!CAT%h&0DhC6+L3w?-_oWeVPg?)MnUA03lBr<!c z8oNo{(W#x=cyzN)BerzPX~-`u#i43CwRe8Tcwi~(G_q0VQt|Un?IQiKcd2Tu5;t^d zQ!e!GF2>9C@37ekk=G^HQV7du(QdedHL1kMVeRuKYts@nefN1y24wC#ugQXJ%4V>* z&uelZ*MDAPg}lglO+FM*wm~6nMNoX|yv7bCBj+`x;Gpb;GTH>V{O2`pD36@iRDg%F z7b@N7Z`V|@LG|wQni{CxcV1Hm^^|>1G9bETBicka^@r)YQE_0S?wPAL-Ju$ux<fTS zeTQn<LRswYW>)sIbikMmvzVrmJ5;OV4t2PUvP--h;nJ!<M%@qi>Tk2nkMB?icaCa? zU|`dzCJKG)Mm7D=OF07FwDmyO<WWr+Iy9r2P6$#CK^txD(7I_<6M*J*qnZ|IqU?tT z?>S8)?e(b7dCsJ%rBG)+r>RD3oaZ!^+s|pLcAe9BQ19k*nsT&a!#Rx$bz_%JKc^A2 zJ5cA;bD9#gbiz4JF>1#y+<s0|M0){hWBWWbpZ#-DE4IaZPLo4>Hfna7&S^3!WSP%t z($Nej|Jir$i7#s9R_|6^3d337x-Y({mG?W0b196yIBtzEie=B9rxYu56!F_T<BMWB zvu{@%%56&i(HrB7YFXd<oRX{XqSrs%9A8w+_7*9f!gYVjXX1-;Wn9N<kS2wfq5R>R z_^MpyBrXuVxxHWEErZ#-Z!!0)N^e1`GDPj5BVCEQ4k!bjG+vhRC8eW!w-S2l3rbLV znU`gJMd|K(T?zNTrgRPLQCie(@kO=#-+EeUQl3)Uzx!f*QLVt+&nvCUb4uOi4=Zv} zuA0jqP^MHX_w%-lC*!Mf`7VD<saGDMANFZ?%u2cDPBk{=PSx<aJDFK^r}`Y8WWABP z>D0}-lm6H*e<Rt1B^C`ZyP@I)FKGccxS(p3m$-lzJW%T8B`siwVi2Re#08w-fcz7@ zBnGUI3q_;6L<Ve709n&{NepB_I^>Kd5x0^QGh~k@nUww$<enh!tYt1bG)nFXQ5b+u zFBdWhK@fUIxsZXpHJ7<;%L#fD_@N2fN4XF}09v7r|Fjlrpc)#fYXBeAkD9oPVT4TH zRE|HleCR)!^=baZZ<R#g0`|XXq>cS$SkFUcMwU&xna{VNGHPEm>r`fG16-b?T%HF< z`1a6yh@)|sqk{JHSN`qtJ@)N`W{!r9qk_uId+SNQE&I2j^4m8Z;%KmcGuk4*CG|eq z8&LV}Kly8BQXLiIsR8+mbP+CN7nK7HXjjBDfC23kaWpDtWc}-%s`cN*q(N=>!tFE} z?p~%ES1wbn_blTXkY%c2E%tis`>|7&sd^*jhr}<37}0Q_;N@0OahjJ~fp>Hgj>XHb zC|82rI|;{{#0}>pr39R`IY3Y@1M7)NIMyU3pK>19Xe)ps%7u_IotN7|7UfJZ(<a}} zmqXcddJ^mB9_NrA;ZUC9kfQQ2-GMV4YV1KY<i@e=97^mCG)#LJ^*z|#Xu`2tIE2j* zpuP3_^_q6t+i0)hKd**5+G=6Uu{hL}8&2b3pBk~iIsWdvJllm44}3$;rTNr|1^dTy z2HIr0FP$qJ5-GuW&OlqHbg(FY<p6)}5GtKaI#(~q+2(zmf0-M0L85C(=L&*!t`3mS z)e4EOC7r7Yq;oZcbgp_xbgl86fwrq_`6rf%kC>OKDLME@uGpq(pH6a17YnM|DOXtz z(fUn1D2CU3dl@n6pj^gTx15PwV%0a~PrvJztJeEO^Eb3NOnHI*j89^3UarQTS*{wk zEoYDaBc}<bQ<EWXc#C-`RU>+yvDfn`s-B^Ay;6aeW4lq;LEdJBmSH<l2koV33AP<A z&d@7GXd$)@EucLg&BL~$x$K{VTCmM%HtktxCUypzuF<o4ImcOgSKQm)<5*+M*K$Qr zIe$d{pewyA?rpSpVs~JN&|n6~8kOD^_cq#Fu$!^{XcPN4q7B$Sw4U}lv=+Mtt=5?I z;~8jTVVT7E*nf5ARw4uacW*9ns)V<hDQ80#WI_g{ORpQ!a0EE{i9jwxne?^@$Vq(( zZ%YL7gtvug3qr!%5`jG7ZGPICAmMF^{w5=tbha8ul5c4e_rE2Fum|xGVSP(`cT!b6 zs=Xzn+95gj@*E!aSMl*7>0$TW#at-y%CL6Z4LhZWy^j6HUA&y(E?&-XmzuJV@|)sE z-z1`WM`kFVGc#C;nV~P#jqO4!PR+o_X5dpZ7`>js^vN0c*bIDXhEj}{(q4kvunW*4 z>kNEs20k@|-<@XgtM?3iYz96xgWsKIC^_tJK{K$^(Jb0C&&)9WErRg}NFR&)6dI3k z=y(K+jrmyIr>KudI5h54XuEaH$KpPP-EwNo$KpPP-N^pZ$KpPP*3n*jhKN=(5hsJX z=~RG^ay>LAvjfPb!{mB8Ewst?bmY=ut<avve+Y8vFu9(NTso`+!j!w9Cz(GO<kDes zJ)Qo^xD4&_0A{!lz)X1fc!-<+iQA%yIA!wKpzrvgF70{nKex1;Mwu*^6f$()#hTQ2 zsg@r6Lm0oM{lX+KZe#uZUFz^zn*JpEz9rWri^&P6%kEaK%f-Tfk-r+hakm=#vMB!- z?ZPQ9$sG!?3-4A9_Pbe>@opm7x8>}_AT`499e#ExJx*XcLSRCR2|O1CF6;{G%V~F` zGO$&x#3`3@(Lp`fS=49Jo`I%gUlh2o^Qq6H-HOV<Rum!dg$R6T!9fC9=W))_5zZMZ zC&pbAxUl6+6nR;VUuLm~_>Twq1EQ#$7*|JoE$uaEHTFe;3%iB-X4?Iz3~cQYd?bX= zpsfdSwuF<_;0V=_a57&qP6r7m^V8M@^6^>eWC3Ue`S@(d^?ZTJd;wHGKHI$#r(25C zq4II|KI$X1_oDJ?c6pVIJDqk!6JPf0g$s@(Cv4<GQQE~5YfGss0Xr0fNM`5{{^Mo8 zI8CFRCh&k*_8slotA>A#7yK4)`Q6=WsSH->-_bsGtzpgz7Fk}QTIa7&V+&WP`bCr% zi(ej={`4w!UlQB?l~*F{Vh@Xdg=%PF`vBXtufU)FO}oWpKTeKygdFP>ITq?1A;&sH zj)h%@x@mV&U$KC}eKhVo9G)s{=}Z~4r&BLq$ef9q*+2UXITp4BwbGtTeZHR@4Yftc z(L&^C*oC1<yesE8$NtEL2!xJDC?bly3XbP!{D<)zZ6(gbe>{XeC^j9?%`x!_P3+m? zA9I(udxSXK*wS5mi|__Wc*l5%O2+*_E}A2sjciBT_I)BmVK=8;@CN=jxrC1N4*5>f zpn<;_mCsI^K9xkB!ClxOo<xoCL(3wrgOCV8;~@$!kb}0Ldb)#f6<_@_jSvNLX(b0F zJjYIZ{H3q#Tav83oiq|x0oG}ZyUzIXm)P>PcE+D^%1^FPhkwrLKP6Uum)|+;(urRG zu1w+QuH@lM(fnQc)9w+OExo)_jqMUQd`~WcRJ&4MBfqD;XUZ4YBX$V$TPs!Lzp&4D zR^mk8mzT?*QuDs}`1_<+7e%4>E=iw=A{Rv=`dbA@u$&|5MrD+`C<?LvMd=gm7e%2O zq7>06kEmotiBhJEqYxeTqV(zDB~ge!pr8M^k3S%S%Jl1^D8&93rBAe96oqPVy9jO) z!fntd8I=-#mWV=i$zy(2gUVUoaz^AQqfkrQn4kI4CXn-pKN*GMQHnn<?sur1Mm!#c zMlM7tR6Z)&D=&cwKl=}V9Kfgk-qinupCuwx>6o7>cC;80ZkC8p6{BNrCLg0oa`BIt zb5y%_mVV|c)jDex!+Wb#!!4^+<2>vI*tf1yMLdKZl}}4=r{;O_qhr$9D%esOf1rJJ z&hT+|_%S=2p!p}O)ZOf0c%QltsQZw*U+^JEf2e)X<Q(CZUT1id7d@^FwPP2frKfo1 z7djrXXqOQy?r3N{qM>HC&t|(EUg%{(GqBUqEcVYtZ4q867UET6v=^dQ>|8Y8&nw45 z_nQ6|vG@;U#JV)6ieK*dkA`TMm%q59q5TWS9B(|Qvi^HH)rHGl2*06S#<RGiq49VZ z!r}ZIaU5vbgd=Ug5YUW&3uxm3Z~7%USsMQ#M2_G%G5#R5@A!qw8+PKRFn)P7(32*Q zT`t3CxIAL_pxx5p<dd|ykiVUP1vbhBkhm0%2fXQ*<YZ~&WZ*tLHm9cI1daq=Y!4(H zZ9L$yzr2(>ClU}rjB*z#C*@eLxkrsny+_qwcaLhlUMzf@v0-OCz`ZT!Qa?@WGot)$ z`H07HnM676kxA5#<kAb@yGJ!16>t7X`_RHyrH`G*)~r^oy47lI!fMrU`D&Kp!u~Aw zq}8f^GUcnqFMmv^`&5E7@`*b`LRFTV1j|l>g{Eg*656mUw(@m1;gP6|pis3B58cbx zM?Ki3>{r76c2wSZ3hK*fccP9ALfxklq!B!XlM>J3i4t{gPPm-}3$4z$B(!1oZ{>t< z!U51;{==btcqsot6dl0sbaTSlKRyn^_C3^h(;h~<GVnaPK8c)k>E}b5(0GDHWBihe z*3n*jJ%7Mt{s6QQyMcNiA5xFT6RhJGJP^Go!5ZNYc<+(~4ZEBAxCctlOL&}hJ01uL zk4rq&l!dp*A4f7E9dgpfJkE^Dr<xKTC+9A>fE8LUO1?goyy0xp;pE(hBnSV9Dev&w z?0o5KYv0jM(Wg9ypFA$My~9&`790Lkg#Xy_nbOt7s}pkc;@i|57OUQs_m)>AtJcqm z=64yr*u!TSZ^WLStj1<0tA?AB+2beL6{hhVikvE*Ly=<@kXsdD7or(%axnXa91OiQ z2aEd`XQ*QLn7_sSi!)G#E&a{T{>AK9id}-n{fl~mjlY@367VCO?o*s@H11z#INjLs zaV|Hf*Ust2mi{&#(Xi!RV%)zrak_Vp`CHt-_Hw$hrN51jvth@_+4kXI)c0U_@5RqP zm7>uoC;iIT#>?d`;{L?Ojo8w^YS3x}e-K)W{qJUIG5(-4{6QlZ{OQz%h=v`aKAz&C z9n#M}m7)>d;t>xYk}2AM`T#B;<l#T4LM!>a@rZ{lGc@OEx-`g$CjDzX;!(b>gnwf9 z;SnjxYRZFnhmrCl$*S>DyybE1C$KkDmR`8zr*ahELXEKgRQs?v`kqYC-eT)v@$q}o z=XCd~)+F)D``T?&{z>}W3hY()s<GAgs)jZ9;&VUKE;0Q-jdBsIE{bLJ`igyg(Y<_e zRK~iCVj1nF)R)k1M~QVjYCiIbSVph>Z;f(gToTLp5BmAwKK_FUDr4P6v5fXk>N{u; zp+Onj5+f3co5f4hw8pfttEUgGhdQW*8mNX#f35g$4RV=AnRg{$HNTguDO#|Tt1Fs^ zZAEj}5z9~uwi(S%+oj3E4>KVH(xGYuUpvF)1eL2$RiNeAZq#*<Q-GFXJ5dMirDzGZ z9ZibAi$3WC?dmIv@Dcq1e8ePDK9Kj71Ncf*bblb1kT|J*WP;^m97o*pp}d-|Si`e} zqU=NM!lc)wH>w}XA+?`t=bLP28QOv(C<H6yLO$ey8L}Y<ERX@|kOi6GP32oa6;y&7 zTu=e!-~<N<C<8kbLn)M`62t;$xy^%iXaheqK?^j459*;28lVQMp$=*xn#x5HhF}mP z&<p+02Vv-f9_WS;1fdf;QfFdVse}&q*p;=5au~WJ>?9`h<iiGz{1C{{RRi+USqJr$ zebBIyKM$HH`=Ob(7HFj$fVL67-iWw*hW5@$R!%&~V1pxwYqE56Zy{xhPx79QLroYP z`ij3h2WN|YWsR!;+8WjRb)o+SXU`UoCBM*eHyB={#=6B<f1!PTa_plus^K@B<1=ei z%kL?Rlc(fqU%FN`-YK5`CGQP+L%PfBYgOyswG3yiWkq?~zP^^Q@+<8uQ`Rw3BFH#I zN(4D)87v1g<!s2OEf1`ebD?NCa}dEsxd2Kxb2h=gfwK#Vv+bnpfC}2m!A;o(Rm(X9 z;HB)@fEUyq<FG=_Ar3Pn&bE(oJ+#o)41UV;*RuBI999TWZiUXxoInU|;DkcrY=<d# zK|gJM5TV=)L(B0Nh*BQdKv+qf{h*r<f%w^vurq|AkGg*7g$N9ejQPX>8ij;U$aG3B zo|*88`jz+yBz&TYHb0E{1b_SpU&6#cC2pBna`?awWf2Y|IKT<zZVn^3zzr8XNb!t} zg_8gNLrjD+JWP7r6dMI=0iIEaXV};B`gBDp`$#TF*G6T#xa%W%8~728!uJucUp{)8 zx#z<R*RiC$xc1lDhNLe`za03r_BxZ%wT=f#*0I>iIu=`DiyG{@b!x^jvLVQVOfa8i zL<(}i0@h=UNI^d2fsM0M00~#KZ|2xQ!qqMfEN<Fdn>9QrLqYnQcR9WV316!@hHF8> z)h-Pze%hLji8l}EZk)aeFIxJBrW5VhiUaJ$5sY{h+PzP79MIitsu}Z{key9A9X;66 zXTqT|pNXf?)JJHSK67abEteRSJ~N(zV@sd$Eydx{miNYd#y^?AitXC5rOyP;jQPx( z(wZ@!vGX5DpAp#7XPluipQ&I6=`(KHrO(7u=n)PJAArxq$4D0tapYjT72;#0>{G}# zHnu6iC5C;RH_oiFQT*;VGDrD<b9qv{_*?BOm#5sco~dZ-2_R>*Uz}jsu^!)8_dEGU zqssL>tGJ$L71^MEy_y<#jo;xK)-Gzg*)YNu{p;17GentFM49)9GH4cdW{fCwkSK%} zE#=Qyz@IalKL^dn&ZFK+doC*W@|Ab<m3Q)$(NgRZBi&~mhXk!+`^p7;W%l<hAj(uw zQBH%K4P0pB84mg>4*Gi>bhHk;HpWRgh^wIOY~QwkgU$!EqAl3X)ca{~LgW6ho0Gni zla6-o#VL#gkaai(I>h#a3kVSGKd^u((@#Yo4G}izMdRM$=Yv9H-V;LQ^{5l=h_K@s zc09!gp)%sc$C7AIB*<}ZS-=-rI_5n!XgtA2<@$4RZ(;v-_HUC>#!o?BrdrTu?6|l1 z@t)9__t?>58;24tiCpj&K43gSW<QseNQ9QN|9?7$O7dI)GSVz7lu?F@U3?Iga-OuK z_{0!C@eO?9ExcwJ`&-!G#y?~f`h%Rxe1e8|#Z7;ZdCraZ@!+B;`Ga=7IC@s=(5IA9 zA;fEcWFF*KrLXn=Q9I3K{N8=4<>-B?`OE=)>wuDjT1F1=DA)mf>j1uXK*_|mEgkc% zxUa3`@4?QcJ??AR&OP`Rh%^om*f(;3GOo~+QZ51SB76&~lJPZg)9%_ReGQ`=KJ`=k zVNMYwe*6rub1Ss7Z5xdLR8P4X!h7&7=t;vxAhZz|flkUD5M6|ez)&(S0ukDJH%ec_ z=z~kIWYVuXAqXM3^hzdgy!xRRB5>)IEU~U(^-4MlTy!O4d+Am!($|_m{!n@8l}!3o zDHMYpF1?aTzp4NixMBQp+*?ShlS!s|ST+ldPs>4N#43V<lZ1*<-0G}GnUhZ;{sbLr z_`!X=C+$AolZL0fbDwJX34S3x<+p!gs^jq0_p|n~c=XRQz-_vp_p^(^Kg(y+N8~VQ z(fw+ySS<cnUjGi?PmJ?^ti3Vm>v2E(7(bIMX~~6d5;HX9N?L;ug#qZaahf18L!*as zH-w=JS`Kr>AaUB;DYroYTA^+tJsA>%g^iRO;KC2M_}O9n3=+4la&aWNm`P%=uGEI3 zf*oYf7ElIGa6tZHju==W7mA<|Y)}AM6A8+Y0qKwf7BEBhL@iPH=zQJ#)40E(XJ?X3 z$g!PlR8D-2uWLLizV{dHUHakAY*398#iVocI6t&OwLUBkoYSsP>X*k-J*S;=gCUJA zOtfTd;639TRD)%MnvzS|Dt<jGPrxvBV*6jU&zjz3r`TUG|H>{(ipl~F>Tptunm7&p zCpayTI1P!rzQk!r%u!36h92Cc8xp6Xg?*bLaT?lblXKKAoCXrs>6{lB{~?`5IY}*X z8WJN|iPJDXW@@_-V7;6cNSp@2zH)>iaT@YZa9SX78e{;K@iW6f8l^FjM=^mZ6U-CI zqacSy3-po3$ZsE?lZK9W??W67+B=*$rI+q+#wpQmdW-z_UD$GBhs?6&w~x<BL*+f0 z{Pr@-4xqJDaZ1y4g4hHO2P$vM8fmD<_Mzo`fc*9@c97$Dm9)!m@4=QoO_Vt4{$7rP z8K*?cXqVsKfi1`H^4MQ~`&_oSqJ^}}Z(o2dySGeq=SU}u_kYyF2J+j-oenkX@rTP( z$V|>_U)JmI!@A#jukMx4revfrUl}LCd7MGr9C1{mV^Z@_3SKo`t9$jb;n5V{qLd<v zw7LZ>@5Zq6br0}<jR$zY#slimX$~etVE``e`90KiLm0Z?(w^T=T^l6kW?kCz$7Y|6 zUodd7i5GYMDt4{}FL>b6o-e2?11C7((w<*LT_M<@050wMIn-Ie4B2uT+@)QAeA?ql zE;k(LxQiXe<v25P80CL-+M|gNjJw!Gj>3eo>!pYPSbqCTRJv5$#rOlt`2#Aj9n{Nj zUxrGTio2K%mosxTu;W(<+6!ryE){n%_Rl-S(ZJ58UVeKEI!C(HMKeFKV>2F54YToy zoAHTT9>B$PIwrZV#!41{)#*r2-h4of{j+#2N%ut35FRAdB;6E)vHJnl(#yW$!~`8T z=6~cP{wyBV>s~P(U?ctV2i1gI$pb4VVU0&J%6*V<E7@0Nq#BQ8lsh2dR<f_kNHreG zC^y5W@{oxf--+_pn?^YcE8$l1z{-gZ<B^PVIV9Xl9#|Qv5*HMCU`rt3R`S5cGd&Kh zyr9Sfn+IcVHP%&Spvo{XlB6*)l!TE9<B^J^FC$eS{aU(J+|AJONJYDhR2{T;qVY(E zjz=omWu$7Ry#<X&GITsp(Jlj(96hQVcQY!+BNZFSNL9`T(yiidhK@%n+GV6Fp<RCa zxSOHlk&5<2A|k(i+|AJOKt;P5JCpq<$!{NbGt_#Q7}V=XJucIcdVKjoCbNmfm+M}c zW(hs08af|j0@H)4wTJo$_5e;JJuG;+3}pu0GX3yx*yN0ubOny6Uu@(-KXKp+-6~Tj zr=bHv5QHA+hA?!&rJ?N7B&LpSYM};X?%D_q;DdU&G?ZPE#IPbN8<l_^ia|gbIKcsz zhO$eOm@Kx*gbYZB9I${Hvf<KDc5xD;<v;AbLO1OO!%ibBn{Zrq8Cf{Q$ig8;HRVf` z_lRGAMn{6tOr6;NS>3b$;BPV3%plhNgoE)u2kZk5*oQ`S_!pFaN%<oqac`oIOY6Lc znBpYvokSQ*+4T_bk$;Hy$Unq;<R4Py0+B(G3q*E67`i|%5ZMh8kR4kt5Gk)8Q5XQZ zKI9-I1_J9S*Fs_-u#s{DBnARoC^K!IabyZIM`?#PutPDFLJ2s*0Rqav4YFfbfV_Tq zL8d5j5x`2wfON=$OfW+><bVaNAUk$G<Q=-29#28$C`C{RqIZ&RnmBr;jufMg10$wi zB}4yv9G{<w!mC&*K3&6q1DTKo*<c0><UlT1ArJDQ0BleQMNkZOD1lONfD_6<fD7DE z4i(@5FH}O+|Mxd#YvSGVDtU}1%Zuo&hk3kF{B^Rth(7-?FP9asU9Ee_)N=xdfiQGI zCv-pvf-nSw5QPDdxo;mtpcfjU0enyobx;d6Pz~+S1_5XVnfo?_ADZMsB7#C0IKcs> zPy%)+hAOB8FL*%azU3fuU-{d04p_hp*^mX9kOApX1chLO0+6|H9?0A`_eA_4U9FpD zFqS{8T09TS6yh4`M?ZR))t|(pQ|NpD;C^(uM$upY2rF8N!71|MIaOXfpLs-$Z4ryF zl~LXK2uIC(t?ous_!v$EJ<tsy2tp@xKokaG2nHbnz0eQx#>xlv&<G81>6mhWx>jh1 zHW(XI#;GWCW5tr-6dWL+4CHb)#ZU?*-~|s<K_$4s1r;D~tjv%NIbeYdNQW%Q1RE4U z5fp+Iav>k`j>VnnTG``1C$oj`KcX6siAmSVY~kvSs`VOi;5xE}e{gq-AEfJKwy<HN zYJ7m>V%*3puQsZNM>nb|k5k?xeto@e*kmI4icj{c;Y2fr+H`EOCO)|pJAmC9BH9Fq zHc$iA&<G9SgL>#^yFRv$puKh+4;{pgq66&TiFROz(BKgq7wyIl`*Ef&Hjt}=<bf4( zp$H1W1_h7>nUDeLkOLMlLpI1&LCV1mE~tV^@PY?Qp#<zu3<Ao)8Q=!dL19S4d$bsR ziw{+!{qOOO(D+a_8lN_a#^(m1@wr22=nUToZO@>qpz)z<H2%ZKC`S~Upb;8CX6*G) z2l95MCZ4%dr~-MvA~SXm$c$ayueiYliTf3qvCFAJGGmwbD|RS`#QloQ*yWENGGmwb zD^|z_c^@e6S7gSX4Ot*>SLSE%PdXUCfsed}Uku|TF?{6Pl)ppy-|(L!8%dUK)a{sH zF+55XS~*QOFR4Xl?K`IF?l%oT@F)xSKFV0wqiV(kJ`S=V6U>keIUwIAZ-rdQhdi)B z0Te+Y*r6Cop#+@Z00Cv-1{YL7Id~`FLLLfLPzg04@3`wg-f{b&9vYzm{Llm~&<p`+ zg?4Cz5CowUIv@;P&;#8NfnMl`K8Q}>l}rN^hF}mx*EHQMadf(FtzK{8pyY@fX2@*d ztw+_^A(1yjxA5}VMh=ubd|fl-rZaW3CRntOabQPg%8TOK$9SqyyfRBhG}B`|pe}wi zTYf_HG2V_Y9=b{QqAA^Q8MmI7DVf(_#tY^zQ?e<W!Lsr)B?odTTOp6Od?=u7gTj56 zG4<mzrI@lEN@yzu2W2Oetz@l7a8Y(cIc*i-q3ngqk<0kS|1y@`!Ki^+8tb5*vJV<o zvU~?LQT9VKZ7tABIRI_@c*Y-slta)#TPJi;4ny}!)(C)J$`R<Jtse#`M`3W3C%n$` zR^KK+{{b{d_z$3!asb+n^B+Jp<r=7^tq$rb`=G(i9|*maBhW`%KMYWg!r)fDG6X@& zA?Tn@u3yqcSuWAk?dA^@XSV33n6PY6c${Mj>69}dldZELo3a@!Zq5m~D7&GYwhHi2 z_Cn=WjwuvVwnGVRrQo3Kgfchh1k&*s!=Lbtk;hcaUmjC61`)YQccVCZv+i;I@XE(o z<xwo0LnfvF`s1o~-{XwnJgyoJP@n#|nm&=}Ih8&+oxX`?W1E-LCzI)$XfC#u`aIh6 z(E@B+Dt$7IzKIrN+o><1y%cp|JD1ZZ*<YY8Y&Z4g)9H^D6g*UT&yEGPDzqBAhWc7Q zpbo9a_ARGR@&S!#6SklFX4+fOR_s73eKL){i3YJl)OXO{iFRR!m(wTNzX$Ebj!@q> zUB1tV!T=Rf4X+d%naR8tRy}H+qnj^|-lAhZKsN`cM_e&i=2d^-Ac=i*b$8OKI7X0p zJ;w;LDVxEvl4At9l&z3QTRs#}wn5=Oju8}7wnGVRrQo3KgtC<!BXCi6Lpf~~;Gyh= z3qKvnPQ@{T#59cm^3<Gt93u#_ZwNa0@J{HW9ER?d93$wZ9DzRC`eA@_6b46F$7)2b zW7RrWH~-q$f=#?AYm;hRyh*h#*`yBNL3x?@<vbaX_EPtT*fyV$LFtg9b-wPpt5VuG zsj(n?b+To66BmmG@?vqC8nJSr?j@5o6E~dqglb*z1g=OwnMyyIPCr4jvCYfrC&}~^ zG#A@SeID)kXaTk@m422+KSPVL?bMghUWz)foy+Mb>@QFkwwwBL+AC1cMB<hAEFEo> zj)qpVK@APHY*2^RWBZm98u)-lv<cf!eKYMXXe)LgmC%w#XhDP6A?iD5??k(>!^;T` z?B9d-Vn?X&n@nhd0U58N1g^oe1h3PqQAzI(KwFFtgeJ;<Xr{NfKs{w2G+6nApqg?G z)Y4W51C*mMxRF@3L9g^ujzAx6{m?~OuD#T4<qv|O2}3Tv)Inn>6i~K7VT?lxxs<Jt zM_WE*Q#OOe$|-?#${CPJTNZdId!f?E5d{}zH<Z&>0S?MeD6?`(;D%xhJCx8^3et6k zZ+${FE)vZPbvzn$tGrykf`hy&Zn#Zn`Aj-JID@X9IaMRhe@i=cVJ;O`D)P|$J#@ZQ zI$s){4=utjrru6_$y9M~OndD@7q%O_9IZG_XFEHVTvej-H;8KLYiO^XDh{yyjwWnB zb~D<-=e4F1f6|CQXghY0`Vj3MXy+mse^zV4OX*wC1HF^!ThIsc`OyI~%IMiExnNz% z1&bA!>TlC6xyg8pulU0!)Y$PSRKq)b|97!}iv9i*s{SP955*rB$!q=1PpYvw;`57j z_nQ6}L-)TIxd}@X|F=T-CnEP*me*qmzM-=+EY&a8-8R9p^GU+VvD;;eRP`iNvP8-f z-SgK<S2F(aN!5CsL-yk*RsFk^e<He<=$JyfRQHHJW!h7$Ln7wh!KKmA`-E!Q_7pEP ze~K5HKc%MAJu@H^vLJgYjt>^df!xV-49J6gDA-HKfkG&P;>q|mlt3vsmg4wO1_E4@ z=@?KB6_bfWo-_0usDvu0o=n$<TBw8irF4C0fJSJVOlW{+Xo1$fgbrwfb_h<UXF~^c zLf2BdK6FD5^b&_6#Hl{C{}gd(V1!r%gQti^P;KK6aPkLu`2)~;Y+sl^Aj%(rHevgz zZ>GHkZN(0p<qsI;4?u(1A?iD5??k(>!(sja_U}P^u_M&?(cW*OFhE6gub#R21TRR( z&Y(V%4YJT|Y;%}Xzz5`@x!6|f^JvdU3$SfxIR&Gf0<;+0PJIdOrKkhj8RitQzd&8s zZtBZvzpjFUhYD{hryz|}fJz4vo;!3a#L+u-Eam3HHOhs4nY`Nl@F~^$w)pBY-A(^M zZ#37OQ~FPz=dyL4tJZn0TIc0e>pTZ(nQq0^)=8Vy*j1ZV!!?`Ll&O@j6(25_iPV$S ziOqNE9{67m-V6Wh!CMRx>}T~7#3KarKD3_&J_m?4QKHQtI)p7ecj@L#8Mbfc;rPv} z)ycOMe7o|^WHxu}p402UOYISH*9zU|CrS4)eD-NIWzy5Ug<LeO&@E1iEqq!PPp#6e zFj*dXnjx2`RdYHM9I_`WIV|#IDY#0>D8}<06O=60aLV*tuGp%X!CpUIDN^Prg{`v` zYw%hnw`;1B&mvHH1J@}|b+O`jYmp*YWUA~tOBDNe=PJcVZ&6BFM5^SS`HJ_a_bQ$b zl9kv;PpOq3tySE=y-RWZVL7=tt5B7nTcM<1Pd9_i1#~mWUP(6t3*{WhrOgU?yXj_7 zu#avAg_Mh+m^M3<ETEf#V<p`T$|wtP(dLHoJv{idi`D4TSl(zC0jH9RDk`eITBXLv zYHk4*bVTc;tm=j~U^k*owEIu8Q1@xx8i2NrvSuyXhTV<^X%G2$O#$0?qFwACM!T_l z&|cajyR^IsLaX$n18D?06NN!4hR}K+ku5-ELmLSW)n+0ev7i>M!)`rEWIIh{L)%A* zeAN4CZ$?|NyL?18_U}e}*gi;oi1rS&6MJB{UWvlsJ^~x`Qg4c|K_7*FDzbeXaz4O< z=CDIL^%=BhqFLAlCpqM&Ipk>3D5r#aEA4q`KDNWhA!q+GRIq(9^>*4z&{AyAE&|&g z0vlSD#wjshHd%4AK{*>#P!W%Gj#av~;_X$s`TF6!r&VKt(BC5?UF>Pzb1uGmkGyIw z(C#vkzLf7~p-AxX8Z|FeQm%q(_mxTw)cRQ`66zzY6A2BJ8=;9dKQy0W#cOCC;ngD0 zM!6k=w1uF<&#P&nD{>|2>Xk}2<sRszZCZqX`gUKb^uxfuD|u{?T#E7#2-iKj)mK}U zXLteoGpb?2Giu6ZlrI<Wua<#m19f6kvhF@px|=K#GX31qL3V^YI<QdAfn3_Gkavot z5(-90Dxr{a5y&aYavFPypF288Ok*#jEI>|4c0>7Y9Y;#Xk<xLbbQ~!iM@qMcL#5+T z={Qt64wa5WrQ=ZPI8-_gm5xKD<51~1R5}h7D-(~fGBLDKZigUkA?Wb4GBI>TSeY2Q zDfd7xZ4v0(P0&(*pqnBb$-4R14QD;WTeF@~tvMVPD~F>1yYLyJ*S$J59<i+Fo>7PY zO06O)*XZsNZ?DnKnqhs15Vmv+OX_Y>jdyKPtt+;uu~l1CeKO^H#no#Wd5V3N_I+E_ zlz*meKV=i!X3(CsMNRkO&^{a*C(S~$aY{34!OlT*X}6xlp-<z`Xu&8Bj}~GVp~bY@ zeK<7Rm!b~#ccNw30(E&W6L&AvdQA;W^<+nSrGB=amjUQ`-KbuvgKCYQmjUR>j`U<l zde-RGD*<Sx+yYIs`JsEco<(2uO4luVC5(0~)hnH7khTyE^3=%CsGcR#=+bB(c0bxX zLC-ouJV0hy#5Y>bH$=_1@Qo+(jj_}B4Qnzc@QwEHjmRwuP}?rPG41)-@q1g^%NFsC z*xrFU*}sJLQtV>XPP<$~stV*9QeG&hz5+XbZ#zMs=*nw#YgfnooP<_RLK`O`$VutI z4rBLhQ7sY5{gk7$4bgTG`z`EY>~C%1<tXdr*wLS;84<6nm%rvRES~|HkOkRb1`Fgs z?tAnY$b)<+02>rS5fs<ZW1s}&<bDS@p$z2YeitJ$?p>@qy=Nwu3D%wFooHUP61!>_ zYfo2uXL6Zf-RZzgE)%Rf9cA5Vv;n&jZKB<Ol9#BR<|S%q>nJZ#L))<1(ID+1-%KtO zGr3I6<T5dn%fw7B6EnF?%;YjLlgq?RE)z4kOw8mmF;j`6gV<ap7$<VA*R9s;@7k(b z?-oC}U*3#;Wvd!X6)78ZkH%x;<Xt-cetJCWNu$T_5)b`A_vIB-^;0!KRTLe>9ztWa zTUCQ^t7>fIBmI0-3w9fJaI0$Rpe({0bTh@#6z)}qe@p%E#M}pT4~x_XbThBe-@Z+i zVcWVyls}+bwaWO!Hr4vnHZ}J2Hr23on`(R(dpq_E*e}A>e89`peV)26V!yggP1!^F zHK8|h17vBTF2H{6Y!jls6T2I`m;c8{_YL68sJtFsbiF!(^PZ&pp2nF`c|E%5dPRFN z``OvQ1eMpLi>_B&S$uA{R@}W<Tg572bqnaMPy-Fn2=(BD=G}BwXaYa9K|8cU06G@Y zS?Rz*G(-pPMthFYnb9zI-)TB49kv&Z(18chA=(GfD0Zgl7QWvi`p{IqFKWTgL9^+A zW;AaX-)|4!mv$>!h+Tvhq|%wu61Fd0#P_A$j+S8y)WQBvwEPs0--C;GH(Ci*-~lfr z$rCCHjJgM}HvEi3f09%2^KI(zDayYRA3h{6$Mc_MtV}%pu<i~fMQOLn%dy}k60TjW z;mk`V%F&86F3qSHyArLUz1qj6Il!eEt&4JLMt#@~XyXYk%_q4upXSnxwv2LVMg!Pw zXglpeAD3pf??5}*zY7gx&+n$tvz=@6t}B(u9xlyje;Su&G>Scl4$&?O9@Z_JlJfay znVR%0Q<I+Moko0>*Pd0y+YjrWn_#J@R;+wPcenV;Bl5?OMcVBTZP)R|bbK-T6<R?D z@}OSqO0<ghY9ANU02flUF3N=z^<g)ljVHK}p5#J$S||Sb3hlki*{^RGosrKOK%>}$ z=n$V{{ULwZasGyP_{-kqFZ=1U>hSxNKM-w?=;ls|eN5XZ_IcVgupbE^1gBZJ2J z+0Uugo5W3z>R!1j<po-HJf|97dM@5Oh3iq>>ZOL9=TxJWt@3HL(OQgMLOF%ca-e0@ zxv8soPPKR`i_yn*MW*xsZWyN9(PgsGY;5x}+>$PpgXW&b4bS41qqrqnaO{T~+bP`g zIBp65;qM09S;S&9-H!a%oyt1VB*Yb{2ir@1CGAz4wS?RNnR%GZJW6Jc)?@pQk(r+$ zGe?_FlbN3-Gan^0M_aK2)VI;zjt1F26ecrg|4y_EJG`09d^?#r+PjC$JeABmjm#V! zz>ZQsNc+%cZV}pc(Zl!9!#C5z(dKnT0BG7o1b~K}L<p##OoV`14G{#YDc8Uty?Th= zJwPvyqJ4*m0EdYH*b%gw_8#iHrV=60jtNANi9`_WkcmR!b`B-Iya2Ur=1`;g8#wf+ z727hMQ$c$+YM#oeLbE0iK~OoAUHKk896U$p;ZS~%2mmh1ZYX0rLAwKWvVDB$n*EEh zFPCH4V-aDKj^*XA+m2%#e?pFOJ-uBuZV?YXNlyNyCuL_@s@-hT@Rp~oYZL*l-Mm=~ z%J;2ND!@b83zf80LA8Gkzk%^$t;iaF16#vyV7yoh8ji0~8lmab8pRLIBfMA(S}6yh zjkb0O`q%Iq*cyHVTf=W)YxoUpjdEL<f4aA>QF@?v_ZlSvef!pMhqH!Tpf%hAtx*PP z8v;@MB=;G5=XTznBnF?<U7KX6-%b|#6j?{g_qX%jAhB?>?nN2me5O;R!6T%>&`LP~ zZO2K2A?PO!hK>knFmzE4L-$tFVCdaV8Vr5===Cr_ISPZc4S`|e^Q!U6=T*z(=NY%4 zJoS0i`nl)T*!9@cuxC84ir{8B1NCXT`tV~^Z4y^J!=zPxA)6Mlshv$7)R$qqo*(P> z1<&XnyutW?s5wN-e!kE*+3H`OS5v;j9*4!R`GjjMSG=GaKKp`dob-Zfz3K%ucJ&M5 zpSI|xo6<vg4`f0XWFNu1zydjtdj{`=JjjQFBX}niLJ<^)xc)#1l!Ajk<)lxQp<*K) zpT6d%&y}MU*dFS=v{#~48)e52)A6Hpe6;Q`9sd{|{{$T$ZNzRmOUECj<D)Iut<(o- zZ$sO$gJC*8`*)z78|nC)>G<2}_-GGyZz>%>jgF7@V-HXtrF{?`;#J~GGe5MpK+^^~ z`gSh$+$Fa|D+HjH4q7*zj!uWIK^syD1Ze~UY#-WpoQ@8?5P`uF0s#y_6gsY_qeBov z&^<yRfG!A|_$O}@hZb_zai}3Nu3JFa28nUubPcBpEPFXMkWJYP<;UpgC+O&h=;&zW zSpvZ*fdJc!CdP$}+13tav<vDTlrMMk&sfLbqFXy<_(5FcAzb7UT;x$)X44C*_+YCX zZOWlGSEOvy@&49zS{{qzO#-}W$_Zs4z_pje0Lq~PJex=ipc1N}Iz(aswNMB3M@S5y z0UDv{42c0WLkqMXAu)h9Xop~k!~i;ASttK=Z6Y&(Zs>vDy(9+E2mLUxiNpW~VF*O= zHr*_J$|1hou;|{Vn~@ZIn=g3uIT`bGJBWF+x9eVQJi1b=(GRcNp&IWKCEIm89r?VB zAn7|)YlhIipnK`o2-^<qP*VnX@W>_QZ?NrKG!J8c3;WyHhj)zi&*B$!_fL(Tr8dL? z{WC2e?@%r0u*G{jWQ1L{Q#Gy@Pwdp)VH*F9D@JdGY%Z;4uu#r{+!0&^@+jv+0c|!& z{DwCE6BmwNx`8#MaC9eHhAmJR9l?#3V^^Ra+P!EccGU)c<*1H|r|X$3+DT;>jbXH# z#vZhnog-);c0W2mdlVhS9zwhAI5u=;;MmYXxf6mraeNr0JOl%@MWK&!KlIknaiC!` z9SiCw>P<|Uq37+P=S`*Op*6ecnKZUTtA?Hl&6Hc9X*N9<%4|5c6URoKW*mPvF$s#n z4wckb(e6RLw3i3yIM^=K?c-rQ14kLM4XZfp6f)T<3(_G23JnB3C;%JeE#NdjE?6O& z+NKfNJ9R6r)9>4<THoBs=)_J&Cpf_w*jYRAwHI}~6QYNfUUA1u@@DQaZORN@&|~Bs zN#Nj>MouWBEWl;`KqFS%ul0!RmvoPdw_jqqVagpZs<Au8)i3LwxY_X1i>h(gi>mbt z)a`yzHGB#CwHH;(>y+Q1{FN7bzP~}M)d<_mx(9ADR8dvK=5=iDW0NNAX6)7%)!{bE zLGj03c>L<mOOHQI!>`3FpV!Szs!r|Hh~m%d)=r9@<SQ+BNlm%+B{g=N_`w(DaM$ja zaOVxXb!)`Y2elhaEZ$NArQiT3lz{*jxS<>>zyn^Wges^$$n^kfp$_W72My2&P2h)S zXn|G;KpV6}aGh2$h4`lfI-v`~&<#D%3lZppei(o#48jm};2E8GNH7(*MZ2+k&@MbB zjP_yoqrG9=79GSMLI-G%q9leUrEVf_yBD{Gh6(sJcKuW?7-%zg3)-|Cw?*5q+tF6q z187MaZkxf1XsA7vcuiS=!;V`+Ipqp)g>YM_q+A6a+PsjNw*Eq9$OmNb0rAX{OfQi+ z<|X5{*tw{6Ic_~2x5h3&Z6t!7sol8oZuvV@<4da1FS>Wj5!qK58_<9F68DK>?yI_c z#I9H6h{0CvI(aRyZ{Vdr8+qvu>fj#CiI!mt)WtoT8!gAKKs~g3(Ms$p^#4-!?s0Km zb>jay=NS@)Ye3_cw5aSF*SNCV*k+YlT64jb)u^bbXj#qLv;~41mDQ|ehhc`xa35~N z{XW2OzYQ?Jz>}u9A#1eQ(wc73u$Cwq+K_}0noRn8(|-5+`@LSj*M48W-yiS!oX<Ju zIrBW{IrrPs;-y}gr(U>7y@2dkrd~jHqIV&?ebfuc9`s&hAMX9g0rWxSmxl;oIF5?p zI6X$>STgkjavXgEIhjelfSf{~M$X{wMb4tnA#IUQaw6H2oJjVh{+SP9-tVYO(;hL{ z{^3b-?5Ul3-+#u{;g_zZz31mGPwMe)PwKV~Ja%I3deZUYag`e~PtWioJws&2vgUaD z3+j80oOJDr;qkx0_pA7R?MZ#t?=W9>T>0M4m`>zLec}C%uYI4{nyidz;{5yCqxZ$% z|D>M#t&_Sj`J^8I?UTCg1irp|lD_NrsfT}Fi*3E*ksoLuc0Bn5?SrPQd0z5G$||H~ znU{>rLC;0z`FP1l8+rk<5ceWvF?tEIG=}po<2kt(${8G~fJ)3&P<@8{3N@H(p$@lt zXu#YEO)=!xn4JdP0vQ(CE|FoO1G5u4x5=>3joAe~xb;FG=6)E6A;ZE@JPkM)Ng}_( z7-lz&pCP}(BxVmx;WiC3n7uF?Lw=2+0XOR<1J08HkzO|$6gllAgCafX<H(5|GBDDO zK7t&@eF!;>J`hX(<N7{i{~_`xvUflE6X`<lM0WX+f00h~He@^QEyz|A7EK;9AT&TD z)ImMeKrK{*11g~k%Ao?v;>h6067pbaEU#uiuLfCwUWm;1<JBQ^(esd2+;foG=oVxa z*ZZ3>STbfwJU`gk`|%qR%q7RbPV#u^w07*i&%Ey`ec>;j(vAC`(&Ha^O1Hfq{ew^G zj)#9pb@I|l6=Ram^6w_@#w6^=`3feoBH1J=kyYr`NJl2$AVt=q*CFe1Z$LJpHzAw3 zzGdDdS}&SJ8?t?w?^Ge3=$*(epGkBhUFbc?UflbT{pbk;7zPPoD9$8?kDJ5@ax|Gm z*+@6~IC3J>Bqos_^eN;t?wmFxyy&yYIj)cY@>6>5S9w&2{fF97L;SNm<x4*%Qhr{} zvsXIq{di|kYmqqkwc~pkJNp~HrDDI*FGSgr5cXxkwgDk3)`h6tWLL9~6Elzw^cw7I zaj!$x<G;ZzM59-TCS)^uOQI01$wIUt+tazmnxTDZk1_aZ{b=aZy0+(O{Tw)c@GsgQ z9Z&p3`-q`5;~eknC)&fB@k>uRCVoOc%<>g{e-+<}_y)_@(7%rU4Ro+1p?~vfy+as; z)9NQW^GTFN{zCf}?qGQW_vEMbKCwsiUwe-j=nfZy%=30%3lJ{mcY7YC?0hstjOqJ? zoB7=F-!K&Yi}#D67xs$bm)=LLi~GdrYY}3G$zSj5`^D@ZKP=|NN5rJ~pzz#`5>vPT zN=)DVOOfTgBh5?HGRqt@gS1}c1cOZ)Pri2#yUItmVPAmz&S6(Y=*0&qZ=e*)5=it= z4i!*&n&F7kCaO6M4yeJh7U~k2Zir?^F_Bz#kk%VCL(3vlrqBlM(6PakDs(~@bT2Yx z3O&#ZeNjxQ!T=1y&^+Z$%xyVxmhuM1!0m9|mVaYv;V)a02^!g+PS6<yok`HhF7$5f zUAXrkd;1C6P0(I~Mh>D6EioIr%xo-jbe-AS=(nYNgV|b`fJtu015+>!GsCnQVHW0w zS(%e{(jd&~d^IR@TUt)=l_1E0T*xz9Wxm%cZApA}DD#di<PVGZ<6>^01X+q+#vj|c zfpTO8H&D4r<ob5He8eq!4feIT*CFfi-{5Apmg}34&HYU2bY%QoI-dQh_6>i__efCR zC)r;9nbu&+I?R;J;)cvV&Xi0XQ!>d+$slvl^N{(hQnMioeAi`RJX11BOvykA=29q& zVM+$d7nqWP%1cbiKs9Cu{PzhZb(rg+0k=kI!rZ(d1I0UMq(tyJqeG6SOX>q(Uo|k9 z#tOLvR>;8s48qVsR>;8!jKUaA6ZaA=fn{0($jNnD0<46a^3f7N&Y*j-pT&LdlAq|x zdRCLBtOGnPWXJHdU^V8;oM@hQKTpe(=bz+h(|KBCAx~a}EJiOumf~KvpQrU?%kn6m z7Am0%s=)y@D@@r!9n{AJytCP&ah0cC=V@12yxbDWYBvID<4@a>9kDD_Ms}iiA-i#R zWo*fw6c!3X-x;C-1~3o8Pz2EcBL|2E7>i*ME{tQIFyD~>=d~w0tKG6PwGTQvGPMUE z)-F=QfaRAYh+k16y?_puMf6{zzlaW&m(Z8cUnbGKX40O0Z&{oa+8a;n<%gv>hqSzj z%SU3>e$(L%dGy-TdR8LwOnhY{E$PH_2B|8Oct+--=U>{Awglq2e@hnPr|3BGe273d z_)GXdJ|>SkNfdZsDupFBFk`zby@dbSS^i*-KWG`|kFP(iw|S*FdV~8fep+|-6VFcm z<R*Utmd&U2|0$jw)-3H<ut9r9*9_0-=fL7v%F-U%GxFpo#L-Z!_h1c<`^?%YQ-3_K zY9})G<MG4K=q~QDJBnAB#F|f}Gx*&#>0JqwIguoRXl@bOG4HHd_wov7BxS_{>!lC! zY_UA!DrE_Nrob)Vebp1laU)59Fo(Ei6iHGriQ-}k<f<%L>ZaAvNI{)KE8_&M4a^0| zoOoIphiPpj(dt0%bb7O}x1eVu8yCr%{P}<H)T~9XIY?^<S$By<9!o3d0Ii)ffucH? zgPj+%WEuZifql7|Ik(nTdU%Nj(T;2bCv-s93Ju11gK!;S8s!oVMPjBWiUHU~KhejN z4)WFqU^sy%TJooTz!Sw$a3AN1VIq?!22VUOa%N|If*7H=@E+oeoI4|Zj8)8<@c2KT z)E%o>{H=uTG~aRy<}A`oN?FMfl78<VC0UBxIZD!Q<6CgZ3jCDwWz4FGZCM#fQ@M(E z-X(ugdyppc0h-E$RmUgS8ZkFWzAYD_grO-4k40wfv!-(olL8JsqaXcy(#R*C(VySp zlMq9Cc?7pl;r8e=x*--GgU9jr>1XsTnjB`A#&E4*xK=P+D;Ta74A%;ghkgEnpRlb7 zQDE0ZAx)T~VvVVBAxfMjvUHWEP9senr=XQbyQRWOTZeW`6|x%LfvmwpZLkpM__OT8 z&*<t1611b-7-S#gX5!#8@DK1=h{xYQ;s)w-NKl_gf;x@_^#vrTFCxEa!tf;wpb~hh zLw_x@k?PCXf%*y()K{O;t&p9F`D;i}Uq^!a1`<>f64W=5puU9!^#l@BG7{83Awhi` z`7}T3I~YKnK!W;bB&hEqK|P5CbrK2cDI|20p<MP2*+X*NImWJ!jMq<c9gsKVpx+JI zLgs6Yy&>B&Z%D^?JnFrsceXr`V0X4^O`jwSlTZy$KcmkYk<UD%51qLohof)Ek(e8D z6vh(&?`<W96yi4((%=-N!}qwG?-MmU{D9~?&S|oVoFbdd5g<q_dXAf7)I*Va`WcgM z{UIKI1V4rh_?Kt&#*5T9{fn~sES`RX-B01nGkV1u_0=|68LBZmpo$z-nMvAZ4858I zdF?0ZH=ZC9Ksn~dRuZ|B{$M}-LKpo-Mg({IjrATfftN0t^<N2`#GkjHTn7I;7slZB zvuE^TJQqb@lcnU65+k{g0Cr9QF2ubcj%<$o7`bd>Fa5$W+HZdJ8zt>G7{TmC&LHQ= zm9t#$iKgFpfMS5_rc7Mu<PSOsw2Qzx3AmLqp(TM1E3{$m=c)R5x<Q_D;0zsBWH+)m zg^p`FCnPyFmZXsHz=GMz^ly$wV-c4o^7_elRZQ!W=ibS8nBU2FHcgalYr<6O(?pqB z6ZRZUl#?MVaHzyn6&F;419J@l)e=x0f!FgV4g5hPf6#<|GxjZs%snSC!od7-dlDlI zNzX}V@^i8??m5|Y_&M2~`J8l}c~176X7(zB*{c)^&va(5lAbfkK|BscJtv3vKPRov z@+van-++8*%>pxIg9WUR1G$g~`CucF)%vlF%}->zSuon@N6F(y$>T@K<Htmi9}`8C zJoA2{;G&->L>8T<-Je1=vq9Uwbc433owg~ILj_b0(>8@_a6pZnwkgy>Jv4041caRv zo_7vSYzZS@K^wF~$3ZGg=!Bh98@t2Em(T;f&_^!nC)W*_FbsM~FbgCc7`a5kficW( z7~dvg!6ar6OyM>SGnl<F>mgx5FZEVW0(I1;A5#^6%oZDHa2Z5@oPk=MI&I01PKlpr zr5)L_Ok0w&xRcm0InVmh8}_5gZxB;U24;y2e2B%spoKxqBE1(4{Fy<F(x@F-F^J(c zgBV&jaG02Zn=`QC*C56==?xU{Dhi<pilGEbp$zO$4i!)dRZtC%o%~;eq4pWQ{(zsT zi}Dk-(SD)^nq&PK$MIwF(DSn4kRR1F@3juU_0T}8CCrbtwSKG)^P>&x$3B#orL_^a zCTNBhXoWUthYoN;Cv-tKxONg*5A;GG^uquQ!VnC@2#mrQxM3V7U=lnq1=BDCUYLbB z(BUjR2j?iCpQmJ;$GpSO;osqN=lNNH3-BNC3%CfsgkQl6un50~7vUvXf|ubH_zhfw zSBa7L7HKgC%GFZGuu;3uMCZ1ep(6){Xw5RCJ)4aFxM&Yixb7T2&}gG?mHCQnKJ$uf z$$3S#nqQG^#jnVA+bgmohV%=a3?+3@hji19a-C%ODU$TNm-Ksr;V5c|fh2~fV2FBU z7)B&1g_>lH)`iLKM-svFL@@217?RdOs+mIuF&#{j^3h^krKJdS%Ti?d@g;@e5Ec%E zu;(pASYtTz4ss80<{jimaCYzs&J2b^D1zdPe1YL4Ut&n+3kv)BVrevA(Te3u3WxZL zR&>bDF${c0%NZgZi+o0FgU@N5pvd1aih5{R=W|*MM$u&Flc#5mq9rASg4Zb82(Uda zkZ-$(2<P!Y(P^h(4&zhk{S>@z3S2Lltu{pT*+NABY6v5RA!6`wh#1N=ieYYGWRVQK zw3C6UTK@m!;PKWVF#%(CdWA5u%6PFkNDT2s^I^VdKD!V^+d7D$eD+Ntr}*UgH1?DD z^&mTI14LJAfaov>2&b0;Yg1UDXgAV3bu)tP_GbxGkQiJK5`9$xENNoY1&_VZZD$k) zst@}MN2I@~jA4WvSsu%X`2l}XcG4*9hZqr0_ZQ6>{w$C2CsX>1x?q3C+89CKPs=hY zROD8Lio8a?x@-#-IY?9XTBxu<mM>J8ktHWXStb@Lic&(Ep$ZiR(V?PnZ>Y%UVjCCl zO!a2}CryagU>SZ3za!IJCVlU)0{;cShu2{huD~DQkFW-Rf<MF6ouXwO!yE7>T!Rg` z4maQ?Y{D(rg4?hSZ#~22;b-(+KIA>5NYQg;ioQ!j8j!n?yOBm@05S*}j0{EYL53sW zgWUUZ4DZ8mA2I@YKk@<O`;i|&Mj`(a`61+f<X@+}lY<Y2MbVKwPKP~?ob08;KJR#6 zj{3G~m@0&v(nHqmB}?`lq6)c07F?hPfuTjRAadu_oKX_*SV9Q%&mqh|hlrhnhj-4M z*g2+pDlUW($`F=-T#(-M3o`d3{P#z>hrdBaBmWlpV2U2c+sHkL`628cPSH)f{vN|8 zu!uoEf_xMii+mjUY2*>)QDhwQGsw>(<B^|3ejfP+<QI_%$S)(micCa)9r+F9H<8~$ zCL{j|`5oj5<ad!zrkM0yPhof(ixgxk@)R;1`F-RMkUvEJ2$_NW7vxWoXOKTbK8yS} zqzP$8W+Sc09Aq9cKSdv2W0Y)#(J^|hBjAQHn80p)m6pQJ3W`Z)w5Jacjf)KE^9p7T zQh*_Mj+t*YGXl8H@LVy&bd;{nS_Z8e8J@%3NvpxdNLDxW4l_)L>}Qm#&&BW@eg?oE zOiKiI4jd?#v_OosM36fN4pf^k)cVmBLDnPdpb;9N8Jb{ML5iMV2*pqecBp_VaPXA1 zDSA|WivGp+6y3ioMVGD={U~TXDf(I1X$I#&E=|;(=J5aXU_{f-VZ22V^g5v=g|GcU zK4u#f7-<hfWf&7QPz?@{y(#)(D92m@4Z$?X@z=yPJH5^t{-}2EPH&V3IkE#cCv@W0 z1ubhdhoLQk$((j9`f%)r0nCHYeSl&adNB9GJME@L+D(#n6AY!$u7VNVMqws~cGIDE z8{%_D+EqNQ2c}>evesxzLpE(=%OY**Ra%m?nR3@ZMq_#vOP=OvA5qqyqMrlv08s~W z5PQ&uupe@CR%q`xogK%1f*1fZoCP_F4rVwD(vzYef#-JQ6cUb1BjLyl5{`J0aAXz< zN9I!Wp;%UY(W@9CMn{o5JrFl?9DM>g8B2_LFic@Fjhw;5b6j{tClJV5oR{W=$LZTX zE-YX@{kS-C7B|Rasak&g<H8noSr$ZJmW2l{%c4V<WwGDmbb%fhrNNJjvf)@^hw`Pj z<&o!7^vax7S>=jlXoauAK~21g#doc`tUGa5*0a{FfdCt!DTc*&VULTJ4Hn-)Tcj@A zH(7ia^SpFIr=KpLBf#!u7T@h>@m(~F?~q5%adU9wc_a+YUzWqmkBE_rm*uhf6n$*z zvON6r6n%X8vYeQIgs<_fUX~u;BVua(vYg(0giUOhrFZ=iG3&c5=T_0<A7iraF)^F; zn3y~JyqrAnnDD?9`ZSD%Jto{MXXSVlZ!zLAYT|oxIPx)eI6ubf$j2ywS7iUe6*&;Q zA_rquWcU6R>55vBJ<%(&_rQuYb?jY{&af5P8Mz|6B35LJ-->LNE3z$kMYbDPq?x-t z3-aGd6Hw!ROw`8QlV;54pfcliSw+lL`&cPh%!l<3KPv1{4i!-3UXjJaE3(A9B1=6h zGT*r(ZLKS^pnpXcx>o*5TB}xMj(tVuHm=A#26eM?_*3(W%w|V|g<T3+#g3P&wNIJ) zV@25_uXBmlc_~&@K;?X_sJh7OJR2(<m}|CUMJ?2A#?pR@rTr8u8u8zR-{$35(GnFa zTA?jEmc{F_qT^7kaK^^c1B&I$fLPIOj1?~E@rz}F!Na02>|xOl1Cb94(_nC{7&;v* zhG8T#R*asB6=Nr3g*!b~jGu@V6PPEHVuc5$5@W@5LadlM9!u9cR?Kqk906x-J}S(< zM@9C=qr!sOy7s8Zf!x(cMc&GzB7gZ&VLM1Kn2b{xLoawgqfgQFf}=E(C`Bow=V~|7 z3-+TIEa?SD(hFXo7d%fd_$<9(AHCoudcl|I1uxSJ#;y^XF47CeU&|qS!35sMpR{A{ zIKT>Q=p^i}D25IR&(#_udirBTuPcTLni$c)_qrU2`0RB#c=#bP6#I}EK5<=+q+ge# zDc9v##&zkAU=inD7IB8Lh?BeZ5a1L6P7~k^e`YyPv9pk(&)I%2v#J~~*Jx>`q)#zu zPsi!_r&xDOM+#apm^{ji5pBz#qAU0*(Qz_HIMb<>B3fiuChdDC#b`gpD6-c}F^lX+ zA3zQ^QjD%rjK;d;$RU>;^|r{dr5@>C?vdjWPC2pHDJKs)r6<-Yr;JW%n)Y+b8R?YX zV5gk*w8%M_Jh>`8>8o-IIh{h@OMgUoGs%l~x|-3Ch#~BUaUVgBX0FOHq#J!4Igz|7 zyOA#R9%L`>eaL?F0pwuhs%%+%M6{x}om@4^c06?8;KZU6*_FI1Yy9SAt#Mw~WqgvQ z0+(e&%vsrp-gM}!Y+m>jBLMT#-uk*MUtE<HP-&Z&Rla#yy*V!(OY<_H>undGmj&KO zL}AjrEQ<Q1DDHoRR@z&#G|3>e3zX^ZPl@bU6JrqOM<`Dp5jl`+dqm`AGBCTbPn^S( z<v)01zo4SHNG1o%FVTO6{sKB!7AfL>O%Vr2UPQu?mv95i5`JIC?<?qF`3=Qg^c6XA z;0n_wSEMKGieyOXiad5HML+f`{3pDYq7Od0rIj_X62`h$99vG&E3A9OvESnMyA-`T z|NY|FWy~w^U-0`Bz5en?M8oIbBN|)o6HS-*iDtun;@IoBt-=-f1N;%z;7{;pxC-ku z6py`u;Z3*(8*m+NP~*h6OEY9AwMz?FG3P|MWUkRA^B_OQC2dfU(Jl**x67i$c3GU< zE=!Q5t1ej<KPK&Q6SDmHgsebTCQryJWHq`2S(7;-YvU$NvJQ)SEE@dUWn(g<G{Jw6 z&Bi~-md!uNR%5$ti)fea(1F<roiXjQ3%ap$K~GA%>^<2o`!d^QKXM?wT@E6LBHJ0J zYL_F??Q#@37TYe}$noViIkDI#CpX)qXWP^!r!Kb1>9cKeX1+~&7uw{kyG<UuNxgdW zm^ijcY}|q^xDDIz7Tkfm;7ieqyMx5BdntP1P?9+IHg+OaKPFRk$CGsFnPjixFvMdj zRj>FGvzDA-&L%9q><OeHRc|tVXSX=!#|67!H~51Q0w55AAQ(a*6!t(Egu{E_y|5SF zm#X)6>=MWR0`q;TdiPHa;#dUceW|*e3Hh-iKXL5-RDI+n<OA48!uwNo(`?^9;U&== z`#`EbE%u42Z(#l)j#21-YjR4iNl);aoIJB8kNqX~e+3_c{i*tB;+h;uSd+uaYjP-Q zO-{tE$??N$(tUhQj>WIZUhf6j6TGt&_k!#;hKge!PSpn_ORg~Yg|S5I<eEvgCD7wd zV(r#pJp6U4-g$^MTXC%2g8JG)S!W-V$3DUz9)OR+-@wOG^#<>ttT@=m)M%eP7LDEC z66LLfvf48!t8lA~=woKPkNOq&zr+2(RJ{~)2^2#SSlp&QneE2IK|FjsRnM#Hlev)N z?2}f=N)Hm|3^uiA1_=w&S{o#C=*#C82QiHvB=W%q1yHyiB#JhJM6oYOlpssd%c{u6 zjX|OuS<%lzXEzf)-XKwpbfDLmaHz#Y9kSjYEE+Osb7#`#MmDF@rmj+=l`_2z*-jbX z;ik>)q0NoCi}L>1L)`eoa0vb$J^`PE7<eRA|L?VllfyKf;lI}=&Lq&Z2Ga<Ejpk5k zEKzB+(j+z0B!w|MO;pILrHu{QF6u9^dj8jVbw5pPC*##Cj8{X!HZ}DoHT60*^%6BT zlu{X&d8w!q8Qo8&Q-M78DQ@ghh=s>eP5QCNF?<>h!x1<N#~=<qld2E#^uyfT$oBux zcBW(CfvFP&{tv18J3Hn1vv-!|N0RkEOE~eV`jLN3)t`eSpF_gY&!;k{WJ+aHDOK0L zfF-mDT|4qc%y8sONU)vVEsi8$h9h4_!jZ3}>d(WuuX252s&4oifqgwyw?g(eFef3w z@J-C$O4Y-5MoW<DiBuNYaD6h@LbmCju>3X#P~Sm<I)MbkKf`xZb?4?)X?POzNpi<F z=BF?}4bMOdq(T~;f^_&^s@`#Ww=jGk^AF&3s(z3gF#HhnkKo6U0slh&H~a*{PvH#w zEBp+eg-rN2Aco8fJKOxY^T?|*`z)(;lCMfDgOzy<S{ls6kdKc)L1Fw=S%6*f5|hIT zS7j;Jm0-5V4a>4&CT}lZmE~@hu4P`8l?-k=5|~M@PGSkeX+}L$Si;~Nmvv_t?M!FX z6PkDG18D7+?CWB*-ovQ9{i<|cT9+NIjMh8fond#a&?>V3R=#skW#pTZCbii4irZlN zn{r@{xv}h>bi`1Liy;?=RosNLb5iGoixV@)F?ed<9dj{Q$TD~^Sb6ds$c4OA-Janm z%D|AHs+S|Hyo_U~pxbaSNYy=wZ^+4nH>9DE>ywa0*cYejbH_245Q7ZFXE7jeVGuqW znS*Xc=9!FiS25TaJkLK!fTgK=0kU{66CY3tC15BcKs%H}1yn*6RD%O*pcd+&9vYw# znxL5*aC0Lqyh86AGTZY81sr<o&iijm)pK1a?Re|}Cv>Lj#W`=tBJ&%vwD=8KVta$$ z2<1RA<p5=Y1*|s80m!91$b)>$HYmWYu<C!!TTvdAY)}qBnVoU~%Ao=(hiR}6)0(fw z;D8z|YoQM6;hmC@!GI<TsOBn4!qb$5&<5?$!7FnX^Zs*42kxEszcZW4kXsc)Zu7kV zgABRB0PK7;Y$)N~N&gX~p^F^id{a7GO>fH1{x@Zp>rH9s=E+^q1HI4({V)K7RIeG= zr1$hSIh%P+&Yig?b5CBAc_*$(bNV&emvT)OBwmw+3D;zP@-=Box+Y8Fn1(!jO%@%$ zCX3^*$?QYdq~+i>Sr&Uu`VXb*{v#mA93ORRA2#`qljk>an8b35_jHNr_!wXcJ}+8` zxwUI@Xo<ixgz%rC>b-PLj?LqLHdQZ)6{6U~pSrI}`*9%(z1$4Czn-cWwQ>{v*QDV& z5@QuNZ|5NzugRI(YjQg0nw&CUlb+&hCOK)lCbJ&k9Y4uC_Ge9im3RCat7v``Cvrc_ zP2A=kpJ%mF_f6@4E>$lqV@<%#Ras06r(}|M3}v5YwbFwuf3z|z_DLmJ#VGveQ}yZ> z7#4eh0i=&HD)!`OMcwneg#W)&g$ovi$PH-;HnAYgxFK79io-&x-o~25tnCfi@gNIj zYmN)Uf28VNzhJWLbwA;H*iZCK8ie5&yq1gbOZZi){yqL|NcaoG3$${w0)=5QRUi9m zpm4vJiH-Xx{eF$V7vUvXqH&`OF;l~w*^{inG!<{i2HS=lr587H`nnv>ye@~%T$i&a zuS>(rT<{8k(aSUZ2J<BflVtoPU6<o=*X6`vdVj}pdzBk^Mp8{jik6&63av<%SkYUI zx-L81k-~}IOBb>y?7DPCUZ(+zn;%_Dd0m?Ng9+e2DO~nHz;5gZ*p2-FQzQ?t8~Xv_ zKvo`lKvW^?G9D21nGc8>+-p-F5KT!BusQMp(EyELc#U*4d|jHo*JYN6FqgUE-@@<U za;iRAbzOSw*X2~>bva#oUC!iOmtOOAlbkKSF6V4qlyO6vPv4M+6<Vh(Coue1s@`+r z2K~7kvODF5>`J^LI}>h5XYvi%k#s}0$KmJq1pYd#!WH-f{1Mht^{oAb7lofck+A;^ z>(r8AH)LJo10px#2C@Etc#UNw!V-LgG;l+fZQhXfjT>^@cSBBW-;i0WH)PccZtJXq zSi2!-mTpMz;te^wd_(poJRm0LZ%EJC8#Lo@$mxX}ve|t@`oGB^ZIDYmtbuUeknUE( z>&N~Ec2)Sd<G&GoGgS}TO4Yq4{xIk*-m#5^6B##U<LR5SDf1@NZ8zn_$(wTg#7%m* zH|1E$O*zeE>Qur_8RR26lWxknxSMkJgda&SO`oBt9yxSVjvl-z%ko2o{THDuoh28j zG<__ZFb>?5Q^8E&(#;Ptr0Is;;Gd?O=HyK|6MR#8ja<BWQ}(P2CJ+Q8&_YbCasw+| zyv_}*-INAnnm)C7Q+k$f%E?RIz&v&VX}b3!GLQg+AQ(a*G)?bz-juFZ!tB2(dtEnW zN7YT~wBvVAn%-553`;ZVhVV4Kt@x&Fx8W&cQySjG#qWi^{MkwL_oeCm>6?szZp!h* zO*xUUDaVpGr8{X;j>K)s(Zic^==i1_j^C7%kxVom<eI-o)2CxL<;;G}_oeBx(a4B2 zeRMC^gyA=mYa$5C&xFIiG~FFcpvFzv5@Fyl1=eUX5!ntMn4Jj*(V1uvU55>#8(h!> z|NFSys&~ib2hB_juo88cA@={*dA!U6Owd4iXDc<ctWz{jru$j&pGd33>tzcLa?Z)p zvy5A^>hvvHoq0<-&fJoQ`-zDMAQIl6rVl0FlEVqN<Vf-@Ihu4!j>X~U18I8Q@msP! z{+2W!LjRy+UoS_k8KTnkf{0tPfplQ_ORoPb_z>)e4-*@{TQYC^mUOQY&dM!0v3^UA zuicXVf1RdJF5Z&U%a{-FdgpJ+nX|WK(BBY~3%6vYn|mDQ9=+THVYfJMF(`6NmiOP1 z6)yb!ZJHi*kXIY@FosX0>Hd$T>9yutvaXn$wB3>vfTAdog~Knfe3j(u|0w<*OVbOn z3p$*p7bR}V;)E?(lDx&J$(9_|nf5!pB@2$Th%$alRvy}tRR_0Zb?lZ5I+muF?cb92 zs4ZC@y(KFSVD}GctmZPM=~<Cm(j2kHu+EmOm0L3VAJcU0bEK=yE!nrRB`1Abbj7!% zYjsO@uWZTQbrz(q;csb6S{Aot*YcL^WCiE={Fa<JyCtm`x8yml&vFy)F#f#w^I-4X zk~yu&&!_2mF641?oym^j3u(Ibi)s3B&Xyc9^CZPva>T}+X55w)r*BK^mvB!=)2mP3 zmW~s*Wo7zpS(S2I)+OGS^$E9SP4aD7n{-<?#od<8hi}V<<F{pF{B1dWh-<&hD~x3o zI4i;3`)|v!DD1wHrjH*$ewD0kO(gKI@w5@QrPuE^qiwh4TriGbPt!*>Z_Ck*+j7Wv zTXs8xn7P9K8~9HmGpyZ~?J0qx?L?qxT?iD`Z<1Z(ndD}|+nKmC>5ZEkKf||Wi}$u{ z_1u=7&fBu9^|tKjzb&1v+p?$Xwn_Hd3G`b8`b3&OkaJrOns3X#;@h&{#vl7_%VD`K zM}oKIsBv44?cJ8{ux&XWxy>;8ww&C*Ej>}&aw>XTP9NBoGl#aN_u#gijop@WF<gs2 zgg%Nsg6_uu82+uvJ29{=TlVtV$yNevBfxe7?B@^r_`^Z|aDYF|+T50<8(i>D+~Bw2 zJ8633%C>A;-<El6+cJM?TiO=4Wx?{cEWETWi{`gw@!4%za*^v6wq==nTiS=WWx01- zR(M!K>D-o8t%P@iu>KiL-{t2?I0;Xs>E$^%K22&Z=0Y3hkFogi8QfAJl`L2Fs`S*p zDy?bgr$|5%uTu2C%HZa!3~;<Ehvchr_~0Ema`-I@y|-jv)LXJ224E1n<y+DPJ<toL zj;Ob!6FQ*_S`NP@TcHiwp`rCH*$7S03^nGrWG&P|J@kEub#>_^&epf21G2wIw)%dW zo(;WEu<FA7mURD-u?g>6GRK3v^DSw?UsnHH((HQ6B-IZnJbv>jG0Ond+_R5~Y;@0$ zHK9%u2n=Q1k;A8XBK87|p1H$t?j7kqaYv4)-;wHv#7N>D=}EXFr;_i;>7+YyChm^( z9=;>9ahr?3Bg+omk#=IFJob*Ph`A#x_ur9KQFml@G}r%#{Ga_J?)b+@P#NS+IP#M; z{W&=DQ}Xe8s4#DaitGcS!V<%xglHBeB!-IIWELqP^V3;#aGW&=3=<VXk(>1-Ue=Q! zOI@rdVR?t$&U%t6){}TxPjWU`R6#X3O#IX=1v9%9%!bEc#*`Rku?4fDCRj9`2^P(e z!^cAhi>_*UPyDUOeURDFlXqpoiM!I4eplwF+?9EWcV%wEU73@7SDxdBZ8b)befX|4 zAHOTJ;_u2Taa*fA=F=RXy`xos{w>We?rLRUWy$_G-quXzPu$ZQ#Wk(r$xW^KnHyTu zsq0#;xT)3rcw4Lg>6TXa>@6+(3^DVs)Udw0vTysY>|MPpdsg`Knk$;E?+q>Y@_iy3 zcdK|^%lZ5pnoC^Oy2my&^|Lg+2lu(+KrwqMP<WRE#SBbG1&D%H9-#lON#?tFf-0WC z&J#4=mDbw3GMfu`J)5TIW>OUMiYysEnSI(P&6z%#b;c+2PWoi-37^bK_epDtk45u7 zS&-n9wq&2oPx8sqIG-#z?32aEeX=OtCo2y5Wcfj#w8#2nS&UCQ_WNXYluuSg^G%4# z13p>5*C*@3e6lvuCu<^nvf0lko1{-R2K!`#(I?wCeX?!CCtH0!*|P1EU8_FXx#E+~ zb)W25^U2;NpX^!mN!PMZc3<+zb2~SkMT}ka$-V`j9CiET$gocidwp`q<C7CkpB!)X zNw=xrC&yeqIbG$GQ+A*9H2UOZtxwM7_~fkFC%wf!Ib-8yT3IpU@)t+V#7K6Uehw;L zV0yrkrXRIpmxCQxa?$h9^U=X#OVdlKNsks_SI8@(pK=bFeTu2tBFg`)Z>($PC*IVK z7UQ9Wi@;J!o8bIMMd6U2I9i6iojQH<o}AmbCue;3q<8zCoLapnr&sRD$@P2EvvyC8 zFWqC{`kowHz9-$6a3A`!HhLEK$u-S0_9yKeVOYx3^bYYy%}FopXhoXd{t|nEy8G`* zm+PL?Dycy1_hg5kzc`1vZLdEY0{j_y_2)baf01Q;Tbdy|;%#XGE9M-?jd@$<K|W?1 z6yR0}MVO1B<jmW$6v{B$p&YjgsKi`lG`%gWW8RhysKKHZ>Q1~Z>!AS}p$VFyrS)yu z3T@C19pHpc=z{K*x1|eupcnd}9|m9$hFag2!!QD)Fm{PuQzm}K7v7c=FuDCU4UM<u z6y|A|!OaV^nCBoXMJO|5OQ9@a1%?+?F62Qz*q{Ikp$Hg)P$f_bWnhPLsDMf^F&?3+ z!2vbExP_{NdT2O9fK_RF6Mxza3yw4nX64nmgo<yVplM3eogSe&(HB~9YfsZ%wL<l* z3DpaI+xT-*tz#a9A>4*x#3j@yjCq7fxjjOSFA6mQlN&;Lz}7|69C^+~-GKcp%t6*c zsmzcKmK}Q1^qd5#av=}$PfKNk0w{!{1gVOl1WKVSUMf43CrMQSmGRU+RajJ?k;(xz zm}{X9$9ibM+z3rEQZ+-%f>f>0c1fys=)mlR&TXl>fN#7j7xc7B)eC*l4+Ag=Ln~4- z%dSR%@7ky_6F*G7t8tis$yTX6z$bpxG|Ye(_{5KzgUTqSj`mXBA{|gB74KB>P8IJ| zsfH6uH9`|KLyc6tQpGD(yi%pCMx}BfH$tgAFlD7E5|vWfC>AKfu^38DC{+si2b8iw z0d9pb=25&drN$SOnt&n9!!Y7fyi=uSww2<{YsJ~WY7Qnbdtho!scGnFRmusS(ABC; zs>Q5SE3`p7^x?T52Do?-x-q+;hl_h5>x8DvkPQ~FMrbMrv_7hCqo(p>G-ZPV%;&JP z_NVE^xR;=pLfIKj*`XYBh0&y`N*pZ%L@5>ysEN^3E!1JIhX&k_4w4H_YpNMq?3!wY zHfV>AVNE%q6S|<=t|=GvKri%dXsRCuU=W7vni_@?7&Y-THmoT(jKc&_Vk!@4L*#;O zP0e8T!t5oU2)nE#gEB*Qia}Yxdd8q~AUDjQ@*w}9LD`@H3ZW>>po*acN}=quLD``k zOcnf8h8a{9RD%O*4jNP~)ImKogc(#LG(j`8*bJ%_+G-7|9Xc>Oq0?hfUC@o$1wFX+ zLLcUS7}z$bK^Ve33?sD$lN!at7>XOlJq9%albAg)wPH}yFautgg*nKI@>6EW1`Ajr z2XY|~@=y3F8x%kx6hSeRKq-_(`6)Y;N16Oo1%}EbKUD?Q@qWqyHD~-(E!1JIhX&jl zp$T&{w0QhfE3_^6sdnhN<foj_iJA5ABw9Zbt)J?_+zWlU^}_(`tOj9d(N7J-h>4$3 z7~AkuZWxCNm|XNz9+-k@m>Kp{UYLbBC`;L;EW`8{<Sz34F3R#<stT&X0X0wybx;ot z&<IV?3|U-n22=JK0s<?_(Gi+l5xbN&nx;Pw=f-FU#OzXqP;_9IDuxo=OJS&X7iIY_ z^8GII{VvLdU6c#E)Hvn|n8eKkQ)|1(_q!+;c2O?uQnQ%npvAn4Y`@DyzTZW@-=#Xh z37yad-Qa>A=!HHm?uP*`9)zrf-O3EvU;*pt-6{t-HcD}96b09A3a;I%AZ9lYu$u?i z%>(S_0e15MyNSWw%AR7{t;)~rRuxc*MHN)z=ztnL*Fs&)ZdDHrm>Z!9w`OQr+pSum z&Agij*v$j%<^gu|0K0jB-8{f<9$+^Qu$u?it@@xJ27qB8H3Y*Zenwyv#=s5ZFaeX` zfhm}VneE*~;clXEH&M7-WySj|Gh`?ED+^dL=Rocmf0YOMm~BuH<*y2%2#TQuN}=q8 zzcShRDTfNEges^82h>FQt6HdodT4+~Xo6;FS@Bn`&<5?$vH7;<Tz^}$xM_rrGqhJ6 zqFhy>s;4nj^{#}fzUUCuzZ$9ryrGH{Fx600kV$a_rs4=pHG0vg#xQfB3GIPUHE||X zO~!{RPfn<s!frY_lyV?cd80zr?EX+S7a6Lu_J=6*nR{9`x+OM5S<!QnLR2o-<Q)%D z`RKN|5LK`ctO}7u$swxv5HWzcv_FLQfGI@TS3*=dR6wOKL{+VZP!5DBhb=_actcce zPKc@_u=>S&TElv%YTO$_Nf@G<T_LLFK!|FM3Q=v&P}Pp#4rd7EK#1z}3sqegLsU1q zYd%Ev#0RThWZyC|;PzJom<OXn)R1W*L=B$^Q6n(Q9gSs%DEG+_s@D)TVGL1|u_4Oi z7ow&JY}$TLn<3h~Ts(_DHypw%3E-6k@Ja%BB?0970P=kR`96SrA3zHyfCf&0s*Vbv zy%0dE4<OYC&;kme1r(r6MaBT?`v9^<0NEly<>P3B0?dVAjSnDE1*qJl0F?(>xS1gv zvjt{60m=)r3jx&k0csMn2c}#BY8u8cyJ6fGpeA4l^DvCyHtGs6(L@TMi4;H+DS#$Y z0NEyhY!g7X2_V}9&|nCl!4N=$A%F%$fND7#pjx2~+99hkSeX-pRCaKXvZMzo>qe-` z$qZ7t89^#9k_So-QZ{!Gxgbat+JaP3ZICL4lByt8T5JkZWeGvbZVytdf>5l2P?cO* zwRTslo)1=z*dSG73{thjL8|U-kg8t@QVq$$sxdA|H60I9%?E>2OJuNWT?$fd#X+il zJxFz&4pmNHkm}qFQeBNa&~lJ+9SWjN7^Hd$t1lu*_2YkFZ;%=^1qV?X1gYV$AT@%; zXmpSo+YC}}*IjLVGgwU!;3S?st{^p)5v-=YL23qfZ&i?*jUX2wvs{774B5^=W$^~m zF9=jQX9HF4e4xtn1*&|%KxM1FqZPP$rN;wRQBt5PUI|ns_CRGStqoLV=_HDbKvkX@ zNM#bJDld{~xTrcgkQfM5HD=0z;6PP}U4446YQWsM8mO912dd^vf#iZf)r#J>KTx&f zzr!4;oQDGyV}YuRFuQXCmFv<SttW?9+8C(%(EGOo)xdt<|KNI{8k!GO!_Gi8a=>4W zBFDUe%Doh<#xYND$CGP;%0qxtFby-wfyx^fNJ$u|=J@lhWfJ8oi87i*IsBGpEhYvs zNVKO(v<svG5}wUWqFy8hmI9Pvf==yZnqFcwMXS;n(m+hGvP1dlXjS1Cttum<RaI=X zs$Py(j>TwIlMt<HPml(}f>k{<#7C<}+?x(XtLDAYs)cJ>jnS%&9hvPm(!g4<azbZo zwCY-lR^98-%5^bX^+ZRj-l%Bmf@sxGni^pIM-9TzcC;EUj#eW%(ZoQs8cU2;?xbin zz80+}E|CUmgQ-q~X%9rJY20VL(aPHxt!6W$)f_inbI3@?#i;74D5{f<s$s8DHD(x9 z(>CP-vdrtR>~Tg_e%PogwvDQ?)~KrNMrEqrOIaUmRQc;hW!p5Wf+V9V#IdN@sEXZ2 zRg!5`r7PQ77JkepLsj-(qq6vnG&+qcXVIu~ArB@`knag&YSpNwiz%uxd;N@RmOq(` zF{+_re>GfXR3nW>dOj5O(f-Q4YBZ^FuTf1b8&!9@QModVs^^kX^+p&~AAi_?$fySR zv%%JFt)<vVA`4Y*1k#>qR2|zKWSU@9ozT^2R9P9J%1qYF&ZI7&?yz#RIVVC@ZaQ^A zN~p?D3{|#-P*p%Z!Z4jGiVIc6hfSfX<anqmrJ^Z26sqh8Llv8qR7Fgvs@xx{s;F$L zqeGSBK&YzO8>(u<LRDR4sH!LHHTY2%$WYZ39IBd)p{ivwRJCq!gFfnl?NHURN;$9+ zsyf%H3)Vta_fn{GEm9jUQyX5QHk=Pt{id^2hZm_17pM;1p^9@K)rgnc&=aaS5L&rg zsSW!>)j3kD)x!|V6jkdq{eoJeIET8xOl??9Ibh?7GxjKLhGrXBy=i*>_8wuG#s3_s zUB_;CkI3=t5xKTKBCmE2=PK<HCbN5w$oAq9$8&hHoMp`J;#IA8?W)!XN1x;3a|8+v zd-u=}*rS>v_s{~`qsq*Cg#Bcgusolp*Bkdx|L;-GjXkQ<w};;F9(8m+O>bS<quSQ@ zsP?rzs&8qJ>R;TW2A21z!ApBo_xv7{a-H3ydM@svnY>5kyY?s>6b%1aD_o9KMg4!) ziZ{Yk$>vq9H2JEQg_{|&J$sY|teA5kw|0->iAV!sq=7KfKp1Htj5H8N8VDl|gemRk z#7JToX&{U=5JnmZBMmrq{apLF|2f<&|DL8hKK*m;0n@Sb3{7o^@dCrt(S<a<b3Kd~ z7{&_>;{}HC0txK`Q3BTg;L2YR#}^r?j=NtC$KS6;QtnryiTA0ojQeTL+^5D*->)WM z^3wgvvvj|jO1fW7!_4vfmG{v7YBu(MHHYlW2v^!KnG!qqO9K5B?WdFB%5@@K^`wWZ z-js0Fkr+;r4_BSZ;i@YsT(!i7tJcHes_l5VYL5?BBQfD>6o$@xoEv$M8jt+Anuz_l z8awfE<=*>ob@T=1ZNkFU<od^zXXWEcTjU|Y@@pR8MW)m?!&UKyDO{EK!d2;ZxXND* zSGJXKRj?ke3fIDwbt#-oAFgtj!&Tm;aFsP5uFPk{RrbYjWmyPU4eoH&I2^8;yy2?Z z6Rv8U;i|SZT-Ei5t9n<ss;mm9)CpJBjp52s8?MT7!j;_|uF8wURfUc4E=DM`=|qIe zz7(M>U|on%Igq;@q4G8&RQ^(gvMoobg1!H!3hj}q=um_zj)_nuYmut-K%^>*ja2rt z5vqJXLRG9ssLF+Xs*3BXk&ev>Rl^_DCPy&76QSw}rvbgu9if_rBS_H^swIKlL9P6> zK|6Guj!@3b2-TSpp}JB&K!Ywqx#IV!o&yo8*BPn$)<38E`y-Y15>W{)I-`}(2Z%Ni zB-(!wEIM9b@8?SoicY>I)b-jS(fyrI2+I=D{W96=6|MoxZ_qEHzl#1U@89yDY5Ja> zLEG1`T*eYCzs1w<&@ZEdbp^Zs!tVFjy^bANR?}Eg4S(Q965r5T*WS>M{t@>z+`;lE z{QnvKDmqx!(cfU6_f71sVF%?#)4En+T-VA@Ue)YN>ss}3j(X2n*DA5Aid)y}(CZV{ zwHjnCdXx7Jt-1dVtpVBSenZPX@O#YySz*7|%+bHsa;uoRD88y$7q4nLP++^N71mzW z@{u+uAzzn9Ue$`yuWH5oXO@Wp0dv&_E!%ZLv$VnngM4m=H_-Du7qt9K7c`rh;f>l& z0@~DyY@1p!loVgkN)K#mWx<=8efxq|zO<=Ttew*;*Dh#P7cbDByP!EXFK9Jq&uO*x z3tAojQJ=A?HJp7@Yjn<QO&8zPn&;otOvkQMZ0)_EwH>&i9lgPjb=9vl?IzI!T`}Cy zCc~VP;f;t*trz-?n_52%1YghwU7Ola@uoI>=7KhoxTzhz#T;GA1<if@f;Jv^L7T|9 zplMsoaD`pa&LO8aH#O^RUd1-A?JZu%9o~O->w>1;<qyH)!}1<OTf^ABjU8CTDZS8s zOFJr0={6f(+METgH0qY7sZ)Bf`Ic6`d`mlr*{Yq=?Hdc4#c)b@_!hLvl?Cmn-zi<& zbxN<##1FEzc0sf5KBYHP=3D(w>5cuSpKEPVTx2|@9}PIAw?^{Efv5DYg9}<m<1e)5 zG53Vs(w+}GrFSPRXf7jNTE7Ktu=SSKCvR!}z`_IVT<|G<DCd?oe0o6}Kfa(%WG-l9 zCl)k!+=4bmnA5&nnkD3v?ul5?X5*-6LQm;4@mDp?wC9wbRm<>1)p^pudD6gn&ARuZ zmgC+<8aPiHI8PclPa4>z&f&MPc8fHyMH<*54V)(put6bsi!^YaG_XY)I7=EhPZ~H+ z8aPiHI8PclOBy&&8aPiH*wQRvJZv~I^PW?>=^Q_n_wwX>&yxnuYwc0zwPSlv=^Tov zI?=mg&THLC=QX}=qmI7sl-?V$MH<*54Q!DH&XWeVNCR7>f%Bw+Ep0UAtTvW%o-}Zt zG;p3Ya9*1{d{*;>o!6%LkLk@VP5X;e`Z=)Ncgmz6jUWa#U)RjO*R|}GB&F@c5iIv} zmk)54k?3G~Kl%sIKX^)i9?nH!_m|lH)hWH~;@>H|=XI@odsVB5ctTau>a1E>)v6P& zXpY8jtD5vHT5ZM^t?u*{tv>z*tpU9;@ru@zY`UT~V`%BWqP4oOXl<vXRJ-Sj*5SIM zIhU?zoy%9WuEi@__vRJNwLz1%>f5UK;uWoLC6R8;x7EOUB9;3UZRo&<=-Xe>Mx0l) zQNkI+?B07t8(&IP6DfbECih>_Jcq7mQ^qUWw7jBeA3CL<1FLC2Pw`=5!0&!xj@&1* z_wVBv$bG^Zw@>60zo_M=?-O~O``CrGk6ncy5C!fRwZbzW5Jh}dra18fqNMgkt(2pM z%PxfpJCvV!uc&~^;`fQF(|^#aJ?|3^sIk3I)SfVkI;f9b(i+%y-#Gj+Wok+{@_iik z@i!Vp>%394CH+BbXMcYOX6MF|)*1Vv)^+Gbt$SffbNP&-XVWNpH(u5H@Y5ghqBd~A zC<d9Y8*=)K;eLNHf_W74SdzbRGn+Ne&i4s+zE39l3s15?+tB>Q^cqv6+f0#e2XfM5 zZJ^khGwnIOqV?|Gr@A8_RIa@bs)3U$+R?v0rS~Q7Q~gO_RBcHQsdi|AR_Kg<P<8Ep zP<0@khaOZ@r}wF8m^`shd0;kdpPGXi%w8D5Z4`zu55suEJ~feG+DC(7pK?RN`iiD~ z<dmL|w5_ga=P+9ikib4lqWhauEF!<GSs*L^vSv=ctmWb_Z)rudI#;xuoE5FE_90ad zHBbvpo`+}`Jfs?+5y~$;q$;2c>`?7_NI6XWR6-SGWf+B-_mYjYxc!Bd=*-#h7r8!v zPELD8%U|#pHYngoszNBrS=5Rz`iqize^DA6D#{WUH9K23%Gs__!F+usvWmI-YNR8R z0|$_`=yk}Q0|*+BrpB{m11@TYmb2{tfVN9)2!W0T_J2U<HfQ!iH)a>~;MTj$#*uk8 zkX+=<UgY4$Z?vJs5HZX)`4JdRVk><-Tj^uy*PUT2eOL&4z(2M#z9Xh2Tj?X&O7F*E z6W-tuF^lEgcCg6WyGxik^*Fnhb11=TV}(29MzXJZFWaWKU)GqP5d|?U41glc#ZVH# zw(8}Vsm3|Sh^-ps$cjw1RU@m=tC0@EsEPQkR@?f5Ru|2->HTc0j^(h4Mhs0@G$UJ< zS=qeG%I5T!we~euHgB`X13EExZIcVw{ouk+53-kQ`VO(*FP8m&$U*d>M0Q9dvqJ(o zn$FgZ47P48zN(G0N_qk&6WAdEQ!ouPrh{z$fLWM>kqAHbTl<Nj<yW*}zu#%&EJmMz zF>u3F1S=L{5<D=w;3wu_2E4#P5!>ir(OM1$h*n@A>D|*U7C7BvGeGok+C{JX6|MW? zqUOSE8eqBPAoS&My2WWfQBJ;OSVWYqe^l8wIq?m%!x$(kITfR7Z=k4S*;9Q&pr{EB zWYue+XyOl>p@F4Pjhv8?y%;DgEPKd08z{_>OL%$fESZ`Q6gj>?Q7{}R3O#`$zdume zoaFzKoZo4sIf0^xa-g`13uz0UBfD8XM%Ifyr5pa1?DuyRmtc7iyMyQ-M+eJ8=ntbG zLI=y=qkjVZljvZHImOl+>^_Cvqu7BZ7Q4r=dmOt@o3Q*emWQ!8g2hpEupYxM4!h4_ z_YYJ!aPG6%#bfu6*nJKite=PD@CDqyNWfphJpuPG;|`XupnnxT5gjaF!_U{T`v!JN z*nNN>%Qvz37Ek&FmdVrwiyUdUzNBTxy`niTpHxvM--}ui>*$Lwa=HbwR6fd{=EsE{ z%Ao=(b6%vy%AV)r?0H7kWU`AKS%+SaY)EDoIkE}88QJ1x7kSc)THC_oq8&Oe@d+2` z#M}kl+w3BT9?Yg*e)@3iU*;PwOOK1ei+san|BKr2FyC;2Q5Z{joO3hygbPf-q~|rw zlffrkxRGh(Ogf)%N#qkQ+{hf#`cFjLw|V;S@bo8m@_!~e!17)6C(%!$gY_vI36`gc zw`XujA(~QgOv4eZr??;;`|n}@efR-xr*Q+z53&Cd`j64Ul7ap&=s!UR%TEdO40iv@ zbw7h=am(ag@9=N1lQy#OXU1?W8!TXj94^Yma~>YRl8<g9fCB6au>)%n6hjH_rMQ>j zX2%Vz<xs(U_OTj;Z7|tvfw6!Ua}MNQ;-t)FPRcw$ry>3gt-!R-k)*2}mFXi5AWP6o z&A->m4!@z<*-KN-x`v8Ff6^+4Io1uTWB;r<*oRTWf{0qzpQxGG&jSrCm}rEi9QN~M zvh$jaB&}>DX+yR*vhy10MDIj)v01De=|VU4VCcm|AF>~d0py^M&*vbA(MOP@xQ`*- z=;O!<0+>X4(5F_`wP`auufYqmFb9@OVy23it0s0F#12?$h~Zl7>#(cGu72nJH(=3- z$0jVBu>?yCcCF}b=wNM!4(y!-(1}|Y{=0E^;SSaw=*7Jc_kP?5a392d2zRgyV>g06 ziVoH>aKkw7-#USZNj!M)Fog%OOry`Bd(pu%i#~_0lX&3VS?r#}?i_Z{ql0B0{paZa zjt)Deo#i|?v;Y_2KXCsA?iX<f%P+Ar{R+z$xZ6b>evJcIUqmJ&U&7N8?l0r^3U*-m z4f-YYSJA=ppXjflFQbFyx9Gn^zl;vn73}^CyWeB?I(A@L<&UqxALt|;{UZ)*IDqv} zIQ|*WSFu~i4y<p$o7i8&egpgK!~mSTf&ER`#O)StTe#iE4bE+2_ZBh07JG9hN0A~e z#T-@If1k?X(3f1ugZ$y$$-*|w3F=S?MOYR?36w(F=|5|BD7W9IDtw&$vdRfC>zw?8 zbZ|OMjgze|{T%#)tY<ZP1F{jl3E3P+PWkF#Ez0El|FDsNo%)%^!Kjal9qC3NM@}?S zv2(qrpNbth?WJNzdeLWHf7a$0*c(q^xjT%t{y}pe=Dhku&Z|%60I;ZCG#GXX)9i7U zyJI<n^v1Cs9?#PU{-jNw*d;vBiQWYrT<nB4o}eAQ1=-4z3}7CFKHU1D2XilUpV%c_ zP|ZqmM>4S!N9-WW(JPQ;`#C=+dACV4B@i9MYg)r$q6g~G>!HT=d#&~a?<48mBV?_} z9LR>9BV@C{3?&3uN`OWDK{2ubyTa2CiF~9Dc2fG$JG}q9ynC?tNDuc&)Nf-aOzHYL zk*-^0x_(Th>qoV8PT$22EPmMSg5B8rV{gPBtO2+MVjqNka5{S|a0|r^EPJr~ziPRc z-x#AfjN@;gsY_|yTfKYgT6dvn++xv%g$0R)1&M`)g(emj7LrbNRI4qe)hR8Cx)()J z6tn1p#Db&?36X}PE}_N3LKENlBP>37&u``|&im%gnaRm{&Uoqx)F_)+91D}-DE&bA zlRQTm*T@i=F0SR&2TEvE(4lfG=d2EW4pnVrLq1b!4G!TjYG0D6!co+revs*w^}7b9 z<!BP?ZO@o)5kEAy3C$_2xABu*G>O5aGMF?5gR;rg=?pl7&oYZ+-EemnjchvP&>@!& zQ~1)<h~yQ=qT_l)MH_l~h@AWyyu*9+fsqFv@d^DPrWSm`S9}Av7ktML3}6sH@e9$` z*P%rueu5wYLI{Z%hT#|iGH5{(MqxCPk%BQ8i}><gJXSja6A_Quq#_NILF6mQKqj(~ zjU41+N<iYsH1aSN(=Z(~FcY(okJ*@mxnQ4oFdqxB5Nxpw;<4JrSb_p9#WE~MAxQ8A zE3gu)uo`Q?-<u$K7+4Pyt@|Ub&SdM!IP38w>ruMZm2KU&b!)nHGv8{TXWd(5-7T>0 z6j~jttgCCSD@E4jb=IZz){PC;^-b2b%~snM>)bZ$>~`zS4y$FSb#a$<VYhXDkJVbT z*J=t|CnDDI3ahc&I$dj>s<%!yTFuQ?U5j<J)jD#?s%^8P?N&pFb*$5>@3N|5*1>M8 z@~Ktv+&cWiI@Dv;yt1m_SY^G|zCNq8-wJ=T4h&f3zped$ti;G)?x^Q&D4g(rUa)^i Cj{+e8