diff --git a/NuGet.config b/NuGet.config index 5fc2d2fc94..106603cce0 100644 --- a/NuGet.config +++ b/NuGet.config @@ -5,7 +5,6 @@ - diff --git a/src/Microsoft.ML.Tokenizers/Microsoft.ML.Tokenizers.csproj b/src/Microsoft.ML.Tokenizers/Microsoft.ML.Tokenizers.csproj index ffc5946c78..271eb1b5aa 100644 --- a/src/Microsoft.ML.Tokenizers/Microsoft.ML.Tokenizers.csproj +++ b/src/Microsoft.ML.Tokenizers/Microsoft.ML.Tokenizers.csproj @@ -6,6 +6,7 @@ true Microsoft.ML.Tokenizers contains the implmentation of the tokenization used in the NLP transforms. true + $(NoWarn);MSML_NoInstanceInitializers @@ -16,10 +17,6 @@ - - - - diff --git a/src/Microsoft.ML.Tokenizers/Model/BertOptions.cs b/src/Microsoft.ML.Tokenizers/Model/BertOptions.cs index 7771757d5d..14d985e3cd 100644 --- a/src/Microsoft.ML.Tokenizers/Model/BertOptions.cs +++ b/src/Microsoft.ML.Tokenizers/Model/BertOptions.cs @@ -1,4 +1,4 @@ -// Licensed to the .NET Foundation under one or more agreements. +// Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. @@ -9,7 +9,6 @@ namespace Microsoft.ML.Tokenizers /// public sealed class BertOptions : WordPieceOptions { -#pragma warning disable MSML_NoInstanceInitializers /// /// Gets or sets a value indicating whether to lower case the input before tokenization. /// @@ -66,7 +65,5 @@ public sealed class BertOptions : WordPieceOptions /// Gets or sets a value indicating whether to remove non-spacing marks. /// public bool RemoveNonSpacingMarks { get; set; } - -#pragma warning restore MSML_NoInstanceInitializers } -} \ No newline at end of file +} diff --git a/src/Microsoft.ML.Tokenizers/Model/WordPieceOptions.cs b/src/Microsoft.ML.Tokenizers/Model/WordPieceOptions.cs index ac6f05c612..2bfd5d4b6a 100644 --- a/src/Microsoft.ML.Tokenizers/Model/WordPieceOptions.cs +++ b/src/Microsoft.ML.Tokenizers/Model/WordPieceOptions.cs @@ -1,4 +1,4 @@ -// Licensed to the .NET Foundation under one or more agreements. +// Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. @@ -11,7 +11,6 @@ namespace Microsoft.ML.Tokenizers /// public class WordPieceOptions { -#pragma warning disable MSML_NoInstanceInitializers internal const int DefaultMaxInputCharsPerWord = 100; internal const string DefaultContinuingSubwordPrefix = "##"; @@ -44,6 +43,5 @@ public class WordPieceOptions /// Gets or set the maximum number of characters to consider for a single word. /// public int MaxInputCharsPerWord { get; set; } = DefaultMaxInputCharsPerWord; -#pragma warning restore MSML_NoInstanceInitializers } -} \ No newline at end of file +} diff --git a/src/Microsoft.ML.Tokenizers/SentencepieceModel.cs b/src/Microsoft.ML.Tokenizers/SentencepieceModel.cs index 64f258f454..6bffb8fdb4 100644 --- a/src/Microsoft.ML.Tokenizers/SentencepieceModel.cs +++ b/src/Microsoft.ML.Tokenizers/SentencepieceModel.cs @@ -1,4701 +1,466 @@ -// Licensed to the .NET Foundation under one or more agreements. +// Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. -// This file is generated from the SentencePiece_https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto using the protocol buffer compiler. -// Look at https://protobuf.dev/reference/csharp/csharp-generated/ for more information regarding protocol buffer generated code. -// To generate this file, run the following command: -// protoc.exe --proto_path .\sentencepiece\src --csharp_out=. sentencepiece.proto -// Generated file is used to define the SentencePiece model and its related classes. - -// SentencePiece is under the Apache License 2.0 https://github.com/google/sentencepiece/blob/master/LICENSE - -// The generated code is edited to mark the generated types as internal and to remove the formatting warnings. -// Otherwise, the generated code is preserved as it is. - -// -// Generated by the protocol buffer compiler. DO NOT EDIT! -// source: sentencepiece_model.proto -// - -#pragma warning disable 1591, 0612, 3021, 8981, CS1570, CS1587 -#region Designer generated code - -using pb = global::Google.Protobuf; -using pbc = global::Google.Protobuf.Collections; -using pbr = global::Google.Protobuf.Reflection; -using scg = global::System.Collections.Generic; -namespace Sentencepiece { - - /// Holder for reflection information generated from sentencepiece_model.proto - internal static partial class SentencepieceModelReflection { - - #region Descriptor - /// File descriptor for sentencepiece_model.proto - public static pbr::FileDescriptor Descriptor { - get { return descriptor; } - } - private static pbr::FileDescriptor descriptor; - - static SentencepieceModelReflection() { - byte[] descriptorData = global::System.Convert.FromBase64String( - string.Concat( - "ChlzZW50ZW5jZXBpZWNlX21vZGVsLnByb3RvEg1zZW50ZW5jZXBpZWNlIqQM", - "CgtUcmFpbmVyU3BlYxINCgVpbnB1dBgBIAMoCRIUCgxpbnB1dF9mb3JtYXQY", - "ByABKAkSFAoMbW9kZWxfcHJlZml4GAIgASgJEkEKCm1vZGVsX3R5cGUYAyAB", - "KA4yJC5zZW50ZW5jZXBpZWNlLlRyYWluZXJTcGVjLk1vZGVsVHlwZToHVU5J", - "R1JBTRIYCgp2b2NhYl9zaXplGAQgASgFOgQ4MDAwEhcKD2FjY2VwdF9sYW5n", - "dWFnZRgFIAMoCRIgChVzZWxmX3Rlc3Rfc2FtcGxlX3NpemUYBiABKAU6ATAS", - "KgobZW5hYmxlX2RpZmZlcmVudGlhbF9wcml2YWN5GDIgASgIOgVmYWxzZRIr", - "CiBkaWZmZXJlbnRpYWxfcHJpdmFjeV9ub2lzZV9sZXZlbBgzIAEoAjoBMBIy", - "CidkaWZmZXJlbnRpYWxfcHJpdmFjeV9jbGlwcGluZ190aHJlc2hvbGQYNCAB", - "KAQ6ATASIgoSY2hhcmFjdGVyX2NvdmVyYWdlGAogASgCOgYwLjk5OTUSHgoT", - "aW5wdXRfc2VudGVuY2Vfc2l6ZRgLIAEoBDoBMBIkChZzaHVmZmxlX2lucHV0", - "X3NlbnRlbmNlGBMgASgIOgR0cnVlEiAKFG1pbmluZ19zZW50ZW5jZV9zaXpl", - "GAwgASgFQgIYARIiChZ0cmFpbmluZ19zZW50ZW5jZV9zaXplGA0gASgFQgIY", - "ARIoChdzZWVkX3NlbnRlbmNlcGllY2Vfc2l6ZRgOIAEoBToHMTAwMDAwMBIe", - "ChBzaHJpbmtpbmdfZmFjdG9yGA8gASgCOgQwLjc1EiEKE21heF9zZW50ZW5j", - "ZV9sZW5ndGgYEiABKAU6BDQxOTISFwoLbnVtX3RocmVhZHMYECABKAU6AjE2", - "Eh0KEm51bV9zdWJfaXRlcmF0aW9ucxgRIAEoBToBMhIkChhtYXhfc2VudGVu", - "Y2VwaWVjZV9sZW5ndGgYFCABKAU6AjE2EiUKF3NwbGl0X2J5X3VuaWNvZGVf", - "c2NyaXB0GBUgASgIOgR0cnVlEh0KD3NwbGl0X2J5X251bWJlchgXIAEoCDoE", - "dHJ1ZRIhChNzcGxpdF9ieV93aGl0ZXNwYWNlGBYgASgIOgR0cnVlEikKGnRy", - "ZWF0X3doaXRlc3BhY2VfYXNfc3VmZml4GBggASgIOgVmYWxzZRIrChxhbGxv", - "d193aGl0ZXNwYWNlX29ubHlfcGllY2VzGBogASgIOgVmYWxzZRIbCgxzcGxp", - "dF9kaWdpdHMYGSABKAg6BWZhbHNlEiMKGXByZXRva2VuaXphdGlvbl9kZWxp", - "bWl0ZXIYNSABKAk6ABIXCg9jb250cm9sX3N5bWJvbHMYHiADKAkSHAoUdXNl", - "cl9kZWZpbmVkX3N5bWJvbHMYHyADKAkSFgoOcmVxdWlyZWRfY2hhcnMYJCAB", - "KAkSHAoNYnl0ZV9mYWxsYmFjaxgjIAEoCDoFZmFsc2USKwoddm9jYWJ1bGFy", - "eV9vdXRwdXRfcGllY2Vfc2NvcmUYICABKAg6BHRydWUSHgoQaGFyZF92b2Nh", - "Yl9saW1pdBghIAEoCDoEdHJ1ZRIcCg11c2VfYWxsX3ZvY2FiGCIgASgIOgVm", - "YWxzZRIRCgZ1bmtfaWQYKCABKAU6ATASEQoGYm9zX2lkGCkgASgFOgExEhEK", - "BmVvc19pZBgqIAEoBToBMhISCgZwYWRfaWQYKyABKAU6Ai0xEhgKCXVua19w", - "aWVjZRgtIAEoCToFPHVuaz4SFgoJYm9zX3BpZWNlGC4gASgJOgM8cz4SFwoJ", - "ZW9zX3BpZWNlGC8gASgJOgQ8L3M+EhgKCXBhZF9waWVjZRgwIAEoCToFPHBh", - "ZD4SGgoLdW5rX3N1cmZhY2UYLCABKAk6BSDigYcgEisKHHRyYWluX2V4dHJl", - "bWVseV9sYXJnZV9jb3JwdXMYMSABKAg6BWZhbHNlEiIKGHNlZWRfc2VudGVu", - "Y2VwaWVjZXNfZmlsZRg2IAEoCToAIjUKCU1vZGVsVHlwZRILCgdVTklHUkFN", - "EAESBwoDQlBFEAISCAoEV09SRBADEggKBENIQVIQBCoJCMgBEICAgIACItEB", - "Cg5Ob3JtYWxpemVyU3BlYxIMCgRuYW1lGAEgASgJEhwKFHByZWNvbXBpbGVk", - "X2NoYXJzbWFwGAIgASgMEh4KEGFkZF9kdW1teV9wcmVmaXgYAyABKAg6BHRy", - "dWUSJgoYcmVtb3ZlX2V4dHJhX3doaXRlc3BhY2VzGAQgASgIOgR0cnVlEiAK", - "EmVzY2FwZV93aGl0ZXNwYWNlcxgFIAEoCDoEdHJ1ZRIeChZub3JtYWxpemF0", - "aW9uX3J1bGVfdHN2GAYgASgJKgkIyAEQgICAgAIieQoMU2VsZlRlc3REYXRh", - "EjMKB3NhbXBsZXMYASADKAsyIi5zZW50ZW5jZXBpZWNlLlNlbGZUZXN0RGF0", - "YS5TYW1wbGUaKQoGU2FtcGxlEg0KBWlucHV0GAEgASgJEhAKCGV4cGVjdGVk", - "GAIgASgJKgkIyAEQgICAgAIi/gMKCk1vZGVsUHJvdG8SNwoGcGllY2VzGAEg", - "AygLMicuc2VudGVuY2VwaWVjZS5Nb2RlbFByb3RvLlNlbnRlbmNlUGllY2US", - "MAoMdHJhaW5lcl9zcGVjGAIgASgLMhouc2VudGVuY2VwaWVjZS5UcmFpbmVy", - "U3BlYxI2Cg9ub3JtYWxpemVyX3NwZWMYAyABKAsyHS5zZW50ZW5jZXBpZWNl", - "Lk5vcm1hbGl6ZXJTcGVjEjMKDnNlbGZfdGVzdF9kYXRhGAQgASgLMhsuc2Vu", - "dGVuY2VwaWVjZS5TZWxmVGVzdERhdGESOAoRZGVub3JtYWxpemVyX3NwZWMY", - "BSABKAsyHS5zZW50ZW5jZXBpZWNlLk5vcm1hbGl6ZXJTcGVjGtIBCg1TZW50", - "ZW5jZVBpZWNlEg0KBXBpZWNlGAEgASgJEg0KBXNjb3JlGAIgASgCEkIKBHR5", - "cGUYAyABKA4yLC5zZW50ZW5jZXBpZWNlLk1vZGVsUHJvdG8uU2VudGVuY2VQ", - "aWVjZS5UeXBlOgZOT1JNQUwiVAoEVHlwZRIKCgZOT1JNQUwQARILCgdVTktO", - "T1dOEAISCwoHQ09OVFJPTBADEhAKDFVTRVJfREVGSU5FRBAEEggKBEJZVEUQ", - "BhIKCgZVTlVTRUQQBSoJCMgBEICAgIACKgkIyAEQgICAgAJCAkgD")); - descriptor = pbr::FileDescriptor.FromGeneratedCode(descriptorData, - new pbr::FileDescriptor[] { }, - new pbr::GeneratedClrTypeInfo(null, null, new pbr::GeneratedClrTypeInfo[] { - new pbr::GeneratedClrTypeInfo(typeof(global::Sentencepiece.TrainerSpec), global::Sentencepiece.TrainerSpec.Parser, new[]{ "Input", "InputFormat", "ModelPrefix", "ModelType", "VocabSize", "AcceptLanguage", "SelfTestSampleSize", "EnableDifferentialPrivacy", "DifferentialPrivacyNoiseLevel", "DifferentialPrivacyClippingThreshold", "CharacterCoverage", "InputSentenceSize", "ShuffleInputSentence", "MiningSentenceSize", "TrainingSentenceSize", "SeedSentencepieceSize", "ShrinkingFactor", "MaxSentenceLength", "NumThreads", "NumSubIterations", "MaxSentencepieceLength", "SplitByUnicodeScript", "SplitByNumber", "SplitByWhitespace", "TreatWhitespaceAsSuffix", "AllowWhitespaceOnlyPieces", "SplitDigits", "PretokenizationDelimiter", "ControlSymbols", "UserDefinedSymbols", "RequiredChars", "ByteFallback", "VocabularyOutputPieceScore", "HardVocabLimit", "UseAllVocab", "UnkId", "BosId", "EosId", "PadId", "UnkPiece", "BosPiece", "EosPiece", "PadPiece", "UnkSurface", "TrainExtremelyLargeCorpus", "SeedSentencepiecesFile" }, null, new[]{ typeof(global::Sentencepiece.TrainerSpec.Types.ModelType) }, null, null), - new pbr::GeneratedClrTypeInfo(typeof(global::Sentencepiece.NormalizerSpec), global::Sentencepiece.NormalizerSpec.Parser, new[]{ "Name", "PrecompiledCharsmap", "AddDummyPrefix", "RemoveExtraWhitespaces", "EscapeWhitespaces", "NormalizationRuleTsv" }, null, null, null, null), - new pbr::GeneratedClrTypeInfo(typeof(global::Sentencepiece.SelfTestData), global::Sentencepiece.SelfTestData.Parser, new[]{ "Samples" }, null, null, null, new pbr::GeneratedClrTypeInfo[] { new pbr::GeneratedClrTypeInfo(typeof(global::Sentencepiece.SelfTestData.Types.Sample), global::Sentencepiece.SelfTestData.Types.Sample.Parser, new[]{ "Input", "Expected" }, null, null, null, null)}), - new pbr::GeneratedClrTypeInfo(typeof(global::Sentencepiece.ModelProto), global::Sentencepiece.ModelProto.Parser, new[]{ "Pieces", "TrainerSpec", "NormalizerSpec", "SelfTestData", "DenormalizerSpec" }, null, null, null, new pbr::GeneratedClrTypeInfo[] { new pbr::GeneratedClrTypeInfo(typeof(global::Sentencepiece.ModelProto.Types.SentencePiece), global::Sentencepiece.ModelProto.Types.SentencePiece.Parser, new[]{ "Piece", "Score", "Type" }, null, new[]{ typeof(global::Sentencepiece.ModelProto.Types.SentencePiece.Types.Type) }, null, null)}) - })); - } - #endregion - - } - #region Messages - /// - /// TrainerSpec encodes a various parameters for SentencePiece training. - /// Next id: 55 - /// - [global::System.Diagnostics.DebuggerDisplayAttribute("{ToString(),nq}")] - internal sealed partial class TrainerSpec : pb::IExtendableMessage - #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE - , pb::IBufferMessage - #endif - { - private static readonly pb::MessageParser _parser = new pb::MessageParser(() => new TrainerSpec()); - private pb::UnknownFieldSet _unknownFields; - private pb::ExtensionSet _extensions; - private pb::ExtensionSet _Extensions { get { return _extensions; } } - private int _hasBits0; - private int _hasBits1; - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public static pb::MessageParser Parser { get { return _parser; } } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public static pbr::MessageDescriptor Descriptor { - get { return global::Sentencepiece.SentencepieceModelReflection.Descriptor.MessageTypes[0]; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - pbr::MessageDescriptor pb::IMessage.Descriptor { - get { return Descriptor; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public TrainerSpec() { - OnConstruction(); - } - - partial void OnConstruction(); - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public TrainerSpec(TrainerSpec other) : this() { - _hasBits0 = other._hasBits0; - _hasBits1 = other._hasBits1; - input_ = other.input_.Clone(); - inputFormat_ = other.inputFormat_; - modelPrefix_ = other.modelPrefix_; - modelType_ = other.modelType_; - vocabSize_ = other.vocabSize_; - acceptLanguage_ = other.acceptLanguage_.Clone(); - selfTestSampleSize_ = other.selfTestSampleSize_; - enableDifferentialPrivacy_ = other.enableDifferentialPrivacy_; - differentialPrivacyNoiseLevel_ = other.differentialPrivacyNoiseLevel_; - differentialPrivacyClippingThreshold_ = other.differentialPrivacyClippingThreshold_; - characterCoverage_ = other.characterCoverage_; - inputSentenceSize_ = other.inputSentenceSize_; - shuffleInputSentence_ = other.shuffleInputSentence_; - miningSentenceSize_ = other.miningSentenceSize_; - trainingSentenceSize_ = other.trainingSentenceSize_; - seedSentencepieceSize_ = other.seedSentencepieceSize_; - shrinkingFactor_ = other.shrinkingFactor_; - maxSentenceLength_ = other.maxSentenceLength_; - numThreads_ = other.numThreads_; - numSubIterations_ = other.numSubIterations_; - maxSentencepieceLength_ = other.maxSentencepieceLength_; - splitByUnicodeScript_ = other.splitByUnicodeScript_; - splitByNumber_ = other.splitByNumber_; - splitByWhitespace_ = other.splitByWhitespace_; - treatWhitespaceAsSuffix_ = other.treatWhitespaceAsSuffix_; - allowWhitespaceOnlyPieces_ = other.allowWhitespaceOnlyPieces_; - splitDigits_ = other.splitDigits_; - pretokenizationDelimiter_ = other.pretokenizationDelimiter_; - controlSymbols_ = other.controlSymbols_.Clone(); - userDefinedSymbols_ = other.userDefinedSymbols_.Clone(); - requiredChars_ = other.requiredChars_; - byteFallback_ = other.byteFallback_; - vocabularyOutputPieceScore_ = other.vocabularyOutputPieceScore_; - hardVocabLimit_ = other.hardVocabLimit_; - useAllVocab_ = other.useAllVocab_; - unkId_ = other.unkId_; - bosId_ = other.bosId_; - eosId_ = other.eosId_; - padId_ = other.padId_; - unkPiece_ = other.unkPiece_; - bosPiece_ = other.bosPiece_; - eosPiece_ = other.eosPiece_; - padPiece_ = other.padPiece_; - unkSurface_ = other.unkSurface_; - trainExtremelyLargeCorpus_ = other.trainExtremelyLargeCorpus_; - seedSentencepiecesFile_ = other.seedSentencepiecesFile_; - _unknownFields = pb::UnknownFieldSet.Clone(other._unknownFields); - _extensions = pb::ExtensionSet.Clone(other._extensions); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public TrainerSpec Clone() { - return new TrainerSpec(this); - } - - /// Field number for the "input" field. - public const int InputFieldNumber = 1; - private static readonly pb::FieldCodec _repeated_input_codec - = pb::FieldCodec.ForString(10); - private readonly pbc::RepeatedField input_ = new pbc::RepeatedField(); - /// - //////////////////////////////////////////////////////////////////// - /// General parameters - /// - /// Input corpus files. - /// Trainer accepts the following two formats: - /// A) Monolingual: plain text, one sentence per line. - /// B) Bilingual: TSV, source sentence <tab> target sentence - /// When bilingual data is passed, shared vocabulary model is built. - /// Note that the input file must be raw corpus, not a preprocessed corpus. - /// Trainer only loads the first `input_sentence_size` sentences specified - /// with this parameter. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public pbc::RepeatedField Input { - get { return input_; } - } - - /// Field number for the "input_format" field. - public const int InputFormatFieldNumber = 7; - private readonly static string InputFormatDefaultValue = ""; - - private string inputFormat_; - /// - /// Input corpus format: - /// "text": one-sentence-per-line text format (default) - /// "tsv": sentence <tab> freq - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public string InputFormat { - get { return inputFormat_ ?? InputFormatDefaultValue; } - set { - inputFormat_ = pb::ProtoPreconditions.CheckNotNull(value, "value"); - } - } - /// Gets whether the "input_format" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasInputFormat { - get { return inputFormat_ != null; } - } - /// Clears the value of the "input_format" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearInputFormat() { - inputFormat_ = null; - } - - /// Field number for the "model_prefix" field. - public const int ModelPrefixFieldNumber = 2; - private readonly static string ModelPrefixDefaultValue = ""; - - private string modelPrefix_; - /// - /// Output model file prefix. - /// <model_prefix>.model and <model_prefix>.vocab are generated. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public string ModelPrefix { - get { return modelPrefix_ ?? ModelPrefixDefaultValue; } - set { - modelPrefix_ = pb::ProtoPreconditions.CheckNotNull(value, "value"); - } - } - /// Gets whether the "model_prefix" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasModelPrefix { - get { return modelPrefix_ != null; } - } - /// Clears the value of the "model_prefix" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearModelPrefix() { - modelPrefix_ = null; - } - - /// Field number for the "model_type" field. - public const int ModelTypeFieldNumber = 3; - private readonly static global::Sentencepiece.TrainerSpec.Types.ModelType ModelTypeDefaultValue = global::Sentencepiece.TrainerSpec.Types.ModelType.Unigram; - - private global::Sentencepiece.TrainerSpec.Types.ModelType modelType_; - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public global::Sentencepiece.TrainerSpec.Types.ModelType ModelType { - get { if ((_hasBits0 & 1) != 0) { return modelType_; } else { return ModelTypeDefaultValue; } } - set { - _hasBits0 |= 1; - modelType_ = value; - } - } - /// Gets whether the "model_type" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasModelType { - get { return (_hasBits0 & 1) != 0; } - } - /// Clears the value of the "model_type" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearModelType() { - _hasBits0 &= ~1; - } - - /// Field number for the "vocab_size" field. - public const int VocabSizeFieldNumber = 4; - private readonly static int VocabSizeDefaultValue = 8000; - - private int vocabSize_; - /// - /// Vocabulary size. 8k is the default size. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public int VocabSize { - get { if ((_hasBits0 & 2) != 0) { return vocabSize_; } else { return VocabSizeDefaultValue; } } - set { - _hasBits0 |= 2; - vocabSize_ = value; - } - } - /// Gets whether the "vocab_size" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasVocabSize { - get { return (_hasBits0 & 2) != 0; } - } - /// Clears the value of the "vocab_size" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearVocabSize() { - _hasBits0 &= ~2; - } - - /// Field number for the "accept_language" field. - public const int AcceptLanguageFieldNumber = 5; - private static readonly pb::FieldCodec _repeated_acceptLanguage_codec - = pb::FieldCodec.ForString(42); - private readonly pbc::RepeatedField acceptLanguage_ = new pbc::RepeatedField(); - /// - /// List of the languages this model can accept. - /// Since the model is language-agnostic, this field is used as a reference. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public pbc::RepeatedField AcceptLanguage { - get { return acceptLanguage_; } - } - - /// Field number for the "self_test_sample_size" field. - public const int SelfTestSampleSizeFieldNumber = 6; - private readonly static int SelfTestSampleSizeDefaultValue = 0; - - private int selfTestSampleSize_; - /// - /// Size of self-test samples, which are encoded in the model file. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public int SelfTestSampleSize { - get { if ((_hasBits0 & 4) != 0) { return selfTestSampleSize_; } else { return SelfTestSampleSizeDefaultValue; } } - set { - _hasBits0 |= 4; - selfTestSampleSize_ = value; - } - } - /// Gets whether the "self_test_sample_size" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasSelfTestSampleSize { - get { return (_hasBits0 & 4) != 0; } - } - /// Clears the value of the "self_test_sample_size" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearSelfTestSampleSize() { - _hasBits0 &= ~4; - } - - /// Field number for the "enable_differential_privacy" field. - public const int EnableDifferentialPrivacyFieldNumber = 50; - private readonly static bool EnableDifferentialPrivacyDefaultValue = false; - - private bool enableDifferentialPrivacy_; - /// - /// Whether to use DP version of sentencepiece. Use it with TSV input format - /// (requires precomputed word tab counts to work). - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool EnableDifferentialPrivacy { - get { if ((_hasBits0 & 536870912) != 0) { return enableDifferentialPrivacy_; } else { return EnableDifferentialPrivacyDefaultValue; } } - set { - _hasBits0 |= 536870912; - enableDifferentialPrivacy_ = value; - } - } - /// Gets whether the "enable_differential_privacy" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasEnableDifferentialPrivacy { - get { return (_hasBits0 & 536870912) != 0; } - } - /// Clears the value of the "enable_differential_privacy" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearEnableDifferentialPrivacy() { - _hasBits0 &= ~536870912; - } - - /// Field number for the "differential_privacy_noise_level" field. - public const int DifferentialPrivacyNoiseLevelFieldNumber = 51; - private readonly static float DifferentialPrivacyNoiseLevelDefaultValue = 0F; - - private float differentialPrivacyNoiseLevel_; - /// - /// Set these parameters if you need DP version of sentencepiece. - /// std of noise to add. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public float DifferentialPrivacyNoiseLevel { - get { if ((_hasBits0 & 1073741824) != 0) { return differentialPrivacyNoiseLevel_; } else { return DifferentialPrivacyNoiseLevelDefaultValue; } } - set { - _hasBits0 |= 1073741824; - differentialPrivacyNoiseLevel_ = value; - } - } - /// Gets whether the "differential_privacy_noise_level" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasDifferentialPrivacyNoiseLevel { - get { return (_hasBits0 & 1073741824) != 0; } - } - /// Clears the value of the "differential_privacy_noise_level" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearDifferentialPrivacyNoiseLevel() { - _hasBits0 &= ~1073741824; - } - - /// Field number for the "differential_privacy_clipping_threshold" field. - public const int DifferentialPrivacyClippingThresholdFieldNumber = 52; - private readonly static ulong DifferentialPrivacyClippingThresholdDefaultValue = 0UL; - - private ulong differentialPrivacyClippingThreshold_; - /// - /// Clipping threshold to apply after adding noise. All the words with - /// frequency less than this value are dropped. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public ulong DifferentialPrivacyClippingThreshold { - get { if ((_hasBits0 & -2147483648) != 0) { return differentialPrivacyClippingThreshold_; } else { return DifferentialPrivacyClippingThresholdDefaultValue; } } - set { - _hasBits0 |= -2147483648; - differentialPrivacyClippingThreshold_ = value; - } - } - /// Gets whether the "differential_privacy_clipping_threshold" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasDifferentialPrivacyClippingThreshold { - get { return (_hasBits0 & -2147483648) != 0; } - } - /// Clears the value of the "differential_privacy_clipping_threshold" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearDifferentialPrivacyClippingThreshold() { - _hasBits0 &= ~-2147483648; - } - - /// Field number for the "character_coverage" field. - public const int CharacterCoverageFieldNumber = 10; - private readonly static float CharacterCoverageDefaultValue = 0.9995F; - - private float characterCoverage_; - /// - //////////////////////////////////////////////////////////////////// - /// Training parameters. - /// - /// Uses characters which cover the corpus with the ratio of `chars_coverage`. - /// This parameter determines the set of basic Alphabet of sentence piece. - /// 1.0 - `chars_coverage` characters are treated as UNK. - /// See also required_chars field. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public float CharacterCoverage { - get { if ((_hasBits0 & 8) != 0) { return characterCoverage_; } else { return CharacterCoverageDefaultValue; } } - set { - _hasBits0 |= 8; - characterCoverage_ = value; - } - } - /// Gets whether the "character_coverage" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasCharacterCoverage { - get { return (_hasBits0 & 8) != 0; } - } - /// Clears the value of the "character_coverage" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearCharacterCoverage() { - _hasBits0 &= ~8; - } - - /// Field number for the "input_sentence_size" field. - public const int InputSentenceSizeFieldNumber = 11; - private readonly static ulong InputSentenceSizeDefaultValue = 0UL; - - private ulong inputSentenceSize_; - /// - /// Maximum size of sentences the trainer loads from `input` parameter. - /// Trainer simply loads the `input` files in sequence. - /// It is better to shuffle the input corpus randomly. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public ulong InputSentenceSize { - get { if ((_hasBits0 & 16) != 0) { return inputSentenceSize_; } else { return InputSentenceSizeDefaultValue; } } - set { - _hasBits0 |= 16; - inputSentenceSize_ = value; - } - } - /// Gets whether the "input_sentence_size" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasInputSentenceSize { - get { return (_hasBits0 & 16) != 0; } - } - /// Clears the value of the "input_sentence_size" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearInputSentenceSize() { - _hasBits0 &= ~16; - } - - /// Field number for the "shuffle_input_sentence" field. - public const int ShuffleInputSentenceFieldNumber = 19; - private readonly static bool ShuffleInputSentenceDefaultValue = true; - - private bool shuffleInputSentence_; - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool ShuffleInputSentence { - get { if ((_hasBits0 & 4096) != 0) { return shuffleInputSentence_; } else { return ShuffleInputSentenceDefaultValue; } } - set { - _hasBits0 |= 4096; - shuffleInputSentence_ = value; - } - } - /// Gets whether the "shuffle_input_sentence" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasShuffleInputSentence { - get { return (_hasBits0 & 4096) != 0; } - } - /// Clears the value of the "shuffle_input_sentence" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearShuffleInputSentence() { - _hasBits0 &= ~4096; - } - - /// Field number for the "mining_sentence_size" field. - public const int MiningSentenceSizeFieldNumber = 12; - private readonly static int MiningSentenceSizeDefaultValue = 0; - - private int miningSentenceSize_; - /// - /// Maximum size of sentences to make seed sentence pieces. - /// Extended suffix array is constructed to extract frequent - /// sub-strings from the corpus. This uses 20N working space, - /// where N is the size of corpus. - /// - [global::System.ObsoleteAttribute] - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public int MiningSentenceSize { - get { if ((_hasBits0 & 32) != 0) { return miningSentenceSize_; } else { return MiningSentenceSizeDefaultValue; } } - set { - _hasBits0 |= 32; - miningSentenceSize_ = value; - } - } - /// Gets whether the "mining_sentence_size" field is set - [global::System.ObsoleteAttribute] - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasMiningSentenceSize { - get { return (_hasBits0 & 32) != 0; } - } - /// Clears the value of the "mining_sentence_size" field - [global::System.ObsoleteAttribute] - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearMiningSentenceSize() { - _hasBits0 &= ~32; - } - - /// Field number for the "training_sentence_size" field. - public const int TrainingSentenceSizeFieldNumber = 13; - private readonly static int TrainingSentenceSizeDefaultValue = 0; - - private int trainingSentenceSize_; - /// - /// Maximum size of sentences to train sentence pieces. - /// - [global::System.ObsoleteAttribute] - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public int TrainingSentenceSize { - get { if ((_hasBits0 & 64) != 0) { return trainingSentenceSize_; } else { return TrainingSentenceSizeDefaultValue; } } - set { - _hasBits0 |= 64; - trainingSentenceSize_ = value; - } - } - /// Gets whether the "training_sentence_size" field is set - [global::System.ObsoleteAttribute] - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasTrainingSentenceSize { - get { return (_hasBits0 & 64) != 0; } - } - /// Clears the value of the "training_sentence_size" field - [global::System.ObsoleteAttribute] - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearTrainingSentenceSize() { - _hasBits0 &= ~64; - } - - /// Field number for the "seed_sentencepiece_size" field. - public const int SeedSentencepieceSizeFieldNumber = 14; - private readonly static int SeedSentencepieceSizeDefaultValue = 1000000; - - private int seedSentencepieceSize_; - /// - /// The size of seed sentencepieces. - /// `seed_sentencepiece_size` must be larger than `vocab_size`. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public int SeedSentencepieceSize { - get { if ((_hasBits0 & 128) != 0) { return seedSentencepieceSize_; } else { return SeedSentencepieceSizeDefaultValue; } } - set { - _hasBits0 |= 128; - seedSentencepieceSize_ = value; - } - } - /// Gets whether the "seed_sentencepiece_size" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasSeedSentencepieceSize { - get { return (_hasBits0 & 128) != 0; } - } - /// Clears the value of the "seed_sentencepiece_size" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearSeedSentencepieceSize() { - _hasBits0 &= ~128; - } - - /// Field number for the "shrinking_factor" field. - public const int ShrinkingFactorFieldNumber = 15; - private readonly static float ShrinkingFactorDefaultValue = 0.75F; - - private float shrinkingFactor_; - /// - /// In every EM sub-iterations, keeps top - /// `shrinking_factor` * `current sentencepieces size` with respect to - /// the loss of the sentence piece. This value should be smaller than 1.0. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public float ShrinkingFactor { - get { if ((_hasBits0 & 256) != 0) { return shrinkingFactor_; } else { return ShrinkingFactorDefaultValue; } } - set { - _hasBits0 |= 256; - shrinkingFactor_ = value; - } - } - /// Gets whether the "shrinking_factor" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasShrinkingFactor { - get { return (_hasBits0 & 256) != 0; } - } - /// Clears the value of the "shrinking_factor" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearShrinkingFactor() { - _hasBits0 &= ~256; - } - - /// Field number for the "max_sentence_length" field. - public const int MaxSentenceLengthFieldNumber = 18; - private readonly static int MaxSentenceLengthDefaultValue = 4192; - - private int maxSentenceLength_; - /// - /// The maximum sentence length in byte. The sentences with the length - /// larger than `max_sentence_length` is simply ignored. - /// Longer input tends to bring the following risks: - /// * Overflow during EM training (unigram language model only) - /// * Performance drop because of O(n log n) cost in BPE. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public int MaxSentenceLength { - get { if ((_hasBits0 & 2048) != 0) { return maxSentenceLength_; } else { return MaxSentenceLengthDefaultValue; } } - set { - _hasBits0 |= 2048; - maxSentenceLength_ = value; - } - } - /// Gets whether the "max_sentence_length" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasMaxSentenceLength { - get { return (_hasBits0 & 2048) != 0; } - } - /// Clears the value of the "max_sentence_length" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearMaxSentenceLength() { - _hasBits0 &= ~2048; - } - - /// Field number for the "num_threads" field. - public const int NumThreadsFieldNumber = 16; - private readonly static int NumThreadsDefaultValue = 16; - - private int numThreads_; - /// - /// Number of threads in the training. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public int NumThreads { - get { if ((_hasBits0 & 512) != 0) { return numThreads_; } else { return NumThreadsDefaultValue; } } - set { - _hasBits0 |= 512; - numThreads_ = value; - } - } - /// Gets whether the "num_threads" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasNumThreads { - get { return (_hasBits0 & 512) != 0; } - } - /// Clears the value of the "num_threads" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearNumThreads() { - _hasBits0 &= ~512; - } - - /// Field number for the "num_sub_iterations" field. - public const int NumSubIterationsFieldNumber = 17; - private readonly static int NumSubIterationsDefaultValue = 2; - - private int numSubIterations_; - /// - /// Number of EM sub iterations. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public int NumSubIterations { - get { if ((_hasBits0 & 1024) != 0) { return numSubIterations_; } else { return NumSubIterationsDefaultValue; } } - set { - _hasBits0 |= 1024; - numSubIterations_ = value; - } - } - /// Gets whether the "num_sub_iterations" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasNumSubIterations { - get { return (_hasBits0 & 1024) != 0; } - } - /// Clears the value of the "num_sub_iterations" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearNumSubIterations() { - _hasBits0 &= ~1024; - } - - /// Field number for the "max_sentencepiece_length" field. - public const int MaxSentencepieceLengthFieldNumber = 20; - private readonly static int MaxSentencepieceLengthDefaultValue = 16; - - private int maxSentencepieceLength_; - /// - //////////////////////////////////////////////////////////////////// - /// SentencePiece parameters which control the shapes of sentence piece. - /// - /// Maximum length of sentencepiece. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public int MaxSentencepieceLength { - get { if ((_hasBits0 & 8192) != 0) { return maxSentencepieceLength_; } else { return MaxSentencepieceLengthDefaultValue; } } - set { - _hasBits0 |= 8192; - maxSentencepieceLength_ = value; - } - } - /// Gets whether the "max_sentencepiece_length" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasMaxSentencepieceLength { - get { return (_hasBits0 & 8192) != 0; } - } - /// Clears the value of the "max_sentencepiece_length" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearMaxSentencepieceLength() { - _hasBits0 &= ~8192; - } - - /// Field number for the "split_by_unicode_script" field. - public const int SplitByUnicodeScriptFieldNumber = 21; - private readonly static bool SplitByUnicodeScriptDefaultValue = true; - - private bool splitByUnicodeScript_; - /// - /// Uses Unicode script to split sentence pieces. - /// When `split_by_unicode_script` is true, we do not allow sentence piece to - /// include multiple Unicode scripts, e.g. "F1" is not a valid piece. - /// Exception: CJ characters (Hiragana/Katakana/Han) are all handled - /// as one script type, since Japanese word can consist of multiple scripts. - /// This exception is always applied regardless of the accept-language - /// parameter. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool SplitByUnicodeScript { - get { if ((_hasBits0 & 16384) != 0) { return splitByUnicodeScript_; } else { return SplitByUnicodeScriptDefaultValue; } } - set { - _hasBits0 |= 16384; - splitByUnicodeScript_ = value; - } - } - /// Gets whether the "split_by_unicode_script" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasSplitByUnicodeScript { - get { return (_hasBits0 & 16384) != 0; } - } - /// Clears the value of the "split_by_unicode_script" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearSplitByUnicodeScript() { - _hasBits0 &= ~16384; - } - - /// Field number for the "split_by_number" field. - public const int SplitByNumberFieldNumber = 23; - private readonly static bool SplitByNumberDefaultValue = true; - - private bool splitByNumber_; - /// - /// When `split_by_number` is true, put a boundary between number and - /// non-number transition. If we want to treat "F1" is one token, set this flag - /// to be false. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool SplitByNumber { - get { if ((_hasBits0 & 65536) != 0) { return splitByNumber_; } else { return SplitByNumberDefaultValue; } } - set { - _hasBits0 |= 65536; - splitByNumber_ = value; - } - } - /// Gets whether the "split_by_number" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasSplitByNumber { - get { return (_hasBits0 & 65536) != 0; } - } - /// Clears the value of the "split_by_number" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearSplitByNumber() { - _hasBits0 &= ~65536; - } - - /// Field number for the "split_by_whitespace" field. - public const int SplitByWhitespaceFieldNumber = 22; - private readonly static bool SplitByWhitespaceDefaultValue = true; - - private bool splitByWhitespace_; - /// - /// Use a white space to split sentence pieces. - /// When `split_by_whitespace` is false, we may have the piece containing - /// a white space in the middle. e.g., "in_the". - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool SplitByWhitespace { - get { if ((_hasBits0 & 32768) != 0) { return splitByWhitespace_; } else { return SplitByWhitespaceDefaultValue; } } - set { - _hasBits0 |= 32768; - splitByWhitespace_ = value; - } - } - /// Gets whether the "split_by_whitespace" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasSplitByWhitespace { - get { return (_hasBits0 & 32768) != 0; } - } - /// Clears the value of the "split_by_whitespace" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearSplitByWhitespace() { - _hasBits0 &= ~32768; - } - - /// Field number for the "treat_whitespace_as_suffix" field. - public const int TreatWhitespaceAsSuffixFieldNumber = 24; - private readonly static bool TreatWhitespaceAsSuffixDefaultValue = false; - - private bool treatWhitespaceAsSuffix_; - /// - /// Adds whitespace symbol (_) as a suffix instead of prefix. e.g., _hello => - /// hello_. When `treat_whitespace_as_suffix` is true, - /// NormalizerSpec::add_dummy_prefix will add the dummy whitespace to the end - /// of sentence. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool TreatWhitespaceAsSuffix { - get { if ((_hasBits0 & 131072) != 0) { return treatWhitespaceAsSuffix_; } else { return TreatWhitespaceAsSuffixDefaultValue; } } - set { - _hasBits0 |= 131072; - treatWhitespaceAsSuffix_ = value; - } - } - /// Gets whether the "treat_whitespace_as_suffix" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasTreatWhitespaceAsSuffix { - get { return (_hasBits0 & 131072) != 0; } - } - /// Clears the value of the "treat_whitespace_as_suffix" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearTreatWhitespaceAsSuffix() { - _hasBits0 &= ~131072; - } - - /// Field number for the "allow_whitespace_only_pieces" field. - public const int AllowWhitespaceOnlyPiecesFieldNumber = 26; - private readonly static bool AllowWhitespaceOnlyPiecesDefaultValue = false; - - private bool allowWhitespaceOnlyPieces_; - /// - /// Allows pieces that only contain whitespaces instead of appearing only as - /// prefix or suffix of other pieces. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool AllowWhitespaceOnlyPieces { - get { if ((_hasBits0 & 524288) != 0) { return allowWhitespaceOnlyPieces_; } else { return AllowWhitespaceOnlyPiecesDefaultValue; } } - set { - _hasBits0 |= 524288; - allowWhitespaceOnlyPieces_ = value; - } - } - /// Gets whether the "allow_whitespace_only_pieces" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasAllowWhitespaceOnlyPieces { - get { return (_hasBits0 & 524288) != 0; } - } - /// Clears the value of the "allow_whitespace_only_pieces" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearAllowWhitespaceOnlyPieces() { - _hasBits0 &= ~524288; - } - - /// Field number for the "split_digits" field. - public const int SplitDigitsFieldNumber = 25; - private readonly static bool SplitDigitsDefaultValue = false; - - private bool splitDigits_; - /// - /// Split all digits (0-9) into separate pieces. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool SplitDigits { - get { if ((_hasBits0 & 262144) != 0) { return splitDigits_; } else { return SplitDigitsDefaultValue; } } - set { - _hasBits0 |= 262144; - splitDigits_ = value; - } - } - /// Gets whether the "split_digits" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasSplitDigits { - get { return (_hasBits0 & 262144) != 0; } - } - /// Clears the value of the "split_digits" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearSplitDigits() { - _hasBits0 &= ~262144; - } - - /// Field number for the "pretokenization_delimiter" field. - public const int PretokenizationDelimiterFieldNumber = 53; - private readonly static string PretokenizationDelimiterDefaultValue = ""; - - private string pretokenizationDelimiter_; - /// - /// Defines the pre-tokenization delimiter. - /// When specified, no pieces crossing this delimiter is not included - /// in the vocab. Then the delimiter string is virtually ignored - /// during the training. This field can allows constraints on the vocabulary - /// selection. Note that this field is available on unigram mode. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public string PretokenizationDelimiter { - get { return pretokenizationDelimiter_ ?? PretokenizationDelimiterDefaultValue; } - set { - pretokenizationDelimiter_ = pb::ProtoPreconditions.CheckNotNull(value, "value"); - } - } - /// Gets whether the "pretokenization_delimiter" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasPretokenizationDelimiter { - get { return pretokenizationDelimiter_ != null; } - } - /// Clears the value of the "pretokenization_delimiter" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearPretokenizationDelimiter() { - pretokenizationDelimiter_ = null; - } - - /// Field number for the "control_symbols" field. - public const int ControlSymbolsFieldNumber = 30; - private static readonly pb::FieldCodec _repeated_controlSymbols_codec - = pb::FieldCodec.ForString(242); - private readonly pbc::RepeatedField controlSymbols_ = new pbc::RepeatedField(); - /// - //////////////////////////////////////////////////////////////////// - /// Vocabulary management - /// - /// Defines control symbols used as an indicator to - /// change the behavior of the decoder. <s> and </s> are pre-defined. - /// We can use this field to encode various meta information, - /// including language indicator in multilingual model. - /// These symbols are not visible to users, but visible to - /// the decoder. Note that when the input sentence contains control symbols, - /// they are not treated as one token, but segmented into normal pieces. - /// Control symbols must be inserted independently from the segmentation. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public pbc::RepeatedField ControlSymbols { - get { return controlSymbols_; } - } - - /// Field number for the "user_defined_symbols" field. - public const int UserDefinedSymbolsFieldNumber = 31; - private static readonly pb::FieldCodec _repeated_userDefinedSymbols_codec - = pb::FieldCodec.ForString(250); - private readonly pbc::RepeatedField userDefinedSymbols_ = new pbc::RepeatedField(); - /// - /// Defines user defined symbols. - /// These symbols are added with extremely high score - /// so they are always treated as one unique symbol in any context. - /// Typical usage of user_defined_symbols is placeholder for named entities. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public pbc::RepeatedField UserDefinedSymbols { - get { return userDefinedSymbols_; } - } - - /// Field number for the "required_chars" field. - public const int RequiredCharsFieldNumber = 36; - private readonly static string RequiredCharsDefaultValue = ""; - - private string requiredChars_; - /// - /// Defines required characters. Each UTF8 character in this string is included - /// in the character set regardless of character_coverage value. Unlike - /// user_defined_symbols, these characters have scores based on the frequency - /// on input sentences, and the model can form subwords using characters - /// in this field. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public string RequiredChars { - get { return requiredChars_ ?? RequiredCharsDefaultValue; } - set { - requiredChars_ = pb::ProtoPreconditions.CheckNotNull(value, "value"); - } - } - /// Gets whether the "required_chars" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasRequiredChars { - get { return requiredChars_ != null; } - } - /// Clears the value of the "required_chars" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearRequiredChars() { - requiredChars_ = null; - } - - /// Field number for the "byte_fallback" field. - public const int ByteFallbackFieldNumber = 35; - private readonly static bool ByteFallbackDefaultValue = false; - - private bool byteFallback_; - /// - /// Decomposes unknown pieces into UTF-8 bytes. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool ByteFallback { - get { if ((_hasBits0 & 8388608) != 0) { return byteFallback_; } else { return ByteFallbackDefaultValue; } } - set { - _hasBits0 |= 8388608; - byteFallback_ = value; - } - } - /// Gets whether the "byte_fallback" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasByteFallback { - get { return (_hasBits0 & 8388608) != 0; } - } - /// Clears the value of the "byte_fallback" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearByteFallback() { - _hasBits0 &= ~8388608; - } - - /// Field number for the "vocabulary_output_piece_score" field. - public const int VocabularyOutputPieceScoreFieldNumber = 32; - private readonly static bool VocabularyOutputPieceScoreDefaultValue = true; - - private bool vocabularyOutputPieceScore_; - /// - /// When creating the vocabulary file, defines whether or not to additionally - /// output the score for each piece. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool VocabularyOutputPieceScore { - get { if ((_hasBits0 & 1048576) != 0) { return vocabularyOutputPieceScore_; } else { return VocabularyOutputPieceScoreDefaultValue; } } - set { - _hasBits0 |= 1048576; - vocabularyOutputPieceScore_ = value; - } - } - /// Gets whether the "vocabulary_output_piece_score" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasVocabularyOutputPieceScore { - get { return (_hasBits0 & 1048576) != 0; } - } - /// Clears the value of the "vocabulary_output_piece_score" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearVocabularyOutputPieceScore() { - _hasBits0 &= ~1048576; - } - - /// Field number for the "hard_vocab_limit" field. - public const int HardVocabLimitFieldNumber = 33; - private readonly static bool HardVocabLimitDefaultValue = true; - - private bool hardVocabLimit_; - /// - /// `vocab_size` is treated as hard limit. Crash if - /// the model can not produce the vocab of size `vocab_size`, - /// When `hard_vocab_limit` is false, vocab_size is treated - /// as soft limit. Note that when model_type=char, - /// always assumes hard_vocab_limit = false. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HardVocabLimit { - get { if ((_hasBits0 & 2097152) != 0) { return hardVocabLimit_; } else { return HardVocabLimitDefaultValue; } } - set { - _hasBits0 |= 2097152; - hardVocabLimit_ = value; - } - } - /// Gets whether the "hard_vocab_limit" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasHardVocabLimit { - get { return (_hasBits0 & 2097152) != 0; } - } - /// Clears the value of the "hard_vocab_limit" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearHardVocabLimit() { - _hasBits0 &= ~2097152; - } - - /// Field number for the "use_all_vocab" field. - public const int UseAllVocabFieldNumber = 34; - private readonly static bool UseAllVocabDefaultValue = false; - - private bool useAllVocab_; - /// - /// use all symbols for vocab extraction. This flag is valid - /// if model type is either CHAR or WORD - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool UseAllVocab { - get { if ((_hasBits0 & 4194304) != 0) { return useAllVocab_; } else { return UseAllVocabDefaultValue; } } - set { - _hasBits0 |= 4194304; - useAllVocab_ = value; - } - } - /// Gets whether the "use_all_vocab" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasUseAllVocab { - get { return (_hasBits0 & 4194304) != 0; } - } - /// Clears the value of the "use_all_vocab" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearUseAllVocab() { - _hasBits0 &= ~4194304; - } - - /// Field number for the "unk_id" field. - public const int UnkIdFieldNumber = 40; - private readonly static int UnkIdDefaultValue = 0; - - private int unkId_; - /// - //////////////////////////////////////////////////////////////////// - /// Reserved special meta tokens. - /// * -1 is not used. - /// * unk_id must not be -1. - /// Id must starts with 0 and be contigous. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public int UnkId { - get { if ((_hasBits0 & 16777216) != 0) { return unkId_; } else { return UnkIdDefaultValue; } } - set { - _hasBits0 |= 16777216; - unkId_ = value; - } - } - /// Gets whether the "unk_id" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasUnkId { - get { return (_hasBits0 & 16777216) != 0; } - } - /// Clears the value of the "unk_id" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearUnkId() { - _hasBits0 &= ~16777216; - } - - /// Field number for the "bos_id" field. - public const int BosIdFieldNumber = 41; - private readonly static int BosIdDefaultValue = 1; - - private int bosId_; - /// - /// <s> - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public int BosId { - get { if ((_hasBits0 & 33554432) != 0) { return bosId_; } else { return BosIdDefaultValue; } } - set { - _hasBits0 |= 33554432; - bosId_ = value; - } - } - /// Gets whether the "bos_id" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasBosId { - get { return (_hasBits0 & 33554432) != 0; } - } - /// Clears the value of the "bos_id" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearBosId() { - _hasBits0 &= ~33554432; - } - - /// Field number for the "eos_id" field. - public const int EosIdFieldNumber = 42; - private readonly static int EosIdDefaultValue = 2; - - private int eosId_; - /// - /// </s> - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public int EosId { - get { if ((_hasBits0 & 67108864) != 0) { return eosId_; } else { return EosIdDefaultValue; } } - set { - _hasBits0 |= 67108864; - eosId_ = value; - } - } - /// Gets whether the "eos_id" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasEosId { - get { return (_hasBits0 & 67108864) != 0; } - } - /// Clears the value of the "eos_id" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearEosId() { - _hasBits0 &= ~67108864; - } - - /// Field number for the "pad_id" field. - public const int PadIdFieldNumber = 43; - private readonly static int PadIdDefaultValue = -1; - - private int padId_; - /// - /// <pad> (padding) - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public int PadId { - get { if ((_hasBits0 & 134217728) != 0) { return padId_; } else { return PadIdDefaultValue; } } - set { - _hasBits0 |= 134217728; - padId_ = value; - } - } - /// Gets whether the "pad_id" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasPadId { - get { return (_hasBits0 & 134217728) != 0; } - } - /// Clears the value of the "pad_id" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearPadId() { - _hasBits0 &= ~134217728; - } - - /// Field number for the "unk_piece" field. - public const int UnkPieceFieldNumber = 45; - private readonly static string UnkPieceDefaultValue = global::System.Text.Encoding.UTF8.GetString(global::System.Convert.FromBase64String("PHVuaz4="), 0, 5); - - private string unkPiece_; - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public string UnkPiece { - get { return unkPiece_ ?? UnkPieceDefaultValue; } - set { - unkPiece_ = pb::ProtoPreconditions.CheckNotNull(value, "value"); - } - } - /// Gets whether the "unk_piece" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasUnkPiece { - get { return unkPiece_ != null; } - } - /// Clears the value of the "unk_piece" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearUnkPiece() { - unkPiece_ = null; - } - - /// Field number for the "bos_piece" field. - public const int BosPieceFieldNumber = 46; - private readonly static string BosPieceDefaultValue = global::System.Text.Encoding.UTF8.GetString(global::System.Convert.FromBase64String("PHM+"), 0, 3); - - private string bosPiece_; - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public string BosPiece { - get { return bosPiece_ ?? BosPieceDefaultValue; } - set { - bosPiece_ = pb::ProtoPreconditions.CheckNotNull(value, "value"); - } - } - /// Gets whether the "bos_piece" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasBosPiece { - get { return bosPiece_ != null; } - } - /// Clears the value of the "bos_piece" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearBosPiece() { - bosPiece_ = null; - } - - /// Field number for the "eos_piece" field. - public const int EosPieceFieldNumber = 47; - private readonly static string EosPieceDefaultValue = global::System.Text.Encoding.UTF8.GetString(global::System.Convert.FromBase64String("PC9zPg=="), 0, 4); - - private string eosPiece_; - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public string EosPiece { - get { return eosPiece_ ?? EosPieceDefaultValue; } - set { - eosPiece_ = pb::ProtoPreconditions.CheckNotNull(value, "value"); - } - } - /// Gets whether the "eos_piece" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasEosPiece { - get { return eosPiece_ != null; } - } - /// Clears the value of the "eos_piece" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearEosPiece() { - eosPiece_ = null; - } - - /// Field number for the "pad_piece" field. - public const int PadPieceFieldNumber = 48; - private readonly static string PadPieceDefaultValue = global::System.Text.Encoding.UTF8.GetString(global::System.Convert.FromBase64String("PHBhZD4="), 0, 5); - - private string padPiece_; - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public string PadPiece { - get { return padPiece_ ?? PadPieceDefaultValue; } - set { - padPiece_ = pb::ProtoPreconditions.CheckNotNull(value, "value"); - } - } - /// Gets whether the "pad_piece" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasPadPiece { - get { return padPiece_ != null; } - } - /// Clears the value of the "pad_piece" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearPadPiece() { - padPiece_ = null; - } - - /// Field number for the "unk_surface" field. - public const int UnkSurfaceFieldNumber = 44; - private readonly static string UnkSurfaceDefaultValue = global::System.Text.Encoding.UTF8.GetString(global::System.Convert.FromBase64String("IOKBhyA="), 0, 5); - - private string unkSurface_; - /// - /// Encodes <unk> into U+2047 (DOUBLE QUESTION MARK), - /// since this character can be useful both for user and - /// developer. We can easily figure out that <unk> is emitted. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public string UnkSurface { - get { return unkSurface_ ?? UnkSurfaceDefaultValue; } - set { - unkSurface_ = pb::ProtoPreconditions.CheckNotNull(value, "value"); - } - } - /// Gets whether the "unk_surface" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasUnkSurface { - get { return unkSurface_ != null; } - } - /// Clears the value of the "unk_surface" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearUnkSurface() { - unkSurface_ = null; - } - - /// Field number for the "train_extremely_large_corpus" field. - public const int TrainExtremelyLargeCorpusFieldNumber = 49; - private readonly static bool TrainExtremelyLargeCorpusDefaultValue = false; - - private bool trainExtremelyLargeCorpus_; - /// - /// Increase bit depth to allow unigram model training on large - /// (>10M sentences) corpora. A Side-effect of enabling this flag - /// is increased memory usage. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool TrainExtremelyLargeCorpus { - get { if ((_hasBits0 & 268435456) != 0) { return trainExtremelyLargeCorpus_; } else { return TrainExtremelyLargeCorpusDefaultValue; } } - set { - _hasBits0 |= 268435456; - trainExtremelyLargeCorpus_ = value; - } - } - /// Gets whether the "train_extremely_large_corpus" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasTrainExtremelyLargeCorpus { - get { return (_hasBits0 & 268435456) != 0; } - } - /// Clears the value of the "train_extremely_large_corpus" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearTrainExtremelyLargeCorpus() { - _hasBits0 &= ~268435456; - } - - /// Field number for the "seed_sentencepieces_file" field. - public const int SeedSentencepiecesFileFieldNumber = 54; - private readonly static string SeedSentencepiecesFileDefaultValue = ""; - - private string seedSentencepiecesFile_; - /// - /// Path to a seed sentencepieces file, with one tab-separated - /// seed sentencepiece <tab> frequency per line. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public string SeedSentencepiecesFile { - get { return seedSentencepiecesFile_ ?? SeedSentencepiecesFileDefaultValue; } - set { - seedSentencepiecesFile_ = pb::ProtoPreconditions.CheckNotNull(value, "value"); - } - } - /// Gets whether the "seed_sentencepieces_file" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasSeedSentencepiecesFile { - get { return seedSentencepiecesFile_ != null; } - } - /// Clears the value of the "seed_sentencepieces_file" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearSeedSentencepiecesFile() { - seedSentencepiecesFile_ = null; - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public override bool Equals(object other) { - return Equals(other as TrainerSpec); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool Equals(TrainerSpec other) { - if (ReferenceEquals(other, null)) { - return false; - } - if (ReferenceEquals(other, this)) { - return true; - } - if(!input_.Equals(other.input_)) return false; - if (InputFormat != other.InputFormat) return false; - if (ModelPrefix != other.ModelPrefix) return false; - if (ModelType != other.ModelType) return false; - if (VocabSize != other.VocabSize) return false; - if(!acceptLanguage_.Equals(other.acceptLanguage_)) return false; - if (SelfTestSampleSize != other.SelfTestSampleSize) return false; - if (EnableDifferentialPrivacy != other.EnableDifferentialPrivacy) return false; - if (!pbc::ProtobufEqualityComparers.BitwiseSingleEqualityComparer.Equals(DifferentialPrivacyNoiseLevel, other.DifferentialPrivacyNoiseLevel)) return false; - if (DifferentialPrivacyClippingThreshold != other.DifferentialPrivacyClippingThreshold) return false; - if (!pbc::ProtobufEqualityComparers.BitwiseSingleEqualityComparer.Equals(CharacterCoverage, other.CharacterCoverage)) return false; - if (InputSentenceSize != other.InputSentenceSize) return false; - if (ShuffleInputSentence != other.ShuffleInputSentence) return false; - if (MiningSentenceSize != other.MiningSentenceSize) return false; - if (TrainingSentenceSize != other.TrainingSentenceSize) return false; - if (SeedSentencepieceSize != other.SeedSentencepieceSize) return false; - if (!pbc::ProtobufEqualityComparers.BitwiseSingleEqualityComparer.Equals(ShrinkingFactor, other.ShrinkingFactor)) return false; - if (MaxSentenceLength != other.MaxSentenceLength) return false; - if (NumThreads != other.NumThreads) return false; - if (NumSubIterations != other.NumSubIterations) return false; - if (MaxSentencepieceLength != other.MaxSentencepieceLength) return false; - if (SplitByUnicodeScript != other.SplitByUnicodeScript) return false; - if (SplitByNumber != other.SplitByNumber) return false; - if (SplitByWhitespace != other.SplitByWhitespace) return false; - if (TreatWhitespaceAsSuffix != other.TreatWhitespaceAsSuffix) return false; - if (AllowWhitespaceOnlyPieces != other.AllowWhitespaceOnlyPieces) return false; - if (SplitDigits != other.SplitDigits) return false; - if (PretokenizationDelimiter != other.PretokenizationDelimiter) return false; - if(!controlSymbols_.Equals(other.controlSymbols_)) return false; - if(!userDefinedSymbols_.Equals(other.userDefinedSymbols_)) return false; - if (RequiredChars != other.RequiredChars) return false; - if (ByteFallback != other.ByteFallback) return false; - if (VocabularyOutputPieceScore != other.VocabularyOutputPieceScore) return false; - if (HardVocabLimit != other.HardVocabLimit) return false; - if (UseAllVocab != other.UseAllVocab) return false; - if (UnkId != other.UnkId) return false; - if (BosId != other.BosId) return false; - if (EosId != other.EosId) return false; - if (PadId != other.PadId) return false; - if (UnkPiece != other.UnkPiece) return false; - if (BosPiece != other.BosPiece) return false; - if (EosPiece != other.EosPiece) return false; - if (PadPiece != other.PadPiece) return false; - if (UnkSurface != other.UnkSurface) return false; - if (TrainExtremelyLargeCorpus != other.TrainExtremelyLargeCorpus) return false; - if (SeedSentencepiecesFile != other.SeedSentencepiecesFile) return false; - if (!Equals(_extensions, other._extensions)) { - return false; - } - return Equals(_unknownFields, other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public override int GetHashCode() { - int hash = 1; - hash ^= input_.GetHashCode(); - if (HasInputFormat) hash ^= InputFormat.GetHashCode(); - if (HasModelPrefix) hash ^= ModelPrefix.GetHashCode(); - if (HasModelType) hash ^= ModelType.GetHashCode(); - if (HasVocabSize) hash ^= VocabSize.GetHashCode(); - hash ^= acceptLanguage_.GetHashCode(); - if (HasSelfTestSampleSize) hash ^= SelfTestSampleSize.GetHashCode(); - if (HasEnableDifferentialPrivacy) hash ^= EnableDifferentialPrivacy.GetHashCode(); - if (HasDifferentialPrivacyNoiseLevel) hash ^= pbc::ProtobufEqualityComparers.BitwiseSingleEqualityComparer.GetHashCode(DifferentialPrivacyNoiseLevel); - if (HasDifferentialPrivacyClippingThreshold) hash ^= DifferentialPrivacyClippingThreshold.GetHashCode(); - if (HasCharacterCoverage) hash ^= pbc::ProtobufEqualityComparers.BitwiseSingleEqualityComparer.GetHashCode(CharacterCoverage); - if (HasInputSentenceSize) hash ^= InputSentenceSize.GetHashCode(); - if (HasShuffleInputSentence) hash ^= ShuffleInputSentence.GetHashCode(); - if (HasMiningSentenceSize) hash ^= MiningSentenceSize.GetHashCode(); - if (HasTrainingSentenceSize) hash ^= TrainingSentenceSize.GetHashCode(); - if (HasSeedSentencepieceSize) hash ^= SeedSentencepieceSize.GetHashCode(); - if (HasShrinkingFactor) hash ^= pbc::ProtobufEqualityComparers.BitwiseSingleEqualityComparer.GetHashCode(ShrinkingFactor); - if (HasMaxSentenceLength) hash ^= MaxSentenceLength.GetHashCode(); - if (HasNumThreads) hash ^= NumThreads.GetHashCode(); - if (HasNumSubIterations) hash ^= NumSubIterations.GetHashCode(); - if (HasMaxSentencepieceLength) hash ^= MaxSentencepieceLength.GetHashCode(); - if (HasSplitByUnicodeScript) hash ^= SplitByUnicodeScript.GetHashCode(); - if (HasSplitByNumber) hash ^= SplitByNumber.GetHashCode(); - if (HasSplitByWhitespace) hash ^= SplitByWhitespace.GetHashCode(); - if (HasTreatWhitespaceAsSuffix) hash ^= TreatWhitespaceAsSuffix.GetHashCode(); - if (HasAllowWhitespaceOnlyPieces) hash ^= AllowWhitespaceOnlyPieces.GetHashCode(); - if (HasSplitDigits) hash ^= SplitDigits.GetHashCode(); - if (HasPretokenizationDelimiter) hash ^= PretokenizationDelimiter.GetHashCode(); - hash ^= controlSymbols_.GetHashCode(); - hash ^= userDefinedSymbols_.GetHashCode(); - if (HasRequiredChars) hash ^= RequiredChars.GetHashCode(); - if (HasByteFallback) hash ^= ByteFallback.GetHashCode(); - if (HasVocabularyOutputPieceScore) hash ^= VocabularyOutputPieceScore.GetHashCode(); - if (HasHardVocabLimit) hash ^= HardVocabLimit.GetHashCode(); - if (HasUseAllVocab) hash ^= UseAllVocab.GetHashCode(); - if (HasUnkId) hash ^= UnkId.GetHashCode(); - if (HasBosId) hash ^= BosId.GetHashCode(); - if (HasEosId) hash ^= EosId.GetHashCode(); - if (HasPadId) hash ^= PadId.GetHashCode(); - if (HasUnkPiece) hash ^= UnkPiece.GetHashCode(); - if (HasBosPiece) hash ^= BosPiece.GetHashCode(); - if (HasEosPiece) hash ^= EosPiece.GetHashCode(); - if (HasPadPiece) hash ^= PadPiece.GetHashCode(); - if (HasUnkSurface) hash ^= UnkSurface.GetHashCode(); - if (HasTrainExtremelyLargeCorpus) hash ^= TrainExtremelyLargeCorpus.GetHashCode(); - if (HasSeedSentencepiecesFile) hash ^= SeedSentencepiecesFile.GetHashCode(); - if (_extensions != null) { - hash ^= _extensions.GetHashCode(); - } - if (_unknownFields != null) { - hash ^= _unknownFields.GetHashCode(); - } - return hash; - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public override string ToString() { - return pb::JsonFormatter.ToDiagnosticString(this); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void WriteTo(pb::CodedOutputStream output) { - #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE - output.WriteRawMessage(this); - #else - input_.WriteTo(output, _repeated_input_codec); - if (HasModelPrefix) { - output.WriteRawTag(18); - output.WriteString(ModelPrefix); - } - if (HasModelType) { - output.WriteRawTag(24); - output.WriteEnum((int) ModelType); - } - if (HasVocabSize) { - output.WriteRawTag(32); - output.WriteInt32(VocabSize); - } - acceptLanguage_.WriteTo(output, _repeated_acceptLanguage_codec); - if (HasSelfTestSampleSize) { - output.WriteRawTag(48); - output.WriteInt32(SelfTestSampleSize); - } - if (HasInputFormat) { - output.WriteRawTag(58); - output.WriteString(InputFormat); - } - if (HasCharacterCoverage) { - output.WriteRawTag(85); - output.WriteFloat(CharacterCoverage); - } - if (HasInputSentenceSize) { - output.WriteRawTag(88); - output.WriteUInt64(InputSentenceSize); - } - if (HasMiningSentenceSize) { - output.WriteRawTag(96); - output.WriteInt32(MiningSentenceSize); - } - if (HasTrainingSentenceSize) { - output.WriteRawTag(104); - output.WriteInt32(TrainingSentenceSize); - } - if (HasSeedSentencepieceSize) { - output.WriteRawTag(112); - output.WriteInt32(SeedSentencepieceSize); - } - if (HasShrinkingFactor) { - output.WriteRawTag(125); - output.WriteFloat(ShrinkingFactor); - } - if (HasNumThreads) { - output.WriteRawTag(128, 1); - output.WriteInt32(NumThreads); - } - if (HasNumSubIterations) { - output.WriteRawTag(136, 1); - output.WriteInt32(NumSubIterations); - } - if (HasMaxSentenceLength) { - output.WriteRawTag(144, 1); - output.WriteInt32(MaxSentenceLength); - } - if (HasShuffleInputSentence) { - output.WriteRawTag(152, 1); - output.WriteBool(ShuffleInputSentence); - } - if (HasMaxSentencepieceLength) { - output.WriteRawTag(160, 1); - output.WriteInt32(MaxSentencepieceLength); - } - if (HasSplitByUnicodeScript) { - output.WriteRawTag(168, 1); - output.WriteBool(SplitByUnicodeScript); - } - if (HasSplitByWhitespace) { - output.WriteRawTag(176, 1); - output.WriteBool(SplitByWhitespace); - } - if (HasSplitByNumber) { - output.WriteRawTag(184, 1); - output.WriteBool(SplitByNumber); - } - if (HasTreatWhitespaceAsSuffix) { - output.WriteRawTag(192, 1); - output.WriteBool(TreatWhitespaceAsSuffix); - } - if (HasSplitDigits) { - output.WriteRawTag(200, 1); - output.WriteBool(SplitDigits); - } - if (HasAllowWhitespaceOnlyPieces) { - output.WriteRawTag(208, 1); - output.WriteBool(AllowWhitespaceOnlyPieces); - } - controlSymbols_.WriteTo(output, _repeated_controlSymbols_codec); - userDefinedSymbols_.WriteTo(output, _repeated_userDefinedSymbols_codec); - if (HasVocabularyOutputPieceScore) { - output.WriteRawTag(128, 2); - output.WriteBool(VocabularyOutputPieceScore); - } - if (HasHardVocabLimit) { - output.WriteRawTag(136, 2); - output.WriteBool(HardVocabLimit); - } - if (HasUseAllVocab) { - output.WriteRawTag(144, 2); - output.WriteBool(UseAllVocab); - } - if (HasByteFallback) { - output.WriteRawTag(152, 2); - output.WriteBool(ByteFallback); - } - if (HasRequiredChars) { - output.WriteRawTag(162, 2); - output.WriteString(RequiredChars); - } - if (HasUnkId) { - output.WriteRawTag(192, 2); - output.WriteInt32(UnkId); - } - if (HasBosId) { - output.WriteRawTag(200, 2); - output.WriteInt32(BosId); - } - if (HasEosId) { - output.WriteRawTag(208, 2); - output.WriteInt32(EosId); - } - if (HasPadId) { - output.WriteRawTag(216, 2); - output.WriteInt32(PadId); - } - if (HasUnkSurface) { - output.WriteRawTag(226, 2); - output.WriteString(UnkSurface); - } - if (HasUnkPiece) { - output.WriteRawTag(234, 2); - output.WriteString(UnkPiece); - } - if (HasBosPiece) { - output.WriteRawTag(242, 2); - output.WriteString(BosPiece); - } - if (HasEosPiece) { - output.WriteRawTag(250, 2); - output.WriteString(EosPiece); - } - if (HasPadPiece) { - output.WriteRawTag(130, 3); - output.WriteString(PadPiece); - } - if (HasTrainExtremelyLargeCorpus) { - output.WriteRawTag(136, 3); - output.WriteBool(TrainExtremelyLargeCorpus); - } - if (HasEnableDifferentialPrivacy) { - output.WriteRawTag(144, 3); - output.WriteBool(EnableDifferentialPrivacy); - } - if (HasDifferentialPrivacyNoiseLevel) { - output.WriteRawTag(157, 3); - output.WriteFloat(DifferentialPrivacyNoiseLevel); - } - if (HasDifferentialPrivacyClippingThreshold) { - output.WriteRawTag(160, 3); - output.WriteUInt64(DifferentialPrivacyClippingThreshold); - } - if (HasPretokenizationDelimiter) { - output.WriteRawTag(170, 3); - output.WriteString(PretokenizationDelimiter); - } - if (HasSeedSentencepiecesFile) { - output.WriteRawTag(178, 3); - output.WriteString(SeedSentencepiecesFile); - } - if (_extensions != null) { - _extensions.WriteTo(output); - } - if (_unknownFields != null) { - _unknownFields.WriteTo(output); - } - #endif - } - - #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - void pb::IBufferMessage.InternalWriteTo(ref pb::WriteContext output) { - input_.WriteTo(ref output, _repeated_input_codec); - if (HasModelPrefix) { - output.WriteRawTag(18); - output.WriteString(ModelPrefix); - } - if (HasModelType) { - output.WriteRawTag(24); - output.WriteEnum((int) ModelType); - } - if (HasVocabSize) { - output.WriteRawTag(32); - output.WriteInt32(VocabSize); - } - acceptLanguage_.WriteTo(ref output, _repeated_acceptLanguage_codec); - if (HasSelfTestSampleSize) { - output.WriteRawTag(48); - output.WriteInt32(SelfTestSampleSize); - } - if (HasInputFormat) { - output.WriteRawTag(58); - output.WriteString(InputFormat); - } - if (HasCharacterCoverage) { - output.WriteRawTag(85); - output.WriteFloat(CharacterCoverage); - } - if (HasInputSentenceSize) { - output.WriteRawTag(88); - output.WriteUInt64(InputSentenceSize); - } - if (HasMiningSentenceSize) { - output.WriteRawTag(96); - output.WriteInt32(MiningSentenceSize); - } - if (HasTrainingSentenceSize) { - output.WriteRawTag(104); - output.WriteInt32(TrainingSentenceSize); - } - if (HasSeedSentencepieceSize) { - output.WriteRawTag(112); - output.WriteInt32(SeedSentencepieceSize); - } - if (HasShrinkingFactor) { - output.WriteRawTag(125); - output.WriteFloat(ShrinkingFactor); - } - if (HasNumThreads) { - output.WriteRawTag(128, 1); - output.WriteInt32(NumThreads); - } - if (HasNumSubIterations) { - output.WriteRawTag(136, 1); - output.WriteInt32(NumSubIterations); - } - if (HasMaxSentenceLength) { - output.WriteRawTag(144, 1); - output.WriteInt32(MaxSentenceLength); - } - if (HasShuffleInputSentence) { - output.WriteRawTag(152, 1); - output.WriteBool(ShuffleInputSentence); - } - if (HasMaxSentencepieceLength) { - output.WriteRawTag(160, 1); - output.WriteInt32(MaxSentencepieceLength); - } - if (HasSplitByUnicodeScript) { - output.WriteRawTag(168, 1); - output.WriteBool(SplitByUnicodeScript); - } - if (HasSplitByWhitespace) { - output.WriteRawTag(176, 1); - output.WriteBool(SplitByWhitespace); - } - if (HasSplitByNumber) { - output.WriteRawTag(184, 1); - output.WriteBool(SplitByNumber); - } - if (HasTreatWhitespaceAsSuffix) { - output.WriteRawTag(192, 1); - output.WriteBool(TreatWhitespaceAsSuffix); - } - if (HasSplitDigits) { - output.WriteRawTag(200, 1); - output.WriteBool(SplitDigits); - } - if (HasAllowWhitespaceOnlyPieces) { - output.WriteRawTag(208, 1); - output.WriteBool(AllowWhitespaceOnlyPieces); - } - controlSymbols_.WriteTo(ref output, _repeated_controlSymbols_codec); - userDefinedSymbols_.WriteTo(ref output, _repeated_userDefinedSymbols_codec); - if (HasVocabularyOutputPieceScore) { - output.WriteRawTag(128, 2); - output.WriteBool(VocabularyOutputPieceScore); - } - if (HasHardVocabLimit) { - output.WriteRawTag(136, 2); - output.WriteBool(HardVocabLimit); - } - if (HasUseAllVocab) { - output.WriteRawTag(144, 2); - output.WriteBool(UseAllVocab); - } - if (HasByteFallback) { - output.WriteRawTag(152, 2); - output.WriteBool(ByteFallback); - } - if (HasRequiredChars) { - output.WriteRawTag(162, 2); - output.WriteString(RequiredChars); - } - if (HasUnkId) { - output.WriteRawTag(192, 2); - output.WriteInt32(UnkId); - } - if (HasBosId) { - output.WriteRawTag(200, 2); - output.WriteInt32(BosId); - } - if (HasEosId) { - output.WriteRawTag(208, 2); - output.WriteInt32(EosId); - } - if (HasPadId) { - output.WriteRawTag(216, 2); - output.WriteInt32(PadId); - } - if (HasUnkSurface) { - output.WriteRawTag(226, 2); - output.WriteString(UnkSurface); - } - if (HasUnkPiece) { - output.WriteRawTag(234, 2); - output.WriteString(UnkPiece); - } - if (HasBosPiece) { - output.WriteRawTag(242, 2); - output.WriteString(BosPiece); - } - if (HasEosPiece) { - output.WriteRawTag(250, 2); - output.WriteString(EosPiece); - } - if (HasPadPiece) { - output.WriteRawTag(130, 3); - output.WriteString(PadPiece); - } - if (HasTrainExtremelyLargeCorpus) { - output.WriteRawTag(136, 3); - output.WriteBool(TrainExtremelyLargeCorpus); - } - if (HasEnableDifferentialPrivacy) { - output.WriteRawTag(144, 3); - output.WriteBool(EnableDifferentialPrivacy); - } - if (HasDifferentialPrivacyNoiseLevel) { - output.WriteRawTag(157, 3); - output.WriteFloat(DifferentialPrivacyNoiseLevel); - } - if (HasDifferentialPrivacyClippingThreshold) { - output.WriteRawTag(160, 3); - output.WriteUInt64(DifferentialPrivacyClippingThreshold); - } - if (HasPretokenizationDelimiter) { - output.WriteRawTag(170, 3); - output.WriteString(PretokenizationDelimiter); - } - if (HasSeedSentencepiecesFile) { - output.WriteRawTag(178, 3); - output.WriteString(SeedSentencepiecesFile); - } - if (_extensions != null) { - _extensions.WriteTo(ref output); - } - if (_unknownFields != null) { - _unknownFields.WriteTo(ref output); - } - } - #endif - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public int CalculateSize() { - int size = 0; - size += input_.CalculateSize(_repeated_input_codec); - if (HasInputFormat) { - size += 1 + pb::CodedOutputStream.ComputeStringSize(InputFormat); - } - if (HasModelPrefix) { - size += 1 + pb::CodedOutputStream.ComputeStringSize(ModelPrefix); - } - if (HasModelType) { - size += 1 + pb::CodedOutputStream.ComputeEnumSize((int) ModelType); - } - if (HasVocabSize) { - size += 1 + pb::CodedOutputStream.ComputeInt32Size(VocabSize); - } - size += acceptLanguage_.CalculateSize(_repeated_acceptLanguage_codec); - if (HasSelfTestSampleSize) { - size += 1 + pb::CodedOutputStream.ComputeInt32Size(SelfTestSampleSize); - } - if (HasEnableDifferentialPrivacy) { - size += 2 + 1; - } - if (HasDifferentialPrivacyNoiseLevel) { - size += 2 + 4; - } - if (HasDifferentialPrivacyClippingThreshold) { - size += 2 + pb::CodedOutputStream.ComputeUInt64Size(DifferentialPrivacyClippingThreshold); - } - if (HasCharacterCoverage) { - size += 1 + 4; - } - if (HasInputSentenceSize) { - size += 1 + pb::CodedOutputStream.ComputeUInt64Size(InputSentenceSize); - } - if (HasShuffleInputSentence) { - size += 2 + 1; - } - if (HasMiningSentenceSize) { - size += 1 + pb::CodedOutputStream.ComputeInt32Size(MiningSentenceSize); - } - if (HasTrainingSentenceSize) { - size += 1 + pb::CodedOutputStream.ComputeInt32Size(TrainingSentenceSize); - } - if (HasSeedSentencepieceSize) { - size += 1 + pb::CodedOutputStream.ComputeInt32Size(SeedSentencepieceSize); - } - if (HasShrinkingFactor) { - size += 1 + 4; - } - if (HasMaxSentenceLength) { - size += 2 + pb::CodedOutputStream.ComputeInt32Size(MaxSentenceLength); - } - if (HasNumThreads) { - size += 2 + pb::CodedOutputStream.ComputeInt32Size(NumThreads); - } - if (HasNumSubIterations) { - size += 2 + pb::CodedOutputStream.ComputeInt32Size(NumSubIterations); - } - if (HasMaxSentencepieceLength) { - size += 2 + pb::CodedOutputStream.ComputeInt32Size(MaxSentencepieceLength); - } - if (HasSplitByUnicodeScript) { - size += 2 + 1; - } - if (HasSplitByNumber) { - size += 2 + 1; - } - if (HasSplitByWhitespace) { - size += 2 + 1; - } - if (HasTreatWhitespaceAsSuffix) { - size += 2 + 1; - } - if (HasAllowWhitespaceOnlyPieces) { - size += 2 + 1; - } - if (HasSplitDigits) { - size += 2 + 1; - } - if (HasPretokenizationDelimiter) { - size += 2 + pb::CodedOutputStream.ComputeStringSize(PretokenizationDelimiter); - } - size += controlSymbols_.CalculateSize(_repeated_controlSymbols_codec); - size += userDefinedSymbols_.CalculateSize(_repeated_userDefinedSymbols_codec); - if (HasRequiredChars) { - size += 2 + pb::CodedOutputStream.ComputeStringSize(RequiredChars); - } - if (HasByteFallback) { - size += 2 + 1; - } - if (HasVocabularyOutputPieceScore) { - size += 2 + 1; - } - if (HasHardVocabLimit) { - size += 2 + 1; - } - if (HasUseAllVocab) { - size += 2 + 1; - } - if (HasUnkId) { - size += 2 + pb::CodedOutputStream.ComputeInt32Size(UnkId); - } - if (HasBosId) { - size += 2 + pb::CodedOutputStream.ComputeInt32Size(BosId); - } - if (HasEosId) { - size += 2 + pb::CodedOutputStream.ComputeInt32Size(EosId); - } - if (HasPadId) { - size += 2 + pb::CodedOutputStream.ComputeInt32Size(PadId); - } - if (HasUnkPiece) { - size += 2 + pb::CodedOutputStream.ComputeStringSize(UnkPiece); - } - if (HasBosPiece) { - size += 2 + pb::CodedOutputStream.ComputeStringSize(BosPiece); - } - if (HasEosPiece) { - size += 2 + pb::CodedOutputStream.ComputeStringSize(EosPiece); - } - if (HasPadPiece) { - size += 2 + pb::CodedOutputStream.ComputeStringSize(PadPiece); - } - if (HasUnkSurface) { - size += 2 + pb::CodedOutputStream.ComputeStringSize(UnkSurface); - } - if (HasTrainExtremelyLargeCorpus) { - size += 2 + 1; - } - if (HasSeedSentencepiecesFile) { - size += 2 + pb::CodedOutputStream.ComputeStringSize(SeedSentencepiecesFile); - } - if (_extensions != null) { - size += _extensions.CalculateSize(); - } - if (_unknownFields != null) { - size += _unknownFields.CalculateSize(); - } - return size; - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void MergeFrom(TrainerSpec other) { - if (other == null) { - return; - } - input_.Add(other.input_); - if (other.HasInputFormat) { - InputFormat = other.InputFormat; - } - if (other.HasModelPrefix) { - ModelPrefix = other.ModelPrefix; - } - if (other.HasModelType) { - ModelType = other.ModelType; - } - if (other.HasVocabSize) { - VocabSize = other.VocabSize; - } - acceptLanguage_.Add(other.acceptLanguage_); - if (other.HasSelfTestSampleSize) { - SelfTestSampleSize = other.SelfTestSampleSize; - } - if (other.HasEnableDifferentialPrivacy) { - EnableDifferentialPrivacy = other.EnableDifferentialPrivacy; - } - if (other.HasDifferentialPrivacyNoiseLevel) { - DifferentialPrivacyNoiseLevel = other.DifferentialPrivacyNoiseLevel; - } - if (other.HasDifferentialPrivacyClippingThreshold) { - DifferentialPrivacyClippingThreshold = other.DifferentialPrivacyClippingThreshold; - } - if (other.HasCharacterCoverage) { - CharacterCoverage = other.CharacterCoverage; - } - if (other.HasInputSentenceSize) { - InputSentenceSize = other.InputSentenceSize; - } - if (other.HasShuffleInputSentence) { - ShuffleInputSentence = other.ShuffleInputSentence; - } - if (other.HasMiningSentenceSize) { - MiningSentenceSize = other.MiningSentenceSize; - } - if (other.HasTrainingSentenceSize) { - TrainingSentenceSize = other.TrainingSentenceSize; - } - if (other.HasSeedSentencepieceSize) { - SeedSentencepieceSize = other.SeedSentencepieceSize; - } - if (other.HasShrinkingFactor) { - ShrinkingFactor = other.ShrinkingFactor; - } - if (other.HasMaxSentenceLength) { - MaxSentenceLength = other.MaxSentenceLength; - } - if (other.HasNumThreads) { - NumThreads = other.NumThreads; - } - if (other.HasNumSubIterations) { - NumSubIterations = other.NumSubIterations; - } - if (other.HasMaxSentencepieceLength) { - MaxSentencepieceLength = other.MaxSentencepieceLength; - } - if (other.HasSplitByUnicodeScript) { - SplitByUnicodeScript = other.SplitByUnicodeScript; - } - if (other.HasSplitByNumber) { - SplitByNumber = other.SplitByNumber; - } - if (other.HasSplitByWhitespace) { - SplitByWhitespace = other.SplitByWhitespace; - } - if (other.HasTreatWhitespaceAsSuffix) { - TreatWhitespaceAsSuffix = other.TreatWhitespaceAsSuffix; - } - if (other.HasAllowWhitespaceOnlyPieces) { - AllowWhitespaceOnlyPieces = other.AllowWhitespaceOnlyPieces; - } - if (other.HasSplitDigits) { - SplitDigits = other.SplitDigits; - } - if (other.HasPretokenizationDelimiter) { - PretokenizationDelimiter = other.PretokenizationDelimiter; - } - controlSymbols_.Add(other.controlSymbols_); - userDefinedSymbols_.Add(other.userDefinedSymbols_); - if (other.HasRequiredChars) { - RequiredChars = other.RequiredChars; - } - if (other.HasByteFallback) { - ByteFallback = other.ByteFallback; - } - if (other.HasVocabularyOutputPieceScore) { - VocabularyOutputPieceScore = other.VocabularyOutputPieceScore; - } - if (other.HasHardVocabLimit) { - HardVocabLimit = other.HardVocabLimit; - } - if (other.HasUseAllVocab) { - UseAllVocab = other.UseAllVocab; - } - if (other.HasUnkId) { - UnkId = other.UnkId; - } - if (other.HasBosId) { - BosId = other.BosId; - } - if (other.HasEosId) { - EosId = other.EosId; - } - if (other.HasPadId) { - PadId = other.PadId; - } - if (other.HasUnkPiece) { - UnkPiece = other.UnkPiece; - } - if (other.HasBosPiece) { - BosPiece = other.BosPiece; - } - if (other.HasEosPiece) { - EosPiece = other.EosPiece; - } - if (other.HasPadPiece) { - PadPiece = other.PadPiece; - } - if (other.HasUnkSurface) { - UnkSurface = other.UnkSurface; - } - if (other.HasTrainExtremelyLargeCorpus) { - TrainExtremelyLargeCorpus = other.TrainExtremelyLargeCorpus; - } - if (other.HasSeedSentencepiecesFile) { - SeedSentencepiecesFile = other.SeedSentencepiecesFile; - } - pb::ExtensionSet.MergeFrom(ref _extensions, other._extensions); - _unknownFields = pb::UnknownFieldSet.MergeFrom(_unknownFields, other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void MergeFrom(pb::CodedInputStream input) { - #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE - input.ReadRawMessage(this); - #else - uint tag; - while ((tag = input.ReadTag()) != 0) { - switch(tag) { - default: - if (!pb::ExtensionSet.TryMergeFieldFrom(ref _extensions, input)) { - _unknownFields = pb::UnknownFieldSet.MergeFieldFrom(_unknownFields, input); - } - break; - case 10: { - input_.AddEntriesFrom(input, _repeated_input_codec); - break; - } - case 18: { - ModelPrefix = input.ReadString(); - break; - } - case 24: { - ModelType = (global::Sentencepiece.TrainerSpec.Types.ModelType) input.ReadEnum(); - break; - } - case 32: { - VocabSize = input.ReadInt32(); - break; - } - case 42: { - acceptLanguage_.AddEntriesFrom(input, _repeated_acceptLanguage_codec); - break; - } - case 48: { - SelfTestSampleSize = input.ReadInt32(); - break; - } - case 58: { - InputFormat = input.ReadString(); - break; - } - case 85: { - CharacterCoverage = input.ReadFloat(); - break; - } - case 88: { - InputSentenceSize = input.ReadUInt64(); - break; - } - case 96: { - MiningSentenceSize = input.ReadInt32(); - break; - } - case 104: { - TrainingSentenceSize = input.ReadInt32(); - break; - } - case 112: { - SeedSentencepieceSize = input.ReadInt32(); - break; - } - case 125: { - ShrinkingFactor = input.ReadFloat(); - break; - } - case 128: { - NumThreads = input.ReadInt32(); - break; - } - case 136: { - NumSubIterations = input.ReadInt32(); - break; - } - case 144: { - MaxSentenceLength = input.ReadInt32(); - break; - } - case 152: { - ShuffleInputSentence = input.ReadBool(); - break; - } - case 160: { - MaxSentencepieceLength = input.ReadInt32(); - break; - } - case 168: { - SplitByUnicodeScript = input.ReadBool(); - break; - } - case 176: { - SplitByWhitespace = input.ReadBool(); - break; - } - case 184: { - SplitByNumber = input.ReadBool(); - break; - } - case 192: { - TreatWhitespaceAsSuffix = input.ReadBool(); - break; - } - case 200: { - SplitDigits = input.ReadBool(); - break; - } - case 208: { - AllowWhitespaceOnlyPieces = input.ReadBool(); - break; - } - case 242: { - controlSymbols_.AddEntriesFrom(input, _repeated_controlSymbols_codec); - break; - } - case 250: { - userDefinedSymbols_.AddEntriesFrom(input, _repeated_userDefinedSymbols_codec); - break; - } - case 256: { - VocabularyOutputPieceScore = input.ReadBool(); - break; - } - case 264: { - HardVocabLimit = input.ReadBool(); - break; - } - case 272: { - UseAllVocab = input.ReadBool(); - break; - } - case 280: { - ByteFallback = input.ReadBool(); - break; - } - case 290: { - RequiredChars = input.ReadString(); - break; - } - case 320: { - UnkId = input.ReadInt32(); - break; - } - case 328: { - BosId = input.ReadInt32(); - break; - } - case 336: { - EosId = input.ReadInt32(); - break; - } - case 344: { - PadId = input.ReadInt32(); - break; - } - case 354: { - UnkSurface = input.ReadString(); - break; - } - case 362: { - UnkPiece = input.ReadString(); - break; - } - case 370: { - BosPiece = input.ReadString(); - break; - } - case 378: { - EosPiece = input.ReadString(); - break; - } - case 386: { - PadPiece = input.ReadString(); - break; - } - case 392: { - TrainExtremelyLargeCorpus = input.ReadBool(); - break; - } - case 400: { - EnableDifferentialPrivacy = input.ReadBool(); - break; - } - case 413: { - DifferentialPrivacyNoiseLevel = input.ReadFloat(); - break; - } - case 416: { - DifferentialPrivacyClippingThreshold = input.ReadUInt64(); - break; - } - case 426: { - PretokenizationDelimiter = input.ReadString(); - break; - } - case 434: { - SeedSentencepiecesFile = input.ReadString(); - break; - } - } - } - #endif - } - - #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - void pb::IBufferMessage.InternalMergeFrom(ref pb::ParseContext input) { - uint tag; - while ((tag = input.ReadTag()) != 0) { - switch(tag) { - default: - if (!pb::ExtensionSet.TryMergeFieldFrom(ref _extensions, ref input)) { - _unknownFields = pb::UnknownFieldSet.MergeFieldFrom(_unknownFields, ref input); - } - break; - case 10: { - input_.AddEntriesFrom(ref input, _repeated_input_codec); - break; - } - case 18: { - ModelPrefix = input.ReadString(); - break; - } - case 24: { - ModelType = (global::Sentencepiece.TrainerSpec.Types.ModelType) input.ReadEnum(); - break; - } - case 32: { - VocabSize = input.ReadInt32(); - break; - } - case 42: { - acceptLanguage_.AddEntriesFrom(ref input, _repeated_acceptLanguage_codec); - break; - } - case 48: { - SelfTestSampleSize = input.ReadInt32(); - break; - } - case 58: { - InputFormat = input.ReadString(); - break; - } - case 85: { - CharacterCoverage = input.ReadFloat(); - break; - } - case 88: { - InputSentenceSize = input.ReadUInt64(); - break; - } - case 96: { - MiningSentenceSize = input.ReadInt32(); - break; - } - case 104: { - TrainingSentenceSize = input.ReadInt32(); - break; - } - case 112: { - SeedSentencepieceSize = input.ReadInt32(); - break; - } - case 125: { - ShrinkingFactor = input.ReadFloat(); - break; - } - case 128: { - NumThreads = input.ReadInt32(); - break; - } - case 136: { - NumSubIterations = input.ReadInt32(); - break; - } - case 144: { - MaxSentenceLength = input.ReadInt32(); - break; - } - case 152: { - ShuffleInputSentence = input.ReadBool(); - break; - } - case 160: { - MaxSentencepieceLength = input.ReadInt32(); - break; - } - case 168: { - SplitByUnicodeScript = input.ReadBool(); - break; - } - case 176: { - SplitByWhitespace = input.ReadBool(); - break; - } - case 184: { - SplitByNumber = input.ReadBool(); - break; - } - case 192: { - TreatWhitespaceAsSuffix = input.ReadBool(); - break; - } - case 200: { - SplitDigits = input.ReadBool(); - break; - } - case 208: { - AllowWhitespaceOnlyPieces = input.ReadBool(); - break; - } - case 242: { - controlSymbols_.AddEntriesFrom(ref input, _repeated_controlSymbols_codec); - break; - } - case 250: { - userDefinedSymbols_.AddEntriesFrom(ref input, _repeated_userDefinedSymbols_codec); - break; - } - case 256: { - VocabularyOutputPieceScore = input.ReadBool(); - break; - } - case 264: { - HardVocabLimit = input.ReadBool(); - break; - } - case 272: { - UseAllVocab = input.ReadBool(); - break; - } - case 280: { - ByteFallback = input.ReadBool(); - break; - } - case 290: { - RequiredChars = input.ReadString(); - break; - } - case 320: { - UnkId = input.ReadInt32(); - break; - } - case 328: { - BosId = input.ReadInt32(); - break; - } - case 336: { - EosId = input.ReadInt32(); - break; - } - case 344: { - PadId = input.ReadInt32(); - break; - } - case 354: { - UnkSurface = input.ReadString(); - break; - } - case 362: { - UnkPiece = input.ReadString(); - break; - } - case 370: { - BosPiece = input.ReadString(); - break; - } - case 378: { - EosPiece = input.ReadString(); - break; - } - case 386: { - PadPiece = input.ReadString(); - break; - } - case 392: { - TrainExtremelyLargeCorpus = input.ReadBool(); - break; - } - case 400: { - EnableDifferentialPrivacy = input.ReadBool(); - break; - } - case 413: { - DifferentialPrivacyNoiseLevel = input.ReadFloat(); - break; - } - case 416: { - DifferentialPrivacyClippingThreshold = input.ReadUInt64(); - break; - } - case 426: { - PretokenizationDelimiter = input.ReadString(); - break; - } - case 434: { - SeedSentencepiecesFile = input.ReadString(); - break; - } - } - } - } - #endif - - public TValue GetExtension(pb::Extension extension) { - return pb::ExtensionSet.Get(ref _extensions, extension); - } - public pbc::RepeatedField GetExtension(pb::RepeatedExtension extension) { - return pb::ExtensionSet.Get(ref _extensions, extension); - } - public pbc::RepeatedField GetOrInitializeExtension(pb::RepeatedExtension extension) { - return pb::ExtensionSet.GetOrInitialize(ref _extensions, extension); - } - public void SetExtension(pb::Extension extension, TValue value) { - pb::ExtensionSet.Set(ref _extensions, extension, value); - } - public bool HasExtension(pb::Extension extension) { - return pb::ExtensionSet.Has(ref _extensions, extension); - } - public void ClearExtension(pb::Extension extension) { - pb::ExtensionSet.Clear(ref _extensions, extension); - } - public void ClearExtension(pb::RepeatedExtension extension) { - pb::ExtensionSet.Clear(ref _extensions, extension); - } - - #region Nested types - /// Container for nested types declared in the TrainerSpec message type. - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public static partial class Types { - /// - /// Model type. only have UNIGRAM now. - /// - public enum ModelType { - /// - /// Unigram language model with dynamic algorithm - /// - [pbr::OriginalName("UNIGRAM")] Unigram = 1, - /// - /// Byte Pair Encoding - /// - [pbr::OriginalName("BPE")] Bpe = 2, - /// - /// Delimitered by whitespace. - /// - [pbr::OriginalName("WORD")] Word = 3, - /// - /// tokenizes into character sequence - /// - [pbr::OriginalName("CHAR")] Char = 4, - } - - } - #endregion - - } - - /// - /// NormalizerSpec encodes a various parameters for string normalization - /// - [global::System.Diagnostics.DebuggerDisplayAttribute("{ToString(),nq}")] - internal sealed partial class NormalizerSpec : pb::IExtendableMessage - #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE - , pb::IBufferMessage - #endif - { - private static readonly pb::MessageParser _parser = new pb::MessageParser(() => new NormalizerSpec()); - private pb::UnknownFieldSet _unknownFields; - private pb::ExtensionSet _extensions; - private pb::ExtensionSet _Extensions { get { return _extensions; } } - private int _hasBits0; - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public static pb::MessageParser Parser { get { return _parser; } } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public static pbr::MessageDescriptor Descriptor { - get { return global::Sentencepiece.SentencepieceModelReflection.Descriptor.MessageTypes[1]; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - pbr::MessageDescriptor pb::IMessage.Descriptor { - get { return Descriptor; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public NormalizerSpec() { - OnConstruction(); - } - - partial void OnConstruction(); - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public NormalizerSpec(NormalizerSpec other) : this() { - _hasBits0 = other._hasBits0; - name_ = other.name_; - precompiledCharsmap_ = other.precompiledCharsmap_; - addDummyPrefix_ = other.addDummyPrefix_; - removeExtraWhitespaces_ = other.removeExtraWhitespaces_; - escapeWhitespaces_ = other.escapeWhitespaces_; - normalizationRuleTsv_ = other.normalizationRuleTsv_; - _unknownFields = pb::UnknownFieldSet.Clone(other._unknownFields); - _extensions = pb::ExtensionSet.Clone(other._extensions); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public NormalizerSpec Clone() { - return new NormalizerSpec(this); - } - - /// Field number for the "name" field. - public const int NameFieldNumber = 1; - private readonly static string NameDefaultValue = ""; - - private string name_; - /// - /// name of normalization rule. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public string Name { - get { return name_ ?? NameDefaultValue; } - set { - name_ = pb::ProtoPreconditions.CheckNotNull(value, "value"); - } - } - /// Gets whether the "name" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasName { - get { return name_ != null; } - } - /// Clears the value of the "name" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearName() { - name_ = null; - } - - /// Field number for the "precompiled_charsmap" field. - public const int PrecompiledCharsmapFieldNumber = 2; - private readonly static pb::ByteString PrecompiledCharsmapDefaultValue = pb::ByteString.Empty; - - private pb::ByteString precompiledCharsmap_; - /// - /// Pre-compiled normalization rule created by - /// Builder::GetPrecompiledCharsMap() or Builder::CompileCharsMap() method. - /// Usually this field is set by Builder::GetNormalizerSpec() method. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public pb::ByteString PrecompiledCharsmap { - get { return precompiledCharsmap_ ?? PrecompiledCharsmapDefaultValue; } - set { - precompiledCharsmap_ = pb::ProtoPreconditions.CheckNotNull(value, "value"); - } - } - /// Gets whether the "precompiled_charsmap" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasPrecompiledCharsmap { - get { return precompiledCharsmap_ != null; } - } - /// Clears the value of the "precompiled_charsmap" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearPrecompiledCharsmap() { - precompiledCharsmap_ = null; - } - - /// Field number for the "add_dummy_prefix" field. - public const int AddDummyPrefixFieldNumber = 3; - private readonly static bool AddDummyPrefixDefaultValue = true; - - private bool addDummyPrefix_; - /// - /// Adds dummy whitespace at the beginning of text in order to - /// treat "world" in "world" and "hello world" in the same way. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool AddDummyPrefix { - get { if ((_hasBits0 & 1) != 0) { return addDummyPrefix_; } else { return AddDummyPrefixDefaultValue; } } - set { - _hasBits0 |= 1; - addDummyPrefix_ = value; - } - } - /// Gets whether the "add_dummy_prefix" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasAddDummyPrefix { - get { return (_hasBits0 & 1) != 0; } - } - /// Clears the value of the "add_dummy_prefix" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearAddDummyPrefix() { - _hasBits0 &= ~1; - } - - /// Field number for the "remove_extra_whitespaces" field. - public const int RemoveExtraWhitespacesFieldNumber = 4; - private readonly static bool RemoveExtraWhitespacesDefaultValue = true; - - private bool removeExtraWhitespaces_; - /// - /// Removes leading, trailing, and duplicate internal whitespace. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool RemoveExtraWhitespaces { - get { if ((_hasBits0 & 2) != 0) { return removeExtraWhitespaces_; } else { return RemoveExtraWhitespacesDefaultValue; } } - set { - _hasBits0 |= 2; - removeExtraWhitespaces_ = value; - } - } - /// Gets whether the "remove_extra_whitespaces" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasRemoveExtraWhitespaces { - get { return (_hasBits0 & 2) != 0; } - } - /// Clears the value of the "remove_extra_whitespaces" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearRemoveExtraWhitespaces() { - _hasBits0 &= ~2; - } - - /// Field number for the "escape_whitespaces" field. - public const int EscapeWhitespacesFieldNumber = 5; - private readonly static bool EscapeWhitespacesDefaultValue = true; - - private bool escapeWhitespaces_; - /// - /// Replaces whitespace with meta symbol. - /// This field must be true to train sentence piece model. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool EscapeWhitespaces { - get { if ((_hasBits0 & 4) != 0) { return escapeWhitespaces_; } else { return EscapeWhitespacesDefaultValue; } } - set { - _hasBits0 |= 4; - escapeWhitespaces_ = value; - } - } - /// Gets whether the "escape_whitespaces" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasEscapeWhitespaces { - get { return (_hasBits0 & 4) != 0; } - } - /// Clears the value of the "escape_whitespaces" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearEscapeWhitespaces() { - _hasBits0 &= ~4; - } - - /// Field number for the "normalization_rule_tsv" field. - public const int NormalizationRuleTsvFieldNumber = 6; - private readonly static string NormalizationRuleTsvDefaultValue = ""; - - private string normalizationRuleTsv_; - /// - /// Custom normalization rule file in TSV format. - /// https://github.com/google/sentencepiece/blob/master/doc/normalization.md - /// This field is only used in SentencePieceTrainer::Train() method, which - /// compiles the rule into the binary rule stored in `precompiled_charsmap`. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public string NormalizationRuleTsv { - get { return normalizationRuleTsv_ ?? NormalizationRuleTsvDefaultValue; } - set { - normalizationRuleTsv_ = pb::ProtoPreconditions.CheckNotNull(value, "value"); - } - } - /// Gets whether the "normalization_rule_tsv" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasNormalizationRuleTsv { - get { return normalizationRuleTsv_ != null; } - } - /// Clears the value of the "normalization_rule_tsv" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearNormalizationRuleTsv() { - normalizationRuleTsv_ = null; - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public override bool Equals(object other) { - return Equals(other as NormalizerSpec); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool Equals(NormalizerSpec other) { - if (ReferenceEquals(other, null)) { - return false; - } - if (ReferenceEquals(other, this)) { - return true; - } - if (Name != other.Name) return false; - if (PrecompiledCharsmap != other.PrecompiledCharsmap) return false; - if (AddDummyPrefix != other.AddDummyPrefix) return false; - if (RemoveExtraWhitespaces != other.RemoveExtraWhitespaces) return false; - if (EscapeWhitespaces != other.EscapeWhitespaces) return false; - if (NormalizationRuleTsv != other.NormalizationRuleTsv) return false; - if (!Equals(_extensions, other._extensions)) { - return false; - } - return Equals(_unknownFields, other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public override int GetHashCode() { - int hash = 1; - if (HasName) hash ^= Name.GetHashCode(); - if (HasPrecompiledCharsmap) hash ^= PrecompiledCharsmap.GetHashCode(); - if (HasAddDummyPrefix) hash ^= AddDummyPrefix.GetHashCode(); - if (HasRemoveExtraWhitespaces) hash ^= RemoveExtraWhitespaces.GetHashCode(); - if (HasEscapeWhitespaces) hash ^= EscapeWhitespaces.GetHashCode(); - if (HasNormalizationRuleTsv) hash ^= NormalizationRuleTsv.GetHashCode(); - if (_extensions != null) { - hash ^= _extensions.GetHashCode(); - } - if (_unknownFields != null) { - hash ^= _unknownFields.GetHashCode(); - } - return hash; - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public override string ToString() { - return pb::JsonFormatter.ToDiagnosticString(this); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void WriteTo(pb::CodedOutputStream output) { - #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE - output.WriteRawMessage(this); - #else - if (HasName) { - output.WriteRawTag(10); - output.WriteString(Name); - } - if (HasPrecompiledCharsmap) { - output.WriteRawTag(18); - output.WriteBytes(PrecompiledCharsmap); - } - if (HasAddDummyPrefix) { - output.WriteRawTag(24); - output.WriteBool(AddDummyPrefix); - } - if (HasRemoveExtraWhitespaces) { - output.WriteRawTag(32); - output.WriteBool(RemoveExtraWhitespaces); - } - if (HasEscapeWhitespaces) { - output.WriteRawTag(40); - output.WriteBool(EscapeWhitespaces); - } - if (HasNormalizationRuleTsv) { - output.WriteRawTag(50); - output.WriteString(NormalizationRuleTsv); - } - if (_extensions != null) { - _extensions.WriteTo(output); - } - if (_unknownFields != null) { - _unknownFields.WriteTo(output); - } - #endif - } - - #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - void pb::IBufferMessage.InternalWriteTo(ref pb::WriteContext output) { - if (HasName) { - output.WriteRawTag(10); - output.WriteString(Name); - } - if (HasPrecompiledCharsmap) { - output.WriteRawTag(18); - output.WriteBytes(PrecompiledCharsmap); - } - if (HasAddDummyPrefix) { - output.WriteRawTag(24); - output.WriteBool(AddDummyPrefix); - } - if (HasRemoveExtraWhitespaces) { - output.WriteRawTag(32); - output.WriteBool(RemoveExtraWhitespaces); - } - if (HasEscapeWhitespaces) { - output.WriteRawTag(40); - output.WriteBool(EscapeWhitespaces); - } - if (HasNormalizationRuleTsv) { - output.WriteRawTag(50); - output.WriteString(NormalizationRuleTsv); - } - if (_extensions != null) { - _extensions.WriteTo(ref output); - } - if (_unknownFields != null) { - _unknownFields.WriteTo(ref output); - } - } - #endif - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public int CalculateSize() { - int size = 0; - if (HasName) { - size += 1 + pb::CodedOutputStream.ComputeStringSize(Name); - } - if (HasPrecompiledCharsmap) { - size += 1 + pb::CodedOutputStream.ComputeBytesSize(PrecompiledCharsmap); - } - if (HasAddDummyPrefix) { - size += 1 + 1; - } - if (HasRemoveExtraWhitespaces) { - size += 1 + 1; - } - if (HasEscapeWhitespaces) { - size += 1 + 1; - } - if (HasNormalizationRuleTsv) { - size += 1 + pb::CodedOutputStream.ComputeStringSize(NormalizationRuleTsv); - } - if (_extensions != null) { - size += _extensions.CalculateSize(); - } - if (_unknownFields != null) { - size += _unknownFields.CalculateSize(); - } - return size; - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void MergeFrom(NormalizerSpec other) { - if (other == null) { - return; - } - if (other.HasName) { - Name = other.Name; - } - if (other.HasPrecompiledCharsmap) { - PrecompiledCharsmap = other.PrecompiledCharsmap; - } - if (other.HasAddDummyPrefix) { - AddDummyPrefix = other.AddDummyPrefix; - } - if (other.HasRemoveExtraWhitespaces) { - RemoveExtraWhitespaces = other.RemoveExtraWhitespaces; - } - if (other.HasEscapeWhitespaces) { - EscapeWhitespaces = other.EscapeWhitespaces; - } - if (other.HasNormalizationRuleTsv) { - NormalizationRuleTsv = other.NormalizationRuleTsv; - } - pb::ExtensionSet.MergeFrom(ref _extensions, other._extensions); - _unknownFields = pb::UnknownFieldSet.MergeFrom(_unknownFields, other._unknownFields); - } +// Minimal protobuf parser for SentencePiece model files (sentencepiece_model.proto). +// Replaces a full Google.Protobuf dependency with just enough wire-format reading +// to parse the fields the tokenizer implementation actually consumes. +// SentencePiece is under the Apache License 2.0 https://github.com/google/sentencepiece/blob/master/LICENSE - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void MergeFrom(pb::CodedInputStream input) { - #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE - input.ReadRawMessage(this); - #else - uint tag; - while ((tag = input.ReadTag()) != 0) { - switch(tag) { - default: - if (!pb::ExtensionSet.TryMergeFieldFrom(ref _extensions, input)) { - _unknownFields = pb::UnknownFieldSet.MergeFieldFrom(_unknownFields, input); +using System; +using System.Collections.Generic; +using System.IO; +using System.Text; + +namespace Sentencepiece +{ + /// Low-level protobuf wire-format primitives (read-only, forward-only). + internal static class SentencePieceProtobufReader + { + internal static int ReadRawVarint32(byte[] data, int end, ref int pos) + { + if (pos >= end) + { + throw new InvalidDataException("Unexpected end of data while reading varint."); } - break; - case 10: { - Name = input.ReadString(); - break; - } - case 18: { - PrecompiledCharsmap = input.ReadBytes(); - break; - } - case 24: { - AddDummyPrefix = input.ReadBool(); - break; - } - case 32: { - RemoveExtraWhitespaces = input.ReadBool(); - break; - } - case 40: { - EscapeWhitespaces = input.ReadBool(); - break; - } - case 50: { - NormalizationRuleTsv = input.ReadString(); - break; - } - } - } - #endif - } - #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - void pb::IBufferMessage.InternalMergeFrom(ref pb::ParseContext input) { - uint tag; - while ((tag = input.ReadTag()) != 0) { - switch(tag) { - default: - if (!pb::ExtensionSet.TryMergeFieldFrom(ref _extensions, ref input)) { - _unknownFields = pb::UnknownFieldSet.MergeFieldFrom(_unknownFields, ref input); + byte b = data[pos++]; + int result = b & 0x7F; + if ((b & 0x80) == 0) + { + return result; } - break; - case 10: { - Name = input.ReadString(); - break; - } - case 18: { - PrecompiledCharsmap = input.ReadBytes(); - break; - } - case 24: { - AddDummyPrefix = input.ReadBool(); - break; - } - case 32: { - RemoveExtraWhitespaces = input.ReadBool(); - break; - } - case 40: { - EscapeWhitespaces = input.ReadBool(); - break; - } - case 50: { - NormalizationRuleTsv = input.ReadString(); - break; - } - } - } - } - #endif - - public TValue GetExtension(pb::Extension extension) { - return pb::ExtensionSet.Get(ref _extensions, extension); - } - public pbc::RepeatedField GetExtension(pb::RepeatedExtension extension) { - return pb::ExtensionSet.Get(ref _extensions, extension); - } - public pbc::RepeatedField GetOrInitializeExtension(pb::RepeatedExtension extension) { - return pb::ExtensionSet.GetOrInitialize(ref _extensions, extension); - } - public void SetExtension(pb::Extension extension, TValue value) { - pb::ExtensionSet.Set(ref _extensions, extension, value); - } - public bool HasExtension(pb::Extension extension) { - return pb::ExtensionSet.Has(ref _extensions, extension); - } - public void ClearExtension(pb::Extension extension) { - pb::ExtensionSet.Clear(ref _extensions, extension); - } - public void ClearExtension(pb::RepeatedExtension extension) { - pb::ExtensionSet.Clear(ref _extensions, extension); - } - - } - - /// - /// Proto to store samples for self-testing. - /// - [global::System.Diagnostics.DebuggerDisplayAttribute("{ToString(),nq}")] - internal sealed partial class SelfTestData : pb::IExtendableMessage - #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE - , pb::IBufferMessage - #endif - { - private static readonly pb::MessageParser _parser = new pb::MessageParser(() => new SelfTestData()); - private pb::UnknownFieldSet _unknownFields; - private pb::ExtensionSet _extensions; - private pb::ExtensionSet _Extensions { get { return _extensions; } } - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public static pb::MessageParser Parser { get { return _parser; } } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public static pbr::MessageDescriptor Descriptor { - get { return global::Sentencepiece.SentencepieceModelReflection.Descriptor.MessageTypes[2]; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - pbr::MessageDescriptor pb::IMessage.Descriptor { - get { return Descriptor; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public SelfTestData() { - OnConstruction(); - } - - partial void OnConstruction(); - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public SelfTestData(SelfTestData other) : this() { - samples_ = other.samples_.Clone(); - _unknownFields = pb::UnknownFieldSet.Clone(other._unknownFields); - _extensions = pb::ExtensionSet.Clone(other._extensions); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public SelfTestData Clone() { - return new SelfTestData(this); - } - /// Field number for the "samples" field. - public const int SamplesFieldNumber = 1; - private static readonly pb::FieldCodec _repeated_samples_codec - = pb::FieldCodec.ForMessage(10, global::Sentencepiece.SelfTestData.Types.Sample.Parser); - private readonly pbc::RepeatedField samples_ = new pbc::RepeatedField(); - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public pbc::RepeatedField Samples { - get { return samples_; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public override bool Equals(object other) { - return Equals(other as SelfTestData); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool Equals(SelfTestData other) { - if (ReferenceEquals(other, null)) { - return false; - } - if (ReferenceEquals(other, this)) { - return true; - } - if(!samples_.Equals(other.samples_)) return false; - if (!Equals(_extensions, other._extensions)) { - return false; - } - return Equals(_unknownFields, other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public override int GetHashCode() { - int hash = 1; - hash ^= samples_.GetHashCode(); - if (_extensions != null) { - hash ^= _extensions.GetHashCode(); - } - if (_unknownFields != null) { - hash ^= _unknownFields.GetHashCode(); - } - return hash; - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public override string ToString() { - return pb::JsonFormatter.ToDiagnosticString(this); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void WriteTo(pb::CodedOutputStream output) { - #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE - output.WriteRawMessage(this); - #else - samples_.WriteTo(output, _repeated_samples_codec); - if (_extensions != null) { - _extensions.WriteTo(output); - } - if (_unknownFields != null) { - _unknownFields.WriteTo(output); - } - #endif - } - - #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - void pb::IBufferMessage.InternalWriteTo(ref pb::WriteContext output) { - samples_.WriteTo(ref output, _repeated_samples_codec); - if (_extensions != null) { - _extensions.WriteTo(ref output); - } - if (_unknownFields != null) { - _unknownFields.WriteTo(ref output); - } - } - #endif - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public int CalculateSize() { - int size = 0; - size += samples_.CalculateSize(_repeated_samples_codec); - if (_extensions != null) { - size += _extensions.CalculateSize(); - } - if (_unknownFields != null) { - size += _unknownFields.CalculateSize(); - } - return size; - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void MergeFrom(SelfTestData other) { - if (other == null) { - return; - } - samples_.Add(other.samples_); - pb::ExtensionSet.MergeFrom(ref _extensions, other._extensions); - _unknownFields = pb::UnknownFieldSet.MergeFrom(_unknownFields, other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void MergeFrom(pb::CodedInputStream input) { - #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE - input.ReadRawMessage(this); - #else - uint tag; - while ((tag = input.ReadTag()) != 0) { - switch(tag) { - default: - if (!pb::ExtensionSet.TryMergeFieldFrom(ref _extensions, input)) { - _unknownFields = pb::UnknownFieldSet.MergeFieldFrom(_unknownFields, input); - } - break; - case 10: { - samples_.AddEntriesFrom(input, _repeated_samples_codec); - break; - } - } - } - #endif - } + for (int shift = 7; shift < 32; shift += 7) + { + if (pos >= end) + { + throw new InvalidDataException("Unexpected end of data while reading varint."); + } - #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - void pb::IBufferMessage.InternalMergeFrom(ref pb::ParseContext input) { - uint tag; - while ((tag = input.ReadTag()) != 0) { - switch(tag) { - default: - if (!pb::ExtensionSet.TryMergeFieldFrom(ref _extensions, ref input)) { - _unknownFields = pb::UnknownFieldSet.MergeFieldFrom(_unknownFields, ref input); + b = data[pos++]; + result |= (b & 0x7F) << shift; + if ((b & 0x80) == 0) + { + return result; + } } - break; - case 10: { - samples_.AddEntriesFrom(ref input, _repeated_samples_codec); - break; - } - } - } - } - #endif - - public TValue GetExtension(pb::Extension extension) { - return pb::ExtensionSet.Get(ref _extensions, extension); - } - public pbc::RepeatedField GetExtension(pb::RepeatedExtension extension) { - return pb::ExtensionSet.Get(ref _extensions, extension); - } - public pbc::RepeatedField GetOrInitializeExtension(pb::RepeatedExtension extension) { - return pb::ExtensionSet.GetOrInitialize(ref _extensions, extension); - } - public void SetExtension(pb::Extension extension, TValue value) { - pb::ExtensionSet.Set(ref _extensions, extension, value); - } - public bool HasExtension(pb::Extension extension) { - return pb::ExtensionSet.Has(ref _extensions, extension); - } - public void ClearExtension(pb::Extension extension) { - pb::ExtensionSet.Clear(ref _extensions, extension); - } - public void ClearExtension(pb::RepeatedExtension extension) { - pb::ExtensionSet.Clear(ref _extensions, extension); - } - - #region Nested types - /// Container for nested types declared in the SelfTestData message type. - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public static partial class Types { - [global::System.Diagnostics.DebuggerDisplayAttribute("{ToString(),nq}")] - public sealed partial class Sample : pb::IMessage - #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE - , pb::IBufferMessage - #endif - { - private static readonly pb::MessageParser _parser = new pb::MessageParser(() => new Sample()); - private pb::UnknownFieldSet _unknownFields; - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public static pb::MessageParser Parser { get { return _parser; } } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public static pbr::MessageDescriptor Descriptor { - get { return global::Sentencepiece.SelfTestData.Descriptor.NestedTypes[0]; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - pbr::MessageDescriptor pb::IMessage.Descriptor { - get { return Descriptor; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public Sample() { - OnConstruction(); - } - - partial void OnConstruction(); - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public Sample(Sample other) : this() { - input_ = other.input_; - expected_ = other.expected_; - _unknownFields = pb::UnknownFieldSet.Clone(other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public Sample Clone() { - return new Sample(this); - } - - /// Field number for the "input" field. - public const int InputFieldNumber = 1; - private readonly static string InputDefaultValue = ""; - - private string input_; - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public string Input { - get { return input_ ?? InputDefaultValue; } - set { - input_ = pb::ProtoPreconditions.CheckNotNull(value, "value"); - } - } - /// Gets whether the "input" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasInput { - get { return input_ != null; } - } - /// Clears the value of the "input" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearInput() { - input_ = null; - } - - /// Field number for the "expected" field. - public const int ExpectedFieldNumber = 2; - private readonly static string ExpectedDefaultValue = ""; - - private string expected_; - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public string Expected { - get { return expected_ ?? ExpectedDefaultValue; } - set { - expected_ = pb::ProtoPreconditions.CheckNotNull(value, "value"); - } - } - /// Gets whether the "expected" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasExpected { - get { return expected_ != null; } - } - /// Clears the value of the "expected" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearExpected() { - expected_ = null; - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public override bool Equals(object other) { - return Equals(other as Sample); - } - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool Equals(Sample other) { - if (ReferenceEquals(other, null)) { - return false; - } - if (ReferenceEquals(other, this)) { - return true; - } - if (Input != other.Input) return false; - if (Expected != other.Expected) return false; - return Equals(_unknownFields, other._unknownFields); - } + // Negative int32 values are sign-extended to 10-byte varints; consume remaining bytes. + for (int i = 0; i < 5; i++) + { + if (pos >= end) + { + throw new InvalidDataException("Unexpected end of data while reading varint."); + } - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public override int GetHashCode() { - int hash = 1; - if (HasInput) hash ^= Input.GetHashCode(); - if (HasExpected) hash ^= Expected.GetHashCode(); - if (_unknownFields != null) { - hash ^= _unknownFields.GetHashCode(); - } - return hash; - } + if ((data[pos++] & 0x80) == 0) + { + return result; + } + } - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public override string ToString() { - return pb::JsonFormatter.ToDiagnosticString(this); + throw new InvalidDataException("Malformed varint."); } - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void WriteTo(pb::CodedOutputStream output) { - #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE - output.WriteRawMessage(this); - #else - if (HasInput) { - output.WriteRawTag(10); - output.WriteString(Input); - } - if (HasExpected) { - output.WriteRawTag(18); - output.WriteString(Expected); - } - if (_unknownFields != null) { - _unknownFields.WriteTo(output); - } - #endif - } + internal static int ReadLengthPrefix(byte[] data, int end, ref int pos) + { + int length = ReadRawVarint32(data, end, ref pos); + if ((uint)length > (uint)(end - pos)) + { + throw new InvalidDataException("Invalid length-delimited field size."); + } - #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - void pb::IBufferMessage.InternalWriteTo(ref pb::WriteContext output) { - if (HasInput) { - output.WriteRawTag(10); - output.WriteString(Input); - } - if (HasExpected) { - output.WriteRawTag(18); - output.WriteString(Expected); - } - if (_unknownFields != null) { - _unknownFields.WriteTo(ref output); - } + return length; } - #endif - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public int CalculateSize() { - int size = 0; - if (HasInput) { - size += 1 + pb::CodedOutputStream.ComputeStringSize(Input); - } - if (HasExpected) { - size += 1 + pb::CodedOutputStream.ComputeStringSize(Expected); - } - if (_unknownFields != null) { - size += _unknownFields.CalculateSize(); - } - return size; + internal static string ReadString(byte[] data, int end, ref int pos) + { + int length = ReadLengthPrefix(data, end, ref pos); + string result = Encoding.UTF8.GetString(data, pos, length); + pos += length; + return result; } - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void MergeFrom(Sample other) { - if (other == null) { - return; - } - if (other.HasInput) { - Input = other.Input; - } - if (other.HasExpected) { - Expected = other.Expected; - } - _unknownFields = pb::UnknownFieldSet.MergeFrom(_unknownFields, other._unknownFields); - } + internal static float ReadFloat(byte[] data, int end, ref int pos) + { + if (pos > end - 4) + { + throw new InvalidDataException("Unexpected end of data while reading float."); + } - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void MergeFrom(pb::CodedInputStream input) { - #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE - input.ReadRawMessage(this); - #else - uint tag; - while ((tag = input.ReadTag()) != 0) { - switch(tag) { - default: - _unknownFields = pb::UnknownFieldSet.MergeFieldFrom(_unknownFields, input); - break; - case 10: { - Input = input.ReadString(); - break; - } - case 18: { - Expected = input.ReadString(); - break; - } + float value; + if (BitConverter.IsLittleEndian) + { + value = BitConverter.ToSingle(data, pos); + } + else + { + // Protobuf fixed32 is always little-endian; reverse bytes on big-endian platforms. + byte[] buffer = new byte[4]; + buffer[0] = data[pos + 3]; + buffer[1] = data[pos + 2]; + buffer[2] = data[pos + 1]; + buffer[3] = data[pos]; + value = BitConverter.ToSingle(buffer, 0); } - } - #endif - } - #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - void pb::IBufferMessage.InternalMergeFrom(ref pb::ParseContext input) { - uint tag; - while ((tag = input.ReadTag()) != 0) { - switch(tag) { - default: - _unknownFields = pb::UnknownFieldSet.MergeFieldFrom(_unknownFields, ref input); - break; - case 10: { - Input = input.ReadString(); - break; - } - case 18: { - Expected = input.ReadString(); - break; - } + pos += 4; + return value; + } + + internal static void SkipField(byte[] data, int end, int wireType, ref int pos) + { + switch (wireType) + { + case 0: // varint (max 10 bytes per protobuf spec) + for (int i = 0; i < 10; i++) + { + if (pos >= end) + { + throw new InvalidDataException("Unexpected end of data while skipping varint."); + } + + if ((data[pos++] & 0x80) == 0) + { + break; + } + + if (i == 9) + { + throw new InvalidDataException("Malformed varint."); + } + } + break; + + case 1: // 64-bit fixed + if (pos > end - 8) + { + throw new InvalidDataException("Unexpected end of data while skipping fixed64."); + } + pos += 8; + break; + + case 2: // length-delimited + int skipLength = ReadLengthPrefix(data, end, ref pos); + pos += skipLength; + break; + + case 5: // 32-bit fixed + if (pos > end - 4) + { + throw new InvalidDataException("Unexpected end of data while skipping fixed32."); + } + pos += 4; + break; + + default: + throw new InvalidDataException($"Unknown or unsupported protobuf wire type {wireType}."); } - } } - #endif - - } - - } - #endregion - - } - - /// - /// ModelProto stores model parameters. - /// SentencePieceProcessor is supposed to be self-contained. - /// All settings/parameters which may change the behavior must be encoded - /// in ModelProto. - /// - [global::System.Diagnostics.DebuggerDisplayAttribute("{ToString(),nq}")] - internal sealed partial class ModelProto : pb::IExtendableMessage - #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE - , pb::IBufferMessage - #endif - { - private static readonly pb::MessageParser _parser = new pb::MessageParser(() => new ModelProto()); - private pb::UnknownFieldSet _unknownFields; - private pb::ExtensionSet _extensions; - private pb::ExtensionSet _Extensions { get { return _extensions; } } - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public static pb::MessageParser Parser { get { return _parser; } } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public static pbr::MessageDescriptor Descriptor { - get { return global::Sentencepiece.SentencepieceModelReflection.Descriptor.MessageTypes[3]; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - pbr::MessageDescriptor pb::IMessage.Descriptor { - get { return Descriptor; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public ModelProto() { - OnConstruction(); - } - - partial void OnConstruction(); - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public ModelProto(ModelProto other) : this() { - pieces_ = other.pieces_.Clone(); - trainerSpec_ = other.trainerSpec_ != null ? other.trainerSpec_.Clone() : null; - normalizerSpec_ = other.normalizerSpec_ != null ? other.normalizerSpec_.Clone() : null; - selfTestData_ = other.selfTestData_ != null ? other.selfTestData_.Clone() : null; - denormalizerSpec_ = other.denormalizerSpec_ != null ? other.denormalizerSpec_.Clone() : null; - _unknownFields = pb::UnknownFieldSet.Clone(other._unknownFields); - _extensions = pb::ExtensionSet.Clone(other._extensions); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public ModelProto Clone() { - return new ModelProto(this); - } - - /// Field number for the "pieces" field. - public const int PiecesFieldNumber = 1; - private static readonly pb::FieldCodec _repeated_pieces_codec - = pb::FieldCodec.ForMessage(10, global::Sentencepiece.ModelProto.Types.SentencePiece.Parser); - private readonly pbc::RepeatedField pieces_ = new pbc::RepeatedField(); - /// - /// Sentence pieces with scores. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public pbc::RepeatedField Pieces { - get { return pieces_; } - } - - /// Field number for the "trainer_spec" field. - public const int TrainerSpecFieldNumber = 2; - private global::Sentencepiece.TrainerSpec trainerSpec_; - /// - /// Spec used to generate this model file. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public global::Sentencepiece.TrainerSpec TrainerSpec { - get { return trainerSpec_; } - set { - trainerSpec_ = value; - } - } - - /// Field number for the "normalizer_spec" field. - public const int NormalizerSpecFieldNumber = 3; - private global::Sentencepiece.NormalizerSpec normalizerSpec_; - /// - /// Spec for text normalization. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public global::Sentencepiece.NormalizerSpec NormalizerSpec { - get { return normalizerSpec_; } - set { - normalizerSpec_ = value; - } - } - - /// Field number for the "self_test_data" field. - public const int SelfTestDataFieldNumber = 4; - private global::Sentencepiece.SelfTestData selfTestData_; - /// - /// Stores sample input and its expected segmentation to verify the model. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public global::Sentencepiece.SelfTestData SelfTestData { - get { return selfTestData_; } - set { - selfTestData_ = value; - } - } - - /// Field number for the "denormalizer_spec" field. - public const int DenormalizerSpecFieldNumber = 5; - private global::Sentencepiece.NormalizerSpec denormalizerSpec_; - /// - /// Spec for text de-normalization. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public global::Sentencepiece.NormalizerSpec DenormalizerSpec { - get { return denormalizerSpec_; } - set { - denormalizerSpec_ = value; - } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public override bool Equals(object other) { - return Equals(other as ModelProto); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool Equals(ModelProto other) { - if (ReferenceEquals(other, null)) { - return false; - } - if (ReferenceEquals(other, this)) { - return true; - } - if(!pieces_.Equals(other.pieces_)) return false; - if (!object.Equals(TrainerSpec, other.TrainerSpec)) return false; - if (!object.Equals(NormalizerSpec, other.NormalizerSpec)) return false; - if (!object.Equals(SelfTestData, other.SelfTestData)) return false; - if (!object.Equals(DenormalizerSpec, other.DenormalizerSpec)) return false; - if (!Equals(_extensions, other._extensions)) { - return false; - } - return Equals(_unknownFields, other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public override int GetHashCode() { - int hash = 1; - hash ^= pieces_.GetHashCode(); - if (trainerSpec_ != null) hash ^= TrainerSpec.GetHashCode(); - if (normalizerSpec_ != null) hash ^= NormalizerSpec.GetHashCode(); - if (selfTestData_ != null) hash ^= SelfTestData.GetHashCode(); - if (denormalizerSpec_ != null) hash ^= DenormalizerSpec.GetHashCode(); - if (_extensions != null) { - hash ^= _extensions.GetHashCode(); - } - if (_unknownFields != null) { - hash ^= _unknownFields.GetHashCode(); - } - return hash; - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public override string ToString() { - return pb::JsonFormatter.ToDiagnosticString(this); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void WriteTo(pb::CodedOutputStream output) { - #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE - output.WriteRawMessage(this); - #else - pieces_.WriteTo(output, _repeated_pieces_codec); - if (trainerSpec_ != null) { - output.WriteRawTag(18); - output.WriteMessage(TrainerSpec); - } - if (normalizerSpec_ != null) { - output.WriteRawTag(26); - output.WriteMessage(NormalizerSpec); - } - if (selfTestData_ != null) { - output.WriteRawTag(34); - output.WriteMessage(SelfTestData); - } - if (denormalizerSpec_ != null) { - output.WriteRawTag(42); - output.WriteMessage(DenormalizerSpec); - } - if (_extensions != null) { - _extensions.WriteTo(output); - } - if (_unknownFields != null) { - _unknownFields.WriteTo(output); - } - #endif - } - - #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - void pb::IBufferMessage.InternalWriteTo(ref pb::WriteContext output) { - pieces_.WriteTo(ref output, _repeated_pieces_codec); - if (trainerSpec_ != null) { - output.WriteRawTag(18); - output.WriteMessage(TrainerSpec); - } - if (normalizerSpec_ != null) { - output.WriteRawTag(26); - output.WriteMessage(NormalizerSpec); - } - if (selfTestData_ != null) { - output.WriteRawTag(34); - output.WriteMessage(SelfTestData); - } - if (denormalizerSpec_ != null) { - output.WriteRawTag(42); - output.WriteMessage(DenormalizerSpec); - } - if (_extensions != null) { - _extensions.WriteTo(ref output); - } - if (_unknownFields != null) { - _unknownFields.WriteTo(ref output); - } } - #endif - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public int CalculateSize() { - int size = 0; - size += pieces_.CalculateSize(_repeated_pieces_codec); - if (trainerSpec_ != null) { - size += 1 + pb::CodedOutputStream.ComputeMessageSize(TrainerSpec); - } - if (normalizerSpec_ != null) { - size += 1 + pb::CodedOutputStream.ComputeMessageSize(NormalizerSpec); - } - if (selfTestData_ != null) { - size += 1 + pb::CodedOutputStream.ComputeMessageSize(SelfTestData); - } - if (denormalizerSpec_ != null) { - size += 1 + pb::CodedOutputStream.ComputeMessageSize(DenormalizerSpec); - } - if (_extensions != null) { - size += _extensions.CalculateSize(); - } - if (_unknownFields != null) { - size += _unknownFields.CalculateSize(); - } - return size; - } + /// Lightweight replacement for Google.Protobuf.ByteString with a Span property. + internal readonly struct SentencePieceByteString(byte[] data, int offset, int length) + { + internal ReadOnlySpan Span => data is null ? ReadOnlySpan.Empty : data.AsSpan(offset, length); + } + + /// ModelProto (top-level message; field numbers match sentencepiece_model.proto) + internal sealed class ModelProto + { + internal static readonly ModelProtoParser Parser = new(); + + internal List Pieces { get; } = new(); + internal TrainerSpec TrainerSpec { get; private set; } = new(); + internal NormalizerSpec NormalizerSpec { get; private set; } = new(); + + internal static ModelProto Parse(byte[] data, int start, int end) + { + ModelProto result = new(); + int pos = start; + int length; + while (pos < end) + { + int tag = SentencePieceProtobufReader.ReadRawVarint32(data, end, ref pos); + int fieldNumber = tag >> 3; + int wireType = tag & 7; + + // The 'when wireType == 2' guards serve double duty: they match the expected wire + // type for these message fields AND provide forward-compatibility — if a future proto + // version changes a field's type, or if extension fields reuse these numbers with a + // different wire type, the mismatch falls through to the default skip case. + // + // For non-repeated message fields (TrainerSpec, NormalizerSpec), seeing the same field + // number twice replaces the prior value (last wins). This differs from the standard + // protobuf library which merges repeated occurrences of non-repeated message fields. + // SentencePiece model files contain each field at most once, so the difference is moot + // in practice. + switch (fieldNumber) + { + case 1 when wireType == 2: // repeated SentencePiece pieces = 1 + length = SentencePieceProtobufReader.ReadLengthPrefix(data, end, ref pos); + result.Pieces.Add(Types.SentencePiece.Parse(data, pos, pos + length)); + pos += length; + break; + + case 2 when wireType == 2: // TrainerSpec trainer_spec = 2 + length = SentencePieceProtobufReader.ReadLengthPrefix(data, end, ref pos); + result.TrainerSpec = TrainerSpec.Parse(data, pos, pos + length); + pos += length; + break; + + case 3 when wireType == 2: // NormalizerSpec normalizer_spec = 3 + length = SentencePieceProtobufReader.ReadLengthPrefix(data, end, ref pos); + result.NormalizerSpec = NormalizerSpec.Parse(data, pos, pos + length); + pos += length; + break; + + default: + SentencePieceProtobufReader.SkipField(data, end, wireType, ref pos); + break; + } + } - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void MergeFrom(ModelProto other) { - if (other == null) { - return; - } - pieces_.Add(other.pieces_); - if (other.trainerSpec_ != null) { - if (trainerSpec_ == null) { - TrainerSpec = new global::Sentencepiece.TrainerSpec(); - } - TrainerSpec.MergeFrom(other.TrainerSpec); - } - if (other.normalizerSpec_ != null) { - if (normalizerSpec_ == null) { - NormalizerSpec = new global::Sentencepiece.NormalizerSpec(); - } - NormalizerSpec.MergeFrom(other.NormalizerSpec); - } - if (other.selfTestData_ != null) { - if (selfTestData_ == null) { - SelfTestData = new global::Sentencepiece.SelfTestData(); - } - SelfTestData.MergeFrom(other.SelfTestData); - } - if (other.denormalizerSpec_ != null) { - if (denormalizerSpec_ == null) { - DenormalizerSpec = new global::Sentencepiece.NormalizerSpec(); - } - DenormalizerSpec.MergeFrom(other.DenormalizerSpec); - } - pb::ExtensionSet.MergeFrom(ref _extensions, other._extensions); - _unknownFields = pb::UnknownFieldSet.MergeFrom(_unknownFields, other._unknownFields); - } + return result; + } + + internal static class Types + { + internal sealed class SentencePiece + { + internal string Piece { get; set; } = ""; + internal float Score { get; set; } + internal Types.Type Type { get; set; } = Types.Type.Normal; + + internal static SentencePiece Parse(byte[] data, int start, int end) + { + SentencePiece result = new(); + int pos = start; + while (pos < end) + { + int tag = SentencePieceProtobufReader.ReadRawVarint32(data, end, ref pos); + int fieldNumber = tag >> 3; + int wireType = tag & 7; + + switch (fieldNumber) + { + case 1 when wireType == 2: // string piece = 1 + result.Piece = SentencePieceProtobufReader.ReadString(data, end, ref pos); + break; + + case 2 when wireType == 5: // float score = 2 + result.Score = SentencePieceProtobufReader.ReadFloat(data, end, ref pos); + break; + + case 3 when wireType == 0: // Type type = 3 + result.Type = (Types.Type)SentencePieceProtobufReader.ReadRawVarint32(data, end, ref pos); + break; + + default: + SentencePieceProtobufReader.SkipField(data, end, wireType, ref pos); + break; + } + } + + return result; + } - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void MergeFrom(pb::CodedInputStream input) { - #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE - input.ReadRawMessage(this); - #else - uint tag; - while ((tag = input.ReadTag()) != 0) { - switch(tag) { - default: - if (!pb::ExtensionSet.TryMergeFieldFrom(ref _extensions, input)) { - _unknownFields = pb::UnknownFieldSet.MergeFieldFrom(_unknownFields, input); - } - break; - case 10: { - pieces_.AddEntriesFrom(input, _repeated_pieces_codec); - break; - } - case 18: { - if (trainerSpec_ == null) { - TrainerSpec = new global::Sentencepiece.TrainerSpec(); - } - input.ReadMessage(TrainerSpec); - break; - } - case 26: { - if (normalizerSpec_ == null) { - NormalizerSpec = new global::Sentencepiece.NormalizerSpec(); - } - input.ReadMessage(NormalizerSpec); - break; - } - case 34: { - if (selfTestData_ == null) { - SelfTestData = new global::Sentencepiece.SelfTestData(); - } - input.ReadMessage(SelfTestData); - break; - } - case 42: { - if (denormalizerSpec_ == null) { - DenormalizerSpec = new global::Sentencepiece.NormalizerSpec(); + internal static class Types + { + internal enum Type + { + Normal = 1, + Unknown = 2, + Control = 3, + UserDefined = 4, + Unused = 5, + Byte = 6, + } + } } - input.ReadMessage(DenormalizerSpec); - break; - } } - } - #endif } - #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - void pb::IBufferMessage.InternalMergeFrom(ref pb::ParseContext input) { - uint tag; - while ((tag = input.ReadTag()) != 0) { - switch(tag) { - default: - if (!pb::ExtensionSet.TryMergeFieldFrom(ref _extensions, ref input)) { - _unknownFields = pb::UnknownFieldSet.MergeFieldFrom(_unknownFields, ref input); - } - break; - case 10: { - pieces_.AddEntriesFrom(ref input, _repeated_pieces_codec); - break; - } - case 18: { - if (trainerSpec_ == null) { - TrainerSpec = new global::Sentencepiece.TrainerSpec(); + /// ModelProtoParser (entry point: ModelProto.Parser.ParseFrom(Stream)) + internal sealed class ModelProtoParser + { + internal ModelProto ParseFrom(Stream stream) + { + if (stream is null) + { + throw new ArgumentNullException(nameof(stream)); } - input.ReadMessage(TrainerSpec); - break; - } - case 26: { - if (normalizerSpec_ == null) { - NormalizerSpec = new global::Sentencepiece.NormalizerSpec(); + + // Fast-path: if the input is already a MemoryStream with an accessible buffer, + // parse directly from its underlying array without copying. + if (stream is MemoryStream memoryStream && + memoryStream.TryGetBuffer(out ArraySegment segment)) + { + int start = segment.Offset + (int)memoryStream.Position; + int end = segment.Offset + (int)memoryStream.Length; + return ModelProto.Parse(segment.Array!, start, end); } - input.ReadMessage(NormalizerSpec); - break; - } - case 34: { - if (selfTestData_ == null) { - SelfTestData = new global::Sentencepiece.SelfTestData(); + + // Fallback: copy remaining data into a new MemoryStream, pre-sizing when possible. + MemoryStream ms; + if (stream.CanSeek) + { + long remaining = stream.Length - stream.Position; + ms = remaining > 0 && remaining <= int.MaxValue ? new MemoryStream((int)remaining) : new MemoryStream(); } - input.ReadMessage(SelfTestData); - break; - } - case 42: { - if (denormalizerSpec_ == null) { - DenormalizerSpec = new global::Sentencepiece.NormalizerSpec(); + else + { + ms = new MemoryStream(); } - input.ReadMessage(DenormalizerSpec); - break; - } - } - } - } - #endif - - public TValue GetExtension(pb::Extension extension) { - return pb::ExtensionSet.Get(ref _extensions, extension); - } - public pbc::RepeatedField GetExtension(pb::RepeatedExtension extension) { - return pb::ExtensionSet.Get(ref _extensions, extension); - } - public pbc::RepeatedField GetOrInitializeExtension(pb::RepeatedExtension extension) { - return pb::ExtensionSet.GetOrInitialize(ref _extensions, extension); - } - public void SetExtension(pb::Extension extension, TValue value) { - pb::ExtensionSet.Set(ref _extensions, extension, value); - } - public bool HasExtension(pb::Extension extension) { - return pb::ExtensionSet.Has(ref _extensions, extension); - } - public void ClearExtension(pb::Extension extension) { - pb::ExtensionSet.Clear(ref _extensions, extension); - } - public void ClearExtension(pb::RepeatedExtension extension) { - pb::ExtensionSet.Clear(ref _extensions, extension); - } - - #region Nested types - /// Container for nested types declared in the ModelProto message type. - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public static partial class Types { - [global::System.Diagnostics.DebuggerDisplayAttribute("{ToString(),nq}")] - public sealed partial class SentencePiece : pb::IExtendableMessage - #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE - , pb::IBufferMessage - #endif - { - private static readonly pb::MessageParser _parser = new pb::MessageParser(() => new SentencePiece()); - private pb::UnknownFieldSet _unknownFields; - private pb::ExtensionSet _extensions; - private pb::ExtensionSet _Extensions { get { return _extensions; } } - private int _hasBits0; - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public static pb::MessageParser Parser { get { return _parser; } } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public static pbr::MessageDescriptor Descriptor { - get { return global::Sentencepiece.ModelProto.Descriptor.NestedTypes[0]; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - pbr::MessageDescriptor pb::IMessage.Descriptor { - get { return Descriptor; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public SentencePiece() { - OnConstruction(); - } - - partial void OnConstruction(); - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public SentencePiece(SentencePiece other) : this() { - _hasBits0 = other._hasBits0; - piece_ = other.piece_; - score_ = other.score_; - type_ = other.type_; - _unknownFields = pb::UnknownFieldSet.Clone(other._unknownFields); - _extensions = pb::ExtensionSet.Clone(other._extensions); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public SentencePiece Clone() { - return new SentencePiece(this); - } - - /// Field number for the "piece" field. - public const int PieceFieldNumber = 1; - private readonly static string PieceDefaultValue = ""; - - private string piece_; - /// - /// piece must not be empty. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public string Piece { - get { return piece_ ?? PieceDefaultValue; } - set { - piece_ = pb::ProtoPreconditions.CheckNotNull(value, "value"); - } - } - /// Gets whether the "piece" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasPiece { - get { return piece_ != null; } - } - /// Clears the value of the "piece" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearPiece() { - piece_ = null; - } - - /// Field number for the "score" field. - public const int ScoreFieldNumber = 2; - private readonly static float ScoreDefaultValue = 0F; - - private float score_; - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public float Score { - get { if ((_hasBits0 & 1) != 0) { return score_; } else { return ScoreDefaultValue; } } - set { - _hasBits0 |= 1; - score_ = value; - } - } - /// Gets whether the "score" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasScore { - get { return (_hasBits0 & 1) != 0; } - } - /// Clears the value of the "score" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearScore() { - _hasBits0 &= ~1; - } - - /// Field number for the "type" field. - public const int TypeFieldNumber = 3; - private readonly static global::Sentencepiece.ModelProto.Types.SentencePiece.Types.Type TypeDefaultValue = global::Sentencepiece.ModelProto.Types.SentencePiece.Types.Type.Normal; - - private global::Sentencepiece.ModelProto.Types.SentencePiece.Types.Type type_; - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public global::Sentencepiece.ModelProto.Types.SentencePiece.Types.Type Type { - get { if ((_hasBits0 & 2) != 0) { return type_; } else { return TypeDefaultValue; } } - set { - _hasBits0 |= 2; - type_ = value; - } - } - /// Gets whether the "type" field is set - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool HasType { - get { return (_hasBits0 & 2) != 0; } - } - /// Clears the value of the "type" field - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void ClearType() { - _hasBits0 &= ~2; - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public override bool Equals(object other) { - return Equals(other as SentencePiece); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public bool Equals(SentencePiece other) { - if (ReferenceEquals(other, null)) { - return false; - } - if (ReferenceEquals(other, this)) { - return true; - } - if (Piece != other.Piece) return false; - if (!pbc::ProtobufEqualityComparers.BitwiseSingleEqualityComparer.Equals(Score, other.Score)) return false; - if (Type != other.Type) return false; - if (!Equals(_extensions, other._extensions)) { - return false; - } - return Equals(_unknownFields, other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public override int GetHashCode() { - int hash = 1; - if (HasPiece) hash ^= Piece.GetHashCode(); - if (HasScore) hash ^= pbc::ProtobufEqualityComparers.BitwiseSingleEqualityComparer.GetHashCode(Score); - if (HasType) hash ^= Type.GetHashCode(); - if (_extensions != null) { - hash ^= _extensions.GetHashCode(); - } - if (_unknownFields != null) { - hash ^= _unknownFields.GetHashCode(); - } - return hash; - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public override string ToString() { - return pb::JsonFormatter.ToDiagnosticString(this); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void WriteTo(pb::CodedOutputStream output) { - #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE - output.WriteRawMessage(this); - #else - if (HasPiece) { - output.WriteRawTag(10); - output.WriteString(Piece); - } - if (HasScore) { - output.WriteRawTag(21); - output.WriteFloat(Score); - } - if (HasType) { - output.WriteRawTag(24); - output.WriteEnum((int) Type); - } - if (_extensions != null) { - _extensions.WriteTo(output); - } - if (_unknownFields != null) { - _unknownFields.WriteTo(output); - } - #endif - } - - #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - void pb::IBufferMessage.InternalWriteTo(ref pb::WriteContext output) { - if (HasPiece) { - output.WriteRawTag(10); - output.WriteString(Piece); - } - if (HasScore) { - output.WriteRawTag(21); - output.WriteFloat(Score); - } - if (HasType) { - output.WriteRawTag(24); - output.WriteEnum((int) Type); - } - if (_extensions != null) { - _extensions.WriteTo(ref output); - } - if (_unknownFields != null) { - _unknownFields.WriteTo(ref output); - } - } - #endif - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public int CalculateSize() { - int size = 0; - if (HasPiece) { - size += 1 + pb::CodedOutputStream.ComputeStringSize(Piece); - } - if (HasScore) { - size += 1 + 4; - } - if (HasType) { - size += 1 + pb::CodedOutputStream.ComputeEnumSize((int) Type); - } - if (_extensions != null) { - size += _extensions.CalculateSize(); - } - if (_unknownFields != null) { - size += _unknownFields.CalculateSize(); - } - return size; - } + stream.CopyTo(ms); + return ModelProto.Parse(ms.GetBuffer(), 0, (int)ms.Length); + } + } + + /// TrainerSpec (defaults match sentencepiece_model.proto) + internal sealed class TrainerSpec + { + internal Types.ModelType ModelType { get; private set; } = Types.ModelType.Unigram; + internal bool TreatWhitespaceAsSuffix { get; private set; } + internal bool ByteFallback { get; private set; } + internal int UnkId { get; private set; } + internal int BosId { get; private set; } = 1; + internal int EosId { get; private set; } = 2; + internal int PadId { get; private set; } = -1; + internal string UnkPiece { get; private set; } = ""; + internal string BosPiece { get; private set; } = ""; + internal string EosPiece { get; private set; } = ""; + internal string PadPiece { get; private set; } = ""; + + internal static TrainerSpec Parse(byte[] data, int start, int end) + { + TrainerSpec result = new(); + int pos = start; + while (pos < end) + { + int tag = SentencePieceProtobufReader.ReadRawVarint32(data, end, ref pos); + int fieldNumber = tag >> 3; + int wireType = tag & 7; + + switch (fieldNumber) + { + case 3 when wireType == 0: // ModelType model_type = 3 + result.ModelType = (Types.ModelType)SentencePieceProtobufReader.ReadRawVarint32(data, end, ref pos); + break; + + case 24 when wireType == 0: // bool treat_whitespace_as_suffix = 24 + result.TreatWhitespaceAsSuffix = SentencePieceProtobufReader.ReadRawVarint32(data, end, ref pos) != 0; + break; + + case 35 when wireType == 0: // bool byte_fallback = 35 + result.ByteFallback = SentencePieceProtobufReader.ReadRawVarint32(data, end, ref pos) != 0; + break; + + case 40 when wireType == 0: // int32 unk_id = 40 + result.UnkId = SentencePieceProtobufReader.ReadRawVarint32(data, end, ref pos); + break; + + case 41 when wireType == 0: // int32 bos_id = 41 + result.BosId = SentencePieceProtobufReader.ReadRawVarint32(data, end, ref pos); + break; + + case 42 when wireType == 0: // int32 eos_id = 42 + result.EosId = SentencePieceProtobufReader.ReadRawVarint32(data, end, ref pos); + break; + + case 43 when wireType == 0: // int32 pad_id = 43 + result.PadId = SentencePieceProtobufReader.ReadRawVarint32(data, end, ref pos); + break; + + case 45 when wireType == 2: // string unk_piece = 45 + result.UnkPiece = SentencePieceProtobufReader.ReadString(data, end, ref pos); + break; + + case 46 when wireType == 2: // string bos_piece = 46 + result.BosPiece = SentencePieceProtobufReader.ReadString(data, end, ref pos); + break; + + case 47 when wireType == 2: // string eos_piece = 47 + result.EosPiece = SentencePieceProtobufReader.ReadString(data, end, ref pos); + break; + + case 48 when wireType == 2: // string pad_piece = 48 + result.PadPiece = SentencePieceProtobufReader.ReadString(data, end, ref pos); + break; + + default: + SentencePieceProtobufReader.SkipField(data, end, wireType, ref pos); + break; + } + } - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void MergeFrom(SentencePiece other) { - if (other == null) { - return; - } - if (other.HasPiece) { - Piece = other.Piece; - } - if (other.HasScore) { - Score = other.Score; - } - if (other.HasType) { - Type = other.Type; - } - pb::ExtensionSet.MergeFrom(ref _extensions, other._extensions); - _unknownFields = pb::UnknownFieldSet.MergeFrom(_unknownFields, other._unknownFields); + return result; } - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public void MergeFrom(pb::CodedInputStream input) { - #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE - input.ReadRawMessage(this); - #else - uint tag; - while ((tag = input.ReadTag()) != 0) { - switch(tag) { - default: - if (!pb::ExtensionSet.TryMergeFieldFrom(ref _extensions, input)) { - _unknownFields = pb::UnknownFieldSet.MergeFieldFrom(_unknownFields, input); - } - break; - case 10: { - Piece = input.ReadString(); - break; - } - case 21: { - Score = input.ReadFloat(); - break; - } - case 24: { - Type = (global::Sentencepiece.ModelProto.Types.SentencePiece.Types.Type) input.ReadEnum(); - break; - } + internal static class Types + { + internal enum ModelType + { + Unigram = 1, + Bpe = 2, + Word = 3, + Char = 4, } - } - #endif } + } - #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - void pb::IBufferMessage.InternalMergeFrom(ref pb::ParseContext input) { - uint tag; - while ((tag = input.ReadTag()) != 0) { - switch(tag) { - default: - if (!pb::ExtensionSet.TryMergeFieldFrom(ref _extensions, ref input)) { - _unknownFields = pb::UnknownFieldSet.MergeFieldFrom(_unknownFields, ref input); + /// NormalizerSpec (defaults match sentencepiece_model.proto) + internal sealed class NormalizerSpec + { + internal string Name { get; private set; } = ""; + internal SentencePieceByteString PrecompiledCharsmap { get; private set; } + internal bool AddDummyPrefix { get; private set; } = true; + internal bool RemoveExtraWhitespaces { get; private set; } = true; + internal bool EscapeWhitespaces { get; private set; } = true; + + internal static NormalizerSpec Parse(byte[] data, int start, int end) + { + NormalizerSpec result = new(); + int pos = start; + while (pos < end) + { + int tag = SentencePieceProtobufReader.ReadRawVarint32(data, end, ref pos); + int fieldNumber = tag >> 3; + int wireType = tag & 7; + + switch (fieldNumber) + { + case 1 when wireType == 2: // string name = 1 + result.Name = SentencePieceProtobufReader.ReadString(data, end, ref pos); + break; + + case 2 when wireType == 2: // bytes precompiled_charsmap = 2 + int length = SentencePieceProtobufReader.ReadLengthPrefix(data, end, ref pos); + result.PrecompiledCharsmap = new SentencePieceByteString(data, pos, length); + pos += length; + break; + + case 3 when wireType == 0: // bool add_dummy_prefix = 3 + result.AddDummyPrefix = SentencePieceProtobufReader.ReadRawVarint32(data, end, ref pos) != 0; + break; + + case 4 when wireType == 0: // bool remove_extra_whitespaces = 4 + result.RemoveExtraWhitespaces = SentencePieceProtobufReader.ReadRawVarint32(data, end, ref pos) != 0; + break; + + case 5 when wireType == 0: // bool escape_whitespaces = 5 + result.EscapeWhitespaces = SentencePieceProtobufReader.ReadRawVarint32(data, end, ref pos) != 0; + break; + + default: + SentencePieceProtobufReader.SkipField(data, end, wireType, ref pos); + break; } - break; - case 10: { - Piece = input.ReadString(); - break; - } - case 21: { - Score = input.ReadFloat(); - break; - } - case 24: { - Type = (global::Sentencepiece.ModelProto.Types.SentencePiece.Types.Type) input.ReadEnum(); - break; - } } - } - } - #endif - - public TValue GetExtension(pb::Extension extension) { - return pb::ExtensionSet.Get(ref _extensions, extension); - } - public pbc::RepeatedField GetExtension(pb::RepeatedExtension extension) { - return pb::ExtensionSet.Get(ref _extensions, extension); - } - public pbc::RepeatedField GetOrInitializeExtension(pb::RepeatedExtension extension) { - return pb::ExtensionSet.GetOrInitialize(ref _extensions, extension); - } - public void SetExtension(pb::Extension extension, TValue value) { - pb::ExtensionSet.Set(ref _extensions, extension, value); - } - public bool HasExtension(pb::Extension extension) { - return pb::ExtensionSet.Has(ref _extensions, extension); - } - public void ClearExtension(pb::Extension extension) { - pb::ExtensionSet.Clear(ref _extensions, extension); - } - public void ClearExtension(pb::RepeatedExtension extension) { - pb::ExtensionSet.Clear(ref _extensions, extension); - } - - #region Nested types - /// Container for nested types declared in the SentencePiece message type. - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] - public static partial class Types { - public enum Type { - /// - /// normal symbol - /// - [pbr::OriginalName("NORMAL")] Normal = 1, - /// - /// unknown symbol. only <unk> for now. - /// - [pbr::OriginalName("UNKNOWN")] Unknown = 2, - /// - /// control symbols. </s>, <s>, <2ja> etc. - /// - [pbr::OriginalName("CONTROL")] Control = 3, - /// - /// user defined symbols. - /// - [pbr::OriginalName("USER_DEFINED")] UserDefined = 4, - /// - /// Typical usage of USER_DEFINED symbol - /// is placeholder. - /// - [pbr::OriginalName("BYTE")] Byte = 6, - /// - /// this piece is not used. - /// - [pbr::OriginalName("UNUSED")] Unused = 5, - } + return result; } - #endregion - - } - } - #endregion - - } - - #endregion - } - -#endregion Designer generated code diff --git a/test/Microsoft.ML.Tokenizers.Tests/LlamaTests.cs b/test/Microsoft.ML.Tokenizers.Tests/LlamaTests.cs index 472e344acd..11340188f2 100644 --- a/test/Microsoft.ML.Tokenizers.Tests/LlamaTests.cs +++ b/test/Microsoft.ML.Tokenizers.Tests/LlamaTests.cs @@ -913,5 +913,11 @@ public void TestPhi3TokenizerIdEncoding(string text, string decodedWithNoSpecial Assert.Equal(textWithSpecialTokens.Length, charactersWritten); Assert.Equal(textWithSpecialTokens, destinationBuffer.AsSpan(0, charactersWritten).ToString()); } + + [Fact] + public void CreateWithNullStreamThrows() + { + Assert.ThrowsAny(() => LlamaTokenizer.Create(null!)); + } } } diff --git a/test/Microsoft.ML.Tokenizers.Tests/SentencePieceTests.cs b/test/Microsoft.ML.Tokenizers.Tests/SentencePieceTests.cs new file mode 100644 index 0000000000..8dbde47178 --- /dev/null +++ b/test/Microsoft.ML.Tokenizers.Tests/SentencePieceTests.cs @@ -0,0 +1,957 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.ML.Tokenizers; +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using Xunit; + +namespace Microsoft.ML.Tokenizers.Tests +{ + public class SentencePieceTests + { + [Fact] + public void CreateWithNullStreamThrows() + { + Assert.ThrowsAny(() => SentencePieceTokenizer.Create(null!)); + } + + [Fact] + public void CreateWithEmptyStreamThrows() + { + using MemoryStream empty = new MemoryStream(Array.Empty()); + Assert.ThrowsAny(() => SentencePieceTokenizer.Create(empty)); + } + + [Fact] + public void CreateWithTruncatedStreamThrows() + { + // A protobuf tag claiming a length-delimited field longer than remaining bytes. + byte[] truncated = new byte[] { 0x0A, 0xFF, 0x01 }; // field 1, length 255 – but only 0 data bytes follow + using MemoryStream ms = new MemoryStream(truncated); + Assert.ThrowsAny(() => SentencePieceTokenizer.Create(ms)); + } + + [Fact] + public void CreateBpeViaSentencePieceTokenizer() + { + // Verify that the generic SentencePieceTokenizer.Create() factory method + // works for BPE models (not just LlamaTokenizer.Create()). + using Stream stream = File.OpenRead(Path.Combine(@"Llama", "tokenizer.model")); + SentencePieceTokenizer tokenizer = SentencePieceTokenizer.Create(stream); + + IReadOnlyList tokens = tokenizer.EncodeToTokens("Hello", out _); + Assert.True(tokens.Count > 0); + Assert.Equal("Hello", tokenizer.Decode(tokens.Select(t => t.Id))); + } + + [Fact] + public void CreateFromMemoryStreamUsesFastPath() + { + // Verify that loading from a MemoryStream works (exercises + // the TryGetBuffer fast-path in ModelProtoParser.ParseFrom). + byte[] modelBytes = File.ReadAllBytes(Path.Combine(@"Llama", "tokenizer.model")); + using MemoryStream ms = new MemoryStream(modelBytes); + SentencePieceTokenizer tokenizer = SentencePieceTokenizer.Create(ms); + + IReadOnlyList tokens = tokenizer.EncodeToTokens("Hello", out _); + Assert.True(tokens.Count > 0); + Assert.Equal("Hello", tokenizer.Decode(tokens.Select(t => t.Id))); + } + + [Fact] + public void BpeModelPropertiesParsedCorrectly() + { + // Verify that TrainerSpec and NormalizerSpec fields are correctly parsed + // from a known BPE model (Llama). + using Stream stream = File.OpenRead(Path.Combine(@"Llama", "tokenizer.model")); + SentencePieceTokenizer tokenizer = SentencePieceTokenizer.Create(stream); + + Assert.True(tokenizer.ByteFallback); + Assert.True(tokenizer.AddDummyPrefix); + Assert.True(tokenizer.EscapeWhiteSpaces); + Assert.False(tokenizer.TreatWhitespaceAsSuffix); + + Assert.Equal("", tokenizer.UnknownToken); + Assert.Equal("", tokenizer.BeginningOfSentenceToken); + Assert.Equal("", tokenizer.EndOfSentenceToken); + Assert.Equal(0, tokenizer.UnknownId); + Assert.Equal(1, tokenizer.BeginningOfSentenceId); + Assert.Equal(2, tokenizer.EndOfSentenceId); + } + + [Fact] + public void UnigramModelPropertiesParsedCorrectly() + { + // Verify that TrainerSpec and NormalizerSpec fields are correctly parsed + // from a known Unigram model (Paraphrase-multilingual-MiniLM-L12-v2). + using Stream stream = File.OpenRead(Path.Combine( + @"Paraphrase-multilingual-MiniLM-L12-v2", "sentencepiece.bpe.model")); + SentencePieceTokenizer tokenizer = SentencePieceTokenizer.Create(stream); + + // Unigram model should not have byte fallback + Assert.False(tokenizer.ByteFallback); + Assert.True(tokenizer.AddDummyPrefix); + Assert.True(tokenizer.EscapeWhiteSpaces); + Assert.False(tokenizer.TreatWhitespaceAsSuffix); + } + + [Fact] + public void ByteFallbackEncodesRareCharacterAsBytes() + { + // Llama has byte_fallback=true. Encoding a character that is not in + // the vocabulary should produce byte-level tokens (<0xNN>) rather than . + using Stream stream = File.OpenRead(Path.Combine(@"Llama", "tokenizer.model")); + SentencePieceTokenizer tokenizer = SentencePieceTokenizer.Create(stream, + addBeginningOfSentence: false, addEndOfSentence: false); + + // U+10342 (Old Italic Letter Re) — a 4-byte UTF-8 character (F0 90 8D 82) + // that is extremely unlikely to be in the Llama vocabulary. + string rareChar = "\U00010342"; + IReadOnlyList tokens = tokenizer.EncodeToTokens(rareChar, out _); + + // With byte fallback, the character should be encoded as individual byte tokens + // rather than a single token. + Assert.True(tokens.Count > 1, "Byte fallback should produce multiple byte tokens."); + Assert.DoesNotContain(tokens, t => t.Value == ""); + + // Each byte token should have a name like <0xNN>. + foreach (EncodedToken token in tokens) + { + // The first token is the dummy prefix "▁", the rest should be byte tokens. + if (token.Value != "\u2581") + { + Assert.StartsWith("<0x", token.Value); + } + } + + // Round-trip: decoding should recover the original character. + string decoded = tokenizer.Decode(tokens.Select(t => t.Id)); + Assert.Equal(rareChar, decoded); + } + + [Fact] + public void BpeAndUnigramProduceDifferentTokenizations() + { + // Sanity check: BPE and Unigram models produce different tokenizations + // for the same input, confirming model_type is parsed and routed correctly. + using Stream bpeStream = File.OpenRead(Path.Combine(@"Llama", "tokenizer.model")); + SentencePieceTokenizer bpe = SentencePieceTokenizer.Create(bpeStream, + addBeginningOfSentence: false, addEndOfSentence: false); + + using Stream unigramStream = File.OpenRead(Path.Combine( + @"Paraphrase-multilingual-MiniLM-L12-v2", "sentencepiece.bpe.model")); + SentencePieceTokenizer unigram = SentencePieceTokenizer.Create(unigramStream, + addBeginningOfSentence: false, addEndOfSentence: false); + + string input = "The quick brown fox jumps over the lazy dog."; + IReadOnlyList bpeTokens = bpe.EncodeToTokens(input, out _); + IReadOnlyList unigramTokens = unigram.EncodeToTokens(input, out _); + + // Both should successfully tokenize the input. + Assert.True(bpeTokens.Count > 0); + Assert.True(unigramTokens.Count > 0); + + // But they should produce different token sequences (different vocabs and algorithms). + Assert.NotEqual( + string.Join(",", bpeTokens.Select(t => t.Value)), + string.Join(",", unigramTokens.Select(t => t.Value))); + + // Both should round-trip decode correctly. + Assert.Equal(input, bpe.Decode(bpeTokens.Select(t => t.Id))); + Assert.Equal(input, unigram.Decode(unigramTokens.Select(t => t.Id))); + } + + [Fact] + public void MalformedVarintInModelThrows() + { + // A varint that never terminates (all continuation bits set, exceeding 10 bytes). + byte[] malformed = new byte[] { 0x0A, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }; + using MemoryStream ms = new MemoryStream(malformed); + Assert.ThrowsAny(() => SentencePieceTokenizer.Create(ms)); + } + + [Fact] + public void UnknownWireTypeInModelThrows() + { + // Wire type 7 is invalid in the protobuf spec. + // Tag byte 0x0F = field 1, wire type 7. + byte[] badWireType = new byte[] { 0x0F, 0x00 }; + using MemoryStream ms = new MemoryStream(badWireType); + Assert.ThrowsAny(() => SentencePieceTokenizer.Create(ms)); + } + + // ================================================================= + // Synthetic protobuf tests — construct models from raw wire format + // ================================================================= + + [Fact] + public void Synthetic_BpeModel_DefaultProperties() + { + // Minimal BPE model: only ModelType set, everything else uses proto defaults. + byte[] model = MakeModelProto(MakeBpeTrainerSpec()); + SentencePieceTokenizer tokenizer = CreateFromSyntheticModel(model); + + // TrainerSpec defaults + Assert.False(tokenizer.ByteFallback); + Assert.False(tokenizer.TreatWhitespaceAsSuffix); + Assert.Equal("", tokenizer.UnknownToken); + Assert.Equal("", tokenizer.BeginningOfSentenceToken); + Assert.Equal("", tokenizer.EndOfSentenceToken); + Assert.Equal(0, tokenizer.UnknownId); + Assert.Equal(1, tokenizer.BeginningOfSentenceId); + Assert.Equal(2, tokenizer.EndOfSentenceId); + + // NormalizerSpec defaults (proto specifies [default = true] for these) + Assert.True(tokenizer.AddDummyPrefix); + Assert.True(tokenizer.EscapeWhiteSpaces); + } + + [Fact] + public void Synthetic_BpeModel_AllTrainerSpecFieldsParsed() + { + ProtobufWriter ts = new(); + ts.WriteInt32Field(3, 2); // model_type = BPE + ts.WriteBoolField(24, true); // treat_whitespace_as_suffix + ts.WriteBoolField(35, true); // byte_fallback + ts.WriteInt32Field(40, 5); // unk_id + ts.WriteInt32Field(41, 6); // bos_id + ts.WriteInt32Field(42, 7); // eos_id + ts.WriteInt32Field(43, 8); // pad_id + ts.WriteStringField(45, "[UNK]"); // unk_piece + ts.WriteStringField(46, "[BOS]"); // bos_piece + ts.WriteStringField(47, "[EOS]"); // eos_piece + ts.WriteStringField(48, "[PAD]"); // pad_piece + + byte[] model = MakeModelProto(ts.ToArray()); + SentencePieceTokenizer tokenizer = CreateFromSyntheticModel(model); + + Assert.True(tokenizer.ByteFallback); + Assert.True(tokenizer.TreatWhitespaceAsSuffix); + Assert.Equal("[UNK]", tokenizer.UnknownToken); + Assert.Equal("[BOS]", tokenizer.BeginningOfSentenceToken); + Assert.Equal("[EOS]", tokenizer.EndOfSentenceToken); + Assert.Equal(5, tokenizer.UnknownId); + Assert.Equal(6, tokenizer.BeginningOfSentenceId); + Assert.Equal(7, tokenizer.EndOfSentenceId); + } + + [Fact] + public void Synthetic_BpeModel_AllNormalizerSpecFieldsParsed() + { + ProtobufWriter ns = new(); + ns.WriteStringField(1, "identity"); // name + ns.WriteBoolField(3, false); // add_dummy_prefix (default true) + ns.WriteBoolField(4, false); // remove_extra_whitespaces (default true) + ns.WriteBoolField(5, false); // escape_whitespaces (default true) + + byte[] model = MakeModelProto(MakeBpeTrainerSpec(), ns.ToArray()); + SentencePieceTokenizer tokenizer = CreateFromSyntheticModel(model); + + Assert.False(tokenizer.AddDummyPrefix); + Assert.False(tokenizer.EscapeWhiteSpaces); + } + + [Fact] + public void Synthetic_BpeModel_NegativePadIdParsesSuccessfully() + { + // PadId = -1 is encoded as a 10-byte varint (sign-extended to uint64). + ProtobufWriter ts = new(); + ts.WriteInt32Field(3, 2); // model_type = BPE + ts.WriteInt32Field(43, -1); // pad_id = -1 + + byte[] model = MakeModelProto(ts.ToArray()); + SentencePieceTokenizer tokenizer = CreateFromSyntheticModel(model); + Assert.NotNull(tokenizer); + } + + [Fact] + public void Synthetic_BpeModel_AllPieceTypesParseSuccessfully() + { + byte[] model = MakeModelProto( + MakeBpeTrainerSpec(), + null, + MakePiece("", 0f, 2), // Unknown + MakePiece("", 0f, 3), // Control + MakePiece("", 0f, 3), // Control + MakePiece("hello", -1.5f, 1), // Normal + MakePiece("world", -2.0f, 4), // UserDefined + MakePiece("", 0f, 5), // Unused + MakePiece("<0x00>", 0f, 6)); // Byte + + SentencePieceTokenizer tokenizer = CreateFromSyntheticModel(model); + Assert.NotNull(tokenizer); + } + + [Fact] + public void Synthetic_BpeModel_FieldsInReverseOrderParseCorrectly() + { + // Protobuf allows fields in any order; verify our parser handles it. + ProtobufWriter ts = new(); + ts.WriteStringField(47, "[EOS]"); // eos_piece (field 47 before field 3) + ts.WriteStringField(46, "[BOS]"); // bos_piece + ts.WriteStringField(45, "[UNK]"); // unk_piece + ts.WriteBoolField(35, true); // byte_fallback + ts.WriteInt32Field(3, 2); // model_type = BPE (written last) + + // ModelProto fields also reversed: normalizer before trainer, trainer before pieces + ProtobufWriter model = new(); + model.WriteMessageField(3, Array.Empty()); // empty NormalizerSpec + model.WriteMessageField(2, ts.ToArray()); // TrainerSpec + model.WriteMessageField(1, MakePiece("test", -1f)); // piece + + SentencePieceTokenizer tokenizer = CreateFromSyntheticModel(model.ToArray()); + Assert.True(tokenizer.ByteFallback); + Assert.Equal("[UNK]", tokenizer.UnknownToken); + Assert.Equal("[BOS]", tokenizer.BeginningOfSentenceToken); + Assert.Equal("[EOS]", tokenizer.EndOfSentenceToken); + } + + [Fact] + public void Synthetic_BpeModel_DuplicateScalarFieldLastValueWins() + { + // When a scalar field appears multiple times, protobuf spec says last value wins. + ProtobufWriter ts = new(); + ts.WriteInt32Field(3, 2); // model_type = BPE + ts.WriteBoolField(35, false); // byte_fallback = false + ts.WriteBoolField(35, true); // byte_fallback = true (should win) + ts.WriteStringField(45, "first"); // unk_piece = "first" + ts.WriteStringField(45, "second"); // unk_piece = "second" (should win) + + byte[] model = MakeModelProto(ts.ToArray()); + SentencePieceTokenizer tokenizer = CreateFromSyntheticModel(model); + + Assert.True(tokenizer.ByteFallback); + Assert.Equal("second", tokenizer.UnknownToken); + } + + [Fact] + public void Synthetic_BpeModel_UnknownFieldsAtAllLevelsAreSkipped() + { + // Unknown field numbers should be silently skipped at every message level. + + // SentencePiece with unknown field + ProtobufWriter piece = new(); + piece.WriteStringField(1, "test"); + piece.WriteFloatField(2, -1f); + piece.WriteInt32Field(3, 1); // type = Normal + piece.WriteInt32Field(99, 42); // unknown field + + // TrainerSpec with unknown field + ProtobufWriter ts = new(); + ts.WriteInt32Field(3, 2); // model_type = BPE + ts.WriteBoolField(35, true); // byte_fallback + ts.WriteInt32Field(99, 42); // unknown field + + // NormalizerSpec with unknown field + ProtobufWriter ns = new(); + ns.WriteStringField(1, "identity"); + ns.WriteInt32Field(99, 42); // unknown field + + // ModelProto with unknown field + ProtobufWriter model = new(); + model.WriteMessageField(1, piece.ToArray()); + model.WriteMessageField(2, ts.ToArray()); + model.WriteMessageField(3, ns.ToArray()); + model.WriteInt32Field(99, 42); // unknown field at top level + + SentencePieceTokenizer tokenizer = CreateFromSyntheticModel(model.ToArray()); + Assert.True(tokenizer.ByteFallback); + } + + [Fact] + public void Synthetic_BpeModel_AllWireTypesSkippedForUnknownFields() + { + // Unknown fields using all four valid wire types should be skipped. + ProtobufWriter ts = new(); + ts.WriteInt32Field(3, 2); // model_type = BPE + ts.WriteInt32Field(90, 42); // unknown varint (wire type 0) + ts.WriteFixed64Field(91, 0xDEADBEEF); // unknown fixed64 (wire type 1) + ts.WriteBytesField(92, new byte[] { 0x01, 0x02, 0x03 }); // unknown length-delimited (wire type 2) + ts.WriteFixed32Field(93, 0xCAFE); // unknown fixed32 (wire type 5) + ts.WriteBoolField(35, true); // byte_fallback (after unknown fields) + + byte[] model = MakeModelProto(ts.ToArray()); + SentencePieceTokenizer tokenizer = CreateFromSyntheticModel(model); + + // Verify that fields after unknown fields are still parsed correctly. + Assert.True(tokenizer.ByteFallback); + } + + [Fact] + public void Synthetic_BpeModel_EmptyNormalizerSpecUsesDefaults() + { + // Zero-length NormalizerSpec submessage → all C# defaults. + ProtobufWriter model = new(); + model.WriteMessageField(2, MakeBpeTrainerSpec()); + model.WriteMessageField(3, Array.Empty()); + + SentencePieceTokenizer tokenizer = CreateFromSyntheticModel(model.ToArray()); + Assert.True(tokenizer.AddDummyPrefix); + Assert.True(tokenizer.EscapeWhiteSpaces); + } + + [Fact] + public void Synthetic_BpeModel_MultiplePiecesAccumulate() + { + // Repeated field: each piece message is independently appended. + byte[] model = MakeModelProto( + MakeBpeTrainerSpec(), + null, + MakePiece("a", -1f), + MakePiece("b", -2f), + MakePiece("c", -3f), + MakePiece("ab", -0.5f), + MakePiece("bc", -0.5f)); + + SentencePieceTokenizer tokenizer = CreateFromSyntheticModel(model); + Assert.NotNull(tokenizer); + } + + [Fact] + public void Synthetic_BpeModel_UnicodeStringsParsedCorrectly() + { + // Multi-byte UTF-8 strings in piece names. + byte[] model = MakeModelProto( + MakeBpeTrainerSpec(), + null, + MakePiece("\u2581", -1f), // ▁ (3-byte UTF-8) + MakePiece("\u65E5\u672C\u8A9E", -2f), // 日本語 (CJK) + MakePiece("\U0001F389", -3f), // 🎉 (4-byte UTF-8) + MakePiece("caf\u00E9", -4f)); // café (Latin with diacritics) + + SentencePieceTokenizer tokenizer = CreateFromSyntheticModel(model); + Assert.NotNull(tokenizer); + } + + [Fact] + public void Synthetic_BpeModel_LargeFieldNumbersSkipped() + { + // Field number 1000 requires a multi-byte varint tag. + ProtobufWriter ts = new(); + ts.WriteInt32Field(3, 2); // model_type = BPE + ts.WriteInt32Field(1000, 42); // unknown field with large number + ts.WriteBoolField(35, true); // byte_fallback (after large field number) + + byte[] model = MakeModelProto(ts.ToArray()); + SentencePieceTokenizer tokenizer = CreateFromSyntheticModel(model); + Assert.True(tokenizer.ByteFallback); + } + + [Fact] + public void Synthetic_BpeModel_ZeroLengthStringField() + { + ProtobufWriter ts = new(); + ts.WriteInt32Field(3, 2); // model_type = BPE + ts.WriteStringField(45, ""); // unk_piece = "" (zero-length) + + byte[] model = MakeModelProto(ts.ToArray()); + SentencePieceTokenizer tokenizer = CreateFromSyntheticModel(model); + + // Empty string is not null, so ?? "" does not apply. + Assert.Equal("", tokenizer.UnknownToken); + } + + [Fact] + public void Synthetic_UnigramModel_PropertiesParsedCorrectly() + { + ProtobufWriter ts = new(); + ts.WriteInt32Field(3, 1); // model_type = Unigram + ts.WriteBoolField(35, true); // byte_fallback + ts.WriteInt32Field(40, 0); // unk_id = 0 + ts.WriteInt32Field(41, 1); // bos_id = 1 + ts.WriteInt32Field(42, 2); // eos_id = 2 + + byte[] model = MakeModelProto( + ts.ToArray(), + null, + MakePiece("", 0f, 2), // Unknown (index 0) + MakePiece("", 0f, 3), // Control (index 1) + MakePiece("", 0f, 3), // Control (index 2) + MakePiece("\u2581hello", -1.5f, 1), // Normal + MakePiece("\u2581world", -2.0f, 1)); // Normal + + SentencePieceTokenizer tokenizer = CreateFromSyntheticModel(model); + Assert.True(tokenizer.ByteFallback); + Assert.Equal(0, tokenizer.UnknownId); + Assert.Equal(1, tokenizer.BeginningOfSentenceId); + Assert.Equal(2, tokenizer.EndOfSentenceId); + } + + [Fact] + public void Synthetic_BpeModel_ParsedFromNonSeekableStream() + { + // Exercises the fallback path in ParseFrom (not MemoryStream.TryGetBuffer). + byte[] modelBytes = MakeModelProto( + MakeBpeTrainerSpec(), + null, + MakePiece("test", -1f)); + + using NonSeekableStream stream = new(modelBytes); + SentencePieceTokenizer tokenizer = SentencePieceTokenizer.Create(stream); + Assert.NotNull(tokenizer); + } + + [Fact] + public void Synthetic_TruncatedVarintThrows() + { + // Tag byte with continuation bit set but no more data follows. + byte[] data = new byte[] { 0x80 }; + using MemoryStream ms = new MemoryStream(data); + Assert.ThrowsAny(() => SentencePieceTokenizer.Create(ms)); + } + + [Fact] + public void Synthetic_TruncatedStringFieldThrows() + { + // Length-delimited field claiming 100 bytes but only 2 follow. + ProtobufWriter w = new(); + w.WriteTag(1, 2); // field 1, length-delimited + w.WriteVarint(100); // length = 100 + w.WriteRaw(0x41, 0x42); // only 2 bytes + + using MemoryStream ms = new MemoryStream(w.ToArray()); + Assert.ThrowsAny(() => SentencePieceTokenizer.Create(ms)); + } + + [Fact] + public void Synthetic_TruncatedFloatInPieceThrows() + { + // Float field with only 2 of 4 bytes inside a piece submessage. + ProtobufWriter piece = new(); + piece.WriteStringField(1, "test"); + piece.WriteTag(2, 5); // score field tag (float = wire type 5) + piece.WriteRaw(0x00, 0x00); // only 2 of 4 bytes + + ProtobufWriter model = new(); + model.WriteMessageField(1, piece.ToArray()); + model.WriteMessageField(2, MakeBpeTrainerSpec()); + + using MemoryStream ms = new MemoryStream(model.ToArray()); + Assert.ThrowsAny(() => SentencePieceTokenizer.Create(ms)); + } + + [Fact] + public void Synthetic_TruncatedFixed64DuringSkipThrows() + { + // Unknown field with wire type 1 (fixed64) but only 4 of 8 bytes available. + ProtobufWriter ts = new(); + ts.WriteInt32Field(3, 2); // model_type = BPE + ts.WriteTag(90, 1); // unknown field 90, wire type 1 (fixed64) + ts.WriteRaw(0, 0, 0, 0); // only 4 of 8 bytes + + ProtobufWriter model = new(); + model.WriteMessageField(2, ts.ToArray()); + + using MemoryStream ms = new MemoryStream(model.ToArray()); + Assert.ThrowsAny(() => SentencePieceTokenizer.Create(ms)); + } + + [Fact] + public void Synthetic_InvalidLengthPrefixThrows() + { + // Length-delimited field with length far exceeding available data. + ProtobufWriter w = new(); + w.WriteTag(1, 2); // field 1, length-delimited + w.WriteRaw(0xFF, 0xFF, 0xFF, 0xFF, 0x07); // varint = 0x7FFFFFFF + + using MemoryStream ms = new MemoryStream(w.ToArray()); + Assert.ThrowsAny(() => SentencePieceTokenizer.Create(ms)); + } + + [Theory] + [InlineData(3)] // Group start (deprecated) + [InlineData(4)] // Group end (deprecated) + [InlineData(6)] // Invalid + public void Synthetic_UnsupportedWireTypeThrows(int wireType) + { + byte[] data = new byte[] { (byte)((1 << 3) | wireType), 0x00 }; + using MemoryStream ms = new MemoryStream(data); + Assert.ThrowsAny(() => SentencePieceTokenizer.Create(ms)); + } + + [Fact] + public void Synthetic_BpeModel_WireTypeMismatchSkippedGracefully() + { + // If a known field number arrives with an unexpected wire type, + // the parser should treat it as unknown and skip it (forward-compat). + ProtobufWriter model = new(); + model.WriteInt32Field(1, 42); // field 1 (pieces) as varint instead of message — skipped + model.WriteFixed32Field(2, 0); // field 2 (TrainerSpec) as fixed32 instead of message — skipped + model.WriteMessageField(2, MakeBpeTrainerSpec()); // real TrainerSpec with correct wire type + + SentencePieceTokenizer tokenizer = CreateFromSyntheticModel(model.ToArray()); + Assert.NotNull(tokenizer); + } + + [Fact] + public void Synthetic_BpeModel_NonCanonicalVarintParsedCorrectly() + { + // Value 2 (BPE) encoded as a maximally-padded 5-byte varint + // instead of the minimal single byte 0x02. + ProtobufWriter ts = new(); + ts.WriteTag(3, 0); // model_type field tag + ts.WriteRaw(0x82, 0x80, 0x80, 0x80, 0x00); // value 2 in 5-byte non-canonical encoding + ts.WriteBoolField(35, true); // byte_fallback (verify parsing continues) + + byte[] model = MakeModelProto(ts.ToArray()); + SentencePieceTokenizer tokenizer = CreateFromSyntheticModel(model); + Assert.True(tokenizer.ByteFallback); + } + + [Fact] + public void Synthetic_BpeModel_NegativeBosEosIdClampedToZero() + { + // BosId and EosId set to -1 (disabled in SentencePiece). + // The base constructor applies Math.Max(0, value), clamping to 0. + ProtobufWriter ts = new(); + ts.WriteInt32Field(3, 2); // model_type = BPE + ts.WriteInt32Field(41, -1); // bos_id = -1 + ts.WriteInt32Field(42, -1); // eos_id = -1 + + byte[] model = MakeModelProto(ts.ToArray()); + SentencePieceTokenizer tokenizer = CreateFromSyntheticModel(model); + Assert.Equal(0, tokenizer.BeginningOfSentenceId); + Assert.Equal(0, tokenizer.EndOfSentenceId); + } + + [Fact] + public void Synthetic_BpeModel_ExtensionRangeFieldsSkipped() + { + // sentencepiece_model.proto defines 'extensions 200 to max;' on several messages. + // Real files may contain extension fields that must be silently skipped. + ProtobufWriter ts = new(); + ts.WriteInt32Field(3, 2); // model_type = BPE + ts.WriteStringField(200, "ext_value"); // extension field + ts.WriteInt32Field(300, 99); // extension field + ts.WriteBytesField(500, new byte[] { 0xFF }); // extension field + ts.WriteBoolField(35, true); // byte_fallback + + ProtobufWriter ns = new(); + ns.WriteInt32Field(200, 77); // extension in NormalizerSpec + ns.WriteBoolField(3, false); // add_dummy_prefix = false + + byte[] model = MakeModelProto(ts.ToArray(), ns.ToArray()); + SentencePieceTokenizer tokenizer = CreateFromSyntheticModel(model); + Assert.True(tokenizer.ByteFallback); + Assert.False(tokenizer.AddDummyPrefix); + } + + [Fact] + public void Synthetic_BpeModel_FloatSpecialValuesParsed() + { + // IEEE 754 special values should not crash the parser. + byte[] model = MakeModelProto( + MakeBpeTrainerSpec(), + null, + MakePiece("nan_score", float.NaN, 1), + MakePiece("inf_score", float.PositiveInfinity, 1), + MakePiece("neg_inf", float.NegativeInfinity, 1), + MakePiece("subnormal", float.Epsilon, 1)); + + SentencePieceTokenizer tokenizer = CreateFromSyntheticModel(model); + Assert.NotNull(tokenizer); + } + + [Fact] + public void Synthetic_BpeModel_SkipUnknownFieldWithLongVarint() + { + // Unknown varint field with value -1 (10-byte varint in the skip path). + // Exercises the SkipField varint loop through all 10 bytes. + ProtobufWriter ts = new(); + ts.WriteInt32Field(3, 2); // model_type = BPE + ts.WriteInt32Field(99, -1); // unknown field, value -1 (10-byte varint) + ts.WriteBoolField(35, true); // byte_fallback (verify skip was correct) + + byte[] model = MakeModelProto(ts.ToArray()); + SentencePieceTokenizer tokenizer = CreateFromSyntheticModel(model); + Assert.True(tokenizer.ByteFallback); + } + + [Fact] + public void Synthetic_BpeModel_ZeroLengthPieceSubmessage() + { + // A piece submessage with zero length creates a default SentencePiece + // (piece="", score=0, type=Normal). + ProtobufWriter model = new(); + model.WriteMessageField(1, Array.Empty()); // zero-length piece + model.WriteMessageField(2, MakeBpeTrainerSpec()); + + SentencePieceTokenizer tokenizer = CreateFromSyntheticModel(model.ToArray()); + Assert.NotNull(tokenizer); + } + + [Fact] + public void Synthetic_BpeModel_MultipleTrainerSpecLastWins() + { + // When a non-repeated message field appears multiple times, + // our parser uses the last occurrence (not protobuf merge semantics). + ProtobufWriter ts1 = new(); + ts1.WriteInt32Field(3, 2); // model_type = BPE + ts1.WriteBoolField(35, false); // byte_fallback = false + ts1.WriteStringField(45, "first"); // unk_piece = "first" + + ProtobufWriter ts2 = new(); + ts2.WriteInt32Field(3, 2); // model_type = BPE + ts2.WriteBoolField(35, true); // byte_fallback = true + ts2.WriteStringField(45, "second"); // unk_piece = "second" + + ProtobufWriter model = new(); + model.WriteMessageField(2, ts1.ToArray()); + model.WriteMessageField(2, ts2.ToArray()); + + SentencePieceTokenizer tokenizer = CreateFromSyntheticModel(model.ToArray()); + Assert.True(tokenizer.ByteFallback); + Assert.Equal("second", tokenizer.UnknownToken); + } + + [Fact] + public void Synthetic_BpeModel_MultipleTrainerSpecFieldsNotMerged() + { + // Unlike the standard protobuf library which merges repeated occurrences of + // non-repeated message fields, our parser replaces entirely (last wins). + // Fields from the first occurrence that aren't in the second are lost. + ProtobufWriter ts1 = new(); + ts1.WriteInt32Field(3, 2); // model_type = BPE + ts1.WriteBoolField(35, true); // byte_fallback = true (ONLY in first) + + ProtobufWriter ts2 = new(); + ts2.WriteInt32Field(3, 2); // model_type = BPE + ts2.WriteStringField(45, "[UNK]"); // unk_piece (ONLY in second; no byte_fallback) + + ProtobufWriter model = new(); + model.WriteMessageField(2, ts1.ToArray()); + model.WriteMessageField(2, ts2.ToArray()); + + SentencePieceTokenizer tokenizer = CreateFromSyntheticModel(model.ToArray()); + // With protobuf merge: ByteFallback=true (from first), UnknownToken="[UNK]" (from second). + // With our last-wins: ByteFallback=false (default, since second has no byte_fallback field). + Assert.False(tokenizer.ByteFallback); + Assert.Equal("[UNK]", tokenizer.UnknownToken); + } + + [Fact] + public void Synthetic_BpeModel_PieceWithPartialFieldsUsesDefaults() + { + // A piece with only the string field; score and type use defaults (0f, Normal). + ProtobufWriter piece = new(); + piece.WriteStringField(1, "hello"); // only piece name, no score or type + + byte[] model = MakeModelProto( + MakeBpeTrainerSpec(), + null, + piece.ToArray()); + + SentencePieceTokenizer tokenizer = CreateFromSyntheticModel(model); + Assert.NotNull(tokenizer); + } + + [Theory] + [InlineData(3)] // Word + [InlineData(4)] // Char + [InlineData(99)] // completely unknown + public void Synthetic_UnsupportedModelTypeThrows(int modelType) + { + // Only BPE (2) and Unigram (1) are supported. Other model types + // should throw ArgumentException from the SentencePieceTokenizer constructor. + ProtobufWriter ts = new(); + ts.WriteInt32Field(3, modelType); + + byte[] model = MakeModelProto(ts.ToArray()); + using MemoryStream ms = new MemoryStream(); + ms.Write(model, 0, model.Length); + ms.Position = 0; + Assert.Throws(() => SentencePieceTokenizer.Create(ms)); + } + + [Fact] + public void Synthetic_BpeModel_MemoryStreamAtNonZeroPosition() + { + // When a MemoryStream has been partially consumed (Position > 0), + // the parser's TryGetBuffer fast path must start at the current position, + // not at the beginning of the buffer. + byte[] modelBytes = MakeModelProto( + MakeBpeTrainerSpec(), + null, + MakePiece("test", -1f)); + byte[] prefix = new byte[137]; // arbitrary prefix junk + + using MemoryStream ms = new MemoryStream(); + ms.Write(prefix, 0, prefix.Length); + ms.Write(modelBytes, 0, modelBytes.Length); + ms.Position = prefix.Length; // skip past prefix + + SentencePieceTokenizer tokenizer = SentencePieceTokenizer.Create(ms); + Assert.NotNull(tokenizer); + } + + // ================================================================= + // Helper infrastructure + // ================================================================= + + private static SentencePieceTokenizer CreateFromSyntheticModel( + byte[] modelProtoBytes, bool addBos = true, bool addEos = false) + { + // Default MemoryStream constructor + Write so TryGetBuffer returns true, + // exercising the fast path in ModelProtoParser.ParseFrom. + using MemoryStream ms = new MemoryStream(); + ms.Write(modelProtoBytes, 0, modelProtoBytes.Length); + ms.Position = 0; + return SentencePieceTokenizer.Create(ms, addBos, addEos); + } + + private static byte[] MakePiece(string text, float score = 0f, int type = 1) + { + ProtobufWriter w = new(); + w.WriteStringField(1, text); // piece + w.WriteFloatField(2, score); // score + w.WriteInt32Field(3, type); // type enum + return w.ToArray(); + } + + private static byte[] MakeBpeTrainerSpec() + { + ProtobufWriter w = new(); + w.WriteInt32Field(3, 2); // model_type = BPE + return w.ToArray(); + } + + private static byte[] MakeModelProto( + byte[] trainerSpec, byte[]? normalizerSpec = null, params byte[][] pieces) + { + ProtobufWriter w = new(); + foreach (byte[] piece in pieces) + { + w.WriteMessageField(1, piece); // repeated SentencePiece + } + w.WriteMessageField(2, trainerSpec); // TrainerSpec + if (normalizerSpec != null) + { + w.WriteMessageField(3, normalizerSpec); // NormalizerSpec + } + return w.ToArray(); + } + + /// Minimal protobuf writer for constructing synthetic test data. + private sealed class ProtobufWriter + { + private readonly MemoryStream _ms = new MemoryStream(); + + public byte[] ToArray() => _ms.ToArray(); + + public void WriteVarint(ulong value) + { + while (value > 0x7F) + { + _ms.WriteByte((byte)(value | 0x80)); + value >>= 7; + } + _ms.WriteByte((byte)value); + } + + public void WriteTag(int fieldNumber, int wireType) => + WriteVarint((ulong)((fieldNumber << 3) | wireType)); + + public void WriteInt32Field(int fieldNumber, int value) + { + WriteTag(fieldNumber, 0); + WriteVarint((ulong)(long)value); // sign-extend for negative values + } + + public void WriteBoolField(int fieldNumber, bool value) + { + WriteTag(fieldNumber, 0); + WriteVarint(value ? 1UL : 0UL); + } + + public void WriteStringField(int fieldNumber, string value) + { + byte[] bytes = Encoding.UTF8.GetBytes(value); + WriteTag(fieldNumber, 2); + WriteVarint((ulong)bytes.Length); + _ms.Write(bytes, 0, bytes.Length); + } + + public void WriteBytesField(int fieldNumber, byte[] value) + { + WriteTag(fieldNumber, 2); + WriteVarint((ulong)value.Length); + _ms.Write(value, 0, value.Length); + } + + public void WriteFloatField(int fieldNumber, float value) + { + WriteTag(fieldNumber, 5); + byte[] bytes = BitConverter.GetBytes(value); + if (!BitConverter.IsLittleEndian) + { + Array.Reverse(bytes); + } + _ms.Write(bytes, 0, 4); + } + + public void WriteMessageField(int fieldNumber, byte[] submessage) + { + WriteTag(fieldNumber, 2); + WriteVarint((ulong)submessage.Length); + _ms.Write(submessage, 0, submessage.Length); + } + + public void WriteFixed64Field(int fieldNumber, ulong value) + { + WriteTag(fieldNumber, 1); + byte[] bytes = BitConverter.GetBytes(value); + if (!BitConverter.IsLittleEndian) + { + Array.Reverse(bytes); + } + _ms.Write(bytes, 0, 8); + } + + public void WriteFixed32Field(int fieldNumber, uint value) + { + WriteTag(fieldNumber, 5); + byte[] bytes = BitConverter.GetBytes(value); + if (!BitConverter.IsLittleEndian) + { + Array.Reverse(bytes); + } + _ms.Write(bytes, 0, 4); + } + + public void WriteRaw(params byte[] bytes) => + _ms.Write(bytes, 0, bytes.Length); + } + + /// Stream wrapper that hides seekability, forcing the fallback parse path. + private sealed class NonSeekableStream : Stream + { + private readonly MemoryStream _inner; + public NonSeekableStream(byte[] data) => _inner = new MemoryStream(data); + + public override bool CanRead => true; + public override bool CanSeek => false; + public override bool CanWrite => false; + public override long Length => throw new NotSupportedException(); + public override long Position + { + get => throw new NotSupportedException(); + set => throw new NotSupportedException(); + } + public override void Flush() { } + public override int Read(byte[] buffer, int offset, int count) => _inner.Read(buffer, offset, count); + public override long Seek(long offset, SeekOrigin origin) => throw new NotSupportedException(); + public override void SetLength(long value) => throw new NotSupportedException(); + public override void Write(byte[] buffer, int offset, int count) => throw new NotSupportedException(); + protected override void Dispose(bool disposing) + { + if (disposing) _inner.Dispose(); + base.Dispose(disposing); + } + } + } +}