Skip to content

Commit 8938520

Browse files
author
Atif Ahmed
committed
Adding tokenizer part
1 parent 014c558 commit 8938520

File tree

1 file changed

+8
-0
lines changed

1 file changed

+8
-0
lines changed

texar/torch/data/tokenizers/bert_tokenizer.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,10 @@ class BERTTokenizer(PretrainedBERTMixin, TokenizerBase):
7474
'scibert-scivocab-cased': 512,
7575
'scibert-basevocab-uncased': 512,
7676
'scibert-basevocab-cased': 512,
77+
78+
# BERT for MS-MARCO
79+
'bert-msmarco-base': 512,
80+
'bert-msmarco-large': 512,
7781
}
7882
_VOCAB_FILE_NAMES = {'vocab_file': 'vocab.txt'}
7983
_VOCAB_FILE_MAP = {
@@ -98,6 +102,10 @@ class BERTTokenizer(PretrainedBERTMixin, TokenizerBase):
98102
'scibert-scivocab-cased': 'vocab.txt',
99103
'scibert-basevocab-uncased': 'vocab.txt',
100104
'scibert-basevocab-cased': 'vocab.txt',
105+
106+
# BERT for MS-MARCO
107+
'bert-msmarco-base': 'vocab.txt',
108+
'bert-msmarco-large': 'vocab.txt',
101109
}
102110
}
103111

0 commit comments

Comments
 (0)