Skip to content

Commit 9e725ab

Browse files
committed
template for users to create their own vocab file.
1 parent b0a5a01 commit 9e725ab

File tree

1 file changed

+14
-0
lines changed

1 file changed

+14
-0
lines changed
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
from medcat.vocab import Vocab
2+
import os
3+
4+
vocab = Vocab()
5+
6+
model_dir = "data/medcat_models/vocab"
7+
8+
# the vocab.txt file need to be in the tab sep format: <token>\t<word_count>\t<vector_embedding_separated_by_spaces>
9+
# Current vocab uses pre-calculated vector embedding from Word2Vec, future use embeddings calculated from BERT tokeniser
10+
# embeddings of 300 dimensions is standard
11+
12+
vocab.add_words(os.path.join(model_dir, 'vocab_data.txt'), replace=True)
13+
vocab.make_unigram_table()
14+
vocab.save(os.path.join(model_dir + "vocab.dat"))

0 commit comments

Comments
 (0)