We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent b0a5a01 commit 9e725abCopy full SHA for 9e725ab
data/medcat_models/vocab/create_vocab.py
@@ -0,0 +1,14 @@
1
+from medcat.vocab import Vocab
2
+import os
3
+
4
+vocab = Vocab()
5
6
+model_dir = "data/medcat_models/vocab"
7
8
+# the vocab.txt file need to be in the tab sep format: <token>\t<word_count>\t<vector_embedding_separated_by_spaces>
9
+# Current vocab uses pre-calculated vector embedding from Word2Vec, future use embeddings calculated from BERT tokeniser
10
+# embeddings of 300 dimensions is standard
11
12
+vocab.add_words(os.path.join(model_dir, 'vocab_data.txt'), replace=True)
13
+vocab.make_unigram_table()
14
+vocab.save(os.path.join(model_dir + "vocab.dat"))
0 commit comments