CU-8699049kf: Fix MetaCAT related notebooks

mart-r · mart-r · commit cabcfee53151 · 2025-07-16T15:07:38.000+01:00
diff --git a/medcat/2_train_model/2_supervised_training/meta_annotation_training.ipynb b/medcat/2_train_model/2_supervised_training/meta_annotation_training.ipynb
@@ -144,13 +144,9 @@
     "# NOTE: we need to provide a BaseTokenizer to add the relevant additional data paths\n",
     "#       to the relevant Entity/Span and Document implementation\n",
     "#       we'll use the regex tokenizer here for example since it's easier to initialise\n",
-    "#       but you can use a spacy-based one, you just need to also pass:\n",
-    "#       - the model name (e.g 'en_core_web_md')\n",
-    "#       - the names of the disabled components (e.g ['ner', 'parser', 'vectors', 'textcat',\n",
-    "#           'entity_linker', 'sentencizer', 'entity_ruler', 'merge_noun_chunks', 'merge_entities', 'merge_subtokens'])\n",
-    "#       - whether diacritics should be used\n",
-    "#       - max document length (e.g 1_000_000)\n",
-    "base_tokenizer = create_tokenizer(\"regex\")\n",
+    "#       but you can use a spacy-based one, you just need to also pass the appropriate config\n",
+    "from medcat.config import Config\n",
+    "base_tokenizer = create_tokenizer(\"regex\", Config())\n",
     "for meta_model in meta_model_names:\n",
     "    meta_cat_path = os.path.join(base_dir_meta_models, exp_start + meta_model)\n",
     "    if model_is_legacy:\n",
@@ -228,7 +224,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "venv_v2",
+   "display_name": "venv_v2_311",
    "language": "python",
    "name": "python3"
   },
@@ -242,7 +238,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.13"
+   "version": "3.11.12"
   }
  },
  "nbformat": 4,
diff --git a/medcat/2_train_model/2_supervised_training/meta_annotation_training_advanced.ipynb b/medcat/2_train_model/2_supervised_training/meta_annotation_training_advanced.ipynb
@@ -117,13 +117,9 @@
     "# NOTE: we need to provide a BaseTokenizer to add the relevant additional data paths\n",
     "#       to the relevant Entity/Span and Document implementation\n",
     "#       we'll use the regex tokenizer here for example since it's easier to initialise\n",
-    "#       but you can use a spacy-based one, you just need to also pass:\n",
-    "#       - the model name (e.g 'en_core_web_md')\n",
-    "#       - the names of the disabled components (e.g ['ner', 'parser', 'vectors', 'textcat',\n",
-    "#           'entity_linker', 'sentencizer', 'entity_ruler', 'merge_noun_chunks', 'merge_entities', 'merge_subtokens'])\n",
-    "#       - whether diacritics should be used\n",
-    "#       - max document length (e.g 1_000_000)\n",
-    "base_tokenizer = create_tokenizer(\"regex\")"
+    "#       but you can use a spacy-based one, you just need to also pass the appropraite config\n",
+    "from medcat.config import Config\n",
+    "base_tokenizer = create_tokenizer(\"regex\", Config())"
    ]
   },
   {
@@ -339,7 +335,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "venv_v2_311",
    "language": "python",
    "name": "python3"
   },
@@ -353,7 +349,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.8"
+   "version": "3.11.12"
   }
  },
  "nbformat": 4,