|
62 | 62 | "yo": "Yoruba", |
63 | 63 | "zh": "Chinese (simplified)", |
64 | 64 | } |
65 | | -"""dict: Languages supported by spaCy: https://spacy.io/usage/models#languages |
66 | 65 |
|
| 66 | +"""dict: Languages supported by spaCy: https://spacy.io/usage/models#languages |
67 | 67 | Dictionary with ISO 639-1 language code (key) and language name (value) |
68 | 68 | Korean is excluded for now because of system installation issues |
69 | 69 | """ |
70 | 70 |
|
71 | 71 | SPACY_LANGUAGE_MODELS = { |
| 72 | + "de": "de_core_news_sm", # OntoNotes |
72 | 73 | "en": "en_core_web_sm", # OntoNotes |
73 | 74 | "es": "es_core_news_sm", # Wikipedia |
74 | | - "zh": "zh_core_web_sm", # OntoNotes |
75 | | - "xx": "xx_ent_wiki_sm", # Wikipedia |
76 | | - "pl": "nb_core_news_sm", # NorNE |
77 | 75 | "fr": "fr_core_news_sm", # Wikipedia |
78 | | - "de": "de_core_news_sm", # OntoNotes |
| 76 | + "nb": "nb_core_news_sm", # NorNE |
| 77 | + "pl": "pl_core_news_sm", # NKJP |
| 78 | + "ru": "ru_core_news_sm", # Nerus |
| 79 | + "zh": "zh_core_web_sm", # OntoNotes |
79 | 80 | } |
80 | | -"""dict: Mapping between ISO 639-1 language code and spaCy model identifiers |
81 | 81 |
|
| 82 | +"""dict: Mapping between ISO 639-1 language code and spaCy model identifiers |
82 | 83 | Models with Creative Commons licenses are not included because this plugin is licensed under Apache-2 |
83 | 84 | """ |
| 85 | + |
| 86 | +SPACY_LANGUAGE_MODELS_LEMMATIZATION = ["de", "en", "es", "fr", "nb", "pl", "ru"] |
| 87 | +"""list: Languages that have a SpaCy pre-trained model with a Lemmatizer component. |
| 88 | +When using a pre-trained pipeline to lemmatize, you need to have in your SpaCy Language pipeline: |
| 89 | +-either SpaCy 'morphologizer' + 'lemmatizer' |
| 90 | +-or SpaCy 'tagger' + 'attribute ruler' + 'lemmatizer' |
| 91 | +depending on the pre-trained pipeline built-in components""" |
| 92 | + |
| 93 | +SPACY_LANGUAGE_MODELS_MORPHOLOGIZER = ["de", "es", "fr", "nb", "pl", "ru"] |
| 94 | +"""list: Languages that have a SpaCy pre-trained model with a Morphologizer component.""" |
| 95 | + |
| 96 | +SPACY_LANGUAGE_LOOKUP = [ |
| 97 | + "ca", |
| 98 | + "cs", |
| 99 | + "da", |
| 100 | + "de", |
| 101 | + "en", |
| 102 | + "es", |
| 103 | + "fr", |
| 104 | + "hr", |
| 105 | + "hu", |
| 106 | + "id", |
| 107 | + "it", |
| 108 | + "lb", |
| 109 | + "lt", |
| 110 | + "mk", |
| 111 | + "nb", |
| 112 | + "nl", |
| 113 | + "pt", |
| 114 | + "ro", |
| 115 | + "sr", |
| 116 | + "sv", |
| 117 | + "tl", |
| 118 | + "tr", |
| 119 | + "ur", |
| 120 | +] |
| 121 | +"""list: Languages that have available SpaCy lookup tables for lemmatization. |
| 122 | +The lookup tables are available at https://github.com/explosion/spacy-lookups-data/tree/master/spacy_lookups_data/data |
| 123 | +""" |
| 124 | + |
| 125 | +SPACY_LANGUAGE_RULES = ["bn", "el", "fa"] |
| 126 | +"""list: Languages that have available SpaCy rule tables for lemmatization |
| 127 | +The rule tables are available at https://github.com/explosion/spacy-lookups-data/tree/master/spacy_lookups_data/data |
| 128 | +""" |
0 commit comments