|
5 | 5 | use TextAnalysis\Corpus\WordnetCorpus; |
6 | 6 | use TextAnalysis\Models\Wordnet\Synset; |
7 | 7 | use TextAnalysis\Models\Wordnet\Lemma; |
| 8 | +use TextAnalysis\Models\Wordnet\ExceptionMap; |
| 9 | +use TextAnalysis\Utilities\Text; |
8 | 10 |
|
9 | 11 | /* |
10 | 12 | * WordnetIndex is a facade for accessing the wordnet data |
@@ -88,6 +90,107 @@ public function getLemma($word, $pos = '') |
88 | 90 | return $found; |
89 | 91 | } |
90 | 92 |
|
| 93 | + /** |
| 94 | + * Concept taken from nltk |
| 95 | + * Find a possible base form for the given form, with the given |
| 96 | + * part of speech, by checking WordNet's list of exceptional |
| 97 | + * forms, and by recursively stripping affixes for this part of |
| 98 | + * speech until a form in WordNet is found. |
| 99 | + * @todo improve the algorithm, it is really slow |
| 100 | + * @param string $word |
| 101 | + * @param string|null $pos |
| 102 | + * @return string return the base word |
| 103 | + */ |
| 104 | + public function getMorph($word, $pos = '') |
| 105 | + { |
| 106 | + if(mb_strlen($word) < 3) { |
| 107 | + return ""; |
| 108 | + } |
| 109 | + |
| 110 | + $searchForFuncWithPos = function(ExceptionMap $exceptionMap) use($word, $pos) |
| 111 | + { |
| 112 | + return $exceptionMap->getPos() === $pos && in_array($word, $exceptionMap->getExceptionList()); |
| 113 | + }; |
| 114 | + |
| 115 | + $searchForFuncWithoutPos = function(ExceptionMap $exceptionMap) use($word) |
| 116 | + { |
| 117 | + return in_array($word, $exceptionMap->getExceptionList()); |
| 118 | + }; |
| 119 | + |
| 120 | + $found = []; |
| 121 | + |
| 122 | + |
| 123 | + if(!empty($pos)) { |
| 124 | + $found = array_filter($this->getWordnetCorpus()->getExceptionsMap(), $searchForFuncWithPos); |
| 125 | + } else { |
| 126 | + $found = array_filter($this->getWordnetCorpus()->getExceptionsMap(), $searchForFuncWithoutPos); |
| 127 | + } |
| 128 | + |
| 129 | + // found a match in the exceptions data |
| 130 | + if(!empty($found)) { |
| 131 | + return array_values($found)[0]->getTarget(); |
| 132 | + } |
| 133 | + |
| 134 | + foreach($this->getMorphilogicalSubstitutions() as $keyPos => $keyValues) |
| 135 | + { |
| 136 | + foreach($keyValues as $key => $value) |
| 137 | + { |
| 138 | + if(Text::endsWith($word, $key)) { |
| 139 | + $morphedWord = substr($word, 0, -strlen($key)).$value; |
| 140 | + $r = $this->getLemma($morphedWord, $keyPos); |
| 141 | + if(!empty($r)) { |
| 142 | + $found += array_map(function($lemma){ return $lemma->getWord();}, $r); |
| 143 | + return $found[0]; |
| 144 | + } |
| 145 | + } |
| 146 | + } |
| 147 | + } |
| 148 | + if(empty($found)) { |
| 149 | + return ""; |
| 150 | + } |
| 151 | + |
| 152 | + return $found[0]; |
| 153 | + } |
| 154 | + |
| 155 | + /** |
| 156 | + * |
| 157 | + * @return array |
| 158 | + */ |
| 159 | + public function getMorphilogicalSubstitutions() |
| 160 | + { |
| 161 | + return [ |
| 162 | + WordnetCorpus::NOUN => [ |
| 163 | + 's' => '', |
| 164 | + 'ses' => 's', |
| 165 | + 'ves' => 'f', |
| 166 | + 'xes' => 'x', |
| 167 | + 'zes' => 'z', |
| 168 | + 'ches' => 'ch', |
| 169 | + 'shes' => 'sh', |
| 170 | + 'men' => 'man', |
| 171 | + 'ies' => 'y', |
| 172 | + |
| 173 | + ], |
| 174 | + WordnetCorpus::VERB => [ |
| 175 | + 's'=> '', |
| 176 | + 'ies'=> 'y', |
| 177 | + 'es'=> 'e', |
| 178 | + 'es'=> '', |
| 179 | + 'ed'=> 'e', |
| 180 | + 'ed'=> '', |
| 181 | + 'ing'=> 'e', |
| 182 | + 'ing'=> '' |
| 183 | + ], |
| 184 | + WordnetCorpus::ADJECTIVE => [ |
| 185 | + 'er' => '', |
| 186 | + 'est' => '', |
| 187 | + 'er' => 'e', |
| 188 | + 'est' => 'e' |
| 189 | + ] |
| 190 | + ]; |
| 191 | + } |
| 192 | + |
| 193 | + |
91 | 194 | public function __destruct() |
92 | 195 | { |
93 | 196 | unset($this->wordnetCorpus); |
|
0 commit comments