|
| 1 | +<?php |
| 2 | + |
| 3 | +namespace TextAnalysis\Corpus; |
| 4 | + |
| 5 | +use RuntimeException; |
| 6 | +use TextAnalysis\Models\Wordnet\Lemma; |
| 7 | +use TextAnalysis\Models\Wordnet\Synset; |
| 8 | +use TextAnalysis\Models\Wordnet\ExceptionMap; |
| 9 | + |
| 10 | +/** |
| 11 | + * Loads the wordnet corpus for use. Borrowed heavily from nltk |
| 12 | + * @author yooper |
| 13 | + */ |
| 14 | +class WordnetCorpus extends ReadCorpusAbstract |
| 15 | +{ |
| 16 | + const VERB = 'v'; |
| 17 | + const ADJECTIVE = 'r'; |
| 18 | + const ADVERB = 'a'; |
| 19 | + const NOUN = 'n'; |
| 20 | + |
| 21 | + /** |
| 22 | + * |
| 23 | + * @var array stores all the lex names |
| 24 | + */ |
| 25 | + protected $lexNames = []; |
| 26 | + |
| 27 | + /** |
| 28 | + * |
| 29 | + * @var Lemma[] |
| 30 | + */ |
| 31 | + protected $lemmas = []; |
| 32 | + |
| 33 | + /** |
| 34 | + * An array with indexing. Indexing is pos with offset concatenated |
| 35 | + * @var Synset[] |
| 36 | + */ |
| 37 | + protected $synsets = []; |
| 38 | + |
| 39 | + /** |
| 40 | + * @var ExceptionMap[] |
| 41 | + */ |
| 42 | + protected $exceptionsMap = []; |
| 43 | + |
| 44 | + |
| 45 | + /** |
| 46 | + * @var array map part of speech character to its definition |
| 47 | + */ |
| 48 | + protected $posFileMaps = [ |
| 49 | + 'a' => 'adj', |
| 50 | + 'r' => 'adv', |
| 51 | + 'n' => 'noun', |
| 52 | + 'v' => 'verb' |
| 53 | + ]; |
| 54 | + |
| 55 | + /** |
| 56 | + * Returns the list of required files that is provided by the nltk project |
| 57 | + * @return string[] |
| 58 | + */ |
| 59 | + public function getFileNames() |
| 60 | + { |
| 61 | + return [ |
| 62 | + 'cntlist.rev', 'lexnames', 'index.sense','index.adj', 'index.adv', |
| 63 | + 'index.noun', 'index.verb','data.adj', 'data.adv', 'data.noun', |
| 64 | + 'data.verb', 'adj.exc', 'adv.exc', 'noun.exc', 'verb.exc' |
| 65 | + ]; |
| 66 | + } |
| 67 | + |
| 68 | + /** |
| 69 | + * Returns array of file names with the exceptions |
| 70 | + * @return array |
| 71 | + */ |
| 72 | + public function getExceptionFileNames() |
| 73 | + { |
| 74 | + return ['adj.exc', 'adv.exc', 'noun.exc', 'verb.exc']; |
| 75 | + } |
| 76 | + |
| 77 | + /** |
| 78 | + * |
| 79 | + * @param string $line |
| 80 | + * @param string $pos |
| 81 | + * @return ExceptionMap |
| 82 | + */ |
| 83 | + public function getExceptionMapFromString($line, $pos) |
| 84 | + { |
| 85 | + $tokens = explode(" ", $line); |
| 86 | + return new ExceptionMap($pos, $tokens[count($tokens)-1], array_slice($tokens, 0, -1)); |
| 87 | + } |
| 88 | + |
| 89 | + /** |
| 90 | + * Returns the list of exception spellings |
| 91 | + * @return ExceptionMap[] |
| 92 | + * @throws RuntimeException |
| 93 | + */ |
| 94 | + public function getExceptionsMap() |
| 95 | + { |
| 96 | + if(empty($this->exceptionsMap)) { |
| 97 | + $fileExtToPos = array_flip($this->getPosFileMaps()); |
| 98 | + |
| 99 | + foreach($this->getExceptionFileNames() as $fileName ) |
| 100 | + { |
| 101 | + $pos = $fileExtToPos[substr($fileName, 0, -4)]; |
| 102 | + $fh = fopen($this->getDir().$fileName,'r'); |
| 103 | + if(!$fh) { |
| 104 | + throw new RuntimeException("wordnet file missing {$fileName}"); |
| 105 | + } |
| 106 | + |
| 107 | + while($line = fgets($fh)) |
| 108 | + { |
| 109 | + if($line[0] === ' ') { |
| 110 | + continue; |
| 111 | + } |
| 112 | + $this->exceptionsMap[] = $this->getExceptionMapFromString(trim($line), $pos); |
| 113 | + |
| 114 | + } |
| 115 | + fclose($fh); |
| 116 | + } |
| 117 | + } |
| 118 | + return $this->exceptionsMap; |
| 119 | + } |
| 120 | + |
| 121 | + |
| 122 | + /** |
| 123 | + * @return string[] |
| 124 | + */ |
| 125 | + public function getLexNames() |
| 126 | + { |
| 127 | + if(empty($this->lexNames)) { |
| 128 | + $this->lexNames = array_map( |
| 129 | + function($row) { |
| 130 | + return explode("\t", trim($row))[1]; |
| 131 | + }, |
| 132 | + file($this->getDir().'lexnames') |
| 133 | + ); |
| 134 | + } |
| 135 | + return $this->lexNames; |
| 136 | + } |
| 137 | + |
| 138 | + /** |
| 139 | + * |
| 140 | + * @return array |
| 141 | + */ |
| 142 | + public function getPosFileMaps() |
| 143 | + { |
| 144 | + return $this->posFileMaps; |
| 145 | + } |
| 146 | + |
| 147 | + /** |
| 148 | + * |
| 149 | + * @return string[] |
| 150 | + */ |
| 151 | + public function getDataFileNames() |
| 152 | + { |
| 153 | + return ['data.adj', 'data.adv','data.noun', 'data.verb']; |
| 154 | + } |
| 155 | + |
| 156 | + /** |
| 157 | + * |
| 158 | + * @return string[] |
| 159 | + */ |
| 160 | + public function getIndexFileNames() |
| 161 | + { |
| 162 | + return ['index.adj', 'index.adv','index.noun', 'index.verb']; |
| 163 | + } |
| 164 | + |
| 165 | + /** |
| 166 | + * Opens the raw data file and reads the synsets in |
| 167 | + * @param int $synsetOffset |
| 168 | + * @param string $pos Part of speech |
| 169 | + * @return Synset |
| 170 | + */ |
| 171 | + public function getSynsetByOffsetAndPos($synsetOffset, $pos) |
| 172 | + { |
| 173 | + // check if the synset has already been cached |
| 174 | + if(isset($this->synsets[$pos.$synsetOffset])) { |
| 175 | + return $this->synsets[$pos.$synsetOffset]; |
| 176 | + } |
| 177 | + |
| 178 | + $fileName = "data.{$this->posFileMaps[$pos]}"; |
| 179 | + if(!in_array($fileName, $this->getDataFileNames())) { |
| 180 | + throw new RuntimeException("That is not a correct wordnet file {$fileName}"); |
| 181 | + } elseif(!file_exists($this->getDir().$fileName)) { |
| 182 | + throw new RuntimeException("Wordnet file missing {$fileName}"); |
| 183 | + } |
| 184 | + |
| 185 | + |
| 186 | + $fh = fopen($this->getDir().$fileName,'r'); |
| 187 | + if(!$fh) { |
| 188 | + throw new RuntimeException("Could not open wordnet file for reading {$fileName}"); |
| 189 | + } |
| 190 | + if(fseek($fh, $synsetOffset) === -1) { |
| 191 | + throw new RuntimeException("Could not seek to {$synsetOffset} in {$fileName}"); |
| 192 | + } |
| 193 | + |
| 194 | + $line = trim(fgets($fh)); |
| 195 | + fclose($fh); |
| 196 | + return $this->getSynsetFromString($line); |
| 197 | + |
| 198 | + } |
| 199 | + |
| 200 | + /** |
| 201 | + * Parse the line from the synset file and turn it into a synset object |
| 202 | + * @param string $line |
| 203 | + * @return Synset |
| 204 | + */ |
| 205 | + public function getSynsetFromString($line) |
| 206 | + { |
| 207 | + $row = str_getcsv($line," "); |
| 208 | + $synset = new Synset((int)$row[0], $row[2]); |
| 209 | + $synset->setDefinition(trim(substr($line, strpos($line,"|")+1))); |
| 210 | + |
| 211 | + for($index = 0; $index < (int)$row[3]; $index++) |
| 212 | + { |
| 213 | + $synset->addWord($row[4 + $index*2], $row[5 + $index*2]); |
| 214 | + } |
| 215 | + |
| 216 | + $startIdx = 5 + $row[3] * 2; |
| 217 | + $endIdx = ($row[$startIdx-1] * 4) + $startIdx; |
| 218 | + $embeddedSynsets = array_splice($row, $startIdx, $endIdx); |
| 219 | + for($index = 0; $index < $row[$startIdx-1] * 4; $index+=4) |
| 220 | + { |
| 221 | + $linkedSynset = new Synset($embeddedSynsets[$index+1], $embeddedSynsets[$index+2]); |
| 222 | + $linkedSynset->setPtrSymbols([$embeddedSynsets[$index]]); |
| 223 | + |
| 224 | + // set src and target word indexes |
| 225 | + if((int)$embeddedSynsets[$index+3] === 0) { |
| 226 | + $linkedSynset->setSrcWordIdx(0); |
| 227 | + $linkedSynset->setTargetWordIdx(0); |
| 228 | + } else { |
| 229 | + $linkedSynset->setSrcWordIdx((int)($embeddedSynsets[$index+3]) % 100); |
| 230 | + $linkedSynset->setTargetWordIdx((int) floor($embeddedSynsets[$index+3] / 100)); |
| 231 | + } |
| 232 | + $synset->addLinkedSynset($linkedSynset); |
| 233 | + } |
| 234 | + return $synset; |
| 235 | + } |
| 236 | + |
| 237 | + /** |
| 238 | + * |
| 239 | + * @param string $line |
| 240 | + * @return Lemma |
| 241 | + */ |
| 242 | + public function getLemmaFromString($line) |
| 243 | + { |
| 244 | + $row = str_getcsv(trim($line)," "); |
| 245 | + return new Lemma($row[0], $row[1], (int)$row[2], (int)$row[3], array_slice($row, 4, (int)$row[3]), array_map('intval', array_slice($row, count($row)-(int)$row[2])) ); |
| 246 | + } |
| 247 | + |
| 248 | + /** |
| 249 | + * @return Lemma[] Returns an array of lemmas |
| 250 | + * @throws RuntimeException |
| 251 | + */ |
| 252 | + public function getLemmas() |
| 253 | + { |
| 254 | + if(empty($this->lemmas)) { |
| 255 | + foreach($this->getIndexFileNames() as $fileName ) |
| 256 | + { |
| 257 | + $seenBefore = []; |
| 258 | + $fh = fopen($this->getDir().$fileName,'r'); |
| 259 | + if(!$fh) { |
| 260 | + throw new RuntimeException("wordnet file missing {$fileName}"); |
| 261 | + } |
| 262 | + while($line = fgets($fh)) |
| 263 | + { |
| 264 | + if($line[0] === ' ' || isset($seenBefore[md5(trim($line))] )) { |
| 265 | + continue; |
| 266 | + } |
| 267 | + $seenBefore[md5(trim($line))] = 1; |
| 268 | + $this->lemmas[] = $this->getLemmaFromString($line); |
| 269 | + |
| 270 | + } |
| 271 | + fclose($fh); |
| 272 | + } |
| 273 | + } |
| 274 | + return $this->lemmas; |
| 275 | + } |
| 276 | +} |
0 commit comments