Skip to content

Commit f885272

Browse files
authored
Merge pull request #9 from yooper/wordnet
Wordnet
2 parents fb773ac + 097c922 commit f885272

File tree

11 files changed

+1293
-1
lines changed

11 files changed

+1293
-1
lines changed

src/Console/Commands/NltkPackageListCommand.php

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,14 @@ protected function execute(InputInterface $input, OutputInterface $output)
3535
$downloader = new NltkCorporaIndexDownloader();
3636
}
3737

38+
$packages = $downloader->getPackages();
39+
40+
usort($packages, function($package1, $package2){
41+
return strnatcasecmp($package1->getId(), $package2->getId());
42+
});
3843
/** @var $package \TextAnalysis\Utilities\Nltk\Download\Package */
3944
$output->writeln("Packages available for installation:");
40-
foreach($downloader->getPackages() as $package)
45+
foreach($packages as $package)
4146
{
4247
$output->writeln(" * {$package->getId()} - {$package->getName()}");
4348
}

src/Corpus/ReadCorpusAbstract.php

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
<?php
2+
3+
namespace TextAnalysis\Corpus;
4+
5+
6+
/**
7+
* Abstract class for making corpus readers
8+
*/
9+
abstract class ReadCorpusAbstract
10+
{
11+
/**
12+
*
13+
* @var string the directory the corpus files are located
14+
*/
15+
protected $dir;
16+
17+
/**
18+
*
19+
* @var string which language to use, default is eng
20+
*/
21+
protected $lang = 'eng';
22+
23+
24+
/**
25+
*
26+
* @param string $dir the directory the corpus files are located
27+
* @param string $lang language to use, default is eng
28+
*/
29+
public function __construct($dir, $lang = 'eng')
30+
{
31+
$this->dir = $dir;
32+
$this->lang = $lang;
33+
}
34+
35+
/**
36+
*
37+
* @return string language to use, default is eng
38+
*/
39+
public function getLanguage()
40+
{
41+
return $this->lang;
42+
}
43+
44+
/**
45+
* @return string the directory the corpus files are located
46+
*/
47+
public function getDir()
48+
{
49+
return $this->dir;
50+
}
51+
52+
53+
54+
/**
55+
* @return string[] Return the list of file names that must be loaded to use the corpus
56+
* Should use relative paths
57+
*/
58+
abstract public function getFileNames();
59+
60+
61+
}
62+
63+

src/Corpus/WordnetCorpus.php

Lines changed: 276 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,276 @@
1+
<?php
2+
3+
namespace TextAnalysis\Corpus;
4+
5+
use RuntimeException;
6+
use TextAnalysis\Models\Wordnet\Lemma;
7+
use TextAnalysis\Models\Wordnet\Synset;
8+
use TextAnalysis\Models\Wordnet\ExceptionMap;
9+
10+
/**
11+
* Loads the wordnet corpus for use. Borrowed heavily from nltk
12+
* @author yooper
13+
*/
14+
class WordnetCorpus extends ReadCorpusAbstract
15+
{
16+
const VERB = 'v';
17+
const ADJECTIVE = 'r';
18+
const ADVERB = 'a';
19+
const NOUN = 'n';
20+
21+
/**
22+
*
23+
* @var array stores all the lex names
24+
*/
25+
protected $lexNames = [];
26+
27+
/**
28+
*
29+
* @var Lemma[]
30+
*/
31+
protected $lemmas = [];
32+
33+
/**
34+
* An array with indexing. Indexing is pos with offset concatenated
35+
* @var Synset[]
36+
*/
37+
protected $synsets = [];
38+
39+
/**
40+
* @var ExceptionMap[]
41+
*/
42+
protected $exceptionsMap = [];
43+
44+
45+
/**
46+
* @var array map part of speech character to its definition
47+
*/
48+
protected $posFileMaps = [
49+
'a' => 'adj',
50+
'r' => 'adv',
51+
'n' => 'noun',
52+
'v' => 'verb'
53+
];
54+
55+
/**
56+
* Returns the list of required files that is provided by the nltk project
57+
* @return string[]
58+
*/
59+
public function getFileNames()
60+
{
61+
return [
62+
'cntlist.rev', 'lexnames', 'index.sense','index.adj', 'index.adv',
63+
'index.noun', 'index.verb','data.adj', 'data.adv', 'data.noun',
64+
'data.verb', 'adj.exc', 'adv.exc', 'noun.exc', 'verb.exc'
65+
];
66+
}
67+
68+
/**
69+
* Returns array of file names with the exceptions
70+
* @return array
71+
*/
72+
public function getExceptionFileNames()
73+
{
74+
return ['adj.exc', 'adv.exc', 'noun.exc', 'verb.exc'];
75+
}
76+
77+
/**
78+
*
79+
* @param string $line
80+
* @param string $pos
81+
* @return ExceptionMap
82+
*/
83+
public function getExceptionMapFromString($line, $pos)
84+
{
85+
$tokens = explode(" ", $line);
86+
return new ExceptionMap($pos, $tokens[count($tokens)-1], array_slice($tokens, 0, -1));
87+
}
88+
89+
/**
90+
* Returns the list of exception spellings
91+
* @return ExceptionMap[]
92+
* @throws RuntimeException
93+
*/
94+
public function getExceptionsMap()
95+
{
96+
if(empty($this->exceptionsMap)) {
97+
$fileExtToPos = array_flip($this->getPosFileMaps());
98+
99+
foreach($this->getExceptionFileNames() as $fileName )
100+
{
101+
$pos = $fileExtToPos[substr($fileName, 0, -4)];
102+
$fh = fopen($this->getDir().$fileName,'r');
103+
if(!$fh) {
104+
throw new RuntimeException("wordnet file missing {$fileName}");
105+
}
106+
107+
while($line = fgets($fh))
108+
{
109+
if($line[0] === ' ') {
110+
continue;
111+
}
112+
$this->exceptionsMap[] = $this->getExceptionMapFromString(trim($line), $pos);
113+
114+
}
115+
fclose($fh);
116+
}
117+
}
118+
return $this->exceptionsMap;
119+
}
120+
121+
122+
/**
123+
* @return string[]
124+
*/
125+
public function getLexNames()
126+
{
127+
if(empty($this->lexNames)) {
128+
$this->lexNames = array_map(
129+
function($row) {
130+
return explode("\t", trim($row))[1];
131+
},
132+
file($this->getDir().'lexnames')
133+
);
134+
}
135+
return $this->lexNames;
136+
}
137+
138+
/**
139+
*
140+
* @return array
141+
*/
142+
public function getPosFileMaps()
143+
{
144+
return $this->posFileMaps;
145+
}
146+
147+
/**
148+
*
149+
* @return string[]
150+
*/
151+
public function getDataFileNames()
152+
{
153+
return ['data.adj', 'data.adv','data.noun', 'data.verb'];
154+
}
155+
156+
/**
157+
*
158+
* @return string[]
159+
*/
160+
public function getIndexFileNames()
161+
{
162+
return ['index.adj', 'index.adv','index.noun', 'index.verb'];
163+
}
164+
165+
/**
166+
* Opens the raw data file and reads the synsets in
167+
* @param int $synsetOffset
168+
* @param string $pos Part of speech
169+
* @return Synset
170+
*/
171+
public function getSynsetByOffsetAndPos($synsetOffset, $pos)
172+
{
173+
// check if the synset has already been cached
174+
if(isset($this->synsets[$pos.$synsetOffset])) {
175+
return $this->synsets[$pos.$synsetOffset];
176+
}
177+
178+
$fileName = "data.{$this->posFileMaps[$pos]}";
179+
if(!in_array($fileName, $this->getDataFileNames())) {
180+
throw new RuntimeException("That is not a correct wordnet file {$fileName}");
181+
} elseif(!file_exists($this->getDir().$fileName)) {
182+
throw new RuntimeException("Wordnet file missing {$fileName}");
183+
}
184+
185+
186+
$fh = fopen($this->getDir().$fileName,'r');
187+
if(!$fh) {
188+
throw new RuntimeException("Could not open wordnet file for reading {$fileName}");
189+
}
190+
if(fseek($fh, $synsetOffset) === -1) {
191+
throw new RuntimeException("Could not seek to {$synsetOffset} in {$fileName}");
192+
}
193+
194+
$line = trim(fgets($fh));
195+
fclose($fh);
196+
return $this->getSynsetFromString($line);
197+
198+
}
199+
200+
/**
201+
* Parse the line from the synset file and turn it into a synset object
202+
* @param string $line
203+
* @return Synset
204+
*/
205+
public function getSynsetFromString($line)
206+
{
207+
$row = str_getcsv($line," ");
208+
$synset = new Synset((int)$row[0], $row[2]);
209+
$synset->setDefinition(trim(substr($line, strpos($line,"|")+1)));
210+
211+
for($index = 0; $index < (int)$row[3]; $index++)
212+
{
213+
$synset->addWord($row[4 + $index*2], $row[5 + $index*2]);
214+
}
215+
216+
$startIdx = 5 + $row[3] * 2;
217+
$endIdx = ($row[$startIdx-1] * 4) + $startIdx;
218+
$embeddedSynsets = array_splice($row, $startIdx, $endIdx);
219+
for($index = 0; $index < $row[$startIdx-1] * 4; $index+=4)
220+
{
221+
$linkedSynset = new Synset($embeddedSynsets[$index+1], $embeddedSynsets[$index+2]);
222+
$linkedSynset->setPtrSymbols([$embeddedSynsets[$index]]);
223+
224+
// set src and target word indexes
225+
if((int)$embeddedSynsets[$index+3] === 0) {
226+
$linkedSynset->setSrcWordIdx(0);
227+
$linkedSynset->setTargetWordIdx(0);
228+
} else {
229+
$linkedSynset->setSrcWordIdx((int)($embeddedSynsets[$index+3]) % 100);
230+
$linkedSynset->setTargetWordIdx((int) floor($embeddedSynsets[$index+3] / 100));
231+
}
232+
$synset->addLinkedSynset($linkedSynset);
233+
}
234+
return $synset;
235+
}
236+
237+
/**
238+
*
239+
* @param string $line
240+
* @return Lemma
241+
*/
242+
public function getLemmaFromString($line)
243+
{
244+
$row = str_getcsv(trim($line)," ");
245+
return new Lemma($row[0], $row[1], (int)$row[2], (int)$row[3], array_slice($row, 4, (int)$row[3]), array_map('intval', array_slice($row, count($row)-(int)$row[2])) );
246+
}
247+
248+
/**
249+
* @return Lemma[] Returns an array of lemmas
250+
* @throws RuntimeException
251+
*/
252+
public function getLemmas()
253+
{
254+
if(empty($this->lemmas)) {
255+
foreach($this->getIndexFileNames() as $fileName )
256+
{
257+
$seenBefore = [];
258+
$fh = fopen($this->getDir().$fileName,'r');
259+
if(!$fh) {
260+
throw new RuntimeException("wordnet file missing {$fileName}");
261+
}
262+
while($line = fgets($fh))
263+
{
264+
if($line[0] === ' ' || isset($seenBefore[md5(trim($line))] )) {
265+
continue;
266+
}
267+
$seenBefore[md5(trim($line))] = 1;
268+
$this->lemmas[] = $this->getLemmaFromString($line);
269+
270+
}
271+
fclose($fh);
272+
}
273+
}
274+
return $this->lemmas;
275+
}
276+
}

0 commit comments

Comments
 (0)