Skip to content

Commit 1450466

Browse files
committed
implementing support for wordnet
1 parent cc1e840 commit 1450466

File tree

10 files changed

+1014
-1
lines changed

10 files changed

+1014
-1
lines changed

src/Console/Commands/NltkPackageListCommand.php

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,14 @@ protected function execute(InputInterface $input, OutputInterface $output)
3535
$downloader = new NltkCorporaIndexDownloader();
3636
}
3737

38+
$packages = $downloader->getPackages();
39+
40+
usort($packages, function($package1, $package2){
41+
return strnatcasecmp($package1->getId(), $package2->getId());
42+
});
3843
/** @var $package \TextAnalysis\Utilities\Nltk\Download\Package */
3944
$output->writeln("Packages available for installation:");
40-
foreach($downloader->getPackages() as $package)
45+
foreach($packages as $package)
4146
{
4247
$output->writeln(" * {$package->getId()} - {$package->getName()}");
4348
}

src/Corpus/ReadCorpusAbstract.php

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
<?php
2+
3+
namespace TextAnalysis\Corpus;
4+
5+
6+
/**
7+
* Abstract class for making corpus readers
8+
*/
9+
abstract class ReadCorpusAbstract
10+
{
11+
/**
12+
*
13+
* @var string the directory the corpus files are located
14+
*/
15+
protected $dir;
16+
17+
/**
18+
*
19+
* @var string which language to use, default is eng
20+
*/
21+
protected $lang = 'eng';
22+
23+
24+
/**
25+
*
26+
* @param string $dir the directory the corpus files are located
27+
* @param string $lang language to use, default is eng
28+
*/
29+
public function __construct($dir, $lang = 'eng')
30+
{
31+
$this->dir = $dir;
32+
$this->lang = $lang;
33+
}
34+
35+
/**
36+
*
37+
* @return string language to use, default is eng
38+
*/
39+
public function getLanguage()
40+
{
41+
return $this->lang;
42+
}
43+
44+
/**
45+
* @return string the directory the corpus files are located
46+
*/
47+
public function getDir()
48+
{
49+
return $this->dir;
50+
}
51+
52+
53+
54+
/**
55+
* @return string[] Return the list of file names that must be loaded to use the corpus
56+
* Should use relative paths
57+
*/
58+
abstract public function getFileNames();
59+
60+
61+
}
62+
63+

src/Corpus/WordnetCorpus.php

Lines changed: 216 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,216 @@
1+
<?php
2+
3+
namespace TextAnalysis\Corpus;
4+
5+
use RuntimeException;
6+
use TextAnalysis\Models\Wordnet\Lemma;
7+
use TextAnalysis\Models\Wordnet\Synset;
8+
9+
/**
10+
* Loads the wordnet corpus for use. Borrowed heavily from nltk
11+
* @author yooper
12+
*/
13+
class WordnetCorpus extends ReadCorpusAbstract
14+
{
15+
const VERB = 'v';
16+
const ADJECTIVE = 'r';
17+
const ADVERB = 'a';
18+
const NOUN = 'n';
19+
20+
/**
21+
*
22+
* @var array stores all the lex names
23+
*/
24+
protected $lexNames = [];
25+
26+
/**
27+
*
28+
* @var Lemma[]
29+
*/
30+
protected $lemmas = [];
31+
32+
/**
33+
* An array with indexing. Indexing is pos with offset concatenated
34+
* @var Synset[]
35+
*/
36+
protected $synsets = [];
37+
38+
/**
39+
* @var array map part of speech character to its definition
40+
*/
41+
protected $posFileMaps = [
42+
'a' => 'adj',
43+
'r' => 'adv',
44+
'n' => 'noun',
45+
'v' => 'verb'
46+
];
47+
48+
/**
49+
* Returns the list of required files that is provided by the nltk project
50+
* @return string[]
51+
*/
52+
public function getFileNames()
53+
{
54+
return [
55+
'cntlist.rev', 'lexnames', 'index.sense','index.adj', 'index.adv',
56+
'index.noun', 'index.verb','data.adj', 'data.adv', 'data.noun',
57+
'data.verb', 'adj.exc', 'adv.exc', 'noun.exc', 'verb.exc'
58+
];
59+
}
60+
61+
62+
/**
63+
* @return string[]
64+
*/
65+
public function getLexNames()
66+
{
67+
if(empty($this->lexNames)) {
68+
$this->lexNames = array_map(
69+
function($row) {
70+
return explode("\t", trim($row))[1];
71+
},
72+
file($this->getDir().'lexnames')
73+
);
74+
}
75+
return $this->lexNames;
76+
}
77+
78+
/**
79+
*
80+
* @return array
81+
*/
82+
public function getPosFileMaps()
83+
{
84+
return $this->posFileMaps;
85+
}
86+
87+
/**
88+
*
89+
* @return string[]
90+
*/
91+
public function getDataFileNames()
92+
{
93+
return ['data.adj', 'data.adv','data.noun', 'data.verb'];
94+
}
95+
96+
/**
97+
*
98+
* @return string[]
99+
*/
100+
public function getIndexFileNames()
101+
{
102+
return ['index.adj', 'index.adv','index.noun', 'index.verb'];
103+
}
104+
105+
/**
106+
* Opens the raw data file and reads the synsets in
107+
* @param int $synsetOffset
108+
* @param string $pos Part of speech
109+
* @return Synset
110+
*/
111+
public function getSynsetByOffsetAndPos($synsetOffset, $pos)
112+
{
113+
// check if the synset has already been cached
114+
if(isset($this->synsets[$pos.$synsetOffset])) {
115+
return $this->synsets[$pos.$synsetOffset];
116+
}
117+
118+
$fileName = "data.{$this->posFileMaps[$pos]}";
119+
if(!in_array($fileName, $this->getDataFileNames())) {
120+
throw new RuntimeException("That is not a correct wordnet file {$fileName}");
121+
} elseif(!file_exists($this->getDir().$fileName)) {
122+
throw new RuntimeException("Wordnet file missing {$fileName}");
123+
}
124+
125+
126+
$fh = fopen($this->getDir().$fileName,'r');
127+
if(!$fh) {
128+
throw new RuntimeException("Could not open wordnet file for reading {$fileName}");
129+
}
130+
if(fseek($fh, $synsetOffset) === -1) {
131+
throw new RuntimeException("Could not seek to {$synsetOffset} in {$fileName}");
132+
}
133+
134+
$line = trim(fgets($fh));
135+
fclose($fh);
136+
return $this->getSynsetFromString($line);
137+
138+
}
139+
140+
/**
141+
* Parse the line from the synset file and turn it into a synset object
142+
* @param string $line
143+
* @return Synset
144+
*/
145+
public function getSynsetFromString($line)
146+
{
147+
$row = str_getcsv($line," ");
148+
$synset = new Synset((int)$row[0], $row[2]);
149+
$synset->setDefinition(trim(substr($line, strpos($line,"|")+1)));
150+
151+
for($index = 0; $index < (int)$row[3]; $index++)
152+
{
153+
$synset->addWord($row[4 + $index*2], $row[5 + $index*2]);
154+
}
155+
156+
$startIdx = 5 + $row[3] * 2;
157+
$endIdx = ($row[$startIdx-1] * 4) + $startIdx;
158+
$embeddedSynsets = array_splice($row, $startIdx, $endIdx);
159+
for($index = 0; $index < $row[$startIdx-1] * 4; $index+=4)
160+
{
161+
$linkedSynset = new Synset($embeddedSynsets[$index+1], $embeddedSynsets[$index+2]);
162+
$linkedSynset->setPtrSymbols([$embeddedSynsets[$index]]);
163+
164+
// set src and target word indexes
165+
if((int)$embeddedSynsets[$index+3] === 0) {
166+
$linkedSynset->setSrcWordIdx(0);
167+
$linkedSynset->setTargetWordIdx(0);
168+
} else {
169+
$linkedSynset->setSrcWordIdx((int)($embeddedSynsets[$index+3]) % 100);
170+
$linkedSynset->setTargetWordIdx((int) floor($embeddedSynsets[$index+3] / 100));
171+
}
172+
$synset->addLinkedSynset($linkedSynset);
173+
}
174+
return $synset;
175+
}
176+
177+
/**
178+
*
179+
* @param string $line
180+
* @return Lemma
181+
*/
182+
public function getLemmaFromString($line)
183+
{
184+
$row = str_getcsv(trim($line)," ");
185+
return new Lemma($row[0], $row[1], (int)$row[2], (int)$row[3], array_slice($row, 4, (int)$row[3]), array_map('intval', array_slice($row, count($row)-(int)$row[2])) );
186+
}
187+
188+
/**
189+
* @return Lemma[] Returns an array of lemmas
190+
* @throws RuntimeException
191+
*/
192+
public function getLemmas()
193+
{
194+
if(empty($this->lemmas)) {
195+
foreach($this->getIndexFileNames() as $fileName )
196+
{
197+
$seenBefore = [];
198+
$fh = fopen($this->getDir().$fileName,'r');
199+
if(!$fh) {
200+
throw new RuntimeException("wordnet file missing {$fileName}");
201+
}
202+
while($line = fgets($fh))
203+
{
204+
if($line[0] === ' ' || isset($seenBefore[md5(trim($line))] )) {
205+
continue;
206+
}
207+
$seenBefore[md5(trim($line))] = 1;
208+
$this->lemmas[] = $this->getLemmaFromString($line);
209+
210+
}
211+
fclose($fh);
212+
}
213+
}
214+
return $this->lemmas;
215+
}
216+
}

src/Indexes/WordnetIndex.php

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
<?php
2+
3+
namespace TextAnalysis\Indexes;
4+
5+
use TextAnalysis\Corpus\WordnetCorpus;
6+
use TextAnalysis\Models\Wordnet\Synset;
7+
use TextAnalysis\Models\Wordnet\Lemma;
8+
9+
/*
10+
* WordnetIndex is a facade for accessing the wordnet data
11+
* @author yooper
12+
*/
13+
class WordnetIndex
14+
{
15+
/**
16+
*
17+
* @var WordnetCorpus
18+
*/
19+
protected $wordnetCorpus = null;
20+
21+
/**
22+
*
23+
* @var array
24+
*/
25+
protected $lemmasIdx = [];
26+
27+
28+
/**
29+
*
30+
* @param WordnetCorpus $wordnetCorpus
31+
*/
32+
public function __construct(WordnetCorpus $wordnetCorpus)
33+
{
34+
$this->wordnetCorpus = $wordnetCorpus;
35+
}
36+
37+
/**
38+
* @return WordnetCorpus
39+
*/
40+
public function getWordnetCorpus()
41+
{
42+
return $this->wordnetCorpus;
43+
}
44+
45+
/**
46+
* Return the lemmas that are linked to the given word, provide a pos to
47+
* filter down the results
48+
* @param string $word
49+
* @param string $pos
50+
* @return \TextAnalysis\Models\Wordnet\Lemma[]
51+
*/
52+
public function getLemma($word, $pos = '')
53+
{
54+
if(empty($this->lemmasIdx)) {
55+
foreach($this->getWordnetCorpus()->getLemmas() as &$lemma) {
56+
$this->lemmasIdx["{$lemma->getWord()}.{$lemma->getPos()}"] = $lemma;
57+
}
58+
// sort the keys for faster lookup
59+
ksort($this->lemmasIdx);
60+
}
61+
62+
$found = [];
63+
64+
// found 1
65+
if(isset($this->lemmasIdx["{$word}.{$pos}"])) {
66+
$found[] = $this->lemmasIdx["{$word}.{$pos}"];
67+
} else {
68+
foreach($this->getWordnetCorpus()->getPosFileMaps() as $key => $value)
69+
{
70+
if(isset($this->lemmasIdx["{$word}.{$key}"])) {
71+
$found[] = $this->lemmasIdx["{$word}.{$key}"];
72+
}
73+
}
74+
}
75+
76+
//attach the synsets for the lemmas
77+
foreach($found as $lemma)
78+
{
79+
if(empty($lemma->getSynsets())) {
80+
$synsets = [];
81+
foreach($lemma->getSynsetOffsets() as $fileOffset)
82+
{
83+
$synsets[] = $this->getWordnetCorpus()->getSynsetByOffsetAndPos((int)$fileOffset, $lemma->getPos());
84+
}
85+
$lemma->setSynsets($synsets);
86+
}
87+
}
88+
return $found;
89+
}
90+
91+
public function __destruct()
92+
{
93+
unset($this->wordnetCorpus);
94+
unset($this->lemmasIdx);
95+
}
96+
}

0 commit comments

Comments
 (0)