Skip to content

Commit 655acaf

Browse files
committed
adds in basic support for wordnet and getting base words using morph
1 parent 1450466 commit 655acaf

File tree

5 files changed

+281
-2
lines changed

5 files changed

+281
-2
lines changed

src/Corpus/WordnetCorpus.php

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
use RuntimeException;
66
use TextAnalysis\Models\Wordnet\Lemma;
77
use TextAnalysis\Models\Wordnet\Synset;
8+
use TextAnalysis\Models\Wordnet\ExceptionMap;
89

910
/**
1011
* Loads the wordnet corpus for use. Borrowed heavily from nltk
@@ -35,6 +36,12 @@ class WordnetCorpus extends ReadCorpusAbstract
3536
*/
3637
protected $synsets = [];
3738

39+
/**
40+
* @var ExceptionMap[]
41+
*/
42+
protected $exceptionsMap = [];
43+
44+
3845
/**
3946
* @var array map part of speech character to its definition
4047
*/
@@ -58,6 +65,59 @@ public function getFileNames()
5865
];
5966
}
6067

68+
/**
69+
* Returns array of file names with the exceptions
70+
* @return array
71+
*/
72+
public function getExceptionFileNames()
73+
{
74+
return ['adj.exc', 'adv.exc', 'noun.exc', 'verb.exc'];
75+
}
76+
77+
/**
78+
*
79+
* @param string $line
80+
* @param string $pos
81+
* @return ExceptionMap
82+
*/
83+
public function getExceptionMapFromString($line, $pos)
84+
{
85+
$tokens = explode(" ", $line);
86+
return new ExceptionMap($pos, $tokens[count($tokens)-1], array_slice($tokens, 0, -1));
87+
}
88+
89+
/**
90+
* Returns the list of exception spellings
91+
* @return ExceptionMap[]
92+
* @throws RuntimeException
93+
*/
94+
public function getExceptionsMap()
95+
{
96+
if(empty($this->exceptionsMap)) {
97+
$fileExtToPos = array_flip($this->getPosFileMaps());
98+
99+
foreach($this->getExceptionFileNames() as $fileName )
100+
{
101+
$pos = $fileExtToPos[substr($fileName, 0, -4)];
102+
$fh = fopen($this->getDir().$fileName,'r');
103+
if(!$fh) {
104+
throw new RuntimeException("wordnet file missing {$fileName}");
105+
}
106+
107+
while($line = fgets($fh))
108+
{
109+
if($line[0] === ' ') {
110+
continue;
111+
}
112+
$this->exceptionsMap[] = $this->getExceptionMapFromString(trim($line), $pos);
113+
114+
}
115+
fclose($fh);
116+
}
117+
}
118+
return $this->exceptionsMap;
119+
}
120+
61121

62122
/**
63123
* @return string[]

src/Indexes/WordnetIndex.php

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
use TextAnalysis\Corpus\WordnetCorpus;
66
use TextAnalysis\Models\Wordnet\Synset;
77
use TextAnalysis\Models\Wordnet\Lemma;
8+
use TextAnalysis\Models\Wordnet\ExceptionMap;
9+
use TextAnalysis\Utilities\Text;
810

911
/*
1012
* WordnetIndex is a facade for accessing the wordnet data
@@ -88,6 +90,107 @@ public function getLemma($word, $pos = '')
8890
return $found;
8991
}
9092

93+
/**
94+
* Concept taken from nltk
95+
* Find a possible base form for the given form, with the given
96+
* part of speech, by checking WordNet's list of exceptional
97+
* forms, and by recursively stripping affixes for this part of
98+
* speech until a form in WordNet is found.
99+
* @todo improve the algorithm, it is really slow
100+
* @param string $word
101+
* @param string|null $pos
102+
* @return string return the base word
103+
*/
104+
public function getMorph($word, $pos = '')
105+
{
106+
if(mb_strlen($word) < 3) {
107+
return "";
108+
}
109+
110+
$searchForFuncWithPos = function(ExceptionMap $exceptionMap) use($word, $pos)
111+
{
112+
return $exceptionMap->getPos() === $pos && in_array($word, $exceptionMap->getExceptionList());
113+
};
114+
115+
$searchForFuncWithoutPos = function(ExceptionMap $exceptionMap) use($word)
116+
{
117+
return in_array($word, $exceptionMap->getExceptionList());
118+
};
119+
120+
$found = [];
121+
122+
123+
if(!empty($pos)) {
124+
$found = array_filter($this->getWordnetCorpus()->getExceptionsMap(), $searchForFuncWithPos);
125+
} else {
126+
$found = array_filter($this->getWordnetCorpus()->getExceptionsMap(), $searchForFuncWithoutPos);
127+
}
128+
129+
// found a match in the exceptions data
130+
if(!empty($found)) {
131+
return array_values($found)[0]->getTarget();
132+
}
133+
134+
foreach($this->getMorphilogicalSubstitutions() as $keyPos => $keyValues)
135+
{
136+
foreach($keyValues as $key => $value)
137+
{
138+
if(Text::endsWith($word, $key)) {
139+
$morphedWord = substr($word, 0, -strlen($key)).$value;
140+
$r = $this->getLemma($morphedWord, $keyPos);
141+
if(!empty($r)) {
142+
$found += array_map(function($lemma){ return $lemma->getWord();}, $r);
143+
return $found[0];
144+
}
145+
}
146+
}
147+
}
148+
if(empty($found)) {
149+
return "";
150+
}
151+
152+
return $found[0];
153+
}
154+
155+
/**
156+
*
157+
* @return array
158+
*/
159+
public function getMorphilogicalSubstitutions()
160+
{
161+
return [
162+
WordnetCorpus::NOUN => [
163+
's' => '',
164+
'ses' => 's',
165+
'ves' => 'f',
166+
'xes' => 'x',
167+
'zes' => 'z',
168+
'ches' => 'ch',
169+
'shes' => 'sh',
170+
'men' => 'man',
171+
'ies' => 'y',
172+
173+
],
174+
WordnetCorpus::VERB => [
175+
's'=> '',
176+
'ies'=> 'y',
177+
'es'=> 'e',
178+
'es'=> '',
179+
'ed'=> 'e',
180+
'ed'=> '',
181+
'ing'=> 'e',
182+
'ing'=> ''
183+
],
184+
WordnetCorpus::ADJECTIVE => [
185+
'er' => '',
186+
'est' => '',
187+
'er' => 'e',
188+
'est' => 'e'
189+
]
190+
];
191+
}
192+
193+
91194
public function __destruct()
92195
{
93196
unset($this->wordnetCorpus);
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
<?php
2+
3+
namespace TextAnalysis\Models\Wordnet;
4+
5+
/**
6+
* Maps the exception file data into an object
7+
* @author dcardin
8+
*/
9+
class ExceptionMap
10+
{
11+
/**
12+
*
13+
* @var string pos
14+
*/
15+
protected $pos = null;
16+
17+
/**
18+
*
19+
* @var string[] String of exception words to look for
20+
*/
21+
protected $exceptionList = [];
22+
23+
/**
24+
*
25+
* @var string The target word the strings in the exceptionList get mapped to
26+
*/
27+
protected $target = null;
28+
29+
/**
30+
*
31+
* @param string $pos
32+
* @param string $target
33+
* @param array $exceptionList
34+
*/
35+
public function __construct($pos, $target, array $exceptionList)
36+
{
37+
$this->pos = $pos;
38+
$this->target = $target;
39+
$this->exceptionList = $exceptionList;
40+
}
41+
42+
/**
43+
*
44+
* @return string
45+
*/
46+
public function getPos()
47+
{
48+
return $this->pos;
49+
}
50+
51+
/**
52+
*
53+
* @return string
54+
*/
55+
public function getTarget()
56+
{
57+
return $this->target;
58+
}
59+
60+
/**
61+
* @return string[]
62+
*/
63+
public function getExceptionList()
64+
{
65+
return $this->exceptionList;
66+
}
67+
68+
}

tests/TextAnalysis/Corpus/WordnetCorpusTest.php

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,4 +83,20 @@ public function testGetSynsets()
8383
$this->assertCount(4, $synset->getLinkedSynsets());
8484
}
8585

86+
public function testGetExceptionMapFromString()
87+
{
88+
$wn = new WordnetCorpus(get_storage_path('corpora/wordnet'));
89+
90+
$e1 = $wn->getExceptionMapFromString('thieves thief', 'n');
91+
$this->assertCount(1, $e1->getExceptionList());
92+
$this->assertEquals('thief', $e1->getTarget());
93+
$this->assertEquals('thieves', $e1->getExceptionList()[0]);
94+
95+
$e2 = $wn->getExceptionMapFromString('ploughmen ploughman plowman', 'n');
96+
$this->assertCount(2, $e2->getExceptionList());
97+
$this->assertEquals('plowman', $e2->getTarget());
98+
$this->assertEquals(['ploughmen', 'ploughman'], $e2->getExceptionList());
99+
100+
}
101+
86102
}

tests/TextAnalysis/Indexes/WordnetIndexTest.php

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,46 @@
1212
*/
1313
class WordnetIndexTest extends \PHPUnit_Framework_TestCase
1414
{
15+
16+
/**
17+
*
18+
* @var WordnetIndex
19+
*/
20+
protected $wordnetIdx = null;
21+
22+
/**
23+
*
24+
* @return WordnetIndex
25+
*/
26+
public function getWordnetIndex()
27+
{
28+
if(!$this->wordnetIdx) {
29+
$this->wordnetIdx = new WordnetIndex(new WordnetCorpus(get_storage_path('corpora/wordnet')));
30+
}
31+
return $this->wordnetIdx;
32+
}
33+
1534
public function testGetLemma()
1635
{
1736
if( getenv('SKIP_TEST') || !is_dir(get_storage_path('corpora/wordnet'))) {
1837
return;
1938
}
20-
$wnIdx = new WordnetIndex(new WordnetCorpus(get_storage_path('corpora/wordnet')));
21-
$lemmas = $wnIdx->getLemma('programmer');
39+
40+
$lemmas = $this->getWordnetIndex()->getLemma('programmer');
2241
$this->assertCount(8, $lemmas[0]->getSynsets()[0]->getLinkedSynsets());
2342
}
2443

44+
public function testGetMorph()
45+
{
46+
if( getenv('SKIP_TEST') || !is_dir(get_storage_path('corpora/wordnet'))) {
47+
return;
48+
}
49+
$this->assertEquals('play', $this->getWordnetIndex()->getMorph('playing'));
50+
$this->assertEquals('dog', $this->getWordnetIndex()->getMorph('dogs'));
51+
$this->assertEquals('church', $this->getWordnetIndex()->getMorph('churches'));
52+
$this->assertEquals('aardwolf', $this->getWordnetIndex()->getMorph('aardwolves'));
53+
$this->assertEquals('abacus', $this->getWordnetIndex()->getMorph('abaci'));
54+
$this->assertEquals('book', $this->getWordnetIndex()->getMorph('books'));
55+
}
56+
2557
}

0 commit comments

Comments
 (0)