semanticize · c-martinez · Dec 1, 2014 · Dec 1, 2014 · Jan 30, 2015 · Jan 30, 2015
diff --git a/.travis.yml b/.travis.yml
@@ -1,7 +1,15 @@
 language: python
+
 python:
   - "2.7"
+
+virtualenv:
+  system_site_packages: true
+
 install: "pip install -r requirements.txt"
+
 script:
+  - sudo apt-get update
+  - sudo apt-get install python-numpy python-scipy
   - nosetests -v
   - nosetests --with-doctest --doctest-extension doc/*.rst
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,5 @@
 cytoolz
 docopt
-six>=1.4.1
 nose>=1.3.3
+scipy
+six>=1.4.1
diff --git a/semanticizest/_semanticizer.py b/semanticizest/_semanticizer.py
@@ -1,10 +1,14 @@
+from __future__ import division
+
 from collections import defaultdict
 import sqlite3
 from os.path import join, dirname, abspath
 
+import numpy as np
 import six
 
-from semanticizest._util import ngrams_with_pos, tosequence
+from semanticizest._util import (ngrams_with_pos, tosequence,
+                                 wilson_ci_lower_bound)
 from semanticizest.parse_wikidump import parse_dump
 
 
@@ -23,7 +27,7 @@ class Semanticizer(object):
         create the stored model.
     """
 
-    def __init__(self, fname, N=7):
+    def __init__(self, fname, N=7, score='wilson', wilson_confidence=0.95):
         commonness = defaultdict(list)
 
         self.db = sqlite3.connect(fname)
@@ -34,13 +38,20 @@ def __init__(self, fname, N=7):
             'where ngram_id = ngrams.id;'):
             commonness[anchor].append((target, count))
 
-        for anchor, targets in six.iteritems(commonness):
-            # targets.sort(key=operator.itemgetter(1), reverse=True)
+        if score == 'wilson':
+            z = norm.ppf(wilson_confidence)
 
+        for anchor, targets in six.iteritems(commonness):
             # Turn counts into probabilities.
             # XXX should we preserve the counts as well?
-            total = float(sum(count for _, count in targets))
-            commonness[anchor] = [(t, count / total) for t, count in targets]
+            counts = np.asarray([count for _, count in targets], dtype=float)
+            total = counts.sum()
+            if score == 'wilson':
+                score = wilson_ci_lower_bound(counts, total, z)
+            else:
+                score = counts / total
+
+            commonness[anchor] = [(t, score[i]) for i, t in enumerate(targets)]
 
         self.commonness = commonness
         self.N = N

diff --git a/semanticizest/_util.py b/semanticizest/_util.py
@@ -1,7 +1,10 @@
 from collections import Sequence
+
 from six.moves import xrange
 from six.moves.urllib.parse import quote
 
+import numpy as np
+
 
 def ngrams_with_pos(lst, N):
     """Generate n-grams for 1 <= n <= N from lst."""
@@ -33,3 +36,17 @@ def url_from_title(title, wiki):
     title = title[0].upper() + title[1:]    # Wikipedia-specific
     title = quote(title.replace(' ', '_'), safe=',()/:')
     return "https://{}.wikipedia.org/wiki/{}".format(wiki, title)
+
+
+def wilson_ci_lower_bound(pos, n, z):
+    """
+    Calculate Lower bound of Wilson score confidence interval for a
+    Bernoulli parameter, as described here:
+    http://www.evanmiller.org/how-not-to-sort-by-average-rating.html
+    """
+    if n == 0:
+        return np.zeros_like(pos)
+    phat = pos / n
+    z2n = z ** 2 / n
+    return ((phat + .5 * z2n - z * sqrt((phat * (1 - phat) + .25 * z2n) / n))
+            / (1 + z2n))