From 3889633399404d7e02bf28acbdf27fc4d8a7bc7b Mon Sep 17 00:00:00 2001
From: Carlos Martinez <c.martinez@esciencecenter.nl>
Date: Mon, 1 Dec 2014 13:34:13 +0100
Subject: [PATCH 1/4] Add lower bound wilson score for bernoulli parameter

---
 semanticizest/_semanticizer.py | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/semanticizest/_semanticizer.py b/semanticizest/_semanticizer.py
index 5454ccf..48f27db 100644
--- a/semanticizest/_semanticizer.py
+++ b/semanticizest/_semanticizer.py
@@ -4,6 +4,9 @@
 
 import six
 
+from math import sqrt
+from scipy.stats import norm 
+
 from semanticizest._util import ngrams_with_pos, tosequence
 from semanticizest.parse_wikidump import parse_dump
 
@@ -23,7 +26,7 @@ class Semanticizer(object):
         create the stored model.
     """
 
-    def __init__(self, fname, N=7):
+    def __init__(self, fname, N=7, score='wilson', wilson_confidence=0.95):
         commonness = defaultdict(list)
 
         self.db = sqlite3.connect(fname)
@@ -34,17 +37,36 @@ def __init__(self, fname, N=7):
             'where ngram_id = ngrams.id;'):
             commonness[anchor].append((target, count))
 
+        if score=='wilson':
+            # Better but slower
+            z = norm.ppf(wilson_confidence)
+            makeProb = lambda count, total: self._ci_lower_bound(count, total, z)
+        else:
+            makeProb = lambda count, total: count / total
+
         for anchor, targets in six.iteritems(commonness):
             # targets.sort(key=operator.itemgetter(1), reverse=True)
 
             # Turn counts into probabilities.
             # XXX should we preserve the counts as well?
             total = float(sum(count for _, count in targets))
-            commonness[anchor] = [(t, count / total) for t, count in targets]
+            commonness[anchor] = [(t, makeProb(count, total)) for t, count in targets]
 
         self.commonness = commonness
         self.N = N
 
+    def _ci_lower_bound(self, pos, n, z):
+        """
+        Calculate Lower bound of Wilson score confidence interval for a Bernoulli parameter
+        as described here: 
+        http://www.evanmiller.org/how-not-to-sort-by-average-rating.html
+        """
+        if n == 0:
+            return 0
+        phat = 1.0*pos/n
+        score = (phat + z*z/(2*n) - z * sqrt((phat*(1-phat)+z*z/(4*n))/n))/(1+z*z/n)
+        return score
+
     def all_candidates(self, s):
         """Retrieve all candidate entities.
 

From 76bd6fd7bda05f1803b06c4ea1ca905e186f5cd8 Mon Sep 17 00:00:00 2001
From: Lars Buitinck <l.buitinck@esciencecenter.nl>
Date: Mon, 1 Dec 2014 16:50:21 +0100
Subject: [PATCH 2/4] require SciPy for Wilson ranking

---
 requirements.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 60f8434..8328883 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,5 @@
 cytoolz
 docopt
-six>=1.4.1
 nose>=1.3.3
+scipy
+six>=1.4.1

From 02e92ff8f0c9fd1ef2c9c9fd25d578b7f82533d1 Mon Sep 17 00:00:00 2001
From: Lars Buitinck <l.buitinck@esciencecenter.nl>
Date: Fri, 30 Jan 2015 14:58:18 +0100
Subject: [PATCH 3/4] install SciPy from apt-get on Travis

---
 .travis.yml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.travis.yml b/.travis.yml
index f6262c7..92630d3 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,7 +1,15 @@
 language: python
+
 python:
   - "2.7"
+
+virtualenv:
+  system_site_packages: true
+
 install: "pip install -r requirements.txt"
+
 script:
+  - sudo apt-get update
+  - sudo apt-get install python-numpy python-scipy
   - nosetests -v
   - nosetests --with-doctest --doctest-extension doc/*.rst

From ed03d2a22c5e968d39b7991fe08315d75679ac9b Mon Sep 17 00:00:00 2001
From: Lars Buitinck <l.buitinck@esciencecenter.nl>
Date: Fri, 30 Jan 2015 15:43:40 +0100
Subject: [PATCH 4/4] refactor/vectorize Wilson confidence interval code

---
 semanticizest/_semanticizer.py | 39 ++++++++++++----------------------
 semanticizest/_util.py         | 17 +++++++++++++++
 2 files changed, 31 insertions(+), 25 deletions(-)

diff --git a/semanticizest/_semanticizer.py b/semanticizest/_semanticizer.py
index 48f27db..5e79a20 100644
--- a/semanticizest/_semanticizer.py
+++ b/semanticizest/_semanticizer.py
@@ -1,13 +1,14 @@
+from __future__ import division
+
 from collections import defaultdict
 import sqlite3
 from os.path import join, dirname, abspath
 
+import numpy as np
 import six
 
-from math import sqrt
-from scipy.stats import norm 
-
-from semanticizest._util import ngrams_with_pos, tosequence
+from semanticizest._util import (ngrams_with_pos, tosequence,
+                                 wilson_ci_lower_bound)
 from semanticizest.parse_wikidump import parse_dump
 
 
@@ -37,36 +38,24 @@ def __init__(self, fname, N=7, score='wilson', wilson_confidence=0.95):
             'where ngram_id = ngrams.id;'):
             commonness[anchor].append((target, count))
 
-        if score=='wilson':
-            # Better but slower
+        if score == 'wilson':
             z = norm.ppf(wilson_confidence)
-            makeProb = lambda count, total: self._ci_lower_bound(count, total, z)
-        else:
-            makeProb = lambda count, total: count / total
 
         for anchor, targets in six.iteritems(commonness):
-            # targets.sort(key=operator.itemgetter(1), reverse=True)
-
             # Turn counts into probabilities.
             # XXX should we preserve the counts as well?
-            total = float(sum(count for _, count in targets))
-            commonness[anchor] = [(t, makeProb(count, total)) for t, count in targets]
+            counts = np.asarray([count for _, count in targets], dtype=float)
+            total = counts.sum()
+            if score == 'wilson':
+                score = wilson_ci_lower_bound(counts, total, z)
+            else:
+                score = counts / total
+
+            commonness[anchor] = [(t, score[i]) for i, t in enumerate(targets)]
 
         self.commonness = commonness
         self.N = N
 
-    def _ci_lower_bound(self, pos, n, z):
-        """
-        Calculate Lower bound of Wilson score confidence interval for a Bernoulli parameter
-        as described here: 
-        http://www.evanmiller.org/how-not-to-sort-by-average-rating.html
-        """
-        if n == 0:
-            return 0
-        phat = 1.0*pos/n
-        score = (phat + z*z/(2*n) - z * sqrt((phat*(1-phat)+z*z/(4*n))/n))/(1+z*z/n)
-        return score
-
     def all_candidates(self, s):
         """Retrieve all candidate entities.
 
diff --git a/semanticizest/_util.py b/semanticizest/_util.py
index 515ff76..57580d7 100644
--- a/semanticizest/_util.py
+++ b/semanticizest/_util.py
@@ -1,7 +1,10 @@
 from collections import Sequence
+
 from six.moves import xrange
 from six.moves.urllib.parse import quote
 
+import numpy as np
+
 
 def ngrams_with_pos(lst, N):
     """Generate n-grams for 1 <= n <= N from lst."""
@@ -33,3 +36,17 @@ def url_from_title(title, wiki):
     title = title[0].upper() + title[1:]    # Wikipedia-specific
     title = quote(title.replace(' ', '_'), safe=',()/:')
     return "https://{}.wikipedia.org/wiki/{}".format(wiki, title)
+
+
+def wilson_ci_lower_bound(pos, n, z):
+    """
+    Calculate Lower bound of Wilson score confidence interval for a
+    Bernoulli parameter, as described here:
+    http://www.evanmiller.org/how-not-to-sort-by-average-rating.html
+    """
+    if n == 0:
+        return np.zeros_like(pos)
+    phat = pos / n
+    z2n = z ** 2 / n
+    return ((phat + .5 * z2n - z * sqrt((phat * (1 - phat) + .25 * z2n) / n))
+            / (1 + z2n))