From 3889633399404d7e02bf28acbdf27fc4d8a7bc7b Mon Sep 17 00:00:00 2001 From: Carlos Martinez Date: Mon, 1 Dec 2014 13:34:13 +0100 Subject: [PATCH 1/4] Add lower bound wilson score for bernoulli parameter --- semanticizest/_semanticizer.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/semanticizest/_semanticizer.py b/semanticizest/_semanticizer.py index 5454ccf..48f27db 100644 --- a/semanticizest/_semanticizer.py +++ b/semanticizest/_semanticizer.py @@ -4,6 +4,9 @@ import six +from math import sqrt +from scipy.stats import norm + from semanticizest._util import ngrams_with_pos, tosequence from semanticizest.parse_wikidump import parse_dump @@ -23,7 +26,7 @@ class Semanticizer(object): create the stored model. """ - def __init__(self, fname, N=7): + def __init__(self, fname, N=7, score='wilson', wilson_confidence=0.95): commonness = defaultdict(list) self.db = sqlite3.connect(fname) @@ -34,17 +37,36 @@ def __init__(self, fname, N=7): 'where ngram_id = ngrams.id;'): commonness[anchor].append((target, count)) + if score=='wilson': + # Better but slower + z = norm.ppf(wilson_confidence) + makeProb = lambda count, total: self._ci_lower_bound(count, total, z) + else: + makeProb = lambda count, total: count / total + for anchor, targets in six.iteritems(commonness): # targets.sort(key=operator.itemgetter(1), reverse=True) # Turn counts into probabilities. # XXX should we preserve the counts as well? total = float(sum(count for _, count in targets)) - commonness[anchor] = [(t, count / total) for t, count in targets] + commonness[anchor] = [(t, makeProb(count, total)) for t, count in targets] self.commonness = commonness self.N = N + def _ci_lower_bound(self, pos, n, z): + """ + Calculate Lower bound of Wilson score confidence interval for a Bernoulli parameter + as described here: + http://www.evanmiller.org/how-not-to-sort-by-average-rating.html + """ + if n == 0: + return 0 + phat = 1.0*pos/n + score = (phat + z*z/(2*n) - z * sqrt((phat*(1-phat)+z*z/(4*n))/n))/(1+z*z/n) + return score + def all_candidates(self, s): """Retrieve all candidate entities. From 76bd6fd7bda05f1803b06c4ea1ca905e186f5cd8 Mon Sep 17 00:00:00 2001 From: Lars Buitinck Date: Mon, 1 Dec 2014 16:50:21 +0100 Subject: [PATCH 2/4] require SciPy for Wilson ranking --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 60f8434..8328883 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ cytoolz docopt -six>=1.4.1 nose>=1.3.3 +scipy +six>=1.4.1 From 02e92ff8f0c9fd1ef2c9c9fd25d578b7f82533d1 Mon Sep 17 00:00:00 2001 From: Lars Buitinck Date: Fri, 30 Jan 2015 14:58:18 +0100 Subject: [PATCH 3/4] install SciPy from apt-get on Travis --- .travis.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.travis.yml b/.travis.yml index f6262c7..92630d3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,7 +1,15 @@ language: python + python: - "2.7" + +virtualenv: + system_site_packages: true + install: "pip install -r requirements.txt" + script: + - sudo apt-get update + - sudo apt-get install python-numpy python-scipy - nosetests -v - nosetests --with-doctest --doctest-extension doc/*.rst From ed03d2a22c5e968d39b7991fe08315d75679ac9b Mon Sep 17 00:00:00 2001 From: Lars Buitinck Date: Fri, 30 Jan 2015 15:43:40 +0100 Subject: [PATCH 4/4] refactor/vectorize Wilson confidence interval code --- semanticizest/_semanticizer.py | 39 ++++++++++++---------------------- semanticizest/_util.py | 17 +++++++++++++++ 2 files changed, 31 insertions(+), 25 deletions(-) diff --git a/semanticizest/_semanticizer.py b/semanticizest/_semanticizer.py index 48f27db..5e79a20 100644 --- a/semanticizest/_semanticizer.py +++ b/semanticizest/_semanticizer.py @@ -1,13 +1,14 @@ +from __future__ import division + from collections import defaultdict import sqlite3 from os.path import join, dirname, abspath +import numpy as np import six -from math import sqrt -from scipy.stats import norm - -from semanticizest._util import ngrams_with_pos, tosequence +from semanticizest._util import (ngrams_with_pos, tosequence, + wilson_ci_lower_bound) from semanticizest.parse_wikidump import parse_dump @@ -37,36 +38,24 @@ def __init__(self, fname, N=7, score='wilson', wilson_confidence=0.95): 'where ngram_id = ngrams.id;'): commonness[anchor].append((target, count)) - if score=='wilson': - # Better but slower + if score == 'wilson': z = norm.ppf(wilson_confidence) - makeProb = lambda count, total: self._ci_lower_bound(count, total, z) - else: - makeProb = lambda count, total: count / total for anchor, targets in six.iteritems(commonness): - # targets.sort(key=operator.itemgetter(1), reverse=True) - # Turn counts into probabilities. # XXX should we preserve the counts as well? - total = float(sum(count for _, count in targets)) - commonness[anchor] = [(t, makeProb(count, total)) for t, count in targets] + counts = np.asarray([count for _, count in targets], dtype=float) + total = counts.sum() + if score == 'wilson': + score = wilson_ci_lower_bound(counts, total, z) + else: + score = counts / total + + commonness[anchor] = [(t, score[i]) for i, t in enumerate(targets)] self.commonness = commonness self.N = N - def _ci_lower_bound(self, pos, n, z): - """ - Calculate Lower bound of Wilson score confidence interval for a Bernoulli parameter - as described here: - http://www.evanmiller.org/how-not-to-sort-by-average-rating.html - """ - if n == 0: - return 0 - phat = 1.0*pos/n - score = (phat + z*z/(2*n) - z * sqrt((phat*(1-phat)+z*z/(4*n))/n))/(1+z*z/n) - return score - def all_candidates(self, s): """Retrieve all candidate entities. diff --git a/semanticizest/_util.py b/semanticizest/_util.py index 515ff76..57580d7 100644 --- a/semanticizest/_util.py +++ b/semanticizest/_util.py @@ -1,7 +1,10 @@ from collections import Sequence + from six.moves import xrange from six.moves.urllib.parse import quote +import numpy as np + def ngrams_with_pos(lst, N): """Generate n-grams for 1 <= n <= N from lst.""" @@ -33,3 +36,17 @@ def url_from_title(title, wiki): title = title[0].upper() + title[1:] # Wikipedia-specific title = quote(title.replace(' ', '_'), safe=',()/:') return "https://{}.wikipedia.org/wiki/{}".format(wiki, title) + + +def wilson_ci_lower_bound(pos, n, z): + """ + Calculate Lower bound of Wilson score confidence interval for a + Bernoulli parameter, as described here: + http://www.evanmiller.org/how-not-to-sort-by-average-rating.html + """ + if n == 0: + return np.zeros_like(pos) + phat = pos / n + z2n = z ** 2 / n + return ((phat + .5 * z2n - z * sqrt((phat * (1 - phat) + .25 * z2n) / n)) + / (1 + z2n))