diff --git a/.travis.yml b/.travis.yml index f6262c7..92630d3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,7 +1,15 @@ language: python + python: - "2.7" + +virtualenv: + system_site_packages: true + install: "pip install -r requirements.txt" + script: + - sudo apt-get update + - sudo apt-get install python-numpy python-scipy - nosetests -v - nosetests --with-doctest --doctest-extension doc/*.rst diff --git a/requirements.txt b/requirements.txt index 60f8434..8328883 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ cytoolz docopt -six>=1.4.1 nose>=1.3.3 +scipy +six>=1.4.1 diff --git a/semanticizest/_semanticizer.py b/semanticizest/_semanticizer.py index 5454ccf..5e79a20 100644 --- a/semanticizest/_semanticizer.py +++ b/semanticizest/_semanticizer.py @@ -1,10 +1,14 @@ +from __future__ import division + from collections import defaultdict import sqlite3 from os.path import join, dirname, abspath +import numpy as np import six -from semanticizest._util import ngrams_with_pos, tosequence +from semanticizest._util import (ngrams_with_pos, tosequence, + wilson_ci_lower_bound) from semanticizest.parse_wikidump import parse_dump @@ -23,7 +27,7 @@ class Semanticizer(object): create the stored model. """ - def __init__(self, fname, N=7): + def __init__(self, fname, N=7, score='wilson', wilson_confidence=0.95): commonness = defaultdict(list) self.db = sqlite3.connect(fname) @@ -34,13 +38,20 @@ def __init__(self, fname, N=7): 'where ngram_id = ngrams.id;'): commonness[anchor].append((target, count)) - for anchor, targets in six.iteritems(commonness): - # targets.sort(key=operator.itemgetter(1), reverse=True) + if score == 'wilson': + z = norm.ppf(wilson_confidence) + for anchor, targets in six.iteritems(commonness): # Turn counts into probabilities. # XXX should we preserve the counts as well? - total = float(sum(count for _, count in targets)) - commonness[anchor] = [(t, count / total) for t, count in targets] + counts = np.asarray([count for _, count in targets], dtype=float) + total = counts.sum() + if score == 'wilson': + score = wilson_ci_lower_bound(counts, total, z) + else: + score = counts / total + + commonness[anchor] = [(t, score[i]) for i, t in enumerate(targets)] self.commonness = commonness self.N = N diff --git a/semanticizest/_util.py b/semanticizest/_util.py index 515ff76..57580d7 100644 --- a/semanticizest/_util.py +++ b/semanticizest/_util.py @@ -1,7 +1,10 @@ from collections import Sequence + from six.moves import xrange from six.moves.urllib.parse import quote +import numpy as np + def ngrams_with_pos(lst, N): """Generate n-grams for 1 <= n <= N from lst.""" @@ -33,3 +36,17 @@ def url_from_title(title, wiki): title = title[0].upper() + title[1:] # Wikipedia-specific title = quote(title.replace(' ', '_'), safe=',()/:') return "https://{}.wikipedia.org/wiki/{}".format(wiki, title) + + +def wilson_ci_lower_bound(pos, n, z): + """ + Calculate Lower bound of Wilson score confidence interval for a + Bernoulli parameter, as described here: + http://www.evanmiller.org/how-not-to-sort-by-average-rating.html + """ + if n == 0: + return np.zeros_like(pos) + phat = pos / n + z2n = z ** 2 / n + return ((phat + .5 * z2n - z * sqrt((phat * (1 - phat) + .25 * z2n) / n)) + / (1 + z2n))