Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,15 @@
language: python

python:
- "2.7"

virtualenv:
system_site_packages: true

install: "pip install -r requirements.txt"

script:
- sudo apt-get update
- sudo apt-get install python-numpy python-scipy
- nosetests -v
- nosetests --with-doctest --doctest-extension doc/*.rst
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
cytoolz
docopt
six>=1.4.1
nose>=1.3.3
scipy
six>=1.4.1
23 changes: 17 additions & 6 deletions semanticizest/_semanticizer.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
from __future__ import division

from collections import defaultdict
import sqlite3
from os.path import join, dirname, abspath

import numpy as np
import six

from semanticizest._util import ngrams_with_pos, tosequence
from semanticizest._util import (ngrams_with_pos, tosequence,
wilson_ci_lower_bound)
from semanticizest.parse_wikidump import parse_dump


Expand All @@ -23,7 +27,7 @@ class Semanticizer(object):
create the stored model.
"""

def __init__(self, fname, N=7):
def __init__(self, fname, N=7, score='wilson', wilson_confidence=0.95):
commonness = defaultdict(list)

self.db = sqlite3.connect(fname)
Expand All @@ -34,13 +38,20 @@ def __init__(self, fname, N=7):
'where ngram_id = ngrams.id;'):
commonness[anchor].append((target, count))

for anchor, targets in six.iteritems(commonness):
# targets.sort(key=operator.itemgetter(1), reverse=True)
if score == 'wilson':
z = norm.ppf(wilson_confidence)

for anchor, targets in six.iteritems(commonness):
# Turn counts into probabilities.
# XXX should we preserve the counts as well?
total = float(sum(count for _, count in targets))
commonness[anchor] = [(t, count / total) for t, count in targets]
counts = np.asarray([count for _, count in targets], dtype=float)
total = counts.sum()
if score == 'wilson':
score = wilson_ci_lower_bound(counts, total, z)
else:
score = counts / total

commonness[anchor] = [(t, score[i]) for i, t in enumerate(targets)]

self.commonness = commonness
self.N = N
Expand Down
17 changes: 17 additions & 0 deletions semanticizest/_util.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
from collections import Sequence

from six.moves import xrange
from six.moves.urllib.parse import quote

import numpy as np


def ngrams_with_pos(lst, N):
"""Generate n-grams for 1 <= n <= N from lst."""
Expand Down Expand Up @@ -33,3 +36,17 @@ def url_from_title(title, wiki):
title = title[0].upper() + title[1:] # Wikipedia-specific
title = quote(title.replace(' ', '_'), safe=',()/:')
return "https://{}.wikipedia.org/wiki/{}".format(wiki, title)


def wilson_ci_lower_bound(pos, n, z):
"""
Calculate Lower bound of Wilson score confidence interval for a
Bernoulli parameter, as described here:
http://www.evanmiller.org/how-not-to-sort-by-average-rating.html
"""
if n == 0:
return np.zeros_like(pos)
phat = pos / n
z2n = z ** 2 / n
return ((phat + .5 * z2n - z * sqrt((phat * (1 - phat) + .25 * z2n) / n))
/ (1 + z2n))