Skip to content

Commit 8fb60d3

Browse files
committed
A complete example (contact extraction). See GH-24.
1 parent 5a3f39e commit 8fb60d3

File tree

9 files changed

+424
-1
lines changed

9 files changed

+424
-1
lines changed

.gitignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,4 +51,7 @@ notebooks/*.marisa
5151
notebooks/*.wapiti
5252
notebooks/*.crfsuite
5353
webstruct_data/corpus/random_pages/wa/*.html
54-
webstruct_data/corpus/us_contact_pages/cleaned
54+
webstruct_data/corpus/us_contact_pages/cleaned
55+
example/_data/*
56+
example/*.joblib
57+
example/*.html

example/README.rst

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
Contact extraction using Webstruct
2+
==================================
3+
4+
This repository contains code to train a model for contact and address
5+
extraction. The result is a .joblib file with pickled webstruct.NER object.
6+
7+
Currently the example requires Python 3.5+
8+
9+
Training
10+
--------
11+
12+
To train a model, first build gazetteers using built_gazetteers script::
13+
14+
python3 -m ner.build_gazetteers
15+
16+
It will create "_data" folder with city/state geonames data. The script uses
17+
several GBs or RAM.
18+
19+
To train a CRF model run::
20+
21+
python3 -m ner.train
22+
23+
The model uses training data from opensource webstruct package
24+
(mostly contact pages of US, CA and GB small business websites)
25+
and provides 'ORG', 'TEL', 'FAX', 'HOURS', 'STREET', 'CITY', 'STATE',
26+
'ZIPCODE', 'COUNTRY', and 'SUBJ' entities.
27+
28+
Script should produce "contact-extractor.joblib" file with a saved
29+
webstruct.NER object and "crf-features.html" file with debugging
30+
information about the model.
31+
32+
Usage
33+
-----
34+
35+
To use the saved model code in this repository is not needed.
36+
Make sure joblib, sklearn-crfsuite and webstruct are installed,
37+
then load the model::
38+
39+
import joblib
40+
ner = joblib.load('contact-extractor.joblib')
41+
print(ner.extract_groups_from_url('<some URL>'))
42+
43+
See https://webstruct.readthedocs.io/en/latest/ref/model.html#webstruct.model.NER
44+
for the API.

example/ner/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# -*- coding: utf-8 -*-

example/ner/build_gazetteers.py

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
#!/usr/bin/env python3
2+
import argparse
3+
from pathlib import Path
4+
5+
import requests
6+
7+
from webstruct.gazetteers.geonames import read_geonames_zipped, to_dawg
8+
9+
10+
FILES = [
11+
('http://download.geonames.org/export/dump/allCountries.zip', 'allCountries.zip'),
12+
('http://download.geonames.org/export/dump/cities1000.zip', 'cities1000.zip'),
13+
('http://download.geonames.org/export/dump/cities5000.zip', 'cities5000.zip'),
14+
('http://download.geonames.org/export/dump/cities15000.zip', 'cities15000.zip'),
15+
]
16+
17+
DATA_ROOT = Path('_data')
18+
19+
20+
def download_geonames():
21+
""" Download geonames files if they don't exist in ./_data folder. """
22+
DATA_ROOT.mkdir(exist_ok=True)
23+
for url, name in FILES:
24+
path = (DATA_ROOT / name)
25+
if path.exists():
26+
continue
27+
print("downloading {}".format(url))
28+
path.write_bytes(requests.get(url).content)
29+
30+
31+
def _compile_cities(path: Path, lowercase: bool=False):
32+
out_path = path.with_suffix('.dafsa')
33+
# if out_path.exists():
34+
# return
35+
print("reading {}".format(path))
36+
df = read_geonames_zipped(str(path))
37+
if lowercase:
38+
df = _to_lower(df)
39+
print("compiling {}".format(out_path))
40+
dawg = to_dawg(df)
41+
dawg.save(str(out_path))
42+
43+
44+
def _to_lower(df):
45+
return df.assign(
46+
main_name=df.main_name.str.lower(),
47+
asciiname=df.asciiname.str.lower(),
48+
alternatenames=df.alternatenames.str.lower(),
49+
)
50+
51+
52+
def _read_full():
53+
path = DATA_ROOT / 'allCountries.zip'
54+
print("reading {}".format(path))
55+
return read_geonames_zipped(str(path))
56+
57+
58+
def _compile_adm(df):
59+
codes = ['ADM1', 'ADM2', 'ADM3', 'ADM4']
60+
out_paths = [DATA_ROOT / "{}.dafsa".format(code.lower()) for code in codes]
61+
# if all(p.exists() for p in out_paths):
62+
# return
63+
for code, out_path in zip(codes, out_paths):
64+
# if out_path.exists():
65+
# continue
66+
print("compiling {}".format(out_path))
67+
df_adm = df[df.feature_code == code]
68+
dawg = to_dawg(df_adm)
69+
dawg.save(str(out_path))
70+
71+
72+
def compile_gazetteers_contacts(lowercase=False):
73+
""" Compile geonames data downloaded by ``download_geonames``. """
74+
for name in ['cities1000.zip', 'cities5000.zip', 'cities15000.zip']:
75+
_compile_cities(DATA_ROOT / name, lowercase=lowercase)
76+
df = _read_full()
77+
if lowercase:
78+
df = _to_lower(df)
79+
_compile_adm(df)
80+
81+
82+
if __name__ == '__main__':
83+
p = argparse.ArgumentParser()
84+
p.add_argument('--lower', action="store_true")
85+
args = p.parse_args()
86+
87+
download_geonames()
88+
compile_gazetteers_contacts(args.lower)

example/ner/cv.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# -*- coding: utf-8 -*-
2+
import numpy as np
3+
4+
5+
def crf_cross_val_predict(pipe, X, y, cv, groups=None, n_folds=None):
6+
"""
7+
Split data into folds according to cv iterator, do train/test prediction
8+
on first n_folds (or on all folds if n_folds is None).
9+
"""
10+
X, y = np.array(X), np.array(y)
11+
y_pred = []
12+
y_true = []
13+
14+
for idx, (train_idx, test_idx) in enumerate(cv.split(X, y, groups)):
15+
if n_folds and idx >= n_folds:
16+
break
17+
18+
X_train, X_dev = X[train_idx], X[test_idx]
19+
y_train, y_dev = y[train_idx], y[test_idx]
20+
pipe.fit(X_train, y_train, X_dev=X_dev, y_dev=y_dev)
21+
y_true.append(y_dev)
22+
y_pred.append(pipe.predict(X_dev))
23+
24+
y_pred = np.hstack(y_pred)
25+
y_true = np.hstack(y_true)
26+
return y_pred, y_true

example/ner/data.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: utf-8 -*-
3+
from itertools import chain
4+
from pathlib import Path
5+
from typing import List, Tuple, Any, Set
6+
7+
import webstruct
8+
9+
from .utils import pages_progress
10+
11+
12+
WEBSTRUCT_DATA = Path(__name__).parent / ".." / "webstruct_data"
13+
GAZETTEER_DATA = Path(__name__).parent / "_data"
14+
15+
16+
KNOWN_ENTITIES = [
17+
'ORG', 'TEL', 'FAX', 'HOURS',
18+
'STREET', 'CITY', 'STATE', 'ZIPCODE', 'COUNTRY',
19+
'EMAIL', 'PER', 'FUNC', 'SUBJ'
20+
]
21+
CONTACT_ENTITIES = [
22+
'ORG', 'TEL', 'FAX', 'HOURS',
23+
'STREET', 'CITY', 'STATE', 'ZIPCODE', 'COUNTRY',
24+
'SUBJ',
25+
]
26+
ADDRESS_ENTITIES = [
27+
'STREET', 'CITY', 'STATE', 'ZIPCODE', 'COUNTRY',
28+
]
29+
30+
31+
def load_webstruct_data() -> List:
32+
"""
33+
Load training data from webstruct repository.
34+
35+
It is a mess: there are two folders which have OK data, one
36+
is stored in WebAnnotator format, another is stored in GATE format.
37+
"""
38+
wa_loader = webstruct.WebAnnotatorLoader(known_entities=KNOWN_ENTITIES)
39+
gate_loader = webstruct.GateLoader(known_entities=KNOWN_ENTITIES)
40+
41+
trees1 = webstruct.load_trees(
42+
str(WEBSTRUCT_DATA / "corpus/business_pages/wa/*.html"),
43+
loader=wa_loader,
44+
)
45+
46+
trees2 = webstruct.load_trees(
47+
str(WEBSTRUCT_DATA / "corpus/us_contact_pages/annotated/*.xml"),
48+
loader=gate_loader
49+
)
50+
trees = chain(trees1, trees2)
51+
return list(pages_progress(trees, desc="Loading webstruct default annotated data"))
52+
53+
54+
def load_countries() -> Set[str]:
55+
countries_path = WEBSTRUCT_DATA / 'gazetteers/countries/countries.txt'
56+
return set(countries_path.read_text().splitlines())

0 commit comments

Comments
 (0)