1+ #!/usr/bin/env python3
2+ import argparse
3+ from pathlib import Path
4+
5+ import requests
6+
7+ from webstruct .gazetteers .geonames import read_geonames_zipped , to_dawg
8+
9+
10+ FILES = [
11+ ('http://download.geonames.org/export/dump/allCountries.zip' , 'allCountries.zip' ),
12+ ('http://download.geonames.org/export/dump/cities1000.zip' , 'cities1000.zip' ),
13+ ('http://download.geonames.org/export/dump/cities5000.zip' , 'cities5000.zip' ),
14+ ('http://download.geonames.org/export/dump/cities15000.zip' , 'cities15000.zip' ),
15+ ]
16+
17+ DATA_ROOT = Path ('_data' )
18+
19+
20+ def download_geonames ():
21+ """ Download geonames files if they don't exist in ./_data folder. """
22+ DATA_ROOT .mkdir (exist_ok = True )
23+ for url , name in FILES :
24+ path = (DATA_ROOT / name )
25+ if path .exists ():
26+ continue
27+ print ("downloading {}" .format (url ))
28+ path .write_bytes (requests .get (url ).content )
29+
30+
31+ def _compile_cities (path : Path , lowercase : bool = False ):
32+ out_path = path .with_suffix ('.dafsa' )
33+ # if out_path.exists():
34+ # return
35+ print ("reading {}" .format (path ))
36+ df = read_geonames_zipped (str (path ))
37+ if lowercase :
38+ df = _to_lower (df )
39+ print ("compiling {}" .format (out_path ))
40+ dawg = to_dawg (df )
41+ dawg .save (str (out_path ))
42+
43+
44+ def _to_lower (df ):
45+ return df .assign (
46+ main_name = df .main_name .str .lower (),
47+ asciiname = df .asciiname .str .lower (),
48+ alternatenames = df .alternatenames .str .lower (),
49+ )
50+
51+
52+ def _read_full ():
53+ path = DATA_ROOT / 'allCountries.zip'
54+ print ("reading {}" .format (path ))
55+ return read_geonames_zipped (str (path ))
56+
57+
58+ def _compile_adm (df ):
59+ codes = ['ADM1' , 'ADM2' , 'ADM3' , 'ADM4' ]
60+ out_paths = [DATA_ROOT / "{}.dafsa" .format (code .lower ()) for code in codes ]
61+ # if all(p.exists() for p in out_paths):
62+ # return
63+ for code , out_path in zip (codes , out_paths ):
64+ # if out_path.exists():
65+ # continue
66+ print ("compiling {}" .format (out_path ))
67+ df_adm = df [df .feature_code == code ]
68+ dawg = to_dawg (df_adm )
69+ dawg .save (str (out_path ))
70+
71+
72+ def compile_gazetteers_contacts (lowercase = False ):
73+ """ Compile geonames data downloaded by ``download_geonames``. """
74+ for name in ['cities1000.zip' , 'cities5000.zip' , 'cities15000.zip' ]:
75+ _compile_cities (DATA_ROOT / name , lowercase = lowercase )
76+ df = _read_full ()
77+ if lowercase :
78+ df = _to_lower (df )
79+ _compile_adm (df )
80+
81+
82+ if __name__ == '__main__' :
83+ p = argparse .ArgumentParser ()
84+ p .add_argument ('--lower' , action = "store_true" )
85+ args = p .parse_args ()
86+
87+ download_geonames ()
88+ compile_gazetteers_contacts (args .lower )
0 commit comments