diff --git a/setup.cfg b/setup.cfg
index 74ca7f4..c44e36c 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -9,6 +9,7 @@ install_requires =
lark
neuroquery
gender-guesser
+ Unidecode
python_requires = >=3.7
[options.packages.find]
@@ -18,7 +19,10 @@ where = src
pubextract.participants._data = *
pubextract.methods_finder._data = *.csv
pubextract.authors._data = *
+pubextract.author_locations._data = *
[options.entry_points]
pubget.plugin_actions =
- get_pubget_actions = pubextract.participants._pubget:get_pubget_actions
\ No newline at end of file
+ get_pubget_actions-author_locations = pubextract.author_locations._pubget:get_pubget_actions
+ get_pubget_actions-demographics = pubextract.participants._pubget:get_pubget_actions
+
diff --git a/src/pubextract/author_locations/_data/__init__.py b/src/pubextract/author_locations/_data/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/pubextract/author_locations/_guessing_locations.py b/src/pubextract/author_locations/_guessing_locations.py
index 8694aad..5eebfab 100644
--- a/src/pubextract/author_locations/_guessing_locations.py
+++ b/src/pubextract/author_locations/_guessing_locations.py
@@ -4,22 +4,29 @@
from unidecode import unidecode
import pandas as pd
import numpy as np
-import en_core_web_sm
+# import en_core_web_sm
from pubextract.author_locations import _reading_xml
-cities_path = Path(__file__).parent / "_data" / "worldcities.csv"
-WC = pd.read_csv(cities_path)
-WC = WC.dropna()
-COUNTRIES = set(WC["country"])
-CITIES = set(WC["city_ascii"])
-LOCATIONS = COUNTRIES.union(CITIES)
-COUNTRY_MAPPING = {
- "UK": "United Kingdom",
- "USA": "United States",
- "South Korea": "Korea, South",
-}
+def _define_locations():
+ cities_path = Path(__file__).parent / "_data" / "worldcities.csv"
+ WORLD_CITIES = pd.read_csv(cities_path)
+ COUNTRIES = set(WORLD_CITIES["country"])
+ CITIES = set(WORLD_CITIES["city_ascii"])
+ LOCATIONS = COUNTRIES.union(CITIES)
+ COUNTRY_MAPPING = {
+ "UK": "United Kingdom",
+ "USA": "United States",
+ "South Korea": "Korea, South",
+ }
+ return {
+ 'WORLD_CITIES': WORLD_CITIES,
+ "COUNTRIES": COUNTRIES,
+ "CITIES": CITIES,
+ "LOCATIONS": LOCATIONS,
+ "COUNTRY_MAPPING": COUNTRY_MAPPING,
+ }
def _preprocess_text(text):
@@ -41,27 +48,31 @@ def _preprocess_text(text):
return text
-def _get_entities(article_path):
- aff = _reading_xml._get_first_affiliation(article_path)
- aff = _preprocess_text(aff)
- nlp = en_core_web_sm.load()
- doc = nlp(aff)
- items = [ent.text for ent in doc.ents if ent.label_ == "GPE"]
+def _get_entities(affiliation):
+ aff = _preprocess_text(affiliation)
+ # nlp = en_core_web_sm.load()
+ # doc = nlp(aff)
+ # items = [ent.text for ent in doc.ents if ent.label_ == "GPE"]
+ items = []
unigrams = aff.split(" ")
items = items + unigrams
for i, unigram in enumerate(unigrams[:-1]):
bigram = " ".join([unigram, unigrams[i+1]])
items.append(bigram)
- entities = [x for x in items if x in LOCATIONS]
+ locs = _define_locations()
+ CM = locs["COUNTRY_MAPPING"]
+ items = [CM[x] if x in CM else x for x in items]
+ entities = [x for x in items if x in locs["LOCATIONS"]]
entities = [x.strip() for x in entities]
entities = list(set(entities))
return entities
def _get_location(ents):
- ents = [COUNTRY_MAPPING[x] if x in COUNTRY_MAPPING else x for x in ents]
- cities = CITIES.intersection(set(ents))
- countries = COUNTRIES.intersection(set(ents))
+ locs = _define_locations()
+ cities = locs["CITIES"].intersection(set(ents))
+ countries = locs["COUNTRIES"].intersection(set(ents))
+ WC = locs["WORLD_CITIES"]
i_ci = WC[WC["city_ascii"].isin(cities)].index
i_co = WC[WC["country"].isin(countries)].index
i = i_ci.intersection(i_co)
@@ -74,11 +85,15 @@ def _get_location(ents):
location = np.nan
return location
-# class Locations:
-# def __init__(self, article_path):
-# self.article_path = article_path
-# self.id = _reading_xml._get_id(article_path)
-# self.affiliation = _reading_xml._get_first_affiliation(article_path)
-# # self.tree = _reading._get_tree(article_path)
-# self.entities = self._get_entities()
-# self.locations = self._get_locations()
+
+class LocationGuesser:
+ def __init__(self, article_path):
+ self.article_path = article_path
+ self.tree = _reading_xml._get_tree(article_path)
+ self.id = _reading_xml._get_id(self.tree)
+ # self.metadata =
+
+ def get_location(self):
+ self.affiliation = _reading_xml._get_first_affiliation(self.tree)
+ self.entities = _get_entities(self.affiliation)
+ self.location = _get_location(self.entities)
diff --git a/src/pubextract/author_locations/_pubget.py b/src/pubextract/author_locations/_pubget.py
index 699aead..79704a4 100644
--- a/src/pubextract/author_locations/_pubget.py
+++ b/src/pubextract/author_locations/_pubget.py
@@ -2,8 +2,10 @@
from pathlib import Path
import pandas as pd
+import numpy as np
+import folium
-from pubextract.author_locations import _guessing_locations, _reading_xml
+from pubextract.author_locations import _guessing_locations
_STEP_NAME = "extract_author_locations"
@@ -11,27 +13,130 @@
_LOG = logging.getLogger(_STEP_NAME)
+def _create_map(df, output_dir):
+ df = df.dropna(subset="lng")
+ counts = df["country"].value_counts()
+ data = pd.DataFrame(columns=["country", "count"])
+ data["country"] = list(counts.index)
+ data["count"] = list(counts.values)
+
+ m = folium.Map(tiles="cartodb positron", zoom_start=2)
+ political_countries_url = (
+ "http://geojson.xyz/naturalearth-3.3.0/ne_50m_admin_0_countries.geojson"
+ )
+ folium.Choropleth(
+ geo_data=political_countries_url,
+ data=data,
+ columns=["country", "count"],
+ popup=data["country"],
+ key_on="feature.properties.name",
+ ).add_to(m)
+
+ for ind, row in df.iterrows():
+ df.loc[ind, "str_coords"] = "%f, %f" % (row["lat"], row["lng"])
+ RADIUS = .01
+ for str_coords, group in df.groupby("str_coords"):
+ row = group.iloc[0]
+ LNG = row["lng"]
+ LAT = row["lat"]
+ n = len(group)
+ T = np.linspace(0, 2*np.pi, n, endpoint=False)
+ popup = f"{row.city}, {row.entities}"
+ for (i, row), t in zip(group.iterrows(), T):
+ radius = RADIUS * n
+ lng = LNG + radius * np.sin(t)
+ lat = LAT + radius * np.cos(t)
+ if n == 1:
+ lng = LNG
+ lat = LAT
+ folium.CircleMarker(
+ [lat, lng],
+ popup=popup,
+ radius=.1,
+ color="#f9190076",
+ fill_color="#f9190076",
+ ).add_to(m)
+ m.save(output_dir / "author_locations_map.html")
+
+
def _extract_from_articles_dir(articles_dir, output_dir=None):
if output_dir is None:
output_dir = articles_dir.parent / "subset_allArticles_authorLocations"
else:
output_dir = Path(output_dir)
output_dir.mkdir(exist_ok=True)
+
+ if articles_dir.name == "subset_allArticles_extractedData":
+ articles_dir = articles_dir.parent / "articles"
+ _LOG.info(f"Extracting author locations to {output_dir}")
ids = []
locations = []
entss = []
article_paths = list(articles_dir.glob("**/article.xml"))
for i_article, article_path in enumerate(article_paths):
- print("Processing article %d/%d" % (i_article, len(article_paths)), end="\r")
- ents = _guessing_locations._get_entities(article_path)
- location = _guessing_locations._get_location(ents)
-
- if not pd.isna(location):
- ids.append(_reading_xml._get_id(article_path))
- entss.append("; ".join(ents))
- locations.append(location)
- d = 1
+ print("Processing article %d/%d" % (i_article, len(article_paths)),
+ end="\r")
+ loc = _guessing_locations.LocationGuesser(article_path)
+ loc.get_location()
+ if not pd.isna(loc.location):
+ ids.append(loc.id)
+ entss.append("; ".join(loc.entities))
+ locations.append(loc.location)
+
df = pd.DataFrame.from_records(locations)
- df["entities"] = entss
- df["id"] = ids
- df.to_csv(output_dir / "author_locations.csv")
+ if not df.empty:
+ df["entities"] = entss
+ df["id"] = ids
+ df.to_csv(output_dir / "author_locations.csv")
+ _LOG.info(f"Done extracting author locations to {output_dir}")
+ _LOG.info("Creating map of author locations")
+ _create_map(df, output_dir)
+ _LOG.info(f"Done creating map of author locations in {output_dir}")
+ return output_dir, 0
+
+
+class AuthorLocationsStep:
+ name = _STEP_NAME
+ short_description = _STEP_DESCRIPTION
+
+ def edit_argument_parser(self, argument_parser) -> None:
+ argument_parser.add_argument(
+ "--author_locations",
+ action="store_true",
+ help="Extract the location from the first affiliation",
+ )
+
+ def run(self, args, previous_steps_output):
+ if not args.author_locations:
+ return None, 0
+ author_locations_dir = (
+ previous_steps_output.get("extract_author_locations_data")
+ )
+ if author_locations_dir is None:
+ author_locations_dir, _ = _extract_from_articles_dir(
+ previous_steps_output["extract_data"]
+ )
+ return _extract_from_articles_dir(author_locations_dir)
+
+
+class AuthorLocationsCommand:
+ name = _STEP_NAME
+ short_description = _STEP_DESCRIPTION
+
+ def edit_argument_parser(self, argument_parser) -> None:
+ argument_parser.add_argument(
+ "author_locations_dir",
+ help="Directory containing author locations. "
+ "It is a directory created by pubget with the '--author_locations'"
+ " option, whose name ends with 'authorLocations'."
+ )
+
+ def run(self, args):
+ return _extract_from_articles_dir(Path(args.author_locations_dir))[1]
+
+
+def get_pubget_actions():
+ return {
+ "pipeline_steps": [AuthorLocationsStep()],
+ "commands": [AuthorLocationsCommand()]
+ }
diff --git a/src/pubextract/author_locations/_reading_xml.py b/src/pubextract/author_locations/_reading_xml.py
index a548390..90b468e 100644
--- a/src/pubextract/author_locations/_reading_xml.py
+++ b/src/pubextract/author_locations/_reading_xml.py
@@ -1,35 +1,33 @@
-from pathlib import Path
-import re
-from typing import List, Optional, Union, Tuple, Any, NewType
-import dataclasses
-
-from unidecode import unidecode
from lxml import etree
-import pandas as pd
-import en_core_web_sm
def _get_tree(article_path):
parser = etree.XMLParser(remove_blank_text=True)
- return etree.parse(article_path, parser)
+ tree = etree.parse(article_path, parser)
+ return tree
-def _get_id(article_path):
- tree = _get_tree(article_path)
+def _get_id(tree):
try:
- pmcid = tree.find("front/article-meta/article-id[@pub-id-type='pmc']").text
- id = "PMC%s" % pmcid
- except:
+ pmcid = (
+ tree.find("front/article-meta/article-id[@pub-id-type='pmc']").text
+ )
+ id_ = f"PMC{pmcid}"
+ except Exception: # make error more specific
pmid = tree.xpath("//PMID/text()")[0]
- id = "Pubmed%s" % pmid
- return id
+ id_ = f"Pubmed{pmid}"
+ return id_
-def _get_first_affiliation(article_path):
- aff = ""
- for event, element in etree.iterparse(article_path):
- if element.tag == "aff" or element.tag == "Affiliation":
- aff = etree.tostring(element, with_tail=False, encoding="unicode")
- if aff:
- break
- return aff
+def _get_first_affiliation(tree):
+ affiliation = ""
+ element = tree.find("//aff")
+ if element == -1:
+ element = tree.find("//Affiliation")
+ if element is not None:
+ affiliation = etree.tostring(
+ element, with_tail=False, encoding="unicode"
+ )
+ else:
+ affiliation = ""
+ return affiliation
diff --git a/src/pubextract/author_locations/test.py b/src/pubextract/author_locations/test.py
deleted file mode 100644
index 056452b..0000000
--- a/src/pubextract/author_locations/test.py
+++ /dev/null
@@ -1,15 +0,0 @@
-from pathlib import Path
-
-from pubextract.author_locations import _pubget
-
-
-articles_dir = (
- Path(__file__).resolve().parents[5]
- / "data"
- / "pubget_data"
- / "review-neuro-meta-analyses_2023-06-29"
- / "query_a84b639ed7c2cc2d04c773db7c22905d"
- / "articles"
-)
-
-_pubget._extract_from_articles_dir(articles_dir)
\ No newline at end of file
diff --git a/tests/test_author_locations/data/example_xml_0.xml b/tests/test_author_locations/data/example_xml_0.xml
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_author_locations/data/example_xml_1.xml b/tests/test_author_locations/data/example_xml_1.xml
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_author_locations/test_guessing_locations.py b/tests/test_author_locations/test_guessing_locations.py
new file mode 100644
index 0000000..425c59a
--- /dev/null
+++ b/tests/test_author_locations/test_guessing_locations.py
@@ -0,0 +1,54 @@
+from pathlib import Path
+
+from pubextract.author_locations import _reading_xml, _guessing_locations
+
+
+def example_xml():
+ folder = Path(__file__).parent.joinpath("data")
+ for path in folder.glob("*.xml"):
+ yield path.read_text("utf-8")
+
+
+def test_define_locations():
+ locs = _guessing_locations._define_locations()
+ assert len(locs["LOCATIONS"]) > 0
+ assert "United Kingdom" in locs["LOCATIONS"]
+ assert "Pittsburgh" in locs["CITIES"]
+
+
+def test_preprocess_text():
+ text = (
+ "University of Pittsburgh, New York, PA, USA"
+ ""
+ )
+ processed = _guessing_locations._preprocess_text(text)
+ assert processed == "Affiliation of Pittsburgh New York PA USA"
+
+
+def test_get_entities():
+ affiliation = (
+ "University of Pittsburgh, New York, PA, USA"
+ ""
+ )
+ ents = _guessing_locations._get_entities(affiliation)
+ assert ents == [
+ "Affiliation", "Pittsburgh", "Pittsburgh", "New York" "PA", "USA"
+ ]
+
+
+def test_get_location():
+ tree = _reading_xml._get_tree(next(example_xml()))
+ aff = _reading_xml._get_first_affiliation(tree)
+ loc = _guessing_locations._get_location(aff)
+ assert loc["city"] == "Pittsburgh"
+
+
+def test_LocationGuesser():
+ article_path = next(example_xml())
+ location = _guessing_locations.LocationGuesser(article_path)
+ location.get_location()
+ assert location.location["city"] == "Pittsburgh"
+ assert location.location["country"] == "United States"
+ assert location.entities == [
+ "Affiliation", "Pittsburgh", "Pittsburgh", "New York" "PA", "USA"
+ ]