Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ install_requires =
lark
neuroquery
gender-guesser
Unidecode
python_requires = >=3.7

[options.packages.find]
Expand All @@ -18,7 +19,10 @@ where = src
pubextract.participants._data = *
pubextract.methods_finder._data = *.csv
pubextract.authors._data = *
pubextract.author_locations._data = *

[options.entry_points]
pubget.plugin_actions =
get_pubget_actions = pubextract.participants._pubget:get_pubget_actions
get_pubget_actions-author_locations = pubextract.author_locations._pubget:get_pubget_actions
get_pubget_actions-demographics = pubextract.participants._pubget:get_pubget_actions

Empty file.
75 changes: 45 additions & 30 deletions src/pubextract/author_locations/_guessing_locations.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,29 @@
from unidecode import unidecode
import pandas as pd
import numpy as np
import en_core_web_sm
# import en_core_web_sm

from pubextract.author_locations import _reading_xml


cities_path = Path(__file__).parent / "_data" / "worldcities.csv"
WC = pd.read_csv(cities_path)
WC = WC.dropna()
COUNTRIES = set(WC["country"])
CITIES = set(WC["city_ascii"])
LOCATIONS = COUNTRIES.union(CITIES)
COUNTRY_MAPPING = {
"UK": "United Kingdom",
"USA": "United States",
"South Korea": "Korea, South",
}
def _define_locations():
cities_path = Path(__file__).parent / "_data" / "worldcities.csv"
WORLD_CITIES = pd.read_csv(cities_path)
COUNTRIES = set(WORLD_CITIES["country"])
CITIES = set(WORLD_CITIES["city_ascii"])
LOCATIONS = COUNTRIES.union(CITIES)
COUNTRY_MAPPING = {
"UK": "United Kingdom",
"USA": "United States",
"South Korea": "Korea, South",
}
return {
'WORLD_CITIES': WORLD_CITIES,
"COUNTRIES": COUNTRIES,
"CITIES": CITIES,
"LOCATIONS": LOCATIONS,
"COUNTRY_MAPPING": COUNTRY_MAPPING,
}


def _preprocess_text(text):
Expand All @@ -41,27 +48,31 @@ def _preprocess_text(text):
return text


def _get_entities(article_path):
aff = _reading_xml._get_first_affiliation(article_path)
aff = _preprocess_text(aff)
nlp = en_core_web_sm.load()
doc = nlp(aff)
items = [ent.text for ent in doc.ents if ent.label_ == "GPE"]
def _get_entities(affiliation):
aff = _preprocess_text(affiliation)
# nlp = en_core_web_sm.load()
# doc = nlp(aff)
# items = [ent.text for ent in doc.ents if ent.label_ == "GPE"]
items = []
unigrams = aff.split(" ")
items = items + unigrams
for i, unigram in enumerate(unigrams[:-1]):
bigram = " ".join([unigram, unigrams[i+1]])
items.append(bigram)
entities = [x for x in items if x in LOCATIONS]
locs = _define_locations()
CM = locs["COUNTRY_MAPPING"]
items = [CM[x] if x in CM else x for x in items]
entities = [x for x in items if x in locs["LOCATIONS"]]
entities = [x.strip() for x in entities]
entities = list(set(entities))
return entities


def _get_location(ents):
ents = [COUNTRY_MAPPING[x] if x in COUNTRY_MAPPING else x for x in ents]
cities = CITIES.intersection(set(ents))
countries = COUNTRIES.intersection(set(ents))
locs = _define_locations()
cities = locs["CITIES"].intersection(set(ents))
countries = locs["COUNTRIES"].intersection(set(ents))
WC = locs["WORLD_CITIES"]
i_ci = WC[WC["city_ascii"].isin(cities)].index
i_co = WC[WC["country"].isin(countries)].index
i = i_ci.intersection(i_co)
Expand All @@ -74,11 +85,15 @@ def _get_location(ents):
location = np.nan
return location

# class Locations:
# def __init__(self, article_path):
# self.article_path = article_path
# self.id = _reading_xml._get_id(article_path)
# self.affiliation = _reading_xml._get_first_affiliation(article_path)
# # self.tree = _reading._get_tree(article_path)
# self.entities = self._get_entities()
# self.locations = self._get_locations()

class LocationGuesser:
def __init__(self, article_path):
self.article_path = article_path
self.tree = _reading_xml._get_tree(article_path)
self.id = _reading_xml._get_id(self.tree)
# self.metadata =

def get_location(self):
self.affiliation = _reading_xml._get_first_affiliation(self.tree)
self.entities = _get_entities(self.affiliation)
self.location = _get_location(self.entities)
131 changes: 118 additions & 13 deletions src/pubextract/author_locations/_pubget.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,36 +2,141 @@
from pathlib import Path

import pandas as pd
import numpy as np
import folium

from pubextract.author_locations import _guessing_locations, _reading_xml
from pubextract.author_locations import _guessing_locations


_STEP_NAME = "extract_author_locations"
_STEP_DESCRIPTION = "Extract author locations from studies' text."
_LOG = logging.getLogger(_STEP_NAME)


def _create_map(df, output_dir):
df = df.dropna(subset="lng")
counts = df["country"].value_counts()
data = pd.DataFrame(columns=["country", "count"])
data["country"] = list(counts.index)
data["count"] = list(counts.values)

m = folium.Map(tiles="cartodb positron", zoom_start=2)
political_countries_url = (
"http://geojson.xyz/naturalearth-3.3.0/ne_50m_admin_0_countries.geojson"
)
folium.Choropleth(
geo_data=political_countries_url,
data=data,
columns=["country", "count"],
popup=data["country"],
key_on="feature.properties.name",
).add_to(m)

for ind, row in df.iterrows():
df.loc[ind, "str_coords"] = "%f, %f" % (row["lat"], row["lng"])
RADIUS = .01
for str_coords, group in df.groupby("str_coords"):
row = group.iloc[0]
LNG = row["lng"]
LAT = row["lat"]
n = len(group)
T = np.linspace(0, 2*np.pi, n, endpoint=False)
popup = f"{row.city}, {row.entities}"
for (i, row), t in zip(group.iterrows(), T):
radius = RADIUS * n
lng = LNG + radius * np.sin(t)
lat = LAT + radius * np.cos(t)
if n == 1:
lng = LNG
lat = LAT
folium.CircleMarker(
[lat, lng],
popup=popup,
radius=.1,
color="#f9190076",
fill_color="#f9190076",
).add_to(m)
m.save(output_dir / "author_locations_map.html")


def _extract_from_articles_dir(articles_dir, output_dir=None):
if output_dir is None:
output_dir = articles_dir.parent / "subset_allArticles_authorLocations"
else:
output_dir = Path(output_dir)
output_dir.mkdir(exist_ok=True)

if articles_dir.name == "subset_allArticles_extractedData":
articles_dir = articles_dir.parent / "articles"
_LOG.info(f"Extracting author locations to {output_dir}")
ids = []
locations = []
entss = []
article_paths = list(articles_dir.glob("**/article.xml"))
for i_article, article_path in enumerate(article_paths):
print("Processing article %d/%d" % (i_article, len(article_paths)), end="\r")
ents = _guessing_locations._get_entities(article_path)
location = _guessing_locations._get_location(ents)

if not pd.isna(location):
ids.append(_reading_xml._get_id(article_path))
entss.append("; ".join(ents))
locations.append(location)
d = 1
print("Processing article %d/%d" % (i_article, len(article_paths)),
end="\r")
loc = _guessing_locations.LocationGuesser(article_path)
loc.get_location()
if not pd.isna(loc.location):
ids.append(loc.id)
entss.append("; ".join(loc.entities))
locations.append(loc.location)

df = pd.DataFrame.from_records(locations)
df["entities"] = entss
df["id"] = ids
df.to_csv(output_dir / "author_locations.csv")
if not df.empty:
df["entities"] = entss
df["id"] = ids
df.to_csv(output_dir / "author_locations.csv")
_LOG.info(f"Done extracting author locations to {output_dir}")
_LOG.info("Creating map of author locations")
_create_map(df, output_dir)
_LOG.info(f"Done creating map of author locations in {output_dir}")
return output_dir, 0


class AuthorLocationsStep:
name = _STEP_NAME
short_description = _STEP_DESCRIPTION

def edit_argument_parser(self, argument_parser) -> None:
argument_parser.add_argument(
"--author_locations",
action="store_true",
help="Extract the location from the first affiliation",
)

def run(self, args, previous_steps_output):
if not args.author_locations:
return None, 0
author_locations_dir = (
previous_steps_output.get("extract_author_locations_data")
)
if author_locations_dir is None:
author_locations_dir, _ = _extract_from_articles_dir(
previous_steps_output["extract_data"]
)
return _extract_from_articles_dir(author_locations_dir)


class AuthorLocationsCommand:
name = _STEP_NAME
short_description = _STEP_DESCRIPTION

def edit_argument_parser(self, argument_parser) -> None:
argument_parser.add_argument(
"author_locations_dir",
help="Directory containing author locations. "
"It is a directory created by pubget with the '--author_locations'"
" option, whose name ends with 'authorLocations'."
)

def run(self, args):
return _extract_from_articles_dir(Path(args.author_locations_dir))[1]


def get_pubget_actions():
return {
"pipeline_steps": [AuthorLocationsStep()],
"commands": [AuthorLocationsCommand()]
}
46 changes: 22 additions & 24 deletions src/pubextract/author_locations/_reading_xml.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,33 @@
from pathlib import Path
import re
from typing import List, Optional, Union, Tuple, Any, NewType
import dataclasses

from unidecode import unidecode
from lxml import etree
import pandas as pd
import en_core_web_sm


def _get_tree(article_path):
parser = etree.XMLParser(remove_blank_text=True)
return etree.parse(article_path, parser)
tree = etree.parse(article_path, parser)
return tree


def _get_id(article_path):
tree = _get_tree(article_path)
def _get_id(tree):
try:
pmcid = tree.find("front/article-meta/article-id[@pub-id-type='pmc']").text
id = "PMC%s" % pmcid
except:
pmcid = (
tree.find("front/article-meta/article-id[@pub-id-type='pmc']").text
)
id_ = f"PMC{pmcid}"
except Exception: # make error more specific
pmid = tree.xpath("//PMID/text()")[0]
id = "Pubmed%s" % pmid
return id
id_ = f"Pubmed{pmid}"
return id_


def _get_first_affiliation(article_path):
aff = ""
for event, element in etree.iterparse(article_path):
if element.tag == "aff" or element.tag == "Affiliation":
aff = etree.tostring(element, with_tail=False, encoding="unicode")
if aff:
break
return aff
def _get_first_affiliation(tree):
affiliation = ""
element = tree.find("//aff")
if element == -1:
element = tree.find("//Affiliation")
if element is not None:
affiliation = etree.tostring(
element, with_tail=False, encoding="unicode"
)
else:
affiliation = ""
return affiliation
15 changes: 0 additions & 15 deletions src/pubextract/author_locations/test.py

This file was deleted.

Empty file.
Empty file.
Loading