From 8b60379b5ead85ad7373c8fd2dca64d30d8f2494 Mon Sep 17 00:00:00 2001 From: koudyk Date: Mon, 4 Sep 2023 13:34:04 -0400 Subject: [PATCH] adding plugin for author locations - broken atm --- setup.cfg | 6 +- .../author_locations/_data/__init__.py | 0 .../author_locations/_guessing_locations.py | 75 ++++++---- src/pubextract/author_locations/_pubget.py | 131 ++++++++++++++++-- .../author_locations/_reading_xml.py | 46 +++--- src/pubextract/author_locations/test.py | 15 -- .../data/example_xml_0.xml | 0 .../data/example_xml_1.xml | 0 .../test_guessing_locations.py | 54 ++++++++ 9 files changed, 244 insertions(+), 83 deletions(-) create mode 100644 src/pubextract/author_locations/_data/__init__.py delete mode 100644 src/pubextract/author_locations/test.py create mode 100644 tests/test_author_locations/data/example_xml_0.xml create mode 100644 tests/test_author_locations/data/example_xml_1.xml create mode 100644 tests/test_author_locations/test_guessing_locations.py diff --git a/setup.cfg b/setup.cfg index 74ca7f4..c44e36c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -9,6 +9,7 @@ install_requires = lark neuroquery gender-guesser + Unidecode python_requires = >=3.7 [options.packages.find] @@ -18,7 +19,10 @@ where = src pubextract.participants._data = * pubextract.methods_finder._data = *.csv pubextract.authors._data = * +pubextract.author_locations._data = * [options.entry_points] pubget.plugin_actions = - get_pubget_actions = pubextract.participants._pubget:get_pubget_actions \ No newline at end of file + get_pubget_actions-author_locations = pubextract.author_locations._pubget:get_pubget_actions + get_pubget_actions-demographics = pubextract.participants._pubget:get_pubget_actions + diff --git a/src/pubextract/author_locations/_data/__init__.py b/src/pubextract/author_locations/_data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/pubextract/author_locations/_guessing_locations.py b/src/pubextract/author_locations/_guessing_locations.py index 8694aad..5eebfab 100644 --- a/src/pubextract/author_locations/_guessing_locations.py +++ b/src/pubextract/author_locations/_guessing_locations.py @@ -4,22 +4,29 @@ from unidecode import unidecode import pandas as pd import numpy as np -import en_core_web_sm +# import en_core_web_sm from pubextract.author_locations import _reading_xml -cities_path = Path(__file__).parent / "_data" / "worldcities.csv" -WC = pd.read_csv(cities_path) -WC = WC.dropna() -COUNTRIES = set(WC["country"]) -CITIES = set(WC["city_ascii"]) -LOCATIONS = COUNTRIES.union(CITIES) -COUNTRY_MAPPING = { - "UK": "United Kingdom", - "USA": "United States", - "South Korea": "Korea, South", -} +def _define_locations(): + cities_path = Path(__file__).parent / "_data" / "worldcities.csv" + WORLD_CITIES = pd.read_csv(cities_path) + COUNTRIES = set(WORLD_CITIES["country"]) + CITIES = set(WORLD_CITIES["city_ascii"]) + LOCATIONS = COUNTRIES.union(CITIES) + COUNTRY_MAPPING = { + "UK": "United Kingdom", + "USA": "United States", + "South Korea": "Korea, South", + } + return { + 'WORLD_CITIES': WORLD_CITIES, + "COUNTRIES": COUNTRIES, + "CITIES": CITIES, + "LOCATIONS": LOCATIONS, + "COUNTRY_MAPPING": COUNTRY_MAPPING, + } def _preprocess_text(text): @@ -41,27 +48,31 @@ def _preprocess_text(text): return text -def _get_entities(article_path): - aff = _reading_xml._get_first_affiliation(article_path) - aff = _preprocess_text(aff) - nlp = en_core_web_sm.load() - doc = nlp(aff) - items = [ent.text for ent in doc.ents if ent.label_ == "GPE"] +def _get_entities(affiliation): + aff = _preprocess_text(affiliation) + # nlp = en_core_web_sm.load() + # doc = nlp(aff) + # items = [ent.text for ent in doc.ents if ent.label_ == "GPE"] + items = [] unigrams = aff.split(" ") items = items + unigrams for i, unigram in enumerate(unigrams[:-1]): bigram = " ".join([unigram, unigrams[i+1]]) items.append(bigram) - entities = [x for x in items if x in LOCATIONS] + locs = _define_locations() + CM = locs["COUNTRY_MAPPING"] + items = [CM[x] if x in CM else x for x in items] + entities = [x for x in items if x in locs["LOCATIONS"]] entities = [x.strip() for x in entities] entities = list(set(entities)) return entities def _get_location(ents): - ents = [COUNTRY_MAPPING[x] if x in COUNTRY_MAPPING else x for x in ents] - cities = CITIES.intersection(set(ents)) - countries = COUNTRIES.intersection(set(ents)) + locs = _define_locations() + cities = locs["CITIES"].intersection(set(ents)) + countries = locs["COUNTRIES"].intersection(set(ents)) + WC = locs["WORLD_CITIES"] i_ci = WC[WC["city_ascii"].isin(cities)].index i_co = WC[WC["country"].isin(countries)].index i = i_ci.intersection(i_co) @@ -74,11 +85,15 @@ def _get_location(ents): location = np.nan return location -# class Locations: -# def __init__(self, article_path): -# self.article_path = article_path -# self.id = _reading_xml._get_id(article_path) -# self.affiliation = _reading_xml._get_first_affiliation(article_path) -# # self.tree = _reading._get_tree(article_path) -# self.entities = self._get_entities() -# self.locations = self._get_locations() + +class LocationGuesser: + def __init__(self, article_path): + self.article_path = article_path + self.tree = _reading_xml._get_tree(article_path) + self.id = _reading_xml._get_id(self.tree) + # self.metadata = + + def get_location(self): + self.affiliation = _reading_xml._get_first_affiliation(self.tree) + self.entities = _get_entities(self.affiliation) + self.location = _get_location(self.entities) diff --git a/src/pubextract/author_locations/_pubget.py b/src/pubextract/author_locations/_pubget.py index 699aead..79704a4 100644 --- a/src/pubextract/author_locations/_pubget.py +++ b/src/pubextract/author_locations/_pubget.py @@ -2,8 +2,10 @@ from pathlib import Path import pandas as pd +import numpy as np +import folium -from pubextract.author_locations import _guessing_locations, _reading_xml +from pubextract.author_locations import _guessing_locations _STEP_NAME = "extract_author_locations" @@ -11,27 +13,130 @@ _LOG = logging.getLogger(_STEP_NAME) +def _create_map(df, output_dir): + df = df.dropna(subset="lng") + counts = df["country"].value_counts() + data = pd.DataFrame(columns=["country", "count"]) + data["country"] = list(counts.index) + data["count"] = list(counts.values) + + m = folium.Map(tiles="cartodb positron", zoom_start=2) + political_countries_url = ( + "http://geojson.xyz/naturalearth-3.3.0/ne_50m_admin_0_countries.geojson" + ) + folium.Choropleth( + geo_data=political_countries_url, + data=data, + columns=["country", "count"], + popup=data["country"], + key_on="feature.properties.name", + ).add_to(m) + + for ind, row in df.iterrows(): + df.loc[ind, "str_coords"] = "%f, %f" % (row["lat"], row["lng"]) + RADIUS = .01 + for str_coords, group in df.groupby("str_coords"): + row = group.iloc[0] + LNG = row["lng"] + LAT = row["lat"] + n = len(group) + T = np.linspace(0, 2*np.pi, n, endpoint=False) + popup = f"{row.city}, {row.entities}" + for (i, row), t in zip(group.iterrows(), T): + radius = RADIUS * n + lng = LNG + radius * np.sin(t) + lat = LAT + radius * np.cos(t) + if n == 1: + lng = LNG + lat = LAT + folium.CircleMarker( + [lat, lng], + popup=popup, + radius=.1, + color="#f9190076", + fill_color="#f9190076", + ).add_to(m) + m.save(output_dir / "author_locations_map.html") + + def _extract_from_articles_dir(articles_dir, output_dir=None): if output_dir is None: output_dir = articles_dir.parent / "subset_allArticles_authorLocations" else: output_dir = Path(output_dir) output_dir.mkdir(exist_ok=True) + + if articles_dir.name == "subset_allArticles_extractedData": + articles_dir = articles_dir.parent / "articles" + _LOG.info(f"Extracting author locations to {output_dir}") ids = [] locations = [] entss = [] article_paths = list(articles_dir.glob("**/article.xml")) for i_article, article_path in enumerate(article_paths): - print("Processing article %d/%d" % (i_article, len(article_paths)), end="\r") - ents = _guessing_locations._get_entities(article_path) - location = _guessing_locations._get_location(ents) - - if not pd.isna(location): - ids.append(_reading_xml._get_id(article_path)) - entss.append("; ".join(ents)) - locations.append(location) - d = 1 + print("Processing article %d/%d" % (i_article, len(article_paths)), + end="\r") + loc = _guessing_locations.LocationGuesser(article_path) + loc.get_location() + if not pd.isna(loc.location): + ids.append(loc.id) + entss.append("; ".join(loc.entities)) + locations.append(loc.location) + df = pd.DataFrame.from_records(locations) - df["entities"] = entss - df["id"] = ids - df.to_csv(output_dir / "author_locations.csv") + if not df.empty: + df["entities"] = entss + df["id"] = ids + df.to_csv(output_dir / "author_locations.csv") + _LOG.info(f"Done extracting author locations to {output_dir}") + _LOG.info("Creating map of author locations") + _create_map(df, output_dir) + _LOG.info(f"Done creating map of author locations in {output_dir}") + return output_dir, 0 + + +class AuthorLocationsStep: + name = _STEP_NAME + short_description = _STEP_DESCRIPTION + + def edit_argument_parser(self, argument_parser) -> None: + argument_parser.add_argument( + "--author_locations", + action="store_true", + help="Extract the location from the first affiliation", + ) + + def run(self, args, previous_steps_output): + if not args.author_locations: + return None, 0 + author_locations_dir = ( + previous_steps_output.get("extract_author_locations_data") + ) + if author_locations_dir is None: + author_locations_dir, _ = _extract_from_articles_dir( + previous_steps_output["extract_data"] + ) + return _extract_from_articles_dir(author_locations_dir) + + +class AuthorLocationsCommand: + name = _STEP_NAME + short_description = _STEP_DESCRIPTION + + def edit_argument_parser(self, argument_parser) -> None: + argument_parser.add_argument( + "author_locations_dir", + help="Directory containing author locations. " + "It is a directory created by pubget with the '--author_locations'" + " option, whose name ends with 'authorLocations'." + ) + + def run(self, args): + return _extract_from_articles_dir(Path(args.author_locations_dir))[1] + + +def get_pubget_actions(): + return { + "pipeline_steps": [AuthorLocationsStep()], + "commands": [AuthorLocationsCommand()] + } diff --git a/src/pubextract/author_locations/_reading_xml.py b/src/pubextract/author_locations/_reading_xml.py index a548390..90b468e 100644 --- a/src/pubextract/author_locations/_reading_xml.py +++ b/src/pubextract/author_locations/_reading_xml.py @@ -1,35 +1,33 @@ -from pathlib import Path -import re -from typing import List, Optional, Union, Tuple, Any, NewType -import dataclasses - -from unidecode import unidecode from lxml import etree -import pandas as pd -import en_core_web_sm def _get_tree(article_path): parser = etree.XMLParser(remove_blank_text=True) - return etree.parse(article_path, parser) + tree = etree.parse(article_path, parser) + return tree -def _get_id(article_path): - tree = _get_tree(article_path) +def _get_id(tree): try: - pmcid = tree.find("front/article-meta/article-id[@pub-id-type='pmc']").text - id = "PMC%s" % pmcid - except: + pmcid = ( + tree.find("front/article-meta/article-id[@pub-id-type='pmc']").text + ) + id_ = f"PMC{pmcid}" + except Exception: # make error more specific pmid = tree.xpath("//PMID/text()")[0] - id = "Pubmed%s" % pmid - return id + id_ = f"Pubmed{pmid}" + return id_ -def _get_first_affiliation(article_path): - aff = "" - for event, element in etree.iterparse(article_path): - if element.tag == "aff" or element.tag == "Affiliation": - aff = etree.tostring(element, with_tail=False, encoding="unicode") - if aff: - break - return aff +def _get_first_affiliation(tree): + affiliation = "" + element = tree.find("//aff") + if element == -1: + element = tree.find("//Affiliation") + if element is not None: + affiliation = etree.tostring( + element, with_tail=False, encoding="unicode" + ) + else: + affiliation = "" + return affiliation diff --git a/src/pubextract/author_locations/test.py b/src/pubextract/author_locations/test.py deleted file mode 100644 index 056452b..0000000 --- a/src/pubextract/author_locations/test.py +++ /dev/null @@ -1,15 +0,0 @@ -from pathlib import Path - -from pubextract.author_locations import _pubget - - -articles_dir = ( - Path(__file__).resolve().parents[5] - / "data" - / "pubget_data" - / "review-neuro-meta-analyses_2023-06-29" - / "query_a84b639ed7c2cc2d04c773db7c22905d" - / "articles" -) - -_pubget._extract_from_articles_dir(articles_dir) \ No newline at end of file diff --git a/tests/test_author_locations/data/example_xml_0.xml b/tests/test_author_locations/data/example_xml_0.xml new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_author_locations/data/example_xml_1.xml b/tests/test_author_locations/data/example_xml_1.xml new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_author_locations/test_guessing_locations.py b/tests/test_author_locations/test_guessing_locations.py new file mode 100644 index 0000000..425c59a --- /dev/null +++ b/tests/test_author_locations/test_guessing_locations.py @@ -0,0 +1,54 @@ +from pathlib import Path + +from pubextract.author_locations import _reading_xml, _guessing_locations + + +def example_xml(): + folder = Path(__file__).parent.joinpath("data") + for path in folder.glob("*.xml"): + yield path.read_text("utf-8") + + +def test_define_locations(): + locs = _guessing_locations._define_locations() + assert len(locs["LOCATIONS"]) > 0 + assert "United Kingdom" in locs["LOCATIONS"] + assert "Pittsburgh" in locs["CITIES"] + + +def test_preprocess_text(): + text = ( + "University of Pittsburgh, New York, PA, USA" + "" + ) + processed = _guessing_locations._preprocess_text(text) + assert processed == "Affiliation of Pittsburgh New York PA USA" + + +def test_get_entities(): + affiliation = ( + "University of Pittsburgh, New York, PA, USA" + "" + ) + ents = _guessing_locations._get_entities(affiliation) + assert ents == [ + "Affiliation", "Pittsburgh", "Pittsburgh", "New York" "PA", "USA" + ] + + +def test_get_location(): + tree = _reading_xml._get_tree(next(example_xml())) + aff = _reading_xml._get_first_affiliation(tree) + loc = _guessing_locations._get_location(aff) + assert loc["city"] == "Pittsburgh" + + +def test_LocationGuesser(): + article_path = next(example_xml()) + location = _guessing_locations.LocationGuesser(article_path) + location.get_location() + assert location.location["city"] == "Pittsburgh" + assert location.location["country"] == "United States" + assert location.entities == [ + "Affiliation", "Pittsburgh", "Pittsburgh", "New York" "PA", "USA" + ]