From 8b60379b5ead85ad7373c8fd2dca64d30d8f2494 Mon Sep 17 00:00:00 2001
From: koudyk <kendra.oudyk@gmail.com>
Date: Mon, 4 Sep 2023 13:34:04 -0400
Subject: [PATCH] adding plugin for author locations - broken atm

---
 setup.cfg                                     |   6 +-
 .../author_locations/_data/__init__.py        |   0
 .../author_locations/_guessing_locations.py   |  75 ++++++----
 src/pubextract/author_locations/_pubget.py    | 131 ++++++++++++++++--
 .../author_locations/_reading_xml.py          |  46 +++---
 src/pubextract/author_locations/test.py       |  15 --
 .../data/example_xml_0.xml                    |   0
 .../data/example_xml_1.xml                    |   0
 .../test_guessing_locations.py                |  54 ++++++++
 9 files changed, 244 insertions(+), 83 deletions(-)
 create mode 100644 src/pubextract/author_locations/_data/__init__.py
 delete mode 100644 src/pubextract/author_locations/test.py
 create mode 100644 tests/test_author_locations/data/example_xml_0.xml
 create mode 100644 tests/test_author_locations/data/example_xml_1.xml
 create mode 100644 tests/test_author_locations/test_guessing_locations.py

diff --git a/setup.cfg b/setup.cfg
index 74ca7f4..c44e36c 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -9,6 +9,7 @@ install_requires =
     lark
     neuroquery
     gender-guesser
+    Unidecode
 python_requires = >=3.7
 
 [options.packages.find]
@@ -18,7 +19,10 @@ where = src
 pubextract.participants._data = *
 pubextract.methods_finder._data = *.csv
 pubextract.authors._data = *
+pubextract.author_locations._data = *
 
 [options.entry_points]
 pubget.plugin_actions =
-    get_pubget_actions = pubextract.participants._pubget:get_pubget_actions
\ No newline at end of file
+    get_pubget_actions-author_locations = pubextract.author_locations._pubget:get_pubget_actions
+    get_pubget_actions-demographics = pubextract.participants._pubget:get_pubget_actions
+
diff --git a/src/pubextract/author_locations/_data/__init__.py b/src/pubextract/author_locations/_data/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/pubextract/author_locations/_guessing_locations.py b/src/pubextract/author_locations/_guessing_locations.py
index 8694aad..5eebfab 100644
--- a/src/pubextract/author_locations/_guessing_locations.py
+++ b/src/pubextract/author_locations/_guessing_locations.py
@@ -4,22 +4,29 @@
 from unidecode import unidecode
 import pandas as pd
 import numpy as np
-import en_core_web_sm
+# import en_core_web_sm
 
 from pubextract.author_locations import _reading_xml
 
 
-cities_path = Path(__file__).parent / "_data" / "worldcities.csv"
-WC = pd.read_csv(cities_path)
-WC = WC.dropna()
-COUNTRIES = set(WC["country"])
-CITIES = set(WC["city_ascii"])
-LOCATIONS = COUNTRIES.union(CITIES)
-COUNTRY_MAPPING = {
-    "UK": "United Kingdom",
-    "USA": "United States",
-    "South Korea": "Korea, South",
-}
+def _define_locations():
+    cities_path = Path(__file__).parent / "_data" / "worldcities.csv"
+    WORLD_CITIES = pd.read_csv(cities_path)
+    COUNTRIES = set(WORLD_CITIES["country"])
+    CITIES = set(WORLD_CITIES["city_ascii"])
+    LOCATIONS = COUNTRIES.union(CITIES)
+    COUNTRY_MAPPING = {
+        "UK": "United Kingdom",
+        "USA": "United States",
+        "South Korea": "Korea, South",
+    }
+    return {
+        'WORLD_CITIES': WORLD_CITIES,
+        "COUNTRIES": COUNTRIES,
+        "CITIES": CITIES,
+        "LOCATIONS": LOCATIONS,
+        "COUNTRY_MAPPING": COUNTRY_MAPPING,
+    }
 
 
 def _preprocess_text(text):
@@ -41,27 +48,31 @@ def _preprocess_text(text):
     return text
 
 
-def _get_entities(article_path):
-    aff = _reading_xml._get_first_affiliation(article_path)
-    aff = _preprocess_text(aff)
-    nlp = en_core_web_sm.load()
-    doc = nlp(aff)
-    items = [ent.text for ent in doc.ents if ent.label_ == "GPE"]
+def _get_entities(affiliation):
+    aff = _preprocess_text(affiliation)
+    # nlp = en_core_web_sm.load()
+    # doc = nlp(aff)
+    # items = [ent.text for ent in doc.ents if ent.label_ == "GPE"]
+    items = []
     unigrams = aff.split(" ")
     items = items + unigrams
     for i, unigram in enumerate(unigrams[:-1]):
         bigram = " ".join([unigram, unigrams[i+1]])
         items.append(bigram)
-    entities = [x for x in items if x in LOCATIONS]
+    locs = _define_locations()
+    CM = locs["COUNTRY_MAPPING"]
+    items = [CM[x] if x in CM else x for x in items]
+    entities = [x for x in items if x in locs["LOCATIONS"]]
     entities = [x.strip() for x in entities]
     entities = list(set(entities))
     return entities
 
 
 def _get_location(ents):
-    ents = [COUNTRY_MAPPING[x] if x in COUNTRY_MAPPING else x for x in ents]
-    cities = CITIES.intersection(set(ents))
-    countries = COUNTRIES.intersection(set(ents))
+    locs = _define_locations()
+    cities = locs["CITIES"].intersection(set(ents))
+    countries = locs["COUNTRIES"].intersection(set(ents))
+    WC = locs["WORLD_CITIES"]
     i_ci = WC[WC["city_ascii"].isin(cities)].index
     i_co = WC[WC["country"].isin(countries)].index
     i = i_ci.intersection(i_co)
@@ -74,11 +85,15 @@ def _get_location(ents):
         location = np.nan
     return location
 
-# class Locations:
-#     def __init__(self, article_path):
-#         self.article_path = article_path
-#         self.id = _reading_xml._get_id(article_path)
-#         self.affiliation = _reading_xml._get_first_affiliation(article_path)
-#         # self.tree = _reading._get_tree(article_path)
-#         self.entities = self._get_entities()
-#         self.locations = self._get_locations()
+
+class LocationGuesser:
+    def __init__(self, article_path):
+        self.article_path = article_path
+        self.tree = _reading_xml._get_tree(article_path)
+        self.id = _reading_xml._get_id(self.tree)
+        # self.metadata =
+
+    def get_location(self):
+        self.affiliation = _reading_xml._get_first_affiliation(self.tree)
+        self.entities = _get_entities(self.affiliation)
+        self.location = _get_location(self.entities)
diff --git a/src/pubextract/author_locations/_pubget.py b/src/pubextract/author_locations/_pubget.py
index 699aead..79704a4 100644
--- a/src/pubextract/author_locations/_pubget.py
+++ b/src/pubextract/author_locations/_pubget.py
@@ -2,8 +2,10 @@
 from pathlib import Path
 
 import pandas as pd
+import numpy as np
+import folium
 
-from pubextract.author_locations import _guessing_locations, _reading_xml
+from pubextract.author_locations import _guessing_locations
 
 
 _STEP_NAME = "extract_author_locations"
@@ -11,27 +13,130 @@
 _LOG = logging.getLogger(_STEP_NAME)
 
 
+def _create_map(df, output_dir):
+    df = df.dropna(subset="lng")
+    counts = df["country"].value_counts()
+    data = pd.DataFrame(columns=["country", "count"])
+    data["country"] = list(counts.index)
+    data["count"] = list(counts.values)
+
+    m = folium.Map(tiles="cartodb positron", zoom_start=2)
+    political_countries_url = (
+        "http://geojson.xyz/naturalearth-3.3.0/ne_50m_admin_0_countries.geojson"
+    )
+    folium.Choropleth(
+        geo_data=political_countries_url,
+        data=data,
+        columns=["country", "count"],
+        popup=data["country"],
+        key_on="feature.properties.name",
+    ).add_to(m)
+
+    for ind, row in df.iterrows():
+        df.loc[ind, "str_coords"] = "%f, %f" % (row["lat"], row["lng"])
+    RADIUS = .01
+    for str_coords, group in df.groupby("str_coords"):
+        row = group.iloc[0]
+        LNG = row["lng"]
+        LAT = row["lat"]
+        n = len(group)
+        T = np.linspace(0, 2*np.pi, n, endpoint=False)
+        popup = f"{row.city}, {row.entities}"
+        for (i, row), t in zip(group.iterrows(), T):
+            radius = RADIUS * n
+            lng = LNG + radius * np.sin(t)
+            lat = LAT + radius * np.cos(t)
+            if n == 1:
+                lng = LNG
+                lat = LAT
+            folium.CircleMarker(
+                [lat, lng],
+                popup=popup,
+                radius=.1,
+                color="#f9190076",
+                fill_color="#f9190076",
+            ).add_to(m)
+    m.save(output_dir / "author_locations_map.html")
+
+
 def _extract_from_articles_dir(articles_dir, output_dir=None):
     if output_dir is None:
         output_dir = articles_dir.parent / "subset_allArticles_authorLocations"
     else:
         output_dir = Path(output_dir)
     output_dir.mkdir(exist_ok=True)
+
+    if articles_dir.name == "subset_allArticles_extractedData":
+        articles_dir = articles_dir.parent / "articles"
+    _LOG.info(f"Extracting author locations to {output_dir}")
     ids = []
     locations = []
     entss = []
     article_paths = list(articles_dir.glob("**/article.xml"))
     for i_article, article_path in enumerate(article_paths):
-        print("Processing article %d/%d" % (i_article, len(article_paths)), end="\r")
-        ents = _guessing_locations._get_entities(article_path)
-        location = _guessing_locations._get_location(ents)
-        
-        if not pd.isna(location):
-            ids.append(_reading_xml._get_id(article_path))
-            entss.append("; ".join(ents))
-            locations.append(location)
-        d = 1
+        print("Processing article %d/%d" % (i_article, len(article_paths)),
+              end="\r")
+        loc = _guessing_locations.LocationGuesser(article_path)
+        loc.get_location()
+        if not pd.isna(loc.location):
+            ids.append(loc.id)
+            entss.append("; ".join(loc.entities))
+            locations.append(loc.location)
+
     df = pd.DataFrame.from_records(locations)
-    df["entities"] = entss
-    df["id"] = ids
-    df.to_csv(output_dir / "author_locations.csv")
+    if not df.empty:
+        df["entities"] = entss
+        df["id"] = ids
+        df.to_csv(output_dir / "author_locations.csv")
+        _LOG.info(f"Done extracting author locations to {output_dir}")
+        _LOG.info("Creating map of author locations")
+        _create_map(df, output_dir)
+        _LOG.info(f"Done creating map of author locations in {output_dir}")
+    return output_dir, 0
+
+
+class AuthorLocationsStep:
+    name = _STEP_NAME
+    short_description = _STEP_DESCRIPTION
+
+    def edit_argument_parser(self, argument_parser) -> None:
+        argument_parser.add_argument(
+            "--author_locations",
+            action="store_true",
+            help="Extract the location from the first affiliation",
+        )
+
+    def run(self, args, previous_steps_output):
+        if not args.author_locations:
+            return None, 0
+        author_locations_dir = (
+            previous_steps_output.get("extract_author_locations_data")
+        )
+        if author_locations_dir is None:
+            author_locations_dir, _ = _extract_from_articles_dir(
+                previous_steps_output["extract_data"]
+            )
+        return _extract_from_articles_dir(author_locations_dir)
+
+
+class AuthorLocationsCommand:
+    name = _STEP_NAME
+    short_description = _STEP_DESCRIPTION
+
+    def edit_argument_parser(self, argument_parser) -> None:
+        argument_parser.add_argument(
+            "author_locations_dir",
+            help="Directory containing author locations. "
+            "It is a directory created by pubget with the '--author_locations'"
+            " option, whose name ends with 'authorLocations'."
+        )
+
+    def run(self, args):
+        return _extract_from_articles_dir(Path(args.author_locations_dir))[1]
+
+
+def get_pubget_actions():
+    return {
+        "pipeline_steps": [AuthorLocationsStep()],
+        "commands": [AuthorLocationsCommand()]
+    }
diff --git a/src/pubextract/author_locations/_reading_xml.py b/src/pubextract/author_locations/_reading_xml.py
index a548390..90b468e 100644
--- a/src/pubextract/author_locations/_reading_xml.py
+++ b/src/pubextract/author_locations/_reading_xml.py
@@ -1,35 +1,33 @@
-from pathlib import Path
-import re
-from typing import List, Optional, Union, Tuple, Any, NewType
-import dataclasses
-
-from unidecode import unidecode
 from lxml import etree
-import pandas as pd
-import en_core_web_sm
 
 
 def _get_tree(article_path):
     parser = etree.XMLParser(remove_blank_text=True)
-    return etree.parse(article_path, parser)
+    tree = etree.parse(article_path, parser)
+    return tree
 
 
-def _get_id(article_path):
-    tree = _get_tree(article_path)
+def _get_id(tree):
     try:
-        pmcid = tree.find("front/article-meta/article-id[@pub-id-type='pmc']").text
-        id = "PMC%s" % pmcid
-    except:
+        pmcid = (
+            tree.find("front/article-meta/article-id[@pub-id-type='pmc']").text
+        )
+        id_ = f"PMC{pmcid}"
+    except Exception:  # make error more specific
         pmid = tree.xpath("//PMID/text()")[0]
-        id = "Pubmed%s" % pmid
-    return id
+        id_ = f"Pubmed{pmid}"
+    return id_
 
 
-def _get_first_affiliation(article_path):
-    aff = ""
-    for event, element in etree.iterparse(article_path):
-        if element.tag == "aff" or element.tag == "Affiliation":
-            aff = etree.tostring(element, with_tail=False, encoding="unicode")
-            if aff:
-                break
-    return aff
+def _get_first_affiliation(tree):
+    affiliation = ""
+    element = tree.find("//aff")
+    if element == -1:
+        element = tree.find("//Affiliation")
+    if element is not None:
+        affiliation = etree.tostring(
+            element, with_tail=False, encoding="unicode"
+            )
+    else:
+        affiliation = ""
+    return affiliation
diff --git a/src/pubextract/author_locations/test.py b/src/pubextract/author_locations/test.py
deleted file mode 100644
index 056452b..0000000
--- a/src/pubextract/author_locations/test.py
+++ /dev/null
@@ -1,15 +0,0 @@
-from pathlib import Path
-
-from pubextract.author_locations import _pubget
-
-
-articles_dir = (
-    Path(__file__).resolve().parents[5]
-    / "data"
-    / "pubget_data"
-    / "review-neuro-meta-analyses_2023-06-29"
-    / "query_a84b639ed7c2cc2d04c773db7c22905d"
-    / "articles"
-)
-
-_pubget._extract_from_articles_dir(articles_dir)
\ No newline at end of file
diff --git a/tests/test_author_locations/data/example_xml_0.xml b/tests/test_author_locations/data/example_xml_0.xml
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_author_locations/data/example_xml_1.xml b/tests/test_author_locations/data/example_xml_1.xml
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_author_locations/test_guessing_locations.py b/tests/test_author_locations/test_guessing_locations.py
new file mode 100644
index 0000000..425c59a
--- /dev/null
+++ b/tests/test_author_locations/test_guessing_locations.py
@@ -0,0 +1,54 @@
+from pathlib import Path
+
+from pubextract.author_locations import _reading_xml, _guessing_locations
+
+
+def example_xml():
+    folder = Path(__file__).parent.joinpath("data")
+    for path in folder.glob("*.xml"):
+        yield path.read_text("utf-8")
+
+
+def test_define_locations():
+    locs = _guessing_locations._define_locations()
+    assert len(locs["LOCATIONS"]) > 0
+    assert "United Kingdom" in locs["LOCATIONS"]
+    assert "Pittsburgh" in locs["CITIES"]
+
+
+def test_preprocess_text():
+    text = (
+        "<Affiliation>University of Pittsburgh, New York, PA, USA"
+        "</Affiliation>"
+    )
+    processed = _guessing_locations._preprocess_text(text)
+    assert processed == "Affiliation of Pittsburgh New York PA USA"
+
+
+def test_get_entities():
+    affiliation = (
+        "<Affiliation>University of Pittsburgh, New York, PA, USA"
+        "</Affiliation>"
+    )
+    ents = _guessing_locations._get_entities(affiliation)
+    assert ents == [
+        "Affiliation", "Pittsburgh", "Pittsburgh", "New York" "PA", "USA"
+    ]
+
+
+def test_get_location():
+    tree = _reading_xml._get_tree(next(example_xml()))
+    aff = _reading_xml._get_first_affiliation(tree)
+    loc = _guessing_locations._get_location(aff)
+    assert loc["city"] == "Pittsburgh"
+
+
+def test_LocationGuesser():
+    article_path = next(example_xml())
+    location = _guessing_locations.LocationGuesser(article_path)
+    location.get_location()
+    assert location.location["city"] == "Pittsburgh"
+    assert location.location["country"] == "United States"
+    assert location.entities == [
+        "Affiliation", "Pittsburgh", "Pittsburgh", "New York" "PA", "USA"
+    ]