diff --git a/pittapi/news.py b/pittapi/news.py
index ae1462c..76fbdea 100644
--- a/pittapi/news.py
+++ b/pittapi/news.py
@@ -19,50 +19,20 @@
from __future__ import annotations
+from functools import cache
import math
from requests_html import Element, HTMLResponse, HTMLSession
-from typing import Literal, NamedTuple
+from typing import NamedTuple
NUM_ARTICLES_PER_PAGE = 20
-NEWS_BY_CATEGORY_URL = (
- "https://www.pitt.edu/pittwire/news/{category}?field_topics_target_id={topic_id}&field_article_date_value={year}"
+PITT_BASE_URL = "https://www.pitt.edu"
+PITTWIRE_URL = PITT_BASE_URL + "/pittwire"
+FEATURES_ARTICLES_URL = PITTWIRE_URL + "/news/features-articles"
+NEWS_BY_CATEGORY_URL = PITTWIRE_URL + (
+ "/news/{category}?field_topics_target_id={topic_id}&field_article_date_value={year}"
"&title={query}&field_category_target_id=All&page={page_num}"
)
-PITT_BASE_URL = "https://www.pitt.edu"
-
-Category = Literal["features-articles", "accolades-honors", "ones-to-watch", "announcements-and-updates"]
-Topic = Literal[
- "university-news",
- "health-and-wellness",
- "technology-and-science",
- "arts-and-humanities",
- "community-impact",
- "innovation-and-research",
- "global",
- "diversity-equity-and-inclusion",
- "our-city-our-campus",
- "teaching-and-learning",
- "space",
- "ukraine",
- "sustainability",
-]
-
-TOPIC_ID_MAP: dict[Topic, int] = {
- "university-news": 432,
- "health-and-wellness": 2,
- "technology-and-science": 391,
- "arts-and-humanities": 4,
- "community-impact": 6,
- "innovation-and-research": 1,
- "global": 9,
- "diversity-equity-and-inclusion": 8,
- "our-city-our-campus": 12,
- "teaching-and-learning": 7,
- "space": 440,
- "ukraine": 441,
- "sustainability": 470,
-}
sess = HTMLSession()
@@ -87,18 +57,51 @@ def from_html(cls, article_html: Element) -> Article:
return cls(title=article_title, description=article_description, url=article_url, tags=article_tags)
-def _get_page_articles(
- topic: Topic,
- category: Category,
- query: str,
- year: int | None,
- page_num: int,
-) -> list[Article]:
+@cache
+def _scrape_categories() -> dict[str, str]:
+ response: HTMLResponse = sess.get(PITTWIRE_URL)
+ category_menu: Element = response.html.find("div#block-views-block-category-menu-category-menu", first=True)
+ category_list: list[Element] = category_menu.find("ul.hamburger-menu-list li")
+ category_map: dict[str, str] = {}
+ for category in category_list:
+ category_link: Element = category.find("a", first=True)
+ category_url_name = category_link.attrs["href"].split("/")[-1]
+ category_map[category.text.strip()] = category_url_name
+ if not category_map:
+ raise RuntimeError("No categories found, please open a GitHub issue")
+ return category_map
+
+
+@cache
+def _scrape_topics() -> dict[str, int]:
+ response: HTMLResponse = sess.get(FEATURES_ARTICLES_URL)
+ main_content: Element = response.html.xpath("/html/body/div/main/div/section", first=True)
+ topic_fieldset: Element = main_content.find("fieldset.form-item-field-topics-target-id", first=True)
+ topic_options: list[Element] = topic_fieldset.find("option")
+ topic_map: dict[str, int] = {}
+ for topic_option in topic_options:
+ if (topic_id := topic_option.attrs["value"].strip()) == "All": # Skip placeholder "Topics" option
+ continue
+ topic_name = topic_option.text.strip()
+ topic_map[topic_name] = int(topic_id)
+ if not topic_map:
+ raise RuntimeError("No topics found, please open a GitHub issue")
+ return topic_map
+
+
+def _get_page_articles(topic: str, category: str, query: str, year: int | None, page_num: int) -> list[Article]:
+ topic_id_map = _scrape_topics()
+ category_url_name_map = _scrape_categories()
year_str = str(year) if year else ""
page_num_str = str(page_num) if page_num else ""
+
response: HTMLResponse = sess.get(
NEWS_BY_CATEGORY_URL.format(
- category=category, topic_id=TOPIC_ID_MAP[topic], year=year_str, query=query, page_num=page_num_str
+ category=category_url_name_map[category],
+ topic_id=topic_id_map[topic],
+ year=year_str,
+ query=query,
+ page_num=page_num_str,
)
)
main_content: Element = response.html.xpath("/html/body/div/main/div/section", first=True)
@@ -107,13 +110,33 @@ def _get_page_articles(
return page_articles
+@cache
+def get_categories() -> list[str]:
+ category_url_name_map = _scrape_categories()
+ return list(category_url_name_map.keys())
+
+
+@cache
+def get_topics() -> list[str]:
+ topic_id_map = _scrape_topics()
+ return list(topic_id_map.keys())
+
+
def get_articles_by_topic(
- topic: Topic,
- category: Category = "features-articles",
+ topic: str,
+ category: str = "Features & Articles",
query: str = "",
year: int | None = None,
max_num_results: int = NUM_ARTICLES_PER_PAGE,
) -> list[Article]:
+ topic_id_map = _scrape_topics()
+ if topic not in topic_id_map:
+ raise ValueError(f"'{topic}' is not a valid topic, must be one of the following: {get_topics()}")
+
+ category_url_name_map = _scrape_categories()
+ if category not in category_url_name_map:
+ raise ValueError(f"'{category}' is not a valid category, must be one of the following: {get_categories()}")
+
num_pages = math.ceil(max_num_results / NUM_ARTICLES_PER_PAGE)
# Get articles sequentially and synchronously (i.e., not using grequests) because the news pages must stay in order
diff --git a/tests/news_test.py b/tests/news_test.py
index 203b684..44de365 100644
--- a/tests/news_test.py
+++ b/tests/news_test.py
@@ -30,6 +30,14 @@
class NewsTest(unittest.TestCase):
def __init__(self, *args, **kwargs):
unittest.TestCase.__init__(self, *args, **kwargs)
+ with (SAMPLE_PATH / "news_pittwire.html").open() as f:
+ self.pittwire = f.read()
+ with (SAMPLE_PATH / "news_pittwire_no_categories.html").open() as f:
+ self.pittwire_no_categories = f.read()
+ with (SAMPLE_PATH / "news_features_articles.html").open() as f:
+ self.features_articles = f.read()
+ with (SAMPLE_PATH / "news_features_articles_no_topics.html").open() as f:
+ self.features_articles_no_topics = f.read()
with (SAMPLE_PATH / "news_university_news_features_articles_page_0.html").open() as f:
self.university_news_features_articles_page_0 = f.read()
with (SAMPLE_PATH / "news_university_news_features_articles_page_1.html").open() as f:
@@ -39,8 +47,69 @@ def __init__(self, *args, **kwargs):
with (SAMPLE_PATH / "news_university_news_features_articles_2020.html").open() as f:
self.university_news_features_articles_2020 = f.read()
+ @responses.activate
+ def test_get_categories(self):
+ news.get_categories.cache_clear()
+ news._scrape_categories.cache_clear()
+ responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire)
+
+ categories = news.get_categories()
+
+ self.assertCountEqual(
+ categories, ["Features & Articles", "Accolades & Honors", "Ones to Watch", "Announcements and Updates"]
+ )
+
+ @responses.activate
+ def test_get_categories_missing(self):
+ news.get_categories.cache_clear()
+ news._scrape_categories.cache_clear()
+ responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire_no_categories)
+
+ self.assertRaises(RuntimeError, news.get_categories)
+
+ @responses.activate
+ def test_get_topics(self):
+ news.get_topics.cache_clear()
+ news._scrape_topics.cache_clear()
+ responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles)
+
+ topics = news.get_topics()
+
+ self.assertCountEqual(
+ topics,
+ [
+ "University News",
+ "Health and Wellness",
+ "Technology & Science",
+ "Arts and Humanities",
+ "Community Impact",
+ "Innovation and Research",
+ "Global",
+ "Diversity, Equity, and Inclusion",
+ "Our City/Our Campus",
+ "Teaching & Learning",
+ "Space",
+ "Ukraine",
+ "Sustainability",
+ ],
+ )
+
+ @responses.activate
+ def test_get_topics_missing(self):
+ news.get_topics.cache_clear()
+ news._scrape_topics.cache_clear()
+ responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles_no_topics)
+
+ self.assertRaises(RuntimeError, news.get_topics)
+
@responses.activate
def test_get_articles_by_topic(self):
+ news.get_categories.cache_clear()
+ news.get_topics.cache_clear()
+ news._scrape_categories.cache_clear()
+ news._scrape_topics.cache_clear()
+ responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire)
+ responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles)
responses.add(
responses.GET,
"https://www.pitt.edu/pittwire/news/features-articles?field_topics_target_id=432&field_article_date_value=&title="
@@ -48,7 +117,7 @@ def test_get_articles_by_topic(self):
body=self.university_news_features_articles_page_0,
)
- university_news_articles = news.get_articles_by_topic("university-news")
+ university_news_articles = news.get_articles_by_topic("University News")
self.assertEqual(len(university_news_articles), news.NUM_ARTICLES_PER_PAGE)
self.assertEqual(
@@ -75,6 +144,12 @@ def test_get_articles_by_topic(self):
@responses.activate
def test_get_articles_by_topic_query(self):
query = "fulbright"
+ news.get_categories.cache_clear()
+ news.get_topics.cache_clear()
+ news._scrape_categories.cache_clear()
+ news._scrape_topics.cache_clear()
+ responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire)
+ responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles)
responses.add(
responses.GET,
"https://www.pitt.edu/pittwire/news/features-articles?field_topics_target_id=432&field_article_date_value="
@@ -82,7 +157,7 @@ def test_get_articles_by_topic_query(self):
body=self.university_news_features_articles_fulbright,
)
- university_news_articles = news.get_articles_by_topic("university-news", query=query)
+ university_news_articles = news.get_articles_by_topic("University News", query=query)
self.assertEqual(len(university_news_articles), 3)
self.assertEqual(
@@ -115,6 +190,12 @@ def test_get_articles_by_topic_query(self):
@responses.activate
def test_get_articles_by_topic_year(self):
year = 2020
+ news.get_categories.cache_clear()
+ news.get_topics.cache_clear()
+ news._scrape_categories.cache_clear()
+ news._scrape_topics.cache_clear()
+ responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire)
+ responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles)
responses.add(
responses.GET,
f"https://www.pitt.edu/pittwire/news/features-articles?field_topics_target_id=432&field_article_date_value={year}"
@@ -122,7 +203,7 @@ def test_get_articles_by_topic_year(self):
body=self.university_news_features_articles_2020,
)
- university_news_articles = news.get_articles_by_topic("university-news", year=year)
+ university_news_articles = news.get_articles_by_topic("University News", year=year)
self.assertEqual(len(university_news_articles), 5)
self.assertEqual(
@@ -152,6 +233,12 @@ def test_get_articles_by_topic_year(self):
@responses.activate
def test_get_articles_by_topic_less_than_one_page(self):
num_results = 5
+ news.get_categories.cache_clear()
+ news.get_topics.cache_clear()
+ news._scrape_categories.cache_clear()
+ news._scrape_topics.cache_clear()
+ responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire)
+ responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles)
responses.add(
responses.GET,
"https://www.pitt.edu/pittwire/news/features-articles?field_topics_target_id=432&field_article_date_value=&title="
@@ -159,7 +246,7 @@ def test_get_articles_by_topic_less_than_one_page(self):
body=self.university_news_features_articles_page_0,
)
- university_news_articles = news.get_articles_by_topic("university-news", max_num_results=num_results)
+ university_news_articles = news.get_articles_by_topic("University News", max_num_results=num_results)
self.assertEqual(len(university_news_articles), num_results)
self.assertEqual(
@@ -186,6 +273,12 @@ def test_get_articles_by_topic_less_than_one_page(self):
@responses.activate
def test_get_articles_by_topic_multiple_pages(self):
num_results = news.NUM_ARTICLES_PER_PAGE + 5
+ news.get_categories.cache_clear()
+ news.get_topics.cache_clear()
+ news._scrape_categories.cache_clear()
+ news._scrape_topics.cache_clear()
+ responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire)
+ responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles)
responses.add(
responses.GET,
"https://www.pitt.edu/pittwire/news/features-articles?field_topics_target_id=432&field_article_date_value=&title="
@@ -199,7 +292,7 @@ def test_get_articles_by_topic_multiple_pages(self):
body=self.university_news_features_articles_page_1,
)
- university_news_articles = news.get_articles_by_topic("university-news", max_num_results=num_results)
+ university_news_articles = news.get_articles_by_topic("University News", max_num_results=num_results)
self.assertEqual(len(university_news_articles), num_results)
self.assertEqual(
@@ -227,3 +320,25 @@ def test_get_articles_by_topic_multiple_pages(self):
],
),
)
+
+ @responses.activate
+ def test_get_articles_by_topic_invalid_category(self):
+ news.get_categories.cache_clear()
+ news.get_topics.cache_clear()
+ news._scrape_categories.cache_clear()
+ news._scrape_topics.cache_clear()
+ responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire)
+ responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles)
+
+ self.assertRaises(ValueError, news.get_articles_by_topic, "University News", "Invalid Category")
+
+ @responses.activate
+ def test_get_articles_by_topic_invalid_topic(self):
+ news.get_categories.cache_clear()
+ news.get_topics.cache_clear()
+ news._scrape_categories.cache_clear()
+ news._scrape_topics.cache_clear()
+ responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire)
+ responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles)
+
+ self.assertRaises(ValueError, news.get_articles_by_topic, "Invalid Topic")
diff --git a/tests/samples/news_features_articles.html b/tests/samples/news_features_articles.html
new file mode 100644
index 0000000..7dce0c4
--- /dev/null
+++ b/tests/samples/news_features_articles.html
@@ -0,0 +1,1839 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Pittwire News | University of Pittsburgh
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Skip to main content
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A family trauma almost toppled Pitt alum Isaac Mattson’s dream. A career roadblock may have saved it.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ President Robert Gregerson and other campus leaders recognized the 39 students who completed their degree requirements in the summer and fall semesters.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ See a gallery of the ceremony, which included speeches by Holden Thorp and Mihika Shah.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ We spoke with principal investigator Charles Rinaldo and longtime participant and volunteer Marc Wagner about how the study has impacted their lives over the decades.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Mothers Leading Science is helping health sciences faculty find a supportive community, strategies for work-life integration and renewed passion for their research.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Jennifer Hirsch’s unique course lets students get in the heads of reality stars — and learn some lessons while they’re in there.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The undergraduates will travel to South America, Taiwan and more through the program, which supports Federal Pell Grant recipients with up to $5,000 during their study abroad experience.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Everything you need to know about parking, pictures, accessibility and more for your time on the Pittsburgh campus.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ After his career on the field, Bonasorte became a fixture on the Pittsburgh campus with his clothing kiosk at Forbes and Bigelow.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Our photographers shared their 10 favorite images of innovative researchers, major developments and more from the year.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This Pitt alum’s childhood aspirations of working in the entertainment industry are coming true on the set of ‘Saturday Night Live.’
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Supported by NIH funding, the Vanish Therapeutics team is working to bring a bioabsorbable nerve stimulator to market.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The latest Economic Impact Report showed the University supported nearly 49,000 jobs and contributed $356.2 million in state and local taxes.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Here’s what’s next for Richard Su Fang, a Goldwater scholar who has already received interview invitations from 17 MD/PhD programs.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 156 people will serve at the University’s 19th annual celebration on Dec. 25.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The Nonprofit Capacity Building Program connects local organizations working to improve economic stability with University training and resources.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The Vijayalakshmi Innovation Center is funded by a gift from siblings and health care entrepreneurs Vishnu Vardhan and Harsha Vardhini, along with a significant investment from the School of Medicine.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The University’s Property and Facilities Committee and Board of Trustees approved interior fit out projects for the Department of Computational and Systems Biology and Pitt EDGE on Dec. 5.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A platform developed by Jason Lohmueller and Alex Dieters could allow immunotherapies to be delivered to tumors with more flexibility and precision.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Kate Flickinger’s research on lower metabolic rates could help astronauts safely undergo long-duration spaceflights one day. It could also help ICU patients here on Earth.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/tests/samples/news_features_articles_no_topics.html b/tests/samples/news_features_articles_no_topics.html
new file mode 100644
index 0000000..bb7410e
--- /dev/null
+++ b/tests/samples/news_features_articles_no_topics.html
@@ -0,0 +1,1838 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Pittwire News | University of Pittsburgh
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Skip to main content
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A family trauma almost toppled Pitt alum Isaac Mattson’s dream. A career roadblock may have saved it.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ President Robert Gregerson and other campus leaders recognized the 39 students who completed their degree requirements in the summer and fall semesters.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ See a gallery of the ceremony, which included speeches by Holden Thorp and Mihika Shah.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ We spoke with principal investigator Charles Rinaldo and longtime participant and volunteer Marc Wagner about how the study has impacted their lives over the decades.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Mothers Leading Science is helping health sciences faculty find a supportive community, strategies for work-life integration and renewed passion for their research.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Jennifer Hirsch’s unique course lets students get in the heads of reality stars — and learn some lessons while they’re in there.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The undergraduates will travel to South America, Taiwan and more through the program, which supports Federal Pell Grant recipients with up to $5,000 during their study abroad experience.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Everything you need to know about parking, pictures, accessibility and more for your time on the Pittsburgh campus.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ After his career on the field, Bonasorte became a fixture on the Pittsburgh campus with his clothing kiosk at Forbes and Bigelow.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Our photographers shared their 10 favorite images of innovative researchers, major developments and more from the year.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This Pitt alum’s childhood aspirations of working in the entertainment industry are coming true on the set of ‘Saturday Night Live.’
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Supported by NIH funding, the Vanish Therapeutics team is working to bring a bioabsorbable nerve stimulator to market.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The latest Economic Impact Report showed the University supported nearly 49,000 jobs and contributed $356.2 million in state and local taxes.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Here’s what’s next for Richard Su Fang, a Goldwater scholar who has already received interview invitations from 17 MD/PhD programs.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 156 people will serve at the University’s 19th annual celebration on Dec. 25.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The Nonprofit Capacity Building Program connects local organizations working to improve economic stability with University training and resources.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The Vijayalakshmi Innovation Center is funded by a gift from siblings and health care entrepreneurs Vishnu Vardhan and Harsha Vardhini, along with a significant investment from the School of Medicine.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The University’s Property and Facilities Committee and Board of Trustees approved interior fit out projects for the Department of Computational and Systems Biology and Pitt EDGE on Dec. 5.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A platform developed by Jason Lohmueller and Alex Dieters could allow immunotherapies to be delivered to tumors with more flexibility and precision.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Kate Flickinger’s research on lower metabolic rates could help astronauts safely undergo long-duration spaceflights one day. It could also help ICU patients here on Earth.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/tests/samples/news_pittwire.html b/tests/samples/news_pittwire.html
new file mode 100644
index 0000000..74dc749
--- /dev/null
+++ b/tests/samples/news_pittwire.html
@@ -0,0 +1,1942 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Pittwire | University of Pittsburgh
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Skip to main content
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Subscribe to Pittwire Today
+
+
+ Get the most interesting and important stories from the University of Pittsburgh.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Features & Articles
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A family trauma almost toppled Pitt alum Isaac Mattson’s dream. A career roadblock may have saved it.
+
+
+
+
+
+
+
+
+
+ See a gallery of the ceremony, which included speeches by Holden Thorp and Mihika Shah.
+
+
+
+
+
+
+
+
+
+ We spoke with principal investigator Charles Rinaldo and longtime participant and volunteer Marc Wagner about how the study has impacted their lives over the decades.
+
+
+
+
+
+
+
+
+
+ President Robert Gregerson and other campus leaders recognized the 39 students who completed their degree requirements in the summer and fall semesters.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Announcements and Updates
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Accolades & Honors
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Trending
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/tests/samples/news_pittwire_no_categories.html b/tests/samples/news_pittwire_no_categories.html
new file mode 100644
index 0000000..aed5fa4
--- /dev/null
+++ b/tests/samples/news_pittwire_no_categories.html
@@ -0,0 +1,1935 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Pittwire | University of Pittsburgh
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Skip to main content
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Subscribe to Pittwire Today
+
+
+ Get the most interesting and important stories from the University of Pittsburgh.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Features & Articles
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A family trauma almost toppled Pitt alum Isaac Mattson’s dream. A career roadblock may have saved it.
+
+
+
+
+
+
+
+
+
+ See a gallery of the ceremony, which included speeches by Holden Thorp and Mihika Shah.
+
+
+
+
+
+
+
+
+
+ We spoke with principal investigator Charles Rinaldo and longtime participant and volunteer Marc Wagner about how the study has impacted their lives over the decades.
+
+
+
+
+
+
+
+
+
+ President Robert Gregerson and other campus leaders recognized the 39 students who completed their degree requirements in the summer and fall semesters.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Announcements and Updates
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Accolades & Honors
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Trending
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+