From bc67cd22304fcce4f8990954b8bf7b76d928274b Mon Sep 17 00:00:00 2001 From: saraskardelly <85453371+saraskardelly@users.noreply.github.com> Date: Thu, 11 May 2023 18:17:08 +0200 Subject: [PATCH] Update crawldata.py Hi, I have now added Bloomberg and Washington Post. I will send you a detailed message on Discord. Thank you very much. --- src/articlecrawler/crawldata.py | 47 +++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/src/articlecrawler/crawldata.py b/src/articlecrawler/crawldata.py index bef53ff..403a59e 100644 --- a/src/articlecrawler/crawldata.py +++ b/src/articlecrawler/crawldata.py @@ -121,5 +121,52 @@ ], 'regex-filter': ['.*[a-z]+.*$'] } + }, + 'bloomberg': { + 'url-prefix': 'https://www.bloomberg.com', + 'article-links': { + 'overview-urls': ['/markets', '/technology', '/politics', '/world'], + 'find-tags': [ # Hierarchy + {'type': 'include', 'name': 'header', 'attrs': {'class': 'story-package-module_ _stories'}}, + {'type': 'include', 'name': 'a', 'attrs': {'class': 'story-package-module_ _story_ _headline-link'}} + ], + 'link-prefix': ('/news/articles/') + }, + 'heading': { + 'find-tags': [ # Hierarchy + {'type': 'include', 'name': 'h1', 'attrs': {'class': 'lede-text-v2_ _hed'}} + ] + }, + 'article': { + 'find-tags': [ # Hierarchy + {'type': 'include', 'name': 'div', 'attrs': {'class': 'body-copy-v2 fence-body'}}, + {'type': 'include', 'name': 'p', 'attrs': {}}, + {'type': 'excludeParent', 'name': 'div', 'attrs': {'class': 'bb-unsupported-inset'}}, + ], + 'regex-filter': ['.*[a-z]+.*$'] + } + }, + 'washingtonpost': { + 'url-prefix': 'https://www.washingtonpost.com', + 'article-links': { + 'overview-urls': ['/politics', '/world', '/business', '/technology', '/sports'], + 'find-tags': [ # Hierarchy + {'type': 'include', 'name': 'header', 'attrs': {'class': 'headline'}}, + {'type': 'include', 'name': 'a', 'attrs': {}} + ], + 'link-prefix': ('/2023/05/') + }, + 'heading': { + 'find-tags': [ # Hierarchy + {'type': 'include', 'name': 'h1', 'attrs': {'data-qa': 'headline'}} + ] + }, + 'article': { + 'find-tags': [ # Hierarchy + {'type': 'include', 'name': 'article', 'attrs': {'itemprop': 'articleBody'}}, + {'type': 'include', 'name': 'p', 'attrs': {}}, + ], + 'regex-filter': ['.*[a-z]+.*$'] + } } }