diff --git a/README.md b/README.md index a08c646..2353638 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,10 @@ Read a config file to set parameters: ***You can overide (or add for list) any parameters define in the config.json*** >>> python main.py --config config/config.json +More configuration options can be found in config.py: + - Set custom xml tags for the sitemap + - Set an user agent + - Configure the crawling rate #### Enable debug: diff --git a/config.py b/config.py index 13c3f40..8b1e803 100644 --- a/config.py +++ b/config.py @@ -9,3 +9,6 @@ xml_footer = "" crawler_user_agent = 'Sitemap crawler' + +number_calls = 1 # number of requests per call period +call_period = 15 # time in seconds per number of requests diff --git a/crawler.py b/crawler.py index 248a41f..38384d0 100644 --- a/crawler.py +++ b/crawler.py @@ -13,6 +13,7 @@ import mimetypes import os +from ratelimit import limits, sleep_and_retry class IllegalArgumentError(ValueError): pass @@ -24,7 +25,6 @@ class Crawler: output = None report = False - config = None domain = "" exclude = [] @@ -144,8 +144,8 @@ async def crawl_all_pending_urls(self, executor): logging.debug('all crawl tasks have completed nicely') return - - + @sleep_and_retry + @limits(calls=config.number_calls, period=config.call_period) def __crawl(self, current_url): url = urlparse(current_url) logging.info("Crawling #{}: {}".format(self.num_crawled, url.geturl()))