From c992abcb773e026f79272f12cdfd363eb5ea8351 Mon Sep 17 00:00:00 2001 From: vsoch Date: Fri, 25 Mar 2022 21:57:23 -0600 Subject: [PATCH 1/2] testing multiprocessing for faster finds! Signed-off-by: vsoch --- CHANGELOG.md | 1 + LICENSE | 2 +- README.md | 4 +- docs/source/fileproc.rst | 2 +- urlchecker/__init__.py | 11 +--- urlchecker/client/__init__.py | 2 +- urlchecker/client/check.py | 6 +- urlchecker/core/check.py | 84 +++++++++++++++++++------- urlchecker/core/exclude.py | 2 +- urlchecker/core/fileproc.py | 2 +- urlchecker/core/urlmarker.py | 2 +- urlchecker/core/urlproc.py | 7 +-- urlchecker/core/worker.py | 110 ++++++++++++++++++++++++++++++++++ urlchecker/logger.py | 2 +- urlchecker/main/github.py | 2 +- urlchecker/main/utils.py | 2 +- urlchecker/version.py | 4 +- 17 files changed, 192 insertions(+), 53 deletions(-) create mode 100644 urlchecker/core/worker.py diff --git a/CHANGELOG.md b/CHANGELOG.md index d205e5b..0334610 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ and **Merged pull requests**. Critical items to know are: Referenced versions in headers are tagged on Github, in parentheses are for pypi. ## [vxx](https://github.com/urlstechie/urlschecker-python/tree/master) (master) + - multiprocessing to speed up checks (0.0.26) - bug fix for verbose option to only print file names that have failures (0.0.25) - adding option to print a summary that contains file names and urls (0.0.24) - updating container base to use debian buster and adding certifi (0.0.23) diff --git a/LICENSE b/LICENSE index 4530312..509d137 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2020-2021 Ayoub Malek and Vanessa Sochat +Copyright (c) 2020-2022 Ayoub Malek and Vanessa Sochat Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index c04731a..ed533af 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,9 @@ This is a python module to collect urls over static files (code and documentation) and then test for and report broken links. If you are interesting in using this as a GitHub action, see [urlchecker-action](https://github.com/urlstechie/urlchecker-action). There are also container -bases available on [quay.io/urlstechie/urlchecker](https://quay.io/repository/urlstechie/urlchecker?tab=tags). +bases available on [quay.io/urlstechie/urlchecker](https://quay.io/repository/urlstechie/urlchecker?tab=tags). As of version +0.0.26, we use multiprocessing so the checks run a lot faster, and you can set `URLCHECKER_WORKERS` to change the number of workers +(defaults to 9). If you don't want multiprocessing, use version 0.0.25 or earlier. ## Module Documentation diff --git a/docs/source/fileproc.rst b/docs/source/fileproc.rst index 5b46de8..0dfb754 100644 --- a/docs/source/fileproc.rst +++ b/docs/source/fileproc.rst @@ -1,5 +1,5 @@ urlchecker.core.fileproc -========================== +======================== .. automodule:: urlchecker.core.fileproc diff --git a/urlchecker/__init__.py b/urlchecker/__init__.py index d3e6f3e..8b77261 100644 --- a/urlchecker/__init__.py +++ b/urlchecker/__init__.py @@ -1,10 +1,3 @@ -""" - -Copyright (c) 2020-2021 Ayoub Malek and Vanessa Sochat - -This source code is licensed under the terms of the MIT license. -For a copy, see . - -""" - from urlchecker.version import __version__ + +assert __version__ diff --git a/urlchecker/client/__init__.py b/urlchecker/client/__init__.py index f07b256..69ad436 100755 --- a/urlchecker/client/__init__.py +++ b/urlchecker/client/__init__.py @@ -2,7 +2,7 @@ """ -Copyright (c) 2020-2021 Ayoub Malek and Vanessa Sochat +Copyright (c) 2020-2022 Ayoub Malek and Vanessa Sochat This source code is licensed under the terms of the MIT license. For a copy, see . diff --git a/urlchecker/client/check.py b/urlchecker/client/check.py index 2450df5..fd8ce1a 100644 --- a/urlchecker/client/check.py +++ b/urlchecker/client/check.py @@ -1,6 +1,6 @@ """ client/github.py: entrypoint for interaction with a GitHub repostiory. -Copyright (c) 2020-2021 Ayoub Malek and Vanessa Sochat +Copyright (c) 2020-2022 Ayoub Malek and Vanessa Sochat """ import re @@ -106,9 +106,9 @@ def main(args, extra): if args.verbose: print("\n\U0001F914 Uh oh... The following urls did not pass:") for file_name, result in checker.checks.items(): - if result.failed: + if result["failed"]: print_failure(file_name + ":") - for url in result.failed: + for url in result["failed"]: print_failure(" " + url) else: print("\n\U0001F914 Uh oh... The following urls did not pass:") diff --git a/urlchecker/core/check.py b/urlchecker/core/check.py index 0604233..471a058 100644 --- a/urlchecker/core/check.py +++ b/urlchecker/core/check.py @@ -1,6 +1,6 @@ """ -Copyright (c) 2020-2021 Ayoub Malek and Vanessa Sochat +Copyright (c) 2020-2022 Ayoub Malek and Vanessa Sochat This source code is licensed under the terms of the MIT license. For a copy, see . @@ -12,6 +12,7 @@ import re import sys from urlchecker.core import fileproc +from urlchecker.core.worker import Workers from urlchecker.core.urlproc import UrlCheckResult @@ -41,6 +42,8 @@ def __init__( """ # Initiate results object, and checks lookup (holds UrlCheck) for each file self.results = {"passed": set(), "failed": set(), "excluded": set()} + + # Results organized by filename self.checks = {} # Save run parameters @@ -123,12 +126,18 @@ def save_results(self, file_path, sep=",", header=None, relative_paths=True): else: file_name = os.path.relpath(file_name) - [writer.writerow([url, "failed", file_name]) for url in result.failed] + [ + writer.writerow([url, "failed", file_name]) + for url in result["failed"] + ] [ writer.writerow([url, "excluded", file_name]) - for url in result.excluded + for url in result["excluded"] + ] + [ + writer.writerow([url, "passed", file_name]) + for url in result["passed"] ] - [writer.writerow([url, "passed", file_name]) for url in result.passed] return file_path @@ -161,27 +170,56 @@ def run( exclude_urls = exclude_urls or [] exclude_patterns = exclude_patterns or [] - # loop through files files - for file_name in file_paths: - - # Instantiate a checker to extract urls - checker = UrlCheckResult( - file_name=file_name, - exclude_patterns=exclude_patterns, - exclude_urls=exclude_urls, - print_all=self.print_all, - ) - - # Check the urls - checker.check_urls(retry_count=retry_count, timeout=timeout) + # Run with multiprocessing + tasks = {} + funcs = {} + workers = Workers() - # Update flattened results - self.results["failed"].update(checker.failed) - self.results["passed"].update(checker.passed) - self.results["excluded"].update(checker.excluded) + # loop through files + for file_name in file_paths: - # Save the checker in the lookup - self.checks[file_name] = checker + # Export parameters and functions, use the same check task for all + tasks[file_name] = { + "file_name": file_name, + "exclude_patterns": exclude_patterns, + "exclude_urls": exclude_urls, + "print_all": self.print_all, + "retry_count": retry_count, + "timeout": timeout, + } + funcs[file_name] = check_task + + results = workers.run(funcs, tasks) + for file_name, result in results.items(): + self.checks[file_name] = result + self.results["failed"].update(result["failed"]) + self.results["passed"].update(result["passed"]) + self.results["excluded"].update(result["excluded"]) # A flattened dict of passed and failed return self.results + + +def check_task(*args, **kwargs): + """ + A checking task, the default we use + """ + # Instantiate a checker to extract urls + checker = UrlCheckResult( + file_name=kwargs["file_name"], + exclude_patterns=kwargs.get("exclude_patterns", []), + exclude_urls=kwargs.get("exclude_urls", []), + print_all=kwargs.get("print_all", True), + ) + + # Check the urls + checker.check_urls( + retry_count=kwargs.get("retry_count", 2), timeout=kwargs.get("timeout", 5) + ) + + # Update flattened results + return { + "failed": checker.failed, + "passed": checker.passed, + "excluded": checker.excluded, + } diff --git a/urlchecker/core/exclude.py b/urlchecker/core/exclude.py index 4d49d17..255ee00 100644 --- a/urlchecker/core/exclude.py +++ b/urlchecker/core/exclude.py @@ -1,6 +1,6 @@ """ -Copyright (c) 2020-2021 Ayoub Malek and Vanessa Sochat +Copyright (c) 2020-2022 Ayoub Malek and Vanessa Sochat This source code is licensed under the terms of the MIT license. For a copy, see . diff --git a/urlchecker/core/fileproc.py b/urlchecker/core/fileproc.py index 3cf8258..8cd04a3 100644 --- a/urlchecker/core/fileproc.py +++ b/urlchecker/core/fileproc.py @@ -1,6 +1,6 @@ """ -Copyright (c) 2020-2021 Ayoub Malek and Vanessa Sochat +Copyright (c) 2020-2022 Ayoub Malek and Vanessa Sochat This source code is licensed under the terms of the MIT license. For a copy, see . diff --git a/urlchecker/core/urlmarker.py b/urlchecker/core/urlmarker.py index e5a4acc..0098974 100644 --- a/urlchecker/core/urlmarker.py +++ b/urlchecker/core/urlmarker.py @@ -4,7 +4,7 @@ http://daringfireball.net/2010/07/improved_regex_for_matching_urls https://gist.github.com/gruber/8891611 -Copyright (c) 2020-2021 Ayoub Malek and Vanessa Sochat +Copyright (c) 2020-2022 Ayoub Malek and Vanessa Sochat This source code is licensed under the terms of the MIT license. For a copy, see . diff --git a/urlchecker/core/urlproc.py b/urlchecker/core/urlproc.py index beb2dae..8404e8d 100644 --- a/urlchecker/core/urlproc.py +++ b/urlchecker/core/urlproc.py @@ -1,6 +1,6 @@ """ -Copyright (c) 2020-2021 Ayoub Malek and Vanessa Sochat +Copyright (c) 2020-2022 Ayoub Malek and Vanessa Sochat This source code is licensed under the terms of the MIT license. For a copy, see . @@ -168,14 +168,9 @@ def check_urls(self, urls=None, retry_count=1, timeout=5): # if no urls are found, mention it if required if not urls: if self.print_all: - if self.file_name: - print("\n", self.file_name, "\n", "-" * len(self.file_name)) print("No urls found.") return - if self.file_name: - print("\n", self.file_name, "\n", "-" * len(self.file_name)) - # init seen urls list seen = set() diff --git a/urlchecker/core/worker.py b/urlchecker/core/worker.py new file mode 100644 index 0000000..7b06f73 --- /dev/null +++ b/urlchecker/core/worker.py @@ -0,0 +1,110 @@ +""" + +Copyright (c) 2020-2022 Ayoub Malek and Vanessa Sochat + +This source code is licensed under the terms of the MIT license. +For a copy, see . + +""" + +import itertools +import multiprocessing +import os +import time +import signal +import sys + +from urlchecker.logger import get_logger + +logger = get_logger() + + +class Workers: + def __init__(self, workers=None): + + if workers is None: + workers = int(os.environ.get("URLCHECKER_WORKERS", 9)) + self.workers = workers + logger.debug(f"Using {self.workers} workers for multiprocess.") + + def start(self): + logger.debug("Starting multiprocess") + self.start_time = time.time() + + def end(self): + self.end_time = time.time() + self.runtime = self.runtime = self.end_time - self.start_time + logger.debug(f"Ending multiprocess, runtime: {self.runtime} sec") + + def run(self, funcs, tasks): + """run will send a list of tasks, a tuple with arguments, through a function. + the arguments should be ordered correctly. + + Parameters + ========== + funcs: the functions to run with multiprocessing.pool, a dictionary + with lookup by the task name + tasks: a dict of tasks, each task name (key) with a + tuple of arguments to process + """ + # Number of tasks must == number of functions + assert len(funcs) == len(tasks) + + # Keep track of some progress for the user + progress = 1 + + # if we don't have tasks, don't run + if not tasks: + return + + # results will also have the same key to look up + finished = dict() + results = [] + + try: + pool = multiprocessing.Pool(self.workers, init_worker) + + self.start() + for key, params in tasks.items(): + func = funcs[key] + logger.info("Processing task %s:%s" % (key, params)) + result = pool.apply_async(multi_wrapper, multi_package(func, [params])) + + # Store the key with the result + results.append((key, result)) + + while len(results) > 0: + pair = results.pop() + key, result = pair + result.wait() + progress += 1 + finished[key] = result.get() + + self.end() + pool.close() + pool.join() + + except (KeyboardInterrupt, SystemExit): + logger.error("Keyboard interrupt detected, terminating workers!") + pool.terminate() + sys.exit(1) + + except: + logger.exit("Error running task.") + + return finished + + +# Supporting functions for MultiProcess Worker +def init_worker(): + signal.signal(signal.SIGINT, signal.SIG_IGN) + + +def multi_wrapper(func_args): + function, kwargs = func_args + return function(**kwargs) + + +def multi_package(func, kwargs): + zipped = zip(itertools.repeat(func), kwargs) + return zipped diff --git a/urlchecker/logger.py b/urlchecker/logger.py index 9fe968d..c26fae1 100644 --- a/urlchecker/logger.py +++ b/urlchecker/logger.py @@ -1,6 +1,6 @@ """ -Copyright (c) 2020-2021 Ayoub Malek and Vanessa Sochat +Copyright (c) 2020-2022 Ayoub Malek and Vanessa Sochat This source code is licensed under the terms of the MIT license. For a copy, see . diff --git a/urlchecker/main/github.py b/urlchecker/main/github.py index ab8ab52..f5a1feb 100644 --- a/urlchecker/main/github.py +++ b/urlchecker/main/github.py @@ -1,6 +1,6 @@ """ -Copyright (c) 2020-2021 Ayoub Malek and Vanessa Sochat +Copyright (c) 2020-2022 Ayoub Malek and Vanessa Sochat This source code is licensed under the terms of the MIT license. For a copy, see . diff --git a/urlchecker/main/utils.py b/urlchecker/main/utils.py index 8fc00cc..1ad1707 100644 --- a/urlchecker/main/utils.py +++ b/urlchecker/main/utils.py @@ -1,6 +1,6 @@ """ -Copyright (c) 2020-2021 Ayoub Malek and Vanessa Sochat +Copyright (c) 2020-2022 Ayoub Malek and Vanessa Sochat This source code is licensed under the terms of the MIT license. For a copy, see . diff --git a/urlchecker/version.py b/urlchecker/version.py index 127dce8..df63da4 100644 --- a/urlchecker/version.py +++ b/urlchecker/version.py @@ -1,13 +1,13 @@ """ -Copyright (c) 2020-2021 Ayoub Malek and Vanessa Sochat +Copyright (c) 2020-2022 Ayoub Malek and Vanessa Sochat This source code is licensed under the terms of the MIT license. For a copy, see . """ -__version__ = "0.0.25" +__version__ = "0.0.26" AUTHOR = "Ayoub Malek, Vanessa Sochat" AUTHOR_EMAIL = "superkogito@gmail.com, vsochat@stanford.edu" NAME = "urlchecker" From 0821d5d8414ff40fd859d5362e99465079c617b7 Mon Sep 17 00:00:00 2001 From: vsoch Date: Fri, 25 Mar 2022 22:21:03 -0600 Subject: [PATCH 2/2] remove extra verbose logging of task info Signed-off-by: vsoch --- urlchecker/core/worker.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/urlchecker/core/worker.py b/urlchecker/core/worker.py index 7b06f73..66efa05 100644 --- a/urlchecker/core/worker.py +++ b/urlchecker/core/worker.py @@ -67,7 +67,6 @@ def run(self, funcs, tasks): self.start() for key, params in tasks.items(): func = funcs[key] - logger.info("Processing task %s:%s" % (key, params)) result = pool.apply_async(multi_wrapper, multi_package(func, [params])) # Store the key with the result @@ -90,7 +89,7 @@ def run(self, funcs, tasks): sys.exit(1) except: - logger.exit("Error running task.") + logger.exit("Error running task") return finished