singer-io · joaopamaral · Oct 21, 2022 · Oct 21, 2022 · Oct 21, 2022 · Oct 21, 2022
diff --git a/README.md b/README.md
@@ -66,6 +66,10 @@ This tap:
       "base_url": "https://api.github.com"
     }
     ```
+
+> Note: The max results per page is configurable with the parameter `max_per_page`,
+> as default it will return 100 (that is the max of most of the endpoints)
+
 4. Run the tap in discovery mode to get properties.json file
 
     ```bash

diff --git a/config.sample.json b/config.sample.json
@@ -3,5 +3,6 @@
     "repository": "singer-io/target-stitch",
     "start_date": "2021-01-01T00:00:00Z",
     "request_timeout": 300,
-    "base_url": "https://api.github.com"
+    "base_url": "https://api.github.com",
+    "extract_archived": "false"
 }
diff --git a/constraints.txt b/constraints.txt
@@ -0,0 +1,36 @@
+#
+# This file is autogenerated by pip-compile with Python 3.12
+# by the following command:
+#
+#    pip-compile --output-file=constraints.txt setup.py
+#
+--index-url https://nexus.a8c.com/repository/pypi/simple
+
+backoff==1.8.0
+    # via
+    #   singer-python
+    #   tap-github (setup.py)
+certifi==2025.8.3
+    # via requests
+charset-normalizer==3.4.3
+    # via requests
+ciso8601==2.3.3
+    # via singer-python
+idna==3.10
+    # via requests
+jsonschema==2.6.0
+    # via singer-python
+python-dateutil==2.9.0.post0
+    # via singer-python
+pytz==2018.4
+    # via singer-python
+requests==2.32.5
+    # via tap-github (setup.py)
+simplejson==3.11.1
+    # via singer-python
+singer-python==5.12.1
+    # via tap-github (setup.py)
+six==1.17.0
+    # via python-dateutil
+urllib3==2.5.0
+    # via requests
diff --git a/setup.py b/setup.py
@@ -3,15 +3,15 @@
 from setuptools import setup, find_packages
 
 setup(name='tap-github',
-      version='2.0.0',
+      version='2.0.15',
       description='Singer.io tap for extracting data from the GitHub API',
       author='Stitch',
       url='http://singer.io',
       classifiers=['Programming Language :: Python :: 3 :: Only'],
       py_modules=['tap_github'],
       install_requires=[
           'singer-python==5.12.1',
-          'requests==2.20.0',
+          'requests==2.32.5',
           'backoff==1.8.0'
       ],
       extras_require={

diff --git a/tap_github/client.py b/tap_github/client.py
@@ -1,17 +1,27 @@
 import time
 import requests
+from requests.models import PreparedRequest
 import backoff
 from simplejson import JSONDecodeError
 import singer
 from singer import metrics
+from math import ceil
 
 LOGGER = singer.get_logger()
 DEFAULT_SLEEP_SECONDS = 600
+DEFAULT_MIN_REMAIN_RATE_LIMIT = 0
+DEFAULT_MAX_PER_PAGE = 100
 DEFAULT_DOMAIN = "https://api.github.com"
 
 # Set default timeout of 300 seconds
 REQUEST_TIMEOUT = 300
 
+# How many total seconds to retry when getting rate limit error from API. The limit resets every hour.
+RATE_LIMIT_RETRY_MAX_TIME = 3600
+
+PAGINATION_EXCEED_MSG = 'In order to keep the API fast for everyone, pagination is limited for this resource.'
+RATE_LIMIT_EXCEED_MSG = 'API rate limit exceeded'
+
 class GithubException(Exception):
     pass
 
@@ -45,12 +55,22 @@ class MovedPermanentlyError(GithubException):
 class ConflictError(GithubException):
     pass
 
+# Thrown when we receive 403 Rate Limit Exceeded from Github API
 class RateLimitExceeded(GithubException):
     pass
 
+# Thrown when we're expected to sleep for longer than the max_sleep_seconds limit
+class RateLimitSleepExceeded(GithubException):
+    pass
+
+# Thrown when 429 is received from Github API
 class TooManyRequests(GithubException):
     pass
 
+# Thrown when repository is archived and extract_archived is not enabled
+class ArchivedRepositoryError(GithubException):
+    pass
+
 
 ERROR_CODE_EXCEPTION_MAPPING = {
     301: {
@@ -105,6 +125,13 @@ def raise_for_error(resp, source, stream, client, should_skip_404):
     except JSONDecodeError:
         response_json = {}
 
+    response_message = response_json.get('message', '')
+
+    if error_code == 403 and RATE_LIMIT_EXCEED_MSG in response_message:
+        message = f"HTTP-error-code: 403, Error: {response_message}"
+        LOGGER.warning(message)
+        raise RateLimitExceeded() from None
+
     if error_code == 404 and should_skip_404:
         # Add not accessible stream into list.
         client.not_accessible_repos.add(stream)
@@ -116,6 +143,14 @@ def raise_for_error(resp, source, stream, client, should_skip_404):
         # Don't raise a NotFoundException
         return None
 
+    if error_code == 422 and PAGINATION_EXCEED_MSG in response_message:
+        message = f"HTTP-error-code: 422, Error: {response_message}. " \
+                  f"Please refer '{response_json.get('documentation_url')}' for more details." \
+                  "This is a known issue when the results exceed 40k and the last page is not full" \
+                  " (it will trim the results to get only the available by the API)."
+        LOGGER.warning(message)
+        return None
+
     message = "HTTP-error-code: {}, Error: {}".format(
         error_code, ERROR_CODE_EXCEPTION_MAPPING.get(error_code, {}).get("message", "Unknown Error") if response_json == {} else response_json)
 
@@ -130,19 +165,24 @@ def calculate_seconds(epoch):
     Calculate the seconds to sleep before making a new request.
     """
     current = time.time()
-    return int(round((epoch - current), 0))
+    return max(0, int(ceil(epoch - current)))
 
-def rate_throttling(response, max_sleep_seconds):
+def rate_throttling(response, max_sleep_seconds, min_remain_rate_limit):
     """
     For rate limit errors, get the remaining time before retrying and calculate the time to sleep before making a new request.
     """
+    if "Retry-After" in response.headers:
+        # handles the secondary rate limit
+        seconds_to_sleep = int(response.headers['Retry-After'])
+        LOGGER.info("Retry-After header found in response. Tap will retry the data collection after %s seconds.", seconds_to_sleep)
+        time.sleep(seconds_to_sleep)
     if 'X-RateLimit-Remaining' in response.headers:
-        if int(response.headers['X-RateLimit-Remaining']) == 0:
-            seconds_to_sleep = calculate_seconds(int(response.headers['X-RateLimit-Reset']))
+        if int(response.headers['X-RateLimit-Remaining']) <= min_remain_rate_limit:
+            seconds_to_sleep = calculate_seconds(int(response.headers['X-RateLimit-Reset']) + 15)
 
             if seconds_to_sleep > max_sleep_seconds:
                 message = "API rate limit exceeded, please try after {} seconds.".format(seconds_to_sleep)
-                raise RateLimitExceeded(message) from None
+                raise RateLimitSleepExceeded(message) from None
 
             LOGGER.info("API rate limit exceeded. Tap will retry the data collection after %s seconds.", seconds_to_sleep)
             time.sleep(seconds_to_sleep)
@@ -160,8 +200,13 @@ def __init__(self, config):
         self.session = requests.Session()
         self.base_url = config['base_url'] if config.get('base_url') else DEFAULT_DOMAIN
         self.max_sleep_seconds = self.config.get('max_sleep_seconds', DEFAULT_SLEEP_SECONDS)
+        self.min_remain_rate_limit = self.config.get('min_remain_rate_limit', DEFAULT_MIN_REMAIN_RATE_LIMIT)
         self.set_auth_in_session()
         self.not_accessible_repos = set()
+        self.max_per_page = self.config.get('max_per_page', DEFAULT_MAX_PER_PAGE)
+        # Convert string 'true'/'false' to boolean, default to False
+        extract_archived_value = str(self.config.get('extract_archived', 'false')).lower()
+        self.extract_archived = extract_archived_value == 'true'
 
     def get_request_timeout(self):
         """
@@ -187,52 +232,88 @@ def set_auth_in_session(self):
     # pylint: disable=dangerous-default-value
     # During 'Timeout' error there is also possibility of 'ConnectionError',
     # hence added backoff for 'ConnectionError' too.
-    @backoff.on_exception(backoff.expo, (requests.Timeout, requests.ConnectionError, Server5xxError, TooManyRequests), max_tries=5, factor=2)
-    def authed_get(self, source, url, headers={}, stream="", should_skip_404 = True):
+    @backoff.on_exception(backoff.expo, (requests.Timeout, requests.ConnectionError, Server5xxError, TooManyRequests),
+                          max_tries=5, factor=2)
+    @backoff.on_exception(backoff.expo, (BadCredentialsException, ), max_tries=3, factor=2)
+    @backoff.on_exception(backoff.constant, (RateLimitExceeded, ), interval=60, jitter=None, max_time=RATE_LIMIT_RETRY_MAX_TIME)
+    def authed_get_single_page(self, source, url, headers={}, stream="", should_skip_404 = True):
         """
         Call rest API and return the response in case of status code 200.
         """
-        with metrics.http_request_timer(source) as timer:
+        with metrics.http_request_timer(url) as timer:
             self.session.headers.update(headers)
             resp = self.session.request(method='get', url=url, timeout=self.get_request_timeout())
             if resp.status_code != 200:
                 raise_for_error(resp, source, stream, self, should_skip_404)
             timer.tags[metrics.Tag.http_status_code] = resp.status_code
-            rate_throttling(resp, self.max_sleep_seconds)
-            if resp.status_code == 404:
+            rate_throttling(resp, self.max_sleep_seconds, self.min_remain_rate_limit)
+            if resp.status_code == 404 or resp.status_code == 422:
                 # Return an empty response body since we're not raising a NotFoundException
-                resp._content = b'{}' # pylint: disable=protected-access
+                resp._content = b'{}'  # pylint: disable=protected-access
             return resp
 
     def authed_get_all_pages(self, source, url, headers={}, stream="", should_skip_404 = True):
         """
         Fetch all pages of records and return them.
         """
-        while True:
-            r = self.authed_get(source, url, headers, stream, should_skip_404)
-            yield r
+        next_url = self.prepare_url(url)
+        while next_url:
+            response = self.authed_get_single_page(source, next_url, headers, stream, should_skip_404)
+            yield response
 
-            # Fetch the next page if next found in the response.
-            if 'next' in r.links:
-                url = r.links['next']['url']
-            else:
-            # Break the loop if all pages are fetched.
-                break
+            next_url = response.links.get('next', {}).get('url', None)
+
+    def authed_get(self, source, url, headers={}, stream="", should_skip_404=True, single_page=False):
+        if single_page:
+            yield self.authed_get_single_page(source, url, headers, stream, should_skip_404)
+        else:
+            yield from self.authed_get_all_pages(source, url, headers, stream, should_skip_404)
+
+    def prepare_url(self, url):
+        """
+        Prepare the URL with some additional parameters
+        """
+        prepared_request = PreparedRequest()
+        # Including max per page param
+        prepared_request.prepare_url(url, {'per_page': self.max_per_page})
+        return prepared_request.url
 
     def verify_repo_access(self, url_for_repo, repo):
         """
         Call rest API to verify that the user has sufficient permissions to access this repository.
         """
         try:
-            self.authed_get("verifying repository access", url_for_repo)
+            self.authed_get_single_page("verifying repository access", url_for_repo, should_skip_404=False)
         except NotFoundException:
             # Throwing user-friendly error message as it checks token access
             message = "HTTP-error-code: 404, Error: Please check the repository name \'{}\' or you do not have sufficient permissions to access this repository.".format(repo)
             raise NotFoundException(message) from None
 
+    def check_repo_archived(self, repo):
+        """
+        Check if a repository is archived and raise an error if extract_archived is not enabled.
+
+        Args:
+            repo: Repository in 'org/repo' format
+
+        Raises:
+            ArchivedRepositoryError: If repo is archived and extract_archived config is not true
+        """
+        url = "{}/repos/{}".format(self.base_url, repo)
+        response = self.authed_get_single_page("checking repository archived status", url, should_skip_404=False)
+        repo_info = response.json()
+
+        if repo_info.get('archived', False):
+            if not self.extract_archived:
+                message = "Repository '{}' is archived. To extract data from archived repositories, " \
+                          "set 'extract_archived' to 'true' in the config.".format(repo)
+                raise ArchivedRepositoryError(message)
+            LOGGER.warning("Repository '%s' is archived. Proceeding with extraction as 'extract_archived' is enabled.", repo)
+
     def verify_access_for_repo(self):
         """
         For all the repositories mentioned in the config, check the access for each repos.
+        Also checks if repositories are archived and fails if extract_archived is not enabled.
         """
         repositories, org = self.extract_repos_from_config() # pylint: disable=unused-variable
 
@@ -244,6 +325,9 @@ def verify_access_for_repo(self):
             # Verifying for Repo access
             self.verify_repo_access(url_for_repo, repo)
 
+            # Check if repository is archived
+            self.check_repo_archived(repo)
+
     def extract_orgs_from_config(self):
         """
         Extracts all organizations from the config
@@ -331,6 +415,14 @@ def get_all_repos(self, organizations: list):
                             repo
                         )
 
+                        # Check if repository is archived (info already available in response)
+                        if repo.get('archived', False):
+                            if not self.extract_archived:
+                                message = "Repository '{}' is archived. To extract data from archived repositories, " \
+                                          "set 'extract_archived' to 'true' in the config.".format(repo_full_name)
+                                raise ArchivedRepositoryError(message)
+                            LOGGER.warning("Repository '%s' is archived. Proceeding with extraction as 'extract_archived' is enabled.", repo_full_name)
+
                         repos.append(repo_full_name)
             except NotFoundException:
                 # Throwing user-friendly error message as it checks token access