Add ReplitScrapper and GithubArchiver classes

fungss · fungss · commit 8a187c4892dd · 2025-01-20T17:56:02.000Z
diff --git a/.DS_Store b/.DS_Store
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,6 @@
+# Act bin file for local testing
+bin/ 
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
@@ -158,3 +161,6 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+
+playwright/.auth
+screen-shots
diff --git a/Makefile b/Makefile
@@ -0,0 +1,11 @@
+test_all:
+	python -m unittest discover -v
+
+test_replit_scrapper:
+	python -m unittest ./tests/test_replit_scrapper.py
+
+test_github_archiver:
+	python -m unittest ./tests/test_github_archiver.py
+
+lint:
+	flake8
diff --git a/funcs/github_archiver.py b/funcs/github_archiver.py
@@ -0,0 +1,121 @@
+from abc import ABC, abstractmethod
+import os
+from github import Github, Auth, InputGitTreeElement
+import base64
+
+
+class GithubArchiverInterface(ABC):
+    @abstractmethod
+    def identify_target_files():
+        """
+        Read list of target files to be pushed to github, excluding replit's system files.
+        Raise error if target folder does not exist or is empty.
+        """
+        pass
+
+    @abstractmethod
+    def commit_to_github():
+        """
+        Commit target files to github.
+        """
+        pass
+
+
+class GithubArchiver(GithubArchiverInterface):
+
+    def __init__(self, project_name, github_access_token, commit_message="Auto-archive") -> None:
+        self._project_name = project_name
+        self._file_paths = dict()
+        self._file_list = list()
+        self._commit_sha = ""
+        self.__github_access_token = github_access_token
+        self._commit_message = commit_message
+
+    def get_project_name(self) -> str:
+        return self._project_name
+
+    def identify_target_files(self) -> None:
+        print("GithubArchiver: Begin to parse target files...")
+        download_folder_path = "./screen-shots"
+        extracted_folder_path = os.path.join(download_folder_path, self.get_project_name())
+        assert os.path.isdir(extracted_folder_path) is True, "Target folder does not exist"
+        assert len(os.listdir(extracted_folder_path)) != 0, "Target folder is empty"
+
+        replit_junk = [
+            '.cache',
+            '.upm',
+            '.replit',
+            'poetry.lock',
+            'pyproject.toml',
+            'replit_zip_error_log.txt',
+            'replit.nix',
+        ]
+
+        # Walk through the directory and its subdirectories
+        for root, dirs, files in os.walk(extracted_folder_path):
+            for file in files:
+                file_full_path = os.path.join(root, file)
+                file_relative_path = file_full_path.replace(extracted_folder_path, self.get_project_name())
+                if not any(excluded in file_relative_path for excluded in replit_junk):
+                    self._file_paths[file_relative_path] = file_full_path
+                    self._file_list.append(file_relative_path)
+
+        print("GithubArchiver: Target files are parsed")
+
+    def get_target_files(self) -> list:
+        return self._file_list
+
+    def commit_to_github(self) -> None:
+        print("GithubArchiver: Begin to upload files to Github...")
+        assert len(self._file_list) != 0, "Target files are not identified"
+        auth = Auth.Token(self.__github_access_token)
+        g = Github(auth=auth)
+        repo = g.get_user().get_repo('The-Archive')
+        main_branch = repo.get_branch("main")
+        main_tree = repo.get_git_tree(sha=main_branch.commit.sha)
+
+        tree = list()
+        for file_relative_path, file_full_path in self._file_paths.items():
+
+            with open(file_full_path, "rb") as file:
+                file_content = file.read()
+
+            file_content_based64 = base64.b64encode(file_content)
+
+            blob = repo.create_git_blob(
+                    content=file_content_based64.decode('utf-8'),
+                    encoding="base64"
+                )
+
+            tree.append(
+                InputGitTreeElement(
+                    path=file_relative_path,
+                    mode="100644",
+                    type="blob",
+                    sha=blob.sha,
+                )
+            )
+
+        new_tree = repo.create_git_tree(
+            tree=tree,
+            base_tree=main_tree
+        )
+
+        commit = repo.create_git_commit(
+            message=self._commit_message,
+            tree=repo.get_git_tree(sha=new_tree.sha),
+            parents=[repo.get_git_commit(main_branch.commit.sha)],
+        )
+
+        archive_ref = repo.get_git_ref(ref='heads/main')
+        print(f"GithubArchiver: Archive_ref is {archive_ref}")
+        self._commit_sha = commit.sha
+
+        # Commit to Github
+        archive_ref.edit(sha=commit.sha)
+        print("GithubArchiver: Upload complete")
+
+        g.close()
+
+    def get_commit_sha(self) -> str:
+        return self._commit_sha
diff --git a/funcs/replit_scrapper.py b/funcs/replit_scrapper.py
@@ -0,0 +1,125 @@
+from playwright.sync_api import sync_playwright
+from playwright_stealth import stealth_sync
+
+
+class ReplitScrapper():
+    user_agent = (
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+        "AppleWebKit/537.36 (KHTML, like Gecko) "
+        "Chrome/116.0.0.0 "
+        "Safari/537.36 "
+        "Edg/116.0.1938.81"
+    )
+
+    def __init__(self, login_name, login_password):
+        self.__login_name = login_name
+        self.__login_password = login_password
+        self._replit_url = None
+        self._downloaded_filename = None
+
+    def set_replit_url(self, replit_url) -> None:
+        if replit_url is None:
+            raise ValueError
+        self._replit_url = replit_url
+
+    def get_replit_url(self) -> str:
+        if self._replit_url is None:
+            raise ValueError("Missing replit_url")
+        return self._replit_url
+
+    def _set_downloaded_filename(self, filename) -> None:
+        if filename is None:
+            raise ValueError("ReplitScrapper._set_downloaded_filename() argument is None")
+        self._downloaded_filename = filename
+
+    def get_downloaded_filename(self) -> str:
+        if self._downloaded_filename is None:
+            raise ValueError("Missing downloaded_filename")
+        return self._downloaded_filename
+
+    def _visit_replit_repo(self, page) -> None:
+        response = page.goto(self.get_replit_url(), wait_until="domcontentloaded")
+        if response.status != 200:
+            if response.status == 404:
+                print(f"response.status = {response.status}")
+                raise ValueError("Invalid replit_url")
+            else:
+                print(f"response.status = {response.status}")
+                raise ValueError("ReplitScrapper._visit_replit_repo() something other than 404 happened")
+
+    def _login_replit(self, page) -> None:
+        # Login
+        page.goto('https://replit.com/login', wait_until="domcontentloaded")
+        page.screenshot(path="./screen-shots/replit.png")
+        url_init = "https://identitytoolkit.googleapis.com/v1/accounts"
+        with page.expect_response(lambda response: url_init in response.url) as response_info:
+            page.locator(
+                "xpath=/html/body/div[1]/div/div[2]/div/main/div[2]/div/form/div[1]/input"
+            ).fill(self.__login_name)
+            page.locator(
+                "xpath=/html/body/div[1]/div/div[2]/div/main/div[2]/div/form/div[2]/div/input"
+            ).fill(self.__login_password)
+            page.locator(
+                "xpath=/html/body/div[1]/div/div[2]/div/main/div[2]/div/form/div[3]/button"
+            ).click()
+        response = response_info.value
+        if response.status != 200:
+            print(response)
+            if response.status == 400:
+                print(f"response.status = {response.status}")
+                raise ValueError("Invalid login credentials")
+            else:
+                print(f"response.status = {response.status}")
+                raise ValueError("ReplitScrapper._login_replit() something other than 401 happened")
+        page.wait_for_url("https://replit.com/~")
+        page.screenshot(path="./screen-shots/replit_after_login.png")
+
+    def _download_as_zip(self, page) -> None:
+        # Wait for page load
+        page.locator(
+            "xpath=/html/body/div[1]/div[1]/div[1]/div[2]/div/div[1]/div/div[3]/div/div[1]/button/div/span"
+        ).wait_for()
+        while page.locator(
+                "xpath=/html/body/div[1]/div[1]/div[1]/div[2]/header/div[2]/button"
+                ).text_content() != "Run":
+            print(page.locator(
+                "xpath=/html/body/div[1]/div[1]/div[1]/div[2]/header/div[2]/button"
+                ).text_content())
+            page.wait_for_timeout(2000)
+        page.screenshot(path="./screen-shots/target_page.png")
+
+        # Begin downloading
+        page.locator(
+            "xpath=/html/body/div[1]/div[1]/div[1]/div[2]/div/div[1]/div/div[2]/div[1]/div[1]/div/button[3]"
+        ).click()
+        with page.expect_download() as download_info:
+            page.locator(
+                "xpath=/html/body/div[@class='css-1o92kwk']//div[@id='item-4']//div[@class='css-1l2rn59']"
+            ).click()
+        download = download_info.value
+        self._set_downloaded_filename(download.suggested_filename)
+        download.save_as(f"./screen-shots/{download.suggested_filename}")
+
+    def run(self):
+        print("ReplitScrapper: Begin downloading repo files...")
+        with sync_playwright() as p:
+            # Context setup
+            browser = p.chromium.launch(slow_mo=50)
+            # browser = p.chromium.launch(headless=False
+            #                 , slow_mo=50
+            #                 )
+            context = browser.new_context(user_agent=ReplitScrapper.user_agent)
+            page = context.new_page()
+            stealth_sync(page)
+
+            # Login replit
+            self._login_replit(page)
+
+            # Download repo files as zip
+            self._visit_replit_repo(page)
+            self._download_as_zip(page)
+
+            # Clean-up
+            context.close()
+            browser.close()
+        print("ReplitScrapper: Download complete")
diff --git a/main.py b/main.py
@@ -0,0 +1,31 @@
+from funcs.replit_scrapper import ReplitScrapper
+from funcs.github_archiver import GithubArchiver
+import os
+import zipfile
+from dotenv import load_dotenv
+load_dotenv()
+
+WDIR = os.path.abspath(os.path.dirname(__name__))
+
+if __name__ == "__main__":
+    test_url = "https://replit.com/@pythondojoarchi/SlipperyGargantuanDebuggers"
+    project_name = "SlipperyGargantuanDebuggers"
+
+    # Download repo files as zip
+    scrapper = ReplitScrapper(login_name=os.environ['EMAIL'], login_password=os.environ['PASSWORD'])
+    scrapper.set_replit_url(test_url)
+    scrapper.run()
+
+    # Unzip downloaded zip file
+    download_folder_path = os.path.join(WDIR, "screen-shots")
+    full_file_path = os.path.join(download_folder_path, project_name+".zip")
+    extracted_folder_path = os.path.join(download_folder_path, project_name)
+    zipfile.ZipFile(full_file_path).extractall(extracted_folder_path)
+
+    # Commit target files to Github
+    archiver = GithubArchiver(
+        project_name=project_name,
+        github_access_token=os.environ['GITHUB_ACCESS_TOKEN']
+    )
+    archiver.identify_target_files()
+    archiver.commit_to_github()
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,59 @@
+appnope==0.1.3
+asttokens==2.4.1
+certifi==2023.11.17
+cffi==1.16.0
+charset-normalizer==3.3.2
+comm==0.2.0
+cryptography==41.0.7
+debugpy==1.8.0
+decorator==5.1.1
+Deprecated==1.2.14
+executing==2.0.1
 flake8==6.1.0
+greenlet==3.0.1
+idna==3.6
+iniconfig==2.0.0
+ipykernel==6.27.1
+ipython==8.19.0
+jedi==0.19.1
+jupyter_client==8.6.0
+jupyter_core==5.5.1
+matplotlib-inline==0.1.6
 mccabe==0.7.0
+nest-asyncio==1.5.8
+packaging==23.2
+parso==0.8.3
+pexpect==4.9.0
+platformdirs==4.1.0
+playwright==1.40.0
+playwright-stealth==1.0.6
+pluggy==1.3.0
+prompt-toolkit==3.0.43
+psutil==5.9.7
+ptyprocess==0.7.0
+pure-eval==0.2.2
 pycodestyle==2.11.0
+pycparser==2.21
+pyee==11.0.1
 pyflakes==3.1.0
+PyGithub==2.1.1
+Pygments==2.17.2
+PyJWT==2.8.0
+PyNaCl==1.5.0
+pytest==7.4.3
+pytest-base-url==2.0.0
+pytest-playwright==0.4.3
+python-dateutil==2.8.2
+python-dotenv==1.0.0
+python-slugify==8.0.1
+pyzmq==25.1.2
+requests==2.31.0
+six==1.16.0
+stack-data==0.6.3
+text-unidecode==1.3
+tornado==6.4
+traitlets==5.14.0
+typing_extensions==4.9.0
+urllib3==2.1.0
+wcwidth==0.2.12
+wrapt==1.16.0
diff --git a/tests/test_github_archiver.py b/tests/test_github_archiver.py
diff --git a/tests/test_replit_scrapper.py b/tests/test_replit_scrapper.py