Skip to content

Commit 8a187c4

Browse files
committed
Add ReplitScrapper and GithubArchiver classes
1 parent 4a54b4d commit 8a187c4

File tree

9 files changed

+473
-0
lines changed

9 files changed

+473
-0
lines changed

.DS_Store

6 KB
Binary file not shown.

.gitignore

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
# Act bin file for local testing
2+
bin/
3+
14
# Byte-compiled / optimized / DLL files
25
__pycache__/
36
*.py[cod]
@@ -158,3 +161,6 @@ cython_debug/
158161
# and can be added to the global gitignore or merged into this file. For a more nuclear
159162
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
160163
#.idea/
164+
165+
playwright/.auth
166+
screen-shots

Makefile

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
test_all:
2+
python -m unittest discover -v
3+
4+
test_replit_scrapper:
5+
python -m unittest ./tests/test_replit_scrapper.py
6+
7+
test_github_archiver:
8+
python -m unittest ./tests/test_github_archiver.py
9+
10+
lint:
11+
flake8

funcs/github_archiver.py

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
from abc import ABC, abstractmethod
2+
import os
3+
from github import Github, Auth, InputGitTreeElement
4+
import base64
5+
6+
7+
class GithubArchiverInterface(ABC):
8+
@abstractmethod
9+
def identify_target_files():
10+
"""
11+
Read list of target files to be pushed to github, excluding replit's system files.
12+
Raise error if target folder does not exist or is empty.
13+
"""
14+
pass
15+
16+
@abstractmethod
17+
def commit_to_github():
18+
"""
19+
Commit target files to github.
20+
"""
21+
pass
22+
23+
24+
class GithubArchiver(GithubArchiverInterface):
25+
26+
def __init__(self, project_name, github_access_token, commit_message="Auto-archive") -> None:
27+
self._project_name = project_name
28+
self._file_paths = dict()
29+
self._file_list = list()
30+
self._commit_sha = ""
31+
self.__github_access_token = github_access_token
32+
self._commit_message = commit_message
33+
34+
def get_project_name(self) -> str:
35+
return self._project_name
36+
37+
def identify_target_files(self) -> None:
38+
print("GithubArchiver: Begin to parse target files...")
39+
download_folder_path = "./screen-shots"
40+
extracted_folder_path = os.path.join(download_folder_path, self.get_project_name())
41+
assert os.path.isdir(extracted_folder_path) is True, "Target folder does not exist"
42+
assert len(os.listdir(extracted_folder_path)) != 0, "Target folder is empty"
43+
44+
replit_junk = [
45+
'.cache',
46+
'.upm',
47+
'.replit',
48+
'poetry.lock',
49+
'pyproject.toml',
50+
'replit_zip_error_log.txt',
51+
'replit.nix',
52+
]
53+
54+
# Walk through the directory and its subdirectories
55+
for root, dirs, files in os.walk(extracted_folder_path):
56+
for file in files:
57+
file_full_path = os.path.join(root, file)
58+
file_relative_path = file_full_path.replace(extracted_folder_path, self.get_project_name())
59+
if not any(excluded in file_relative_path for excluded in replit_junk):
60+
self._file_paths[file_relative_path] = file_full_path
61+
self._file_list.append(file_relative_path)
62+
63+
print("GithubArchiver: Target files are parsed")
64+
65+
def get_target_files(self) -> list:
66+
return self._file_list
67+
68+
def commit_to_github(self) -> None:
69+
print("GithubArchiver: Begin to upload files to Github...")
70+
assert len(self._file_list) != 0, "Target files are not identified"
71+
auth = Auth.Token(self.__github_access_token)
72+
g = Github(auth=auth)
73+
repo = g.get_user().get_repo('The-Archive')
74+
main_branch = repo.get_branch("main")
75+
main_tree = repo.get_git_tree(sha=main_branch.commit.sha)
76+
77+
tree = list()
78+
for file_relative_path, file_full_path in self._file_paths.items():
79+
80+
with open(file_full_path, "rb") as file:
81+
file_content = file.read()
82+
83+
file_content_based64 = base64.b64encode(file_content)
84+
85+
blob = repo.create_git_blob(
86+
content=file_content_based64.decode('utf-8'),
87+
encoding="base64"
88+
)
89+
90+
tree.append(
91+
InputGitTreeElement(
92+
path=file_relative_path,
93+
mode="100644",
94+
type="blob",
95+
sha=blob.sha,
96+
)
97+
)
98+
99+
new_tree = repo.create_git_tree(
100+
tree=tree,
101+
base_tree=main_tree
102+
)
103+
104+
commit = repo.create_git_commit(
105+
message=self._commit_message,
106+
tree=repo.get_git_tree(sha=new_tree.sha),
107+
parents=[repo.get_git_commit(main_branch.commit.sha)],
108+
)
109+
110+
archive_ref = repo.get_git_ref(ref='heads/main')
111+
print(f"GithubArchiver: Archive_ref is {archive_ref}")
112+
self._commit_sha = commit.sha
113+
114+
# Commit to Github
115+
archive_ref.edit(sha=commit.sha)
116+
print("GithubArchiver: Upload complete")
117+
118+
g.close()
119+
120+
def get_commit_sha(self) -> str:
121+
return self._commit_sha

funcs/replit_scrapper.py

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
from playwright.sync_api import sync_playwright
2+
from playwright_stealth import stealth_sync
3+
4+
5+
class ReplitScrapper():
6+
user_agent = (
7+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
8+
"AppleWebKit/537.36 (KHTML, like Gecko) "
9+
"Chrome/116.0.0.0 "
10+
"Safari/537.36 "
11+
"Edg/116.0.1938.81"
12+
)
13+
14+
def __init__(self, login_name, login_password):
15+
self.__login_name = login_name
16+
self.__login_password = login_password
17+
self._replit_url = None
18+
self._downloaded_filename = None
19+
20+
def set_replit_url(self, replit_url) -> None:
21+
if replit_url is None:
22+
raise ValueError
23+
self._replit_url = replit_url
24+
25+
def get_replit_url(self) -> str:
26+
if self._replit_url is None:
27+
raise ValueError("Missing replit_url")
28+
return self._replit_url
29+
30+
def _set_downloaded_filename(self, filename) -> None:
31+
if filename is None:
32+
raise ValueError("ReplitScrapper._set_downloaded_filename() argument is None")
33+
self._downloaded_filename = filename
34+
35+
def get_downloaded_filename(self) -> str:
36+
if self._downloaded_filename is None:
37+
raise ValueError("Missing downloaded_filename")
38+
return self._downloaded_filename
39+
40+
def _visit_replit_repo(self, page) -> None:
41+
response = page.goto(self.get_replit_url(), wait_until="domcontentloaded")
42+
if response.status != 200:
43+
if response.status == 404:
44+
print(f"response.status = {response.status}")
45+
raise ValueError("Invalid replit_url")
46+
else:
47+
print(f"response.status = {response.status}")
48+
raise ValueError("ReplitScrapper._visit_replit_repo() something other than 404 happened")
49+
50+
def _login_replit(self, page) -> None:
51+
# Login
52+
page.goto('https://replit.com/login', wait_until="domcontentloaded")
53+
page.screenshot(path="./screen-shots/replit.png")
54+
url_init = "https://identitytoolkit.googleapis.com/v1/accounts"
55+
with page.expect_response(lambda response: url_init in response.url) as response_info:
56+
page.locator(
57+
"xpath=/html/body/div[1]/div/div[2]/div/main/div[2]/div/form/div[1]/input"
58+
).fill(self.__login_name)
59+
page.locator(
60+
"xpath=/html/body/div[1]/div/div[2]/div/main/div[2]/div/form/div[2]/div/input"
61+
).fill(self.__login_password)
62+
page.locator(
63+
"xpath=/html/body/div[1]/div/div[2]/div/main/div[2]/div/form/div[3]/button"
64+
).click()
65+
response = response_info.value
66+
if response.status != 200:
67+
print(response)
68+
if response.status == 400:
69+
print(f"response.status = {response.status}")
70+
raise ValueError("Invalid login credentials")
71+
else:
72+
print(f"response.status = {response.status}")
73+
raise ValueError("ReplitScrapper._login_replit() something other than 401 happened")
74+
page.wait_for_url("https://replit.com/~")
75+
page.screenshot(path="./screen-shots/replit_after_login.png")
76+
77+
def _download_as_zip(self, page) -> None:
78+
# Wait for page load
79+
page.locator(
80+
"xpath=/html/body/div[1]/div[1]/div[1]/div[2]/div/div[1]/div/div[3]/div/div[1]/button/div/span"
81+
).wait_for()
82+
while page.locator(
83+
"xpath=/html/body/div[1]/div[1]/div[1]/div[2]/header/div[2]/button"
84+
).text_content() != "Run":
85+
print(page.locator(
86+
"xpath=/html/body/div[1]/div[1]/div[1]/div[2]/header/div[2]/button"
87+
).text_content())
88+
page.wait_for_timeout(2000)
89+
page.screenshot(path="./screen-shots/target_page.png")
90+
91+
# Begin downloading
92+
page.locator(
93+
"xpath=/html/body/div[1]/div[1]/div[1]/div[2]/div/div[1]/div/div[2]/div[1]/div[1]/div/button[3]"
94+
).click()
95+
with page.expect_download() as download_info:
96+
page.locator(
97+
"xpath=/html/body/div[@class='css-1o92kwk']//div[@id='item-4']//div[@class='css-1l2rn59']"
98+
).click()
99+
download = download_info.value
100+
self._set_downloaded_filename(download.suggested_filename)
101+
download.save_as(f"./screen-shots/{download.suggested_filename}")
102+
103+
def run(self):
104+
print("ReplitScrapper: Begin downloading repo files...")
105+
with sync_playwright() as p:
106+
# Context setup
107+
browser = p.chromium.launch(slow_mo=50)
108+
# browser = p.chromium.launch(headless=False
109+
# , slow_mo=50
110+
# )
111+
context = browser.new_context(user_agent=ReplitScrapper.user_agent)
112+
page = context.new_page()
113+
stealth_sync(page)
114+
115+
# Login replit
116+
self._login_replit(page)
117+
118+
# Download repo files as zip
119+
self._visit_replit_repo(page)
120+
self._download_as_zip(page)
121+
122+
# Clean-up
123+
context.close()
124+
browser.close()
125+
print("ReplitScrapper: Download complete")

main.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
from funcs.replit_scrapper import ReplitScrapper
2+
from funcs.github_archiver import GithubArchiver
3+
import os
4+
import zipfile
5+
from dotenv import load_dotenv
6+
load_dotenv()
7+
8+
WDIR = os.path.abspath(os.path.dirname(__name__))
9+
10+
if __name__ == "__main__":
11+
test_url = "https://replit.com/@pythondojoarchi/SlipperyGargantuanDebuggers"
12+
project_name = "SlipperyGargantuanDebuggers"
13+
14+
# Download repo files as zip
15+
scrapper = ReplitScrapper(login_name=os.environ['EMAIL'], login_password=os.environ['PASSWORD'])
16+
scrapper.set_replit_url(test_url)
17+
scrapper.run()
18+
19+
# Unzip downloaded zip file
20+
download_folder_path = os.path.join(WDIR, "screen-shots")
21+
full_file_path = os.path.join(download_folder_path, project_name+".zip")
22+
extracted_folder_path = os.path.join(download_folder_path, project_name)
23+
zipfile.ZipFile(full_file_path).extractall(extracted_folder_path)
24+
25+
# Commit target files to Github
26+
archiver = GithubArchiver(
27+
project_name=project_name,
28+
github_access_token=os.environ['GITHUB_ACCESS_TOKEN']
29+
)
30+
archiver.identify_target_files()
31+
archiver.commit_to_github()

requirements.txt

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,59 @@
1+
appnope==0.1.3
2+
asttokens==2.4.1
3+
certifi==2023.11.17
4+
cffi==1.16.0
5+
charset-normalizer==3.3.2
6+
comm==0.2.0
7+
cryptography==41.0.7
8+
debugpy==1.8.0
9+
decorator==5.1.1
10+
Deprecated==1.2.14
11+
executing==2.0.1
112
flake8==6.1.0
13+
greenlet==3.0.1
14+
idna==3.6
15+
iniconfig==2.0.0
16+
ipykernel==6.27.1
17+
ipython==8.19.0
18+
jedi==0.19.1
19+
jupyter_client==8.6.0
20+
jupyter_core==5.5.1
21+
matplotlib-inline==0.1.6
222
mccabe==0.7.0
23+
nest-asyncio==1.5.8
24+
packaging==23.2
25+
parso==0.8.3
26+
pexpect==4.9.0
27+
platformdirs==4.1.0
28+
playwright==1.40.0
29+
playwright-stealth==1.0.6
30+
pluggy==1.3.0
31+
prompt-toolkit==3.0.43
32+
psutil==5.9.7
33+
ptyprocess==0.7.0
34+
pure-eval==0.2.2
335
pycodestyle==2.11.0
36+
pycparser==2.21
37+
pyee==11.0.1
438
pyflakes==3.1.0
39+
PyGithub==2.1.1
40+
Pygments==2.17.2
41+
PyJWT==2.8.0
42+
PyNaCl==1.5.0
43+
pytest==7.4.3
44+
pytest-base-url==2.0.0
45+
pytest-playwright==0.4.3
46+
python-dateutil==2.8.2
47+
python-dotenv==1.0.0
48+
python-slugify==8.0.1
49+
pyzmq==25.1.2
50+
requests==2.31.0
51+
six==1.16.0
52+
stack-data==0.6.3
53+
text-unidecode==1.3
54+
tornado==6.4
55+
traitlets==5.14.0
56+
typing_extensions==4.9.0
57+
urllib3==2.1.0
58+
wcwidth==0.2.12
59+
wrapt==1.16.0

0 commit comments

Comments
 (0)