Skip to content

Commit 3278bfb

Browse files
committed
Add method for extracting data from about page
1 parent ec5bf12 commit 3278bfb

File tree

3 files changed

+152
-0
lines changed

3 files changed

+152
-0
lines changed

loading_sdk/api.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,81 @@
1+
import json
12
import math
3+
import re
24

35
import requests
6+
from bs4 import BeautifulSoup
47

58
from loading_sdk.settings import (
69
API_URL,
710
API_VERSION,
11+
BASE_URL,
812
EDITORIAL_POST_TYPES,
913
EDITORIAL_SORT,
1014
USER_AGENT,
1115
)
1216

1317

18+
class AboutPageExtractor:
19+
def __init__(self):
20+
about_page_source = self._get_source(f"{BASE_URL}/om")
21+
main_script_url = self._extract_main_script_url(about_page_source)
22+
main_script_source = self._get_source(f"{BASE_URL}/{main_script_url}")
23+
about_script_url = self._get_about_script_url(main_script_source)
24+
about_script_source = self._get_source(about_script_url)
25+
26+
self.data = self._get_about_data(about_script_source)
27+
28+
def _get_source(self, url):
29+
headers = {"User-Agent": USER_AGENT}
30+
response = requests.get(url, headers=headers)
31+
32+
return response.text
33+
34+
def _get_about_script_url(self, source_code):
35+
chunk_urls = []
36+
37+
# Extracts the code with the javascript chunks.
38+
p = re.compile("(static/js/).+?(?=\{)(.+?(?=\[)).+(.chunk.js)")
39+
m = p.search(source_code)
40+
41+
if m:
42+
# Transform the code into valid JSON so the chunk ids can be stored in a python dict.
43+
s = re.sub(r"([0-9]+?(?=:))", r'"\1"', m.group(2))
44+
chunk_ids = json.loads(s)
45+
46+
for k, v in chunk_ids.items():
47+
chunk_url = f"{BASE_URL}/{m.group(1)}{k}.{v}{m.group(3)}"
48+
chunk_urls.append(chunk_url)
49+
50+
return chunk_urls[-1]
51+
52+
def _get_about_data(self, source_code):
53+
m = re.search("var.e=(.+?)(?=\.map).+a=(.+?)(?=\.map)", source_code)
54+
55+
if m:
56+
people = re.sub(r"(\{|\,)([a-z]+)(\:)", r'\1"\2"\3', m.group(1))
57+
people = re.sub(r"(.+)(')(.+)(')(.+)", r'\1"\3"\5', people)
58+
people = people.replace('slags "vuxen p', "slags 'vuxen p")
59+
people = people.replace('riktigt"-framtid', "riktigt'-framtid")
60+
people = people.replace("\\n", "")
61+
people = people.encode("utf-8").decode("unicode_escape")
62+
63+
moderators = re.sub(r"(\{|\,)([a-z]+)(\:)", r'\1"\2"\3', m.group(2))
64+
moderators = re.sub(r"(.+)(')(.+)(')(.+)", r'\1"\3"\5', moderators)
65+
moderators = moderators.replace("\\n", "")
66+
moderators = moderators.encode("utf-8").decode("unicode_escape")
67+
68+
about = {"people": json.loads(people), "moderators": json.loads(moderators)}
69+
70+
return about
71+
72+
def _extract_main_script_url(self, html):
73+
soup = BeautifulSoup(html, "html.parser")
74+
main_script = soup.find(src=re.compile("/static/js/main\.[0-9a-zA-Z]+\.js"))
75+
76+
return main_script["src"][1:]
77+
78+
1479
class LoadingApiClient:
1580
"""A client that allows python apps to easily communicate with the loading forums web api.
1681
@@ -456,3 +521,12 @@ def edit_thread(self, thread_id, message):
456521
thread_data["message"] = "Thread updated"
457522

458523
return thread_data
524+
525+
def get_about(self):
526+
"""Get about page data
527+
528+
:rtype dict
529+
"""
530+
about_page = AboutPageExtractor()
531+
532+
return about_page.data

loading_sdk/async_api.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,14 @@
1+
import json
12
import math
3+
import re
4+
25
import aiohttp
6+
from bs4 import BeautifulSoup
37

48
from loading_sdk.settings import (
59
API_URL,
610
API_VERSION,
11+
BASE_URL,
712
EDITORIAL_POST_TYPES,
813
EDITORIAL_SORT,
914
USER_AGENT,
@@ -17,6 +22,68 @@ async def async_loading_api_client(email=None, password=None):
1722
return client
1823

1924

25+
class AboutPageExtractor:
26+
async def extract_about_data(self):
27+
about_page_source = await self._get_source(f"{BASE_URL}/om")
28+
main_script_url = self._extract_main_script_url(about_page_source)
29+
main_script_source = await self._get_source(f"{BASE_URL}/{main_script_url}")
30+
about_script_url = self._get_about_script_url(main_script_source)
31+
about_script_source = await self._get_source(about_script_url)
32+
33+
return self._get_about_data(about_script_source)
34+
35+
async def _get_source(self, url):
36+
headers = {"User-Agent": USER_AGENT}
37+
38+
async with aiohttp.ClientSession() as session:
39+
async with session.get(url, headers=headers) as response:
40+
return await response.text()
41+
42+
def _get_about_script_url(self, source_code):
43+
chunk_urls = []
44+
45+
# Extracts the code with the javascript chunks.
46+
p = re.compile("(static/js/).+?(?=\{)(.+?(?=\[)).+(.chunk.js)")
47+
m = p.search(source_code)
48+
49+
if m:
50+
# Transform the code into valid JSON so the chunk ids can be stored in a python dict.
51+
s = re.sub(r"([0-9]+?(?=:))", r'"\1"', m.group(2))
52+
chunk_ids = json.loads(s)
53+
54+
for k, v in chunk_ids.items():
55+
chunk_url = f"{BASE_URL}/{m.group(1)}{k}.{v}{m.group(3)}"
56+
chunk_urls.append(chunk_url)
57+
58+
return chunk_urls[-1]
59+
60+
def _get_about_data(self, source_code):
61+
m = re.search("var.e=(.+?)(?=\.map).+a=(.+?)(?=\.map)", source_code)
62+
63+
if m:
64+
people = re.sub(r"(\{|\,)([a-z]+)(\:)", r'\1"\2"\3', m.group(1))
65+
people = re.sub(r"(.+)(')(.+)(')(.+)", r'\1"\3"\5', people)
66+
people = people.replace('slags "vuxen p', "slags 'vuxen p")
67+
people = people.replace('riktigt"-framtid', "riktigt'-framtid")
68+
people = people.replace("\\n", "")
69+
people = people.encode("utf-8").decode("unicode_escape")
70+
71+
moderators = re.sub(r"(\{|\,)([a-z]+)(\:)", r'\1"\2"\3', m.group(2))
72+
moderators = re.sub(r"(.+)(')(.+)(')(.+)", r'\1"\3"\5', moderators)
73+
moderators = moderators.replace("\\n", "")
74+
moderators = moderators.encode("utf-8").decode("unicode_escape")
75+
76+
about = {"people": json.loads(people), "moderators": json.loads(moderators)}
77+
78+
return about
79+
80+
def _extract_main_script_url(self, html):
81+
soup = BeautifulSoup(html, "html.parser")
82+
main_script = soup.find(src=re.compile("/static/js/main\.[0-9a-zA-Z]+\.js"))
83+
84+
return main_script["src"][1:]
85+
86+
2087
class AsyncLoadingApiClient:
2188
"""
2289
An async client that allows python apps to easily communicate with the loading forums web api.
@@ -490,3 +557,13 @@ async def edit_thread(self, thread_id, message):
490557
thread_data["message"] = "Thread updated"
491558

492559
return thread_data
560+
561+
async def get_about(self):
562+
"""Get about page data
563+
564+
:rtype dict
565+
"""
566+
about_page = AboutPageExtractor()
567+
about_data = await about_page.extract_about_data()
568+
569+
return about_data

loading_sdk/settings.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
BASE_URL = "https://loading.se"
12
API_URL = "https://api.loading.se"
23
API_VERSION = "v1"
34
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0"

0 commit comments

Comments
 (0)