Add method for extracting data from about page

hnrkcode · hnrkcode · commit 3278bfbab763 · 2022-09-03T13:53:11.000+02:00
diff --git a/loading_sdk/api.py b/loading_sdk/api.py
@@ -1,16 +1,81 @@
+import json
 import math
+import re
 
 import requests
+from bs4 import BeautifulSoup
 
 from loading_sdk.settings import (
     API_URL,
     API_VERSION,
+    BASE_URL,
     EDITORIAL_POST_TYPES,
     EDITORIAL_SORT,
     USER_AGENT,
 )
 
 
+class AboutPageExtractor:
+    def __init__(self):
+        about_page_source = self._get_source(f"{BASE_URL}/om")
+        main_script_url = self._extract_main_script_url(about_page_source)
+        main_script_source = self._get_source(f"{BASE_URL}/{main_script_url}")
+        about_script_url = self._get_about_script_url(main_script_source)
+        about_script_source = self._get_source(about_script_url)
+
+        self.data = self._get_about_data(about_script_source)
+
+    def _get_source(self, url):
+        headers = {"User-Agent": USER_AGENT}
+        response = requests.get(url, headers=headers)
+
+        return response.text
+
+    def _get_about_script_url(self, source_code):
+        chunk_urls = []
+
+        # Extracts the code with the javascript chunks.
+        p = re.compile("(static/js/).+?(?=\{)(.+?(?=\[)).+(.chunk.js)")
+        m = p.search(source_code)
+
+        if m:
+            # Transform the code into valid JSON so the chunk ids can be stored in a python dict.
+            s = re.sub(r"([0-9]+?(?=:))", r'"\1"', m.group(2))
+            chunk_ids = json.loads(s)
+
+            for k, v in chunk_ids.items():
+                chunk_url = f"{BASE_URL}/{m.group(1)}{k}.{v}{m.group(3)}"
+                chunk_urls.append(chunk_url)
+
+        return chunk_urls[-1]
+
+    def _get_about_data(self, source_code):
+        m = re.search("var.e=(.+?)(?=\.map).+a=(.+?)(?=\.map)", source_code)
+
+        if m:
+            people = re.sub(r"(\{|\,)([a-z]+)(\:)", r'\1"\2"\3', m.group(1))
+            people = re.sub(r"(.+)(')(.+)(')(.+)", r'\1"\3"\5', people)
+            people = people.replace('slags "vuxen p', "slags 'vuxen p")
+            people = people.replace('riktigt"-framtid', "riktigt'-framtid")
+            people = people.replace("\\n", "")
+            people = people.encode("utf-8").decode("unicode_escape")
+
+            moderators = re.sub(r"(\{|\,)([a-z]+)(\:)", r'\1"\2"\3', m.group(2))
+            moderators = re.sub(r"(.+)(')(.+)(')(.+)", r'\1"\3"\5', moderators)
+            moderators = moderators.replace("\\n", "")
+            moderators = moderators.encode("utf-8").decode("unicode_escape")
+
+            about = {"people": json.loads(people), "moderators": json.loads(moderators)}
+
+            return about
+
+    def _extract_main_script_url(self, html):
+        soup = BeautifulSoup(html, "html.parser")
+        main_script = soup.find(src=re.compile("/static/js/main\.[0-9a-zA-Z]+\.js"))
+
+        return main_script["src"][1:]
+
+
 class LoadingApiClient:
     """A client that allows python apps to easily communicate with the loading forums web api.
 
@@ -456,3 +521,12 @@ def edit_thread(self, thread_id, message):
             thread_data["message"] = "Thread updated"
 
         return thread_data
+
+    def get_about(self):
+        """Get about page data
+
+        :rtype dict
+        """
+        about_page = AboutPageExtractor()
+
+        return about_page.data
diff --git a/loading_sdk/async_api.py b/loading_sdk/async_api.py
@@ -1,9 +1,14 @@
+import json
 import math
+import re
+
 import aiohttp
+from bs4 import BeautifulSoup
 
 from loading_sdk.settings import (
     API_URL,
     API_VERSION,
+    BASE_URL,
     EDITORIAL_POST_TYPES,
     EDITORIAL_SORT,
     USER_AGENT,
@@ -17,6 +22,68 @@ async def async_loading_api_client(email=None, password=None):
     return client
 
 
+class AboutPageExtractor:
+    async def extract_about_data(self):
+        about_page_source = await self._get_source(f"{BASE_URL}/om")
+        main_script_url = self._extract_main_script_url(about_page_source)
+        main_script_source = await self._get_source(f"{BASE_URL}/{main_script_url}")
+        about_script_url = self._get_about_script_url(main_script_source)
+        about_script_source = await self._get_source(about_script_url)
+
+        return self._get_about_data(about_script_source)
+
+    async def _get_source(self, url):
+        headers = {"User-Agent": USER_AGENT}
+
+        async with aiohttp.ClientSession() as session:
+            async with session.get(url, headers=headers) as response:
+                return await response.text()
+
+    def _get_about_script_url(self, source_code):
+        chunk_urls = []
+
+        # Extracts the code with the javascript chunks.
+        p = re.compile("(static/js/).+?(?=\{)(.+?(?=\[)).+(.chunk.js)")
+        m = p.search(source_code)
+
+        if m:
+            # Transform the code into valid JSON so the chunk ids can be stored in a python dict.
+            s = re.sub(r"([0-9]+?(?=:))", r'"\1"', m.group(2))
+            chunk_ids = json.loads(s)
+
+            for k, v in chunk_ids.items():
+                chunk_url = f"{BASE_URL}/{m.group(1)}{k}.{v}{m.group(3)}"
+                chunk_urls.append(chunk_url)
+
+        return chunk_urls[-1]
+
+    def _get_about_data(self, source_code):
+        m = re.search("var.e=(.+?)(?=\.map).+a=(.+?)(?=\.map)", source_code)
+
+        if m:
+            people = re.sub(r"(\{|\,)([a-z]+)(\:)", r'\1"\2"\3', m.group(1))
+            people = re.sub(r"(.+)(')(.+)(')(.+)", r'\1"\3"\5', people)
+            people = people.replace('slags "vuxen p', "slags 'vuxen p")
+            people = people.replace('riktigt"-framtid', "riktigt'-framtid")
+            people = people.replace("\\n", "")
+            people = people.encode("utf-8").decode("unicode_escape")
+
+            moderators = re.sub(r"(\{|\,)([a-z]+)(\:)", r'\1"\2"\3', m.group(2))
+            moderators = re.sub(r"(.+)(')(.+)(')(.+)", r'\1"\3"\5', moderators)
+            moderators = moderators.replace("\\n", "")
+            moderators = moderators.encode("utf-8").decode("unicode_escape")
+
+            about = {"people": json.loads(people), "moderators": json.loads(moderators)}
+
+            return about
+
+    def _extract_main_script_url(self, html):
+        soup = BeautifulSoup(html, "html.parser")
+        main_script = soup.find(src=re.compile("/static/js/main\.[0-9a-zA-Z]+\.js"))
+
+        return main_script["src"][1:]
+
+
 class AsyncLoadingApiClient:
     """
     An async client that allows python apps to easily communicate with the loading forums web api.
@@ -490,3 +557,13 @@ async def edit_thread(self, thread_id, message):
             thread_data["message"] = "Thread updated"
 
         return thread_data
+
+    async def get_about(self):
+        """Get about page data
+
+        :rtype dict
+        """
+        about_page = AboutPageExtractor()
+        about_data = await about_page.extract_about_data()
+
+        return about_data
diff --git a/loading_sdk/settings.py b/loading_sdk/settings.py
@@ -1,3 +1,4 @@
+BASE_URL = "https://loading.se"
 API_URL = "https://api.loading.se"
 API_VERSION = "v1"
 USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0"

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+BASE_URL = "https://loading.se"`
`1`	`2`	`API_URL = "https://api.loading.se"`
`2`	`3`	`API_VERSION = "v1"`
`3`	`4`	`USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0"`