Skip to content

Commit c520a5b

Browse files
Scraper: add logging to file
1 parent 079965c commit c520a5b

File tree

3 files changed

+49
-24
lines changed

3 files changed

+49
-24
lines changed

migrate/oldwiki/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
output/
22
cache/
33
images/
4+
logs/
45
__pycache__/

migrate/oldwiki/copy_files.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6,24 +6,23 @@
66
OUTPUT_FUNCTIONS = "../../functions"
77
OUTPUT_EVENTS = "../../events"
88

9-
# Set to false so that the script doesn't override .yaml
10-
# files that have the 'incomplete: true'
11-
OVERRIDE_INCOMPLETE_PAGES = True
12-
13-
def copy_files(source_dir, target_dir):
9+
def copy_files(page_type, source_dir, target_dir):
1410
for root, dirs, files in os.walk(source_dir):
1511
for file in files:
1612
if file.endswith(".yaml"):
1713
src_path = os.path.join(root, file)
1814
rel_path = os.path.relpath(src_path, source_dir)
1915
dest_path = os.path.join(target_dir, rel_path)
2016

21-
# Check if destination .yaml has 'incomplete' attribute
22-
if (not OVERRIDE_INCOMPLETE_PAGES) and os.path.exists(dest_path):
17+
if not os.path.exists(dest_path):
18+
# Don't copy because it doesn't exist in the output
19+
print(f"(YAML) Skipping {dest_path} because it doesn't exist in the output")
20+
continue
21+
else:
2322
with open(dest_path, 'r', encoding='utf-8') as dest_file:
2423
content = dest_file.read()
2524
if 'incomplete: true' not in content:
26-
print(f"Skipping {dest_path} due to 'incomplete: true'")
25+
print(f"(YAML) Skipping {dest_path} because it's not marked as incomplete")
2726
continue
2827

2928
os.makedirs(os.path.dirname(dest_path), exist_ok=True)
@@ -49,10 +48,11 @@ def copy_files(source_dir, target_dir):
4948

5049
if __name__ == "__main__":
5150
# Copy all generated YAML files from the old wiki migration to the definitive folders
52-
# print("Copying functions...")
53-
# copy_files(MIGRATE_FUNCTIONS, OUTPUT_FUNCTIONS)
51+
52+
print("Copying functions...")
53+
copy_files('functions', MIGRATE_FUNCTIONS, OUTPUT_FUNCTIONS)
5454

55-
print("Copying events...")
56-
copy_files(MIGRATE_EVENTS, OUTPUT_EVENTS)
55+
# print("Copying events...")
56+
# copy_files('events', MIGRATE_EVENTS, OUTPUT_EVENTS)
5757

5858
print("Copy completed.")

migrate/oldwiki/scrape.py

Lines changed: 36 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,16 @@
4040
'Vector'
4141
]
4242

43+
log_filename = f"./logs/scrape_log_{time.strftime('%Y%m%d_%H%M%S', time.localtime())}.log"
44+
log_file = open(log_filename, "a", encoding="utf-8")
45+
46+
def log(message: str):
47+
global log_file
48+
timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
49+
log_entry = f"[{timestamp}] {message}"
50+
print(log_entry)
51+
log_file.write(log_entry + "\n")
52+
4353
def clean_category(category_name: str) -> str:
4454
if category_name.endswith("events"):
4555
return category_name[:-7]
@@ -54,7 +64,7 @@ def fix_category(category_name: str) -> str:
5464
return category_name
5565

5666
def parse_links(source_label: str, url: str) -> dict:
57-
print(f"Parsing list of {source_label} ...")
67+
log(f"Parsing list of {source_label} ...")
5868

5969
response = requests.get(url)
6070
soup = BeautifulSoup(response.text, "html.parser")
@@ -83,8 +93,16 @@ def parse_links(source_label: str, url: str) -> dict:
8393
continue
8494
page_url = a.get("href")
8595
page_url = f"https://wiki.multitheftauto.com{page_url}"
86-
if name not in result[current_category]:
96+
# Check if name not in any result category
97+
foundInCat = False
98+
for cat, entries in result.items():
99+
if any(entry[1] == name for entry in entries):
100+
foundInCat = cat
101+
break
102+
if not foundInCat:
87103
result[current_category].append((page_url, name))
104+
else:
105+
log(f"!!! Duplicate found in {foundInCat} when parsing {current_category}: {page_url}")
88106

89107
return result
90108

@@ -274,7 +292,7 @@ def parse_description(content_div):
274292
if text and not text.isspace():
275293
the_description = convert_to_markdown(str(element))
276294
the_description = the_description.strip()
277-
# print(f"Found description for {name}: {the_description}")
295+
# log(f"Found description for {name}: {the_description}")
278296
break
279297
elif element.name in ["h2", "h3"]:
280298
# Stop at the first header
@@ -288,7 +306,7 @@ def parse_description(content_div):
288306
if text and not text.isspace():
289307
the_description = convert_to_markdown(str(div))
290308
the_description = the_description.strip()
291-
# print(f"Found description in div for {name}: {the_description}")
309+
# log(f"Found description in div for {name}: {the_description}")
292310
break
293311

294312
return the_description
@@ -351,8 +369,8 @@ def print_additional_headers_found_in_page(content_div, handled_header_names, pa
351369
additional_headers.append(header_text)
352370

353371
if additional_headers:
354-
print(f"Other headers found in {page_url}:")
355-
print(f" {', '.join(additional_headers)}")
372+
log(f"Other headers found in {page_url}:")
373+
log(f" {', '.join(additional_headers)}")
356374

357375
def parse_event_page(page_url: str, category: str, name: str, source: str) -> dict:
358376
response_text = get_page_from_cache_or_fetch(page_url, name)
@@ -474,7 +492,7 @@ def parse_event_page(page_url: str, category: str, name: str, source: str) -> di
474492
handled_header_names.append("Examples")
475493
handled_header_names.append("Example")
476494
if len(examples) == 0:
477-
print(f"Event is missing code examples: {page_url}")
495+
log(f"Event is missing code examples: {page_url}")
478496

479497
example_index = 1
480498
added_examples = []
@@ -603,7 +621,7 @@ def parse_function_page(page_url: str, category: str, name: str, source: str) ->
603621
handled_header_names.append("Examples")
604622
handled_header_names.append("Example")
605623
# if len(examples) == 0:
606-
# print(f"Function is missing code examples: {page_url}")
624+
# log(f"Function is missing code examples: {page_url}")
607625

608626
example_index = 1
609627
added_examples = []
@@ -670,7 +688,7 @@ def convert_page_to_yaml(page_url: str, category: str, name: str, source: str) -
670688
def parse_items_by_source(base_dir, data_by_source):
671689
for source, categories in data_by_source.items():
672690
started_at = time.time()
673-
print(f"Parsing individual pages of {source}...")
691+
log(f"»»» Parsing individual pages of {source}...")
674692
for category, entries in categories.items():
675693
dir_path = os.path.join(base_dir, category)
676694
os.makedirs(dir_path, exist_ok=True)
@@ -684,18 +702,18 @@ def parse_items_by_source(base_dir, data_by_source):
684702
file_content += convert_page_to_yaml(page_url, category, name, source)
685703
f.write(file_content)
686704
except Exception as e:
687-
print(e)
705+
log(e)
688706
# Cancel and continue to next entry, closing/deleting file if needed
689707
if os.path.exists(filename):
690708
os.remove(filename)
691709

692-
print(f">> Parsed individual pages of {source} in {time.time() - started_at:.2f} seconds.")
710+
log(f">> Parsed individual pages of {source} in {time.time() - started_at:.2f} seconds.\n")
693711

694712
def main():
695713
# Create cache directory if it doesn't exist
696714
if not os.path.exists(PAGES_CACHE_DIR):
697715
os.makedirs(PAGES_CACHE_DIR)
698-
print("SKIP_CACHE is set to", SKIP_CACHE)
716+
log(f"SKIP_CACHE is set to {SKIP_CACHE}")
699717

700718
functions_by_source = {}
701719
events_by_source = {}
@@ -720,8 +738,14 @@ def main():
720738
if os.path.exists("./output"):
721739
shutil.rmtree("./output")
722740

741+
log(" ")
742+
723743
parse_items_by_source(FUNCTIONS_DIR, functions_by_source)
724744
parse_items_by_source(EVENTS_DIR, events_by_source)
725745

746+
# Close log file
747+
log_file.close()
748+
726749
if __name__ == "__main__":
727750
main()
751+

0 commit comments

Comments
 (0)