4040 'Vector'
4141]
4242
43+ log_filename = f"./logs/scrape_log_{ time .strftime ('%Y%m%d_%H%M%S' , time .localtime ())} .log"
44+ log_file = open (log_filename , "a" , encoding = "utf-8" )
45+
46+ def log (message : str ):
47+ global log_file
48+ timestamp = time .strftime ("%Y-%m-%d %H:%M:%S" , time .localtime ())
49+ log_entry = f"[{ timestamp } ] { message } "
50+ print (log_entry )
51+ log_file .write (log_entry + "\n " )
52+
4353def clean_category (category_name : str ) -> str :
4454 if category_name .endswith ("events" ):
4555 return category_name [:- 7 ]
@@ -54,7 +64,7 @@ def fix_category(category_name: str) -> str:
5464 return category_name
5565
5666def parse_links (source_label : str , url : str ) -> dict :
57- print (f"Parsing list of { source_label } ..." )
67+ log (f"Parsing list of { source_label } ..." )
5868
5969 response = requests .get (url )
6070 soup = BeautifulSoup (response .text , "html.parser" )
@@ -83,8 +93,16 @@ def parse_links(source_label: str, url: str) -> dict:
8393 continue
8494 page_url = a .get ("href" )
8595 page_url = f"https://wiki.multitheftauto.com{ page_url } "
86- if name not in result [current_category ]:
96+ # Check if name not in any result category
97+ foundInCat = False
98+ for cat , entries in result .items ():
99+ if any (entry [1 ] == name for entry in entries ):
100+ foundInCat = cat
101+ break
102+ if not foundInCat :
87103 result [current_category ].append ((page_url , name ))
104+ else :
105+ log (f"!!! Duplicate found in { foundInCat } when parsing { current_category } : { page_url } " )
88106
89107 return result
90108
@@ -274,7 +292,7 @@ def parse_description(content_div):
274292 if text and not text .isspace ():
275293 the_description = convert_to_markdown (str (element ))
276294 the_description = the_description .strip ()
277- # print (f"Found description for {name}: {the_description}")
295+ # log (f"Found description for {name}: {the_description}")
278296 break
279297 elif element .name in ["h2" , "h3" ]:
280298 # Stop at the first header
@@ -288,7 +306,7 @@ def parse_description(content_div):
288306 if text and not text .isspace ():
289307 the_description = convert_to_markdown (str (div ))
290308 the_description = the_description .strip ()
291- # print (f"Found description in div for {name}: {the_description}")
309+ # log (f"Found description in div for {name}: {the_description}")
292310 break
293311
294312 return the_description
@@ -351,8 +369,8 @@ def print_additional_headers_found_in_page(content_div, handled_header_names, pa
351369 additional_headers .append (header_text )
352370
353371 if additional_headers :
354- print (f"Other headers found in { page_url } :" )
355- print (f" { ', ' .join (additional_headers )} " )
372+ log (f"Other headers found in { page_url } :" )
373+ log (f" { ', ' .join (additional_headers )} " )
356374
357375def parse_event_page (page_url : str , category : str , name : str , source : str ) -> dict :
358376 response_text = get_page_from_cache_or_fetch (page_url , name )
@@ -474,7 +492,7 @@ def parse_event_page(page_url: str, category: str, name: str, source: str) -> di
474492 handled_header_names .append ("Examples" )
475493 handled_header_names .append ("Example" )
476494 if len (examples ) == 0 :
477- print (f"Event is missing code examples: { page_url } " )
495+ log (f"Event is missing code examples: { page_url } " )
478496
479497 example_index = 1
480498 added_examples = []
@@ -603,7 +621,7 @@ def parse_function_page(page_url: str, category: str, name: str, source: str) ->
603621 handled_header_names .append ("Examples" )
604622 handled_header_names .append ("Example" )
605623 # if len(examples) == 0:
606- # print (f"Function is missing code examples: {page_url}")
624+ # log (f"Function is missing code examples: {page_url}")
607625
608626 example_index = 1
609627 added_examples = []
@@ -670,7 +688,7 @@ def convert_page_to_yaml(page_url: str, category: str, name: str, source: str) -
670688def parse_items_by_source (base_dir , data_by_source ):
671689 for source , categories in data_by_source .items ():
672690 started_at = time .time ()
673- print (f"Parsing individual pages of { source } ..." )
691+ log (f"»»» Parsing individual pages of { source } ..." )
674692 for category , entries in categories .items ():
675693 dir_path = os .path .join (base_dir , category )
676694 os .makedirs (dir_path , exist_ok = True )
@@ -684,18 +702,18 @@ def parse_items_by_source(base_dir, data_by_source):
684702 file_content += convert_page_to_yaml (page_url , category , name , source )
685703 f .write (file_content )
686704 except Exception as e :
687- print (e )
705+ log (e )
688706 # Cancel and continue to next entry, closing/deleting file if needed
689707 if os .path .exists (filename ):
690708 os .remove (filename )
691709
692- print (f">> Parsed individual pages of { source } in { time .time () - started_at :.2f} seconds." )
710+ log (f">> Parsed individual pages of { source } in { time .time () - started_at :.2f} seconds.\n " )
693711
694712def main ():
695713 # Create cache directory if it doesn't exist
696714 if not os .path .exists (PAGES_CACHE_DIR ):
697715 os .makedirs (PAGES_CACHE_DIR )
698- print ( "SKIP_CACHE is set to" , SKIP_CACHE )
716+ log ( f "SKIP_CACHE is set to { SKIP_CACHE } " )
699717
700718 functions_by_source = {}
701719 events_by_source = {}
@@ -720,8 +738,14 @@ def main():
720738 if os .path .exists ("./output" ):
721739 shutil .rmtree ("./output" )
722740
741+ log (" " )
742+
723743 parse_items_by_source (FUNCTIONS_DIR , functions_by_source )
724744 parse_items_by_source (EVENTS_DIR , events_by_source )
725745
746+ # Close log file
747+ log_file .close ()
748+
726749if __name__ == "__main__" :
727750 main ()
751+
0 commit comments