diff --git a/Modules/csv_management.py b/Modules/csv_management.py index e1015df..a4733ce 100644 --- a/Modules/csv_management.py +++ b/Modules/csv_management.py @@ -166,8 +166,8 @@ def order_by_time(filename: str = 'Out/output.csv', outfile: str = None, asc: bo data, key=lambda x: x['TimeStampStart'], reverse=False if asc else True) with open(output_path, 'w', newline='', encoding='utf-8') as file: - fieldnames = sorted_data[0].keys() - writer = csv.DictWriter(file, fieldnames=fieldnames) + field_names = sorted_data[0].keys() + writer = csv.DictWriter(file, fieldnames=field_names) writer.writeheader() for row in sorted_data: diff --git a/Modules/disease_header_parser.py b/Modules/disease_header_parser.py index c458ab5..c5a3c6c 100644 --- a/Modules/disease_header_parser.py +++ b/Modules/disease_header_parser.py @@ -36,16 +36,16 @@ def compare_two_word(str1: str, str2: str) -> int: # Python compare words with case sensitive measure. # First, need to check whether the string is two-length or one. - str1_list = str1.split() + str_1_list = str1.split() is_two_word1 = False - if len(str1_list) > 1: + if len(str_1_list) > 1: is_two_word1 = True - str2_list = str2.split() + str_2_list = str2.split() is_two_word2 = False - if len(str2_list) > 1: + if len(str_2_list) > 1: is_two_word2 = True if is_two_word1 and not is_two_word2: @@ -59,20 +59,20 @@ def compare_two_word(str1: str, str2: str) -> int: # normalize so that maximum is 1. elif not is_two_word1 and is_two_word2: # since we don't hope the word to match second word, abort the second similarity - lc1 = longest_common_subsequence(str1, str2_list[0]) - if lc1 <= 1: + lc_1 = longest_common_subsequence(str1, str_2_list[0]) + if lc_1 <= 1: temp_score = 0 else: - lc2 = longest_common_subsequence(str1, str2_list[1]) - length1 = (len(str1) + len(str2_list[0])) / 2 - length2 = (len(str1) + len(str2_list[1])) /2 - temp_score = ((lc1 / length1) + (lc2 / length2)) / 2 + lc_2 = longest_common_subsequence(str1, str_2_list[1]) + length_1 = (len(str1) + len(str_2_list[0])) / 2 + length_2 = (len(str1) + len(str_2_list[1])) /2 + temp_score = ((lc_1 / length_1) + (lc_2 / length_2)) / 2 elif is_two_word1 and is_two_word2: - lc1 = longest_common_subsequence(str1_list[0], str2_list[0]) - lc2 = longest_common_subsequence(str1_list[1], str2_list[1]) - length1 = (len(str1_list[0]) + len(str2_list[0])) / 2 - length2 = (len(str1_list[1]) + len(str2_list[1])) / 2 - temp_score = ((lc1 / length1) + (lc2 / length2)) / 2 + lc_1 = longest_common_subsequence(str_1_list[0], str_2_list[0]) + lc_2 = longest_common_subsequence(str_1_list[1], str_2_list[1]) + length_1 = (len(str_1_list[0]) + len(str_2_list[0])) / 2 + length_2 = (len(str_1_list[1]) + len(str_2_list[1])) / 2 + temp_score = ((lc_1 / length_1) + (lc_2 / length_2)) / 2 return temp_score diff --git a/ParsingModels/BrazilModules/brazil_convert_to_table.py b/ParsingModels/BrazilModules/brazil_convert_to_table.py index cfc1fa8..d842f6b 100644 --- a/ParsingModels/BrazilModules/brazil_convert_to_table.py +++ b/ParsingModels/BrazilModules/brazil_convert_to_table.py @@ -20,7 +20,7 @@ from disease_header_parser import detect_diseases from table_conversion_functions import time_to_excel_time, remove_quotes, remove_numbers, month_to_timestamps, week_number_to_datetime -tableHeading = ['Disease Name', +table_heading = ['Disease Name', 'Cases', 'Location Name', 'Country Code', @@ -102,7 +102,7 @@ def convert_to_table(important_text: List[str], table_data = [] location_type = remove_quotes(header[0]) - tableHeading[2] = location_type + table_heading[2] = location_type for row in rows: cells = row.split(';') @@ -133,4 +133,4 @@ def convert_to_table(important_text: List[str], time_to_excel_time(timestamps[0]), time_to_excel_time(timestamps[1])]) - return table_data, tableHeading + return table_data, table_heading diff --git a/ParsingModels/SriLankaModules/convert_to_table.py b/ParsingModels/SriLankaModules/convert_to_table.py index 42586a3..8883210 100644 --- a/ParsingModels/SriLankaModules/convert_to_table.py +++ b/ParsingModels/SriLankaModules/convert_to_table.py @@ -30,7 +30,7 @@ # Gets the directory of LLaMaInterface module for import # from LLaMaInterface import separateCellHeaders -tableHeading = ['Disease Name', +table_heading = ['Disease Name', 'Cases', 'Location Name', 'Country Code', @@ -319,6 +319,6 @@ def convert_to_table(important_text: List[str], table = convert_to_table(TEST_DATA, [datetime(2023, 6, 9) +timedelta(days=-7), datetime(2023, 6, 9)]) - print_table(table, tableHeading) + print_table(table, table_heading) from table_to_csv import print_to_csv - print_to_csv(table,tableHeading) + print_to_csv(table,table_heading) diff --git a/ParsingModels/SriLankaModules/rtf_extractor.py b/ParsingModels/SriLankaModules/rtf_extractor.py index 6eae74a..759185b 100644 --- a/ParsingModels/SriLankaModules/rtf_extractor.py +++ b/ParsingModels/SriLankaModules/rtf_extractor.py @@ -80,7 +80,7 @@ def extract_table(first_word: str, end_words: List[str], text: str) -> Optional[ #print("starting word found: ",subset) try: #print("DEBUG: checking", subset.split()[1]) - temp_diseases = detect_diseases((subset)) + temp_diseases = detect_diseases((subset),True) except ValueError: subset = "" found_starting_word = False diff --git a/ParsingModels/brazil_parser.py b/ParsingModels/brazil_parser.py index 7658267..b2b805e 100644 --- a/ParsingModels/brazil_parser.py +++ b/ParsingModels/brazil_parser.py @@ -13,7 +13,7 @@ moudlues_directory = os.path.join(current_directory, '../Modules') sys.path.append(moudlues_directory) -from ParsingModels.BrazilModules.brazil_convert_to_table import convert_to_table, tableHeading +from ParsingModels.BrazilModules.brazil_convert_to_table import convert_to_table, table_heading from table_conversion_functions import print_table diff --git a/ParsingModels/parser_template.py b/ParsingModels/parser_template.py index d0e722a..fccc9f5 100644 --- a/ParsingModels/parser_template.py +++ b/ParsingModels/parser_template.py @@ -18,7 +18,7 @@ from typing import List, Tuple from ParsingModels.. import # Refer rtf_extractor at SriLankaModules. It should give the important text (what is relevant) then timestamp. -from ParsingModels.. import convert_to_table, tableHeading +from ParsingModels.. import convert_to_table, table_heading # Refer brazil, then srilanka. These are two common structure of convert_to_table. from table_conversion_functions import print_table @@ -83,7 +83,7 @@ def extract_to_table(rtf_data: List[str], # or somewhat similar. # you can see the exact format as an output using print_table() function. - heading = tableHeading + heading = table_heading # header of your desire. diff --git a/ParsingModels/sriLankaParser.py b/ParsingModels/sriLankaParser.py index 30fc242..ab42ab6 100644 --- a/ParsingModels/sriLankaParser.py +++ b/ParsingModels/sriLankaParser.py @@ -12,7 +12,7 @@ sys.path.append(moudlues_directory) from typing import List, Tuple -from ParsingModels.SriLankaModules.convert_to_table import convert_to_table, tableHeading, print_table +from ParsingModels.SriLankaModules.convert_to_table import convert_to_table, table_heading, print_table from ParsingModels.SriLankaModules.rtf_extractor import extract_data_from_rtf from table_conversion_functions import print_table @@ -53,7 +53,7 @@ def extract_to_table(rtf_data: List[str], print(important_text) print(timestamps) table = convert_to_table(important_text, timestamps, flags = flags) - heading = tableHeading #tableHeading imported from SriLankaModules.convert_to_table + heading = table_heading #tableHeading imported from SriLankaModules.convert_to_table if debug_mode: print("DEBUG - Output Table:") diff --git a/Testing/testParse.py b/Testing/testParse.py index eae5099..57b60f5 100644 --- a/Testing/testParse.py +++ b/Testing/testParse.py @@ -7,5 +7,5 @@ out += page.extract_text() print(out) -outFile = open("Output.txt", "w", encoding="utf-8") -print(out, file=outFile) +out_file = open("Output.txt", "w", encoding="utf-8") +print(out, file=out_file) diff --git a/conversion.py b/conversion.py index 3a2556f..4bad374 100644 --- a/conversion.py +++ b/conversion.py @@ -36,17 +36,17 @@ def split_file(in_csv_path:str, out_csv_path=None): def split_all_data(): if len(sys.argv) < 2: print(f'please include an argument of where the folder is located') - inFolder = sys.argv[1] - filesToParse = [] - if os.path.exists(inFolder): + in_folder = sys.argv[1] + files_to_parse = [] + if os.path.exists(in_folder): print("Locating files...") - for root, dirs, files in os.walk(inFolder): + for root, dirs, files in os.walk(in_folder): for name in files: if name[-4:] == '.csv': # Only parse csv files - filesToParse.append(f'{root}/{name}'.replace('\\', '/')) - print(f'files to parse: {filesToParse[:5]}') + files_to_parse.append(f'{root}/{name}'.replace('\\', '/')) + print(f'files to parse: {files_to_parse[:5]}') - for file in filesToParse: + for file in files_to_parse: split_file(file) diff --git a/dataParser.py b/dataParser.py index c8fa2c1..5b23dbd 100644 --- a/dataParser.py +++ b/dataParser.py @@ -53,11 +53,11 @@ sys.exit() # Import Arguments - inFolder = sys.argv[1] + in_folder = sys.argv[1] # Arg 1: folder of PDFs to parse. They should all be compatible with the same parsing model - outFile = sys.argv[2] + out_file = sys.argv[2] # Arg 2: output file, in csv format (only the name of the file) - modelFile = sys.argv[3] + model_file = sys.argv[3] # Arg 3: parsing model. PDF will be converted to text, but model will convert text to array data flags = sys.argv[4:] @@ -107,23 +107,23 @@ if not os.path.exists(ERROR_DIR): # If there is no directory, make it os.makedirs(ERROR_DIR) - model = importlib.import_module(modelFile) + model = importlib.import_module(model_file) # process each file in input folder - filesToParse = [] - if os.path.exists(inFolder): + files_to_parse = [] + if os.path.exists(in_folder): print("Locating files...") - for root, dirs, files in os.walk(inFolder): + for root, dirs, files in os.walk(in_folder): for name in files: # print(f'root: {root} dirs: {dirs} files: {files}') if name[-4:] == '.pdf' or name[-4:] == '.txt': # Only parse pdf or txt files - filesToParse.append(f'{root}/{name}'.replace('\\', '/')) + files_to_parse.append(f'{root}/{name}'.replace('\\', '/')) else: - print(f"ERROR: folder '{inFolder}' not found!") + print(f"ERROR: folder '{in_folder}' not found!") quit() print("Will parse the following files: ", end="") - for f in filesToParse: + for f in files_to_parse: print(f, end=", ") print() RESPONSE = '' @@ -134,27 +134,27 @@ i = 1 NUM_ERRORS = 0 - for currentFile in filesToParse: - print(f"Parsing file {i}/{len(filesToParse)}:", currentFile) + for current_file in files_to_parse: + print(f"Parsing file {i}/{len(files_to_parse)}:", current_file) STEP = 0 try: - rtfData = [] - if currentFile[-4:] == '.pdf': # if file is PDF - rtfData = pdf_to_rtf(currentFile) - elif currentFile[-4:] == '.txt': #if file is txt - with open(currentFile, encoding="utf8") as txt_data: - rtfData = [txt_data.read()] + rtf_data = [] + if current_file[-4:] == '.pdf': # if file is PDF + rtf_data = pdf_to_rtf(current_file) + elif current_file[-4:] == '.txt': #if file is txt + with open(current_file, encoding="utf8") as txt_data: + rtf_data = [txt_data.read()] STEP += 1 - table, heading = model.extract_to_table(rtfData, flags=flags) + table, heading = model.extract_to_table(rtf_data, flags=flags) for n in range(len(table)): # Added file source to show here the data came from - table[n].append(currentFile) + table[n].append(current_file) heading.append("Source File") STEP += 1 - print_to_csv(table, heading, file_name=outFile) + print_to_csv(table, heading, file_name=out_file) except Exception as error: NUM_ERRORS += 1 - error_message = f"Error for file {currentFile} " + error_message = f"Error for file {current_file} " if STEP == 0: error_message += "at pdf_to_rtf(). Perhaps the file is not a proper PDF?\n" @@ -178,8 +178,8 @@ if start_of_folder_name == -1: # can't find line, can't categorize error (shouldn't happen) print("can't find line in:", error_folder) - shutil.copy(currentFile, os.path.join( - ERROR_DIR, ntpath.basename(currentFile))) + shutil.copy(current_file, os.path.join( + ERROR_DIR, ntpath.basename(current_file))) else: error_folder = error_folder[start_of_folder_name:] error_folder = make_valid_filename(error_folder) @@ -187,24 +187,24 @@ # If there is no directory, make it if not os.path.exists(output_dir): os.makedirs(output_dir) - shutil.copy(currentFile, os.path.join( - output_dir, ntpath.basename(currentFile))) + shutil.copy(current_file, os.path.join( + output_dir, ntpath.basename(current_file))) i += 1 - print("Done! Output in", outFile) - print(f"There were errors in {NUM_ERRORS}/{len(filesToParse)} files") + print("Done! Output in", out_file) + print(f"There were errors in {NUM_ERRORS}/{len(files_to_parse)} files") if log_mode: with open(LOG_FILE_PATH, 'a', encoding='utf-8') as log_file: - log_file.write(f"There were errors in {NUM_ERRORS}/{len(filesToParse)} files") + log_file.write(f"There were errors in {NUM_ERRORS}/{len(files_to_parse)} files") if sort_mode: from Modules.csv_management import order_by_time if '-asc' in flags: - order_by_time(outFile) + order_by_time(out_file) elif '-desc' in flags: - order_by_time(outFile, asc=False) + order_by_time(out_file, asc=False) if extract_mode: try: @@ -226,8 +226,8 @@ OUTPUT_PATH = temp_path if OUTPUT_PATH is None: - OUTPUT_PATH = outFile.split('.')[0] + '_' + target_keyword + '.csv' + OUTPUT_PATH = out_file.split('.')[0] + '_' + target_keyword + '.csv' from Modules.csv_management import extract_data - extract_data(target_keyword, outFile, OUTPUT_PATH) + extract_data(target_keyword, out_file, OUTPUT_PATH)