MLOpenSourceOpenScience · Y01N · Apr 24, 2024 · May 13, 2024 · May 13, 2024
diff --git a/Modules/csv_management.py b/Modules/csv_management.py
@@ -166,8 +166,8 @@ def order_by_time(filename: str = 'Out/output.csv', outfile: str = None, asc: bo
         data, key=lambda x: x['TimeStampStart'], reverse=False if asc else True)
 
     with open(output_path, 'w', newline='', encoding='utf-8') as file:
-        fieldnames = sorted_data[0].keys()
-        writer = csv.DictWriter(file, fieldnames=fieldnames)
+        field_names = sorted_data[0].keys()
+        writer = csv.DictWriter(file, fieldnames=field_names)
         writer.writeheader()
 
         for row in sorted_data:

diff --git a/Modules/disease_header_parser.py b/Modules/disease_header_parser.py
@@ -36,16 +36,16 @@ def compare_two_word(str1: str, str2: str) -> int:
     # Python compare words with case sensitive measure.
 
     # First, need to check whether the string is two-length or one.
-    str1_list = str1.split()
+    str_1_list = str1.split()
     is_two_word1 = False
 
-    if len(str1_list) > 1:
+    if len(str_1_list) > 1:
         is_two_word1 = True
 
-    str2_list = str2.split()
+    str_2_list = str2.split()
     is_two_word2 = False
 
-    if len(str2_list) > 1:
+    if len(str_2_list) > 1:
         is_two_word2 = True
 
     if is_two_word1 and not is_two_word2:
@@ -59,20 +59,20 @@ def compare_two_word(str1: str, str2: str) -> int:
         # normalize so that maximum is 1.
     elif not is_two_word1 and is_two_word2:
         # since we don't hope the word to match second word, abort the second similarity
-        lc1 = longest_common_subsequence(str1, str2_list[0])
-        if lc1 <= 1:
+        lc_1 = longest_common_subsequence(str1, str_2_list[0])
+        if lc_1 <= 1:
             temp_score = 0
         else:
-            lc2 = longest_common_subsequence(str1, str2_list[1])
-            length1 = (len(str1) + len(str2_list[0])) / 2
-            length2 = (len(str1) + len(str2_list[1])) /2
-            temp_score = ((lc1 / length1) + (lc2 / length2)) / 2
+            lc_2 = longest_common_subsequence(str1, str_2_list[1])
+            length_1 = (len(str1) + len(str_2_list[0])) / 2
+            length_2 = (len(str1) + len(str_2_list[1])) /2
+            temp_score = ((lc_1 / length_1) + (lc_2 / length_2)) / 2
     elif is_two_word1 and is_two_word2:
-        lc1 = longest_common_subsequence(str1_list[0], str2_list[0])
-        lc2 = longest_common_subsequence(str1_list[1], str2_list[1])
-        length1 = (len(str1_list[0]) + len(str2_list[0])) / 2
-        length2 = (len(str1_list[1]) + len(str2_list[1])) / 2
-        temp_score = ((lc1 / length1) + (lc2 / length2)) / 2
+        lc_1 = longest_common_subsequence(str_1_list[0], str_2_list[0])
+        lc_2 = longest_common_subsequence(str_1_list[1], str_2_list[1])
+        length_1 = (len(str_1_list[0]) + len(str_2_list[0])) / 2
+        length_2 = (len(str_1_list[1]) + len(str_2_list[1])) / 2
+        temp_score = ((lc_1 / length_1) + (lc_2 / length_2)) / 2
 
     return temp_score
 

diff --git a/ParsingModels/BrazilModules/brazil_convert_to_table.py b/ParsingModels/BrazilModules/brazil_convert_to_table.py
@@ -20,7 +20,7 @@
 from disease_header_parser import detect_diseases
 from table_conversion_functions import time_to_excel_time, remove_quotes, remove_numbers, month_to_timestamps, week_number_to_datetime
 
-tableHeading = ['Disease Name',
+table_heading = ['Disease Name',
                 'Cases',
                 'Location Name',
                 'Country Code',
@@ -102,7 +102,7 @@ def convert_to_table(important_text: List[str],
 
     table_data = []
     location_type = remove_quotes(header[0])
-    tableHeading[2] = location_type
+    table_heading[2] = location_type
 
     for row in rows:
         cells = row.split(';')
@@ -133,4 +133,4 @@ def convert_to_table(important_text: List[str],
                     time_to_excel_time(timestamps[0]),
                     time_to_excel_time(timestamps[1])])
 
-    return table_data, tableHeading
+    return table_data, table_heading
diff --git a/ParsingModels/SriLankaModules/convert_to_table.py b/ParsingModels/SriLankaModules/convert_to_table.py
@@ -30,7 +30,7 @@
 # Gets the directory of LLaMaInterface module for import
 # from LLaMaInterface import separateCellHeaders
 
-tableHeading = ['Disease Name',
+table_heading = ['Disease Name',
                 'Cases',
                 'Location Name',
                 'Country Code',
@@ -319,6 +319,6 @@ def convert_to_table(important_text: List[str],
     table = convert_to_table(TEST_DATA,
                            [datetime(2023, 6, 9) +timedelta(days=-7),
                             datetime(2023, 6, 9)])
-    print_table(table, tableHeading)
+    print_table(table, table_heading)
     from table_to_csv import print_to_csv
-    print_to_csv(table,tableHeading)
+    print_to_csv(table,table_heading)
diff --git a/ParsingModels/SriLankaModules/rtf_extractor.py b/ParsingModels/SriLankaModules/rtf_extractor.py
@@ -80,7 +80,7 @@ def extract_table(first_word: str, end_words: List[str], text: str) -> Optional[
             #print("starting word found: ",subset)
             try:
                 #print("DEBUG: checking", subset.split()[1])
-                temp_diseases = detect_diseases((subset))
+                temp_diseases = detect_diseases((subset),True)
             except ValueError:
                 subset = ""
                 found_starting_word = False

diff --git a/ParsingModels/brazil_parser.py b/ParsingModels/brazil_parser.py
@@ -13,7 +13,7 @@
 moudlues_directory = os.path.join(current_directory, '../Modules')
 sys.path.append(moudlues_directory)
 
-from ParsingModels.BrazilModules.brazil_convert_to_table import convert_to_table, tableHeading
+from ParsingModels.BrazilModules.brazil_convert_to_table import convert_to_table, table_heading
 from table_conversion_functions import print_table
 
 

diff --git a/ParsingModels/parser_template.py b/ParsingModels/parser_template.py
@@ -18,7 +18,7 @@
 from typing import List, Tuple
 from ParsingModels.<Target Module>.<Target Extracter> import <extracter function name>
 # Refer rtf_extractor at SriLankaModules. It should give the important text (what is relevant) then timestamp.
-from ParsingModels.<Target Module>.<parsing module python file> import convert_to_table, tableHeading
+from ParsingModels.<Target Module>.<parsing module python file> import convert_to_table, table_heading
 # Refer brazil, then srilanka. These are two common structure of convert_to_table.
 from table_conversion_functions import print_table
 
@@ -83,7 +83,7 @@ def extract_to_table(rtf_data: List[str],
     # or somewhat similar.
     # you can see the exact format as an output using print_table() function.
 
-    heading = tableHeading
+    heading = table_heading
 
     # header of your desire.
 

diff --git a/ParsingModels/sriLankaParser.py b/ParsingModels/sriLankaParser.py
@@ -12,7 +12,7 @@
 sys.path.append(moudlues_directory)
 
 from typing import List, Tuple
-from ParsingModels.SriLankaModules.convert_to_table import convert_to_table, tableHeading, print_table
+from ParsingModels.SriLankaModules.convert_to_table import convert_to_table, table_heading, print_table
 from ParsingModels.SriLankaModules.rtf_extractor import extract_data_from_rtf
 from table_conversion_functions import print_table
 
@@ -53,7 +53,7 @@ def extract_to_table(rtf_data: List[str],
         print(important_text)
         print(timestamps)
     table = convert_to_table(important_text, timestamps, flags = flags)
-    heading = tableHeading #tableHeading imported from SriLankaModules.convert_to_table
+    heading = table_heading #tableHeading imported from SriLankaModules.convert_to_table
 
     if debug_mode:
         print("DEBUG - Output Table:")

diff --git a/Testing/testParse.py b/Testing/testParse.py
@@ -7,5 +7,5 @@
     out += page.extract_text()
 
 print(out)
-outFile = open("Output.txt", "w", encoding="utf-8")
-print(out, file=outFile)
+out_file = open("Output.txt", "w", encoding="utf-8")
+print(out, file=out_file)
diff --git a/conversion.py b/conversion.py
@@ -36,17 +36,17 @@ def split_file(in_csv_path:str, out_csv_path=None):
 def split_all_data():
     if len(sys.argv) < 2:
         print(f'please include an argument of where the folder is located')
-    inFolder = sys.argv[1]
-    filesToParse = []
-    if os.path.exists(inFolder):
+    in_folder = sys.argv[1]
+    files_to_parse = []
+    if os.path.exists(in_folder):
         print("Locating files...")
-        for root, dirs, files in os.walk(inFolder):
+        for root, dirs, files in os.walk(in_folder):
             for name in files:
                 if name[-4:] == '.csv': # Only parse csv files
-                    filesToParse.append(f'{root}/{name}'.replace('\\', '/'))
-    print(f'files to parse: {filesToParse[:5]}')
+                    files_to_parse.append(f'{root}/{name}'.replace('\\', '/'))
+    print(f'files to parse: {files_to_parse[:5]}')
 
-    for file in filesToParse:
+    for file in files_to_parse:
         split_file(file)
 
 

diff --git a/dataParser.py b/dataParser.py
@@ -53,11 +53,11 @@
         sys.exit()
 
     # Import Arguments
-    inFolder = sys.argv[1]
+    in_folder = sys.argv[1]
     # Arg 1: folder of PDFs to parse. They should all be compatible with the same parsing model
-    outFile = sys.argv[2]
+    out_file = sys.argv[2]
     # Arg 2: output file, in csv format (only the name of the file)
-    modelFile = sys.argv[3]
+    model_file = sys.argv[3]
     # Arg 3: parsing model. PDF will be converted to text, but model will convert text to array data
     flags = sys.argv[4:]
 
@@ -107,23 +107,23 @@
         if not os.path.exists(ERROR_DIR):  # If there is no directory, make it
             os.makedirs(ERROR_DIR)
 
-    model = importlib.import_module(modelFile)
+    model = importlib.import_module(model_file)
 
     # process each file in input folder
-    filesToParse = []
-    if os.path.exists(inFolder):
+    files_to_parse = []
+    if os.path.exists(in_folder):
         print("Locating files...")
-        for root, dirs, files in os.walk(inFolder):
+        for root, dirs, files in os.walk(in_folder):
             for name in files:
                 # print(f'root: {root} dirs: {dirs} files: {files}')
                 if name[-4:] == '.pdf' or name[-4:] == '.txt':  # Only parse pdf or txt files
-                    filesToParse.append(f'{root}/{name}'.replace('\\', '/'))
+                    files_to_parse.append(f'{root}/{name}'.replace('\\', '/'))
     else:
-        print(f"ERROR: folder '{inFolder}' not found!")
+        print(f"ERROR: folder '{in_folder}' not found!")
         quit()
 
     print("Will parse the following files: ", end="")
-    for f in filesToParse:
+    for f in files_to_parse:
         print(f, end=", ")
     print()
     RESPONSE = ''
@@ -134,27 +134,27 @@
 
     i = 1
     NUM_ERRORS = 0
-    for currentFile in filesToParse:
-        print(f"Parsing file {i}/{len(filesToParse)}:", currentFile)
+    for current_file in files_to_parse:
+        print(f"Parsing file {i}/{len(files_to_parse)}:", current_file)
         STEP = 0
         try:
-            rtfData = []
-            if currentFile[-4:] == '.pdf':  # if file is PDF
-                rtfData = pdf_to_rtf(currentFile)
-            elif currentFile[-4:] == '.txt': #if file is txt
-                with open(currentFile, encoding="utf8") as txt_data:
-                    rtfData = [txt_data.read()]
+            rtf_data = []
+            if current_file[-4:] == '.pdf':  # if file is PDF
+                rtf_data = pdf_to_rtf(current_file)
+            elif current_file[-4:] == '.txt': #if file is txt
+                with open(current_file, encoding="utf8") as txt_data:
+                    rtf_data = [txt_data.read()]
             STEP += 1
-            table, heading = model.extract_to_table(rtfData, flags=flags)
+            table, heading = model.extract_to_table(rtf_data, flags=flags)
             for n in range(len(table)):
                 # Added file source to show here the data came from
-                table[n].append(currentFile)
+                table[n].append(current_file)
             heading.append("Source File")
             STEP += 1
-            print_to_csv(table, heading, file_name=outFile)
+            print_to_csv(table, heading, file_name=out_file)
         except Exception as error:
             NUM_ERRORS += 1
-            error_message = f"Error for file {currentFile} "
+            error_message = f"Error for file {current_file} "
 
             if STEP == 0:
                     error_message += "at pdf_to_rtf(). Perhaps the file is not a proper PDF?\n"
@@ -178,33 +178,33 @@
                 if start_of_folder_name == -1:
                     # can't find line, can't categorize error (shouldn't happen)
                     print("can't find line in:", error_folder)
-                    shutil.copy(currentFile, os.path.join(
-                        ERROR_DIR, ntpath.basename(currentFile)))
+                    shutil.copy(current_file, os.path.join(
+                        ERROR_DIR, ntpath.basename(current_file)))
                 else:
                     error_folder = error_folder[start_of_folder_name:]
                     error_folder = make_valid_filename(error_folder)
                     output_dir = os.path.join(ERROR_DIR, error_folder)
                     # If there is no directory, make it
                     if not os.path.exists(output_dir):
                         os.makedirs(output_dir)
-                    shutil.copy(currentFile, os.path.join(
-                        output_dir, ntpath.basename(currentFile)))
+                    shutil.copy(current_file, os.path.join(
+                        output_dir, ntpath.basename(current_file)))
 
         i += 1
 
-    print("Done! Output in", outFile)
-    print(f"There were errors in {NUM_ERRORS}/{len(filesToParse)} files")
+    print("Done! Output in", out_file)
+    print(f"There were errors in {NUM_ERRORS}/{len(files_to_parse)} files")
     if log_mode:
         with open(LOG_FILE_PATH, 'a', encoding='utf-8') as log_file:
-            log_file.write(f"There were errors in {NUM_ERRORS}/{len(filesToParse)} files")
+            log_file.write(f"There were errors in {NUM_ERRORS}/{len(files_to_parse)} files")
 
     if sort_mode:
         from Modules.csv_management import order_by_time
 
         if '-asc' in flags:
-            order_by_time(outFile)
+            order_by_time(out_file)
         elif '-desc' in flags:
-            order_by_time(outFile, asc=False)
+            order_by_time(out_file, asc=False)
 
     if extract_mode:
         try:
@@ -226,8 +226,8 @@
                 OUTPUT_PATH = temp_path
 
         if OUTPUT_PATH is None:
-            OUTPUT_PATH = outFile.split('.')[0] + '_' + target_keyword + '.csv'
+            OUTPUT_PATH = out_file.split('.')[0] + '_' + target_keyword + '.csv'
 
         from Modules.csv_management import extract_data
 
-        extract_data(target_keyword, outFile, OUTPUT_PATH)
+        extract_data(target_keyword, out_file, OUTPUT_PATH)