Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Modules/csv_management.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,8 +166,8 @@ def order_by_time(filename: str = 'Out/output.csv', outfile: str = None, asc: bo
data, key=lambda x: x['TimeStampStart'], reverse=False if asc else True)

with open(output_path, 'w', newline='', encoding='utf-8') as file:
fieldnames = sorted_data[0].keys()
writer = csv.DictWriter(file, fieldnames=fieldnames)
field_names = sorted_data[0].keys()
writer = csv.DictWriter(file, fieldnames=field_names)
writer.writeheader()

for row in sorted_data:
Expand Down
30 changes: 15 additions & 15 deletions Modules/disease_header_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,16 +36,16 @@ def compare_two_word(str1: str, str2: str) -> int:
# Python compare words with case sensitive measure.

# First, need to check whether the string is two-length or one.
str1_list = str1.split()
str_1_list = str1.split()
is_two_word1 = False

if len(str1_list) > 1:
if len(str_1_list) > 1:
is_two_word1 = True

str2_list = str2.split()
str_2_list = str2.split()
is_two_word2 = False

if len(str2_list) > 1:
if len(str_2_list) > 1:
is_two_word2 = True

if is_two_word1 and not is_two_word2:
Expand All @@ -59,20 +59,20 @@ def compare_two_word(str1: str, str2: str) -> int:
# normalize so that maximum is 1.
elif not is_two_word1 and is_two_word2:
# since we don't hope the word to match second word, abort the second similarity
lc1 = longest_common_subsequence(str1, str2_list[0])
if lc1 <= 1:
lc_1 = longest_common_subsequence(str1, str_2_list[0])
if lc_1 <= 1:
temp_score = 0
else:
lc2 = longest_common_subsequence(str1, str2_list[1])
length1 = (len(str1) + len(str2_list[0])) / 2
length2 = (len(str1) + len(str2_list[1])) /2
temp_score = ((lc1 / length1) + (lc2 / length2)) / 2
lc_2 = longest_common_subsequence(str1, str_2_list[1])
length_1 = (len(str1) + len(str_2_list[0])) / 2
length_2 = (len(str1) + len(str_2_list[1])) /2
temp_score = ((lc_1 / length_1) + (lc_2 / length_2)) / 2
elif is_two_word1 and is_two_word2:
lc1 = longest_common_subsequence(str1_list[0], str2_list[0])
lc2 = longest_common_subsequence(str1_list[1], str2_list[1])
length1 = (len(str1_list[0]) + len(str2_list[0])) / 2
length2 = (len(str1_list[1]) + len(str2_list[1])) / 2
temp_score = ((lc1 / length1) + (lc2 / length2)) / 2
lc_1 = longest_common_subsequence(str_1_list[0], str_2_list[0])
lc_2 = longest_common_subsequence(str_1_list[1], str_2_list[1])
length_1 = (len(str_1_list[0]) + len(str_2_list[0])) / 2
length_2 = (len(str_1_list[1]) + len(str_2_list[1])) / 2
temp_score = ((lc_1 / length_1) + (lc_2 / length_2)) / 2

return temp_score

Expand Down
6 changes: 3 additions & 3 deletions ParsingModels/BrazilModules/brazil_convert_to_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from disease_header_parser import detect_diseases
from table_conversion_functions import time_to_excel_time, remove_quotes, remove_numbers, month_to_timestamps, week_number_to_datetime

tableHeading = ['Disease Name',
table_heading = ['Disease Name',
'Cases',
'Location Name',
'Country Code',
Expand Down Expand Up @@ -102,7 +102,7 @@ def convert_to_table(important_text: List[str],

table_data = []
location_type = remove_quotes(header[0])
tableHeading[2] = location_type
table_heading[2] = location_type

for row in rows:
cells = row.split(';')
Expand Down Expand Up @@ -133,4 +133,4 @@ def convert_to_table(important_text: List[str],
time_to_excel_time(timestamps[0]),
time_to_excel_time(timestamps[1])])

return table_data, tableHeading
return table_data, table_heading
6 changes: 3 additions & 3 deletions ParsingModels/SriLankaModules/convert_to_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
# Gets the directory of LLaMaInterface module for import
# from LLaMaInterface import separateCellHeaders

tableHeading = ['Disease Name',
table_heading = ['Disease Name',
'Cases',
'Location Name',
'Country Code',
Expand Down Expand Up @@ -319,6 +319,6 @@ def convert_to_table(important_text: List[str],
table = convert_to_table(TEST_DATA,
[datetime(2023, 6, 9) +timedelta(days=-7),
datetime(2023, 6, 9)])
print_table(table, tableHeading)
print_table(table, table_heading)
from table_to_csv import print_to_csv
print_to_csv(table,tableHeading)
print_to_csv(table,table_heading)
2 changes: 1 addition & 1 deletion ParsingModels/SriLankaModules/rtf_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def extract_table(first_word: str, end_words: List[str], text: str) -> Optional[
#print("starting word found: ",subset)
try:
#print("DEBUG: checking", subset.split()[1])
temp_diseases = detect_diseases((subset))
temp_diseases = detect_diseases((subset),True)
except ValueError:
subset = ""
found_starting_word = False
Expand Down
2 changes: 1 addition & 1 deletion ParsingModels/brazil_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
moudlues_directory = os.path.join(current_directory, '../Modules')
sys.path.append(moudlues_directory)

from ParsingModels.BrazilModules.brazil_convert_to_table import convert_to_table, tableHeading
from ParsingModels.BrazilModules.brazil_convert_to_table import convert_to_table, table_heading
from table_conversion_functions import print_table


Expand Down
4 changes: 2 additions & 2 deletions ParsingModels/parser_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from typing import List, Tuple
from ParsingModels.<Target Module>.<Target Extracter> import <extracter function name>
# Refer rtf_extractor at SriLankaModules. It should give the important text (what is relevant) then timestamp.
from ParsingModels.<Target Module>.<parsing module python file> import convert_to_table, tableHeading
from ParsingModels.<Target Module>.<parsing module python file> import convert_to_table, table_heading
# Refer brazil, then srilanka. These are two common structure of convert_to_table.
from table_conversion_functions import print_table

Expand Down Expand Up @@ -83,7 +83,7 @@ def extract_to_table(rtf_data: List[str],
# or somewhat similar.
# you can see the exact format as an output using print_table() function.

heading = tableHeading
heading = table_heading

# header of your desire.

Expand Down
4 changes: 2 additions & 2 deletions ParsingModels/sriLankaParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
sys.path.append(moudlues_directory)

from typing import List, Tuple
from ParsingModels.SriLankaModules.convert_to_table import convert_to_table, tableHeading, print_table
from ParsingModels.SriLankaModules.convert_to_table import convert_to_table, table_heading, print_table
from ParsingModels.SriLankaModules.rtf_extractor import extract_data_from_rtf
from table_conversion_functions import print_table

Expand Down Expand Up @@ -53,7 +53,7 @@ def extract_to_table(rtf_data: List[str],
print(important_text)
print(timestamps)
table = convert_to_table(important_text, timestamps, flags = flags)
heading = tableHeading #tableHeading imported from SriLankaModules.convert_to_table
heading = table_heading #tableHeading imported from SriLankaModules.convert_to_table

if debug_mode:
print("DEBUG - Output Table:")
Expand Down
4 changes: 2 additions & 2 deletions Testing/testParse.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@
out += page.extract_text()

print(out)
outFile = open("Output.txt", "w", encoding="utf-8")
print(out, file=outFile)
out_file = open("Output.txt", "w", encoding="utf-8")
print(out, file=out_file)
14 changes: 7 additions & 7 deletions conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,17 +36,17 @@ def split_file(in_csv_path:str, out_csv_path=None):
def split_all_data():
if len(sys.argv) < 2:
print(f'please include an argument of where the folder is located')
inFolder = sys.argv[1]
filesToParse = []
if os.path.exists(inFolder):
in_folder = sys.argv[1]
files_to_parse = []
if os.path.exists(in_folder):
print("Locating files...")
for root, dirs, files in os.walk(inFolder):
for root, dirs, files in os.walk(in_folder):
for name in files:
if name[-4:] == '.csv': # Only parse csv files
filesToParse.append(f'{root}/{name}'.replace('\\', '/'))
print(f'files to parse: {filesToParse[:5]}')
files_to_parse.append(f'{root}/{name}'.replace('\\', '/'))
print(f'files to parse: {files_to_parse[:5]}')

for file in filesToParse:
for file in files_to_parse:
split_file(file)


Expand Down
66 changes: 33 additions & 33 deletions dataParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,11 @@
sys.exit()

# Import Arguments
inFolder = sys.argv[1]
in_folder = sys.argv[1]
# Arg 1: folder of PDFs to parse. They should all be compatible with the same parsing model
outFile = sys.argv[2]
out_file = sys.argv[2]
# Arg 2: output file, in csv format (only the name of the file)
modelFile = sys.argv[3]
model_file = sys.argv[3]
# Arg 3: parsing model. PDF will be converted to text, but model will convert text to array data
flags = sys.argv[4:]

Expand Down Expand Up @@ -107,23 +107,23 @@
if not os.path.exists(ERROR_DIR): # If there is no directory, make it
os.makedirs(ERROR_DIR)

model = importlib.import_module(modelFile)
model = importlib.import_module(model_file)

# process each file in input folder
filesToParse = []
if os.path.exists(inFolder):
files_to_parse = []
if os.path.exists(in_folder):
print("Locating files...")
for root, dirs, files in os.walk(inFolder):
for root, dirs, files in os.walk(in_folder):
for name in files:
# print(f'root: {root} dirs: {dirs} files: {files}')
if name[-4:] == '.pdf' or name[-4:] == '.txt': # Only parse pdf or txt files
filesToParse.append(f'{root}/{name}'.replace('\\', '/'))
files_to_parse.append(f'{root}/{name}'.replace('\\', '/'))
else:
print(f"ERROR: folder '{inFolder}' not found!")
print(f"ERROR: folder '{in_folder}' not found!")
quit()

print("Will parse the following files: ", end="")
for f in filesToParse:
for f in files_to_parse:
print(f, end=", ")
print()
RESPONSE = ''
Expand All @@ -134,27 +134,27 @@

i = 1
NUM_ERRORS = 0
for currentFile in filesToParse:
print(f"Parsing file {i}/{len(filesToParse)}:", currentFile)
for current_file in files_to_parse:
print(f"Parsing file {i}/{len(files_to_parse)}:", current_file)
STEP = 0
try:
rtfData = []
if currentFile[-4:] == '.pdf': # if file is PDF
rtfData = pdf_to_rtf(currentFile)
elif currentFile[-4:] == '.txt': #if file is txt
with open(currentFile, encoding="utf8") as txt_data:
rtfData = [txt_data.read()]
rtf_data = []
if current_file[-4:] == '.pdf': # if file is PDF
rtf_data = pdf_to_rtf(current_file)
elif current_file[-4:] == '.txt': #if file is txt
with open(current_file, encoding="utf8") as txt_data:
rtf_data = [txt_data.read()]
STEP += 1
table, heading = model.extract_to_table(rtfData, flags=flags)
table, heading = model.extract_to_table(rtf_data, flags=flags)
for n in range(len(table)):
# Added file source to show here the data came from
table[n].append(currentFile)
table[n].append(current_file)
heading.append("Source File")
STEP += 1
print_to_csv(table, heading, file_name=outFile)
print_to_csv(table, heading, file_name=out_file)
except Exception as error:
NUM_ERRORS += 1
error_message = f"Error for file {currentFile} "
error_message = f"Error for file {current_file} "

if STEP == 0:
error_message += "at pdf_to_rtf(). Perhaps the file is not a proper PDF?\n"
Expand All @@ -178,33 +178,33 @@
if start_of_folder_name == -1:
# can't find line, can't categorize error (shouldn't happen)
print("can't find line in:", error_folder)
shutil.copy(currentFile, os.path.join(
ERROR_DIR, ntpath.basename(currentFile)))
shutil.copy(current_file, os.path.join(
ERROR_DIR, ntpath.basename(current_file)))
else:
error_folder = error_folder[start_of_folder_name:]
error_folder = make_valid_filename(error_folder)
output_dir = os.path.join(ERROR_DIR, error_folder)
# If there is no directory, make it
if not os.path.exists(output_dir):
os.makedirs(output_dir)
shutil.copy(currentFile, os.path.join(
output_dir, ntpath.basename(currentFile)))
shutil.copy(current_file, os.path.join(
output_dir, ntpath.basename(current_file)))

i += 1

print("Done! Output in", outFile)
print(f"There were errors in {NUM_ERRORS}/{len(filesToParse)} files")
print("Done! Output in", out_file)
print(f"There were errors in {NUM_ERRORS}/{len(files_to_parse)} files")
if log_mode:
with open(LOG_FILE_PATH, 'a', encoding='utf-8') as log_file:
log_file.write(f"There were errors in {NUM_ERRORS}/{len(filesToParse)} files")
log_file.write(f"There were errors in {NUM_ERRORS}/{len(files_to_parse)} files")

if sort_mode:
from Modules.csv_management import order_by_time

if '-asc' in flags:
order_by_time(outFile)
order_by_time(out_file)
elif '-desc' in flags:
order_by_time(outFile, asc=False)
order_by_time(out_file, asc=False)

if extract_mode:
try:
Expand All @@ -226,8 +226,8 @@
OUTPUT_PATH = temp_path

if OUTPUT_PATH is None:
OUTPUT_PATH = outFile.split('.')[0] + '_' + target_keyword + '.csv'
OUTPUT_PATH = out_file.split('.')[0] + '_' + target_keyword + '.csv'

from Modules.csv_management import extract_data

extract_data(target_keyword, outFile, OUTPUT_PATH)
extract_data(target_keyword, out_file, OUTPUT_PATH)