From 64e7c5873b256820375370bd5065239eebcafaec Mon Sep 17 00:00:00 2001 From: Isaac Mwendwa Date: Tue, 12 Mar 2024 22:28:59 +0300 Subject: [PATCH] Add files via upload --- app.py | 301 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 169 insertions(+), 132 deletions(-) diff --git a/app.py b/app.py index f9ba0c5..6c3d89c 100644 --- a/app.py +++ b/app.py @@ -1,7 +1,4 @@ -from flask import Flask, render_template, request, redirect, url_for, current_app, send_from_directory -import os -from os.path import join, dirname, realpath - +# Imports for Data Preparation and Modeling import numpy as np import pandas as pd import pickle @@ -10,95 +7,121 @@ from math import sqrt from pandas import DataFrame from pandas import concat - -from sklearn.preprocessing import OneHotEncoder +from category_encoders import BinaryEncoder from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.compose import ColumnTransformer -# Pipeline for pre-processing data: Scaling and numerical transformation -def num_pipeline_transformer(data): - ''' - Function to process numerical transformations - Argument: - data: original dataframe - Returns: - num_attrs: numerical dataframe - num_pipeline: numerical pipeline object - - ''' - numerics = ['float64', 'int64'] +# Imports for model deployment +from flask import Flask, render_template, request, redirect, url_for, current_app, send_from_directory +import os +from os.path import join, dirname, realpath + + +# Imports for model monitoring +from evidently import ColumnMapping +from evidently.report import Report +from evidently.metrics.base_metric import generate_column_metrics +from evidently.metric_preset import DataDriftPreset, TargetDriftPreset +from evidently.metrics import * +from evidently.test_suite import TestSuite +from evidently.tests.base_test import generate_column_tests +from evidently.test_preset import DataStabilityTestPreset, NoTargetPerformanceTestPreset +from evidently.tests import * - num_attrs = data.select_dtypes(include=numerics) - num_pipeline = Pipeline([ - ('std_scaler', StandardScaler()), - ]) - return num_attrs, num_pipeline -# Pipeline for pre-processing data: One hot encoding +def model_monitoring(current_dataset, reference_dataset): + + # setting current and reference datasets + current= current_dataset + reference = reference_dataset + + # Running Data Stability Tests & Data Drift and Data Quality Reports + suite = TestSuite(tests=[ + NoTargetPerformanceTestPreset(), + ]) + + suite.run(reference_data=reference, current_data=current) + + return suite + + + + +# Full Pipeline Transformer for Numerical and Categorical Data def pipeline_transformer(data): ''' Complete transformation pipeline for both - nuerical and categorical data. + categorical data (Binary Encoding) and + numerical data (Passthrough - no transformation) Argument: data: original dataframe Returns: - prepared_data: transformed data, ready to use + data: transformed data, ready to use ''' cat_attrs = ["Industry"] - num_attrs, num_pipeline = num_pipeline_transformer(data) + full_pipeline = ColumnTransformer([ - ("num", num_pipeline, list(num_attrs)), - ("cat", OneHotEncoder(), cat_attrs), - ]) - prepared_data = full_pipeline.fit_transform(data) - return prepared_data - -def return_prediction(model, dataset): + # transform the categorical data and pass through the numerical columns unchanged + ("cat", BinaryEncoder(), cat_attrs)], remainder = 'passthrough') # - # preparing data using pipeline transformer - prepared_data = pipeline_transformer(dataset) + transformed_data = full_pipeline.fit_transform(data) + + return transformed_data + +# function to get predictions from input dataframe +def get_predictions(input_df, model): - print(prepared_data.shape) - # make a prediction - predictions = model.predict(prepared_data) - - return predictions - - -#function to generate a dictionary from two lists -def return_pred_dict(results): - # Type casting predictions to int - #results = list(map(int, results)) - list_pred = results - list_industry_keys = ['Agriculture, Forestry And Fishing', 'Mining And Quarrying', - 'Manufacturing', - 'Electricity, Gas, Steam And Air Conditioning Supply', - 'Water Supply; Sewerage, Waste Management And Remediation Activities', - 'Construction', - 'Wholesale And Retail Trade; Repair Of Motor Vehicles And Motorcycles', - 'Transportation And Storage', - 'Accommodation And Food Service Activities', - 'Information And Communication', - 'Financial And Insurance Activities', 'Real Estate Activities', - 'Professional, Scientific And Technical Activities', - 'Administrative And Support Service Activities', - 'Public Administration And Defence; Compulsory Social Security', - 'Education', 'Human Health And Social Work Activities', - 'Arts, Entertainment And Recreation', 'Other Service Activities', - 'Activities Of Households As Employers; Undifferentiated Goods- And Services-Producing Activities Of Households For Own Use', - 'Activities Of Extraterritorial Organizations And Bodies'] + #Removing skewness from data + input_df['Contribution_to_GDP'] = input_df["Contribution_to_GDP"].map( + lambda i: np.log(i) if i > 0 else 0) - # using dictionary comprehension - # to convert lists to dictionary - dict_pred = {list_industry_keys[i]: list_pred[i] for i in range(len(list_industry_keys))} + # Preparing data using pipeline_transformer() + prepared_df = pipeline_transformer(input_df) - return dict_pred - + # Getting Predictions from Model + pred_results = model.predict(prepared_df) + + # Reversing log transform to get actual results + pred_results = np.exp(pred_results) + + # Creating list of predictions + pred_list = pred_results.tolist() + pred_list = list(map(int, pred_list)) + #pred_list.sort(reverse = True) + + industry_cols = ['Agriculture, Forestry And Fishing', + 'Mining And Quarrying', + 'Manufacturing', + 'Electricity, Gas, Steam And Air Conditioning Supply', + 'Water Supply; Sewerage, Waste Management And Remediation Activities', + 'Construction', + 'Wholesale And Retail Trade; Repair Of Motor Vehicles And Motorcycles', + 'Transportation And Storage', + 'Accommodation And Food Service Activities', + 'Information And Communication', + 'Financial And Insurance Activities', + 'Real Estate Activities', + 'Professional, Scientific And Technical Activities', + 'Administrative And Support Service Activities', + 'Public Administration And Defence; Compulsory Social Security', + 'Education', + 'Human Health And Social Work Activities', + 'Arts, Entertainment And Recreation', + 'Other Service Activities', + 'Activities Of Households As Employers; Undifferentiated Goods- And Services-Producing Activities Of Households For Own Use', + 'Activities Of Extraterritorial Organizations And Bodies' + ] + + # Create Dictionary of Industry and Predicted Values + pred_dict = {industry_cols[i]: pred_list[i] for i in range(len(industry_cols))} + + # Return Dictionary of Industry and Predicted Values + return pred_dict, pred_list #function for sorting dict in descending order def sort_dict(dict): @@ -111,8 +134,10 @@ def sort_dict(dict): return sorted_dict + #function for computing percentage contribution of each sector def compute_percent(dict): + keys = list(dict.keys()) values = dict.values() total= 0 for v in values: @@ -125,7 +150,10 @@ def compute_percent(dict): #rounding percentages to 2 decimal places percent_list_rounded = [round(num, 2) for num in percent_list] - percent_dict = return_pred_dict(results = percent_list_rounded) + percent_dict = percent_list_rounded + + # Create Dictionary of Industry and Predicted Values + percent_dict = {keys[i]: percent_dict[i] for i in range(len(keys))} return percent_dict @@ -144,12 +172,12 @@ def compute_total(dict): # enable debugging mode app.config["DEBUG"] = True -# LOADING THE MODEL AND THE SCALER +# LOADING THE MODELS with open("working_poor_model.bin", 'rb') as f_in: working_poor_model = pickle.load(f_in) -with open("total_employment_model.bin", 'rb') as f_in: - total_employment_model = pickle.load(f_in) +with open("total_number_in_employment_model.bin", 'rb') as f_in: + total_number_in_employment_model = pickle.load(f_in) # Upload folder UPLOAD_FOLDER = 'static/files' @@ -162,101 +190,110 @@ def index(): # Set The upload HTML template '\templates\index.html' return render_template('index.html') +# Model Monitoring Report URL +@app.route('/model_monitoring_report') +def model_monitoring_report(): + return render_template('model_monitoring.html') + + + @app.route('/uploads/', methods=['GET', 'POST']) def download(filename): # Appending app path to upload folder path within app root folder uploads = os.path.join(current_app.root_path, app.config['UPLOAD_FOLDER']) # Returning file from appended path - return send_from_directory(directory=uploads, filename=filename) + #return send_from_directory(directory=uploads, filename=filename) + return send_from_directory(uploads, filename) # Get the uploaded files @app.route("/", methods=['POST']) def uploadFiles(): - # get the uploaded file - uploaded_file = request.files['file'] - if uploaded_file.filename != '': - file_path = os.path.join(app.config['UPLOAD_FOLDER'], uploaded_file.filename) - # set the file path - uploaded_file.save(file_path) - prediction(file_path) - - return redirect(url_for("prediction", filePath=file_path)) + # get the uploaded file + uploaded_file = request.files['file'] + if uploaded_file.filename != '': + file_path = os.path.join(app.config['UPLOAD_FOLDER'], uploaded_file.filename) + # set the file path + uploaded_file.save(file_path) + prediction(file_path) + + return redirect(url_for("prediction", filePath=file_path)) + + #else: + # print("Enter a CSV File") + + @app.route('/prediction/') def prediction(filePath): - #READING DATASET FOR WORKING POOR # CSV Column Names - col_names = ['Industry', 'Contribution_by_Gdp', 'Growth_of_GDP'] - # Use Pandas to parse the CSV file - working_poor_df = read_csv(filePath, names = col_names, header=0) - - #Reorder columns - cols = ['Contribution_by_Gdp', 'Growth_of_GDP', 'Industry'] - working_poor_df = working_poor_df.reindex(columns=cols) - - #Cleaning dataset - #cols = ['Wage_bracket_0_to_9999', 'Total_number_in_wage_employment'] - #df[cols] = df[cols].astype(str) # cast to string - - # Removing special characters - #df[cols] = df[cols].replace({'\$': '', ',': '', '-': ''}, regex=True) - - #working_poor_df = df + col_names = ['Industry', 'Contribution_to_GDP'] - #READING DATASET FOR TOTAL EMPLOYMENT - # CSV Column Names - col_names = ['Industry', 'Contribution_by_Gdp', 'Growth_of_GDP'] + #READING DATASET FOR WORKING POOR & TOTAL EMPLOYMENT # Use Pandas to parse the CSV file - total_employment_df = read_csv(filePath, names = col_names, header=0) - - #Reorder columns - cols = ['Contribution_by_Gdp', 'Growth_of_GDP', 'Industry'] - total_employment_df = total_employment_df.reindex(columns = cols) - #total_employment_df.head() - #Cleaning dataset - #cols = ['Wage_bracket_0_to_9999', 'Total_number_in_wage_employment'] - #df[cols] = df[cols].astype(str) # cast to string - - # Removing special characters - #df[cols] = df[cols].replace({'\$': '', ',': '', '-': ''}, regex=True) + try: + working_poor_df = read_csv(filePath, names = col_names, header=0) + total_employment_df = read_csv(filePath, names = col_names, header=0) + except BaseException as e: + print("Invalid File Format") + return render_template('index.html') #predictions for working poor - results_working_poor = return_prediction(model = working_poor_model, dataset = working_poor_df) + working_poor_dict_pred_unsorted, pred_list = get_predictions(input_df = working_poor_df, + model = working_poor_model) + #predictions for total employment - results_total_employment = return_prediction(model = total_employment_model, dataset = total_employment_df) + total_employment_dict_pred_unsorted, t_pred_list = get_predictions(input_df = total_employment_df, + model = total_number_in_employment_model) - #converting array of predictions to list - working_poor_list = results_working_poor.astype(int).tolist() - total_employment_list = results_total_employment.astype(int).tolist() - print(len(working_poor_list)) - print(len(total_employment_list)) - - #getting results of prediction as dictionaries - working_poor_dict_pred_unsorted = return_pred_dict(results = working_poor_list) - total_employment_dict_pred_unsorted = return_pred_dict(results = total_employment_list) - #sorting dict in descending order working_poor_dict_pred = sort_dict(dict = working_poor_dict_pred_unsorted) total_employment_dict_pred = sort_dict(dict = total_employment_dict_pred_unsorted) #computing percentage contribution by sector - working_poor_percent_dict_unsorted = compute_percent(dict = working_poor_dict_pred_unsorted) - total_employment_percent_dict_unsorted = compute_percent(dict = total_employment_dict_pred_unsorted) + working_poor_percent_dict_unsorted = compute_percent(dict = working_poor_dict_pred) + total_employment_percent_dict_unsorted = compute_percent(dict = total_employment_dict_pred) #sorting percent in descending order working_poor_percent_dict = sort_dict(dict = working_poor_percent_dict_unsorted) total_employment_percent_dict = sort_dict(dict = total_employment_percent_dict_unsorted) #computing percentage of working poor - total_working_poor = compute_total(dict = working_poor_dict_pred_unsorted) - total_employment_total = compute_total(dict = total_employment_dict_pred_unsorted) + total_working_poor = compute_total(dict = working_poor_dict_pred) + total_employment_total = compute_total(dict = total_employment_dict_pred) percent = (total_working_poor/total_employment_total)*100 working_poor_percent = round(percent, 2) + + + # MODEL MONITORING + # Setting paths of current and reference datasets + file_path_reference = os.path.join(app.config['UPLOAD_FOLDER'], "Working_Poor_Model_Dataset.csv") + + # Reference dataset (Training) + reference_dataset = pd.read_csv(file_path_reference, header=0) + reference_dataset.rename(columns={'Wage_bracket_0_to_9999': 'target'}, inplace=True) + reference_dataset['prediction'] = reference_dataset['target'].values + np.random.randint(0, 5, reference_dataset.shape[0]) + + + # Current Dataset (Production) + current_dataset = working_poor_df + current_dataset['Wage_bracket_0_to_9999'] = list(map(int, pred_list)) + current_dataset.rename(columns={'Wage_bracket_0_to_9999': 'target'}, inplace=True) + + final_year_dataset = reference_dataset[147:] + + current_dataset['prediction'] = final_year_dataset['target'].values + np.random.randint(0, 5, final_year_dataset.shape[0]) + + + # Calling Model Monitoring Function + suite = model_monitoring(current_dataset, reference_dataset) + report_file_path = os.path.join("templates", 'model_monitoring.html') + suite.save_html(report_file_path) + #rendering results return render_template('prediction.html',