From 64e7c5873b256820375370bd5065239eebcafaec Mon Sep 17 00:00:00 2001
From: Isaac Mwendwa <isaacmwendwa107@gmail.com>
Date: Tue, 12 Mar 2024 22:28:59 +0300
Subject: [PATCH] Add files via upload

---
 app.py | 301 ++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 169 insertions(+), 132 deletions(-)

diff --git a/app.py b/app.py
index f9ba0c5..6c3d89c 100644
--- a/app.py
+++ b/app.py
@@ -1,7 +1,4 @@
-from flask import Flask, render_template, request, redirect, url_for, current_app, send_from_directory
-import os
-from os.path import join, dirname, realpath
-
+# Imports for Data Preparation and Modeling
 import numpy as np  
 import pandas as pd
 import pickle
@@ -10,95 +7,121 @@
 from math import sqrt
 from pandas import DataFrame
 from pandas import concat
-
-from sklearn.preprocessing import OneHotEncoder
+from category_encoders import BinaryEncoder
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import StandardScaler
 from sklearn.compose import ColumnTransformer
 
 
-# Pipeline for pre-processing data: Scaling and numerical transformation
-def num_pipeline_transformer(data):
-    '''
-    Function to process numerical transformations
-    Argument:
-        data: original dataframe 
-    Returns:
-        num_attrs: numerical dataframe
-        num_pipeline: numerical pipeline object
-        
-    '''
-    numerics = ['float64', 'int64']
+# Imports for model deployment
+from flask import Flask, render_template, request, redirect, url_for, current_app, send_from_directory
+import os
+from os.path import join, dirname, realpath
+
+
+# Imports for model monitoring
+from evidently import ColumnMapping
+from evidently.report import Report
+from evidently.metrics.base_metric import generate_column_metrics
+from evidently.metric_preset import DataDriftPreset, TargetDriftPreset
+from evidently.metrics import *
+from evidently.test_suite import TestSuite
+from evidently.tests.base_test import generate_column_tests
+from evidently.test_preset import DataStabilityTestPreset, NoTargetPerformanceTestPreset
+from evidently.tests import *
 
-    num_attrs = data.select_dtypes(include=numerics)
 
-    num_pipeline = Pipeline([
-        ('std_scaler', StandardScaler()),
-        ])
-    return num_attrs, num_pipeline
 
-# Pipeline for pre-processing data: One hot encoding
+def model_monitoring(current_dataset, reference_dataset):
+
+    # setting current and reference datasets
+    current= current_dataset
+    reference = reference_dataset
+
+    # Running Data Stability Tests & Data Drift  and Data Quality Reports
+    suite = TestSuite(tests=[
+        NoTargetPerformanceTestPreset(),
+    ])
+
+    suite.run(reference_data=reference, current_data=current)
+
+    return suite
+
+
+
+
+# Full Pipeline Transformer for Numerical and Categorical Data
 def pipeline_transformer(data):
     '''
     Complete transformation pipeline for both
-    nuerical and categorical data.
+    categorical data (Binary Encoding) and 
+    numerical data (Passthrough - no transformation)
     
     Argument:
         data: original dataframe 
     Returns:
-        prepared_data: transformed data, ready to use
+        data: transformed data, ready to use
     '''
     cat_attrs = ["Industry"]
-    num_attrs, num_pipeline = num_pipeline_transformer(data)
+    
     full_pipeline = ColumnTransformer([
-        ("num", num_pipeline, list(num_attrs)),
-        ("cat", OneHotEncoder(), cat_attrs),
-        ])
-    prepared_data = full_pipeline.fit_transform(data)
-    return prepared_data
-
-def return_prediction(model, dataset):
+        # transform the categorical data and pass through the numerical columns unchanged
+        ("cat", BinaryEncoder(), cat_attrs)], remainder = 'passthrough')    #
     
-    # preparing data using pipeline transformer
-    prepared_data = pipeline_transformer(dataset)
+    transformed_data = full_pipeline.fit_transform(data)
+    
+    return transformed_data
+
+# function to get predictions from input dataframe
+def get_predictions(input_df, model):
 
-    print(prepared_data.shape)
     
-    # make a prediction
-    predictions = model.predict(prepared_data)
-
-    return predictions
-
-
-#function to generate a dictionary from two lists
-def return_pred_dict(results):
-    # Type casting predictions to int
-    #results = list(map(int, results))
-    list_pred = results
-    list_industry_keys = ['Agriculture, Forestry And Fishing', 'Mining And Quarrying',
-       'Manufacturing',
-       'Electricity, Gas, Steam And Air Conditioning Supply',
-       'Water Supply; Sewerage, Waste Management And Remediation Activities',
-       'Construction',
-       'Wholesale And Retail Trade; Repair Of Motor Vehicles And Motorcycles',
-       'Transportation And Storage',
-       'Accommodation And Food Service Activities',
-       'Information And Communication',
-       'Financial And Insurance Activities', 'Real Estate Activities',
-       'Professional, Scientific And Technical Activities',
-       'Administrative And Support Service Activities',
-       'Public Administration And Defence; Compulsory Social Security',
-       'Education', 'Human Health And Social Work Activities',
-       'Arts, Entertainment And Recreation', 'Other Service Activities',
-       'Activities Of Households As Employers; Undifferentiated Goods- And Services-Producing Activities Of Households For Own Use',
-       'Activities Of Extraterritorial Organizations And Bodies']
+    #Removing skewness from data
+    input_df['Contribution_to_GDP'] = input_df["Contribution_to_GDP"].map(
+                                        lambda i: np.log(i) if i > 0 else 0)
     
-    # using dictionary comprehension 
-    # to convert lists to dictionary 
-    dict_pred = {list_industry_keys[i]: list_pred[i] for i in range(len(list_industry_keys))} 
+    # Preparing data using pipeline_transformer()
+    prepared_df = pipeline_transformer(input_df)
     
-    return dict_pred
-
+    # Getting Predictions from Model
+    pred_results = model.predict(prepared_df)
+    
+    # Reversing log transform to get actual results
+    pred_results = np.exp(pred_results)
+    
+    # Creating list of predictions
+    pred_list = pred_results.tolist()
+    pred_list = list(map(int, pred_list))
+    #pred_list.sort(reverse = True)
+
+    industry_cols = ['Agriculture, Forestry And Fishing',
+                        'Mining And Quarrying',
+                        'Manufacturing',
+                        'Electricity, Gas, Steam And Air Conditioning Supply',
+                        'Water Supply; Sewerage, Waste Management And Remediation Activities',
+                        'Construction',
+                        'Wholesale And Retail Trade; Repair Of Motor Vehicles And Motorcycles',
+                        'Transportation And Storage',
+                        'Accommodation And Food Service Activities',
+                        'Information And Communication',
+                        'Financial And Insurance Activities',
+                        'Real Estate Activities',
+                        'Professional, Scientific And Technical Activities',
+                        'Administrative And Support Service Activities',
+                        'Public Administration And Defence; Compulsory Social Security',
+                        'Education',
+                        'Human Health And Social Work Activities',
+                        'Arts, Entertainment And Recreation',
+                        'Other Service Activities',
+                        'Activities Of Households As Employers; Undifferentiated Goods- And Services-Producing Activities Of Households For Own Use',
+                        'Activities Of Extraterritorial Organizations And Bodies'
+                    ]
+    
+    # Create Dictionary of Industry and Predicted Values
+    pred_dict = {industry_cols[i]: pred_list[i] for i in range(len(industry_cols))}
+    
+    # Return Dictionary of Industry and Predicted Values
+    return pred_dict, pred_list
 
 #function for sorting dict in descending order
 def sort_dict(dict):
@@ -111,8 +134,10 @@ def sort_dict(dict):
 
     return sorted_dict
 
+
 #function for computing percentage contribution of each sector
 def compute_percent(dict):
+    keys = list(dict.keys())
     values = dict.values()
     total= 0
     for v in values:
@@ -125,7 +150,10 @@ def compute_percent(dict):
 
     #rounding percentages to 2 decimal places
     percent_list_rounded = [round(num, 2) for num in percent_list]
-    percent_dict = return_pred_dict(results = percent_list_rounded)
+    percent_dict = percent_list_rounded
+    
+    # Create Dictionary of Industry and Predicted Values
+    percent_dict = {keys[i]: percent_dict[i] for i in range(len(keys))}
 
     return percent_dict
 
@@ -144,12 +172,12 @@ def compute_total(dict):
 # enable debugging mode
 app.config["DEBUG"] = True
 
-# LOADING THE MODEL AND THE SCALER
+# LOADING THE MODELS
 with open("working_poor_model.bin", 'rb') as f_in:
     working_poor_model = pickle.load(f_in)
 
-with open("total_employment_model.bin", 'rb') as f_in:
-    total_employment_model = pickle.load(f_in)
+with open("total_number_in_employment_model.bin", 'rb') as f_in:
+    total_number_in_employment_model = pickle.load(f_in)
 
 # Upload folder
 UPLOAD_FOLDER = 'static/files'
@@ -162,101 +190,110 @@ def index():
      # Set The upload HTML template '\templates\index.html'
     return render_template('index.html')
 
+# Model Monitoring Report URL
+@app.route('/model_monitoring_report')
+def model_monitoring_report():
+    return render_template('model_monitoring.html')
+
+
+
 @app.route('/uploads/<path:filename>', methods=['GET', 'POST'])
 def download(filename):
     # Appending app path to upload folder path within app root folder
     uploads = os.path.join(current_app.root_path, app.config['UPLOAD_FOLDER'])
     # Returning file from appended path
-    return send_from_directory(directory=uploads, filename=filename)    
+    #return send_from_directory(directory=uploads, filename=filename)  
+    return send_from_directory(uploads, filename)  
 
 
 # Get the uploaded files
 @app.route("/", methods=['POST'])
 def uploadFiles():
-      # get the uploaded file
-      uploaded_file = request.files['file']
-      if uploaded_file.filename != '':
-           file_path = os.path.join(app.config['UPLOAD_FOLDER'], uploaded_file.filename)
-          # set the file path
-           uploaded_file.save(file_path)
-           prediction(file_path)
-           
-      return redirect(url_for("prediction", filePath=file_path))
+    # get the uploaded file
+    uploaded_file = request.files['file']
+    if uploaded_file.filename != '':
+        file_path = os.path.join(app.config['UPLOAD_FOLDER'], uploaded_file.filename)
+        # set the file path
+        uploaded_file.save(file_path)
+        prediction(file_path)
+
+        return redirect(url_for("prediction", filePath=file_path))
+    
+    #else: 
+    #    print("Enter a CSV File")    
+
+    
 
 
 @app.route('/prediction/<path:filePath>')
 def prediction(filePath):
 
-    #READING DATASET FOR WORKING POOR
     # CSV Column Names
-    col_names = ['Industry', 'Contribution_by_Gdp', 'Growth_of_GDP']
-    # Use Pandas to parse the CSV file
-    working_poor_df = read_csv(filePath, names = col_names, header=0)
-
-    #Reorder columns
-    cols = ['Contribution_by_Gdp', 'Growth_of_GDP', 'Industry']
-    working_poor_df = working_poor_df.reindex(columns=cols)
-
-    #Cleaning dataset
-    #cols = ['Wage_bracket_0_to_9999', 'Total_number_in_wage_employment']
-    #df[cols] = df[cols].astype(str)  # cast to string
-
-    # Removing special characters
-    #df[cols] = df[cols].replace({'\$': '', ',': '', '-': ''}, regex=True)
-
-    #working_poor_df = df
+    col_names = ['Industry', 'Contribution_to_GDP']
 
-    #READING DATASET FOR TOTAL EMPLOYMENT
-    # CSV Column Names
-    col_names = ['Industry', 'Contribution_by_Gdp', 'Growth_of_GDP']
+    #READING DATASET FOR WORKING POOR & TOTAL EMPLOYMENT
     # Use Pandas to parse the CSV file
-    total_employment_df = read_csv(filePath, names = col_names, header=0)
-
-    #Reorder columns
-    cols = ['Contribution_by_Gdp', 'Growth_of_GDP', 'Industry']
-    total_employment_df = total_employment_df.reindex(columns = cols)
-    #total_employment_df.head()
 
-    #Cleaning dataset
-    #cols = ['Wage_bracket_0_to_9999', 'Total_number_in_wage_employment']
-    #df[cols] = df[cols].astype(str)  # cast to string
-
-    # Removing special characters
-    #df[cols] = df[cols].replace({'\$': '', ',': '', '-': ''}, regex=True)
+    try: 
+        working_poor_df = read_csv(filePath, names = col_names, header=0)
+        total_employment_df = read_csv(filePath, names = col_names, header=0)
+    except BaseException as e:
+        print("Invalid File Format") 
+        return render_template('index.html')
 
     
     #predictions for working poor
-    results_working_poor = return_prediction(model = working_poor_model, dataset = working_poor_df)
+    working_poor_dict_pred_unsorted, pred_list = get_predictions(input_df = working_poor_df, 
+                                           model = working_poor_model)
+    
     #predictions for total employment
-    results_total_employment = return_prediction(model = total_employment_model, dataset = total_employment_df)
+    total_employment_dict_pred_unsorted, t_pred_list = get_predictions(input_df = total_employment_df,
+                                               model = total_number_in_employment_model)
 
-    #converting array of predictions to list
-    working_poor_list = results_working_poor.astype(int).tolist()
-    total_employment_list = results_total_employment.astype(int).tolist()
-    print(len(working_poor_list))
-    print(len(total_employment_list))
-
-    #getting results of prediction as dictionaries
-    working_poor_dict_pred_unsorted = return_pred_dict(results = working_poor_list)
-    total_employment_dict_pred_unsorted = return_pred_dict(results = total_employment_list)
-    
     #sorting dict in descending order
     working_poor_dict_pred = sort_dict(dict = working_poor_dict_pred_unsorted)
     total_employment_dict_pred = sort_dict(dict = total_employment_dict_pred_unsorted)
 
     #computing percentage contribution by sector
-    working_poor_percent_dict_unsorted = compute_percent(dict = working_poor_dict_pred_unsorted)
-    total_employment_percent_dict_unsorted = compute_percent(dict = total_employment_dict_pred_unsorted)
+    working_poor_percent_dict_unsorted = compute_percent(dict = working_poor_dict_pred)
+    total_employment_percent_dict_unsorted = compute_percent(dict = total_employment_dict_pred)
     
     #sorting percent in descending order
     working_poor_percent_dict = sort_dict(dict = working_poor_percent_dict_unsorted)
     total_employment_percent_dict = sort_dict(dict = total_employment_percent_dict_unsorted)
 
     #computing percentage of working poor
-    total_working_poor = compute_total(dict = working_poor_dict_pred_unsorted)
-    total_employment_total = compute_total(dict = total_employment_dict_pred_unsorted)
+    total_working_poor = compute_total(dict = working_poor_dict_pred)
+    total_employment_total = compute_total(dict = total_employment_dict_pred)
     percent = (total_working_poor/total_employment_total)*100
     working_poor_percent = round(percent, 2)
+
+    
+    # MODEL MONITORING
+    # Setting paths of current and reference datasets
+    file_path_reference = os.path.join(app.config['UPLOAD_FOLDER'], "Working_Poor_Model_Dataset.csv")
+    
+    # Reference dataset (Training)
+    reference_dataset = pd.read_csv(file_path_reference, header=0) 
+    reference_dataset.rename(columns={'Wage_bracket_0_to_9999': 'target'}, inplace=True)
+    reference_dataset['prediction'] = reference_dataset['target'].values + np.random.randint(0, 5, reference_dataset.shape[0])
+   
+
+    # Current Dataset (Production)
+    current_dataset = working_poor_df   
+    current_dataset['Wage_bracket_0_to_9999'] = list(map(int, pred_list))
+    current_dataset.rename(columns={'Wage_bracket_0_to_9999': 'target'}, inplace=True)
+
+    final_year_dataset = reference_dataset[147:]
+
+    current_dataset['prediction'] = final_year_dataset['target'].values + np.random.randint(0, 5, final_year_dataset.shape[0])
+   
+
+    # Calling Model Monitoring Function
+    suite = model_monitoring(current_dataset, reference_dataset)
+    report_file_path = os.path.join("templates", 'model_monitoring.html')
+    suite.save_html(report_file_path)
+
     
     #rendering results
     return render_template('prediction.html',