diff --git a/resources/healthsystem/consumables/ResourceFile_Consumables_Item_Designations.csv b/resources/healthsystem/consumables/ResourceFile_Consumables_Item_Designations.csv index 5e2b85db4b..ada24c7a54 100644 --- a/resources/healthsystem/consumables/ResourceFile_Consumables_Item_Designations.csv +++ b/resources/healthsystem/consumables/ResourceFile_Consumables_Item_Designations.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5c03fd1b3ad2a7352dcbc41ab891744fb8b82f37c131798596e46fe3c00d0a15 -size 72294 +oid sha256:e47906311ed0df8d399dcd3e98208fba47ecd6f1f04154ff74bd8d3d52d8cccc +size 74310 diff --git a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv index ed46fac7cb..4301af678d 100644 --- a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv +++ b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f945b15a98e571464b6931f0a3a071c1c90be93d8ba0bd9d1eca751caab34793 -size 55657974 +oid sha256:b0922f6c12c1303bdd1ce5f1a3c1e212b1635069664113a18a80ce77742ac031 +size 71215257 diff --git a/src/scripts/consumables_analyses/descriptive_analysis.py b/src/scripts/consumables_analyses/descriptive_analysis.py new file mode 100644 index 0000000000..61780bf128 --- /dev/null +++ b/src/scripts/consumables_analyses/descriptive_analysis.py @@ -0,0 +1,446 @@ +''' +Produce plots and estimates for the manuscript "Estimating the health gains and value for money of reducing drug stock-outs in Malawi: an individual-based modelling study" +''' + +import datetime +import os +import textwrap +from pathlib import Path + +import matplotlib.pyplot as plt +import pandas as pd +import seaborn as sns +from typing import Literal, Sequence, Optional, Union, List + +from tlo import Date +from tlo.analysis.utils import ( + extract_params, + get_scenario_info, + get_scenario_outputs, + load_pickled_dataframes, +) + +# Define a timestamp for script outputs +timestamp = datetime.datetime.now().strftime("_%Y_%m_%d_%H_%M") + +# Print the start time of the script +print('Script Start', datetime.datetime.now().strftime('%H:%M')) + +# Create folders to store results +resourcefilepath = Path("./resources") +consumable_resourcefilepath = resourcefilepath / "healthsystem/consumables" +simulationfilepath = Path('./outputs/sakshi.mohan@york.ac.uk') +outputfilepath = Path('./outputs/consumables_impact_analysis') +if not os.path.exists(outputfilepath): + os.makedirs(outputfilepath) + +# Utility functions +# Assign item names to item codes +def assign_consumable_names_to_item_codes(df): + # Create dictionary mapping item_codes to consumables names + consumables_df = pd.read_csv(consumable_resourcefilepath / "ResourceFile_Consumables_Items_and_Packages.csv")[['Item_Code', 'Items']] + consumables_df = consumables_df[consumables_df['Item_Code'].notna()] + consumables_dict = dict(zip(consumables_df['Item_Code'], consumables_df['Items'])) + + # Add consumable_name to df + df = df.copy() + df['item_name'] = df['item_code'].map(consumables_dict) + + return df + +# Prepare availability data +def prepare_availability_dataset_for_plots( + _df: pd.DataFrame, + scenario_list: Optional[list[int]] = None, + scenario_names_dict: Optional[dict[str, str]] = None, + consumable_resourcefilepath: Path = None, + resourcefilepath: Path = None +) -> pd.DataFrame: + """ + Prepares a consumable availability dataset by merging facility and item category data, + renaming columns for scenarios, and cleaning category names for plotting. + """ + if scenario_list is None: + scenario_list = [] + if scenario_names_dict is None: + scenario_names_dict = {} + + # Load item category mapping + program_item_mapping = pd.read_csv( + consumable_resourcefilepath / 'ResourceFile_Consumables_Item_Designations.csv', + usecols=['Item_Code', 'item_category'] + ) + program_item_mapping = program_item_mapping.rename(columns={'Item_Code': 'item_code'}) + program_item_mapping = program_item_mapping[program_item_mapping['item_category'].notna()] + + # Load facility list + mfl = pd.read_csv(resourcefilepath / "healthsystem" / "organisation" / "ResourceFile_Master_Facilities_List.csv") + + # Merge facility and program info + _df = _df.merge( + mfl[['District', 'Facility_Level', 'Facility_ID']], + on='Facility_ID', how='left' + ) + _df = _df.merge(program_item_mapping, on='item_code', how='left') + + # Rename scenario columns + _df = _df.rename(columns=scenario_names_dict) + + # Clean item category names + clean_category_names = { + 'cancer': 'Cancer', + 'cardiometabolicdisorders': 'Cardiometabolic Disorders', + 'contraception': 'Contraception', + 'general': 'General', + 'hiv': 'HIV', + 'malaria': 'Malaria', + 'ncds': 'Non-communicable Diseases', + 'neonatal_health': 'Neonatal Health', + 'other_childhood_illnesses': 'Other Childhood Illnesses', + 'reproductive_health': 'Reproductive Health', + 'road_traffic_injuries': 'Road Traffic Injuries', + 'tb': 'Tuberculosis', + 'undernutrition': 'Undernutrition', + 'epi': 'Expanded programme on immunization' + } + _df['item_category'] = _df['item_category'].map(clean_category_names) + + return _df + +# Wrap Labels +def wrap_labels(labels, width=15): + """Wrap each label to the given character width.""" + return [textwrap.fill(str(lab), width) if lab is not None else "" for lab in labels] + +# Generate heatmaps of average availability +def generate_heatmap( + df: pd.DataFrame, + include_levels: Optional[List[str]] = None, + value_col: str = "Actual", + row: str = "item_category", + col: str = "Facility_Level", + row_order: Optional[Sequence[str]] = None, + col_order: Optional[Sequence[str]] = None, + figurespath: Optional[Path] = None, + filename: str = "heatmap_consumable_availability.png", + figsize: tuple[int, int] = (10, 8), + cmap: str = "RdYlGn", + annot: bool = True, + fmt: Optional[str] = None, # None -> auto choose + font_scale: float = 0.9, + cbar_label: str = "Proportion of days on which consumable is available", + xlabel: Optional[str] = None, + ylabel: Optional[str] = None, + as_pct: bool = True, # format annotations as percentages + round_decimals: int = 2, + # option to plot scenarios on the x_axis + scenario_axis: bool = False, # if True, columns become scenarios + scenario_cols: Optional[Sequence[str]] = None, +): + """ + Build a heatmap either by a single column (e.g., Facility_Level) or across multiple + scenario columns placed on the x-axis. + """ + if include_levels is not None: + df = df[df.Facility_Level.isin(include_levels)] + if scenario_axis: + aggregated = ( + df.groupby([row], dropna=True)[scenario_cols] + .mean() + .reset_index() + ) + # Add perfect scenario + aggregated['Perfect'] = 1 # Add a column representing the perfect scenario + heatmap_df = aggregated.set_index('item_category') + else: + # Standard mode: columns = `col`, values = mean(value_col) + aggregated = ( + df.groupby([row, col], dropna=True)[value_col] + .mean() + .reset_index() + ) + heatmap_df = aggregated.pivot(index=row, columns=col, values=value_col) + + # Optional ordering + if row_order is not None: + heatmap_df = heatmap_df.reindex(row_order) + if col_order is not None: + heatmap_df = heatmap_df.reindex(columns=col_order) + + # 2) Totals (means across the raw data, not the pivot means) + if scenario_axis: + # Means by row across all programs for each scenario + row_means = heatmap_df.mean(axis=0) # average per scenario across programs + avg_row = row_means.copy() + heatmap_df.loc["Average"] = avg_row + else: + # Compute from raw df to avoid double-averaging + col_means = df.groupby(row, dropna=False)[value_col].mean() + row_means = df.groupby(col, dropna=False)[value_col].mean() + overall_mean = df[value_col].mean() + + heatmap_df["Average"] = col_means.reindex(heatmap_df.index) + avg_row = row_means.reindex(heatmap_df.columns).copy() + avg_row.loc["Average"] = overall_mean + heatmap_df.loc["Average"] = avg_row + + # 3) Annotation formatting + if as_pct: + # If values are 0–1 proportions, annotate as percentages + display_df = (heatmap_df * 100).round(round_decimals) + if fmt is None: + fmt = ".0f" if round_decimals == 0 else f".{round_decimals}f" + annot_kws = {"fmt": fmt} + # Build string labels with % sign + annot_data = display_df.astype(float) + else: + display_df = heatmap_df.round(round_decimals) + if fmt is None: + fmt = f".{round_decimals}f" + annot_kws = {"fmt": fmt} + annot_data = display_df.astype(float) + + # 4) Plot + sns.set(font_scale=font_scale) + fig, ax = plt.subplots(figsize=figsize) + hm = sns.heatmap( + annot_data, + annot=annot, + cmap=cmap, + cbar_kws={"label": cbar_label}, + ax=ax, + fmt=".2f" + ) + + # If percentage labels requested, overwrite text with % suffix + if annot and as_pct: + for t in ax.texts: + t.set_text(f"{t.get_text()}%") + + # 5) Labels & ticks + xlab = (xlabel or ("Scenario" if scenario_axis else col.replace("_", " ").title())) + ylab = (ylabel or row.replace("_", " ").title()) + ax.set_xlabel(xlab) + ax.set_ylabel(ylab) + ax.set_xticklabels(ax.get_xticklabels(), rotation=90) + ax.set_yticklabels(ax.get_yticklabels(), rotation=0) + + # 6) Save (optional) + if figurespath is not None: + figurespath = Path(figurespath) + figurespath.mkdir(parents=True, exist_ok=True) + outpath = figurespath / filename + plt.savefig(outpath, dpi=300, bbox_inches="tight") + + plt.show() + plt.close(fig) + return fig, ax, heatmap_df + +# Generate LaTex-compatible detailed table of availability +def generate_detail_availability_table_by_scenario( + df: pd.DataFrame, + groupby_var: str, + scenario_cols: Sequence[str], + include_levels: Union[str, Sequence[str]], + longtable: bool = False, + outputpath: Path = None, + decimals: int = 2, + caption: Optional[str] = None, + label_prefix: str = "tab:availability_by_", + col_width_groupbyvar: str = "4cm", + col_width_scenario: str = "1.8cm", +) -> str: + """ + Create a LaTeX table of average availability (as percentages) for each consumable across scenarios, + filtered to selected facility levels. + + Returns the LaTeX string and writes it to figurespath / f"availability_by_{groupby_var}.tex" + """ + + # --- Setup --- + if outputpath is None: + outputpath = outputfilepath / "appendix" # falls back to your existing default + outputpath.mkdir(parents=True, exist_ok=True) + + # Accept str OR list for include_levels + if isinstance(include_levels, str): + include_levels = [include_levels] + + table_df = df.copy() + + # Filter by facility level if the column is present + if "Facility_Level" in table_df.columns: + table_df = table_df[table_df["Facility_Level"].isin(include_levels)] + + # Aggregate means per item + grouped = ( + table_df + .groupby([groupby_var], dropna=False)[list(scenario_cols)] + .mean() + .reset_index() + ) + + # Rename first column to "Consumable" + grouped.rename(columns={groupby_var: "Consumable"}, inplace=True) + + # Escape LaTeX in Consumable names + def _latex_escape(s): + if pd.isna(s): + return "" + s = str(s) + # Order matters for backslashes; escape backslash first + s = s.replace("\\", r"\\") + s = s.replace("&", r"\&").replace("%", r"\%").replace("_", r"\_") + s = s.replace("#", r"\#").replace("{", r"\{").replace("}", r"\}") + s = s.replace("$", r"\$").replace("^", r"\^{}").replace("~", r"\~{}") + return s + + grouped["Consumable"] = grouped["Consumable"].map(_latex_escape) + + # Convert proportions -> percentage strings with escaped % + def pct_format(x: float) -> str: + if pd.isna(x): + return "" + return f"{x * 100:.{decimals}f}\\%" + + for c in scenario_cols: + grouped[c] = grouped[c].map(pct_format) + + # Build column format dynamically + # First col wider for names, then one col per scenario + column_format = ( + f"|R{{{col_width_groupbyvar}}}|" + + "|".join([f"R{{{col_width_scenario}}}"] * len(scenario_cols)) + + "|" + ) + + # Caption/label + if caption is None: + caption = "Summarized availability by consumable" + label = f"{label_prefix}{groupby_var}_{include_levels}" + + # Export to LaTeX (escape=False since we already escaped) + latex_table = grouped.to_latex( + longtable=longtable, + column_format=column_format, + caption=caption, + label=label, + position="h", + index=False, + escape=False, + header=True, + ) + + # Add \hline after each row + latex_table = latex_table.replace("\\\\\n", "\\\\ \\hline\n") + + # Save + outpath = outputpath / f"availability_by_{groupby_var}_{include_levels}.tex" + with open(outpath, "w", encoding="utf-8") as f: + f.write(latex_table) + + return latex_table + + +# Import and clean data files +#********************************** +# Import TLO model availability data +tlo_availability_df = pd.read_csv(consumable_resourcefilepath / "ResourceFile_Consumables_availability_small.csv") +scenario_names_dict={ + 'available_prop': 'Actual', + 'available_prop_scenario1': 'Non-therapeutic consumables', + 'available_prop_scenario2': 'Vital medicines', + 'available_prop_scenario3': 'Pharmacist- managed', + 'available_prop_scenario4': 'Level 1b', + 'available_prop_scenario5': 'CHAM', + 'available_prop_scenario6': '75th percentile facility', + 'available_prop_scenario7': '90th percentile facility', + 'available_prop_scenario8': 'Best facility', + 'available_prop_scenario9': 'Best facility (including DHO)', + 'available_prop_scenario10': 'HIV supply chain', + 'available_prop_scenario11': 'EPI supply chain', + 'available_prop_scenario12': 'HIV moved to Govt supply chain (Avg by Level)', + 'available_prop_scenario13': 'HIV moved to Govt supply chain (Avg by Facility_ID)', + 'available_prop_scenario14': 'HIV moved to Govt supply chain (Avg by Facility_ID times 1.25)', + 'available_prop_scenario15': 'HIV moved to Govt supply chain (Avg by Facility_ID times 0.75)' + } + +tlo_availability_df = prepare_availability_dataset_for_plots( + _df=tlo_availability_df, + scenario_list=[1, 2, 3, 6, 7, 8, 10, 11, 12, 13, 14, 15], + scenario_names_dict=scenario_names_dict, + consumable_resourcefilepath=consumable_resourcefilepath, + resourcefilepath=resourcefilepath +) + +# Generate figures for manuscript +#********************************** +# Figure 1: Average probability of consumable availability in public and CHAM health facilities in Malawi +_ = generate_heatmap( + df=tlo_availability_df, + value_col="Actual", + row="item_category", + col="Facility_Level", + figurespath = outputfilepath / 'manuscript', + filename="heatmap_program_and_level_actual.png", + figsize=(10, 8), + cmap="RdYlGn", + round_decimals=4, + cbar_label="Proportion of days on which consumable is available", + xlabel="Facility Level", + ylabel="Program", +) + +# Figure 3: Comparison of consumable availability across modelled scenarios +scenario_cols = ['Actual', 'Non-therapeutic consumables', 'Vital medicines', 'Pharmacist- managed','75th percentile facility', '90th percentile facility', 'Best facility'] +for level in ['1a', '1b']: + _ = generate_heatmap( + df=tlo_availability_df, + include_levels = [level], + value_col="Actual", + row="item_category", + col="Facility_Level", + figurespath=outputfilepath / 'manuscript', + filename=f"scenarios_heatmap_{level}.png", + figsize=(10, 8), + cmap="RdYlGn", + round_decimals=4, + cbar_label="Proportion of days on which consumable is available", + xlabel="Facility Level", + ylabel="Program", + scenario_axis = True, # if True, columns become scenarios + scenario_cols = scenario_cols, + ) + + +# Figure A.1: Trend in average consumable availability by facility level + +# Figure A.2: Comparison of consumable availability as per Open Logistics Management Information System (OpenLMIS), 2018 and Harmonised +# Health Facility Assessment, 2018-19 + +# Table B.1: Average probability of availability for each consumable under all scenarios (Level 1a) +tlo_availability_df = assign_consumable_names_to_item_codes(tlo_availability_df) + +# Table B.1: Average probability of availability for each consumable under all scenarios (Level 1a) +availablity_by_item_1a = generate_detail_availability_table_by_scenario( + df=tlo_availability_df, + groupby_var="item_name", + scenario_cols=scenario_cols, + include_levels="1a", + longtable=True, + outputpath=outputfilepath / "appendix", + decimals=2, + caption="Average probability of availability for each consumable under all scenarios (Level 1a)", +) + +# Table B.2: Average probability of availability for each consumable under all scenarios (Level 1b) +availablity_by_item_1b = generate_detail_availability_table_by_scenario( + df=tlo_availability_df, + groupby_var="item_name", + scenario_cols=scenario_cols, + include_levels="1b", + longtable=True, + outputpath=outputfilepath / "appendix", + decimals=2, + caption="Average probability of availability for each consumable under all scenarios (Level 1b)", +) diff --git a/src/scripts/consumables_analyses/manuscript/scenario_improved_consumable_availability.py b/src/scripts/consumables_analyses/manuscript/scenario_improved_consumable_availability.py new file mode 100644 index 0000000000..a76a97078b --- /dev/null +++ b/src/scripts/consumables_analyses/manuscript/scenario_improved_consumable_availability.py @@ -0,0 +1,123 @@ + + +"""This Scenario file run the model under different assumptions for the consumable availability in order to estimate the +cost under each scenario for the HSSP-III duration + +Run on the batch system using: +``` +tlo batch-submit src/scripts/consumables_analyses/manuscript/scenario_improved_consumable_availability.py +``` + +or locally using: +``` +tlo scenario-run src/scripts/consumables_analyses/manuscript/scenario_improved_consumable_availability.py + +# TODO Pending actions +# check if 7 days of persistence +# Scale-up in 2026 +# Relaxing health worker capacity constraint +# Reduced persistence of care-seeking +# Private market substitution - derive percentage from TLM data +# Don't run sensitivity analyses yet (can be added later) - only run the HR one --> 20 scenarios + ``` + +""" +from tlo import Date, logging +from tlo.methods.fullmodel import fullmodel +from tlo.methods.scenario_switcher import ImprovedHealthSystemAndCareSeekingScenarioSwitcher +from tlo.scenario import BaseScenario + +class ConsumablesCosting(BaseScenario): + # ----------------------------- + # 1) DEFINE SCENARIOS EXPLICITLY + # ----------------------------- + CONSUMABLE_SCENARIOS = [ + 'default', + 'scenario1', 'scenario2', 'scenario3', # Predictive factors + 'scenario6', 'scenario7', 'scenario8', # Benchmark facilities + 'scenario16', 'scenario17', 'scenario18', 'scenario19', # Redistribution + 'all' # Perfect + ] + + SYSTEM_MODES = [ + { + "mode_appt_constraints": 2, + "max_healthsystem_function": [False, False], + "max_healthcare_seeking": [False, False], + }, + { + "mode_appt_constraints": 1, + "max_healthsystem_function": [False, True], + "max_healthcare_seeking": [False, True], + }, + ] + + def __init__(self): + super().__init__() + self.seed = 0 + self.start_date = Date(2010, 1, 1) + self.end_date = Date(2031, 1, 1) # TODO change to 2041 + # Run until 2040 even though analysis maybe focused on years until 2030 + self.pop_size = 5_000 # TODO change to 100_000 + + + # Build cartesian product of scenarios + self.SCENARIOS = [ + (cons, sys) + for cons in self.CONSUMABLE_SCENARIOS + for sys in self.SYSTEM_MODES + ] + + self.number_of_draws = len(self.SCENARIOS) + self.scenarios = list(range(self.number_of_draws)) + + self.runs_per_draw = 1 #TODO change to 5 + + def log_configuration(self): + return { + 'filename': 'consumables_costing', + 'directory': './outputs', + 'custom_levels': { + '*': logging.WARNING, + "tlo.methods.demography": logging.INFO, + "tlo.methods.healthburden": logging.INFO, + "tlo.methods.healthsystem.summary": logging.INFO, + "tlo.methods.healthsystem": logging.INFO, # TODO Confirm whether this needs to be logged + } + } + + def modules(self): + return ( + fullmodel() + + [ImprovedHealthSystemAndCareSeekingScenarioSwitcher()] + ) + + def draw_parameters(self, draw_number, rng): + cons_scenario, sys = self.SCENARIOS[draw_number] + + return { + 'HealthSystem': { + 'cons_availability': 'default', + 'year_cons_availability_switch': 2026, + 'cons_availability_postSwitch': cons_scenario, + 'mode_appt_constraints':1, + 'mode_appt_constraints_postSwitch':sys["mode_appt_constraints"], # once without HR constraints and once with HR constraints + 'year_mode_switch':2026, + 'policy_name': 'EHP_III', + 'use_funded_or_actual_staffing': 'actual', + 'scale_to_effective_capabilities':True, # <-- Transition into Mode2 with the effective capabilities in HRH 'revealed' in Mode 1 + 'yearly_HR_scaling_mode': 'historical_scaling', # allow historical HRH scaling to occur 2018-2024 + 'equip_availability':'all', + 'beds_availability':'all', + }, + "ImprovedHealthSystemAndCareSeekingScenarioSwitcher": { + "max_healthsystem_function": sys["max_healthsystem_function"], + "max_healthcare_seeking": sys["max_healthcare_seeking"], + "year_of_switch": 2026, + } + } + +if __name__ == '__main__': + from tlo.cli import scenario_run + + scenario_run([__file__]) diff --git a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py index c19114402b..5ea502ff28 100644 --- a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py +++ b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py @@ -1,13 +1,21 @@ """ This script generates estimates of availability of consumables used by disease modules: +OUTPUTS: * ResourceFile_Consumables_availability_small.csv (estimate of consumable available - file for use in the simulation). * ResourceFile_Consumables_Inflow_Outflow_Ratio.csv (a file that gives the ratio of inflow of consumables to outflow to * capture the extent of wastage as a proportion of use for each consumable by month, district and level. - -N.B. The file uses `ResourceFile_Consumables_matched.csv` as an input. +INPUTS: +* `ResourceFile_Consumables_matched.csv` - matches consumable names in OpenLMIS to those in TLO model +* `ResourceFile_LMIS_2018.csv` - consumable availability in OpenLMIS 2018. Data from OpenLMIS includes closing balance, +quantity received, quantity dispensed, and average monthly consumption for each month by facility. +* `ResourceFile_hhfa_consumables.xlsx` - provides consumable availability from other sources, mainly Harmonised Health + Facility Assessment 2018-19 (to fill gaps in Open LMIS data +* `ResourceFile_Consumables_Item_Designations.csv` to categorise consumables into disease/public health programs +* `ResourceFile_Master_Facilities_List.csv` - to obtain the Facility_Level associated with each Facility_ID +* `ResourceFile_Population_2010.csv` - to get the list of districts It creates one row for each consumable for availability at a specific facility and month when the data is extracted from the OpenLMIS dataset and one row for each consumable for availability aggregated across all facilities when the data is @@ -15,9 +23,15 @@ Consumable availability is measured as probability of stockout at any point in time. -Data from OpenLMIS includes closing balance, quantity received, quantity dispensed, and average monthly consumption -for each month by facility. - +Steps: +1. Prepare OpenLMIS data (A. Import, B. Reshape, C. Interpolate, D. Summarise by month and facility) +2. Match with TLO Model consumable names +3. Add data from other sources where OpenLMIS data is missing +4. Interpolate missing data +5. Add alternative availability scenarios +6. Check format and save as resourcefile +7. Produce validation plots +8. Plot summary of availability across scenarios """ import calendar @@ -29,8 +43,12 @@ import matplotlib.pyplot as plt import numpy as np import pandas as pd +from typing import Optional, List +import re from tlo.methods.consumables import check_format_of_consumables_file +from scripts.data_file_processing.healthsystem.consumables.generating_consumable_scenarios.generate_consumable_availability_scenarios_for_impact_analysis import generate_alternative_availability_scenarios, generate_descriptive_consumable_availability_plots +from scripts.data_file_processing.healthsystem.consumables.generating_consumable_scenarios.create_consumable_redistribution_scenarios import generate_redistribution_scenarios # Set local shared folder source path_to_share = Path( # <-- point to the shared folder @@ -51,8 +69,8 @@ resourcefilepath = Path("./resources") path_for_new_resourcefiles = resourcefilepath / "healthsystem/consumables" - # Define necessary functions +# Functions to clean LMIS data def change_colnames(df, NameChangeList): # Change column names ColNames = df.columns ColNames2 = ColNames @@ -62,12 +80,426 @@ def change_colnames(df, NameChangeList): # Change column names df.columns = ColNames2 return df +def rename_items_to_address_inconsistentencies(_df, item_dict): + """Return a dataframe with rows for the same item with inconsistent names collapsed into one""" + # Recode item names appearing from Jan to Aug to the new names adopted from September onwards + old_unique_item_count = _df.item.nunique() + for item in item_dict: + print(len(_df[_df.item == item_dict[item]]), ''' instances of "''', item_dict[item], '''"''' + ''' changed to "''', item, + '''"''') + # row_newname = _df.item == item + row_oldname = _df.item == item_dict[item] + _df.loc[row_oldname, 'item'] = item -# %% -# 1. DATA IMPORT AND CLEANING ## -######################################################################################### + # Make a list of column names to be collapsed using different methods + columns_to_sum = [col for col in _df.columns if + col[0].startswith(('amc', 'closing_bal', 'dispensed', 'received', 'stkout_days'))] + columns_to_preserve = [col for col in _df.columns if + col[0].startswith(('data_source'))] -# Import 2018 data + # Define aggregation function to be applied to collapse data by item + def custom_agg(x): + if x.name in columns_to_sum: + return x.sum(skipna=True) if np.any( + x.notnull() & (x >= 0)) else np.nan # this ensures that the NaNs are retained + # , i.e. not changed to 0, when the corresponding data for both item name variations are NaN, and when there + # is a 0 or positive value for one or both item name variation, the sum is taken. + elif x.name in columns_to_preserve: + return x.str.cat( + sep='') # for the data_source column, this function concatenates the string values + + # Collapse dataframe + _collapsed_df = _df.groupby(['program', 'item', 'district', 'fac_type_tlo', 'fac_name']).agg( + {col: custom_agg for col in columns_to_preserve + columns_to_sum} + ).reset_index() + + # Test that all items in the dictionary have been found in the dataframe + new_unique_item_count = _collapsed_df.item.nunique() + assert len(item_dict) == old_unique_item_count - new_unique_item_count + return _collapsed_df + +def replace_old_item_names_in_lmis_data(_df, item_dict): + """Return a dataframe with old LMIS consumable names replaced with the new name""" + for item in item_dict: + cond_oldname = _df.item == item_dict[item] + _df.loc[cond_oldname, 'item'] = item + return _df + +def recategorize_modules_into_consumable_categories(_df): + _df['item_category'] = _df['module_name'].str.lower() + cond_RH = (_df['item_category'].str.contains('care_of_women_during_pregnancy')) | \ + (_df['item_category'].str.contains('labour')) + cond_newborn = (_df['item_category'].str.contains('newborn')) + cond_newborn[cond_newborn.isna()] = False + cond_childhood = (_df['item_category'] == 'acute lower respiratory infections') | \ + (_df['item_category'] == 'measles') | \ + (_df['item_category'] == 'diarrhoea') + cond_rti = _df['item_category'] == 'road traffic injuries' + cond_cancer = _df['item_category'].str.contains('cancer') + cond_cancer[cond_cancer.isna()] = False + cond_ncds = (_df['item_category'] == 'epilepsy') | \ + (_df['item_category'] == 'depression') + _df.loc[cond_RH, 'item_category'] = 'reproductive_health' + _df.loc[cond_cancer, 'item_category'] = 'cancer' + _df.loc[cond_newborn, 'item_category'] = 'neonatal_health' + _df.loc[cond_childhood, 'item_category'] = 'other_childhood_illnesses' + _df.loc[cond_rti, 'item_category'] = 'road_traffic_injuries' + _df.loc[cond_ncds, 'item_category'] = 'ncds' + cond_condom = _df['item_code'] == 2 + _df.loc[cond_condom, 'item_category'] = 'contraception' + + # Create a general consumables category + general_cons_list = [300, 33, 57, 58, 141, 5, 6, 10, 21, 23, 127, 24, 80, 93, 144, 149, 154, 40, 67, 73, 76, + 82, 101, 103, 88, 126, 135, 71, 98, 171, 133, 134, 244, 247, 49, 112, 1933, 1960] + cond_general = _df['item_code'].isin(general_cons_list) + _df.loc[cond_general, 'item_category'] = 'general' + + # Fill gaps in categories + dict_for_missing_categories = {292: 'acute lower respiratory infections', 293: 'acute lower respiratory infections', + 307: 'reproductive_health', 2019: 'reproductive_health', + 2678: 'tb', 1171: 'other_childhood_illnesses', 1237: 'cancer', 1239: 'cancer'} + # Use map to create a new series from item_code to fill missing values in category + mapped_categories = _df['item_code'].map(dict_for_missing_categories) + # Use fillna on the 'item_category' column to fill missing values using the mapped_categories + _df['item_category'] = _df['item_category'].fillna(mapped_categories) + + return _df + +# Function to extract inflow to outflow ratio for costing +def get_inflow_to_outflow_ratio_by_item_and_facilitylevel(_df): + df_by_item_level_month = \ + _df.groupby(['item_category', 'item_code', 'district', 'fac_type_tlo', 'month'])[ + ['closing_bal', 'dispensed', 'received']].sum() + df_by_item_level_month = df_by_item_level_month.loc[df_by_item_level_month.index.get_level_values('month') != "Aggregate"] + # Opening balance in January is the closing balance for the month minus what was received during the month plus what was dispensed + opening_bal_january = df_by_item_level_month.loc[df_by_item_level_month.index.get_level_values('month') == 'January', 'closing_bal'] + \ + df_by_item_level_month.loc[df_by_item_level_month.index.get_level_values('month') == 'January', 'dispensed'] - \ + df_by_item_level_month.loc[df_by_item_level_month.index.get_level_values('month') == 'January', 'received'] + closing_bal_december = df_by_item_level_month.loc[df_by_item_level_month.index.get_level_values('month') == 'December', 'closing_bal'] + # the consumable inflow during the year is the opening balance in January + what was received throughout the year - what was transferred to the next year (i.e. closing bal of December) + total_consumables_inflow_during_the_year = df_by_item_level_month['received'].groupby(level=['item_category', 'item_code', 'district', 'fac_type_tlo']).sum() +\ + opening_bal_january.reset_index(level='month', drop=True) -\ + closing_bal_december.reset_index(level='month', drop=True) + total_consumables_outflow_during_the_year = df_by_item_level_month['dispensed'].groupby(level=['item_category', 'item_code', 'district', 'fac_type_tlo']).sum() + inflow_to_outflow_ratio = total_consumables_inflow_during_the_year.div(total_consumables_outflow_during_the_year, fill_value=1) + inflow_to_outflow_ratio.loc[inflow_to_outflow_ratio < 1] = 1 # Ratio can't be less than 1 + + return inflow_to_outflow_ratio + +def update_level1b_availability( + availability_df: pd.DataFrame, + facilities_by_level: dict, + resourcefilepath: Path, + district_to_city_dict: dict, + weighting: str = "district_1b_to_2_ratio" +) -> pd.DataFrame: + """ + Updates the availability of Level 1b facilities to be the weighted average + of availability at Level 1b and 2 facilities, since these levels are merged + together in simulations. + + weighting : {'level2', 'national_1b_to_2_ratio', 'district_1b_to_2_ratio'}, default 'district_1b_to_2_ratio' + Weighting strategy: + - 'level2': Replace 1b availability entirely with level 2 values. + - 'national_1b_to_2_ratio': Apply a single national 1b:2 ratio to all districts. + - 'district_1b_to_2_ratio': (default) Use district-specific 1b:2 ratios. + """ + # Load and prepare base weights (facility counts) + # --------------------------------------------------------------------- + weight = ( + pd.read_csv(resourcefilepath / 'healthsystem' / 'organisation' / 'ResourceFile_Master_Facilities_List.csv') + [["District", "Facility_Level", "Facility_ID", "Facility_Count"]] + ) + + # Keep only Level 1b and 2 facilities + lvl1b2_weights = weight[weight["Facility_Level"].isin(["1b", "2"])].copy() + + # Compute weights depending on strategy + # --------------------------------------------------------------------- + if weighting == "level2": + # Force all weight on level 2 + lvl1b2_weights = lvl1b2_weights[~lvl1b2_weights.District.str.contains("City")] + lvl1b2_weights["weight"] = (lvl1b2_weights["Facility_Level"] == "2").astype(float) + lvl1b2_weights = lvl1b2_weights.drop(columns = 'Facility_ID') + + elif weighting == "national_1b_to_2_ratio": + lvl1b2_weights = lvl1b2_weights[~lvl1b2_weights.District.str.contains("City")] + # National total counts + national_counts = ( + lvl1b2_weights.groupby("Facility_Level")["Facility_Count"].sum().to_dict() + ) + total_fac = national_counts.get("1b", 0) + national_counts.get("2", 0) + if total_fac == 0: + raise ValueError("No facilities found at levels 1b or 2.") + lvl1b2_weights["weight"] = lvl1b2_weights["Facility_Level"].map( + {lvl: cnt / total_fac for lvl, cnt in national_counts.items()} + ) + lvl1b2_weights = lvl1b2_weights.drop(columns='Facility_ID') + + elif weighting == "district_1b_to_2_ratio": + # Replace city names with their parent districts (temporarily for grouping) + city_to_district_dict = {v: k for k, v in district_to_city_dict.items()} + lvl1b2_weights["District"] = lvl1b2_weights["District"].replace(city_to_district_dict) + + # District-level weighting (default) + lvl1b2_weights = ( + lvl1b2_weights + .groupby(["District", "Facility_Level"], as_index=False)["Facility_Count"] + .sum() + ) + + lvl1b2_weights["total_facilities"] = lvl1b2_weights.groupby("District")["Facility_Count"].transform("sum") + lvl1b2_weights["weight"] = lvl1b2_weights["Facility_Count"] / lvl1b2_weights["total_facilities"] + + else: + raise ValueError( + f"Invalid weighting '{weighting}'. Choose from " + "'level2', 'national_1b_to_2_ratio', or 'district_1b_to_2_ratio'." + ) + + # Add back city districts (reverse mapping) + for source, destination in copy_source_to_destination.items(): + new_rows = lvl1b2_weights.loc[lvl1b2_weights.District == source].copy() + new_rows.District = destination + lvl1b2_weights = pd.concat([lvl1b2_weights, new_rows], axis=0, ignore_index=True) + + # Merge Facility_ID back + lvl1b2_weights = lvl1b2_weights.merge( + weight.loc[weight["Facility_Level"].isin(["1b", "2"]), ["District", "Facility_Level", "Facility_ID"]], + on=["District", "Facility_Level"], + how="left", + validate="1:1" + ) + + # Subset Level 1b and 2 facilities and apply weights + # --------------------------------------------------------------------- + lvl1b2_ids = list(facilities_by_level.get("1b", [])) + list(facilities_by_level.get("2", [])) + availability_levels1b2 = availability_df[ + availability_df["Facility_ID"].isin(lvl1b2_ids) + ].copy() + + availability_levels1b2 = availability_levels1b2.merge( + lvl1b2_weights[["District", "Facility_Level", "Facility_ID", "weight"]], + on="Facility_ID", + how="left", + validate="m:1" + ) + + # Apply weighting + available_cols = [c for c in availability_levels1b2.columns if c.startswith("available_prop")] + availability_levels1b2[available_cols] *= availability_levels1b2["weight"].values[:, None] + + # Aggregate to district-month-item level + availability_levels1b2 = ( + availability_levels1b2 + .groupby(["District", "month", "item_code"], as_index=False)[available_cols] + .sum() + ) + + # Add facility level + availability_levels1b2["Facility_Level"] = "1b" + + # Reattach Facility_IDs and weights for level 1b + full_set_interpolated_levels1b2 = availability_levels1b2.merge( + lvl1b2_weights.query("Facility_Level == '1b'")[["District", "Facility_Level", "Facility_ID", "weight"]], + on=["District", "Facility_Level"], + how="left", + validate="m:1" + ) + + # Replace old level 1b facilities and recompute weighted availability + # --------------------------------------------------------------------- + # Drop old Level 1b facilities + availability_df = availability_df[ + ~availability_df["Facility_ID"].isin(facilities_by_level.get("1b", [])) + ] + + # Append new 1b facility data + availability_df = pd.concat( + [ + availability_df, + full_set_interpolated_levels1b2[["Facility_ID", "month", "item_code", *available_cols]] + ], + axis=0, + ignore_index=True + ) + + return availability_df + +# Function to count scenarios at any stage of generating the cons availability RF +def get_max_scenario_number(df: pd.DataFrame) -> int: + scenario_nums = [ + int(m.group(1)) + for c in df.columns + if (m := re.match(r"available_prop_scenario(\d+)$", c)) + ] + return max(scenario_nums) if scenario_nums else 0 + +# Function to compute average availability by facility level +def compute_avg_availability_by_var(df: pd.DataFrame = None, # TLO availability dataframe with each row representing one Facility_ID, item, month, + mfl: Optional[pd.DataFrame] = None, # Master Facility list mapping Facility_Level to Faciility_ID + program_item_mapping: Optional[pd.DataFrame] = None, + groupby_var:str = 'month', + available_cols: List[str] = ['available_prop'], # List of availability columns to summarise + label:str = "Average"): + if groupby_var == 'Facility_Level': + # Merge Facility_Level + df = (df.merge(mfl[['District', 'Facility_Level', 'Facility_ID']],on=['Facility_ID'], how='left')) + if groupby_var == 'item_category': + # Merge Program + program_item_mapping = program_item_mapping.rename(columns ={'Item_Code': 'item_code'})[program_item_mapping.item_category.notna()] + df = df.merge(program_item_mapping, on = ['item_code'], how='left') + + out = ( + df + .groupby(groupby_var)[available_cols] + .mean() + .reset_index() + .melt( + id_vars=groupby_var, + value_vars=available_cols, + var_name="Scenario", + value_name="Average_Availability" + ) + ) + out["Dataset"] = label + return out + +def plot_availability_before_and_after_level1b_fix(old_df: pd.DataFrame = None, + new_df: pd.DataFrame = None, + mfl: pd.DataFrame = None, + available_cols: List[str] = ['available_prop'], # List of availability columns to summarise + save_figure_as:Path = None): + avg_old = compute_avg_availability_by_var(df=old_df, + mfl=mfl, + groupby_var='Facility_Level', + available_cols=available_cols, + label="Original") + + avg_new = compute_avg_availability_by_var(df=new_df, + mfl=mfl, + groupby_var='Facility_Level', + available_cols=available_cols, + label="Updated") + + plot_df = pd.concat([avg_old, avg_new], ignore_index=True) + facility_levels = plot_df["Facility_Level"].unique() + scenarios = plot_df["Scenario"].unique() + + x = np.arange(len(scenarios)) + width = 0.35 + + fig, axes = plt.subplots( + nrows=len(facility_levels), + figsize=(14, 5 * len(facility_levels)), + sharey=True + ) + + if len(facility_levels) == 1: + axes = [axes] + + for ax, fl in zip(axes, facility_levels): + sub = plot_df[plot_df["Facility_Level"] == fl] + + orig = sub[sub["Dataset"] == "Original"].set_index("Scenario").loc[scenarios] + new = sub[sub["Dataset"] == "Updated"].set_index("Scenario").loc[scenarios] + + ax.bar(x - width / 2, orig["Average_Availability"], width, label="Original") + ax.bar(x + width / 2, new["Average_Availability"], width, label="Updated") + + ax.set_title(f"Average Availability by Scenario – Facility Level {fl}") + ax.set_xticks(x) + ax.set_xticklabels(scenarios, rotation=45, ha="right") + ax.set_ylabel("Average Availability") + ax.legend() + + plt.tight_layout() + plt.savefig(save_figure_as) + +def collapse_stockout_data(_df, groupby_list, var): + """Return a dataframe with rows for the same TLO model item code collapsed into 1""" + # Define column lists based on the aggregation function to be applied + columns_to_multiply = [var] + columns_to_sum = ['closing_bal', 'amc', 'dispensed', 'received'] + columns_to_preserve = ['data_source', 'consumable_reporting_freq', 'consumables_reported_in_mth'] + + # Define aggregation function to be applied to collapse data by item + def custom_agg_stkout(x): + if x.name in columns_to_multiply: + return x.prod(skipna=True) if np.any( + x.notnull() & (x >= 0)) else np.nan # this ensures that the NaNs are retained + elif x.name in columns_to_sum: + return x.sum(skipna=True) if np.any( + x.notnull() & (x >= 0)) else np.nan # this ensures that the NaNs are retained + # , i.e. not changed to 1, when the corresponding data for both item name variations are NaN, and when there + # is a 0 or positive value for one or both item name variation, the sum is taken. + elif x.name in columns_to_preserve: + return x.iloc[0] # this function extracts the first value + + # Collapse dataframe + _collapsed_df = _df.groupby(groupby_list).agg( + {col: custom_agg_stkout for col in columns_to_multiply + columns_to_sum + columns_to_preserve} + ).reset_index() + + return _collapsed_df + +# Functions for interpolation +def get_other_facilities_of_same_level(_fac_id): + """Return a set of facility_id for other facilities that are of the same level as that provided.""" + for v in facilities_by_level.values(): + if _fac_id in v: + return v - {_fac_id} + +def interpolate_missing_with_mean(_ser): + """Return a series in which any values that are null are replaced with the mean of the non-missing.""" + if pd.isnull(_ser).all(): + raise ValueError + return _ser.fillna(_ser.mean()) + +# Function to draw calibration plots at different levels of disaggregation (comparing final TLO data to HHFA) +def comparison_plot(level_of_disaggregation, group_by_var, colour): + comparison_df_agg = comparison_df.groupby([group_by_var], + as_index=False).agg({'available_prop': 'mean', + 'available_prop_hhfa': 'mean', + 'Facility_Level': 'first', + 'consumable_labels': 'first'}) + comparison_df_agg['labels'] = comparison_df_agg[level_of_disaggregation] + + ax = comparison_df_agg.plot.scatter('available_prop', 'available_prop_hhfa', c=colour) + ax.axline([0, 0], [1, 1]) + for i, label in enumerate(comparison_df_agg['labels']): + plt.annotate(label, + (comparison_df_agg['available_prop'][i] + 0.005, + comparison_df_agg['available_prop_hhfa'][i] + 0.005), + fontsize=6, rotation=38) + if level_of_disaggregation != 'aggregate': + plt.title('Disaggregated by ' + level_of_disaggregation, fontsize=size, weight="bold") + else: + plt.title('Aggregate', fontsize=size, weight="bold") + plt.xlabel('Pr(drug available) as per TLO model') + plt.ylabel('Pr(drug available) as per HHFA') + save_name = 'comparison_plots/calibration_to_hhfa_' + level_of_disaggregation + '.png' + plt.savefig(outputfilepath / save_name) + +def comparison_plot_by_level(fac_type): + cond_fac_type = comparison_df['Facility_Level'] == fac_type + comparison_df_by_level = comparison_df[cond_fac_type].reset_index() + plt.scatter(comparison_df_by_level['available_prop'], + comparison_df_by_level['available_prop_hhfa']) + plt.axline([0, 0], [1, 1]) + for i, label in enumerate(comparison_df_by_level['consumable_labels']): + plt.annotate(label, (comparison_df_by_level['available_prop'][i] + 0.005, + comparison_df_by_level['available_prop_hhfa'][i] + 0.005), + fontsize=6, rotation=27) + plt.title(fac_type, fontsize=size, weight="bold") + plt.xlabel('Pr(drug available) as per TLO model') + plt.ylabel('Pr(drug available) as per HHFA') + +# %% +# 1. PREPARE OPENLMIS DATA +######################################################################################################################## +# 1A. Import 2018 data lmis_df = pd.read_csv(path_to_files_in_the_tlo_shared_drive / 'OpenLMIS/2018/ResourceFile_LMIS_2018.csv', low_memory=False) # 1. BASIC CLEANING ## @@ -143,9 +575,7 @@ def change_colnames(df, NameChangeList): # Change column names months_withdata = ['January', 'February', 'April', 'October', 'November'] months_interpolated = ['March', 'May', 'June', 'July', 'August', 'September', 'December'] -# 2. RESHAPE AND REORDER ## -######################################################################################### - +# 1B. RESHAPE AND REORDER # Reshape dataframe so that each row represent a unique consumable and facility lmis_df_wide = lmis_df.pivot_table(index=['district', 'fac_type_tlo', 'fac_name', 'program', 'item'], columns='month', values=['closing_bal', 'dispensed', 'received', 'stkout_days', 'amc'], @@ -160,8 +590,7 @@ def change_colnames(df, NameChangeList): # Change column names num = lmis_df_wide._get_numeric_data() lmis_df_wide[num < 0] = np.nan -# 3. INTERPOLATE MISSING VALUES ## -######################################################################################### +# 1C. INTERPOLATE MISSING VALUES ## # When stkout_days is empty but closing balance, dispensed and received entries are available lmis_df_wide_flat = lmis_df_wide.reset_index() count_stkout_entries = lmis_df_wide_flat['stkout_days'].count(axis=1).sum() @@ -237,45 +666,6 @@ def change_colnames(df, NameChangeList): # Change column names # TODO check whether there is any issue with the above items_introduced_in_september which only show up from September # onwards -def rename_items_to_address_inconsistentencies(_df, item_dict): - """Return a dataframe with rows for the same item with inconsistent names collapsed into one""" - # Recode item names appearing from Jan to Aug to the new names adopted from September onwards - old_unique_item_count = _df.item.nunique() - for item in item_dict: - print(len(_df[_df.item == item_dict[item]]), ''' instances of "''', item_dict[item], '''"''' - ''' changed to "''', item, - '''"''') - # row_newname = _df.item == item - row_oldname = _df.item == item_dict[item] - _df.loc[row_oldname, 'item'] = item - - # Make a list of column names to be collapsed using different methods - columns_to_sum = [col for col in _df.columns if - col[0].startswith(('amc', 'closing_bal', 'dispensed', 'received', 'stkout_days'))] - columns_to_preserve = [col for col in _df.columns if - col[0].startswith(('data_source'))] - - # Define aggregation function to be applied to collapse data by item - def custom_agg(x): - if x.name in columns_to_sum: - return x.sum(skipna=True) if np.any( - x.notnull() & (x >= 0)) else np.nan # this ensures that the NaNs are retained - # , i.e. not changed to 0, when the corresponding data for both item name variations are NaN, and when there - # is a 0 or positive value for one or both item name variation, the sum is taken. - elif x.name in columns_to_preserve: - return x.str.cat( - sep='') # for the data_source column, this function concatenates the string values - - # Collapse dataframe - _collapsed_df = _df.groupby(['program', 'item', 'district', 'fac_type_tlo', 'fac_name']).agg( - {col: custom_agg for col in columns_to_preserve + columns_to_sum} - ).reset_index() - - # Test that all items in the dictionary have been found in the dataframe - new_unique_item_count = _collapsed_df.item.nunique() - assert len(item_dict) == old_unique_item_count - new_unique_item_count - return _collapsed_df - # Hold out the dataframe with no naming inconsistencies list_of_items_with_inconsistent_names_zipped = set(zip(inconsistent_item_names_mapping.keys(), inconsistent_item_names_mapping.values())) list_of_items_with_inconsistent_names = [item for sublist in list_of_items_with_inconsistent_names_zipped for item in sublist] @@ -288,7 +678,7 @@ def custom_agg(x): lmis_df_wide_flat = pd.concat([df_without_consistent_item_names_corrected, df_with_consistent_item_names], ignore_index=True) -# --- 3.1 RULE: 1.If i) stockout is missing, ii) closing_bal, amc and received are not missing , and iii) amc !=0 and, +# 1. --- RULE: 1.If i) stockout is missing, ii) closing_bal, amc and received are not missing , and iii) amc !=0 and, # then stkout_days[m] = (amc[m] - closing_bal[m-1] - received)/amc * number of days in the month --- # (Note that the number of entries for closing balance, dispensed and received is always the same) for m in range(2, 13): @@ -324,7 +714,7 @@ def custom_agg(x): count_stkout_entries = lmis_df_wide_flat['stkout_days'].count(axis=1).sum() print(count_stkout_entries, "stockout entries after first interpolation") -# 3.2 --- If any stockout_days < 0 after the above interpolation, update to stockout_days = 0 --- +# 2. --- If any stockout_days < 0 after the above interpolation, update to stockout_days = 0 --- # RULE: If closing balance[previous month] - dispensed[this month] + received[this month] > 0, stockout == 0 for m in range(1, 13): cond1 = lmis_df_wide_flat['stkout_days', months_dict[m]] < 0 @@ -342,7 +732,7 @@ def custom_agg(x): # Flatten multilevel columns lmis_df_wide_flat.columns = [' '.join(col).strip() for col in lmis_df_wide_flat.columns.values] -# 3.3 --- If the consumable was previously reported and during a given month, if any consumable was reported, assume +# 3. --- If the consumable was previously reported and during a given month, if any consumable was reported, assume # 100% days of stckout --- # RULE: If the balance on a consumable is ever reported and if any consumables are reported during the month, stkout_ # days = number of days of the month @@ -370,9 +760,7 @@ def custom_agg(x): count_stkout_entries = count_stkout_entries + lmis_df_wide_flat['stkout_days ' + months_dict[m]].count().sum() print(count_stkout_entries, "stockout entries after third interpolation") -# 4. CALCULATE STOCK OUT RATES BY MONTH and FACILITY ## -######################################################################################### - +# 1D. CALCULATE STOCK OUT RATES BY MONTH and FACILITY lmis = lmis_df_wide_flat # choose dataframe # Generate variables denoting the stockout proportion in each month @@ -391,10 +779,9 @@ def custom_agg(x): sep=' ', suffix=r'\w+') lmis = lmis.reset_index() -# 5. LOAD CLEANED MATCHED CONSUMABLE LIST FROM TLO MODEL AND MERGE WITH LMIS DATA ## -######################################################################################### - -# 5.1 --- Load and clean data --- +# 2. LOAD CLEANED MATCHED CONSUMABLE LIST FROM TLO MODEL AND MERGE WITH LMIS DATA +######################################################################################################################## +# 1. --- Load and clean data --- # Import matched list of consumanbles consumables_df = pd.read_csv(path_for_new_resourcefiles / 'ResourceFile_consumables_matched.csv', low_memory=False, encoding="ISO-8859-1") @@ -413,49 +800,12 @@ def custom_agg(x): # Update matched consumable name where the name in the OpenLMIS data was updated in September -def replace_old_item_names_in_lmis_data(_df, item_dict): - """Return a dataframe with old LMIS consumable names replaced with the new name""" - for item in item_dict: - cond_oldname = _df.item == item_dict[item] - _df.loc[cond_oldname, 'item'] = item - return _df - - matched_consumables = replace_old_item_names_in_lmis_data(matched_consumables, inconsistent_item_names_mapping) -# 5.2 --- Merge data with LMIS data --- +# 2. --- Merge data with LMIS data --- lmis_matched_df = pd.merge(lmis, matched_consumables, how='inner', on=['item']) lmis_matched_df = lmis_matched_df.sort_values('data_source') - -def collapse_stockout_data(_df, groupby_list, var): - """Return a dataframe with rows for the same TLO model item code collapsed into 1""" - # Define column lists based on the aggregation function to be applied - columns_to_multiply = [var] - columns_to_sum = ['closing_bal', 'amc', 'dispensed', 'received'] - columns_to_preserve = ['data_source', 'consumable_reporting_freq', 'consumables_reported_in_mth'] - - # Define aggregation function to be applied to collapse data by item - def custom_agg_stkout(x): - if x.name in columns_to_multiply: - return x.prod(skipna=True) if np.any( - x.notnull() & (x >= 0)) else np.nan # this ensures that the NaNs are retained - elif x.name in columns_to_sum: - return x.sum(skipna=True) if np.any( - x.notnull() & (x >= 0)) else np.nan # this ensures that the NaNs are retained - # , i.e. not changed to 1, when the corresponding data for both item name variations are NaN, and when there - # is a 0 or positive value for one or both item name variation, the sum is taken. - elif x.name in columns_to_preserve: - return x.iloc[0] # this function extracts the first value - - # Collapse dataframe - _collapsed_df = _df.groupby(groupby_list).agg( - {col: custom_agg_stkout for col in columns_to_multiply + columns_to_sum + columns_to_preserve} - ).reset_index() - - return _collapsed_df - - # 2.i. For substitable drugs (within drug category), collapse by taking the product of stkout_prop (OR condition) # This represents Pr(all substitutes with the item code are stocked out) groupby_list1 = ['module_name', 'district', 'fac_type_tlo', 'fac_name', 'month', 'item_code', 'consumable_name_tlo', @@ -503,10 +853,9 @@ def custom_agg_stkout(x): 'available_prop', 'closing_bal', 'amc', 'dispensed', 'received', 'data_source', 'consumable_reporting_freq', 'consumables_reported_in_mth']] -# 6. ADD STOCKOUT DATA FROM OTHER SOURCES TO COMPLETE STOCKOUT DATAFRAME ## -######################################################################################### - -# --- 6.1. Generate a dataframe of stock availability for consumables which were not found in the OpenLMIS data but +# 3. ADD STOCKOUT DATA FROM OTHER SOURCES TO COMPLETE STOCKOUT DATAFRAME +######################################################################################################################## +# --- 1. Generate a dataframe of stock availability for consumables which were not found in the OpenLMIS data but # available in the HHFA 2018/19 --- # # Save the list of items for which a match was not found in the OpenLMIS data unmatched_consumables = consumables_df.drop_duplicates(['item_code']) @@ -582,13 +931,13 @@ def custom_agg_stkout(x): ('available_prop_hhfa', 'available_prop')] change_colnames(unmatched_consumables_df, NameChangeList) -# --- 6.2 Append OpenLMIS stockout dataframe with HHFA stockout dataframe and Extract in .csv format --- # +# --- 2. Append OpenLMIS stockout dataframe with HHFA stockout dataframe and Extract in .csv format --- # # Append common consumables stockout dataframe with the main dataframe cond = unmatched_consumables_df['available_prop'].notna() unmatched_consumables_df.loc[~cond, 'data_source'] = 'Not available' stkout_df = pd.concat([stkout_df, unmatched_consumables_df], axis=0, ignore_index=True) -# --- 6.3 Append stockout rate for facility level 0 from HHFA --- # +# --- 3. Append stockout rate for facility level 0 from HHFA --- # cond = hhfa_df['item_code'].notna() hhfa_fac0 = hhfa_df[cond][ ['item_code', 'consumable_name_tlo', 'fac_count_Facility_level_0', 'available_prop_hhfa_Facility_level_0']] @@ -605,47 +954,7 @@ def custom_agg_stkout(x): stkout_df = stkout_df[~cond] stkout_df = pd.concat([stkout_df, hhfa_fac0], axis=0, ignore_index=True) -# --- 6.4 Generate new category variable for analysis --- # -def recategorize_modules_into_consumable_categories(_df): - _df['item_category'] = _df['module_name'].str.lower() - cond_RH = (_df['item_category'].str.contains('care_of_women_during_pregnancy')) | \ - (_df['item_category'].str.contains('labour')) - cond_newborn = (_df['item_category'].str.contains('newborn')) - cond_newborn[cond_newborn.isna()] = False - cond_childhood = (_df['item_category'] == 'acute lower respiratory infections') | \ - (_df['item_category'] == 'measles') | \ - (_df['item_category'] == 'diarrhoea') - cond_rti = _df['item_category'] == 'road traffic injuries' - cond_cancer = _df['item_category'].str.contains('cancer') - cond_cancer[cond_cancer.isna()] = False - cond_ncds = (_df['item_category'] == 'epilepsy') | \ - (_df['item_category'] == 'depression') - _df.loc[cond_RH, 'item_category'] = 'reproductive_health' - _df.loc[cond_cancer, 'item_category'] = 'cancer' - _df.loc[cond_newborn, 'item_category'] = 'neonatal_health' - _df.loc[cond_childhood, 'item_category'] = 'other_childhood_illnesses' - _df.loc[cond_rti, 'item_category'] = 'road_traffic_injuries' - _df.loc[cond_ncds, 'item_category'] = 'ncds' - cond_condom = _df['item_code'] == 2 - _df.loc[cond_condom, 'item_category'] = 'contraception' - - # Create a general consumables category - general_cons_list = [300, 33, 57, 58, 141, 5, 6, 10, 21, 23, 127, 24, 80, 93, 144, 149, 154, 40, 67, 73, 76, - 82, 101, 103, 88, 126, 135, 71, 98, 171, 133, 134, 244, 247, 49, 112, 1933, 1960] - cond_general = _df['item_code'].isin(general_cons_list) - _df.loc[cond_general, 'item_category'] = 'general' - - # Fill gaps in categories - dict_for_missing_categories = {292: 'acute lower respiratory infections', 293: 'acute lower respiratory infections', - 307: 'reproductive_health', 2019: 'reproductive_health', - 2678: 'tb', 1171: 'other_childhood_illnesses', 1237: 'cancer', 1239: 'cancer'} - # Use map to create a new series from item_code to fill missing values in category - mapped_categories = _df['item_code'].map(dict_for_missing_categories) - # Use fillna on the 'item_category' column to fill missing values using the mapped_categories - _df['item_category'] = _df['item_category'].fillna(mapped_categories) - - return _df - +# --- 4. Generate new category variable for analysis --- # stkout_df = recategorize_modules_into_consumable_categories(stkout_df) item_code_category_mapping = stkout_df[['item_category', 'item_code']].drop_duplicates() @@ -655,12 +964,12 @@ def recategorize_modules_into_consumable_categories(_df): item_designations = item_designations.merge(item_code_category_mapping, left_on = 'Item_Code', right_on = 'item_code', how = 'left', validate = '1:1') item_designations.drop(columns = 'item_code').to_csv(path_for_new_resourcefiles / 'ResourceFile_Consumables_Item_Designations.csv', index = False) -# --- 6.5 Replace district/fac_name/month entries where missing --- # +# --- 5. Replace district/fac_name/month entries where missing --- # for var in ['district', 'fac_name', 'month']: cond = stkout_df[var].isna() stkout_df.loc[cond, var] = 'Aggregate' -# --- 6.6 Export final stockout dataframe --- # +# --- 6. Export final stockout dataframe --- # # stkout_df.to_csv(path_for_new_resourcefiles / "ResourceFile_Consumables_availability_and_usage.csv") # <-- this line commented out as the file is very large. @@ -670,26 +979,6 @@ def recategorize_modules_into_consumable_categories(_df): lmis_consumable_usage = stkout_df.copy() # TODO Generate a smaller version of this file # Collapse individual facilities -def get_inflow_to_outflow_ratio_by_item_and_facilitylevel(_df): - df_by_item_level_month = \ - _df.groupby(['item_category', 'item_code', 'district', 'fac_type_tlo', 'month'])[ - ['closing_bal', 'dispensed', 'received']].sum() - df_by_item_level_month = df_by_item_level_month.loc[df_by_item_level_month.index.get_level_values('month') != "Aggregate"] - # Opening balance in January is the closing balance for the month minus what was received during the month plus what was dispensed - opening_bal_january = df_by_item_level_month.loc[df_by_item_level_month.index.get_level_values('month') == 'January', 'closing_bal'] + \ - df_by_item_level_month.loc[df_by_item_level_month.index.get_level_values('month') == 'January', 'dispensed'] - \ - df_by_item_level_month.loc[df_by_item_level_month.index.get_level_values('month') == 'January', 'received'] - closing_bal_december = df_by_item_level_month.loc[df_by_item_level_month.index.get_level_values('month') == 'December', 'closing_bal'] - # the consumable inflow during the year is the opening balance in January + what was received throughout the year - what was transferred to the next year (i.e. closing bal of December) - total_consumables_inflow_during_the_year = df_by_item_level_month['received'].groupby(level=['item_category', 'item_code', 'district', 'fac_type_tlo']).sum() +\ - opening_bal_january.reset_index(level='month', drop=True) -\ - closing_bal_december.reset_index(level='month', drop=True) - total_consumables_outflow_during_the_year = df_by_item_level_month['dispensed'].groupby(level=['item_category', 'item_code', 'district', 'fac_type_tlo']).sum() - inflow_to_outflow_ratio = total_consumables_inflow_during_the_year.div(total_consumables_outflow_during_the_year, fill_value=1) - inflow_to_outflow_ratio.loc[inflow_to_outflow_ratio < 1] = 1 # Ratio can't be less than 1 - - return inflow_to_outflow_ratio - inflow_to_outflow_ratio = get_inflow_to_outflow_ratio_by_item_and_facilitylevel(lmis_consumable_usage) # Clean values for analysis inflow_to_outflow_ratio.loc[inflow_to_outflow_ratio < 1] = 1 # Ratio can't be less than 1 @@ -711,7 +1000,6 @@ def get_inflow_to_outflow_ratio_by_item_and_facilitylevel(_df): # the Master Facilities List. # unify the set within each facility_id - mfl = pd.read_csv(resourcefilepath / "healthsystem" / "organisation" / "ResourceFile_Master_Facilities_List.csv") districts = set(pd.read_csv(resourcefilepath / 'demography' / 'ResourceFile_Population_2010.csv')['District']) fac_levels = {'0', '1a', '1b', '2', '3', '4'} @@ -737,7 +1025,8 @@ def get_inflow_to_outflow_ratio_by_item_and_facilitylevel(_df): # Take averages (now that 'Mzimba' is mapped-to by both 'Mzimba South' and 'Mzimba North'.) sf = sf.groupby(by=['district_std', 'fac_type_tlo', 'month', 'item_code'])['available_prop'].mean().reset_index() -# Fill in missing data: +# 4. INTERPOLATE MISSING DATA TO ENSURE DATA IS AVAILABLE FOR ALL ITEMS, MONTHS, LEVELS, DISTRICTS +######################################################################################################################## # 1) Cities to get same results as their respective regions copy_source_to_destination = { 'Mzimba': 'Mzuzu City', @@ -765,7 +1054,33 @@ def get_inflow_to_outflow_ratio_by_item_and_facilitylevel(_df): mwanza_1b = sf.loc[(sf.district_std == 'Mwanza') & (sf.fac_type_tlo == '1a')].copy().assign(fac_type_tlo='1b') sf = pd.concat([sf, mwanza_1b], axis=0, ignore_index=True) -# 4) Copy all the results to create a level 0 with an availability equal to half that in the respective 1a +# 4) Update the availability Xpert (item_code = 187) +# First add rows for Xpert at level 1b by cloning rows for level 2 -> only if not already present +xpert_item = sf['item_code'].eq(187) +level_2 = sf['fac_type_tlo'].eq('2') +level_1b = sf['fac_type_tlo'].eq('1b') + +# Clone rows from level 2 +base = sf.loc[level_2 & xpert_item].copy() +new_rows = base.copy() +new_rows['fac_type_tlo'] = '1b' + +# Add rows to main availability dataframe and drop duplicates, if any +sf = pd.concat([sf, new_rows], ignore_index=True) +id_cols = [c for c in sf.columns if c != 'available_prop'] +dupe_mask = sf.duplicated(subset=id_cols, keep=False) +dupes = sf.loc[dupe_mask].sort_values(id_cols) +sf = sf.drop_duplicates(subset=id_cols, keep='first').reset_index(drop=True) + +# Compute the average availability Sep–Dec (months >= 9) for level 2, item 187 +sep_to_dec = sf['month'].ge(9) +new_xpert_availability = sf.loc[level_2 & xpert_item & sep_to_dec, 'available_prop'].mean() +# Assign new availability to relevant facility levels +levels_1b_2_or_3 = sf['fac_type_tlo'].isin(['1b', '2', '3']) +xpert_item = sf['item_code'].eq(187) +sf.loc[levels_1b_2_or_3 & xpert_item, 'available_prop'] = new_xpert_availability + +# 5) Copy all the results to create a level 0 with an availability equal to half that in the respective 1a all_1a = sf.loc[sf.fac_type_tlo == '1a'] all_0 = sf.loc[sf.fac_type_tlo == '1a'].copy().assign(fac_type_tlo='0') all_0.available_prop *= 0.5 @@ -811,26 +1126,10 @@ def get_inflow_to_outflow_ratio_by_item_and_facilitylevel(_df): full_set = full_set.combine_first(sf_final.set_index(['Facility_ID', 'month', 'item_code'])['available_prop']) # Fill in the blanks with rules for interpolation. - facilities_by_level = defaultdict(set) for ix, row in mfl.iterrows(): facilities_by_level[row['Facility_Level']].add(row['Facility_ID']) - -def get_other_facilities_of_same_level(_fac_id): - """Return a set of facility_id for other facilities that are of the same level as that provided.""" - for v in facilities_by_level.values(): - if _fac_id in v: - return v - {_fac_id} - - -def interpolate_missing_with_mean(_ser): - """Return a series in which any values that are null are replaced with the mean of the non-missing.""" - if pd.isnull(_ser).all(): - raise ValueError - return _ser.fillna(_ser.mean()) - - # Create new dataset that include the interpolations (The operation is not done "in place", because the logic is based # on what results are missing before the interpolations in other facilities). full_set_interpolated = full_set * np.nan @@ -877,20 +1176,63 @@ def interpolate_missing_with_mean(_ser): full_set_interpolated = full_set_interpolated.reset_index() #full_set_interpolated = full_set_interpolated.reset_index().merge(item_code_category_mapping, on = 'item_code', how = 'left', validate = 'm:1') +full_set_interpolated.to_csv( + path_for_new_resourcefiles / "ResourceFile_Consumables_availability_small.csv", + index=False +) # Save as .csv - this file is then read to apply the following functions - generate_alternative_availability_scenarios +# & generate_redistribution_scenarios + +# 5. ADD ALTERNATIVE AVAILABILITY SCENARIOS +######################################################################################################################## +# Add alternative availability scenarios to represent improved or reduce consumable availability +full_set_interpolated_with_scenarios = generate_alternative_availability_scenarios(full_set_interpolated) +max_scenario = get_max_scenario_number(full_set_interpolated_with_scenarios) # Get current scenario count +full_set_interpolated_with_scenarios = generate_redistribution_scenarios(full_set_interpolated_with_scenarios, + scenario_count = max_scenario, + outputfilepath = Path("./outputs/consumables_impact_analysis")) +available_cols = [c for c in full_set_interpolated_with_scenarios.columns if c.startswith("available_prop")] +full_set_interpolated_with_scenarios = full_set_interpolated_with_scenarios[['Facility_ID', 'month', 'item_code'] + available_cols] + +full_set_interpolated_with_scenarios_level1b_fixed = update_level1b_availability( + availability_df=full_set_interpolated_with_scenarios, + facilities_by_level=facilities_by_level, + resourcefilepath=resourcefilepath, + district_to_city_dict=copy_source_to_destination, + weighting = 'district_1b_to_2_ratio', +) +# Verify that the shape of this dataframe is identical to the original availability dataframe +assert sorted(set(full_set_interpolated_with_scenarios_level1b_fixed.Facility_ID)) == sorted(set(pd.unique(full_set_interpolated.Facility_ID))) +assert sorted(set(full_set_interpolated_with_scenarios_level1b_fixed.month)) == sorted(set(pd.unique(full_set_interpolated.month))) +assert sorted(set(full_set_interpolated_with_scenarios_level1b_fixed.item_code)) == sorted(set(pd.unique(full_set_interpolated.item_code))) +assert len(full_set_interpolated_with_scenarios_level1b_fixed) == len(full_set_interpolated.item_code) + +# Compare availability averages by Facility_Level before and after the 1b fix +level1b_fix_plots_path = outputfilepath / 'comparison_plots' +figurespath_scenarios = outputfilepath / 'consumable_scenarios' +if not os.path.exists(level1b_fix_plots_path): + os.makedirs(level1b_fix_plots_path) +plot_availability_before_and_after_level1b_fix(old_df = full_set_interpolated_with_scenarios, + new_df = full_set_interpolated_with_scenarios_level1b_fixed, + mfl = mfl, + available_cols = available_cols, # List of availability columns to summarise + save_figure_as = level1b_fix_plots_path / 'availability_before_and_after_level1b_fix.png') + +# 6. CHECK FORMAT AND SAVE AS RESOURCEFILE +######################################################################################################################## # --- Check that the exported file has the properties required of it by the model code. --- # -check_format_of_consumables_file(df=full_set_interpolated, fac_ids=fac_ids) +check_format_of_consumables_file(df=full_set_interpolated_with_scenarios_level1b_fixed, fac_ids=fac_ids) # %% # Save -full_set_interpolated.to_csv( +full_set_interpolated_with_scenarios_level1b_fixed.to_csv( path_for_new_resourcefiles / "ResourceFile_Consumables_availability_small.csv", index=False ) # %% -# 7. COMPARISON WITH HHFA DATA, 2018/19 ## -######################################################################################### +# 7. COMPARISON WITH HHFA DATA, 2018/19 +######################################################################################################################## # --- 7.1 Prepare comparison dataframe --- ## # Note that this only plot consumables for which data is available in the HHFA # i. Prepare data from HHFA @@ -905,7 +1247,7 @@ def interpolate_missing_with_mean(_ser): hhfa_comparison_df = hhfa_comparison_df.rename({'fac_type_tlo': 'Facility_Level'}, axis=1) # ii. Collapse final model availability data by facility level -final_availability_df = full_set_interpolated +final_availability_df = full_set_interpolated_with_scenarios_level1b_fixed mfl = pd.read_csv(resourcefilepath / "healthsystem" / "organisation" / "ResourceFile_Master_Facilities_List.csv") final_availability_df = pd.merge(final_availability_df, mfl[['District', 'Facility_Level', 'Facility_ID']], how="left", on=['Facility_ID'], @@ -927,39 +1269,7 @@ def interpolate_missing_with_mean(_ser): size = 10 comparison_df['consumable_labels'] = comparison_df['consumable_name_tlo'].str[:10] -# Define function to draw calibration plots at different levels of disaggregation -def comparison_plot(level_of_disaggregation, group_by_var, colour): - comparison_df_agg = comparison_df.groupby([group_by_var], - as_index=False).agg({'available_prop': 'mean', - 'available_prop_hhfa': 'mean', - 'Facility_Level': 'first', - 'consumable_labels': 'first'}) - comparison_df_agg['labels'] = comparison_df_agg[level_of_disaggregation] - - ax = comparison_df_agg.plot.scatter('available_prop', 'available_prop_hhfa', c=colour) - ax.axline([0, 0], [1, 1]) - for i, label in enumerate(comparison_df_agg['labels']): - plt.annotate(label, - (comparison_df_agg['available_prop'][i] + 0.005, - comparison_df_agg['available_prop_hhfa'][i] + 0.005), - fontsize=6, rotation=38) - if level_of_disaggregation != 'aggregate': - plt.title('Disaggregated by ' + level_of_disaggregation, fontsize=size, weight="bold") - else: - plt.title('Aggregate', fontsize=size, weight="bold") - plt.xlabel('Pr(drug available) as per TLO model') - plt.ylabel('Pr(drug available) as per HHFA') - save_name = 'comparison_plots/calibration_to_hhfa_' + level_of_disaggregation + '.png' - plt.savefig(outputfilepath / save_name) - - # 7.2.1 Aggregate plot -# First create folder in which to store the plots - -if not os.path.exists(outputfilepath / 'comparison_plots'): - os.makedirs(outputfilepath / 'comparison_plots') - print("folder to store Model-HHFA comparison plots created") - comparison_df['aggregate'] = 'aggregate' level_of_disaggregation = 'aggregate' colour = 'red' @@ -978,23 +1288,7 @@ def comparison_plot(level_of_disaggregation, group_by_var, colour): colour = 'yellow' comparison_plot(level_of_disaggregation, group_by_var, colour) - # 7.2.4 Plot by item and facility level -def comparison_plot_by_level(fac_type): - cond_fac_type = comparison_df['Facility_Level'] == fac_type - comparison_df_by_level = comparison_df[cond_fac_type].reset_index() - plt.scatter(comparison_df_by_level['available_prop'], - comparison_df_by_level['available_prop_hhfa']) - plt.axline([0, 0], [1, 1]) - for i, label in enumerate(comparison_df_by_level['consumable_labels']): - plt.annotate(label, (comparison_df_by_level['available_prop'][i] + 0.005, - comparison_df_by_level['available_prop_hhfa'][i] + 0.005), - fontsize=6, rotation=27) - plt.title(fac_type, fontsize=size, weight="bold") - plt.xlabel('Pr(drug available) as per TLO model') - plt.ylabel('Pr(drug available) as per HHFA') - - fig = plt.figure(figsize=(22, 22)) plt.subplot(421) comparison_plot_by_level(comparison_df['Facility_Level'].unique()[1]) @@ -1005,3 +1299,36 @@ def comparison_plot_by_level(fac_type): plt.subplot(424) comparison_plot_by_level(comparison_df['Facility_Level'].unique()[4]) plt.savefig(outputfilepath / 'comparison_plots/calibration_to_hhfa_fac_type_and_consumable.png') + +# %% +# 8. PLOT SCENARIO SUMMARIES +######################################################################################################################## +# Create the directory if it doesn't exist +figurespath_scenarios = outputfilepath / 'consumable_scenarios' +if not os.path.exists(figurespath_scenarios): + os.makedirs(figurespath_scenarios) + +chosen_availability_columns = [c for c in full_set_interpolated_with_scenarios_level1b_fixed.columns if c.startswith("available_prop")] +scenario_names_dict = {'available_prop': 'Actual', 'available_prop_scenario1': 'Non-therapeutic \n consumables', 'available_prop_scenario2': 'Vital medicines', + 'available_prop_scenario3': 'Pharmacist-\n managed', 'available_prop_scenario4': 'Level 1b', 'available_prop_scenario5': 'CHAM', + 'available_prop_scenario6': '75th percentile\n facility', 'available_prop_scenario7': '90th percentile \n facility', 'available_prop_scenario8': 'Best \n facility', + 'available_prop_scenario9': 'Best facility \n (including DHO)','available_prop_scenario10': 'HIV supply \n chain', 'available_prop_scenario11': 'EPI supply \n chain', + 'available_prop_scenario12': 'HIV moved to \n Govt supply chain \n (Avg by Level)', 'available_prop_scenario13': 'HIV moved to \n Govt supply chain \n (Avg by Facility_ID)', + 'available_prop_scenario14': 'HIV moved to \n Govt supply chain \n (Avg by Facility_ID times 1.25)', + 'available_prop_scenario15': 'HIV moved to \n Govt supply chain \n (Avg by Facility_ID times 0.75)', + 'available_prop_scenario16': 'Redistribution: District pooling', 'available_prop_scenario17': 'Redistribution: Cluster pooling', + 'available_prop_scenario18': 'Redistribution: Pairwise (large radius)', 'available_prop_scenario19': 'Redistribution: Pairwise (small radius)', +} + +# Generate descriptive plots of consumable availability +program_item_mapping = pd.read_csv(path_for_new_resourcefiles / 'ResourceFile_Consumables_Item_Designations.csv')[['Item_Code', 'item_category']] +program_item_mapping = program_item_mapping.rename(columns ={'Item_Code': 'item_code'})[program_item_mapping.item_category.notna()] +generate_descriptive_consumable_availability_plots(tlo_availability_df = full_set_interpolated_with_scenarios_level1b_fixed, + figurespath = figurespath_scenarios, + mfl = mfl, + program_item_mapping = program_item_mapping, + chosen_availability_columns = chosen_availability_columns, + scenario_names_dict = scenario_names_dict,) + + + diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py new file mode 100644 index 0000000000..634451295d --- /dev/null +++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py @@ -0,0 +1,2153 @@ +import datetime +from pathlib import Path +import pickle +import calendar + +import matplotlib.pyplot as plt +import seaborn as sns +import numpy as np +import pandas as pd +import time +import matplotlib.patches as mpatches +import matplotlib.lines as mlines + +from typing import Literal, Optional, Dict, Tuple, Iterable +import textwrap +from functools import reduce +import requests +from collections import defaultdict + +from pulp import LpProblem, LpMaximize, LpVariable, LpBinary, LpStatus, value, lpSum, LpContinuous, PULP_CBC_CMD +from math import ceil + +# define a timestamp for script outputs +timestamp = datetime.datetime.now().strftime("_%Y_%m_%d_%H_%M") + +# print the start time of the script +print('Script Start', datetime.datetime.now().strftime('%H:%M')) + +# define folder pathways +outputfilepath = Path("./outputs/consumables_impact_analysis") +resourcefilepath = Path("./resources") +path_for_new_resourcefiles = resourcefilepath / "healthsystem/consumables" +# Set local shared drive source +path_to_share = Path( # <-- point to the shared folder + '/Users/sm2511/CloudStorage/OneDrive-SharedLibraries-ImperialCollegeLondon/TLOModel - WP - Documents/' +) + +def generate_redistribution_scenarios(tlo_availability_df: pd.DataFrame, + scenario_count: int, + outputfilepath: Path = Path("./outputs/consumables_impact_analysis")) -> pd.DataFrame: + # 1. Import and clean data files + #********************************** + # Import Cleaned OpenLMIS data from 2018 + lmis = (pd.read_csv(outputfilepath / "ResourceFile_Consumables_availability_and_usage.csv") + [['district', 'fac_type_tlo', 'fac_name', 'month', 'item_code', 'available_prop', + 'closing_bal', 'amc', 'dispensed', 'received']]) + + # Drop duplicated facility, item, month combinations + print(lmis.shape, "rows before collapsing duplicates") + key_cols = ["district", "item_code", "fac_name", "month"] # keys that define a unique record + + # helper to keep one facility level per group (mode → most common; fallback to first non-null) + def _mode_or_first(s: pd.Series): + s = s.dropna() + if s.empty: + return np.nan + m = s.mode() + return m.iloc[0] if not m.empty else s.iloc[0] + + lmis = ( + lmis + .groupby(key_cols, as_index=False) + .agg( + closing_bal=("closing_bal", "sum"), + dispensed=("dispensed", "sum"), + received=("received", "sum"), + amc=("amc", "sum"), + available_prop=("available_prop", "mean"), + fac_type_tlo=("fac_type_tlo", _mode_or_first), # optional; remove if not needed + ) + ) + + print(lmis.shape, "rows after collapsing duplicates") + + # Import data on facility location + location = (pd.read_excel(path_to_share / "07 - Data/Facility_GPS_Coordinates/gis_data_for_openlmis/LMISFacilityLocations_raw.xlsx") + [['LMIS Facility List', 'LATITUDE', 'LONGITUDE']]) + # Find duplicates in facilty names in the location dataset + duplicates = location[location['LMIS Facility List'].duplicated(keep=False)] + location = location.drop(duplicates[duplicates['LATITUDE'].isna()].index).reset_index(drop=True) # Drop those duplicates where location is missing + # Import ownership data + ownership = (pd.read_csv(path_to_share / "07 - Data/Consumables data/OpenLMIS/lmis_facility_ownership.csv"))[['fac_name', 'fac_owner']] + ownership = ownership.drop_duplicates(subset=['fac_name']) + + # Merge OpenLMIS and location and ownership data + lmis = lmis.merge(location, left_on='fac_name', right_on = 'LMIS Facility List', how = 'left', validate='m:1') + lmis = lmis.merge(ownership, on='fac_name', how = 'left', validate='m:1') + lmis.rename(columns = {'LATITUDE':'lat', 'LONGITUDE':'long', 'fac_type_tlo': 'Facility_Level'}, inplace = True) + + # Cleaning to match date to the same format as consumable availability RF in TLO model + month_map = { + "January": 1, "February": 2, "March": 3, "April": 4, + "May": 5, "June": 6, "July": 7, "August": 8, + "September": 9, "October": 10, "November": 11, "December": 12 + } + lmis["month"] = lmis["month"].map(month_map) + lmis["Facility_Level"] = lmis["Facility_Level"].str.replace("Facility_level_", "", regex=False) + + # Clean data types before analysis + # 1) Normalize fac_name + lmis["fac_name"] = ( + lmis["fac_name"] + .astype("string") # Pandas string dtype (not just object) + .str.normalize("NFKC") # unify unicode forms + .str.strip() # trim leading/trailing spaces + .str.replace(r"\s+", "_", regex=True) # collapse internal whitespace + ) + + # 2) Normalize other key columns used in grouping/joins + lmis["item_code"] = lmis["item_code"].astype("string").str.strip() + lmis["district"] = lmis["district"].astype("string").str.strip().str.replace(r"\s+", "_", regex=True) + lmis["Facility_Level"] = lmis["Facility_Level"].astype("string").str.strip() + + # 3) Ensure numeric types (quietly coerce bad strings to NaN) + lmis["amc"] = pd.to_numeric(lmis["amc"], errors="coerce") + lmis["closing_bal"] = pd.to_numeric(lmis["closing_bal"], errors="coerce") + + # Keep only those facilities whose location is available + old_facility_count = lmis.fac_name.nunique() + lmis = lmis[lmis.lat.notna()] + new_facility_count = lmis.fac_name.nunique() + print(f"{old_facility_count - new_facility_count} facilities out of {old_facility_count} in the lmis data dropped due to " + f"missing location information") + + # Explore missingness + def compute_opening_balance(df: pd.DataFrame) -> pd.Series: + """ + Compute opening balance from same-month records. + + Formula: + OB = closing_bal - received + dispensed + Any negative OB values are replaced with 0. + Equivalent to: OB_(m) = CB_(m-1) + """ + ob = df["closing_bal"] - df["received"] + df["dispensed"] + return ob.clip(lower=0) + + # 1. Compute opening balance + lmis["opening_bal"] = compute_opening_balance(lmis).replace([np.inf, -np.inf], np.nan).fillna(0.0) + # Mechanistic probability (p_mech = OB / AMC) + amc_safe = np.maximum(1e-6, lmis["amc"].astype(float)) + lmis["p_mech"] = np.clip(lmis["opening_bal"] / amc_safe, 0.0, 1.0) + # Identify inconsistent rows (where reported p > mechanistic p) + mask_inconsistent = lmis["p_mech"] < lmis["available_prop"] + # Adjust opening balance upward to match reported availability + lmis.loc[mask_inconsistent, "opening_bal"] = ( + lmis.loc[mask_inconsistent, "available_prop"] * lmis.loc[mask_inconsistent, "amc"] + ) + print(f"Adjusted {mask_inconsistent.sum():,} rows " + f"({mask_inconsistent.mean()*100:.2f}%) where recorded availability " + f"exceeded mechanistic availability.") + + lmis.reset_index(inplace=True, drop = True) + + # ---------------------------------------------------------------------------------------------------------------------- + # 1) Data exploration + # ---------------------------------------------------------------------------------------------------------------------- + def generate_stock_adequacy_heatmap( + df: pd.DataFrame, + figures_path: Path = Path("figures"), + filename: str = "heatmap_adequacy_opening_vs_3xamc.png", + y_var: str = "district", # the variable on the y-axis of the heatmap + value_var: str = "item_code", # the count variable on the basis of which the values are calculated + value_label: str = "", # label describing the values in the heatmap + include_missing_as_fail: bool = False, # if True, items with NaN opening/amc count as NOT adequate + amc_threshold: float = 3.0, + compare: str = "ge" , # "ge" for >= threshold*AMC, "le" for <= threshold*AMC + decimals: int = 0, + cmap: str = "RdYlGn", + figsize= None, + xtick_rotation: int = 45, + ytick_rotation: int = 0, + annotation: bool = True, + ): + """ + Heatmap values: for each (month, district), the % of item_code groups where + sum(opening_balance over Facility_ID) >= 3 * sum(amc over Facility_ID). + """ + df = df.copy() + + # --- 1. Ensure month is int and build label --- + df["month"] = pd.to_numeric(df["month"], errors="coerce").astype("Int64") + df = df.dropna(subset=["month"]) + df["month"] = df["month"].astype(int) + + df["_month_label"] = df["month"].map(lambda m: calendar.month_abbr[m]) + + # ---- 2) Aggregate to (month, district, item_code) over facilities ---- + agg = ( + df.groupby(["month", "_month_label", y_var, value_var], dropna=False) + .agg(opening_bal=("opening_bal", "sum"), + amc=("amc", "sum")) + .reset_index() + ) + + # Keep: + # - 1. all rows where amc != 0 + # - 2. rows where the (fac_name, item_code) pair never had any non-zero amc + # (because this would indicate that their AMC may in fact be zero) + # - 3. rows where both Opening balance and AMC are not zero + agg = agg[(agg["amc"] != 0)] + agg = agg[~((agg["amc"] == 0) & (agg["opening_bal"] == 0))] + + # ---- 3) Adequacy indicator per (month, district, item_code) ---- + if include_missing_as_fail: + # NaNs treated as fail -> fill with NaN-safe compare: set False when either missing + ok = agg[["opening_bal", "amc"]].notna().all(axis=1) + left = agg["opening_bal"] + right = amc_threshold * agg["amc"] + if compare == "le": + cond = (left <= right) & ok + else: # default to ">=" + cond = (left >= right) & ok + else: + valid = agg.dropna(subset=["opening_bal", "amc"]) + cond = pd.Series(False, index=agg.index) + left = valid["opening_bal"] + right = amc_threshold * valid["amc"] + if compare == "le": + cond.loc[valid.index] = left <= right + else: + cond.loc[valid.index] = left >= right + + agg["condition_met"] = cond.astype(int) + + # --- % meeting condition per (month, district) across item_code --- + if include_missing_as_fail: + denom = agg.groupby(["month", "_month_label", y_var])[value_var].nunique() + numer = agg.groupby(["month", "_month_label", y_var])["condition_met"].sum() + else: + valid_mask = agg[["opening_bal", "amc"]].notna().all(axis=1) + denom = agg[valid_mask].groupby(["month", "_month_label", y_var])[value_var].nunique() + numer = agg[valid_mask].groupby(["month", "_month_label", y_var])["condition_met"].sum() + + pct = (numer / denom * 100).replace([np.inf, -np.inf], np.nan).reset_index(name="pct_meeting") + + # ---- 5) Pivot: districts (rows) x months (columns) ---- + # Sort months by _month_sort and use _month_label as the displayed column name + month_order = ( + pct[["month", "_month_label"]] + .drop_duplicates() + .sort_values("month") + ["_month_label"] + .tolist() + ) + heatmap_df = pct.pivot(index=y_var, columns="_month_label", values="pct_meeting") + heatmap_df = heatmap_df.reindex(columns=month_order) + + # --- Add average row and column --- + # Column average (mean of each month) + heatmap_df.loc["Average"] = heatmap_df.mean(axis=0) + # Row average (mean of each y_var) + heatmap_df["Average"] = heatmap_df.mean(axis=1) + + # Fix overall average (bottom-right) + overall_avg = heatmap_df.loc["Average", "Average"] + heatmap_df.loc["Average", "Average"] = overall_avg + + # Optional rounding for nicer colorbar ticks (doesn't affect color) + if decimals is not None: + heatmap_df = heatmap_df.round(decimals) + + # --- Dynamic figure size --- + if figsize is None: + n_rows = len(heatmap_df) + n_cols = len(heatmap_df.columns) + height = max(6, n_rows * 0.2) # taller if many rows + width = max(8, n_cols * 0.6) + figsize = (width, height) + + # ---- 6) Plot heatmap ---- + sns.set(font_scale=1.0) + fig, ax = plt.subplots(figsize=figsize) + + cbar_kws = value_label + hm = sns.heatmap( + heatmap_df, + cmap=cmap, + cbar_kws={"label": value_label}, + ax=ax, + annot=annotation, annot_kws={"size": 10}, + vmin = 0, vmax = 100) + + ax.set_xlabel("Month") + ax.set_ylabel(f"{y_var}") + ax.set_xticklabels(ax.get_xticklabels(), rotation=xtick_rotation) + ax.set_yticklabels(ax.get_yticklabels(), rotation=ytick_rotation) + + # Keep colorbar ticks plain (no scientific notation) + try: + cbar_ax = ax.figure.axes[-1] + cbar_ax.ticklabel_format(style="plain") + except Exception: + pass + + # ---- 7) Save & return ---- + figures_path.mkdir(parents=True, exist_ok=True) + outpath = figures_path / filename + plt.savefig(outpath, dpi=300, bbox_inches="tight") + plt.close(fig) + + return fig, ax, heatmap_df + + # ---------------------------------------------------------------------------------------------------------------------- + # 2) Estimate travel time matrix + # ---------------------------------------------------------------------------------------------------------------------- + def _chunk_indices(n: int, chunk: int): + """Yield (start, end) index pairs for chunking 0..n-1.""" + for s in range(0, n, chunk): + e = min(n, s + chunk) + yield s, e + + def build_travel_time_matrix( + fac_df: pd.DataFrame, + *, + id_col: str = "fac_name", + lat_col: str = "lat", + lon_col: str = "long", + mode: Literal["car", "bicycle"] = "car", + backend: Literal["ors", "osrm"] = "ors", + # ORS options + ors_api_key: Optional[str] = None, + ors_base_url: str = "https://api.openrouteservice.org/v2/matrix", + # OSRM options (self-hosted or public; note: public often has only 'car') + osrm_base_url: str = "https://router.project-osrm.org", + osrm_profile_map: dict = None, + # matrix request chunking (keeps requests within API limits) + max_chunk: int = 40, + timeout: int = 60, + ) -> pd.DataFrame: + """ + Build an NxN *road* travel-time matrix (minutes) for facilities, by CAR or BICYCLE + + backends: + - 'ors' -> uses OpenRouteService Matrix API (profiles: driving-car, cycling-regular). + Requires ors_api_key. Has rate/size limits; we auto-chunk. + - 'osrm' -> uses OSRM 'table' service (profiles typically 'car' or 'bike'). + For bicycle, you'll likely need a self-hosted OSRM with the bicycle profile. + + Parameters + ---------- + mode : 'car' | 'bicycle' + Travel mode on roads. + max_chunk : int + Max #origins (and #destinations) per sub-matrix request to keep within API limits. + + Returns + ------- + pd.DataFrame + Square DataFrame (minutes), index/columns = facility names. + """ + facs = fac_df[[id_col, lat_col, lon_col]].dropna().drop_duplicates().reset_index(drop=True) + ids = facs[id_col].tolist() + lats = facs[lat_col].to_numpy() + lons = facs[lon_col].to_numpy() + n = len(ids) + + T = pd.DataFrame(np.full((n, n), np.nan, dtype=float), index=ids, columns=ids) + np.fill_diagonal(T.values, 0.0) + + if n == 0: + return T + + if backend == "ors": + if ors_api_key is None: + raise ValueError("OpenRouteService requires ors_api_key.") + profile = "driving-car" if mode == "car" else "cycling-regular" + + # ORS expects [lon, lat] + coords = [[float(lons[i]), float(lats[i])] for i in range(n)] + + headers = {"Authorization": ors_api_key, "Content-Type": "application/json"} + + # Chunk both sources and destinations to stay under limits + for si, sj in _chunk_indices(n, max_chunk): + for di, dj in _chunk_indices(n, max_chunk): + sources = list(range(si, sj)) + destinations = list(range(di, dj)) + + body = { + "locations": coords, + "sources": sources, + "destinations": destinations, + "metrics": ["duration"], + } + url = f"{ors_base_url}/{profile}" + r = requests.post(url, json=body, headers=headers, timeout=timeout) + r.raise_for_status() + data = r.json() + + # durations in seconds; fill submatrix + durs = data.get("durations") + if durs is None: + raise RuntimeError(f"ORS returned no durations for block {si}:{sj} x {di}:{dj}") + sub = (np.array(durs, dtype=float) / 60.0) # minutes + T.iloc[si:sj, di:dj] = sub + + elif backend == "osrm": + # Map desired mode to OSRM profile names + if osrm_profile_map is None: + osrm_profile_map = {"car": "car", "bicycle": "bike"} + profile = osrm_profile_map.get(mode) + if profile is None: + raise ValueError(f"No OSRM profile mapped for mode='{mode}'.") + + # NOTE: The public OSRM demo often supports *car only*. + # For bicycle, run your own OSRM with the bicycle profile. + # We use the OSRM 'table' service; POST with 'sources' and 'destinations' indices. + + coords = ";".join([f"{lons[i]},{lats[i]}" for i in range(n)]) # lon,lat + + for si, sj in _chunk_indices(n, max_chunk): + for di, dj in _chunk_indices(n, max_chunk): + sources = ";".join(map(str, range(si, sj))) + destinations = ";".join(map(str, range(di, dj))) + + url = ( + f"{osrm_base_url}/table/v1/{profile}/{coords}" + f"?sources={sources}&destinations={destinations}&annotations=duration" + ) + r = requests.get(url, timeout=timeout) + r.raise_for_status() + data = r.json() + + durs = data.get("durations") + if durs is None: + raise RuntimeError(f"OSRM returned no durations for block {si}:{sj} x {di}:{dj}") + sub = (np.array(durs, dtype=float) / 60.0) # minutes + T.iloc[si:sj, di:dj] = sub + + else: + raise ValueError("backend must be 'ors' or 'osrm'.") + + # Safety: replace any residual NaNs (unroutable pairs) with inf or a large number + T = T.fillna(np.inf) + return T + + # Because ORS and ORSM can only handle a limited number of elements at a time, it's better to run this by district + # each of which already has under 50 facilities + def build_time_matrices_by_district( + df: pd.DataFrame, + *, + district_col: str = "district", + id_col: str = "fac_name", + lat_col: str = "lat", + lon_col: str = "long", + mode: str = "car", + backend: str = "osrm", + osrm_base_url: str = "https://router.project-osrm.org", + ors_api_key: str | None = None, + max_chunk: int = 50, # safe for both OSRM/ORS + ) -> dict[str, pd.DataFrame]: + """ + Returns a dict: {district -> square minutes matrix DataFrame}, computed within each district only. + """ + matrices = {} + # unique facilities per district (drop duplicates in case of repeated rows) + fac_cols = [district_col, id_col, lat_col, lon_col] + fac_coords = df[fac_cols].dropna().drop_duplicates() + + for d, facs_d in fac_coords.groupby(district_col): + # Skip tiny groups + if len(facs_d) < 2: + continue + + T = build_travel_time_matrix( + fac_df=facs_d[[id_col, lat_col, lon_col]], + id_col=id_col, lat_col=lat_col, lon_col=lon_col, + mode=mode, backend=backend, + osrm_base_url=osrm_base_url, + ors_api_key=ors_api_key, + max_chunk=max_chunk, + ) + matrices[d] = T + + return matrices + + # ----------------------------------------------- + # 3) Data prep for redistribution linear program + # ----------------------------------------------- + def presumed_availability(ob, amc, eps=1e-9) -> float: + """ + Presumed likelihood of cons availability = p = min(1, OB/AMC) at month start (no additional receipts considered + at this point in time). + """ + return float(min(1.0, max(0.0, (ob / max(eps, amc))))) + + + def build_edges_within_radius( + time_matrix: pd.DataFrame, + max_minutes: float + ) -> Dict[str, set]: + """ + For each facility g, return set of receivers f such that T[g,f] <= max_minutes, f != g. + """ + edges = {} + for g in time_matrix.index: + feasible = set(time_matrix.columns[(time_matrix.loc[g] <= max_minutes) & (time_matrix.columns != g)]) + edges[g] = feasible + return edges + + def build_edges_within_radius_flat(T_by_dist: dict, max_minutes: float) -> dict[str, set[str]]: + """ + Takes the district-wise dictionary of time travel matrices and converts it into a flat dictionary of facilities + and their edge neighbours depending on the maximum allowable travel distance. + T_by_dist: {district -> square DataFrame of minutes (index/cols = facility IDs)} + Returns: {facility_id -> set(of facility_ids)} for all districts combined. + """ + edges: dict[str, set[str]] = {} + for _, T in T_by_dist.items(): + for g in T.index: + row = T.loc[g].to_numpy() + feasible_mask = (row <= max_minutes) & np.isfinite(row) + # Exclude self + feasible = [f for f in T.columns[feasible_mask] if f != g] + if g not in edges: + edges[g] = set() + edges[g].update(feasible) + return edges + + # a = build_edges_within_radius(T_car, max_minutes = 18) + + # Defining clusters of health facilities within district + # This function helps find the facilities which would be appropriate cluster centers + def _farthest_first_seeds(T: pd.DataFrame, k: int, big: float = 1e9) -> list: + """ + Pick k seed medoids via farthest-first traversal on a travel-time matrix. + Treat inf/NaN distances as 'big' so disconnected components get separate seeds. + """ + n = T.shape[0] + facs = T.index.tolist() + D = T.to_numpy().astype(float) + D[~np.isfinite(D)] = big + + # Start at the row with largest average distance (covers sparse areas first) + start = int(np.nanargmax(np.nanmean(D, axis=1))) # the remotest facility + seeds_idx = [start] + + # Iteratively add the point with max distance to its nearest seed + for _ in range(1, k): + # min distance to any existing seed for every point + min_to_seed = np.min(D[:, seeds_idx], axis=1) # this has a dimension of [facs, number of seeds] + next_idx = int(np.argmax(min_to_seed)) # for each facility find the distance to its nearest seed + # and the facility farthest from the nearest seed becomes the next seed + if next_idx in seeds_idx: + # Fallback: pick any non-seed with highest min_to_seed + candidates = [i for i in range(n) if i not in seeds_idx] + if not candidates: + break + next_idx = int(candidates[np.argmax(min_to_seed[candidates])]) + seeds_idx.append(next_idx) + + return [facs[i] for i in seeds_idx] # list of length k representing the clustering points + + # Assign each facility to its nearest seed subject to a hard cluster capacity (≤ X members) + def _assign_to_cluster_with_fixed_capacity(T: pd.DataFrame, seeds: list, capacity: int, big: float = 1e9) -> Dict[str, int]: + """ + Greedy assignment of facilities to nearest seed that still has capacity (based on maximum cluster size). + Returns: mapping facility -> seed_index (position in seeds list). + """ + facs = T.index.tolist() + D = T.loc[facs, seeds].to_numpy().astype(float) # Distance of all facilities from the k seeds + D[~np.isfinite(D)] = big + + # each facility: nearest distance to any seed (for stable ordering) + nearest = D.min(axis=1) # find the shortest distance to a cluster for each facility + order = np.argsort(nearest) # sort all facilities in ascending order of their distance from the nearest facility + + cap_left = {j: capacity for j in range(len(seeds))} # the capacity left for each seed + assign = {} + + for idx in order: + f = facs[idx] + # try seeds in ascending distance + seq = np.argsort(D[idx, :]) # the sequence of seeds most suitable for idx + placed = False + for j in seq: + if cap_left[j] > 0: + assign[f] = j + cap_left[j] -= 1 + placed = True + break + if not placed: + # total capacity >= n, so this should be rare; if it happens, put in least-loaded seed + j = min(cap_left, key=lambda jj: cap_left[jj]) + assign[f] = j + cap_left[j] -= 1 + + return assign + + def capacity_clusters_for_district( + T_d: pd.DataFrame, cluster_size: int = 5, big: float = 1e9, refine_swaps: int = 0 + ) -> Dict[str, str]: + """ + Build ~equal-size clusters (size<=cluster_size) from a district's travel-time matrix via + capacity-constrained k-medoids (farthest-first seeds + greedy capacity assignment). + + Returns: {facility_id -> cluster_id} (cluster ids like 'C00','C01',...) + """ + facs = T_d.index.tolist() + n = len(facs) + if n == 0: + return {} + if n <= cluster_size: + return {f: "C00" for f in facs} + + k = ceil(n / cluster_size) + seeds = _farthest_first_seeds(T_d, k=k, big=big) + assign_seed_idx = _assign_to_cluster_with_fixed_capacity(T_d, seeds=seeds, capacity=cluster_size, big=big) + + # Optional tiny refinement: (off by default) + # You could add 1–2 passes of medoid swap within clusters to reduce intra-cluster travel. + + # Build cluster ids in seed order + seed_to_cid = {j: f"C{j:02d}" for j in range(len(seeds))} + return {f: seed_to_cid[assign_seed_idx[f]] for f in facs} + + def build_capacity_clusters_all( + T_by_dist: Dict[str, pd.DataFrame], cluster_size: int = 5 + ) -> pd.Series: + """ + Apply capacity clustering to all districts. + Args: + T_by_dist: {'DistrictName': time_matrix (minutes, square DF with facility ids)} + cluster_size: desired max cluster size (e.g., 5) + + Returns: + pd.Series mapping facility_id -> cluster_id names scoped by district, e.g. 'Nkhotakota#C03' + """ + mappings = [] + for d, T_d in T_by_dist.items(): + if T_d is None or T_d.empty: + continue + local_map = capacity_clusters_for_district(T_d, cluster_size=cluster_size) + if not local_map: + continue + s = pd.Series(local_map, name="cluster_id") + s = s.map(lambda cid: f"{d}#{cid}") # scope cluster name by district + mappings.append(s) + if not mappings: + return pd.Series(dtype=object) + return pd.concat(mappings) + + # ----------------------------------------------- + # 3) LP/MILP Redistribution + # ----------------------------------------------- + def redistribute_radius_lp( + df: pd.DataFrame, + time_matrix: Dict[str, pd.DataFrame] | pd.DataFrame, + radius_minutes: float, + # policy knobs + tau_keep: float = 1.0, # donors must keep ≥ tau_keep * AMC + tau_tar: float = 1.0, # receivers target OB = AMC + K_in: int = 1, # per-item: max donors per receiver + K_out: int = 10, # per-item: max receivers per donor + Qmin_proportion: float = 0.25, # min lot as a fraction of receiver AMC (e.g., 0.25 ≈ 7–8 days) + eligible_levels: Iterable[str] = ("1a","1b"), + # schema + id_cols=("district","month","item_code"), + facility_col="fac_name", + level_col="Facility_Level", + amc_col="amc", + # outputs/behaviour + return_edge_log: bool = True, + floor_to_baseline: bool = True, # if True, never let reported availability drop below baseline + # numerics + amc_eps: float = 1e-6, + eps: float = 1e-9, + ) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]: + """ + Pairwise (radius) redistribution with per-item degree caps. + + MILP per (district, month, item): + variables: t[g,f] ≥ 0 (transfer), y[g,f] ∈ {0,1} (edge activation) + objective: maximize Σ_p + key constraints: + - donors keep ≥ tau_keep * AMC + - receivers (only eligible levels) limited to deficit: tau_tar*AMC - OB + - travel-time ≤ radius + - min lot: t[g,f] ≥ Qmin * y[g,f], with Qmin = Qmin_proportion * AMC_receiver + - edge capacity: t[g,f] ≤ min(surplus_g, deficit_f) * y[g,f] + - degree caps per item: inbound ≤ K_in, outbound ≤ K_out + Availability is recomputed mechanistically and written back **only** where a transfer occurred. + """ + out = df.copy() + + # Opening balance + out["OB"] = out["opening_bal"] + # Preserve only necessary columns + selected_cols = list(id_cols) + [level_col, facility_col, 'OB', amc_col, 'available_prop'] + out = out[selected_cols] + + # clean AMC & level + out[amc_col] = pd.to_numeric(out[amc_col], errors="coerce").fillna(0.0) + out[level_col] = out[level_col].astype(str) + + # Container for updated OB and edge log + out["OB_prime"] = out["OB"] + edge_rows = [] if return_edge_log else None + + # Group at (district, month, item_code) + group_cols = list(id_cols) + + skipped_nodes = [] + for (d, m, i), g in out.groupby(group_cols, sort=False): + g = g.copy() + + # --- Pick the travel-time matrix slice --- + if isinstance(time_matrix, dict): + T_d = time_matrix.get(d) + if T_d is None or T_d.empty: + continue + else: + T_d = time_matrix # single matrix for all, if you pass one + + facs_slice = g[facility_col].dropna().unique().tolist() + facs = [f for f in facs_slice if f in T_d.index and f in T_d.columns] + if len(facs) < 2: + continue + + T_sub = T_d.loc[facs, facs].replace(np.nan, np.inf) + + # --- Pull per-fac data for this item --- + AMC = (g.set_index(facility_col)[amc_col].astype(float) + .replace([np.inf, -np.inf], np.nan).fillna(0.0)) + OB0 = (g.set_index(facility_col)["OB"].astype(float) + .replace([np.inf, -np.inf], np.nan).fillna(0.0)) + LVL = (g.set_index(facility_col)[level_col].astype(str)) + + + # Align to facs and guard AMC + AMC = AMC.reindex(facs).fillna(0.0) + AMC_guard = AMC.copy() + AMC_guard[AMC_guard <= 0.0] = amc_eps + OB0 = OB0.reindex(facs).fillna(0.0) + LVL = LVL.reindex(facs) + + # --- Surplus / deficit --- + surplus = np.maximum(0.0, OB0.values - tau_keep * AMC_guard.values) # donors + deficit = np.maximum(0.0, tau_tar * AMC_guard.values - OB0.values) # receivers + + # Leave AMC == 0 untouched + #recv_pos_mask = AMC.values > amc_eps # forbid AMC≈0 from receiving + #deficit = np.where(recv_pos_mask, deficit0, 0.0) + + # Only eligible levels can receive + elig_recv = LVL.isin(eligible_levels).values + deficit = np.where(elig_recv, deficit, 0.0) + + donors = [f for f, s in zip(facs, surplus) if s > eps] + recvs = [f for f, h in zip(facs, deficit) if h > eps] + if not donors or not recvs: + continue + + s_map = dict(zip(facs, surplus)) + h_map = dict(zip(facs, deficit)) + qmin_map = dict(zip(facs, Qmin_proportion * AMC_guard.values)) + + # --- Feasible edges (within radius), HARD PRUNE if capacity < qmin --- + M_edge = {} # capacity per edge + Qmin = {} # min lot per edge + for g_fac in donors: + row = T_sub.loc[g_fac].to_numpy() + feas_idx = np.where((row <= radius_minutes) & np.isfinite(row))[0] + for idx in feas_idx: + f_fac = T_sub.columns[idx] + if f_fac == g_fac or f_fac not in recvs: + continue + M = min(s_map[g_fac], h_map[f_fac]) + if not np.isfinite(M) or M <= eps: + continue + qmin = float(qmin_map[f_fac]) + if not np.isfinite(qmin) or qmin <= eps or M < qmin: + # cannot move at least qmin -> drop the edge + continue + M_edge[(g_fac, f_fac)] = float(M) + Qmin[(g_fac, f_fac)] = float(qmin) + + if not M_edge: + continue + + # --- MILP (per item) --- + prob = LpProblem(f"Radius_{d}_{m}_{i}", LpMaximize) + + # decision vars + t = {e: LpVariable(f"t_{e[0]}->{e[1]}", lowBound=0, upBound=M_edge[e], cat=LpContinuous) + for e in M_edge.keys()} + y = {e: LpVariable(f"y_{e[0]}->{e[1]}", lowBound=0, upBound=1, cat=LpBinary) + for e in M_edge.keys()} + p = {f: LpVariable(f"p_{f}", lowBound=0, upBound=1) for f in + facs} # or only for eligible receivers + + # objective: maximize total shipped + prob += lpSum(p[f] for f in recvs) # or for level-eligible facilities + + AMC_guard = AMC.reindex(facs).fillna(0.0) + AMC_guard[AMC_guard <= 0.0] = amc_eps + + # donor outflow caps (per item) + for g_fac in donors: + vars_out = [t[(g_fac, f_fac)] for (gg, f_fac) in t.keys() if gg == g_fac] + if vars_out: + s_cap = float(max(0.0, OB0[g_fac] - tau_keep * AMC_guard[g_fac])) + prob += lpSum(vars_out) <= s_cap + + # receiver inflow caps (per item; eligibility already enforced) + for f_fac in recvs: + vars_in = [t[(g_fac, f_fac)] for (g_fac, ff) in t.keys() if ff == f_fac] + if vars_in: + h_cap = float(max(0.0, tau_tar * AMC_guard[f_fac] - OB0[f_fac])) + prob += lpSum(vars_in) <= h_cap + + # link t & y + min-lot + for e, M in M_edge.items(): + prob += t[e] <= M * y[e] + qmin = min(Qmin[e], M_edge[e]) # TODO should this be qmin = Qmin[e]? + if qmin > eps: + prob += t[e] >= qmin * y[e] + + # per-item degree caps + for f_fac in recvs: + inbound_y = [y[(g_fac, f_fac)] for (g_fac, ff) in y.keys() if ff == f_fac] + if inbound_y: + prob += lpSum(inbound_y) <= K_in + for g_fac in donors: + outbound_y = [y[(g_fac, f_fac)] for (gg, f_fac) in y.keys() if gg == g_fac] + if outbound_y: + prob += lpSum(outbound_y) <= K_out + + # 3) Availability linearization per facility + # Need inflow and outflow expressions from your t[(g,f)] + for f_fac in facs: + inflow = lpSum(t[(g_fac, f_fac)] for (g_fac, ff) in t.keys() if ff == f_fac) + outflow = lpSum(t[(f_fac, h_fac)] for (gg, h_fac) in t.keys() if gg == f_fac) + prob += float(AMC_guard[f_fac]) * p[f_fac] <= float(OB0.get(f_fac, 0.0)) + inflow - outflow + + # solve + status = prob.solve(PULP_CBC_CMD(msg=False, cuts=0, presolve=True, threads=1)) + if LpStatus[prob.status] != "Optimal": + skipped_nodes.append((d, m, i)) + continue + + # --- Apply transfers & log --- + delta = {f: 0.0 for f in facs} # net change in OB by facility (this item) + any_transfer = False + for (g_fac, f_fac), var in t.items(): + moved = float(value(var) or 0.0) + if moved > eps: + any_transfer = True + delta[g_fac] -= moved + delta[f_fac] += moved + if return_edge_log: + tm = float(T_sub.loc[g_fac, f_fac]) if np.isfinite(T_sub.loc[g_fac, f_fac]) else np.nan + edge_rows.append({ + "district": d, "month": m, "item_code": i, + "donor_fac": g_fac, "receiver_fac": f_fac, + "units_moved": moved, "travel_minutes": tm + }) + + if not any_transfer: + continue + + sel = (out["district"].eq(d) & out["month"].eq(m) & out["item_code"].eq(i)) + out.loc[sel, "OB_prime"] = out.loc[sel].apply( + lambda r: r["OB"] + delta.get(r[facility_col], 0.0), + axis=1 + ) + print("Skipped ", len(skipped_nodes), "district-month-item combinations - no optimal solution") + + # ---------- Availability: update ONLY where positive transfers happened ---------- + changed_mask = (out["OB_prime"] - out["OB"]) > 1e-6 + denom = np.maximum(amc_eps, out[amc_col].astype(float).values) + p_mech = np.minimum(1.0, np.maximum(0.0, out["OB_prime"].values / denom)) + + # start from baseline + new_p = out["available_prop"].astype(float).values if floor_to_baseline else out["available_prop"].astype(float).values.copy() + # update only changed rows; optionally floor to baseline + if floor_to_baseline: + new_p[changed_mask] = np.maximum(p_mech[changed_mask], out["available_prop"].astype(float).values[changed_mask]) + else: + new_p[changed_mask] = p_mech[changed_mask] + + # force non-eligible levels to keep baseline (mirrors pooling) + non_elig = ~out[level_col].isin(eligible_levels) + new_p[non_elig] = out.loc[non_elig, "available_prop"].astype(float).values + + out["available_prop_redis"] = new_p + + edge_df = pd.DataFrame(edge_rows) if return_edge_log else None + return out, edge_df + + def redistribute_pooling_lp( + df: pd.DataFrame, + tau_min: float = 0.25, # lower floor in "months of demand" (≈ 7–8 days) - minimum transfer required + tau_max: float = 3.0, # upper ceiling (storage/policy max) + tau_donor_keep: float = 3.0, # minimum the donor keeps before donating + id_cols=("district","month","item_code"), + facility_col="fac_name", + level_col="Facility_Level", + amc_col="amc", + close_cols=("closing_bal","received","dispensed"), + amc_eps: float = 1e-6, # threshold to treat AMC as "zero" + return_move_log: bool = True, # return a detailed df showing net movement of consumables after redistribution + pooling_level: str = "district", # "district" or "cluster" + cluster_map: pd.Series | None = None, # required if pooling_level=="cluster"; this specifes which cluster each facility belongs to + floor_to_baseline: bool = True # if True, report availability floored to baseline (no decrease in outputs) + ) -> pd.DataFrame: + """ + Scenario 3: district-level pooling/push . + Maximizes total availability with: + - NaN/inf guards on AMC/OB + - duplicate facility IDs collapsed within group + - floors scaled if total stock < sum floors + - optional 'excess' sink if total stock > sum ceilings + - availability computed safely; AMC≈0 rows keep baseline (optional) + + Pooling redistribution that can operate at the district level (default) + or within fixed-size clusters inside districts. + + If pooling_level == "cluster", you must pass cluster_map: Series mapping facility_id -> cluster_id + (cluster ids should already be district-scoped, e.g., "Dedza#C01"). + + Returns: + - out: original df plus columns: + OB, OB_prime, available_prop_redis, received_from_pool + where received_from_pool = OB_prime - OB (pos=received, neg=donated) + - (optional) move_log: per (district, month, item, facility) net movement summary + """ + if pooling_level not in ("district", "cluster"): + raise ValueError("pooling_level must be 'district' or 'cluster'.") + + closing_bal, received, dispensed = close_cols + out = df.copy() + + # Opening balance + # Ensure OB is consistent with observed availability + amc_safe = np.maximum(1e-6, lmis["amc"].astype(float)) + lmis["p_mech"] = np.clip(lmis["opening_bal"] / amc_safe, 0.0, 1.0) + + mask_inconsistent = lmis["p_mech"] < lmis["available_prop"] + lmis.loc[mask_inconsistent, "opening_bal"] = ( + lmis.loc[mask_inconsistent, "available_prop"] * lmis.loc[mask_inconsistent, "amc"] + ) + out["OB"] = out["opening_bal"] + + # Default (will overwrite per group) + out["OB_prime"] = out["OB"] + + # Attach cluster if needed + if pooling_level == "cluster": + if cluster_map is None: + raise ValueError("cluster_map is required when pooling_level='cluster'.") + # cluster_map: index = facility_id (facility_col), value = cluster_id (already district-scoped) + out = out.merge( + cluster_map.rename("cluster_id"), + how="left", + left_on=facility_col, + right_index=True, + ) + if out["cluster_id"].isna().any(): + # facilities missing a cluster—assign singleton clusters to keep them + out["cluster_id"] = out["cluster_id"].fillna( + out["district"].astype(str) + "#CXX_" + out[facility_col].astype(str)) + + group_cols = list(id_cols) + node_label = "district" + if pooling_level == "cluster": + group_cols = ["cluster_id", "month", "item_code"] + node_label = "cluster_id" + move_rows = [] # optional movement log + # TODO could remove the movement log + + skipped_nodes = [] # collect nodes that did NOT solve optimally + for keys, g in out.groupby(group_cols, sort=False): + g = g.copy() + # Resolve node ID for logging and selection masks + if pooling_level == "district": + node_val, m, i = g["district"].iloc[0], keys[1], keys[2] + else: + node_val, m, i = keys + + # Build per-facility Series (unique index) for AMC, OB, Level, Baseline p + AMC = (g.set_index(facility_col)[amc_col] + .astype(float).replace([np.inf, -np.inf], np.nan).fillna(0.0)) + OB0 = (g.set_index(facility_col)["OB"] + .astype(float).replace([np.inf, -np.inf], np.nan).fillna(0.0)) + LVL = (g.set_index(facility_col)[level_col].astype(str) + .replace({np.nan: ""})) + + # collapse duplicates if any + if AMC.index.duplicated().any(): + AMC = AMC[~AMC.index.duplicated(keep="first")] + if LVL.index.duplicated().any(): + LVL = LVL[~LVL.index.duplicated(keep="first")] + if OB0.index.duplicated().any(): + OB0 = OB0.groupby(level=0).sum() + + total_stock = float(OB0.sum()) + if total_stock <= 1e-9: + continue + + # Participants (positive demand) + mask_pos = AMC >= amc_eps + facs_pos = AMC.index[mask_pos].tolist() + if not facs_pos: + # nothing to reallocate to; they will be donors only (handled by OB' defaults) + continue + + AMC_pos = AMC.loc[facs_pos] + OB0_pos = OB0.loc[facs_pos] + LVL_pos = LVL.reindex(facs_pos) + + # policy floors/ceilings + tau_min_floor = (tau_min * AMC_pos).astype(float) + donor_protect = np.minimum(OB0_pos, tau_donor_keep * AMC_pos) # retain min(OB, tau_donor_keep*AMC) + LB0 = np.maximum(tau_min_floor, donor_protect) + + UB = (tau_max * AMC_pos).astype(float) + UB.loc[~LVL_pos.isin(["1a", "1b"])] = np.minimum( + OB0_pos.loc[~LVL_pos.isin(["1a", "1b"])], + UB.loc[~LVL_pos.isin(["1a", "1b"])] + ) + + # Feasibility: scale only the tau_min component if sum LB > total_stock + sum_LB0 = float(LB0.sum()) + if total_stock + 1e-9 < sum_LB0: + # Scale down the tau_min part (not the donor protection) + base_guard = donor_protect + extra = np.maximum(0.0, tau_min_floor - np.minimum(tau_min_floor, base_guard)) + need = float(extra.sum()) + budget = total_stock - float(base_guard.sum()) + scale = 0.0 if need <= 1e-12 else max(0.0, min(1.0, budget / max(1e-9, need))) + tau_min_scaled = np.minimum(base_guard, tau_min_floor) + extra * scale + LB = np.maximum(base_guard, tau_min_scaled) + else: + LB = LB0 + + # ---- Excess sink if ceilings bind + sum_UB = float(UB.sum()) + allow_excess_sink = total_stock > sum_UB + 1e-9 + + # 1) Per-facility feasibility guard + bad = LB > UB + 1e-12 + if bad.any(): + # clip LB to UB; if that still leaves negative room, the facility is degenerate + LB = np.minimum(LB, UB - 1e-9) + + # ---------- LP ---------- + prob = LpProblem(f"Pooling_{node_val}_{m}_{i}", LpMaximize) + x = {f: LpVariable(f"x_{f}", lowBound=0) for f in facs_pos} + p = {f: LpVariable(f"p_{f}", lowBound=0, upBound=1) for f in facs_pos} + excess = LpVariable("excess", lowBound=0) if allow_excess_sink else None + # note that even though facilities with AMC == 0 are not considered for optimisation, their postive OB is + # included in the total stock + + # Objective: maximize total availability + prob += lpSum(p.values()) + + # Conservation + if excess is None: + prob += lpSum(x.values()) == total_stock + else: + prob += lpSum(x.values()) + excess == total_stock + + # Bounds + linearization + for f in facs_pos: + prob += x[f] >= float(LB.loc[f]) # donor protection + tau_min (scaled) + (optional) no-harm + prob += x[f] <= float(UB.loc[f]) # eligibility-aware ceiling + prob += float(max(AMC_pos.loc[f], amc_eps)) * p[f] <= x[f] # TODO CHECK max(AMC_pos.loc[f], amc_eps) or just AMC_pos + + # Solve + prob.solve(PULP_CBC_CMD(msg=False, cuts=0, presolve=True, threads=1)) + if LpStatus[prob.status] != "Optimal": + skipped_nodes.append((node_val, m, i)) + #print("NO Optimal solution found", node_val, m, i) + continue + #else: + #print("Optimal solution found", node_val, m, i) + + # Apply solution to OB' + x_sol = {f: float(value(var) or 0.0) for f, var in x.items()} + + # Selection mask for writing back + if pooling_level == "district": + sel = (out["district"].eq(node_val) & out["month"].eq(m) & out["item_code"].eq(i)) + else: + sel = (out["cluster_id"].eq(node_val) & out["month"].eq(m) & out["item_code"].eq(i)) + + # Facilities with AMC>=eps get x_f + mask_rows_pos = sel & out[facility_col].isin(facs_pos) + out.loc[mask_rows_pos, "OB_prime"] = out.loc[mask_rows_pos, facility_col].map(x_sol).values + + # Facilities with AMC OB' = 0 + # (this matches the "donate to pool" assumption) + mask_rows_zero = sel & ~out[facility_col].isin(facs_pos) + out.loc[mask_rows_zero, "OB_prime"] = 0.0 + + if return_move_log: + for f in AMC.index: # include amc≈0 facilities (x_f=0) + x_f = x_sol.get(f, 0.0) if f in facs_pos else 0.0 + net = x_f - float(OB0.get(f, 0.0)) + move_rows.append({ + node_label: node_val, + "month": m, + "item_code": i, + "facility": f, + "received_from_pool": net, + "x_allocated": x_f, + "OB0_agg": float(OB0.get(f, 0.0)), + "eligible_receiver": bool(LVL.get(f, "") in {"1a", "1b"}), + }) + + # --- Availability after redistribution: update ONLY where OB' changed --- + amc_safe_all = out[amc_col].astype(float).replace([np.inf, -np.inf], np.nan).fillna(0.0) + denom = np.maximum(amc_eps, amc_safe_all.values) + + # Start from baseline everywhere + out["available_prop_redis"] = out["available_prop"].astype(float).values + + # Change availability only for those rows where OB has increased + changed = (out["OB_prime"] - out["OB"]) > 1e-6 + p_new = np.minimum(1.0, np.maximum(0.0, out.loc[changed, "OB_prime"].values / denom[changed])) + if floor_to_baseline: + p_new = np.maximum(p_new, out.loc[changed, "available_prop"].astype(float).values) + + out.loc[changed, "available_prop_redis"] = p_new + out["received_from_pool"] = out["OB_prime"] - out["OB"] + + move_log = pd.DataFrame(move_rows) if return_move_log else None + + # Force non-eligible levels back to baseline (mirror analysis scope) + non_elig = ~out[level_col].isin(["1a", "1b"]) + out.loc[non_elig, "available_prop_redis"] = out.loc[non_elig, "available_prop"].values # this should ideally happen automatically + # however, there are facilities at levels 2-4 whether some stock out was experienced even though OB > AMC + # We want to retain the original probability in these cases because our overall analysis is restricted to levels 1a and 1b + + # Per-row movement + out["received_from_pool"] = out["OB_prime"] - out["OB"] + + # Check if the rules are correctly applied + # This section until if return_move_log:, is not required for the solution + #--------------------------------------------------------------------------------- + # --- Build masks for skipping --- + # 1. Nodes that failed to solve optimally + # Exclude any node/month/item combinations that didn't solve optimally or which were skipped due to AMC == 0 + if skipped_nodes: + skipped_df = pd.DataFrame(skipped_nodes, columns=[node_label, "month", "item_code"]) + + # Merge to flag rows belonging to skipped groups + out = out.merge( + skipped_df.assign(skip_flag=True), + on=[node_label, "month", "item_code"], + how="left", + ) + mask_skip_solution = out["skip_flag"].fillna(False) + else: + mask_skip_solution = pd.Series(False, index=out.index) + + #out[mask_skip_solution].to_csv(outputfilepath / 'skipped_nodes_no_optiimal_soln.csv', index = False) + + # 2. Facilities with AMC effectively zero + mask_skip_amc = out["amc"].astype(float) <= 1e-9 + + # Combined skip mask + mask_skip = mask_skip_solution | mask_skip_amc + mask_solved = ~mask_skip + print(f"Skipping {mask_skip.sum()} rows due to non-optimal LPs or AMC≈0") + + # No facility should end below min(OB, tau_donor_keep*AMC) (# Lower bound check) + tol = 1e-6 #tolerance + viol_lb = ( + (out.loc[mask_solved, "OB_prime"] < + (np.minimum(out.loc[mask_solved, "OB"], tau_donor_keep * out.loc[mask_solved, "amc"]) - tol)) + ) + + # No facility ends up above upper bounds (# Upper bound check) + elig = out.loc[mask_solved, "Facility_Level"].isin(["1a", "1b"]).values + ub = np.where( + elig, + tau_max * out.loc[mask_solved, "amc"], + np.minimum(out.loc[mask_solved, "OB"], tau_max * out.loc[mask_solved, "amc"]) + ) + viol_ub = out.loc[mask_solved, "OB_prime"].values > (ub + tol) + + temp = out[mask_solved] + if viol_lb.any(): + print("For the following rows (facility, item and month combinations), unclear why OB_prime < tau_donor_keep * AMC " + "which violates a constraint in the LPP") + print(temp[viol_lb][['Facility_Level', 'amc', 'OB', 'OB_prime']]) + temp[viol_lb][['Facility_Level', 'fac_name', 'amc', 'OB', 'OB_prime']].to_csv('violates_lb.csv') + if viol_ub.any(): + print("For the following rows (facility, item and month combinations), unclear why OB_prime > tau_max * AMC " + "which violates a constraint in the LPP") + print(temp[viol_ub][['Facility_Level', 'amc', 'OB', 'OB_prime']]) + temp[viol_ub][['Facility_Level', 'fac_name', 'amc', 'OB', 'OB_prime']].to_csv('violates_ub.csv') + + if return_move_log: + move_log = pd.DataFrame(move_rows) + return out, move_log + + return out + # pooled_df, pool_moves = redistribute_pooling_lp(lmis, tau_min=0.25, tau_max=3.0, return_move_log=True) + + # Functions to generate summary plots of the outcomes of redistribution + def prep_violin_df(df, scenario_name, keep_facs_with_no_change = True): + out = df.copy() + out["delta_p"] = out["available_prop_redis"] - out["available_prop"] + + if (keep_facs_with_no_change == True): + mask = ( + (out["Facility_Level"].isin(["1a", "1b"])) & + (out["amc"] > 0) + ) + else: + mask = ( + (out["OB_prime"] > out["OB"]) & + (out["Facility_Level"].isin(["1a", "1b"])) & + (out["amc"] > 0) + ) + + return ( + out.loc[mask, ["district", "delta_p"]] + .assign(scenario=scenario_name) + ) + + def _add_custom_legend(fig=None, legend_location="upper right"): + iqr_patch = mpatches.Rectangle( + (0, 0), 1, 1, + facecolor="grey", + edgecolor="black", + linewidth=1, + label="Interquartile range (IQR)" + ) + median_patch = mlines.Line2D( + [], [], color="#b2182b", marker="o", linestyle="None", + markersize=5, label="Median" + ) + mean_patch = mlines.Line2D( + [], [], color="#b2182b", marker="D", linestyle="None", + markersize=6, label="Mean" + ) + + if fig is None: + plt.legend(handles=[iqr_patch, median_patch, mean_patch], + loc=legend_location, fontsize=8, frameon=True) + else: + fig.legend( + handles=[iqr_patch, median_patch, mean_patch], + loc=legend_location, + ncol=3, + fontsize=8, + frameon=True + ) + + def do_violin_plot_change_in_p( + violin_df: pd.DataFrame, + figname: str, + by_district: bool = False, + district_col: str = "district", + ncol: int = 4, + legend_location = "upper right", + ): + """ + Create violin + box + mean/median overlay plots of change in availability. + + If by_district=False: + Single national-level plot. + + If by_district=True: + One combined faceted figure with one panel per district. + """ + + # ---------- National-level plot ---------- + if not by_district: + mean_df = violin_df.groupby("scenario", as_index=False)["delta_p"].mean() + median_df = violin_df.groupby("scenario", as_index=False)["delta_p"].median() + + plt.figure(figsize=(10, 5)) + + sns.violinplot( + data=violin_df, + x="scenario", + y="delta_p", + cut=0, + scale="width", + inner=None, + linewidth=0.8, + color="#4C72B0", + alpha=0.6 + ) + + sns.boxplot( + data=violin_df, + x="scenario", + y="delta_p", + width=0.03, + showcaps=True, + showfliers=False, + boxprops={"facecolor": "grey", "edgecolor": "black", "linewidth": 1}, + whiskerprops={"linewidth": 1}, + medianprops={"linewidth": 0} + ) + + sns.scatterplot( + data=mean_df, + x="scenario", + y="delta_p", + color="#b2182b", + marker="D", + s=60, + zorder=10, + label="Mean" + ) + + sns.scatterplot( + data=median_df, + x="scenario", + y="delta_p", + color="#b2182b", + marker="o", + s=45, + zorder=11, + label="Median" + ) + + plt.axhline(0, color="black", linewidth=0.8, linestyle="--") + plt.ylabel("Change in probability of availability (Δp)") + plt.xlabel("") + + _add_custom_legend(legend_location=legend_location) + plt.tight_layout() + plt.savefig(outputfilepath / figname, dpi=600) + plt.close() + return + + # ---------- District-faceted plot ---------- + g = sns.catplot( + data=violin_df, + x="scenario", + y="delta_p", + col=district_col, + col_wrap=ncol, + kind="violin", + cut=0, + scale="width", + inner=None, + linewidth=0.6, + color="#4C72B0", + alpha=0.6, + height=3, + aspect=1 + ) + + # Overlay boxplots, means, medians per facet + for ax, (district, df_d) in zip(g.axes.flat, violin_df.groupby(district_col)): + mean_df = df_d.groupby("scenario", as_index=False)["delta_p"].mean() + median_df = df_d.groupby("scenario", as_index=False)["delta_p"].median() + + sns.boxplot( + data=df_d, + x="scenario", + y="delta_p", + width=0.03, + showcaps=True, + showfliers=False, + boxprops={"facecolor": "grey", "edgecolor": "black", "linewidth": 0.8}, + whiskerprops={"linewidth": 0.8}, + medianprops={"linewidth": 0}, + ax=ax + ) + + sns.scatterplot( + data=mean_df, + x="scenario", + y="delta_p", + color="#b2182b", + marker="D", + s=35, + zorder=10, + ax=ax + ) + + sns.scatterplot( + data=median_df, + x="scenario", + y="delta_p", + color="#b2182b", + marker="o", + s=30, + zorder=11, + ax=ax + ) + + ax.axhline(0, color="black", linewidth=0.6, linestyle="--") + ax.set_xlabel("") + ax.set_ylabel("Δp") + ax.tick_params(axis="x", labelrotation=45, labelsize=8) + ax.tick_params(axis="y", labelsize=8) + ax.set_title(district, fontsize=9) + + _add_custom_legend(fig=g.fig, legend_location = legend_location) + g.fig.tight_layout() + g.fig.savefig(outputfilepath / figname, dpi=600) + plt.close() + + + # IMPLEMENT + # 1) Build a time matrix + fac_coords = lmis[['fac_name', 'district', 'lat','long']] + #T_car = build_time_matrices_by_district( + # fac_coords, + # mode="car", + # backend="osrm", + # osrm_base_url="https://router.project-osrm.org", + # max_chunk=50) + + # Store dictionary in pickle format + #with open(outputfilepath / "T_car2.pkl", "wb") as f: + # pickle.dump(T_car, f) + # -> Commented out because it takes long to run. The result has been stored in pickle format + + # Load pre-generated dictionary + with open(outputfilepath / "T_car2.pkl", "rb") as f: + T_car = pickle.load(f) + # T_car2 was created after cleaning fac names and getting rid of spaces in the text + + #edges_flat = build_edges_within_radius_flat(T_car, max_minutes= 60) + + # 2) Explore the availability and distances to make decisions about optimisation rules + # Plot stock adequacy by district and month to assess what bounds to set when pooling + fig, ax, hm_df = generate_stock_adequacy_heatmap(df = lmis, figures_path = outputfilepath, + y_var = 'district', value_var = 'item_code', + value_label= f"% of consumables with Opening Balance ≥ 3 × AMC", + amc_threshold = 3, compare = "ge", + filename = "mth_district_stock_adequacy_3amc.png", figsize = (12,10)) + fig, ax, hm_df = generate_stock_adequacy_heatmap(df = lmis, figures_path = outputfilepath, + y_var = 'district', value_var = 'item_code', + value_label= f"% of consumables with Opening Balance ≥ 1.5 × AMC", + amc_threshold = 1.5, compare = "ge", + filename = "mth_district_stock_adequacy_1.5amc.png", figsize = (12,10)) + fig, ax, hm_df = generate_stock_adequacy_heatmap(df = lmis, figures_path = outputfilepath, + y_var = 'district', value_var = 'item_code', + value_label= f"% of consumables with Opening Balance <= 1 × AMC", + amc_threshold = 1, compare = "le", + filename = "mth_district_stock_inadequacy_1amc.png", figsize = (12,10)) + fig, ax, hm_df = generate_stock_adequacy_heatmap(df = lmis, figures_path = outputfilepath, + y_var = 'item_code', value_var = 'fac_name', + value_label= f"% of facilities with Opening Balance ≥ 3 × AMC", + amc_threshold = 3, compare = "ge", + filename = "mth_item_stock_adequacy_3amc.png") + fig, ax, hm_df = generate_stock_adequacy_heatmap(df = lmis, figures_path = outputfilepath, + y_var = 'item_code', value_var = 'fac_name', + value_label= f"% of facilities with Opening Balance ≥ 1.5 × AMC", + amc_threshold = 1.5, compare = "ge", + filename = "mth_item_stock_adequacy_1.5amc.png") + fig, ax, hm_df = generate_stock_adequacy_heatmap(df = lmis, figures_path = outputfilepath, + y_var = 'item_code', value_var = 'fac_name', + value_label= f"% of facilities with Opening Balance <= 1 × AMC", + amc_threshold = 1, compare = "le", + filename = "mth_item_stock_inadequacy_1amc.png") + + + # Browse the number of eligible neighbours depending on allowable travel time + results = [] + for mins in [30, 60, 90, 120]: + edges_flat = build_edges_within_radius_flat(T_car, max_minutes=mins) + neighbors_count = pd.Series({fac: len(neigh) for fac, neigh in edges_flat.items()}) + mean = neighbors_count.mean() + sem = neighbors_count.sem() # standard error of mean + ci95 = 1.96 * sem + results.append({"radius": mins, "mean": mean, "ci95": ci95}) + + results_df = pd.DataFrame(results) + + # Plot + plt.figure(figsize=(6,4)) + plt.bar(results_df["radius"], results_df["mean"], yerr=results_df["ci95"], capsize=5, color="skyblue") + plt.xlabel("Travel time radius (minutes)") + plt.ylabel("Average number of facilities within radius") + plt.title("Average connectivity of facilities with 95% CI") + plt.xticks(results_df["radius"]) + plt.savefig(outputfilepath / "neighbour_count_by_max_travel_time") + + # A manual check shows that for distances greater than 60 minutes ORS underestimates the travel time a little + # TODO consider using google maps API + + #Drop NAs + # TODO find a more generalisable solution for this issue (within the optimisation functions) + #lmis = lmis[(lmis.amc != 0) & (lmis.amc.notna())] + + # ---------------------------------------------------------------------------------------------------------------------- + # 3) Implement pooled redistribution + # ---------------------------------------------------------------------------------------------------------------------- + # Build clusters from per-district travel-time matrices + # T_car_by_dist: {"District A": DF(index=fac_ids, cols=fac_ids), ...} + cluster_size = 3 + cluster_series = build_capacity_clusters_all(T_car, cluster_size=cluster_size) + # cluster_series is a pd.Series: index=facility_id, value like "District A#C00", "District A#C01", ... + + # a) Run optimisation at district level + ''' + # Commented out for quicker runs + print("Now running Pooled Redistribution at District level") + start = time.time() + pooled_district_df, cluster_district_moves = redistribute_pooling_lp( + df=lmis, # the LMIS dataframe + tau_min=0.25, tau_max=3.0, + tau_donor_keep = 1.5, + pooling_level="district", + cluster_map=None, + return_move_log=True, + floor_to_baseline=True + ) + print(pooled_district_df.drop(columns = ['LMIS Facility List', 'lat', 'long', 'fac_owner']).groupby('Facility_Level')[['available_prop_redis', 'available_prop']].mean()) + pooled_district_df[['district', 'item_code', 'fac_name', 'month', 'amc', 'available_prop', 'Facility_Level', + 'OB', 'OB_prime', 'available_prop_redis', 'received_from_pool']].to_csv( + outputfilepath/ 'clustering_district_df.csv', index=False) + end = time.time() + print(f"District redistribution completed in {end - start:.3f} seconds") # 1.1 hour + ''' + pooled_district_df = pd.read_csv(outputfilepath / 'clustering_district_df.csv') + tlo_pooled_district = ( + pooled_district_df + .groupby(["item_code", "district", "Facility_Level", "month"], as_index=False) + .agg(available_prop_scenario16=("available_prop_redis", "mean")) + .sort_values(["item_code","district","Facility_Level","month"]) + ) + + + # b) Run optimisation at cluster (size = 3) level + ''' + # Commented out for quicker runs + print("Now running pooled redistribution at Cluster (Size = 3) level") + start = time.time() + pooled_cluster_df, cluster_moves = redistribute_pooling_lp( + df=lmis, # the LMIS dataframe + tau_min=0, tau_max=3.0, + tau_donor_keep = 1.5, + pooling_level="cluster", + cluster_map=cluster_series, + return_move_log=True, + floor_to_baseline=True + ) + print(pooled_cluster_df.drop(columns = ['LMIS Facility List', 'lat', 'long', 'fac_owner']).groupby('Facility_Level')[['available_prop_redis', 'available_prop']].mean()) + pooled_cluster_df[['district', 'item_code', 'fac_name', 'month', 'amc', 'available_prop', 'Facility_Level', + 'OB', 'OB_prime', 'available_prop_redis', 'received_from_pool']].to_csv( + outputfilepath/ 'clustering_n3_df.csv', index=False) + + end = time.time() + print(f"Cluster redistribution completed in {end - start:.3f} seconds") # 18 hours + ''' + pooled_cluster_df = pd.read_csv(outputfilepath / 'clustering_n3_df.csv') + + tlo_pooled_cluster = ( + pooled_cluster_df + .groupby(["item_code", "district", "Facility_Level", "month"], as_index=False) + .agg(available_prop_scenario17=("available_prop_redis", "mean")) + .sort_values(["item_code","district","Facility_Level","month"]) + ) + + + + # c) Implement pairwise redistribution + ''' + # Commented out for quicker runs + print("Now running pairwise redistribution with maximum radius 60 minutes") + start = time.time() + # c.i) 1-hour radius + large_radius_df, large_radius_moves = redistribute_radius_lp( + df=lmis, + time_matrix=T_car, + radius_minutes=60, # facilities within 1 hour by car + tau_keep=1.5, # donor must keep 1.5 × AMC + tau_tar=1.0, # receivers target 1× AMC + K_in=2, # at most 1 inbound transfers per item + K_out=10, # at most 10 outbound transfers # TODO could increase this + Qmin_proportion=0.25, # min lot = one week of demand + eligible_levels=("1a", "1b"), # only 1a/1b can receive + ) + print(large_radius_df.groupby('Facility_Level')[['available_prop_redis', 'available_prop']].mean()) + large_radius_df.to_csv(outputfilepath/ 'large_radius_df.csv', index=False) + end = time.time() + print(f"Large radius exchange distribution completed in {end - start:.3f} seconds") + ''' + large_radius_df = pd.read_csv(outputfilepath / 'large_radius_df.csv') + tlo_large_radius = ( + large_radius_df + .groupby(["item_code", "district", "Facility_Level", "month"], as_index=False) + .agg(available_prop_scenario18=("available_prop_redis", "mean")) + .sort_values(["item_code","district","Facility_Level","month"]) + ) + + + + # c.ii) 30-minute radius + ''' + print("Now running pairwise redistribution with maximum radius 30 minutes") + start = time.time() + small_radius_df, small_radius_moves = redistribute_radius_lp( + df=lmis, + time_matrix=T_car, + radius_minutes=30, # facilities within 1 hour by car + tau_keep=1.5, # donor must keep 1 × AMC + tau_tar=1.0, # receivers target 1 × AMC + K_in=2, # at most 2 inbound transfers per item + K_out=10, # at most 10 outbound transfers + Qmin_proportion=0.25, # min lot = one week of demand + eligible_levels=("1a", "1b"), # only 1a/1b can receive + ) + print(small_radius_df.groupby('Facility_Level')[['available_prop_redis', 'available_prop']].mean()) + small_radius_df.to_csv(outputfilepath/ 'small_radius_df.csv', index=False) + end = time.time() + print(f"Small radius exchange redistributino completed in {end - start:.3f} seconds") + ''' + small_radius_df = pd.read_csv(outputfilepath / 'small_radius_df.csv') + tlo_small_radius = ( + small_radius_df + .groupby(["item_code", "district", "Facility_Level", "month"], as_index=False) + .agg(available_prop_scenario19=("available_prop_redis", "mean")) + .sort_values(["item_code","district","Facility_Level","month"]) + ) + + # Summarise the outcome of the redistribution in violin plots + violin_df_all_facs = pd.concat([ + prep_violin_df(pooled_district_df, "District pooling", keep_facs_with_no_change = True), + prep_violin_df(pooled_cluster_df, "Cluster pooling", keep_facs_with_no_change = True), + prep_violin_df(large_radius_df, "Pairwise (large radius)", keep_facs_with_no_change = True), + prep_violin_df(small_radius_df, "Pairwise (small radius)", keep_facs_with_no_change = True) + ], ignore_index=True) + violin_df_only_facs_with_change = pd.concat([ + prep_violin_df(pooled_district_df, "District pooling", keep_facs_with_no_change = False), + prep_violin_df(pooled_cluster_df, "Cluster pooling", keep_facs_with_no_change = False), + prep_violin_df(large_radius_df, "Pairwise (large radius)", keep_facs_with_no_change = False), + prep_violin_df(small_radius_df, "Pairwise (small radius)", keep_facs_with_no_change = False) + ], ignore_index=True) + + do_violin_plot_change_in_p( + violin_df = violin_df_all_facs, + figname="violin_redistribution_national_all_facs.png", + legend_location= "upper right" + ) + do_violin_plot_change_in_p( + violin_df = violin_df_only_facs_with_change, + figname="violin_redistribution_national_only_facs_with_change.png", + legend_location = "lower right" + ) + + do_violin_plot_change_in_p( + violin_df = violin_df_all_facs, + figname="violin_by_district_all_facs", + by_district=True, + ncol=4 + ) + + do_violin_plot_change_in_p( + violin_df = violin_df_only_facs_with_change, + figname="violin_redistribution_national_only_facs_with_change", + by_district=True, + ncol=4 + ) + + + # ---------------------------------------------------------------------------------------------------------------------- + # 4) Compile update probabilities and merge with Resourcefile + # ---------------------------------------------------------------------------------------------------------------------- + # 1) Merge the new dataframes together + # ---------------------------------------------------------------------------------------------------------------------- + tlo_redis = reduce( + lambda left, right: pd.merge( + left, right, + on=["item_code", "district", "Facility_Level", "month"], + how="outer" + ), + [tlo_pooled_district, tlo_pooled_cluster, tlo_large_radius, tlo_small_radius] + ) + + tlo_redis.to_csv(outputfilepath/ 'tlo_redis.csv', index=False) + + # Edit new dataframe to match mfl formatting + list_of_new_scenario_variables = ['available_prop_scenario16', 'available_prop_scenario17', + 'available_prop_scenario18', 'available_prop_scenario19'] + tlo_redis = tlo_redis[['item_code', 'month', 'district', 'Facility_Level'] + list_of_new_scenario_variables].dropna() + tlo_redis["item_code"] = tlo_redis["item_code"].astype(float).astype(int) + + # Load master facility list + mfl = pd.read_csv(resourcefilepath / "healthsystem" / "organisation" / "ResourceFile_Master_Facilities_List.csv") + mfl["District"] = mfl["District"].astype("string").str.strip().str.replace(r"\s+", "_", regex=True) + districts = set(mfl[mfl.District.notna()]["District"].unique()) + kch = (mfl.Region == 'Central') & (mfl.Facility_Level == '3') + qech = (mfl.Region == 'Southern') & (mfl.Facility_Level == '3') + mch = (mfl.Region == 'Northern') & (mfl.Facility_Level == '3') + zmh = mfl.Facility_Level == '4' + mfl.loc[kch, "District"] = "Lilongwe" + mfl.loc[qech, "District"] = "Blantyre" + mfl.loc[mch, "District"] = "Mzimba" + mfl.loc[zmh, "District"] = "Zomba" + + # Do some mapping to make the Districts line-up with the definition of Districts in the model + rename_and_collapse_to_model_districts = { + 'Nkhota_Kota': 'Nkhotakota', + 'Mzimba_South': 'Mzimba', + 'Mzimba_North': 'Mzimba', + 'Nkhata_bay': 'Nkhata_Bay', + } + + tlo_redis['district_std'] = tlo_redis['district'].replace(rename_and_collapse_to_model_districts) + # Take averages (now that 'Mzimba' is mapped-to by both 'Mzimba South' and 'Mzimba North'.) + tlo_redis = tlo_redis.groupby(by=['district_std', 'Facility_Level', 'month', 'item_code'])[list_of_new_scenario_variables].mean().reset_index() + + # Fill in missing data: + # 1) Cities to get same results as their respective regions + copy_source_to_destination = { + 'Mzimba': 'Mzuzu_City', + 'Lilongwe': 'Lilongwe_City', + 'Zomba': 'Zomba_City', + 'Blantyre': 'Blantyre_City' + } + + for source, destination in copy_source_to_destination.items(): + new_rows = tlo_redis.loc[(tlo_redis.district_std == source) & (tlo_redis.Facility_Level.isin(['1a', '1b', '2']))].copy() + new_rows.district_std = destination + tlo_redis = pd.concat([tlo_redis, new_rows], axis=0, ignore_index=True) + + # 2) Fill in Likoma (for which no data) with the means + means = tlo_redis.loc[tlo_redis.Facility_Level.isin(['1a', '1b', '2'])].groupby(by=['Facility_Level', 'month', 'item_code'])[ + list_of_new_scenario_variables].mean().reset_index() + new_rows = means.copy() + new_rows['district_std'] = 'Likoma' + tlo_redis = pd.concat([tlo_redis, new_rows], axis=0, ignore_index=True) + assert sorted(set(districts)) == sorted(set(pd.unique(tlo_redis.district_std))) + + # 3) copy the results for 'Mwanza/1b' to be equal to 'Mwanza/1a'. + mwanza_1a = tlo_redis.loc[(tlo_redis.district_std == 'Mwanza') & (tlo_redis.Facility_Level == '1a')] + mwanza_1b = tlo_redis.loc[(tlo_redis.district_std == 'Mwanza') & (tlo_redis.Facility_Level == '1a')].copy().assign(Facility_Level='1b') + tlo_redis= pd.concat([tlo_redis, mwanza_1b], axis=0, ignore_index=True) + + # 4) Copy all the results to create a level 0 with an availability equal to half that in the respective 1a + all_1a = tlo_redis.loc[tlo_redis.Facility_Level == '1a'] + all_0 = tlo_redis.loc[tlo_redis.Facility_Level == '1a'].copy().assign(Facility_Level='0') + all_0[list_of_new_scenario_variables] *= 0.5 + tlo_redis = pd.concat([tlo_redis, all_0], axis=0, ignore_index=True) + + # Now, merge-in facility_id + tlo_redis = tlo_redis.merge(mfl[['District', 'Facility_Level', 'Facility_ID']], + left_on=['district_std', 'Facility_Level'], + right_on=['District', 'Facility_Level'], how='left', indicator=True, validate = 'm:1') + tlo_redis = tlo_redis[tlo_redis.Facility_ID.notna()].rename(columns = {'district_std': 'district'}) + assert sorted(set(mfl.loc[mfl.Facility_Level != '5','Facility_ID'].unique())) == sorted(set(pd.unique(tlo_redis.Facility_ID))) + + # Load original availability dataframe + # ---------------------------------------------------------------------------------------------------------------------- + tlo_availability_df = tlo_availability_df + list_of_old_scenario_variables = [f"available_prop_scenario{i}" for i in range(1, scenario_count + 1)] + tlo_availability_df = tlo_availability_df[['Facility_ID', 'month', 'item_code', 'available_prop'] + list_of_old_scenario_variables] + + # Attach district, facility level and item_category to this dataset + program_item_mapping = pd.read_csv(path_for_new_resourcefiles / 'ResourceFile_Consumables_Item_Designations.csv')[['Item_Code', 'item_category']] # Import item_category + program_item_mapping = program_item_mapping.rename(columns ={'Item_Code': 'item_code'})[program_item_mapping.item_category.notna()] + fac_levels = {'0', '1a', '1b', '2', '3', '4'} + tlo_availability_df = tlo_availability_df.merge(mfl[['District', 'Facility_Level', 'Facility_ID']], + on = ['Facility_ID'], how='left').rename(columns = {'District': 'district'}) + tlo_availability_df = tlo_availability_df.merge(program_item_mapping, + on = ['item_code'], how='left') + #tlo_availability_df = tlo_availability_df[~tlo_availability_df[['District', 'Facility_Level', 'month', 'item_code']].duplicated()] + + # Because some of the availbility data in the original availability comes from data sources other than OpenLMIS, there are + # more unique item codes in tlo_availability_df than in tlo_redis. For these items, assume that the proportion of 'uplift' + # is the same as the average 'uplift' experienced across the consumables in tlo_redis disaggregated by district, + # facility level, and month. + + # First fix any unexpected changes in availability probability + # Merge the old and new dataframe + redis_levels = ['1a','1b'] + tlo_redis = tlo_redis[tlo_redis.Facility_Level.isin(redis_levels)] + + tlo_redis = tlo_redis.merge( + tlo_availability_df[["district", "Facility_Level", "item_code", "month", "available_prop"]], + on=["district", "Facility_Level", "item_code", "month"], + how="left", + validate="one_to_one" + ) + + for redis_scenario_col in list_of_new_scenario_variables: + pre = ( + tlo_redis[redis_scenario_col] < tlo_redis["available_prop"] + ).mean() + print(f"Pre-fix {redis_scenario_col}: {pre:.3%}") + + # Enforce no-harm + tlo_redis[redis_scenario_col] = np.maximum( + tlo_redis[redis_scenario_col], + tlo_redis["available_prop"] + ) + + post = ( + tlo_redis[redis_scenario_col] < tlo_redis["available_prop"] + ).mean() + print(f"Post-fix {redis_scenario_col}: {post:.3%}") + + # Next create an uplift dataframe + modelled_items = tlo_redis["item_code"].unique() + # Compute uplift once per scenario, store in a dict + uplift_maps = {} + + for scenario_col in list_of_new_scenario_variables: + uplift_maps[scenario_col] = ( + tlo_redis.assign( + uplift=lambda x: np.where( + x["available_prop"] > 0, + x[scenario_col] / x["available_prop"], + np.nan + ) + ) + .groupby(["district", "Facility_Level", "month"], as_index=False)["uplift"] + .mean() + .rename(columns={"uplift": f"uplift_{scenario_col}"}) + ) + + # Get baseline rows for missing items + missing_mask = ~tlo_availability_df["item_code"].isin(modelled_items) + + df_missing = ( + tlo_availability_df[ + (tlo_availability_df["Facility_Level"].isin(redis_levels)) & + missing_mask + ] + .copy() + ) + + # Merge all uplifts horizontally + for scenario_col, uplift_df in uplift_maps.items(): + df_missing = df_missing.merge( + uplift_df, + on=["district", "Facility_Level", "month"], + how="left" + ) + df_missing[scenario_col] = np.minimum( + 1.0, + df_missing["available_prop"] * df_missing[f"uplift_{scenario_col}"] + ) + df_missing.drop(columns=[f"uplift_{scenario_col}"], inplace=True) + + # Concatenate + tlo_redis = pd.concat( + [tlo_redis, df_missing], + ignore_index=True + ) + + dupes = tlo_redis.duplicated( + ["district", "Facility_Level", "item_code", "month"] + ) + assert (dupes.sum() == 0) + + for scenario_col in list_of_new_scenario_variables: + assert ((tlo_redis[scenario_col]< + tlo_redis["available_prop"]).sum()) == 0 + + tlo_redis = tlo_redis[['Facility_ID', 'month', 'item_code'] + list_of_new_scenario_variables] + + # Interpolate missing values in tlo_redis for all levels except 0 + # ---------------------------------------------------------------------------------------------------------------------- + # Generate the dataframe that has the desired size and shape + fac_ids = set(mfl.loc[mfl.Facility_Level.isin(redis_levels)].Facility_ID) + item_codes = set(tlo_availability_df.item_code.unique()) + months = range(1, 13) + + # Create a MultiIndex from the product of fac_ids, months, and item_codes + index = pd.MultiIndex.from_product([fac_ids, months, item_codes], names=['Facility_ID', 'month', 'item_code']) + + # Initialize a DataFrame with the MultiIndex and columns, filled with NaN + full_set = pd.DataFrame(index=index, columns=list_of_new_scenario_variables) + full_set = full_set.astype(float) # Ensure all columns are float type and filled with NaN + + # Insert the data, where it is available. + full_set = full_set.combine_first(tlo_redis.set_index(['Facility_ID', 'month', 'item_code'])[list_of_new_scenario_variables]) + + # Fill in the blanks with rules for interpolation. + facilities_by_level = defaultdict(set) + for ix, row in mfl.iterrows(): + facilities_by_level[row['Facility_Level']].add(row['Facility_ID']) + + items_by_category = defaultdict(set) + for ix, row in program_item_mapping.iterrows(): + items_by_category[row['item_category']].add(row['item_code']) + + def get_other_facilities_of_same_level(_fac_id): + """Return a set of facility_id for other facilities that are of the same level as that provided.""" + for v in facilities_by_level.values(): + if _fac_id in v: + return v - {_fac_id} + + def get_other_items_of_same_category(_item_code): + """Return a set of item_codes for other items that are in the same category/program as that provided.""" + for v in items_by_category.values(): + if _item_code in v: + return v - {_item_code} + def interpolate_missing_with_mean(_ser): + """Return a series in which any values that are null are replaced with the mean of the non-missing.""" + if pd.isnull(_ser).all(): + raise ValueError + return _ser.fillna(_ser.mean()) + + # Create new dataset that include the interpolations (The operation is not done "in place", because the logic is based + # on what results are missing before the interpolations in other facilities). + full_set_interpolated = full_set * np.nan + full_set_interpolated[list_of_new_scenario_variables] = full_set[list_of_new_scenario_variables] + + for fac in fac_ids: + for item in item_codes: + for col in list_of_new_scenario_variables: + print(f"Now doing: fac={fac}, item={item}, column={col}") + + # Get records of the availability of this item in this facility. + _monthly_records = full_set.loc[(fac, slice(None), item), col].copy() + + if pd.notnull(_monthly_records).any(): + # If there is at least one record of this item at this facility, then interpolate the missing months from + # the months for there are data on this item in this facility. (If none are missing, this has no effect). + _monthly_records = interpolate_missing_with_mean(_monthly_records) + + else: + # If there is no record of this item at this facility, check to see if it's available at other facilities + # of the same level + # Or if there is no record of item at other facilities at this level, check to see if other items of this category + # are available at this facility level + facilities = list(get_other_facilities_of_same_level(fac)) + + other_items = get_other_items_of_same_category(item) + items = list(other_items) if other_items else other_items + + recorded_at_other_facilities_of_same_level = pd.notnull( + full_set.loc[(facilities, slice(None), item), col] + ).any() + + if not items: + category_recorded_at_other_facilities_of_same_level = False + else: + # Filter only items that exist in the MultiIndex at this facility + valid_items = [ + itm for itm in items + if any((fac, m, itm) in full_set.index for m in months) + ] + + category_recorded_at_other_facilities_of_same_level = pd.notnull( + full_set.loc[(fac, slice(None), valid_items), col] + ).any() + + if recorded_at_other_facilities_of_same_level: + # If it recorded at other facilities of same level, find the average availability of the item at other + # facilities of the same level. + print("Data for facility ", fac, " extrapolated from other facilities within level - ", facilities) + facilities = list(get_other_facilities_of_same_level(fac)) + _monthly_records = interpolate_missing_with_mean( + full_set.loc[(facilities, slice(None), item), col].groupby(level=1).mean() + ) + + elif category_recorded_at_other_facilities_of_same_level and valid_items: + # If it recorded at other facilities of same level, find the average availability of the item at other + # facilities of the same level. + print("Data for item ", item, " extrapolated from other items within category - ", valid_items) + + _monthly_records = interpolate_missing_with_mean( + full_set.loc[(fac, slice(None), valid_items), col].groupby(level=1).mean() + ) + + else: + # If it is not recorded at other facilities of same level, then assume that there is no change + print("No interpolation worked") + _monthly_records = _monthly_records.fillna(1.0) + + # Insert values (including corrections) into the resulting dataset. + full_set_interpolated.loc[(fac, slice(None), item), col] = _monthly_records.values + # temporary code + assert full_set_interpolated.loc[(fac, slice(None), item), col].mean() >= 0 + + # Check that there are not missing values + assert not pd.isnull(full_set_interpolated).any().any() + + full_set_interpolated = full_set_interpolated.reset_index() + + # Add to this dataset original availability for all the other levels of care + base_other_levels = tlo_availability_df[ + ~tlo_availability_df["Facility_Level"].isin(redis_levels) + ].copy() + for col in list_of_new_scenario_variables: + base_other_levels[col] = base_other_levels["available_prop"] + base_other_levels = base_other_levels[['Facility_ID', 'month', 'item_code'] + list_of_new_scenario_variables] + tlo_redis_final = pd.concat( + [full_set_interpolated, base_other_levels], + ignore_index=True, + ) + #tlo_redis_final.to_csv(outputfilepath / 'ResourceFile_consumable_availability_after_redistribution.csv', index = False) + + # Verify that the shape of this dataframe is identical to the original availability dataframe + assert sorted(set(tlo_redis_final.Facility_ID)) == sorted(set(pd.unique(tlo_availability_df.Facility_ID))) + assert sorted(set(tlo_redis_final.month)) == sorted(set(pd.unique(tlo_availability_df.month))) + assert sorted(set(tlo_redis_final.item_code)) == sorted(set(pd.unique(tlo_availability_df.item_code))) + assert len(tlo_redis_final) == len(tlo_availability_df.item_code) + + tlo_redis_final = tlo_availability_df.merge(tlo_redis_final, on = ['Facility_ID', 'item_code', 'month'], + how = 'left', validate = "1:1") + + return tlo_redis_final + +# Plot final availability +def plot_availability_heatmap( + df: pd.DataFrame, + y_var: str = None, + scenario_cols: list[str] = None, + filter_dict: dict = None, + cmap: str = "RdYlGn", + vmin: float = 0, + vmax: float = 1, + figsize: tuple = (10, 8), + annot: bool = True, + rename_scenarios_dict: dict = None, + title: str = 'Availability across scenarios', + figname: Path = None, +): + """ + Flexible heatmap generator that supports: + 1. Filters to subset data + 2. Multiple scenario columns (wide format, like available_prop_scen1–16) + + Parameters + ---------- + df : pd.DataFrame + Input dataframe. + y_var : str, optional + Column name for y-axis. + scenario_cols : list of str, optional + List of scenario columns (e.g., [f"available_prop_scen{i}" for i in range(1,17)]). + filter_dict : dict, optional + Filters to apply before plotting, e.g. {"Facility_Level": "1a"}. + cmap : str + Colormap. + vmin, vmax : float + Color scale range. + figsize : tuple + Figure size. + annot : bool + Annotate cells with values. + rename_scenario_dict : dict, optional + Rename columns (for pretty scenario names, etc.) + title : str, optional + Title for the plot. + figname : str, optional + Save name for PNG; if None, displays interactively. + """ + if filter_dict: + for k, v in filter_dict.items(): + if isinstance(v, (list, tuple, set)): + df = df[df[k].isin(v)] + else: + df = df[df[k] == v] + + aggregated_df = df.groupby([y_var])[scenario_cols].mean().reset_index() + heatmap_data = aggregated_df.set_index(y_var) + + # Calculate aggregate column (true overall mean) + aggregate_col = df[scenario_cols].mean() + if rename_scenarios_dict: + aggregate_col = aggregate_col.rename(index=rename_scenarios_dict) + + # Apply column renames (fix variable name) + if rename_scenarios_dict: + heatmap_data = heatmap_data.rename(columns=rename_dict) + heatmap_data.loc['Average'] = aggregate_col + + # Generate the heatmap + sns.set(font_scale=1) + plt.figure(figsize=figsize) + ax = sns.heatmap( # <── assign to ax + heatmap_data, + annot=annot, + cmap=cmap, + vmin=vmin, + vmax=vmax, + cbar_kws={'label': 'Proportion of days on which consumable is available'} + ) + + # Customize the plot + plt.title(title) + plt.xlabel('Scenarios') + plt.ylabel(y_var) + plt.xticks(rotation=90, fontsize=12) + plt.yticks(rotation=0, fontsize=11) + ax.set_xticklabels( + [textwrap.fill(label.get_text(), 20) for label in ax.get_xticklabels()], + rotation=90, ha='center' + ) + ax.set_yticklabels( + [textwrap.fill(label.get_text(), 25) for label in ax.get_yticklabels()], + rotation=0, va='center' + ) + + if figname: + plt.savefig(outputfilepath / figname, dpi=500, bbox_inches='tight') + plt.close() + else: + plt.show() + plt.close() + +''' +# Clean item category +clean_category_names = {'cancer': 'Cancer', 'cardiometabolicdisorders': 'Cardiometabolic Disorders', + 'contraception': 'Contraception', 'general': 'General', 'hiv': 'HIV', 'malaria': 'Malaria', + 'ncds': 'Non-communicable Diseases', 'neonatal_health': 'Neonatal Health', + 'other_childhood_illnesses': 'Other Childhood Illnesses', 'reproductive_health': 'Reproductive Health', + 'road_traffic_injuries': 'Road Traffic Injuries', 'tb': 'Tuberculosis', + 'undernutrition': 'Undernutrition', 'epi': 'Expanded programme on immunization'} +df_for_plots['item_category_clean'] = df_for_plots['item_category'].map(clean_category_names) + +scenario_cols = ['available_prop', 'available_prop_scenario1', 'available_prop_scenario2', 'available_prop_scenario3', + 'available_prop_scenario6', 'available_prop_scenario7', 'available_prop_scenario8', + 'available_prop_scenario16', 'available_prop_scenario17', 'available_prop_scenario18', 'available_prop_scenario19'] +rename_dict = {'available_prop': 'Actual', + 'available_prop_scenario1': 'Non-therapeutic consumables', + 'available_prop_scenario2': 'Vital medicines', + 'available_prop_scenario3': 'Pharmacist-managed', + 'available_prop_scenario6': '75th percentile facility', + 'available_prop_scenario7': '90th percentile acility', + 'available_prop_scenario8': 'Best facility', + 'available_prop_scenario16': 'District Pooling', + 'available_prop_scenario17' : 'Cluster Pooling', + 'available_prop_scenario18': 'Pairwise exchange (60-min radius)', + 'available_prop_scenario19': 'Pairwise exchange (30-min radius)'} +scenario_names = list(rename_dict.values()) + +# Plot heatmap for level 1a +plot_availability_heatmap( + df=df_for_plots, + scenario_cols=scenario_cols, + y_var="item_category_clean", + filter_dict={"Facility_Level": ["1a"]}, + title="Availability across Scenarios — Level 1a", + rename_scenarios_dict = rename_dict, + cmap = "RdYlGn", + figname = 'availability_1a.png' +) + +# Plot heatmap for level 1b +plot_availability_heatmap( + df=df_for_plots, + scenario_cols=scenario_cols, + y_var="item_category_clean", + filter_dict={"Facility_Level": ["1b"]}, + title="Availability across Scenarios — Level 1b", + rename_scenarios_dict = rename_dict, + cmap = "RdYlGn", + figname = 'availability_1b.png' +) +''' diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py index 9ca0554650..32281bbe96 100644 --- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py +++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py @@ -45,7 +45,7 @@ import numpy as np import pandas as pd import seaborn as sns -from plotnine import aes, element_text, geom_bar, ggplot, labs, theme, ylim # for ggplots from R +#from plotnine import aes, element_text, geom_bar, ggplot, labs, theme, ylim # for ggplots from R from tlo.methods.consumables import check_format_of_consumables_file @@ -61,637 +61,659 @@ resourcefilepath = Path("./resources") path_for_new_resourcefiles = resourcefilepath / "healthsystem/consumables" -# 1. Import and clean data files -#********************************** -# 1.1 Import TLO model availability data -#------------------------------------------------------ -tlo_availability_df = pd.read_csv(path_for_new_resourcefiles / "ResourceFile_Consumables_availability_small.csv") -# Drop any scenario data previously included in the resourcefile -tlo_availability_df = tlo_availability_df[['Facility_ID', 'month','item_code', 'available_prop']] - -# Import item_category -program_item_mapping = pd.read_csv(path_for_new_resourcefiles / 'ResourceFile_Consumables_Item_Designations.csv')[['Item_Code', 'item_category']] -program_item_mapping = program_item_mapping.rename(columns ={'Item_Code': 'item_code'})[program_item_mapping.item_category.notna()] - -# 1.1.1 Attach district, facility level and item_category to this dataset -#---------------------------------------------------------------- -# Get TLO Facility_ID for each district and facility level -mfl = pd.read_csv(resourcefilepath / "healthsystem" / "organisation" / "ResourceFile_Master_Facilities_List.csv") -districts = set(pd.read_csv(resourcefilepath / 'demography' / 'ResourceFile_Population_2010.csv')['District']) -fac_levels = {'0', '1a', '1b', '2', '3', '4'} -tlo_availability_df = tlo_availability_df.merge(mfl[['District', 'Facility_Level', 'Facility_ID']], - on = ['Facility_ID'], how='left') - -tlo_availability_df = tlo_availability_df.merge(program_item_mapping, - on = ['item_code'], how='left') - -# 1.2 Import scenario data -#------------------------------------------------------ -scenario_availability_df = pd.read_csv(outputfilepath / "regression_analysis/predictions/predicted_consumable_availability_regression_scenarios.csv") -scenario_availability_df = scenario_availability_df.drop(['Unnamed: 0'], axis=1) -scenario_availability_df = scenario_availability_df.rename({'item': 'item_hhfa'}, axis=1) - -# Prepare scenario data to be merged to TLO model availability based on TLO model features -# 1.2.1 Level of care -#------------------------------------------------------ -scenario_availability_df['fac_type'] = scenario_availability_df['fac_type_original'].str.replace("Facility_level_", "") - -# 1.2.2 District -#------------------------------------------------------ -# Do some mapping to make the Districts in the scenario file line-up with the definition of Districts in the model -rename_and_collapse_to_model_districts = { - 'Mzimba South': 'Mzimba', - 'Mzimba North': 'Mzimba', -} -scenario_availability_df = scenario_availability_df.rename({'district': 'district_original'}, axis=1) -scenario_availability_df['District'] = scenario_availability_df['district_original'].replace(rename_and_collapse_to_model_districts) - -# Cities to get same results as their respective regions -copy_source_to_destination = { - 'Mzimba': 'Mzuzu City', - 'Lilongwe': 'Lilongwe City', - 'Zomba': 'Zomba City', - 'Blantyre': 'Blantyre City', - 'Nkhata Bay': 'Likoma' # based on anecdotal evidence, assume that they experience the same change in avaiability as a result of interventions based on regression results -} -for source, destination in copy_source_to_destination.items(): - new_rows = scenario_availability_df.loc[scenario_availability_df.District == source].copy() # standardised district names - new_rows.District = destination - scenario_availability_df = pd.concat([scenario_availability_df, new_rows], ignore_index = True) - -assert sorted(set(districts)) == sorted(set(pd.unique(scenario_availability_df.District))) - -# 1.2.3 Facility_ID -# #------------------------------------------------------ -# Merge-in facility_id -scenario_availability_df = scenario_availability_df.merge(mfl[['District', 'Facility_Level', 'Facility_ID']], - left_on=['District', 'fac_type'], - right_on=['District', 'Facility_Level'], how='left', indicator=True) -scenario_availability_df = scenario_availability_df.rename({'_merge': 'merge_facid'}, axis=1) - -# Extract list of District X Facility Level combinations for which there is no HHFA data -df_to_check_prediction_completeness = scenario_availability_df.merge(mfl[['District', 'Facility_Level', 'Facility_ID']], - left_on=['District', 'Facility_Level'], - right_on=['District', 'Facility_Level'], how='right', indicator=True) -cond_no_1b = (df_to_check_prediction_completeness['Facility_Level'].isin(['1b'])) & (df_to_check_prediction_completeness['_merge'] == 'right_only') -cond_no_1a = (df_to_check_prediction_completeness['Facility_Level'].isin(['1a'])) & (df_to_check_prediction_completeness['_merge'] == 'right_only') -districts_with_no_scenario_data_for_1b = df_to_check_prediction_completeness[cond_no_1b]['District'].unique() -districts_with_no_scenario_data_for_1a = df_to_check_prediction_completeness[cond_no_1a]['District'].unique() -districts_with_no_scenario_data_for_1b_only = np.setdiff1d(districts_with_no_scenario_data_for_1b, districts_with_no_scenario_data_for_1a) - -# According to HHFA data, Balaka, Machinga, Mwanza, Ntchisi and Salima do not have level 1b facilities -# Likoma was not included in the regression because of the limited variation within the district - only 4 facilities - we have assumed that the change of consumable -# availability in Likoma is equal to that predicted for Nkhata Bay - -# 1.2.4 Program -#------------------------------------------------------ -scenario_availability_df.loc[scenario_availability_df.program_plot == 'infection_prev', 'program_plot'] = 'general' # there is no separate infection_prevention category in the TLO availability data -map_model_programs_to_hhfa = { - 'contraception': 'contraception', - 'general': 'general', - 'reproductive_health': 'obs&newb', - 'road_traffic_injuries': 'surgical', - 'epi': 'epi', - 'neonatal_health': 'obs&newb', - 'other_childhood_illnesses': 'alri', - 'malaria': 'malaria', - 'tb': 'tb', - 'hiv': 'hiv', - 'undernutrition': 'child', - 'ncds': 'ncds', - 'cardiometabolicdisorders': 'ncds', - 'cancer': 'ncds', -} -# Reverse the map_model_programs_to_hhfa dictionary -hhfa_to_model_programs = {v: k for k, v in map_model_programs_to_hhfa.items()} - -scenario_availability_df['category_tlo'] = scenario_availability_df['program_plot'].replace(hhfa_to_model_programs) # TODO this might not be relevant - -# 1.2.5 Consumable/Item code and Category -#------------------------------------------------------ -# Load TLO - HHFA consumable name crosswalk -consumable_crosswalk_df = pd.read_csv(path_for_new_resourcefiles / 'ResourceFile_consumables_matched.csv', encoding='ISO-8859-1')[['item_code', 'consumable_name_tlo', -'item_hhfa_for_scenario_generation', 'hhfa_mapping_rationale_for_scenario_generation']] - -# Keep only item_codes in the availability dataframe -consumable_crosswalk_df = consumable_crosswalk_df.merge(tlo_availability_df[['item_code']], how = 'right', on = 'item_code') -# TODO is module_name used? -# TODO add new consumables Rifapentine to this? - -# Now merge in TLO item codes -scenario_availability_df = scenario_availability_df.reset_index(drop = True) -scenario_availability_df = scenario_availability_df.merge(consumable_crosswalk_df[['item_code', 'item_hhfa_for_scenario_generation', 'hhfa_mapping_rationale_for_scenario_generation', 'consumable_name_tlo']], - left_on = ['item_hhfa'], right_on = ['item_hhfa_for_scenario_generation'], how='right', indicator=True, validate = "m:m") -scenario_availability_df = scenario_availability_df.drop_duplicates(['Facility_ID', 'item_code']) -scenario_availability_df = scenario_availability_df.rename({'_merge': 'merge_itemcode'}, axis=1) -print("Number of item codes from the TLO model for which no match was found in the regression-based scenario data = ", scenario_availability_df.merge_itemcode.value_counts()[1]) - -# Before merging the above dataframe with tlo_availability_df, and apply a general interpolation rule to fill any gaps, -# we need to make sure that any specific interpolation rules are applied to the scenario dataframe - -# Further a row needs to be added for 1b level under Balaka, Machinga, Mwanza, Ntchisi and Salima -print("Number of unique facility IDs: \n - TLO consumable data = ", tlo_availability_df.Facility_ID.nunique(), - "\n - Scenario based on regression = ", scenario_availability_df.Facility_ID.nunique(), - "\nNumber of unique item codes: \n - TLO consumable availability data = ", tlo_availability_df.item_code.nunique(), - "\n - TLO consumable availability repository = ", consumable_crosswalk_df.item_code.nunique(), - "\n - Scenario based on regression = ", scenario_availability_df.item_code.nunique()) - -# Extract list of TLO consumables which weren't matched with the availability prediction dataframe -items_not_matched = scenario_availability_df['merge_itemcode'] == 'right_only' - -# Get average availability_change_prop value by facility_ID and category_tlo -scenario_availability_df = scenario_availability_df.merge(program_item_mapping, - on = ['item_code'], validate = "m:1", - how = "left") - -# 1.3 Initial interpolation -#------------------------------------------------------ -# 1.3.1 Items not relevant to the regression analysis -items_not_relevant_to_regression = (items_not_matched) & (scenario_availability_df['hhfa_mapping_rationale_for_scenario_generation'] == 'not relevant to logistic regression analysis') -# For category 3, replace availability_change_prop with 1, since we assume that the system-level intervention does not change availability -list_of_scenario_variables = ['change_proportion_scenario1', 'change_proportion_scenario2', - 'change_proportion_scenario3', 'change_proportion_scenario4', 'change_proportion_scenario5'] -for var in list_of_scenario_variables: - scenario_availability_df.loc[items_not_relevant_to_regression,var] = 1 - -# 1.3.2 For level 1b for the districts where this level was not present in the regression analysis/HHFA dataset, assume -# that the change is equal to the product of the (ratio of average change across districts for level 1b to -# average change across districts for level 1a) and change for each item_code for level 1a for that district -#------------------------------------------------------------------------------------------------------------ -average_change_across_districts = scenario_availability_df.groupby(['Facility_Level','item_code'])[list_of_scenario_variables].mean().reset_index() - -# Generate the ratio of the proportional changes to availability of level 1b to 1a in the districts for which level 1b data is available -new_colnames_1a = {col: col + '_1a' if col in list_of_scenario_variables else col for col in average_change_across_districts.columns} -new_colnames_1b = {col: col + '_1b' if col in list_of_scenario_variables else col for col in average_change_across_districts.columns} -average_change_across_districts_for_1a = average_change_across_districts[average_change_across_districts.Facility_Level == "1a"].rename(new_colnames_1a, axis = 1).drop('Facility_Level', axis = 1) -average_change_across_districts_for_1b = average_change_across_districts[average_change_across_districts.Facility_Level == "1b"].rename(new_colnames_1b, axis = 1).drop('Facility_Level', axis = 1) -ratio_of_change_across_districts_1b_to_1a = average_change_across_districts_for_1a.merge(average_change_across_districts_for_1b, - how = "left", on = ['item_code']) -for var in list_of_scenario_variables: - var_ratio = 'ratio_' + var - var_1a = var + '_1a' - var_1b = var + '_1b' - ratio_of_change_across_districts_1b_to_1a[var_ratio] = (ratio_of_change_across_districts_1b_to_1a[var_1b])/(ratio_of_change_across_districts_1b_to_1a[var_1a]) -ratio_of_change_across_districts_1b_to_1a.reset_index(drop = True) -# TODO check if this ratio should be of the proportions minus 1 - -# For districts with no level 1b data in the HHFA, use the ratio of change in level 1b facilities to level 1a facilities to generate the expected proportional change in availability -# for level 1b facilities in those districts -scenario_availability_df = scenario_availability_df.reset_index(drop = True) -cond_districts_with_1b_missing = scenario_availability_df.District.isin(districts_with_no_scenario_data_for_1b_only) -cond_1a = scenario_availability_df.Facility_Level == '1a' -cond_1b = scenario_availability_df.Facility_Level == '1b' -df_1a = scenario_availability_df[cond_districts_with_1b_missing & cond_1a] - -ratio_vars = ['ratio_' + item for item in list_of_scenario_variables] # create columns to represent the ratio of change in 1b facilities to level 1a facilities - -item_var = ['item_code'] - -# First merge the dataframe with changes at level 1a with the ratio of 1b to 1a -df_missing_1b_imputed = df_1a.merge(ratio_of_change_across_districts_1b_to_1a[item_var + ratio_vars], - on = item_var, - how = 'left', validate = "m:1") - -# Then multiply the ratio of 1b to 1a with the change at level 1a to get the expected change at level 1b -for var in list_of_scenario_variables: - df_missing_1b_imputed[var] = df_missing_1b_imputed[var] * df_missing_1b_imputed['ratio_' + var] -# Update columns so the dataframe in fact refers to level 1b facilities -df_missing_1b_imputed.Facility_Level = '1b' # Update facility level to 1 -# Replace Facility_IDs -df_missing_1b_imputed = df_missing_1b_imputed.drop('Facility_ID', axis = 1).merge(mfl[['District', 'Facility_Level', 'Facility_ID']], - on =['District', 'Facility_Level'], - how = 'left') -# Append the new imputed level 1b dataframe to the original dataframe -df_without_districts_with_no_1b_facilities = scenario_availability_df[~(cond_districts_with_1b_missing & cond_1b)] -scenario_availability_df = pd.concat([df_without_districts_with_no_1b_facilities, df_missing_1b_imputed], ignore_index = True) - -# 2. Merge TLO model availability data with scenario data using crosswalk -#************************************************************************* -# 2.1 Merge the two datasets -#------------------------------------------------------ -id_variables = ['item_code','Facility_ID'] - -full_scenario_df = tlo_availability_df.merge(scenario_availability_df[id_variables + list_of_scenario_variables], - how='left', on=['Facility_ID', 'item_code'], indicator = True) -full_scenario_df = full_scenario_df.rename({'_merge': 'merge_scenario'}, axis=1) -full_scenario_df = full_scenario_df.drop_duplicates(['Facility_ID', 'item_code', 'month']) - -# Check that level 1b values are currently imputed -#full_scenario_df[full_scenario_df.District == 'Balaka'].groupby(['District', 'Facility_Level'])['change_proportion_scenario1'].mean() - -# 2.2 Further imputation -#------------------------------------------------------ -# 2.2.1 For all levels other than 1a and 1b, there will be no change in consumable availability -#------------------------------------------------------------------------------------------------------------ -fac_levels_not_relevant_to_regression = full_scenario_df.Facility_Level.isin(['0', '2', '3', '4']) - -for var in list_of_scenario_variables: - full_scenario_df.loc[fac_levels_not_relevant_to_regression, var] = 1 - -# 2.3 Final checks -#------------------------------------------------------ -# 2.3.1 Check that the merged dataframe has the same number of unique items, facility IDs, and total -# number of rows as the original small availability resource file -#--------------------------------------------------------------------------------------------------------- -assert(full_scenario_df.item_code.nunique() == tlo_availability_df.item_code.nunique()) # the number of items in the new dataframe is the same at the original availability dataframe -assert(full_scenario_df.Facility_ID.nunique() == tlo_availability_df.Facility_ID.nunique()) # the number of Facility IDs in the new dataframe is the same at the original availability dataframe -assert(len(full_scenario_df) == len(tlo_availability_df)) # the number of rows in the new dataframe is the same at the original availability dataframe - -# 2.3.2 Construct dataset that conforms to the principles expected by the simulation: i.e. that there is an entry for every -# facility_id and for every month for every item_code. -#----------------------------------------------------------------------------------------------------------------------- -# Generate the dataframe that has the desired size and shape -fac_ids = set(mfl.loc[mfl.Facility_Level != '5'].Facility_ID) -item_codes = set(tlo_availability_df.item_code.unique()) -months = range(1, 13) -all_availability_columns = ['available_prop'] + list_of_scenario_variables - -# Create a MultiIndex from the product of fac_ids, months, and item_codes -index = pd.MultiIndex.from_product([fac_ids, months, item_codes], names=['Facility_ID', 'month', 'item_code']) - -# Initialize a DataFrame with the MultiIndex and columns, filled with NaN -full_set = pd.DataFrame(index=index, columns=all_availability_columns) -full_set = full_set.astype(float) # Ensure all columns are float type and filled with NaN - -# Insert the data, where it is available. -full_set = full_set.combine_first(full_scenario_df.set_index(['Facility_ID', 'month', 'item_code'])[all_availability_columns]) - -# Fill in the blanks with rules for interpolation. -facilities_by_level = defaultdict(set) -for ix, row in mfl.iterrows(): - facilities_by_level[row['Facility_Level']].add(row['Facility_ID']) - -items_by_category = defaultdict(set) -for ix, row in program_item_mapping.iterrows(): - items_by_category[row['item_category']].add(row['item_code']) - -def get_other_facilities_of_same_level(_fac_id): - """Return a set of facility_id for other facilities that are of the same level as that provided.""" - for v in facilities_by_level.values(): - if _fac_id in v: - return v - {_fac_id} - -def get_other_items_of_same_category(_item_code): - """Return a set of item_codes for other items that are in the same category/program as that provided.""" - for v in items_by_category.values(): - if _item_code in v: - return v - {_item_code} -def interpolate_missing_with_mean(_ser): - """Return a series in which any values that are null are replaced with the mean of the non-missing.""" - if pd.isnull(_ser).all(): - raise ValueError - return _ser.fillna(_ser.mean()) - -# Create new dataset that include the interpolations (The operation is not done "in place", because the logic is based -# on what results are missing before the interpolations in other facilities). -full_set_interpolated = full_set * np.nan -full_set_interpolated.available_prop = full_set.available_prop - -for fac in fac_ids: - for item in item_codes: - for col in list_of_scenario_variables: - print(f"Now doing: fac={fac}, item={item}, column={col}") - - # Get records of the availability of this item in this facility. - _monthly_records = full_set.loc[(fac, slice(None), item), col].copy() - - if pd.notnull(_monthly_records).any(): - # If there is at least one record of this item at this facility, then interpolate the missing months from - # the months for there are data on this item in this facility. (If none are missing, this has no effect). - _monthly_records = interpolate_missing_with_mean(_monthly_records) - - else: - # If there is no record of this item at this facility, check to see if it's available at other facilities - # of the same level - # Or if there is no record of item at other facilities at this level, check to see if other items of this category - # are available at this facility level - facilities = list(get_other_facilities_of_same_level(fac)) - - other_items = get_other_items_of_same_category(item) - items = list(other_items) if other_items else other_items - - recorded_at_other_facilities_of_same_level = pd.notnull( - full_set.loc[(facilities, slice(None), item), col] - ).any() - - if not items: - category_recorded_at_other_facilities_of_same_level = False - else: - # Filter only items that exist in the MultiIndex at this facility - valid_items = [ - itm for itm in items - if any((fac, m, itm) in full_set.index for m in months) - ] - - category_recorded_at_other_facilities_of_same_level = pd.notnull( - full_set.loc[(fac, slice(None), valid_items), col] - ).any() +def generate_alternative_availability_scenarios(tlo_availability_df: pd.DataFrame = None) -> pd.DataFrame: + # 1. Import and clean data files + #********************************** + # 1.1 Import TLO model availability data + # ------------------------------------------------------ + # Drop any scenario data previously included in the resourcefile + tlo_availability_df = tlo_availability_df[['Facility_ID', 'month','item_code', 'available_prop']] + + # Import item_category + program_item_mapping = pd.read_csv(path_for_new_resourcefiles / 'ResourceFile_Consumables_Item_Designations.csv')[['Item_Code', 'item_category']] + program_item_mapping = program_item_mapping.rename(columns ={'Item_Code': 'item_code'})[program_item_mapping.item_category.notna()] + + # 1.1.1 Attach district, facility level and item_category to this dataset + #---------------------------------------------------------------- + # Get TLO Facility_ID for each district and facility level + mfl = pd.read_csv(resourcefilepath / "healthsystem" / "organisation" / "ResourceFile_Master_Facilities_List.csv") + districts = set(pd.read_csv(resourcefilepath / 'demography' / 'ResourceFile_Population_2010.csv')['District']) + fac_levels = {'0', '1a', '1b', '2', '3', '4'} + tlo_availability_df = tlo_availability_df.merge(mfl[['District', 'Facility_Level', 'Facility_ID']], + on = ['Facility_ID'], how='left') + + tlo_availability_df = tlo_availability_df.merge(program_item_mapping, + on = ['item_code'], how='left') + + # 1.2 Import scenario data + #------------------------------------------------------ + scenario_availability_df = pd.read_csv(outputfilepath / "regression_analysis/predictions/predicted_consumable_availability_regression_scenarios.csv") + scenario_availability_df = scenario_availability_df.drop(['Unnamed: 0'], axis=1) + scenario_availability_df = scenario_availability_df.rename({'item': 'item_hhfa'}, axis=1) + + # Prepare scenario data to be merged to TLO model availability based on TLO model features + # 1.2.1 Level of care + #------------------------------------------------------ + scenario_availability_df['fac_type'] = scenario_availability_df['fac_type_original'].str.replace("Facility_level_", "") + + # 1.2.2 District + #------------------------------------------------------ + # Do some mapping to make the Districts in the scenario file line-up with the definition of Districts in the model + rename_and_collapse_to_model_districts = { + 'Mzimba South': 'Mzimba', + 'Mzimba North': 'Mzimba', + } + scenario_availability_df = scenario_availability_df.rename({'district': 'district_original'}, axis=1) + scenario_availability_df['District'] = scenario_availability_df['district_original'].replace(rename_and_collapse_to_model_districts) + + # Cities to get same results as their respective regions + copy_source_to_destination = { + 'Mzimba': 'Mzuzu City', + 'Lilongwe': 'Lilongwe City', + 'Zomba': 'Zomba City', + 'Blantyre': 'Blantyre City', + 'Nkhata Bay': 'Likoma' # based on anecdotal evidence, assume that they experience the same change in avaiability as a result of interventions based on regression results + } + for source, destination in copy_source_to_destination.items(): + new_rows = scenario_availability_df.loc[scenario_availability_df.District == source].copy() # standardised district names + new_rows.District = destination + scenario_availability_df = pd.concat([scenario_availability_df, new_rows], ignore_index = True) + + assert sorted(set(districts)) == sorted(set(pd.unique(scenario_availability_df.District))) + + # 1.2.3 Facility_ID + # #------------------------------------------------------ + # Merge-in facility_id + scenario_availability_df = scenario_availability_df.merge(mfl[['District', 'Facility_Level', 'Facility_ID']], + left_on=['District', 'fac_type'], + right_on=['District', 'Facility_Level'], how='left', indicator=True) + scenario_availability_df = scenario_availability_df.rename({'_merge': 'merge_facid'}, axis=1) + + # Extract list of District X Facility Level combinations for which there is no HHFA data + df_to_check_prediction_completeness = scenario_availability_df.merge(mfl[['District', 'Facility_Level', 'Facility_ID']], + left_on=['District', 'Facility_Level'], + right_on=['District', 'Facility_Level'], how='right', indicator=True) + cond_no_1b = (df_to_check_prediction_completeness['Facility_Level'].isin(['1b'])) & (df_to_check_prediction_completeness['_merge'] == 'right_only') + cond_no_1a = (df_to_check_prediction_completeness['Facility_Level'].isin(['1a'])) & (df_to_check_prediction_completeness['_merge'] == 'right_only') + districts_with_no_scenario_data_for_1b = df_to_check_prediction_completeness[cond_no_1b]['District'].unique() + districts_with_no_scenario_data_for_1a = df_to_check_prediction_completeness[cond_no_1a]['District'].unique() + districts_with_no_scenario_data_for_1b_only = np.setdiff1d(districts_with_no_scenario_data_for_1b, districts_with_no_scenario_data_for_1a) + + # According to HHFA data, Balaka, Machinga, Mwanza, Ntchisi and Salima do not have level 1b facilities + # Likoma was not included in the regression because of the limited variation within the district - only 4 facilities - we have assumed that the change of consumable + # availability in Likoma is equal to that predicted for Nkhata Bay + + # 1.2.4 Program + #------------------------------------------------------ + scenario_availability_df.loc[scenario_availability_df.program_plot == 'infection_prev', 'program_plot'] = 'general' # there is no separate infection_prevention category in the TLO availability data + map_model_programs_to_hhfa = { + 'contraception': 'contraception', + 'general': 'general', + 'reproductive_health': 'obs&newb', + 'road_traffic_injuries': 'surgical', + 'epi': 'epi', + 'neonatal_health': 'obs&newb', + 'other_childhood_illnesses': 'alri', + 'malaria': 'malaria', + 'tb': 'tb', + 'hiv': 'hiv', + 'undernutrition': 'child', + 'ncds': 'ncds', + 'cardiometabolicdisorders': 'ncds', + 'cancer': 'ncds', + } + # Reverse the map_model_programs_to_hhfa dictionary + hhfa_to_model_programs = {v: k for k, v in map_model_programs_to_hhfa.items()} + + scenario_availability_df['category_tlo'] = scenario_availability_df['program_plot'].replace(hhfa_to_model_programs) # TODO this might not be relevant + + # 1.2.5 Consumable/Item code and Category + #------------------------------------------------------ + # Load TLO - HHFA consumable name crosswalk + consumable_crosswalk_df = pd.read_csv(path_for_new_resourcefiles / 'ResourceFile_consumables_matched.csv', encoding='ISO-8859-1')[['item_code', 'consumable_name_tlo', + 'item_hhfa_for_scenario_generation', 'hhfa_mapping_rationale_for_scenario_generation']] + + # Keep only item_codes in the availability dataframe + consumable_crosswalk_df = consumable_crosswalk_df.merge(tlo_availability_df[['item_code']], how = 'right', on = 'item_code') + # TODO is module_name used? + # TODO add new consumables Rifapentine to this? + + # Now merge in TLO item codes + scenario_availability_df = scenario_availability_df.reset_index(drop = True) + scenario_availability_df = scenario_availability_df.merge(consumable_crosswalk_df[['item_code', 'item_hhfa_for_scenario_generation', 'hhfa_mapping_rationale_for_scenario_generation', 'consumable_name_tlo']], + left_on = ['item_hhfa'], right_on = ['item_hhfa_for_scenario_generation'], how='right', indicator=True, validate = "m:m") + scenario_availability_df = scenario_availability_df.drop_duplicates(['Facility_ID', 'item_code']) + scenario_availability_df = scenario_availability_df.rename({'_merge': 'merge_itemcode'}, axis=1) + print("Number of item codes from the TLO model for which no match was found in the regression-based scenario data = ", scenario_availability_df.merge_itemcode.value_counts()[1]) + + # Before merging the above dataframe with tlo_availability_df, and apply a general interpolation rule to fill any gaps, + # we need to make sure that any specific interpolation rules are applied to the scenario dataframe + + # Further a row needs to be added for 1b level under Balaka, Machinga, Mwanza, Ntchisi and Salima + print("Number of unique facility IDs: \n - TLO consumable data = ", tlo_availability_df.Facility_ID.nunique(), + "\n - Scenario based on regression = ", scenario_availability_df.Facility_ID.nunique(), + "\nNumber of unique item codes: \n - TLO consumable availability data = ", tlo_availability_df.item_code.nunique(), + "\n - TLO consumable availability repository = ", consumable_crosswalk_df.item_code.nunique(), + "\n - Scenario based on regression = ", scenario_availability_df.item_code.nunique()) + + # Extract list of TLO consumables which weren't matched with the availability prediction dataframe + items_not_matched = scenario_availability_df['merge_itemcode'] == 'right_only' + + # Get average availability_change_prop value by facility_ID and category_tlo + scenario_availability_df = scenario_availability_df.merge(program_item_mapping, + on = ['item_code'], validate = "m:1", + how = "left") + + # 1.3 Initial interpolation + #------------------------------------------------------ + # 1.3.1 Items not relevant to the regression analysis + items_not_relevant_to_regression = (items_not_matched) & (scenario_availability_df['hhfa_mapping_rationale_for_scenario_generation'] == 'not relevant to logistic regression analysis') + # For category 3, replace availability_change_prop with 1, since we assume that the system-level intervention does not change availability + list_of_scenario_variables = ['change_proportion_scenario1', 'change_proportion_scenario2', + 'change_proportion_scenario3', 'change_proportion_scenario4', 'change_proportion_scenario5'] + for var in list_of_scenario_variables: + scenario_availability_df.loc[items_not_relevant_to_regression,var] = 1 + + # 1.3.2 For level 1b for the districts where this level was not present in the regression analysis/HHFA dataset, assume + # that the change is equal to the product of the (ratio of average change across districts for level 1b to + # average change across districts for level 1a) and change for each item_code for level 1a for that district + #------------------------------------------------------------------------------------------------------------ + average_change_across_districts = scenario_availability_df.groupby(['Facility_Level','item_code'])[list_of_scenario_variables].mean().reset_index() + + # Generate the ratio of the proportional changes to availability of level 1b to 1a in the districts for which level 1b data is available + new_colnames_1a = {col: col + '_1a' if col in list_of_scenario_variables else col for col in average_change_across_districts.columns} + new_colnames_1b = {col: col + '_1b' if col in list_of_scenario_variables else col for col in average_change_across_districts.columns} + average_change_across_districts_for_1a = average_change_across_districts[average_change_across_districts.Facility_Level == "1a"].rename(new_colnames_1a, axis = 1).drop('Facility_Level', axis = 1) + average_change_across_districts_for_1b = average_change_across_districts[average_change_across_districts.Facility_Level == "1b"].rename(new_colnames_1b, axis = 1).drop('Facility_Level', axis = 1) + ratio_of_change_across_districts_1b_to_1a = average_change_across_districts_for_1a.merge(average_change_across_districts_for_1b, + how = "left", on = ['item_code']) + for var in list_of_scenario_variables: + var_ratio = 'ratio_' + var + var_1a = var + '_1a' + var_1b = var + '_1b' + ratio_of_change_across_districts_1b_to_1a[var_ratio] = (ratio_of_change_across_districts_1b_to_1a[var_1b])/(ratio_of_change_across_districts_1b_to_1a[var_1a]) + ratio_of_change_across_districts_1b_to_1a.reset_index(drop = True) + # TODO check if this ratio should be of the proportions minus 1 + + # For districts with no level 1b data in the HHFA, use the ratio of change in level 1b facilities to level 1a facilities to generate the expected proportional change in availability + # for level 1b facilities in those districts + scenario_availability_df = scenario_availability_df.reset_index(drop = True) + cond_districts_with_1b_missing = scenario_availability_df.District.isin(districts_with_no_scenario_data_for_1b_only) + cond_1a = scenario_availability_df.Facility_Level == '1a' + cond_1b = scenario_availability_df.Facility_Level == '1b' + df_1a = scenario_availability_df[cond_districts_with_1b_missing & cond_1a] + + ratio_vars = ['ratio_' + item for item in list_of_scenario_variables] # create columns to represent the ratio of change in 1b facilities to level 1a facilities + + item_var = ['item_code'] + + # First merge the dataframe with changes at level 1a with the ratio of 1b to 1a + df_missing_1b_imputed = df_1a.merge(ratio_of_change_across_districts_1b_to_1a[item_var + ratio_vars], + on = item_var, + how = 'left', validate = "m:1") + + # Then multiply the ratio of 1b to 1a with the change at level 1a to get the expected change at level 1b + for var in list_of_scenario_variables: + df_missing_1b_imputed[var] = df_missing_1b_imputed[var] * df_missing_1b_imputed['ratio_' + var] + # Update columns so the dataframe in fact refers to level 1b facilities + df_missing_1b_imputed.Facility_Level = '1b' # Update facility level to 1 + # Replace Facility_IDs + df_missing_1b_imputed = df_missing_1b_imputed.drop('Facility_ID', axis = 1).merge(mfl[['District', 'Facility_Level', 'Facility_ID']], + on =['District', 'Facility_Level'], + how = 'left') + # Append the new imputed level 1b dataframe to the original dataframe + df_without_districts_with_no_1b_facilities = scenario_availability_df[~(cond_districts_with_1b_missing & cond_1b)] + scenario_availability_df = pd.concat([df_without_districts_with_no_1b_facilities, df_missing_1b_imputed], ignore_index = True) + + # 2. Merge TLO model availability data with scenario data using crosswalk + #************************************************************************* + # 2.1 Merge the two datasets + #------------------------------------------------------ + id_variables = ['item_code','Facility_ID'] + + full_scenario_df = tlo_availability_df.merge(scenario_availability_df[id_variables + list_of_scenario_variables], + how='left', on=['Facility_ID', 'item_code'], indicator = True) + full_scenario_df = full_scenario_df.rename({'_merge': 'merge_scenario'}, axis=1) + full_scenario_df = full_scenario_df.drop_duplicates(['Facility_ID', 'item_code', 'month']) + + # Check that level 1b values are currently imputed + #full_scenario_df[full_scenario_df.District == 'Balaka'].groupby(['District', 'Facility_Level'])['change_proportion_scenario1'].mean() + + # 2.2 Further imputation + #------------------------------------------------------ + # 2.2.1 For all levels other than 1a and 1b, there will be no change in consumable availability + #------------------------------------------------------------------------------------------------------------ + fac_levels_not_relevant_to_regression = full_scenario_df.Facility_Level.isin(['0', '2', '3', '4']) + + for var in list_of_scenario_variables: + full_scenario_df.loc[fac_levels_not_relevant_to_regression, var] = 1 + + # 2.3 Final checks + #------------------------------------------------------ + # 2.3.1 Check that the merged dataframe has the same number of unique items, facility IDs, and total + # number of rows as the original small availability resource file + #--------------------------------------------------------------------------------------------------------- + assert(full_scenario_df.item_code.nunique() == tlo_availability_df.item_code.nunique()) # the number of items in the new dataframe is the same at the original availability dataframe + assert(full_scenario_df.Facility_ID.nunique() == tlo_availability_df.Facility_ID.nunique()) # the number of Facility IDs in the new dataframe is the same at the original availability dataframe + assert(len(full_scenario_df) == len(tlo_availability_df)) # the number of rows in the new dataframe is the same at the original availability dataframe + + # 2.3.2 Construct dataset that conforms to the principles expected by the simulation: i.e. that there is an entry for every + # facility_id and for every month for every item_code. + #----------------------------------------------------------------------------------------------------------------------- + # Generate the dataframe that has the desired size and shape + fac_ids = set(mfl.loc[mfl.Facility_Level != '5'].Facility_ID) + item_codes = set(tlo_availability_df.item_code.unique()) + months = range(1, 13) + all_availability_columns = ['available_prop'] + list_of_scenario_variables + + # Create a MultiIndex from the product of fac_ids, months, and item_codes + index = pd.MultiIndex.from_product([fac_ids, months, item_codes], names=['Facility_ID', 'month', 'item_code']) + + # Initialize a DataFrame with the MultiIndex and columns, filled with NaN + full_set = pd.DataFrame(index=index, columns=all_availability_columns) + full_set = full_set.astype(float) # Ensure all columns are float type and filled with NaN + + # Insert the data, where it is available. + full_set = full_set.combine_first(full_scenario_df.set_index(['Facility_ID', 'month', 'item_code'])[all_availability_columns]) + + # Fill in the blanks with rules for interpolation. + facilities_by_level = defaultdict(set) + for ix, row in mfl.iterrows(): + facilities_by_level[row['Facility_Level']].add(row['Facility_ID']) + + items_by_category = defaultdict(set) + for ix, row in program_item_mapping.iterrows(): + items_by_category[row['item_category']].add(row['item_code']) + + def get_other_facilities_of_same_level(_fac_id): + """Return a set of facility_id for other facilities that are of the same level as that provided.""" + for v in facilities_by_level.values(): + if _fac_id in v: + return v - {_fac_id} + + def get_other_items_of_same_category(_item_code): + """Return a set of item_codes for other items that are in the same category/program as that provided.""" + for v in items_by_category.values(): + if _item_code in v: + return v - {_item_code} + def interpolate_missing_with_mean(_ser): + """Return a series in which any values that are null are replaced with the mean of the non-missing.""" + if pd.isnull(_ser).all(): + raise ValueError + return _ser.fillna(_ser.mean()) + + # Create new dataset that include the interpolations (The operation is not done "in place", because the logic is based + # on what results are missing before the interpolations in other facilities). + full_set_interpolated = full_set * np.nan + full_set_interpolated.available_prop = full_set.available_prop + + for fac in fac_ids: + for item in item_codes: + for col in list_of_scenario_variables: + print(f"Now doing: fac={fac}, item={item}, column={col}") + + # Get records of the availability of this item in this facility. + _monthly_records = full_set.loc[(fac, slice(None), item), col].copy() - if recorded_at_other_facilities_of_same_level: - # If it recorded at other facilities of same level, find the average availability of the item at other - # facilities of the same level. - print("Data for facility ", fac, " extrapolated from other facilities within level - ", facilities) + if pd.notnull(_monthly_records).any(): + # If there is at least one record of this item at this facility, then interpolate the missing months from + # the months for there are data on this item in this facility. (If none are missing, this has no effect). + _monthly_records = interpolate_missing_with_mean(_monthly_records) + + else: + # If there is no record of this item at this facility, check to see if it's available at other facilities + # of the same level + # Or if there is no record of item at other facilities at this level, check to see if other items of this category + # are available at this facility level facilities = list(get_other_facilities_of_same_level(fac)) - _monthly_records = interpolate_missing_with_mean( - full_set.loc[(facilities, slice(None), item), col].groupby(level=1).mean() - ) - elif category_recorded_at_other_facilities_of_same_level and valid_items: - # If it recorded at other facilities of same level, find the average availability of the item at other - # facilities of the same level. - print("Data for item ", item, " extrapolated from other items within category - ", valid_items) + other_items = get_other_items_of_same_category(item) + items = list(other_items) if other_items else other_items - _monthly_records = interpolate_missing_with_mean( - full_set.loc[(fac, slice(None), valid_items), col].groupby(level=1).mean() - ) + recorded_at_other_facilities_of_same_level = pd.notnull( + full_set.loc[(facilities, slice(None), item), col] + ).any() + + if not items: + category_recorded_at_other_facilities_of_same_level = False + else: + # Filter only items that exist in the MultiIndex at this facility + valid_items = [ + itm for itm in items + if any((fac, m, itm) in full_set.index for m in months) + ] + + category_recorded_at_other_facilities_of_same_level = pd.notnull( + full_set.loc[(fac, slice(None), valid_items), col] + ).any() + + if recorded_at_other_facilities_of_same_level: + # If it recorded at other facilities of same level, find the average availability of the item at other + # facilities of the same level. + print("Data for facility ", fac, " extrapolated from other facilities within level - ", facilities) + facilities = list(get_other_facilities_of_same_level(fac)) + _monthly_records = interpolate_missing_with_mean( + full_set.loc[(facilities, slice(None), item), col].groupby(level=1).mean() + ) + + elif category_recorded_at_other_facilities_of_same_level and valid_items: + # If it recorded at other facilities of same level, find the average availability of the item at other + # facilities of the same level. + print("Data for item ", item, " extrapolated from other items within category - ", valid_items) + + _monthly_records = interpolate_missing_with_mean( + full_set.loc[(fac, slice(None), valid_items), col].groupby(level=1).mean() + ) + + else: + # If it is not recorded at other facilities of same level, then assume that there is no change + print("No interpolation worked") + _monthly_records = _monthly_records.fillna(1.0) + + # Insert values (including corrections) into the resulting dataset. + full_set_interpolated.loc[(fac, slice(None), item), col] = _monthly_records.values + # temporary code + assert full_set_interpolated.loc[(fac, slice(None), item), col].mean() >= 0 + + # 3. Generate regression-based scenario data on consumable availablity + #************************************************************************* + # Create new consumable availability estimates for TLO model consumables using + # estimates of proportional change from the regression analysis based on HHFA data + #------------------------------------------------------ + prefix = 'change_proportion_' + list_of_scenario_suffixes = [s.replace(prefix, '') for s in list_of_scenario_variables] + + for scenario in list_of_scenario_suffixes: + full_set_interpolated['available_prop_' + scenario] = full_set_interpolated['available_prop'] * full_set_interpolated['change_proportion_' + scenario] + availability_greater_than_1 = full_set_interpolated['available_prop_' + scenario] > 1 + full_set_interpolated.loc[availability_greater_than_1, 'available_prop_' + scenario] = 1 + + assert(sum(full_set_interpolated['available_prop_' + scenario].isna()) == + sum(full_set_interpolated['change_proportion_' + scenario].isna())) # make sure that there is an entry for every row in which there was previously data + + # 4. Generate best performing facility-based scenario data on consumable availability + #*************************************************************************************** + df = full_set_interpolated.reset_index().copy() + + # Try updating the avaiability to represent the 75th percentile by consumable + facility_levels = ['1a', '1b', '2'] + target_percentiles = [75, 90, 99] + + best_performing_facilities = {} + # Populate the dictionary + for level in facility_levels: + # Create an empty dictionary for the current level + best_performing_facilities[level] = {} - else: - # If it is not recorded at other facilities of same level, then assume that there is no change - print("No interpolation worked") - _monthly_records = _monthly_records.fillna(1.0) - - # Insert values (including corrections) into the resulting dataset. - full_set_interpolated.loc[(fac, slice(None), item), col] = _monthly_records.values - # temporary code - assert full_set_interpolated.loc[(fac, slice(None), item), col].mean() >= 0 - -# 3. Generate regression-based scenario data on consumable availablity -#************************************************************************* -# Create new consumable availability estimates for TLO model consumables using -# estimates of proportional change from the regression analysis based on HHFA data -#------------------------------------------------------ -prefix = 'change_proportion_' -list_of_scenario_suffixes = [s.replace(prefix, '') for s in list_of_scenario_variables] - -for scenario in list_of_scenario_suffixes: - full_set_interpolated['available_prop_' + scenario] = full_set_interpolated['available_prop'] * full_set_interpolated['change_proportion_' + scenario] - availability_greater_than_1 = full_set_interpolated['available_prop_' + scenario] > 1 - full_set_interpolated.loc[availability_greater_than_1, 'available_prop_' + scenario] = 1 - - assert(sum(full_set_interpolated['available_prop_' + scenario].isna()) == - sum(full_set_interpolated['change_proportion_' + scenario].isna())) # make sure that there is an entry for every row in which there was previously data - -# 4. Generate best performing facility-based scenario data on consumable availability -#*************************************************************************************** -df = full_set_interpolated.reset_index().copy() - -# Try updating the avaiability to represent the 75th percentile by consumable -facility_levels = ['1a', '1b', '2'] -target_percentiles = [75, 90, 99] - -best_performing_facilities = {} -# Populate the dictionary -for level in facility_levels: - # Create an empty dictionary for the current level - best_performing_facilities[level] = {} - - for item in item_codes: - best_performing_facilities[level][item] = {} - # Get the mean availability by Facility for the current level - mean_consumable_availability = pd.DataFrame(df[(df.Facility_ID.isin(facilities_by_level[level])) & (df.item_code == item)] - .groupby('Facility_ID')['available_prop'].mean()).reset_index() - - # Calculate the percentile rank of each row for 'available_prop' - mean_consumable_availability['percentile_rank'] = mean_consumable_availability['available_prop'].rank(pct=True) * 100 - - # Find the row which is closest to the nth percentile rank for each target percentile - for target_perc in target_percentiles: - # Calculate the difference to target percentile - mean_consumable_availability['diff_to_target_' + str(target_perc)] = np.abs( - mean_consumable_availability['percentile_rank'] - target_perc) - - # Find the row with the minimum difference to the target percentile - closest_row = mean_consumable_availability.loc[ - mean_consumable_availability['diff_to_target_' + str(target_perc)].idxmin()] - - # Store the Facility_ID of the closest row in the dictionary for the current level - best_performing_facilities[level][item][str(target_perc) + 'th percentile'] = closest_row['Facility_ID'] - -print("Reference facilities at each level for each item: ", best_performing_facilities) - -# Obtain the updated availability estimates for level 1a for scenarios 6-8 -updated_availability_1a = df[['item_code', 'month']].drop_duplicates() -updated_availability_1b = df[['item_code', 'month']].drop_duplicates() -updated_availability_2 = df[['item_code', 'month']].drop_duplicates() -temporary_df = pd.DataFrame([]) -availability_dataframes = [updated_availability_1a, updated_availability_1b, updated_availability_2] - -i = 6 # start scenario counter -j = 0 # start level counter -for level in facility_levels: - for target_perc in target_percentiles: for item in item_codes: + best_performing_facilities[level][item] = {} + # Get the mean availability by Facility for the current level + mean_consumable_availability = pd.DataFrame(df[(df.Facility_ID.isin(facilities_by_level[level])) & (df.item_code == item)] + .groupby('Facility_ID')['available_prop'].mean()).reset_index() + + # Calculate the percentile rank of each row for 'available_prop' + mean_consumable_availability['percentile_rank'] = mean_consumable_availability['available_prop'].rank(pct=True) * 100 + + # Find the row which is closest to the nth percentile rank for each target percentile + for target_perc in target_percentiles: + # Calculate the difference to target percentile + mean_consumable_availability['diff_to_target_' + str(target_perc)] = np.abs( + mean_consumable_availability['percentile_rank'] - target_perc) + + # Find the row with the minimum difference to the target percentile + closest_row = mean_consumable_availability.loc[ + mean_consumable_availability['diff_to_target_' + str(target_perc)].idxmin()] + + # Store the Facility_ID of the closest row in the dictionary for the current level + best_performing_facilities[level][item][str(target_perc) + 'th percentile'] = closest_row['Facility_ID'] + + print("Reference facilities at each level for each item: ", best_performing_facilities) + + # Obtain the updated availability estimates for level 1a for scenarios 6-8 + updated_availability_1a = df[['item_code', 'month']].drop_duplicates() + updated_availability_1b = df[['item_code', 'month']].drop_duplicates() + updated_availability_2 = df[['item_code', 'month']].drop_duplicates() + temporary_df = pd.DataFrame([]) + availability_dataframes = [updated_availability_1a, updated_availability_1b, updated_availability_2] + + i = 6 # start scenario counter + j = 0 # start level counter + for level in facility_levels: + for target_perc in target_percentiles: + for item in item_codes: + + print("Running level ", level, "; Running scenario ", str(i), "; Running item ", item) + reference_facility = df['Facility_ID'] == best_performing_facilities[level][item][str(target_perc) + 'th percentile'] + current_item = df['item_code'] == item + availability_at_reference_facility = df[reference_facility & current_item][['item_code', 'month', 'available_prop']] - print("Running level ", level, "; Running scenario ", str(i), "; Running item ", item) - reference_facility = df['Facility_ID'] == best_performing_facilities[level][item][str(target_perc) + 'th percentile'] - current_item = df['item_code'] == item - availability_at_reference_facility = df[reference_facility & current_item][['item_code', 'month', 'available_prop']] - - if temporary_df.empty: - temporary_df = availability_at_reference_facility - else: - temporary_df = pd.concat([temporary_df,availability_at_reference_facility], ignore_index = True) - - column_name = 'available_prop_scenario' + str(i) - temporary_df = temporary_df.rename(columns = {'available_prop': column_name }) - availability_dataframes[j] = availability_dataframes[j].merge(temporary_df, on = ['item_code', 'month'], how = 'left', validate = '1:1') - temporary_df = pd.DataFrame([]) - i = i + 1 - i = 6 # restart scenario counter - j = j + 1 # move to the next level - -# Merge the above scenario data to the full availability scenario dataframe -# 75, 90 and 99th percentile availability data for level 1a -df_new_1a = df[df['Facility_ID'].isin(facilities_by_level['1a'])].merge(availability_dataframes[0],on = ['item_code', 'month'], - how = 'left', - validate = "m:1") -# 75, 90 and 99th percentile availability data for level 1b -df_new_1b = df[df['Facility_ID'].isin(facilities_by_level['1b'])].merge(availability_dataframes[1],on = ['item_code', 'month'], - how = 'left', - validate = "m:1") - -# For scenarios 6-8, replace the new availability probabilities by the max(original availability, availability at target percentile) -for scen in [6,7,8]: - df_new_1a['available_prop_scenario' + str(scen)] = df_new_1a.apply( - lambda row: max(row['available_prop_scenario' + str(scen) ], row['available_prop']), axis=1) - df_new_1b['available_prop_scenario' + str(scen)] = df_new_1b.apply( - lambda row: max(row['available_prop_scenario' + str(scen) ], row['available_prop']), axis=1) - -# 75, 90 and 99th percentile availability data for level 2 -df_new_2 = df[df['Facility_ID'].isin(facilities_by_level['2'])].merge(availability_dataframes[2],on = ['item_code', 'month'], - how = 'left', - validate = "m:1") - -# Generate scenarios 6-8 -#------------------------ -# scenario 6: only levels 1a and 1b changed to availability at 75th percentile for the corresponding level -# scenario 7: only levels 1a and 1b changed to availability at 90th percentile for the corresponding level -# scenario 8: only levels 1a and 1b changed to availability at 99th percentile for the corresponding level -# Scenario 6-8 availability data for other levels -df_new_otherlevels = df[~df['Facility_ID'].isin(facilities_by_level['1a']|facilities_by_level['1b'])] -new_scenario_columns = ['available_prop_scenario6', 'available_prop_scenario7', 'available_prop_scenario8'] -for col in new_scenario_columns: - df_new_otherlevels[col] = df_new_otherlevels['available_prop'] -# Append the above dataframes -df_facility_level_benchmark_scenarios = pd.concat([df_new_1a, df_new_1b, df_new_otherlevels], ignore_index = True) - - -# Generate scenario 9 -#------------------------ -# scenario 9: levels 1a, 1b and 2 changed to availability at 99th percentile for the corresponding level -df_new_otherlevels = df_facility_level_benchmark_scenarios[~df_facility_level_benchmark_scenarios['Facility_ID'].isin(facilities_by_level['1a']|facilities_by_level['1b']|facilities_by_level['2'])].reset_index(drop = True) -df_new_1a_scenario9 = df_facility_level_benchmark_scenarios[df_facility_level_benchmark_scenarios['Facility_ID'].isin(facilities_by_level['1a'])].reset_index(drop = True) -df_new_1b_scenario9 = df_facility_level_benchmark_scenarios[df_facility_level_benchmark_scenarios['Facility_ID'].isin(facilities_by_level['1b'])].reset_index(drop = True) -df_new_2_scenario9 = df_new_2[df_new_2['Facility_ID'].isin(facilities_by_level['2'])].reset_index(drop = True) -new_scenario_columns = ['available_prop_scenario9'] -for col in new_scenario_columns: - df_new_otherlevels[col] = df_new_otherlevels['available_prop'] - df_new_1a_scenario9[col] = df_new_1a_scenario9['available_prop_scenario8'] - df_new_1b_scenario9[col] = df_new_1b_scenario9['available_prop_scenario8'] - df_new_2_scenario9[col] = df_new_2_scenario9.apply(lambda row: max(row['available_prop_scenario8'], row['available_prop']), axis=1) - -# Append the above dataframes -df_new_scenarios9 = pd.concat([df_new_1a_scenario9, df_new_1b_scenario9, df_new_2_scenario9, df_new_otherlevels], ignore_index = True) - -# 6. Generate scenarios based on the performance of vertical programs -#*************************************************************************************** -df_program_benchmark_scenarios = tlo_availability_df.copy() -# Define common conditions needed for the next set of scenarios -cond_levels1a1b = df_program_benchmark_scenarios.Facility_Level.isin(['1a', '1b']) -cond_hiv = df_program_benchmark_scenarios.item_category == 'hiv' -cond_epi = df_program_benchmark_scenarios.item_category == 'epi' -cond_cancer = df_program_benchmark_scenarios.item_category == 'cancer' -cond_malaria = df_program_benchmark_scenarios.item_category == 'malaria' -cond_tb = df_program_benchmark_scenarios.item_category == 'tb' -cond_non_epi_hiv_cancer = ~(cond_epi | cond_hiv | cond_cancer) - -# Calculate average availabilities -avg_hiv = df_program_benchmark_scenarios[cond_hiv & cond_levels1a1b].groupby('Facility_Level')['available_prop'].mean() -avg_epi = df_program_benchmark_scenarios[cond_epi & cond_levels1a1b].groupby('Facility_Level')['available_prop'].mean() -avg_other_by_level = df_program_benchmark_scenarios[cond_non_epi_hiv_cancer & cond_levels1a1b].groupby('Facility_Level')['available_prop'].mean() -avg_other_by_facility = df_program_benchmark_scenarios[~(cond_epi | cond_hiv | cond_cancer | cond_malaria | cond_tb)].groupby('Facility_ID')['available_prop'].mean() - -# Initialize scenario columns with baseline values -for i in range(10, 16): - df_program_benchmark_scenarios[f'available_prop_scenario{i}'] = df_program_benchmark_scenarios['available_prop'] - -# Define update logic using a loop -scenario_logic = { - 10: lambda row: max(row['available_prop'], avg_hiv[row['Facility_Level']]) - if row['Facility_Level'] in ['1a', '1b'] and row['item_category'] not in ['hiv', 'epi'] else row['available_prop'], - 11: lambda row: max(row['available_prop'], avg_epi[row['Facility_Level']]) - if row['Facility_Level'] in ['1a', '1b'] and row['item_category'] not in ['hiv', 'epi'] else row['available_prop'], - 12: lambda row: min(row['available_prop'], avg_other_by_level[row['Facility_Level']]) - if row['Facility_Level'] in ['1a', '1b'] and row['item_category'] == 'hiv' else row['available_prop'], - 13: lambda row: min(row['available_prop'], avg_other_by_facility[row['Facility_ID']]) - if row['item_category'] == 'hiv' else row['available_prop'], - 14: lambda row: min(row['available_prop'], avg_other_by_facility[row['Facility_ID']] * 1.25) - if row['item_category'] == 'hiv' else row['available_prop'], - 15: lambda row: min(row['available_prop'], avg_other_by_facility[row['Facility_ID']] * 0.75) - if row['item_category'] == 'hiv' else row['available_prop'], -} - -# Apply logic to each scenario -for scen, func in scenario_logic.items(): - df_program_benchmark_scenarios[f'available_prop_scenario{scen}'] = df_program_benchmark_scenarios.apply(func, axis=1) - -# Add scenarios 6 to 12 to the original dataframe -#------------------------------------------------------ -# Combine all scenario suffixes into a single list -scenario_suffixes = list_of_scenario_suffixes + [f'scenario{i}' for i in range(6, 16)] -scenario_vars = [f'available_prop_{s}' for s in scenario_suffixes] -old_vars = ['Facility_ID', 'month', 'item_code'] - -# Prepare the full base dataframe from scenarios 6–8 -full_df_with_scenario = df_facility_level_benchmark_scenarios[ - old_vars + ['available_prop'] + [f'available_prop_scenario{i}' for i in range(1, 9)] -].reset_index(drop=True) - -# Add scenario 9 -full_df_with_scenario = full_df_with_scenario.merge( - df_new_scenarios9[old_vars + ['available_prop_scenario9']], - on=old_vars, how='left', validate='1:1' -) - -# Add scenarios 10–15 from the program benchmark dataframe -full_df_with_scenario = full_df_with_scenario.merge( - df_program_benchmark_scenarios[old_vars + [f'available_prop_scenario{i}' for i in range(10, 16)]], - on=old_vars, how='left', validate='1:1' -) - -# --- Check that the scenarios 6-11 always have higher prob of availability than baseline --- # -for scen in range(6,12): - assert sum(full_df_with_scenario['available_prop_scenario' + str(scen)] < full_df_with_scenario['available_prop']) == 0 - assert sum(full_df_with_scenario['available_prop_scenario' + str(scen)] >= full_df_with_scenario['available_prop']) == len(full_df_with_scenario) -# Check that scenarios 12-15 always have equal to or lower prob of availability than baselin --- # -for scen in range(12,16): - assert sum(full_df_with_scenario['available_prop_scenario' + str(scen)] > full_df_with_scenario['available_prop']) == 0 - assert sum(full_df_with_scenario['available_prop_scenario' + str(scen)] <= full_df_with_scenario['available_prop']) == len(full_df_with_scenario) - -# --- Check that the exported file has the properties required of it by the model code. --- # -check_format_of_consumables_file(df=full_df_with_scenario, fac_ids=fac_ids) - -# Save updated consumable availability resource file with scenario data -full_df_with_scenario.to_csv( - path_for_new_resourcefiles / "ResourceFile_Consumables_availability_small.csv", - index=False -) -# TODO: Create a column providing the source of scenario data - -# 8. Plot new availability estimates by scenario -#********************************************************************************************* -full_df_with_scenario = pd.read_csv(path_for_new_resourcefiles / "ResourceFile_Consumables_availability_small.csv") - -# Create the directory if it doesn't exist -figurespath = outputfilepath / 'consumable_scenario_analysis' -if not os.path.exists(figurespath): - os.makedirs(figurespath) - -# Prepare the availability dataframe for descriptive plots -df_for_plots = full_df_with_scenario.merge(mfl[['Facility_ID', 'Facility_Level']], on = 'Facility_ID', how = 'left', validate = "m:1") -df_for_plots = df_for_plots.merge(program_item_mapping, on = 'item_code', how = 'left', validate = "m:1") -scenario_list = [1,2,3,6,7,8,10,11,12,13,14,15] -chosen_availability_columns = ['available_prop'] + [f'available_prop_scenario{i}' for i in - scenario_list] -scenario_names_dict = {'available_prop': 'Actual', 'available_prop_scenario1': 'Non-therapeutic \n consumables', 'available_prop_scenario2': 'Vital medicines', - 'available_prop_scenario3': 'Pharmacist-\n managed', 'available_prop_scenario4': 'Level 1b', 'available_prop_scenario5': 'CHAM', - 'available_prop_scenario6': '75th percentile\n facility', 'available_prop_scenario7': '90th percentile \n facility', 'available_prop_scenario8': 'Best \n facility', - 'available_prop_scenario9': 'Best facility \n (including DHO)','available_prop_scenario10': 'HIV supply \n chain', 'available_prop_scenario11': 'EPI supply \n chain', - 'available_prop_scenario12': 'HIV moved to \n Govt supply chain \n (Avg by Level)', 'available_prop_scenario13': 'HIV moved to \n Govt supply chain \n (Avg by Facility_ID)', - 'available_prop_scenario14': 'HIV moved to \n Govt supply chain \n (Avg by Facility_ID times 1.25)', - 'available_prop_scenario15': 'HIV moved to \n Govt supply chain \n (Avg by Facility_ID times 0.75)'} -# recreate the chosen columns list based on the mapping above -chosen_availability_columns = [scenario_names_dict[col] for col in chosen_availability_columns] -df_for_plots = df_for_plots.rename(columns = scenario_names_dict) - -# Generate a bar plot of average availability under each scenario by item_category and Facility_Level -def generate_barplot_of_scenarios(_df, _x_axis_var, _filename): - df_for_bar_plot = _df.groupby([_x_axis_var])[chosen_availability_columns].mean() - df_for_bar_plot = df_for_bar_plot.reset_index().melt(id_vars=[_x_axis_var], value_vars=chosen_availability_columns, - var_name='Scenario', value_name='Value') - plot = (ggplot(df_for_bar_plot.reset_index(), aes(x=_x_axis_var, y='Value', fill = 'Scenario')) - + geom_bar(stat='identity', position='dodge') - + ylim(0, 1) - + labs(title = "Probability of availability across scenarios", - x=_x_axis_var, - y='Probability of availability') - + theme(axis_text_x=element_text(angle=45, hjust=1)) - ) - - plot.save(filename= figurespath / _filename, dpi=300, width=10, height=8, units='in') -generate_barplot_of_scenarios(_df = df_for_plots, _x_axis_var = 'item_category', _filename = 'availability_by_category.png') -generate_barplot_of_scenarios(_df = df_for_plots, _x_axis_var = 'Facility_Level', _filename = 'availability_by_level.png') - -# Create heatmaps by Facility_Level of average availability by item_category across chosen scenarios -for level in fac_levels: - # Generate a heatmap + if temporary_df.empty: + temporary_df = availability_at_reference_facility + else: + temporary_df = pd.concat([temporary_df,availability_at_reference_facility], ignore_index = True) + + column_name = 'available_prop_scenario' + str(i) + temporary_df = temporary_df.rename(columns = {'available_prop': column_name }) + availability_dataframes[j] = availability_dataframes[j].merge(temporary_df, on = ['item_code', 'month'], how = 'left', validate = '1:1') + temporary_df = pd.DataFrame([]) + i = i + 1 + i = 6 # restart scenario counter + j = j + 1 # move to the next level + + # Merge the above scenario data to the full availability scenario dataframe + # 75, 90 and 99th percentile availability data for level 1a + df_new_1a = df[df['Facility_ID'].isin(facilities_by_level['1a'])].merge(availability_dataframes[0],on = ['item_code', 'month'], + how = 'left', + validate = "m:1") + # 75, 90 and 99th percentile availability data for level 1b + df_new_1b = df[df['Facility_ID'].isin(facilities_by_level['1b'])].merge(availability_dataframes[1],on = ['item_code', 'month'], + how = 'left', + validate = "m:1") + + # For scenarios 6-8, replace the new availability probabilities by the max(original availability, availability at target percentile) + for scen in [6,7,8]: + df_new_1a['available_prop_scenario' + str(scen)] = df_new_1a.apply( + lambda row: max(row['available_prop_scenario' + str(scen) ], row['available_prop']), axis=1) + df_new_1b['available_prop_scenario' + str(scen)] = df_new_1b.apply( + lambda row: max(row['available_prop_scenario' + str(scen) ], row['available_prop']), axis=1) + + # 75, 90 and 99th percentile availability data for level 2 + df_new_2 = df[df['Facility_ID'].isin(facilities_by_level['2'])].merge(availability_dataframes[2],on = ['item_code', 'month'], + how = 'left', + validate = "m:1") + + # Generate scenarios 6-8 + #------------------------ + # scenario 6: only levels 1a and 1b changed to availability at 75th percentile for the corresponding level + # scenario 7: only levels 1a and 1b changed to availability at 90th percentile for the corresponding level + # scenario 8: only levels 1a and 1b changed to availability at 99th percentile for the corresponding level + # Scenario 6-8 availability data for other levels + df_new_otherlevels = df[~df['Facility_ID'].isin(facilities_by_level['1a']|facilities_by_level['1b'])] + new_scenario_columns = ['available_prop_scenario6', 'available_prop_scenario7', 'available_prop_scenario8'] + for col in new_scenario_columns: + df_new_otherlevels[col] = df_new_otherlevels['available_prop'] + # Append the above dataframes + df_facility_level_benchmark_scenarios = pd.concat([df_new_1a, df_new_1b, df_new_otherlevels], ignore_index = True) + + + # Generate scenario 9 + #------------------------ + # scenario 9: levels 1a, 1b and 2 changed to availability at 99th percentile for the corresponding level + df_new_otherlevels = df_facility_level_benchmark_scenarios[~df_facility_level_benchmark_scenarios['Facility_ID'].isin(facilities_by_level['1a']|facilities_by_level['1b']|facilities_by_level['2'])].reset_index(drop = True) + df_new_1a_scenario9 = df_facility_level_benchmark_scenarios[df_facility_level_benchmark_scenarios['Facility_ID'].isin(facilities_by_level['1a'])].reset_index(drop = True) + df_new_1b_scenario9 = df_facility_level_benchmark_scenarios[df_facility_level_benchmark_scenarios['Facility_ID'].isin(facilities_by_level['1b'])].reset_index(drop = True) + df_new_2_scenario9 = df_new_2[df_new_2['Facility_ID'].isin(facilities_by_level['2'])].reset_index(drop = True) + new_scenario_columns = ['available_prop_scenario9'] + for col in new_scenario_columns: + df_new_otherlevels[col] = df_new_otherlevels['available_prop'] + df_new_1a_scenario9[col] = df_new_1a_scenario9['available_prop_scenario8'] + df_new_1b_scenario9[col] = df_new_1b_scenario9['available_prop_scenario8'] + df_new_2_scenario9[col] = df_new_2_scenario9.apply(lambda row: max(row['available_prop_scenario8'], row['available_prop']), axis=1) + + # Append the above dataframes + df_new_scenarios9 = pd.concat([df_new_1a_scenario9, df_new_1b_scenario9, df_new_2_scenario9, df_new_otherlevels], ignore_index = True) + + # 6. Generate scenarios based on the performance of vertical programs + #*************************************************************************************** + df_program_benchmark_scenarios = tlo_availability_df.copy() + # Define common conditions needed for the next set of scenarios + cond_levels1a1b = df_program_benchmark_scenarios.Facility_Level.isin(['1a', '1b']) + cond_hiv = df_program_benchmark_scenarios.item_category == 'hiv' + cond_epi = df_program_benchmark_scenarios.item_category == 'epi' + cond_cancer = df_program_benchmark_scenarios.item_category == 'cancer' + cond_malaria = df_program_benchmark_scenarios.item_category == 'malaria' + cond_tb = df_program_benchmark_scenarios.item_category == 'tb' + cond_non_epi_hiv_cancer = ~(cond_epi | cond_hiv | cond_cancer) + + # Calculate average availabilities + avg_hiv = df_program_benchmark_scenarios[cond_hiv & cond_levels1a1b].groupby('Facility_Level')['available_prop'].mean() + avg_epi = df_program_benchmark_scenarios[cond_epi & cond_levels1a1b].groupby('Facility_Level')['available_prop'].mean() + avg_other_by_level = df_program_benchmark_scenarios[cond_non_epi_hiv_cancer & cond_levels1a1b].groupby('Facility_Level')['available_prop'].mean() + avg_other_by_facility = df_program_benchmark_scenarios[~(cond_epi | cond_hiv | cond_cancer | cond_malaria | cond_tb)].groupby('Facility_ID')['available_prop'].mean() + + # Initialize scenario columns with baseline values + for i in range(10, 16): + df_program_benchmark_scenarios[f'available_prop_scenario{i}'] = df_program_benchmark_scenarios['available_prop'] + + # Define update logic using a loop + scenario_logic = { + 10: lambda row: max(row['available_prop'], avg_hiv[row['Facility_Level']]) + if row['Facility_Level'] in ['1a', '1b'] and row['item_category'] not in ['hiv', 'epi'] else row['available_prop'], + 11: lambda row: max(row['available_prop'], avg_epi[row['Facility_Level']]) + if row['Facility_Level'] in ['1a', '1b'] and row['item_category'] not in ['hiv', 'epi'] else row['available_prop'], + 12: lambda row: min(row['available_prop'], avg_other_by_level[row['Facility_Level']]) + if row['Facility_Level'] in ['1a', '1b'] and row['item_category'] == 'hiv' else row['available_prop'], + 13: lambda row: min(row['available_prop'], avg_other_by_facility[row['Facility_ID']]) + if row['item_category'] == 'hiv' else row['available_prop'], + 14: lambda row: min(row['available_prop'], avg_other_by_facility[row['Facility_ID']] * 1.25) + if row['item_category'] == 'hiv' else row['available_prop'], + 15: lambda row: min(row['available_prop'], avg_other_by_facility[row['Facility_ID']] * 0.75) + if row['item_category'] == 'hiv' else row['available_prop'], + } + + # Apply logic to each scenario + for scen, func in scenario_logic.items(): + df_program_benchmark_scenarios[f'available_prop_scenario{scen}'] = df_program_benchmark_scenarios.apply(func, axis=1) + + # Add scenarios 6 to 12 to the original dataframe + #------------------------------------------------------ + # Combine all scenario suffixes into a single list + scenario_suffixes = list_of_scenario_suffixes + [f'scenario{i}' for i in range(6, 16)] + scenario_vars = [f'available_prop_{s}' for s in scenario_suffixes] + old_vars = ['Facility_ID', 'month', 'item_code'] + + # Prepare the full base dataframe from scenarios 6–8 + full_df_with_scenario = df_facility_level_benchmark_scenarios[ + old_vars + ['available_prop'] + [f'available_prop_scenario{i}' for i in range(1, 9)] + ].reset_index(drop=True) + + # Add scenario 9 + full_df_with_scenario = full_df_with_scenario.merge( + df_new_scenarios9[old_vars + ['available_prop_scenario9']], + on=old_vars, how='left', validate='1:1' + ) + + # Add scenarios 10–15 from the program benchmark dataframe + full_df_with_scenario = full_df_with_scenario.merge( + df_program_benchmark_scenarios[old_vars + [f'available_prop_scenario{i}' for i in range(10, 16)]], + on=old_vars, how='left', validate='1:1' + ) + + # --- Check that the scenarios 6-11 always have higher prob of availability than baseline --- # + for scen in range(6,12): + assert sum(full_df_with_scenario['available_prop_scenario' + str(scen)] < full_df_with_scenario['available_prop']) == 0 + assert sum(full_df_with_scenario['available_prop_scenario' + str(scen)] >= full_df_with_scenario['available_prop']) == len(full_df_with_scenario) + # Check that scenarios 12-15 always have equal to or lower prob of availability than baselin --- # + for scen in range(12,16): + assert sum(full_df_with_scenario['available_prop_scenario' + str(scen)] > full_df_with_scenario['available_prop']) == 0 + assert sum(full_df_with_scenario['available_prop_scenario' + str(scen)] <= full_df_with_scenario['available_prop']) == len(full_df_with_scenario) + + # --- Check that the exported file has the properties required of it by the model code. --- # + check_format_of_consumables_file(df=full_df_with_scenario, fac_ids=fac_ids) + + # Return updated consumable availability resource file with scenario data + return full_df_with_scenario + +def generate_descriptive_consumable_availability_plots(tlo_availability_df: pd.DataFrame = None, + figurespath: str = None, + mfl: pd.DataFrame = None, + program_item_mapping: pd.DataFrame = None, + chosen_availability_columns:list = None, + scenario_names_dict:dict = None,): + # Prepare the availability dataframe for descriptive plots + df_for_plots = tlo_availability_df.merge(mfl[['Facility_ID', 'Facility_Level']], on = 'Facility_ID', how = 'left', validate = "m:1") + df_for_plots = df_for_plots.merge(program_item_mapping, on = 'item_code', how = 'left', validate = "m:1") + if chosen_availability_columns is None: + # All availability columns + chosen_availability_columns = [c for c in df_for_plots.columns if c.startswith("available_prop")] + + # recreate the chosen columns list based on the mapping above + if scenario_names_dict is not None: + chosen_availability_columns = [scenario_names_dict[col] for col in chosen_availability_columns] + df_for_plots = df_for_plots.rename(columns = scenario_names_dict) + + ''' + # Generate a bar plot of average availability under each scenario by item_category and Facility_Level + def generate_barplot_of_scenarios(_df, _x_axis_var, _filename): + df_for_bar_plot = _df.groupby([_x_axis_var])[chosen_availability_columns].mean() + df_for_bar_plot = df_for_bar_plot.reset_index().melt(id_vars=[_x_axis_var], value_vars=chosen_availability_columns, + var_name='Scenario', value_name='Value') + plot = (ggplot(df_for_bar_plot.reset_index(), aes(x=_x_axis_var, y='Value', fill = 'Scenario')) + + geom_bar(stat='identity', position='dodge') + + ylim(0, 1) + + labs(title = "Probability of availability across scenarios", + x=_x_axis_var, + y='Probability of availability') + + theme(axis_text_x=element_text(angle=45, hjust=1)) + ) + + plot.save(filename= figurespath / _filename, dpi=300, width=10, height=8, units='in') + generate_barplot_of_scenarios(_df = df_for_plots, _x_axis_var = 'item_category', _filename = 'availability_by_category.png') + generate_barplot_of_scenarios(_df = df_for_plots, _x_axis_var = 'Facility_Level', _filename = 'availability_by_level.png') + ''' + + # Plot 1 + # Create heatmaps by Facility_Level of average availability by item_category across chosen scenarios + fac_levels = {'0', '1a', '1b', '2', '3', '4'} + for level in fac_levels: + # Generate a heatmap + # Pivot the DataFrame + aggregated_df = df_for_plots.groupby(['item_category', 'Facility_Level'])[chosen_availability_columns].mean().reset_index() + aggregated_df = aggregated_df[aggregated_df.Facility_Level.isin([level])] + heatmap_data = aggregated_df.set_index('item_category').drop(columns = 'Facility_Level') + + # Calculate the aggregate row and column + aggregate_col= df_for_plots.loc[df_for_plots.Facility_Level.isin([level]), chosen_availability_columns].mean() + #overall_aggregate = aggregate_col.mean() + + # Add aggregate row and column + #heatmap_data['Average'] = aggregate_row + #aggregate_col['Average'] = overall_aggregate + heatmap_data.loc['Average'] = aggregate_col + + # Generate the heatmap + sns.set(font_scale=0.8) + plt.figure(figsize=(10, 8)) + sns.heatmap(heatmap_data, annot=True, cmap='RdYlGn', cbar_kws={'label': 'Proportion of days on which consumable is available'}) + + # Customize the plot + plt.title(f'Facility Level {level}') + plt.xlabel('Scenarios') + plt.ylabel('Disease/Public health \n program') + plt.xticks(rotation=90, fontsize=8) + plt.yticks(rotation=0, fontsize=8) + + plt.savefig(figurespath /f'consumable_availability_heatmap_{level}.png', dpi=300, bbox_inches='tight') + plt.close() + + # Plot 2 + # Create heatmap of average availability by Facility_Level across chosen scenarios # Pivot the DataFrame - aggregated_df = df_for_plots.groupby(['item_category', 'Facility_Level'])[chosen_availability_columns].mean().reset_index() - aggregated_df = aggregated_df[aggregated_df.Facility_Level.isin([level])] - heatmap_data = aggregated_df.set_index('item_category').drop(columns = 'Facility_Level') + aggregated_df = df_for_plots.groupby(['Facility_Level'])[chosen_availability_columns].mean().reset_index() + heatmap_data = aggregated_df.set_index('Facility_Level') # Calculate the aggregate row and column - aggregate_col= df_for_plots.loc[df_for_plots.Facility_Level.isin([level]), chosen_availability_columns].mean() + aggregate_col= df_for_plots[chosen_availability_columns].mean() #overall_aggregate = aggregate_col.mean() # Add aggregate row and column @@ -705,346 +727,91 @@ def generate_barplot_of_scenarios(_df, _x_axis_var, _filename): sns.heatmap(heatmap_data, annot=True, cmap='RdYlGn', cbar_kws={'label': 'Proportion of days on which consumable is available'}) # Customize the plot - plt.title(f'Facility Level {level}') + plt.title('Availability across scenarios') plt.xlabel('Scenarios') - plt.ylabel('Disease/Public health \n program') + plt.ylabel('Facility Level') plt.xticks(rotation=90, fontsize=8) plt.yticks(rotation=0, fontsize=8) - plt.savefig(figurespath /f'consumable_availability_heatmap_{level}.png', dpi=300, bbox_inches='tight') + plt.savefig(figurespath /'consumable_availability_heatmap_alllevels.png', dpi=300, bbox_inches='tight') plt.close() -# Create heatmap of average availability by Facility_Level across chosen scenarios -# Pivot the DataFrame -aggregated_df = df_for_plots.groupby(['Facility_Level'])[chosen_availability_columns].mean().reset_index() -heatmap_data = aggregated_df.set_index('Facility_Level') - -# Calculate the aggregate row and column -aggregate_col= df_for_plots[chosen_availability_columns].mean() -#overall_aggregate = aggregate_col.mean() - -# Add aggregate row and column -#heatmap_data['Average'] = aggregate_row -#aggregate_col['Average'] = overall_aggregate -heatmap_data.loc['Average'] = aggregate_col - -# Generate the heatmap -sns.set(font_scale=0.8) -plt.figure(figsize=(10, 8)) -sns.heatmap(heatmap_data, annot=True, cmap='RdYlGn', cbar_kws={'label': 'Proportion of days on which consumable is available'}) - -# Customize the plot -plt.title('Availability across scenarios') -plt.xlabel('Scenarios') -plt.ylabel('Facility Level') -plt.xticks(rotation=90, fontsize=8) -plt.yticks(rotation=0, fontsize=8) - -plt.savefig(figurespath /'consumable_availability_heatmap_alllevels.png', dpi=300, bbox_inches='tight') -plt.close() - -# Create heatmap of average availability by Facility_Level and program for actual and 75th percentile (Costing paper) -clean_category_names = {'cancer': 'Cancer', 'cardiometabolicdisorders': 'Cardiometabolic Disorders', - 'contraception': 'Contraception', 'general': 'General', 'hiv': 'HIV', 'malaria': 'Malaria', - 'ncds': 'Non-communicable Diseases', 'neonatal_health': 'Neonatal Health', - 'other_childhood_illnesses': 'Other Childhood Illnesses', 'reproductive_health': 'Reproductive Health', - 'road_traffic_injuries': 'Road Traffic Injuries', 'tb': 'Tuberculosis', - 'undernutrition': 'Undernutrition', 'epi': 'Expanded programme on immunization'} -df_with_cleaned_item_category = df_for_plots.copy() -df_with_cleaned_item_category['item_category'] = df_for_plots['item_category'].map(clean_category_names) - -# Actual -aggregated_df = df_with_cleaned_item_category.groupby(['Facility_Level', 'item_category'])['Actual'].mean().reset_index() -heatmap_data = aggregated_df.pivot(index='item_category', columns='Facility_Level', values='Actual') - -# Calculate the aggregate row and column -aggregate_col= df_with_cleaned_item_category.groupby('item_category')['Actual'].mean() -overall_aggregate = df_with_cleaned_item_category['Actual'].mean() -aggregate_row = df_with_cleaned_item_category.groupby('Facility_Level')['Actual'].mean() - -# Add aggregate row and column -heatmap_data['Average'] = aggregate_col -aggregate_row['Average'] = overall_aggregate -heatmap_data.loc['Average'] = aggregate_row - -# Generate the heatmap -sns.set(font_scale=1.2) -plt.figure(figsize=(10, 8)) -sns.heatmap(heatmap_data, annot=True, cmap='RdYlGn', cbar_kws={'label': 'Proportion of days on which consumable is available'}) - -# Customize the plot -plt.xlabel('Facility Level') -plt.ylabel('Program') -plt.xticks(rotation=90) -plt.yticks(rotation=0) - -plt.savefig(figurespath /'heatmap_program_and_level_actual.png', dpi=300, bbox_inches='tight') -plt.show() -plt.close() - -# 75th percentile -aggregated_df = df_with_cleaned_item_category.groupby(['Facility_Level', 'item_category'])['75th percentile\n facility'].mean().reset_index() -heatmap_data = aggregated_df.pivot(index='item_category', columns='Facility_Level', values='75th percentile\n facility') - -# Calculate the aggregate row and column -aggregate_col= df_with_cleaned_item_category.groupby('item_category')['75th percentile\n facility'].mean() -overall_aggregate = df_with_cleaned_item_category['75th percentile\n facility'].mean() -aggregate_row = df_with_cleaned_item_category.groupby('Facility_Level')['75th percentile\n facility'].mean() - - -# Add aggregate row and column -heatmap_data['Average'] = aggregate_col -aggregate_row['Average'] = overall_aggregate -heatmap_data.loc['Average'] = aggregate_row - -# Generate the heatmap -sns.set(font_scale=1.2) -plt.figure(figsize=(10, 8)) -sns.heatmap(heatmap_data, annot=True, cmap='RdYlGn', cbar_kws={'label': 'Proportion of days on which consumable is available'}) - -# Customize the plot -plt.xlabel('Facility Level') -plt.ylabel('Program') -plt.xticks(rotation=90) -plt.yticks(rotation=0) - -plt.savefig(figurespath /'heatmap_program_and_level_75perc.png', dpi=300, bbox_inches='tight') -plt.show() -plt.close() - - -# Create a heatmap of average availability by item_category and scenario -# Base scenario list -base_scenarios = [['Actual']] -# Additional scenarios to add iteratively -additional_scenarios = [ - ['Non-therapeutic \n consumables', 'Vital medicines', 'Pharmacist-\n managed'], - ['75th percentile\n facility', '90th percentile \n facility', 'Best \n facility'], - ['HIV supply \n chain', 'EPI supply \n chain', 'HIV moved to \n Govt supply chain \n (Avg by Level)'] -] -# Generate iteratively chosen availability columns -iteratively_chosen_availability_columns = [ - base_scenarios[0] + sum(additional_scenarios[:i], []) for i in range(len(additional_scenarios) + 1) -] - -i = 1 -for column_list in iteratively_chosen_availability_columns: - # Create heatmap of average availability by item_category across chosen scenarios + # Plot 3 + # Create a barplot of average consumable availability based on the colours used in analysis_impact_of_consumable_scenarios + average_availability = df_for_plots[chosen_availability_columns].mean().reset_index() + average_availability.columns = ['scenario', 'average_availability'] + new_row = pd.DataFrame([['Perfect', 1]], columns=['scenario', 'average_availability']) # new row for perfect availability + average_availability = pd.concat([average_availability, new_row], axis=0, ignore_index=True) # Concatenate the new row with the existing DataFrame + + # Define color mapping for each scenario + color_mapping = { + 'Actual': '#1f77b4', + 'Non-therapeutic \n consumables': '#ff7f0e', + 'Vital medicines': '#2ca02c', + 'Pharmacist-\n managed': '#d62728', + '75th percentile\n facility': '#9467bd', + '90th percentile \n facility': '#8c564b', + 'Best \n facility': '#e377c2', + 'Best facility \n (including DHO)': '#7f7f7f', + 'HIV supply \n chain': '#bcbd22', + 'EPI supply \n chain': '#17becf', + 'HIV moved to \n Govt supply chain \n (Avg by Level)': '#ff6347', + 'HIV moved to \n Govt supply chain \n (Avg by Facility_ID)': '#ff7f50', + 'HIV moved to \n Govt supply chain \n (Avg by Facility_ID times 1.25)': '#fa8072', + 'HIV moved to \n Govt supply chain \n (Avg by Facility_ID times 0.75)': '#cd5c5c', + 'Perfect': '#31a354' + } + + # Create a color list for the bars + colors = [ + color_mapping.get(scenario, '#808080') # default to grey if scenario not in mapping + for scenario in average_availability['scenario'] + ] + # Create the bar plot and capture the bars + plt.figure(figsize=(10, 6)) + bars = plt.bar(average_availability['scenario'], average_availability['average_availability'], color=colors) + plt.title('Average Availability by Scenario') + plt.xlabel('Scenario') + plt.ylabel('Average Availability') + plt.xticks(rotation=90, fontsize=8) + plt.ylim(0, 1) # Adjust based on your data range + plt.grid(axis='y') + # Add data labels + for bar in bars: + yval = bar.get_height() + plt.text(bar.get_x() + bar.get_width() / 2, yval + 0.02, round(yval, 2), ha='center', va='bottom') + + # Save the plot + plt.tight_layout() + plt.tight_layout() + plt.savefig(figurespath / 'scenarios_average_availability.png', dpi=300, bbox_inches='tight') + + # Plot 4 + # Create heatmap of average availability by item for HIV program across chosen scenarios # Pivot the DataFrame - chosen_levels = ['1a', '1b'] - aggregated_df = df_for_plots[df_for_plots.Facility_Level.isin(chosen_levels)] - aggregated_df = aggregated_df.groupby(['item_category'])[column_list].mean().reset_index() - heatmap_data = aggregated_df.set_index('item_category') + aggregated_df = df_for_plots[df_for_plots.item_category == 'hiv'].groupby(['item_code'])[chosen_availability_columns].mean().reset_index() + heatmap_data = aggregated_df.set_index('item_code') # Calculate the aggregate row and column - aggregate_col= df_for_plots.loc[df_for_plots.Facility_Level.isin(chosen_levels), column_list].mean() + aggregate_col= df_for_plots[chosen_availability_columns].mean() #overall_aggregate = aggregate_col.mean() # Add aggregate row and column - heatmap_data['Perfect'] = 1 # Add a column representing the perfect scenario - aggregate_col['Perfect'] = 1 + #heatmap_data['Average'] = aggregate_row + #aggregate_col['Average'] = overall_aggregate heatmap_data.loc['Average'] = aggregate_col - # Ensure all scenarios are represented for consistent column widths - all_scenarios = chosen_availability_columns + ['Perfect'] - heatmap_data = heatmap_data.reindex(columns=all_scenarios) - - # Update column names for x-axis labels # Generate the heatmap - sns.set(font_scale=1.2) + sns.set(font_scale=0.8) plt.figure(figsize=(10, 8)) sns.heatmap(heatmap_data, annot=True, cmap='RdYlGn', cbar_kws={'label': 'Proportion of days on which consumable is available'}) # Customize the plot plt.title('Availability across scenarios') plt.xlabel('Scenarios') - plt.ylabel('Facility Level') - plt.xticks(rotation=90) - plt.yticks(rotation=0) + plt.ylabel('Item Code') + plt.xticks(rotation=90, fontsize=8) + plt.yticks(rotation=0, fontsize=8) - plt.savefig(figurespath /f'consumable_availability_heatmap_bycategory_iter{i}.png', dpi=300, bbox_inches='tight') + plt.savefig(figurespath /'consumable_availability_heatmap_hiv_alllevels_byconsumable.png', dpi=300, bbox_inches='tight') plt.close() - i = i + 1 - -# Create a barplot of average consumable availability based on the colours used in analysis_impact_of_consumable_scenarios -average_availability = df_for_plots[chosen_availability_columns].mean().reset_index() -average_availability.columns = ['scenario', 'average_availability'] -new_row = pd.DataFrame([['Perfect', 1]], columns=['scenario', 'average_availability']) # new row for perfect availability -average_availability = pd.concat([average_availability, new_row], axis=0, ignore_index=True) # Concatenate the new row with the existing DataFrame - -# Define color mapping for each scenario -color_mapping = { - 'Actual': '#1f77b4', - 'Non-therapeutic \n consumables': '#ff7f0e', - 'Vital medicines': '#2ca02c', - 'Pharmacist-\n managed': '#d62728', - '75th percentile\n facility': '#9467bd', - '90th percentile \n facility': '#8c564b', - 'Best \n facility': '#e377c2', - 'Best facility \n (including DHO)': '#7f7f7f', - 'HIV supply \n chain': '#bcbd22', - 'EPI supply \n chain': '#17becf', - 'HIV moved to \n Govt supply chain \n (Avg by Level)': '#ff6347', # original tomato - 'HIV moved to \n Govt supply chain \n (Avg by Facility_ID)': '#ff7f50', # coral - 'HIV moved to \n Govt supply chain \n (Avg by Facility_ID times 1.25)': '#fa8072', # salmon - 'HIV moved to \n Govt supply chain \n (Avg by Facility_ID times 0.75)': '#cd5c5c', # indian red - 'Perfect': '#31a354' -} - -# Create a color list for the bars -colors = [color_mapping[scenario] for scenario in average_availability['scenario']] -# Create the bar plot and capture the bars -plt.figure(figsize=(10, 6)) -bars = plt.bar(average_availability['scenario'], average_availability['average_availability'], color=colors) -plt.title('Average Availability by Scenario') -plt.xlabel('Scenario') -plt.ylabel('Average Availability') -plt.xticks(rotation=90, fontsize=8) -plt.ylim(0, 1) # Adjust based on your data range -plt.grid(axis='y') -# Add data labels -for bar in bars: - yval = bar.get_height() - plt.text(bar.get_x() + bar.get_width() / 2, yval + 0.02, round(yval, 2), ha='center', va='bottom') - -# Save the plot -plt.tight_layout() -plt.tight_layout() -plt.savefig(figurespath / 'scenarios_average_availability.png', dpi=300, bbox_inches='tight') - - -# Create the directory if it doesn't exist -roi_plots_path = outputfilepath / 'horizontal_v_vertical/roi/' -if not os.path.exists(roi_plots_path): - os.makedirs(roi_plots_path) - -# Create a combined plot of heatmaps of average availability for levels 1a and 1b under actual, 75th percentile, HIV and EPI scenarios -# Scenario list -scenarios_for_roi_paper = ['Actual', '75th percentile\n facility', 'HIV supply \n chain', 'EPI supply \n chain'] -# Define facility levels -chosen_levels = ['1a', '1b'] - -# Create a figure with subplots for each level -fig, axes = plt.subplots(nrows=1, ncols=len(chosen_levels), figsize=(20, 8), sharex=True, sharey=True) -# Create a single colorbar axis -cbar_ax = fig.add_axes([.91, .3, .02, .4]) # Position of the colorbar - -for ax, level in zip(axes, chosen_levels): - # Filter data for the current facility level - aggregated_df = df_for_plots[df_for_plots.Facility_Level.isin([level])] - aggregated_df = aggregated_df.groupby(['item_category'])[scenarios_for_roi_paper].mean().reset_index() - heatmap_data = aggregated_df.set_index('item_category') - - # Calculate the aggregate row - aggregate_col = df_for_plots.loc[df_for_plots.Facility_Level.isin([level]), scenarios_for_roi_paper].mean() - heatmap_data.loc['Average'] = aggregate_col - heatmap_data = heatmap_data.rename(columns = {'75th percentile\n facility': "Consumables increased \n to 75th percentile", - 'HIV supply \n chain': "Consuambles increased \n to HIV level", - 'EPI supply \n chain': "Consumables increased \n to EPI level"}) - - # Generate the heatmap on the current subplot - sns.heatmap(heatmap_data, annot=True, cmap='RdYlGn', vmin = 0, vmax = 1, - ax=ax, cbar=(ax == axes[-1]), cbar_ax=(cbar_ax if ax == axes[-1] else None), - annot_kws={"size": 14}) - - # Set labels - ax.set_title(f'Level {level}') - ax.set_xlabel('Scenarios') - ax.set_ylabel('Program' if ax == axes[0] else "") - -cbar_ax.set_ylabel('Proportion of days consumable is available') -# Save the combined heatmap -plt.savefig(roi_plots_path / 'combined_consumable_availability_heatmap_1a_1b.png', dpi=300, bbox_inches='tight') -plt.close() - -# Create a combined plot of heatmaps of average availability for all levels under actual, 75th percentile, HIV and EPI scenarios -chosen_levels = ['0', '1a', '1b', '2', '3'] -# Create a figure with subplots -fig, axes = plt.subplots(nrows=1, ncols=len(chosen_levels), figsize=(20, 8), sharex=True, sharey=True) - -# Create a single colorbar axis -cbar_ax = fig.add_axes([.91, .3, .02, .4]) # Position of the colorbar - -for ax, level in zip(axes, chosen_levels): - # Filter data for the current facility level - aggregated_df = df_for_plots[df_for_plots.Facility_Level.isin([level])] - aggregated_df = aggregated_df.groupby(['item_category'])[scenarios_for_roi_paper].mean().reset_index() - heatmap_data = aggregated_df.set_index('item_category') - - # Calculate the aggregate row - aggregate_col = df_for_plots.loc[df_for_plots.Facility_Level.isin([level]), scenarios_for_roi_paper].mean() - heatmap_data.loc['Average'] = aggregate_col - - # Generate the heatmap on the current subplot - sns.heatmap(heatmap_data, annot=True, cmap='RdYlGn', ax=ax, cbar=(ax == axes[-1]), cbar_ax=(cbar_ax if ax == axes[-1] else None)) - - # Set labels - ax.set_title(f'Level {level}') - ax.set_xlabel('Scenarios') - ax.set_ylabel('Program' if ax == axes[0] else "") - -# Adjust layout -cbar_ax.set_ylabel('Proportion of days consumable is available') -# Save the combined heatmap -plt.savefig(roi_plots_path / 'combined_consumable_availability_heatmap_all_levels.png', dpi=300, bbox_inches='tight') -plt.close() - - -# Create heatmap of average availability by Facility_Level across chosen scenarios -# Pivot the DataFrame -aggregated_df = df_for_plots[df_for_plots.item_category == 'hiv'].groupby(['Facility_Level'])[chosen_availability_columns].mean().reset_index() -heatmap_data = aggregated_df.set_index('Facility_Level') - -# Calculate the aggregate row and column -aggregate_col= df_for_plots[chosen_availability_columns].mean() -#overall_aggregate = aggregate_col.mean() - -# Add aggregate row and column -#heatmap_data['Average'] = aggregate_row -#aggregate_col['Average'] = overall_aggregate -heatmap_data.loc['Average'] = aggregate_col - -# Generate the heatmap -sns.set(font_scale=0.8) -plt.figure(figsize=(10, 8)) -sns.heatmap(heatmap_data, annot=True, cmap='RdYlGn', cbar_kws={'label': 'Proportion of days on which consumable is available'}) - -# Customize the plot -plt.title('Availability across scenarios') -plt.xlabel('Scenarios') -plt.ylabel('Facility Level') -plt.xticks(rotation=90, fontsize=8) -plt.yticks(rotation=0, fontsize=8) - -plt.savefig(figurespath /'consumable_availability_heatmap_hiv_alllevels.png', dpi=300, bbox_inches='tight') -plt.close() - - -# Create heatmap of average availability by Facility_Level across chosen scenarios -# Pivot the DataFrame -aggregated_df = df_for_plots[df_for_plots.item_category == 'hiv'].groupby(['item_code'])[chosen_availability_columns].mean().reset_index() -heatmap_data = aggregated_df.set_index('item_code') - -# Calculate the aggregate row and column -aggregate_col= df_for_plots[chosen_availability_columns].mean() -#overall_aggregate = aggregate_col.mean() - -# Add aggregate row and column -#heatmap_data['Average'] = aggregate_row -#aggregate_col['Average'] = overall_aggregate -heatmap_data.loc['Average'] = aggregate_col - -# Generate the heatmap -sns.set(font_scale=0.8) -plt.figure(figsize=(10, 8)) -sns.heatmap(heatmap_data, annot=True, cmap='RdYlGn', cbar_kws={'label': 'Proportion of days on which consumable is available'}) - -# Customize the plot -plt.title('Availability across scenarios') -plt.xlabel('Scenarios') -plt.ylabel('Item Code') -plt.xticks(rotation=90, fontsize=8) -plt.yticks(rotation=0, fontsize=8) - -plt.savefig(figurespath /'consumable_availability_heatmap_hiv_alllevels_byconsumable.png', dpi=300, bbox_inches='tight') -plt.close()