22import json
33import uuid
44import logging
5+ import hashlib
6+
57from modules .BatchDataLoader import BatchDataLoader
68from modules .DestinationTableManager import DestinationTableManager
79from modules .data_load_tracking .DataLoadTracker import DataLoadTracker
@@ -25,35 +27,54 @@ def start_single_import(self, target_engine, model_name, requested_full_refresh)
2527
2628 config_file = os .path .abspath (self .configuration_path + model_name )
2729 self .logger .debug ("Using configuration file : {0}" .format (config_file ))
28- with open (config_file ) as json_data :
29- pipeline_configuration = json .load (json_data )
30+ with open (config_file ) as json_file :
31+ model_checksum = hashlib .md5 (json_file .read ().encode ('utf-8' )).hexdigest ()
32+ json_file .seek (0 )
33+ pipeline_configuration = json .load (json_file )
3034
3135 self .logger .info ("Execute Starting for: {0} requested_full_refresh: {1}" .format (model_name , requested_full_refresh ))
3236
3337 destination_table_manager = DestinationTableManager (target_engine )
3438
39+ full_refresh_reason = "Command Line Argument" if requested_full_refresh else "N/A"
3540 full_refresh = requested_full_refresh
3641 if not requested_full_refresh and not destination_table_manager .table_exists (pipeline_configuration ['target_schema' ],
3742 pipeline_configuration ['load_table' ]):
3843 self .logger .warning ("The load table {0}.{1} does not exist. Swapping to full-refresh mode" .format (pipeline_configuration ['target_schema' ],
3944 pipeline_configuration ['load_table' ]))
45+
46+ full_refresh_reason = "Destination table does not exist"
4047 full_refresh = True
4148
4249 self .data_source .assert_data_source_is_valid (pipeline_configuration ['source_table' ],
4350 pipeline_configuration ['columns' ])
4451
45- last_sync_version = self .data_load_tracker_repository .get_last_sync_version (model_name )
52+ last_successful_data_load_execution = self .data_load_tracker_repository .get_last_successful_data_load_execution (model_name )
53+
54+ if last_successful_data_load_execution is None :
55+ last_sync_version = 0
56+ full_refresh_reason = "First Execution"
57+ full_refresh = True ,
58+ else :
59+ self .logger .debug ("Previous Checksum {0}. Current Checksum {1}" .format (last_successful_data_load_execution .model_checksum , model_checksum ))
60+ last_sync_version = last_successful_data_load_execution .next_sync_version
61+ if not full_refresh and last_successful_data_load_execution .model_checksum != model_checksum :
62+ self .logger .info ("A model checksum change has forced this to be a full load" )
63+ full_refresh = True
64+ full_refresh_reason = "Model Change"
4665
4766 change_tracking_info = self .data_source .init_change_tracking (pipeline_configuration ['source_table' ],
4867 last_sync_version )
4968
5069
51- data_load_tracker = DataLoadTracker (model_name , json_data , full_refresh , change_tracking_info , self .correlation_id )
5270
53- self .logger .debug (" Change Tracking: this_sync_version: {0} next_sync_version: {1} force_full_load:{2} : " .format (change_tracking_info .this_sync_version , change_tracking_info .next_sync_version , change_tracking_info .force_full_load ))
5471 if not full_refresh and change_tracking_info .force_full_load :
5572 self .logger .info ("Change tracking has forced this to be a full load" )
5673 full_refresh = True
74+ full_refresh_reason = "Change Tracking Invalid"
75+
76+ data_load_tracker = DataLoadTracker (model_name , model_checksum , json_file , full_refresh , change_tracking_info ,
77+ self .correlation_id , full_refresh_reason )
5778
5879 columns = pipeline_configuration ['columns' ]
5980 destination_table_manager .create_schema (pipeline_configuration ['target_schema' ])
0 commit comments