Skip to content
This repository was archived by the owner on Mar 13, 2020. It is now read-only.

Commit 8c97e80

Browse files
authored
Merge pull request #4 from PageUpPeopleOrg/ExecutionLogs
Execution logs
2 parents d9855f1 + fd27b9b commit 8c97e80

17 files changed

+260
-70
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1+
# IDE-specific settings
12
.idea/
3+
.vscode/
24

35
# Byte-compiled / optimized / DLL files
46
__pycache__/

README.md

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,8 @@ In the above example, dwsource is a 64bit ODBC system dsn
1818

1919

2020
### Examples
21-
#### CSV Source
22-
23-
`py rdl.py csv://.\test_data\full_refresh postgresql+psycopg2://postgres:xxxx@localhost/dest_dw .\configuration\ --log-level INFO --full-refresh yes`
24-
`py rdl.py csv://.\test_data\incremental_refresh postgresql+psycopg2://postgres:xxxx@localhost/dest_dw .\configuration\ --log-level INFO --full-refresh no`
25-
26-
27-
#### MSSQL Source
28-
2921

22+
See `test_*.cmd` scripts for usage samples.
3023

3124
### Troubleshooting
3225
Run with `--log-level DEBUG` on the command line.

appveyor.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,15 +39,15 @@ build_script:
3939
- psql -c "SELECT VERSION()"
4040
- createdb %DBNAME%
4141
- psql -d %DBNAME% -c "CREATE EXTENSION IF NOT EXISTS citext"
42-
42+
- C:\projects\relational-data-loader\venv\Scripts\activate.bat
4343
#Install the dependencies for rdl.
4444
- pip install -r requirements.txt
4545

4646

4747
test_script:
4848
- test_full_refresh_from_csv.cmd
4949
- test_incremental_refresh_from_csv.cmd
50-
- test_full_refresh_from_mssql.cmd
50+
- test_incremental_refresh_from_mssql.cmd
5151

5252
on_finish:
5353
#Enable this line to make the build pause after completion for RDP troubleshooting.

integration_tests/mssql_source/config/LargeTableTest.json

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
{
2-
32
"source_table": {
43
"name": "LargeTable",
54
"schema": "dbo",
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
FROM microsoft/mssql-server-linux:2017-latest
2+
3+
ENV ACCEPT_EULA=Y
Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,5 @@
11
IF NOT EXISTS (SELECT * FROM sys.databases WHERE Name = 'RelationalDataLoaderIntegrationTestSource')
2-
CREATE DATABASE RelationalDataLoaderIntegrationTestSource
2+
CREATE DATABASE RelationalDataLoaderIntegrationTestSource
3+
4+
ALTER DATABASE RelationalDataLoaderIntegrationTestSource
5+
SET CHANGE_TRACKING = ON (CHANGE_RETENTION = 2 DAYS, AUTO_CLEANUP = ON)

modules/BatchDataLoader.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
class BatchDataLoader(object):
88
def __init__(self, data_source, source_table_configuration, target_schema, target_table, columns, data_load_tracker,
9-
batch_configuration, target_engine, logger=None):
9+
batch_configuration, target_engine, full_refresh, change_tracking_info, logger=None):
1010
self.logger = logger or logging.getLogger(__name__)
1111
self.source_table_configuration = source_table_configuration
1212
self.columns = columns
@@ -16,15 +16,18 @@ def __init__(self, data_source, source_table_configuration, target_schema, targe
1616
self.data_load_tracker = data_load_tracker
1717
self.batch_configuration = batch_configuration
1818
self.target_engine = target_engine
19+
self.full_refresh = full_refresh
20+
self.change_tracking_info = change_tracking_info
1921

2022
# Imports rows, returns True if >0 rows were found
2123
def load_batch(self, previous_batch_key):
2224
batch_tracker = self.data_load_tracker.start_batch()
2325

24-
self.logger.debug("ImportBatch Starting from previous_batch_key: {0}".format(previous_batch_key))
26+
self.logger.debug("ImportBatch Starting from previous_batch_key: {0}. Full Refresh: {1} this_sync_version: {2}".format(previous_batch_key, self.full_refresh, self.change_tracking_info.this_sync_version))
2527

2628
data_frame = self.data_source.get_next_data_frame(self.source_table_configuration, self.columns,
27-
self.batch_configuration, batch_tracker, previous_batch_key)
29+
self.batch_configuration, batch_tracker, previous_batch_key,
30+
self.full_refresh, self.change_tracking_info)
2831

2932
if data_frame is None or len(data_frame) == 0:
3033
self.logger.debug("There are no rows to import, returning -1")
@@ -44,7 +47,6 @@ def write_data_frame_to_table(self, data_frame):
4447
qualified_target_table = "{0}.{1}".format(self.target_schema, self.target_table)
4548
self.logger.debug("Starting write to table {0}".format(qualified_target_table))
4649
data = StringIO()
47-
4850
data_frame.to_csv(data, header=False, index=False, na_rep='', float_format='%.16g')
4951
# Float_format is used to truncate any insignificant digits. Unfortunately it gives us an artificial limitation
5052

@@ -58,6 +60,7 @@ def write_data_frame_to_table(self, data_frame):
5860

5961
sql = "COPY {0}({1}) FROM STDIN with csv".format(qualified_target_table, column_list)
6062
self.logger.debug("Writing to table using command {0}".format(sql))
63+
6164
curs.copy_expert(sql=sql, file=data)
6265

6366
self.logger.debug("Completed write to table {0}".format(qualified_target_table))
@@ -70,6 +73,10 @@ def get_destination_column_name(self, source_column_name):
7073
if column['source_name'] == source_column_name:
7174
return column['destination']['name']
7275

76+
# Internal columns - map them straight through
77+
if source_column_name.startswith("data_pipeline_"):
78+
return source_column_name;
79+
7380
message = 'A source column with name {0} was not found in the column configuration'.format(source_column_name)
7481
raise ValueError(message)
7582

modules/DataLoadManager.py

Lines changed: 50 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,48 +1,81 @@
11
import os
22
import json
3+
import uuid
34
import logging
5+
import hashlib
6+
47
from modules.BatchDataLoader import BatchDataLoader
58
from modules.DestinationTableManager import DestinationTableManager
6-
from modules.DataLoadTracker import DataLoadTracker
7-
9+
from modules.data_load_tracking.DataLoadTracker import DataLoadTracker
810

911

1012
class DataLoadManager(object):
11-
def __init__(self, configuration_path, data_source, logger=None):
13+
def __init__(self, configuration_path, data_source, data_load_tracker_repository, logger=None):
1214
self.logger = logger or logging.getLogger(__name__)
1315
self.configuration_path = configuration_path
1416
self.data_source = data_source
15-
17+
self.data_load_tracker_repository = data_load_tracker_repository
18+
self.correlation_id = uuid.uuid4()
1619
def start_imports(self, target_engine, full_refresh):
1720
for file in os.listdir(self.configuration_path):
1821
self.start_single_import(target_engine, file, full_refresh)
1922

2023
self.logger.info("Execution completed.")
2124

22-
def start_single_import(self, target_engine, configuration_name, requested_full_refresh):
23-
self.logger.debug("Using configuration file : {0}".format(configuration_name))
25+
def start_single_import(self, target_engine, model_name, requested_full_refresh):
26+
self.logger.debug("Using configuration file : {0}".format(model_name))
2427

25-
config_file = os.path.abspath(self.configuration_path + configuration_name)
28+
config_file = os.path.abspath(self.configuration_path + model_name)
2629
self.logger.debug("Using configuration file : {0}".format(config_file))
27-
with open(config_file) as json_data:
28-
pipeline_configuration = json.load(json_data)
30+
with open(config_file) as json_file:
31+
model_checksum = hashlib.md5(json_file.read().encode('utf-8')).hexdigest()
32+
json_file.seek(0)
33+
pipeline_configuration = json.load(json_file)
2934

30-
self.logger.info("Execute Starting for: {0} requested_full_refresh: {1}".format(configuration_name, requested_full_refresh))
35+
self.logger.info("Execute Starting for: {0} requested_full_refresh: {1}".format(model_name, requested_full_refresh))
3136

3237
destination_table_manager = DestinationTableManager(target_engine)
3338

39+
full_refresh_reason = "Command Line Argument" if requested_full_refresh else "N/A"
3440
full_refresh = requested_full_refresh
3541
if not requested_full_refresh and not destination_table_manager.table_exists(pipeline_configuration['target_schema'],
3642
pipeline_configuration['load_table']):
3743
self.logger.warning("The load table {0}.{1} does not exist. Swapping to full-refresh mode".format(pipeline_configuration['target_schema'],
3844
pipeline_configuration['load_table']))
39-
full_refresh = True
4045

41-
data_load_tracker = DataLoadTracker(configuration_name, json_data, full_refresh)
46+
full_refresh_reason = "Destination table does not exist"
47+
full_refresh = True
4248

4349
self.data_source.assert_data_source_is_valid(pipeline_configuration['source_table'],
4450
pipeline_configuration['columns'])
4551

52+
last_successful_data_load_execution = self.data_load_tracker_repository.get_last_successful_data_load_execution(model_name)
53+
54+
if last_successful_data_load_execution is None:
55+
last_sync_version = 0
56+
full_refresh_reason = "First Execution"
57+
full_refresh = True
58+
else:
59+
self.logger.debug("Previous Checksum {0}. Current Checksum {1}".format(last_successful_data_load_execution.model_checksum, model_checksum))
60+
last_sync_version = last_successful_data_load_execution.next_sync_version
61+
if not full_refresh and last_successful_data_load_execution.model_checksum != model_checksum:
62+
self.logger.info("A model checksum change has forced this to be a full load")
63+
full_refresh = True
64+
full_refresh_reason = "Model Change"
65+
66+
change_tracking_info = self.data_source.init_change_tracking(pipeline_configuration['source_table'],
67+
last_sync_version)
68+
69+
70+
71+
if not full_refresh and change_tracking_info.force_full_load:
72+
self.logger.info("Change tracking has forced this to be a full load")
73+
full_refresh = True
74+
full_refresh_reason = "Change Tracking Invalid"
75+
76+
data_load_tracker = DataLoadTracker(model_name, model_checksum, json_file, full_refresh, change_tracking_info,
77+
self.correlation_id, full_refresh_reason)
78+
4679
columns = pipeline_configuration['columns']
4780
destination_table_manager.create_schema(pipeline_configuration['target_schema'])
4881

@@ -60,7 +93,9 @@ def start_single_import(self, target_engine, configuration_name, requested_full_
6093
columns,
6194
data_load_tracker,
6295
pipeline_configuration['batch'],
63-
target_engine)
96+
target_engine,
97+
full_refresh,
98+
change_tracking_info)
6499

65100
previous_unique_column_value = 0
66101
while previous_unique_column_value > -1:
@@ -82,5 +117,6 @@ def start_single_import(self, target_engine, configuration_name, requested_full_
82117
destination_table_manager.drop_table(pipeline_configuration['target_schema'],
83118
pipeline_configuration['stage_table'])
84119
data_load_tracker.completed_successfully()
85-
self.logger.info("Import for configuration: {0} Complete. {1}".format(configuration_name, data_load_tracker.get_statistics()))
120+
self.data_load_tracker_repository.save(data_load_tracker)
121+
self.logger.info("Import Complete for: {0}. {1}".format(model_name, data_load_tracker.get_statistics()))
86122

modules/DestinationTableManager.py

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,15 @@
33
import logging
44
from modules.ColumnTypeResolver import ColumnTypeResolver
55

6-
from sqlalchemy import MetaData, DateTime, Boolean
6+
from sqlalchemy import MetaData, DateTime, Boolean, BigInteger
77
from sqlalchemy.schema import Column, Table
88
from sqlalchemy.sql import func
99

1010

1111
class DestinationTableManager(object):
1212
TIMESTAMP_COLUMN_NAME = "data_pipeline_timestamp"
1313
IS_DELETED_COLUMN_NAME = "data_pipeline_is_deleted"
14+
CHANGE_VERSION_COLUMN_NAME = "data_pipeline_change_version"
1415

1516
def __init__(self, target_engine, logger=None):
1617
self.logger = logger or logging.getLogger(__name__)
@@ -23,7 +24,6 @@ def create_schema(self, schema_name):
2324
def table_exists(self, schema_name, table_name):
2425
return self.target_engine.dialect.has_table(self.target_engine, table_name, schema_name)
2526

26-
2727
def drop_table(self, schema_name, table_name):
2828
metadata = MetaData()
2929
self.logger.debug(
@@ -35,9 +35,6 @@ def drop_table(self, schema_name, table_name):
3535
self.logger.debug(
3636
"Dropped table {0}.{1}".format(schema_name, table_name))
3737

38-
39-
40-
4138
def create_table(self, schema_name, table_name, columns_configuration, drop_first):
4239
metadata = MetaData()
4340

@@ -52,9 +49,8 @@ def create_table(self, schema_name, table_name, columns_configuration, drop_firs
5249
table.append_column(
5350
Column(self.IS_DELETED_COLUMN_NAME, Boolean, server_default='f', default=False))
5451

55-
56-
57-
52+
table.append_column(
53+
Column(self.CHANGE_VERSION_COLUMN_NAME, BigInteger))
5854

5955
if drop_first:
6056
self.logger.debug(
@@ -66,6 +62,7 @@ def create_table(self, schema_name, table_name, columns_configuration, drop_firs
6662
self.logger.debug("Creating table {0}.{1}".format(schema_name, table_name))
6763
table.create(self.target_engine, checkfirst=False)
6864
self.logger.debug("Created table {0}.{1}".format(schema_name, table_name))
65+
6966
return
7067

7168
def create_column(self, configuration):
@@ -116,8 +113,12 @@ def upsert_table(self, schema_name, source_table_name, target_table_name, column
116113
column_array = list(map(lambda column: column['destination']['name'], columns_configuration))
117114
column_list = ','.join(map(str, column_array))
118115
column_list = column_list + ",{0}".format(self.TIMESTAMP_COLUMN_NAME)
116+
column_list = column_list + ",{0}".format(self.IS_DELETED_COLUMN_NAME)
117+
column_list = column_list + ",{0}".format(self.CHANGE_VERSION_COLUMN_NAME)
119118

120-
primary_key_column_array = [column_configuration['destination']['name'] for column_configuration in columns_configuration if 'primary_key' in column_configuration['destination'] and column_configuration['destination']['primary_key']]
119+
primary_key_column_array = [column_configuration['destination']['name'] for column_configuration in
120+
columns_configuration if 'primary_key' in column_configuration['destination'] and
121+
column_configuration['destination']['primary_key']]
121122

122123
primary_key_column_list = ','.join(map(str, primary_key_column_array))
123124

@@ -128,11 +129,16 @@ def upsert_table(self, schema_name, source_table_name, target_table_name, column
128129
sql_builder.write(os.linesep)
129130
sql_builder.write(" ON CONFLICT({0}) DO UPDATE SET ".format(primary_key_column_list))
130131

131-
for column_configuratiomn in columns_configuration:
132-
sql_builder.write("{0} = EXCLUDED.{0},".format(column_configuratiomn['destination']['name']))
132+
for column_configuration in columns_configuration:
133+
sql_builder.write("{0} = EXCLUDED.{0},".format(column_configuration['destination']['name']))
133134
sql_builder.write(os.linesep)
134135

135-
sql_builder.write("{0} = EXCLUDED.{0}".format(self.TIMESTAMP_COLUMN_NAME))
136+
sql_builder.write("{0} = EXCLUDED.{0},".format(self.TIMESTAMP_COLUMN_NAME))
137+
sql_builder.write(os.linesep)
138+
sql_builder.write("{0} = EXCLUDED.{0},".format(self.IS_DELETED_COLUMN_NAME))
139+
sql_builder.write(os.linesep)
140+
sql_builder.write("{0} = EXCLUDED.{0}".format(self.CHANGE_VERSION_COLUMN_NAME))
141+
sql_builder.write(os.linesep)
136142

137143
self.logger.debug("Upsert executing {0}".format(sql_builder.getvalue()))
138144
self.target_engine.execute(sql_builder.getvalue())

modules/RelationalDataLoader.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@
22
import argparse
33
from sqlalchemy import create_engine
44
from modules.DataLoadManager import DataLoadManager
5+
from modules.data_load_tracking.DataLoadTrackerRepository import DataLoadTrackerRepository
56
from modules.data_sources.DataSourceFactory import DataSourceFactory
7+
from sqlalchemy.orm import sessionmaker
68

79
_LOG_LEVEL_STRINGS = ['CRITICAL', 'ERROR', 'WARNING', 'INFO', 'DEBUG']
810

@@ -20,7 +22,10 @@ def main(self):
2022

2123
destination_engine = create_engine(args['destination-engine'])
2224

23-
data_load_manager = DataLoadManager(args['configuration-folder'], data_source)
25+
session_maker = sessionmaker(bind=destination_engine)
26+
repository = DataLoadTrackerRepository(session_maker)
27+
repository.create_tables(destination_engine)
28+
data_load_manager = DataLoadManager(args['configuration-folder'], data_source, repository)
2429
data_load_manager.start_imports(destination_engine, args['full_refresh'])
2530

2631
def configure_logging(self, log_level):

0 commit comments

Comments
 (0)