Skip to content

Commit 3c5397a

Browse files
authored
Merge pull request #26 from andreasscherman/add-vacuum-metrics
Add mxid and xid metrics
2 parents 6c16a8b + e39c822 commit 3c5397a

File tree

7 files changed

+147
-35
lines changed

7 files changed

+147
-35
lines changed

README.md

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -217,14 +217,21 @@ Called once per your Postgres cluster.
217217
per each slave. If the slave and the master are in synchronous state,
218218
the replication delay is zero.
219219

220-
221-
### Database Local Directory Based (Global) Metrics
222-
223220
* **get_stats_wal_file_amount**:
224221
This graph shows the amount of files in your database clusters WAL log
225222
directory (pg_wal or pg_xlog). If the WAL file amount starts to suddenly
226223
increase, you probably have issues with your WAL archiving process, which
227224
might lead to the disk filling up, and you database cluster crashing.
225+
226+
* **get_xid_remaining_ratio, get_multixact_remaining_ratio, get_multixact_members_remaining_ratio**:
227+
These metric shows the corresponding remaining % of transaction ids ("xid"), multixact ids ("mxid"),
228+
and multixact members that are available for postgres to use before exhaustion.
229+
Useful for ensuring that the vacuuming is working as intended for your postgres instance.
230+
231+
* **get_multixact_members_per_mxid**:
232+
This metric emits the number of multixact members there are per multixact ID. A larger number means
233+
that it'll be quicker for the multixact members exhaustion to happen (as can
234+
be seen in **get_multixact_members_usage_ratio**).
228235

229236

230237
## Short Overview of Python Modules

etc/postgresql-metrics/default/postgresql-metrics.yml

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,7 @@ ffwd:
3838
# Each entry must be a tuple with the function name, and a time interval in seconds
3939
# to call that metrics function.
4040
#
41-
# db_functions: Functions taking DB connection and returning a list of metrics,
42-
# called once per each database in cluster.
41+
# db_functions: Functions called once per each database in cluster.
4342
db_functions:
4443
- ["get_stats_disk_usage_for_database", 180]
4544
- ["get_stats_tx_rate_for_database", 60]
@@ -52,15 +51,14 @@ db_functions:
5251
# replication status relies on `pg_stat_wal_receiver`, which is only available on postgres 9.6+
5352
# - ["get_stats_incoming_replication_status", 30]
5453

55-
# global_db_functions: Functions taking DB connection and returning a list of metrics,
56-
# called once per the whole database cluster.
54+
# global_db_functions: Functions called once per the whole database cluster.
5755
global_db_functions:
5856
- ["get_stats_client_connections", 60]
5957
- ["get_stats_lock_statistics", 60]
6058
- ["get_stats_heap_hit_statistics", 60]
6159
- ["get_stats_replication_delays", 60]
62-
63-
# data_dir_functions: Functions taking a file path to Postgres data dir and returning
64-
# a list of metrics, called once per the whole database cluster.
65-
data_dir_functions:
6660
- ["get_stats_wal_file_amount", 180]
61+
- ["get_multixact_members_per_mxid", 60]
62+
- ["get_multixact_members_remaining_ratio", 60]
63+
- ["get_multixact_remaining_ratio", 60]
64+
- ["get_xid_remaining_ratio", 60]

postgresql_metrics/default_metrics.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,29 @@ def metric_sec_since_oldest_xact_start(database_name, value):
116116
'unit': 's'})
117117

118118

119+
def metric_xid_remaining_ratio(value):
120+
return create_default_metric(value,
121+
{'what': 'xid-remaining',
122+
'unit': '%'})
123+
124+
125+
def metric_multixact_remaining_ratio(value):
126+
return create_default_metric(value,
127+
{'what': 'mxid-remaining',
128+
'unit': '%'})
129+
130+
131+
def metric_multixact_members_per_mxid(value):
132+
return create_default_metric(value,
133+
{'what': 'multixact-members-per-mxid',
134+
'unit': 'members/id'})
135+
136+
137+
def metric_multixact_members_remaining_ratio(value):
138+
return create_default_metric(value,
139+
{'what': 'multixact-members-remaining',
140+
'unit': '%'})
141+
119142
def metric_wal_file_amount(value):
120143
return create_default_metric(value,
121144
{'what': 'wal-file-amount',

postgresql_metrics/localhost_postgres_stats.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,18 @@
2424
LOG = get_logger()
2525

2626

27+
def get_multixact_member_files(data_dir):
28+
try:
29+
members_dir = os.path.join(data_dir, "pg_multixact", "members")
30+
if os.path.isdir(members_dir):
31+
return len([f for f in os.listdir(members_dir) if os.path.isfile(os.path.join(members_dir, f))])
32+
else:
33+
LOG.exception(f"Missing pg_multixact/members directory in data_dir: {data_dir}")
34+
except OSError:
35+
LOG.exception('Failed accessing multixact member files in: {data_dir}. Is data dir readable by user?')
36+
return 0
37+
38+
2739
def get_amount_of_wal_files(data_dir):
2840
amount_of_wal_files = 0
2941
try:

postgresql_metrics/metrics_gatherer.py

Lines changed: 61 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,13 @@
4343
metric_replication_delay_bytes,
4444
metric_wal_file_amount,
4545
metric_incoming_replication_running,
46+
metric_multixact_members_per_mxid,
47+
metric_multixact_remaining_ratio,
48+
metric_xid_remaining_ratio,
49+
metric_multixact_members_remaining_ratio,
4650
)
4751

48-
from postgresql_metrics.localhost_postgres_stats import get_amount_of_wal_files
52+
from postgresql_metrics.localhost_postgres_stats import get_amount_of_wal_files, get_multixact_member_files
4953

5054
from postgresql_metrics.postgres_queries import (
5155
get_client_connections_amount,
@@ -60,23 +64,29 @@
6064
get_replication_delays,
6165
get_tables_with_oids_for_current_db,
6266
get_wal_receiver_status,
67+
get_max_mxid_age,
68+
get_max_xid_age,
6369
)
6470

71+
MEMBERS_PER_MEMBER_FILE = 52352
72+
MAX_MULTIXACT_MEMBERS = 2**32
73+
WRAPAROUND_LIMIT = (2**32/2) - 1
6574

6675
# Notice that all functions here are expected to return a list of metrics.
6776
# Notice also that the names of these functions should match the configuration.
6877

69-
def get_stats_client_connections(db_connection):
78+
79+
def get_stats_client_connections(_data_dir, db_connection):
7080
client_amount = get_client_connections_amount(db_connection)
7181
return [metric_client_connections(client_amount)]
7282

7383

74-
def get_stats_disk_usage_for_database(db_connection):
84+
def get_stats_disk_usage_for_database(_data_dir, db_connection):
7585
db_size = get_disk_usage_for_database(db_connection)
7686
return [metric_database_size(db_size[0], db_size[1])]
7787

7888

79-
def get_stats_tx_rate_for_database(db_connection):
89+
def get_stats_tx_rate_for_database(_data_dir, db_connection):
8090
db_name, tx_rate, tx_rollbacks = get_transaction_rate_for_database(db_connection)
8191
if tx_rate is not None:
8292
return [metric_transaction_rate(db_name, tx_rate),
@@ -85,15 +95,15 @@ def get_stats_tx_rate_for_database(db_connection):
8595
return []
8696

8797

88-
def get_stats_seconds_since_last_vacuum_per_table(db_connection):
98+
def get_stats_seconds_since_last_vacuum_per_table(_data_dir, db_connection):
8999
last_vacuums_data = get_seconds_since_last_vacuum_per_table(db_connection)
90100
metrics = []
91101
for db_name, table_name, seconds_since in last_vacuums_data:
92102
metrics.append(metric_seconds_since_last_vacuum(db_name, table_name, seconds_since))
93103
return metrics
94104

95105

96-
def get_stats_heap_hit_statistics(db_connection):
106+
def get_stats_heap_hit_statistics(_data_dir, db_connection):
97107
db_name, heap_read, heap_hit, heap_hit_ratio = get_heap_hit_statistics(db_connection)
98108
metrics = []
99109
if heap_hit_ratio is not None:
@@ -103,7 +113,7 @@ def get_stats_heap_hit_statistics(db_connection):
103113
return metrics
104114

105115

106-
def get_stats_lock_statistics(db_connection):
116+
def get_stats_lock_statistics(_data_dir, db_connection):
107117
locks_by_type, [total_locks_waiting, total_locks_granted] = get_lock_statistics(db_connection)
108118
metrics = []
109119
for lock_type, [locks_waiting, locks_granted] in locks_by_type.items():
@@ -114,15 +124,15 @@ def get_stats_lock_statistics(db_connection):
114124
return metrics
115125

116126

117-
def get_stats_oldest_transaction_timestamp(db_connection):
127+
def get_stats_oldest_transaction_timestamp(_data_dir, db_connection):
118128
db_name, sec_since_oldest_xact_start = get_oldest_transaction_timestamp(db_connection)
119129
metrics = []
120130
if sec_since_oldest_xact_start is not None:
121131
metrics.append(metric_sec_since_oldest_xact_start(db_name, sec_since_oldest_xact_start))
122132
return metrics
123133

124134

125-
def get_stats_table_bloat(db_connection):
135+
def get_stats_table_bloat(_data_dir, db_connection):
126136
tables_with_oids = get_tables_with_oids_for_current_db(db_connection)
127137
metrics = []
128138
for table_oid, table_name in tables_with_oids:
@@ -132,7 +142,7 @@ def get_stats_table_bloat(db_connection):
132142
return metrics
133143

134144

135-
def get_stats_index_hit_rates(db_connection):
145+
def get_stats_index_hit_rates(_data_dir, db_connection):
136146
index_hit_rates = get_index_hit_rates(db_connection)
137147
metrics = []
138148
for db_name, table_name, index_hit_ratio in index_hit_rates:
@@ -141,18 +151,56 @@ def get_stats_index_hit_rates(db_connection):
141151
return metrics
142152

143153

144-
def get_stats_replication_delays(db_connection):
154+
def get_stats_replication_delays(_data_dir, db_connection):
145155
replication_delays = get_replication_delays(db_connection)
146156
metrics = []
147157
for client_addr, delay_in_bytes in replication_delays:
148158
metrics.append(metric_replication_delay_bytes(client_addr, delay_in_bytes))
149159
return metrics
150160

151161

152-
def get_stats_wal_file_amount(data_dir):
162+
def _get_multixact_members(data_dir):
163+
return get_multixact_member_files(data_dir) * MEMBERS_PER_MEMBER_FILE
164+
165+
166+
def get_multixact_members_per_mxid(data_dir, db_connection):
167+
members = _get_multixact_members(data_dir)
168+
mxid_age = get_max_mxid_age(db_connection)
169+
if not mxid_age:
170+
return []
171+
members_per_id = round(members / mxid_age, 2)
172+
return [metric_multixact_members_per_mxid(members_per_id)]
173+
174+
175+
def get_multixact_members_remaining_ratio(data_dir, _db_connection):
176+
members = _get_multixact_members(data_dir)
177+
ratio = round(members / MAX_MULTIXACT_MEMBERS, 2)
178+
percentage_remaining = (1.0 - ratio) * 100
179+
return [metric_multixact_members_remaining_ratio(percentage_remaining)]
180+
181+
182+
def get_multixact_remaining_ratio(_data_dir, db_connection):
183+
mxid_age = get_max_mxid_age(db_connection)
184+
if not mxid_age:
185+
return []
186+
ratio = round(mxid_age / WRAPAROUND_LIMIT, 2)
187+
percentage_remaining = (1.0 - ratio) * 100
188+
return [metric_multixact_remaining_ratio(percentage_remaining)]
189+
190+
191+
def get_xid_remaining_ratio(_data_dir, db_connection):
192+
xid_age = get_max_xid_age(db_connection)
193+
if not xid_age:
194+
return []
195+
ratio = round(xid_age / WRAPAROUND_LIMIT, 2)
196+
percentage_remaining = (1.0 - ratio) * 100
197+
return [metric_xid_remaining_ratio(percentage_remaining)]
198+
199+
200+
def get_stats_wal_file_amount(data_dir, _db_connection):
153201
return [metric_wal_file_amount(get_amount_of_wal_files(data_dir))]
154202

155203

156-
def get_stats_incoming_replication_status(db_connection):
204+
def get_stats_incoming_replication_status(_data_dir, db_connection):
157205
return [metric_incoming_replication_running(host, is_streaming)
158206
for host, is_streaming in get_wal_receiver_status(db_connection)]

postgresql_metrics/metrics_logic.py

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ def _is_time_to_call_stats_func_and_update_ts(database_name, metrics_func, run_i
8282
return False
8383

8484

85-
def _call_all_db_functions(db_parameter, db_stats_functions, schedule=False, db_name=None):
85+
def _call_all_db_functions(db_stats_functions, db_parameters, schedule=False, db_name=None):
8686
"""Iterates through all given statistics functions, calling them with the given parameter.
8787
The db_parameter can be a database connection or a file path to Postgres data directory,
8888
depending on the statistics function to call.
@@ -100,7 +100,7 @@ def _call_all_db_functions(db_parameter, db_stats_functions, schedule=False, db_
100100
if is_call_required:
101101
try:
102102
LOG.debug('calling stats function {}', db_metrics_func.__name__)
103-
metrics.extend(db_metrics_func(db_parameter))
103+
metrics.extend(db_metrics_func(*db_parameters))
104104
except Exception:
105105
LOG.exception('failed calling stats function: ' + db_metrics_func.__name__)
106106
return metrics
@@ -123,22 +123,25 @@ def get_stats_functions_from_conf(func_key_name, conf):
123123
def get_all_stats_functions_from_conf(conf):
124124
db_functions = get_stats_functions_from_conf('db_functions', conf)
125125
global_db_functions = get_stats_functions_from_conf('global_db_functions', conf)
126+
# `data_dir_functions` is deprecated, but to preserve backwards compatibility still read
126127
data_dir_functions = get_stats_functions_from_conf('data_dir_functions', conf)
127-
return db_functions, global_db_functions, data_dir_functions
128+
if data_dir_functions:
129+
LOG.warn("data_dir_functions field in config is deprecated -- consider moving functions to global_db_functions")
130+
all_global_db_functions = data_dir_functions + global_db_functions
131+
return db_functions, all_global_db_functions
128132

129133

130134
def get_all_metrics_now(db_connections, conf):
131135
"""Get all the metrics immediately without any scheduling.
132136
First gets the global stats with first available database connection,
133137
and then gets the rest per database.
134138
"""
135-
db_functions, global_db_functions, data_dir_functions = get_all_stats_functions_from_conf(conf)
139+
db_functions, global_db_functions = get_all_stats_functions_from_conf(conf)
136140
data_dir = figure_out_postgres_data_dir(db_connections[0], conf)
137141

138-
all_metrics = _call_all_db_functions(db_connections[0], global_db_functions)
139-
all_metrics.extend(_call_all_db_functions(data_dir, data_dir_functions))
142+
all_metrics = _call_all_db_functions(global_db_functions, (data_dir, db_connections[0]))
140143
for db_connection in db_connections:
141-
all_metrics.extend(_call_all_db_functions(db_connection, db_functions))
144+
all_metrics.extend(_call_all_db_functions(db_functions, (data_dir, db_connection)))
142145
return all_metrics
143146

144147

@@ -147,14 +150,13 @@ def get_all_metrics_scheduled(db_connections, conf):
147150
First gets the global stats with first available database connection,
148151
and then gets the rest per database.
149152
"""
150-
db_functions, global_db_functions, data_dir_functions = get_all_stats_functions_from_conf(conf)
153+
db_functions, global_db_functions = get_all_stats_functions_from_conf(conf)
151154
data_dir = figure_out_postgres_data_dir(db_connections[0], conf)
152155

153-
all_metrics = _call_all_db_functions(db_connections[0], global_db_functions, schedule=True)
154-
all_metrics.extend(_call_all_db_functions(data_dir, data_dir_functions, schedule=True))
156+
all_metrics = _call_all_db_functions(global_db_functions, (data_dir, db_connections[0]), schedule=True)
155157
for db_connection in db_connections:
156158
db_name = get_db_name_from_connection(db_connection)
157-
all_metrics.extend(_call_all_db_functions(db_connection, db_functions,
159+
all_metrics.extend(_call_all_db_functions(db_functions, (data_dir, db_connection),
158160
schedule=True, db_name=db_name))
159161
return all_metrics
160162

postgresql_metrics/postgres_queries.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,28 @@ def get_oldest_transaction_timestamp(conn):
195195
return None, None
196196

197197

198+
def get_max_mxid_age(conn):
199+
# `mxid_age` is only available on postgres 9.5 and newer
200+
if conn.server_version < 95000:
201+
LOG.error("Unable to check mxid_age on versions of postgres below 9.5")
202+
return None
203+
sql = "SELECT max(mxid_age(relminmxid)) FROM pg_class WHERE relminmxid <> '0'"
204+
results = query(conn, sql)
205+
if not results:
206+
return None
207+
mxid_age, = results[0]
208+
return int(mxid_age)
209+
210+
211+
def get_max_xid_age(conn):
212+
sql = "SELECT max(age(datfrozenxid)) FROM pg_database"
213+
results = query(conn, sql)
214+
if not results:
215+
return None
216+
xid_age, = results[0]
217+
return int(xid_age)
218+
219+
198220
def get_replication_delays(conn):
199221
sql = ("SELECT client_addr, "
200222
"pg_xlog_location_diff(pg_current_xlog_location(), replay_location) AS bytes_diff "

0 commit comments

Comments
 (0)