Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -434,6 +434,9 @@ public final class HddsConfigKeys {
public static final String HDDS_METRICS_PERCENTILES_INTERVALS_KEY =
"hdds.metrics.percentiles.intervals";

public static final String HDDS_DATANODE_DISK_CHECK_IO_TEST_ENABLED_KEY =
"hdds.datanode.disk.check.io.test.enabled";

/** Do not instantiate. */
private HddsConfigKeys() {
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -413,6 +413,8 @@ private static void addDeprecatedKeys() {
HddsConfigKeys.HDDS_METRICS_PERCENTILES_INTERVALS_KEY),
new DeprecationDelta("hdds.recon.heartbeat.interval",
HddsConfigKeys.HDDS_RECON_HEARTBEAT_INTERVAL),
new DeprecationDelta("hdds.datanode.disk.check.io.test.count",
HddsConfigKeys.HDDS_DATANODE_DISK_CHECK_IO_TEST_ENABLED_KEY)
});
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -149,12 +149,17 @@ private long getCurrentTime() {
return clock.millis();
}

public long getExpiryDurationMillis() {
return expiryDurationMillis;
}

/**
* A custom monotonic clock implementation.
* A custom monotonic clock implementation to allow overriding the current time for testing purposes.
* Implementation of Clock that uses System.nanoTime() for real usage.
* See {@see org.apache.ozone.test.TestClock}
* The class {@code org.apache.ozone.test.TestClock} provides a mock clock which can be used
* to manipulate the current time in tests.
*/
private static final class MonotonicClock extends Clock {
public static final class MonotonicClock extends Clock {
@Override
public long millis() {
return TimeUnit.NANOSECONDS.toMillis(System.nanoTime());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import static org.apache.hadoop.ozone.container.common.statemachine.DatanodeConfiguration.CONFIG_PREFIX;

import java.time.Duration;
import org.apache.commons.lang3.time.DurationFormatUtils;
import org.apache.hadoop.hdds.conf.Config;
import org.apache.hadoop.hdds.conf.ConfigGroup;
import org.apache.hadoop.hdds.conf.ConfigTag;
Expand Down Expand Up @@ -61,6 +62,7 @@ public class DatanodeConfiguration extends ReconfigurableConfig {
public static final String FAILED_DB_VOLUMES_TOLERATED_KEY = "hdds.datanode.failed.db.volumes.tolerated";
public static final String DISK_CHECK_MIN_GAP_KEY = "hdds.datanode.disk.check.min.gap";
public static final String DISK_CHECK_TIMEOUT_KEY = "hdds.datanode.disk.check.timeout";
public static final String DISK_CHECK_SLIDING_WINDOW_TIMEOUT_KEY = "hdds.datanode.disk.check.sliding.window.timeout";

// Minimum space should be left on volume.
// Ex: If volume has 1000GB and minFreeSpace is configured as 10GB,
Expand Down Expand Up @@ -99,6 +101,9 @@ public class DatanodeConfiguration extends ReconfigurableConfig {

static final Duration DISK_CHECK_TIMEOUT_DEFAULT = Duration.ofMinutes(10);

static final Duration DISK_CHECK_SLIDING_WINDOW_TIMEOUT_DEFAULT =
Duration.ofMinutes(PERIODIC_DISK_CHECK_INTERVAL_MINUTES_DEFAULT).plus(DISK_CHECK_TIMEOUT_DEFAULT);

static final boolean CONTAINER_SCHEMA_V3_ENABLED_DEFAULT = true;
static final long ROCKSDB_LOG_MAX_FILE_SIZE_BYTES_DEFAULT = 32 * 1024 * 1024;
static final int ROCKSDB_LOG_MAX_FILE_NUM_DEFAULT = 64;
Expand Down Expand Up @@ -350,7 +355,7 @@ public class DatanodeConfiguration extends ReconfigurableConfig {
@Config(key = "hdds.datanode.disk.check.io.test.count",
defaultValue = "3",
type = ConfigType.INT,
tags = { DATANODE },
tags = {DATANODE},
description = "The number of IO tests required to determine if a disk " +
" has failed. Each disk check does one IO test. The volume will be " +
"failed if more than " +
Expand All @@ -360,6 +365,14 @@ public class DatanodeConfiguration extends ReconfigurableConfig {
)
private int volumeIOTestCount = DISK_CHECK_IO_TEST_COUNT_DEFAULT;

@Config(key = "hdds.datanode.disk.check.io.test.enabled",
defaultValue = "true",
type = ConfigType.BOOLEAN,
tags = { DATANODE },
description = "The configuration to enable or disable disk IO checks."
)
private boolean isDiskCheckEnabled = true;

@Config(key = "hdds.datanode.disk.check.io.failures.tolerated",
defaultValue = "1",
type = ConfigType.INT,
Expand Down Expand Up @@ -404,6 +417,19 @@ public class DatanodeConfiguration extends ReconfigurableConfig {
)
private Duration diskCheckTimeout = DISK_CHECK_TIMEOUT_DEFAULT;

@Config(key = "hdds.datanode.disk.check.sliding.window.timeout",
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With this sliding window introduced, "hdds.datanode.disk.check.io.test.count" property function is half removed. We should consider deprecate "hdds.datanode.disk.check.io.test.count" and introduce a new boolean property with name, like "hdds.datanode.disk.check.io.test.enabled".

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To not break existing users, it's recommend to add new property "hdds.datanode.disk.check.io.test.enabled", instead of change the current property "hdds.datanode.disk.check.io.test.count" to "hdds.datanode.disk.check.io.test.enabled".

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, I reverted the change which removed the config and updated the deprecated config list.

defaultValue = "70m",
type = ConfigType.TIME,
tags = {ConfigTag.DATANODE},
description = "Time interval after which a disk check"
+ " failure result stored in the sliding window will expire."
+ " Do not set the window timeout period to less than or equal to the disk check interval period"
+ " or failures can be missed across sparse checks"
+ " e.g., every 120m interval with a 60m window rarely accumulates enough failed events"
+ " Unit could be defined with postfix (ns,ms,s,m,h,d)."
)
private Duration diskCheckSlidingWindowTimeout = DISK_CHECK_SLIDING_WINDOW_TIMEOUT_DEFAULT;

@Config(key = "hdds.datanode.chunk.data.validation.check",
defaultValue = "false",
type = ConfigType.BOOLEAN,
Expand Down Expand Up @@ -632,45 +658,21 @@ public void validate() {
failedDbVolumesTolerated = FAILED_VOLUMES_TOLERATED_DEFAULT;
}

if (volumeIOTestCount == 0) {
LOG.info("{} set to {}. Disk IO health tests have been disabled.",
DISK_CHECK_IO_TEST_COUNT_KEY, volumeIOTestCount);
if (!isDiskCheckEnabled) {
LOG.info("Disk IO health tests have been disabled.");
} else {
if (volumeIOTestCount < 0) {
LOG.warn("{} must be greater than 0 but was set to {}." +
"Defaulting to {}",
DISK_CHECK_IO_TEST_COUNT_KEY, volumeIOTestCount,
DISK_CHECK_IO_TEST_COUNT_DEFAULT);
volumeIOTestCount = DISK_CHECK_IO_TEST_COUNT_DEFAULT;
}

if (volumeIOFailureTolerance < 0) {
LOG.warn("{} must be greater than or equal to 0 but was set to {}. " +
"Defaulting to {}",
LOG.warn("{} must be greater than or equal to 0 but was set to {}. Defaulting to {}",
DISK_CHECK_IO_FAILURES_TOLERATED_KEY, volumeIOFailureTolerance,
DISK_CHECK_IO_FAILURES_TOLERATED_DEFAULT);
volumeIOFailureTolerance = DISK_CHECK_IO_FAILURES_TOLERATED_DEFAULT;
}

if (volumeIOFailureTolerance >= volumeIOTestCount) {
LOG.warn("{} was set to {} but cannot be greater or equals to {} " +
"set to {}. Defaulting {} to {} and {} to {}",
DISK_CHECK_IO_FAILURES_TOLERATED_KEY, volumeIOFailureTolerance,
DISK_CHECK_IO_TEST_COUNT_KEY, volumeIOTestCount,
DISK_CHECK_IO_FAILURES_TOLERATED_KEY,
DISK_CHECK_IO_FAILURES_TOLERATED_DEFAULT,
DISK_CHECK_IO_TEST_COUNT_KEY, DISK_CHECK_IO_TEST_COUNT_DEFAULT);
volumeIOTestCount = DISK_CHECK_IO_TEST_COUNT_DEFAULT;
volumeIOFailureTolerance = DISK_CHECK_IO_FAILURES_TOLERATED_DEFAULT;
}

if (volumeHealthCheckFileSize < 1) {
LOG.warn(DISK_CHECK_FILE_SIZE_KEY +
"must be at least 1 byte and was set to {}. Defaulting to {}",
volumeHealthCheckFileSize,
LOG.warn("{} must be at least 1 byte and was set to {}. Defaulting to {}",
DISK_CHECK_FILE_SIZE_KEY, volumeHealthCheckFileSize,
DISK_CHECK_FILE_SIZE_DEFAULT);
volumeHealthCheckFileSize =
DISK_CHECK_FILE_SIZE_DEFAULT;
volumeHealthCheckFileSize = DISK_CHECK_FILE_SIZE_DEFAULT;
}
}

Expand All @@ -688,6 +690,25 @@ public void validate() {
diskCheckTimeout = DISK_CHECK_TIMEOUT_DEFAULT;
}

if (diskCheckSlidingWindowTimeout.isNegative()) {
Duration defaultTimeout = Duration.ofMinutes(periodicDiskCheckIntervalMinutes).plus(diskCheckTimeout);
LOG.warn("{} must be greater than zero and was set to {}. Defaulting to {}",
DISK_CHECK_SLIDING_WINDOW_TIMEOUT_KEY, diskCheckSlidingWindowTimeout,
DISK_CHECK_SLIDING_WINDOW_TIMEOUT_DEFAULT);
diskCheckSlidingWindowTimeout = defaultTimeout;
}

// Do not set window timeout <= periodic disk check interval period, or failures can be missed across sparse checks
// e.g., every 120m interval with a 60m window rarely accumulates enough failed events
if (diskCheckSlidingWindowTimeout.compareTo(Duration.ofMinutes(periodicDiskCheckIntervalMinutes)) < 0) {
Duration defaultTimeout = Duration.ofMinutes(periodicDiskCheckIntervalMinutes).plus(diskCheckTimeout);
LOG.warn("{} must be greater than or equal to {} minutes and was set to {} minutes. Defaulting to {}",
DISK_CHECK_SLIDING_WINDOW_TIMEOUT_KEY, periodicDiskCheckIntervalMinutes,
diskCheckSlidingWindowTimeout.toMinutes(),
DurationFormatUtils.formatDurationHMS(defaultTimeout.toMillis()));
diskCheckSlidingWindowTimeout = defaultTimeout;
}

if (blockDeleteCommandWorkerInterval.isNegative()) {
LOG.warn(BLOCK_DELETE_COMMAND_WORKER_INTERVAL +
" must be greater than zero and was set to {}. Defaulting to {}",
Expand Down Expand Up @@ -907,6 +928,22 @@ public void setDiskCheckTimeout(Duration duration) {
diskCheckTimeout = duration;
}

public void setDiskCheckEnabled(boolean diskCheckEnabled) {
isDiskCheckEnabled = diskCheckEnabled;
}

public boolean isDiskCheckEnabled() {
return isDiskCheckEnabled;
}

public Duration getDiskCheckSlidingWindowTimeout() {
return diskCheckSlidingWindowTimeout;
}

public void setDiskCheckSlidingWindowTimeout(Duration duration) {
diskCheckSlidingWindowTimeout = duration;
}

public int getBlockDeleteThreads() {
return blockDeleteThreads;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,10 @@
import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Queue;
import java.util.concurrent.ConcurrentSkipListSet;
import java.util.concurrent.ThreadLocalRandom;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.function.Function;
import org.apache.commons.io.FileUtils;
Expand Down Expand Up @@ -110,11 +107,6 @@ public class HddsVolume extends StorageVolume {
private final AtomicBoolean dbLoaded = new AtomicBoolean(false);
private final AtomicBoolean dbLoadFailure = new AtomicBoolean(false);

private final int volumeTestCount;
private final int volumeTestFailureTolerance;
private AtomicInteger volumeTestFailureCount;
private Queue<Boolean> volumeTestResultQueue;

/**
* Builder for HddsVolume.
*/
Expand Down Expand Up @@ -147,20 +139,13 @@ private HddsVolume(Builder b) throws IOException {
this.volumeInfoMetrics =
new VolumeInfoMetrics(b.getVolumeRootStr(), this);

this.volumeTestCount = getDatanodeConfig().getVolumeIOTestCount();
this.volumeTestFailureTolerance = getDatanodeConfig().getVolumeIOFailureTolerance();
this.volumeTestFailureCount = new AtomicInteger(0);
this.volumeTestResultQueue = new LinkedList<>();

initialize();
} else {
// Builder is called with failedVolume set, so create a failed volume
// HddsVolume Object.
this.setState(VolumeState.FAILED);
volumeIOStats = null;
volumeInfoMetrics = new VolumeInfoMetrics(b.getVolumeRootStr(), this);
this.volumeTestCount = 0;
this.volumeTestFailureTolerance = 0;
}

LOG.info("HddsVolume: {}", getReport());
Expand Down Expand Up @@ -322,38 +307,32 @@ public synchronized VolumeCheckResult check(@Nullable Boolean unused)

@VisibleForTesting
public VolumeCheckResult checkDbHealth(File dbFile) throws InterruptedException {
if (volumeTestCount == 0) {
if (!getDiskCheckEnabled()) {
return VolumeCheckResult.HEALTHY;
}

final boolean isVolumeTestResultHealthy = true;
try (ManagedOptions managedOptions = new ManagedOptions();
ManagedRocksDB ignored = ManagedRocksDB.openReadOnly(managedOptions, dbFile.toString())) {
volumeTestResultQueue.add(isVolumeTestResultHealthy);
// Do nothing. Only check if rocksdb is accessible.
LOG.debug("Successfully opened the database at \"{}\" for HDDS volume {}.", dbFile, getStorageDir());
} catch (Exception e) {
if (Thread.currentThread().isInterrupted()) {
throw new InterruptedException("Check of database for volume " + this + " interrupted.");
}
LOG.warn("Could not open Volume DB located at {}", dbFile, e);
volumeTestResultQueue.add(!isVolumeTestResultHealthy);
volumeTestFailureCount.incrementAndGet();
}

if (volumeTestResultQueue.size() > volumeTestCount
&& (Boolean.TRUE.equals(volumeTestResultQueue.poll()) != isVolumeTestResultHealthy)) {
volumeTestFailureCount.decrementAndGet();
getIoTestSlidingWindow().add();
}

if (volumeTestFailureCount.get() > volumeTestFailureTolerance) {
if (getIoTestSlidingWindow().isExceeded()) {
LOG.error("Failed to open the database at \"{}\" for HDDS volume {}: " +
"the last {} runs encountered {} out of {} tolerated failures.",
dbFile, this, volumeTestResultQueue.size(), volumeTestFailureCount.get(), volumeTestFailureTolerance);
"encountered more than the {} tolerated failures.",
dbFile, this, getIoTestSlidingWindow().getWindowSize());
return VolumeCheckResult.FAILED;
}

LOG.debug("Successfully opened the database at \"{}\" for HDDS volume {}: " +
"the last {} runs encountered {} out of {} tolerated failures",
dbFile, this, volumeTestResultQueue.size(), volumeTestFailureTolerance, volumeTestFailureTolerance);
"encountered {} out of {} tolerated failures",
dbFile, this, getIoTestSlidingWindow().getNumEventsInWindow(), getIoTestSlidingWindow().getWindowSize());
return VolumeCheckResult.HEALTHY;
}

Expand Down
Loading