diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/diskbalancer/ContainerChoosingPolicyFactory.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/diskbalancer/ContainerChoosingPolicyFactory.java index 652e0c29781b..33814be0499d 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/diskbalancer/ContainerChoosingPolicyFactory.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/diskbalancer/ContainerChoosingPolicyFactory.java @@ -24,7 +24,6 @@ import org.apache.hadoop.ozone.container.common.volume.VolumeChoosingPolicyFactory; import org.apache.hadoop.ozone.container.diskbalancer.policy.ContainerChoosingPolicy; import org.apache.hadoop.ozone.container.diskbalancer.policy.DefaultContainerChoosingPolicy; -import org.apache.ratis.util.ReflectionUtils; /** * A factory to create {@link ContainerChoosingPolicy} instances for the DiskBalancer. @@ -49,7 +48,14 @@ public static ContainerChoosingPolicy getDiskBalancerPolicy(ConfigurationSource Class policyClass = conf.getClass( HDDS_DATANODE_DISKBALANCER_CONTAINER_CHOOSING_POLICY, DEFAULT_CONTAINER_CHOOSING_POLICY, ContainerChoosingPolicy.class); - return ReflectionUtils.newInstance(policyClass, new Class[]{ReentrantLock.class}, - VolumeChoosingPolicyFactory.getVolumeSpaceReservationLock()); + ReentrantLock lock = VolumeChoosingPolicyFactory.getVolumeSpaceReservationLock(); + try { + return policyClass.getConstructor(ReentrantLock.class, ConfigurationSource.class) + .newInstance(lock, conf); + } catch (ReflectiveOperationException e) { + throw new IllegalStateException( + "Disk balancer container choosing policy must implement " + + "(ReentrantLock, ConfigurationSource): " + policyClass.getName(), e); + } } } diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/diskbalancer/DiskBalancerConfiguration.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/diskbalancer/DiskBalancerConfiguration.java index 9f35a1f3edfe..7e21644f6761 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/diskbalancer/DiskBalancerConfiguration.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/diskbalancer/DiskBalancerConfiguration.java @@ -121,6 +121,15 @@ public final class DiskBalancerConfiguration { "Unit could be defined with postfix (ns,ms,s,m,h,d).") private long replicaDeletionDelay = Duration.ofMinutes(5).toMillis(); + @Config(key = "hdds.datanode.disk.balancer.include.non.standard.containers", + defaultValue = "false", + type = ConfigType.BOOLEAN, + tags = { DATANODE, ConfigTag.DISKBALANCER }, + description = "If true, balancer include non-standard states, i.e, QUASI_CLOSED." + + " So both CLOSED and QUASI_CLOSED state containers are eligible for move. " + + "If false (default), balancer only moves CLOSED containers.") + private boolean includeNonStandardContainers = false; + public DiskBalancerConfiguration(Double threshold, Long bandwidthInMB, Integer parallelThread, @@ -262,6 +271,14 @@ public void setParallelThread(int parallelThread) { this.parallelThread = parallelThread; } + public boolean getIncludeNonStandardContainers() { + return includeNonStandardContainers; + } + + public void setIncludeNonStandardContainers(boolean includeNonStandardContainers) { + this.includeNonStandardContainers = includeNonStandardContainers; + } + @Override public String toString() { return String.format("Disk Balancer Configuration values:%n" + diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/diskbalancer/DiskBalancerService.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/diskbalancer/DiskBalancerService.java index b52a08e26412..1e8df67c1d08 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/diskbalancer/DiskBalancerService.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/diskbalancer/DiskBalancerService.java @@ -70,7 +70,7 @@ import org.apache.hadoop.ozone.container.diskbalancer.DiskBalancerVolumeCalculation.VolumeFixedUsage; import org.apache.hadoop.ozone.container.diskbalancer.policy.ContainerCandidate; import org.apache.hadoop.ozone.container.diskbalancer.policy.ContainerChoosingPolicy; -import org.apache.hadoop.ozone.container.diskbalancer.policy.DefaultContainerChoosingPolicy; +import org.apache.hadoop.ozone.container.diskbalancer.policy.DefaultContainerChoosingPolicy.MovableContainerStates; import org.apache.hadoop.ozone.container.keyvalue.KeyValueContainerData; import org.apache.hadoop.ozone.container.keyvalue.helpers.KeyValueContainerLocationUtil; import org.apache.hadoop.ozone.container.keyvalue.helpers.KeyValueContainerUtil; @@ -130,6 +130,8 @@ public class DiskBalancerService extends BackgroundService { private DiskBalancerServiceMetrics metrics; + private final MovableContainerStates movableContainerStates; + public DiskBalancerService(OzoneContainer ozoneContainer, long serviceCheckInterval, long serviceCheckTimeout, TimeUnit timeUnit, int workerSize, ConfigurationSource conf) throws IOException { @@ -153,8 +155,9 @@ public DiskBalancerService(OzoneContainer ozoneContainer, throw new IOException(e); } - replicaDeletionDelay = conf.getObject(DiskBalancerConfiguration.class) - .getReplicaDeletionDelay(); + DiskBalancerConfiguration diskBalancerConfiguration = conf.getObject(DiskBalancerConfiguration.class); + replicaDeletionDelay = diskBalancerConfiguration.getReplicaDeletionDelay(); + movableContainerStates = MovableContainerStates.fromConfig(conf); metrics = DiskBalancerServiceMetrics.create(); loadDiskBalancerInfo(); @@ -485,14 +488,10 @@ public BackgroundTaskResult call() { } // Double check container state before acquiring lock to start move process. - // Container state may have changed after selection. Only CLOSED containers can be moved. - // QUASI_CLOSED is allowed when test mode is enabled, this is done to test in production - // these containers are rejected. + // Container state may have changed after selection. State containerState = container.getContainerData().getState(); - boolean isTestMode = DefaultContainerChoosingPolicy.isTest(); - if (containerState != State.CLOSED && !(isTestMode && containerState == State.QUASI_CLOSED)) { - LOG.warn("Container {} is in {} state, skipping move process. Only CLOSED containers can be moved.", - containerId, containerState); + if (!movableContainerStates.allows(containerState)) { + LOG.warn("Container {} is in {} state, skipping move process.", containerId, containerState); postCall(false, startTime); return BackgroundTaskResult.EmptyTaskResult.newResult(); } diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/diskbalancer/policy/DefaultContainerChoosingPolicy.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/diskbalancer/policy/DefaultContainerChoosingPolicy.java index 5ead73db9681..0bbdb4430fcd 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/diskbalancer/policy/DefaultContainerChoosingPolicy.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/diskbalancer/policy/DefaultContainerChoosingPolicy.java @@ -22,7 +22,6 @@ import static org.apache.hadoop.ozone.container.diskbalancer.DiskBalancerVolumeCalculation.getIdealUsage; import static org.apache.hadoop.ozone.container.diskbalancer.DiskBalancerVolumeCalculation.newVolumeFixedUsage; -import com.google.common.annotations.VisibleForTesting; import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; import java.util.Comparator; @@ -33,13 +32,16 @@ import java.util.concurrent.ExecutionException; import java.util.concurrent.locks.ReentrantLock; import java.util.stream.Collectors; +import org.apache.hadoop.hdds.conf.ConfigurationSource; import org.apache.hadoop.hdds.fs.SpaceUsageSource; +import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerDataProto.State; import org.apache.hadoop.hdds.scm.container.ContainerID; import org.apache.hadoop.ozone.container.common.impl.ContainerData; import org.apache.hadoop.ozone.container.common.interfaces.Container; import org.apache.hadoop.ozone.container.common.volume.HddsVolume; import org.apache.hadoop.ozone.container.common.volume.MutableVolumeSet; import org.apache.hadoop.ozone.container.common.volume.StorageVolume; +import org.apache.hadoop.ozone.container.diskbalancer.DiskBalancerConfiguration; import org.apache.hadoop.ozone.container.diskbalancer.DiskBalancerVolumeCalculation.VolumeFixedUsage; import org.apache.hadoop.ozone.container.ozoneimpl.OzoneContainer; import org.slf4j.Logger; @@ -50,6 +52,8 @@ * then chooses a container from the source volume that can be moved to the destination without * exceeding the upper threshold. Space is reserved on the destination only when a container is * chosen, using the actual container size. + * + * Which replica states may move is defined by {@link MovableContainerStates}. */ public class DefaultContainerChoosingPolicy implements ContainerChoosingPolicy { public static final Logger LOG = LoggerFactory.getLogger( @@ -59,13 +63,12 @@ public class DefaultContainerChoosingPolicy implements ContainerChoosingPolicy { ThreadLocal.withInitial( () -> CacheBuilder.newBuilder().recordStats().expireAfterAccess(1, HOURS).build()); - // for test - private static boolean test = false; - private final ReentrantLock lock; + private final MovableContainerStates movableContainerStates; - public DefaultContainerChoosingPolicy(ReentrantLock globalLock) { + public DefaultContainerChoosingPolicy(ReentrantLock globalLock, ConfigurationSource conf) { this.lock = globalLock; + this.movableContainerStates = MovableContainerStates.fromConfig(conf); } @Override @@ -128,15 +131,15 @@ public ContainerCandidate chooseVolumesAndContainer(OzoneContainer ozoneContaine dst.incCommittedBytes(containerSize); LOG.debug("Chosen volume pair for disk balancing: source={} (utilization={}), " + "destination={} (utilization={})", - src.getStorageID(), srcUsage.getUtilization(), - dst.getStorageID(), dstUsage.getUtilization()); + src.getStorageDir().getPath(), srcUsage.getUtilization(), + dst.getStorageDir().getPath(), dstUsage.getUtilization()); return new ContainerCandidate(containerData, src, dst); } - LOG.debug("No suitable container found for destination {}, trying next volume.", - dst.getStorageID()); + LOG.debug("No container to move for destination {}, trying next volume.", + dst.getStorageDir().getPath()); } else { LOG.debug("Destination volume {} does not have enough space, trying next volume.", - dst.getStorageID()); + dst.getStorageDir().getPath()); } } LOG.debug("Failed to find appropriate destination volume and container."); @@ -147,7 +150,7 @@ public ContainerCandidate chooseVolumesAndContainer(OzoneContainer ozoneContaine } /** - * Choose a container from source volume that can be moved to destination volume. + * Finds a container on {@code src} that can move to {@code dst}. */ private ContainerData chooseContainer(OzoneContainer ozoneContainer, HddsVolume src, HddsVolume dst, VolumeFixedUsage dstUsage, @@ -166,23 +169,63 @@ private ContainerData chooseContainer(OzoneContainer ozoneContainer, while (itr.hasNext()) { ContainerData containerData = itr.next().getContainerData(); + long containerId = containerData.getContainerID(); - // Skip containers removed from containerSet after iterator was cached - if (ozoneContainer.getContainerSet().getContainer(containerData.getContainerID()) == null) { + if (ozoneContainer.getContainerSet().getContainer(containerId) == null) { + if (LOG.isDebugEnabled()) { + LOG.debug("Skipping container {} from volume {}: not in container set " + + "(removed after iterator was cached)", + containerId, src.getStorageDir().getPath()); + } + continue; + } + + if (inProgressContainerIDs.contains(ContainerID.valueOf(containerId))) { + if (LOG.isDebugEnabled()) { + LOG.debug("Skipping container {} from volume {}: disk balancer move already in progress", + containerId, src.getStorageDir().getPath()); + } continue; } - if (containerData.getBytesUsed() > 0 && - !inProgressContainerIDs.contains(ContainerID.valueOf(containerData.getContainerID())) && - (containerData.isClosed() || (test && containerData.isQuasiClosed()))) { + long containerSize = containerData.getBytesUsed(); + if (containerSize <= 0) { + if (LOG.isDebugEnabled()) { + LOG.debug("Skipping container {} from volume {}: bytes used is {}", + containerId, src.getStorageDir().getPath(), containerData.getBytesUsed()); + } + continue; + } - long containerSize = containerData.getBytesUsed(); - // Check if dst has enough space and can accept the container without exceeding threshold - if (containerSize < usableSpace && - computeUtilization(dstSpaceUsage, dstCommittedBytes, containerSize) < upperThreshold) { - return containerData; + if (!movableContainerStates.allows(containerData.getState())) { + if (LOG.isDebugEnabled()) { + LOG.debug("Skipping container {} from volume {}: state is {}. Policy: {}", + containerId, src.getStorageDir().getPath(), containerData.getState(), movableContainerStates); } + continue; } + + if (containerSize >= usableSpace) { + if (LOG.isDebugEnabled()) { + LOG.debug("Skipping container {} ({}B) from volume {}: exceeds destination {} " + + "usable space {}B", containerId, containerSize, src.getStorageDir().getPath(), + dst.getStorageDir().getPath(), usableSpace); + } + continue; + } + + double newUtilization = computeUtilization(dstSpaceUsage, dstCommittedBytes, containerSize); + if (newUtilization >= upperThreshold) { + if (LOG.isDebugEnabled()) { + LOG.debug("Skipping container {} ({}B) from volume {}: moving to {} would " + + "result in utilization {} exceeding upper threshold {}", + containerId, containerSize, src.getStorageDir().getPath(), dst.getStorageDir().getPath(), + newUtilization, upperThreshold); + } + continue; + } + + return containerData; } CACHE.get().invalidate(src); @@ -206,19 +249,37 @@ private void logVolumeBalancingState(List volumeUsages, long usableSpace = vfu.computeUsableSpace(); LOG.debug("Volume[{}] - disk={}, utilization={}, capacity={}, " + "effectiveUsed={}, available={}, usableSpace={}, committedBytes={}, delta={}", - i, vol.getStorageID(), String.format("%.10f", vfu.getUtilization()), + i, vol.getStorageDir().getPath(), String.format("%.10f", vfu.getUtilization()), usage.getCapacity(), vfu.getEffectiveUsed(), usage.getAvailable(), usableSpace, vol.getCommittedBytes(), deltaMap.getOrDefault(vol, 0L)); } } - @VisibleForTesting - public static void setTest(boolean isTest) { - test = isTest; - } + /** + * This shows which container {@link State}s can be moved. + * {@link #CLOSED_ONLY} + * {@link #NON_STANDARD_INCLUDED}: CLOSED as well as QUASI_CLOSED (add more container states in + * {@link #allows} when needed). + */ + public enum MovableContainerStates { + CLOSED_ONLY, + NON_STANDARD_INCLUDED; + + public static MovableContainerStates fromConfig(ConfigurationSource conf) { + boolean includeNonStandard = + conf.getObject(DiskBalancerConfiguration.class).getIncludeNonStandardContainers(); + return includeNonStandard ? NON_STANDARD_INCLUDED : CLOSED_ONLY; + } - @VisibleForTesting - public static boolean isTest() { - return test; + public boolean allows(State state) { + switch (state) { + case CLOSED: + return true; + case QUASI_CLOSED: + return this == NON_STANDARD_INCLUDED; + default: + return false; + } + } } } diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/diskbalancer/TestDefaultContainerChoosingPolicy.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/diskbalancer/TestDefaultContainerChoosingPolicy.java index 1c9e4ddeaa60..2c21507755d1 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/diskbalancer/TestDefaultContainerChoosingPolicy.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/diskbalancer/TestDefaultContainerChoosingPolicy.java @@ -69,6 +69,7 @@ import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.MethodSource; +import org.junit.jupiter.params.provider.ValueSource; /** * This class tests for volume and container selection in DefaultContainerChoosingPolicy. @@ -95,7 +96,7 @@ public class TestDefaultContainerChoosingPolicy { @BeforeEach public void setup() throws Exception { datanodeUuid = UUID.randomUUID().toString(); - policy = new DefaultContainerChoosingPolicy(new ReentrantLock()); + policy = new DefaultContainerChoosingPolicy(new ReentrantLock(), CONF); deltaMap = new HashMap<>(); inProgressContainerIDs = new HashSet<>(); } @@ -295,13 +296,18 @@ private void createContainer(long id, long usedBytes, HddsVolume vol) throws IOE private void createContainer(long id, long usedBytes, HddsVolume vol, ContainerSet targetSet) throws IOException { + createContainer(id, usedBytes, vol, targetSet, ContainerDataProto.State.CLOSED); + } + + private void createContainer(long id, long usedBytes, HddsVolume vol, ContainerSet targetSet, + ContainerDataProto.State state) throws IOException { long maxSize = usedBytes > 0 ? usedBytes : (long) CONF.getStorageSize( ScmConfigKeys.OZONE_SCM_CONTAINER_SIZE, ScmConfigKeys.OZONE_SCM_CONTAINER_SIZE_DEFAULT, StorageUnit.BYTES); KeyValueContainerData containerData = new KeyValueContainerData(id, ContainerLayoutVersion.FILE_PER_BLOCK, maxSize, UUID.randomUUID().toString(), UUID.randomUUID().toString()); - containerData.setState(ContainerDataProto.State.CLOSED); + containerData.setState(state); containerData.setVolume(vol); containerData.getStatistics().setBlockBytesForTesting(usedBytes); KeyValueContainer container = new KeyValueContainer(containerData, CONF); @@ -343,11 +349,14 @@ private void setupVolumeSetAndContainers(List volumes, inProgressContainerIDs.add(ContainerID.valueOf(id)); } } + mockContainerSet(containerSet); + } + private void mockContainerSet(ContainerSet cs) { ozoneContainer = mock(OzoneContainer.class); - ContainerController controller = new ContainerController(containerSet, null); + ContainerController controller = new ContainerController(cs, null); when(ozoneContainer.getController()).thenReturn(controller); - when(ozoneContainer.getContainerSet()).thenReturn(containerSet); + when(ozoneContainer.getContainerSet()).thenReturn(cs); } private MutableVolumeSet createVolumeSetForUsages(List volumes) throws IOException { @@ -363,6 +372,62 @@ private MutableVolumeSet createVolumeSetForUsages(List volumes) thro return vs; } + /** + * With {@link DiskBalancerConfiguration#getIncludeNonStandardContainers()} true, QUASI_CLOSED is + * chosen; + * with false (default), only CLOSED is eligible to move. + */ + @ParameterizedTest(name = "includeNonStandardContainers={0}") + @ValueSource(booleans = {true, false}) + public void testQuasiClosedEligibilityDependsOnIncludeNonStandard(boolean includeNonStandardContainers) + throws IOException { + OzoneConfiguration testConf = new OzoneConfiguration(); + DiskBalancerConfiguration dbc = testConf.getObject(DiskBalancerConfiguration.class); + dbc.setIncludeNonStandardContainers(includeNonStandardContainers); + testConf.setFromObject(dbc); + ContainerChoosingPolicy policyUnderTest = new DefaultContainerChoosingPolicy( + new ReentrantLock(), testConf); + + inProgressContainerIDs.clear(); + deltaMap.clear(); + + List configs = Arrays.asList( + new VolumeTestConfig("disk3", 0.15), + new VolumeTestConfig("disk2", 0.85), + new VolumeTestConfig("disk1", 0.90) + ); + List volumes = createVolumes(configs); + volumeSet = createVolumeSetForUsages(volumes); + List sortedUsages = getVolumeUsages(volumeSet, deltaMap); + sortedUsages.sort(Comparator.comparingDouble(VolumeFixedUsage::getUtilization)); + HddsVolume sourceVolume = sortedUsages.get(sortedUsages.size() - 1).getVolume(); + HddsVolume destVolume = sortedUsages.get(0).getVolume(); + + containerSet = newContainerSet(); + createContainer(1L, 1500L * MB, sourceVolume, containerSet, ContainerDataProto.State.CLOSED); + createContainer(2L, 50L * MB, sourceVolume, containerSet, ContainerDataProto.State.QUASI_CLOSED); + createContainer(3L, 200L * MB, sourceVolume, containerSet, ContainerDataProto.State.CLOSED); + + mockContainerSet(containerSet); + + ContainerCandidate candidate = policyUnderTest.chooseVolumesAndContainer(ozoneContainer, + volumeSet, deltaMap, inProgressContainerIDs, THRESHOLD); + + if (includeNonStandardContainers) { + assertNotNull(candidate); + assertEquals(sourceVolume, candidate.getSourceVolume()); + assertEquals(destVolume, candidate.getDestVolume()); + assertEquals(2L, candidate.getContainerData().getContainerID()); + assertTrue(candidate.getContainerData().isQuasiClosed()); + } else { + assertNotNull(candidate); + assertEquals(sourceVolume, candidate.getSourceVolume()); + assertEquals(destVolume, candidate.getDestVolume()); + assertEquals(3L, candidate.getContainerData().getContainerID()); + assertTrue(candidate.getContainerData().isClosed()); + } + } + /** * Generic test method that can be reused for different scenarios. * diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/diskbalancer/TestDiskBalancerTask.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/diskbalancer/TestDiskBalancerTask.java index 0dc92bef87a9..8aa13f472a6e 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/diskbalancer/TestDiskBalancerTask.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/diskbalancer/TestDiskBalancerTask.java @@ -71,7 +71,6 @@ import org.apache.hadoop.ozone.container.common.volume.MutableVolumeSet; import org.apache.hadoop.ozone.container.common.volume.StorageVolume; import org.apache.hadoop.ozone.container.common.volume.VolumeSet; -import org.apache.hadoop.ozone.container.diskbalancer.policy.DefaultContainerChoosingPolicy; import org.apache.hadoop.ozone.container.keyvalue.ContainerTestVersionInfo; import org.apache.hadoop.ozone.container.keyvalue.KeyValueContainer; import org.apache.hadoop.ozone.container.keyvalue.KeyValueContainerData; @@ -242,6 +241,7 @@ public void setup() throws Exception { DiskBalancerConfiguration diskBalancerConfiguration = conf.getObject(DiskBalancerConfiguration.class); diskBalancerConfiguration.setDiskBalancerShouldRun(true); + diskBalancerConfiguration.setIncludeNonStandardContainers(true); conf.setFromObject(diskBalancerConfiguration); diskBalancerService = new DiskBalancerServiceTestImpl(ozoneContainer, 100, conf, 1); @@ -266,7 +266,6 @@ public void cleanup() throws IOException { kvFaultInjector.reset(); KeyValueContainer.setInjector(null); DiskBalancerService.setInjector(null); - DefaultContainerChoosingPolicy.setTest(false); } @ParameterizedTest @@ -289,9 +288,6 @@ public void moveSuccess(State containerState) throws IOException { assertFalse(containerIterator.hasNext()); String oldContainerPath = container.getContainerData().getContainerPath(); - if (containerState == State.QUASI_CLOSED) { - DefaultContainerChoosingPolicy.setTest(true); - } DiskBalancerService.DiskBalancerTask task = getTask(); task.call(); assertEquals(State.DELETED, container.getContainerState()); @@ -459,7 +455,6 @@ public void moveFailsDuringInMemoryUpdate(ContainerTestVersionInfo versionInfo) .when(spyContainerSet).updateContainer(any(Container.class)); when(ozoneContainer.getContainerSet()).thenReturn(spyContainerSet); - DefaultContainerChoosingPolicy.setTest(true); DiskBalancerService.DiskBalancerTask task = getTask(); CompletableFuture completableFuture = CompletableFuture.runAsync(() -> task.call()); @@ -612,12 +607,10 @@ public void testOldReplicaDelayedDeletion(ContainerTestVersionInfo versionInfo) } /** - * Testing that invalid states (including QUASI_CLOSED in production mode) are correctly rejected. - * Here, with QUASI_CLOSED state, we ensure that the test runs in production mode - * where QUASI_CLOSED is not allowed for move. + * Testing that invalid states are correctly rejected. */ @ParameterizedTest - @EnumSource(names = {"OPEN", "CLOSING", "QUASI_CLOSED", "UNHEALTHY", "INVALID", "DELETED", "RECOVERING"}) + @EnumSource(names = {"OPEN", "CLOSING", "UNHEALTHY", "INVALID", "DELETED", "RECOVERING"}) public void testMoveSkippedWhenContainerStateChanged(State invalidState) throws IOException, InterruptedException, TimeoutException { LogCapturer serviceLog = LogCapturer.captureLogs(DiskBalancerService.class); diff --git a/hadoop-hdds/docs/content/design/diskbalancer.md b/hadoop-hdds/docs/content/design/diskbalancer.md index ab1547c4c491..595d8cfaf82e 100644 --- a/hadoop-hdds/docs/content/design/diskbalancer.md +++ b/hadoop-hdds/docs/content/design/diskbalancer.md @@ -104,6 +104,8 @@ D1 ----> C1-CLOSED --- (5) ---> C1-DELETED D2 ----> Temp C1-CLOSED --- (2) ---> Temp C1-RECOVERING --- (3) ---> C1-RECOVERING --- (4) ---> C1-CLOSED ``` +**Note:** By default only **CLOSED** containers are moved. If `hdds.datanode.disk.balancer.include.non.standard.containers` is set to **true**, **QUASI_CLOSED** replicas are also eligible. The same move process applies in case of any non-standard containers(QUASI_CLOSED). + ### Lazy Deletion of Source Container Replica The source container on D1 is **not** deleted immediately after the move completes. Instead, it is scheduled for deletion after a configurable delay using config `hdds.datanode.disk.balancer.replica.deletion.delay`, **default: 5 minutes**. @@ -123,7 +125,7 @@ are configurable, but the default implementations provide robust and safe behavi * **`DefaultContainerChoosingPolicy`**: This is the default policy that consolidates both volume selection and container selection into a single operation. It identifies the most over-utilized volume as the source and the most under-utilized volume with sufficient space as the destination, then iterates through containers on the source to pick the first one -that is in a **CLOSED** state and is not already being moved. It caches the list of containers for each volume which auto expires after one hour. +that is movable (**CLOSED** by default, and both **CLOSED** as well as **QUASI_CLOSED** when `hdds.datanode.disk.balancer.include.non.standard.containers` is true) and is not already being moved. It caches the list of containers for each volume which auto expires after one hour. ## Security Design DiskBalancer follows the same security model as other services: diff --git a/hadoop-hdds/docs/content/feature/DiskBalancer.md b/hadoop-hdds/docs/content/feature/DiskBalancer.md index 65079064f900..ab7b215b30e2 100644 --- a/hadoop-hdds/docs/content/feature/DiskBalancer.md +++ b/hadoop-hdds/docs/content/feature/DiskBalancer.md @@ -238,16 +238,17 @@ ozone admin datanode diskbalancer report --in-service-datanodes --json The DiskBalancer's behavior can be controlled using the following configuration properties in `ozone-site.xml`. -| Property | Default Value | Description | -|-------------------------------------------------------------|----------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `hdds.datanode.disk.balancer.enabled` | `false` | If false, the DiskBalancer service on the Datanode is disabled. Configure it to true for diskBalancer to be enabled. | -| `hdds.datanode.disk.balancer.volume.density.threshold.percent` | `10.0` | A percentage (0-100). A datanode is considered balanced if for each volume, its utilization differs from the average datanode utilization by no more than this threshold. | -| `hdds.datanode.disk.balancer.max.disk.throughputInMBPerSec` | `10` | The maximum bandwidth (in MB/s) that the balancer can use for moving data, to avoid impacting client I/O. | -| `hdds.datanode.disk.balancer.parallel.thread` | `5` | The number of worker threads to use for moving containers in parallel. | -| `hdds.datanode.disk.balancer.service.interval` | `60s` | The time interval at which the Datanode DiskBalancer service checks for imbalance and updates its configuration. | -| `hdds.datanode.disk.balancer.stop.after.disk.even` | `true` | If true, the DiskBalancer will automatically stop its balancing activity once disks are considered balanced (i.e., all volume densities are within the threshold). | -| `hdds.datanode.disk.balancer.replica.deletion.delay` | `5m` | The delay after a container is successfully moved from source volume to destination volume before the source container replica is deleted. This lazy deletion provides a grace period before failing the read thread holding the old container replica. Unit: ns, ms, s, m, h, d. | -| `hdds.datanode.disk.balancer.container.choosing.policy` | `org.apache.hadoop.ozone.container.diskbalancer.policy.DefaultContainerChoosingPolicy` | The policy for selecting source/destination volumes and which containers to move. | -| `hdds.datanode.disk.balancer.service.timeout` | `300s` | Timeout for the Datanode DiskBalancer service operations. | -| `hdds.datanode.disk.balancer.should.run.default` | `false` | If the balancer fails to read its persisted configuration, this value determines if the service should run by default. | +| Property | Default Value | Description | +|----------------------------------------------------------------|----------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `hdds.datanode.disk.balancer.enabled` | `false` | If false, the DiskBalancer service on the Datanode is disabled. Configure it to true for diskBalancer to be enabled. | +| `hdds.datanode.disk.balancer.volume.density.threshold.percent` | `10.0` | A percentage (0-100). A datanode is considered balanced if for each volume, its utilization differs from the average datanode utilization by no more than this threshold. | +| `hdds.datanode.disk.balancer.max.disk.throughputInMBPerSec` | `10` | The maximum bandwidth (in MB/s) that the balancer can use for moving data, to avoid impacting client I/O. | +| `hdds.datanode.disk.balancer.parallel.thread` | `5` | The number of worker threads to use for moving containers in parallel. | +| `hdds.datanode.disk.balancer.service.interval` | `60s` | The time interval at which the Datanode DiskBalancer service checks for imbalance and updates its configuration. | +| `hdds.datanode.disk.balancer.stop.after.disk.even` | `true` | If true, the DiskBalancer will automatically stop its balancing activity once disks are considered balanced (i.e., all volume densities are within the threshold). | +| `hdds.datanode.disk.balancer.replica.deletion.delay` | `5m` | The delay after a container is successfully moved from source volume to destination volume before the source container replica is deleted. This lazy deletion provides a grace period before failing the read thread holding the old container replica. Unit: ns, ms, s, m, h, d. | +| `hdds.datanode.disk.balancer.include.non.standard.containers` | `false` | If true, balancer include non-standard states, i.e, QUASI_CLOSED. So both CLOSED and QUASI_CLOSED state containers are eligible for move. If false (default), balancer only moves CLOSED containers. | +| `hdds.datanode.disk.balancer.container.choosing.policy` | `org.apache.hadoop.ozone.container.diskbalancer.policy.DefaultContainerChoosingPolicy` | The policy for selecting source/destination volumes and which containers to move. | +| `hdds.datanode.disk.balancer.service.timeout` | `300s` | Timeout for the Datanode DiskBalancer service operations. | +| `hdds.datanode.disk.balancer.should.run.default` | `false` | If the balancer fails to read its persisted configuration, this value determines if the service should run by default. | diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/scm/node/TestDiskBalancerPolicyPerformance.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/scm/node/TestDiskBalancerPolicyPerformance.java index 1b621629e7b3..e77d94a60e87 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/scm/node/TestDiskBalancerPolicyPerformance.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/scm/node/TestDiskBalancerPolicyPerformance.java @@ -119,7 +119,7 @@ public void setup() throws Exception { ContainerController containerController = new ContainerController(containerSet, null); when(ozoneContainer.getController()).thenReturn(containerController); when(ozoneContainer.getContainerSet()).thenReturn(containerSet); - policy = new DefaultContainerChoosingPolicy(new ReentrantLock()); + policy = new DefaultContainerChoosingPolicy(new ReentrantLock(), conf); executor = Executors.newFixedThreadPool(NUM_THREADS); }