diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/IntegrationTestCompactionWithDeviceSimulator.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/IntegrationTestCompactionWithDeviceSimulator.java new file mode 100644 index 000000000000..5ed4cb92fccf --- /dev/null +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/IntegrationTestCompactionWithDeviceSimulator.java @@ -0,0 +1,564 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase; + +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.Set; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.client.Admin; +import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder; +import org.apache.hadoop.hbase.client.Put; +import org.apache.hadoop.hbase.client.Table; +import org.apache.hadoop.hbase.client.TableDescriptorBuilder; +import org.apache.hadoop.hbase.io.compress.Compression; +import org.apache.hadoop.hbase.io.devsim.EBSDevice; +import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding; +import org.apache.hadoop.hbase.io.hfile.BlockCompressedSizePredicator; +import org.apache.hadoop.hbase.io.hfile.PreviousBlockCompressionRatePredicator; +import org.apache.hadoop.hbase.io.hfile.UncompressedBlockSizePredicator; +import org.apache.hadoop.hbase.regionserver.BloomType; +import org.apache.hadoop.hbase.regionserver.DefaultStoreEngine; +import org.apache.hadoop.hbase.regionserver.HRegion; +import org.apache.hadoop.hbase.regionserver.HRegionServer; +import org.apache.hadoop.hbase.regionserver.HStore; +import org.apache.hadoop.hbase.regionserver.Region; +import org.apache.hadoop.hbase.regionserver.StoreEngine; +import org.apache.hadoop.hbase.regionserver.compactions.CompactionConfiguration; +import org.apache.hadoop.hbase.regionserver.throttle.CompactionThroughputControllerFactory; +import org.apache.hadoop.hbase.regionserver.throttle.NoLimitThroughputController; +import org.apache.hadoop.hbase.testclassification.IntegrationTests; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; +import org.apache.hadoop.hbase.util.JVMClusterUtil; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.util.ToolRunner; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.hbase.thirdparty.com.google.common.collect.Sets; +import org.apache.hbase.thirdparty.org.apache.commons.cli.CommandLine; + +/** + * Integration test that demonstrates the EBS device layer simulator's value for diagnosing + * compaction throughput under different storage device constraints. Runs HBase major compactions + * under two simulated EBS volume bandwidth limits (constrained and baseline) with realistic HFile + * configuration (Snappy compression, tunable compression ratio, configurable block parameters) and + * produces a diagnostic throughput analysis comparing the scenarios. + *
+ * This test always starts a local MiniDFSCluster with the EBS device layer installed — the + * distributed cluster mode of IntegrationTestBase is not applicable because the device simulator + * requires the {@code ThrottledFsDatasetFactory} to be injected at DataNode startup. + *
+ * JUnit / maven-failsafe execution: + * + *
+ * mvn verify -pl hbase-it -Dtest=IntegrationTestCompactionWithDeviceSimulator + *+ * + * CLI execution via hbase script: + * + *
+ * hbase org.apache.hadoop.hbase.IntegrationTestCompactionWithDeviceSimulator \ + * -totalDataBytes 536870912 -constrainedBwMbps 10 -compression LZ4 + *+ * + * Configurable parameters (settable via CLI flags or + * {@code -Dhbase.IntegrationTestCompactionWithDeviceSimulator.}): + * + *
+ * # Data generation + * totalDataBytes (default 1073741824 = 1 GB) + * valueSize (default 102400 = 100 KB) + * numFlushCycles (default 10) + * targetCompressionRatio (default 3.0) + * + * # HFile / column family + * compression (default SNAPPY) + * dataBlockEncoding (default NONE) + * blockSize (default 65536 = 64 KB) + * bloomType (default ROW) + * blockPredicator (default PreviousBlockCompressionRatePredicator) + * + * # Device simulator + * constrainedBwMbps (default 25) + * baselineBwMbps (default 250) + * budgetIops (default 100000) + * deviceLatencyUs (default 0) + * volumesPerDataNode (default 1) + *+ */ +@Tag(IntegrationTests.TAG) +public class IntegrationTestCompactionWithDeviceSimulator extends IntegrationTestBase { + + private static final Logger LOG = + LoggerFactory.getLogger(IntegrationTestCompactionWithDeviceSimulator.class); + + private static final String CLASS_NAME = + IntegrationTestCompactionWithDeviceSimulator.class.getSimpleName(); + private static final String CONF_PREFIX = "hbase." + CLASS_NAME + "."; + + private static final TableName TABLE_NAME = TableName.valueOf(CLASS_NAME); + private static final byte[] FAMILY = Bytes.toBytes("f"); + private static final byte[] QUALIFIER = Bytes.toBytes("q"); + + private static final Map
+ * Test isolation: This class uses static mutable state. Tests must call {@link #shutdown()}
+ * in their {@code @AfterClass} method. Parallel test execution in the same JVM is not supported.
+ */
+public final class EBSDevice {
+
+ private static final Logger LOG = LoggerFactory.getLogger(EBSDevice.class);
+
+ // ---- Configuration keys ----
+
+ public static final String IO_BUDGET_BYTES_PER_SEC_KEY = "hbase.test.devsim.budget.bytes.per.sec";
+ public static final String IO_BUDGET_IOPS_KEY = "hbase.test.devsim.budget.iops";
+ public static final String IO_BUDGET_WINDOW_MS_KEY = "hbase.test.devsim.budget.window.ms";
+ public static final int DEFAULT_WINDOW_MS = 100;
+
+ public static final String IO_BUDGET_REPORT_INTERVAL_SEC_KEY =
+ "hbase.test.devsim.budget.report.interval.sec";
+ public static final int DEFAULT_REPORT_INTERVAL_SEC = 10;
+
+ public static final String IO_MAX_IO_SIZE_KB_KEY = "hbase.test.devsim.max.iosize.kb";
+ public static final int DEFAULT_MAX_IO_SIZE_KB = 1024;
+
+ public static final String IO_INSTANCE_MBPS_KEY = "hbase.test.devsim.instance.mbps";
+ public static final int DEFAULT_INSTANCE_MBPS = 0;
+
+ public static final String IO_DEVICE_LATENCY_US_KEY = "hbase.test.devsim.device.latency.us";
+ public static final int DEFAULT_DEVICE_LATENCY_US = 1000;
+
+ // ---- Per-DataNode context ----
+
+ /**
+ * Holds the EBS volume devices and instance-level budget for a single DataNode. One instance per
+ * {@link ThrottledFsDataset} proxy.
+ */
+ public static final class DataNodeContext {
+ private final String datanodeId;
+ private final EBSVolumeDevice[] volumes;
+ private final IOBudget instanceBwBudget;
+ private final long budgetBytesPerSec;
+ private final int budgetIops;
+ private final int deviceLatencyUs;
+
+ DataNodeContext(String datanodeId, EBSVolumeDevice[] volumes, IOBudget instanceBwBudget,
+ long budgetBytesPerSec, int budgetIops, int deviceLatencyUs) {
+ this.datanodeId = datanodeId;
+ this.volumes = volumes;
+ this.instanceBwBudget = instanceBwBudget;
+ this.budgetBytesPerSec = budgetBytesPerSec;
+ this.budgetIops = budgetIops;
+ this.deviceLatencyUs = deviceLatencyUs;
+ }
+
+ public String getDatanodeId() {
+ return datanodeId;
+ }
+
+ public EBSVolumeDevice[] getVolumes() {
+ return volumes;
+ }
+
+ public int getNumVolumes() {
+ return volumes.length;
+ }
+
+ public IOBudget getInstanceBwBudget() {
+ return instanceBwBudget;
+ }
+
+ public long getBudgetBytesPerSec() {
+ return budgetBytesPerSec;
+ }
+
+ public int getBudgetIops() {
+ return budgetIops;
+ }
+
+ public int getDeviceLatencyUs() {
+ return deviceLatencyUs;
+ }
+
+ /**
+ * Charge bytes against the instance-level BW budget (shared across all volumes on this DN).
+ * @return ms slept
+ */
+ public long consumeInstanceBw(long bytes) {
+ if (instanceBwBudget != null) {
+ return instanceBwBudget.consume(bytes);
+ }
+ return 0;
+ }
+
+ public void reset() {
+ for (EBSVolumeDevice vol : volumes) {
+ vol.reset();
+ }
+ if (instanceBwBudget != null) {
+ instanceBwBudget.reset();
+ }
+ }
+ }
+
+ // ---- Static registry ----
+
+ private static final List
+ * Read and write coalescing buffers are independent: a write to the same volume does not break read
+ * coalescing (they are separate physical operations at the device level). However, reads and writes
+ * share the per-volume IOPS budget.
+ *
+ * Each instance is fully self-contained with its own counters. Aggregation across volumes and
+ * DataNodes is handled by {@link EBSDevice}.
+ */
+public class EBSVolumeDevice {
+
+ final int id;
+ final IOBudget bwBudget;
+ final IOBudget iopsBudget;
+ final int maxIoSizeBytes;
+ final int deviceLatencyUs;
+
+ private Object lastReader;
+ private long lastReadEnd = -1;
+ private long pendingReadBytes;
+
+ private Object lastWriter;
+ private long pendingWriteBytes;
+
+ // Per-volume counters
+ final AtomicLong volumeDeviceReadOps = new AtomicLong();
+ final AtomicLong volumeDeviceWriteOps = new AtomicLong();
+ final AtomicLong totalBytesRead = new AtomicLong();
+ final AtomicLong totalBytesWritten = new AtomicLong();
+ final AtomicLong readOpCount = new AtomicLong();
+ final AtomicLong writeOpCount = new AtomicLong();
+ final AtomicLong bwReadSleepTimeMs = new AtomicLong();
+ final AtomicLong bwReadSleepCount = new AtomicLong();
+ final AtomicLong bwWriteSleepTimeMs = new AtomicLong();
+ final AtomicLong bwWriteSleepCount = new AtomicLong();
+ final AtomicLong iopsReadSleepTimeMs = new AtomicLong();
+ final AtomicLong iopsReadSleepCount = new AtomicLong();
+ final AtomicLong iopsWriteSleepTimeMs = new AtomicLong();
+ final AtomicLong iopsWriteSleepCount = new AtomicLong();
+ final AtomicLong latencySleepCount = new AtomicLong();
+ final AtomicLong latencySleepTimeUs = new AtomicLong();
+
+ /**
+ * @param id volume index within the DataNode
+ * @param bwBudget per-volume BW token bucket (null = unlimited)
+ * @param iopsBudget per-volume IOPS token bucket (null = unlimited)
+ * @param maxIoSizeBytes max coalesced IO size in bytes (0 = no coalescing)
+ * @param deviceLatencyUs per-IO device latency in microseconds (0 = disabled)
+ */
+ public EBSVolumeDevice(int id, IOBudget bwBudget, IOBudget iopsBudget, int maxIoSizeBytes,
+ int deviceLatencyUs) {
+ this.id = id;
+ this.bwBudget = bwBudget;
+ this.iopsBudget = iopsBudget;
+ this.maxIoSizeBytes = maxIoSizeBytes;
+ this.deviceLatencyUs = deviceLatencyUs;
+ }
+
+ public int getId() {
+ return id;
+ }
+
+ /**
+ * Account a read operation against this volume's BW and IOPS budgets. Sequential reads from the
+ * same stream are coalesced up to {@code maxIoSizeBytes} before consuming an IOPS token.
+ * Interleaving (a different stream, or a non-sequential position) flushes pending coalesced IO.
+ */
+ public synchronized void accountRead(Object stream, long position, int bytes) {
+ readOpCount.incrementAndGet();
+ totalBytesRead.addAndGet(bytes);
+ if (iopsBudget != null) {
+ if (maxIoSizeBytes > 0) {
+ if (lastReader != stream || (lastReadEnd >= 0 && position != lastReadEnd)) {
+ flushPendingReadIopsInternal();
+ }
+ lastReader = stream;
+ pendingReadBytes += bytes;
+ lastReadEnd = position + bytes;
+ while (pendingReadBytes >= maxIoSizeBytes) {
+ pendingReadBytes -= maxIoSizeBytes;
+ consumeReadIops();
+ }
+ } else {
+ consumeReadIops();
+ }
+ }
+ if (bwBudget != null) {
+ long slept = bwBudget.consume(bytes);
+ if (slept > 0) {
+ bwReadSleepTimeMs.addAndGet(slept);
+ bwReadSleepCount.incrementAndGet();
+ }
+ }
+ }
+
+ public synchronized void flushPendingReadIops(Object stream) {
+ if (lastReader == stream) {
+ flushPendingReadIopsInternal();
+ }
+ }
+
+ private void flushPendingReadIopsInternal() {
+ if (pendingReadBytes > 0 && iopsBudget != null) {
+ consumeReadIops();
+ }
+ pendingReadBytes = 0;
+ lastReader = null;
+ lastReadEnd = -1;
+ }
+
+ private void consumeReadIops() {
+ volumeDeviceReadOps.incrementAndGet();
+ long slept = iopsBudget.consume(1);
+ if (slept > 0) {
+ iopsReadSleepTimeMs.addAndGet(slept);
+ iopsReadSleepCount.incrementAndGet();
+ }
+ applyDeviceLatency();
+ }
+
+ /**
+ * Account a write operation against this volume's BW and IOPS budgets. Sequential writes from the
+ * same stream are coalesced up to {@code maxIoSizeBytes}.
+ */
+ public synchronized void accountWrite(Object stream, int bytes) {
+ writeOpCount.incrementAndGet();
+ totalBytesWritten.addAndGet(bytes);
+ if (iopsBudget != null) {
+ if (maxIoSizeBytes > 0) {
+ if (lastWriter != stream) {
+ flushPendingWriteIopsInternal();
+ }
+ lastWriter = stream;
+ pendingWriteBytes += bytes;
+ while (pendingWriteBytes >= maxIoSizeBytes) {
+ pendingWriteBytes -= maxIoSizeBytes;
+ consumeWriteIops();
+ }
+ } else {
+ consumeWriteIops();
+ }
+ }
+ if (bwBudget != null) {
+ long slept = bwBudget.consume(bytes);
+ if (slept > 0) {
+ bwWriteSleepTimeMs.addAndGet(slept);
+ bwWriteSleepCount.incrementAndGet();
+ }
+ }
+ }
+
+ /**
+ * Charge a bulk write against this volume's budgets. Used for block-level write throttling at
+ * finalize time, where the total bytes are known but were not individually intercepted.
+ * @param totalBytes total bytes written
+ */
+ public synchronized void accountBulkWrite(long totalBytes) {
+ writeOpCount.incrementAndGet();
+ totalBytesWritten.addAndGet(totalBytes);
+ if (iopsBudget != null) {
+ int opsToCharge =
+ maxIoSizeBytes > 0 ? (int) ((totalBytes + maxIoSizeBytes - 1) / maxIoSizeBytes) : 1;
+ for (int i = 0; i < opsToCharge; i++) {
+ consumeWriteIops();
+ }
+ }
+ if (bwBudget != null) {
+ long slept = bwBudget.consume(totalBytes);
+ if (slept > 0) {
+ bwWriteSleepTimeMs.addAndGet(slept);
+ bwWriteSleepCount.incrementAndGet();
+ }
+ }
+ }
+
+ public synchronized void flushPendingWriteIops(Object stream) {
+ if (lastWriter == stream) {
+ flushPendingWriteIopsInternal();
+ }
+ }
+
+ private void flushPendingWriteIopsInternal() {
+ if (pendingWriteBytes > 0 && iopsBudget != null) {
+ consumeWriteIops();
+ }
+ pendingWriteBytes = 0;
+ lastWriter = null;
+ }
+
+ private void consumeWriteIops() {
+ volumeDeviceWriteOps.incrementAndGet();
+ long slept = iopsBudget.consume(1);
+ if (slept > 0) {
+ iopsWriteSleepTimeMs.addAndGet(slept);
+ iopsWriteSleepCount.incrementAndGet();
+ }
+ applyDeviceLatency();
+ }
+
+ private void applyDeviceLatency() {
+ if (deviceLatencyUs > 0) {
+ latencySleepCount.incrementAndGet();
+ long startNs = System.nanoTime();
+ try {
+ Thread.sleep(deviceLatencyUs / 1000, (deviceLatencyUs % 1000) * 1000);
+ } catch (InterruptedException e) {
+ Thread.currentThread().interrupt();
+ }
+ latencySleepTimeUs.addAndGet((System.nanoTime() - startNs) / 1000);
+ }
+ }
+
+ public synchronized void reset() {
+ lastReader = null;
+ lastReadEnd = -1;
+ pendingReadBytes = 0;
+ lastWriter = null;
+ pendingWriteBytes = 0;
+ volumeDeviceReadOps.set(0);
+ volumeDeviceWriteOps.set(0);
+ totalBytesRead.set(0);
+ totalBytesWritten.set(0);
+ readOpCount.set(0);
+ writeOpCount.set(0);
+ bwReadSleepTimeMs.set(0);
+ bwReadSleepCount.set(0);
+ bwWriteSleepTimeMs.set(0);
+ bwWriteSleepCount.set(0);
+ iopsReadSleepTimeMs.set(0);
+ iopsReadSleepCount.set(0);
+ iopsWriteSleepTimeMs.set(0);
+ iopsWriteSleepCount.set(0);
+ latencySleepCount.set(0);
+ latencySleepTimeUs.set(0);
+ if (bwBudget != null) {
+ bwBudget.reset();
+ }
+ if (iopsBudget != null) {
+ iopsBudget.reset();
+ }
+ }
+}
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/io/devsim/IOBudget.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/io/devsim/IOBudget.java
new file mode 100644
index 000000000000..703101fa7932
--- /dev/null
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/io/devsim/IOBudget.java
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.devsim;
+
+/**
+ * Token-bucket rate limiter. Supports a configurable tokens-per-second rate with windowed refill.
+ * Used for per-volume BW budgets, per-volume IOPS budgets, and instance-level aggregate BW caps.
+ *
+ * When all tokens in the current window are consumed, {@link #consume(long)} blocks until the next
+ * window opens, providing backpressure to the caller.
+ */
+public class IOBudget {
+
+ private final long tokensPerSec;
+ private final long tokensPerWindow;
+ private final long windowMs;
+ private final long millisPerToken;
+ private final boolean lowRateMode;
+ private long availableTokens;
+ private long windowStartTime;
+ private long nextTokenTimeMs;
+
+ /**
+ * @param tokensPerSec tokens replenished per second (e.g. bytes/sec for BW, ops/sec for IOPS)
+ * @param windowMs refill window duration in milliseconds
+ */
+ public IOBudget(long tokensPerSec, long windowMs) {
+ this.tokensPerSec = tokensPerSec;
+ this.windowMs = windowMs;
+ this.tokensPerWindow = tokensPerSec * windowMs / 1000;
+ this.lowRateMode = tokensPerSec > 0 && this.tokensPerWindow == 0;
+ this.millisPerToken = lowRateMode ? Math.max(1, (1000 + tokensPerSec - 1) / tokensPerSec) : 0;
+ this.availableTokens = lowRateMode ? 0 : tokensPerWindow;
+ this.windowStartTime = System.currentTimeMillis();
+ this.nextTokenTimeMs = this.windowStartTime;
+ }
+
+ public synchronized void reset() {
+ this.availableTokens = lowRateMode ? 0 : tokensPerWindow;
+ this.windowStartTime = System.currentTimeMillis();
+ this.nextTokenTimeMs = this.windowStartTime;
+ }
+
+ /**
+ * Consume tokens from the budget. Blocks (sleeps) if the budget is exhausted until enough tokens
+ * are available.
+ * @param tokens number of tokens to consume
+ * @return total milliseconds slept waiting for tokens
+ */
+ public synchronized long consume(long tokens) {
+ if (tokens <= 0) {
+ return 0;
+ }
+ if (tokensPerSec <= 0) {
+ return 0;
+ }
+ if (lowRateMode) {
+ return consumeLowRate(tokens);
+ }
+ long totalSlept = 0;
+ long remaining = tokens;
+ while (remaining > 0) {
+ long now = System.currentTimeMillis();
+ long elapsed = now - windowStartTime;
+ if (elapsed >= windowMs) {
+ long windowsPassed = elapsed / windowMs;
+ availableTokens = tokensPerWindow;
+ windowStartTime += windowsPassed * windowMs;
+ }
+ long toConsume = Math.min(remaining, availableTokens);
+ if (toConsume > 0) {
+ availableTokens -= toConsume;
+ remaining -= toConsume;
+ }
+ if (remaining > 0) {
+ long sleepTime = windowMs - (System.currentTimeMillis() - windowStartTime);
+ if (sleepTime <= 0) {
+ sleepTime = 1;
+ }
+ totalSlept += sleepTime;
+ try {
+ wait(sleepTime);
+ } catch (InterruptedException e) {
+ Thread.currentThread().interrupt();
+ break;
+ }
+ }
+ }
+ notifyAll();
+ return totalSlept;
+ }
+
+ /**
+ * Low-rate path for configurations where {@code tokensPerWindow == 0}. In this case we schedule
+ * one token every {@code millisPerToken} and block callers until the next token time.
+ */
+ private long consumeLowRate(long tokens) {
+ long totalSlept = 0;
+ for (long i = 0; i < tokens; i++) {
+ long now = System.currentTimeMillis();
+ if (now < nextTokenTimeMs) {
+ long sleepTime = nextTokenTimeMs - now;
+ totalSlept += sleepTime;
+ try {
+ wait(sleepTime);
+ } catch (InterruptedException e) {
+ Thread.currentThread().interrupt();
+ break;
+ }
+ } else {
+ nextTokenTimeMs = now;
+ }
+ nextTokenTimeMs += millisPerToken;
+ }
+ notifyAll();
+ return totalSlept;
+ }
+}
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/io/devsim/TestEBSDeviceLayer.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/io/devsim/TestEBSDeviceLayer.java
new file mode 100644
index 000000000000..c008b484124a
--- /dev/null
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/io/devsim/TestEBSDeviceLayer.java
@@ -0,0 +1,201 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.devsim;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.IOException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.HBaseClassTestRule;
+import org.apache.hadoop.hbase.HBaseConfiguration;
+import org.apache.hadoop.hbase.HBaseTestingUtil;
+import org.apache.hadoop.hbase.HConstants;
+import org.apache.hadoop.hbase.TableName;
+import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder;
+import org.apache.hadoop.hbase.client.Get;
+import org.apache.hadoop.hbase.client.Put;
+import org.apache.hadoop.hbase.client.Result;
+import org.apache.hadoop.hbase.client.ResultScanner;
+import org.apache.hadoop.hbase.client.Scan;
+import org.apache.hadoop.hbase.client.Table;
+import org.apache.hadoop.hbase.client.TableDescriptor;
+import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
+import org.apache.hadoop.hbase.regionserver.HRegion;
+import org.apache.hadoop.hbase.testclassification.IOTests;
+import org.apache.hadoop.hbase.testclassification.MediumTests;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Integration test verifying that the EBS device layer proxy is correctly installed by
+ * MiniDFSCluster and intercepts read and write IO at the DataNode storage level.
+ *
+ * Starts a single-DataNode MiniDFSCluster with 2 storage volumes and the EBS device layer
+ * configured with high bandwidth/IOPS budgets (so throttling does not slow the test) but with
+ * device latency disabled. Writes data through HBase, flushes, reads it back via scan and get, and
+ * asserts that the device layer metrics reflect the IO.
+ */
+@Category({ IOTests.class, MediumTests.class })
+public class TestEBSDeviceLayer {
+
+ @ClassRule
+ public static final HBaseClassTestRule CLASS_RULE =
+ HBaseClassTestRule.forClass(TestEBSDeviceLayer.class);
+
+ private static final Logger LOG = LoggerFactory.getLogger(TestEBSDeviceLayer.class);
+
+ private static final TableName TABLE_NAME = TableName.valueOf("TestEBSDeviceLayer");
+ private static final byte[] FAMILY = Bytes.toBytes("f");
+ private static final byte[] QUALIFIER = Bytes.toBytes("q");
+ private static final int NUM_ROWS = 200;
+ private static final int VALUE_SIZE = 4096;
+ private static final int NUM_VOLUMES = 2;
+
+ private static HBaseTestingUtil UTIL;
+
+ @BeforeClass
+ public static void setUp() throws Exception {
+ Configuration conf = HBaseConfiguration.create();
+ conf.set(HConstants.HBASE_REGION_SPLIT_POLICY_KEY,
+ "org.apache.hadoop.hbase.regionserver.DisabledRegionSplitPolicy");
+ conf.setBoolean("hbase.tests.use.shortcircuit.reads", false);
+ EBSDevice.configure(conf, 1000, 100000, 0, 1024, 0);
+
+ UTIL = new HBaseTestingUtil(conf);
+ UTIL.startMiniZKCluster();
+ MiniDFSCluster dfsCluster =
+ new MiniDFSCluster.Builder(conf).numDataNodes(1).storagesPerDatanode(NUM_VOLUMES).build();
+ dfsCluster.waitClusterUp();
+ UTIL.setDFSCluster(dfsCluster);
+ UTIL.startMiniCluster(1);
+ }
+
+ @AfterClass
+ public static void tearDown() throws Exception {
+ EBSDevice.shutdown();
+ if (UTIL != null) {
+ UTIL.shutdownMiniCluster();
+ }
+ }
+
+ @Test
+ public void testDeviceLayerInterceptsIO() throws Exception {
+ assertEquals("Expected 1 DataNode registered with EBSDevice", 1, EBSDevice.getNumDataNodes());
+ EBSDevice.DataNodeContext dnCtx = EBSDevice.getDataNodeContext(0);
+ assertEquals("Expected " + NUM_VOLUMES + " volumes", NUM_VOLUMES, dnCtx.getNumVolumes());
+
+ TableDescriptor desc = TableDescriptorBuilder.newBuilder(TABLE_NAME).setColumnFamily(
+ ColumnFamilyDescriptorBuilder.newBuilder(FAMILY).setBlocksize(64 * 1024).build()).build();
+ UTIL.getAdmin().createTable(desc);
+ UTIL.waitUntilAllRegionsAssigned(TABLE_NAME);
+
+ EBSDevice.resetMetrics();
+
+ byte[] value = new byte[VALUE_SIZE];
+ Bytes.random(value);
+ try (Table table = UTIL.getConnection().getTable(TABLE_NAME)) {
+ for (int i = 0; i < NUM_ROWS; i++) {
+ Put put = new Put(Bytes.toBytes(String.format("row-%05d", i)));
+ put.addColumn(FAMILY, QUALIFIER, value);
+ table.put(put);
+ }
+ }
+
+ UTIL.getAdmin().flush(TABLE_NAME);
+ waitForFlush();
+
+ long writeBytesAfterFlush = EBSDevice.getTotalBytesWritten();
+ long writeInterceptsAfterFlush = EBSDevice.getWriteInterceptCount();
+ LOG.info("After write+flush: bytesWritten={}, writeIntercepts={}, deviceWriteOps={}",
+ writeBytesAfterFlush, writeInterceptsAfterFlush, EBSDevice.getDeviceWriteOps());
+
+ assertTrue("Expected write intercepts after flush, got " + writeInterceptsAfterFlush,
+ writeInterceptsAfterFlush > 0);
+ assertTrue("Expected bytes written > 0 after flush, got " + writeBytesAfterFlush,
+ writeBytesAfterFlush > 0);
+
+ EBSDevice.resetMetrics();
+ int rowCount = 0;
+ try (Table table = UTIL.getConnection().getTable(TABLE_NAME)) {
+ try (ResultScanner scanner = table.getScanner(new Scan())) {
+ Result result;
+ while ((result = scanner.next()) != null) {
+ assertTrue("Row should not be empty", !result.isEmpty());
+ rowCount++;
+ }
+ }
+ }
+ assertEquals("Expected to read back all rows", NUM_ROWS, rowCount);
+
+ long readBytes = EBSDevice.getTotalBytesRead();
+ long readIntercepts = EBSDevice.getReadInterceptCount();
+ long deviceReadOps = EBSDevice.getDeviceReadOps();
+ long readOps = EBSDevice.getReadOpCount();
+ LOG.info("After scan: bytesRead={}, readIntercepts={}, appReadOps={}, deviceReadOps={}",
+ readBytes, readIntercepts, readOps, deviceReadOps);
+
+ assertTrue("Expected read intercepts after scan, got " + readIntercepts, readIntercepts > 0);
+ assertTrue("Expected bytes read > 0 after scan, got " + readBytes, readBytes > 0);
+ assertTrue("Expected device read ops > 0 (IOPS coalescing should still produce ops), got "
+ + deviceReadOps, deviceReadOps > 0);
+
+ EBSDevice.resetMetrics();
+ try (Table table = UTIL.getConnection().getTable(TABLE_NAME)) {
+ Result result = table.get(new Get(Bytes.toBytes("row-00050")));
+ assertTrue("Get should return data", !result.isEmpty());
+ }
+ long getReadIntercepts = EBSDevice.getReadInterceptCount();
+ LOG.info("After get: readIntercepts={}, bytesRead={}", getReadIntercepts,
+ EBSDevice.getTotalBytesRead());
+ assertTrue("Expected read intercepts after get, got " + getReadIntercepts,
+ getReadIntercepts > 0);
+
+ long totalIntercepts = EBSDevice.getReadInterceptCount() + EBSDevice.getWriteInterceptCount();
+ long unresolved = EBSDevice.getUnresolvedVolumeCount();
+ if (totalIntercepts > 0) {
+ double unresolvedRatio = (double) unresolved / totalIntercepts;
+ assertTrue("Unresolved volume ratio too high: " + unresolvedRatio + " (unresolved="
+ + unresolved + ", total=" + totalIntercepts + ")", unresolvedRatio <= 0.01);
+ }
+
+ LOG.info("Per-volume stats: {}", EBSDevice.getPerVolumeStats());
+ }
+
+ private void waitForFlush() throws Exception {
+ long deadline = System.currentTimeMillis() + 60000;
+ while (System.currentTimeMillis() < deadline) {
+ long memstoreSize = 0;
+ for (HRegion region : UTIL.getMiniHBaseCluster().getRegionServer(0).getRegions(TABLE_NAME)) {
+ memstoreSize += region.getMemStoreDataSize();
+ }
+ if (memstoreSize == 0) {
+ return;
+ }
+ Thread.sleep(500);
+ }
+ throw new IOException("Flush did not complete within timeout");
+ }
+}
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/io/devsim/TestIOBudget.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/io/devsim/TestIOBudget.java
new file mode 100644
index 000000000000..58f79b0e56a0
--- /dev/null
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/io/devsim/TestIOBudget.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.devsim;
+
+import static org.junit.Assert.assertTrue;
+
+import org.apache.hadoop.hbase.HBaseClassTestRule;
+import org.apache.hadoop.hbase.testclassification.IOTests;
+import org.apache.hadoop.hbase.testclassification.SmallTests;
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+@Category({ IOTests.class, SmallTests.class })
+public class TestIOBudget {
+
+ @ClassRule
+ public static final HBaseClassTestRule CLASS_RULE =
+ HBaseClassTestRule.forClass(TestIOBudget.class);
+
+ @Test
+ public void testLowRateModeDoesNotDeadlock() {
+ IOBudget budget = new IOBudget(2, 100);
+ long t0 = System.currentTimeMillis();
+ budget.consume(1);
+ long t1 = System.currentTimeMillis();
+ budget.consume(1);
+ long elapsedSecondTokenMs = System.currentTimeMillis() - t1;
+
+ // At 2 tokens/sec, second token should require waiting roughly 500ms.
+ assertTrue("Expected low-rate budget to throttle second token, elapsed=" + elapsedSecondTokenMs,
+ elapsedSecondTokenMs >= 300);
+ assertTrue("Unexpectedly long low-rate throttle delay, elapsed=" + elapsedSecondTokenMs,
+ elapsedSecondTokenMs < 3000);
+ assertTrue("Clock sanity check", t1 >= t0);
+ }
+
+ @Test
+ public void testRegularWindowModeStillThrottles() {
+ // 100 tokens/sec with 100ms windows -> 10 tokens/window
+ IOBudget budget = new IOBudget(100, 100);
+ long elapsedMs = budget.consume(15);
+ assertTrue("Expected consume to sleep when exceeding window budget", elapsedMs > 0);
+ }
+}
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/io/devsim/ThrottledBlockInputStream.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/io/devsim/ThrottledBlockInputStream.java
new file mode 100644
index 000000000000..268fb93f9028
--- /dev/null
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/io/devsim/ThrottledBlockInputStream.java
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.devsim;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * Wraps a block data {@link InputStream} returned by the DataNode's {@code getBlockInputStream}
+ * with per-byte BW and IOPS throttling against the volume's {@link EBSVolumeDevice}. Tracks the
+ * read position for sequential-IO coalescing detection.
+ *
+ * Also charges the per-DataNode instance-level BW budget if configured.
+ */
+public class ThrottledBlockInputStream extends InputStream {
+
+ private final InputStream delegate;
+ private final EBSVolumeDevice volume;
+ private final EBSDevice.DataNodeContext dnContext;
+ private long position;
+
+ /**
+ * @param delegate the real block input stream from FsDatasetImpl
+ * @param volume the EBS volume device this block resides on
+ * @param dnContext the DataNode context for instance-level BW budget (may be null)
+ * @param offset the initial seek offset into the block
+ */
+ public ThrottledBlockInputStream(InputStream delegate, EBSVolumeDevice volume,
+ EBSDevice.DataNodeContext dnContext, long offset) {
+ this.delegate = delegate;
+ this.volume = volume;
+ this.dnContext = dnContext;
+ this.position = offset;
+ }
+
+ @Override
+ public int read() throws IOException {
+ int b = delegate.read();
+ if (b >= 0) {
+ accountRead(1);
+ }
+ return b;
+ }
+
+ @Override
+ public int read(byte[] buf, int off, int len) throws IOException {
+ int bytesRead = delegate.read(buf, off, len);
+ if (bytesRead > 0) {
+ accountRead(bytesRead);
+ }
+ return bytesRead;
+ }
+
+ @Override
+ public long skip(long n) throws IOException {
+ long skipped = delegate.skip(n);
+ if (skipped > 0) {
+ volume.flushPendingReadIops(this);
+ position += skipped;
+ }
+ return skipped;
+ }
+
+ @Override
+ public int available() throws IOException {
+ return delegate.available();
+ }
+
+ @Override
+ public void close() throws IOException {
+ volume.flushPendingReadIops(this);
+ delegate.close();
+ }
+
+ private void accountRead(int bytes) {
+ volume.accountRead(this, position, bytes);
+ position += bytes;
+ if (dnContext != null) {
+ dnContext.consumeInstanceBw(bytes);
+ }
+ }
+}
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/io/devsim/ThrottledFsDataset.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/io/devsim/ThrottledFsDataset.java
new file mode 100644
index 000000000000..81478685bb6d
--- /dev/null
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/io/devsim/ThrottledFsDataset.java
@@ -0,0 +1,310 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.devsim;
+
+import java.io.InputStream;
+import java.lang.reflect.InvocationHandler;
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Method;
+import java.lang.reflect.Proxy;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
+import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsDatasetSpi;
+import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeSpi;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Dynamic proxy wrapping a real {@code FsDatasetImpl} to apply EBS volume emulation at the DataNode
+ * storage level. One proxy is created per DataNode by {@link ThrottledFsDatasetFactory}.
+ *
+ * Models the Linux ext4 page cache behavior: application writes go to the page cache with no EBS
+ * charge. EBS is engaged only when dirty pages are flushed to the block device, which happens at
+ * two interception points on {@code FsDatasetSpi}:
+ *
+ * All other methods are delegated transparently to the inner {@code FsDatasetImpl}.
+ */
+public final class ThrottledFsDataset {
+
+ private static final Logger LOG = LoggerFactory.getLogger(ThrottledFsDataset.class);
+
+ private ThrottledFsDataset() {
+ }
+
+ /**
+ * Create a dynamic proxy wrapping the given {@code FsDatasetSpi} delegate with EBS throttling.
+ * @param delegate the real FsDatasetImpl
+ * @param dnId DataNode identifier for metrics registration
+ * @param conf Hadoop configuration with EBS parameters
+ * @return a proxy implementing FsDatasetSpi with throttling
+ */
+ @SuppressWarnings("unchecked")
+ public static
+ * We charge {@code nbytes} against the volume's BW and IOPS budgets here, and track the
+ * cumulative synced bytes per block so that {@code finalizeBlock} only charges the remaining
+ * unflushed delta.
+ */
+ private Object handleSyncFileRange(Method method, Object[] args) throws Throwable {
+ ExtendedBlock block = (ExtendedBlock) args[0];
+ // args: (ExtendedBlock, ReplicaOutputStreams, long offset, long nbytes, int flags)
+ long nbytes = args[3] instanceof Number ? ((Number) args[3]).longValue() : 0;
+ EBSDevice.recordWriteIntercept();
+ EBSVolumeDevice vol = resolveVolume(block);
+ if (vol != null && nbytes > 0) {
+ vol.accountBulkWrite(nbytes);
+ dnContext.consumeInstanceBw(nbytes);
+ syncedBytesPerBlock.merge(block.getBlockId(), nbytes, Long::sum);
+ } else if (vol == null) {
+ EBSDevice.recordUnresolvedVolume();
+ }
+ return delegateInvoke(method, args);
+ }
+
+ /**
+ * Intercepts block finalization, which in the DataNode follows immediately after
+ * {@code BlockReceiver.close()} (which includes {@code syncDataOut()} / fsync). Charges only
+ * the remaining bytes not already flushed via {@code sync_file_range} calls.
+ */
+ private Object handleFinalizeBlock(Method method, Object[] args) throws Throwable {
+ ExtendedBlock block = (ExtendedBlock) args[0];
+ EBSDevice.recordWriteIntercept();
+ EBSVolumeDevice vol = resolveVolume(block);
+ if (vol != null) {
+ long blockBytes = block.getNumBytes();
+ long alreadySynced = syncedBytesPerBlock.getOrDefault(block.getBlockId(), 0L);
+ long remaining = Math.max(0, blockBytes - alreadySynced);
+ if (remaining > 0) {
+ vol.accountBulkWrite(remaining);
+ dnContext.consumeInstanceBw(remaining);
+ }
+ syncedBytesPerBlock.remove(block.getBlockId());
+ } else {
+ EBSDevice.recordUnresolvedVolume();
+ }
+ return delegateInvoke(method, args);
+ }
+
+ private EBSVolumeDevice resolveVolume(ExtendedBlock block) {
+ try {
+ FsVolumeSpi vol = delegate.getVolume(block);
+ if (vol != null) {
+ return storageIdToVolume.get(vol.getStorageID());
+ }
+ } catch (Exception e) {
+ LOG.debug("Could not resolve volume for block {}", block, e);
+ }
+ return null;
+ }
+
+ private Object delegateInvoke(Method method, Object[] args) throws Throwable {
+ try {
+ return method.invoke(delegate, args);
+ } catch (InvocationTargetException e) {
+ throw e.getCause();
+ }
+ }
+ }
+}
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/io/devsim/ThrottledFsDatasetFactory.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/io/devsim/ThrottledFsDatasetFactory.java
new file mode 100644
index 000000000000..595afa82ccc0
--- /dev/null
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/io/devsim/ThrottledFsDatasetFactory.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.devsim;
+
+import java.io.IOException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hdfs.server.datanode.DataNode;
+import org.apache.hadoop.hdfs.server.datanode.DataStorage;
+import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsDatasetSpi;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Factory that creates {@link ThrottledFsDataset} proxies wrapping the standard
+ * {@code FsDatasetImpl}. Configured via:
+ *
+ *
+ *
+ * Cumulative synced bytes are tracked per block to avoid double-counting between the intermediate
+ * sync_file_range charges and the final fsync charge.
+ *
+ * conf.set("dfs.datanode.fsdataset.factory", ThrottledFsDatasetFactory.class.getName());
+ *
+ *
+ * Hadoop calls {@link #newInstance} once per DataNode during startup. Each invocation creates an
+ * independent proxy with its own set of {@link EBSVolumeDevice} instances.
+ */
+@SuppressWarnings("rawtypes")
+public class ThrottledFsDatasetFactory extends FsDatasetSpi.Factory {
+
+ private static final Logger LOG = LoggerFactory.getLogger(ThrottledFsDatasetFactory.class);
+
+ private static final String DEFAULT_FACTORY_CLASS =
+ "org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetFactory";
+
+ @Override
+ @SuppressWarnings("unchecked")
+ public FsDatasetSpi newInstance(DataNode datanode, DataStorage storage, Configuration conf)
+ throws IOException {
+ FsDatasetSpi.Factory defaultFactory = loadDefaultFactory();
+ FsDatasetSpi> inner = defaultFactory.newInstance(datanode, storage, conf);
+ String dnId = datanode.getDatanodeId() != null
+ ? datanode.getDatanodeId().getDatanodeUuid()
+ : "DN-" + System.identityHashCode(datanode);
+ LOG.info("ThrottledFsDatasetFactory: creating EBS device layer for DataNode {}", dnId);
+ return ThrottledFsDataset.wrap(inner, dnId, conf);
+ }
+
+ @SuppressWarnings("unchecked")
+ private static FsDatasetSpi.Factory loadDefaultFactory() throws IOException {
+ try {
+ Class> clazz = Class.forName(DEFAULT_FACTORY_CLASS);
+ return (FsDatasetSpi.Factory) clazz.getDeclaredConstructor().newInstance();
+ } catch (Exception e) {
+ throw new IOException("Failed to load default FsDataset factory: " + DEFAULT_FACTORY_CLASS,
+ e);
+ }
+ }
+}