diff --git a/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/system/SystemMetrics.java b/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/system/SystemMetrics.java index e14d8910791a1..a1d8f8fa879b5 100644 --- a/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/system/SystemMetrics.java +++ b/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/system/SystemMetrics.java @@ -41,6 +41,8 @@ import java.io.InputStreamReader; import java.lang.management.ManagementFactory; import java.nio.file.FileStore; +import java.util.ArrayList; +import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Set; @@ -60,7 +62,8 @@ public class SystemMetrics implements IMetricSet { static final String SYSTEM = "system"; private final com.sun.management.OperatingSystemMXBean osMxBean; - private Set fileStores = new HashSet<>(); + private volatile Set fileStores = new HashSet<>(); + private volatile List diskDirs = Collections.emptyList(); private static final String FAILED_TO_STATISTIC = "Failed to statistic the size of {}, because"; public SystemMetrics() { @@ -72,6 +75,7 @@ public void setDiskDirs(List diskDirs) { .getMetricConfig() .getMetricLevel() .equals(MetricLevel.OFF)) { + this.diskDirs = new ArrayList<>(diskDirs); this.fileStores = getFileStores(diskDirs); } } @@ -334,39 +338,62 @@ private void removeSystemDiskInfo(AbstractMetricService metricService) { } public long getSystemDiskTotalSpace() { - long sysTotalSpace = 0L; - for (FileStore fileStore : fileStores) { - try { - sysTotalSpace += fileStore.getTotalSpace(); - } catch (IOException e) { - logger.error(FAILED_TO_STATISTIC, fileStore, e); - } - } - return sysTotalSpace; + return collectDiskSpace(FileStore::getTotalSpace); } public long getSystemDiskFreeSpace() { - long sysFreeSpace = 0L; - for (FileStore fileStore : fileStores) { - try { - sysFreeSpace += fileStore.getUnallocatedSpace(); - } catch (IOException e) { - logger.error(FAILED_TO_STATISTIC, fileStore, e); - } - } - return sysFreeSpace; + return collectDiskSpace(FileStore::getUnallocatedSpace); } public long getSystemDiskAvailableSpace() { - long sysAvailableSpace = 0L; - for (FileStore fileStore : fileStores) { - try { - sysAvailableSpace += fileStore.getUsableSpace(); - } catch (IOException e) { - logger.error(FAILED_TO_STATISTIC, fileStore, e); + return collectDiskSpace(FileStore::getUsableSpace); + } + + /** + * Sum up a disk-space metric across all cached {@link FileStore}s. + * + *

A cached {@code FileStore} pins the exact path it was resolved from. That path can be + * removed while IoTDB is running (for example an empty data region directory deleted during + * region migration), after which every space query against the stale {@code FileStore} throws + * {@link java.nio.file.NoSuchFileException}. When that happens we re-resolve the {@code + * FileStore}s once: {@link org.apache.iotdb.metrics.utils.FileStoreUtils#getFileStore} walks up + * to an existing ancestor directory on the same device, so the metric recovers on the next + * sampling instead of flooding the log with errors on every heartbeat. + */ + private long collectDiskSpace(DiskSpaceReader reader) { + boolean refreshed = false; + while (true) { + Set currentFileStores = fileStores; + long space = 0L; + boolean stale = false; + for (FileStore fileStore : currentFileStores) { + try { + space += reader.read(fileStore); + } catch (IOException e) { + stale = true; + if (refreshed) { + // Still failing after re-resolving: log once at warn level (instead of error on every + // sampling) and skip this file store to keep the metric best-effort. + logger.warn(FAILED_TO_STATISTIC, fileStore, e); + } + } + } + if (!stale || refreshed) { + return space; } + refreshFileStores(); + refreshed = true; } - return sysAvailableSpace; + } + + /** Re-resolve the cached {@link FileStore}s from the configured disk dirs. */ + private synchronized void refreshFileStores() { + this.fileStores = getFileStores(diskDirs); + } + + @FunctionalInterface + private interface DiskSpaceReader { + long read(FileStore fileStore) throws IOException; } public static SystemMetrics getInstance() { diff --git a/iotdb-core/metrics/interface/src/test/java/org/apache/iotdb/metrics/metricsets/system/SystemMetricsTest.java b/iotdb-core/metrics/interface/src/test/java/org/apache/iotdb/metrics/metricsets/system/SystemMetricsTest.java new file mode 100644 index 0000000000000..42d65173ea542 --- /dev/null +++ b/iotdb-core/metrics/interface/src/test/java/org/apache/iotdb/metrics/metricsets/system/SystemMetricsTest.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.metrics.metricsets.system; + +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import java.io.File; +import java.util.Collections; + +import static org.junit.Assert.assertTrue; + +public class SystemMetricsTest { + + private File tempDir; + + @Before + public void setUp() { + tempDir = new File("target", "system-metrics-test-" + System.nanoTime()); + File dataDir = new File(tempDir, "data"); + assertTrue(dataDir.mkdirs()); + } + + @After + public void tearDown() { + if (tempDir != null) { + // Best-effort cleanup; the data subdir may already be gone after the test. + new File(tempDir, "data").delete(); + tempDir.delete(); + } + } + + /** + * Regression test for the case where the directory backing a cached {@link + * java.nio.file.FileStore} is removed while IoTDB is running (e.g. an empty data region directory + * deleted during region migration). The disk-space metrics must recover by re-resolving the file + * stores instead of throwing / permanently returning the stale value. + */ + @Test + public void testDiskSpaceRecoversWhenBackingDirIsRemoved() { + SystemMetrics systemMetrics = new SystemMetrics(); + File dataDir = new File(tempDir, "data"); + systemMetrics.setDiskDirs(Collections.singletonList(dataDir.getAbsolutePath())); + + // Sanity check: the file store resolves to a real device with a positive size. + assertTrue(systemMetrics.getSystemDiskTotalSpace() > 0L); + assertTrue(systemMetrics.getSystemDiskFreeSpace() > 0L); + assertTrue(systemMetrics.getSystemDiskAvailableSpace() > 0L); + + // The directory the FileStore was pinned to is removed; its parent still lives on the same + // device. The metrics should re-resolve the file store and keep reporting a positive size + // rather than flooding the log with NoSuchFileException. + assertTrue(dataDir.delete()); + + assertTrue(systemMetrics.getSystemDiskTotalSpace() > 0L); + assertTrue(systemMetrics.getSystemDiskFreeSpace() > 0L); + assertTrue(systemMetrics.getSystemDiskAvailableSpace() > 0L); + } +}