From 2e3563ef52a4775129bbb6670cad34319351c6d4 Mon Sep 17 00:00:00 2001 From: Yongzao <532741407@qq.com> Date: Fri, 12 Jun 2026 16:22:31 +0800 Subject: [PATCH] Re-resolve stale FileStore in SystemMetrics when its backing dir is removed A cached FileStore pins the exact path it was resolved from. When that path is deleted while IoTDB is running (e.g. an empty data region directory removed during region migration), every disk-space query against the stale FileStore throws NoSuchFileException, which was logged at ERROR on every heartbeat and flooded the DataNode log. Store the configured disk dirs and, when a space query fails, re-resolve the FileStores once via FileStoreUtils#getFileStore (which walks up to an existing ancestor on the same device) so the metric recovers on the next sampling. Remaining failures are logged at WARN instead of ERROR. --- .../metricsets/system/SystemMetrics.java | 79 +++++++++++++------ .../metricsets/system/SystemMetricsTest.java | 77 ++++++++++++++++++ 2 files changed, 130 insertions(+), 26 deletions(-) create mode 100644 iotdb-core/metrics/interface/src/test/java/org/apache/iotdb/metrics/metricsets/system/SystemMetricsTest.java diff --git a/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/system/SystemMetrics.java b/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/system/SystemMetrics.java index e14d8910791a1..a1d8f8fa879b5 100644 --- a/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/system/SystemMetrics.java +++ b/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/system/SystemMetrics.java @@ -41,6 +41,8 @@ import java.io.InputStreamReader; import java.lang.management.ManagementFactory; import java.nio.file.FileStore; +import java.util.ArrayList; +import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Set; @@ -60,7 +62,8 @@ public class SystemMetrics implements IMetricSet { static final String SYSTEM = "system"; private final com.sun.management.OperatingSystemMXBean osMxBean; - private Set fileStores = new HashSet<>(); + private volatile Set fileStores = new HashSet<>(); + private volatile List diskDirs = Collections.emptyList(); private static final String FAILED_TO_STATISTIC = "Failed to statistic the size of {}, because"; public SystemMetrics() { @@ -72,6 +75,7 @@ public void setDiskDirs(List diskDirs) { .getMetricConfig() .getMetricLevel() .equals(MetricLevel.OFF)) { + this.diskDirs = new ArrayList<>(diskDirs); this.fileStores = getFileStores(diskDirs); } } @@ -334,39 +338,62 @@ private void removeSystemDiskInfo(AbstractMetricService metricService) { } public long getSystemDiskTotalSpace() { - long sysTotalSpace = 0L; - for (FileStore fileStore : fileStores) { - try { - sysTotalSpace += fileStore.getTotalSpace(); - } catch (IOException e) { - logger.error(FAILED_TO_STATISTIC, fileStore, e); - } - } - return sysTotalSpace; + return collectDiskSpace(FileStore::getTotalSpace); } public long getSystemDiskFreeSpace() { - long sysFreeSpace = 0L; - for (FileStore fileStore : fileStores) { - try { - sysFreeSpace += fileStore.getUnallocatedSpace(); - } catch (IOException e) { - logger.error(FAILED_TO_STATISTIC, fileStore, e); - } - } - return sysFreeSpace; + return collectDiskSpace(FileStore::getUnallocatedSpace); } public long getSystemDiskAvailableSpace() { - long sysAvailableSpace = 0L; - for (FileStore fileStore : fileStores) { - try { - sysAvailableSpace += fileStore.getUsableSpace(); - } catch (IOException e) { - logger.error(FAILED_TO_STATISTIC, fileStore, e); + return collectDiskSpace(FileStore::getUsableSpace); + } + + /** + * Sum up a disk-space metric across all cached {@link FileStore}s. + * + *

A cached {@code FileStore} pins the exact path it was resolved from. That path can be + * removed while IoTDB is running (for example an empty data region directory deleted during + * region migration), after which every space query against the stale {@code FileStore} throws + * {@link java.nio.file.NoSuchFileException}. When that happens we re-resolve the {@code + * FileStore}s once: {@link org.apache.iotdb.metrics.utils.FileStoreUtils#getFileStore} walks up + * to an existing ancestor directory on the same device, so the metric recovers on the next + * sampling instead of flooding the log with errors on every heartbeat. + */ + private long collectDiskSpace(DiskSpaceReader reader) { + boolean refreshed = false; + while (true) { + Set currentFileStores = fileStores; + long space = 0L; + boolean stale = false; + for (FileStore fileStore : currentFileStores) { + try { + space += reader.read(fileStore); + } catch (IOException e) { + stale = true; + if (refreshed) { + // Still failing after re-resolving: log once at warn level (instead of error on every + // sampling) and skip this file store to keep the metric best-effort. + logger.warn(FAILED_TO_STATISTIC, fileStore, e); + } + } + } + if (!stale || refreshed) { + return space; } + refreshFileStores(); + refreshed = true; } - return sysAvailableSpace; + } + + /** Re-resolve the cached {@link FileStore}s from the configured disk dirs. */ + private synchronized void refreshFileStores() { + this.fileStores = getFileStores(diskDirs); + } + + @FunctionalInterface + private interface DiskSpaceReader { + long read(FileStore fileStore) throws IOException; } public static SystemMetrics getInstance() { diff --git a/iotdb-core/metrics/interface/src/test/java/org/apache/iotdb/metrics/metricsets/system/SystemMetricsTest.java b/iotdb-core/metrics/interface/src/test/java/org/apache/iotdb/metrics/metricsets/system/SystemMetricsTest.java new file mode 100644 index 0000000000000..42d65173ea542 --- /dev/null +++ b/iotdb-core/metrics/interface/src/test/java/org/apache/iotdb/metrics/metricsets/system/SystemMetricsTest.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.metrics.metricsets.system; + +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import java.io.File; +import java.util.Collections; + +import static org.junit.Assert.assertTrue; + +public class SystemMetricsTest { + + private File tempDir; + + @Before + public void setUp() { + tempDir = new File("target", "system-metrics-test-" + System.nanoTime()); + File dataDir = new File(tempDir, "data"); + assertTrue(dataDir.mkdirs()); + } + + @After + public void tearDown() { + if (tempDir != null) { + // Best-effort cleanup; the data subdir may already be gone after the test. + new File(tempDir, "data").delete(); + tempDir.delete(); + } + } + + /** + * Regression test for the case where the directory backing a cached {@link + * java.nio.file.FileStore} is removed while IoTDB is running (e.g. an empty data region directory + * deleted during region migration). The disk-space metrics must recover by re-resolving the file + * stores instead of throwing / permanently returning the stale value. + */ + @Test + public void testDiskSpaceRecoversWhenBackingDirIsRemoved() { + SystemMetrics systemMetrics = new SystemMetrics(); + File dataDir = new File(tempDir, "data"); + systemMetrics.setDiskDirs(Collections.singletonList(dataDir.getAbsolutePath())); + + // Sanity check: the file store resolves to a real device with a positive size. + assertTrue(systemMetrics.getSystemDiskTotalSpace() > 0L); + assertTrue(systemMetrics.getSystemDiskFreeSpace() > 0L); + assertTrue(systemMetrics.getSystemDiskAvailableSpace() > 0L); + + // The directory the FileStore was pinned to is removed; its parent still lives on the same + // device. The metrics should re-resolve the file store and keep reporting a positive size + // rather than flooding the log with NoSuchFileException. + assertTrue(dataDir.delete()); + + assertTrue(systemMetrics.getSystemDiskTotalSpace() > 0L); + assertTrue(systemMetrics.getSystemDiskFreeSpace() > 0L); + assertTrue(systemMetrics.getSystemDiskAvailableSpace() > 0L); + } +}