From 21c6110b71b9811a261db688edc91c88e9632298 Mon Sep 17 00:00:00 2001 From: James Peru Date: Tue, 28 Apr 2026 11:31:16 +0300 Subject: [PATCH] fix(linstor): verify resource definition deletion completes; warn if stuck MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The LINSTOR plugin treats a successful HTTP response from resourceDefinitionDelete as proof the resource is gone, then drops the volume from CloudStack's accounting. In practice LINSTOR can return success while the resource lingers in DELETING state — for example when a DRBD peer is unreachable, quorum was lost, or a satellite is down. The plugin had no retry, no verification, and no sweeper. Operators have been finding hundreds of stuck DELETING resources accumulated over weeks because nothing surfaced the divergence between the CS view and the LINSTOR view. This change adds two helpers to LinstorUtil: isResourceDefinitionGone(api, rscName) - quick existence check via resourceDefinitionList waitForResourceDefinitionDeleted(api, rscName, timeoutMillis) - polls every second until the resource is gone OR timeout elapses - returns true on confirmed-gone, false on timeout and calls waitForResourceDefinitionDeleted from both delete sites (driver: LinstorPrimaryDataStoreDriverImpl.deleteResourceDefinition; adaptor: LinstorStorageAdaptor.deRefOrDeleteResource) with a 30s default timeout. On timeout the plugin logs a WARN with the resource name and a hint pointing at `linstor resource list`. We deliberately do NOT throw on timeout: the CS-side accounting has already moved on, and throwing would create a different inconsistency. This is the minimal Tier-1 fix that surfaces the problem in the operator's view. A follow-up could add a periodic sweeper that attempts force-delete on long-stuck DELETING resources. --- .../kvm/storage/LinstorStorageAdaptor.java | 11 ++++ .../LinstorPrimaryDataStoreDriverImpl.java | 14 +++++ .../storage/datastore/util/LinstorUtil.java | 51 +++++++++++++++++++ 3 files changed, 76 insertions(+) diff --git a/plugins/storage/volume/linstor/src/main/java/com/cloud/hypervisor/kvm/storage/LinstorStorageAdaptor.java b/plugins/storage/volume/linstor/src/main/java/com/cloud/hypervisor/kvm/storage/LinstorStorageAdaptor.java index 2b11c83c8021..01e4eee470b0 100644 --- a/plugins/storage/volume/linstor/src/main/java/com/cloud/hypervisor/kvm/storage/LinstorStorageAdaptor.java +++ b/plugins/storage/volume/linstor/src/main/java/com/cloud/hypervisor/kvm/storage/LinstorStorageAdaptor.java @@ -514,6 +514,17 @@ private boolean deRefOrDeleteResource(DevelopersApi api, String rscName, String ApiCallRcList answers = api.resourceDefinitionDelete(rd.getName()); checkLinstorAnswersThrow(answers); deleted = true; + + // LINSTOR can return success here while the resource lingers in DELETING state + // on the controller (down peer, lost quorum, etc.). Confirm it's actually gone + // — if not, log a WARN so operators can clear it manually. Don't throw: the + // CloudStack-side accounting has already moved on. + if (!LinstorUtil.waitForResourceDefinitionDeleted(api, rd.getName(), + LinstorUtil.DEFAULT_RD_DELETE_VERIFY_TIMEOUT_MILLIS)) { + logger.warn("Linstor: resource {} still present {}ms after delete returned success — " + + "may be stuck in DELETING. Check the LINSTOR controller (linstor resource list).", + rd.getName(), LinstorUtil.DEFAULT_RD_DELETE_VERIFY_TIMEOUT_MILLIS); + } } } return deleted; diff --git a/plugins/storage/volume/linstor/src/main/java/org/apache/cloudstack/storage/datastore/driver/LinstorPrimaryDataStoreDriverImpl.java b/plugins/storage/volume/linstor/src/main/java/org/apache/cloudstack/storage/datastore/driver/LinstorPrimaryDataStoreDriverImpl.java index 3f06bee8ac83..e68ec45e8224 100644 --- a/plugins/storage/volume/linstor/src/main/java/org/apache/cloudstack/storage/datastore/driver/LinstorPrimaryDataStoreDriverImpl.java +++ b/plugins/storage/volume/linstor/src/main/java/org/apache/cloudstack/storage/datastore/driver/LinstorPrimaryDataStoreDriverImpl.java @@ -230,6 +230,20 @@ private void deleteResourceDefinition(StoragePoolVO storagePoolVO, String rscDef throw new CloudRuntimeException("Linstor: Unable to delete resource definition: " + rscDefName); } logger.info("Linstor: Deleted resource {}", rscDefName); + + // LINSTOR can return success on the delete API call while the resource lingers in + // DELETING state (peer issues, lost quorum, satellite down). Verify the resource is + // actually gone — if not, log a WARN so operators see it. We deliberately do NOT + // throw here: the volume is already considered gone on the CloudStack side, and + // throwing would leave the CS DB and LINSTOR in different states. + if (!LinstorUtil.waitForResourceDefinitionDeleted(linstorApi, rscDefName, + LinstorUtil.DEFAULT_RD_DELETE_VERIFY_TIMEOUT_MILLIS)) + { + logger.warn("Linstor: resource {} still present {}ms after delete returned success — " + + "may be stuck in DELETING. Check the LINSTOR controller (linstor resource list) " + + "and clear manually if the resource has no live peers.", + rscDefName, LinstorUtil.DEFAULT_RD_DELETE_VERIFY_TIMEOUT_MILLIS); + } } catch (ApiException apiEx) { logger.error("Linstor: ApiEx - " + apiEx.getMessage()); diff --git a/plugins/storage/volume/linstor/src/main/java/org/apache/cloudstack/storage/datastore/util/LinstorUtil.java b/plugins/storage/volume/linstor/src/main/java/org/apache/cloudstack/storage/datastore/util/LinstorUtil.java index 7c45493dddc4..d64c3b64c68a 100644 --- a/plugins/storage/volume/linstor/src/main/java/org/apache/cloudstack/storage/datastore/util/LinstorUtil.java +++ b/plugins/storage/volume/linstor/src/main/java/org/apache/cloudstack/storage/datastore/util/LinstorUtil.java @@ -401,6 +401,57 @@ public static List getRDListStartingWith(DevelopersApi api, .collect(Collectors.toList()); } + /** + * Default per-call timeout for {@link #waitForResourceDefinitionDeleted}. Long enough for a + * healthy LINSTOR controller to finish a normal delete; short enough not to block the calling + * agent thread for too long if the delete is genuinely stuck. + */ + public static final long DEFAULT_RD_DELETE_VERIFY_TIMEOUT_MILLIS = 30_000L; + + /** + * Returns {@code true} if the named resource definition is no longer present on the LINSTOR + * controller. Used after a {@code resourceDefinitionDelete} to verify the delete actually + * completed (LINSTOR can return success on the API call while the resource lingers in + * DELETING state due to peer issues, lost quorum, or down satellites). + */ + public static boolean isResourceDefinitionGone(DevelopersApi api, String rscName) throws ApiException { + List all = api.resourceDefinitionList(null, false, null, null, null); + if (all == null) { + return true; + } + return all.stream().noneMatch(rd -> rscName.equalsIgnoreCase(rd.getName())); + } + + /** + * Polls the controller until the named resource definition is gone or the timeout elapses. + * Returns {@code true} if the resource was confirmed gone, {@code false} if it was still + * present (or the controller kept erroring) at the deadline. Callers should NOT throw on a + * {@code false} return — the upstream API call already reported success and the operator + * may need to investigate manually. Log a WARN with the resource name instead. + */ + public static boolean waitForResourceDefinitionDeleted(DevelopersApi api, String rscName, long timeoutMillis) { + final long deadline = System.currentTimeMillis() + timeoutMillis; + while (true) { + try { + if (isResourceDefinitionGone(api, rscName)) { + return true; + } + } catch (ApiException e) { + LOGGER.debug("LINSTOR delete-verify poll failed for {}: {}", rscName, e.getMessage()); + // Keep polling — controller may be transiently unavailable. + } + if (System.currentTimeMillis() >= deadline) { + return false; + } + try { + Thread.sleep(1_000L); + } catch (InterruptedException ie) { + Thread.currentThread().interrupt(); + return false; + } + } + } + /** * Returns a pair list of resource-definitions with ther 1:1 mapped resource-group objects that start with the * resource name `startWith`