diff --git a/plugins/storage/volume/linstor/src/main/java/com/cloud/hypervisor/kvm/storage/LinstorStorageAdaptor.java b/plugins/storage/volume/linstor/src/main/java/com/cloud/hypervisor/kvm/storage/LinstorStorageAdaptor.java index 2b11c83c8021..01e4eee470b0 100644 --- a/plugins/storage/volume/linstor/src/main/java/com/cloud/hypervisor/kvm/storage/LinstorStorageAdaptor.java +++ b/plugins/storage/volume/linstor/src/main/java/com/cloud/hypervisor/kvm/storage/LinstorStorageAdaptor.java @@ -514,6 +514,17 @@ private boolean deRefOrDeleteResource(DevelopersApi api, String rscName, String ApiCallRcList answers = api.resourceDefinitionDelete(rd.getName()); checkLinstorAnswersThrow(answers); deleted = true; + + // LINSTOR can return success here while the resource lingers in DELETING state + // on the controller (down peer, lost quorum, etc.). Confirm it's actually gone + // — if not, log a WARN so operators can clear it manually. Don't throw: the + // CloudStack-side accounting has already moved on. + if (!LinstorUtil.waitForResourceDefinitionDeleted(api, rd.getName(), + LinstorUtil.DEFAULT_RD_DELETE_VERIFY_TIMEOUT_MILLIS)) { + logger.warn("Linstor: resource {} still present {}ms after delete returned success — " + + "may be stuck in DELETING. Check the LINSTOR controller (linstor resource list).", + rd.getName(), LinstorUtil.DEFAULT_RD_DELETE_VERIFY_TIMEOUT_MILLIS); + } } } return deleted; diff --git a/plugins/storage/volume/linstor/src/main/java/org/apache/cloudstack/storage/datastore/driver/LinstorPrimaryDataStoreDriverImpl.java b/plugins/storage/volume/linstor/src/main/java/org/apache/cloudstack/storage/datastore/driver/LinstorPrimaryDataStoreDriverImpl.java index 3f06bee8ac83..e68ec45e8224 100644 --- a/plugins/storage/volume/linstor/src/main/java/org/apache/cloudstack/storage/datastore/driver/LinstorPrimaryDataStoreDriverImpl.java +++ b/plugins/storage/volume/linstor/src/main/java/org/apache/cloudstack/storage/datastore/driver/LinstorPrimaryDataStoreDriverImpl.java @@ -230,6 +230,20 @@ private void deleteResourceDefinition(StoragePoolVO storagePoolVO, String rscDef throw new CloudRuntimeException("Linstor: Unable to delete resource definition: " + rscDefName); } logger.info("Linstor: Deleted resource {}", rscDefName); + + // LINSTOR can return success on the delete API call while the resource lingers in + // DELETING state (peer issues, lost quorum, satellite down). Verify the resource is + // actually gone — if not, log a WARN so operators see it. We deliberately do NOT + // throw here: the volume is already considered gone on the CloudStack side, and + // throwing would leave the CS DB and LINSTOR in different states. + if (!LinstorUtil.waitForResourceDefinitionDeleted(linstorApi, rscDefName, + LinstorUtil.DEFAULT_RD_DELETE_VERIFY_TIMEOUT_MILLIS)) + { + logger.warn("Linstor: resource {} still present {}ms after delete returned success — " + + "may be stuck in DELETING. Check the LINSTOR controller (linstor resource list) " + + "and clear manually if the resource has no live peers.", + rscDefName, LinstorUtil.DEFAULT_RD_DELETE_VERIFY_TIMEOUT_MILLIS); + } } catch (ApiException apiEx) { logger.error("Linstor: ApiEx - " + apiEx.getMessage()); diff --git a/plugins/storage/volume/linstor/src/main/java/org/apache/cloudstack/storage/datastore/util/LinstorUtil.java b/plugins/storage/volume/linstor/src/main/java/org/apache/cloudstack/storage/datastore/util/LinstorUtil.java index 7c45493dddc4..d64c3b64c68a 100644 --- a/plugins/storage/volume/linstor/src/main/java/org/apache/cloudstack/storage/datastore/util/LinstorUtil.java +++ b/plugins/storage/volume/linstor/src/main/java/org/apache/cloudstack/storage/datastore/util/LinstorUtil.java @@ -401,6 +401,57 @@ public static List getRDListStartingWith(DevelopersApi api, .collect(Collectors.toList()); } + /** + * Default per-call timeout for {@link #waitForResourceDefinitionDeleted}. Long enough for a + * healthy LINSTOR controller to finish a normal delete; short enough not to block the calling + * agent thread for too long if the delete is genuinely stuck. + */ + public static final long DEFAULT_RD_DELETE_VERIFY_TIMEOUT_MILLIS = 30_000L; + + /** + * Returns {@code true} if the named resource definition is no longer present on the LINSTOR + * controller. Used after a {@code resourceDefinitionDelete} to verify the delete actually + * completed (LINSTOR can return success on the API call while the resource lingers in + * DELETING state due to peer issues, lost quorum, or down satellites). + */ + public static boolean isResourceDefinitionGone(DevelopersApi api, String rscName) throws ApiException { + List all = api.resourceDefinitionList(null, false, null, null, null); + if (all == null) { + return true; + } + return all.stream().noneMatch(rd -> rscName.equalsIgnoreCase(rd.getName())); + } + + /** + * Polls the controller until the named resource definition is gone or the timeout elapses. + * Returns {@code true} if the resource was confirmed gone, {@code false} if it was still + * present (or the controller kept erroring) at the deadline. Callers should NOT throw on a + * {@code false} return — the upstream API call already reported success and the operator + * may need to investigate manually. Log a WARN with the resource name instead. + */ + public static boolean waitForResourceDefinitionDeleted(DevelopersApi api, String rscName, long timeoutMillis) { + final long deadline = System.currentTimeMillis() + timeoutMillis; + while (true) { + try { + if (isResourceDefinitionGone(api, rscName)) { + return true; + } + } catch (ApiException e) { + LOGGER.debug("LINSTOR delete-verify poll failed for {}: {}", rscName, e.getMessage()); + // Keep polling — controller may be transiently unavailable. + } + if (System.currentTimeMillis() >= deadline) { + return false; + } + try { + Thread.sleep(1_000L); + } catch (InterruptedException ie) { + Thread.currentThread().interrupt(); + return false; + } + } + } + /** * Returns a pair list of resource-definitions with ther 1:1 mapped resource-group objects that start with the * resource name `startWith`