From 1a1f8ff653c2a813015350231d7e48b9cd1aacb6 Mon Sep 17 00:00:00 2001 From: James Peru Date: Tue, 28 Apr 2026 11:40:41 +0300 Subject: [PATCH] fix(linstor): pre-flight check destination is a LINSTOR satellite before live migration LinstorDataMotionStrategy.copyAsync would call createResource on the destination pool's controller without first verifying that the destination KVM host is registered as a LINSTOR satellite there. Two failure modes: 1. The resource group's auto-placement filter happens to match a different node (a registered satellite that is NOT the migration destination), and the resource is silently created on the wrong node. The subsequent migrate then fails because the destination KVM host has no DRBD device for the resource. 2. The auto-placement filter has no candidates and the LINSTOR API returns an opaque error. The operator has to correlate the migration failure with an unrelated controller log entry to understand what happened. This change adds verifyDestinationIsLinstorSatellite() called at the top of copyAsync. For each LINSTOR-typed destination pool it: - fetches the controller's node list via LinstorUtil.getLinstorNodeNames - throws CloudRuntimeException with a clear actionable message (lists known satellites) if destHost.getName() is missing from that list - silently skips on transient controller errors so a network blip against the controller doesn't block an otherwise valid migration Non-LINSTOR destination pools in the volumeDataStoreMap are skipped (mixed-storage migrations are unaffected). --- .../motion/LinstorDataMotionStrategy.java | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/plugins/storage/volume/linstor/src/main/java/org/apache/cloudstack/storage/motion/LinstorDataMotionStrategy.java b/plugins/storage/volume/linstor/src/main/java/org/apache/cloudstack/storage/motion/LinstorDataMotionStrategy.java index cab2820f09ae..aa450a8290fb 100644 --- a/plugins/storage/volume/linstor/src/main/java/org/apache/cloudstack/storage/motion/LinstorDataMotionStrategy.java +++ b/plugins/storage/volume/linstor/src/main/java/org/apache/cloudstack/storage/motion/LinstorDataMotionStrategy.java @@ -314,6 +314,58 @@ private boolean needsExactSizeProp(VolumeInfo srcVolumeInfo) { return true; } + /** + * Verify that the destination KVM host is a registered LINSTOR satellite on the controller + * backing every destination pool involved in this migration. Throws CloudRuntimeException + * with a clear message when it isn't, instead of letting the resource creation later fail + * obscurely inside auto-placement. + * + * Best-effort: a transient controller error during this check does not block the migration + * — we log a warning and let the downstream resource-create surface the real issue. Only a + * confirmed "host not in node list" outcome aborts the migration up-front. + */ + private void verifyDestinationIsLinstorSatellite(Map volumeDataStoreMap, Host destHost) { + if (destHost == null || destHost.getName() == null) { + // Without a destination host name to match, the only sensible thing is to let the + // existing flow run and report whatever it would have reported. + return; + } + for (Map.Entry entry : volumeDataStoreMap.entrySet()) { + DataStore destDataStore = entry.getValue(); + StoragePoolVO destStoragePool = _storagePool.findById(destDataStore.getId()); + if (destStoragePool == null + || destStoragePool.getPoolType() != Storage.StoragePoolType.Linstor) { + continue; + } + DevelopersApi api = LinstorUtil.getLinstorAPI(destStoragePool.getHostAddress()); + try { + List nodes = LinstorUtil.getLinstorNodeNames(api); + if (nodes == null) { + logger.warn("LINSTOR controller {} returned null node list; skipping pre-flight", + destStoragePool.getHostAddress()); + return; + } + if (!nodes.contains(destHost.getName())) { + throw new CloudRuntimeException(String.format( + "Cannot migrate to host '%s': it is not a registered LINSTOR satellite on " + + "controller %s (pool '%s'). Known satellites: %s. Either register the " + + "host with `linstor node create` or pick a different destination.", + destHost.getName(), + destStoragePool.getHostAddress(), + destStoragePool.getName(), + nodes)); + } + } catch (ApiException apiEx) { + // Don't block migration on a transient controller hiccup — log and let the + // downstream resource creation handle the real failure. + logger.warn("LINSTOR pre-flight check could not contact controller {}: {}; " + + "letting downstream resource creation proceed", + destStoragePool.getHostAddress(), apiEx.getBestMessage()); + return; + } + } + } + @Override public void copyAsync(Map volumeDataStoreMap, VirtualMachineTO vmTO, Host srcHost, Host destHost, AsyncCompletionCallback callback) { @@ -323,6 +375,15 @@ public void copyAsync(Map volumeDataStoreMap, VirtualMach String.format("Invalid hypervisor type [%s]. Only KVM supported", srcHost.getHypervisorType())); } + // Pre-flight: verify the destination KVM host is registered as a satellite on the + // LINSTOR controller backing each destination pool. Without this check, resource + // creation falls through to the resource-group's auto-placement filters and may + // either silently place the resource on the wrong node or fail with an opaque + // auto-place error from the LINSTOR API. Failing fast here gives operators a clear + // actionable message instead of having to correlate the live-migration failure with + // an unrelated LINSTOR controller log entry. + verifyDestinationIsLinstorSatellite(volumeDataStoreMap, destHost); + String errMsg = null; VMInstanceVO vmInstance = _vmDao.findById(vmTO.getId()); vmTO.setState(vmInstance.getState());