From ad8a4c51107b47225e4562b6dc805fdbb207d44d Mon Sep 17 00:00:00 2001 From: "yingzhe.hu" Date: Mon, 27 Apr 2026 14:51:39 +0800 Subject: [PATCH] [kvm]: decouple TLS cert detection from libvirtd restart toggle Resolves: ZSTAC-84446 Change-Id: I9bed31c0cefddd6ed11f59cd13e36eb1c2abc029 --- .../src/main/java/org/zstack/kvm/KVMHost.java | 99 +++++++++++-------- .../java/org/zstack/kvm/KVMHostUtils.java | 15 +++ .../org/zstack/test/kvm/KVMHostUtilsTest.java | 29 ++++++ 3 files changed, 101 insertions(+), 42 deletions(-) diff --git a/plugin/kvm/src/main/java/org/zstack/kvm/KVMHost.java b/plugin/kvm/src/main/java/org/zstack/kvm/KVMHost.java index 5052ba7aa8..b485c343d1 100755 --- a/plugin/kvm/src/main/java/org/zstack/kvm/KVMHost.java +++ b/plugin/kvm/src/main/java/org/zstack/kvm/KVMHost.java @@ -5716,54 +5716,61 @@ public void run(FlowTrigger trigger, Map data) { @Override public boolean skip(Map data) { + // ZSTAC-84446: run detection whenever TLS is enabled so check + // and first-deploy share the same IP source. return CoreGlobalProperty.UNIT_TEST_ON - || !KVMGlobalConfig.LIBVIRT_TLS_ENABLED.value(Boolean.class) - || !rcf.getResourceConfigValue( - KVMGlobalConfig.RECONNECT_HOST_RESTART_LIBVIRTD_SERVICE, - self.getUuid(), Boolean.class); + || !KVMGlobalConfig.LIBVIRT_TLS_ENABLED.value(Boolean.class); } @Override public void run(FlowTrigger trigger, Map data) { - String managementIp = getSelf().getManagementIp(); - - SshShell sshShell = new SshShell(); - sshShell.setHostname(managementIp); - sshShell.setUsername(getSelf().getUsername()); - sshShell.setPassword(getSelf().getPassword()); - sshShell.setPort(getSelf().getPort()); - - // Use the same logic as zstack-utility host_plugin.fact() so MN's - // expectation matches what the host itself reports. - String certIpList = KVMHostUtils.collectHostIps( - sshShell, self.getUuid(), managementIp); - List allIps = new ArrayList<>(Arrays.asList(certIpList.split(","))); - // Save detected IPs so apply-ansible-playbook can union with EXTRA_IPS - // without running a second SSH. - data.put("TLS_DETECTED_IPS", certIpList); - - SshResult sanResult = sshShell.runCommand( - "openssl x509 -in /etc/pki/libvirt/servercert.pem -noout -ext subjectAltName 2>/dev/null"); - - boolean needDeploy = false; - if (sanResult.isSshFailure() || sanResult.getReturnCode() != 0 - || sanResult.getStdout() == null || sanResult.getStdout().trim().isEmpty()) { - // cert doesn't exist or can't be read - logger.info(String.format("TLS cert not found or unreadable on host[uuid:%s], need deploy", self.getUuid())); - needDeploy = true; - } else { - Set sanIps = parseSanIps(sanResult.getStdout()); - for (String ip : allIps) { - if (!sanIps.contains(ip)) { - logger.info(String.format("TLS cert SAN missing IP %s on host[uuid:%s], need deploy", ip, self.getUuid())); - needDeploy = true; - break; + // ZSTAC-84446: detection is best-effort. SSH failures must NOT + // break reconnect; on error we skip and let the deploy step + // fall back to mgmtIp + EXTRA_IPS. + try { + String managementIp = getSelf().getManagementIp(); + + SshShell sshShell = new SshShell(); + sshShell.setHostname(managementIp); + sshShell.setUsername(getSelf().getUsername()); + sshShell.setPassword(getSelf().getPassword()); + sshShell.setPort(getSelf().getPort()); + + // Same logic as zstack-utility host_plugin.fact() so MN's + // expectation matches what the host itself reports. + String certIpList = KVMHostUtils.collectHostIps( + sshShell, self.getUuid(), managementIp); + List allIps = new ArrayList<>(Arrays.asList(certIpList.split(","))); + // Save detected IPs so apply-ansible-playbook can union with + // EXTRA_IPS without running a second SSH. + data.put("TLS_DETECTED_IPS", certIpList); + + SshResult sanResult = sshShell.runCommand( + "openssl x509 -in /etc/pki/libvirt/servercert.pem -noout -ext subjectAltName 2>/dev/null"); + + boolean needDeploy = false; + if (sanResult.isSshFailure() || sanResult.getReturnCode() != 0 + || sanResult.getStdout() == null || sanResult.getStdout().trim().isEmpty()) { + logger.info(String.format("TLS cert not found or unreadable on host[uuid:%s], need deploy", self.getUuid())); + needDeploy = true; + } else { + Set sanIps = parseSanIps(sanResult.getStdout()); + for (String ip : allIps) { + if (!sanIps.contains(ip)) { + logger.info(String.format("TLS cert SAN missing IP %s on host[uuid:%s], need deploy", ip, self.getUuid())); + needDeploy = true; + break; + } } } - } - if (needDeploy) { - data.put("NEED_DEPLOY_TLS_CERT", true); + if (needDeploy) { + data.put("NEED_DEPLOY_TLS_CERT", true); + } + } catch (Exception e) { + logger.warn(String.format( + "TLS cert detection failed on host[uuid:%s], continue connect flow: %s", + self.getUuid(), e.getMessage()), e); } trigger.next(); @@ -5910,11 +5917,19 @@ public void run(final FlowTrigger trigger, Map data) { self.getUuid(), managementIp, detectedIps); deployArguments.setTlsCertIps(tlsCertIps); - // Force ansible deploy when TLS cert needs update (detected by check-tls-certs flow) + // ZSTAC-84446: force ansible re-run only when policy allows; + // see KVMHostUtils#shouldForceTlsRedeploy. Boolean needDeployTlsCert = (Boolean) data.get("NEED_DEPLOY_TLS_CERT"); - if (Boolean.TRUE.equals(needDeployTlsCert)) { + boolean allowRestart = rcf.getResourceConfigValue( + KVMGlobalConfig.RECONNECT_HOST_RESTART_LIBVIRTD_SERVICE, + self.getUuid(), Boolean.class); + if (KVMHostUtils.shouldForceTlsRedeploy( + Boolean.TRUE.equals(needDeployTlsCert), allowRestart, info.isNewAdded())) { runner.setForceRun(true); deployArguments.setRestartLibvirtd("true"); + } else if (Boolean.TRUE.equals(needDeployTlsCert)) { + logger.info(String.format("TLS cert needs deploy on host[uuid:%s], skip " + + "force-run to keep kvmagent PID stable", self.getUuid())); } if (deployArguments.isForceRun()) { diff --git a/plugin/kvm/src/main/java/org/zstack/kvm/KVMHostUtils.java b/plugin/kvm/src/main/java/org/zstack/kvm/KVMHostUtils.java index 01b42b4b66..6934d7112d 100644 --- a/plugin/kvm/src/main/java/org/zstack/kvm/KVMHostUtils.java +++ b/plugin/kvm/src/main/java/org/zstack/kvm/KVMHostUtils.java @@ -131,6 +131,21 @@ public static String collectHostIps(String hostUuid, String managementIp, return collectHostIps(newSsh(managementIp, username, password, sshPort), hostUuid, managementIp); } + /** + * ZSTAC-84446: force ansible re-run + libvirtd restart only when operator + * opted in (RECONNECT_HOST_RESTART_LIBVIRTD_SERVICE) or it's a fresh add + * (full-deploy will start libvirtd anyway). Skipping on plain reconnect + * keeps kvmagent PID stable. + */ + public static boolean shouldForceTlsRedeploy(boolean needDeployTlsCert, + boolean allowRestartLibvirtd, + boolean isNewAdded) { + if (!needDeployTlsCert) { + return false; + } + return allowRestartLibvirtd || isNewAdded; + } + private static SshShell newSsh(String host, String user, String pwd, int port) { SshShell s = new SshShell(); s.setHostname(host); diff --git a/test/src/test/java/org/zstack/test/kvm/KVMHostUtilsTest.java b/test/src/test/java/org/zstack/test/kvm/KVMHostUtilsTest.java index 2f70c327bc..5dc53aee89 100644 --- a/test/src/test/java/org/zstack/test/kvm/KVMHostUtilsTest.java +++ b/test/src/test/java/org/zstack/test/kvm/KVMHostUtilsTest.java @@ -169,4 +169,33 @@ public void zstac84446_deployUnionContainsAllDetected() { } Assert.assertTrue("deploy must include extra", deployIps.contains("10.0.0.7")); } + + // ----- shouldForceTlsRedeploy coverage (ZSTAC-84446) ----- + + /** needDeployTlsCert=false short-circuits regardless of other flags. */ + @Test + public void shouldForceTlsRedeploy_noNeedNeverForces() { + Assert.assertFalse(KVMHostUtils.shouldForceTlsRedeploy(false, true, true)); + Assert.assertFalse(KVMHostUtils.shouldForceTlsRedeploy(false, true, false)); + Assert.assertFalse(KVMHostUtils.shouldForceTlsRedeploy(false, false, true)); + Assert.assertFalse(KVMHostUtils.shouldForceTlsRedeploy(false, false, false)); + } + + /** Operator opted in: force-run allowed on reconnect. */ + @Test + public void shouldForceTlsRedeploy_allowRestartForces() { + Assert.assertTrue(KVMHostUtils.shouldForceTlsRedeploy(true, true, false)); + } + + /** First-add: full-deploy will start libvirtd, so force-run is safe. */ + @Test + public void shouldForceTlsRedeploy_newAddedForces() { + Assert.assertTrue(KVMHostUtils.shouldForceTlsRedeploy(true, false, true)); + } + + /** Reconnect + toggle off: must NOT force-run (would change kvmagent PID). */ + @Test + public void shouldForceTlsRedeploy_reconnectWithoutRestartSkips() { + Assert.assertFalse(KVMHostUtils.shouldForceTlsRedeploy(true, false, false)); + } }