diff --git a/ci-operator/step-registry/ipi/conf/vsphere/check/vcm/ipi-conf-vsphere-check-vcm-commands.sh b/ci-operator/step-registry/ipi/conf/vsphere/check/vcm/ipi-conf-vsphere-check-vcm-commands.sh index 1e72f8bac3051..e89eb067f776a 100755 --- a/ci-operator/step-registry/ipi/conf/vsphere/check/vcm/ipi-conf-vsphere-check-vcm-commands.sh +++ b/ci-operator/step-registry/ipi/conf/vsphere/check/vcm/ipi-conf-vsphere-check-vcm-commands.sh @@ -577,6 +577,68 @@ EOF log "Creating vsphere_context.sh file..." cp "${SHARED_DIR}/govc.sh" "${SHARED_DIR}/vsphere_context.sh" +log "Creating individual govc files for each pool..." +for _leaseJSON in "${SHARED_DIR}"/LEASE*; do + # Skip the single lease file - we already processed it + if [[ ${_leaseJSON} =~ "single" ]]; then + continue + fi + + # Get the number of pools in this lease's poolInfo array + pool_count=$(jq -r '.status.poolInfo | length' < "${_leaseJSON}") + + if [[ "${pool_count}" == "null" || "${pool_count}" -eq 0 ]]; then + log "No poolInfo found in ${_leaseJSON}, skipping" + continue + fi + + log "Processing ${pool_count} pool(s) from lease $(basename ${_leaseJSON})" + + # Iterate through each pool in the poolInfo array + for ((pool_idx = 0; pool_idx < pool_count; pool_idx++)); do + # Get the pool name from poolInfo + pool_name=$(jq -r ".status.poolInfo[${pool_idx}].name" < "${_leaseJSON}") + + log "Processing pool: ${pool_name}" + + # Get envVars for this specific pool from envVarsMap using the pool name as key + jq -r ".status.envVarsMap.\"${pool_name}\"" < "${_leaseJSON}" > /tmp/envvars_pool + + # Get topology info from poolInfo + vcenter_cluster=$(jq -r ".status.poolInfo[${pool_idx}].topology.computeCluster" < "${_leaseJSON}") + + # Source the envVars to get vsphere_url and other variables + # shellcheck source=/dev/null + source /tmp/envvars_pool + + # Build resource pool path + if [ $IPI -eq 0 ]; then + vcenter_resource_pool=${vcenter_cluster}/Resources/${NAMESPACE}-${UNIQUE_HASH} + else + vcenter_resource_pool=${vcenter_cluster}/Resources/ipi-ci-clusters + fi + + # Create a sanitized filename from the pool name + # Pool names look like: vcenter-1.ci.ibmc.devcluster.openshift.com-cidatacenter-2-cicluster-3 + pool_filename=$(echo "${pool_name}" | tr '.' '_' | tr ':' '_') + + log "Creating govc_${pool_filename}.sh for pool ${pool_name}" + cat >"${SHARED_DIR}/govc_${pool_filename}.sh" </dev/null || echo "") + if [[ -z "$clustervms" ]]; then + clustervms=$(govc ls "/${GOVC_DATACENTER}/vm/${infra_id}" 2>/dev/null || echo "") + fi + + if [[ -z "$clustervms" ]]; then + echo "$(date -u --rfc-3339=seconds) - No VMs found in ${GOVC_DATACENTER} for lease ${lease_name}" + continue + fi + + found_vms=true + echo "$(date -u --rfc-3339=seconds) - Found VMs in ${GOVC_DATACENTER}" + + # Process VMs for single lease (code reuse below) + else + # New multi-pool lease - iterate through each pool + pool_count=$(jq -r '.status.poolInfo | length' < "${lease_file}") + + if [[ "${pool_count}" == "null" || "${pool_count}" -eq 0 ]]; then + echo "$(date -u --rfc-3339=seconds) - No poolInfo found in ${lease_name}, skipping" + continue + fi + + echo "$(date -u --rfc-3339=seconds) - Found ${pool_count} pool(s) in ${lease_name}" + + # Iterate through each pool in the poolInfo array + for ((pool_idx = 0; pool_idx < pool_count; pool_idx++)); do + # Get the pool name from poolInfo + pool_name=$(jq -r ".status.poolInfo[${pool_idx}].name" < "${lease_file}") + + echo "$(date -u --rfc-3339=seconds) - Processing pool: ${pool_name}" + + # Create a sanitized filename from the pool name + pool_filename=$(echo "${pool_name}" | tr '.' '_' | tr ':' '_') + govc_file="${SHARED_DIR}/govc_${pool_filename}.sh" + + # Check if govc file exists + if [[ ! -f "${govc_file}" ]]; then + echo "$(date -u --rfc-3339=seconds) - Warning: ${govc_file} not found, skipping pool ${pool_name}" + continue + fi + + # Source the pool-specific govc configuration + echo "$(date -u --rfc-3339=seconds) - Sourcing ${govc_file}..." + # shellcheck source=/dev/null + source "${govc_file}" + unset SSL_CERT_FILE + unset GOVC_TLS_CA_CERTS + + # List all virtual machines in this pool's datacenter + clustervms=$(govc ls "/${GOVC_DATACENTER}/vm/${cluster_name}" 2>/dev/null || echo "") + if [[ -z "$clustervms" ]]; then + clustervms=$(govc ls "/${GOVC_DATACENTER}/vm/${infra_id}" 2>/dev/null || echo "") + fi + + if [[ -z "$clustervms" ]]; then + echo "$(date -u --rfc-3339=seconds) - No VMs found in ${GOVC_DATACENTER} for pool ${pool_name}" + continue + fi + + found_vms=true + echo "$(date -u --rfc-3339=seconds) - Found VMs in ${GOVC_DATACENTER} for pool ${pool_name}" + + # Process each VM in this pool + for ipath in $clustervms; do + # split on / + # shellcheck disable=SC2162 + IFS=/ read -a ipath_array <<< "$ipath"; + hostname=${ipath_array[-1]} + + # Create png of the current console to determine if a virtual machine has a problem + echo "$(date -u --rfc-3339=seconds) - Capturing console image for ${hostname}" + govc vm.console -vm.ipath="$ipath" -capture "${ARTIFACT_DIR}/${hostname}.png" || \ + echo "$(date -u --rfc-3339=seconds) - Warning: Failed to capture console for ${hostname}" + + # If hostname has cluster name in it (newer powercli installs), strip the host name off + if [[ "${hostname}" == "${infra_id}"* ]]; then + hostname=${hostname#${infra_id}-} + fi + + # Older UPI installs have "-0" after bootstrap. Remove those to be consistent with newer builds. + if [[ "${hostname}" == "bootstrap-0" ]]; then + hostname="bootstrap" + fi + + # Get IP address for this VM + echo "$(date -u --rfc-3339=seconds) - Getting IP for ${hostname}" + vm_ip="$(govc vm.ip -wait=1m -vm.ipath="$ipath" 2>/dev/null | awk -F',' '{print $1}' || echo "")" + + if [[ -z "${vm_ip}" ]]; then + echo "$(date -u --rfc-3339=seconds) - Warning: Could not get IP for ${hostname}" + continue + fi + + echo "$(date -u --rfc-3339=seconds) - ${hostname} IP: ${vm_ip}" + + # Categorize the VM and store its IP + if [[ "${hostname}" == "bootstrap" ]]; then + all_bootstrap_ips+=("${vm_ip}") + # Also set the variable for backwards compatibility + # shellcheck disable=SC2140 + declare "bootstrap_ip"="${vm_ip}" + elif [[ "${hostname}" =~ ^control[-_]plane[-_][0-9]+$ ]]; then + all_control_plane_ips+=("${vm_ip}") + # Set individual control plane variables for backwards compatibility + # shellcheck disable=SC2140 + declare "${hostname//-/_}_ip"="${vm_ip}" + fi + done # end VM processing for this pool + done # end pool iteration + + # For multi-pool leases, skip the shared VM processing below since we handled it in the pool loop + continue fi - echo "declaring ${hostname//-/_}_ip" - # shellcheck disable=SC2140 - declare "${hostname//-/_}_ip"="$(govc vm.ip -wait=1m -vm.ipath="$ipath" | awk -F',' '{print $1}')" + # Shared VM processing code (only runs for LEASE_single) + for ipath in $clustervms; do + # split on / + # shellcheck disable=SC2162 + IFS=/ read -a ipath_array <<< "$ipath"; + hostname=${ipath_array[-1]} + + # Create png of the current console to determine if a virtual machine has a problem + echo "$(date -u --rfc-3339=seconds) - Capturing console image for ${hostname}" + govc vm.console -vm.ipath="$ipath" -capture "${ARTIFACT_DIR}/${hostname}.png" || \ + echo "$(date -u --rfc-3339=seconds) - Warning: Failed to capture console for ${hostname}" + + # If hostname has cluster name in it (newer powercli installs), strip the host name off + if [[ "${hostname}" == "${infra_id}"* ]]; then + hostname=${hostname#${infra_id}-} + fi + + # Older UPI installs have "-0" after bootstrap. Remove those to be consistent with newer builds. + if [[ "${hostname}" == "bootstrap-0" ]]; then + hostname="bootstrap" + fi + + # Get IP address for this VM + echo "$(date -u --rfc-3339=seconds) - Getting IP for ${hostname}" + vm_ip="$(govc vm.ip -wait=1m -vm.ipath="$ipath" 2>/dev/null | awk -F',' '{print $1}' || echo "")" + + if [[ -z "${vm_ip}" ]]; then + echo "$(date -u --rfc-3339=seconds) - Warning: Could not get IP for ${hostname}" + continue + fi + + echo "$(date -u --rfc-3339=seconds) - ${hostname} IP: ${vm_ip}" + + # Categorize the VM and store its IP + if [[ "${hostname}" == "bootstrap" ]]; then + all_bootstrap_ips+=("${vm_ip}") + # Also set the variable for backwards compatibility + # shellcheck disable=SC2140 + declare "bootstrap_ip"="${vm_ip}" + elif [[ "${hostname}" =~ ^control[-_]plane[-_][0-9]+$ ]]; then + all_control_plane_ips+=("${vm_ip}") + # Set individual control plane variables for backwards compatibility + # shellcheck disable=SC2140 + declare "${hostname//-/_}_ip"="${vm_ip}" + fi + done + done + + # Check if we found any VMs at all + if [[ "${found_vms}" == "false" ]]; then + echo "$(date -u --rfc-3339=seconds) - Did not find any cluster virtual machines across all vCenters, skipping gather logs steps" + return 1 + fi + + # Build gather bootstrap arguments from collected IPs + GATHER_BOOTSTRAP_ARGS=() + + # Add bootstrap IPs (usually just one) + for bootstrap_ip in "${all_bootstrap_ips[@]}"; do + GATHER_BOOTSTRAP_ARGS+=('--bootstrap' "${bootstrap_ip}") + done + + # Add control plane IPs (expecting 3 for HA) + for cp_ip in "${all_control_plane_ips[@]}"; do + GATHER_BOOTSTRAP_ARGS+=('--master' "${cp_ip}") done - GATHER_BOOTSTRAP_ARGS+=('--bootstrap' "${bootstrap_ip}") - GATHER_BOOTSTRAP_ARGS+=('--master' "${control_plane_0_ip}" '--master' "${control_plane_1_ip}" '--master' "${control_plane_2_ip}") + echo "$(date -u --rfc-3339=seconds) - Running gather bootstrap with ${#all_bootstrap_ips[@]} bootstrap node(s) and ${#all_control_plane_ips[@]} control plane node(s)" # 4.5 and prior used the terraform.tfstate for gather bootstrap. This causes an error with: # state snapshot was created by Terraform v0.12.24, which is newer than current v0.12.20; upgrade to Terraform v0.12.24 or greater to work with this state"