diff --git a/.github/workflows/e2e-matrix.yml b/.github/workflows/e2e-matrix.yml index b152159a93..0bf8ab0e0f 100644 --- a/.github/workflows/e2e-matrix.yml +++ b/.github/workflows/e2e-matrix.yml @@ -118,6 +118,8 @@ jobs: MIN_MEM_GI_PER_NODE=12 MIN_CPU_PER_NODE=4 MIN_NODES_FOR_PLACEMENT=3 + POWER_OFF_POLL_INTERVAL_SEC=10 + POWER_OFF_WAIT_TIMEOUT_SEC=180 # Helpers: Kubernetes quantity -> numeric (portable, no bash 4+) mem_to_gi() { @@ -155,14 +157,17 @@ jobs: # Gather free resources like the scheduler: allocatable - sum(pod requests) per node. worker_nodes=$(kubectl get nodes -l node-role.kubernetes.io/worker -o jsonpath='{.items[*].metadata.name}') gather_node_resources() { - available_mem_gi=0 - available_cpu=0 - nodes_meeting_min=0 + local available_mem_gi=0 + local available_cpu=0 + local nodes_meeting_min=0 + local node node_json alloc_mem_gi alloc_cpu pods_json requested_mem_gi requested_cpu + local node_free_mem node_free_cpu node_ok_mem node_ok_cpu + for node in $worker_nodes; do [[ -n "$node" ]] || continue node_json=$(kubectl get node "$node" -o json 2>/dev/null) || true if [[ -z "$node_json" ]]; then - echo "[WARN] Node $node: could not get node spec, skipping" + echo "[WARN] Node $node: could not get node spec, skipping" >&2 continue fi @@ -205,29 +210,33 @@ jobs: if [[ "$node_ok_mem" -eq 1 && "$node_ok_cpu" -eq 1 ]]; then nodes_meeting_min=$((nodes_meeting_min + 1)) else - echo "[INFO] Node $node: does not meet placement min — free ${node_free_mem} Gi RAM, ${node_free_cpu} CPU (required: >= ${MIN_MEM_GI_PER_NODE} Gi, >= ${MIN_CPU_PER_NODE} CPU)" + echo "[INFO] Node $node: does not meet placement min — free ${node_free_mem} Gi RAM, ${node_free_cpu} CPU (required: >= ${MIN_MEM_GI_PER_NODE} Gi, >= ${MIN_CPU_PER_NODE} CPU)" >&2 fi done + + printf '%s\t%s\t%s\n' "$available_mem_gi" "$available_cpu" "$nodes_meeting_min" } - gather_node_resources - echo "[INFO] Workers: free ${available_mem_gi} Gi RAM, ${available_cpu} CPU; nodes with enough free resources for placement: ${nodes_meeting_min} (need at least ${MIN_NODES_FOR_PLACEMENT})" - echo "[INFO] Required: ${REQUIRED_MEM_GI} Gi, ${REQUIRED_CPU} CPU; need >= ${MIN_NODES_FOR_PLACEMENT} nodes with >= ${MIN_MEM_GI_PER_NODE} Gi and >= ${MIN_CPU_PER_NODE} CPU" - echo " " + refresh_resource_state() { + IFS=$'\t' read -r available_mem_gi available_cpu nodes_meeting_min < <(gather_node_resources) + deficit_mem=$(echo "$REQUIRED_MEM_GI - $available_mem_gi" | bc 2>/dev/null || echo "$REQUIRED_MEM_GI") + deficit_cpu=$(echo "$REQUIRED_CPU - $available_cpu" | bc 2>/dev/null || echo "$REQUIRED_CPU") - deficit_mem=$(echo "$REQUIRED_MEM_GI - $available_mem_gi" | bc 2>/dev/null || echo "$REQUIRED_MEM_GI") - deficit_cpu=$(echo "$REQUIRED_CPU - $available_cpu" | bc 2>/dev/null || echo "$REQUIRED_CPU") + total_sufficient=false + if float_le "$deficit_mem" 0 && float_le "$deficit_cpu" 0; then + total_sufficient=true + fi - # Check for sufficient free resources and node availability to proceed with placement - total_sufficient=false - placement_sufficient=false - if float_le "$deficit_mem" 0 && float_le "$deficit_cpu" 0; then - total_sufficient=true - fi + placement_sufficient=false + if [[ $nodes_meeting_min -ge $MIN_NODES_FOR_PLACEMENT ]]; then + placement_sufficient=true + fi + } - if [[ $nodes_meeting_min -ge $MIN_NODES_FOR_PLACEMENT ]]; then - placement_sufficient=true - fi + refresh_resource_state + echo "[INFO] Workers: free ${available_mem_gi} Gi RAM, ${available_cpu} CPU; nodes with enough free resources for placement: ${nodes_meeting_min} (need at least ${MIN_NODES_FOR_PLACEMENT})" + echo "[INFO] Required: ${REQUIRED_MEM_GI} Gi, ${REQUIRED_CPU} CPU; need >= ${MIN_NODES_FOR_PLACEMENT} nodes with >= ${MIN_MEM_GI_PER_NODE} Gi and >= ${MIN_CPU_PER_NODE} CPU" + echo " " if $total_sufficient && $placement_sufficient; then echo "[INFO] Resources sufficient (total + placement), no VMs to power off" @@ -256,7 +265,7 @@ jobs: .items[] | select(.metadata.namespace | test("^nightly-e2e-|static-cse") | not) | select(.metadata.labels | tostring | test("e2e-cluster/do-not-stop-vm-on-e2e-run") | not) - | select(.spec.runPolicy != "AlwaysOff") + | select(.status.phase != "Stopped") | [.metadata.namespace, .metadata.name, (.spec.memory.size // "0"), (.spec.cpu.cores // 0), (.spec.cpu.coreFraction // "100%")] | @tsv ' @@ -271,41 +280,120 @@ jobs: done | sort -t$'\t' -k1,1 -rn } - # Keep powering off while: placement not met, or memory/CPU deficit not yet covered + vm_cpu_from_cores_and_fraction() { + local cores="$1" core_frac="$2" frac_pct=100 + [[ "$core_frac" =~ ^([0-9]+)%$ ]] && frac_pct="${BASH_REMATCH[1]}" + echo "scale=2; $cores * $frac_pct / 100" | bc + } + + print_power_off_plan() { + local plan_index=0 cumulative_mem=0 cumulative_cpu=0 + local vm_mem_gi ns name mem_qty cores core_frac vm_cpu + + echo "[INFO] Planned power-off order with projected VM-spec resources:" + echo "[INFO] Projection is based on VM spec memory/cpu; actual placement improvement depends on where workloads are running." + + while IFS=$'\t' read -r vm_mem_gi ns name mem_qty cores core_frac; do + [[ -n "$ns" ]] || continue + plan_index=$((plan_index + 1)) + vm_cpu=$(vm_cpu_from_cores_and_fraction "$cores" "$core_frac") + cumulative_mem=$(echo "$cumulative_mem + $vm_mem_gi" | bc) + cumulative_cpu=$(echo "$cumulative_cpu + $vm_cpu" | bc) + echo "[PLAN] ${plan_index}. ${ns}/${name} -> ${vm_mem_gi} Gi RAM, ${vm_cpu} CPU (cumulative: ${cumulative_mem} Gi RAM, ${cumulative_cpu} CPU)" + done < "$1" + + if [[ $plan_index -eq 0 ]]; then + echo "[WARN] No VM candidates available for power off" + fi + } + + count_stopped_requested_vms() { + local requested_vms_file="$1" + local stopped_requested=0 total_requested=0 + local ns name phase + + while IFS=$'\t' read -r ns name; do + [[ -n "$ns" ]] || continue + total_requested=$((total_requested + 1)) + phase=$(kubectl get vm -n "$ns" "$name" -o jsonpath='{.status.phase}' 2>/dev/null || echo "Unknown") + if [[ "$phase" == "Stopped" ]]; then + stopped_requested=$((stopped_requested + 1)) + fi + done < "$requested_vms_file" + + printf '%s\t%s\n' "$stopped_requested" "$total_requested" + } + + # Keep powering off while current cluster state still does not satisfy placement or total resources. still_need_to_free() { if ! $placement_sufficient; then return 0; fi - if float_gt "$deficit_mem" 0 && float_lt "$freed_mem" "$deficit_mem"; then return 0; fi - if float_gt "$deficit_cpu" 0 && float_lt "$freed_cpu" "$deficit_cpu"; then return 0; fi + if ! $total_sufficient; then return 0; fi return 1 } - freed_mem=0 - freed_cpu=0 + vm_candidates_file=$(mktemp) + requested_vms_file=$(mktemp) + trap 'rm -f "$vm_candidates_file" "$requested_vms_file"' EXIT + get_vms_candidates | sort_by_mem_desc > "$vm_candidates_file" + print_power_off_plan "$vm_candidates_file" + + requested_count=0 while IFS=$'\t' read -r vm_mem_gi ns name mem_qty cores core_frac; do [[ -n "$ns" ]] || continue - still_need_to_free || break + vm_cpu=$(vm_cpu_from_cores_and_fraction "$cores" "$core_frac") - frac_pct=100 - [[ "$core_frac" =~ ^([0-9]+)%$ ]] && frac_pct="${BASH_REMATCH[1]}" - vm_cpu=$(echo "scale=2; $cores * $frac_pct / 100" | bc) - - echo "[INFO] Powering off vm $ns/$name (${vm_mem_gi} Gi, ${vm_cpu} CPU)" - kubectl patch vm -n "$ns" "$name" --type=merge -p '{"spec":{"runPolicy":"AlwaysOff"}}' || true - freed_mem=$(echo "$freed_mem + $vm_mem_gi" | bc) - freed_cpu=$(echo "$freed_cpu + $vm_cpu" | bc) - - # Re-check placement after each power-off (pods may take a few seconds to disappear) - if ! $placement_sufficient; then - kubectl wait --for=jsonpath='{.status.phase}'=Stopped vm -n "$ns" "$name" --timeout=90s || true - gather_node_resources - if [[ $nodes_meeting_min -ge $MIN_NODES_FOR_PLACEMENT ]]; then - placement_sufficient=true - echo "[INFO] Placement now sufficient: ${nodes_meeting_min} nodes with >= ${MIN_MEM_GI_PER_NODE} Gi and >= ${MIN_CPU_PER_NODE} CPU" - fi + echo "[INFO] Request power off for vm $ns/$name (${vm_mem_gi} Gi, ${vm_cpu} CPU)" + if ! kubectl patch vm -n "$ns" "$name" --type=merge -p '{"spec":{"runPolicy":"AlwaysOff"}}'; then + echo "[WARN] Failed to power off vm $ns/$name, skip it and continue with next candidate" + continue + fi + printf '%s\t%s\n' "$ns" "$name" >> "$requested_vms_file" + requested_count=$((requested_count + 1)) + done < "$vm_candidates_file" + + if [[ $requested_count -eq 0 ]]; then + echo "[ERROR] No running VM candidates available for power off, but resources are still insufficient." + echo "[ERROR] Human intervention is required." + rm -f "$vm_candidates_file" "$requested_vms_file" + trap - EXIT + exit 1 + fi + + echo "[INFO] Requested power off for ${requested_count} VM(s). Waiting up to ${POWER_OFF_WAIT_TIMEOUT_SEC}s and checking cluster resources every ${POWER_OFF_POLL_INTERVAL_SEC}s." + + wait_elapsed=0 + prev_nodes_meeting_min="$nodes_meeting_min" + while true; do + refresh_resource_state + IFS=$'\t' read -r stopped_requested total_requested < <(count_stopped_requested_vms "$requested_vms_file") + echo "[INFO] Current workers free: ${available_mem_gi} Gi RAM, ${available_cpu} CPU; nodes with enough free resources for placement: ${nodes_meeting_min}" + echo "[INFO] Requested VMs stopped: ${stopped_requested}/${total_requested}; waited ${wait_elapsed}s/${POWER_OFF_WAIT_TIMEOUT_SEC}s" + if [[ $prev_nodes_meeting_min -lt $MIN_NODES_FOR_PLACEMENT && $nodes_meeting_min -ge $MIN_NODES_FOR_PLACEMENT ]]; then + echo "[INFO] Placement now sufficient: ${nodes_meeting_min} nodes with >= ${MIN_MEM_GI_PER_NODE} Gi and >= ${MIN_CPU_PER_NODE} CPU" + fi + prev_nodes_meeting_min="$nodes_meeting_min" + + if ! still_need_to_free; then + break + fi + + if [[ $total_requested -gt 0 && $stopped_requested -eq $total_requested ]]; then + echo "[INFO] All requested VMs are already stopped; no need to wait further." + break fi - done < <(get_vms_candidates | sort_by_mem_desc) - echo "[INFO] Freed: ${freed_mem} Gi RAM, ${freed_cpu} CPU" + if [[ $wait_elapsed -ge $POWER_OFF_WAIT_TIMEOUT_SEC ]]; then + break + fi + + sleep "$POWER_OFF_POLL_INTERVAL_SEC" + wait_elapsed=$((wait_elapsed + POWER_OFF_POLL_INTERVAL_SEC)) + done + + rm -f "$vm_candidates_file" "$requested_vms_file" + trap - EXIT + + echo "[INFO] Final workers free: ${available_mem_gi} Gi RAM, ${available_cpu} CPU; nodes with enough free resources for placement: ${nodes_meeting_min}" if still_need_to_free; then echo "[ERROR] Stopping VMs did not free enough resources. Human intervention is required."