apache · ayushtkn · May 25, 2026 · May 26, 2026 · May 28, 2026 · May 28, 2026
diff --git a/packaging/src/kubernetes/README.md b/packaging/src/kubernetes/README.md
diff --git a/packaging/src/kubernetes/helm/hive-operator/crds/hiveclusters.hive.apache.org-v1.yml b/packaging/src/kubernetes/helm/hive-operator/crds/hiveclusters.hive.apache.org-v1.yml
@@ -44,6 +44,50 @@ spec:
               hiveServer2:
                 description: HiveServer2 component configuration
                 properties:
+                  autoscaling:
+                    description: Autoscaling configuration (requires KEDA installed
+                      in the cluster)
+                    properties:
+                      activationCpuValue:
+                        description: CPU average value below which the trigger is
+                          inactive. Required if targetCpuValue is set.
+                        type: string
+                      cooldownSeconds:
+                        default: 600
+                        description: Cooldown period in seconds after a scaling event
+                          before another can occur
+                        type: integer
+                      enabled:
+                        default: false
+                        description: Whether autoscaling is enabled for this component
+                        type: boolean
+                      gracePeriodSeconds:
+                        default: 300
+                        description: Maximum time in seconds to wait for graceful
+                          drain during scale-down before the pod is forcibly terminated
+                        type: integer
+                      minReplicas:
+                        default: 0
+                        description: Minimum number of replicas (floor for scale-down).
+                          Set to 0 for scale-to-zero (HS2 requires KEDA HTTP Add-on
+                          for wake-from-zero)
+                        type: integer
+                      scaleDownThreshold:
+                        default: 20
+                        description: Threshold that triggers scale-down for Prometheus-based
+                          metrics
+                        type: integer
+                      scaleUpThreshold:
+                        default: 80
+                        description: "Threshold that triggers scale-up (component-specific:\
+                          \ sessions for HS2, connections for HMS, queue depth for\
+                          \ LLAP, pending tasks for TezAM)"
+                        type: integer
+                      targetCpuValue:
+                        description: "Target CPU average value for scaling (e.g.,\
+                          \ '1500m' or '1'). If omitted, CPU scaling is disabled."
+                        type: string
+                    type: object
                   configOverrides:
                     additionalProperties:
                       type: string
@@ -152,6 +196,50 @@ spec:
               llap:
                 description: LLAP daemon configuration. Enabled by default.
                 properties:
+                  autoscaling:
+                    description: Autoscaling configuration (requires KEDA installed
+                      in the cluster)
+                    properties:
+                      activationCpuValue:
+                        description: CPU average value below which the trigger is
+                          inactive. Required if targetCpuValue is set.
+                        type: string
+                      cooldownSeconds:
+                        default: 600
+                        description: Cooldown period in seconds after a scaling event
+                          before another can occur
+                        type: integer
+                      enabled:
+                        default: false
+                        description: Whether autoscaling is enabled for this component
+                        type: boolean
+                      gracePeriodSeconds:
+                        default: 300
+                        description: Maximum time in seconds to wait for graceful
+                          drain during scale-down before the pod is forcibly terminated
+                        type: integer
+                      minReplicas:
+                        default: 0
+                        description: Minimum number of replicas (floor for scale-down).
+                          Set to 0 for scale-to-zero (HS2 requires KEDA HTTP Add-on
+                          for wake-from-zero)
+                        type: integer
+                      scaleDownThreshold:
+                        default: 20
+                        description: Threshold that triggers scale-down for Prometheus-based
+                          metrics
+                        type: integer
+                      scaleUpThreshold:
+                        default: 80
+                        description: "Threshold that triggers scale-up (component-specific:\
+                          \ sessions for HS2, connections for HMS, queue depth for\
+                          \ LLAP, pending tasks for TezAM)"
+                        type: integer
+                      targetCpuValue:
+                        description: "Target CPU average value for scaling (e.g.,\
+                          \ '1500m' or '1'). If omitted, CPU scaling is disabled."
+                        type: string
+                    type: object
                   configOverrides:
                     additionalProperties:
                       type: string
@@ -235,6 +323,50 @@ spec:
               metastore:
                 description: Metastore component configuration
                 properties:
+                  autoscaling:
+                    description: Autoscaling configuration (requires KEDA installed
+                      in the cluster)
+                    properties:
+                      activationCpuValue:
+                        description: CPU average value below which the trigger is
+                          inactive. Required if targetCpuValue is set.
+                        type: string
+                      cooldownSeconds:
+                        default: 600
+                        description: Cooldown period in seconds after a scaling event
+                          before another can occur
+                        type: integer
+                      enabled:
+                        default: false
+                        description: Whether autoscaling is enabled for this component
+                        type: boolean
+                      gracePeriodSeconds:
+                        default: 300
+                        description: Maximum time in seconds to wait for graceful
+                          drain during scale-down before the pod is forcibly terminated
+                        type: integer
+                      minReplicas:
+                        default: 0
+                        description: Minimum number of replicas (floor for scale-down).
+                          Set to 0 for scale-to-zero (HS2 requires KEDA HTTP Add-on
+                          for wake-from-zero)
+                        type: integer
+                      scaleDownThreshold:
+                        default: 20
+                        description: Threshold that triggers scale-down for Prometheus-based
+                          metrics
+                        type: integer
+                      scaleUpThreshold:
+                        default: 80
+                        description: "Threshold that triggers scale-up (component-specific:\
+                          \ sessions for HS2, connections for HMS, queue depth for\
+                          \ LLAP, pending tasks for TezAM)"
+                        type: integer
+                      targetCpuValue:
+                        description: "Target CPU average value for scaling (e.g.,\
+                          \ '1500m' or '1'). If omitted, CPU scaling is disabled."
+                        type: string
+                    type: object
                   configOverrides:
                     additionalProperties:
                       type: string
@@ -371,6 +503,50 @@ spec:
               tezAm:
                 description: Tez Application Master configuration. Enabled by default.
                 properties:
+                  autoscaling:
+                    description: Autoscaling configuration (requires KEDA installed
+                      in the cluster)
+                    properties:
+                      activationCpuValue:
+                        description: CPU average value below which the trigger is
+                          inactive. Required if targetCpuValue is set.
+                        type: string
+                      cooldownSeconds:
+                        default: 600
+                        description: Cooldown period in seconds after a scaling event
+                          before another can occur
+                        type: integer
+                      enabled:
+                        default: false
+                        description: Whether autoscaling is enabled for this component
+                        type: boolean
+                      gracePeriodSeconds:
+                        default: 300
+                        description: Maximum time in seconds to wait for graceful
+                          drain during scale-down before the pod is forcibly terminated
+                        type: integer
+                      minReplicas:
+                        default: 0
+                        description: Minimum number of replicas (floor for scale-down).
+                          Set to 0 for scale-to-zero (HS2 requires KEDA HTTP Add-on
+                          for wake-from-zero)
+                        type: integer
+                      scaleDownThreshold:
+                        default: 20
+                        description: Threshold that triggers scale-down for Prometheus-based
+                          metrics
+                        type: integer
+                      scaleUpThreshold:
+                        default: 80
+                        description: "Threshold that triggers scale-up (component-specific:\
+                          \ sessions for HS2, connections for HMS, queue depth for\
+                          \ LLAP, pending tasks for TezAM)"
+                        type: integer
+                      targetCpuValue:
+                        description: "Target CPU average value for scaling (e.g.,\
+                          \ '1500m' or '1'). If omitted, CPU scaling is disabled."
+                        type: string
+                    type: object
                   configOverrides:
                     additionalProperties:
                       type: string

diff --git a/packaging/src/kubernetes/helm/hive-operator/templates/clusterrole.yaml b/packaging/src/kubernetes/helm/hive-operator/templates/clusterrole.yaml
@@ -50,3 +50,15 @@ rules:
   - apiGroups: [""]
     resources: ["pods"]
     verbs: ["get", "list", "watch"]
+  # PodDisruptionBudgets for graceful autoscaling
+  - apiGroups: ["policy"]
+    resources: ["poddisruptionbudgets"]
+    verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
+  # KEDA ScaledObjects for autoscaling
+  - apiGroups: ["keda.sh"]
+    resources: ["scaledobjects", "triggerauthentications"]
+    verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
+  # KEDA HTTP Add-on for scale-to-zero (wake-from-zero on HTTP request)
+  - apiGroups: ["http.keda.sh"]
+    resources: ["httpscaledobjects", "interceptorroutes"]
+    verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
diff --git a/packaging/src/kubernetes/helm/hive-operator/templates/hivecluster.yaml b/packaging/src/kubernetes/helm/hive-operator/templates/hivecluster.yaml
@@ -67,6 +67,21 @@ spec:
     extraVolumeMounts:
       {{- toYaml .Values.cluster.metastore.extraVolumeMounts | nindent 6 }}
     {{- end }}
+    {{- if and .Values.cluster.metastore.autoscaling .Values.cluster.metastore.autoscaling.enabled }}
+    autoscaling:
+      enabled: true
+      minReplicas: {{ .Values.cluster.metastore.autoscaling.minReplicas }}
+      scaleUpThreshold: {{ .Values.cluster.metastore.autoscaling.scaleUpThreshold }}
+      scaleDownThreshold: {{ .Values.cluster.metastore.autoscaling.scaleDownThreshold }}
+      {{- if .Values.cluster.metastore.autoscaling.targetCpuValue }}
+      targetCpuValue: {{ .Values.cluster.metastore.autoscaling.targetCpuValue | quote }}
+      {{- end }}
+      {{- if .Values.cluster.metastore.autoscaling.activationCpuValue }}
+      activationCpuValue: {{ .Values.cluster.metastore.autoscaling.activationCpuValue | quote }}
+      {{- end }}
+      cooldownSeconds: {{ .Values.cluster.metastore.autoscaling.cooldownSeconds }}
+      gracePeriodSeconds: {{ .Values.cluster.metastore.autoscaling.gracePeriodSeconds }}
+    {{- end }}
     {{- else }}
     {{- if .Values.cluster.metastore.externalUri }}
     externalUri: {{ .Values.cluster.metastore.externalUri | quote }}
@@ -96,6 +111,21 @@ spec:
     extraVolumeMounts:
       {{- toYaml .Values.cluster.hiveServer2.extraVolumeMounts | nindent 6 }}
     {{- end }}
+    {{- if and .Values.cluster.hiveServer2.autoscaling .Values.cluster.hiveServer2.autoscaling.enabled }}
+    autoscaling:
+      enabled: true
+      minReplicas: {{ .Values.cluster.hiveServer2.autoscaling.minReplicas }}
+      scaleUpThreshold: {{ .Values.cluster.hiveServer2.autoscaling.scaleUpThreshold }}
+      scaleDownThreshold: {{ .Values.cluster.hiveServer2.autoscaling.scaleDownThreshold }}
+      {{- if .Values.cluster.hiveServer2.autoscaling.targetCpuValue }}
+      targetCpuValue: {{ .Values.cluster.hiveServer2.autoscaling.targetCpuValue | quote }}
+      {{- end }}
+      {{- if .Values.cluster.hiveServer2.autoscaling.activationCpuValue }}
+      activationCpuValue: {{ .Values.cluster.hiveServer2.autoscaling.activationCpuValue | quote }}
+      {{- end }}
+      cooldownSeconds: {{ .Values.cluster.hiveServer2.autoscaling.cooldownSeconds }}
+      gracePeriodSeconds: {{ .Values.cluster.hiveServer2.autoscaling.gracePeriodSeconds }}
+    {{- end }}
 
   llap:
     enabled: {{ .Values.cluster.llap.enabled }}
@@ -120,6 +150,15 @@ spec:
     extraVolumeMounts:
       {{- toYaml .Values.cluster.llap.extraVolumeMounts | nindent 6 }}
     {{- end }}
+    {{- if and .Values.cluster.llap.autoscaling .Values.cluster.llap.autoscaling.enabled }}
+    autoscaling:
+      enabled: true
+      minReplicas: {{ .Values.cluster.llap.autoscaling.minReplicas }}
+      scaleUpThreshold: {{ .Values.cluster.llap.autoscaling.scaleUpThreshold }}
+      scaleDownThreshold: {{ .Values.cluster.llap.autoscaling.scaleDownThreshold }}
+      cooldownSeconds: {{ .Values.cluster.llap.autoscaling.cooldownSeconds }}
+      gracePeriodSeconds: {{ .Values.cluster.llap.autoscaling.gracePeriodSeconds }}
+    {{- end }}
     {{- end }}
 
   tezAm:
@@ -146,6 +185,21 @@ spec:
     extraVolumeMounts:
       {{- toYaml .Values.cluster.tezAm.extraVolumeMounts | nindent 6 }}
     {{- end }}
+    {{- if and .Values.cluster.tezAm.autoscaling .Values.cluster.tezAm.autoscaling.enabled }}
+    autoscaling:
+      enabled: true
+      minReplicas: {{ .Values.cluster.tezAm.autoscaling.minReplicas }}
+      scaleUpThreshold: {{ .Values.cluster.tezAm.autoscaling.scaleUpThreshold }}
+      scaleDownThreshold: {{ .Values.cluster.tezAm.autoscaling.scaleDownThreshold }}
+      {{- if .Values.cluster.tezAm.autoscaling.targetCpuValue }}
+      targetCpuValue: {{ .Values.cluster.tezAm.autoscaling.targetCpuValue | quote }}
+      {{- end }}
+      {{- if .Values.cluster.tezAm.autoscaling.activationCpuValue }}
+      activationCpuValue: {{ .Values.cluster.tezAm.autoscaling.activationCpuValue | quote }}
+      {{- end }}
+      cooldownSeconds: {{ .Values.cluster.tezAm.autoscaling.cooldownSeconds }}
+      gracePeriodSeconds: {{ .Values.cluster.tezAm.autoscaling.gracePeriodSeconds }}
+    {{- end }}
     {{- end }}
 
   zookeeper:

diff --git a/packaging/src/kubernetes/helm/hive-operator/values.yaml b/packaging/src/kubernetes/helm/hive-operator/values.yaml
@@ -112,6 +112,17 @@ cluster:
     configOverrides: {}
     extraVolumes: []
     extraVolumeMounts: []
+    # Autoscaling (requires KEDA + Prometheus in the cluster)
+    # When enabled, 'replicas' above acts as the max replica ceiling
+    autoscaling:
+      enabled: false
+      minReplicas: 1
+      scaleUpThreshold: 75
+      scaleDownThreshold: 30
+      # targetCpuValue: "750m"      # Uncomment to enable CPU-based scaling (AverageValue)
+      # activationCpuValue: "200m"  # CPU trigger inactive below this value
+      cooldownSeconds: 300
+      gracePeriodSeconds: 60
     # Set to use an external Metastore instead of deploying one:
     # enabled: false
     # externalUri: "thrift://external-metastore:9083"
@@ -127,6 +138,18 @@ cluster:
     externalJars: []
     extraVolumes: []
     extraVolumeMounts: []
+    # Autoscaling (requires KEDA + Prometheus + KEDA HTTP Add-on in the cluster)
+    # minReplicas: 0 enables scale-to-zero — beeline HTTP connects wake HS2 via KEDA HTTP interceptor
+    # When enabled, 'replicas' above acts as the max replica ceiling
+    autoscaling:
+      enabled: false
+      minReplicas: 0
+      scaleUpThreshold: 80
+      scaleDownThreshold: 20
+      # targetCpuValue: "1600m"     # Uncomment to enable CPU-based scaling (AverageValue)
+      # activationCpuValue: "400m"  # CPU trigger inactive below this value
+      cooldownSeconds: 600
+      gracePeriodSeconds: 300
 
   # ---------------------------------------------------------------------------
   # LLAP — enabled by default for full-HA
@@ -141,6 +164,16 @@ cluster:
     configOverrides: {}
     extraVolumes: []
     extraVolumeMounts: []
+    # Autoscaling (requires KEDA + Prometheus in the cluster)
+    # minReplicas: 0 enables scale-to-zero — scales up immediately when queries need LLAP
+    # When enabled, 'replicas' above acts as the max replica ceiling
+    autoscaling:
+      enabled: false
+      minReplicas: 0
+      scaleUpThreshold: 1
+      scaleDownThreshold: 0
+      cooldownSeconds: 900
+      gracePeriodSeconds: 600
 
   # ---------------------------------------------------------------------------
   # TEZ AM — enabled by default for full-HA
@@ -154,3 +187,16 @@ cluster:
     configOverrides: {}
     extraVolumes: []
     extraVolumeMounts: []
+    # Autoscaling (requires KEDA + Prometheus in the cluster)
+    # minReplicas: 0 enables scale-to-zero — wakes when HS2 receives queries
+    # When enabled, 'replicas' above acts as the max replica ceiling
+    # scaleUpThreshold: pending tasks per AM (e.g., 5 = scale when 5+ tasks waiting)
+    autoscaling:
+      enabled: false
+      minReplicas: 0
+      scaleUpThreshold: 5
+      scaleDownThreshold: 10
+      # targetCpuValue: "600m"      # Uncomment to enable CPU-based scaling (AverageValue)
+      # activationCpuValue: "100m"  # CPU trigger inactive below this value
+      cooldownSeconds: 600
+      gracePeriodSeconds: 120