Skip to content

Commit 3c13ff3

Browse files
authored
Merge pull request #498 from X1aoZEOuO/feat/0-1-activator
feat(controller): support serverless serving with 0-1 activator.
2 parents 9e7b004 + 43f79d3 commit 3c13ff3

File tree

10 files changed

+770
-56
lines changed

10 files changed

+770
-56
lines changed

api/core/v1alpha1/model_types.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,11 @@ const (
3535
// Once either of them qualified, we'll expose this as a field in Model.
3636
ModelPreheatAnnoKey = "llmaz.io/model-preheat"
3737

38+
// ModelActivatorAnnoKey is used to indicate the model name activated by the activator.
39+
ModelActivatorAnnoKey = "activator.llmaz.io/model-name"
40+
// CachedModelActivatorAnnoKey is used to cache the activator state of the model.
41+
CachedModelActivatorAnnoKey = "activator.llmaz.io/cached-state"
42+
3843
HUGGING_FACE = "Huggingface"
3944
MODEL_SCOPE = "ModelScope"
4045

chart/templates/deployment.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,10 @@ spec:
2929
env:
3030
- name: KUBERNETES_CLUSTER_DOMAIN
3131
value: {{ quote .Values.kubernetesClusterDomain }}
32+
- name: POD_IP
33+
valueFrom:
34+
fieldRef:
35+
fieldPath: status.podIP
3236
image: {{ .Values.controllerManager.manager.image.repository }}:{{ .Values.controllerManager.manager.image.tag
3337
| default .Chart.AppVersion }}
3438
livenessProbe:

chart/values.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ controllerManager:
55
- --metrics-bind-address=:8443
66
- --leader-elect
77
- --namespace=llmaz-system
8+
- --enable-service-activator
9+
- --pod-ip=$(POD_IP)
810
containerSecurityContext:
911
allowPrivilegeEscalation: false
1012
capabilities:

cmd/main.go

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626

2727
"k8s.io/apimachinery/pkg/runtime"
2828
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
29+
"k8s.io/client-go/dynamic"
2930
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
3031
ctrl "sigs.k8s.io/controller-runtime"
3132
"sigs.k8s.io/controller-runtime/pkg/healthz"
@@ -63,10 +64,14 @@ func main() {
6364
var enableLeaderElection bool
6465
var probeAddr string
6566
var namespace string
67+
var enableServiceActivator bool
68+
var podIP string
6669

6770
flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
6871
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
6972
flag.StringVar(&namespace, "namespace", "llmaz-system", "The namespace of the llmaz to deploy")
73+
flag.BoolVar(&enableServiceActivator, "enable-service-activator", false, "Enable the service activator feature. This is an experimental feature.")
74+
flag.StringVar(&podIP, "pod-ip", "", "The pod IP of the llmaz controller manager. Only used when service activator is enabled.")
7075
flag.BoolVar(&enableLeaderElection, "leader-elect", false,
7176
"Enable leader election for controller manager. "+
7277
"Enabling this will ensure there is only one active controller manager.")
@@ -120,7 +125,7 @@ func main() {
120125
// Cert won't be ready until manager starts, so start a goroutine here which
121126
// will block until the cert is ready before setting up the controllers.
122127
// Controllers who register after manager starts will start directly.
123-
go setupControllers(mgr, certsReady)
128+
go setupControllers(mgr, certsReady, enableServiceActivator, podIP)
124129

125130
//+kubebuilder:scaffold:builder
126131

@@ -140,7 +145,7 @@ func main() {
140145
}
141146
}
142147

143-
func setupControllers(mgr ctrl.Manager, certsReady chan struct{}) {
148+
func setupControllers(mgr ctrl.Manager, certsReady chan struct{}, enableServiceActivator bool, podIP string) {
144149
// The controllers won't work until the webhooks are operating,
145150
// and the webhook won't work until the certs are all in places.
146151
setupLog.Info("waiting for the cert generation to complete")
@@ -176,6 +181,20 @@ func setupControllers(mgr ctrl.Manager, certsReady chan struct{}) {
176181
os.Exit(1)
177182
}
178183

184+
if enableServiceActivator {
185+
dynamicClient, err := dynamic.NewForConfig(mgr.GetConfig())
186+
if err != nil {
187+
setupLog.Error(err, "unable to create dynamic client")
188+
os.Exit(1)
189+
}
190+
191+
activatorReconciler := inferencecontroller.NewActivatorReconciler(mgr, dynamicClient, podIP)
192+
if err := activatorReconciler.SetupWithManager(mgr); err != nil {
193+
setupLog.Error(err, "unable to create controller", "controller", "Activator")
194+
os.Exit(1)
195+
}
196+
}
197+
179198
if os.Getenv("ENABLE_WEBHOOKS") != "false" {
180199
if err := webhook.SetupOpenModelWebhook(mgr); err != nil {
181200
setupLog.Error(err, "unable to create webhook", "webhook", "Model")

config/crd/bases/inference.llmaz.io_backendruntimes.yaml

Lines changed: 48 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -388,6 +388,12 @@ spec:
388388
- port
389389
type: object
390390
type: object
391+
stopSignal:
392+
description: |-
393+
StopSignal defines which signal will be sent to a container when it is being stopped.
394+
If not specified, the default is defined by the container runtime in use.
395+
StopSignal can only be set for Pods with a non-empty .spec.os.name
396+
type: string
391397
type: object
392398
livenessProbe:
393399
description: |-
@@ -770,7 +776,9 @@ spec:
770776
policies:
771777
description: |-
772778
policies is a list of potential scaling polices which can be used during scaling.
773-
At least one policy must be specified, otherwise the HPAScalingRules will be discarded as invalid
779+
If not set, use the default values:
780+
- For scale up: allow doubling the number of pods, or an absolute change of 4 pods in a 15s window.
781+
- For scale down: allow all pods to be removed in a 15s window.
774782
items:
775783
description: HPAScalingPolicy is a single
776784
policy which must hold true for a specified
@@ -814,6 +822,24 @@ spec:
814822
- For scale down: 300 (i.e. the stabilization window is 300 seconds long).
815823
format: int32
816824
type: integer
825+
tolerance:
826+
anyOf:
827+
- type: integer
828+
- type: string
829+
description: |-
830+
tolerance is the tolerance on the ratio between the current and desired
831+
metric value under which no updates are made to the desired number of
832+
replicas (e.g. 0.01 for 1%). Must be greater than or equal to zero. If not
833+
set, the default cluster-wide tolerance is applied (by default 10%).
834+
835+
For example, if autoscaling is configured with a memory consumption target of 100Mi,
836+
and scale-down and scale-up tolerances of 5% and 1% respectively, scaling will be
837+
triggered when the actual consumption falls below 95Mi or exceeds 101Mi.
838+
839+
This is an alpha field and requires enabling the HPAConfigurableTolerance
840+
feature gate.
841+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
842+
x-kubernetes-int-or-string: true
817843
type: object
818844
scaleUp:
819845
description: |-
@@ -826,7 +852,9 @@ spec:
826852
policies:
827853
description: |-
828854
policies is a list of potential scaling polices which can be used during scaling.
829-
At least one policy must be specified, otherwise the HPAScalingRules will be discarded as invalid
855+
If not set, use the default values:
856+
- For scale up: allow doubling the number of pods, or an absolute change of 4 pods in a 15s window.
857+
- For scale down: allow all pods to be removed in a 15s window.
830858
items:
831859
description: HPAScalingPolicy is a single
832860
policy which must hold true for a specified
@@ -870,6 +898,24 @@ spec:
870898
- For scale down: 300 (i.e. the stabilization window is 300 seconds long).
871899
format: int32
872900
type: integer
901+
tolerance:
902+
anyOf:
903+
- type: integer
904+
- type: string
905+
description: |-
906+
tolerance is the tolerance on the ratio between the current and desired
907+
metric value under which no updates are made to the desired number of
908+
replicas (e.g. 0.01 for 1%). Must be greater than or equal to zero. If not
909+
set, the default cluster-wide tolerance is applied (by default 10%).
910+
911+
For example, if autoscaling is configured with a memory consumption target of 100Mi,
912+
and scale-down and scale-up tolerances of 5% and 1% respectively, scaling will be
913+
triggered when the actual consumption falls below 95Mi or exceeds 101Mi.
914+
915+
This is an alpha field and requires enabling the HPAConfigurableTolerance
916+
feature gate.
917+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
918+
x-kubernetes-int-or-string: true
873919
type: object
874920
type: object
875921
metrics:

config/crd/bases/inference.llmaz.io_playgrounds.yaml

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -295,7 +295,9 @@ spec:
295295
policies:
296296
description: |-
297297
policies is a list of potential scaling polices which can be used during scaling.
298-
At least one policy must be specified, otherwise the HPAScalingRules will be discarded as invalid
298+
If not set, use the default values:
299+
- For scale up: allow doubling the number of pods, or an absolute change of 4 pods in a 15s window.
300+
- For scale down: allow all pods to be removed in a 15s window.
299301
items:
300302
description: HPAScalingPolicy is a single policy
301303
which must hold true for a specified past
@@ -339,6 +341,24 @@ spec:
339341
- For scale down: 300 (i.e. the stabilization window is 300 seconds long).
340342
format: int32
341343
type: integer
344+
tolerance:
345+
anyOf:
346+
- type: integer
347+
- type: string
348+
description: |-
349+
tolerance is the tolerance on the ratio between the current and desired
350+
metric value under which no updates are made to the desired number of
351+
replicas (e.g. 0.01 for 1%). Must be greater than or equal to zero. If not
352+
set, the default cluster-wide tolerance is applied (by default 10%).
353+
354+
For example, if autoscaling is configured with a memory consumption target of 100Mi,
355+
and scale-down and scale-up tolerances of 5% and 1% respectively, scaling will be
356+
triggered when the actual consumption falls below 95Mi or exceeds 101Mi.
357+
358+
This is an alpha field and requires enabling the HPAConfigurableTolerance
359+
feature gate.
360+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
361+
x-kubernetes-int-or-string: true
342362
type: object
343363
scaleUp:
344364
description: |-
@@ -351,7 +371,9 @@ spec:
351371
policies:
352372
description: |-
353373
policies is a list of potential scaling polices which can be used during scaling.
354-
At least one policy must be specified, otherwise the HPAScalingRules will be discarded as invalid
374+
If not set, use the default values:
375+
- For scale up: allow doubling the number of pods, or an absolute change of 4 pods in a 15s window.
376+
- For scale down: allow all pods to be removed in a 15s window.
355377
items:
356378
description: HPAScalingPolicy is a single policy
357379
which must hold true for a specified past
@@ -395,6 +417,24 @@ spec:
395417
- For scale down: 300 (i.e. the stabilization window is 300 seconds long).
396418
format: int32
397419
type: integer
420+
tolerance:
421+
anyOf:
422+
- type: integer
423+
- type: string
424+
description: |-
425+
tolerance is the tolerance on the ratio between the current and desired
426+
metric value under which no updates are made to the desired number of
427+
replicas (e.g. 0.01 for 1%). Must be greater than or equal to zero. If not
428+
set, the default cluster-wide tolerance is applied (by default 10%).
429+
430+
For example, if autoscaling is configured with a memory consumption target of 100Mi,
431+
and scale-down and scale-up tolerances of 5% and 1% respectively, scaling will be
432+
triggered when the actual consumption falls below 95Mi or exceeds 101Mi.
433+
434+
This is an alpha field and requires enabling the HPAConfigurableTolerance
435+
feature gate.
436+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
437+
x-kubernetes-int-or-string: true
398438
type: object
399439
type: object
400440
metrics:

0 commit comments

Comments
 (0)