Skip to content

Commit 84f8bad

Browse files
authored
Managed ml diagnostics and xpk integration (#801)
* Add Diagon installation during cluster creation and modify the workload.py Add wait_for_deployment_ready() Added unit test update goldens.yaml update goldens.yaml update goldens.yaml Fixed parser/cluster.py update goldens.yaml fixed linter fixed linter pyink Test unit test * Resolve conflicts * Resolve conflicts * test unit test * test unit test * test unit test * deleted set_result_for_command
1 parent 7ee8139 commit 84f8bad

File tree

6 files changed

+428
-0
lines changed

6 files changed

+428
-0
lines changed

src/xpk/commands/cluster.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@
8484
from ..utils.templates import get_templates_absolute_path
8585
import shutil
8686
import os
87+
from .managed_ml_diagnostics import install_mldiagnostics_prerequisites
8788

8889
CLUSTER_PREHEAT_JINJA_FILE = 'cluster_preheat.yaml.j2'
8990

@@ -422,6 +423,13 @@ def cluster_create(args) -> None:
422423
# pylint: disable=line-too-long
423424
f' https://console.cloud.google.com/kubernetes/clusters/details/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/details?project={args.project}'
424425
)
426+
427+
if args.managed_mldiagnostics:
428+
return_code = install_mldiagnostics_prerequisites()
429+
if return_code != 0:
430+
xpk_print('Installation of MLDiagnostics failed.')
431+
xpk_exit(return_code)
432+
425433
xpk_exit(0)
426434

427435

src/xpk/commands/cluster_test.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,9 @@ def mocks(mocker) -> _Mocks:
8484
run_command_with_updates_path=(
8585
'xpk.commands.cluster.run_command_with_updates'
8686
),
87+
run_command_for_value_path=(
88+
'xpk.commands.cluster.run_command_for_value'
89+
),
8790
),
8891
)
8992

@@ -121,6 +124,7 @@ def construct_args(**kwargs: Any) -> Namespace:
121124
cluster_cpu_machine_type='',
122125
create_vertex_tensorboard=False,
123126
enable_autoprovisioning=False,
127+
managed_mldiagnostics=False,
124128
)
125129
args_dict.update(kwargs)
126130
return Namespace(**args_dict)
Lines changed: 249 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,249 @@
1+
"""
2+
Copyright 2024 Google LLC
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
https://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
"""
16+
17+
from packaging.version import Version
18+
from ..core.commands import run_command_for_value, run_command_with_updates
19+
from ..utils.console import xpk_print
20+
import os
21+
import tempfile
22+
23+
_KUEUE_DEPLOYMENT_NAME = 'kueue-controller-manager'
24+
_KUEUE_NAMESPACE_NAME = 'kueue-system'
25+
_CERT_WEBHOOK_DEPLOYMENT_NAME = 'cert-manager-webhook'
26+
_CERT_WEBHOOK_NAMESPACE_NAME = 'cert-manager'
27+
_WEBHOOK_PACKAGE = 'mldiagnostics-injection-webhook'
28+
_WEBHOOK_VERSION = Version('v0.5.0')
29+
_WEBHOOK_FILENAME = f'{_WEBHOOK_PACKAGE}-v{_WEBHOOK_VERSION}.yaml'
30+
_OPERATOR_PACKAGE = 'mldiagnostics-connection-operator'
31+
_OPERATOR_VERSION = Version('v0.5.0')
32+
_OPERATOR_FILENAME = f'{_OPERATOR_PACKAGE}-v{_OPERATOR_VERSION}.yaml'
33+
_CERT_MANAGER_VERSION = Version('v1.13.0')
34+
35+
36+
def _install_cert_manager(version: Version = _CERT_MANAGER_VERSION) -> int:
37+
"""
38+
Apply the cert-manager manifest.
39+
40+
Returns:
41+
0 if successful and 1 otherwise.
42+
"""
43+
44+
command = (
45+
'kubectl apply -f'
46+
' https://github.com/cert-manager/cert-manager/releases/download/'
47+
f'v{version}/cert-manager.yaml'
48+
)
49+
50+
return_code = run_command_with_updates(
51+
command, f'Applying cert-manager {version} manifest...'
52+
)
53+
54+
return return_code
55+
56+
57+
def _download_mldiagnostics_yaml(package_name: str, version: Version) -> int:
58+
"""
59+
Downloads the mldiagnostics injection webhook YAML from Artifact Registry.
60+
61+
Returns:
62+
0 if successful and 1 otherwise.
63+
"""
64+
65+
command = (
66+
'gcloud artifacts generic download'
67+
' --repository=mldiagnostics-webhook-and-operator-yaml --location=us'
68+
f' --package={package_name} --version=v{version} --destination=/tmp/'
69+
' --project=ai-on-gke'
70+
)
71+
72+
return_code, return_output = run_command_for_value(
73+
command,
74+
f'Download {package_name} {version}...',
75+
)
76+
77+
if return_code != 0:
78+
if 'already exists' in return_output:
79+
xpk_print(
80+
f'Artifact file for {package_name} {version} already exists locally.'
81+
' Skipping download.'
82+
)
83+
return 0
84+
85+
return return_code
86+
87+
88+
def _create_mldiagnostics_namespace() -> int:
89+
"""
90+
Creates the 'gke-mldiagnostics' namespace.
91+
92+
Returns:
93+
0 if successful and 1 otherwise.
94+
"""
95+
96+
command = 'kubectl create namespace gke-mldiagnostics'
97+
98+
return_code, return_output = run_command_for_value(
99+
command, 'Create gke-mldiagnostics namespace...'
100+
)
101+
102+
if return_code != 0:
103+
if 'already exists' in return_output:
104+
xpk_print('Namespace already exists. Skipping creation.')
105+
return 0
106+
107+
return return_code
108+
109+
110+
def _install_mldiagnostics_yaml(artifact_filename: str) -> int:
111+
"""
112+
Applies the mldiagnostics injection webhook YAML manifest.
113+
114+
Returns:
115+
0 if successful and 1 otherwise.
116+
"""
117+
full_artifact_path = os.path.join(tempfile.gettempdir(), artifact_filename)
118+
119+
command = f'kubectl apply -f {full_artifact_path} -n gke-mldiagnostics'
120+
121+
return run_command_with_updates(
122+
command,
123+
f'Install {full_artifact_path}...',
124+
)
125+
126+
127+
def _label_default_namespace_mldiagnostics() -> int:
128+
"""
129+
Labels the 'default' namespace with 'managed-mldiagnostics-gke=true'.
130+
131+
Returns:
132+
0 if successful and 1 otherwise.
133+
"""
134+
135+
command = 'kubectl label namespace default managed-mldiagnostics-gke=true'
136+
137+
return run_command_with_updates(
138+
command,
139+
'Label default namespace with managed-mldiagnostics-gke=true',
140+
)
141+
142+
143+
def install_mldiagnostics_prerequisites() -> int:
144+
"""
145+
Mldiagnostics installation requirements.
146+
147+
Returns:
148+
0 if successful and 1 otherwise.
149+
"""
150+
151+
if not _wait_for_deployment_ready(
152+
deployment_name=_KUEUE_DEPLOYMENT_NAME, namespace=_KUEUE_NAMESPACE_NAME
153+
):
154+
xpk_print(
155+
f'Application {_KUEUE_DEPLOYMENT_NAME} failed to become ready within'
156+
' the timeout.'
157+
)
158+
return 1
159+
160+
return_code = _install_cert_manager()
161+
if return_code != 0:
162+
return return_code
163+
164+
cert_webhook_ready = _wait_for_deployment_ready(
165+
deployment_name=_CERT_WEBHOOK_DEPLOYMENT_NAME,
166+
namespace=_CERT_WEBHOOK_NAMESPACE_NAME,
167+
)
168+
if not cert_webhook_ready:
169+
xpk_print('The cert-manager-webhook installation failed.')
170+
return 1
171+
172+
return_code = _download_mldiagnostics_yaml(
173+
package_name=_WEBHOOK_PACKAGE, version=_WEBHOOK_VERSION
174+
)
175+
if return_code != 0:
176+
return return_code
177+
178+
return_code = _create_mldiagnostics_namespace()
179+
if return_code != 0:
180+
return return_code
181+
182+
return_code = _install_mldiagnostics_yaml(artifact_filename=_WEBHOOK_FILENAME)
183+
if return_code != 0:
184+
return return_code
185+
186+
return_code = _label_default_namespace_mldiagnostics()
187+
if return_code != 0:
188+
return return_code
189+
190+
return_code = _download_mldiagnostics_yaml(
191+
package_name=_OPERATOR_PACKAGE, version=_OPERATOR_VERSION
192+
)
193+
if return_code != 0:
194+
return return_code
195+
196+
return_code = _install_mldiagnostics_yaml(
197+
artifact_filename=_OPERATOR_FILENAME
198+
)
199+
if return_code != 0:
200+
return return_code
201+
202+
xpk_print(
203+
'All mldiagnostics installation and setup steps have been'
204+
' successfully completed!'
205+
)
206+
return 0
207+
208+
209+
def _wait_for_deployment_ready(
210+
deployment_name: str, namespace: str, timeout_seconds: int = 300
211+
) -> bool:
212+
"""
213+
Polls the Kubernetes Deployment status using kubectl rollout status
214+
until it successfully rolls out (all replicas are ready) or times out.
215+
216+
Args:
217+
deployment_name: The name of the Kubernetes Deployment (e.g., 'kueue-controller-manager').
218+
namespace: The namespace where the Deployment is located (e.g., 'kueue-system').
219+
timeout_seconds: Timeout duration in seconds (default is 300s / 5 minutes).
220+
221+
Returns:
222+
bool: True if the Deployment successfully rolled out, False otherwise (timeout or error).
223+
"""
224+
225+
command = (
226+
f'kubectl rollout status deployment/{deployment_name} -n {namespace}'
227+
f' --timeout={timeout_seconds}s'
228+
)
229+
230+
return_code = run_command_with_updates(
231+
command, f'Checking status of deployment {deployment_name}...'
232+
)
233+
234+
if return_code != 0:
235+
return False
236+
237+
# When the status changes to 'running,' it might need about 10 seconds to fully stabilize.
238+
stabilization_seconds = 30
239+
stabilization_command = f'sleep {stabilization_seconds}'
240+
stabilization_code = run_command_with_updates(
241+
stabilization_command,
242+
f'Deployment {deployment_name} is ready. Waiting {stabilization_seconds}'
243+
' seconds for full stabilization',
244+
verbose=True,
245+
)
246+
if stabilization_code != 0:
247+
return False
248+
249+
return True

0 commit comments

Comments
 (0)