From 96db6d38ab9f876095fc349ac9b7140fb3e4f16f Mon Sep 17 00:00:00 2001 From: Davanum Srinivas Date: Thu, 25 Jun 2026 11:42:45 -0400 Subject: [PATCH] ateom-microvm: give the micro-VM guest DNS so name-based egress works The guest got an interface, default route, and host NAT but no /etc/resolv.conf: ateom-microvm drops atelet's resolv.conf bind (a host bind is meaningless inside a VM) and never sends CreateSandbox.Dns, so the guest could reach IPs but not resolve names. Workloads doing name-based egress (e.g. the OpenShell helpdesk agent calling Ollama Cloud) failed; only no-egress paths like /status worked. Write /etc/resolv.conf into the bundle rootfs before mkfs.ext4 in both the golden-boot (RunWorkload) and restore (RestoreWorkload) paths, copied verbatim from the worker pod's own resolv.conf (cluster DNS) -- the same source the kata shim's getDNS uses -- with a public-resolver guard only for a degenerate empty file. Mirrors what the kata shim achieves and satisfies kata-agent's requirement that /etc/resolv.conf pre-exist. Verified on bigbox: helpdesk-microvm /chat returns real gpt-oss:20b-cloud completions and survives suspend/resume; counter-microvm and /status unaffected. --- cmd/ateom-microvm/run.go | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/cmd/ateom-microvm/run.go b/cmd/ateom-microvm/run.go index 0325dc949..83890c279 100644 --- a/cmd/ateom-microvm/run.go +++ b/cmd/ateom-microvm/run.go @@ -229,6 +229,11 @@ func (s *AteomService) RunWorkload(ctx context.Context, req *ateompb.RunWorkload } tmpPath := tmp.Name() _ = tmp.Close() + if err := writeGuestResolvConf(filepath.Join(bundle, "rootfs")); err != nil { + buildCancel() + _ = os.Remove(tmpPath) + return nil, fmt.Errorf("while writing guest resolv.conf: %w", err) + } if err := kata.BuildExt4Image(buildCtx, filepath.Join(bundle, "rootfs"), tmpPath); err != nil { buildCancel() _ = os.Remove(tmpPath) @@ -588,6 +593,32 @@ func ensureKataCompatibleSpec(bundle, id, netnsPath string) (*specs.Spec, error) return &spec, nil } +// writeGuestResolvConf gives the micro-VM guest working DNS by writing +// /etc/resolv.conf into the bundle rootfs before it is packed into the ext4 +// disk. ateom drops atelet's /etc/resolv.conf bind (a host bind is meaningless +// inside a VM) and does not send CreateSandbox.Dns, so without this the guest +// can reach IP literals but cannot resolve names. We copy the worker pod's own +// resolv.conf (cluster DNS) verbatim -- the same source the kata shim's getDNS +// uses -- so the guest resolves both cluster and external names (e.g. ollama.com) +// out over the pod's NAT'd egress. Writing the file also satisfies the kata +// agent's requirement that /etc/resolv.conf already exist before it populates DNS. +func writeGuestResolvConf(rootfs string) error { + content, err := os.ReadFile("/etc/resolv.conf") + if err != nil || len(strings.TrimSpace(string(content))) == 0 { + // The worker pod always has a resolv.conf; this guard only trips in a + // degenerate environment, so the guest is never left without a resolver. + content = []byte("nameserver 1.1.1.1\nnameserver 8.8.8.8\n") + } + etc := filepath.Join(rootfs, "etc") + if err := os.MkdirAll(etc, 0o755); err != nil { + return fmt.Errorf("creating %q: %w", etc, err) + } + if err := os.WriteFile(filepath.Join(etc, "resolv.conf"), content, 0o644); err != nil { + return fmt.Errorf("writing guest resolv.conf: %w", err) + } + return nil +} + // defaultKataMounts mirrors the mount set `ctr run --runtime io.containerd.kata.v2` // produces (the proven-good shape for the kata agent). func defaultKataMounts() []specs.Mount { @@ -888,6 +919,9 @@ func (s *AteomService) RestoreWorkload(ctx context.Context, req *ateompb.Restore slog.InfoContext(ctx, "Reset actor rootfs disk to golden (template)", slog.String("id", id)) } else { bundleRootfs := filepath.Join(ateompath.OCIBundlePath(ns, name, id, containers[0].GetName()), "rootfs") + if err := writeGuestResolvConf(bundleRootfs); err != nil { + return nil, fmt.Errorf("while writing guest resolv.conf: %w", err) + } // Detach from the resume RPC deadline: reconstructing a multi-GB rootfs via // mkfs.ext4 -d can take minutes and would otherwise be SIGKILLed mid-write. // (Unlike the golden boot this is reset-each-restore, so it always rebuilds.)