diff --git a/cmd/obol/main.go b/cmd/obol/main.go index d3432da5..9fb019cb 100644 --- a/cmd/obol/main.go +++ b/cmd/obol/main.go @@ -321,6 +321,7 @@ GLOBAL OPTIONS:{{template "visibleFlagTemplate" .}}{{end}} updateCommand(cfg), upgradeCommand(cfg), networkCommand(cfg), + nodeCommand(cfg), hermesCommand(cfg), openclawCommand(cfg), sellCommand(cfg), diff --git a/cmd/obol/node.go b/cmd/obol/node.go new file mode 100644 index 00000000..269af862 --- /dev/null +++ b/cmd/obol/node.go @@ -0,0 +1,102 @@ +package main + +import ( + "context" + "encoding/json" + "fmt" + + "github.com/ObolNetwork/obol-stack/internal/config" + "github.com/ObolNetwork/obol-stack/internal/kubectl" + "github.com/ObolNetwork/obol-stack/internal/stack" + "github.com/urfave/cli/v3" +) + +// nodeCommand groups commands for adding and inspecting worker nodes that join +// this stack's cluster. Multi-node only makes sense on the k3s backend — a +// k3d/Docker master's flannel overlay is not routable off-host — so the +// subcommands guard on the active backend. +func nodeCommand(cfg *config.Config) *cli.Command { + return &cli.Command{ + Name: "node", + Usage: "Add and inspect worker nodes that join this stack's cluster (k3s backend)", + Commands: []*cli.Command{ + nodeTokenCommand(cfg), + nodeListCommand(cfg), + }, + } +} + +func nodeTokenCommand(cfg *config.Config) *cli.Command { + return &cli.Command{ + Name: "token", + Usage: "Print the join command for adding a Linux worker node to this k3s cluster", + Flags: []cli.Flag{ + &cli.StringFlag{ + Name: "server-url", + Usage: "Override the K3S_URL agents dial (default https://:6443)", + }, + &cli.BoolFlag{Name: "json", Usage: "Output machine-readable JSON"}, + }, + Action: func(ctx context.Context, cmd *cli.Command) error { + u := getUI(cmd) + + backend, err := stack.LoadBackend(cfg) + if err != nil { + return err + } + + if backend.Name() != stack.BackendK3s { + return fmt.Errorf( + "obol node requires the k3s backend (current backend: %q)\n"+ + "A k3d/Docker master cannot accept remote node joins — its flannel overlay is not routable off-host.\n"+ + "Re-init on a Linux host with: obol stack init --backend k3s", + backend.Name()) + } + + token, err := stack.ReadK3sNodeToken(cfg) + if err != nil { + return err + } + + serverURL := stack.K3sServerURL(cmd.String("server-url")) + version := stack.K3sBinaryVersion(cfg) + joinCmd := stack.K3sAgentJoinCommand(serverURL, token, version) + + if cmd.Bool("json") { + out, _ := json.MarshalIndent(map[string]string{ + "serverUrl": serverURL, + "token": token, + "version": version, + "joinCommand": joinCmd, + }, "", " ") + fmt.Println(string(out)) + + return nil + } + + u.Info("Run this on a Linux worker node to join the cluster:") + fmt.Printf("\n %s\n\n", joinCmd) + u.Detail("Server", serverURL) + u.Dim("Multi-homed / Wi-Fi node? append: --node-ip --flannel-iface ") + u.Dim("GPU node? label it at join: --node-label obol.tech/accelerator=nvidia") + + return nil + }, + } +} + +func nodeListCommand(cfg *config.Config) *cli.Command { + return &cli.Command{ + Name: "list", + Usage: "List cluster nodes with their accelerator labels", + Action: func(ctx context.Context, cmd *cli.Command) error { + if err := kubectl.EnsureCluster(cfg); err != nil { + return err + } + + bin, kc := kubectl.Paths(cfg) + + return kubectl.Run(bin, kc, "get", "nodes", "-o", "wide", "-L", "obol.tech/accelerator") + }, + } +} diff --git a/internal/embed/k3s-config.yaml b/internal/embed/k3s-config.yaml index be8e7b77..e9a1af0c 100644 --- a/internal/embed/k3s-config.yaml +++ b/internal/embed/k3s-config.yaml @@ -13,11 +13,15 @@ data-dir: {{DATA_DIR}}/k3s bind-address: 0.0.0.0 https-listen-port: 6443 -# TLS SANs for local access +# TLS SANs for local + LAN access. k3s already auto-adds the node's primary +# IP, but listing it (and the hostname) explicitly lets worker nodes join via +# either address and keeps the API cert deterministic across restarts. tls-san: - "127.0.0.1" - "localhost" - "obol.stack" + - "{{NODE_IP}}" + - "{{NODE_HOSTNAME}}" # Relax eviction thresholds: k3s reports imagefs capacity as 0 on shared # filesystems, causing spurious disk-pressure taints with percentage thresholds. diff --git a/internal/stack/backend_k3s.go b/internal/stack/backend_k3s.go index c8e56be5..bea1bbdc 100644 --- a/internal/stack/backend_k3s.go +++ b/internal/stack/backend_k3s.go @@ -63,6 +63,9 @@ func (b *K3sBackend) Init(cfg *config.Config, u *ui.UI, stackID string) error { k3sConfig := embed.K3sConfig k3sConfig = strings.ReplaceAll(k3sConfig, "{{STACK_ID}}", stackID) k3sConfig = strings.ReplaceAll(k3sConfig, "{{DATA_DIR}}", absDataDir) + // LAN SANs so worker nodes can join this server by IP or hostname. + k3sConfig = strings.ReplaceAll(k3sConfig, "{{NODE_IP}}", OutboundIP()) + k3sConfig = strings.ReplaceAll(k3sConfig, "{{NODE_HOSTNAME}}", nodeHostname()) k3sConfigPath := filepath.Join(cfg.ConfigDir, k3sConfigFile) if err := os.WriteFile(k3sConfigPath, []byte(k3sConfig), 0o600); err != nil { diff --git a/internal/stack/backend_k3s_init_test.go b/internal/stack/backend_k3s_init_test.go new file mode 100644 index 00000000..037426b3 --- /dev/null +++ b/internal/stack/backend_k3s_init_test.go @@ -0,0 +1,48 @@ +package stack + +import ( + "os" + "path/filepath" + "strings" + "testing" + + "github.com/ObolNetwork/obol-stack/internal/config" + "github.com/ObolNetwork/obol-stack/internal/ui" +) + +// TestK3sBackend_Init_SubstitutesNodeSANs verifies that Init renders the +// embedded k3s-config.yaml with every {{...}} placeholder resolved and the +// node's LAN IP + hostname injected into the tls-san block, so a worker node +// can join the server by either address. +func TestK3sBackend_Init_SubstitutesNodeSANs(t *testing.T) { + dir := t.TempDir() + cfg := &config.Config{ + ConfigDir: dir, + DataDir: filepath.Join(dir, "data"), + } + + b := &K3sBackend{} + if err := b.Init(cfg, ui.New(false), "teststack"); err != nil { + t.Fatalf("Init: %v", err) + } + + data, err := os.ReadFile(filepath.Join(dir, k3sConfigFile)) + if err != nil { + t.Fatalf("read rendered k3s config: %v", err) + } + + rendered := string(data) + if strings.Contains(rendered, "{{") { + t.Errorf("rendered k3s config still has an unsubstituted placeholder:\n%s", rendered) + } + + if !strings.Contains(rendered, "tls-san") { + t.Fatal("rendered k3s config has no tls-san block") + } + + for _, want := range []string{OutboundIP(), nodeHostname()} { + if !strings.Contains(rendered, want) { + t.Errorf("tls-san missing %q\n%s", want, rendered) + } + } +} diff --git a/internal/stack/node.go b/internal/stack/node.go new file mode 100644 index 00000000..46b393e8 --- /dev/null +++ b/internal/stack/node.go @@ -0,0 +1,128 @@ +package stack + +import ( + "fmt" + "net" + "os" + "os/exec" + "path/filepath" + "strings" + + "github.com/ObolNetwork/obol-stack/internal/config" +) + +// k3sAPIPort is the standard k3s supervisor/apiserver port a joining agent dials. +const k3sAPIPort = 6443 + +// OutboundIP returns this host's primary outbound IPv4 address, discovered by +// opening a UDP socket toward a public address (no packets are actually sent). +// It is the address a LAN peer would use to reach this host and the one k3s +// advertises as the node InternalIP. Falls back to 127.0.0.1. +func OutboundIP() string { + conn, err := net.Dial("udp", "8.8.8.8:80") + if err != nil { + return "127.0.0.1" + } + defer conn.Close() + + if addr, ok := conn.LocalAddr().(*net.UDPAddr); ok && addr.IP != nil { + return addr.IP.String() + } + + return "127.0.0.1" +} + +// nodeHostname returns this host's hostname, or "localhost" if unavailable. +func nodeHostname() string { + h, err := os.Hostname() + if err != nil || strings.TrimSpace(h) == "" { + return "localhost" + } + + return h +} + +// K3sNodeTokenPath returns the path to the k3s server join token for the k3s +// backend's data-dir. It mirrors `data-dir: {{DATA_DIR}}/k3s` in the embedded +// k3s-config.yaml — NOT the default /var/lib/rancher/k3s, which obol overrides. +func K3sNodeTokenPath(cfg *config.Config) string { + absDataDir, err := filepath.Abs(cfg.DataDir) + if err != nil { + absDataDir = cfg.DataDir + } + + return filepath.Join(absDataDir, "k3s", "server", "node-token") +} + +// ReadK3sNodeToken reads the root-owned k3s server join token via sudo. +func ReadK3sNodeToken(cfg *config.Config) (string, error) { + path := K3sNodeTokenPath(cfg) + + out, err := exec.Command("sudo", "cat", path).Output() + if err != nil { + return "", fmt.Errorf("read k3s node-token at %s (is this host the running k3s server?): %w", path, err) + } + + token := strings.TrimSpace(string(out)) + if token == "" { + return "", fmt.Errorf("k3s node-token at %s is empty", path) + } + + return token, nil +} + +// K3sServerURL returns the https URL a joining agent dials. When override is +// empty it uses this host's primary LAN IP and the standard k3s API port. +func K3sServerURL(override string) string { + if override != "" { + return override + } + + return fmt.Sprintf("https://%s:%d", OutboundIP(), k3sAPIPort) +} + +// K3sBinaryVersion returns the k3s release string (e.g. "v1.35.5+k3s1") of the +// k3s binary in BinDir, used to pin a joining agent to the server's version. +// Returns "" when it can't be determined (the installer then picks stable). +func K3sBinaryVersion(cfg *config.Config) string { + out, err := exec.Command(filepath.Join(cfg.BinDir, "k3s"), "--version").Output() + if err != nil { + return "" + } + + return parseK3sVersion(string(out)) +} + +// parseK3sVersion extracts the version token from `k3s --version` output, +// whose first line looks like: "k3s version v1.35.5+k3s1 (6a4781ad)". +func parseK3sVersion(out string) string { + firstLine, _, _ := strings.Cut(out, "\n") + + fields := strings.Fields(firstLine) + for i, f := range fields { + if f == "version" && i+1 < len(fields) { + return fields[i+1] + } + } + + return "" +} + +// K3sAgentJoinCommand builds the copy-pasteable one-liner an operator runs on a +// Linux worker node to join this stack's k3s cluster. When version is non-empty +// the agent install is pinned to it (agents should match the server version). +func K3sAgentJoinCommand(serverURL, token, version string) string { + var b strings.Builder + + b.WriteString("curl -sfL https://get.k3s.io | ") + + if version != "" { + b.WriteString("INSTALL_K3S_VERSION=" + version + " ") + } + + b.WriteString("K3S_URL=" + serverURL + " ") + b.WriteString("K3S_TOKEN='" + token + "' ") + b.WriteString("sh -s - agent") + + return b.String() +} diff --git a/internal/stack/node_test.go b/internal/stack/node_test.go new file mode 100644 index 00000000..cfe27486 --- /dev/null +++ b/internal/stack/node_test.go @@ -0,0 +1,111 @@ +package stack + +import ( + "strings" + "testing" + + "github.com/ObolNetwork/obol-stack/internal/config" +) + +func TestK3sAgentJoinCommand(t *testing.T) { + const ( + token = "K10abc123::server:def456" + server = "https://192.168.50.203:6443" + ) + + tests := []struct { + name string + version string + want []string + absent []string + }{ + { + name: "pinned version", + version: "v1.35.5+k3s1", + want: []string{ + "curl -sfL https://get.k3s.io | ", + "INSTALL_K3S_VERSION=v1.35.5+k3s1 ", + "K3S_URL=https://192.168.50.203:6443 ", + "K3S_TOKEN='K10abc123::server:def456' ", + "sh -s - agent", + }, + }, + { + name: "unpinned version omits INSTALL_K3S_VERSION", + version: "", + want: []string{ + "K3S_URL=https://192.168.50.203:6443 ", + "sh -s - agent", + }, + absent: []string{"INSTALL_K3S_VERSION"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := K3sAgentJoinCommand(server, token, tt.version) + for _, w := range tt.want { + if !strings.Contains(got, w) { + t.Errorf("join command missing %q\n got: %s", w, got) + } + } + + for _, a := range tt.absent { + if strings.Contains(got, a) { + t.Errorf("join command should not contain %q\n got: %s", a, got) + } + } + }) + } +} + +func TestParseK3sVersion(t *testing.T) { + tests := []struct { + name string + in string + want string + }{ + {"standard two-line output", "k3s version v1.35.5+k3s1 (6a4781ad)\ngo version go1.25.9\n", "v1.35.5+k3s1"}, + {"single line no trailing newline", "k3s version v1.30.0+k3s1", "v1.30.0+k3s1"}, + {"empty", "", ""}, + {"unexpected format", "something else entirely", ""}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := parseK3sVersion(tt.in); got != tt.want { + t.Errorf("parseK3sVersion(%q) = %q, want %q", tt.in, got, tt.want) + } + }) + } +} + +func TestK3sServerURL(t *testing.T) { + if got := K3sServerURL("https://example.local:6443"); got != "https://example.local:6443" { + t.Errorf("override should be returned verbatim, got %q", got) + } + + got := K3sServerURL("") + if !strings.HasPrefix(got, "https://") || !strings.HasSuffix(got, ":6443") { + t.Errorf("default server URL malformed: %q", got) + } +} + +func TestK3sNodeTokenPath(t *testing.T) { + cfg := &config.Config{DataDir: "/tmp/obol-data"} + + got := K3sNodeTokenPath(cfg) + if !strings.HasSuffix(got, "/k3s/server/node-token") { + t.Errorf("token path = %q, want suffix /k3s/server/node-token", got) + } + + if !strings.HasPrefix(got, "/tmp/obol-data") { + t.Errorf("token path should be under the data-dir, got %q", got) + } +} + +func TestOutboundIP_NeverEmpty(t *testing.T) { + if got := OutboundIP(); got == "" { + t.Error("OutboundIP must never return empty (falls back to 127.0.0.1)") + } +}