Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cmd/obol/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,7 @@ GLOBAL OPTIONS:{{template "visibleFlagTemplate" .}}{{end}}
updateCommand(cfg),
upgradeCommand(cfg),
networkCommand(cfg),
nodeCommand(cfg),
hermesCommand(cfg),
openclawCommand(cfg),
sellCommand(cfg),
Expand Down
102 changes: 102 additions & 0 deletions cmd/obol/node.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
package main

import (
"context"
"encoding/json"
"fmt"

"github.com/ObolNetwork/obol-stack/internal/config"
"github.com/ObolNetwork/obol-stack/internal/kubectl"
"github.com/ObolNetwork/obol-stack/internal/stack"
"github.com/urfave/cli/v3"
)

// nodeCommand groups commands for adding and inspecting worker nodes that join
// this stack's cluster. Multi-node only makes sense on the k3s backend — a
// k3d/Docker master's flannel overlay is not routable off-host — so the
// subcommands guard on the active backend.
func nodeCommand(cfg *config.Config) *cli.Command {
return &cli.Command{
Name: "node",
Usage: "Add and inspect worker nodes that join this stack's cluster (k3s backend)",
Commands: []*cli.Command{
nodeTokenCommand(cfg),
nodeListCommand(cfg),
},
}
}

func nodeTokenCommand(cfg *config.Config) *cli.Command {
return &cli.Command{
Name: "token",
Usage: "Print the join command for adding a Linux worker node to this k3s cluster",
Flags: []cli.Flag{
&cli.StringFlag{
Name: "server-url",
Usage: "Override the K3S_URL agents dial (default https://<this-host-LAN-IP>:6443)",
},
&cli.BoolFlag{Name: "json", Usage: "Output machine-readable JSON"},
},
Action: func(ctx context.Context, cmd *cli.Command) error {
u := getUI(cmd)

backend, err := stack.LoadBackend(cfg)
if err != nil {
return err
}

if backend.Name() != stack.BackendK3s {
return fmt.Errorf(
"obol node requires the k3s backend (current backend: %q)\n"+
"A k3d/Docker master cannot accept remote node joins — its flannel overlay is not routable off-host.\n"+
"Re-init on a Linux host with: obol stack init --backend k3s",
backend.Name())
}

token, err := stack.ReadK3sNodeToken(cfg)
if err != nil {
return err
}

serverURL := stack.K3sServerURL(cmd.String("server-url"))
version := stack.K3sBinaryVersion(cfg)
joinCmd := stack.K3sAgentJoinCommand(serverURL, token, version)

if cmd.Bool("json") {
out, _ := json.MarshalIndent(map[string]string{
"serverUrl": serverURL,
"token": token,
"version": version,
"joinCommand": joinCmd,
}, "", " ")
fmt.Println(string(out))

return nil
}

u.Info("Run this on a Linux worker node to join the cluster:")
fmt.Printf("\n %s\n\n", joinCmd)
u.Detail("Server", serverURL)
u.Dim("Multi-homed / Wi-Fi node? append: --node-ip <node-LAN-IP> --flannel-iface <iface>")
u.Dim("GPU node? label it at join: --node-label obol.tech/accelerator=nvidia")

return nil
},
}
}

func nodeListCommand(cfg *config.Config) *cli.Command {
return &cli.Command{
Name: "list",
Usage: "List cluster nodes with their accelerator labels",
Action: func(ctx context.Context, cmd *cli.Command) error {
if err := kubectl.EnsureCluster(cfg); err != nil {
return err
}

bin, kc := kubectl.Paths(cfg)

return kubectl.Run(bin, kc, "get", "nodes", "-o", "wide", "-L", "obol.tech/accelerator")
},
}
}
6 changes: 5 additions & 1 deletion internal/embed/k3s-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,15 @@ data-dir: {{DATA_DIR}}/k3s
bind-address: 0.0.0.0
https-listen-port: 6443

# TLS SANs for local access
# TLS SANs for local + LAN access. k3s already auto-adds the node's primary
# IP, but listing it (and the hostname) explicitly lets worker nodes join via
# either address and keeps the API cert deterministic across restarts.
tls-san:
- "127.0.0.1"
- "localhost"
- "obol.stack"
- "{{NODE_IP}}"
- "{{NODE_HOSTNAME}}"

# Relax eviction thresholds: k3s reports imagefs capacity as 0 on shared
# filesystems, causing spurious disk-pressure taints with percentage thresholds.
Expand Down
3 changes: 3 additions & 0 deletions internal/stack/backend_k3s.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@ func (b *K3sBackend) Init(cfg *config.Config, u *ui.UI, stackID string) error {
k3sConfig := embed.K3sConfig
k3sConfig = strings.ReplaceAll(k3sConfig, "{{STACK_ID}}", stackID)
k3sConfig = strings.ReplaceAll(k3sConfig, "{{DATA_DIR}}", absDataDir)
// LAN SANs so worker nodes can join this server by IP or hostname.
k3sConfig = strings.ReplaceAll(k3sConfig, "{{NODE_IP}}", OutboundIP())
k3sConfig = strings.ReplaceAll(k3sConfig, "{{NODE_HOSTNAME}}", nodeHostname())

k3sConfigPath := filepath.Join(cfg.ConfigDir, k3sConfigFile)
if err := os.WriteFile(k3sConfigPath, []byte(k3sConfig), 0o600); err != nil {
Expand Down
48 changes: 48 additions & 0 deletions internal/stack/backend_k3s_init_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
package stack

import (
"os"
"path/filepath"
"strings"
"testing"

"github.com/ObolNetwork/obol-stack/internal/config"
"github.com/ObolNetwork/obol-stack/internal/ui"
)

// TestK3sBackend_Init_SubstitutesNodeSANs verifies that Init renders the
// embedded k3s-config.yaml with every {{...}} placeholder resolved and the
// node's LAN IP + hostname injected into the tls-san block, so a worker node
// can join the server by either address.
func TestK3sBackend_Init_SubstitutesNodeSANs(t *testing.T) {
dir := t.TempDir()
cfg := &config.Config{
ConfigDir: dir,
DataDir: filepath.Join(dir, "data"),
}

b := &K3sBackend{}
if err := b.Init(cfg, ui.New(false), "teststack"); err != nil {
t.Fatalf("Init: %v", err)
}

data, err := os.ReadFile(filepath.Join(dir, k3sConfigFile))
if err != nil {
t.Fatalf("read rendered k3s config: %v", err)
}

rendered := string(data)
if strings.Contains(rendered, "{{") {
t.Errorf("rendered k3s config still has an unsubstituted placeholder:\n%s", rendered)
}

if !strings.Contains(rendered, "tls-san") {
t.Fatal("rendered k3s config has no tls-san block")
}

for _, want := range []string{OutboundIP(), nodeHostname()} {
if !strings.Contains(rendered, want) {
t.Errorf("tls-san missing %q\n%s", want, rendered)
}
}
}
128 changes: 128 additions & 0 deletions internal/stack/node.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
package stack

import (
"fmt"
"net"
"os"
"os/exec"
"path/filepath"
"strings"

"github.com/ObolNetwork/obol-stack/internal/config"
)

// k3sAPIPort is the standard k3s supervisor/apiserver port a joining agent dials.
const k3sAPIPort = 6443

// OutboundIP returns this host's primary outbound IPv4 address, discovered by
// opening a UDP socket toward a public address (no packets are actually sent).
// It is the address a LAN peer would use to reach this host and the one k3s
// advertises as the node InternalIP. Falls back to 127.0.0.1.
func OutboundIP() string {
conn, err := net.Dial("udp", "8.8.8.8:80")
if err != nil {
return "127.0.0.1"
}
defer conn.Close()

if addr, ok := conn.LocalAddr().(*net.UDPAddr); ok && addr.IP != nil {
return addr.IP.String()
}

return "127.0.0.1"
}

// nodeHostname returns this host's hostname, or "localhost" if unavailable.
func nodeHostname() string {
h, err := os.Hostname()
if err != nil || strings.TrimSpace(h) == "" {
return "localhost"
}

return h
}

// K3sNodeTokenPath returns the path to the k3s server join token for the k3s
// backend's data-dir. It mirrors `data-dir: {{DATA_DIR}}/k3s` in the embedded
// k3s-config.yaml — NOT the default /var/lib/rancher/k3s, which obol overrides.
func K3sNodeTokenPath(cfg *config.Config) string {
absDataDir, err := filepath.Abs(cfg.DataDir)
if err != nil {
absDataDir = cfg.DataDir
}

return filepath.Join(absDataDir, "k3s", "server", "node-token")
}

// ReadK3sNodeToken reads the root-owned k3s server join token via sudo.
func ReadK3sNodeToken(cfg *config.Config) (string, error) {
path := K3sNodeTokenPath(cfg)

out, err := exec.Command("sudo", "cat", path).Output()
if err != nil {
return "", fmt.Errorf("read k3s node-token at %s (is this host the running k3s server?): %w", path, err)
}

token := strings.TrimSpace(string(out))
if token == "" {
return "", fmt.Errorf("k3s node-token at %s is empty", path)
}

return token, nil
}

// K3sServerURL returns the https URL a joining agent dials. When override is
// empty it uses this host's primary LAN IP and the standard k3s API port.
func K3sServerURL(override string) string {
if override != "" {
return override
}

return fmt.Sprintf("https://%s:%d", OutboundIP(), k3sAPIPort)
}

// K3sBinaryVersion returns the k3s release string (e.g. "v1.35.5+k3s1") of the
// k3s binary in BinDir, used to pin a joining agent to the server's version.
// Returns "" when it can't be determined (the installer then picks stable).
func K3sBinaryVersion(cfg *config.Config) string {
out, err := exec.Command(filepath.Join(cfg.BinDir, "k3s"), "--version").Output()
if err != nil {
return ""
}

return parseK3sVersion(string(out))
}

// parseK3sVersion extracts the version token from `k3s --version` output,
// whose first line looks like: "k3s version v1.35.5+k3s1 (6a4781ad)".
func parseK3sVersion(out string) string {
firstLine, _, _ := strings.Cut(out, "\n")

fields := strings.Fields(firstLine)
for i, f := range fields {
if f == "version" && i+1 < len(fields) {
return fields[i+1]
}
}

return ""
}

// K3sAgentJoinCommand builds the copy-pasteable one-liner an operator runs on a
// Linux worker node to join this stack's k3s cluster. When version is non-empty
// the agent install is pinned to it (agents should match the server version).
func K3sAgentJoinCommand(serverURL, token, version string) string {
var b strings.Builder

b.WriteString("curl -sfL https://get.k3s.io | ")

if version != "" {
b.WriteString("INSTALL_K3S_VERSION=" + version + " ")
}

b.WriteString("K3S_URL=" + serverURL + " ")
b.WriteString("K3S_TOKEN='" + token + "' ")
b.WriteString("sh -s - agent")

return b.String()
}
Loading
Loading