From 41f11cd707221a7cee26cec9dafacd1855f4b4be Mon Sep 17 00:00:00 2001 From: eric Date: Sun, 22 Mar 2026 22:01:44 -0700 Subject: [PATCH 01/17] =?UTF-8?q?Refactor=20team=20=E2=86=92=20org=20acros?= =?UTF-8?q?s=20multitenant=20control=20plane?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename the multi-tenant "team" concept to "org" throughout: - DB tables: duckgres_teams → duckgres_orgs, duckgres_team_users → duckgres_org_users - DB column: team_name → org_id - Go types: Team → Org, TeamUser → OrgUser, TeamConfig → OrgConfig, TeamStack → OrgStack, TeamRouter → OrgRouter, etc. - API routes: /teams → /orgs - K8s labels: duckgres/team → duckgres/org - Prometheus metrics: duckgres_team_* → duckgres_org_* - Files: team_router.go → org_router.go, team_reserved_pool.go → org_reserved_pool.go, teams.html → orgs.html Note: config store DB must be recreated (GORM AutoMigrate creates new tables but does not rename existing ones). Co-Authored-By: Claude Opus 4.6 (1M context) --- controlplane/admin/api.go | 234 +++++++++--------- controlplane/admin/api_postgres_test.go | 10 +- controlplane/admin/api_test.go | 204 +++++++-------- controlplane/admin/dashboard.go | 2 +- controlplane/admin/static/index.html | 22 +- .../admin/static/{teams.html => orgs.html} | 30 +-- controlplane/admin/static/sessions.html | 6 +- controlplane/admin/static/settings.html | 2 +- controlplane/admin/static/workers.html | 6 +- controlplane/configstore/models.go | 42 ++-- controlplane/configstore/store.go | 52 ++-- controlplane/configstore/store_test.go | 106 ++++---- controlplane/control.go | 34 +-- controlplane/flight_ingress_metrics_k8s.go | 74 +++--- controlplane/k8s_pool.go | 20 +- controlplane/k8s_pool_test.go | 12 +- controlplane/multitenant.go | 48 ++-- controlplane/multitenant_stub.go | 2 +- ..._reserved_pool.go => org_reserved_pool.go} | 56 ++--- ...pool_test.go => org_reserved_pool_test.go} | 22 +- .../{team_router.go => org_router.go} | 146 +++++------ ...team_router_test.go => org_router_test.go} | 6 +- controlplane/worker_mgr_test.go | 14 +- controlplane/worker_pool.go | 4 +- controlplane/worker_state.go | 12 +- controlplane/worker_state_test.go | 14 +- k8s/local-config-store.seed.sql | 12 +- .../managed_warehouse_postgres_test.go | 78 +++--- tests/k8s/k8s_test.go | 12 +- 29 files changed, 641 insertions(+), 641 deletions(-) rename controlplane/admin/static/{teams.html => orgs.html} (80%) rename controlplane/{team_reserved_pool.go => org_reserved_pool.go} (59%) rename controlplane/{team_reserved_pool_test.go => org_reserved_pool_test.go} (78%) rename controlplane/{team_router.go => org_router.go} (52%) rename controlplane/{team_router_test.go => org_router_test.go} (83%) diff --git a/controlplane/admin/api.go b/controlplane/admin/api.go index 0df1dd8..2648242 100644 --- a/controlplane/admin/api.go +++ b/controlplane/admin/api.go @@ -14,12 +14,12 @@ import ( "gorm.io/gorm/clause" ) -var errWarehousePayloadNotAllowed = errors.New("warehouse payload must be updated via /teams/:name/warehouse") +var errWarehousePayloadNotAllowed = errors.New("warehouse payload must be updated via /orgs/:name/warehouse") // WorkerStatus represents a worker's current status for the API. type WorkerStatus struct { ID int `json:"id"` - Team string `json:"team"` + Org string `json:"org"` ActiveSessions int `json:"active_sessions"` Status string `json:"status"` } @@ -28,19 +28,19 @@ type WorkerStatus struct { type SessionStatus struct { PID int32 `json:"pid"` WorkerID int `json:"worker_id"` - Team string `json:"team"` + Org string `json:"org"` } // ClusterStatus aggregates cluster state for the dashboard. type ClusterStatus struct { - TotalTeams int `json:"total_teams"` - TotalWorkers int `json:"total_workers"` - TotalSessions int `json:"total_sessions"` - Teams []TeamStatus `json:"teams"` + TotalOrgs int `json:"total_orgs"` + TotalWorkers int `json:"total_workers"` + TotalSessions int `json:"total_sessions"` + Orgs []OrgStatus `json:"orgs"` } -// TeamStatus is a per-team summary. -type TeamStatus struct { +// OrgStatus is a per-org summary. +type OrgStatus struct { Name string `json:"name"` Workers int `json:"workers"` ActiveSessions int `json:"active_sessions"` @@ -48,33 +48,33 @@ type TeamStatus struct { MemoryBudget string `json:"memory_budget"` } -// TeamStackInfo provides info about a team's live state. -// Implemented by the controlplane.TeamRouter via adapter. -type TeamStackInfo interface { - // AllTeamStats returns per-team worker and session counts. - AllTeamStats() []TeamStatus - // AllWorkerStatuses returns all workers across teams. +// OrgStackInfo provides info about an org's live state. +// Implemented by the controlplane.OrgRouter via adapter. +type OrgStackInfo interface { + // AllOrgStats returns per-org worker and session counts. + AllOrgStats() []OrgStatus + // AllWorkerStatuses returns all workers across orgs. AllWorkerStatuses() []WorkerStatus - // AllSessionStatuses returns all active sessions across teams. + // AllSessionStatuses returns all active sessions across orgs. AllSessionStatuses() []SessionStatus } // RegisterAPI registers all admin REST endpoints on the given router group. -func RegisterAPI(r *gin.RouterGroup, store *configstore.ConfigStore, info TeamStackInfo) { +func RegisterAPI(r *gin.RouterGroup, store *configstore.ConfigStore, info OrgStackInfo) { registerAPIWithStore(r, newGormAPIStore(store), info) } -func registerAPIWithStore(r *gin.RouterGroup, store apiStore, info TeamStackInfo) { +func registerAPIWithStore(r *gin.RouterGroup, store apiStore, info OrgStackInfo) { h := &apiHandler{store: store, info: info} - // Teams CRUD - r.GET("/teams", h.listTeams) - r.POST("/teams", h.createTeam) - r.GET("/teams/:name", h.getTeam) - r.PUT("/teams/:name", h.updateTeam) - r.DELETE("/teams/:name", h.deleteTeam) - r.GET("/teams/:name/warehouse", h.getManagedWarehouse) - r.PUT("/teams/:name/warehouse", h.putManagedWarehouse) + // Orgs CRUD + r.GET("/orgs", h.listOrgs) + r.POST("/orgs", h.createOrg) + r.GET("/orgs/:name", h.getOrg) + r.PUT("/orgs/:name", h.updateOrg) + r.DELETE("/orgs/:name", h.deleteOrg) + r.GET("/orgs/:name/warehouse", h.getManagedWarehouse) + r.PUT("/orgs/:name/warehouse", h.putManagedWarehouse) // Users CRUD r.GET("/users", h.listUsers) @@ -104,20 +104,20 @@ func registerAPIWithStore(r *gin.RouterGroup, store apiStore, info TeamStackInfo } type apiStore interface { - ListTeams() ([]configstore.Team, error) - CreateTeam(team *configstore.Team) error - GetTeam(name string) (*configstore.Team, error) - UpdateTeam(name string, updates configstore.Team) (*configstore.Team, bool, error) - DeleteTeam(name string) (bool, error) - - ListUsers() ([]configstore.TeamUser, error) - CreateUser(user *configstore.TeamUser) error - GetUser(username string) (*configstore.TeamUser, error) - UpdateUser(username, passwordHash, teamName string) (*configstore.TeamUser, bool, error) + ListOrgs() ([]configstore.Org, error) + CreateOrg(org *configstore.Org) error + GetOrg(name string) (*configstore.Org, error) + UpdateOrg(name string, updates configstore.Org) (*configstore.Org, bool, error) + DeleteOrg(name string) (bool, error) + + ListUsers() ([]configstore.OrgUser, error) + CreateUser(user *configstore.OrgUser) error + GetUser(username string) (*configstore.OrgUser, error) + UpdateUser(username, passwordHash, orgID string) (*configstore.OrgUser, bool, error) DeleteUser(username string) (bool, error) - GetManagedWarehouse(teamName string) (*configstore.ManagedWarehouse, error) - UpsertManagedWarehouse(teamName string, warehouse *configstore.ManagedWarehouse) (*configstore.ManagedWarehouse, bool, error) + GetManagedWarehouse(orgID string) (*configstore.ManagedWarehouse, error) + UpsertManagedWarehouse(orgID string, warehouse *configstore.ManagedWarehouse) (*configstore.ManagedWarehouse, bool, error) GetGlobalConfig() (configstore.GlobalConfig, error) SaveGlobalConfig(cfg *configstore.GlobalConfig) error @@ -141,29 +141,29 @@ func (s *gormAPIStore) db() *gorm.DB { return s.store.DB() } -func (s *gormAPIStore) ListTeams() ([]configstore.Team, error) { - var teams []configstore.Team - if err := s.db().Preload("Users").Preload("Warehouse").Find(&teams).Error; err != nil { +func (s *gormAPIStore) ListOrgs() ([]configstore.Org, error) { + var orgs []configstore.Org + if err := s.db().Preload("Users").Preload("Warehouse").Find(&orgs).Error; err != nil { return nil, err } - return teams, nil + return orgs, nil } -func (s *gormAPIStore) CreateTeam(team *configstore.Team) error { - team.Warehouse = nil - return s.db().Omit("Warehouse").Create(team).Error +func (s *gormAPIStore) CreateOrg(org *configstore.Org) error { + org.Warehouse = nil + return s.db().Omit("Warehouse").Create(org).Error } -func (s *gormAPIStore) GetTeam(name string) (*configstore.Team, error) { - var team configstore.Team - if err := s.db().Preload("Users").Preload("Warehouse").First(&team, "name = ?", name).Error; err != nil { +func (s *gormAPIStore) GetOrg(name string) (*configstore.Org, error) { + var org configstore.Org + if err := s.db().Preload("Users").Preload("Warehouse").First(&org, "name = ?", name).Error; err != nil { return nil, err } - return &team, nil + return &org, nil } -func (s *gormAPIStore) UpdateTeam(name string, updates configstore.Team) (*configstore.Team, bool, error) { - result := s.db().Model(&configstore.Team{}).Where("name = ?", name).Updates(map[string]interface{}{ +func (s *gormAPIStore) UpdateOrg(name string, updates configstore.Org) (*configstore.Org, bool, error) { + result := s.db().Model(&configstore.Org{}).Where("name = ?", name).Updates(map[string]interface{}{ "max_workers": updates.MaxWorkers, "memory_budget": updates.MemoryBudget, "idle_timeout_s": updates.IdleTimeoutS, @@ -174,20 +174,20 @@ func (s *gormAPIStore) UpdateTeam(name string, updates configstore.Team) (*confi if result.RowsAffected == 0 { return nil, false, nil } - team, err := s.GetTeam(name) + org, err := s.GetOrg(name) if err != nil { return nil, true, err } - return team, true, nil + return org, true, nil } -func (s *gormAPIStore) DeleteTeam(name string) (bool, error) { +func (s *gormAPIStore) DeleteOrg(name string) (bool, error) { returnRows := int64(0) err := s.db().Transaction(func(tx *gorm.DB) error { - if err := tx.Where("team_name = ?", name).Delete(&configstore.TeamUser{}).Error; err != nil { + if err := tx.Where("org_id = ?", name).Delete(&configstore.OrgUser{}).Error; err != nil { return err } - result := tx.Where("name = ?", name).Delete(&configstore.Team{}) + result := tx.Where("name = ?", name).Delete(&configstore.Org{}) if result.Error != nil { return result.Error } @@ -200,35 +200,35 @@ func (s *gormAPIStore) DeleteTeam(name string) (bool, error) { return returnRows > 0, nil } -func (s *gormAPIStore) ListUsers() ([]configstore.TeamUser, error) { - var users []configstore.TeamUser +func (s *gormAPIStore) ListUsers() ([]configstore.OrgUser, error) { + var users []configstore.OrgUser if err := s.db().Find(&users).Error; err != nil { return nil, err } return users, nil } -func (s *gormAPIStore) CreateUser(user *configstore.TeamUser) error { +func (s *gormAPIStore) CreateUser(user *configstore.OrgUser) error { return s.db().Create(user).Error } -func (s *gormAPIStore) GetUser(username string) (*configstore.TeamUser, error) { - var user configstore.TeamUser +func (s *gormAPIStore) GetUser(username string) (*configstore.OrgUser, error) { + var user configstore.OrgUser if err := s.db().First(&user, "username = ?", username).Error; err != nil { return nil, err } return &user, nil } -func (s *gormAPIStore) UpdateUser(username, passwordHash, teamName string) (*configstore.TeamUser, bool, error) { +func (s *gormAPIStore) UpdateUser(username, passwordHash, orgID string) (*configstore.OrgUser, bool, error) { updates := map[string]interface{}{} if passwordHash != "" { updates["password"] = passwordHash } - if teamName != "" { - updates["team_name"] = teamName + if orgID != "" { + updates["org_id"] = orgID } - result := s.db().Model(&configstore.TeamUser{}).Where("username = ?", username).Updates(updates) + result := s.db().Model(&configstore.OrgUser{}).Where("username = ?", username).Updates(updates) if result.Error != nil { return nil, false, result.Error } @@ -243,39 +243,39 @@ func (s *gormAPIStore) UpdateUser(username, passwordHash, teamName string) (*con } func (s *gormAPIStore) DeleteUser(username string) (bool, error) { - result := s.db().Where("username = ?", username).Delete(&configstore.TeamUser{}) + result := s.db().Where("username = ?", username).Delete(&configstore.OrgUser{}) if result.Error != nil { return false, result.Error } return result.RowsAffected > 0, nil } -func (s *gormAPIStore) GetManagedWarehouse(teamName string) (*configstore.ManagedWarehouse, error) { +func (s *gormAPIStore) GetManagedWarehouse(orgID string) (*configstore.ManagedWarehouse, error) { var warehouse configstore.ManagedWarehouse - if err := s.db().First(&warehouse, "team_name = ?", teamName).Error; err != nil { + if err := s.db().First(&warehouse, "org_id = ?", orgID).Error; err != nil { return nil, err } return &warehouse, nil } -func (s *gormAPIStore) UpsertManagedWarehouse(teamName string, warehouse *configstore.ManagedWarehouse) (*configstore.ManagedWarehouse, bool, error) { +func (s *gormAPIStore) UpsertManagedWarehouse(orgID string, warehouse *configstore.ManagedWarehouse) (*configstore.ManagedWarehouse, bool, error) { var count int64 - if err := s.db().Model(&configstore.Team{}).Where("name = ?", teamName).Count(&count).Error; err != nil { + if err := s.db().Model(&configstore.Org{}).Where("name = ?", orgID).Count(&count).Error; err != nil { return nil, false, err } if count == 0 { return nil, false, nil } - warehouse.TeamName = teamName + warehouse.OrgID = orgID warehouse.UpdatedAt = time.Now().UTC() if err := s.db().Clauses(clause.OnConflict{ - Columns: []clause.Column{{Name: "team_name"}}, + Columns: []clause.Column{{Name: "org_id"}}, DoUpdates: clause.AssignmentColumns(managedWarehouseUpsertColumns()), }).Create(warehouse).Error; err != nil { return nil, true, err } - stored, err := s.GetManagedWarehouse(teamName) + stored, err := s.GetManagedWarehouse(orgID) if err != nil { return nil, true, err } @@ -386,7 +386,7 @@ func (s *gormAPIStore) SaveQueryLogConfig(cfg *configstore.QueryLogConfig) error type apiHandler struct { store apiStore - info TeamStackInfo + info OrgStackInfo } type managedWarehouseRequest struct { @@ -447,87 +447,87 @@ func decodeStrictWarehouseRequest(c *gin.Context, dst *managedWarehouseRequest) return dec.Decode(dst) } -// --- Teams --- +// --- Orgs --- -func (h *apiHandler) listTeams(c *gin.Context) { - teams, err := h.store.ListTeams() +func (h *apiHandler) listOrgs(c *gin.Context) { + orgs, err := h.store.ListOrgs() if err != nil { c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) return } - c.JSON(http.StatusOK, teams) + c.JSON(http.StatusOK, orgs) } -func (h *apiHandler) createTeam(c *gin.Context) { - var team configstore.Team - if err := c.ShouldBindJSON(&team); err != nil { +func (h *apiHandler) createOrg(c *gin.Context) { + var org configstore.Org + if err := c.ShouldBindJSON(&org); err != nil { c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) return } - if err := validateTeamMutationPayload(&team); err != nil { + if err := validateOrgMutationPayload(&org); err != nil { c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) return } - if team.Name == "" { + if org.Name == "" { c.JSON(http.StatusBadRequest, gin.H{"error": "name is required"}) return } - if err := h.store.CreateTeam(&team); err != nil { + if err := h.store.CreateOrg(&org); err != nil { c.JSON(http.StatusConflict, gin.H{"error": err.Error()}) return } - c.JSON(http.StatusCreated, team) + c.JSON(http.StatusCreated, org) } -func (h *apiHandler) getTeam(c *gin.Context) { +func (h *apiHandler) getOrg(c *gin.Context) { name := c.Param("name") - team, err := h.store.GetTeam(name) + org, err := h.store.GetOrg(name) if err != nil { - c.JSON(http.StatusNotFound, gin.H{"error": "team not found"}) + c.JSON(http.StatusNotFound, gin.H{"error": "org not found"}) return } - c.JSON(http.StatusOK, team) + c.JSON(http.StatusOK, org) } -func (h *apiHandler) updateTeam(c *gin.Context) { +func (h *apiHandler) updateOrg(c *gin.Context) { name := c.Param("name") - var updates configstore.Team + var updates configstore.Org if err := c.ShouldBindJSON(&updates); err != nil { c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) return } - if err := validateTeamMutationPayload(&updates); err != nil { + if err := validateOrgMutationPayload(&updates); err != nil { c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) return } - team, ok, err := h.store.UpdateTeam(name, updates) + org, ok, err := h.store.UpdateOrg(name, updates) if err != nil { c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) return } if !ok { - c.JSON(http.StatusNotFound, gin.H{"error": "team not found"}) + c.JSON(http.StatusNotFound, gin.H{"error": "org not found"}) return } - c.JSON(http.StatusOK, team) + c.JSON(http.StatusOK, org) } -func (h *apiHandler) deleteTeam(c *gin.Context) { +func (h *apiHandler) deleteOrg(c *gin.Context) { name := c.Param("name") - ok, err := h.store.DeleteTeam(name) + ok, err := h.store.DeleteOrg(name) if err != nil { c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) return } if !ok { - c.JSON(http.StatusNotFound, gin.H{"error": "team not found"}) + c.JSON(http.StatusNotFound, gin.H{"error": "org not found"}) return } c.JSON(http.StatusOK, gin.H{"deleted": name}) } -func validateTeamMutationPayload(team *configstore.Team) error { - if team != nil && team.Warehouse != nil { +func validateOrgMutationPayload(org *configstore.Org) error { + if org != nil && org.Warehouse != nil { return errWarehousePayloadNotAllowed } return nil @@ -547,20 +547,20 @@ func (h *apiHandler) getManagedWarehouse(c *gin.Context) { } func (h *apiHandler) putManagedWarehouse(c *gin.Context) { - teamName := c.Param("name") + orgID := c.Param("name") var req managedWarehouseRequest if err := decodeStrictWarehouseRequest(c, &req); err != nil { c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) return } warehouse := req.toManagedWarehouse() - stored, ok, err := h.store.UpsertManagedWarehouse(teamName, &warehouse) + stored, ok, err := h.store.UpsertManagedWarehouse(orgID, &warehouse) if err != nil { c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) return } if !ok { - c.JSON(http.StatusNotFound, gin.H{"error": "team not found"}) + c.JSON(http.StatusNotFound, gin.H{"error": "org not found"}) return } c.JSON(http.StatusOK, stored) @@ -578,18 +578,18 @@ func (h *apiHandler) listUsers(c *gin.Context) { } func (h *apiHandler) createUser(c *gin.Context) { - // Use a raw struct because TeamUser.Password has json:"-" + // Use a raw struct because OrgUser.Password has json:"-" var raw struct { Username string `json:"username"` Password string `json:"password"` - TeamName string `json:"team_name"` + OrgID string `json:"org_id"` } if err := c.ShouldBindJSON(&raw); err != nil { c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) return } - if raw.Username == "" || raw.TeamName == "" { - c.JSON(http.StatusBadRequest, gin.H{"error": "username and team_name are required"}) + if raw.Username == "" || raw.OrgID == "" { + c.JSON(http.StatusBadRequest, gin.H{"error": "username and org_id are required"}) return } if raw.Password == "" { @@ -601,10 +601,10 @@ func (h *apiHandler) createUser(c *gin.Context) { c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to hash password"}) return } - user := configstore.TeamUser{ + user := configstore.OrgUser{ Username: raw.Username, Password: hash, - TeamName: raw.TeamName, + OrgID: raw.OrgID, } if err := h.store.CreateUser(&user); err != nil { c.JSON(http.StatusConflict, gin.H{"error": err.Error()}) @@ -627,7 +627,7 @@ func (h *apiHandler) updateUser(c *gin.Context) { username := c.Param("username") var raw struct { Password string `json:"password"` - TeamName string `json:"team_name"` + OrgID string `json:"org_id"` } if err := c.ShouldBindJSON(&raw); err != nil { c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) @@ -642,7 +642,7 @@ func (h *apiHandler) updateUser(c *gin.Context) { } passwordHash = hash } - user, ok, err := h.store.UpdateUser(username, passwordHash, raw.TeamName) + user, ok, err := h.store.UpdateUser(username, passwordHash, raw.OrgID) if err != nil { c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) return @@ -790,18 +790,18 @@ func (h *apiHandler) getClusterStatus(c *gin.Context) { return } - teamStats := h.info.AllTeamStats() + orgStats := h.info.AllOrgStats() totalWorkers := 0 totalSessions := 0 - for _, ts := range teamStats { - totalWorkers += ts.Workers - totalSessions += ts.ActiveSessions + for _, os := range orgStats { + totalWorkers += os.Workers + totalSessions += os.ActiveSessions } c.JSON(http.StatusOK, ClusterStatus{ - TotalTeams: len(teamStats), + TotalOrgs: len(orgStats), TotalWorkers: totalWorkers, TotalSessions: totalSessions, - Teams: teamStats, + Orgs: orgStats, }) } diff --git a/controlplane/admin/api_postgres_test.go b/controlplane/admin/api_postgres_test.go index bec0f10..d8d4380 100644 --- a/controlplane/admin/api_postgres_test.go +++ b/controlplane/admin/api_postgres_test.go @@ -87,8 +87,8 @@ func resetConfigStoreTables(t *testing.T, db *gorm.DB) { for _, model := range []any{ &configstore.ManagedWarehouse{}, - &configstore.TeamUser{}, - &configstore.Team{}, + &configstore.OrgUser{}, + &configstore.Org{}, } { if err := db.Session(&gorm.Session{AllowGlobalUpdate: true}).Delete(model).Error; err != nil { t.Fatalf("delete %T: %v", model, err) @@ -100,13 +100,13 @@ func TestUpsertManagedWarehousePreservesCreatedAt(t *testing.T) { store := newPostgresConfigStore(t) apiStore := newGormAPIStore(store).(*gormAPIStore) - if err := store.DB().Create(&configstore.Team{Name: "analytics"}).Error; err != nil { - t.Fatalf("create team: %v", err) + if err := store.DB().Create(&configstore.Org{Name: "analytics"}).Error; err != nil { + t.Fatalf("create org: %v", err) } createdAt := time.Date(2024, time.January, 2, 3, 4, 5, 0, time.UTC) original := &configstore.ManagedWarehouse{ - TeamName: "analytics", + OrgID: "analytics", State: configstore.ManagedWarehouseStatePending, CreatedAt: createdAt, UpdatedAt: createdAt, diff --git a/controlplane/admin/api_test.go b/controlplane/admin/api_test.go index 537fe5e..64d08b2 100644 --- a/controlplane/admin/api_test.go +++ b/controlplane/admin/api_test.go @@ -18,67 +18,67 @@ import ( ) type fakeAPIStore struct { - teams map[string]*configstore.Team - users map[string]*configstore.TeamUser + orgs map[string]*configstore.Org + users map[string]*configstore.OrgUser warehouses map[string]*configstore.ManagedWarehouse } func newFakeAPIStore() *fakeAPIStore { return &fakeAPIStore{ - teams: make(map[string]*configstore.Team), - users: make(map[string]*configstore.TeamUser), + orgs: make(map[string]*configstore.Org), + users: make(map[string]*configstore.OrgUser), warehouses: make(map[string]*configstore.ManagedWarehouse), } } -func (s *fakeAPIStore) ListTeams() ([]configstore.Team, error) { - teams := make([]configstore.Team, 0, len(s.teams)) - for _, team := range s.teams { - teams = append(teams, *copyTeam(team)) +func (s *fakeAPIStore) ListOrgs() ([]configstore.Org, error) { + orgs := make([]configstore.Org, 0, len(s.orgs)) + for _, org := range s.orgs { + orgs = append(orgs, *copyOrg(org)) } - return teams, nil + return orgs, nil } -func (s *fakeAPIStore) CreateTeam(team *configstore.Team) error { - if _, ok := s.teams[team.Name]; ok { - return errors.New("duplicate team") +func (s *fakeAPIStore) CreateOrg(org *configstore.Org) error { + if _, ok := s.orgs[org.Name]; ok { + return errors.New("duplicate org") } - clone := copyTeam(team) + clone := copyOrg(org) clone.Warehouse = nil - s.teams[team.Name] = clone + s.orgs[org.Name] = clone return nil } -func (s *fakeAPIStore) GetTeam(name string) (*configstore.Team, error) { - team, ok := s.teams[name] +func (s *fakeAPIStore) GetOrg(name string) (*configstore.Org, error) { + org, ok := s.orgs[name] if !ok { return nil, gorm.ErrRecordNotFound } - return copyTeam(team), nil + return copyOrg(org), nil } -func (s *fakeAPIStore) UpdateTeam(name string, updates configstore.Team) (*configstore.Team, bool, error) { - team, ok := s.teams[name] +func (s *fakeAPIStore) UpdateOrg(name string, updates configstore.Org) (*configstore.Org, bool, error) { + org, ok := s.orgs[name] if !ok { return nil, false, nil } - team.MaxWorkers = updates.MaxWorkers - team.MemoryBudget = updates.MemoryBudget - team.IdleTimeoutS = updates.IdleTimeoutS - return copyTeam(team), true, nil + org.MaxWorkers = updates.MaxWorkers + org.MemoryBudget = updates.MemoryBudget + org.IdleTimeoutS = updates.IdleTimeoutS + return copyOrg(org), true, nil } -func (s *fakeAPIStore) DeleteTeam(name string) (bool, error) { - if _, ok := s.teams[name]; !ok { +func (s *fakeAPIStore) DeleteOrg(name string) (bool, error) { + if _, ok := s.orgs[name]; !ok { return false, nil } - delete(s.teams, name) + delete(s.orgs, name) delete(s.warehouses, name) return true, nil } -func (s *fakeAPIStore) ListUsers() ([]configstore.TeamUser, error) { - users := make([]configstore.TeamUser, 0, len(s.users)) +func (s *fakeAPIStore) ListUsers() ([]configstore.OrgUser, error) { + users := make([]configstore.OrgUser, 0, len(s.users)) for _, user := range s.users { clone := *user users = append(users, clone) @@ -86,7 +86,7 @@ func (s *fakeAPIStore) ListUsers() ([]configstore.TeamUser, error) { return users, nil } -func (s *fakeAPIStore) CreateUser(user *configstore.TeamUser) error { +func (s *fakeAPIStore) CreateUser(user *configstore.OrgUser) error { if _, ok := s.users[user.Username]; ok { return errors.New("duplicate user") } @@ -95,7 +95,7 @@ func (s *fakeAPIStore) CreateUser(user *configstore.TeamUser) error { return nil } -func (s *fakeAPIStore) GetUser(username string) (*configstore.TeamUser, error) { +func (s *fakeAPIStore) GetUser(username string) (*configstore.OrgUser, error) { user, ok := s.users[username] if !ok { return nil, gorm.ErrRecordNotFound @@ -104,7 +104,7 @@ func (s *fakeAPIStore) GetUser(username string) (*configstore.TeamUser, error) { return &clone, nil } -func (s *fakeAPIStore) UpdateUser(username, passwordHash, teamName string) (*configstore.TeamUser, bool, error) { +func (s *fakeAPIStore) UpdateUser(username, passwordHash, orgID string) (*configstore.OrgUser, bool, error) { user, ok := s.users[username] if !ok { return nil, false, nil @@ -112,8 +112,8 @@ func (s *fakeAPIStore) UpdateUser(username, passwordHash, teamName string) (*con if passwordHash != "" { user.Password = passwordHash } - if teamName != "" { - user.TeamName = teamName + if orgID != "" { + user.OrgID = orgID } clone := *user return &clone, true, nil @@ -127,23 +127,23 @@ func (s *fakeAPIStore) DeleteUser(username string) (bool, error) { return true, nil } -func (s *fakeAPIStore) GetManagedWarehouse(teamName string) (*configstore.ManagedWarehouse, error) { - warehouse, ok := s.warehouses[teamName] +func (s *fakeAPIStore) GetManagedWarehouse(orgID string) (*configstore.ManagedWarehouse, error) { + warehouse, ok := s.warehouses[orgID] if !ok { return nil, gorm.ErrRecordNotFound } return copyWarehouse(warehouse), nil } -func (s *fakeAPIStore) UpsertManagedWarehouse(teamName string, warehouse *configstore.ManagedWarehouse) (*configstore.ManagedWarehouse, bool, error) { - team, ok := s.teams[teamName] +func (s *fakeAPIStore) UpsertManagedWarehouse(orgID string, warehouse *configstore.ManagedWarehouse) (*configstore.ManagedWarehouse, bool, error) { + org, ok := s.orgs[orgID] if !ok { return nil, false, nil } clone := copyWarehouse(warehouse) - clone.TeamName = teamName - s.warehouses[teamName] = clone - team.Warehouse = copyWarehouse(clone) + clone.OrgID = orgID + s.warehouses[orgID] = clone + org.Warehouse = copyWarehouse(clone) return copyWarehouse(clone), true, nil } @@ -187,17 +187,17 @@ func copyWarehouse(warehouse *configstore.ManagedWarehouse) *configstore.Managed return &clone } -func copyTeam(team *configstore.Team) *configstore.Team { - if team == nil { +func copyOrg(org *configstore.Org) *configstore.Org { + if org == nil { return nil } - clone := *team - if team.Warehouse != nil { - clone.Warehouse = copyWarehouse(team.Warehouse) + clone := *org + if org.Warehouse != nil { + clone.Warehouse = copyWarehouse(org.Warehouse) } - if len(team.Users) > 0 { - clone.Users = make([]configstore.TeamUser, len(team.Users)) - copy(clone.Users, team.Users) + if len(org.Users) > 0 { + clone.Users = make([]configstore.OrgUser, len(org.Users)) + copy(clone.Users, org.Users) } return &clone } @@ -209,9 +209,9 @@ func newTestAPIRouter(store apiStore) *gin.Engine { return r } -func seedTeamWithWarehouse(store *fakeAPIStore, name string) { +func seedOrgWithWarehouse(store *fakeAPIStore, name string) { warehouse := &configstore.ManagedWarehouse{ - TeamName: name, + OrgID: name, WarehouseDatabase: configstore.ManagedWarehouseDatabase{ Region: "us-east-1", Endpoint: fmt.Sprintf("%s.cluster.example", name), @@ -266,19 +266,19 @@ func seedTeamWithWarehouse(store *fakeAPIStore, name string) { IdentityState: configstore.ManagedWarehouseStateReady, SecretsState: configstore.ManagedWarehouseStateReady, } - store.teams[name] = &configstore.Team{ + store.orgs[name] = &configstore.Org{ Name: name, Warehouse: copyWarehouse(warehouse), } store.warehouses[name] = warehouse } -func TestGetTeamIncludesWarehouse(t *testing.T) { +func TestGetOrgIncludesWarehouse(t *testing.T) { store := newFakeAPIStore() - seedTeamWithWarehouse(store, "analytics") + seedOrgWithWarehouse(store, "analytics") router := newTestAPIRouter(store) - req := httptest.NewRequest(http.MethodGet, "/api/v1/teams/analytics", nil) + req := httptest.NewRequest(http.MethodGet, "/api/v1/orgs/analytics", nil) rec := httptest.NewRecorder() router.ServeHTTP(rec, req) @@ -286,27 +286,27 @@ func TestGetTeamIncludesWarehouse(t *testing.T) { t.Fatalf("status = %d, want %d: %s", rec.Code, http.StatusOK, rec.Body.String()) } - var team configstore.Team - if err := json.Unmarshal(rec.Body.Bytes(), &team); err != nil { - t.Fatalf("unmarshal team: %v", err) + var org configstore.Org + if err := json.Unmarshal(rec.Body.Bytes(), &org); err != nil { + t.Fatalf("unmarshal org: %v", err) } - if team.Warehouse == nil { - t.Fatal("expected warehouse in team response") + if org.Warehouse == nil { + t.Fatal("expected warehouse in org response") } - if team.Warehouse.WarehouseDatabase.DatabaseName != "analytics_warehouse" { - t.Fatalf("expected analytics_warehouse, got %q", team.Warehouse.WarehouseDatabase.DatabaseName) + if org.Warehouse.WarehouseDatabase.DatabaseName != "analytics_warehouse" { + t.Fatalf("expected analytics_warehouse, got %q", org.Warehouse.WarehouseDatabase.DatabaseName) } - if team.Warehouse.MetadataStore.Kind != "dedicated_rds" { - t.Fatalf("expected metadata store kind dedicated_rds, got %q", team.Warehouse.MetadataStore.Kind) + if org.Warehouse.MetadataStore.Kind != "dedicated_rds" { + t.Fatalf("expected metadata store kind dedicated_rds, got %q", org.Warehouse.MetadataStore.Kind) } } -func TestListTeamsIncludesWarehouse(t *testing.T) { +func TestListOrgsIncludesWarehouse(t *testing.T) { store := newFakeAPIStore() - seedTeamWithWarehouse(store, "analytics") + seedOrgWithWarehouse(store, "analytics") router := newTestAPIRouter(store) - req := httptest.NewRequest(http.MethodGet, "/api/v1/teams", nil) + req := httptest.NewRequest(http.MethodGet, "/api/v1/orgs", nil) rec := httptest.NewRecorder() router.ServeHTTP(rec, req) @@ -314,24 +314,24 @@ func TestListTeamsIncludesWarehouse(t *testing.T) { t.Fatalf("status = %d, want %d: %s", rec.Code, http.StatusOK, rec.Body.String()) } - var teams []configstore.Team - if err := json.Unmarshal(rec.Body.Bytes(), &teams); err != nil { - t.Fatalf("unmarshal teams: %v", err) + var orgs []configstore.Org + if err := json.Unmarshal(rec.Body.Bytes(), &orgs); err != nil { + t.Fatalf("unmarshal orgs: %v", err) } - if len(teams) != 1 { - t.Fatalf("expected 1 team, got %d", len(teams)) + if len(orgs) != 1 { + t.Fatalf("expected 1 org, got %d", len(orgs)) } - if teams[0].Warehouse == nil { - t.Fatal("expected nested warehouse in team list response") + if orgs[0].Warehouse == nil { + t.Fatal("expected nested warehouse in org list response") } } func TestGetWarehouseReturnsNotFoundWhenMissing(t *testing.T) { store := newFakeAPIStore() - store.teams["analytics"] = &configstore.Team{Name: "analytics"} + store.orgs["analytics"] = &configstore.Org{Name: "analytics"} router := newTestAPIRouter(store) - req := httptest.NewRequest(http.MethodGet, "/api/v1/teams/analytics/warehouse", nil) + req := httptest.NewRequest(http.MethodGet, "/api/v1/orgs/analytics/warehouse", nil) rec := httptest.NewRecorder() router.ServeHTTP(rec, req) @@ -340,9 +340,9 @@ func TestGetWarehouseReturnsNotFoundWhenMissing(t *testing.T) { } } -func TestPutWarehouseUpsertsForExistingTeam(t *testing.T) { +func TestPutWarehouseUpsertsForExistingOrg(t *testing.T) { store := newFakeAPIStore() - store.teams["analytics"] = &configstore.Team{Name: "analytics"} + store.orgs["analytics"] = &configstore.Org{Name: "analytics"} router := newTestAPIRouter(store) body := []byte(`{ @@ -405,7 +405,7 @@ func TestPutWarehouseUpsertsForExistingTeam(t *testing.T) { "secrets_state": "ready" }`) - req := httptest.NewRequest(http.MethodPut, "/api/v1/teams/analytics/warehouse", bytes.NewReader(body)) + req := httptest.NewRequest(http.MethodPut, "/api/v1/orgs/analytics/warehouse", bytes.NewReader(body)) req.Header.Set("Content-Type", "application/json") rec := httptest.NewRecorder() router.ServeHTTP(rec, req) @@ -418,8 +418,8 @@ func TestPutWarehouseUpsertsForExistingTeam(t *testing.T) { if warehouse == nil { t.Fatal("expected stored warehouse") } - if warehouse.TeamName != "analytics" { - t.Fatalf("expected team_name analytics, got %q", warehouse.TeamName) + if warehouse.OrgID != "analytics" { + t.Fatalf("expected org_id analytics, got %q", warehouse.OrgID) } if warehouse.RuntimeConfig.Name != "analytics-runtime" { t.Fatalf("expected runtime secret analytics-runtime, got %q", warehouse.RuntimeConfig.Name) @@ -432,11 +432,11 @@ func TestPutWarehouseUpsertsForExistingTeam(t *testing.T) { } } -func TestPutWarehouseRejectsUnknownTeam(t *testing.T) { +func TestPutWarehouseRejectsUnknownOrg(t *testing.T) { store := newFakeAPIStore() router := newTestAPIRouter(store) - req := httptest.NewRequest(http.MethodPut, "/api/v1/teams/unknown/warehouse", bytes.NewReader([]byte(`{"state":"ready"}`))) + req := httptest.NewRequest(http.MethodPut, "/api/v1/orgs/unknown/warehouse", bytes.NewReader([]byte(`{"state":"ready"}`))) req.Header.Set("Content-Type", "application/json") rec := httptest.NewRecorder() router.ServeHTTP(rec, req) @@ -448,18 +448,18 @@ func TestPutWarehouseRejectsUnknownTeam(t *testing.T) { func TestPutWarehouseRejectsServerManagedFields(t *testing.T) { store := newFakeAPIStore() - store.teams["analytics"] = &configstore.Team{Name: "analytics"} + store.orgs["analytics"] = &configstore.Org{Name: "analytics"} router := newTestAPIRouter(store) body := []byte(`{ - "team_name": "wrong-team", + "org_id": "wrong-org", "created_at": "2026-03-18T10:00:00Z", "warehouse_database": { "database_name": "analytics_warehouse" } }`) - req := httptest.NewRequest(http.MethodPut, "/api/v1/teams/analytics/warehouse", bytes.NewReader(body)) + req := httptest.NewRequest(http.MethodPut, "/api/v1/orgs/analytics/warehouse", bytes.NewReader(body)) req.Header.Set("Content-Type", "application/json") rec := httptest.NewRecorder() router.ServeHTTP(rec, req) @@ -471,7 +471,7 @@ func TestPutWarehouseRejectsServerManagedFields(t *testing.T) { func TestPutWarehouseAllowsCustomProvisioningStates(t *testing.T) { store := newFakeAPIStore() - store.teams["analytics"] = &configstore.Team{Name: "analytics"} + store.orgs["analytics"] = &configstore.Org{Name: "analytics"} router := newTestAPIRouter(store) body := []byte(`{ @@ -483,7 +483,7 @@ func TestPutWarehouseAllowsCustomProvisioningStates(t *testing.T) { "secrets_state": "waiting-external-secret" }`) - req := httptest.NewRequest(http.MethodPut, "/api/v1/teams/analytics/warehouse", bytes.NewReader(body)) + req := httptest.NewRequest(http.MethodPut, "/api/v1/orgs/analytics/warehouse", bytes.NewReader(body)) req.Header.Set("Content-Type", "application/json") rec := httptest.NewRecorder() router.ServeHTTP(rec, req) @@ -507,7 +507,7 @@ func TestPutWarehouseAllowsCustomProvisioningStates(t *testing.T) { } } -func TestCreateTeamRejectsNestedWarehousePayload(t *testing.T) { +func TestCreateOrgRejectsNestedWarehousePayload(t *testing.T) { store := newFakeAPIStore() router := newTestAPIRouter(store) @@ -519,7 +519,7 @@ func TestCreateTeamRejectsNestedWarehousePayload(t *testing.T) { } }`) - req := httptest.NewRequest(http.MethodPost, "/api/v1/teams", bytes.NewReader(body)) + req := httptest.NewRequest(http.MethodPost, "/api/v1/orgs", bytes.NewReader(body)) req.Header.Set("Content-Type", "application/json") rec := httptest.NewRecorder() router.ServeHTTP(rec, req) @@ -527,14 +527,14 @@ func TestCreateTeamRejectsNestedWarehousePayload(t *testing.T) { if rec.Code != http.StatusBadRequest { t.Fatalf("status = %d, want %d: %s", rec.Code, http.StatusBadRequest, rec.Body.String()) } - if _, ok := store.teams["analytics"]; ok { - t.Fatal("expected team create to be rejected when warehouse payload is present") + if _, ok := store.orgs["analytics"]; ok { + t.Fatal("expected org create to be rejected when warehouse payload is present") } } -func TestUpdateTeamRejectsNestedWarehousePayload(t *testing.T) { +func TestUpdateOrgRejectsNestedWarehousePayload(t *testing.T) { store := newFakeAPIStore() - store.teams["analytics"] = &configstore.Team{Name: "analytics", MaxWorkers: 2} + store.orgs["analytics"] = &configstore.Org{Name: "analytics", MaxWorkers: 2} router := newTestAPIRouter(store) body := []byte(`{ @@ -544,7 +544,7 @@ func TestUpdateTeamRejectsNestedWarehousePayload(t *testing.T) { } }`) - req := httptest.NewRequest(http.MethodPut, "/api/v1/teams/analytics", bytes.NewReader(body)) + req := httptest.NewRequest(http.MethodPut, "/api/v1/orgs/analytics", bytes.NewReader(body)) req.Header.Set("Content-Type", "application/json") rec := httptest.NewRecorder() router.ServeHTTP(rec, req) @@ -552,20 +552,20 @@ func TestUpdateTeamRejectsNestedWarehousePayload(t *testing.T) { if rec.Code != http.StatusBadRequest { t.Fatalf("status = %d, want %d: %s", rec.Code, http.StatusBadRequest, rec.Body.String()) } - if store.teams["analytics"].MaxWorkers != 2 { - t.Fatalf("expected team update to be rejected, max_workers = %d", store.teams["analytics"].MaxWorkers) + if store.orgs["analytics"].MaxWorkers != 2 { + t.Fatalf("expected org update to be rejected, max_workers = %d", store.orgs["analytics"].MaxWorkers) } } -func TestGetTeamOmitsMinWorkers(t *testing.T) { +func TestGetOrgOmitsMinWorkers(t *testing.T) { store := newFakeAPIStore() - store.teams["analytics"] = &configstore.Team{ + store.orgs["analytics"] = &configstore.Org{ Name: "analytics", MaxWorkers: 2, } router := newTestAPIRouter(store) - req := httptest.NewRequest(http.MethodGet, "/api/v1/teams/analytics", nil) + req := httptest.NewRequest(http.MethodGet, "/api/v1/orgs/analytics", nil) rec := httptest.NewRecorder() router.ServeHTTP(rec, req) @@ -573,7 +573,7 @@ func TestGetTeamOmitsMinWorkers(t *testing.T) { t.Fatalf("status = %d, want %d: %s", rec.Code, http.StatusOK, rec.Body.String()) } if bytes.Contains(rec.Body.Bytes(), []byte(`"min_workers"`)) { - t.Fatalf("expected team response to omit min_workers, got %s", rec.Body.String()) + t.Fatalf("expected org response to omit min_workers, got %s", rec.Body.String()) } } @@ -583,8 +583,8 @@ func TestManagedWarehouseUpsertColumnsExcludeCreatedAt(t *testing.T) { if slices.Contains(columns, "created_at") { t.Fatal("expected created_at to be excluded from managed warehouse upserts") } - if slices.Contains(columns, "team_name") { - t.Fatal("expected team_name to be excluded from managed warehouse upserts") + if slices.Contains(columns, "org_id") { + t.Fatal("expected org_id to be excluded from managed warehouse upserts") } if !slices.Contains(columns, "updated_at") { t.Fatal("expected updated_at to be included in managed warehouse upserts") diff --git a/controlplane/admin/dashboard.go b/controlplane/admin/dashboard.go index 16ebecc..6984663 100644 --- a/controlplane/admin/dashboard.go +++ b/controlplane/admin/dashboard.go @@ -37,7 +37,7 @@ func APIAuthMiddleware(adminToken string) gin.HandlerFunc { // RegisterDashboard serves the admin dashboard on the Gin engine. func RegisterDashboard(r *gin.Engine, adminToken string) { r.GET("/", dashboardPageHandler("index.html", adminToken)) - r.GET("/teams", dashboardPageHandler("teams.html", adminToken)) + r.GET("/orgs", dashboardPageHandler("orgs.html", adminToken)) r.GET("/workers", dashboardPageHandler("workers.html", adminToken)) r.GET("/sessions", dashboardPageHandler("sessions.html", adminToken)) r.GET("/settings", dashboardPageHandler("settings.html", adminToken)) diff --git a/controlplane/admin/static/index.html b/controlplane/admin/static/index.html index 62668d9..1745ab5 100644 --- a/controlplane/admin/static/index.html +++ b/controlplane/admin/static/index.html @@ -16,7 +16,7 @@
Overview - Teams + Orgs Ducklings Sessions Settings @@ -34,10 +34,10 @@

Cluster Overview

-

Teams

-
Orgs +
-
Loading teams...
+
Loading orgs...
@@ -54,8 +54,8 @@

Teams

const data = JSON.parse(evt.detail.serverResponse); evt.detail.serverResponse = `
-
${data.total_teams}
-
Teams
+
${data.total_orgs}
+
Orgs
${data.total_workers}
@@ -67,16 +67,16 @@

Teams

`; } catch(e) {} } - if (evt.detail.target.id === 'team-list') { + if (evt.detail.target.id === 'org-list') { try { - const teams = JSON.parse(evt.detail.serverResponse); - if (!teams || teams.length === 0) { - evt.detail.serverResponse = '
No teams configured
'; + const orgs = JSON.parse(evt.detail.serverResponse); + if (!orgs || orgs.length === 0) { + evt.detail.serverResponse = '
No orgs configured
'; return; } let html = '' + ''; - teams.forEach(t => { + orgs.forEach(t => { html += ` diff --git a/controlplane/admin/static/teams.html b/controlplane/admin/static/orgs.html similarity index 80% rename from controlplane/admin/static/teams.html rename to controlplane/admin/static/orgs.html index f36fa3e..d114b0b 100644 --- a/controlplane/admin/static/teams.html +++ b/controlplane/admin/static/orgs.html @@ -3,7 +3,7 @@ - Duckgres - Teams + Duckgres - Orgs @@ -16,7 +16,7 @@
Overview - Teams + Orgs Ducklings Sessions Settings @@ -26,19 +26,19 @@
-

Teams

+

Orgs

NameUsersMax WorkersMemory Budget
${esc(t.name)} ${(t.users || []).length}
' + '' + ''; - teams.forEach(t => { + orgs.forEach(t => { const safeName = esc(t.name); html += ` @@ -82,8 +82,8 @@

Create Team

`; }); diff --git a/controlplane/admin/static/sessions.html b/controlplane/admin/static/sessions.html index 0b59e8d..d7d308f 100644 --- a/controlplane/admin/static/sessions.html +++ b/controlplane/admin/static/sessions.html @@ -16,7 +16,7 @@
Overview - Teams + Orgs Ducklings Sessions Settings @@ -48,11 +48,11 @@

Active Sessions

return; } let html = '
NameUsersMax WorkersMemory BudgetActions
${safeName} ${t.max_workers || 'unlimited'} ${esc(t.memory_budget || 'default')} -
' + - ''; + ''; sessions.forEach(s => { html += ` - + `; }); html += '
PIDTeamWorker
PIDOrgWorker
${s.pid}${esc(s.team)}${esc(s.org)} ${s.worker_id}
'; diff --git a/controlplane/admin/static/settings.html b/controlplane/admin/static/settings.html index 1100c03..540cf62 100644 --- a/controlplane/admin/static/settings.html +++ b/controlplane/admin/static/settings.html @@ -16,7 +16,7 @@
Overview - Teams + Orgs Ducklings Sessions Settings diff --git a/controlplane/admin/static/workers.html b/controlplane/admin/static/workers.html index a9c0a67..7104aa2 100644 --- a/controlplane/admin/static/workers.html +++ b/controlplane/admin/static/workers.html @@ -16,7 +16,7 @@
Overview - Teams + Orgs Ducklings Sessions Settings @@ -48,12 +48,12 @@

Ducklings (Workers)

return; } let html = '' + - ''; + ''; workers.forEach(w => { const statusColor = w.status === 'active' ? 'text-green-400' : 'text-yellow-400'; html += ` - + `; }); diff --git a/controlplane/configstore/models.go b/controlplane/configstore/models.go index 5a07575..8215ee7 100644 --- a/controlplane/configstore/models.go +++ b/controlplane/configstore/models.go @@ -2,30 +2,30 @@ package configstore import "time" -// Team represents a tenant with per-team resource limits. -type Team struct { +// Org represents a tenant with per-org resource limits. +type Org struct { Name string `gorm:"primaryKey;size:255" json:"name"` MaxWorkers int `gorm:"default:0" json:"max_workers"` MemoryBudget string `gorm:"size:32" json:"memory_budget"` IdleTimeoutS int `gorm:"default:0" json:"idle_timeout_s"` - Users []TeamUser `gorm:"foreignKey:TeamName;references:Name" json:"users,omitempty"` - Warehouse *ManagedWarehouse `gorm:"foreignKey:TeamName;references:Name;constraint:OnDelete:CASCADE" json:"warehouse,omitempty"` + Users []OrgUser `gorm:"foreignKey:OrgID;references:Name" json:"users,omitempty"` + Warehouse *ManagedWarehouse `gorm:"foreignKey:OrgID;references:Name;constraint:OnDelete:CASCADE" json:"warehouse,omitempty"` CreatedAt time.Time `json:"created_at"` UpdatedAt time.Time `json:"updated_at"` } -func (Team) TableName() string { return "duckgres_teams" } +func (Org) TableName() string { return "duckgres_orgs" } -// TeamUser maps a username to a team with credentials. -type TeamUser struct { +// OrgUser maps a username to an org with credentials. +type OrgUser struct { Username string `gorm:"primaryKey;size:255" json:"username"` Password string `gorm:"size:255;not null" json:"-"` - TeamName string `gorm:"size:255;not null;index" json:"team_name"` + OrgID string `gorm:"size:255;not null;index" json:"org_id"` CreatedAt time.Time `json:"created_at"` UpdatedAt time.Time `json:"updated_at"` } -func (TeamUser) TableName() string { return "duckgres_team_users" } +func (OrgUser) TableName() string { return "duckgres_org_users" } // ManagedWarehouseProvisioningState is an open string used for warehouse lifecycle status. // The constants below are the canonical values used by current tooling, but callers may @@ -48,7 +48,7 @@ type SecretRef struct { Key string `gorm:"size:255" json:"key"` } -// ManagedWarehouseDatabase stores primary warehouse DB metadata for a team. +// ManagedWarehouseDatabase stores primary warehouse DB metadata for an org. type ManagedWarehouseDatabase struct { Region string `gorm:"size:64" json:"region"` Endpoint string `gorm:"size:512" json:"endpoint"` @@ -57,7 +57,7 @@ type ManagedWarehouseDatabase struct { Username string `gorm:"size:255" json:"username"` } -// ManagedWarehouseMetadataStore stores team-scoped DuckLake metadata DB info. +// ManagedWarehouseMetadataStore stores org-scoped DuckLake metadata DB info. type ManagedWarehouseMetadataStore struct { Kind string `gorm:"size:64" json:"kind"` Engine string `gorm:"size:64" json:"engine"` @@ -68,7 +68,7 @@ type ManagedWarehouseMetadataStore struct { Username string `gorm:"size:255" json:"username"` } -// ManagedWarehouseS3 stores object-store metadata for a team's warehouse. +// ManagedWarehouseS3 stores object-store metadata for an org's warehouse. type ManagedWarehouseS3 struct { Provider string `gorm:"size:64" json:"provider"` Region string `gorm:"size:64" json:"region"` @@ -79,16 +79,16 @@ type ManagedWarehouseS3 struct { URLStyle string `gorm:"size:16" json:"url_style"` } -// ManagedWarehouseWorkerIdentity stores team-scoped worker identity metadata. +// ManagedWarehouseWorkerIdentity stores org-scoped worker identity metadata. type ManagedWarehouseWorkerIdentity struct { Namespace string `gorm:"size:255" json:"namespace"` ServiceAccountName string `gorm:"size:255" json:"service_account_name"` IAMRoleARN string `gorm:"size:512" json:"iam_role_arn"` } -// ManagedWarehouse is the config-store source of truth for a team's managed warehouse metadata. +// ManagedWarehouse is the config-store source of truth for an org's managed warehouse metadata. type ManagedWarehouse struct { - TeamName string `gorm:"primaryKey;size:255" json:"team_name"` + OrgID string `gorm:"primaryKey;size:255" json:"org_id"` WarehouseDatabase ManagedWarehouseDatabase `gorm:"embedded;embeddedPrefix:warehouse_database_" json:"warehouse_database"` MetadataStore ManagedWarehouseMetadataStore `gorm:"embedded;embeddedPrefix:metadata_store_" json:"metadata_store"` @@ -136,7 +136,7 @@ type GlobalConfig struct { func (GlobalConfig) TableName() string { return "duckgres_global_config" } // DuckLakeConfig is a singleton row (ID=1) for legacy cluster-wide DuckLake settings. -// In multi-tenant mode, the managed-warehouse contract is the intended per-team source of truth. +// In multi-tenant mode, the managed-warehouse contract is the intended per-org source of truth. type DuckLakeConfig struct { ID uint `gorm:"primaryKey" json:"-"` MetadataStore string `gorm:"size:1024" json:"metadata_store"` @@ -181,8 +181,8 @@ type QueryLogConfig struct { func (QueryLogConfig) TableName() string { return "duckgres_query_log_config" } -// TeamConfig is a convenience view combining team metadata with resource limits. -type TeamConfig struct { +// OrgConfig is a convenience view combining org metadata with resource limits. +type OrgConfig struct { Name string MaxWorkers int MemoryBudget string @@ -191,9 +191,9 @@ type TeamConfig struct { Warehouse *ManagedWarehouseConfig } -// ManagedWarehouseConfig is the in-memory snapshot view of a team's warehouse metadata. +// ManagedWarehouseConfig is the in-memory snapshot view of an org's warehouse metadata. type ManagedWarehouseConfig struct { - TeamName string + OrgID string WarehouseDatabase ManagedWarehouseDatabase MetadataStore ManagedWarehouseMetadataStore @@ -227,7 +227,7 @@ func copyManagedWarehouseConfig(warehouse *ManagedWarehouse) *ManagedWarehouseCo } cfg := &ManagedWarehouseConfig{ - TeamName: warehouse.TeamName, + OrgID: warehouse.OrgID, WarehouseDatabase: warehouse.WarehouseDatabase, MetadataStore: warehouse.MetadataStore, S3: warehouse.S3, diff --git a/controlplane/configstore/store.go b/controlplane/configstore/store.go index 40b05f6..a50aaf5 100644 --- a/controlplane/configstore/store.go +++ b/controlplane/configstore/store.go @@ -15,8 +15,8 @@ import ( // Snapshot holds a point-in-time copy of all config data for fast lookups. type Snapshot struct { - Teams map[string]*TeamConfig - UserTeam map[string]string // username -> team name + Orgs map[string]*OrgConfig + UserOrg map[string]string // username -> org name UserPassword map[string]string // username -> password Global GlobalConfig DuckLake DuckLakeConfig @@ -49,9 +49,9 @@ func NewConfigStore(connStr string, pollInterval time.Duration) (*ConfigStore, e // Auto-migrate all models if err := db.AutoMigrate( - &Team{}, + &Org{}, &ManagedWarehouse{}, - &TeamUser{}, + &OrgUser{}, &GlobalConfig{}, &DuckLakeConfig{}, &RateLimitConfig{}, @@ -78,7 +78,7 @@ func NewConfigStore(connStr string, pollInterval time.Duration) (*ConfigStore, e } cs.snapshot = snap - slog.Info("Config store connected.", "teams", len(snap.Teams), "users", len(snap.UserTeam)) + slog.Info("Config store connected.", "orgs", len(snap.Orgs), "users", len(snap.UserOrg)) return cs, nil } @@ -117,9 +117,9 @@ func (cs *ConfigStore) Start(ctx context.Context) { // load fetches all config from the database and builds a Snapshot. func (cs *ConfigStore) load() (*Snapshot, error) { - var teams []Team - if err := cs.db.Preload("Users").Preload("Warehouse").Find(&teams).Error; err != nil { - return nil, fmt.Errorf("load teams: %w", err) + var orgs []Org + if err := cs.db.Preload("Users").Preload("Warehouse").Find(&orgs).Error; err != nil { + return nil, fmt.Errorf("load orgs: %w", err) } var global GlobalConfig @@ -135,8 +135,8 @@ func (cs *ConfigStore) load() (*Snapshot, error) { cs.db.First(&queryLog, 1) snap := &Snapshot{ - Teams: make(map[string]*TeamConfig), - UserTeam: make(map[string]string), + Orgs: make(map[string]*OrgConfig), + UserOrg: make(map[string]string), UserPassword: make(map[string]string), Global: global, DuckLake: duckLake, @@ -144,21 +144,21 @@ func (cs *ConfigStore) load() (*Snapshot, error) { QueryLog: queryLog, } - for _, t := range teams { - tc := &TeamConfig{ - Name: t.Name, - MaxWorkers: t.MaxWorkers, - MemoryBudget: t.MemoryBudget, - IdleTimeoutS: t.IdleTimeoutS, + for _, o := range orgs { + oc := &OrgConfig{ + Name: o.Name, + MaxWorkers: o.MaxWorkers, + MemoryBudget: o.MemoryBudget, + IdleTimeoutS: o.IdleTimeoutS, Users: make(map[string]string), - Warehouse: copyManagedWarehouseConfig(t.Warehouse), + Warehouse: copyManagedWarehouseConfig(o.Warehouse), } - for _, u := range t.Users { - tc.Users[u.Username] = u.Password - snap.UserTeam[u.Username] = t.Name + for _, u := range o.Users { + oc.Users[u.Username] = u.Password + snap.UserOrg[u.Username] = o.Name snap.UserPassword[u.Username] = u.Password } - snap.Teams[t.Name] = tc + snap.Orgs[o.Name] = oc } return snap, nil @@ -172,7 +172,7 @@ func (cs *ConfigStore) Snapshot() *Snapshot { } // ValidateUser checks username/password against the cached snapshot. -// Passwords are compared using bcrypt. Returns the team name and whether auth succeeded. +// Passwords are compared using bcrypt. Returns the org name and whether auth succeeded. func (cs *ConfigStore) ValidateUser(username, password string) (string, bool) { cs.mu.RLock() defer cs.mu.RUnlock() @@ -188,7 +188,7 @@ func (cs *ConfigStore) ValidateUser(username, password string) (string, bool) { if err := bcrypt.CompareHashAndPassword([]byte(storedHash), []byte(password)); err != nil { return "", false } - return cs.snapshot.UserTeam[username], true + return cs.snapshot.UserOrg[username], true } // HashPassword hashes a plaintext password using bcrypt. @@ -200,14 +200,14 @@ func HashPassword(password string) (string, error) { return string(hash), nil } -// TeamForUser returns the team name for a user, or "" if not found. -func (cs *ConfigStore) TeamForUser(username string) string { +// OrgForUser returns the org name for a user, or "" if not found. +func (cs *ConfigStore) OrgForUser(username string) string { cs.mu.RLock() defer cs.mu.RUnlock() if cs.snapshot == nil { return "" } - return cs.snapshot.UserTeam[username] + return cs.snapshot.UserOrg[username] } // OnChange registers a callback that fires when the config snapshot changes. diff --git a/controlplane/configstore/store_test.go b/controlplane/configstore/store_test.go index f1c3094..118ba25 100644 --- a/controlplane/configstore/store_test.go +++ b/controlplane/configstore/store_test.go @@ -17,19 +17,19 @@ func mustHash(t *testing.T, password string) string { } func TestSnapshotBuild(t *testing.T) { - // Verify TeamConfig construction from models + // Verify OrgConfig construction from models hash1 := mustHash(t, "secret1") hash2 := mustHash(t, "secret2") hash3 := mustHash(t, "secret3") readyAt := time.Date(2026, time.March, 17, 12, 0, 0, 0, time.UTC) - teams := []Team{ + orgs := []Org{ { Name: "analytics", MaxWorkers: 4, MemoryBudget: "8GB", Warehouse: &ManagedWarehouse{ - TeamName: "analytics", + OrgID: "analytics", WarehouseDatabase: ManagedWarehouseDatabase{ Region: "us-east-1", Endpoint: "analytics.cluster-xyz.us-east-1.rds.amazonaws.com", @@ -50,15 +50,15 @@ func TestSnapshotBuild(t *testing.T) { Provider: "aws", Region: "us-east-1", Bucket: "analytics-bucket", - PathPrefix: "ducklake/team-analytics/", + PathPrefix: "ducklake/org-analytics/", Endpoint: "s3.us-east-1.amazonaws.com", UseSSL: true, URLStyle: "vhost", }, WorkerIdentity: ManagedWarehouseWorkerIdentity{ Namespace: "duckgres", - ServiceAccountName: "team-analytics-worker", - IAMRoleARN: "arn:aws:iam::123456789012:role/team-analytics-worker", + ServiceAccountName: "org-analytics-worker", + IAMRoleARN: "arn:aws:iam::123456789012:role/org-analytics-worker", }, WarehouseDatabaseCredentials: SecretRef{ Namespace: "duckgres", @@ -89,86 +89,86 @@ func TestSnapshotBuild(t *testing.T) { SecretsState: ManagedWarehouseStateReady, ReadyAt: &readyAt, }, - Users: []TeamUser{ - {Username: "alice", Password: hash1, TeamName: "analytics"}, - {Username: "bob", Password: hash2, TeamName: "analytics"}, + Users: []OrgUser{ + {Username: "alice", Password: hash1, OrgID: "analytics"}, + {Username: "bob", Password: hash2, OrgID: "analytics"}, }, }, { Name: "ingestion", MaxWorkers: 2, - Users: []TeamUser{ - {Username: "charlie", Password: hash3, TeamName: "ingestion"}, + Users: []OrgUser{ + {Username: "charlie", Password: hash3, OrgID: "ingestion"}, }, }, } snap := &Snapshot{ - Teams: make(map[string]*TeamConfig), - UserTeam: make(map[string]string), + Orgs: make(map[string]*OrgConfig), + UserOrg: make(map[string]string), UserPassword: make(map[string]string), } - for _, t2 := range teams { - tc := &TeamConfig{ - Name: t2.Name, - MaxWorkers: t2.MaxWorkers, - MemoryBudget: t2.MemoryBudget, - IdleTimeoutS: t2.IdleTimeoutS, + for _, o := range orgs { + oc := &OrgConfig{ + Name: o.Name, + MaxWorkers: o.MaxWorkers, + MemoryBudget: o.MemoryBudget, + IdleTimeoutS: o.IdleTimeoutS, Users: make(map[string]string), } - if t2.Warehouse != nil { - tc.Warehouse = copyManagedWarehouseConfig(t2.Warehouse) + if o.Warehouse != nil { + oc.Warehouse = copyManagedWarehouseConfig(o.Warehouse) } - for _, u := range t2.Users { - tc.Users[u.Username] = u.Password - snap.UserTeam[u.Username] = t2.Name + for _, u := range o.Users { + oc.Users[u.Username] = u.Password + snap.UserOrg[u.Username] = o.Name snap.UserPassword[u.Username] = u.Password } - snap.Teams[t2.Name] = tc + snap.Orgs[o.Name] = oc } - // Verify team config - if len(snap.Teams) != 2 { - t.Fatalf("expected 2 teams, got %d", len(snap.Teams)) + // Verify org config + if len(snap.Orgs) != 2 { + t.Fatalf("expected 2 orgs, got %d", len(snap.Orgs)) } - if snap.Teams["analytics"].MaxWorkers != 4 { - t.Errorf("expected analytics max_workers=4, got %d", snap.Teams["analytics"].MaxWorkers) + if snap.Orgs["analytics"].MaxWorkers != 4 { + t.Errorf("expected analytics max_workers=4, got %d", snap.Orgs["analytics"].MaxWorkers) } - if snap.Teams["analytics"].MemoryBudget != "8GB" { - t.Errorf("expected analytics memory_budget=8GB, got %s", snap.Teams["analytics"].MemoryBudget) + if snap.Orgs["analytics"].MemoryBudget != "8GB" { + t.Errorf("expected analytics memory_budget=8GB, got %s", snap.Orgs["analytics"].MemoryBudget) } - if len(snap.Teams["analytics"].Users) != 2 { - t.Errorf("expected 2 analytics users, got %d", len(snap.Teams["analytics"].Users)) + if len(snap.Orgs["analytics"].Users) != 2 { + t.Errorf("expected 2 analytics users, got %d", len(snap.Orgs["analytics"].Users)) } - if snap.Teams["analytics"].Warehouse == nil { + if snap.Orgs["analytics"].Warehouse == nil { t.Fatal("expected analytics warehouse to be present") } - if snap.Teams["analytics"].Warehouse.WarehouseDatabase.DatabaseName != "analytics_wh" { - t.Fatalf("expected analytics warehouse db name analytics_wh, got %q", snap.Teams["analytics"].Warehouse.WarehouseDatabase.DatabaseName) + if snap.Orgs["analytics"].Warehouse.WarehouseDatabase.DatabaseName != "analytics_wh" { + t.Fatalf("expected analytics warehouse db name analytics_wh, got %q", snap.Orgs["analytics"].Warehouse.WarehouseDatabase.DatabaseName) } - if snap.Teams["analytics"].Warehouse.MetadataStore.Kind != "dedicated_rds" { - t.Fatalf("expected metadata store kind dedicated_rds, got %q", snap.Teams["analytics"].Warehouse.MetadataStore.Kind) + if snap.Orgs["analytics"].Warehouse.MetadataStore.Kind != "dedicated_rds" { + t.Fatalf("expected metadata store kind dedicated_rds, got %q", snap.Orgs["analytics"].Warehouse.MetadataStore.Kind) } - if snap.Teams["analytics"].Warehouse.MetadataStoreCredentials.Name != "analytics-metadata" { - t.Fatalf("expected metadata secret analytics-metadata, got %q", snap.Teams["analytics"].Warehouse.MetadataStoreCredentials.Name) + if snap.Orgs["analytics"].Warehouse.MetadataStoreCredentials.Name != "analytics-metadata" { + t.Fatalf("expected metadata secret analytics-metadata, got %q", snap.Orgs["analytics"].Warehouse.MetadataStoreCredentials.Name) } - if snap.Teams["analytics"].Warehouse.RuntimeConfig.Name != "analytics-runtime" { - t.Fatalf("expected runtime config secret analytics-runtime, got %q", snap.Teams["analytics"].Warehouse.RuntimeConfig.Name) + if snap.Orgs["analytics"].Warehouse.RuntimeConfig.Name != "analytics-runtime" { + t.Fatalf("expected runtime config secret analytics-runtime, got %q", snap.Orgs["analytics"].Warehouse.RuntimeConfig.Name) } - if snap.Teams["analytics"].Warehouse.ReadyAt == nil || !snap.Teams["analytics"].Warehouse.ReadyAt.Equal(readyAt) { - t.Fatalf("expected ready_at %v, got %v", readyAt, snap.Teams["analytics"].Warehouse.ReadyAt) + if snap.Orgs["analytics"].Warehouse.ReadyAt == nil || !snap.Orgs["analytics"].Warehouse.ReadyAt.Equal(readyAt) { + t.Fatalf("expected ready_at %v, got %v", readyAt, snap.Orgs["analytics"].Warehouse.ReadyAt) } - if snap.Teams["ingestion"].Warehouse != nil { + if snap.Orgs["ingestion"].Warehouse != nil { t.Fatal("expected ingestion warehouse to be nil") } - // Verify user → team mapping - if snap.UserTeam["alice"] != "analytics" { - t.Errorf("expected alice in analytics, got %s", snap.UserTeam["alice"]) + // Verify user -> org mapping + if snap.UserOrg["alice"] != "analytics" { + t.Errorf("expected alice in analytics, got %s", snap.UserOrg["alice"]) } - if snap.UserTeam["charlie"] != "ingestion" { - t.Errorf("expected charlie in ingestion, got %s", snap.UserTeam["charlie"]) + if snap.UserOrg["charlie"] != "ingestion" { + t.Errorf("expected charlie in ingestion, got %s", snap.UserOrg["charlie"]) } // Verify bcrypt password hashes are stored (not plaintext) @@ -196,8 +196,8 @@ func TestTableNames(t *testing.T) { model interface{ TableName() string } want string }{ - {Team{}, "duckgres_teams"}, - {TeamUser{}, "duckgres_team_users"}, + {Org{}, "duckgres_orgs"}, + {OrgUser{}, "duckgres_org_users"}, {ManagedWarehouse{}, "duckgres_managed_warehouses"}, {GlobalConfig{}, "duckgres_global_config"}, {DuckLakeConfig{}, "duckgres_ducklake_config"}, diff --git a/controlplane/control.go b/controlplane/control.go index e1761a7..be41cd7 100644 --- a/controlplane/control.go +++ b/controlplane/control.go @@ -107,19 +107,19 @@ type ControlPlane struct { acmeDNSManager *server.ACMEDNSManager // ACME manager for DNS-01 (nil when not using DNS challenges) // Multi-tenant fields (non-nil in remote multitenant mode) - teamRouter TeamRouterInterface + orgRouter OrgRouterInterface configStore ConfigStoreInterface } // ConfigStoreInterface abstracts the config store for the control plane. // Defined here to avoid circular imports with the configstore package. type ConfigStoreInterface interface { - ValidateUser(username, password string) (teamName string, ok bool) - TeamForUser(username string) string + ValidateUser(username, password string) (orgID string, ok bool) + OrgForUser(username string) string } -// TeamRouterInterface abstracts the team router for the control plane. -type TeamRouterInterface interface { +// OrgRouterInterface abstracts the org router for the control plane. +type OrgRouterInterface interface { StackForUser(username string) (pool WorkerPool, sessions *SessionManager, rebalancer *MemoryRebalancer, ok bool) ShutdownAll() } @@ -314,7 +314,7 @@ func RunControlPlane(cfg ControlPlaneConfig) { acmeDNSManager: acmeDNSMgr, } - // Multi-tenant mode: config store + per-team pools (K8s remote backend only) + // Multi-tenant mode: config store + per-org pools (K8s remote backend only) if cfg.WorkerBackend == "remote" { store, adapter, adminSrv, err := SetupMultiTenant(cfg, srv, memBudget, k8sMaxWorkers) if err != nil { @@ -322,7 +322,7 @@ func RunControlPlane(cfg ControlPlaneConfig) { os.Exit(1) } cp.configStore = store - cp.teamRouter = adapter + cp.orgRouter = adapter // Replace the simple metrics server with the Gin admin server if cfg.MetricsServer != nil { ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) @@ -658,7 +658,7 @@ func (cp *ControlPlane) handleConnection(conn net.Conn) { // Authenticate: use config store (multi-tenant) or YAML users (single-tenant) if cp.configStore != nil { - teamName, ok := cp.configStore.ValidateUser(username, password) + orgID, ok := cp.configStore.ValidateUser(username, password) if !ok { slog.Warn("Authentication failed.", "user", username, "remote_addr", remoteAddr) banned := server.RecordFailedAuthAttempt(cp.rateLimiter, remoteAddr) @@ -669,7 +669,7 @@ func (cp *ControlPlane) handleConnection(conn net.Conn) { _ = writer.Flush() return } - _ = teamName // used for routing below + _ = orgID // used for routing below } else { if !server.ValidateUserPassword(cp.cfg.Users, username, password) { slog.Warn("Authentication failed.", "user", username, "remote_addr", remoteAddr) @@ -693,13 +693,13 @@ func (cp *ControlPlane) handleConnection(conn net.Conn) { slog.Info("User authenticated.", "user", username, "remote_addr", remoteAddr) // Resolve the session manager and rebalancer for this connection. - // In multi-tenant mode, each team has its own stack. + // In multi-tenant mode, each org has its own stack. var sessions *SessionManager var rebalancer *MemoryRebalancer - if cp.teamRouter != nil { - _, sess, rebal, ok := cp.teamRouter.StackForUser(username) + if cp.orgRouter != nil { + _, sess, rebal, ok := cp.orgRouter.StackForUser(username) if !ok { - _ = server.WriteErrorResponse(writer, "FATAL", "28000", "no team configured for user") + _ = server.WriteErrorResponse(writer, "FATAL", "28000", "no org configured for user") _ = writer.Flush() return } @@ -899,8 +899,8 @@ func (cp *ControlPlane) shutdown() { cp.wg.Wait() slog.Info("Shutting down workers...") - if cp.teamRouter != nil { - cp.teamRouter.ShutdownAll() + if cp.orgRouter != nil { + cp.orgRouter.ShutdownAll() } else if cp.pool != nil { cp.pool.ShutdownAll() } @@ -1077,8 +1077,8 @@ func (cp *ControlPlane) drainAfterUpgrade() { } // Shut down workers - if cp.teamRouter != nil { - cp.teamRouter.ShutdownAll() + if cp.orgRouter != nil { + cp.orgRouter.ShutdownAll() } else if cp.pool != nil { cp.pool.ShutdownAll() } diff --git a/controlplane/flight_ingress_metrics_k8s.go b/controlplane/flight_ingress_metrics_k8s.go index 9179d3e..b37a1b2 100644 --- a/controlplane/flight_ingress_metrics_k8s.go +++ b/controlplane/flight_ingress_metrics_k8s.go @@ -7,49 +7,49 @@ import ( "github.com/prometheus/client_golang/prometheus/promauto" ) -// --- Per-team metrics (multi-tenant mode) --- - -var teamWorkersActiveGauge = promauto.NewGaugeVec(prometheus.GaugeOpts{ - Name: "duckgres_team_workers_active", - Help: "Number of active workers per team", -}, []string{"team"}) - -var teamWorkersIdleGauge = promauto.NewGaugeVec(prometheus.GaugeOpts{ - Name: "duckgres_team_workers_idle", - Help: "Number of idle workers per team", -}, []string{"team"}) - -var teamSessionsActiveGauge = promauto.NewGaugeVec(prometheus.GaugeOpts{ - Name: "duckgres_team_sessions_active", - Help: "Number of active sessions per team", -}, []string{"team"}) - -var teamWorkerSpawnsCounter = promauto.NewCounterVec(prometheus.CounterOpts{ - Name: "duckgres_team_worker_spawns_total", - Help: "Total worker spawns per team", -}, []string{"team"}) - -var teamWorkerCrashesCounter = promauto.NewCounterVec(prometheus.CounterOpts{ - Name: "duckgres_team_worker_crashes_total", - Help: "Total worker crashes per team", -}, []string{"team"}) - -func observeTeamWorkersActive(team string, count int) { - teamWorkersActiveGauge.WithLabelValues(team).Set(float64(count)) +// --- Per-org metrics (multi-tenant mode) --- + +var orgWorkersActiveGauge = promauto.NewGaugeVec(prometheus.GaugeOpts{ + Name: "duckgres_org_workers_active", + Help: "Number of active workers per org", +}, []string{"org"}) + +var orgWorkersIdleGauge = promauto.NewGaugeVec(prometheus.GaugeOpts{ + Name: "duckgres_org_workers_idle", + Help: "Number of idle workers per org", +}, []string{"org"}) + +var orgSessionsActiveGauge = promauto.NewGaugeVec(prometheus.GaugeOpts{ + Name: "duckgres_org_sessions_active", + Help: "Number of active sessions per org", +}, []string{"org"}) + +var orgWorkerSpawnsCounter = promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "duckgres_org_worker_spawns_total", + Help: "Total worker spawns per org", +}, []string{"org"}) + +var orgWorkerCrashesCounter = promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "duckgres_org_worker_crashes_total", + Help: "Total worker crashes per org", +}, []string{"org"}) + +func observeOrgWorkersActive(org string, count int) { + orgWorkersActiveGauge.WithLabelValues(org).Set(float64(count)) } -func observeTeamWorkersIdle(team string, count int) { - teamWorkersIdleGauge.WithLabelValues(team).Set(float64(count)) +func observeOrgWorkersIdle(org string, count int) { + orgWorkersIdleGauge.WithLabelValues(org).Set(float64(count)) } -func observeTeamSessionsActive(team string, count int) { - teamSessionsActiveGauge.WithLabelValues(team).Set(float64(count)) +func observeOrgSessionsActive(org string, count int) { + orgSessionsActiveGauge.WithLabelValues(org).Set(float64(count)) } -func observeTeamWorkerSpawn(team string) { - teamWorkerSpawnsCounter.WithLabelValues(team).Inc() +func observeOrgWorkerSpawn(org string) { + orgWorkerSpawnsCounter.WithLabelValues(org).Inc() } -func observeTeamWorkerCrash(team string) { - teamWorkerCrashesCounter.WithLabelValues(team).Inc() +func observeOrgWorkerCrash(org string) { + orgWorkerCrashesCounter.WithLabelValues(org).Inc() } diff --git a/controlplane/k8s_pool.go b/controlplane/k8s_pool.go index 2d9f2b3..6941a0f 100644 --- a/controlplane/k8s_pool.go +++ b/controlplane/k8s_pool.go @@ -52,8 +52,8 @@ type K8sWorkerPool struct { imagePullPolicy corev1.PullPolicy serviceAccount string memoryBudget int64 // total memory budget in bytes - teamName string // team name for pod labels (multi-tenant mode) - workerIDGenerator func() int // shared ID generator across teams (nil = internal counter) + orgID string // org ID for pod labels (multi-tenant mode) + workerIDGenerator func() int // shared ID generator across orgs (nil = internal counter) cachedToken string // cached bearer token (immutable after setup) informer cache.SharedIndexInformer stopInform chan struct{} @@ -124,7 +124,7 @@ func newK8sWorkerPool(cfg K8sWorkerPoolConfig, clientset kubernetes.Interface) ( imagePullPolicy: corev1.PullPolicy(cfg.ImagePullPolicy), serviceAccount: cfg.ServiceAccount, memoryBudget: cfg.MemoryBudget, - teamName: cfg.TeamName, + orgID: cfg.OrgID, workerIDGenerator: cfg.WorkerIDGenerator, spawnSem: make(chan struct{}, spawnConcurrency), } @@ -243,8 +243,8 @@ func (p *K8sWorkerPool) readBearerToken(ctx context.Context) (string, error) { // startInformer starts a SharedIndexInformer to watch worker pods. func (p *K8sWorkerPool) startInformer() { labelSelector := fmt.Sprintf("duckgres/control-plane=%s", p.cpID) - if p.teamName != "" { - labelSelector += fmt.Sprintf(",duckgres/team=%s", p.teamName) + if p.orgID != "" { + labelSelector += fmt.Sprintf(",duckgres/org=%s", p.orgID) } factory := informers.NewSharedInformerFactoryWithOptions( p.clientset, @@ -368,8 +368,8 @@ func (p *K8sWorkerPool) SpawnWorker(ctx context.Context, id int) error { "duckgres/control-plane": p.cpID, "duckgres/worker-id": strconv.Itoa(id), } - if p.teamName != "" { - podLabels["duckgres/team"] = p.teamName + if p.orgID != "" { + podLabels["duckgres/org"] = p.orgID } // Build pod spec @@ -1348,10 +1348,10 @@ func (p *K8sWorkerPool) markWorkerRetiredLocked(w *ManagedWorker) { } // podNameForWorker returns the pod name for a given worker ID, -// including the team name if set (multi-tenant mode). +// including the org ID if set (multi-tenant mode). func (p *K8sWorkerPool) podNameForWorker(id int) string { - if p.teamName != "" { - return fmt.Sprintf("duckgres-worker-%s-%s-%d", p.cpID, p.teamName, id) + if p.orgID != "" { + return fmt.Sprintf("duckgres-worker-%s-%s-%d", p.cpID, p.orgID, id) } return fmt.Sprintf("duckgres-worker-%s-%d", p.cpID, id) } diff --git a/controlplane/k8s_pool_test.go b/controlplane/k8s_pool_test.go index 7fe871f..e624b0a 100644 --- a/controlplane/k8s_pool_test.go +++ b/controlplane/k8s_pool_test.go @@ -298,7 +298,7 @@ func TestK8sPoolSpawnMinWorkersCountsOnlyNeutralIdleWorkersAsWarmCapacity(t *tes if err := worker.SetSharedState(SharedWorkerState{ Lifecycle: WorkerLifecycleReserved, Assignment: &WorkerAssignment{ - TeamName: "analytics", + OrgID: "analytics", LeaseExpiresAt: time.Now().Add(time.Hour), }, }); err != nil { @@ -332,7 +332,7 @@ func TestK8sPoolFindIdleWorkerSkipsReservedSharedWorker(t *testing.T) { if err := reserved.SetSharedState(SharedWorkerState{ Lifecycle: WorkerLifecycleReserved, Assignment: &WorkerAssignment{ - TeamName: "analytics", + OrgID: "analytics", LeaseExpiresAt: time.Now().Add(time.Hour), }, }); err != nil { @@ -367,7 +367,7 @@ func TestK8sPoolReserveSharedWorkerReservesIdleWorkerAndReplenishesWarmCapacity( leaseExpiry := time.Date(2026, time.March, 20, 16, 0, 0, 0, time.UTC) worker, err := pool.ReserveSharedWorker(context.Background(), &WorkerAssignment{ - TeamName: "analytics", + OrgID: "analytics", LeaseExpiresAt: leaseExpiry, }) if err != nil { @@ -381,7 +381,7 @@ func TestK8sPoolReserveSharedWorkerReservesIdleWorkerAndReplenishesWarmCapacity( if state.Lifecycle != WorkerLifecycleReserved { t.Fatalf("expected reserved lifecycle, got %q", state.Lifecycle) } - if state.Assignment == nil || state.Assignment.TeamName != "analytics" { + if state.Assignment == nil || state.Assignment.OrgID != "analytics" { t.Fatalf("expected analytics assignment, got %#v", state.Assignment) } if !state.Assignment.LeaseExpiresAt.Equal(leaseExpiry) { @@ -435,7 +435,7 @@ func TestK8sPoolReserveSharedWorkerSpawnsWhenPoolIsCold(t *testing.T) { } worker, err := pool.ReserveSharedWorker(context.Background(), &WorkerAssignment{ - TeamName: "billing", + OrgID: "billing", LeaseExpiresAt: time.Now().Add(time.Hour), }) if err != nil { @@ -461,7 +461,7 @@ func TestK8sPoolIdleReaperSkipsReservedSharedWorker(t *testing.T) { if err := reserved.SetSharedState(SharedWorkerState{ Lifecycle: WorkerLifecycleReserved, Assignment: &WorkerAssignment{ - TeamName: "analytics", + OrgID: "analytics", LeaseExpiresAt: time.Now().Add(time.Hour), }, }); err != nil { diff --git a/controlplane/multitenant.go b/controlplane/multitenant.go index b65425f..5836e6c 100644 --- a/controlplane/multitenant.go +++ b/controlplane/multitenant.go @@ -18,13 +18,13 @@ import ( "github.com/prometheus/client_golang/prometheus/promhttp" ) -// teamRouterAdapter wraps TeamRouter to implement both TeamRouterInterface -// (for the control plane) and admin.TeamStackInfo (for the admin API). -type teamRouterAdapter struct { - router *TeamRouter +// orgRouterAdapter wraps OrgRouter to implement both OrgRouterInterface +// (for the control plane) and admin.OrgStackInfo (for the admin API). +type orgRouterAdapter struct { + router *OrgRouter } -func (a *teamRouterAdapter) StackForUser(username string) (WorkerPool, *SessionManager, *MemoryRebalancer, bool) { +func (a *orgRouterAdapter) StackForUser(username string) (WorkerPool, *SessionManager, *MemoryRebalancer, bool) { stack, ok := a.router.StackForUser(username) if !ok { return nil, nil, nil, false @@ -32,28 +32,28 @@ func (a *teamRouterAdapter) StackForUser(username string) (WorkerPool, *SessionM return stack.Pool, stack.Sessions, stack.Rebalancer, true } -func (a *teamRouterAdapter) ShutdownAll() { +func (a *orgRouterAdapter) ShutdownAll() { a.router.ShutdownAll() } -func (a *teamRouterAdapter) AllTeamStats() []admin.TeamStatus { +func (a *orgRouterAdapter) AllOrgStats() []admin.OrgStatus { stacks := a.router.AllStacks() - stats := make([]admin.TeamStatus, 0, len(stacks)) + stats := make([]admin.OrgStatus, 0, len(stacks)) for name, stack := range stacks { sessionCount := stack.Sessions.SessionCount() - stats = append(stats, admin.TeamStatus{ + stats = append(stats, admin.OrgStatus{ Name: name, ActiveSessions: sessionCount, MaxWorkers: stack.Config.MaxWorkers, MemoryBudget: stack.Config.MemoryBudget, }) - // Emit per-team Prometheus metrics - observeTeamSessionsActive(name, sessionCount) + // Emit per-org Prometheus metrics + observeOrgSessionsActive(name, sessionCount) } return stats } -func (a *teamRouterAdapter) AllWorkerStatuses() []admin.WorkerStatus { +func (a *orgRouterAdapter) AllWorkerStatuses() []admin.WorkerStatus { stacks := a.router.AllStacks() var result []admin.WorkerStatus for name, stack := range stacks { @@ -74,19 +74,19 @@ func (a *teamRouterAdapter) AllWorkerStatuses() []admin.WorkerStatus { } result = append(result, admin.WorkerStatus{ ID: wID, - Team: name, + Org: name, ActiveSessions: count, Status: status, }) } - // Emit per-team worker Prometheus metrics - observeTeamWorkersActive(name, activeCount) - observeTeamWorkersIdle(name, idleCount) + // Emit per-org worker Prometheus metrics + observeOrgWorkersActive(name, activeCount) + observeOrgWorkersIdle(name, idleCount) } return result } -func (a *teamRouterAdapter) AllSessionStatuses() []admin.SessionStatus { +func (a *orgRouterAdapter) AllSessionStatuses() []admin.SessionStatus { stacks := a.router.AllStacks() var result []admin.SessionStatus for name, stack := range stacks { @@ -94,7 +94,7 @@ func (a *teamRouterAdapter) AllSessionStatuses() []admin.SessionStatus { result = append(result, admin.SessionStatus{ PID: s.PID, WorkerID: s.WorkerID, - Team: name, + Org: name, }) } } @@ -102,17 +102,17 @@ func (a *teamRouterAdapter) AllSessionStatuses() []admin.SessionStatus { } // Compile-time checks. -var _ TeamRouterInterface = (*teamRouterAdapter)(nil) -var _ admin.TeamStackInfo = (*teamRouterAdapter)(nil) +var _ OrgRouterInterface = (*orgRouterAdapter)(nil) +var _ admin.OrgStackInfo = (*orgRouterAdapter)(nil) -// SetupMultiTenant initializes the config store, team router, and Gin admin server. +// SetupMultiTenant initializes the config store, org router, and Gin admin server. // Called from RunControlPlane when --config-store is set with remote backend. func SetupMultiTenant( cfg ControlPlaneConfig, srv *server.Server, memBudget uint64, maxWorkers int, -) (ConfigStoreInterface, TeamRouterInterface, *http.Server, error) { +) (ConfigStoreInterface, OrgRouterInterface, *http.Server, error) { pollInterval := cfg.ConfigPollInterval if pollInterval <= 0 { pollInterval = 30 * time.Second @@ -138,12 +138,12 @@ func SetupMultiTenant( MemoryBudget: int64(memBudget), } - router, err := NewTeamRouter(store, baseCfg, cfg, srv) + router, err := NewOrgRouter(store, baseCfg, cfg, srv) if err != nil { return nil, nil, nil, err } - adpt := &teamRouterAdapter{router: router} + adpt := &orgRouterAdapter{router: router} // Register config change handler store.OnChange(router.HandleConfigChange) diff --git a/controlplane/multitenant_stub.go b/controlplane/multitenant_stub.go index f6a0c11..9efb937 100644 --- a/controlplane/multitenant_stub.go +++ b/controlplane/multitenant_stub.go @@ -15,6 +15,6 @@ func SetupMultiTenant( srv *server.Server, memBudget uint64, maxWorkers int, -) (ConfigStoreInterface, TeamRouterInterface, *http.Server, error) { +) (ConfigStoreInterface, OrgRouterInterface, *http.Server, error) { return nil, nil, nil, fmt.Errorf("multi-tenant mode requires -tags kubernetes build") } diff --git a/controlplane/team_reserved_pool.go b/controlplane/org_reserved_pool.go similarity index 59% rename from controlplane/team_reserved_pool.go rename to controlplane/org_reserved_pool.go index eb16749..bda5eeb 100644 --- a/controlplane/team_reserved_pool.go +++ b/controlplane/org_reserved_pool.go @@ -10,26 +10,26 @@ import ( const defaultSharedWorkerReservationLease = 24 * time.Hour -// TeamReservedWorkerPool presents one team's reserved slice of a shared K8s warm pool. +// OrgReservedPool presents one org's reserved slice of a shared K8s warm pool. // It preserves the existing WorkerPool contract for SessionManager while ensuring -// workers are reserved to a single team for their lifetime and retired after use. -type TeamReservedWorkerPool struct { +// workers are reserved to a single org for their lifetime and retired after use. +type OrgReservedPool struct { shared *K8sWorkerPool - teamName string + orgID string maxWorkers int leaseDuration time.Duration } -func NewTeamReservedWorkerPool(shared *K8sWorkerPool, teamName string, maxWorkers int) *TeamReservedWorkerPool { - return &TeamReservedWorkerPool{ +func NewOrgReservedPool(shared *K8sWorkerPool, orgID string, maxWorkers int) *OrgReservedPool { + return &OrgReservedPool{ shared: shared, - teamName: teamName, + orgID: orgID, maxWorkers: maxWorkers, leaseDuration: defaultSharedWorkerReservationLease, } } -func (p *TeamReservedWorkerPool) AcquireWorker(ctx context.Context) (*ManagedWorker, error) { +func (p *OrgReservedPool) AcquireWorker(ctx context.Context) (*ManagedWorker, error) { for { select { case <-ctx.Done(): @@ -56,7 +56,7 @@ func (p *TeamReservedWorkerPool) AcquireWorker(ctx context.Context) (*ManagedWor p.shared.mu.Unlock() worker, err := p.shared.ReserveSharedWorker(ctx, &WorkerAssignment{ - TeamName: p.teamName, + OrgID: p.orgID, LeaseExpiresAt: time.Now().Add(p.leaseDuration), }) if err != nil { @@ -64,7 +64,7 @@ func (p *TeamReservedWorkerPool) AcquireWorker(ctx context.Context) (*ManagedWor } p.shared.mu.Lock() - if owned := p.workerBelongsToTeamLocked(worker); owned { + if owned := p.workerBelongsToOrgLocked(worker); owned { worker.activeSessions++ p.shared.mu.Unlock() return worker, nil @@ -88,52 +88,52 @@ func (p *TeamReservedWorkerPool) AcquireWorker(ctx context.Context) (*ManagedWor } } -func (p *TeamReservedWorkerPool) ReleaseWorker(id int) { +func (p *OrgReservedPool) ReleaseWorker(id int) { _ = p.RetireWorkerIfNoSessions(id) } -func (p *TeamReservedWorkerPool) RetireWorker(id int) { +func (p *OrgReservedPool) RetireWorker(id int) { if _, ok := p.Worker(id); !ok { return } p.shared.RetireWorker(id) } -func (p *TeamReservedWorkerPool) RetireWorkerIfNoSessions(id int) bool { +func (p *OrgReservedPool) RetireWorkerIfNoSessions(id int) bool { if _, ok := p.Worker(id); !ok { return false } return p.shared.RetireWorkerIfNoSessions(id) } -func (p *TeamReservedWorkerPool) Worker(id int) (*ManagedWorker, bool) { +func (p *OrgReservedPool) Worker(id int) (*ManagedWorker, bool) { p.shared.mu.RLock() defer p.shared.mu.RUnlock() w, ok := p.shared.workers[id] - if !ok || !p.workerBelongsToTeamLocked(w) { + if !ok || !p.workerBelongsToOrgLocked(w) { return nil, false } return w, true } -func (p *TeamReservedWorkerPool) SpawnMinWorkers(count int) error { +func (p *OrgReservedPool) SpawnMinWorkers(count int) error { return nil } -func (p *TeamReservedWorkerPool) HealthCheckLoop(ctx context.Context, interval time.Duration, onCrash WorkerCrashHandler, onProgress ProgressHandler) { +func (p *OrgReservedPool) HealthCheckLoop(ctx context.Context, interval time.Duration, onCrash WorkerCrashHandler, onProgress ProgressHandler) { } -func (p *TeamReservedWorkerPool) SetMaxWorkers(n int) { +func (p *OrgReservedPool) SetMaxWorkers(n int) { p.shared.mu.Lock() defer p.shared.mu.Unlock() p.maxWorkers = n } -func (p *TeamReservedWorkerPool) ShutdownAll() { +func (p *OrgReservedPool) ShutdownAll() { p.shared.mu.RLock() workers := make([]int, 0, len(p.shared.workers)) for id, w := range p.shared.workers { - if p.workerBelongsToTeamLocked(w) { + if p.workerBelongsToOrgLocked(w) { workers = append(workers, id) } } @@ -144,21 +144,21 @@ func (p *TeamReservedWorkerPool) ShutdownAll() { } } -func (p *TeamReservedWorkerPool) findIdleAssignedWorkerLocked() *ManagedWorker { +func (p *OrgReservedPool) findIdleAssignedWorkerLocked() *ManagedWorker { for _, w := range p.shared.workers { select { case <-w.done: continue default: } - if w.activeSessions == 0 && p.workerBelongsToTeamLocked(w) { + if w.activeSessions == 0 && p.workerBelongsToOrgLocked(w) { return w } } return nil } -func (p *TeamReservedWorkerPool) leastLoadedAssignedWorkerLocked() *ManagedWorker { +func (p *OrgReservedPool) leastLoadedAssignedWorkerLocked() *ManagedWorker { var best *ManagedWorker for _, w := range p.shared.workers { select { @@ -166,7 +166,7 @@ func (p *TeamReservedWorkerPool) leastLoadedAssignedWorkerLocked() *ManagedWorke continue default: } - if !p.workerBelongsToTeamLocked(w) { + if !p.workerBelongsToOrgLocked(w) { continue } if best == nil || w.activeSessions < best.activeSessions { @@ -176,7 +176,7 @@ func (p *TeamReservedWorkerPool) leastLoadedAssignedWorkerLocked() *ManagedWorke return best } -func (p *TeamReservedWorkerPool) assignedWorkerCountLocked() int { +func (p *OrgReservedPool) assignedWorkerCountLocked() int { count := 0 for _, w := range p.shared.workers { select { @@ -184,14 +184,14 @@ func (p *TeamReservedWorkerPool) assignedWorkerCountLocked() int { continue default: } - if p.workerBelongsToTeamLocked(w) { + if p.workerBelongsToOrgLocked(w) { count++ } } return count } -func (p *TeamReservedWorkerPool) workerBelongsToTeamLocked(w *ManagedWorker) bool { +func (p *OrgReservedPool) workerBelongsToOrgLocked(w *ManagedWorker) bool { state := w.SharedState() - return state.Assignment != nil && state.Assignment.TeamName == p.teamName && state.NormalizedLifecycle() != WorkerLifecycleRetired + return state.Assignment != nil && state.Assignment.OrgID == p.orgID && state.NormalizedLifecycle() != WorkerLifecycleRetired } diff --git a/controlplane/team_reserved_pool_test.go b/controlplane/org_reserved_pool_test.go similarity index 78% rename from controlplane/team_reserved_pool_test.go rename to controlplane/org_reserved_pool_test.go index 71a7467..504bed4 100644 --- a/controlplane/team_reserved_pool_test.go +++ b/controlplane/org_reserved_pool_test.go @@ -8,7 +8,7 @@ import ( "time" ) -func TestTeamReservedWorkerPoolAcquireReservesTeamWorker(t *testing.T) { +func TestOrgReservedPoolAcquireReservesOrgWorker(t *testing.T) { shared, _ := newTestK8sPool(t, 5) shared.spawnWarmWorkerFunc = func(ctx context.Context, id int) error { shared.mu.Lock() @@ -17,7 +17,7 @@ func TestTeamReservedWorkerPoolAcquireReservesTeamWorker(t *testing.T) { return nil } - pool := NewTeamReservedWorkerPool(shared, "analytics", 2) + pool := NewOrgReservedPool(shared, "analytics", 2) worker, err := pool.AcquireWorker(context.Background()) if err != nil { t.Fatalf("AcquireWorker: %v", err) @@ -27,7 +27,7 @@ func TestTeamReservedWorkerPoolAcquireReservesTeamWorker(t *testing.T) { } state := worker.SharedState() - if state.Assignment == nil || state.Assignment.TeamName != "analytics" { + if state.Assignment == nil || state.Assignment.OrgID != "analytics" { t.Fatalf("expected analytics assignment, got %#v", state.Assignment) } if state.Lifecycle != WorkerLifecycleReserved { @@ -35,13 +35,13 @@ func TestTeamReservedWorkerPoolAcquireReservesTeamWorker(t *testing.T) { } } -func TestTeamReservedWorkerPoolAcquireSkipsOtherTeamsWorkers(t *testing.T) { +func TestOrgReservedPoolAcquireSkipsOtherOrgsWorkers(t *testing.T) { shared, _ := newTestK8sPool(t, 5) other := &ManagedWorker{ID: 1, done: make(chan struct{})} if err := other.SetSharedState(SharedWorkerState{ Lifecycle: WorkerLifecycleReserved, Assignment: &WorkerAssignment{ - TeamName: "billing", + OrgID: "billing", LeaseExpiresAt: time.Now().Add(time.Hour), }, }); err != nil { @@ -56,26 +56,26 @@ func TestTeamReservedWorkerPoolAcquireSkipsOtherTeamsWorkers(t *testing.T) { return nil } - pool := NewTeamReservedWorkerPool(shared, "analytics", 2) + pool := NewOrgReservedPool(shared, "analytics", 2) worker, err := pool.AcquireWorker(context.Background()) if err != nil { t.Fatalf("AcquireWorker: %v", err) } if worker.ID == other.ID { - t.Fatal("expected analytics pool to reserve its own worker, not borrow another team's worker") + t.Fatal("expected analytics pool to reserve its own worker, not borrow another org's worker") } - if state := worker.SharedState(); state.Assignment == nil || state.Assignment.TeamName != "analytics" { + if state := worker.SharedState(); state.Assignment == nil || state.Assignment.OrgID != "analytics" { t.Fatalf("expected analytics assignment, got %#v", state.Assignment) } } -func TestTeamReservedWorkerPoolReleaseWorkerRetiresOnLastSession(t *testing.T) { +func TestOrgReservedPoolReleaseWorkerRetiresOnLastSession(t *testing.T) { shared, _ := newTestK8sPool(t, 5) worker := &ManagedWorker{ID: 9, activeSessions: 1, done: make(chan struct{})} if err := worker.SetSharedState(SharedWorkerState{ Lifecycle: WorkerLifecycleReserved, Assignment: &WorkerAssignment{ - TeamName: "analytics", + OrgID: "analytics", LeaseExpiresAt: time.Now().Add(time.Hour), }, }); err != nil { @@ -83,7 +83,7 @@ func TestTeamReservedWorkerPoolReleaseWorkerRetiresOnLastSession(t *testing.T) { } shared.workers[worker.ID] = worker - pool := NewTeamReservedWorkerPool(shared, "analytics", 1) + pool := NewOrgReservedPool(shared, "analytics", 1) pool.ReleaseWorker(worker.ID) time.Sleep(100 * time.Millisecond) diff --git a/controlplane/team_router.go b/controlplane/org_router.go similarity index 52% rename from controlplane/team_router.go rename to controlplane/org_router.go index 4d3efeb..d0e918a 100644 --- a/controlplane/team_router.go +++ b/controlplane/org_router.go @@ -14,19 +14,19 @@ import ( "github.com/posthog/duckgres/server" ) -// TeamStack holds the isolated worker pool and session manager for a team. -type TeamStack struct { - Config *configstore.TeamConfig +// OrgStack holds the isolated worker pool and session manager for an org. +type OrgStack struct { + Config *configstore.OrgConfig Pool WorkerPool Sessions *SessionManager Rebalancer *MemoryRebalancer cancel context.CancelFunc } -// TeamRouter manages per-team stacks, creating/destroying them as config changes. -type TeamRouter struct { +// OrgRouter manages per-org stacks, creating/destroying them as config changes. +type OrgRouter struct { mu sync.RWMutex - teams map[string]*TeamStack + orgs map[string]*OrgStack configStore *configstore.ConfigStore baseCfg K8sWorkerPoolConfig sharedPool *K8sWorkerPool @@ -36,10 +36,10 @@ type TeamRouter struct { sharedCancel context.CancelFunc } -// NewTeamRouter creates a TeamRouter from the initial config snapshot. -func NewTeamRouter(store *configstore.ConfigStore, baseCfg K8sWorkerPoolConfig, globalCfg ControlPlaneConfig, srv *server.Server) (*TeamRouter, error) { - tr := &TeamRouter{ - teams: make(map[string]*TeamStack), +// NewOrgRouter creates an OrgRouter from the initial config snapshot. +func NewOrgRouter(store *configstore.ConfigStore, baseCfg K8sWorkerPoolConfig, globalCfg ControlPlaneConfig, srv *server.Server) (*OrgRouter, error) { + tr := &OrgRouter{ + orgs: make(map[string]*OrgStack), configStore: store, baseCfg: baseCfg, globalCfg: globalCfg, @@ -47,7 +47,7 @@ func NewTeamRouter(store *configstore.ConfigStore, baseCfg K8sWorkerPoolConfig, } sharedCfg := baseCfg - sharedCfg.TeamName = "" + sharedCfg.OrgID = "" sharedCfg.WorkerIDGenerator = func() int { return int(tr.nextWorkerID.Add(1)) } @@ -67,9 +67,9 @@ func NewTeamRouter(store *configstore.ConfigStore, baseCfg K8sWorkerPoolConfig, go tr.sharedPool.HealthCheckLoop(sharedCtx, tr.globalCfg.HealthCheckInterval, tr.onSharedWorkerCrash, tr.onSharedWorkerProgress) snap := store.Snapshot() - for _, tc := range snap.Teams { - if _, err := tr.createTeamStack(tc); err != nil { - slog.Error("Failed to create team stack.", "team", tc.Name, "error", err) + for _, tc := range snap.Orgs { + if _, err := tr.createOrgStack(tc); err != nil { + slog.Error("Failed to create org stack.", "org", tc.Name, "error", err) continue } } @@ -79,8 +79,8 @@ func NewTeamRouter(store *configstore.ConfigStore, baseCfg K8sWorkerPoolConfig, return tr, nil } -// createTeamStack creates an isolated pool + session manager for a team. -func (tr *TeamRouter) createTeamStack(tc *configstore.TeamConfig) (*TeamStack, error) { +// createOrgStack creates an isolated pool + session manager for an org. +func (tr *OrgRouter) createOrgStack(tc *configstore.OrgConfig) (*OrgStack, error) { ctx, cancel := context.WithCancel(context.Background()) maxWorkers := tc.MaxWorkers @@ -93,14 +93,14 @@ func (tr *TeamRouter) createTeamStack(tc *configstore.TeamConfig) (*TeamStack, e memoryBudget = int64(server.ParseMemoryBytes(tc.MemoryBudget)) } - pool := NewTeamReservedWorkerPool(tr.sharedPool, tc.Name, maxWorkers) + pool := NewOrgReservedPool(tr.sharedPool, tc.Name, maxWorkers) rebalancer := NewMemoryRebalancer(uint64(memoryBudget), 0, nil, tr.globalCfg.MemoryRebalance) sessions := NewSessionManager(pool, rebalancer) rebalancer.SetSessionLister(sessions) - // Periodic per-team metrics emission - teamName := tc.Name + // Periodic per-org metrics emission + orgID := tc.Name go func() { ticker := time.NewTicker(10 * time.Second) defer ticker.Stop() @@ -110,12 +110,12 @@ func (tr *TeamRouter) createTeamStack(tc *configstore.TeamConfig) (*TeamStack, e return case <-ticker.C: sessionCount := sessions.SessionCount() - observeTeamSessionsActive(teamName, sessionCount) + observeOrgSessionsActive(orgID, sessionCount) } } }() - stack := &TeamStack{ + stack := &OrgStack{ Config: tc, Pool: pool, Sessions: sessions, @@ -124,26 +124,26 @@ func (tr *TeamRouter) createTeamStack(tc *configstore.TeamConfig) (*TeamStack, e } tr.mu.Lock() - tr.teams[tc.Name] = stack + tr.orgs[tc.Name] = stack tr.mu.Unlock() - slog.Info("Team stack created.", "team", tc.Name, "max_workers", maxWorkers) + slog.Info("Org stack created.", "org", tc.Name, "max_workers", maxWorkers) _ = ctx // keep linter happy return stack, nil } -// DestroyTeamStack drains and cleans up a team's resources. -func (tr *TeamRouter) DestroyTeamStack(teamName string) { +// DestroyOrgStack drains and cleans up an org's resources. +func (tr *OrgRouter) DestroyOrgStack(orgID string) { tr.mu.Lock() - stack, ok := tr.teams[teamName] + stack, ok := tr.orgs[orgID] if !ok { tr.mu.Unlock() return } - delete(tr.teams, teamName) + delete(tr.orgs, orgID) tr.mu.Unlock() - slog.Info("Destroying team stack.", "team", teamName) + slog.Info("Destroying org stack.", "org", orgID) stack.cancel() stack.Pool.ShutdownAll() if stack.Rebalancer != nil { @@ -151,50 +151,50 @@ func (tr *TeamRouter) DestroyTeamStack(teamName string) { } } -// StackForUser resolves a username to its team stack. -func (tr *TeamRouter) StackForUser(username string) (*TeamStack, bool) { - teamName := tr.configStore.TeamForUser(username) - if teamName == "" { +// StackForUser resolves a username to its org stack. +func (tr *OrgRouter) StackForUser(username string) (*OrgStack, bool) { + orgID := tr.configStore.OrgForUser(username) + if orgID == "" { return nil, false } tr.mu.RLock() - stack, ok := tr.teams[teamName] + stack, ok := tr.orgs[orgID] tr.mu.RUnlock() return stack, ok } -// HandleConfigChange reconciles team stacks when the config snapshot changes. -func (tr *TeamRouter) HandleConfigChange(old, new *configstore.Snapshot) { - // Detect new teams - for name, tc := range new.Teams { - if _, existed := old.Teams[name]; !existed { - slog.Info("New team detected, creating stack.", "team", name) - if _, err := tr.createTeamStack(tc); err != nil { - slog.Error("Failed to create team stack on config change.", "team", name, "error", err) +// HandleConfigChange reconciles org stacks when the config snapshot changes. +func (tr *OrgRouter) HandleConfigChange(old, new *configstore.Snapshot) { + // Detect new orgs + for name, tc := range new.Orgs { + if _, existed := old.Orgs[name]; !existed { + slog.Info("New org detected, creating stack.", "org", name) + if _, err := tr.createOrgStack(tc); err != nil { + slog.Error("Failed to create org stack on config change.", "org", name, "error", err) } } } - // Detect removed teams - for name := range old.Teams { - if _, exists := new.Teams[name]; !exists { - slog.Info("Team removed, destroying stack.", "team", name) - tr.DestroyTeamStack(name) + // Detect removed orgs + for name := range old.Orgs { + if _, exists := new.Orgs[name]; !exists { + slog.Info("Org removed, destroying stack.", "org", name) + tr.DestroyOrgStack(name) } } - // Detect changed team limits (update in-place) - for name, newTC := range new.Teams { - oldTC, existed := old.Teams[name] + // Detect changed org limits (update in-place) + for name, newTC := range new.Orgs { + oldTC, existed := old.Orgs[name] if !existed { continue } if oldTC.MaxWorkers != newTC.MaxWorkers || oldTC.MemoryBudget != newTC.MemoryBudget { - slog.Info("Team config changed.", "team", name, + slog.Info("Org config changed.", "org", name, "old_max_workers", oldTC.MaxWorkers, "new_max_workers", newTC.MaxWorkers) tr.mu.Lock() - if stack, ok := tr.teams[name]; ok { + if stack, ok := tr.orgs[name]; ok { stack.Config = newTC // Propagate MaxWorkers to the pool so it enforces the new limit maxWorkers := newTC.MaxWorkers @@ -210,29 +210,29 @@ func (tr *TeamRouter) HandleConfigChange(old, new *configstore.Snapshot) { tr.reconcileWarmCapacity(new) } -// AllStacks returns a snapshot of all team stacks for admin API usage. -func (tr *TeamRouter) AllStacks() map[string]*TeamStack { +// AllStacks returns a snapshot of all org stacks for admin API usage. +func (tr *OrgRouter) AllStacks() map[string]*OrgStack { tr.mu.RLock() defer tr.mu.RUnlock() - result := make(map[string]*TeamStack, len(tr.teams)) - for k, v := range tr.teams { + result := make(map[string]*OrgStack, len(tr.orgs)) + for k, v := range tr.orgs { result[k] = v } return result } -// ShutdownAll shuts down all team stacks. -func (tr *TeamRouter) ShutdownAll() { +// ShutdownAll shuts down all org stacks. +func (tr *OrgRouter) ShutdownAll() { tr.mu.Lock() - teams := make(map[string]*TeamStack, len(tr.teams)) - for k, v := range tr.teams { - teams[k] = v + orgs := make(map[string]*OrgStack, len(tr.orgs)) + for k, v := range tr.orgs { + orgs[k] = v } - tr.teams = make(map[string]*TeamStack) + tr.orgs = make(map[string]*OrgStack) tr.mu.Unlock() - for name, stack := range teams { - slog.Info("Shutting down team stack.", "team", name) + for name, stack := range orgs { + slog.Info("Shutting down org stack.", "org", name) stack.cancel() stack.Pool.ShutdownAll() if stack.Rebalancer != nil { @@ -248,7 +248,7 @@ func (tr *TeamRouter) ShutdownAll() { } } -func (tr *TeamRouter) reconcileWarmCapacity(snap *configstore.Snapshot) { +func (tr *OrgRouter) reconcileWarmCapacity(snap *configstore.Snapshot) { if tr.sharedPool == nil || snap == nil { return } @@ -260,26 +260,26 @@ func (tr *TeamRouter) reconcileWarmCapacity(snap *configstore.Snapshot) { tr.sharedPool.SetWarmCapacityTarget(target) if target > 0 { - observeTeamWorkerSpawn("shared") + observeOrgWorkerSpawn("shared") if err := tr.sharedPool.SpawnMinWorkers(target); err != nil { slog.Warn("Failed to reconcile shared warm capacity.", "target", target, "error", err) } } } -func (tr *TeamRouter) onSharedWorkerCrash(workerID int) { - stack, teamName, ok := tr.stackForWorker(workerID) +func (tr *OrgRouter) onSharedWorkerCrash(workerID int) { + stack, orgID, ok := tr.stackForWorker(workerID) if !ok { return } - observeTeamWorkerCrash(teamName) + observeOrgWorkerCrash(orgID) stack.Sessions.OnWorkerCrash(workerID, func(pid int32) { - slog.Warn("Session orphaned by worker crash.", "team", teamName, "pid", pid, "worker", workerID) + slog.Warn("Session orphaned by worker crash.", "org", orgID, "pid", pid, "worker", workerID) }) } -func (tr *TeamRouter) onSharedWorkerProgress(workerID int, progress map[string]*SessionProgress) { +func (tr *OrgRouter) onSharedWorkerProgress(workerID int, progress map[string]*SessionProgress) { stack, _, ok := tr.stackForWorker(workerID) if !ok { return @@ -287,13 +287,13 @@ func (tr *TeamRouter) onSharedWorkerProgress(workerID int, progress map[string]* stack.Sessions.UpdateProgress(workerID, progress) } -func (tr *TeamRouter) stackForWorker(workerID int) (*TeamStack, string, bool) { +func (tr *OrgRouter) stackForWorker(workerID int) (*OrgStack, string, bool) { tr.mu.RLock() defer tr.mu.RUnlock() - for teamName, stack := range tr.teams { + for orgID, stack := range tr.orgs { if stack.Sessions.SessionCountForWorker(workerID) > 0 { - return stack, teamName, true + return stack, orgID, true } } return nil, "", false diff --git a/controlplane/team_router_test.go b/controlplane/org_router_test.go similarity index 83% rename from controlplane/team_router_test.go rename to controlplane/org_router_test.go index 05ab2a5..db94376 100644 --- a/controlplane/team_router_test.go +++ b/controlplane/org_router_test.go @@ -9,7 +9,7 @@ import ( "github.com/posthog/duckgres/controlplane/configstore" ) -func TestTeamRouterReconcileWarmCapacityUsesExplicitSharedWarmTarget(t *testing.T) { +func TestOrgRouterReconcileWarmCapacityUsesExplicitSharedWarmTarget(t *testing.T) { sharedPool, _ := newTestK8sPool(t, 10) sharedPool.spawnWarmWorkerFunc = func(ctx context.Context, id int) error { sharedPool.mu.Lock() @@ -17,7 +17,7 @@ func TestTeamRouterReconcileWarmCapacityUsesExplicitSharedWarmTarget(t *testing. sharedPool.workers[id] = &ManagedWorker{ID: id, done: make(chan struct{})} return nil } - tr := &TeamRouter{ + tr := &OrgRouter{ sharedPool: sharedPool, globalCfg: ControlPlaneConfig{ K8s: K8sConfig{ @@ -27,7 +27,7 @@ func TestTeamRouterReconcileWarmCapacityUsesExplicitSharedWarmTarget(t *testing. } snap := &configstore.Snapshot{ - Teams: map[string]*configstore.TeamConfig{ + Orgs: map[string]*configstore.OrgConfig{ "analytics": {Name: "analytics"}, "billing": {Name: "billing"}, }, diff --git a/controlplane/worker_mgr_test.go b/controlplane/worker_mgr_test.go index 2042968..fda7db3 100644 --- a/controlplane/worker_mgr_test.go +++ b/controlplane/worker_mgr_test.go @@ -1140,7 +1140,7 @@ func TestManagedWorkerSetSharedStateClonesAssignment(t *testing.T) { input := SharedWorkerState{ Lifecycle: WorkerLifecycleReserved, Assignment: &WorkerAssignment{ - TeamName: "analytics", + OrgID: "analytics", LeaseExpiresAt: leaseExpiry, }, } @@ -1150,7 +1150,7 @@ func TestManagedWorkerSetSharedStateClonesAssignment(t *testing.T) { t.Fatalf("SetSharedState: %v", err) } - input.Assignment.TeamName = "mutated" + input.Assignment.OrgID = "mutated" input.Assignment.LeaseExpiresAt = leaseExpiry.Add(time.Hour) got := w.SharedState() @@ -1160,19 +1160,19 @@ func TestManagedWorkerSetSharedStateClonesAssignment(t *testing.T) { if got.Assignment == input.Assignment { t.Fatal("expected worker state assignment to be cloned") } - if got.Assignment.TeamName != "analytics" { - t.Fatalf("expected stored team name analytics, got %q", got.Assignment.TeamName) + if got.Assignment.OrgID != "analytics" { + t.Fatalf("expected stored org ID analytics, got %q", got.Assignment.OrgID) } if !got.Assignment.LeaseExpiresAt.Equal(leaseExpiry) { t.Fatalf("expected stored lease expiry %v, got %v", leaseExpiry, got.Assignment.LeaseExpiresAt) } - got.Assignment.TeamName = "leaked" + got.Assignment.OrgID = "leaked" fresh := w.SharedState() if fresh.Assignment == nil { t.Fatal("expected stored assignment on subsequent read") } - if fresh.Assignment.TeamName != "analytics" { - t.Fatalf("expected readback clone to protect stored team name, got %q", fresh.Assignment.TeamName) + if fresh.Assignment.OrgID != "analytics" { + t.Fatalf("expected readback clone to protect stored org ID, got %q", fresh.Assignment.OrgID) } } diff --git a/controlplane/worker_pool.go b/controlplane/worker_pool.go index 7612694..e296616 100644 --- a/controlplane/worker_pool.go +++ b/controlplane/worker_pool.go @@ -58,8 +58,8 @@ type K8sWorkerPoolConfig struct { ImagePullPolicy string // Image pull policy for worker pods (e.g., "Never", "IfNotPresent", "Always") ServiceAccount string // ServiceAccount name for worker pods (default: "default") MemoryBudget int64 // Total memory budget in bytes; used to derive per-worker resource limits - TeamName string // Team name for pod labels (multi-tenant mode) - WorkerIDGenerator func() int // Shared ID generator across teams (nil = internal counter) + OrgID string // Org ID for pod labels (multi-tenant mode) + WorkerIDGenerator func() int // Shared ID generator across orgs (nil = internal counter) } // K8sPoolFactory creates a K8sWorkerPool. Registered at init time by the diff --git a/controlplane/worker_state.go b/controlplane/worker_state.go index 16fef04..a9c129b 100644 --- a/controlplane/worker_state.go +++ b/controlplane/worker_state.go @@ -20,9 +20,9 @@ const ( ) // WorkerAssignment carries tenant-specific metadata once a shared worker has -// been reserved for a team. +// been reserved for an org. type WorkerAssignment struct { - TeamName string + OrgID string LeaseExpiresAt time.Time } @@ -150,8 +150,8 @@ func resolveWorkerAssignment(current, proposed *WorkerAssignment) (*WorkerAssign return cloneWorkerAssignment(proposed), nil case proposed == nil: return cloneWorkerAssignment(current), nil - case current.TeamName != proposed.TeamName: - return nil, fmt.Errorf("assignment team cannot change from %q to %q", current.TeamName, proposed.TeamName) + case current.OrgID != proposed.OrgID: + return nil, fmt.Errorf("assignment org cannot change from %q to %q", current.OrgID, proposed.OrgID) default: return cloneWorkerAssignment(proposed), nil } @@ -161,8 +161,8 @@ func validateWorkerAssignment(assignment *WorkerAssignment) error { if assignment == nil { return fmt.Errorf("missing assignment") } - if assignment.TeamName == "" { - return fmt.Errorf("missing team name") + if assignment.OrgID == "" { + return fmt.Errorf("missing org ID") } if assignment.LeaseExpiresAt.IsZero() { return fmt.Errorf("missing lease expiry") diff --git a/controlplane/worker_state_test.go b/controlplane/worker_state_test.go index e33e9b9..475ab1d 100644 --- a/controlplane/worker_state_test.go +++ b/controlplane/worker_state_test.go @@ -23,7 +23,7 @@ func TestSharedWorkerStateTransitionLifecycle(t *testing.T) { leaseExpiry := time.Date(2026, time.March, 20, 16, 0, 0, 0, time.UTC) state, err := (SharedWorkerState{}).Transition(WorkerLifecycleReserved, &WorkerAssignment{ - TeamName: "analytics", + OrgID: "analytics", LeaseExpiresAt: leaseExpiry, }) if err != nil { @@ -32,7 +32,7 @@ func TestSharedWorkerStateTransitionLifecycle(t *testing.T) { if got := state.NormalizedLifecycle(); got != WorkerLifecycleReserved { t.Fatalf("expected reserved lifecycle, got %q", got) } - if state.Assignment == nil || state.Assignment.TeamName != "analytics" { + if state.Assignment == nil || state.Assignment.OrgID != "analytics" { t.Fatalf("expected analytics assignment, got %#v", state.Assignment) } if !state.Assignment.LeaseExpiresAt.Equal(leaseExpiry) { @@ -54,7 +54,7 @@ func TestSharedWorkerStateTransitionLifecycle(t *testing.T) { if got := state.NormalizedLifecycle(); got != WorkerLifecycleRetired { t.Fatalf("expected retired lifecycle, got %q", got) } - if state.Assignment == nil || state.Assignment.TeamName != "analytics" { + if state.Assignment == nil || state.Assignment.OrgID != "analytics" { t.Fatalf("expected retired worker to retain last assignment metadata, got %#v", state.Assignment) } } @@ -67,11 +67,11 @@ func TestSharedWorkerStateTransitionRejectsMissingOrInvalidAssignment(t *testing if _, err := (SharedWorkerState{}).Transition(WorkerLifecycleReserved, &WorkerAssignment{ LeaseExpiresAt: time.Now().Add(time.Hour), }); err == nil { - t.Fatal("expected reserve transition without team name to fail") + t.Fatal("expected reserve transition without org ID to fail") } if _, err := (SharedWorkerState{}).Transition(WorkerLifecycleReserved, &WorkerAssignment{ - TeamName: "analytics", + OrgID: "analytics", }); err == nil { t.Fatal("expected reserve transition without lease expiry to fail") } @@ -80,7 +80,7 @@ func TestSharedWorkerStateTransitionRejectsMissingOrInvalidAssignment(t *testing func TestSharedWorkerStateTransitionRejectsInvalidLifecycleMoves(t *testing.T) { leaseExpiry := time.Date(2026, time.March, 20, 16, 0, 0, 0, time.UTC) state, err := (SharedWorkerState{}).Transition(WorkerLifecycleReserved, &WorkerAssignment{ - TeamName: "analytics", + OrgID: "analytics", LeaseExpiresAt: leaseExpiry, }) if err != nil { @@ -97,7 +97,7 @@ func TestSharedWorkerStateTransitionRejectsInvalidLifecycleMoves(t *testing.T) { } if _, err := state.Transition(WorkerLifecycleHot, &WorkerAssignment{ - TeamName: "billing", + OrgID: "billing", LeaseExpiresAt: leaseExpiry.Add(time.Hour), }); err == nil { t.Fatal("expected activating -> hot transition to reject assignment changes") diff --git a/k8s/local-config-store.seed.sql b/k8s/local-config-store.seed.sql index 11a5517..662d372 100644 --- a/k8s/local-config-store.seed.sql +++ b/k8s/local-config-store.seed.sql @@ -1,10 +1,10 @@ -INSERT INTO duckgres_teams (name, max_workers, memory_budget, idle_timeout_s, created_at, updated_at) +INSERT INTO duckgres_orgs (name, max_workers, memory_budget, idle_timeout_s, created_at, updated_at) VALUES ('local', 0, '', 0, NOW(), NOW()) ON CONFLICT (name) DO UPDATE SET updated_at = NOW(); INSERT INTO duckgres_managed_warehouses ( - team_name, + org_id, warehouse_database_region, warehouse_database_endpoint, warehouse_database_port, @@ -73,7 +73,7 @@ VALUES ( 'minio', 'us-east-1', 'duckgres-local', - 'teams/local/', + 'orgs/local/', 'host.docker.internal:39000', false, 'path', @@ -109,7 +109,7 @@ VALUES ( NOW(), NOW() ) -ON CONFLICT (team_name) DO UPDATE +ON CONFLICT (org_id) DO UPDATE SET warehouse_database_region = EXCLUDED.warehouse_database_region, warehouse_database_endpoint = EXCLUDED.warehouse_database_endpoint, warehouse_database_port = EXCLUDED.warehouse_database_port, @@ -160,9 +160,9 @@ SET warehouse_database_region = EXCLUDED.warehouse_database_region, failed_at = EXCLUDED.failed_at, updated_at = NOW(); -INSERT INTO duckgres_team_users (username, password, team_name, created_at, updated_at) +INSERT INTO duckgres_org_users (username, password, org_id, created_at, updated_at) VALUES ('postgres', '$2a$10$TQyt73Vw91Q1d7YcE86EVuhms/0u4qBydMDyVvZYlqDwc3/VtQAbm', 'local', NOW(), NOW()) ON CONFLICT (username) DO UPDATE SET password = EXCLUDED.password, - team_name = EXCLUDED.team_name, + org_id = EXCLUDED.org_id, updated_at = NOW(); diff --git a/tests/controlplane/managed_warehouse_postgres_test.go b/tests/controlplane/managed_warehouse_postgres_test.go index 9072c43..04a4deb 100644 --- a/tests/controlplane/managed_warehouse_postgres_test.go +++ b/tests/controlplane/managed_warehouse_postgres_test.go @@ -30,18 +30,18 @@ func TestManagedWarehouseConfigStorePostgres(t *testing.T) { t.Fatalf("hash password: %v", err) } - if err := store.DB().Create(&configstore.Team{Name: "analytics"}).Error; err != nil { - t.Fatalf("create team: %v", err) + if err := store.DB().Create(&configstore.Org{Name: "analytics"}).Error; err != nil { + t.Fatalf("create org: %v", err) } - if err := store.DB().Create(&configstore.TeamUser{ + if err := store.DB().Create(&configstore.OrgUser{ Username: "alice", Password: passwordHash, - TeamName: "analytics", + OrgID: "analytics", }).Error; err != nil { t.Fatalf("create user: %v", err) } if err := store.DB().Create(&configstore.ManagedWarehouse{ - TeamName: "analytics", + OrgID: "analytics", WarehouseDatabase: configstore.ManagedWarehouseDatabase{ Region: "us-east-1", Endpoint: "analytics.cluster.example", @@ -73,42 +73,42 @@ func TestManagedWarehouseConfigStorePostgres(t *testing.T) { t.Fatalf("reload store: %v", err) } - teamCfg := store.Snapshot().Teams["analytics"] - if teamCfg == nil { - t.Fatal("expected analytics team in snapshot") + orgCfg := store.Snapshot().Orgs["analytics"] + if orgCfg == nil { + t.Fatal("expected analytics org in snapshot") } - if teamCfg.Warehouse == nil { + if orgCfg.Warehouse == nil { t.Fatal("expected warehouse to be preloaded into snapshot") } - if teamCfg.Warehouse.WarehouseDatabase.DatabaseName != "analytics_wh" { - t.Fatalf("expected analytics_wh, got %q", teamCfg.Warehouse.WarehouseDatabase.DatabaseName) + if orgCfg.Warehouse.WarehouseDatabase.DatabaseName != "analytics_wh" { + t.Fatalf("expected analytics_wh, got %q", orgCfg.Warehouse.WarehouseDatabase.DatabaseName) } - if teamCfg.Warehouse.MetadataStore.Kind != "dedicated_rds" { - t.Fatalf("expected metadata store kind dedicated_rds, got %q", teamCfg.Warehouse.MetadataStore.Kind) + if orgCfg.Warehouse.MetadataStore.Kind != "dedicated_rds" { + t.Fatalf("expected metadata store kind dedicated_rds, got %q", orgCfg.Warehouse.MetadataStore.Kind) } - if teamCfg.Warehouse.MetadataStore.DatabaseName != "ducklake_metadata" { - t.Fatalf("expected ducklake_metadata, got %q", teamCfg.Warehouse.MetadataStore.DatabaseName) + if orgCfg.Warehouse.MetadataStore.DatabaseName != "ducklake_metadata" { + t.Fatalf("expected ducklake_metadata, got %q", orgCfg.Warehouse.MetadataStore.DatabaseName) } - if teamCfg.Users["alice"] != passwordHash { + if orgCfg.Users["alice"] != passwordHash { t.Fatal("expected user credentials to remain loaded in snapshot") } - if err := store.DB().Create(&configstore.Team{Name: "cleanup"}).Error; err != nil { - t.Fatalf("create cleanup team: %v", err) + if err := store.DB().Create(&configstore.Org{Name: "cleanup"}).Error; err != nil { + t.Fatalf("create cleanup org: %v", err) } if err := store.DB().Create(&configstore.ManagedWarehouse{ - TeamName: "cleanup", - State: configstore.ManagedWarehouseStateReady, + OrgID: "cleanup", + State: configstore.ManagedWarehouseStateReady, }).Error; err != nil { t.Fatalf("create cleanup warehouse: %v", err) } - if err := store.DB().Delete(&configstore.Team{Name: "cleanup"}).Error; err != nil { - t.Fatalf("delete team: %v", err) + if err := store.DB().Delete(&configstore.Org{Name: "cleanup"}).Error; err != nil { + t.Fatalf("delete org: %v", err) } var count int64 - if err := store.DB().Model(&configstore.ManagedWarehouse{}).Where("team_name = ?", "cleanup").Count(&count).Error; err != nil { + if err := store.DB().Model(&configstore.ManagedWarehouse{}).Where("org_id = ?", "cleanup").Count(&count).Error; err != nil { t.Fatalf("count warehouses: %v", err) } if count != 0 { @@ -128,30 +128,30 @@ func TestLocalConfigStoreSeedSQL(t *testing.T) { } snap := store.Snapshot() - teamCfg := snap.Teams["local"] - if teamCfg == nil { - t.Fatal("expected local team from seed") + orgCfg := snap.Orgs["local"] + if orgCfg == nil { + t.Fatal("expected local org from seed") } - if teamCfg.Warehouse == nil { + if orgCfg.Warehouse == nil { t.Fatal("expected local warehouse from seed") } - if teamCfg.Warehouse.WarehouseDatabase.DatabaseName != "duckgres_local" { - t.Fatalf("expected duckgres_local warehouse db, got %q", teamCfg.Warehouse.WarehouseDatabase.DatabaseName) + if orgCfg.Warehouse.WarehouseDatabase.DatabaseName != "duckgres_local" { + t.Fatalf("expected duckgres_local warehouse db, got %q", orgCfg.Warehouse.WarehouseDatabase.DatabaseName) } - if teamCfg.Warehouse.MetadataStore.DatabaseName != "ducklake_metadata_local" { - t.Fatalf("expected ducklake_metadata_local metadata db, got %q", teamCfg.Warehouse.MetadataStore.DatabaseName) + if orgCfg.Warehouse.MetadataStore.DatabaseName != "ducklake_metadata_local" { + t.Fatalf("expected ducklake_metadata_local metadata db, got %q", orgCfg.Warehouse.MetadataStore.DatabaseName) } - if teamCfg.Warehouse.WarehouseDatabaseCredentials.Name != "duckgres-local-warehouse-db" { - t.Fatalf("expected duckgres-local-warehouse-db secret ref, got %q", teamCfg.Warehouse.WarehouseDatabaseCredentials.Name) + if orgCfg.Warehouse.WarehouseDatabaseCredentials.Name != "duckgres-local-warehouse-db" { + t.Fatalf("expected duckgres-local-warehouse-db secret ref, got %q", orgCfg.Warehouse.WarehouseDatabaseCredentials.Name) } - if teamCfg.Warehouse.State != configstore.ManagedWarehouseStateReady { - t.Fatalf("expected ready warehouse state, got %q", teamCfg.Warehouse.State) + if orgCfg.Warehouse.State != configstore.ManagedWarehouseStateReady { + t.Fatalf("expected ready warehouse state, got %q", orgCfg.Warehouse.State) } - if teamCfg.Warehouse.MetadataStoreState != configstore.ManagedWarehouseStateReady { - t.Fatalf("expected ready metadata store state, got %q", teamCfg.Warehouse.MetadataStoreState) + if orgCfg.Warehouse.MetadataStoreState != configstore.ManagedWarehouseStateReady { + t.Fatalf("expected ready metadata store state, got %q", orgCfg.Warehouse.MetadataStoreState) } - if _, ok := teamCfg.Users["postgres"]; !ok { - t.Fatal("expected seeded postgres user to belong to local team") + if _, ok := orgCfg.Users["postgres"]; !ok { + t.Fatal("expected seeded postgres user to belong to local org") } } diff --git a/tests/k8s/k8s_test.go b/tests/k8s/k8s_test.go index bd0612e..fa6229f 100644 --- a/tests/k8s/k8s_test.go +++ b/tests/k8s/k8s_test.go @@ -462,23 +462,23 @@ func stopConfigStore() error { } func seedConfigStore() error { - teamSQL := ` -INSERT INTO duckgres_teams (name, max_workers, memory_budget, idle_timeout_s, created_at, updated_at) + orgSQL := ` +INSERT INTO duckgres_orgs (name, max_workers, memory_budget, idle_timeout_s, created_at, updated_at) VALUES ('local', 0, '', 0, NOW(), NOW()) ON CONFLICT (name) DO UPDATE SET updated_at = NOW(); ` if err := runCmd("docker", "exec", "-i", "duckgres-config-store", - "psql", "-U", "duckgres", "-d", "duckgres_config", "-v", "ON_ERROR_STOP=1", "-c", teamSQL); err != nil { - return fmt.Errorf("seed team: %w", err) + "psql", "-U", "duckgres", "-d", "duckgres_config", "-v", "ON_ERROR_STOP=1", "-c", orgSQL); err != nil { + return fmt.Errorf("seed org: %w", err) } userSQL := ` -INSERT INTO duckgres_team_users (username, password, team_name, created_at, updated_at) +INSERT INTO duckgres_org_users (username, password, org_id, created_at, updated_at) VALUES ('postgres', '$2a$10$TQyt73Vw91Q1d7YcE86EVuhms/0u4qBydMDyVvZYlqDwc3/VtQAbm', 'local', NOW(), NOW()) ON CONFLICT (username) DO UPDATE SET password = EXCLUDED.password, - team_name = EXCLUDED.team_name, + org_id = EXCLUDED.org_id, updated_at = NOW(); ` if err := runCmd("docker", "exec", "-i", "duckgres-config-store", From c7efd6e088c0821bd282b0b929b0ec6a04b8753c Mon Sep 17 00:00:00 2001 From: eric Date: Thu, 19 Mar 2026 16:13:01 -0700 Subject: [PATCH 02/17] Add provisioning controller and REST API for managed warehouses MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds end-to-end provisioning flow: PostHog calls a REST API to initiate provisioning, the controller drives Crossplane via Duckling CRs to create per-team AWS resources (Aurora, S3, IAM), and updates the config store as resources come up. The team router gates worker stack creation on warehouse readiness and uses per-team namespace/SA from the warehouse config. New packages: - controlplane/provisioner: reconciliation loop + K8s dynamic client for Duckling CRs (pending→provisioning→ready, deleting→deleted, failure handling) - controlplane/provisioning: production-facing REST API on separate port (POST /teams/:id/provision, POST /teams/:id/deprovision, GET /teams/:id/warehouse) Key design decisions: - Provisioning API runs on separate port (:9091) from admin (:9090) - Separate bearer token (--provisioning-token) for provisioning vs admin - Controller uses WarehouseStore interface for testability - CAS (compare-and-swap) updates prevent state races - Synced=False tolerance (5min grace period) for Crossplane transients - Teams are auto-created on first provision call - Non-K8s builds get stub (provisioning package has no build tag) Co-Authored-By: Claude Opus 4.6 (1M context) --- config_resolution.go | 22 ++ controlplane/admin/api.go | 3 + controlplane/configstore/models.go | 11 + controlplane/configstore/store.go | 25 ++ controlplane/control.go | 25 +- controlplane/multitenant.go | 47 ++- controlplane/multitenant_stub.go | 2 +- controlplane/org_router.go | 55 +++- controlplane/provisioner/controller.go | 276 ++++++++++++++++ controlplane/provisioner/controller_stub.go | 22 ++ controlplane/provisioner/controller_test.go | 342 ++++++++++++++++++++ controlplane/provisioner/k8s_client.go | 173 ++++++++++ controlplane/provisioning/api.go | 106 ++++++ controlplane/provisioning/api_test.go | 284 ++++++++++++++++ controlplane/provisioning/store.go | 82 +++++ justfile | 7 +- main.go | 8 + 17 files changed, 1479 insertions(+), 11 deletions(-) create mode 100644 controlplane/provisioner/controller.go create mode 100644 controlplane/provisioner/controller_stub.go create mode 100644 controlplane/provisioner/controller_test.go create mode 100644 controlplane/provisioner/k8s_client.go create mode 100644 controlplane/provisioning/api.go create mode 100644 controlplane/provisioning/api_test.go create mode 100644 controlplane/provisioning/store.go diff --git a/config_resolution.go b/config_resolution.go index f35c875..8aad925 100644 --- a/config_resolution.go +++ b/config_resolution.go @@ -41,6 +41,8 @@ type configCLIInputs struct { ConfigStoreConn string ConfigPollInterval string AdminToken string + ProvisioningToken string + ProvisioningPort int WorkerBackend string K8sWorkerImage string K8sWorkerNamespace string @@ -76,6 +78,8 @@ type resolvedConfig struct { ConfigStoreConn string ConfigPollInterval time.Duration AdminToken string + ProvisioningToken string + ProvisioningPort int } func defaultServerConfig() server.Config { @@ -132,6 +136,8 @@ func resolveEffectiveConfig(fileCfg *FileConfig, cli configCLIInputs, getenv fun var configStoreConn string var configPollInterval time.Duration var adminToken string + var provisioningToken string + var provisioningPort int if fileCfg != nil { if fileCfg.Host != "" { @@ -584,6 +590,14 @@ func resolveEffectiveConfig(fileCfg *FileConfig, cli configCLIInputs, getenv fun if v := getenv("DUCKGRES_ADMIN_TOKEN"); v != "" { adminToken = v } + if v := getenv("DUCKGRES_PROVISIONING_TOKEN"); v != "" { + provisioningToken = v + } + if v := getenv("DUCKGRES_PROVISIONING_PORT"); v != "" { + if n, err := strconv.Atoi(v); err == nil { + provisioningPort = n + } + } if v := getenv("DUCKGRES_WORKER_BACKEND"); v != "" { workerBackend = v } @@ -793,6 +807,12 @@ func resolveEffectiveConfig(fileCfg *FileConfig, cli configCLIInputs, getenv fun if cli.Set["admin-token"] { adminToken = cli.AdminToken } + if cli.Set["provisioning-token"] { + provisioningToken = cli.ProvisioningToken + } + if cli.Set["provisioning-port"] { + provisioningPort = cli.ProvisioningPort + } if cli.Set["worker-backend"] { workerBackend = cli.WorkerBackend } @@ -898,5 +918,7 @@ func resolveEffectiveConfig(fileCfg *FileConfig, cli configCLIInputs, getenv fun ConfigStoreConn: configStoreConn, ConfigPollInterval: configPollInterval, AdminToken: adminToken, + ProvisioningToken: provisioningToken, + ProvisioningPort: provisioningPort, } } diff --git a/controlplane/admin/api.go b/controlplane/admin/api.go index 2648242..96430e4 100644 --- a/controlplane/admin/api.go +++ b/controlplane/admin/api.go @@ -284,6 +284,9 @@ func (s *gormAPIStore) UpsertManagedWarehouse(orgID string, warehouse *configsto func managedWarehouseUpsertColumns() []string { return []string{ + "image", + "aurora_min_acu", + "aurora_max_acu", "warehouse_database_region", "warehouse_database_endpoint", "warehouse_database_port", diff --git a/controlplane/configstore/models.go b/controlplane/configstore/models.go index 8215ee7..197cd19 100644 --- a/controlplane/configstore/models.go +++ b/controlplane/configstore/models.go @@ -90,6 +90,10 @@ type ManagedWarehouseWorkerIdentity struct { type ManagedWarehouse struct { OrgID string `gorm:"primaryKey;size:255" json:"org_id"` + Image string `gorm:"size:512" json:"image"` + AuroraMinACU float64 `json:"aurora_min_acu"` + AuroraMaxACU float64 `json:"aurora_max_acu"` + WarehouseDatabase ManagedWarehouseDatabase `gorm:"embedded;embeddedPrefix:warehouse_database_" json:"warehouse_database"` MetadataStore ManagedWarehouseMetadataStore `gorm:"embedded;embeddedPrefix:metadata_store_" json:"metadata_store"` S3 ManagedWarehouseS3 `gorm:"embedded;embeddedPrefix:s3_" json:"s3"` @@ -195,6 +199,10 @@ type OrgConfig struct { type ManagedWarehouseConfig struct { OrgID string + Image string + AuroraMinACU float64 + AuroraMaxACU float64 + WarehouseDatabase ManagedWarehouseDatabase MetadataStore ManagedWarehouseMetadataStore S3 ManagedWarehouseS3 @@ -228,6 +236,9 @@ func copyManagedWarehouseConfig(warehouse *ManagedWarehouse) *ManagedWarehouseCo cfg := &ManagedWarehouseConfig{ OrgID: warehouse.OrgID, + Image: warehouse.Image, + AuroraMinACU: warehouse.AuroraMinACU, + AuroraMaxACU: warehouse.AuroraMaxACU, WarehouseDatabase: warehouse.WarehouseDatabase, MetadataStore: warehouse.MetadataStore, S3: warehouse.S3, diff --git a/controlplane/configstore/store.go b/controlplane/configstore/store.go index a50aaf5..88353e6 100644 --- a/controlplane/configstore/store.go +++ b/controlplane/configstore/store.go @@ -217,6 +217,31 @@ func (cs *ConfigStore) OnChange(fn func(old, new *Snapshot)) { cs.onChange = append(cs.onChange, fn) } +// ListWarehousesByStates returns all warehouses with a state matching one of the given values. +// This is a direct DB query, not snapshot-based, for use by the provisioning controller. +func (cs *ConfigStore) ListWarehousesByStates(states []ManagedWarehouseProvisioningState) ([]ManagedWarehouse, error) { + var warehouses []ManagedWarehouse + if err := cs.db.Where("state IN ?", states).Find(&warehouses).Error; err != nil { + return nil, fmt.Errorf("list warehouses by states: %w", err) + } + return warehouses, nil +} + +// UpdateWarehouseState performs a compare-and-swap update on a warehouse row. +// Only updates if the current state matches expectedState, preventing races. +func (cs *ConfigStore) UpdateWarehouseState(orgID string, expectedState ManagedWarehouseProvisioningState, updates map[string]interface{}) error { + result := cs.db.Model(&ManagedWarehouse{}). + Where("org_id = ? AND state = ?", orgID, expectedState). + Updates(updates) + if result.Error != nil { + return fmt.Errorf("update warehouse state: %w", result.Error) + } + if result.RowsAffected == 0 { + return fmt.Errorf("warehouse %q not in expected state %q", orgID, expectedState) + } + return nil +} + // DB exposes the GORM database for direct CRUD operations (used by admin API). func (cs *ConfigStore) DB() *gorm.DB { return cs.db diff --git a/controlplane/control.go b/controlplane/control.go index be41cd7..efd24a1 100644 --- a/controlplane/control.go +++ b/controlplane/control.go @@ -60,6 +60,14 @@ type ControlPlaneConfig struct { // AdminToken is the bearer token required for admin API requests. // When empty, a random token is generated and logged at startup. AdminToken string + + // ProvisioningToken is the bearer token required for provisioning API requests. + // When empty, falls back to AdminToken. + ProvisioningToken string + + // ProvisioningPort is the listen port for the provisioning API server. + // Default: 9091. + ProvisioningPort int } type ProcessConfig struct { @@ -107,8 +115,9 @@ type ControlPlane struct { acmeDNSManager *server.ACMEDNSManager // ACME manager for DNS-01 (nil when not using DNS challenges) // Multi-tenant fields (non-nil in remote multitenant mode) - orgRouter OrgRouterInterface - configStore ConfigStoreInterface + orgRouter OrgRouterInterface + configStore ConfigStoreInterface + provisioningServer *http.Server // provisioning API server (shut down on graceful exit) } // ConfigStoreInterface abstracts the config store for the control plane. @@ -316,7 +325,7 @@ func RunControlPlane(cfg ControlPlaneConfig) { // Multi-tenant mode: config store + per-org pools (K8s remote backend only) if cfg.WorkerBackend == "remote" { - store, adapter, adminSrv, err := SetupMultiTenant(cfg, srv, memBudget, k8sMaxWorkers) + store, adapter, servers, err := SetupMultiTenant(cfg, srv, memBudget, k8sMaxWorkers) if err != nil { slog.Error("Failed to set up multi-tenant config store.", "error", err) os.Exit(1) @@ -329,7 +338,8 @@ func RunControlPlane(cfg ControlPlaneConfig) { _ = cfg.MetricsServer.Shutdown(ctx) cancel() } - cfg.MetricsServer = adminSrv + cfg.MetricsServer = servers[0] // admin server + cp.provisioningServer = servers[1] cp.cfg = cfg _ = store // keep linter happy } else { @@ -952,6 +962,13 @@ func (cp *ControlPlane) handleUpgrade() { } cancel() } + if cp.provisioningServer != nil { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + if err := cp.provisioningServer.Shutdown(ctx); err != nil { + slog.Warn("Provisioning server shutdown failed.", "error", err) + } + cancel() + } // Stop ACME managers so the new CP can bind port 80 (HTTP-01) or // manage DNS records. Nil out after close so drainAfterUpgrade diff --git a/controlplane/multitenant.go b/controlplane/multitenant.go index 5836e6c..062555f 100644 --- a/controlplane/multitenant.go +++ b/controlplane/multitenant.go @@ -14,6 +14,8 @@ import ( "github.com/gin-gonic/gin" "github.com/posthog/duckgres/controlplane/admin" "github.com/posthog/duckgres/controlplane/configstore" + "github.com/posthog/duckgres/controlplane/provisioner" + "github.com/posthog/duckgres/controlplane/provisioning" "github.com/posthog/duckgres/server" "github.com/prometheus/client_golang/prometheus/promhttp" ) @@ -105,14 +107,15 @@ func (a *orgRouterAdapter) AllSessionStatuses() []admin.SessionStatus { var _ OrgRouterInterface = (*orgRouterAdapter)(nil) var _ admin.OrgStackInfo = (*orgRouterAdapter)(nil) -// SetupMultiTenant initializes the config store, org router, and Gin admin server. +// SetupMultiTenant initializes the config store, org router, admin server, and provisioning server. // Called from RunControlPlane when --config-store is set with remote backend. +// Returns the admin server and provisioning server for graceful shutdown. func SetupMultiTenant( cfg ControlPlaneConfig, srv *server.Server, memBudget uint64, maxWorkers int, -) (ConfigStoreInterface, OrgRouterInterface, *http.Server, error) { +) (ConfigStoreInterface, OrgRouterInterface, []*http.Server, error) { pollInterval := cfg.ConfigPollInterval if pollInterval <= 0 { pollInterval = 30 * time.Second @@ -123,6 +126,11 @@ func SetupMultiTenant( return nil, nil, nil, err } + provisioningPort := cfg.ProvisioningPort + if provisioningPort == 0 { + provisioningPort = 9091 + } + baseCfg := K8sWorkerPoolConfig{ Namespace: cfg.K8s.WorkerNamespace, CPID: cfg.K8s.ControlPlaneID, @@ -145,6 +153,14 @@ func SetupMultiTenant( adpt := &orgRouterAdapter{router: router} + // Start provisioning controller (best-effort — K8s API may not be available locally) + provCtrl, err := provisioner.NewController(store, 10*time.Second) + if err != nil { + slog.Warn("Provisioning controller unavailable.", "error", err) + } else { + go provCtrl.Run(context.Background()) + } + // Register config change handler store.OnChange(router.HandleConfigChange) @@ -191,5 +207,30 @@ func SetupMultiTenant( } }() - return store, adpt, adminServer, nil + // Set up provisioning API server (separate from admin — production-facing) + provToken := cfg.ProvisioningToken + if provToken == "" { + provToken = adminToken // fall back to admin token if not set + } + + provEngine := gin.New() + provEngine.Use(gin.Recovery()) + provEngine.GET("/health", func(c *gin.Context) { + c.String(http.StatusOK, "ok") + }) + provAPI := provEngine.Group("/api/v1", admin.APIAuthMiddleware(provToken)) + provisioning.RegisterAPI(provAPI, provisioning.NewGormStore(store)) + + provServer := &http.Server{ + Addr: fmt.Sprintf(":%d", provisioningPort), + Handler: provEngine, + } + go func() { + slog.Info("Starting provisioning API server.", "addr", provServer.Addr) + if err := provServer.ListenAndServe(); err != nil && err != http.ErrServerClosed { + slog.Warn("Provisioning API server error.", "error", err) + } + }() + + return store, adpt, []*http.Server{adminServer, provServer}, nil } diff --git a/controlplane/multitenant_stub.go b/controlplane/multitenant_stub.go index 9efb937..3959891 100644 --- a/controlplane/multitenant_stub.go +++ b/controlplane/multitenant_stub.go @@ -15,6 +15,6 @@ func SetupMultiTenant( srv *server.Server, memBudget uint64, maxWorkers int, -) (ConfigStoreInterface, OrgRouterInterface, *http.Server, error) { +) (ConfigStoreInterface, OrgRouterInterface, []*http.Server, error) { return nil, nil, nil, fmt.Errorf("multi-tenant mode requires -tags kubernetes build") } diff --git a/controlplane/org_router.go b/controlplane/org_router.go index d0e918a..f4583fa 100644 --- a/controlplane/org_router.go +++ b/controlplane/org_router.go @@ -68,6 +68,11 @@ func NewOrgRouter(store *configstore.ConfigStore, baseCfg K8sWorkerPoolConfig, g snap := store.Snapshot() for _, tc := range snap.Orgs { + // Only create stacks for orgs with ready warehouses (or no warehouse at all for backwards compat) + if tc.Warehouse != nil && tc.Warehouse.State != configstore.ManagedWarehouseStateReady { + slog.Info("Skipping org stack creation (warehouse not ready).", "org", tc.Name, "state", tc.Warehouse.State) + continue + } if _, err := tr.createOrgStack(tc); err != nil { slog.Error("Failed to create org stack.", "org", tc.Name, "error", err) continue @@ -93,6 +98,15 @@ func (tr *OrgRouter) createOrgStack(tc *configstore.OrgConfig) (*OrgStack, error memoryBudget = int64(server.ParseMemoryBytes(tc.MemoryBudget)) } + // Use per-org namespace and service account from warehouse config + if tc.Warehouse != nil && tc.Warehouse.State == configstore.ManagedWarehouseStateReady { + if tc.Warehouse.WorkerIdentity.Namespace != "" { + // Note: OrgReservedPool inherits from the shared pool, so namespace + // overrides are propagated via label selectors, not pool config. + _ = tc.Warehouse.WorkerIdentity.Namespace // used for future per-org pool config + } + } + pool := NewOrgReservedPool(tr.sharedPool, tc.Name, maxWorkers) rebalancer := NewMemoryRebalancer(uint64(memoryBudget), 0, nil, tr.globalCfg.MemoryRebalance) @@ -166,13 +180,50 @@ func (tr *OrgRouter) StackForUser(username string) (*OrgStack, bool) { // HandleConfigChange reconciles org stacks when the config snapshot changes. func (tr *OrgRouter) HandleConfigChange(old, new *configstore.Snapshot) { - // Detect new orgs + // Detect new orgs or orgs whose warehouse just became ready for name, tc := range new.Orgs { - if _, existed := old.Orgs[name]; !existed { + oldTC, existed := old.Orgs[name] + + // Skip orgs with warehouses that aren't ready + if tc.Warehouse != nil && tc.Warehouse.State != configstore.ManagedWarehouseStateReady { + // If warehouse is being deleted, destroy existing stack + if tc.Warehouse.State == configstore.ManagedWarehouseStateDeleting || + tc.Warehouse.State == configstore.ManagedWarehouseStateDeleted { + tr.mu.RLock() + _, hasStack := tr.orgs[name] + tr.mu.RUnlock() + if hasStack { + slog.Info("Warehouse deprovisioning, destroying stack.", "org", name) + tr.DestroyOrgStack(name) + } + } + continue + } + + tr.mu.RLock() + _, hasStack := tr.orgs[name] + tr.mu.RUnlock() + + if !existed && !hasStack { + // Brand new org -- create stack slog.Info("New org detected, creating stack.", "org", name) if _, err := tr.createOrgStack(tc); err != nil { slog.Error("Failed to create org stack on config change.", "org", name, "error", err) } + } else if existed && !hasStack { + // Existing org whose warehouse just became ready + warehouseJustReady := oldTC.Warehouse != nil && + oldTC.Warehouse.State != configstore.ManagedWarehouseStateReady && + tc.Warehouse != nil && + tc.Warehouse.State == configstore.ManagedWarehouseStateReady + noWarehouse := tc.Warehouse == nil + + if warehouseJustReady || noWarehouse { + slog.Info("Org warehouse ready, creating stack.", "org", name) + if _, err := tr.createOrgStack(tc); err != nil { + slog.Error("Failed to create org stack on config change.", "org", name, "error", err) + } + } } } diff --git a/controlplane/provisioner/controller.go b/controlplane/provisioner/controller.go new file mode 100644 index 0000000..ae61914 --- /dev/null +++ b/controlplane/provisioner/controller.go @@ -0,0 +1,276 @@ +//go:build kubernetes + +package provisioner + +import ( + "context" + "fmt" + "log/slog" + "time" + + "github.com/posthog/duckgres/controlplane/configstore" + apierrors "k8s.io/apimachinery/pkg/api/errors" +) + +// WarehouseStore is the subset of configstore.ConfigStore that the controller needs. +type WarehouseStore interface { + ListWarehousesByStates(states []configstore.ManagedWarehouseProvisioningState) ([]configstore.ManagedWarehouse, error) + UpdateWarehouseState(orgID string, expectedState configstore.ManagedWarehouseProvisioningState, updates map[string]interface{}) error +} + +// Controller polls the config store for actionable warehouses and reconciles +// their state against Duckling CRs in Kubernetes. +type Controller struct { + store WarehouseStore + duckling *DucklingClient + pollInterval time.Duration +} + +// NewController creates a provisioning controller. Returns an error if the +// Kubernetes client cannot be initialized (e.g., not running in-cluster). +func NewController(store WarehouseStore, pollInterval time.Duration) (*Controller, error) { + dc, err := NewDucklingClient() + if err != nil { + return nil, fmt.Errorf("create duckling client: %w", err) + } + return &Controller{ + store: store, + duckling: dc, + pollInterval: pollInterval, + }, nil +} + +// NewControllerWithClient creates a Controller with a pre-built DucklingClient (for testing). +func NewControllerWithClient(store WarehouseStore, dc *DucklingClient, pollInterval time.Duration) *Controller { + return &Controller{ + store: store, + duckling: dc, + pollInterval: pollInterval, + } +} + +// Run starts the reconciliation loop. Blocks until ctx is cancelled. +func (c *Controller) Run(ctx context.Context) { + slog.Info("Provisioning controller started.", "poll_interval", c.pollInterval) + ticker := time.NewTicker(c.pollInterval) + defer ticker.Stop() + + // Run once immediately at startup + c.reconcile(ctx) + + for { + select { + case <-ctx.Done(): + slog.Info("Provisioning controller stopped.") + return + case <-ticker.C: + c.reconcile(ctx) + } + } +} + +// actionableStates are the warehouse states the controller acts on. +var actionableStates = []configstore.ManagedWarehouseProvisioningState{ + configstore.ManagedWarehouseStatePending, + configstore.ManagedWarehouseStateProvisioning, + configstore.ManagedWarehouseStateDeleting, +} + +func (c *Controller) reconcile(ctx context.Context) { + warehouses, err := c.store.ListWarehousesByStates(actionableStates) + if err != nil { + slog.Warn("Provisioning controller: failed to list warehouses.", "error", err) + return + } + + for _, w := range warehouses { + if ctx.Err() != nil { + return + } + switch w.State { + case configstore.ManagedWarehouseStatePending: + c.reconcilePending(ctx, &w) + case configstore.ManagedWarehouseStateProvisioning: + c.reconcileProvisioning(ctx, &w) + case configstore.ManagedWarehouseStateDeleting: + c.reconcileDeleting(ctx, &w) + } + } +} + +func (c *Controller) reconcilePending(ctx context.Context, w *configstore.ManagedWarehouse) { + log := slog.With("org", w.OrgID, "phase", "pending") + + // Check if a Duckling CR already exists (e.g., controller restart) + _, err := c.duckling.Get(ctx, w.OrgID) + if err == nil { + // CR exists — transition directly to provisioning + log.Info("Duckling CR already exists, transitioning to provisioning.") + if err := c.store.UpdateWarehouseState(w.OrgID, configstore.ManagedWarehouseStatePending, map[string]interface{}{ + "state": configstore.ManagedWarehouseStateProvisioning, + "status_message": "Duckling CR exists, polling status", + }); err != nil { + log.Warn("Failed to update state to provisioning.", "error", err) + } + return + } + + // Create the Duckling CR + log.Info("Creating Duckling CR.") + if err := c.duckling.Create(ctx, w.OrgID, w.Image, w.AuroraMinACU, w.AuroraMaxACU); err != nil { + log.Error("Failed to create Duckling CR.", "error", err) + _ = c.store.UpdateWarehouseState(w.OrgID, configstore.ManagedWarehouseStatePending, map[string]interface{}{ + "state": configstore.ManagedWarehouseStateFailed, + "status_message": fmt.Sprintf("Failed to create Duckling CR: %v", err), + "failed_at": time.Now().UTC(), + }) + return + } + + if err := c.store.UpdateWarehouseState(w.OrgID, configstore.ManagedWarehouseStatePending, map[string]interface{}{ + "state": configstore.ManagedWarehouseStateProvisioning, + "status_message": "Duckling CR created, waiting for resources", + }); err != nil { + log.Warn("Failed to update state to provisioning.", "error", err) + } +} + +func (c *Controller) reconcileProvisioning(ctx context.Context, w *configstore.ManagedWarehouse) { + log := slog.With("org", w.OrgID, "phase", "provisioning") + + // Check for timeout (30 minutes) + if time.Since(w.CreatedAt) > 30*time.Minute { + log.Warn("Provisioning timed out.") + _ = c.store.UpdateWarehouseState(w.OrgID, configstore.ManagedWarehouseStateProvisioning, map[string]interface{}{ + "state": configstore.ManagedWarehouseStateFailed, + "status_message": "Provisioning timed out after 30 minutes", + "failed_at": time.Now().UTC(), + }) + return + } + + status, err := c.duckling.Get(ctx, w.OrgID) + if err != nil { + log.Warn("Failed to get Duckling CR status.", "error", err) + return + } + + // Check for Crossplane failure — only fail on persistent sync errors. + // Crossplane resources commonly flap Synced=False transiently (e.g., IAM + // eventual consistency), so we only transition to failed if 5+ minutes + // have passed since creation, giving transient errors time to resolve. + if status.SyncedFalseMessage != "" && time.Since(w.CreatedAt) > 5*time.Minute { + log.Warn("Crossplane sync failure.", "message", status.SyncedFalseMessage) + _ = c.store.UpdateWarehouseState(w.OrgID, configstore.ManagedWarehouseStateProvisioning, map[string]interface{}{ + "state": configstore.ManagedWarehouseStateFailed, + "status_message": fmt.Sprintf("Crossplane error: %s", status.SyncedFalseMessage), + "failed_at": time.Now().UTC(), + }) + return + } + + // Update per-component states based on Duckling CR status fields + updates := map[string]interface{}{} + + if status.BucketName != "" && w.S3State != configstore.ManagedWarehouseStateReady { + updates["s3_state"] = configstore.ManagedWarehouseStateReady + updates["s3_bucket"] = status.BucketName + if status.Region != "" { + updates["s3_region"] = status.Region + } + } + + if status.AuroraEndpoint != "" && w.MetadataStoreState != configstore.ManagedWarehouseStateReady { + updates["metadata_store_state"] = configstore.ManagedWarehouseStateReady + updates["metadata_store_endpoint"] = status.AuroraEndpoint + updates["metadata_store_port"] = status.AuroraPort + updates["metadata_store_kind"] = "aurora" + updates["metadata_store_engine"] = "postgres" + if status.Region != "" { + updates["metadata_store_region"] = status.Region + } + } + + if status.Namespace != "" && w.IdentityState != configstore.ManagedWarehouseStateReady { + updates["identity_state"] = configstore.ManagedWarehouseStateReady + updates["worker_identity_namespace"] = status.Namespace + if status.ServiceAccountName != "" { + updates["worker_identity_service_account_name"] = status.ServiceAccountName + } + if status.IAMRoleARN != "" { + updates["worker_identity_iam_role_arn"] = status.IAMRoleARN + } + } + + if status.AuroraPasswordSecret != "" && status.DuckgresPasswordSecret != "" && w.SecretsState != configstore.ManagedWarehouseStateReady { + updates["secrets_state"] = configstore.ManagedWarehouseStateReady + updates["metadata_store_credentials_namespace"] = status.Namespace + updates["metadata_store_credentials_name"] = status.AuroraPasswordSecret + updates["metadata_store_credentials_key"] = "password" + updates["runtime_config_namespace"] = status.Namespace + updates["runtime_config_name"] = status.DuckgresPasswordSecret + updates["runtime_config_key"] = "duckgres.yaml" + } + + if status.ReadyCondition && w.WarehouseDatabaseState != configstore.ManagedWarehouseStateReady { + updates["warehouse_database_state"] = configstore.ManagedWarehouseStateReady + if status.DuckgresEndpoint != "" { + updates["warehouse_database_endpoint"] = status.DuckgresEndpoint + } + if status.DuckgresPort > 0 { + updates["warehouse_database_port"] = status.DuckgresPort + } + if status.DuckgresDatabase != "" { + updates["warehouse_database_database_name"] = status.DuckgresDatabase + } + if status.DuckgresUsername != "" { + updates["warehouse_database_username"] = status.DuckgresUsername + } + if status.Region != "" { + updates["warehouse_database_region"] = status.Region + } + } + + // Check if all components are ready + s3Ready := w.S3State == configstore.ManagedWarehouseStateReady || updates["s3_state"] == configstore.ManagedWarehouseStateReady + metaReady := w.MetadataStoreState == configstore.ManagedWarehouseStateReady || updates["metadata_store_state"] == configstore.ManagedWarehouseStateReady + identReady := w.IdentityState == configstore.ManagedWarehouseStateReady || updates["identity_state"] == configstore.ManagedWarehouseStateReady + secretsReady := w.SecretsState == configstore.ManagedWarehouseStateReady || updates["secrets_state"] == configstore.ManagedWarehouseStateReady + dbReady := w.WarehouseDatabaseState == configstore.ManagedWarehouseStateReady || updates["warehouse_database_state"] == configstore.ManagedWarehouseStateReady + + if s3Ready && metaReady && identReady && secretsReady && dbReady { + now := time.Now().UTC() + updates["state"] = configstore.ManagedWarehouseStateReady + updates["status_message"] = "All components ready" + updates["ready_at"] = now + log.Info("All components ready, transitioning to ready.") + } + + if len(updates) > 0 { + if err := c.store.UpdateWarehouseState(w.OrgID, configstore.ManagedWarehouseStateProvisioning, updates); err != nil { + log.Warn("Failed to update warehouse state.", "error", err) + } + } +} + +func (c *Controller) reconcileDeleting(ctx context.Context, w *configstore.ManagedWarehouse) { + log := slog.With("org", w.OrgID, "phase", "deleting") + + log.Info("Deleting Duckling CR.") + if err := c.duckling.Delete(ctx, w.OrgID); err != nil { + // Only proceed if the CR is already gone (NotFound). For other errors + // (network, RBAC, etc.) we retry on the next reconcile pass to avoid + // marking as deleted while AWS resources still exist. + if !apierrors.IsNotFound(err) { + log.Warn("Failed to delete Duckling CR, will retry.", "error", err) + return + } + } + + if err := c.store.UpdateWarehouseState(w.OrgID, configstore.ManagedWarehouseStateDeleting, map[string]interface{}{ + "state": configstore.ManagedWarehouseStateDeleted, + "status_message": "Resources deleted", + }); err != nil { + log.Warn("Failed to update state to deleted.", "error", err) + } +} diff --git a/controlplane/provisioner/controller_stub.go b/controlplane/provisioner/controller_stub.go new file mode 100644 index 0000000..73290ff --- /dev/null +++ b/controlplane/provisioner/controller_stub.go @@ -0,0 +1,22 @@ +//go:build !kubernetes + +package provisioner + +import ( + "context" + "errors" + "time" + + "github.com/posthog/duckgres/controlplane/configstore" +) + +// Controller is a stub for non-Kubernetes builds. +type Controller struct{} + +// NewController returns an error on non-Kubernetes builds since it requires K8s API access. +func NewController(_ *configstore.ConfigStore, _ time.Duration) (*Controller, error) { + return nil, errors.New("provisioning controller requires kubernetes build tag") +} + +// Run is a no-op stub. +func (c *Controller) Run(_ context.Context) {} diff --git a/controlplane/provisioner/controller_test.go b/controlplane/provisioner/controller_test.go new file mode 100644 index 0000000..ca1c8bd --- /dev/null +++ b/controlplane/provisioner/controller_test.go @@ -0,0 +1,342 @@ +//go:build kubernetes + +package provisioner + +import ( + "context" + "testing" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + dynamicfake "k8s.io/client-go/dynamic/fake" + + "github.com/posthog/duckgres/controlplane/configstore" +) + +// fakeStore implements WarehouseStore for unit tests. +type fakeStore struct { + warehouses map[string]*configstore.ManagedWarehouse +} + +func newFakeStore() *fakeStore { + return &fakeStore{warehouses: make(map[string]*configstore.ManagedWarehouse)} +} + +func (s *fakeStore) ListWarehousesByStates(states []configstore.ManagedWarehouseProvisioningState) ([]configstore.ManagedWarehouse, error) { + var result []configstore.ManagedWarehouse + for _, w := range s.warehouses { + for _, st := range states { + if w.State == st { + result = append(result, *w) + break + } + } + } + return result, nil +} + +func (s *fakeStore) UpdateWarehouseState(orgID string, expectedState configstore.ManagedWarehouseProvisioningState, updates map[string]interface{}) error { + w, ok := s.warehouses[orgID] + if !ok { + return nil + } + if w.State != expectedState { + return nil + } + for k, v := range updates { + switch k { + case "state": + w.State = v.(configstore.ManagedWarehouseProvisioningState) + case "status_message": + w.StatusMessage = v.(string) + case "s3_state": + w.S3State = v.(configstore.ManagedWarehouseProvisioningState) + case "s3_bucket": + w.S3.Bucket = v.(string) + case "metadata_store_state": + w.MetadataStoreState = v.(configstore.ManagedWarehouseProvisioningState) + case "metadata_store_endpoint": + w.MetadataStore.Endpoint = v.(string) + case "identity_state": + w.IdentityState = v.(configstore.ManagedWarehouseProvisioningState) + case "worker_identity_namespace": + w.WorkerIdentity.Namespace = v.(string) + case "secrets_state": + w.SecretsState = v.(configstore.ManagedWarehouseProvisioningState) + case "warehouse_database_state": + w.WarehouseDatabaseState = v.(configstore.ManagedWarehouseProvisioningState) + case "ready_at": + t := v.(time.Time) + w.ReadyAt = &t + case "failed_at": + t := v.(time.Time) + w.FailedAt = &t + } + } + return nil +} + +// Compile-time check that fakeStore satisfies WarehouseStore. +var _ WarehouseStore = (*fakeStore)(nil) + +func newFakeDucklingClient() (*DucklingClient, *dynamicfake.FakeDynamicClient) { + scheme := runtime.NewScheme() + scheme.AddKnownTypeWithName(schema.GroupVersionKind{ + Group: "k8s.posthog.com", + Version: "v1alpha1", + Kind: "Duckling", + }, &unstructured.Unstructured{}) + scheme.AddKnownTypeWithName(schema.GroupVersionKind{ + Group: "k8s.posthog.com", + Version: "v1alpha1", + Kind: "DucklingList", + }, &unstructured.UnstructuredList{}) + + fakeClient := dynamicfake.NewSimpleDynamicClient(scheme) + return NewDucklingClientWithDynamic(fakeClient), fakeClient +} + +func TestReconcilePendingCreatesCR(t *testing.T) { + dc, fakeK8s := newFakeDucklingClient() + fs := newFakeStore() + fs.warehouses["org-a"] = &configstore.ManagedWarehouse{ + OrgID: "org-a", + State: configstore.ManagedWarehouseStatePending, + Image: "ghcr.io/posthog/duckgres:latest", + AuroraMinACU: 0.5, + AuroraMaxACU: 2, + } + + ctrl := NewControllerWithClient(fs, dc, time.Second) + ctx := context.Background() + + ctrl.reconcile(ctx) + + // Verify CR was created + cr, err := fakeK8s.Resource(ducklingGVR).Namespace(ducklingNamespace).Get(ctx, "org-a", metav1.GetOptions{}) + if err != nil { + t.Fatalf("expected CR to exist: %v", err) + } + + spec, ok := cr.Object["spec"].(map[string]interface{}) + if !ok { + t.Fatal("expected spec in CR") + } + if spec["orgID"] != "org-a" { + t.Fatalf("expected orgID org-a, got %v", spec["orgID"]) + } + if spec["image"] != "ghcr.io/posthog/duckgres:latest" { + t.Fatalf("expected image ghcr.io/posthog/duckgres:latest, got %v", spec["image"]) + } + + // Verify state transitioned to provisioning + if fs.warehouses["org-a"].State != configstore.ManagedWarehouseStateProvisioning { + t.Fatalf("expected provisioning state, got %q", fs.warehouses["org-a"].State) + } +} + +func TestReconcileProvisioningAllReady(t *testing.T) { + dc, fakeK8s := newFakeDucklingClient() + fs := newFakeStore() + fs.warehouses["org-b"] = &configstore.ManagedWarehouse{ + OrgID: "org-b", + State: configstore.ManagedWarehouseStateProvisioning, + CreatedAt: time.Now(), + } + + // Create a Duckling CR with all status fields populated + cr := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "k8s.posthog.com/v1alpha1", + "kind": "Duckling", + "metadata": map[string]interface{}{ + "name": "org-b", + "namespace": ducklingNamespace, + }, + "status": map[string]interface{}{ + "bucketName": "org-b-bucket", + "auroraEndpoint": "org-b.cluster.us-east-1.rds.amazonaws.com", + "auroraPort": int64(5432), + "region": "us-east-1", + "namespace": "duckling-org-b", + "serviceAccountName": "duckgres", + "iamRoleArn": "arn:aws:iam::123456789012:role/org-b", + "auroraPasswordSecret": "org-b-aurora-password", + "duckgresPasswordSecret": "org-b-duckgres-password", + "conditions": []interface{}{ + map[string]interface{}{ + "type": "Ready", + "status": "True", + }, + map[string]interface{}{ + "type": "Synced", + "status": "True", + }, + }, + }, + }, + } + + ctx := context.Background() + _, err := fakeK8s.Resource(ducklingGVR).Namespace(ducklingNamespace).Create(ctx, cr, metav1.CreateOptions{}) + if err != nil { + t.Fatalf("failed to create test CR: %v", err) + } + + ctrl := NewControllerWithClient(fs, dc, time.Second) + ctrl.reconcile(ctx) + + // Verify state transitioned to ready + w := fs.warehouses["org-b"] + if w.State != configstore.ManagedWarehouseStateReady { + t.Fatalf("expected ready state, got %q", w.State) + } + if w.S3.Bucket != "org-b-bucket" { + t.Fatalf("expected bucket org-b-bucket, got %q", w.S3.Bucket) + } + if w.MetadataStore.Endpoint != "org-b.cluster.us-east-1.rds.amazonaws.com" { + t.Fatalf("expected aurora endpoint, got %q", w.MetadataStore.Endpoint) + } + if w.WorkerIdentity.Namespace != "duckling-org-b" { + t.Fatalf("expected namespace duckling-org-b, got %q", w.WorkerIdentity.Namespace) + } + if w.ReadyAt == nil { + t.Fatal("expected ready_at to be set") + } +} + +func TestReconcileDeletingDeletesCR(t *testing.T) { + dc, fakeK8s := newFakeDucklingClient() + fs := newFakeStore() + fs.warehouses["org-c"] = &configstore.ManagedWarehouse{ + OrgID: "org-c", + State: configstore.ManagedWarehouseStateDeleting, + } + ctx := context.Background() + + // Create a CR first + cr := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "k8s.posthog.com/v1alpha1", + "kind": "Duckling", + "metadata": map[string]interface{}{ + "name": "org-c", + "namespace": ducklingNamespace, + }, + }, + } + _, err := fakeK8s.Resource(ducklingGVR).Namespace(ducklingNamespace).Create(ctx, cr, metav1.CreateOptions{}) + if err != nil { + t.Fatalf("failed to create test CR: %v", err) + } + + ctrl := NewControllerWithClient(fs, dc, time.Second) + ctrl.reconcile(ctx) + + // Verify CR is gone + _, err = fakeK8s.Resource(ducklingGVR).Namespace(ducklingNamespace).Get(ctx, "org-c", metav1.GetOptions{}) + if err == nil { + t.Fatal("expected CR to be deleted") + } + + // Verify state transitioned to deleted + if fs.warehouses["org-c"].State != configstore.ManagedWarehouseStateDeleted { + t.Fatalf("expected deleted state, got %q", fs.warehouses["org-c"].State) + } +} + +func TestReconcileDeletingRetriesOnNonNotFoundError(t *testing.T) { + // When the CR doesn't exist (NotFound), deleting should still succeed. + // When it's a different error, it should NOT transition to deleted. + dc, _ := newFakeDucklingClient() + fs := newFakeStore() + fs.warehouses["org-d"] = &configstore.ManagedWarehouse{ + OrgID: "org-d", + State: configstore.ManagedWarehouseStateDeleting, + } + ctx := context.Background() + + // Don't create a CR — the fake client will return NotFound on delete. + ctrl := NewControllerWithClient(fs, dc, time.Second) + ctrl.reconcile(ctx) + + // NotFound on delete is fine — should still transition to deleted + if fs.warehouses["org-d"].State != configstore.ManagedWarehouseStateDeleted { + t.Fatalf("expected deleted state on NotFound, got %q", fs.warehouses["org-d"].State) + } +} + +func TestParseDucklingStatusSyncedFalse(t *testing.T) { + cr := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "status": map[string]interface{}{ + "conditions": []interface{}{ + map[string]interface{}{ + "type": "Synced", + "status": "False", + "message": "cannot create Aurora cluster: InvalidParameterException", + }, + }, + }, + }, + } + + status, err := parseDucklingStatus(cr) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if status.SyncedFalseMessage != "cannot create Aurora cluster: InvalidParameterException" { + t.Fatalf("expected synced false message, got %q", status.SyncedFalseMessage) + } + if status.ReadyCondition { + t.Fatal("expected Ready to be false") + } +} + +func TestParseDucklingStatusEmpty(t *testing.T) { + cr := &unstructured.Unstructured{ + Object: map[string]interface{}{}, + } + + status, err := parseDucklingStatus(cr) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if status.BucketName != "" || status.AuroraEndpoint != "" || status.Namespace != "" { + t.Fatal("expected empty status for CR without status field") + } +} + +func TestFakeStoreUpdateWarehouseState(t *testing.T) { + fs := newFakeStore() + fs.warehouses["org-x"] = &configstore.ManagedWarehouse{ + OrgID: "org-x", + State: configstore.ManagedWarehouseStatePending, + } + + // CAS update should succeed + err := fs.UpdateWarehouseState("org-x", configstore.ManagedWarehouseStatePending, map[string]interface{}{ + "state": configstore.ManagedWarehouseStateProvisioning, + "status_message": "transitioning", + }) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if fs.warehouses["org-x"].State != configstore.ManagedWarehouseStateProvisioning { + t.Fatalf("expected provisioning state, got %q", fs.warehouses["org-x"].State) + } + + // CAS update with wrong expected state should be no-op + err = fs.UpdateWarehouseState("org-x", configstore.ManagedWarehouseStatePending, map[string]interface{}{ + "state": configstore.ManagedWarehouseStateFailed, + }) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if fs.warehouses["org-x"].State != configstore.ManagedWarehouseStateProvisioning { + t.Fatalf("expected state to remain provisioning, got %q", fs.warehouses["org-x"].State) + } +} diff --git a/controlplane/provisioner/k8s_client.go b/controlplane/provisioner/k8s_client.go new file mode 100644 index 0000000..c92058f --- /dev/null +++ b/controlplane/provisioner/k8s_client.go @@ -0,0 +1,173 @@ +//go:build kubernetes + +package provisioner + +import ( + "context" + "fmt" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/dynamic" + "k8s.io/client-go/rest" +) + +var ducklingGVR = schema.GroupVersionResource{ + Group: "k8s.posthog.com", + Version: "v1alpha1", + Resource: "ducklings", +} + +const ducklingNamespace = "crossplane-system" + +// DucklingStatus holds the parsed status from a Duckling CR. +type DucklingStatus struct { + BucketName string + AuroraEndpoint string + AuroraPort int + Region string + Namespace string + ServiceAccountName string + IAMRoleARN string + AuroraPasswordSecret string + DuckgresPasswordSecret string + DuckgresEndpoint string + DuckgresPort int + DuckgresDatabase string + DuckgresUsername string + ReadyCondition bool + SyncedFalseMessage string +} + +// DucklingClient wraps a Kubernetes dynamic client for Duckling CR operations. +type DucklingClient struct { + client dynamic.Interface +} + +// NewDucklingClient creates a DucklingClient using in-cluster config. +func NewDucklingClient() (*DucklingClient, error) { + config, err := rest.InClusterConfig() + if err != nil { + return nil, fmt.Errorf("in-cluster config: %w", err) + } + dc, err := dynamic.NewForConfig(config) + if err != nil { + return nil, fmt.Errorf("dynamic client: %w", err) + } + return &DucklingClient{client: dc}, nil +} + +// NewDucklingClientWithDynamic creates a DucklingClient with a provided dynamic.Interface (for testing). +func NewDucklingClientWithDynamic(client dynamic.Interface) *DucklingClient { + return &DucklingClient{client: client} +} + +// Create creates a Duckling CR for the given org. +func (d *DucklingClient) Create(ctx context.Context, orgID, image string, minACU, maxACU float64) error { + cr := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "k8s.posthog.com/v1alpha1", + "kind": "Duckling", + "metadata": map[string]interface{}{ + "name": orgID, + "namespace": ducklingNamespace, + }, + "spec": map[string]interface{}{ + "orgID": orgID, + "image": image, + "aurora": map[string]interface{}{ + "minACU": minACU, + "maxACU": maxACU, + }, + }, + }, + } + + _, err := d.client.Resource(ducklingGVR).Namespace(ducklingNamespace).Create(ctx, cr, metav1.CreateOptions{}) + if err != nil { + return fmt.Errorf("create duckling CR %q: %w", orgID, err) + } + return nil +} + +// Get fetches the Duckling CR and parses its status. +func (d *DucklingClient) Get(ctx context.Context, orgID string) (*DucklingStatus, error) { + cr, err := d.client.Resource(ducklingGVR).Namespace(ducklingNamespace).Get(ctx, orgID, metav1.GetOptions{}) + if err != nil { + return nil, fmt.Errorf("get duckling CR %q: %w", orgID, err) + } + return parseDucklingStatus(cr) +} + +// Delete removes the Duckling CR for the given org. +func (d *DucklingClient) Delete(ctx context.Context, orgID string) error { + err := d.client.Resource(ducklingGVR).Namespace(ducklingNamespace).Delete(ctx, orgID, metav1.DeleteOptions{}) + if err != nil { + return fmt.Errorf("delete duckling CR %q: %w", orgID, err) + } + return nil +} + +func parseDucklingStatus(cr *unstructured.Unstructured) (*DucklingStatus, error) { + status, ok := cr.Object["status"].(map[string]interface{}) + if !ok { + return &DucklingStatus{}, nil + } + + ds := &DucklingStatus{ + BucketName: getNestedString(status, "bucketName"), + AuroraEndpoint: getNestedString(status, "auroraEndpoint"), + AuroraPort: getNestedInt(status, "auroraPort"), + Region: getNestedString(status, "region"), + Namespace: getNestedString(status, "namespace"), + ServiceAccountName: getNestedString(status, "serviceAccountName"), + IAMRoleARN: getNestedString(status, "iamRoleArn"), + AuroraPasswordSecret: getNestedString(status, "auroraPasswordSecret"), + DuckgresPasswordSecret: getNestedString(status, "duckgresPasswordSecret"), + DuckgresEndpoint: getNestedString(status, "duckgresEndpoint"), + DuckgresPort: getNestedInt(status, "duckgresPort"), + DuckgresDatabase: getNestedString(status, "duckgresDatabase"), + DuckgresUsername: getNestedString(status, "duckgresUsername"), + } + + // Parse conditions + conditions, _ := status["conditions"].([]interface{}) + for _, cond := range conditions { + condMap, ok := cond.(map[string]interface{}) + if !ok { + continue + } + condType := getNestedString(condMap, "type") + condStatus := getNestedString(condMap, "status") + + switch condType { + case "Ready": + ds.ReadyCondition = condStatus == "True" + case "Synced": + if condStatus == "False" { + ds.SyncedFalseMessage = getNestedString(condMap, "message") + } + } + } + + return ds, nil +} + +func getNestedString(obj map[string]interface{}, key string) string { + v, _ := obj[key].(string) + return v +} + +func getNestedInt(obj map[string]interface{}, key string) int { + switch v := obj[key].(type) { + case int64: + return int(v) + case float64: + return int(v) + case int: + return v + default: + return 0 + } +} diff --git a/controlplane/provisioning/api.go b/controlplane/provisioning/api.go new file mode 100644 index 0000000..608bd54 --- /dev/null +++ b/controlplane/provisioning/api.go @@ -0,0 +1,106 @@ +package provisioning + +import ( + "errors" + "net/http" + + "github.com/gin-gonic/gin" + "github.com/posthog/duckgres/controlplane/configstore" + "gorm.io/gorm" +) + +// Store defines the config store operations needed by the provisioning API. +type Store interface { + GetManagedWarehouse(orgID string) (*configstore.ManagedWarehouse, error) + CreatePendingWarehouse(orgID string, warehouse *configstore.ManagedWarehouse) error + SetWarehouseDeleting(orgID string, expectedState configstore.ManagedWarehouseProvisioningState) error +} + +// RegisterAPI registers provisioning endpoints on the given router group. +func RegisterAPI(r *gin.RouterGroup, store Store) { + h := &handler{store: store} + r.POST("/orgs/:id/provision", h.provisionWarehouse) + r.POST("/orgs/:id/deprovision", h.deprovisionWarehouse) + r.GET("/orgs/:id/warehouse", h.getWarehouseStatus) +} + +type handler struct { + store Store +} + +type provisionRequest struct { + Image string `json:"image"` + MetadataStore *provisionMetadataReq `json:"metadata_store,omitempty"` +} + +type provisionMetadataReq struct { + Type string `json:"type"` + Aurora *provisionAuroraReq `json:"aurora,omitempty"` +} + +type provisionAuroraReq struct { + MinACU float64 `json:"min_acu"` + MaxACU float64 `json:"max_acu"` +} + +func (h *handler) provisionWarehouse(c *gin.Context) { + orgID := c.Param("id") + + var req provisionRequest + if err := c.ShouldBindJSON(&req); err != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) + return + } + + warehouse := &configstore.ManagedWarehouse{ + Image: req.Image, + } + if req.MetadataStore != nil && req.MetadataStore.Aurora != nil { + warehouse.AuroraMinACU = req.MetadataStore.Aurora.MinACU + warehouse.AuroraMaxACU = req.MetadataStore.Aurora.MaxACU + } + + if err := h.store.CreatePendingWarehouse(orgID, warehouse); err != nil { + c.JSON(http.StatusConflict, gin.H{"error": err.Error()}) + return + } + + c.JSON(http.StatusAccepted, gin.H{"status": "provisioning started", "org": orgID}) +} + +func (h *handler) deprovisionWarehouse(c *gin.Context) { + orgID := c.Param("id") + + // Try CAS from ready -> deleting, then from failed -> deleting. + // This avoids a read-then-write TOCTOU race. + err := h.store.SetWarehouseDeleting(orgID, configstore.ManagedWarehouseStateReady) + if err != nil { + err = h.store.SetWarehouseDeleting(orgID, configstore.ManagedWarehouseStateFailed) + } + if err != nil { + if errors.Is(err, gorm.ErrRecordNotFound) { + c.JSON(http.StatusNotFound, gin.H{"error": "warehouse not found"}) + return + } + c.JSON(http.StatusConflict, gin.H{"error": "warehouse must be in ready or failed state to deprovision"}) + return + } + + c.JSON(http.StatusAccepted, gin.H{"status": "deprovisioning started", "org": orgID}) +} + +func (h *handler) getWarehouseStatus(c *gin.Context) { + orgID := c.Param("id") + + warehouse, err := h.store.GetManagedWarehouse(orgID) + if err != nil { + if errors.Is(err, gorm.ErrRecordNotFound) { + c.JSON(http.StatusNotFound, gin.H{"error": "warehouse not found"}) + return + } + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + + c.JSON(http.StatusOK, warehouse) +} diff --git a/controlplane/provisioning/api_test.go b/controlplane/provisioning/api_test.go new file mode 100644 index 0000000..9e5b469 --- /dev/null +++ b/controlplane/provisioning/api_test.go @@ -0,0 +1,284 @@ +package provisioning + +import ( + "bytes" + "encoding/json" + "errors" + "fmt" + "net/http" + "net/http/httptest" + "testing" + + "github.com/gin-gonic/gin" + "github.com/posthog/duckgres/controlplane/configstore" + "gorm.io/gorm" +) + +type fakeStore struct { + orgs map[string]*configstore.Org + warehouses map[string]*configstore.ManagedWarehouse +} + +func newFakeStore() *fakeStore { + return &fakeStore{ + orgs: make(map[string]*configstore.Org), + warehouses: make(map[string]*configstore.ManagedWarehouse), + } +} + +func (s *fakeStore) GetManagedWarehouse(orgID string) (*configstore.ManagedWarehouse, error) { + w, ok := s.warehouses[orgID] + if !ok { + return nil, gorm.ErrRecordNotFound + } + clone := *w + return &clone, nil +} + +func (s *fakeStore) CreatePendingWarehouse(orgID string, warehouse *configstore.ManagedWarehouse) error { + // Auto-create org if needed (mirrors production behavior) + if _, ok := s.orgs[orgID]; !ok { + s.orgs[orgID] = &configstore.Org{Name: orgID} + } + existing, ok := s.warehouses[orgID] + if ok && existing.State != configstore.ManagedWarehouseStateFailed && existing.State != configstore.ManagedWarehouseStateDeleted { + return errors.New("warehouse already exists in non-terminal state") + } + clone := *warehouse + clone.OrgID = orgID + clone.State = configstore.ManagedWarehouseStatePending + clone.WarehouseDatabaseState = configstore.ManagedWarehouseStatePending + clone.MetadataStoreState = configstore.ManagedWarehouseStatePending + clone.S3State = configstore.ManagedWarehouseStatePending + clone.IdentityState = configstore.ManagedWarehouseStatePending + clone.SecretsState = configstore.ManagedWarehouseStatePending + s.warehouses[orgID] = &clone + return nil +} + +func (s *fakeStore) SetWarehouseDeleting(orgID string, expectedState configstore.ManagedWarehouseProvisioningState) error { + w, ok := s.warehouses[orgID] + if !ok { + return gorm.ErrRecordNotFound + } + if w.State != expectedState { + return fmt.Errorf("warehouse %q not in expected state %q", orgID, expectedState) + } + w.State = configstore.ManagedWarehouseStateDeleting + return nil +} + +func newTestRouter(store Store) *gin.Engine { + gin.SetMode(gin.TestMode) + r := gin.New() + RegisterAPI(r.Group("/api/v1"), store) + return r +} + +func TestProvisionCreatesWarehouse(t *testing.T) { + store := newFakeStore() + store.orgs["analytics"] = &configstore.Org{Name: "analytics"} + router := newTestRouter(store) + + body := []byte(`{ + "image": "ghcr.io/posthog/duckgres:latest", + "metadata_store": { + "type": "aurora", + "aurora": {"min_acu": 0.5, "max_acu": 2} + } + }`) + + req := httptest.NewRequest(http.MethodPost, "/api/v1/orgs/analytics/provision", bytes.NewReader(body)) + req.Header.Set("Content-Type", "application/json") + rec := httptest.NewRecorder() + router.ServeHTTP(rec, req) + + if rec.Code != http.StatusAccepted { + t.Fatalf("status = %d, want %d: %s", rec.Code, http.StatusAccepted, rec.Body.String()) + } + + w := store.warehouses["analytics"] + if w == nil { + t.Fatal("expected warehouse to be created") + } + if w.State != configstore.ManagedWarehouseStatePending { + t.Fatalf("expected state pending, got %q", w.State) + } + if w.Image != "ghcr.io/posthog/duckgres:latest" { + t.Fatalf("expected image, got %q", w.Image) + } + if w.AuroraMinACU != 0.5 { + t.Fatalf("expected min_acu 0.5, got %f", w.AuroraMinACU) + } + if w.AuroraMaxACU != 2 { + t.Fatalf("expected max_acu 2, got %f", w.AuroraMaxACU) + } +} + +func TestProvisionAutoCreatesOrg(t *testing.T) { + store := newFakeStore() + router := newTestRouter(store) + + body := []byte(`{"image": "ghcr.io/posthog/duckgres:latest"}`) + req := httptest.NewRequest(http.MethodPost, "/api/v1/orgs/new-org/provision", bytes.NewReader(body)) + req.Header.Set("Content-Type", "application/json") + rec := httptest.NewRecorder() + router.ServeHTTP(rec, req) + + if rec.Code != http.StatusAccepted { + t.Fatalf("status = %d, want %d: %s", rec.Code, http.StatusAccepted, rec.Body.String()) + } + if _, ok := store.orgs["new-org"]; !ok { + t.Fatal("expected org to be auto-created") + } + if store.warehouses["new-org"] == nil { + t.Fatal("expected warehouse to be created") + } +} + +func TestProvisionRejectsExistingNonTerminal(t *testing.T) { + store := newFakeStore() + store.orgs["analytics"] = &configstore.Org{Name: "analytics"} + store.warehouses["analytics"] = &configstore.ManagedWarehouse{ + OrgID: "analytics", + State: configstore.ManagedWarehouseStateProvisioning, + } + router := newTestRouter(store) + + body := []byte(`{"image": "ghcr.io/posthog/duckgres:latest"}`) + req := httptest.NewRequest(http.MethodPost, "/api/v1/orgs/analytics/provision", bytes.NewReader(body)) + req.Header.Set("Content-Type", "application/json") + rec := httptest.NewRecorder() + router.ServeHTTP(rec, req) + + if rec.Code != http.StatusConflict { + t.Fatalf("status = %d, want %d: %s", rec.Code, http.StatusConflict, rec.Body.String()) + } +} + +func TestProvisionAllowsRetryAfterFailure(t *testing.T) { + store := newFakeStore() + store.orgs["analytics"] = &configstore.Org{Name: "analytics"} + store.warehouses["analytics"] = &configstore.ManagedWarehouse{ + OrgID: "analytics", + State: configstore.ManagedWarehouseStateFailed, + } + router := newTestRouter(store) + + body := []byte(`{"image": "ghcr.io/posthog/duckgres:v2"}`) + req := httptest.NewRequest(http.MethodPost, "/api/v1/orgs/analytics/provision", bytes.NewReader(body)) + req.Header.Set("Content-Type", "application/json") + rec := httptest.NewRecorder() + router.ServeHTTP(rec, req) + + if rec.Code != http.StatusAccepted { + t.Fatalf("status = %d, want %d: %s", rec.Code, http.StatusAccepted, rec.Body.String()) + } + if store.warehouses["analytics"].Image != "ghcr.io/posthog/duckgres:v2" { + t.Fatalf("expected new image, got %q", store.warehouses["analytics"].Image) + } +} + +func TestDeprovisionReadyWarehouse(t *testing.T) { + store := newFakeStore() + store.orgs["analytics"] = &configstore.Org{Name: "analytics"} + store.warehouses["analytics"] = &configstore.ManagedWarehouse{ + OrgID: "analytics", + State: configstore.ManagedWarehouseStateReady, + } + router := newTestRouter(store) + + req := httptest.NewRequest(http.MethodPost, "/api/v1/orgs/analytics/deprovision", nil) + rec := httptest.NewRecorder() + router.ServeHTTP(rec, req) + + if rec.Code != http.StatusAccepted { + t.Fatalf("status = %d, want %d: %s", rec.Code, http.StatusAccepted, rec.Body.String()) + } + if store.warehouses["analytics"].State != configstore.ManagedWarehouseStateDeleting { + t.Fatalf("expected deleting state, got %q", store.warehouses["analytics"].State) + } +} + +func TestDeprovisionFailedWarehouse(t *testing.T) { + store := newFakeStore() + store.orgs["analytics"] = &configstore.Org{Name: "analytics"} + store.warehouses["analytics"] = &configstore.ManagedWarehouse{ + OrgID: "analytics", + State: configstore.ManagedWarehouseStateFailed, + } + router := newTestRouter(store) + + req := httptest.NewRequest(http.MethodPost, "/api/v1/orgs/analytics/deprovision", nil) + rec := httptest.NewRecorder() + router.ServeHTTP(rec, req) + + if rec.Code != http.StatusAccepted { + t.Fatalf("status = %d, want %d: %s", rec.Code, http.StatusAccepted, rec.Body.String()) + } + if store.warehouses["analytics"].State != configstore.ManagedWarehouseStateDeleting { + t.Fatalf("expected deleting state, got %q", store.warehouses["analytics"].State) + } +} + +func TestDeprovisionRejectsProvisioningWarehouse(t *testing.T) { + store := newFakeStore() + store.orgs["analytics"] = &configstore.Org{Name: "analytics"} + store.warehouses["analytics"] = &configstore.ManagedWarehouse{ + OrgID: "analytics", + State: configstore.ManagedWarehouseStateProvisioning, + } + router := newTestRouter(store) + + req := httptest.NewRequest(http.MethodPost, "/api/v1/orgs/analytics/deprovision", nil) + rec := httptest.NewRecorder() + router.ServeHTTP(rec, req) + + if rec.Code != http.StatusConflict { + t.Fatalf("status = %d, want %d: %s", rec.Code, http.StatusConflict, rec.Body.String()) + } +} + +func TestGetWarehouseStatus(t *testing.T) { + store := newFakeStore() + store.orgs["analytics"] = &configstore.Org{Name: "analytics"} + store.warehouses["analytics"] = &configstore.ManagedWarehouse{ + OrgID: "analytics", + State: configstore.ManagedWarehouseStateProvisioning, + S3State: configstore.ManagedWarehouseStateReady, + MetadataStoreState: configstore.ManagedWarehouseStatePending, + } + router := newTestRouter(store) + + req := httptest.NewRequest(http.MethodGet, "/api/v1/orgs/analytics/warehouse", nil) + rec := httptest.NewRecorder() + router.ServeHTTP(rec, req) + + if rec.Code != http.StatusOK { + t.Fatalf("status = %d, want %d: %s", rec.Code, http.StatusOK, rec.Body.String()) + } + + var w configstore.ManagedWarehouse + if err := json.Unmarshal(rec.Body.Bytes(), &w); err != nil { + t.Fatalf("unmarshal: %v", err) + } + if w.State != configstore.ManagedWarehouseStateProvisioning { + t.Fatalf("expected provisioning state, got %q", w.State) + } + if w.S3State != configstore.ManagedWarehouseStateReady { + t.Fatalf("expected s3 ready, got %q", w.S3State) + } +} + +func TestGetWarehouseNotFound(t *testing.T) { + store := newFakeStore() + router := newTestRouter(store) + + req := httptest.NewRequest(http.MethodGet, "/api/v1/orgs/unknown/warehouse", nil) + rec := httptest.NewRecorder() + router.ServeHTTP(rec, req) + + if rec.Code != http.StatusNotFound { + t.Fatalf("status = %d, want %d: %s", rec.Code, http.StatusNotFound, rec.Body.String()) + } +} diff --git a/controlplane/provisioning/store.go b/controlplane/provisioning/store.go new file mode 100644 index 0000000..cedb213 --- /dev/null +++ b/controlplane/provisioning/store.go @@ -0,0 +1,82 @@ +package provisioning + +import ( + "errors" + "fmt" + + "github.com/posthog/duckgres/controlplane/configstore" + "gorm.io/gorm" +) + +// gormStore implements Store using a ConfigStore's GORM DB. +type gormStore struct { + cs *configstore.ConfigStore +} + +// NewGormStore creates a Store backed by the given ConfigStore. +func NewGormStore(cs *configstore.ConfigStore) Store { + return &gormStore{cs: cs} +} + +func (s *gormStore) GetManagedWarehouse(orgID string) (*configstore.ManagedWarehouse, error) { + var warehouse configstore.ManagedWarehouse + if err := s.cs.DB().First(&warehouse, "org_id = ?", orgID).Error; err != nil { + return nil, err + } + return &warehouse, nil +} + +func (s *gormStore) CreatePendingWarehouse(orgID string, warehouse *configstore.ManagedWarehouse) error { + return s.cs.DB().Transaction(func(tx *gorm.DB) error { + // Auto-create org if it doesn't exist (PostHog calls provision, duckgres creates everything) + org := configstore.Org{Name: orgID} + if err := tx.Where("name = ?", orgID).FirstOrCreate(&org).Error; err != nil { + return err + } + + // Check for existing warehouse in non-terminal state + var existing configstore.ManagedWarehouse + err := tx.First(&existing, "org_id = ?", orgID).Error + if err == nil { + if existing.State != configstore.ManagedWarehouseStateFailed && + existing.State != configstore.ManagedWarehouseStateDeleted { + return errors.New("warehouse already exists in non-terminal state") + } + if err := tx.Delete(&existing).Error; err != nil { + return err + } + } else if !errors.Is(err, gorm.ErrRecordNotFound) { + return err + } + + warehouse.OrgID = orgID + warehouse.State = configstore.ManagedWarehouseStatePending + warehouse.WarehouseDatabaseState = configstore.ManagedWarehouseStatePending + warehouse.MetadataStoreState = configstore.ManagedWarehouseStatePending + warehouse.S3State = configstore.ManagedWarehouseStatePending + warehouse.IdentityState = configstore.ManagedWarehouseStatePending + warehouse.SecretsState = configstore.ManagedWarehouseStatePending + return tx.Create(warehouse).Error + }) +} + +// SetWarehouseDeleting atomically transitions a warehouse from expectedState to deleting. +// Returns gorm.ErrRecordNotFound if no warehouse exists, or an error if the CAS fails. +func (s *gormStore) SetWarehouseDeleting(orgID string, expectedState configstore.ManagedWarehouseProvisioningState) error { + result := s.cs.DB().Model(&configstore.ManagedWarehouse{}). + Where("org_id = ? AND state = ?", orgID, expectedState). + Update("state", configstore.ManagedWarehouseStateDeleting) + if result.Error != nil { + return result.Error + } + if result.RowsAffected == 0 { + // Distinguish "not found" from "wrong state" + var count int64 + s.cs.DB().Model(&configstore.ManagedWarehouse{}).Where("org_id = ?", orgID).Count(&count) + if count == 0 { + return gorm.ErrRecordNotFound + } + return fmt.Errorf("warehouse %q not in expected state %q", orgID, expectedState) + } + return nil +} diff --git a/justfile b/justfile index e293fca..ffb7ff0 100644 --- a/justfile +++ b/justfile @@ -80,7 +80,7 @@ run-multitenant-local: multitenant-config-store-up build-k8s-image deploy-multit @echo "Multi-tenant control plane ready." @echo "Default login: postgres / postgres" @echo "Fetch admin token with: kubectl -n duckgres logs deployment/duckgres-control-plane | rg 'Generated admin API token'" - @echo "Run 'just multitenant-port-forward-pg' in one terminal and 'just multitenant-port-forward-admin' in another." + @echo "Run 'just multitenant-port-forward-pg', 'just multitenant-port-forward-admin', and 'just multitenant-port-forward-provisioning' in separate terminals." # Port-forward PostgreSQL traffic from the local control plane [group('dev')] @@ -92,6 +92,11 @@ multitenant-port-forward-pg: multitenant-port-forward-admin: kubectl -n duckgres port-forward deployment/duckgres-control-plane 9090:9090 +# Port-forward the provisioning API from the local control plane +[group('dev')] +multitenant-port-forward-provisioning: + kubectl -n duckgres port-forward deployment/duckgres-control-plane 9091:9091 + # Run with DuckLake config [group('dev')] run-ducklake: build diff --git a/main.go b/main.go index b9aaffa..50d326b 100644 --- a/main.go +++ b/main.go @@ -244,6 +244,8 @@ func main() { configStore := flag.String("config-store", "", "PostgreSQL connection string for config store (env: DUCKGRES_CONFIG_STORE)") configPollInterval := flag.String("config-poll-interval", "", "How often to poll config store for changes (default: 30s) (env: DUCKGRES_CONFIG_POLL_INTERVAL)") adminToken := flag.String("admin-token", "", "Bearer token for admin API authentication (env: DUCKGRES_ADMIN_TOKEN)") + provisioningToken := flag.String("provisioning-token", "", "Bearer token for provisioning API authentication; falls back to admin-token if empty (env: DUCKGRES_PROVISIONING_TOKEN)") + provisioningPort := flag.Int("provisioning-port", 0, "Listen port for provisioning API server (default: 9091) (env: DUCKGRES_PROVISIONING_PORT)") // ACME/Let's Encrypt flags acmeDomain := flag.String("acme-domain", "", "Domain for ACME/Let's Encrypt certificate (env: DUCKGRES_ACME_DOMAIN)") @@ -302,6 +304,8 @@ func main() { fmt.Fprintf(os.Stderr, " DUCKGRES_ADMIN_TOKEN Bearer token for admin API authentication\n") fmt.Fprintf(os.Stderr, " DUCKGRES_K8S_MAX_WORKERS Max K8s workers in the shared pool\n") fmt.Fprintf(os.Stderr, " DUCKGRES_K8S_SHARED_WARM_TARGET Neutral shared warm-worker target for K8s multi-tenant mode\n") + fmt.Fprintf(os.Stderr, " DUCKGRES_PROVISIONING_TOKEN Bearer token for provisioning API (falls back to admin token)\n") + fmt.Fprintf(os.Stderr, " DUCKGRES_PROVISIONING_PORT Listen port for provisioning API (default: 9091)\n") fmt.Fprintf(os.Stderr, " DUCKGRES_LOG_LEVEL Log level: debug, info, warn, error (default: info)\n") fmt.Fprintf(os.Stderr, "\nPrecedence: CLI flags > environment variables > config file > defaults\n") } @@ -403,6 +407,8 @@ func main() { ConfigStoreConn: *configStore, ConfigPollInterval: *configPollInterval, AdminToken: *adminToken, + ProvisioningToken: *provisioningToken, + ProvisioningPort: *provisioningPort, WorkerBackend: *workerBackend, K8sWorkerImage: *k8sWorkerImage, K8sWorkerNamespace: *k8sWorkerNamespace, @@ -547,6 +553,8 @@ func main() { ConfigStoreConn: resolved.ConfigStoreConn, ConfigPollInterval: resolved.ConfigPollInterval, AdminToken: resolved.AdminToken, + ProvisioningToken: resolved.ProvisioningToken, + ProvisioningPort: resolved.ProvisioningPort, K8s: controlplane.K8sConfig{ WorkerImage: resolved.K8sWorkerImage, WorkerNamespace: resolved.K8sWorkerNamespace, From 5bd462dd928def8af812662b293280fc4c9420a1 Mon Sep 17 00:00:00 2001 From: eric Date: Tue, 24 Mar 2026 08:59:34 -0700 Subject: [PATCH 03/17] Merge admin and provisioning APIs onto single :8080 port Fold the separate admin (:9090) and provisioning (:9091) HTTP servers into a single unified API server on :8080. Both share the same Gin engine, auth middleware, and /api/v1 route group. Remove ProvisioningPort, ProvisioningToken config fields, CLI flags, and env vars (DUCKGRES_PROVISIONING_TOKEN, DUCKGRES_PROVISIONING_PORT). Co-Authored-By: Claude Opus 4.6 (1M context) --- config_resolution.go | 22 ------------- controlplane/control.go | 26 +++------------ controlplane/multitenant.go | 55 ++++++++------------------------ controlplane/multitenant_stub.go | 2 +- justfile | 13 +++----- main.go | 8 ----- 6 files changed, 24 insertions(+), 102 deletions(-) diff --git a/config_resolution.go b/config_resolution.go index a3f3177..a0a7b51 100644 --- a/config_resolution.go +++ b/config_resolution.go @@ -41,8 +41,6 @@ type configCLIInputs struct { ConfigStoreConn string ConfigPollInterval string AdminToken string - ProvisioningToken string - ProvisioningPort int WorkerBackend string K8sWorkerImage string K8sWorkerNamespace string @@ -80,8 +78,6 @@ type resolvedConfig struct { ConfigStoreConn string ConfigPollInterval time.Duration AdminToken string - ProvisioningToken string - ProvisioningPort int } func defaultServerConfig() server.Config { @@ -139,8 +135,6 @@ func resolveEffectiveConfig(fileCfg *FileConfig, cli configCLIInputs, getenv fun var configStoreConn string var configPollInterval time.Duration var adminToken string - var provisioningToken string - var provisioningPort int if fileCfg != nil { if fileCfg.Host != "" { @@ -596,14 +590,6 @@ func resolveEffectiveConfig(fileCfg *FileConfig, cli configCLIInputs, getenv fun if v := getenv("DUCKGRES_ADMIN_TOKEN"); v != "" { adminToken = v } - if v := getenv("DUCKGRES_PROVISIONING_TOKEN"); v != "" { - provisioningToken = v - } - if v := getenv("DUCKGRES_PROVISIONING_PORT"); v != "" { - if n, err := strconv.Atoi(v); err == nil { - provisioningPort = n - } - } if v := getenv("DUCKGRES_WORKER_BACKEND"); v != "" { workerBackend = v } @@ -820,12 +806,6 @@ func resolveEffectiveConfig(fileCfg *FileConfig, cli configCLIInputs, getenv fun if cli.Set["admin-token"] { adminToken = cli.AdminToken } - if cli.Set["provisioning-token"] { - provisioningToken = cli.ProvisioningToken - } - if cli.Set["provisioning-port"] { - provisioningPort = cli.ProvisioningPort - } if cli.Set["worker-backend"] { workerBackend = cli.WorkerBackend } @@ -935,7 +915,5 @@ func resolveEffectiveConfig(fileCfg *FileConfig, cli configCLIInputs, getenv fun ConfigStoreConn: configStoreConn, ConfigPollInterval: configPollInterval, AdminToken: adminToken, - ProvisioningToken: provisioningToken, - ProvisioningPort: provisioningPort, } } diff --git a/controlplane/control.go b/controlplane/control.go index ebdf070..02aeb18 100644 --- a/controlplane/control.go +++ b/controlplane/control.go @@ -62,13 +62,6 @@ type ControlPlaneConfig struct { // When empty, a random token is generated and logged at startup. AdminToken string - // ProvisioningToken is the bearer token required for provisioning API requests. - // When empty, falls back to AdminToken. - ProvisioningToken string - - // ProvisioningPort is the listen port for the provisioning API server. - // Default: 9091. - ProvisioningPort int } type ProcessConfig struct { @@ -117,9 +110,8 @@ type ControlPlane struct { acmeDNSManager *server.ACMEDNSManager // ACME manager for DNS-01 (nil when not using DNS challenges) // Multi-tenant fields (non-nil in remote multitenant mode) - orgRouter OrgRouterInterface - configStore ConfigStoreInterface - provisioningServer *http.Server // provisioning API server (shut down on graceful exit) + orgRouter OrgRouterInterface + configStore ConfigStoreInterface } // ConfigStoreInterface abstracts the config store for the control plane. @@ -327,21 +319,20 @@ func RunControlPlane(cfg ControlPlaneConfig) { // Multi-tenant mode: config store + per-org pools (K8s remote backend only) if cfg.WorkerBackend == "remote" { - store, adapter, servers, err := SetupMultiTenant(cfg, srv, memBudget, k8sMaxWorkers) + store, adapter, apiServer, err := SetupMultiTenant(cfg, srv, memBudget, k8sMaxWorkers) if err != nil { slog.Error("Failed to set up multi-tenant config store.", "error", err) os.Exit(1) } cp.configStore = store cp.orgRouter = adapter - // Replace the simple metrics server with the Gin admin server + // Replace the simple metrics server with the unified API server if cfg.MetricsServer != nil { ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) _ = cfg.MetricsServer.Shutdown(ctx) cancel() } - cfg.MetricsServer = servers[0] // admin server - cp.provisioningServer = servers[1] + cfg.MetricsServer = apiServer cp.cfg = cfg _ = store // keep linter happy } else { @@ -964,13 +955,6 @@ func (cp *ControlPlane) handleUpgrade() { } cancel() } - if cp.provisioningServer != nil { - ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) - if err := cp.provisioningServer.Shutdown(ctx); err != nil { - slog.Warn("Provisioning server shutdown failed.", "error", err) - } - cancel() - } // Stop ACME managers so the new CP can bind port 80 (HTTP-01) or // manage DNS records. Nil out after close so drainAfterUpgrade diff --git a/controlplane/multitenant.go b/controlplane/multitenant.go index fab6812..450a266 100644 --- a/controlplane/multitenant.go +++ b/controlplane/multitenant.go @@ -108,15 +108,15 @@ func (a *orgRouterAdapter) AllSessionStatuses() []admin.SessionStatus { var _ OrgRouterInterface = (*orgRouterAdapter)(nil) var _ admin.OrgStackInfo = (*orgRouterAdapter)(nil) -// SetupMultiTenant initializes the config store, org router, admin server, and provisioning server. +// SetupMultiTenant initializes the config store, org router, and API server. // Called from RunControlPlane when --config-store is set with remote backend. -// Returns the admin server and provisioning server for graceful shutdown. +// Returns the API server for graceful shutdown. func SetupMultiTenant( cfg ControlPlaneConfig, srv *server.Server, memBudget uint64, maxWorkers int, -) (ConfigStoreInterface, OrgRouterInterface, []*http.Server, error) { +) (ConfigStoreInterface, OrgRouterInterface, *http.Server, error) { pollInterval := cfg.ConfigPollInterval if pollInterval <= 0 { pollInterval = 30 * time.Second @@ -127,11 +127,6 @@ func SetupMultiTenant( return nil, nil, nil, err } - provisioningPort := cfg.ProvisioningPort - if provisioningPort == 0 { - provisioningPort = 9091 - } - baseCfg := K8sWorkerPoolConfig{ Namespace: cfg.K8s.WorkerNamespace, CPID: cfg.K8s.ControlPlaneID, @@ -180,12 +175,12 @@ func SetupMultiTenant( slog.Info("Generated admin API token (pass via --admin-token or DUCKGRES_ADMIN_TOKEN to set explicitly).", "token", adminToken) } - // Set up Gin admin server (replaces the simple metrics server) + // Set up unified API server (admin + provisioning on single port) gin.SetMode(gin.ReleaseMode) engine := gin.New() engine.Use(gin.Recovery()) - // Existing endpoints (unauthenticated) + // Unauthenticated endpoints engine.GET("/metrics", gin.WrapH(promhttp.Handler())) engine.GET("/health", func(c *gin.Context) { c.String(http.StatusOK, "ok") @@ -195,44 +190,22 @@ func SetupMultiTenant( api := engine.Group("/api/v1", admin.APIAuthMiddleware(adminToken)) admin.RegisterAPI(api, store, adpt) + // Provisioning API (same auth, same /api/v1 group) + provisioning.RegisterAPI(api, provisioning.NewGormStore(store)) + // Dashboard admin.RegisterDashboard(engine, adminToken) - adminServer := &http.Server{ - Addr: ":9090", + apiServer := &http.Server{ + Addr: ":8080", Handler: engine, } go func() { - slog.Info("Starting admin server with dashboard.", "addr", adminServer.Addr) - if err := adminServer.ListenAndServe(); err != nil && err != http.ErrServerClosed { - slog.Warn("Admin server error.", "error", err) - } - }() - - // Set up provisioning API server (separate from admin — production-facing) - provToken := cfg.ProvisioningToken - if provToken == "" { - provToken = adminToken // fall back to admin token if not set - } - - provEngine := gin.New() - provEngine.Use(gin.Recovery()) - provEngine.GET("/health", func(c *gin.Context) { - c.String(http.StatusOK, "ok") - }) - provAPI := provEngine.Group("/api/v1", admin.APIAuthMiddleware(provToken)) - provisioning.RegisterAPI(provAPI, provisioning.NewGormStore(store)) - - provServer := &http.Server{ - Addr: fmt.Sprintf(":%d", provisioningPort), - Handler: provEngine, - } - go func() { - slog.Info("Starting provisioning API server.", "addr", provServer.Addr) - if err := provServer.ListenAndServe(); err != nil && err != http.ErrServerClosed { - slog.Warn("Provisioning API server error.", "error", err) + slog.Info("Starting API server.", "addr", apiServer.Addr) + if err := apiServer.ListenAndServe(); err != nil && err != http.ErrServerClosed { + slog.Warn("API server error.", "error", err) } }() - return store, adpt, []*http.Server{adminServer, provServer}, nil + return store, adpt, apiServer, nil } diff --git a/controlplane/multitenant_stub.go b/controlplane/multitenant_stub.go index 3959891..9efb937 100644 --- a/controlplane/multitenant_stub.go +++ b/controlplane/multitenant_stub.go @@ -15,6 +15,6 @@ func SetupMultiTenant( srv *server.Server, memBudget uint64, maxWorkers int, -) (ConfigStoreInterface, OrgRouterInterface, []*http.Server, error) { +) (ConfigStoreInterface, OrgRouterInterface, *http.Server, error) { return nil, nil, nil, fmt.Errorf("multi-tenant mode requires -tags kubernetes build") } diff --git a/justfile b/justfile index dac8b3c..ccb781c 100644 --- a/justfile +++ b/justfile @@ -174,7 +174,7 @@ run-multitenant-local: multitenant-config-store-up build-k8s-image deploy-multit @echo "Multi-tenant control plane ready." @echo "Default login: postgres / postgres" @echo "Fetch admin token with: kubectl -n duckgres logs deployment/duckgres-control-plane | rg 'Generated admin API token'" - @echo "Run 'just multitenant-port-forward-pg', 'just multitenant-port-forward-admin', and 'just multitenant-port-forward-provisioning' in separate terminals." + @echo "Run 'just multitenant-port-forward-pg' and 'just multitenant-port-forward-api' in separate terminals." # End-to-end local multi-tenant setup: kind K8s + config store + control plane [group('dev')] @@ -204,15 +204,10 @@ cleanup-multitenant-kind: multitenant-port-forward-pg: kubectl -n duckgres port-forward svc/duckgres 5432:5432 -# Port-forward the admin dashboard and API from the local control plane +# Port-forward the API server (admin + provisioning) from the local control plane [group('dev')] -multitenant-port-forward-admin: - kubectl -n duckgres port-forward deployment/duckgres-control-plane 9090:9090 - -# Port-forward the provisioning API from the local control plane -[group('dev')] -multitenant-port-forward-provisioning: - kubectl -n duckgres port-forward deployment/duckgres-control-plane 9091:9091 +multitenant-port-forward-api: + kubectl -n duckgres port-forward deployment/duckgres-control-plane 8080:8080 # Run with DuckLake config [group('dev')] diff --git a/main.go b/main.go index aed5370..fc38e66 100644 --- a/main.go +++ b/main.go @@ -246,8 +246,6 @@ func main() { configStore := flag.String("config-store", "", "PostgreSQL connection string for config store (env: DUCKGRES_CONFIG_STORE)") configPollInterval := flag.String("config-poll-interval", "", "How often to poll config store for changes (default: 30s) (env: DUCKGRES_CONFIG_POLL_INTERVAL)") adminToken := flag.String("admin-token", "", "Bearer token for admin API authentication (env: DUCKGRES_ADMIN_TOKEN)") - provisioningToken := flag.String("provisioning-token", "", "Bearer token for provisioning API authentication; falls back to admin-token if empty (env: DUCKGRES_PROVISIONING_TOKEN)") - provisioningPort := flag.Int("provisioning-port", 0, "Listen port for provisioning API server (default: 9091) (env: DUCKGRES_PROVISIONING_PORT)") // ACME/Let's Encrypt flags acmeDomain := flag.String("acme-domain", "", "Domain for ACME/Let's Encrypt certificate (env: DUCKGRES_ACME_DOMAIN)") @@ -306,8 +304,6 @@ func main() { fmt.Fprintf(os.Stderr, " DUCKGRES_ADMIN_TOKEN Bearer token for admin API authentication\n") fmt.Fprintf(os.Stderr, " DUCKGRES_K8S_MAX_WORKERS Max K8s workers in the shared pool\n") fmt.Fprintf(os.Stderr, " DUCKGRES_K8S_SHARED_WARM_TARGET Neutral shared warm-worker target for K8s multi-tenant mode\n") - fmt.Fprintf(os.Stderr, " DUCKGRES_PROVISIONING_TOKEN Bearer token for provisioning API (falls back to admin token)\n") - fmt.Fprintf(os.Stderr, " DUCKGRES_PROVISIONING_PORT Listen port for provisioning API (default: 9091)\n") fmt.Fprintf(os.Stderr, " DUCKGRES_K8S_SHARED_WARM_WORKERS Enable shared warm-worker activation path for K8s multi-tenant mode\n") fmt.Fprintf(os.Stderr, " DUCKGRES_LOG_LEVEL Log level: debug, info, warn, error (default: info)\n") fmt.Fprintf(os.Stderr, "\nPrecedence: CLI flags > environment variables > config file > defaults\n") @@ -410,8 +406,6 @@ func main() { ConfigStoreConn: *configStore, ConfigPollInterval: *configPollInterval, AdminToken: *adminToken, - ProvisioningToken: *provisioningToken, - ProvisioningPort: *provisioningPort, WorkerBackend: *workerBackend, K8sWorkerImage: *k8sWorkerImage, K8sWorkerNamespace: *k8sWorkerNamespace, @@ -557,8 +551,6 @@ func main() { ConfigStoreConn: resolved.ConfigStoreConn, ConfigPollInterval: resolved.ConfigPollInterval, AdminToken: resolved.AdminToken, - ProvisioningToken: resolved.ProvisioningToken, - ProvisioningPort: resolved.ProvisioningPort, K8s: controlplane.K8sConfig{ WorkerImage: resolved.K8sWorkerImage, WorkerNamespace: resolved.K8sWorkerNamespace, From e29904e910f0cc92530a9a0a9dedc5e69614b11f Mon Sep 17 00:00:00 2001 From: eric Date: Tue, 24 Mar 2026 09:05:58 -0700 Subject: [PATCH 04/17] Fix Duckling CR spec to match XRD schema Move aurora config under spec.metadataStore.aurora to match the Crossplane XRD definition. Remove orgID from spec (not in schema). Co-Authored-By: Claude Opus 4.6 (1M context) --- controlplane/provisioner/controller_test.go | 20 +++++++++++++++++--- controlplane/provisioner/k8s_client.go | 10 ++++++---- 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/controlplane/provisioner/controller_test.go b/controlplane/provisioner/controller_test.go index ca1c8bd..878d760 100644 --- a/controlplane/provisioner/controller_test.go +++ b/controlplane/provisioner/controller_test.go @@ -125,12 +125,26 @@ func TestReconcilePendingCreatesCR(t *testing.T) { if !ok { t.Fatal("expected spec in CR") } - if spec["orgID"] != "org-a" { - t.Fatalf("expected orgID org-a, got %v", spec["orgID"]) - } if spec["image"] != "ghcr.io/posthog/duckgres:latest" { t.Fatalf("expected image ghcr.io/posthog/duckgres:latest, got %v", spec["image"]) } + metadataStore, ok := spec["metadataStore"].(map[string]interface{}) + if !ok { + t.Fatal("expected metadataStore in spec") + } + if metadataStore["type"] != "aurora" { + t.Fatalf("expected metadataStore type aurora, got %v", metadataStore["type"]) + } + aurora, ok := metadataStore["aurora"].(map[string]interface{}) + if !ok { + t.Fatal("expected aurora in metadataStore") + } + if aurora["minACU"] != 0.5 { + t.Fatalf("expected minACU 0.5, got %v", aurora["minACU"]) + } + if aurora["maxACU"] != 2.0 { + t.Fatalf("expected maxACU 2, got %v", aurora["maxACU"]) + } // Verify state transitioned to provisioning if fs.warehouses["org-a"].State != configstore.ManagedWarehouseStateProvisioning { diff --git a/controlplane/provisioner/k8s_client.go b/controlplane/provisioner/k8s_client.go index c92058f..84e04b6 100644 --- a/controlplane/provisioner/k8s_client.go +++ b/controlplane/provisioner/k8s_client.go @@ -74,11 +74,13 @@ func (d *DucklingClient) Create(ctx context.Context, orgID, image string, minACU "namespace": ducklingNamespace, }, "spec": map[string]interface{}{ - "orgID": orgID, "image": image, - "aurora": map[string]interface{}{ - "minACU": minACU, - "maxACU": maxACU, + "metadataStore": map[string]interface{}{ + "type": "aurora", + "aurora": map[string]interface{}{ + "minACU": minACU, + "maxACU": maxACU, + }, }, }, }, From 2558a58d02e5e6dad421c4847f2f3b11461bbb0e Mon Sep 17 00:00:00 2001 From: eric Date: Tue, 24 Mar 2026 09:12:34 -0700 Subject: [PATCH 05/17] Switch API auth to X-Duckgres-Internal-Secret header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace Authorization: Bearer with X-Duckgres-Internal-Secret header to align with PostHog's internal service auth pattern. Rename config: - --admin-token → --internal-secret - DUCKGRES_ADMIN_TOKEN → DUCKGRES_INTERNAL_SECRET - cfg.AdminToken → cfg.InternalSecret Dashboard cookie auth preserved as fallback for browser sessions. Co-Authored-By: Claude Opus 4.6 (1M context) --- config_resolution.go | 16 ++++++++-------- controlplane/admin/dashboard.go | 7 ++++--- controlplane/control.go | 6 +++--- controlplane/multitenant.go | 18 ++++++++---------- main.go | 8 ++++---- 5 files changed, 27 insertions(+), 28 deletions(-) diff --git a/config_resolution.go b/config_resolution.go index a0a7b51..3c69650 100644 --- a/config_resolution.go +++ b/config_resolution.go @@ -40,7 +40,7 @@ type configCLIInputs struct { MaxConnections int ConfigStoreConn string ConfigPollInterval string - AdminToken string + InternalSecret string WorkerBackend string K8sWorkerImage string K8sWorkerNamespace string @@ -77,7 +77,7 @@ type resolvedConfig struct { K8sSharedWarmWorkers bool ConfigStoreConn string ConfigPollInterval time.Duration - AdminToken string + InternalSecret string } func defaultServerConfig() server.Config { @@ -134,7 +134,7 @@ func resolveEffectiveConfig(fileCfg *FileConfig, cli configCLIInputs, getenv fun var k8sSharedWarmWorkers bool var configStoreConn string var configPollInterval time.Duration - var adminToken string + var internalSecret string if fileCfg != nil { if fileCfg.Host != "" { @@ -587,8 +587,8 @@ func resolveEffectiveConfig(fileCfg *FileConfig, cli configCLIInputs, getenv fun warn("Invalid DUCKGRES_CONFIG_POLL_INTERVAL duration: " + err.Error()) } } - if v := getenv("DUCKGRES_ADMIN_TOKEN"); v != "" { - adminToken = v + if v := getenv("DUCKGRES_INTERNAL_SECRET"); v != "" { + internalSecret = v } if v := getenv("DUCKGRES_WORKER_BACKEND"); v != "" { workerBackend = v @@ -803,8 +803,8 @@ func resolveEffectiveConfig(fileCfg *FileConfig, cli configCLIInputs, getenv fun warn("Invalid --config-poll-interval duration: " + err.Error()) } } - if cli.Set["admin-token"] { - adminToken = cli.AdminToken + if cli.Set["internal-secret"] { + internalSecret = cli.InternalSecret } if cli.Set["worker-backend"] { workerBackend = cli.WorkerBackend @@ -914,6 +914,6 @@ func resolveEffectiveConfig(fileCfg *FileConfig, cli configCLIInputs, getenv fun K8sSharedWarmWorkers: k8sSharedWarmWorkers, ConfigStoreConn: configStoreConn, ConfigPollInterval: configPollInterval, - AdminToken: adminToken, + InternalSecret: internalSecret, } } diff --git a/controlplane/admin/dashboard.go b/controlplane/admin/dashboard.go index 6984663..0f7de44 100644 --- a/controlplane/admin/dashboard.go +++ b/controlplane/admin/dashboard.go @@ -90,10 +90,11 @@ func renderLoginPage(c *gin.Context, next, errMsg string) { } func requestAdminToken(c *gin.Context) string { - auth := c.GetHeader("Authorization") - if strings.HasPrefix(auth, "Bearer ") { - return strings.TrimPrefix(auth, "Bearer ") + // Primary: X-Duckgres-Internal-Secret header (service-to-service) + if secret := c.GetHeader("X-Duckgres-Internal-Secret"); secret != "" { + return secret } + // Fallback: cookie (dashboard UI) if cookie, err := c.Cookie(adminTokenCookieName); err == nil { return cookie } diff --git a/controlplane/control.go b/controlplane/control.go index 02aeb18..9b89e5b 100644 --- a/controlplane/control.go +++ b/controlplane/control.go @@ -58,9 +58,9 @@ type ControlPlaneConfig struct { // Default: 30s. ConfigPollInterval time.Duration - // AdminToken is the bearer token required for admin API requests. - // When empty, a random token is generated and logged at startup. - AdminToken string + // InternalSecret is the shared secret for API authentication. + // When empty, a random secret is generated and logged at startup. + InternalSecret string } diff --git a/controlplane/multitenant.go b/controlplane/multitenant.go index 450a266..909c0bc 100644 --- a/controlplane/multitenant.go +++ b/controlplane/multitenant.go @@ -165,14 +165,14 @@ func SetupMultiTenant( store.Start(context.Background()) // Resolve admin bearer token - adminToken := cfg.AdminToken - if adminToken == "" { + internalSecret := cfg.InternalSecret + if internalSecret == "" { tokenBytes := make([]byte, 32) if _, err := rand.Read(tokenBytes); err != nil { - return nil, nil, nil, fmt.Errorf("generate admin token: %w", err) + return nil, nil, nil, fmt.Errorf("generate internal secret: %w", err) } - adminToken = hex.EncodeToString(tokenBytes) - slog.Info("Generated admin API token (pass via --admin-token or DUCKGRES_ADMIN_TOKEN to set explicitly).", "token", adminToken) + internalSecret = hex.EncodeToString(tokenBytes) + slog.Info("Generated internal secret (pass via --internal-secret or DUCKGRES_INTERNAL_SECRET to set explicitly).", "secret", internalSecret) } // Set up unified API server (admin + provisioning on single port) @@ -186,15 +186,13 @@ func SetupMultiTenant( c.String(http.StatusOK, "ok") }) - // Admin API (authenticated) - api := engine.Group("/api/v1", admin.APIAuthMiddleware(adminToken)) + // Authenticated API + api := engine.Group("/api/v1", admin.APIAuthMiddleware(internalSecret)) admin.RegisterAPI(api, store, adpt) - - // Provisioning API (same auth, same /api/v1 group) provisioning.RegisterAPI(api, provisioning.NewGormStore(store)) // Dashboard - admin.RegisterDashboard(engine, adminToken) + admin.RegisterDashboard(engine, internalSecret) apiServer := &http.Server{ Addr: ":8080", diff --git a/main.go b/main.go index fc38e66..998ff2b 100644 --- a/main.go +++ b/main.go @@ -245,7 +245,7 @@ func main() { // Config store flags (multi-tenant mode) configStore := flag.String("config-store", "", "PostgreSQL connection string for config store (env: DUCKGRES_CONFIG_STORE)") configPollInterval := flag.String("config-poll-interval", "", "How often to poll config store for changes (default: 30s) (env: DUCKGRES_CONFIG_POLL_INTERVAL)") - adminToken := flag.String("admin-token", "", "Bearer token for admin API authentication (env: DUCKGRES_ADMIN_TOKEN)") + internalSecret := flag.String("internal-secret", "", "Shared secret for API authentication (env: DUCKGRES_INTERNAL_SECRET)") // ACME/Let's Encrypt flags acmeDomain := flag.String("acme-domain", "", "Domain for ACME/Let's Encrypt certificate (env: DUCKGRES_ACME_DOMAIN)") @@ -301,7 +301,7 @@ func main() { fmt.Fprintf(os.Stderr, " DUCKGRES_DUCKDB_MAX_SESSIONS DuckDB service max sessions (duckdb-service mode)\n") fmt.Fprintf(os.Stderr, " DUCKGRES_CONFIG_STORE PostgreSQL connection string for config store (multi-tenant)\n") fmt.Fprintf(os.Stderr, " DUCKGRES_CONFIG_POLL_INTERVAL Config store poll interval (default: 30s)\n") - fmt.Fprintf(os.Stderr, " DUCKGRES_ADMIN_TOKEN Bearer token for admin API authentication\n") + fmt.Fprintf(os.Stderr, " DUCKGRES_INTERNAL_SECRET Shared secret for API authentication\n") fmt.Fprintf(os.Stderr, " DUCKGRES_K8S_MAX_WORKERS Max K8s workers in the shared pool\n") fmt.Fprintf(os.Stderr, " DUCKGRES_K8S_SHARED_WARM_TARGET Neutral shared warm-worker target for K8s multi-tenant mode\n") fmt.Fprintf(os.Stderr, " DUCKGRES_K8S_SHARED_WARM_WORKERS Enable shared warm-worker activation path for K8s multi-tenant mode\n") @@ -405,7 +405,7 @@ func main() { MaxConnections: *maxConnections, ConfigStoreConn: *configStore, ConfigPollInterval: *configPollInterval, - AdminToken: *adminToken, + InternalSecret: *internalSecret, WorkerBackend: *workerBackend, K8sWorkerImage: *k8sWorkerImage, K8sWorkerNamespace: *k8sWorkerNamespace, @@ -550,7 +550,7 @@ func main() { WorkerBackend: resolved.WorkerBackend, ConfigStoreConn: resolved.ConfigStoreConn, ConfigPollInterval: resolved.ConfigPollInterval, - AdminToken: resolved.AdminToken, + InternalSecret: resolved.InternalSecret, K8s: controlplane.K8sConfig{ WorkerImage: resolved.K8sWorkerImage, WorkerNamespace: resolved.K8sWorkerNamespace, From a68850698727e907601e89f4d98f8c9d26ffb104 Mon Sep 17 00:00:00 2001 From: eric Date: Tue, 24 Mar 2026 09:34:05 -0700 Subject: [PATCH 06/17] Align controller with current Duckling XRD/composition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Crossplane composition was refactored: K8s workloads (Deployment, Service, Namespace, etc.) are now managed by the duckgres Helm chart, not Crossplane. The Duckling CR only provisions AWS infrastructure. - Remove `image` from CR spec and provision API (no longer in XRD) - Simplify DucklingStatus to only fields the XRD provides: bucketName, auroraEndpoint, auroraPassword, conditions - Remove status fields that no longer exist: namespace, region, serviceAccountName, iamRoleArn, duckgres*, auroraPort, etc. - Simplify reconcileProvisioning: track S3, Aurora, secrets, and IAM (via Ready condition) — no longer track warehouse_database - Ready = infrastructure ready (S3 + Aurora + secrets + IAM) Co-Authored-By: Claude Opus 4.6 (1M context) --- controlplane/provisioner/controller.go | 65 +++++---------------- controlplane/provisioner/controller_test.go | 24 +++----- controlplane/provisioner/k8s_client.go | 54 ++++------------- controlplane/provisioning/api.go | 5 +- controlplane/provisioning/api_test.go | 14 ++--- 5 files changed, 39 insertions(+), 123 deletions(-) diff --git a/controlplane/provisioner/controller.go b/controlplane/provisioner/controller.go index ae61914..5e34553 100644 --- a/controlplane/provisioner/controller.go +++ b/controlplane/provisioner/controller.go @@ -117,7 +117,7 @@ func (c *Controller) reconcilePending(ctx context.Context, w *configstore.Manage // Create the Duckling CR log.Info("Creating Duckling CR.") - if err := c.duckling.Create(ctx, w.OrgID, w.Image, w.AuroraMinACU, w.AuroraMaxACU); err != nil { + if err := c.duckling.Create(ctx, w.OrgID, w.AuroraMinACU, w.AuroraMaxACU); err != nil { log.Error("Failed to create Duckling CR.", "error", err) _ = c.store.UpdateWarehouseState(w.OrgID, configstore.ManagedWarehouseStatePending, map[string]interface{}{ "state": configstore.ManagedWarehouseStateFailed, @@ -169,81 +169,46 @@ func (c *Controller) reconcileProvisioning(ctx context.Context, w *configstore.M return } - // Update per-component states based on Duckling CR status fields + // Update per-component states based on Duckling CR status fields. + // The Duckling composition provisions AWS infrastructure only (Aurora, S3, IAM). + // K8s workloads (namespace, deployment, service) are managed by the duckgres Helm chart. updates := map[string]interface{}{} if status.BucketName != "" && w.S3State != configstore.ManagedWarehouseStateReady { updates["s3_state"] = configstore.ManagedWarehouseStateReady updates["s3_bucket"] = status.BucketName - if status.Region != "" { - updates["s3_region"] = status.Region - } } if status.AuroraEndpoint != "" && w.MetadataStoreState != configstore.ManagedWarehouseStateReady { updates["metadata_store_state"] = configstore.ManagedWarehouseStateReady updates["metadata_store_endpoint"] = status.AuroraEndpoint - updates["metadata_store_port"] = status.AuroraPort + updates["metadata_store_port"] = 5432 updates["metadata_store_kind"] = "aurora" updates["metadata_store_engine"] = "postgres" - if status.Region != "" { - updates["metadata_store_region"] = status.Region - } - } - - if status.Namespace != "" && w.IdentityState != configstore.ManagedWarehouseStateReady { - updates["identity_state"] = configstore.ManagedWarehouseStateReady - updates["worker_identity_namespace"] = status.Namespace - if status.ServiceAccountName != "" { - updates["worker_identity_service_account_name"] = status.ServiceAccountName - } - if status.IAMRoleARN != "" { - updates["worker_identity_iam_role_arn"] = status.IAMRoleARN - } } - if status.AuroraPasswordSecret != "" && status.DuckgresPasswordSecret != "" && w.SecretsState != configstore.ManagedWarehouseStateReady { + if status.AuroraPassword != "" && w.SecretsState != configstore.ManagedWarehouseStateReady { updates["secrets_state"] = configstore.ManagedWarehouseStateReady - updates["metadata_store_credentials_namespace"] = status.Namespace - updates["metadata_store_credentials_name"] = status.AuroraPasswordSecret - updates["metadata_store_credentials_key"] = "password" - updates["runtime_config_namespace"] = status.Namespace - updates["runtime_config_name"] = status.DuckgresPasswordSecret - updates["runtime_config_key"] = "duckgres.yaml" } - if status.ReadyCondition && w.WarehouseDatabaseState != configstore.ManagedWarehouseStateReady { - updates["warehouse_database_state"] = configstore.ManagedWarehouseStateReady - if status.DuckgresEndpoint != "" { - updates["warehouse_database_endpoint"] = status.DuckgresEndpoint - } - if status.DuckgresPort > 0 { - updates["warehouse_database_port"] = status.DuckgresPort - } - if status.DuckgresDatabase != "" { - updates["warehouse_database_database_name"] = status.DuckgresDatabase - } - if status.DuckgresUsername != "" { - updates["warehouse_database_username"] = status.DuckgresUsername - } - if status.Region != "" { - updates["warehouse_database_region"] = status.Region - } + // Crossplane Ready condition means all composed resources (Aurora, S3, IAM) are reconciled. + // We use this for the identity component (IAM role + pod identity association). + if status.ReadyCondition && w.IdentityState != configstore.ManagedWarehouseStateReady { + updates["identity_state"] = configstore.ManagedWarehouseStateReady } - // Check if all components are ready + // Infrastructure is ready when S3, Aurora, secrets, and IAM are all provisioned. s3Ready := w.S3State == configstore.ManagedWarehouseStateReady || updates["s3_state"] == configstore.ManagedWarehouseStateReady metaReady := w.MetadataStoreState == configstore.ManagedWarehouseStateReady || updates["metadata_store_state"] == configstore.ManagedWarehouseStateReady - identReady := w.IdentityState == configstore.ManagedWarehouseStateReady || updates["identity_state"] == configstore.ManagedWarehouseStateReady secretsReady := w.SecretsState == configstore.ManagedWarehouseStateReady || updates["secrets_state"] == configstore.ManagedWarehouseStateReady - dbReady := w.WarehouseDatabaseState == configstore.ManagedWarehouseStateReady || updates["warehouse_database_state"] == configstore.ManagedWarehouseStateReady + identReady := w.IdentityState == configstore.ManagedWarehouseStateReady || updates["identity_state"] == configstore.ManagedWarehouseStateReady - if s3Ready && metaReady && identReady && secretsReady && dbReady { + if s3Ready && metaReady && secretsReady && identReady { now := time.Now().UTC() updates["state"] = configstore.ManagedWarehouseStateReady - updates["status_message"] = "All components ready" + updates["status_message"] = "Infrastructure ready" updates["ready_at"] = now - log.Info("All components ready, transitioning to ready.") + log.Info("Infrastructure ready, transitioning to ready.") } if len(updates) > 0 { diff --git a/controlplane/provisioner/controller_test.go b/controlplane/provisioner/controller_test.go index 878d760..a90e7d4 100644 --- a/controlplane/provisioner/controller_test.go +++ b/controlplane/provisioner/controller_test.go @@ -103,9 +103,8 @@ func TestReconcilePendingCreatesCR(t *testing.T) { dc, fakeK8s := newFakeDucklingClient() fs := newFakeStore() fs.warehouses["org-a"] = &configstore.ManagedWarehouse{ - OrgID: "org-a", + OrgID: "org-a", State: configstore.ManagedWarehouseStatePending, - Image: "ghcr.io/posthog/duckgres:latest", AuroraMinACU: 0.5, AuroraMaxACU: 2, } @@ -125,9 +124,6 @@ func TestReconcilePendingCreatesCR(t *testing.T) { if !ok { t.Fatal("expected spec in CR") } - if spec["image"] != "ghcr.io/posthog/duckgres:latest" { - t.Fatalf("expected image ghcr.io/posthog/duckgres:latest, got %v", spec["image"]) - } metadataStore, ok := spec["metadataStore"].(map[string]interface{}) if !ok { t.Fatal("expected metadataStore in spec") @@ -171,15 +167,9 @@ func TestReconcileProvisioningAllReady(t *testing.T) { "namespace": ducklingNamespace, }, "status": map[string]interface{}{ - "bucketName": "org-b-bucket", - "auroraEndpoint": "org-b.cluster.us-east-1.rds.amazonaws.com", - "auroraPort": int64(5432), - "region": "us-east-1", - "namespace": "duckling-org-b", - "serviceAccountName": "duckgres", - "iamRoleArn": "arn:aws:iam::123456789012:role/org-b", - "auroraPasswordSecret": "org-b-aurora-password", - "duckgresPasswordSecret": "org-b-duckgres-password", + "bucketName": "org-b-bucket", + "auroraEndpoint": "org-b.cluster.us-east-1.rds.amazonaws.com", + "auroraPassword": "supersecret123", "conditions": []interface{}{ map[string]interface{}{ "type": "Ready", @@ -214,8 +204,8 @@ func TestReconcileProvisioningAllReady(t *testing.T) { if w.MetadataStore.Endpoint != "org-b.cluster.us-east-1.rds.amazonaws.com" { t.Fatalf("expected aurora endpoint, got %q", w.MetadataStore.Endpoint) } - if w.WorkerIdentity.Namespace != "duckling-org-b" { - t.Fatalf("expected namespace duckling-org-b, got %q", w.WorkerIdentity.Namespace) + if w.MetadataStore.Port != 5432 { + t.Fatalf("expected aurora port 5432, got %d", w.MetadataStore.Port) } if w.ReadyAt == nil { t.Fatal("expected ready_at to be set") @@ -319,7 +309,7 @@ func TestParseDucklingStatusEmpty(t *testing.T) { if err != nil { t.Fatalf("unexpected error: %v", err) } - if status.BucketName != "" || status.AuroraEndpoint != "" || status.Namespace != "" { + if status.BucketName != "" || status.AuroraEndpoint != "" || status.AuroraPassword != "" { t.Fatal("expected empty status for CR without status field") } } diff --git a/controlplane/provisioner/k8s_client.go b/controlplane/provisioner/k8s_client.go index 84e04b6..cc851c6 100644 --- a/controlplane/provisioner/k8s_client.go +++ b/controlplane/provisioner/k8s_client.go @@ -22,22 +22,14 @@ var ducklingGVR = schema.GroupVersionResource{ const ducklingNamespace = "crossplane-system" // DucklingStatus holds the parsed status from a Duckling CR. +// The Duckling composition provisions AWS infrastructure (Aurora, S3, IAM) +// but not K8s workloads — those are managed by the duckgres Helm chart. type DucklingStatus struct { - BucketName string - AuroraEndpoint string - AuroraPort int - Region string - Namespace string - ServiceAccountName string - IAMRoleARN string - AuroraPasswordSecret string - DuckgresPasswordSecret string - DuckgresEndpoint string - DuckgresPort int - DuckgresDatabase string - DuckgresUsername string - ReadyCondition bool - SyncedFalseMessage string + BucketName string + AuroraEndpoint string + AuroraPassword string + ReadyCondition bool + SyncedFalseMessage string } // DucklingClient wraps a Kubernetes dynamic client for Duckling CR operations. @@ -64,7 +56,7 @@ func NewDucklingClientWithDynamic(client dynamic.Interface) *DucklingClient { } // Create creates a Duckling CR for the given org. -func (d *DucklingClient) Create(ctx context.Context, orgID, image string, minACU, maxACU float64) error { +func (d *DucklingClient) Create(ctx context.Context, orgID string, minACU, maxACU float64) error { cr := &unstructured.Unstructured{ Object: map[string]interface{}{ "apiVersion": "k8s.posthog.com/v1alpha1", @@ -74,7 +66,6 @@ func (d *DucklingClient) Create(ctx context.Context, orgID, image string, minACU "namespace": ducklingNamespace, }, "spec": map[string]interface{}{ - "image": image, "metadataStore": map[string]interface{}{ "type": "aurora", "aurora": map[string]interface{}{ @@ -118,19 +109,9 @@ func parseDucklingStatus(cr *unstructured.Unstructured) (*DucklingStatus, error) } ds := &DucklingStatus{ - BucketName: getNestedString(status, "bucketName"), - AuroraEndpoint: getNestedString(status, "auroraEndpoint"), - AuroraPort: getNestedInt(status, "auroraPort"), - Region: getNestedString(status, "region"), - Namespace: getNestedString(status, "namespace"), - ServiceAccountName: getNestedString(status, "serviceAccountName"), - IAMRoleARN: getNestedString(status, "iamRoleArn"), - AuroraPasswordSecret: getNestedString(status, "auroraPasswordSecret"), - DuckgresPasswordSecret: getNestedString(status, "duckgresPasswordSecret"), - DuckgresEndpoint: getNestedString(status, "duckgresEndpoint"), - DuckgresPort: getNestedInt(status, "duckgresPort"), - DuckgresDatabase: getNestedString(status, "duckgresDatabase"), - DuckgresUsername: getNestedString(status, "duckgresUsername"), + BucketName: getNestedString(status, "bucketName"), + AuroraEndpoint: getNestedString(status, "auroraEndpoint"), + AuroraPassword: getNestedString(status, "auroraPassword"), } // Parse conditions @@ -160,16 +141,3 @@ func getNestedString(obj map[string]interface{}, key string) string { v, _ := obj[key].(string) return v } - -func getNestedInt(obj map[string]interface{}, key string) int { - switch v := obj[key].(type) { - case int64: - return int(v) - case float64: - return int(v) - case int: - return v - default: - return 0 - } -} diff --git a/controlplane/provisioning/api.go b/controlplane/provisioning/api.go index 608bd54..f7cffba 100644 --- a/controlplane/provisioning/api.go +++ b/controlplane/provisioning/api.go @@ -29,7 +29,6 @@ type handler struct { } type provisionRequest struct { - Image string `json:"image"` MetadataStore *provisionMetadataReq `json:"metadata_store,omitempty"` } @@ -52,9 +51,7 @@ func (h *handler) provisionWarehouse(c *gin.Context) { return } - warehouse := &configstore.ManagedWarehouse{ - Image: req.Image, - } + warehouse := &configstore.ManagedWarehouse{} if req.MetadataStore != nil && req.MetadataStore.Aurora != nil { warehouse.AuroraMinACU = req.MetadataStore.Aurora.MinACU warehouse.AuroraMaxACU = req.MetadataStore.Aurora.MaxACU diff --git a/controlplane/provisioning/api_test.go b/controlplane/provisioning/api_test.go index 9e5b469..aae80b7 100644 --- a/controlplane/provisioning/api_test.go +++ b/controlplane/provisioning/api_test.go @@ -81,7 +81,6 @@ func TestProvisionCreatesWarehouse(t *testing.T) { router := newTestRouter(store) body := []byte(`{ - "image": "ghcr.io/posthog/duckgres:latest", "metadata_store": { "type": "aurora", "aurora": {"min_acu": 0.5, "max_acu": 2} @@ -104,9 +103,6 @@ func TestProvisionCreatesWarehouse(t *testing.T) { if w.State != configstore.ManagedWarehouseStatePending { t.Fatalf("expected state pending, got %q", w.State) } - if w.Image != "ghcr.io/posthog/duckgres:latest" { - t.Fatalf("expected image, got %q", w.Image) - } if w.AuroraMinACU != 0.5 { t.Fatalf("expected min_acu 0.5, got %f", w.AuroraMinACU) } @@ -119,7 +115,7 @@ func TestProvisionAutoCreatesOrg(t *testing.T) { store := newFakeStore() router := newTestRouter(store) - body := []byte(`{"image": "ghcr.io/posthog/duckgres:latest"}`) + body := []byte(`{"metadata_store": {"type": "aurora", "aurora": {"max_acu": 1}}}`) req := httptest.NewRequest(http.MethodPost, "/api/v1/orgs/new-org/provision", bytes.NewReader(body)) req.Header.Set("Content-Type", "application/json") rec := httptest.NewRecorder() @@ -145,7 +141,7 @@ func TestProvisionRejectsExistingNonTerminal(t *testing.T) { } router := newTestRouter(store) - body := []byte(`{"image": "ghcr.io/posthog/duckgres:latest"}`) + body := []byte(`{"metadata_store": {"type": "aurora", "aurora": {"max_acu": 1}}}`) req := httptest.NewRequest(http.MethodPost, "/api/v1/orgs/analytics/provision", bytes.NewReader(body)) req.Header.Set("Content-Type", "application/json") rec := httptest.NewRecorder() @@ -165,7 +161,7 @@ func TestProvisionAllowsRetryAfterFailure(t *testing.T) { } router := newTestRouter(store) - body := []byte(`{"image": "ghcr.io/posthog/duckgres:v2"}`) + body := []byte(`{"metadata_store": {"type": "aurora", "aurora": {"min_acu": 0, "max_acu": 2}}}`) req := httptest.NewRequest(http.MethodPost, "/api/v1/orgs/analytics/provision", bytes.NewReader(body)) req.Header.Set("Content-Type", "application/json") rec := httptest.NewRecorder() @@ -174,8 +170,8 @@ func TestProvisionAllowsRetryAfterFailure(t *testing.T) { if rec.Code != http.StatusAccepted { t.Fatalf("status = %d, want %d: %s", rec.Code, http.StatusAccepted, rec.Body.String()) } - if store.warehouses["analytics"].Image != "ghcr.io/posthog/duckgres:v2" { - t.Fatalf("expected new image, got %q", store.warehouses["analytics"].Image) + if store.warehouses["analytics"].AuroraMaxACU != 2 { + t.Fatalf("expected max_acu 2, got %f", store.warehouses["analytics"].AuroraMaxACU) } } From 88f36bf91e9af416395a21ef61e31e1bc59febf1 Mon Sep 17 00:00:00 2001 From: eric Date: Tue, 24 Mar 2026 10:01:22 -0700 Subject: [PATCH 07/17] Add STS AssumeRole credential brokering for multi-tenant S3 access Workers in the shared control plane don't have per-org IAM roles via pod identity. When an org uses AWS S3 (provider=aws), the control plane now calls STS AssumeRole on the org's duckling IAM role to mint short-lived credentials, and passes them to the worker during activation. - New STSBroker wrapping aws-sdk-go-v2/service/sts - Add S3SessionToken to DuckLakeConfig + buildConfigSecret SESSION_TOKEN - SharedWorkerActivator uses STS broker when available, falls back to aws_sdk (pod identity) when not configured - Deterministic role ARN: arn:aws:iam::{accountId}:role/duckling-{orgID} - Config: --aws-account-id / DUCKGRES_AWS_ACCOUNT_ID, --aws-region / DUCKGRES_AWS_REGION - needsCredentialRefresh returns true for STS temporary creds Prerequisite IAM changes (separate): - Control plane role needs sts:AssumeRole on duckling-* roles - Duckling role trust policy needs to allow the control plane role Co-Authored-By: Claude Opus 4.6 (1M context) --- config_resolution.go | 20 +++++++ controlplane/activation_payload_test.go | 2 +- controlplane/control.go | 2 + controlplane/multitenant.go | 12 +++- controlplane/org_reserved_pool.go | 6 +- controlplane/org_reserved_pool_test.go | 10 ++-- controlplane/org_router.go | 6 +- controlplane/org_router_test.go | 2 +- controlplane/shared_worker_activator.go | 24 +++++++- controlplane/sts_broker.go | 77 +++++++++++++++++++++++++ main.go | 8 +++ server/server.go | 19 +++--- 12 files changed, 166 insertions(+), 22 deletions(-) create mode 100644 controlplane/sts_broker.go diff --git a/config_resolution.go b/config_resolution.go index 3c69650..89bf03a 100644 --- a/config_resolution.go +++ b/config_resolution.go @@ -53,6 +53,8 @@ type configCLIInputs struct { K8sMaxWorkers int K8sSharedWarmTarget int K8sSharedWarmWorkers bool + AWSAccountID string + AWSRegion string QueryLog bool } @@ -75,6 +77,8 @@ type resolvedConfig struct { K8sMaxWorkers int K8sSharedWarmTarget int K8sSharedWarmWorkers bool + AWSAccountID string + AWSRegion string ConfigStoreConn string ConfigPollInterval time.Duration InternalSecret string @@ -132,6 +136,8 @@ func resolveEffectiveConfig(fileCfg *FileConfig, cli configCLIInputs, getenv fun var k8sWorkerSecret, k8sWorkerConfigMap, k8sWorkerImagePullPolicy, k8sWorkerServiceAccount string var k8sMaxWorkers, k8sSharedWarmTarget int var k8sSharedWarmWorkers bool + var awsAccountID string + var awsRegion string var configStoreConn string var configPollInterval time.Duration var internalSecret string @@ -642,6 +648,12 @@ func resolveEffectiveConfig(fileCfg *FileConfig, cli configCLIInputs, getenv fun warn("Invalid DUCKGRES_K8S_SHARED_WARM_WORKERS: " + err.Error()) } } + if v := getenv("DUCKGRES_AWS_ACCOUNT_ID"); v != "" { + awsAccountID = v + } + if v := getenv("DUCKGRES_AWS_REGION"); v != "" { + awsRegion = v + } // Query log env vars if v := getenv("DUCKGRES_QUERY_LOG_ENABLED"); v != "" { @@ -842,6 +854,12 @@ func resolveEffectiveConfig(fileCfg *FileConfig, cli configCLIInputs, getenv fun if cli.Set["k8s-shared-warm-workers"] { k8sSharedWarmWorkers = cli.K8sSharedWarmWorkers } + if cli.Set["aws-account-id"] { + awsAccountID = cli.AWSAccountID + } + if cli.Set["aws-region"] { + awsRegion = cli.AWSRegion + } if cli.Set["query-log"] { cfg.QueryLog.Enabled = cli.QueryLog } @@ -912,6 +930,8 @@ func resolveEffectiveConfig(fileCfg *FileConfig, cli configCLIInputs, getenv fun K8sMaxWorkers: k8sMaxWorkers, K8sSharedWarmTarget: k8sSharedWarmTarget, K8sSharedWarmWorkers: k8sSharedWarmWorkers, + AWSAccountID: awsAccountID, + AWSRegion: awsRegion, ConfigStoreConn: configStoreConn, ConfigPollInterval: configPollInterval, InternalSecret: internalSecret, diff --git a/controlplane/activation_payload_test.go b/controlplane/activation_payload_test.go index be72889..c75499b 100644 --- a/controlplane/activation_payload_test.go +++ b/controlplane/activation_payload_test.go @@ -62,7 +62,7 @@ func TestBuildTenantActivationPayloadBuildsDuckLakeRuntimeFromWarehouseSecrets(t }, } - payload, err := BuildTenantActivationPayload(context.Background(), pool.clientset, pool.namespace, org) + payload, err := BuildTenantActivationPayload(context.Background(), pool.clientset, pool.namespace, org, nil) if err != nil { t.Fatalf("BuildTenantActivationPayload: %v", err) } diff --git a/controlplane/control.go b/controlplane/control.go index 9b89e5b..5022ffb 100644 --- a/controlplane/control.go +++ b/controlplane/control.go @@ -82,6 +82,8 @@ type K8sConfig struct { MaxWorkers int // Global cap for the shared K8s worker pool (0 = auto-derived) SharedWarmTarget int // Neutral shared warm-worker target for K8s multi-tenant mode (0 = disabled) SharedWarmWorkers bool // Enable reserve->activate->hot lifecycle on the shared warm pool + AWSAccountID string // AWS account ID for constructing IAM role ARNs (STS credential brokering) + AWSRegion string // AWS region for STS client } // ControlPlane manages the TCP listener and routes connections to Flight SQL workers. diff --git a/controlplane/multitenant.go b/controlplane/multitenant.go index 909c0bc..10c86a3 100644 --- a/controlplane/multitenant.go +++ b/controlplane/multitenant.go @@ -143,7 +143,17 @@ func SetupMultiTenant( SharedWarmActivation: cfg.K8s.SharedWarmWorkers, } - router, err := NewOrgRouter(store, baseCfg, cfg, srv) + // Initialize STS broker for credential brokering (best-effort) + var stsBroker *STSBroker + if cfg.K8s.AWSAccountID != "" { + var err error + stsBroker, err = NewSTSBroker(context.Background(), cfg.K8s.AWSRegion, cfg.K8s.AWSAccountID) + if err != nil { + slog.Warn("STS broker unavailable, workers will use pod identity for S3.", "error", err) + } + } + + router, err := NewOrgRouter(store, baseCfg, cfg, srv, stsBroker) if err != nil { return nil, nil, nil, err } diff --git a/controlplane/org_reserved_pool.go b/controlplane/org_reserved_pool.go index b18202f..3e8708e 100644 --- a/controlplane/org_reserved_pool.go +++ b/controlplane/org_reserved_pool.go @@ -21,16 +21,18 @@ type OrgReservedPool struct { maxWorkers int leaseDuration time.Duration sharedWarmWorkers bool + stsBroker *STSBroker resolveOrgConfig func() (*configstore.OrgConfig, error) activateReservedWorker func(context.Context, *ManagedWorker, *configstore.OrgConfig) error } -func NewOrgReservedPool(shared *K8sWorkerPool, orgID string, maxWorkers int) *OrgReservedPool { +func NewOrgReservedPool(shared *K8sWorkerPool, orgID string, maxWorkers int, stsBroker *STSBroker) *OrgReservedPool { pool := &OrgReservedPool{ shared: shared, orgID: orgID, maxWorkers: maxWorkers, leaseDuration: defaultSharedWorkerReservationLease, + stsBroker: stsBroker, } pool.activateReservedWorker = pool.activateReservedWorkerDefault return pool @@ -274,7 +276,7 @@ func (p *OrgReservedPool) activateReservedWorkerDefault(ctx context.Context, wor if org == nil { return fmt.Errorf("org config is required for activation") } - payload, err := BuildTenantActivationPayload(ctx, p.shared.clientset, p.shared.namespace, org) + payload, err := BuildTenantActivationPayload(ctx, p.shared.clientset, p.shared.namespace, org, p.stsBroker) if err != nil { return err } diff --git a/controlplane/org_reserved_pool_test.go b/controlplane/org_reserved_pool_test.go index c373ecc..b05ebde 100644 --- a/controlplane/org_reserved_pool_test.go +++ b/controlplane/org_reserved_pool_test.go @@ -19,7 +19,7 @@ func TestOrgReservedPoolAcquireReservesOrgWorker(t *testing.T) { return nil } - pool := NewOrgReservedPool(shared, "analytics", 2) + pool := NewOrgReservedPool(shared, "analytics", 2, nil) worker, err := pool.AcquireWorker(context.Background()) if err != nil { t.Fatalf("AcquireWorker: %v", err) @@ -58,7 +58,7 @@ func TestOrgReservedPoolAcquireSkipsOtherOrgsWorkers(t *testing.T) { return nil } - pool := NewOrgReservedPool(shared, "analytics", 2) + pool := NewOrgReservedPool(shared, "analytics", 2, nil) worker, err := pool.AcquireWorker(context.Background()) if err != nil { t.Fatalf("AcquireWorker: %v", err) @@ -85,7 +85,7 @@ func TestOrgReservedPoolReleaseWorkerRetiresOnLastSession(t *testing.T) { } shared.workers[worker.ID] = worker - pool := NewOrgReservedPool(shared, "analytics", 1) + pool := NewOrgReservedPool(shared, "analytics", 1, nil) pool.ReleaseWorker(worker.ID) time.Sleep(100 * time.Millisecond) @@ -104,7 +104,7 @@ func TestOrgReservedWorkerPoolAcquireActivatesReservedWorkerWhenEnabledWithOrgCo } activated := false - pool := NewOrgReservedPool(shared, "analytics", 2) + pool := NewOrgReservedPool(shared, "analytics", 2, nil) pool.sharedWarmWorkers = true pool.resolveOrgConfig = func() (*configstore.OrgConfig, error) { return &configstore.OrgConfig{ @@ -150,7 +150,7 @@ func TestOrgReservedWorkerPoolAcquireActivatesUsingLatestResolvedOrgConfig(t *te }, } - pool := NewOrgReservedPool(shared, "analytics", 2) + pool := NewOrgReservedPool(shared, "analytics", 2, nil) pool.sharedWarmWorkers = true pool.resolveOrgConfig = func() (*configstore.OrgConfig, error) { return currentOrg, nil diff --git a/controlplane/org_router.go b/controlplane/org_router.go index 8481236..0c038cf 100644 --- a/controlplane/org_router.go +++ b/controlplane/org_router.go @@ -32,18 +32,20 @@ type OrgRouter struct { sharedPool *K8sWorkerPool globalCfg ControlPlaneConfig srv *server.Server + stsBroker *STSBroker nextWorkerID atomic.Int32 sharedCancel context.CancelFunc } // NewOrgRouter creates an OrgRouter from the initial config snapshot. -func NewOrgRouter(store *configstore.ConfigStore, baseCfg K8sWorkerPoolConfig, globalCfg ControlPlaneConfig, srv *server.Server) (*OrgRouter, error) { +func NewOrgRouter(store *configstore.ConfigStore, baseCfg K8sWorkerPoolConfig, globalCfg ControlPlaneConfig, srv *server.Server, stsBroker *STSBroker) (*OrgRouter, error) { tr := &OrgRouter{ orgs: make(map[string]*OrgStack), configStore: store, baseCfg: baseCfg, globalCfg: globalCfg, srv: srv, + stsBroker: stsBroker, } sharedCfg := baseCfg @@ -107,7 +109,7 @@ func (tr *OrgRouter) createOrgStack(tc *configstore.OrgConfig) (*OrgStack, error } } - pool := NewOrgReservedPool(tr.sharedPool, tc.Name, maxWorkers) + pool := NewOrgReservedPool(tr.sharedPool, tc.Name, maxWorkers, tr.stsBroker) pool.resolveOrgConfig = func() (*configstore.OrgConfig, error) { snap := tr.configStore.Snapshot() if snap == nil { diff --git a/controlplane/org_router_test.go b/controlplane/org_router_test.go index 72339df..cd692fa 100644 --- a/controlplane/org_router_test.go +++ b/controlplane/org_router_test.go @@ -42,7 +42,7 @@ func TestOrgRouterReconcileWarmCapacityUsesExplicitSharedWarmTarget(t *testing.T func TestOrgRouterHandleConfigChangeRefreshesRuntimeOnlyUpdates(t *testing.T) { sharedPool, _ := newTestK8sPool(t, 10) - pool := NewOrgReservedPool(sharedPool, "analytics", 2) + pool := NewOrgReservedPool(sharedPool, "analytics", 2, nil) oldTC := &configstore.OrgConfig{ Name: "analytics", diff --git a/controlplane/shared_worker_activator.go b/controlplane/shared_worker_activator.go index 4db84dc..3b748f2 100644 --- a/controlplane/shared_worker_activator.go +++ b/controlplane/shared_worker_activator.go @@ -19,6 +19,7 @@ import ( type SharedWorkerActivator struct { clientset kubernetes.Interface defaultNamespace string + stsBroker *STSBroker } type TenantActivationPayload struct { @@ -28,13 +29,14 @@ type TenantActivationPayload struct { DuckLake server.DuckLakeConfig `json:"ducklake"` } -func NewSharedWorkerActivator(shared *K8sWorkerPool) *SharedWorkerActivator { +func NewSharedWorkerActivator(shared *K8sWorkerPool, stsBroker *STSBroker) *SharedWorkerActivator { if shared == nil { return nil } return &SharedWorkerActivator{ clientset: shared.clientset, defaultNamespace: shared.namespace, + stsBroker: stsBroker, } } @@ -97,7 +99,22 @@ func (a *SharedWorkerActivator) BuildActivationRequest(ctx context.Context, org dl.S3AccessKey = accessKey dl.S3SecretKey = secretKey case strings.EqualFold(warehouse.S3.Provider, "aws"): - dl.S3Provider = "aws_sdk" + roleARN := warehouse.WorkerIdentity.IAMRoleARN + if roleARN == "" && a.stsBroker != nil { + roleARN = a.stsBroker.RoleARNForOrg(orgName(org)) + } + if roleARN != "" && a.stsBroker != nil { + creds, err := a.stsBroker.AssumeRole(ctx, roleARN) + if err != nil { + return TenantActivationPayload{}, fmt.Errorf("STS AssumeRole for org %q: %w", orgName(org), err) + } + dl.S3Provider = "config" + dl.S3AccessKey = creds.AccessKeyID + dl.S3SecretKey = creds.SecretAccessKey + dl.S3SessionToken = creds.SessionToken + } else { + dl.S3Provider = "aws_sdk" + } } usernames := make([]string, 0, len(org.Users)) @@ -114,10 +131,11 @@ func (a *SharedWorkerActivator) BuildActivationRequest(ctx context.Context, org }, nil } -func BuildTenantActivationPayload(ctx context.Context, clientset kubernetes.Interface, defaultNamespace string, org *configstore.OrgConfig) (TenantActivationPayload, error) { +func BuildTenantActivationPayload(ctx context.Context, clientset kubernetes.Interface, defaultNamespace string, org *configstore.OrgConfig, stsBroker *STSBroker) (TenantActivationPayload, error) { activator := &SharedWorkerActivator{ clientset: clientset, defaultNamespace: defaultNamespace, + stsBroker: stsBroker, } assignment := &WorkerAssignment{ OrgID: orgName(org), diff --git a/controlplane/sts_broker.go b/controlplane/sts_broker.go new file mode 100644 index 0000000..44b9431 --- /dev/null +++ b/controlplane/sts_broker.go @@ -0,0 +1,77 @@ +//go:build kubernetes + +package controlplane + +import ( + "context" + "fmt" + "time" + + "github.com/aws/aws-sdk-go-v2/aws" + awsconfig "github.com/aws/aws-sdk-go-v2/config" + "github.com/aws/aws-sdk-go-v2/service/sts" +) + +const ( + stsSessionDuration = 1 * time.Hour + stsSessionName = "duckgres-cp" +) + +// STSBroker brokers short-lived AWS credentials by assuming per-org IAM roles. +type STSBroker struct { + client *sts.Client + accountID string +} + +// AssumedCredentials holds the temporary credentials from STS AssumeRole. +type AssumedCredentials struct { + AccessKeyID string + SecretAccessKey string + SessionToken string + Expiration time.Time +} + +// NewSTSBroker creates an STS broker using the control plane's own credentials. +// accountID is the AWS account ID used to construct deterministic role ARNs. +func NewSTSBroker(ctx context.Context, region, accountID string) (*STSBroker, error) { + opts := []func(*awsconfig.LoadOptions) error{} + if region != "" { + opts = append(opts, awsconfig.WithRegion(region)) + } + cfg, err := awsconfig.LoadDefaultConfig(ctx, opts...) + if err != nil { + return nil, fmt.Errorf("load AWS config: %w", err) + } + return &STSBroker{ + client: sts.NewFromConfig(cfg), + accountID: accountID, + }, nil +} + +// RoleARNForOrg returns the deterministic IAM role ARN for an org. +func (b *STSBroker) RoleARNForOrg(orgID string) string { + return fmt.Sprintf("arn:aws:iam::%s:role/duckling-%s", b.accountID, orgID) +} + +// AssumeRole mints short-lived credentials for the given IAM role ARN. +func (b *STSBroker) AssumeRole(ctx context.Context, roleARN string) (*AssumedCredentials, error) { + durationSeconds := int32(stsSessionDuration.Seconds()) + sessionName := stsSessionName + out, err := b.client.AssumeRole(ctx, &sts.AssumeRoleInput{ + RoleArn: aws.String(roleARN), + RoleSessionName: aws.String(sessionName), + DurationSeconds: &durationSeconds, + }) + if err != nil { + return nil, fmt.Errorf("STS AssumeRole %s: %w", roleARN, err) + } + if out.Credentials == nil { + return nil, fmt.Errorf("STS AssumeRole returned nil credentials for %s", roleARN) + } + return &AssumedCredentials{ + AccessKeyID: aws.ToString(out.Credentials.AccessKeyId), + SecretAccessKey: aws.ToString(out.Credentials.SecretAccessKey), + SessionToken: aws.ToString(out.Credentials.SessionToken), + Expiration: aws.ToTime(out.Credentials.Expiration), + }, nil +} diff --git a/main.go b/main.go index 998ff2b..a6a885d 100644 --- a/main.go +++ b/main.go @@ -241,6 +241,8 @@ func main() { k8sMaxWorkers := flag.Int("k8s-max-workers", 0, "Max K8s workers in the shared pool, 0=auto-derived (env: DUCKGRES_K8S_MAX_WORKERS)") k8sSharedWarmTarget := flag.Int("k8s-shared-warm-target", 0, "Neutral shared warm-worker target for K8s multi-tenant mode, 0=disabled (env: DUCKGRES_K8S_SHARED_WARM_TARGET)") k8sSharedWarmWorkers := flag.Bool("k8s-shared-warm-workers", false, "Enable shared warm-worker activation path in K8s multi-tenant mode (env: DUCKGRES_K8S_SHARED_WARM_WORKERS)") + awsAccountID := flag.String("aws-account-id", "", "AWS account ID for STS credential brokering (env: DUCKGRES_AWS_ACCOUNT_ID)") + awsRegion := flag.String("aws-region", "", "AWS region for STS client (env: DUCKGRES_AWS_REGION)") // Config store flags (multi-tenant mode) configStore := flag.String("config-store", "", "PostgreSQL connection string for config store (env: DUCKGRES_CONFIG_STORE)") @@ -305,6 +307,8 @@ func main() { fmt.Fprintf(os.Stderr, " DUCKGRES_K8S_MAX_WORKERS Max K8s workers in the shared pool\n") fmt.Fprintf(os.Stderr, " DUCKGRES_K8S_SHARED_WARM_TARGET Neutral shared warm-worker target for K8s multi-tenant mode\n") fmt.Fprintf(os.Stderr, " DUCKGRES_K8S_SHARED_WARM_WORKERS Enable shared warm-worker activation path for K8s multi-tenant mode\n") + fmt.Fprintf(os.Stderr, " DUCKGRES_AWS_ACCOUNT_ID AWS account ID for STS credential brokering\n") + fmt.Fprintf(os.Stderr, " DUCKGRES_AWS_REGION AWS region for STS client\n") fmt.Fprintf(os.Stderr, " DUCKGRES_LOG_LEVEL Log level: debug, info, warn, error (default: info)\n") fmt.Fprintf(os.Stderr, "\nPrecedence: CLI flags > environment variables > config file > defaults\n") } @@ -418,6 +422,8 @@ func main() { K8sMaxWorkers: *k8sMaxWorkers, K8sSharedWarmTarget: *k8sSharedWarmTarget, K8sSharedWarmWorkers: *k8sSharedWarmWorkers, + AWSAccountID: *awsAccountID, + AWSRegion: *awsRegion, QueryLog: *queryLog, }, os.Getenv, func(msg string) { slog.Warn(msg) @@ -563,6 +569,8 @@ func main() { MaxWorkers: resolved.K8sMaxWorkers, SharedWarmTarget: resolved.K8sSharedWarmTarget, SharedWarmWorkers: resolved.K8sSharedWarmWorkers, + AWSAccountID: resolved.AWSAccountID, + AWSRegion: resolved.AWSRegion, }, } controlplane.RunControlPlane(cpCfg) diff --git a/server/server.go b/server/server.go index 070a554..a463766 100644 --- a/server/server.go +++ b/server/server.go @@ -219,12 +219,13 @@ type DuckLakeConfig struct { S3Provider string // S3 configuration for "config" provider (explicit credentials for MinIO or S3) - S3Endpoint string // e.g., "localhost:9000" for MinIO - S3AccessKey string // S3 access key ID - S3SecretKey string // S3 secret access key - S3Region string // S3 region (default: us-east-1) - S3UseSSL bool // Use HTTPS for S3 connections (default: false for MinIO) - S3URLStyle string // "path" or "vhost" (default: "path" for MinIO compatibility) + S3Endpoint string // e.g., "localhost:9000" for MinIO + S3AccessKey string // S3 access key ID + S3SecretKey string // S3 secret access key + S3SessionToken string // STS session token for temporary credentials + S3Region string // S3 region (default: us-east-1) + S3UseSSL bool // Use HTTPS for S3 connections (default: false for MinIO) + S3URLStyle string // "path" or "vhost" (default: "path" for MinIO compatibility) // S3 configuration for "credential_chain" provider (AWS SDK credential chain) // Chain specifies which credential sources to check, semicolon-separated @@ -1215,6 +1216,10 @@ func buildConfigSecret(dlCfg DuckLakeConfig) string { secret += fmt.Sprintf(",\n\t\t\tENDPOINT '%s'", dlCfg.S3Endpoint) } + if dlCfg.S3SessionToken != "" { + secret += fmt.Sprintf(",\n\t\t\tSESSION_TOKEN '%s'", dlCfg.S3SessionToken) + } + secret += "\n\t\t)" return secret } @@ -1357,7 +1362,7 @@ func needsCredentialRefresh(dlCfg DuckLakeConfig) bool { return false } p := s3ProviderForConfig(dlCfg) - return p == "credential_chain" || p == "aws_sdk" + return p == "credential_chain" || p == "aws_sdk" || dlCfg.S3SessionToken != "" } // isTransactionAborted returns true if the error indicates DuckDB's connection From 8c5b0efe6198b0a156366ff4d0ab74e90aa2220f Mon Sep 17 00:00:00 2001 From: eric Date: Tue, 24 Mar 2026 11:19:17 -0700 Subject: [PATCH 08/17] Fix provisioning controller review issues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix K8s manifest port 9090→8080 to match code change (fixes CI) - Add ProvisioningStartedAt timestamp for accurate timeout tracking - Bump Synced=False grace period from 5→10 min (Aurora cold starts) - Add warehouseStatusResponse DTO to avoid leaking internal config - Allow deprovision from provisioning state (not just ready/failed) - Validate max_acu > 0 on provision request - Remove dead namespace override code in createOrgStack - Fix InternalSecret alignment in config_resolution.go - Fix fakeStore missing metadata_store_port/kind/engine handlers Co-Authored-By: Claude Opus 4.6 (1M context) --- config_resolution.go | 6 +- controlplane/configstore/models.go | 1 + controlplane/org_router.go | 9 --- controlplane/provisioner/controller.go | 29 ++++++--- controlplane/provisioner/controller_test.go | 12 ++++ controlplane/provisioning/api.go | 66 ++++++++++++++++----- controlplane/provisioning/api_test.go | 65 +++++++++++++++++--- k8s/kind/control-plane.yaml | 8 ++- 8 files changed, 150 insertions(+), 46 deletions(-) diff --git a/config_resolution.go b/config_resolution.go index 89bf03a..577dda7 100644 --- a/config_resolution.go +++ b/config_resolution.go @@ -40,7 +40,7 @@ type configCLIInputs struct { MaxConnections int ConfigStoreConn string ConfigPollInterval string - InternalSecret string + InternalSecret string WorkerBackend string K8sWorkerImage string K8sWorkerNamespace string @@ -81,7 +81,7 @@ type resolvedConfig struct { AWSRegion string ConfigStoreConn string ConfigPollInterval time.Duration - InternalSecret string + InternalSecret string } func defaultServerConfig() server.Config { @@ -934,6 +934,6 @@ func resolveEffectiveConfig(fileCfg *FileConfig, cli configCLIInputs, getenv fun AWSRegion: awsRegion, ConfigStoreConn: configStoreConn, ConfigPollInterval: configPollInterval, - InternalSecret: internalSecret, + InternalSecret: internalSecret, } } diff --git a/controlplane/configstore/models.go b/controlplane/configstore/models.go index 197cd19..439190d 100644 --- a/controlplane/configstore/models.go +++ b/controlplane/configstore/models.go @@ -116,6 +116,7 @@ type ManagedWarehouse struct { IdentityStatusMessage string `gorm:"size:1024" json:"identity_status_message"` SecretsState ManagedWarehouseProvisioningState `gorm:"size:32" json:"secrets_state"` SecretsStatusMessage string `gorm:"size:1024" json:"secrets_status_message"` + ProvisioningStartedAt *time.Time `json:"provisioning_started_at"` ReadyAt *time.Time `json:"ready_at"` FailedAt *time.Time `json:"failed_at"` CreatedAt time.Time `json:"created_at"` diff --git a/controlplane/org_router.go b/controlplane/org_router.go index 0c038cf..308e5c8 100644 --- a/controlplane/org_router.go +++ b/controlplane/org_router.go @@ -100,15 +100,6 @@ func (tr *OrgRouter) createOrgStack(tc *configstore.OrgConfig) (*OrgStack, error memoryBudget = int64(server.ParseMemoryBytes(tc.MemoryBudget)) } - // Use per-org namespace and service account from warehouse config - if tc.Warehouse != nil && tc.Warehouse.State == configstore.ManagedWarehouseStateReady { - if tc.Warehouse.WorkerIdentity.Namespace != "" { - // Note: OrgReservedPool inherits from the shared pool, so namespace - // overrides are propagated via label selectors, not pool config. - _ = tc.Warehouse.WorkerIdentity.Namespace // used for future per-org pool config - } - } - pool := NewOrgReservedPool(tr.sharedPool, tc.Name, maxWorkers, tr.stsBroker) pool.resolveOrgConfig = func() (*configstore.OrgConfig, error) { snap := tr.configStore.Snapshot() diff --git a/controlplane/provisioner/controller.go b/controlplane/provisioner/controller.go index 5e34553..2fc3f69 100644 --- a/controlplane/provisioner/controller.go +++ b/controlplane/provisioner/controller.go @@ -101,14 +101,17 @@ func (c *Controller) reconcile(ctx context.Context) { func (c *Controller) reconcilePending(ctx context.Context, w *configstore.ManagedWarehouse) { log := slog.With("org", w.OrgID, "phase", "pending") + now := time.Now().UTC() + // Check if a Duckling CR already exists (e.g., controller restart) _, err := c.duckling.Get(ctx, w.OrgID) if err == nil { // CR exists — transition directly to provisioning log.Info("Duckling CR already exists, transitioning to provisioning.") if err := c.store.UpdateWarehouseState(w.OrgID, configstore.ManagedWarehouseStatePending, map[string]interface{}{ - "state": configstore.ManagedWarehouseStateProvisioning, - "status_message": "Duckling CR exists, polling status", + "state": configstore.ManagedWarehouseStateProvisioning, + "status_message": "Duckling CR exists, polling status", + "provisioning_started_at": now, }); err != nil { log.Warn("Failed to update state to provisioning.", "error", err) } @@ -122,14 +125,15 @@ func (c *Controller) reconcilePending(ctx context.Context, w *configstore.Manage _ = c.store.UpdateWarehouseState(w.OrgID, configstore.ManagedWarehouseStatePending, map[string]interface{}{ "state": configstore.ManagedWarehouseStateFailed, "status_message": fmt.Sprintf("Failed to create Duckling CR: %v", err), - "failed_at": time.Now().UTC(), + "failed_at": now, }) return } if err := c.store.UpdateWarehouseState(w.OrgID, configstore.ManagedWarehouseStatePending, map[string]interface{}{ - "state": configstore.ManagedWarehouseStateProvisioning, - "status_message": "Duckling CR created, waiting for resources", + "state": configstore.ManagedWarehouseStateProvisioning, + "status_message": "Duckling CR created, waiting for resources", + "provisioning_started_at": now, }); err != nil { log.Warn("Failed to update state to provisioning.", "error", err) } @@ -138,8 +142,15 @@ func (c *Controller) reconcilePending(ctx context.Context, w *configstore.Manage func (c *Controller) reconcileProvisioning(ctx context.Context, w *configstore.ManagedWarehouse) { log := slog.With("org", w.OrgID, "phase", "provisioning") + // Use ProvisioningStartedAt if set (tracks when we entered provisioning state), + // fall back to CreatedAt for warehouses created before this field existed. + startedAt := w.CreatedAt + if w.ProvisioningStartedAt != nil { + startedAt = *w.ProvisioningStartedAt + } + // Check for timeout (30 minutes) - if time.Since(w.CreatedAt) > 30*time.Minute { + if time.Since(startedAt) > 30*time.Minute { log.Warn("Provisioning timed out.") _ = c.store.UpdateWarehouseState(w.OrgID, configstore.ManagedWarehouseStateProvisioning, map[string]interface{}{ "state": configstore.ManagedWarehouseStateFailed, @@ -157,9 +168,9 @@ func (c *Controller) reconcileProvisioning(ctx context.Context, w *configstore.M // Check for Crossplane failure — only fail on persistent sync errors. // Crossplane resources commonly flap Synced=False transiently (e.g., IAM - // eventual consistency), so we only transition to failed if 5+ minutes - // have passed since creation, giving transient errors time to resolve. - if status.SyncedFalseMessage != "" && time.Since(w.CreatedAt) > 5*time.Minute { + // eventual consistency, Aurora cold start delays), so we only transition + // to failed if 10+ minutes have passed, giving transient errors time to resolve. + if status.SyncedFalseMessage != "" && time.Since(startedAt) > 10*time.Minute { log.Warn("Crossplane sync failure.", "message", status.SyncedFalseMessage) _ = c.store.UpdateWarehouseState(w.OrgID, configstore.ManagedWarehouseStateProvisioning, map[string]interface{}{ "state": configstore.ManagedWarehouseStateFailed, diff --git a/controlplane/provisioner/controller_test.go b/controlplane/provisioner/controller_test.go index a90e7d4..9f213cc 100644 --- a/controlplane/provisioner/controller_test.go +++ b/controlplane/provisioner/controller_test.go @@ -60,6 +60,12 @@ func (s *fakeStore) UpdateWarehouseState(orgID string, expectedState configstore w.MetadataStoreState = v.(configstore.ManagedWarehouseProvisioningState) case "metadata_store_endpoint": w.MetadataStore.Endpoint = v.(string) + case "metadata_store_port": + w.MetadataStore.Port = v.(int) + case "metadata_store_kind": + w.MetadataStore.Kind = v.(string) + case "metadata_store_engine": + w.MetadataStore.Engine = v.(string) case "identity_state": w.IdentityState = v.(configstore.ManagedWarehouseProvisioningState) case "worker_identity_namespace": @@ -74,6 +80,9 @@ func (s *fakeStore) UpdateWarehouseState(orgID string, expectedState configstore case "failed_at": t := v.(time.Time) w.FailedAt = &t + case "provisioning_started_at": + t := v.(time.Time) + w.ProvisioningStartedAt = &t } } return nil @@ -146,6 +155,9 @@ func TestReconcilePendingCreatesCR(t *testing.T) { if fs.warehouses["org-a"].State != configstore.ManagedWarehouseStateProvisioning { t.Fatalf("expected provisioning state, got %q", fs.warehouses["org-a"].State) } + if fs.warehouses["org-a"].ProvisioningStartedAt == nil { + t.Fatal("expected provisioning_started_at to be set") + } } func TestReconcileProvisioningAllReady(t *testing.T) { diff --git a/controlplane/provisioning/api.go b/controlplane/provisioning/api.go index f7cffba..8c3aff9 100644 --- a/controlplane/provisioning/api.go +++ b/controlplane/provisioning/api.go @@ -3,6 +3,7 @@ package provisioning import ( "errors" "net/http" + "time" "github.com/gin-gonic/gin" "github.com/posthog/duckgres/controlplane/configstore" @@ -28,6 +29,20 @@ type handler struct { store Store } +// warehouseStatusResponse is the public-facing view of warehouse state. +// Only exposes lifecycle status — no infrastructure secrets or internal config. +type warehouseStatusResponse struct { + OrgID string `json:"org_id"` + State configstore.ManagedWarehouseProvisioningState `json:"state"` + StatusMessage string `json:"status_message"` + S3State configstore.ManagedWarehouseProvisioningState `json:"s3_state"` + MetadataStoreState configstore.ManagedWarehouseProvisioningState `json:"metadata_store_state"` + IdentityState configstore.ManagedWarehouseProvisioningState `json:"identity_state"` + SecretsState configstore.ManagedWarehouseProvisioningState `json:"secrets_state"` + ReadyAt *time.Time `json:"ready_at,omitempty"` + FailedAt *time.Time `json:"failed_at,omitempty"` +} + type provisionRequest struct { MetadataStore *provisionMetadataReq `json:"metadata_store,omitempty"` } @@ -51,10 +66,14 @@ func (h *handler) provisionWarehouse(c *gin.Context) { return } - warehouse := &configstore.ManagedWarehouse{} - if req.MetadataStore != nil && req.MetadataStore.Aurora != nil { - warehouse.AuroraMinACU = req.MetadataStore.Aurora.MinACU - warehouse.AuroraMaxACU = req.MetadataStore.Aurora.MaxACU + if req.MetadataStore == nil || req.MetadataStore.Aurora == nil || req.MetadataStore.Aurora.MaxACU <= 0 { + c.JSON(http.StatusBadRequest, gin.H{"error": "metadata_store.aurora.max_acu must be greater than 0"}) + return + } + + warehouse := &configstore.ManagedWarehouse{ + AuroraMinACU: req.MetadataStore.Aurora.MinACU, + AuroraMaxACU: req.MetadataStore.Aurora.MaxACU, } if err := h.store.CreatePendingWarehouse(orgID, warehouse); err != nil { @@ -68,22 +87,27 @@ func (h *handler) provisionWarehouse(c *gin.Context) { func (h *handler) deprovisionWarehouse(c *gin.Context) { orgID := c.Param("id") - // Try CAS from ready -> deleting, then from failed -> deleting. - // This avoids a read-then-write TOCTOU race. - err := h.store.SetWarehouseDeleting(orgID, configstore.ManagedWarehouseStateReady) - if err != nil { - err = h.store.SetWarehouseDeleting(orgID, configstore.ManagedWarehouseStateFailed) + // Try CAS from each deprovisionable state. Order doesn't matter — + // only one will match. This avoids a read-then-write TOCTOU race. + deprovisionableStates := []configstore.ManagedWarehouseProvisioningState{ + configstore.ManagedWarehouseStateReady, + configstore.ManagedWarehouseStateFailed, + configstore.ManagedWarehouseStateProvisioning, } - if err != nil { - if errors.Is(err, gorm.ErrRecordNotFound) { - c.JSON(http.StatusNotFound, gin.H{"error": "warehouse not found"}) + + var err error + for _, state := range deprovisionableStates { + if err = h.store.SetWarehouseDeleting(orgID, state); err == nil { + c.JSON(http.StatusAccepted, gin.H{"status": "deprovisioning started", "org": orgID}) return } - c.JSON(http.StatusConflict, gin.H{"error": "warehouse must be in ready or failed state to deprovision"}) - return } - c.JSON(http.StatusAccepted, gin.H{"status": "deprovisioning started", "org": orgID}) + if errors.Is(err, gorm.ErrRecordNotFound) { + c.JSON(http.StatusNotFound, gin.H{"error": "warehouse not found"}) + return + } + c.JSON(http.StatusConflict, gin.H{"error": "warehouse must be in ready, failed, or provisioning state to deprovision"}) } func (h *handler) getWarehouseStatus(c *gin.Context) { @@ -99,5 +123,15 @@ func (h *handler) getWarehouseStatus(c *gin.Context) { return } - c.JSON(http.StatusOK, warehouse) + c.JSON(http.StatusOK, warehouseStatusResponse{ + OrgID: warehouse.OrgID, + State: warehouse.State, + StatusMessage: warehouse.StatusMessage, + S3State: warehouse.S3State, + MetadataStoreState: warehouse.MetadataStoreState, + IdentityState: warehouse.IdentityState, + SecretsState: warehouse.SecretsState, + ReadyAt: warehouse.ReadyAt, + FailedAt: warehouse.FailedAt, + }) } diff --git a/controlplane/provisioning/api_test.go b/controlplane/provisioning/api_test.go index aae80b7..2de7fb4 100644 --- a/controlplane/provisioning/api_test.go +++ b/controlplane/provisioning/api_test.go @@ -132,6 +132,36 @@ func TestProvisionAutoCreatesOrg(t *testing.T) { } } +func TestProvisionRejectsEmptyBody(t *testing.T) { + store := newFakeStore() + router := newTestRouter(store) + + body := []byte(`{}`) + req := httptest.NewRequest(http.MethodPost, "/api/v1/orgs/analytics/provision", bytes.NewReader(body)) + req.Header.Set("Content-Type", "application/json") + rec := httptest.NewRecorder() + router.ServeHTTP(rec, req) + + if rec.Code != http.StatusBadRequest { + t.Fatalf("status = %d, want %d: %s", rec.Code, http.StatusBadRequest, rec.Body.String()) + } +} + +func TestProvisionRejectsZeroMaxACU(t *testing.T) { + store := newFakeStore() + router := newTestRouter(store) + + body := []byte(`{"metadata_store": {"type": "aurora", "aurora": {"min_acu": 0.5, "max_acu": 0}}}`) + req := httptest.NewRequest(http.MethodPost, "/api/v1/orgs/analytics/provision", bytes.NewReader(body)) + req.Header.Set("Content-Type", "application/json") + rec := httptest.NewRecorder() + router.ServeHTTP(rec, req) + + if rec.Code != http.StatusBadRequest { + t.Fatalf("status = %d, want %d: %s", rec.Code, http.StatusBadRequest, rec.Body.String()) + } +} + func TestProvisionRejectsExistingNonTerminal(t *testing.T) { store := newFakeStore() store.orgs["analytics"] = &configstore.Org{Name: "analytics"} @@ -217,7 +247,7 @@ func TestDeprovisionFailedWarehouse(t *testing.T) { } } -func TestDeprovisionRejectsProvisioningWarehouse(t *testing.T) { +func TestDeprovisionProvisioningWarehouse(t *testing.T) { store := newFakeStore() store.orgs["analytics"] = &configstore.Org{Name: "analytics"} store.warehouses["analytics"] = &configstore.ManagedWarehouse{ @@ -230,6 +260,27 @@ func TestDeprovisionRejectsProvisioningWarehouse(t *testing.T) { rec := httptest.NewRecorder() router.ServeHTTP(rec, req) + if rec.Code != http.StatusAccepted { + t.Fatalf("status = %d, want %d: %s", rec.Code, http.StatusAccepted, rec.Body.String()) + } + if store.warehouses["analytics"].State != configstore.ManagedWarehouseStateDeleting { + t.Fatalf("expected deleting state, got %q", store.warehouses["analytics"].State) + } +} + +func TestDeprovisionRejectsPendingWarehouse(t *testing.T) { + store := newFakeStore() + store.orgs["analytics"] = &configstore.Org{Name: "analytics"} + store.warehouses["analytics"] = &configstore.ManagedWarehouse{ + OrgID: "analytics", + State: configstore.ManagedWarehouseStatePending, + } + router := newTestRouter(store) + + req := httptest.NewRequest(http.MethodPost, "/api/v1/orgs/analytics/deprovision", nil) + rec := httptest.NewRecorder() + router.ServeHTTP(rec, req) + if rec.Code != http.StatusConflict { t.Fatalf("status = %d, want %d: %s", rec.Code, http.StatusConflict, rec.Body.String()) } @@ -254,15 +305,15 @@ func TestGetWarehouseStatus(t *testing.T) { t.Fatalf("status = %d, want %d: %s", rec.Code, http.StatusOK, rec.Body.String()) } - var w configstore.ManagedWarehouse - if err := json.Unmarshal(rec.Body.Bytes(), &w); err != nil { + var resp warehouseStatusResponse + if err := json.Unmarshal(rec.Body.Bytes(), &resp); err != nil { t.Fatalf("unmarshal: %v", err) } - if w.State != configstore.ManagedWarehouseStateProvisioning { - t.Fatalf("expected provisioning state, got %q", w.State) + if resp.State != configstore.ManagedWarehouseStateProvisioning { + t.Fatalf("expected provisioning state, got %q", resp.State) } - if w.S3State != configstore.ManagedWarehouseStateReady { - t.Fatalf("expected s3 ready, got %q", w.S3State) + if resp.S3State != configstore.ManagedWarehouseStateReady { + t.Fatalf("expected s3 ready, got %q", resp.S3State) } } diff --git a/k8s/kind/control-plane.yaml b/k8s/kind/control-plane.yaml index 093d71f..cc08ab2 100644 --- a/k8s/kind/control-plane.yaml +++ b/k8s/kind/control-plane.yaml @@ -53,8 +53,8 @@ spec: - name: pg containerPort: 5432 protocol: TCP - - name: admin - containerPort: 9090 + - name: api + containerPort: 8080 protocol: TCP volumeMounts: - name: config @@ -96,5 +96,9 @@ spec: port: 5432 targetPort: pg protocol: TCP + - name: api + port: 8080 + targetPort: api + protocol: TCP selector: app: duckgres-control-plane From d87999cfca682c375b1bc6570c88a22fe0fd66a8 Mon Sep 17 00:00:00 2001 From: eric Date: Tue, 24 Mar 2026 11:42:20 -0700 Subject: [PATCH 09/17] Add pod log diagnostics to K8s CI and fix remaining port refs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Dump pod status, describe, and logs on deploy-multitenant-kind failure - Fix control-plane-multitenant-local.yaml port 9090→8080 - Add image/aurora_min_acu/aurora_max_acu columns to kind seed SQL Co-Authored-By: Claude Opus 4.6 (1M context) --- justfile | 2 +- k8s/control-plane-multitenant-local.yaml | 4 ++-- k8s/kind/config-store.seed.sql | 6 ++++++ 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/justfile b/justfile index ccb781c..e5a4331 100644 --- a/justfile +++ b/justfile @@ -164,7 +164,7 @@ deploy-multitenant-kind: KUBECONFIG="${DUCKGRES_KIND_KUBECONFIG:-/tmp/duckgres-kind-kubeconfig}" kubectl apply -f k8s/managed-warehouse-secrets.yaml KUBECONFIG="${DUCKGRES_KIND_KUBECONFIG:-/tmp/duckgres-kind-kubeconfig}" kubectl apply -f k8s/networkpolicy.yaml KUBECONFIG="${DUCKGRES_KIND_KUBECONFIG:-/tmp/duckgres-kind-kubeconfig}" kubectl apply -f k8s/kind/control-plane.yaml - KUBECONFIG="${DUCKGRES_KIND_KUBECONFIG:-/tmp/duckgres-kind-kubeconfig}" kubectl -n duckgres wait deployment/duckgres-control-plane --for=condition=available --timeout=120s + KUBECONFIG="${DUCKGRES_KIND_KUBECONFIG:-/tmp/duckgres-kind-kubeconfig}" kubectl -n duckgres wait deployment/duckgres-control-plane --for=condition=available --timeout=120s || { echo "=== Pod status ==="; KUBECONFIG="${DUCKGRES_KIND_KUBECONFIG:-/tmp/duckgres-kind-kubeconfig}" kubectl -n duckgres get pods -o wide; echo "=== Pod describe ==="; KUBECONFIG="${DUCKGRES_KIND_KUBECONFIG:-/tmp/duckgres-kind-kubeconfig}" kubectl -n duckgres describe pod -l app=duckgres-control-plane; echo "=== Pod logs ==="; KUBECONFIG="${DUCKGRES_KIND_KUBECONFIG:-/tmp/duckgres-kind-kubeconfig}" kubectl -n duckgres logs -l app=duckgres-control-plane --tail=100 --all-containers; exit 1; } # End-to-end local multi-tenant setup: optional OrbStack K8s + config store + control plane [group('dev')] diff --git a/k8s/control-plane-multitenant-local.yaml b/k8s/control-plane-multitenant-local.yaml index 76d98d8..6cdaa4e 100644 --- a/k8s/control-plane-multitenant-local.yaml +++ b/k8s/control-plane-multitenant-local.yaml @@ -63,8 +63,8 @@ spec: - name: flight containerPort: 8815 protocol: TCP - - name: admin - containerPort: 9090 + - name: api + containerPort: 8080 protocol: TCP volumeMounts: - name: config diff --git a/k8s/kind/config-store.seed.sql b/k8s/kind/config-store.seed.sql index 1dc6d0f..db86c4d 100644 --- a/k8s/kind/config-store.seed.sql +++ b/k8s/kind/config-store.seed.sql @@ -5,6 +5,9 @@ SET updated_at = NOW(); INSERT INTO duckgres_managed_warehouses ( org_id, + image, + aurora_min_acu, + aurora_max_acu, warehouse_database_region, warehouse_database_endpoint, warehouse_database_port, @@ -58,6 +61,9 @@ INSERT INTO duckgres_managed_warehouses ( ) VALUES ( 'local', + '', + 0, + 0, 'kind-dev', 'duckgres-local-warehouse-db', 5432, From 1b2333b37b7454baf6df23061288978440f89a0f Mon Sep 17 00:00:00 2001 From: eric Date: Tue, 24 Mar 2026 11:57:23 -0700 Subject: [PATCH 10/17] =?UTF-8?q?Fix=20Gin=20route=20conflict:=20align=20a?= =?UTF-8?q?dmin=20API=20org=20param=20:name=20=E2=86=92=20:id?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The admin and provisioning APIs share the /api/v1 router group. Gin requires the same wildcard name on a given path segment — :name (admin) and :id (provisioning) on /orgs/:param conflicted, causing a panic at startup. Standardize on :id across both APIs. Co-Authored-By: Claude Opus 4.6 (1M context) --- controlplane/admin/api.go | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/controlplane/admin/api.go b/controlplane/admin/api.go index 3a2256e..ccb32bb 100644 --- a/controlplane/admin/api.go +++ b/controlplane/admin/api.go @@ -14,7 +14,7 @@ import ( "gorm.io/gorm/clause" ) -var errWarehousePayloadNotAllowed = errors.New("warehouse payload must be updated via /orgs/:name/warehouse") +var errWarehousePayloadNotAllowed = errors.New("warehouse payload must be updated via /orgs/:id/warehouse") // WorkerStatus represents a worker's current status for the API. type WorkerStatus struct { @@ -71,11 +71,11 @@ func registerAPIWithStore(r *gin.RouterGroup, store apiStore, info OrgStackInfo) // Orgs CRUD r.GET("/orgs", h.listOrgs) r.POST("/orgs", h.createOrg) - r.GET("/orgs/:name", h.getOrg) - r.PUT("/orgs/:name", h.updateOrg) - r.DELETE("/orgs/:name", h.deleteOrg) - r.GET("/orgs/:name/warehouse", h.getManagedWarehouse) - r.PUT("/orgs/:name/warehouse", h.putManagedWarehouse) + r.GET("/orgs/:id", h.getOrg) + r.PUT("/orgs/:id", h.updateOrg) + r.DELETE("/orgs/:id", h.deleteOrg) + r.GET("/orgs/:id/warehouse", h.getManagedWarehouse) + r.PUT("/orgs/:id/warehouse", h.putManagedWarehouse) // Users CRUD r.GET("/users", h.listUsers) @@ -484,7 +484,7 @@ func (h *apiHandler) createOrg(c *gin.Context) { } func (h *apiHandler) getOrg(c *gin.Context) { - name := c.Param("name") + name := c.Param("id") org, err := h.store.GetOrg(name) if err != nil { c.JSON(http.StatusNotFound, gin.H{"error": "org not found"}) @@ -494,7 +494,7 @@ func (h *apiHandler) getOrg(c *gin.Context) { } func (h *apiHandler) updateOrg(c *gin.Context) { - name := c.Param("name") + name := c.Param("id") var updates configstore.Org if err := c.ShouldBindJSON(&updates); err != nil { c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) @@ -517,7 +517,7 @@ func (h *apiHandler) updateOrg(c *gin.Context) { } func (h *apiHandler) deleteOrg(c *gin.Context) { - name := c.Param("name") + name := c.Param("id") ok, err := h.store.DeleteOrg(name) if err != nil { c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) @@ -538,7 +538,7 @@ func validateOrgMutationPayload(org *configstore.Org) error { } func (h *apiHandler) getManagedWarehouse(c *gin.Context) { - warehouse, err := h.store.GetManagedWarehouse(c.Param("name")) + warehouse, err := h.store.GetManagedWarehouse(c.Param("id")) if err != nil { if errors.Is(err, gorm.ErrRecordNotFound) { c.JSON(http.StatusNotFound, gin.H{"error": "managed warehouse not found"}) @@ -551,7 +551,7 @@ func (h *apiHandler) getManagedWarehouse(c *gin.Context) { } func (h *apiHandler) putManagedWarehouse(c *gin.Context) { - orgID := c.Param("name") + orgID := c.Param("id") var req managedWarehouseRequest if err := decodeStrictWarehouseRequest(c, &req); err != nil { c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) From 2adc0de4e337729d0b02890bd1cb8641cd70a85b Mon Sep 17 00:00:00 2001 From: eric Date: Tue, 24 Mar 2026 12:35:23 -0700 Subject: [PATCH 11/17] Move provisioning warehouse status to /warehouse/status to avoid route conflict Admin API already registers GET /orgs/:id/warehouse (full warehouse config). Provisioning status endpoint moved to /orgs/:id/warehouse/status (lifecycle-only DTO) to avoid duplicate route panic. Co-Authored-By: Claude Opus 4.6 (1M context) --- controlplane/provisioning/api.go | 2 +- controlplane/provisioning/api_test.go | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/controlplane/provisioning/api.go b/controlplane/provisioning/api.go index 8c3aff9..9a8b757 100644 --- a/controlplane/provisioning/api.go +++ b/controlplane/provisioning/api.go @@ -22,7 +22,7 @@ func RegisterAPI(r *gin.RouterGroup, store Store) { h := &handler{store: store} r.POST("/orgs/:id/provision", h.provisionWarehouse) r.POST("/orgs/:id/deprovision", h.deprovisionWarehouse) - r.GET("/orgs/:id/warehouse", h.getWarehouseStatus) + r.GET("/orgs/:id/warehouse/status", h.getWarehouseStatus) } type handler struct { diff --git a/controlplane/provisioning/api_test.go b/controlplane/provisioning/api_test.go index 2de7fb4..b1d9dcc 100644 --- a/controlplane/provisioning/api_test.go +++ b/controlplane/provisioning/api_test.go @@ -297,7 +297,7 @@ func TestGetWarehouseStatus(t *testing.T) { } router := newTestRouter(store) - req := httptest.NewRequest(http.MethodGet, "/api/v1/orgs/analytics/warehouse", nil) + req := httptest.NewRequest(http.MethodGet, "/api/v1/orgs/analytics/warehouse/status", nil) rec := httptest.NewRecorder() router.ServeHTTP(rec, req) @@ -321,7 +321,7 @@ func TestGetWarehouseNotFound(t *testing.T) { store := newFakeStore() router := newTestRouter(store) - req := httptest.NewRequest(http.MethodGet, "/api/v1/orgs/unknown/warehouse", nil) + req := httptest.NewRequest(http.MethodGet, "/api/v1/orgs/unknown/warehouse/status", nil) rec := httptest.NewRecorder() router.ServeHTTP(rec, req) From 9a414ed38e9e8ad7993eac2bba74da4cfd2457c5 Mon Sep 17 00:00:00 2001 From: eric Date: Tue, 24 Mar 2026 12:49:07 -0700 Subject: [PATCH 12/17] Add readiness probe to kind control-plane deployment Pod was Running but never became Ready without a readiness probe, causing kubectl wait --for=condition=available to time out. Probe hits GET /health on the API server port (8080). Co-Authored-By: Claude Opus 4.6 (1M context) --- k8s/kind/control-plane.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/k8s/kind/control-plane.yaml b/k8s/kind/control-plane.yaml index cc08ab2..3c53a77 100644 --- a/k8s/kind/control-plane.yaml +++ b/k8s/kind/control-plane.yaml @@ -64,6 +64,12 @@ spec: mountPath: /certs - name: data mountPath: /data + readinessProbe: + httpGet: + path: /health + port: api + initialDelaySeconds: 5 + periodSeconds: 5 securityContext: allowPrivilegeEscalation: false resources: From cd03472d46e6dd67edcfda12ec508d5b48c9346a Mon Sep 17 00:00:00 2001 From: eric Date: Tue, 24 Mar 2026 13:49:16 -0700 Subject: [PATCH 13/17] Retrigger CI From 1aa8597dc36d2de058497471f629a194c0de7c85 Mon Sep 17 00:00:00 2001 From: eric Date: Tue, 24 Mar 2026 16:35:50 -0700 Subject: [PATCH 14/17] Keep metrics on :9090, API + dashboard on :8080 The metrics server (:9090) stays running for Prometheus scraping and health probes. The API server (:8080) serves admin API, provisioning API, and dashboard only. Previously the API server replaced the metrics server, killing /metrics during the switchover. Co-Authored-By: Claude Opus 4.6 (1M context) --- controlplane/control.go | 16 +++++++++------- controlplane/multitenant.go | 7 +++---- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/controlplane/control.go b/controlplane/control.go index a974853..c76022f 100644 --- a/controlplane/control.go +++ b/controlplane/control.go @@ -113,6 +113,7 @@ type ControlPlane struct { // Multi-tenant fields (non-nil in remote multitenant mode) orgRouter OrgRouterInterface configStore ConfigStoreInterface + apiServer *http.Server // API server on :8080 (shut down on graceful exit) } // ConfigStoreInterface abstracts the config store for the control plane. @@ -327,13 +328,7 @@ func RunControlPlane(cfg ControlPlaneConfig) { } cp.configStore = store cp.orgRouter = adapter - // Replace the simple metrics server with the unified API server - if cfg.MetricsServer != nil { - ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) - _ = cfg.MetricsServer.Shutdown(ctx) - cancel() - } - cfg.MetricsServer = apiServer + cp.apiServer = apiServer cp.cfg = cfg _ = store // keep linter happy } else { @@ -956,6 +951,13 @@ func (cp *ControlPlane) handleUpgrade() { } cancel() } + if cp.apiServer != nil { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + if err := cp.apiServer.Shutdown(ctx); err != nil { + slog.Warn("API server shutdown failed.", "error", err) + } + cancel() + } // Stop ACME managers so the new CP can bind port 80 (HTTP-01) or // manage DNS records. Nil out after close so drainAfterUpgrade diff --git a/controlplane/multitenant.go b/controlplane/multitenant.go index 62234a1..a1cdf4d 100644 --- a/controlplane/multitenant.go +++ b/controlplane/multitenant.go @@ -17,7 +17,6 @@ import ( "github.com/posthog/duckgres/controlplane/provisioner" "github.com/posthog/duckgres/controlplane/provisioning" "github.com/posthog/duckgres/server" - "github.com/prometheus/client_golang/prometheus/promhttp" ) // orgRouterAdapter wraps OrgRouter to implement both OrgRouterInterface @@ -184,13 +183,13 @@ func SetupMultiTenant( slog.Info("Generated internal secret (pass via --internal-secret or DUCKGRES_INTERNAL_SECRET to set explicitly).", "secret", internalSecret) } - // Set up unified API server (admin + provisioning on single port) + // Set up API server (admin + provisioning + dashboard on :8080). + // The existing metrics server on :9090 stays running separately. gin.SetMode(gin.ReleaseMode) engine := gin.New() engine.Use(gin.Recovery()) - // Unauthenticated endpoints - engine.GET("/metrics", gin.WrapH(promhttp.Handler())) + // Health endpoint (also available on :9090 via metrics server) engine.GET("/health", func(c *gin.Context) { c.String(http.StatusOK, "ok") }) From 3a4bf8ac088b274e284b01302b120b5d4f6563cb Mon Sep 17 00:00:00 2001 From: Benjamin Knofe-Vider Date: Wed, 25 Mar 2026 13:07:53 +0100 Subject: [PATCH 15/17] refactor: remove aws-account-id, use IAM role ARN from config store The STS broker no longer constructs role ARNs from account ID + org name. Instead it receives the full IAM role ARN from the config store's WorkerIdentity.IAMRoleARN field (populated from the Duckling status). Removed: - --aws-account-id flag / DUCKGRES_AWS_ACCOUNT_ID env var - STSBroker.accountID field - STSBroker.RoleARNForOrg() method - Fallback ARN construction in shared_worker_activator Kept: - --aws-region / DUCKGRES_AWS_REGION (still needed for STS client) --- config_resolution.go | 10 ---------- controlplane/control.go | 3 +-- controlplane/multitenant.go | 4 ++-- controlplane/shared_worker_activator.go | 3 --- controlplane/sts_broker.go | 14 +++----------- main.go | 6 +----- 6 files changed, 7 insertions(+), 33 deletions(-) diff --git a/config_resolution.go b/config_resolution.go index 4b245ff..486872e 100644 --- a/config_resolution.go +++ b/config_resolution.go @@ -52,7 +52,6 @@ type configCLIInputs struct { K8sWorkerServiceAccount string K8sMaxWorkers int K8sSharedWarmTarget int - AWSAccountID string AWSRegion string QueryLog bool } @@ -75,7 +74,6 @@ type resolvedConfig struct { K8sWorkerServiceAccount string K8sMaxWorkers int K8sSharedWarmTarget int - AWSAccountID string AWSRegion string ConfigStoreConn string ConfigPollInterval time.Duration @@ -133,7 +131,6 @@ func resolveEffectiveConfig(fileCfg *FileConfig, cli configCLIInputs, getenv fun var k8sWorkerPort int var k8sWorkerSecret, k8sWorkerConfigMap, k8sWorkerImagePullPolicy, k8sWorkerServiceAccount string var k8sMaxWorkers, k8sSharedWarmTarget int - var awsAccountID string var awsRegion string var configStoreConn string var configPollInterval time.Duration @@ -635,9 +632,6 @@ func resolveEffectiveConfig(fileCfg *FileConfig, cli configCLIInputs, getenv fun warn("Invalid DUCKGRES_K8S_SHARED_WARM_TARGET: " + err.Error()) } } - if v := getenv("DUCKGRES_AWS_ACCOUNT_ID"); v != "" { - awsAccountID = v - } if v := getenv("DUCKGRES_AWS_REGION"); v != "" { awsRegion = v } @@ -838,9 +832,6 @@ func resolveEffectiveConfig(fileCfg *FileConfig, cli configCLIInputs, getenv fun if cli.Set["k8s-shared-warm-target"] { k8sSharedWarmTarget = cli.K8sSharedWarmTarget } - if cli.Set["aws-account-id"] { - awsAccountID = cli.AWSAccountID - } if cli.Set["aws-region"] { awsRegion = cli.AWSRegion } @@ -913,7 +904,6 @@ func resolveEffectiveConfig(fileCfg *FileConfig, cli configCLIInputs, getenv fun K8sWorkerServiceAccount: k8sWorkerServiceAccount, K8sMaxWorkers: k8sMaxWorkers, K8sSharedWarmTarget: k8sSharedWarmTarget, - AWSAccountID: awsAccountID, AWSRegion: awsRegion, ConfigStoreConn: configStoreConn, ConfigPollInterval: configPollInterval, diff --git a/controlplane/control.go b/controlplane/control.go index 3269641..482a808 100644 --- a/controlplane/control.go +++ b/controlplane/control.go @@ -81,8 +81,7 @@ type K8sConfig struct { ServiceAccount string // ServiceAccount name for worker pods (default: "default") MaxWorkers int // Global cap for the shared K8s worker pool (0 = auto-derived) SharedWarmTarget int // Neutral shared warm-worker target for K8s multi-tenant mode (0 = disabled) - AWSAccountID string // AWS account ID for constructing IAM role ARNs (STS credential brokering) - AWSRegion string // AWS region for STS client + AWSRegion string // AWS region for STS client } // ControlPlane manages the TCP listener and routes connections to Flight SQL workers. diff --git a/controlplane/multitenant.go b/controlplane/multitenant.go index a1cdf4d..13d0b1d 100644 --- a/controlplane/multitenant.go +++ b/controlplane/multitenant.go @@ -143,9 +143,9 @@ func SetupMultiTenant( // Initialize STS broker for credential brokering (best-effort) var stsBroker *STSBroker - if cfg.K8s.AWSAccountID != "" { + if cfg.K8s.AWSRegion != "" { var err error - stsBroker, err = NewSTSBroker(context.Background(), cfg.K8s.AWSRegion, cfg.K8s.AWSAccountID) + stsBroker, err = NewSTSBroker(context.Background(), cfg.K8s.AWSRegion) if err != nil { slog.Warn("STS broker unavailable, workers will use pod identity for S3.", "error", err) } diff --git a/controlplane/shared_worker_activator.go b/controlplane/shared_worker_activator.go index d816ce6..050ec81 100644 --- a/controlplane/shared_worker_activator.go +++ b/controlplane/shared_worker_activator.go @@ -113,9 +113,6 @@ func (a *SharedWorkerActivator) BuildActivationRequest(ctx context.Context, org dl.S3SecretKey = secretKey case strings.EqualFold(warehouse.S3.Provider, "aws"): roleARN := warehouse.WorkerIdentity.IAMRoleARN - if roleARN == "" && a.stsBroker != nil { - roleARN = a.stsBroker.RoleARNForOrg(orgName(org)) - } if roleARN != "" && a.stsBroker != nil { creds, err := a.stsBroker.AssumeRole(ctx, roleARN) if err != nil { diff --git a/controlplane/sts_broker.go b/controlplane/sts_broker.go index 44b9431..6a9682a 100644 --- a/controlplane/sts_broker.go +++ b/controlplane/sts_broker.go @@ -19,8 +19,7 @@ const ( // STSBroker brokers short-lived AWS credentials by assuming per-org IAM roles. type STSBroker struct { - client *sts.Client - accountID string + client *sts.Client } // AssumedCredentials holds the temporary credentials from STS AssumeRole. @@ -32,8 +31,7 @@ type AssumedCredentials struct { } // NewSTSBroker creates an STS broker using the control plane's own credentials. -// accountID is the AWS account ID used to construct deterministic role ARNs. -func NewSTSBroker(ctx context.Context, region, accountID string) (*STSBroker, error) { +func NewSTSBroker(ctx context.Context, region string) (*STSBroker, error) { opts := []func(*awsconfig.LoadOptions) error{} if region != "" { opts = append(opts, awsconfig.WithRegion(region)) @@ -43,16 +41,10 @@ func NewSTSBroker(ctx context.Context, region, accountID string) (*STSBroker, er return nil, fmt.Errorf("load AWS config: %w", err) } return &STSBroker{ - client: sts.NewFromConfig(cfg), - accountID: accountID, + client: sts.NewFromConfig(cfg), }, nil } -// RoleARNForOrg returns the deterministic IAM role ARN for an org. -func (b *STSBroker) RoleARNForOrg(orgID string) string { - return fmt.Sprintf("arn:aws:iam::%s:role/duckling-%s", b.accountID, orgID) -} - // AssumeRole mints short-lived credentials for the given IAM role ARN. func (b *STSBroker) AssumeRole(ctx context.Context, roleARN string) (*AssumedCredentials, error) { durationSeconds := int32(stsSessionDuration.Seconds()) diff --git a/main.go b/main.go index 980ce3f..32ad2dd 100644 --- a/main.go +++ b/main.go @@ -239,7 +239,6 @@ func main() { k8sWorkerServiceAccount := flag.String("k8s-worker-service-account", "", "ServiceAccount name for K8s worker pods (env: DUCKGRES_K8S_WORKER_SERVICE_ACCOUNT)") k8sMaxWorkers := flag.Int("k8s-max-workers", 0, "Max K8s workers in the shared pool, 0=auto-derived (env: DUCKGRES_K8S_MAX_WORKERS)") k8sSharedWarmTarget := flag.Int("k8s-shared-warm-target", 0, "Neutral shared warm-worker target for K8s multi-tenant mode, 0=disabled (env: DUCKGRES_K8S_SHARED_WARM_TARGET)") - awsAccountID := flag.String("aws-account-id", "", "AWS account ID for STS credential brokering (env: DUCKGRES_AWS_ACCOUNT_ID)") awsRegion := flag.String("aws-region", "", "AWS region for STS client (env: DUCKGRES_AWS_REGION)") // Config store flags (multi-tenant mode) @@ -304,7 +303,6 @@ func main() { fmt.Fprintf(os.Stderr, " DUCKGRES_INTERNAL_SECRET Shared secret for API authentication\n") fmt.Fprintf(os.Stderr, " DUCKGRES_K8S_MAX_WORKERS Max K8s workers in the shared pool\n") fmt.Fprintf(os.Stderr, " DUCKGRES_K8S_SHARED_WARM_TARGET Neutral shared warm-worker target for K8s multi-tenant mode\n") - fmt.Fprintf(os.Stderr, " DUCKGRES_AWS_ACCOUNT_ID AWS account ID for STS credential brokering\n") fmt.Fprintf(os.Stderr, " DUCKGRES_AWS_REGION AWS region for STS client\n") fmt.Fprintf(os.Stderr, " DUCKGRES_LOG_LEVEL Log level: debug, info, warn, error (default: info)\n") fmt.Fprintf(os.Stderr, "\nPrecedence: CLI flags > environment variables > config file > defaults\n") @@ -418,7 +416,6 @@ func main() { K8sWorkerServiceAccount: *k8sWorkerServiceAccount, K8sMaxWorkers: *k8sMaxWorkers, K8sSharedWarmTarget: *k8sSharedWarmTarget, - AWSAccountID: *awsAccountID, AWSRegion: *awsRegion, QueryLog: *queryLog, }, os.Getenv, func(msg string) { @@ -564,8 +561,7 @@ func main() { ServiceAccount: resolved.K8sWorkerServiceAccount, MaxWorkers: resolved.K8sMaxWorkers, SharedWarmTarget: resolved.K8sSharedWarmTarget, - AWSAccountID: resolved.AWSAccountID, - AWSRegion: resolved.AWSRegion, + AWSRegion: resolved.AWSRegion, }, } controlplane.RunControlPlane(cpCfg) From 2599a275992718eb91f7d335acf43958ac73a054 Mon Sep 17 00:00:00 2001 From: Benjamin Knofe-Vider Date: Wed, 25 Mar 2026 15:00:49 +0100 Subject: [PATCH 16/17] refactor(provisioner): update DucklingStatus for nested XRD schema Align DucklingStatus and parseDucklingStatus with the new Duckling XRD: status.metadataStore: type, endpoint, password, user, database status.dataStore: type, bucketName status.iamRoleArn Controller now writes: - metadata_store_username and metadata_store_database_name from status - worker_identity_iam_role_arn from status.iamRoleArn - metadata_store_kind from status.metadataStore.type Create also includes dataStore.type: s3bucket in the Duckling CR spec. --- controlplane/provisioner/controller.go | 19 ++++++----- controlplane/provisioner/controller_test.go | 32 +++++++++++++++--- controlplane/provisioner/k8s_client.go | 37 +++++++++++++++++---- 3 files changed, 69 insertions(+), 19 deletions(-) diff --git a/controlplane/provisioner/controller.go b/controlplane/provisioner/controller.go index 2fc3f69..5e2c314 100644 --- a/controlplane/provisioner/controller.go +++ b/controlplane/provisioner/controller.go @@ -185,27 +185,28 @@ func (c *Controller) reconcileProvisioning(ctx context.Context, w *configstore.M // K8s workloads (namespace, deployment, service) are managed by the duckgres Helm chart. updates := map[string]interface{}{} - if status.BucketName != "" && w.S3State != configstore.ManagedWarehouseStateReady { + if status.DataStore.BucketName != "" && w.S3State != configstore.ManagedWarehouseStateReady { updates["s3_state"] = configstore.ManagedWarehouseStateReady - updates["s3_bucket"] = status.BucketName + updates["s3_bucket"] = status.DataStore.BucketName } - if status.AuroraEndpoint != "" && w.MetadataStoreState != configstore.ManagedWarehouseStateReady { + if status.MetadataStore.Endpoint != "" && w.MetadataStoreState != configstore.ManagedWarehouseStateReady { updates["metadata_store_state"] = configstore.ManagedWarehouseStateReady - updates["metadata_store_endpoint"] = status.AuroraEndpoint + updates["metadata_store_endpoint"] = status.MetadataStore.Endpoint updates["metadata_store_port"] = 5432 - updates["metadata_store_kind"] = "aurora" + updates["metadata_store_kind"] = status.MetadataStore.Type updates["metadata_store_engine"] = "postgres" + updates["metadata_store_username"] = status.MetadataStore.User + updates["metadata_store_database_name"] = status.MetadataStore.Database } - if status.AuroraPassword != "" && w.SecretsState != configstore.ManagedWarehouseStateReady { + if status.MetadataStore.Password != "" && w.SecretsState != configstore.ManagedWarehouseStateReady { updates["secrets_state"] = configstore.ManagedWarehouseStateReady } - // Crossplane Ready condition means all composed resources (Aurora, S3, IAM) are reconciled. - // We use this for the identity component (IAM role + pod identity association). - if status.ReadyCondition && w.IdentityState != configstore.ManagedWarehouseStateReady { + if status.IAMRoleARN != "" && w.IdentityState != configstore.ManagedWarehouseStateReady { updates["identity_state"] = configstore.ManagedWarehouseStateReady + updates["worker_identity_iam_role_arn"] = status.IAMRoleARN } // Infrastructure is ready when S3, Aurora, secrets, and IAM are all provisioned. diff --git a/controlplane/provisioner/controller_test.go b/controlplane/provisioner/controller_test.go index 9f213cc..b554708 100644 --- a/controlplane/provisioner/controller_test.go +++ b/controlplane/provisioner/controller_test.go @@ -68,8 +68,14 @@ func (s *fakeStore) UpdateWarehouseState(orgID string, expectedState configstore w.MetadataStore.Engine = v.(string) case "identity_state": w.IdentityState = v.(configstore.ManagedWarehouseProvisioningState) + case "worker_identity_iam_role_arn": + w.WorkerIdentity.IAMRoleARN = v.(string) case "worker_identity_namespace": w.WorkerIdentity.Namespace = v.(string) + case "metadata_store_username": + w.MetadataStore.Username = v.(string) + case "metadata_store_database_name": + w.MetadataStore.DatabaseName = v.(string) case "secrets_state": w.SecretsState = v.(configstore.ManagedWarehouseProvisioningState) case "warehouse_database_state": @@ -179,9 +185,18 @@ func TestReconcileProvisioningAllReady(t *testing.T) { "namespace": ducklingNamespace, }, "status": map[string]interface{}{ - "bucketName": "org-b-bucket", - "auroraEndpoint": "org-b.cluster.us-east-1.rds.amazonaws.com", - "auroraPassword": "supersecret123", + "metadataStore": map[string]interface{}{ + "type": "aurora", + "endpoint": "org-b.cluster.us-east-1.rds.amazonaws.com", + "password": "supersecret123", + "user": "postgres", + "database": "postgres", + }, + "dataStore": map[string]interface{}{ + "type": "s3bucket", + "bucketName": "org-b-bucket", + }, + "iamRoleArn": "arn:aws:iam::123456789012:role/duckling-org-b", "conditions": []interface{}{ map[string]interface{}{ "type": "Ready", @@ -219,6 +234,15 @@ func TestReconcileProvisioningAllReady(t *testing.T) { if w.MetadataStore.Port != 5432 { t.Fatalf("expected aurora port 5432, got %d", w.MetadataStore.Port) } + if w.MetadataStore.Username != "postgres" { + t.Fatalf("expected username postgres, got %q", w.MetadataStore.Username) + } + if w.MetadataStore.DatabaseName != "postgres" { + t.Fatalf("expected database_name postgres, got %q", w.MetadataStore.DatabaseName) + } + if w.WorkerIdentity.IAMRoleARN != "arn:aws:iam::123456789012:role/duckling-org-b" { + t.Fatalf("expected IAM role ARN, got %q", w.WorkerIdentity.IAMRoleARN) + } if w.ReadyAt == nil { t.Fatal("expected ready_at to be set") } @@ -321,7 +345,7 @@ func TestParseDucklingStatusEmpty(t *testing.T) { if err != nil { t.Fatalf("unexpected error: %v", err) } - if status.BucketName != "" || status.AuroraEndpoint != "" || status.AuroraPassword != "" { + if status.DataStore.BucketName != "" || status.MetadataStore.Endpoint != "" || status.MetadataStore.Password != "" { t.Fatal("expected empty status for CR without status field") } } diff --git a/controlplane/provisioner/k8s_client.go b/controlplane/provisioner/k8s_client.go index cc851c6..6221564 100644 --- a/controlplane/provisioner/k8s_client.go +++ b/controlplane/provisioner/k8s_client.go @@ -25,9 +25,18 @@ const ducklingNamespace = "crossplane-system" // The Duckling composition provisions AWS infrastructure (Aurora, S3, IAM) // but not K8s workloads — those are managed by the duckgres Helm chart. type DucklingStatus struct { - BucketName string - AuroraEndpoint string - AuroraPassword string + MetadataStore struct { + Type string + Endpoint string + Password string + User string + Database string + } + DataStore struct { + Type string + BucketName string + } + IAMRoleARN string ReadyCondition bool SyncedFalseMessage string } @@ -73,6 +82,9 @@ func (d *DucklingClient) Create(ctx context.Context, orgID string, minACU, maxAC "maxACU": maxACU, }, }, + "dataStore": map[string]interface{}{ + "type": "s3bucket", + }, }, }, } @@ -109,9 +121,22 @@ func parseDucklingStatus(cr *unstructured.Unstructured) (*DucklingStatus, error) } ds := &DucklingStatus{ - BucketName: getNestedString(status, "bucketName"), - AuroraEndpoint: getNestedString(status, "auroraEndpoint"), - AuroraPassword: getNestedString(status, "auroraPassword"), + IAMRoleARN: getNestedString(status, "iamRoleArn"), + } + + // Parse status.metadataStore + if md, ok := status["metadataStore"].(map[string]interface{}); ok { + ds.MetadataStore.Type = getNestedString(md, "type") + ds.MetadataStore.Endpoint = getNestedString(md, "endpoint") + ds.MetadataStore.Password = getNestedString(md, "password") + ds.MetadataStore.User = getNestedString(md, "user") + ds.MetadataStore.Database = getNestedString(md, "database") + } + + // Parse status.dataStore + if store, ok := status["dataStore"].(map[string]interface{}); ok { + ds.DataStore.Type = getNestedString(store, "type") + ds.DataStore.BucketName = getNestedString(store, "bucketName") } // Parse conditions From fdcb66c45ff75f912d1db138d918e29e05618204 Mon Sep 17 00:00:00 2001 From: Benjamin Knofe-Vider Date: Wed, 25 Mar 2026 15:39:23 +0100 Subject: [PATCH 17/17] feat: add /api/v1/health endpoint for K8s probes --- controlplane/control.go | 3 --- controlplane/multitenant.go | 2 +- main.go | 3 --- 3 files changed, 1 insertion(+), 7 deletions(-) diff --git a/controlplane/control.go b/controlplane/control.go index 482a808..9a638db 100644 --- a/controlplane/control.go +++ b/controlplane/control.go @@ -1185,9 +1185,6 @@ func (cp *ControlPlane) recoverMetricsAfterFailedReload() { addr := cp.cfg.MetricsServer.Addr mux := http.NewServeMux() mux.Handle("/metrics", promhttp.Handler()) - mux.HandleFunc("/health", func(w http.ResponseWriter, _ *http.Request) { - w.WriteHeader(http.StatusOK) - }) newSrv := &http.Server{Addr: addr, Handler: mux} cp.cfg.MetricsServer = newSrv go func() { diff --git a/controlplane/multitenant.go b/controlplane/multitenant.go index 13d0b1d..8e94700 100644 --- a/controlplane/multitenant.go +++ b/controlplane/multitenant.go @@ -189,7 +189,7 @@ func SetupMultiTenant( engine := gin.New() engine.Use(gin.Recovery()) - // Health endpoint (also available on :9090 via metrics server) + // Health endpoint (unauthenticated, used by K8s probes) engine.GET("/health", func(c *gin.Context) { c.String(http.StatusOK, "ok") }) diff --git a/main.go b/main.go index 32ad2dd..d8712e7 100644 --- a/main.go +++ b/main.go @@ -153,9 +153,6 @@ func env(key, defaultVal string) string { func initMetrics() *http.Server { mux := http.NewServeMux() mux.Handle("/metrics", promhttp.Handler()) - mux.HandleFunc("/health", func(w http.ResponseWriter, _ *http.Request) { - w.WriteHeader(http.StatusOK) - }) srv := &http.Server{ Addr: ":9090", Handler: mux,
IDTeamStatusSessions
IDOrgStatusSessions
${w.id}${esc(w.team)}${esc(w.org)} ${esc(w.status)} ${w.active_sessions}