diff --git a/CHANGELOG.md b/CHANGELOG.md index 3f3cf00..52a6d74 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file. ## [Unreleased] +### Added + +- Add the role group as a node attribute ([#63]). + +[#63]: https://github.com/stackabletech/opensearch-operator/pull/63 + ## [25.11.0] - 2025-11-07 ## [25.11.0-rc1] - 2025-11-06 diff --git a/docs/modules/opensearch/pages/usage-guide/scaling.adoc b/docs/modules/opensearch/pages/usage-guide/scaling.adoc new file mode 100644 index 0000000..275b467 --- /dev/null +++ b/docs/modules/opensearch/pages/usage-guide/scaling.adoc @@ -0,0 +1,258 @@ += Scaling OpenSearch clusters +:description: OpenSearch clusters can be scaled after provisioning but manual steps are required. + +OpenSearch clusters can be scaled after provisioning. +CPU and memory settings can be easily adjusted, as detailed in the xref:opensearch:usage-guide/storage-resource-configuration.adoc#_resource_requests[Resource Requests]. +However, when changing the number of nodes or resizing volumes, the following considerations must be kept in mind. + +Horizontal scaling, which involves adjusting the replica count of role groups, can be easily accomplished for non-data nodes by modifying the OpenSearchCluster specification. +Additionally, the number of data nodes can be increased. +However, reducing the number of data nodes requires manual intervention. +If a pod that manages data is simply shut down, its data becomes inaccessible. +Therefore, it is necessary to manually drain the data from the nodes before removing them. + +Vertical scaling, which refers to changing the volume size of nodes, is not supported by the operator. +Whether the size of a volume can be changed depends on its CSI driver. +OpenSearch allows for multiple data paths within a single data node, but adding volumes to additional data paths typically does not resolve low disk space issues, as the data is not automatically rebalanced across multiple data paths. + +[NOTE] +==== +The OpenSearch operator is currently in the early stages of development. +Smart scaling (adapting resources without data loss) and auto scaling (scaling the cluster based on load) are not supported. +==== + +== Manually scaling + +As noted earlier, scaling can be quite challenging; +however, an easy workaround exists, which will be presented here. + +For example, the following OpenSearchCluster has been deployed with three cluster-manager nodes and five small data nodes: + +[source,yaml] +---- +spec: + nodes: + roleGroups: + cluster-manager: + config: + nodeRoles: + - cluster_manager + replicas: 3 + data-small: + config: + nodeRoles: + - data + - ingest + - remote_cluster_client + resources: + storage: + data: + capacity: 10Gi + replicas: 5 +---- + +You have decided that three large data nodes would be more suitable than five small ones. +To implement this change, you can replace the role group `data-small` with your preferred option. + +First, add the new role group `data-large` with three replicas, each having a capacity of 100 Gi per node: + +[source,yaml] +---- +spec: + nodes: + roleGroups: + cluster-manager: + config: + nodeRoles: + - cluster_manager + replicas: 3 + data-small: + config: + nodeRoles: + - data + - ingest + - remote_cluster_client + resources: + storage: + data: + capacity: 10Gi + replicas: 5 + data-large: + config: + nodeRoles: + - data + - ingest + - remote_cluster_client + resources: + storage: + data: + capacity: 100Gi + replicas: 3 +---- + +The data must now be transferred from `data-small` to `data-large`. +By using the cluster setting `cluster.routing.allocation.exclude`, you can exclude nodes from shard allocation. +If rebalancing has not been disabled, existing data will automatically move from the specified nodes to the allowed ones—in this case, from `data-small` to `data-large`. + +[TIP] +==== +The OpenSearch operator assigns a role group attribute to each OpenSearch node, making it easier to reference all nodes associated with a specific role group. +==== + +The following REST call excludes the `data-small` role group from shard allocation: + +[source,http] +---- +PUT _cluster/settings +{ + "persistent": { + "cluster": { + "routing": { + "allocation.exclude": { + "role-group": "data-small" + } + } + } + } +} +---- + +You must wait until all data has been transferred from `data-small` to `data-large`. +You can request the current shard allocation at the `_cat/shards` endpoint, for example: + +[source,http] +---- +GET _cat/shards?v +index shard prirep state docs store ip node +logs 0 r STARTED 14074 6.9mb 10.244.0.60 opensearch-nodes-data-large-2 +logs 0 p RELOCATING 14074 8.5mb 10.244.0.52 opensearch-nodes-data-small-4 + -> 10.244.0.59 NFjQBBmWSm-pijXcxrXnvQ opensearch-nodes-data-large-1 +... + +GET _cat/shards?v +index shard prirep state docs store ip node +logs 0 r STARTED 14074 6.9mb 10.244.0.60 opensearch-nodes-data-large-2 +logs 0 p STARTED 14074 6.9mb 10.244.0.59 opensearch-nodes-data-large-1 +... +---- + +Statistics, particularly the document count, can be retrieved from the `_nodes/role-group:data-small/stats` endpoint, for example: + +[source,http] +---- +GET _nodes/role-group:data-small/stats/indices/docs +{ + "_nodes": { + "total": 5, + "successful": 5, + "failed": 0 + }, + "cluster_name": "opensearch", + "nodes": { + "wjaeQJUXQX6eNWYUeiScgQ": { + "timestamp": 1761992580239, + "name": "opensearch-nodes-data-small-4", + "transport_address": "10.244.0.52:9300", + "host": "10.244.0.52", + "ip": "10.244.0.52:9300", + "roles": [ + "data", + "ingest", + "remote_cluster_client" + ], + "attributes": { + "role-group": "data-small", + "shard_indexing_pressure_enabled": "true" + }, + "indices": { + "docs": { + "count": 14686, + "deleted": 0 + } + } + }, + ... + } +} + +GET _nodes/role-group:data-small/stats/indices/docs +{ + "_nodes": { + "total": 5, + "successful": 5, + "failed": 0 + }, + "cluster_name": "opensearch", + "nodes": { + "wjaeQJUXQX6eNWYUeiScgQ": { + "timestamp": 1761992817422, + "name": "opensearch-nodes-data-small-4", + "transport_address": "10.244.0.52:9300", + "host": "10.244.0.52", + "ip": "10.244.0.52:9300", + "roles": [ + "data", + "ingest", + "remote_cluster_client" + ], + "attributes": { + "role-group": "data-small", + "shard_indexing_pressure_enabled": "true" + }, + "indices": { + "docs": { + "count": 0, + "deleted": 0 + } + } + }, + ... + } +} + +---- + +Once all shards have been transferred, the `data-small` role group can be removed from the OpenSearchCluster specification: + +[source,yaml] +---- +spec: + nodes: + roleGroups: + cluster-manager: + config: + nodeRoles: + - cluster_manager + replicas: 3 + data-large: + config: + nodeRoles: + - data + - ingest + - remote_cluster_client + resources: + storage: + data: + capacity: 100Gi + replicas: 3 +---- + +Finally, the shard exclusion should be removed from the cluster settings: + +[source,http] +---- +PUT _cluster/settings +{ + "persistent": { + "cluster": { + "routing": { + "allocation.exclude": { + "role-group": null + } + } + } + } +} +---- + +If your OpenSearch clients connected to the cluster exclusively through the cluster-manager nodes, the switch from one data role group to another should have been seamless for them. diff --git a/docs/modules/opensearch/pages/usage-guide/storage-resource-configuration.adoc b/docs/modules/opensearch/pages/usage-guide/storage-resource-configuration.adoc index cc86873..890cc96 100644 --- a/docs/modules/opensearch/pages/usage-guide/storage-resource-configuration.adoc +++ b/docs/modules/opensearch/pages/usage-guide/storage-resource-configuration.adoc @@ -13,7 +13,7 @@ nodes: config: resources: storage: - logDirs: + data: capacity: 50Gi ---- diff --git a/docs/modules/opensearch/partials/nav.adoc b/docs/modules/opensearch/partials/nav.adoc index 0b47320..1994a6e 100644 --- a/docs/modules/opensearch/partials/nav.adoc +++ b/docs/modules/opensearch/partials/nav.adoc @@ -9,6 +9,7 @@ ** xref:opensearch:usage-guide/monitoring.adoc[] ** xref:opensearch:usage-guide/logging.adoc[] ** xref:opensearch:usage-guide/opensearch-dashboards.adoc[] +** xref:opensearch:usage-guide/scaling.adoc[] ** xref:opensearch:usage-guide/operations/index.adoc[] *** xref:opensearch:usage-guide/operations/cluster-operations.adoc[] *** xref:opensearch:usage-guide/operations/pod-placement.adoc[] diff --git a/rust/operator-binary/src/controller/build/node_config.rs b/rust/operator-binary/src/controller/build/node_config.rs index db795f7..9249042 100644 --- a/rust/operator-binary/src/controller/build/node_config.rs +++ b/rust/operator-binary/src/controller/build/node_config.rs @@ -10,7 +10,7 @@ use crate::{ controller::OpenSearchRoleGroupConfig, crd::v1alpha1, framework::{ - ServiceName, + RoleGroupName, ServiceName, builder::pod::container::{EnvVarName, EnvVarSet}, role_group_utils, }, @@ -41,6 +41,10 @@ pub const CONFIG_OPTION_INITIAL_CLUSTER_MANAGER_NODES: &str = /// Type: string pub const CONFIG_OPTION_NETWORK_HOST: &str = "network.host"; +/// The custom node attribute "role-group" +/// Type: string +pub const CONFIG_OPTION_NODE_ATTR_ROLE_GROUP: &str = "node.attr.role-group"; + /// A descriptive name for the node. /// Type: string pub const CONFIG_OPTION_NODE_NAME: &str = "node.name"; @@ -61,6 +65,7 @@ pub const CONFIG_OPTION_PLUGINS_SECURITY_SSL_HTTP_ENABLED: &str = /// Configuration of an OpenSearch node based on the cluster and role-group configuration pub struct NodeConfig { cluster: ValidatedCluster, + role_group_name: RoleGroupName, role_group_config: OpenSearchRoleGroupConfig, discovery_service_name: ServiceName, } @@ -70,11 +75,13 @@ pub struct NodeConfig { impl NodeConfig { pub fn new( cluster: ValidatedCluster, + role_group_name: RoleGroupName, role_group_config: OpenSearchRoleGroupConfig, discovery_service_name: ServiceName, ) -> Self { Self { cluster, + role_group_name, role_group_config, discovery_service_name, } @@ -111,6 +118,10 @@ impl NodeConfig { CONFIG_OPTION_PLUGINS_SECURITY_NODES_DN.to_owned(), json!(["CN=generated certificate for pod".to_owned()]), ); + config.insert( + CONFIG_OPTION_NODE_ATTR_ROLE_GROUP.to_owned(), + json!(self.role_group_name), + ); for (setting, value) in self .role_group_config @@ -311,6 +322,8 @@ mod tests { let image: ProductImage = serde_json::from_str(r#"{"productVersion": "3.1.0"}"#) .expect("should be a valid ProductImage"); + let role_group_name = RoleGroupName::from_str_unsafe("data"); + let role_group_config = OpenSearchRoleGroupConfig { replicas: test_config.replicas, config: ValidatedOpenSearchConfig { @@ -374,6 +387,7 @@ mod tests { NodeConfig::new( cluster, + role_group_name, role_group_config, ServiceName::from_str_unsafe("my-opensearch-cluster-manager"), ) @@ -391,6 +405,7 @@ mod tests { "cluster.name: \"my-opensearch-cluster\"\n", "discovery.type: \"zen\"\n", "network.host: \"0.0.0.0\"\n", + "node.attr.role-group: \"data\"\n", "plugins.security.nodes_dn: [\"CN=generated certificate for pod\"]\n", "test: \"value\"" ) diff --git a/rust/operator-binary/src/controller/build/role_group_builder.rs b/rust/operator-binary/src/controller/build/role_group_builder.rs index 431690f..896c344 100644 --- a/rust/operator-binary/src/controller/build/role_group_builder.rs +++ b/rust/operator-binary/src/controller/build/role_group_builder.rs @@ -101,6 +101,7 @@ impl<'a> RoleGroupBuilder<'a> { cluster: cluster.clone(), node_config: NodeConfig::new( cluster.clone(), + role_group_name.clone(), role_group_config.clone(), discovery_service_name, ), diff --git a/tests/templates/kuttl/smoke/10-assert.yaml.j2 b/tests/templates/kuttl/smoke/10-assert.yaml.j2 index eb94bd6..6979ac6 100644 --- a/tests/templates/kuttl/smoke/10-assert.yaml.j2 +++ b/tests/templates/kuttl/smoke/10-assert.yaml.j2 @@ -650,6 +650,7 @@ data: cluster.routing.allocation.disk.threshold_enabled: "false" discovery.type: "zen" network.host: "0.0.0.0" + node.attr.role-group: "cluster-manager" node.store.allow_mmap: "false" plugins.security.allow_default_init_securityindex: "true" plugins.security.nodes_dn: ["CN=generated certificate for pod"] @@ -685,6 +686,7 @@ data: cluster.routing.allocation.disk.threshold_enabled: "false" discovery.type: "zen" network.host: "0.0.0.0" + node.attr.role-group: "data" node.store.allow_mmap: "false" plugins.security.allow_default_init_securityindex: "true" plugins.security.nodes_dn: ["CN=generated certificate for pod"]