diff --git a/hadoop-hdds/framework/src/main/resources/webapps/static/templates/menu.html b/hadoop-hdds/framework/src/main/resources/webapps/static/templates/menu.html index 9a14f356d7a4..7339f833d11c 100644 --- a/hadoop-hdds/framework/src/main/resources/webapps/static/templates/menu.html +++ b/hadoop-hdds/framework/src/main/resources/webapps/static/templates/menu.html @@ -34,6 +34,7 @@
  • Configuration
  • +
  • Ratis event timeline
  • Documentation
  • ratisEvents = new ArrayList<>(); + private static final int MAX_RATIS_EVENTS = 100; + /** * Container stat metrics, the meaning of following metrics * can be found in {@link ContainerStat}. @@ -155,6 +161,22 @@ public void decrContainerStat(ContainerStat deltaStat) { this.containerReportWriteCount.incr(-1 * deltaStat.getWriteCount().get()); } + public void addRatisEvent(String event) { + synchronized (ratisEvents) { + if (ratisEvents.size() >= MAX_RATIS_EVENTS) { + ratisEvents.remove(0); + } + ratisEvents.add(Time.formatTime(Time.now()) + "|" + event); + } + } + + @Metric("Ratis state machine events") + public String getRatisEvents() { + synchronized (ratisEvents) { + return String.join("\n", ratisEvents); + } + } + public void unRegister() { MetricsSystem ms = DefaultMetricsSystem.instance(); ms.unregisterSource(SOURCE_NAME); diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMStateMachine.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMStateMachine.java index 9d49ca36b6f2..fa48271a745c 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMStateMachine.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMStateMachine.java @@ -26,6 +26,7 @@ import java.io.IOException; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; +import java.util.ArrayList; import java.util.Collection; import java.util.EnumMap; import java.util.List; @@ -39,6 +40,7 @@ import org.apache.hadoop.hdds.protocol.proto.SCMRatisProtocol.RequestType; import org.apache.hadoop.hdds.scm.block.DeletedBlockLog; import org.apache.hadoop.hdds.scm.block.DeletedBlockLogImpl; +import org.apache.hadoop.hdds.scm.container.placement.metrics.SCMMetrics; import org.apache.hadoop.hdds.scm.exceptions.SCMException; import org.apache.hadoop.hdds.scm.exceptions.SCMException.ResultCodes; import org.apache.hadoop.hdds.scm.server.StorageContainerManager; @@ -213,6 +215,10 @@ public void notifyNotLeader(Collection pendingEntries) { return; } LOG.info("current leader SCM steps down."); + SCMMetrics metrics = StorageContainerManager.getMetrics(); + if (metrics != null) { + metrics.addRatisEvent("current leader SCM steps down."); + } scm.getScmContext().updateLeaderAndTerm(false, 0); scm.getSCMServiceManager().notifyStatusChanged(); @@ -243,6 +249,12 @@ public CompletableFuture notifyInstallSnapshotFromLeader( final String leaderNodeId = leaderDetails.get().getNodeId(); LOG.info("Received install snapshot notification from SCM leader: {} with " + "term index: {}", leaderAddress, firstTermIndexInLog); + SCMMetrics metrics = StorageContainerManager.getMetrics(); + if (metrics != null) { + metrics.addRatisEvent( + "Installing snapshot from SCM leader " + leaderNodeId + + ", term index: " + firstTermIndexInLog); + } CompletableFuture future = CompletableFuture.supplyAsync( () -> { @@ -283,6 +295,7 @@ public void notifyLeaderChanged(RaftGroupMemberId groupMemberId, if (!isInitialized) { return; } + SCMMetrics metrics = StorageContainerManager.getMetrics(); currentLeaderTerm.set(scm.getScmHAManager().getRatisServer().getDivision() .getInfo().getCurrentTerm()); @@ -297,10 +310,16 @@ public void notifyLeaderChanged(RaftGroupMemberId groupMemberId, if (!groupMemberId.getPeerId().equals(newLeaderId)) { LOG.info("leader changed, yet current SCM is still follower."); + if (metrics != null) { + metrics.addRatisEvent("Leader changed to " + newLeaderId + ", yet current SCM is still follower."); + } return; } LOG.info("current SCM becomes leader of term {}.", currentLeaderTerm); + if (metrics != null) { + metrics.addRatisEvent("current SCM becomes leader of term " + currentLeaderTerm); + } scm.getScmContext().updateLeaderAndTerm(true, currentLeaderTerm.get()); @@ -394,11 +413,35 @@ public void notifyLeaderReady() { scm.getScmContext().setLeaderReady(); scm.getSCMServiceManager().notifyStatusChanged(); scm.getFinalizationManager().onLeaderReady(); + SCMMetrics metrics = StorageContainerManager.getMetrics(); + if (metrics != null) { + metrics.addRatisEvent("Ready to serve requests as the leader"); + } } @Override public void notifyConfigurationChanged(long term, long index, RaftProtos.RaftConfigurationProto newRaftConfiguration) { + SCMMetrics metrics = StorageContainerManager.getMetrics(); + if (metrics != null) { + List newPeers = + newRaftConfiguration.getPeersList(); + List newListeners = + newRaftConfiguration.getListenersList(); + List newPeerIds = new ArrayList<>(); + List newListenersIds = new ArrayList<>(); + for (RaftProtos.RaftPeerProto raftPeerProto : newPeers) { + newPeerIds.add(RaftPeerId.valueOf(raftPeerProto.getId()).toString()); + } + for (RaftProtos.RaftPeerProto raftListenerProto : newListeners) { + newListenersIds.add(RaftPeerId.valueOf(raftListenerProto.getId()).toString()); + } + metrics.addRatisEvent( + "New peers " + newPeerIds + + (newListenersIds.isEmpty() ? "" : ", new listeners " + newListenersIds) + + " added at term index (" + + term + ", " + index + ")"); + } } @Override @@ -431,6 +474,10 @@ public void reinitialize() throws IOException { } LOG.info("{}: SCMStateMachine is reinitializing. newTermIndex = {}", getId(), termIndex); + SCMMetrics metrics = StorageContainerManager.getMetrics(); + if (metrics != null) { + metrics.addRatisEvent("reinitialize: " + termIndex); + } // re-initialize the DBTransactionBuffer and update the lastAppliedIndex. try { diff --git a/hadoop-hdds/server-scm/src/main/resources/webapps/scm/ratis-events.html b/hadoop-hdds/server-scm/src/main/resources/webapps/scm/ratis-events.html new file mode 100644 index 000000000000..9eb6008d00ff --- /dev/null +++ b/hadoop-hdds/server-scm/src/main/resources/webapps/scm/ratis-events.html @@ -0,0 +1,32 @@ + +

    Ratis event timeline

    + + + + + + + + + + + + + + +
    TimestampEvent description
    {{event.timestamp}}{{event.description}}
    diff --git a/hadoop-hdds/server-scm/src/main/resources/webapps/scm/scm.js b/hadoop-hdds/server-scm/src/main/resources/webapps/scm/scm.js index eca79852e43b..1e44826f980e 100644 --- a/hadoop-hdds/server-scm/src/main/resources/webapps/scm/scm.js +++ b/hadoop-hdds/server-scm/src/main/resources/webapps/scm/scm.js @@ -19,6 +19,32 @@ "use strict"; angular.module('scm', ['ozone', 'nvd3']); + angular.module('scm').config(function ($routeProvider) { + $routeProvider + .when("/ratis_events", { + template: "" + }); + }); + + angular.module('scm').component('ratisEvents', { + templateUrl: 'ratis-events.html', + controller: function ($http) { + var ctrl = this; + $http.get("jmx?qry=Hadoop:service=StorageContainerManager,name=SCMMetrics") + .then(function (result) { + var metrics = result.data.beans[0]; + var rawEvents = metrics['tag.RatisEvents'] ? metrics['tag.RatisEvents'].split('\n') : []; + ctrl.events = rawEvents.map(function(e) { + var parts = e.split('|'); + return { + timestamp: parts[0], + description: parts[1] + }; + }); + }); + } + }); + angular.module('scm').component('scmOverview', { templateUrl: 'scm-overview.html', require: { diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/ha/TestSCMStateMachine.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/ha/TestSCMStateMachine.java new file mode 100644 index 000000000000..6672d764b7c8 --- /dev/null +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/ha/TestSCMStateMachine.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hdds.scm.ha; + +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.mockStatic; +import static org.mockito.Mockito.when; + +import org.apache.hadoop.hdds.scm.container.placement.metrics.SCMMetrics; +import org.apache.hadoop.hdds.scm.server.StorageContainerManager; +import org.apache.hadoop.hdds.utils.TransactionInfo; +import org.apache.ratis.proto.RaftProtos; +import org.apache.ratis.server.protocol.TermIndex; +import org.junit.jupiter.api.Test; +import org.mockito.MockedStatic; + +/** + * Test SCMStateMachine events recording. + */ +public class TestSCMStateMachine { + + @Test + public void testRatisEventsRecording() throws Exception { + StorageContainerManager scm = mock(StorageContainerManager.class); + SCMMetrics metrics = SCMMetrics.create(); + SCMHADBTransactionBuffer buffer = mock(SCMHADBTransactionBuffer.class); + when(buffer.getLatestTrxInfo()).thenReturn(TransactionInfo.valueOf(TermIndex.valueOf(0, 0))); + + SCMStateMachine stateMachine = new SCMStateMachine(scm, buffer); + + try (MockedStatic scmStaticMock = mockStatic(StorageContainerManager.class)) { + scmStaticMock.when(StorageContainerManager::getMetrics).thenReturn(metrics); + + stateMachine.notifyConfigurationChanged(1, 1, RaftProtos.RaftConfigurationProto.getDefaultInstance()); + assertTrue(metrics.getRatisEvents().contains("New peers [] added at term index")); + } + + metrics.unRegister(); + } +} diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OMMetrics.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OMMetrics.java index 3658ec96c658..dabb7d618bda 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OMMetrics.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OMMetrics.java @@ -18,6 +18,8 @@ package org.apache.hadoop.ozone.om; import com.google.common.annotations.VisibleForTesting; +import java.util.ArrayList; +import java.util.List; import org.apache.hadoop.hdds.annotation.InterfaceAudience; import org.apache.hadoop.hdds.utils.DBCheckpointMetrics; import org.apache.hadoop.metrics2.MetricsSystem; @@ -26,6 +28,7 @@ import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; import org.apache.hadoop.metrics2.lib.MutableCounterLong; import org.apache.hadoop.metrics2.lib.MutableGaugeInt; +import org.apache.hadoop.util.Time; /** * This class is for maintaining Ozone Manager statistics. @@ -36,6 +39,9 @@ public class OMMetrics implements OmMetadataReaderMetrics { private static final String SOURCE_NAME = OMMetrics.class.getSimpleName(); + private final List ratisEvents = new ArrayList<>(); + private static final int MAX_RATIS_EVENTS = 100; + // OM request type op metrics private @Metric MutableCounterLong numVolumeOps; private @Metric MutableCounterLong numBucketOps; @@ -1545,6 +1551,22 @@ public void incNumRecoverLeaseFails() { numRecoverLeaseFails.incr(); } + public void addRatisEvent(String event) { + synchronized (ratisEvents) { + if (ratisEvents.size() >= MAX_RATIS_EVENTS) { + ratisEvents.remove(0); + } + ratisEvents.add(Time.formatTime(Time.now()) + "|" + event); + } + } + + @Metric("Ratis state machine events") + public String getRatisEvents() { + synchronized (ratisEvents) { + return String.join("\n", ratisEvents); + } + } + public void unRegister() { if (dbCheckpointMetrics != null) { dbCheckpointMetrics.unRegister(); diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerStateMachine.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerStateMachine.java index 09a530ab6cc7..2abaf9ae5719 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerStateMachine.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerStateMachine.java @@ -26,6 +26,7 @@ import com.google.common.util.concurrent.ThreadFactoryBuilder; import java.io.IOException; import java.util.ArrayList; +import java.util.Collection; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; @@ -41,6 +42,7 @@ import org.apache.hadoop.ozone.audit.AuditLoggerType; import org.apache.hadoop.ozone.audit.OMSystemAction; import org.apache.hadoop.ozone.om.OMConfigKeys; +import org.apache.hadoop.ozone.om.OMMetrics; import org.apache.hadoop.ozone.om.OzoneManager; import org.apache.hadoop.ozone.om.OzoneManagerPrepareState; import org.apache.hadoop.ozone.om.exceptions.OMException; @@ -170,6 +172,10 @@ public synchronized void reinitialize() throws IOException { final TermIndex lastApplied = getLastAppliedTermIndex(); unpause(lastApplied.getIndex(), lastApplied.getTerm()); LOG.info("{}: reinitialize {} with {}", getId(), getGroupId(), lastApplied); + OMMetrics metrics = ozoneManager.getMetrics(); + if (metrics != null) { + metrics.addRatisEvent("reinitialize: " + lastApplied); + } } } @@ -183,6 +189,19 @@ public SnapshotInfo getLatestSnapshot() { @Override public void notifyLeaderReady() { ozoneManager.getOmSnapshotManager().resetInFlightSnapshotCount(); + OMMetrics metrics = ozoneManager.getMetrics(); + if (metrics != null) { + metrics.addRatisEvent("Ready to serve requests as the leader"); + } + } + + @Override + public void notifyNotLeader(Collection pendingEntries) { + LOG.info("current leader OM steps down."); + OMMetrics metrics = ozoneManager.getMetrics(); + if (metrics != null) { + metrics.addRatisEvent("current leader OM steps down."); + } } @Override @@ -208,6 +227,10 @@ public void notifyLeaderChanged(RaftGroupMemberId groupMemberId, AUDIT.logWriteSuccess(ozoneManager.buildAuditMessageForSuccess(OMSystemAction.LEADER_CHANGE, auditParams)); LOG.info("{}: leader changed to {}", groupMemberId, newLeaderId); + OMMetrics metrics = ozoneManager.getMetrics(); + if (metrics != null) { + metrics.addRatisEvent("Leader changed to " + newLeaderId); + } } /** Notified by Ratis for non-StateMachine term-index update. */ @@ -282,10 +305,22 @@ public void notifyConfigurationChanged(long term, long index, for (RaftProtos.RaftPeerProto raftPeerProto : newPeers) { newPeerIds.add(RaftPeerId.valueOf(raftPeerProto.getId()).toString()); } - for (RaftProtos.RaftPeerProto raftPeerProto : newListeners) { - newPeerIds.add(RaftPeerId.valueOf(raftPeerProto.getId()).toString()); + List newListenersIds = new ArrayList<>(); + for (RaftProtos.RaftPeerProto raftListenerProto : newListeners) { + newListenersIds.add(RaftPeerId.valueOf(raftListenerProto.getId()).toString()); + } + + OMMetrics metrics = ozoneManager.getMetrics(); + if (metrics != null) { + metrics.addRatisEvent( + "New peers " + newPeerIds + + (newListenersIds.isEmpty() ? "" : ", new listeners " + newListenersIds) + + " added at term index (" + + term + ", " + index + ")"); } + // Check and update the peer list in OzoneManager + newPeerIds.addAll(newListenersIds); ozoneManager.updatePeerList(newPeerIds); } @@ -301,6 +336,11 @@ public void notifySnapshotInstalled(RaftProtos.InstallSnapshotResult result, long snapshotIndex, RaftPeer peer) { LOG.info("Receive notifySnapshotInstalled event {} for the peer: {}" + " snapshotIndex: {}.", result, peer.getId(), snapshotIndex); + OMMetrics metrics = ozoneManager.getMetrics(); + if (metrics != null) { + metrics.addRatisEvent("Install snapshot " + + result + ", snapshotIndex=" + snapshotIndex + ", peer=" + peer.getId()); + } switch (result) { case SUCCESS: case SNAPSHOT_UNAVAILABLE: @@ -581,6 +621,11 @@ public CompletableFuture notifyInstallSnapshotFromLeader( .getLeaderInfo().getId().getId()).toString(); LOG.info("Received install snapshot notification from OM leader: {} with " + "term index: {}", leaderNodeId, firstTermIndexInLog); + OMMetrics metrics = ozoneManager.getMetrics(); + if (metrics != null) { + metrics.addRatisEvent("Installing snapshot from " + + "OM leader " + leaderNodeId + ", term index: " + firstTermIndexInLog); + } return CompletableFuture.supplyAsync( () -> { diff --git a/hadoop-ozone/ozone-manager/src/main/resources/webapps/ozoneManager/ozoneManager.js b/hadoop-ozone/ozone-manager/src/main/resources/webapps/ozoneManager/ozoneManager.js index 8269b6df0fbb..18c49fbd91b3 100644 --- a/hadoop-ozone/ozone-manager/src/main/resources/webapps/ozoneManager/ozoneManager.js +++ b/hadoop-ozone/ozone-manager/src/main/resources/webapps/ozoneManager/ozoneManager.js @@ -27,8 +27,29 @@ $routeProvider .when("/metrics/ozoneManager", { template: "" + }) + .when("/ratis_events", { + template: "" }); }); + angular.module('ozoneManager').component('ratisEvents', { + templateUrl: 'ratis-events.html', + controller: function ($http) { + var ctrl = this; + $http.get("jmx?qry=Hadoop:service=OzoneManager,name=OMMetrics") + .then(function (result) { + var metrics = result.data.beans[0]; + var rawEvents = metrics['tag.RatisEvents'] ? metrics['tag.RatisEvents'].split('\n') : []; + ctrl.events = rawEvents.map(function(e) { + var parts = e.split('|'); + return { + timestamp: parts[0], + description: parts[1] + }; + }); + }); + } + }); angular.module('ozoneManager').component('omMetrics', { templateUrl: 'om-metrics.html', controller: function ($http) { diff --git a/hadoop-ozone/ozone-manager/src/main/resources/webapps/ozoneManager/ratis-events.html b/hadoop-ozone/ozone-manager/src/main/resources/webapps/ozoneManager/ratis-events.html new file mode 100644 index 000000000000..9eb6008d00ff --- /dev/null +++ b/hadoop-ozone/ozone-manager/src/main/resources/webapps/ozoneManager/ratis-events.html @@ -0,0 +1,32 @@ + +

    Ratis event timeline

    + + + + + + + + + + + + + + +
    TimestampEvent description
    {{event.timestamp}}{{event.description}}
    diff --git a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/ratis/TestOzoneManagerStateMachine.java b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/ratis/TestOzoneManagerStateMachine.java index 36d7a80aeea9..f4be86fcf588 100644 --- a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/ratis/TestOzoneManagerStateMachine.java +++ b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/ratis/TestOzoneManagerStateMachine.java @@ -54,6 +54,7 @@ import org.apache.hadoop.ozone.audit.OMSystemAction; import org.apache.hadoop.ozone.om.OMConfigKeys; import org.apache.hadoop.ozone.om.OMMetadataManager; +import org.apache.hadoop.ozone.om.OMMetrics; import org.apache.hadoop.ozone.om.OmConfig; import org.apache.hadoop.ozone.om.OmMetadataManagerImpl; import org.apache.hadoop.ozone.om.OmSnapshotManager; @@ -1037,4 +1038,27 @@ private RaftClientRequest buildClientRequest( .setType(RaftClientRequest.writeRequestType()) .build(); } + + @Test + public void testRatisEventsRecording() { + OMMetrics metrics = OMMetrics.create(); + when(om.getMetrics()).thenReturn(metrics); + when(om.getOmSnapshotManager()).thenReturn(mock(OmSnapshotManager.class)); + when(om.getConfiguration()).thenReturn(new OzoneConfiguration()); + AuditMessage auditMessage = mock(AuditMessage.class); + when(auditMessage.getOp()).thenReturn("LEADER_CHANGE"); + when(om.buildAuditMessageForSuccess(any(), any())).thenReturn(auditMessage); + sm.notifyLeaderReady(); + assertTrue(metrics.getRatisEvents().contains("Ready to serve requests as the leader")); + + RaftGroupMemberId groupMemberId = mock(RaftGroupMemberId.class); + when(groupMemberId.getPeerId()).thenReturn(RaftPeerId.valueOf("peer1")); + sm.notifyLeaderChanged(groupMemberId, RaftPeerId.valueOf("peer1")); + assertTrue(metrics.getRatisEvents().contains("Leader changed to peer1")); + + sm.notifyConfigurationChanged(1, 1, RaftProtos.RaftConfigurationProto.getDefaultInstance()); + assertTrue(metrics.getRatisEvents().contains("New peers [] added at term index ")); + + metrics.unRegister(); + } }