diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java index cbf5b24129ee..72cf9e3fc62e 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java @@ -21,13 +21,14 @@ import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_COMMAND_STATUS_REPORT_INTERVAL; import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_CONTAINER_REPORT_INTERVAL; import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_HEARTBEAT_INTERVAL; +import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_NODE_REPORT_INTERVAL; import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_PIPELINE_REPORT_INTERVAL; import static org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerDataProto.State.QUASI_CLOSED; import static org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerDataProto.State.UNHEALTHY; +import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_DEADNODE_INTERVAL; import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_STALENODE_INTERVAL; import static org.assertj.core.api.Assertions.assertThat; import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertInstanceOf; import static org.junit.jupiter.api.Assertions.assertNotEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; @@ -64,6 +65,7 @@ import org.apache.hadoop.hdds.protocol.DatanodeID; import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos; import org.apache.hadoop.hdds.protocol.proto.HddsProtos; +import org.apache.hadoop.hdds.ratis.RatisHelper; import org.apache.hadoop.hdds.ratis.conf.RatisClientConfig; import org.apache.hadoop.hdds.scm.OzoneClientConfig; import org.apache.hadoop.hdds.scm.ScmConfigKeys; @@ -100,7 +102,6 @@ import org.apache.hadoop.ozone.protocol.commands.SCMCommand; import org.apache.ozone.test.GenericTestUtils; import org.apache.ozone.test.LambdaTestUtils; -import org.apache.ozone.test.tag.Flaky; import org.apache.ratis.protocol.RaftGroupId; import org.apache.ratis.protocol.exceptions.StateMachineException; import org.apache.ratis.server.storage.FileInfo; @@ -109,11 +110,15 @@ import org.apache.ratis.thirdparty.com.google.protobuf.ByteString; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.MethodOrderer; +import org.junit.jupiter.api.Order; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.TestMethodOrder; /** * Tests the containerStateMachine failure handling. */ +@TestMethodOrder(MethodOrderer.OrderAnnotation.class) public class TestContainerStateMachineFailures { private static MiniOzoneCluster cluster; @@ -138,7 +143,9 @@ public static void init() throws Exception { conf.setTimeDuration(HDDS_PIPELINE_REPORT_INTERVAL, 200, TimeUnit.MILLISECONDS); conf.setTimeDuration(HDDS_HEARTBEAT_INTERVAL, 200, TimeUnit.MILLISECONDS); - conf.setTimeDuration(OZONE_SCM_STALENODE_INTERVAL, 30, TimeUnit.SECONDS); + conf.setTimeDuration(HDDS_NODE_REPORT_INTERVAL, 1, TimeUnit.SECONDS); + conf.setTimeDuration(OZONE_SCM_STALENODE_INTERVAL, 3, TimeUnit.SECONDS); + conf.setTimeDuration(OZONE_SCM_DEADNODE_INTERVAL, 6, TimeUnit.SECONDS); conf.set(OzoneConfigKeys.OZONE_SCM_CLOSE_CONTAINER_WAIT_DURATION, "2s"); conf.set(ScmConfigKeys.OZONE_SCM_PIPELINE_SCRUB_INTERVAL, "2s"); conf.set(ScmConfigKeys.OZONE_SCM_PIPELINE_DESTROY_TIMEOUT, "5s"); @@ -254,7 +261,6 @@ public void testContainerStateMachineCloseOnMissingPipeline() } @Test - @Flaky("HDDS-12215") public void testContainerStateMachineRestartWithDNChangePipeline() throws Exception { try (OzoneOutputStream key = objectStore.getVolume(volumeName).getBucket(bucketName) @@ -304,7 +310,12 @@ public void testContainerStateMachineRestartWithDNChangePipeline() } } + // This test case is placed at the end because it resets the Ratis storage location. + // This causes pipelines to break. Those pipelines are closed passively + // via client-side retries rather than by the ScrubbingService. + // Running this test earlier would leave a dirty pipeline pool for subsequent tests. @Test + @Order(Integer.MAX_VALUE) public void testContainerStateMachineFailures() throws Exception { OzoneOutputStream key = objectStore.getVolume(volumeName).getBucket(bucketName) @@ -476,16 +487,11 @@ public void testApplyTransactionFailure() throws Exception { getHddsDatanodes().get(index), omKeyLocationInfo.getPipeline()); SimpleStateMachineStorage storage = (SimpleStateMachineStorage) stateMachine.getStateMachineStorage(); - stateMachine.takeSnapshot(); - final FileInfo snapshot = getSnapshotFileInfo(storage); - final Path parentPath = snapshot.getPath(); - // Since the snapshot threshold is set to 1, since there are - // applyTransactions, we should see snapshots - assertThat(parentPath.getParent().toFile().listFiles().length).isGreaterThan(0); - assertNotNull(snapshot); long containerID = omKeyLocationInfo.getContainerID(); // delete the container db file FileUtil.fullyDelete(new File(keyValueContainerData.getContainerPath())); + long bcsid = containerData.getBlockCommitSequenceId(); + Pipeline pipeline = cluster.getStorageContainerLocationClient() .getContainerWithPipeline(containerID).getPipeline(); XceiverClientSpi xceiverClient = @@ -506,33 +512,36 @@ public void testApplyTransactionFailure() throws Exception { xceiverClientManager.releaseClient(xceiverClient, false); } // Make sure the container is marked unhealthy - assertSame(dn.getDatanodeStateMachine() - .getContainer().getContainerSet().getContainer(containerID) - .getContainerState(), UNHEALTHY); + GenericTestUtils.waitFor(() -> { + try { + return !((ContainerStateMachine)((XceiverServerRatis)dn.getDatanodeStateMachine() + .getContainer().getWriteChannel()).getServer().getDivision( + RatisHelper.newRaftGroup(pipeline).getGroupId()).getStateMachine()).isStateMachineHealthy(); + } catch (IOException e) { + throw new RuntimeException(e); + } + }, 100, 5000); try { // try to take a new snapshot, ideally it should just fail stateMachine.takeSnapshot(); + fail("Should have thrown StateMachineException because it is UNHEALTHY"); } catch (IOException ioe) { assertInstanceOf(StateMachineException.class, ioe); } - if (snapshot.getPath().toFile().exists()) { - // Make sure the latest snapshot is same as the previous one - try { - final FileInfo latestSnapshot = getSnapshotFileInfo(storage); - assertEquals(snapshot.getPath(), latestSnapshot.getPath()); - } catch (Throwable e) { - assertFalse(snapshot.getPath().toFile().exists()); - } - } + assertEquals(bcsid, dn.getDatanodeStateMachine() + .getContainer().getContainerSet() + .getContainer(omKeyLocationInfo.getContainerID()) + .getContainerData().getBlockCommitSequenceId()); + + final FileInfo snapshot = getSnapshotFileInfo(storage); // when remove pipeline, group dir including snapshot will be deleted LambdaTestUtils.await(10000, 500, () -> (!snapshot.getPath().toFile().exists())); } @Test - @Flaky("HDDS-6115") void testApplyTransactionIdempotencyWithClosedContainer() throws Exception { OzoneOutputStream key = @@ -590,6 +599,8 @@ void testApplyTransactionIdempotencyWithClosedContainer() .getContainerState(), ContainerProtos.ContainerDataProto.State.CLOSED); assertTrue(stateMachine.isStateMachineHealthy()); + GenericTestUtils.waitFor(() -> stateMachine.getLastAppliedTermIndex().getIndex() != markIndex1, + 1000, 30000); try { stateMachine.takeSnapshot(); } finally { @@ -618,7 +629,6 @@ void testApplyTransactionIdempotencyWithClosedContainer() // not be marked unhealthy and pipeline should not fail if container gets // closed here. @Test - @Flaky("HDDS-13482") void testWriteStateMachineDataIdempotencyWithClosedContainer() throws Exception { OzoneOutputStream key = @@ -683,7 +693,7 @@ void testWriteStateMachineDataIdempotencyWithClosedContainer() }; Runnable r2 = () -> { try { - ByteString data = ByteString.copyFromUtf8("hello"); + ByteString data = ByteString.copyFromUtf8("ratis"); ContainerProtos.ContainerCommandRequestProto.Builder writeChunkRequest = ContainerTestHelper.newWriteChunkRequestBuilder(pipeline, omKeyLocationInfo.getBlockID(), data.size()); @@ -698,7 +708,7 @@ void testWriteStateMachineDataIdempotencyWithClosedContainer() failCount.incrementAndGet(); } String message = e.getMessage(); - assertThat(message).doesNotContain("hello"); + assertThat(message).doesNotContain("ratis"); assertThat(message).contains(HddsUtils.REDACTED.toStringUtf8()); } }; @@ -745,7 +755,6 @@ void testWriteStateMachineDataIdempotencyWithClosedContainer() } @Test - @Flaky("HDDS-14101") void testContainerStateMachineSingleFailureRetry() throws Exception { try (OzoneOutputStream key = objectStore.getVolume(volumeName).getBucket(bucketName) @@ -776,7 +785,6 @@ void testContainerStateMachineSingleFailureRetry() } @Test - @Flaky("HDDS-14101") void testContainerStateMachineDualFailureRetry() throws Exception { OzoneOutputStream key =