diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitWALManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitWALManager.java index 245b34d55172..c47b66e4dc04 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitWALManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitWALManager.java @@ -188,10 +188,16 @@ public void addUsedSplitWALWorker(ServerName worker) { public String renameWALForRetry(String walPath, Integer workerChangeCount) throws IOException { String originalWALPath; - if (workerChangeCount == 0) { - originalWALPath = walPath; - } else { + // If the WAL split is retried with another worker, ".retrying-xxx" is appended to the walPath. + // In the case where a SplitWalProcedure is bypassed or rolled back after being retried with + // another worker, the walPath will still have ".retrying-xxx" appended. During recovery, a new + // SplitWalProcedure (by different SCP) will be created, which will have a workerChangeCount of + // 0 but will still have ".retrying-xxx" appended to the walPath. This is why we cannot use the + // condition workerChangeCount != 0. + if (walPath.substring(walPath.length() - RETRYING_EXT.length() - 3).startsWith(RETRYING_EXT)) { originalWALPath = walPath.substring(0, walPath.length() - RETRYING_EXT.length() - 3); + } else { + originalWALPath = walPath; } String walNewName = originalWALPath + RETRYING_EXT + String.format("%03d", (workerChangeCount + 1) % 1000); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/SplitWALProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/SplitWALProcedure.java index 28fefe8d9e6e..eb96071785db 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/SplitWALProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/SplitWALProcedure.java @@ -51,7 +51,7 @@ public class SplitWALProcedure private ServerName worker; private ServerName crashedServer; private RetryCounter retryCounter; - private Integer workerChangeCount = 0; + private int workerChangeCount = 0; public SplitWALProcedure() { } @@ -183,10 +183,6 @@ public ServerName getWorker() { return worker; } - public Integer getWorkerChangeCount() { - return workerChangeCount; - } - @Override public ServerName getServerName() { return this.crashedServer; diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/AbstractFSWALProvider.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/AbstractFSWALProvider.java index d705d9be9915..e7428f3b31c9 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/AbstractFSWALProvider.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/AbstractFSWALProvider.java @@ -237,6 +237,7 @@ static void requestLogRoll(final WAL wal) { /** File Extension used while splitting an WAL into regions (HBASE-2312) */ public static final String SPLITTING_EXT = "-splitting"; + // Extension for the WAL where the split failed on one worker and is being retried on another. public static final String RETRYING_EXT = ".retrying"; /** diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestSplitWALManager.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestSplitWALManager.java index f053461fb6e9..5186ea3d865a 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestSplitWALManager.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestSplitWALManager.java @@ -99,7 +99,7 @@ public void teardown() throws Exception { } @Test - public void testRenameWALForRetry() throws Exception { + public void testRenameWALForRetryWALSplit() throws Exception { HRegionServer regionServer = TEST_UTIL.getHBaseCluster().getRegionServer(0); List logDirs = master.getMasterWalManager().getLogDirs(Collections.singleton(regionServer.getServerName())); @@ -134,6 +134,45 @@ public void testRenameWALForRetry() throws Exception { Assert.assertEquals(walFromFileSystem.toString(), newWALPath); } + @Test + public void testRenameWALForFirstRetryWhenWALAlreadyHaveRetryingSuffix() throws Exception { + HRegionServer regionServer = TEST_UTIL.getHBaseCluster().getRegionServer(0); + List logDirs = + master.getMasterWalManager().getLogDirs(Collections.singleton(regionServer.getServerName())); + List list = + SplitLogManager.getFileList(TEST_UTIL.getConfiguration(), logDirs, NON_META_FILTER); + Path testWal = list.get(0).getPath(); + int workerChangeCount = 0; + + String newWALPath = splitWALManager.renameWALForRetry(testWal.toString(), workerChangeCount); + Assert.assertEquals(newWALPath, + testWal + RETRYING_EXT + String.format("%03d", workerChangeCount + 1)); + // Let's say that after this step, the SplitWalProcedure and SCP are bypassed. To reach a + // consistent state, the recoverUnknown or schedulerRecovery command will create another SCP and + // SplitWalProcedure. If that also fails on the first attempt + + int workerChangeCountForNewProcedure = 0; + newWALPath = splitWALManager.renameWALForRetry(newWALPath, workerChangeCountForNewProcedure); + Assert.assertEquals(newWALPath, + testWal + RETRYING_EXT + String.format("%03d", workerChangeCountForNewProcedure + 1)); + + List fileStatusesA = + SplitLogManager.getFileList(TEST_UTIL.getConfiguration(), logDirs, NON_META_FILTER); + + Path walFromFileSystem = null; + for (FileStatus wal : fileStatusesA) { + if ( + wal.getPath().toString() + .endsWith(RETRYING_EXT + String.format("%03d", workerChangeCountForNewProcedure + 1)) + ) { + walFromFileSystem = wal.getPath(); + break; + } + } + Assert.assertNotNull(walFromFileSystem); + Assert.assertEquals(walFromFileSystem.toString(), newWALPath); + } + @Test public void testAcquireAndRelease() throws Exception { List testProcedures = new ArrayList<>();