Skip to content

Commit

Permalink
HBASE-28951 handle first retry of splitWal for a wal with .retrying s…
Browse files Browse the repository at this point in the history
…uffix
  • Loading branch information
ukumawat committed Dec 16, 2024
1 parent 35c6206 commit 10e9b7c
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -188,10 +188,16 @@ public void addUsedSplitWALWorker(ServerName worker) {

public String renameWALForRetry(String walPath, Integer workerChangeCount) throws IOException {
String originalWALPath;
if (workerChangeCount == 0) {
originalWALPath = walPath;
} else {
// If the WAL split is retried with another worker, ".retrying-xxx" is appended to the walPath.
// In the case where a SplitWalProcedure is bypassed or rolled back after being retried with
// another worker, the walPath will still have ".retrying-xxx" appended. During recovery, a new
// SplitWalProcedure (by different SCP) will be created, which will have a workerChangeCount of
// 0 but will still have ".retrying-xxx" appended to the walPath. This is why we cannot use the
// condition workerChangeCount != 0.
if (walPath.substring(walPath.length() - RETRYING_EXT.length() - 3).startsWith(RETRYING_EXT)) {
originalWALPath = walPath.substring(0, walPath.length() - RETRYING_EXT.length() - 3);
} else {
originalWALPath = walPath;
}
String walNewName =
originalWALPath + RETRYING_EXT + String.format("%03d", (workerChangeCount + 1) % 1000);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ public class SplitWALProcedure
private ServerName worker;
private ServerName crashedServer;
private RetryCounter retryCounter;
private Integer workerChangeCount = 0;
private int workerChangeCount = 0;

public SplitWALProcedure() {
}
Expand Down Expand Up @@ -183,10 +183,6 @@ public ServerName getWorker() {
return worker;
}

public Integer getWorkerChangeCount() {
return workerChangeCount;
}

@Override
public ServerName getServerName() {
return this.crashedServer;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,7 @@ static void requestLogRoll(final WAL wal) {
/** File Extension used while splitting an WAL into regions (HBASE-2312) */
public static final String SPLITTING_EXT = "-splitting";

// Extension for the WAL where the split failed on one worker and is being retried on another.
public static final String RETRYING_EXT = ".retrying";

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ public void teardown() throws Exception {
}

@Test
public void testRenameWALForRetry() throws Exception {
public void testRenameWALForRetryWALSplit() throws Exception {
HRegionServer regionServer = TEST_UTIL.getHBaseCluster().getRegionServer(0);
List<Path> logDirs =
master.getMasterWalManager().getLogDirs(Collections.singleton(regionServer.getServerName()));
Expand Down Expand Up @@ -134,6 +134,45 @@ public void testRenameWALForRetry() throws Exception {
Assert.assertEquals(walFromFileSystem.toString(), newWALPath);
}

@Test
public void testRenameWALForFirstRetryWhenWALAlreadyHaveRetryingSuffix() throws Exception {
HRegionServer regionServer = TEST_UTIL.getHBaseCluster().getRegionServer(0);
List<Path> logDirs =
master.getMasterWalManager().getLogDirs(Collections.singleton(regionServer.getServerName()));
List<FileStatus> list =
SplitLogManager.getFileList(TEST_UTIL.getConfiguration(), logDirs, NON_META_FILTER);
Path testWal = list.get(0).getPath();
int workerChangeCount = 0;

String newWALPath = splitWALManager.renameWALForRetry(testWal.toString(), workerChangeCount);
Assert.assertEquals(newWALPath,
testWal + RETRYING_EXT + String.format("%03d", workerChangeCount + 1));
// Let's say that after this step, the SplitWalProcedure and SCP are bypassed. To reach a
// consistent state, the recoverUnknown or schedulerRecovery command will create another SCP and
// SplitWalProcedure. If that also fails on the first attempt

int workerChangeCountForNewProcedure = 0;
newWALPath = splitWALManager.renameWALForRetry(newWALPath, workerChangeCountForNewProcedure);
Assert.assertEquals(newWALPath,
testWal + RETRYING_EXT + String.format("%03d", workerChangeCountForNewProcedure + 1));

List<FileStatus> fileStatusesA =
SplitLogManager.getFileList(TEST_UTIL.getConfiguration(), logDirs, NON_META_FILTER);

Path walFromFileSystem = null;
for (FileStatus wal : fileStatusesA) {
if (
wal.getPath().toString()
.endsWith(RETRYING_EXT + String.format("%03d", workerChangeCountForNewProcedure + 1))
) {
walFromFileSystem = wal.getPath();
break;
}
}
Assert.assertNotNull(walFromFileSystem);
Assert.assertEquals(walFromFileSystem.toString(), newWALPath);
}

@Test
public void testAcquireAndRelease() throws Exception {
List<FakeServerProcedure> testProcedures = new ArrayList<>();
Expand Down

0 comments on commit 10e9b7c

Please sign in to comment.