Skip to content

HDDS-12483. Quasi Closed Stuck should have 2 replicas of each origin #8014

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Mar 12, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Add QC Stuck handler to the chain and fix tests
  • Loading branch information
S O'Donnell committed Mar 5, 2025
commit a1f05014a455a05c718700a71dad0ccffc16d92c
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import java.util.Set;
import java.util.UUID;
import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos;
import org.apache.hadoop.hdds.scm.container.ContainerReplica;

/**
Expand All @@ -36,10 +37,14 @@ public class QuasiClosedStuckReplicaCount {
private final Map<UUID, Set<ContainerReplica>> maintenanceReplicasByOrigin = new HashMap<>();
private boolean hasOutOfServiceReplicas = false;
private int minHealthyForMaintenance;
private boolean hasHealthyReplicas = false;

public QuasiClosedStuckReplicaCount(Set<ContainerReplica> replicas, int minHealthyForMaintenance) {
this.minHealthyForMaintenance = minHealthyForMaintenance;
for (ContainerReplica r : replicas) {
if (r.getState() != StorageContainerDatanodeProtocolProtos.ContainerReplicaProto.State.UNHEALTHY) {
hasHealthyReplicas = true;
}
replicasByOrigin.computeIfAbsent(r.getOriginDatanodeId(), k -> new HashSet<>()).add(r);
HddsProtos.NodeOperationalState opState = r.getDatanodeDetails().getPersistedOpState();
if (opState == HddsProtos.NodeOperationalState.IN_SERVICE) {
Expand All @@ -58,6 +63,10 @@ public boolean hasOutOfServiceReplicas() {
return hasOutOfServiceReplicas;
}

public boolean hasHealthyReplicas() {
return hasHealthyReplicas;
}

public boolean isUnderReplicated() {
// If there is only a single origin, we expect 3 copies, otherwise we expect 2 copies of each origin
if (replicasByOrigin.size() == 1) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@
import org.apache.hadoop.hdds.scm.container.replication.health.MismatchedReplicasHandler;
import org.apache.hadoop.hdds.scm.container.replication.health.OpenContainerHandler;
import org.apache.hadoop.hdds.scm.container.replication.health.QuasiClosedContainerHandler;
import org.apache.hadoop.hdds.scm.container.replication.health.QuasiClosedStuckReplicationCheck;
import org.apache.hadoop.hdds.scm.container.replication.health.RatisReplicationCheckHandler;
import org.apache.hadoop.hdds.scm.container.replication.health.RatisUnhealthyReplicationCheckHandler;
import org.apache.hadoop.hdds.scm.container.replication.health.VulnerableUnhealthyReplicasHandler;
Expand Down Expand Up @@ -262,6 +263,7 @@ public ReplicationManager(final ConfigurationSource conf,
.addNext(new MismatchedReplicasHandler(this))
.addNext(new EmptyContainerHandler(this))
.addNext(new DeletingContainerHandler(this))
.addNext(new QuasiClosedStuckReplicationCheck())
.addNext(ecReplicationCheckHandler)
.addNext(ratisReplicationCheckHandler)
.addNext(new ClosedWithUnhealthyReplicasHandler(this))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@

import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState.QUASI_CLOSED;

import java.util.Set;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerReplicaProto.State;
import org.apache.hadoop.hdds.scm.container.ContainerInfo;
import org.apache.hadoop.hdds.scm.container.ContainerReplica;
import org.apache.hadoop.hdds.scm.container.ReplicationManagerReport;
import org.apache.hadoop.hdds.scm.container.replication.ContainerCheckRequest;
import org.apache.hadoop.hdds.scm.container.replication.ContainerHealthResult;
Expand All @@ -43,9 +47,27 @@ public boolean handle(ContainerCheckRequest request) {
if (!QuasiClosedContainerHandler.isQuasiClosedStuck(request.getContainerInfo(), request.getContainerReplicas())) {
return false;
}
if (hasEnoughOriginsWithOpen(request.getContainerInfo(), request.getContainerReplicas())) {
// If we have all origins with open replicas, and not unhealthy then the container should close after the close
// goes through, so this handler should not run.
return false;
}

if (request.getContainerReplicas().isEmpty()) {
// If there are no replicas, then mark as missing and return.
request.getReport().incrementAndSample(
ReplicationManagerReport.HealthState.MISSING, request.getContainerInfo().containerID());
return true;
}

QuasiClosedStuckReplicaCount replicaCount = new QuasiClosedStuckReplicaCount(
request.getContainerReplicas(), request.getMaintenanceRedundancy());

if (!replicaCount.hasHealthyReplicas()) {
// All unhealthy are handled by a different handler
return false;
}

int pendingAdd = 0;
int pendingDelete = 0;
for (ContainerReplicaOp op : request.getPendingOps()) {
Expand All @@ -56,13 +78,6 @@ public boolean handle(ContainerCheckRequest request) {
}
}

if (request.getContainerReplicas().isEmpty()) {
// If there are no replicas, then mark as missing and return.
request.getReport().incrementAndSample(
ReplicationManagerReport.HealthState.MISSING, request.getContainerInfo().containerID());
return true;
}

if (replicaCount.isUnderReplicated()) {
LOG.debug("Container {} is quasi-closed-stuck under-replicated", request.getContainerInfo());
request.getReport().incrementAndSample(ReplicationManagerReport.HealthState.UNDER_REPLICATED,
Expand Down Expand Up @@ -94,4 +109,14 @@ public boolean handle(ContainerCheckRequest request) {
return false;
}

private static boolean hasEnoughOriginsWithOpen(ContainerInfo containerInfo, Set<ContainerReplica> replicas) {
final long uniqueOpenReplicaCount = replicas.stream()
.filter(r -> r.getState() == State.QUASI_CLOSED || r.getState() == State.OPEN)
.map(ContainerReplica::getOriginDatanodeId)
.distinct()
.count();
final int replicationFactor = containerInfo.getReplicationConfig().getRequiredNodes();
return uniqueOpenReplicaCount >= replicationFactor;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -463,11 +463,11 @@ public void testQuasiClosedContainerWithUnhealthyReplicaOnUniqueOrigin()
storeContainerAndReplicas(container, replicas);

replicationManager.processContainer(container, repQueue, repReport);
assertEquals(0, repReport.getStat(
assertEquals(1, repReport.getStat(
ReplicationManagerReport.HealthState.UNDER_REPLICATED));
assertEquals(0, repReport.getStat(
ReplicationManagerReport.HealthState.OVER_REPLICATED));
assertEquals(0, repQueue.underReplicatedQueueSize());
assertEquals(1, repQueue.underReplicatedQueueSize());
assertEquals(0, repQueue.overReplicatedQueueSize());
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,32 @@ public void testQuasiClosedNotStuckReturnsFalse() {
assertEquals(0, report.getStat(ReplicationManagerReport.HealthState.OVER_REPLICATED));
assertEquals(0, queue.underReplicatedQueueSize());
assertEquals(0, queue.overReplicatedQueueSize());
}

@Test
public void testQuasiClosedStuckWithOpenReturnsFalse() {
ContainerInfo containerInfo = ReplicationTestUtil.createContainerInfo(
RatisReplicationConfig.getInstance(THREE), 1, QUASI_CLOSED);

Set<ContainerReplica> containerReplicas = ReplicationTestUtil
.createReplicasWithOriginAndOpState(containerInfo.containerID(), State.QUASI_CLOSED,
Pair.of(origin1, IN_SERVICE), Pair.of(origin2, IN_SERVICE));
containerReplicas.addAll(ReplicationTestUtil
.createReplicasWithOriginAndOpState(containerInfo.containerID(), State.OPEN,
Pair.of(origin3, IN_SERVICE)));
ContainerCheckRequest request = new ContainerCheckRequest.Builder()
.setPendingOps(Collections.emptyList())
.setReport(report)
.setContainerInfo(containerInfo)
.setContainerReplicas(containerReplicas)
.setReplicationQueue(queue)
.build();

assertFalse(handler.handle(request));
assertEquals(0, report.getStat(ReplicationManagerReport.HealthState.UNDER_REPLICATED));
assertEquals(0, report.getStat(ReplicationManagerReport.HealthState.OVER_REPLICATED));
assertEquals(0, queue.underReplicatedQueueSize());
assertEquals(0, queue.overReplicatedQueueSize());
}

@Test
Expand All @@ -133,7 +158,6 @@ public void testCorrectlyReplicated() {
assertEquals(0, report.getStat(ReplicationManagerReport.HealthState.OVER_REPLICATED));
assertEquals(0, queue.underReplicatedQueueSize());
assertEquals(0, queue.overReplicatedQueueSize());

}

@Test
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
{ "state": "QUASI_CLOSED", "index": 0, "datanode": "d2", "sequenceId": 12, "isEmpty": false, "origin": "o2"},
{ "state": "QUASI_CLOSED", "index": 0, "datanode": "d3", "sequenceId": 12, "isEmpty": false, "origin": "o2"}
],
"expectation": { "underReplicated": 0, "underReplicatedQueue": 0, "overReplicated": 0, "overReplicatedQueue": 0, "quasiClosedStuck": 1},
"expectation": { "underReplicated": 1, "underReplicatedQueue": 1, "overReplicated": 0, "overReplicatedQueue": 0, "quasiClosedStuck": 1},
"checkCommands": [],
"commands": []
},
Expand Down Expand Up @@ -108,8 +108,8 @@
{ "state": "QUASI_CLOSED", "index": 0, "datanode": "d3", "sequenceId": 10, "isEmpty": false, "origin": "o2"},
{ "state": "UNHEALTHY", "index": 0, "datanode": "d4", "sequenceId": 11, "isEmpty": false, "origin": "o3"}
],
"expectation": { "underReplicated": 0, "underReplicatedQueue": 0, "overReplicated": 0, "overReplicatedQueue": 0, "quasiClosedStuck": 1, "unhealthy": 0 },
"expectation": { "underReplicated": 1, "underReplicatedQueue": 1, "overReplicated": 0, "overReplicatedQueue": 0, "quasiClosedStuck": 1, "unhealthy": 0 },
"checkCommands": [],
"commands": []
}
]
]