Skip to content

HDDS-11463. Track and display failed DataNode storage locations in SCM. #7266

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
Open
Next Next commit
HDDS-11463. Track and display failed DataNode storage locations in SCM.
  • Loading branch information
slfan1989 committed May 22, 2025
commit 41afe7c59868b3734ff6e98f579b692f3a123fd6
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import org.apache.hadoop.hdds.protocol.proto.HddsProtos.DeletedBlocksTransactionInfo;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ContainerBalancerStatusInfoResponseProto;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.DecommissionScmResponseProto;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.GetVolumeInfosResponseProto;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.StartContainerBalancerResponseProto;
import org.apache.hadoop.hdds.scm.DatanodeAdminError;
import org.apache.hadoop.hdds.scm.container.ContainerID;
Expand Down Expand Up @@ -465,4 +466,21 @@ DecommissionScmResponseProto decommissionScm(
String scmId) throws IOException;

String getMetrics(String query) throws IOException;

/**
* Get getVolumeInfos based on query conditions.
*
* @param displayMode Represents the mode for displaying volumes.
* Options include "all" for all volumes, "failed" for failed volumes,
* and "normal" for normal volumes.
* @param uuid datanode uuid String.
* @param hostName datanode hostName String.
* @param pageSize Records displayed per page.
* @param currentPage The current page number.
* @return Volume Information List.
* @throws IOException
* I/O exceptions that may occur during the process of querying the volume.
*/
GetVolumeInfosResponseProto getVolumeInfos(String displayMode, String uuid,
String hostName, int pageSize, int currentPage) throws IOException;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.hadoop.hdds.scm.datanode;

import com.fasterxml.jackson.annotation.JsonIgnore;
import com.google.common.base.Preconditions;
import org.apache.commons.lang3.builder.CompareToBuilder;
import org.apache.commons.lang3.builder.EqualsBuilder;
import org.apache.commons.lang3.builder.HashCodeBuilder;
import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
import org.apache.hadoop.hdds.utils.db.Codec;
import org.apache.hadoop.hdds.utils.db.DelegatedCodec;
import org.apache.hadoop.hdds.utils.db.Proto2Codec;

/**
* This class is used to record disk failure conditions.
* The failureTime may be 0, and capacityLost may be 0, because if the DN restarts,
* we will not know the original capacity of the failed disk.
*/
public final class VolumeInfo implements Comparable<VolumeInfo> {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: please add this class in existing package org.apache.hadoop.hdds.protocol (which contains other datanode-related classes like DatanodeID and DatanodeDetails)


private String uuid;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please use DatanodeID.

private String hostName;
private String volumeName;
private boolean failed;
private long failureTime;
private long capacity;

private static final Codec<VolumeInfo> CODEC = new DelegatedCodec<>(
Proto2Codec.get(HddsProtos.VolumeInfoProto.getDefaultInstance()),
VolumeInfo::fromProtobuf,
VolumeInfo::getProtobuf,
VolumeInfo.class);

public static Codec<VolumeInfo> getCodec() {
return CODEC;
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Codec is required only for storing in DB, but VolumeInfo does not seem to be persisted by either datanode or SCM. So I think this can be removed.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have removed the CODEC.


private VolumeInfo(VolumeInfo.Builder b) {
this.uuid = b.uuid;
this.volumeName = b.volumeName;
this.failureTime = b.failureTime;
this.hostName = b.hostName;
this.failed = b.failed;
this.capacity = b.capacity;
}

public static VolumeInfo fromProtobuf(HddsProtos.VolumeInfoProto info) {
VolumeInfo.Builder builder = new VolumeInfo.Builder();
builder.setUuid(info.getUuid())
.setHostName(info.getHostName())
.setFailed(info.getFailed())
.setVolumeName(info.getVolumeName())
.setFailureTime(info.getFailureTime())
.setCapacity(info.getCapacity());
return builder.build();
}

@JsonIgnore
public HddsProtos.VolumeInfoProto getProtobuf() {
HddsProtos.VolumeInfoProto.Builder builder =
HddsProtos.VolumeInfoProto.newBuilder();
builder.setUuid(getUuid())
.setHostName(getHostName())
.setFailed(isFailed())
.setVolumeName(getVolumeName())
.setFailureTime(getFailureTime())
.setCapacity(getCapacity());
return builder.build();
}

/**
* Builder class for creating an instance of a complex object.
* <p>
* This Builder provides a fluent interface for gradually setting
* the object's properties. Finally, the build() method is used
* to create the object.
* </p>
*/
public static class Builder {
private String uuid;
private String hostName;
private boolean failed;
private String volumeName;
private long failureTime;
private long capacity;

public VolumeInfo.Builder setUuid(String pUuid) {
this.uuid = pUuid;
return this;
}

public VolumeInfo.Builder setHostName(String pHostName) {
this.hostName = pHostName;
return this;
}

public VolumeInfo.Builder setFailed(boolean pFailed) {
this.failed = pFailed;
return this;
}

public VolumeInfo.Builder setVolumeName(String pVolumeName) {
this.volumeName = pVolumeName;
return this;
}

public VolumeInfo.Builder setFailureTime(long pFailureTime) {
this.failureTime = pFailureTime;
return this;
}

public VolumeInfo.Builder setCapacity(long pCapacity) {
this.capacity = pCapacity;
return this;
}

public VolumeInfo build() {
return new VolumeInfo(this);
}
}

public String getUuid() {
return uuid;
}

public String getVolumeName() {
return volumeName;
}

public long getFailureTime() {
return failureTime;
}

public long getCapacity() {
return capacity;
}

public String getHostName() {
return hostName;
}

public boolean isFailed() {
return failed;
}

@Override
public int compareTo(VolumeInfo that) {
Preconditions.checkNotNull(that);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: prefer builtin Objects.requireNonNull

return new CompareToBuilder()
.append(this.uuid, that.uuid)
.append(this.hostName, that.hostName)
.append(this.failed, that.failed)
.append(this.volumeName, that.volumeName)
.append(this.failureTime, that.failureTime)
.append(this.capacity, that.capacity)
.build();
}

@Override
public int hashCode() {
return new HashCodeBuilder(61, 71)
.append(this.uuid)
.append(this.hostName)
.append(this.failed)
.append(this.volumeName)
.append(this.failureTime)
.append(this.capacity)
.toHashCode();
}

@Override
public boolean equals(final Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
final VolumeInfo that = (VolumeInfo) o;
return new EqualsBuilder()
.append(this.uuid, that.uuid)
.append(this.hostName, that.hostName)
.append(this.failed, that.failed)
.append(this.volumeName, that.volumeName)
.append(this.failureTime, that.failureTime)
.append(this.capacity, that.capacity)
.isEquals();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

/**
* datanode volumeinfo.
*/
package org.apache.hadoop.hdds.scm.datanode;
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import org.apache.hadoop.hdds.protocol.DatanodeDetails;
import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
import org.apache.hadoop.hdds.protocol.proto.HddsProtos.DeletedBlocksTransactionInfo;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ContainerBalancerStatusInfoResponseProto;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.DecommissionScmResponseProto;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.StartContainerBalancerResponseProto;
Expand Down Expand Up @@ -489,4 +490,21 @@ DecommissionScmResponseProto decommissionScm(
String scmId) throws IOException;

String getMetrics(String query) throws IOException;

/**
* Get getVolumeInfos based on query conditions.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: Get getVolumeInfos does not sound right

*
* @param displayMode Represents the mode for displaying volumes.
* Options include "all" for all volumes, "failed" for failed volumes,
* and "normal" for normal volumes.
* @param uuid datanode uuid String.
* @param hostName datanode hostName String.
* @param pageSize Records displayed per page.
* @param currentPage The current page number.
* @return Volume Information List.
* @throws IOException
* I/O exceptions that may occur during the process of querying the volume.
*/
StorageContainerLocationProtocolProtos.GetVolumeInfosResponseProto getVolumeInfos(String displayMode, String uuid,
String hostName, int pageSize, int currentPage) throws IOException;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.hadoop.ozone.utils;

import java.util.ArrayList;
import java.util.List;

/**
* This is a memory paging utility that is used to paginate a dataset.
*
* This class is designed to support batch entry queue policies.
*/
public class MemoryPageUtils<T> {
private List<T> dataList;
private int pageSize;

/**
* MemoryPageUtils constructor.
*
* @param pageSize Number of records returned per page.
*/
public MemoryPageUtils(int pageSize) {
this.pageSize = pageSize;
this.dataList = new ArrayList<>();
}

public void addToMemory(T data) {
dataList.add(data);
}

public List<T> readFromMemory(int pageNumber) {
int startIndex = pageNumber * pageSize;
int endIndex = Math.min(startIndex + pageSize, dataList.size());
if (startIndex >= dataList.size()) {
return null;
}
return dataList.subList(startIndex, endIndex);
}

public int getPages() {
return (dataList.size() / pageSize + 1);
}
}
Loading