Skip to content

HDDS-8783. Add metrics for volume scanner #8448

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
HDDS-8783. Add metrics for volume scanner
  • Loading branch information
sarvekshayr committed May 13, 2025
commit c2431ef0bcd8ecfc8c4cbc6bb8cab0ae491da4b2
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,7 @@ public class StorageVolumeChecker {

private AsyncChecker<Boolean, VolumeCheckResult> delegateChecker;

private final AtomicLong numVolumeChecks = new AtomicLong(0);
private final AtomicLong numAllVolumeChecks = new AtomicLong(0);
private final AtomicLong numAllVolumeSetsChecks = new AtomicLong(0);
private final AtomicLong numSkippedChecks = new AtomicLong(0);
private final StorageVolumeCheckerMetrics metrics;

/**
* Max allowed time for a disk check in milliseconds. If the check
Expand Down Expand Up @@ -108,6 +105,8 @@ public class StorageVolumeChecker {
public StorageVolumeChecker(ConfigurationSource conf, Timer timer,
String threadNamePrefix) {

metrics = StorageVolumeCheckerMetrics.create();

this.timer = timer;

dnConf = conf.getObject(DatanodeConfiguration.class);
Expand Down Expand Up @@ -165,7 +164,7 @@ public synchronized void registerVolumeSet(VolumeSet volumeSet) {
public synchronized void checkAllVolumeSets() {
final long gap = timer.monotonicNow() - lastAllVolumeSetsCheckComplete;
if (gap < minDiskCheckGapMs) {
numSkippedChecks.incrementAndGet();
metrics.incNumSkippedChecks();
if (LOG.isTraceEnabled()) {
LOG.trace(
"Skipped checking all volumes, time since last check {} is less " +
Expand All @@ -181,7 +180,7 @@ public synchronized void checkAllVolumeSets() {
}

lastAllVolumeSetsCheckComplete = timer.monotonicNow();
numAllVolumeSetsChecks.incrementAndGet();
metrics.incNumAllVolumeSetsChecks();
} catch (IOException e) {
LOG.warn("Exception while checking disks", e);
}
Expand Down Expand Up @@ -232,7 +231,7 @@ public Set<? extends StorageVolume> checkAllVolumes(
maxAllowedTimeForCheckMs);
}

numAllVolumeChecks.incrementAndGet();
metrics.incNumAllVolumeChecks();
synchronized (this) {
// All volumes that have not been detected as healthy should be
// considered failed. This is a superset of 'failedVolumes'.
Expand Down Expand Up @@ -277,7 +276,7 @@ public boolean checkVolume(final StorageVolume volume, Callback callback) {
Optional<ListenableFuture<VolumeCheckResult>> olf =
delegateChecker.schedule(volume, null);
if (olf.isPresent()) {
numVolumeChecks.incrementAndGet();
metrics.incNumVolumeChecks();
Futures.addCallback(olf.get(),
new ResultHandler(volume,
ConcurrentHashMap.newKeySet(), ConcurrentHashMap.newKeySet(),
Expand Down Expand Up @@ -401,6 +400,7 @@ public void shutdownAndWait(int gracePeriod, TimeUnit timeUnit) {
periodicDiskChecker.cancel(true);
diskCheckerservice.shutdownNow();
checkVolumeResultHandlerExecutorService.shutdownNow();
metrics.unregister();
try {
delegateChecker.shutdownAndWait(gracePeriod, timeUnit);
} catch (InterruptedException e) {
Expand All @@ -422,32 +422,8 @@ void setDelegateChecker(
delegateChecker = testDelegate;
}

/**
* Return the number of {@link #checkVolume} invocations.
*/
public long getNumVolumeChecks() {
return numVolumeChecks.get();
}

/**
* Return the number of {@link #checkAllVolumes(Collection)} ()} invocations.
*/
public long getNumAllVolumeChecks() {
return numAllVolumeChecks.get();
}

/**
* Return the number of {@link #checkAllVolumeSets()} invocations.
*/
public long getNumAllVolumeSetsChecks() {
return numAllVolumeSetsChecks.get();
}

/**
* Return the number of checks skipped because the minimum gap since the
* last check had not elapsed.
*/
public long getNumSkippedChecks() {
return numSkippedChecks.get();
@VisibleForTesting
public StorageVolumeCheckerMetrics getMetrics() {
return metrics;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.hadoop.ozone.container.common.volume;

import org.apache.hadoop.hdds.annotation.InterfaceAudience;
import org.apache.hadoop.metrics2.MetricsSystem;
import org.apache.hadoop.metrics2.annotation.Metric;
import org.apache.hadoop.metrics2.annotation.Metrics;
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
import org.apache.hadoop.metrics2.lib.MutableCounterLong;
import java.util.Collection;

/**
* This class captures the volume scanner metrics on the data-node.
**/
@InterfaceAudience.Private
@Metrics(about = "Datanode volume scanner metrics", context = "dfs")
public class StorageVolumeCheckerMetrics {
private final String name;
private final MetricsSystem ms;

@Metric("number of volume checks")
private MutableCounterLong numVolumeChecks;

@Metric("number of all volume checks")
private MutableCounterLong numAllVolumeChecks;

@Metric("number of all volume sets checks")
private MutableCounterLong numAllVolumeSetsChecks;

@Metric("number of checks skipped because the minimum gap since the last check had not elapsed")
private MutableCounterLong numSkippedChecks;

public StorageVolumeCheckerMetrics(String name, MetricsSystem ms) {
this.name = name;
this.ms = ms;
}

public static StorageVolumeCheckerMetrics create() {
MetricsSystem ms = DefaultMetricsSystem.instance();
String name = "Volume scanner metrics";
return ms.register(name, null, new StorageVolumeCheckerMetrics(name, ms));
}

/**
* Return the number of {@link StorageVolumeChecker#checkVolume} invocations.
*/
public long getNumVolumeChecks() {
return numVolumeChecks.value();
}

public void incNumVolumeChecks() {
numVolumeChecks.incr();
}

/**
* Return the number of {@link StorageVolumeChecker#checkAllVolumes(Collection)} invocations.
*/
public long getNumAllVolumeChecks() {
return numAllVolumeChecks.value();
}

public void incNumAllVolumeChecks() {
numAllVolumeChecks.incr();
}

/**
* Return the number of {@link StorageVolumeChecker#checkAllVolumeSets()} invocations.
*/
public long getNumAllVolumeSetsChecks() {
return numAllVolumeSetsChecks.value();
}

public void incNumAllVolumeSetsChecks() {
numAllVolumeSetsChecks.incr();
}

/**
* Return the number of checks skipped because the minimum gap since the
* last check had not elapsed.
*/
public long getNumSkippedChecks() {
return numSkippedChecks.value();
}

public void incNumSkippedChecks() {
numSkippedChecks.incr();
}

public void unregister() {
ms.unregisterSource(name);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,7 @@
*/
public class TestPeriodicVolumeChecker {

private static final Logger LOG = LoggerFactory.getLogger(
TestPeriodicVolumeChecker.class);
private static final Logger LOG = LoggerFactory.getLogger(TestPeriodicVolumeChecker.class);

@TempDir
private Path folder;
Expand All @@ -63,52 +62,47 @@ public void setup() throws IOException {
public void testPeriodicVolumeChecker(TestInfo testInfo) throws Exception {
LOG.info("Executing {}", testInfo.getTestMethod());

DatanodeConfiguration dnConf =
conf.getObject(DatanodeConfiguration.class);
DatanodeConfiguration dnConf = conf.getObject(DatanodeConfiguration.class);
Duration gap = dnConf.getDiskCheckMinGap();
Duration interval = Duration.ofMinutes(
dnConf.getPeriodicDiskCheckIntervalMinutes());
Duration interval = Duration.ofMinutes(dnConf.getPeriodicDiskCheckIntervalMinutes());

FakeTimer timer = new FakeTimer();

StorageVolumeChecker volumeChecker = new StorageVolumeChecker(conf, timer,
"");
StorageVolumeChecker volumeChecker = new StorageVolumeChecker(conf, timer, "");
StorageVolumeCheckerMetrics metrics = volumeChecker.getMetrics();

try {
volumeChecker.registerVolumeSet(new ImmutableVolumeSet(makeVolumes(
2, HEALTHY)));
volumeChecker.registerVolumeSet(new ImmutableVolumeSet(makeVolumes(
1, HEALTHY)));
volumeChecker.setDelegateChecker(
new TestStorageVolumeChecker.DummyChecker());
volumeChecker.registerVolumeSet(new ImmutableVolumeSet(makeVolumes(2, HEALTHY)));
volumeChecker.registerVolumeSet(new ImmutableVolumeSet(makeVolumes(1, HEALTHY)));
volumeChecker.setDelegateChecker(new TestStorageVolumeChecker.DummyChecker());

assertEquals(0, volumeChecker.getNumAllVolumeChecks());
assertEquals(0, volumeChecker.getNumAllVolumeSetsChecks());
assertEquals(0, metrics.getNumAllVolumeChecks());
assertEquals(0, metrics.getNumAllVolumeSetsChecks());

// first round
timer.advance(gap.toMillis() / 3);
volumeChecker.checkAllVolumeSets();

assertEquals(2, volumeChecker.getNumAllVolumeChecks());
assertEquals(1, volumeChecker.getNumAllVolumeSetsChecks());
assertEquals(0, volumeChecker.getNumSkippedChecks());
assertEquals(2, metrics.getNumAllVolumeChecks());
assertEquals(1, metrics.getNumAllVolumeSetsChecks());
assertEquals(0, metrics.getNumSkippedChecks());

// periodic disk checker next round within gap
timer.advance(gap.toMillis() / 3);
volumeChecker.checkAllVolumeSets();

// skipped next round
assertEquals(2, volumeChecker.getNumAllVolumeChecks());
assertEquals(1, volumeChecker.getNumAllVolumeSetsChecks());
assertEquals(1, volumeChecker.getNumSkippedChecks());
assertEquals(2, metrics.getNumAllVolumeChecks());
assertEquals(1, metrics.getNumAllVolumeSetsChecks());
assertEquals(1, metrics.getNumSkippedChecks());

// periodic disk checker next round
timer.advance(interval.toMillis());
volumeChecker.checkAllVolumeSets();

assertEquals(4, volumeChecker.getNumAllVolumeChecks());
assertEquals(2, volumeChecker.getNumAllVolumeSetsChecks());
assertEquals(1, volumeChecker.getNumSkippedChecks());
assertEquals(4, metrics.getNumAllVolumeChecks());
assertEquals(2, metrics.getNumAllVolumeSetsChecks());
assertEquals(1, metrics.getNumSkippedChecks());
} finally {
volumeChecker.shutdownAndWait(1, TimeUnit.SECONDS);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ public static void restoreBadVolume(StorageVolume vol) {
public static void waitForCheckVolume(MutableVolumeSet volSet,
long numOfChecks) throws Exception {
GenericTestUtils.waitFor(
() -> numOfChecks == volSet.getVolumeChecker().getNumVolumeChecks(),
() -> numOfChecks == volSet.getVolumeChecker().getMetrics().getNumVolumeChecks(),
100, 10000);
}

Expand Down