[CUDA] Multi-GPU for CUDA Version #6138

shiyu1994 · 2023-10-10T15:31:27Z

This is to integrate multi-GPU support for CUDA version, with NCCL.

… nccl-dev

shiyu1994 · 2025-03-07T08:32:06Z

This is ready for review. @guolinke @jameslamb @StrikerRUS Could you help to review this when you have time? Let's get this merged recently.

guolinke

Thank you, it overall looks good to me.

StrikerRUS

Unfortunately, I'm not qualified to review cpp/CUDA code, but I left some suggestions that I believe may improve this PR.

StrikerRUS · 2025-04-19T19:43:35Z

include/LightGBM/config.h

+  // desc = List of CUDA device IDs used when device_type=cuda
+  // desc = When empty, the devices with the smallest IDs will be used


Suggested change

// desc = List of CUDA device IDs used when device_type=cuda

// desc = When empty, the devices with the smallest IDs will be used

// desc = list of CUDA device IDs

// desc = **Note**: can be used only in CUDA implementation (``device_type="cuda"``) and when ``num_gpu>1``

// desc = if empty, the devices with the smallest IDs will be used

StrikerRUS · 2025-04-19T19:44:22Z

include/LightGBM/config.h

  // desc = set this to ``true`` to use double precision math on GPU (by default single precision is used)
  // desc = **Note**: can be used only in OpenCL implementation (``device_type="gpu"``), in CUDA implementation only double precision is currently supported
  bool gpu_use_dp = false;

  // check = >0
  // desc = number of GPUs
  // desc = **Note**: can be used only in CUDA implementation (``device_type="cuda"``)
+  // desc = When <= 0, only 1 GPU will be used


Because of // check = >0.

Suggested change

// desc = When <= 0, only 1 GPU will be used

// desc = if ``0``, only 1 GPU will be used

StrikerRUS · 2025-04-19T19:54:05Z

include/LightGBM/cuda/cuda_nccl_topology.hpp

+      }
+    }
+    if (!gpu_list_.empty() && num_gpu_ != static_cast<int>(gpu_list_.size())) {
+      Log::Warning("num_gpu_ = %d is different from the number of valid device IDs in gpu_device_list (%d), using %d GPUs instead.", \


Users don't know about internal num_gpu_, they can only set num_gpu param. So this can be misleading.

Suggested change

Log::Warning("num_gpu_ = %d is different from the number of valid device IDs in gpu_device_list (%d), using %d GPUs instead.", \

Log::Warning("num_gpu = %d is different from the number of valid device IDs in gpu_device_list (%d), using %d GPUs instead.", \

StrikerRUS · 2025-04-19T20:04:25Z

include/LightGBM/cuda/cuda_nccl_topology.hpp

+    if (Network::num_machines() == 1 || Network::rank() == 0) {
+      NCCLCHECK(ncclGetUniqueId(&nccl_unique_id));
+    }
+    if (Network::num_machines() > 1) {


Is it multi-node multi-GPU case?

StrikerRUS · 2025-04-19T20:11:23Z

include/LightGBM/config.h

@@ -1126,13 +1126,18 @@ struct Config {
  // desc = **Note**: refer to `GPU Targets <./GPU-Targets.rst#query-opencl-devices-in-your-system>`__ for more details


Suggested change

// desc = **Note**: refer to `GPU Targets <./GPU-Targets.rst#query-opencl-devices-in-your-system>`__ for more details

// desc = in multi-GPU case (``num_gpu>1``) means ID of the master GPU

// desc = **Note**: refer to `GPU Targets <./GPU-Targets.rst#query-opencl-devices-in-your-system>`__ for more details

StrikerRUS · 2025-04-20T12:14:17Z

src/treelearner/cuda/cuda_data_partition.cu

-    cuda_hist_pool_,
-    cuda_leaf_output_, cuda_split_info_buffer_);
+
+#define SPLI_TREE_ARGS \


Suggested change

#define SPLI_TREE_ARGS \

#define SPLIT_TREE_ARGS \

StrikerRUS · 2025-04-20T12:16:00Z

src/treelearner/cuda/cuda_data_partition.cu

+      SplitTreeStructureKernel<true, true><<<4, 5, 0, cuda_streams_[0]>>>(SPLI_TREE_ARGS);
+    } else {
+      SplitTreeStructureKernel<true, false><<<4, 5, 0, cuda_streams_[0]>>>(SPLI_TREE_ARGS);
+    }
+  } else {
+    if (use_quantized_grad_) {
+      SplitTreeStructureKernel<false, true><<<4, 5, 0, cuda_streams_[0]>>>(SPLI_TREE_ARGS);
+    } else {
+      SplitTreeStructureKernel<false, false><<<4, 5, 0, cuda_streams_[0]>>>(SPLI_TREE_ARGS);
+    }
+  }
+
+#undef SPLI_TREE_ARGS


Suggested change

SplitTreeStructureKernel<true, true><<<4, 5, 0, cuda_streams_[0]>>>(SPLI_TREE_ARGS);

} else {

SplitTreeStructureKernel<true, false><<<4, 5, 0, cuda_streams_[0]>>>(SPLI_TREE_ARGS);

}

} else {

if (use_quantized_grad_) {

SplitTreeStructureKernel<false, true><<<4, 5, 0, cuda_streams_[0]>>>(SPLI_TREE_ARGS);

} else {

SplitTreeStructureKernel<false, false><<<4, 5, 0, cuda_streams_[0]>>>(SPLI_TREE_ARGS);

}

}

#undef SPLI_TREE_ARGS

SplitTreeStructureKernel<true, true><<<4, 5, 0, cuda_streams_[0]>>>(SPLIT_TREE_ARGS);

} else {

SplitTreeStructureKernel<true, false><<<4, 5, 0, cuda_streams_[0]>>>(SPLIT_TREE_ARGS);

}

} else {

if (use_quantized_grad_) {

SplitTreeStructureKernel<false, true><<<4, 5, 0, cuda_streams_[0]>>>(SPLIT_TREE_ARGS);

} else {

SplitTreeStructureKernel<false, false><<<4, 5, 0, cuda_streams_[0]>>>(SPLIT_TREE_ARGS);

}

}

#undef SPLIT_TREE_ARGS

StrikerRUS · 2025-04-20T16:52:22Z

src/treelearner/cuda/cuda_leaf_splits.cu

+  double* cuda_sum_of_gradients,
+  double* cuda_sum_of_hessians,
+  const data_size_t num_data) {
+  __shared__ double shared_mem_buffer[32];


I guess it should be WARPSIZE after #6086 will be merged.

Suggested change

__shared__ double shared_mem_buffer[32];

__shared__ double shared_mem_buffer[WARPSIZE];

StrikerRUS · 2025-04-20T16:53:10Z

src/treelearner/cuda/cuda_leaf_splits.cu

+  double* cuda_sum_of_hessians,
+  int64_t* cuda_sum_of_gradients_hessians,
+  const data_size_t num_data) {
+  __shared__ double shared_mem_buffer[32];


Suggested change

__shared__ double shared_mem_buffer[32];

__shared__ double shared_mem_buffer[WARPSIZE];

StrikerRUS · 2025-04-20T17:08:35Z

src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp

+      const uint8_t smaller_leaf_num_bits_bin = nccl_communicator_ == nullptr ?
+        cuda_gradient_discretizer_->GetHistBitsInLeaf<false>(smaller_leaf_index_) :
+        cuda_gradient_discretizer_->GetHistBitsInLeaf<true>(smaller_leaf_index_);
+      const uint8_t larger_leaf_num_bits_bin = larger_leaf_index_ < 0 ? 32 : (nccl_communicator_ == nullptr ?


32 is always used despite the value of num_grad_quant_bins parameter value, right?

initialize nccl

ee3923b

shiyu1994 added in progress efficiency feature labels Oct 10, 2023

shiyu1994 requested a review from StrikerRUS October 10, 2023 15:31

shiyu1994 self-assigned this Oct 10, 2023

shiyu1994 requested review from guolinke, jameslamb and jmoralez as code owners October 10, 2023 15:31

shiyu1994 removed the in progress label Oct 10, 2023

shiyu1994 changed the title ~~[CUDA] Multi-GPU for CUDA Version~~ [WIP] [CUDA] Multi-GPU for CUDA Version Oct 10, 2023

shiyu1994 added 9 commits October 26, 2023 11:00

Merge branch 'master' into nccl-dev

82668d0

Merge branch 'master' into nccl-dev

6189cbb

change year in header

f39f877

Merge branch 'master' into nccl-dev

e513662

Merge branch 'nccl-dev' of https://github.com/Microsoft/LightGBM into…

47f3e50

… nccl-dev

add implementation of nccl gbdt

985780f

add nccl topology

35b0ca1

clean up

7d36a14

Merge branch 'master' into nccl-dev

5470d99

shiyu1994 added the in progress label Nov 9, 2023

shiyu1994 added 3 commits November 9, 2023 09:35

clean up

7b47a1e

Merge branch 'nccl-dev' of https://github.com/Microsoft/LightGBM into…

839c375

… nccl-dev

Merge branch 'master' into nccl-dev

8eaf3ad

shiyu1994 changed the title ~~[WIP] [CUDA] Multi-GPU for CUDA Version~~ [CUDA] Multi-GPU for CUDA Version Dec 15, 2023

shiyu1994 closed this Dec 15, 2023

shiyu1994 reopened this Dec 15, 2023

shiyu1994 added 3 commits December 22, 2023 11:37

Merge branch 'master' into nccl-dev

cc72fc8

set nccl info

209e25d

support quantized training with categorical features on cpu

431f967

shiyu1994 added 21 commits October 25, 2024 11:45

Merge branch 'master' into nccl-dev

f30ee85

disable cuda by default

d11991a

Merge branch 'nccl-dev' of https://github.com/Microsoft/LightGBM into…

4bb4411

… nccl-dev

fix single machine gbdt

b56b39e

merge main

3bebc19

clean up

47b4364

fix typo

a326c87

fix lint issues

5f999e7

Merge branch 'master' into nccl-dev

d8ea043

Merge branch 'nccl-dev' of https://github.com/Microsoft/LightGBM into…

a0864dc

… nccl-dev

Merge branch 'master' into nccl-dev

2f040b7

Merge branch 'nccl-dev' of https://github.com/Microsoft/LightGBM into…

0cf1062

… nccl-dev

use num_gpu instead of num_gpus

ae4cce6

Merge branch 'master' into nccl-dev

266e02b

fix compilation error

6aa2aff

fix cpp lint errors

b137216

Merge branch 'master' into nccl-dev

3e1452a

fix reset config for cuda data partition

a86bb42

fix subrow copy in cuda column data

37fb144

Merge branch 'nccl-dev' of https://github.com/Microsoft/LightGBM into…

4fa4837

… nccl-dev

fix cmakelint errors

5bf50de

shiyu1994 added awaiting review and removed in progress labels Mar 7, 2025

shiyu1994 requested a review from jameslamb March 7, 2025 08:30

guolinke approved these changes Mar 14, 2025

View reviewed changes

Merge branch 'master' into nccl-dev

dd20e2d

StrikerRUS requested changes Apr 20, 2025

View reviewed changes

jameslamb mentioned this pull request Apr 28, 2025

[style] Introduce clang-format #6895

Open

jameslamb mentioned this pull request May 20, 2025

Currently cuda version only supports training on a single GPU #6045

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[CUDA] Multi-GPU for CUDA Version #6138

[CUDA] Multi-GPU for CUDA Version #6138

Uh oh!

shiyu1994 commented Oct 10, 2023

Uh oh!

shiyu1994 commented Mar 7, 2025

Uh oh!

guolinke left a comment

Uh oh!

StrikerRUS left a comment

Uh oh!

StrikerRUS Apr 19, 2025

Uh oh!

StrikerRUS Apr 19, 2025

Uh oh!

StrikerRUS Apr 19, 2025

Uh oh!

StrikerRUS Apr 19, 2025

Uh oh!

StrikerRUS Apr 19, 2025

Uh oh!

StrikerRUS Apr 20, 2025

Uh oh!

StrikerRUS Apr 20, 2025

Uh oh!

StrikerRUS Apr 20, 2025

Uh oh!

StrikerRUS Apr 20, 2025

Uh oh!

StrikerRUS Apr 20, 2025

Uh oh!

Uh oh!

		// desc = List of CUDA device IDs used when device_type=cuda
		// desc = When empty, the devices with the smallest IDs will be used

-  // desc = List of CUDA device IDs used when device_type=cuda
-  // desc = When empty, the devices with the smallest IDs will be used
+  // desc = list of CUDA device IDs
+  // desc = **Note**: can be used only in CUDA implementation (``device_type="cuda"``) and when ``num_gpu>1``
+  // desc = if empty, the devices with the smallest IDs will be used

	// desc = When <= 0, only 1 GPU will be used
	// desc = if ``0``, only 1 GPU will be used

	Log::Warning("num_gpu_ = %d is different from the number of valid device IDs in gpu_device_list (%d), using %d GPUs instead.", \
	Log::Warning("num_gpu = %d is different from the number of valid device IDs in gpu_device_list (%d), using %d GPUs instead.", \

		@@ -1126,13 +1126,18 @@ struct Config {
		// desc = Note: refer to `GPU Targets <./GPU-Targets.rst#query-opencl-devices-in-your-system>`__ for more details

	// desc = Note: refer to `GPU Targets <./GPU-Targets.rst#query-opencl-devices-in-your-system>`__ for more details
	// desc = in multi-GPU case (``num_gpu>1``) means ID of the master GPU
	// desc = Note: refer to `GPU Targets <./GPU-Targets.rst#query-opencl-devices-in-your-system>`__ for more details

	__shared__ double shared_mem_buffer[32];
	__shared__ double shared_mem_buffer[WARPSIZE];

[CUDA] Multi-GPU for CUDA Version #6138

Are you sure you want to change the base?

[CUDA] Multi-GPU for CUDA Version #6138

Uh oh!

Conversation

shiyu1994 commented Oct 10, 2023

Uh oh!

shiyu1994 commented Mar 7, 2025

Uh oh!

guolinke left a comment

Choose a reason for hiding this comment

Uh oh!

StrikerRUS left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!