-
Notifications
You must be signed in to change notification settings - Fork 3.9k
[CUDA] Multi-GPU for CUDA Version #6138
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Conversation
This is ready for review. @guolinke @jameslamb @StrikerRUS Could you help to review this when you have time? Let's get this merged recently. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you, it overall looks good to me.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unfortunately, I'm not qualified to review cpp/CUDA code, but I left some suggestions that I believe may improve this PR.
// desc = List of CUDA device IDs used when device_type=cuda | ||
// desc = When empty, the devices with the smallest IDs will be used |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// desc = List of CUDA device IDs used when device_type=cuda | |
// desc = When empty, the devices with the smallest IDs will be used | |
// desc = list of CUDA device IDs | |
// desc = **Note**: can be used only in CUDA implementation (``device_type="cuda"``) and when ``num_gpu>1`` | |
// desc = if empty, the devices with the smallest IDs will be used |
// desc = set this to ``true`` to use double precision math on GPU (by default single precision is used) | ||
// desc = **Note**: can be used only in OpenCL implementation (``device_type="gpu"``), in CUDA implementation only double precision is currently supported | ||
bool gpu_use_dp = false; | ||
|
||
// check = >0 | ||
// desc = number of GPUs | ||
// desc = **Note**: can be used only in CUDA implementation (``device_type="cuda"``) | ||
// desc = When <= 0, only 1 GPU will be used |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Because of // check = >0
.
// desc = When <= 0, only 1 GPU will be used | |
// desc = if ``0``, only 1 GPU will be used |
} | ||
} | ||
if (!gpu_list_.empty() && num_gpu_ != static_cast<int>(gpu_list_.size())) { | ||
Log::Warning("num_gpu_ = %d is different from the number of valid device IDs in gpu_device_list (%d), using %d GPUs instead.", \ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Users don't know about internal num_gpu_
, they can only set num_gpu
param. So this can be misleading.
Log::Warning("num_gpu_ = %d is different from the number of valid device IDs in gpu_device_list (%d), using %d GPUs instead.", \ | |
Log::Warning("num_gpu = %d is different from the number of valid device IDs in gpu_device_list (%d), using %d GPUs instead.", \ |
if (Network::num_machines() == 1 || Network::rank() == 0) { | ||
NCCLCHECK(ncclGetUniqueId(&nccl_unique_id)); | ||
} | ||
if (Network::num_machines() > 1) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it multi-node multi-GPU case?
@@ -1126,13 +1126,18 @@ struct Config { | |||
// desc = **Note**: refer to `GPU Targets <./GPU-Targets.rst#query-opencl-devices-in-your-system>`__ for more details |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// desc = **Note**: refer to `GPU Targets <./GPU-Targets.rst#query-opencl-devices-in-your-system>`__ for more details | |
// desc = in multi-GPU case (``num_gpu>1``) means ID of the master GPU | |
// desc = **Note**: refer to `GPU Targets <./GPU-Targets.rst#query-opencl-devices-in-your-system>`__ for more details |
cuda_hist_pool_, | ||
cuda_leaf_output_, cuda_split_info_buffer_); | ||
|
||
#define SPLI_TREE_ARGS \ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
#define SPLI_TREE_ARGS \ | |
#define SPLIT_TREE_ARGS \ |
SplitTreeStructureKernel<true, true><<<4, 5, 0, cuda_streams_[0]>>>(SPLI_TREE_ARGS); | ||
} else { | ||
SplitTreeStructureKernel<true, false><<<4, 5, 0, cuda_streams_[0]>>>(SPLI_TREE_ARGS); | ||
} | ||
} else { | ||
if (use_quantized_grad_) { | ||
SplitTreeStructureKernel<false, true><<<4, 5, 0, cuda_streams_[0]>>>(SPLI_TREE_ARGS); | ||
} else { | ||
SplitTreeStructureKernel<false, false><<<4, 5, 0, cuda_streams_[0]>>>(SPLI_TREE_ARGS); | ||
} | ||
} | ||
|
||
#undef SPLI_TREE_ARGS |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
SplitTreeStructureKernel<true, true><<<4, 5, 0, cuda_streams_[0]>>>(SPLI_TREE_ARGS); | |
} else { | |
SplitTreeStructureKernel<true, false><<<4, 5, 0, cuda_streams_[0]>>>(SPLI_TREE_ARGS); | |
} | |
} else { | |
if (use_quantized_grad_) { | |
SplitTreeStructureKernel<false, true><<<4, 5, 0, cuda_streams_[0]>>>(SPLI_TREE_ARGS); | |
} else { | |
SplitTreeStructureKernel<false, false><<<4, 5, 0, cuda_streams_[0]>>>(SPLI_TREE_ARGS); | |
} | |
} | |
#undef SPLI_TREE_ARGS | |
SplitTreeStructureKernel<true, true><<<4, 5, 0, cuda_streams_[0]>>>(SPLIT_TREE_ARGS); | |
} else { | |
SplitTreeStructureKernel<true, false><<<4, 5, 0, cuda_streams_[0]>>>(SPLIT_TREE_ARGS); | |
} | |
} else { | |
if (use_quantized_grad_) { | |
SplitTreeStructureKernel<false, true><<<4, 5, 0, cuda_streams_[0]>>>(SPLIT_TREE_ARGS); | |
} else { | |
SplitTreeStructureKernel<false, false><<<4, 5, 0, cuda_streams_[0]>>>(SPLIT_TREE_ARGS); | |
} | |
} | |
#undef SPLIT_TREE_ARGS |
double* cuda_sum_of_gradients, | ||
double* cuda_sum_of_hessians, | ||
const data_size_t num_data) { | ||
__shared__ double shared_mem_buffer[32]; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I guess it should be WARPSIZE
after #6086 will be merged.
__shared__ double shared_mem_buffer[32]; | |
__shared__ double shared_mem_buffer[WARPSIZE]; |
double* cuda_sum_of_hessians, | ||
int64_t* cuda_sum_of_gradients_hessians, | ||
const data_size_t num_data) { | ||
__shared__ double shared_mem_buffer[32]; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
__shared__ double shared_mem_buffer[32]; | |
__shared__ double shared_mem_buffer[WARPSIZE]; |
const uint8_t smaller_leaf_num_bits_bin = nccl_communicator_ == nullptr ? | ||
cuda_gradient_discretizer_->GetHistBitsInLeaf<false>(smaller_leaf_index_) : | ||
cuda_gradient_discretizer_->GetHistBitsInLeaf<true>(smaller_leaf_index_); | ||
const uint8_t larger_leaf_num_bits_bin = larger_leaf_index_ < 0 ? 32 : (nccl_communicator_ == nullptr ? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
32
is always used despite the value of num_grad_quant_bins
parameter value, right?
This is to integrate multi-GPU support for CUDA version, with NCCL.