第5.1课 cuda流
cuda流:GPU中的流并行类似于CPU上的任务并行,即每个流都可以看做是一个独立的任务,每个流中的代码操作顺序执行。
CUDA流表示一个GPU操作队列,并且该队列中的操作已添加到队列的先后顺序执行。使用CUDA流可以实现任务级的并行,比如当GPU在执行核函数的同时,还可以在主机和设备之间交换数据(前提是GPU支持重叠,property的deviceQberlay为TRUE)。
使用cuda流之前,首先需要确认设备是否支持cuda流,可以通过以下代码验证。
若可以使用CUDA流,接下来开始讲流的定义、创建和销毁:
cudaStream_t stream; //定义流
cudaStreamCreate(cudaStream_t * s) //创建流
cudaStreamDestory(cudaStream_t s) //销毁流
当流之间有关系时,就会涉及到流的同步:
显性同步:
cudaStreamSynchronize(cudaStream_t s) //同步单个流:等待该流上的命令都完成
cudaDeviceSynchronize() //同步所有流:等待整个设备上流都完成(无需参数)
cudaStreamWaitEvent(cudaStream_t s) //通过某个事件:等待某事件结束后执行该流上的命令
cudaStreamQuery(cudaStream_t s) //查询一个流任务是否完成(这里的参数是流的名字)
现在到了流的使用:
在使用GPU完成任务时,有两个地方可以使用到流:
1.以__global__定义的kernel函数可以交给流执行:
Kernel <<< gridDim,blockDim,shared_memory_size,stream>>>(parameters)
2.GPU和CPU之间的数据传输可以交给流去完成
cudaMemcpyAsync(dst,src,copy_size,copy_direction,stream)
代码实例:用流实现两矩阵相加
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
using namespace std;
__global__ void addKernel(int* c, int* a, int* b)
{
int i = threadIdx.x;
c[i] = a[i] + b[i];
}
// 流并行
void myTestCalcStream(void)
{
int pDataA[100] = { 0 };
int pDataB[100] = { 0 };
int pDataC[100] = { 0 };
for (int i = 0; i < 100; i++) {
pDataA[i] = i;
pDataB[i] = 10 + i;
pDataC[i] = 0;
}
// 申请A、B、C的内存
int* pDevDataA = nullptr, * pDevDataB = nullptr, * pDevDataC = nullptr;
cudaMalloc(&pDevDataA, sizeof(int) * 100);
cudaMalloc(&pDevDataB, sizeof(int) * 100);
cudaMalloc(&pDevDataC, sizeof(int) * 100);
// 内存拷贝
cudaMemcpy(pDevDataA, pDataA, sizeof(int) * 100, cudaMemcpyHostToDevice);
cudaMemcpy(pDevDataB, pDataB, sizeof(int) * 100, cudaMemcpyHostToDevice);
cudaStream_t streams[100];
for (int i = 0; i < 100; ++i)
cudaStreamCreate(streams + i);
{
// 调用核函数并计时
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
for (int i = 0; i < 100; ++i)
addKernel << <1, 1, 0, streams[i] >> > (pDevDataC + i, pDevDataA + i, pDevDataB + i);
/*for (int i = 50; i < 100; ++i)
cudaStreamSynchronize(streams[i]);*/
cudaDeviceSynchronize();
/*cudaThreadSynchronize();*/
// 输出核函数调用时长
cudaEventRecord(stop, 0);
cudaEventSynchronize(start);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
printf("Kernel time(ms) : %f\n", elapsedTime);
}
{
// 调用核函数并计时
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
for (int i = 0; i < 100; ++i)
addKernel << <1, 1, 0 >> > (pDevDataC + i, pDevDataA + i, pDevDataB + i);
// 输出核函数调用时长
cudaEventRecord(stop, 0);
cudaEventSynchronize(start);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
printf("Kernel time(ms) : %f\n", elapsedTime);
}
cudaMemcpy(pDataC, pDevDataC, sizeof(int) * 5, cudaMemcpyDeviceToHost);
printf("Stream Cala Result is: %d, %d, %d, %d, %d\n", pDataC[0], pDataC[1], pDataC[2], pDataC[3], pDataC[4]);
for (int i = 0; i < 100; ++i)
cudaStreamDestroy(streams[i]);
cudaFree(pDevDataA);
cudaFree(pDevDataB);
cudaFree(pDevDataC);
}
int main() {
myTestCalcStream();
return 0;
}
最后讲一下通常配套使用的锁页式内存:
锁页式内存是CPU上的内存,是可以通过PCIE直接与GPU进行数据交互的内存空间,它比普通内存的区别在于:1.除了内存空间的创建与释放,在使用上与普通内存区别不大;2.在物理层面上,普通内存在于GPU进行数据交互时,是要经过锁页式内存的,而锁页式内存则可以与GPU直接进行数据交互,所以速度更快。
锁页式内存的创建:cudaHostAlloc(void ** pHost, size_t size);
锁页式内存的释放:cudaFreeHost(void * pHost);
当每个cudastream计算量比较小时,锁页式内存是与cudastream结合的标配,因为当计算量小时,限制总体速度的原因很可能是CPU与GPU之间的数据交互。即,在GPU上执行算法的时间会比GPU与CPU之间交互数据的耗时小很多,这时就应当用锁页式内存去进行数据交互。
使用锁页式内存时应当注意:
1.锁页式内存在主机上的大小是有限的,不要无节制使用;
2.锁页式内存被占用过多的话,会导致普通内存与GPU交互速度变慢,因为普通内存与GPU交互时也是把其作为一个中间桥梁,因此对于一些内存占用量大的任务,要谨慎分配锁页式内存。
第5.2课 流与锁页式内存实例
实现目标:矩阵的转置。
gputimer.h:
#ifndef __GPU_TIMER_H__
#define __GPU_TIMER_H__
struct GpuTimer
{
cudaEvent_t start;
cudaEvent_t stop;
GpuTimer()
{
cudaEventCreate(&start);
cudaEventCreate(&stop);
}
~GpuTimer()
{
cudaEventDestroy(start);
cudaEventDestroy(stop);
}
void Start()
{
cudaEventRecord(start, 0);
}
void Stop()
{
cudaEventRecord(stop, 0);
}
float Elapsed()
{
float elapsed;
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed, start, stop);
return elapsed;
}
};
#endif /* __GPU_TIMER_H__ */
transpose.cu:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
#include "gputimer.h"
const int N= 512; // matrix size is NxN
const int K= 32; // tile size is KxK
// Utility functions: compare, print, and fill matrices
#define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__)
template<typename T>
void check(T err, const char* const func, const char* const file, const int line)
{
if (err != cudaSuccess) {
fprintf(stderr, "CUDA error at: %s : %d\n", file,line);
fprintf(stderr, "%s %s\n", cudaGetErrorString(err), func);;
exit(1);
}
}
int compare_matrices(float *gpu, float *ref)
{
int result = 0;
for(int j=0; j < N; j++)
for(int i=0; i < N; i++)
if (ref[i + j*N] != gpu[i + j*N])
{
// printf("reference(%d,%d) = %f but test(%d,%d) = %f\n",
// i,j,ref[i+j*N],i,j,test[i+j*N]);
result = 1;
}
return result;
}
void print_matrix(float *mat)
{
for(int j=0; j < N; j++)
{
for(int i=0; i < N; i++) { printf("%4.4g ", mat[i + j*N]); }
printf("\n");
}
}
// fill a matrix with sequential numbers in the range 0..N-1
void fill_matrix(float *mat)
{
for(int j=0; j < N * N; j++)
mat[j] = (float) j;
}
void
transpose_CPU(float in[], float out[])
{
for(int j=0; j < N; j++)
for(int i=0; i < N; i++)
out[j + i*N] = in[i + j*N]; // out(j,i) = in(i,j)
}
// to be launched on a single thread
__global__ void
transpose_serial(float in[], float out[])
{
for(int j=0; j < N; j++)
for(int i=0; i < N; i++)
out[j + i*N] = in[i + j*N]; // out(j,i) = in(i,j)
}
// to be launched with one thread per row of output matrix
__global__ void
transpose_parallel_per_row(float in[], float out[])
{
int i = threadIdx.x;
for(int j=0; j < N; j++)
out[j + i*N] = in[i + j*N]; // out(j,i) = in(i,j)
}
// to be launched with one thread per element, in KxK threadblocks
// thread (x,y) in grid writes element (i,j) of output matrix
__global__ void
transpose_parallel_per_element(float in[], float out[])
{
int i = blockIdx.x * K + threadIdx.x;
int j = blockIdx.y * K + threadIdx.y;
out[j + i*N] = in[i + j*N]; // out(j,i) = in(i,j)
}
// to be launched with one thread per element, in (tilesize)x(tilesize) threadblocks
// thread blocks read & write tiles, in coalesced fashion
// adjacent threads read adjacent input elements, write adjacent output elmts
__global__ void
transpose_parallel_per_element_tiled(float in[], float out[])
{
// (i,j) locations of the tile corners for input & output matrices:
int in_corner_i = blockIdx.x * K, in_corner_j = blockIdx.y * K;
int out_corner_i = blockIdx.y * K, out_corner_j = blockIdx.x * K;
int x = threadIdx.x, y = threadIdx.y;
__shared__ float tile[K][K];
// coalesced read from global mem, TRANSPOSED write into shared mem:
tile[y][x] = in[(in_corner_i + x) + (in_corner_j + y)*N];
__syncthreads();
// read from shared mem, coalesced write to global mem:
out[(out_corner_i + x) + (out_corner_j + y)*N] = tile[x][y];
}
// to be launched with one thread per element, in (tilesize)x(tilesize) threadblocks
// thread blocks read & write tiles, in coalesced fashion
// adjacent threads read adjacent input elements, write adjacent output elmts
__global__ void
transpose_parallel_per_element_tiled16(float in[], float out[])
{
// (i,j) locations of the tile corners for input & output matrices:
int in_corner_i = blockIdx.x * 16, in_corner_j = blockIdx.y * 16;
int out_corner_i = blockIdx.y * 16, out_corner_j = blockIdx.x * 16;
int x = threadIdx.x, y = threadIdx.y;
__shared__ float tile[16][16];
// coalesced read from global mem, TRANSPOSED write into shared mem:
tile[y][x] = in[(in_corner_i + x) + (in_corner_j + y)*N];
__syncthreads();
// read from shared mem, coalesced write to global mem:
out[(out_corner_i + x) + (out_corner_j + y)*N] = tile[x][y];
}
// to be launched with one thread per element, in KxK threadblocks
// thread blocks read & write tiles, in coalesced fashion
// shared memory array padded to avoid bank conflicts
__global__ void
transpose_parallel_per_element_tiled_padded(float in[], float out[])
{
// (i,j) locations of the tile corners for input & output matrices:
int in_corner_i = blockIdx.x * K, in_corner_j = blockIdx.y * K;
int out_corner_i = blockIdx.y * K, out_corner_j = blockIdx.x * K;
int x = threadIdx.x, y = threadIdx.y;
__shared__ float tile[K][K+1];
// coalesced read from global mem, TRANSPOSED write into shared mem:
tile[y][x] = in[(in_corner_i + x) + (in_corner_j + y)*N];
__syncthreads();
// read from shared mem, coalesced write to global mem:
out[(out_corner_i + x) + (out_corner_j + y)*N] = tile[x][y];
}
// to be launched with one thread per element, in KxK threadblocks
// thread blocks read & write tiles, in coalesced fashion
// shared memory array padded to avoid bank conflicts
__global__ void
transpose_parallel_per_element_tiled_padded16(float in[], float out[])
{
// (i,j) locations of the tile corners for input & output matrices:
int in_corner_i = blockIdx.x * 16, in_corner_j = blockIdx.y * 16;
int out_corner_i = blockIdx.y * 16, out_corner_j = blockIdx.x * 16;
int x = threadIdx.x, y = threadIdx.y;
__shared__ float tile[16][16+1];
// coalesced read from global mem, TRANSPOSED write into shared mem:
tile[y][x] = in[(in_corner_i + x) + (in_corner_j + y)*N];
__syncthreads();
// read from shared mem, coalesced write to global mem:
out[(out_corner_i + x) + (out_corner_j + y)*N] = tile[x][y];
}
__global__ void transpose_parallel_per_stream(float in[], float out[], int stream_id) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = stream_id;
out[j + i * N] = in[i + j * N]; // out(j,i) = in(i,j)
}
void transpose_with_stream(float* d_in, float* d_out, float* out, float* gold, int numbytes, int stream_num) {
GpuTimer timer;
cudaStream_t streams[N];
for (int i = 0; i < N; ++i)
cudaStreamCreate(streams + i);
timer.Start();
// 用流去进行单个矩阵转置
for (int i = 0; i < stream_num; ++i)
transpose_parallel_per_stream << <N/256, 256, 0, streams[i] >> > (d_in, d_out, i);
cudaDeviceSynchronize();
timer.Stop();
cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
printf("transpose_with_streams: %g ms.\nVerifying transpose...%s\n",
timer.Elapsed(), compare_matrices(out, gold) ? "Failed" : "Success");
for (int i = 0; i < N; ++i)
cudaStreamDestroy(streams[i]);
}
void run_streams(int stream_num, bool use_hostmalloc) {
// 当use_hostmalloc == 1, 调用锁页式内存,0时调用普通内存
int numbytes = N * N * sizeof(float);
float *in = (float *)malloc(numbytes);
float **out = new float*[stream_num];
float *gold = (float *)malloc(numbytes);
fill_matrix(in);
transpose_CPU(in, gold);
float **d_in = new float* [stream_num], **d_out = new float* [stream_num];
for (int i = 0; i < stream_num; i++)
{
if(use_hostmalloc) cudaHostAlloc((void**)&out[i], numbytes, cudaHostAllocDefault);
else out[i] = (float *)malloc(numbytes);
cudaMalloc((void **)&(d_in[i]), numbytes);
cudaMalloc((void **)&(d_out[i]), numbytes);
}
dim3 blocks16x16(N / 16, N / 16); // blocks per grid
dim3 threads16x16(16, 16); // threads per block
GpuTimer timer;
cudaStream_t streams[N];
for (int i = 0; i < N; ++i)
cudaStreamCreate(streams + i);
timer.Start();
for (int i = 0; i < stream_num; ++i) {
cudaMemcpyAsync(d_in[i], in, numbytes, cudaMemcpyHostToDevice, streams[i]);
transpose_parallel_per_element_tiled16 << <blocks16x16, threads16x16, 0, streams[i] >> > (d_in[i], d_out[i]);
cudaMemcpyAsync(out[i], d_out[i], numbytes, cudaMemcpyDeviceToHost, streams[i]);
}
cudaDeviceSynchronize();
timer.Stop();
for (int i = 0; i < stream_num; i++) {
//printf(" %s ", compare_matrices(out[i], gold) ? "Failed" : "Success");
}
printf("transpose_with_streams: %g ms.\nVerifying transpose...\n",
timer.Elapsed());
timer.Start();
for (int i = 0; i < stream_num; ++i) {
cudaMemcpy(d_in[i], in, numbytes, cudaMemcpyHostToDevice);
transpose_parallel_per_element_tiled16 << <blocks16x16, threads16x16, 0 >> > (d_in[i], d_out[i]);
cudaMemcpy(out[i], d_out[i], numbytes, cudaMemcpyDeviceToHost);
}
cudaDeviceSynchronize();
timer.Stop();
for (int i = 0; i < stream_num; i++) {
//printf(" %s ", compare_matrices(out[i], gold) ? "Failed" : "Success");
}
printf("transpose_with_nostreams: %g ms.\nVerifying transpose...\n",
timer.Elapsed());
for (int i = 0; i < stream_num; i++) {
cudaStreamDestroy(streams[i]);
if(use_hostmalloc) cudaFreeHost(out[i]);
cudaFree(d_out[i]);
cudaFree(d_in[i]);
}
}
int main(int argc, char **argv)
{
int numbytes = N * N * sizeof(float);
float *in = (float *) malloc(numbytes);
float *out = (float *) malloc(numbytes);
float *gold = (float *) malloc(numbytes);
fill_matrix(in);
transpose_CPU(in, gold);
float *d_in, *d_out;
cudaMalloc(&d_in, numbytes);
cudaMalloc(&d_out, numbytes);
cudaMemcpy(d_in, in, numbytes, cudaMemcpyHostToDevice);
GpuTimer timer;
/*
* Now time each kernel and verify that it produces the correct result.
*
* To be really careful about benchmarking purposes, we should run every kernel once
* to "warm" the system and avoid any compilation or code-caching effects, then run
* every kernel 10 or 100 times and average the timings to smooth out any variance.
* But this makes for messy code and our goal is teaching, not detailed benchmarking.
*/
timer.Start();
transpose_serial<<<1,1>>>(d_in, d_out);
timer.Stop();
cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
printf("transpose_serial: %g ms.\nVerifying transpose...%s\n",
timer.Elapsed(), compare_matrices(out, gold) ? "Failed" : "Success");
timer.Start();
transpose_parallel_per_row<<<1,N>>>(d_in, d_out);
timer.Stop();
cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
printf("transpose_parallel_per_row: %g ms.\nVerifying transpose...%s\n",
timer.Elapsed(), compare_matrices(out, gold) ? "Failed" : "Success");
dim3 blocks(N/K,N/K); // blocks per grid
dim3 threads(K,K); // threads per block
timer.Start();
transpose_parallel_per_element<<<blocks,threads>>>(d_in, d_out);
timer.Stop();
cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
printf("transpose_parallel_per_element: %g ms.\nVerifying transpose...%s\n",
timer.Elapsed(), compare_matrices(out, gold) ? "Failed" : "Success");
timer.Start();
transpose_parallel_per_element_tiled<<<blocks,threads>>>(d_in, d_out);
timer.Stop();
cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
printf("transpose_parallel_per_element_tiled %dx%d: %g ms.\nVerifying ...%s\n",
K, K, timer.Elapsed(), compare_matrices(out, gold) ? "Failed" : "Success");
dim3 blocks16x16(N/16,N/16); // blocks per grid
dim3 threads16x16(16,16); // threads per block
timer.Start();
transpose_parallel_per_element_tiled16<<<blocks16x16,threads16x16>>>(d_in, d_out);
timer.Stop();
cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
printf("transpose_parallel_per_element_tiled 16x16: %g ms.\nVerifying ...%s\n",
timer.Elapsed(), compare_matrices(out, gold) ? "Failed" : "Success");
timer.Start();
transpose_parallel_per_element_tiled_padded16<<<blocks16x16,threads16x16>>>(d_in, d_out);
timer.Stop();
cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
printf("transpose_parallel_per_element_tiled_padded 16x16: %g ms.\nVerifying...%s\n",
timer.Elapsed(), compare_matrices(out, gold) ? "Failed" : "Success");
// 使用cuda流去处理N次
transpose_with_stream(d_in, d_out, out, gold, numbytes, N);
// 使用锁页式内存或者普通内存进行处理
printf("\n 使用锁页式内存:\n");
run_streams(100, 1);
printf("\n 使用普通内存:\n");
run_streams(100, 0);
cudaFree(d_in);
cudaFree(d_out);
}
结果: