CUDA学习5

KAY金

已于 2022-11-17 14:19:16 修改

阅读量1.8k

点赞数 1

CC 4.0 BY-SA版权

分类专栏： cuda 文章标签：学习 c++ 算法

于 2022-11-14 20:18:56 首次发布

本文链接：https://blog.csdn.net/qq_45605440/article/details/127850660

cuda 专栏收录该内容

8 篇文章

订阅专栏

本文详细介绍了CUDA流的概念及其在GPU任务并行中的应用，展示了如何创建、销毁和同步流，以及如何利用流实现GPU计算和数据传输的重叠。同时，讲解了锁页式内存的优势，特别是在GPU数据交互中的高速性能。通过实例代码，演示了如何使用流进行矩阵转置，并对比了不同并行策略的效率。此外，还探讨了锁页式内存的使用注意事项和限制。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

第5.1课 cuda流

cuda流：GPU中的流并行类似于CPU上的任务并行，即每个流都可以看做是一个独立的任务，每个流中的代码操作顺序执行。

CUDA流表示一个GPU操作队列，并且该队列中的操作已添加到队列的先后顺序执行。使用CUDA流可以实现任务级的并行，比如当GPU在执行核函数的同时，还可以在主机和设备之间交换数据（前提是GPU支持重叠，property的deviceQberlay为TRUE）。

使用cuda流之前，首先需要确认设备是否支持cuda流，可以通过以下代码验证。

若可以使用CUDA流，接下来开始讲流的定义、创建和销毁：

cudaStream_t stream; //定义流

cudaStreamCreate(cudaStream_t * s) //创建流

cudaStreamDestory(cudaStream_t s) //销毁流

当流之间有关系时，就会涉及到流的同步：

显性同步：

cudaStreamSynchronize(cudaStream_t s) //同步单个流：等待该流上的命令都完成

cudaDeviceSynchronize() //同步所有流：等待整个设备上流都完成（无需参数）

cudaStreamWaitEvent(cudaStream_t s) //通过某个事件：等待某事件结束后执行该流上的命令

cudaStreamQuery(cudaStream_t s) //查询一个流任务是否完成（这里的参数是流的名字）

现在到了流的使用：

在使用GPU完成任务时，有两个地方可以使用到流：

1.以__global__定义的kernel函数可以交给流执行：

Kernel <<< gridDim,blockDim,shared_memory_size,stream>>>(parameters)

2.GPU和CPU之间的数据传输可以交给流去完成

cudaMemcpyAsync(dst,src,copy_size,copy_direction,stream)

代码实例：用流实现两矩阵相加


#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>

using namespace std;


__global__ void addKernel(int* c, int* a, int* b)
{
	int i = threadIdx.x;
	c[i] = a[i] + b[i];
}

// 流并行
void myTestCalcStream(void)
{
	int pDataA[100] = { 0 };
	int pDataB[100] = { 0 };
	int pDataC[100] = { 0 };
	for (int i = 0; i < 100; i++) {
		pDataA[i] = i;
		pDataB[i] = 10 + i;
		pDataC[i] = 0;
	}

	// 申请A、B、C的内存
	int* pDevDataA = nullptr, * pDevDataB = nullptr, * pDevDataC = nullptr;
	cudaMalloc(&pDevDataA, sizeof(int) * 100);
	cudaMalloc(&pDevDataB, sizeof(int) * 100);
	cudaMalloc(&pDevDataC, sizeof(int) * 100);

	// 内存拷贝
	cudaMemcpy(pDevDataA, pDataA, sizeof(int) * 100, cudaMemcpyHostToDevice);
	cudaMemcpy(pDevDataB, pDataB, sizeof(int) * 100, cudaMemcpyHostToDevice);

	cudaStream_t streams[100];
	for (int i = 0; i < 100; ++i)
		cudaStreamCreate(streams + i);
	{
		// 调用核函数并计时
		cudaEvent_t start, stop;
		cudaEventCreate(&start);
		cudaEventCreate(&stop);
		cudaEventRecord(start, 0);
		for (int i = 0; i < 100; ++i)
			addKernel << <1, 1, 0, streams[i] >> > (pDevDataC + i, pDevDataA + i, pDevDataB + i);
		/*for (int i = 50; i < 100; ++i)
			cudaStreamSynchronize(streams[i]);*/
		cudaDeviceSynchronize();
		/*cudaThreadSynchronize();*/
		// 输出核函数调用时长
		cudaEventRecord(stop, 0);
		cudaEventSynchronize(start);
		cudaEventSynchronize(stop);
		float elapsedTime;
		cudaEventElapsedTime(&elapsedTime, start, stop);
		printf("Kernel time(ms) : %f\n", elapsedTime);
	}

	{
		// 调用核函数并计时
		cudaEvent_t start, stop;
		cudaEventCreate(&start);
		cudaEventCreate(&stop);
		cudaEventRecord(start, 0);
		for (int i = 0; i < 100; ++i)
			addKernel << <1, 1, 0 >> > (pDevDataC + i, pDevDataA + i, pDevDataB + i);

		// 输出核函数调用时长
		cudaEventRecord(stop, 0);
		cudaEventSynchronize(start);
		cudaEventSynchronize(stop);
		float elapsedTime;
		cudaEventElapsedTime(&elapsedTime, start, stop);
		printf("Kernel time(ms) : %f\n", elapsedTime);
	}


	cudaMemcpy(pDataC, pDevDataC, sizeof(int) * 5, cudaMemcpyDeviceToHost);

	printf("Stream Cala Result is: %d, %d, %d, %d, %d\n", pDataC[0], pDataC[1], pDataC[2], pDataC[3], pDataC[4]);

	for (int i = 0; i < 100; ++i)
		cudaStreamDestroy(streams[i]);

	cudaFree(pDevDataA);
	cudaFree(pDevDataB);
	cudaFree(pDevDataC);
}

int main() {
	myTestCalcStream();
	return 0;
}

最后讲一下通常配套使用的锁页式内存：

锁页式内存是CPU上的内存，是可以通过PCIE直接与GPU进行数据交互的内存空间，它比普通内存的区别在于：1.除了内存空间的创建与释放，在使用上与普通内存区别不大；2.在物理层面上，普通内存在于GPU进行数据交互时，是要经过锁页式内存的，而锁页式内存则可以与GPU直接进行数据交互，所以速度更快。

锁页式内存的创建：cudaHostAlloc(void ** pHost, size_t size);

锁页式内存的释放：cudaFreeHost(void * pHost);

当每个cudastream计算量比较小时，锁页式内存是与cudastream结合的标配，因为当计算量小时，限制总体速度的原因很可能是CPU与GPU之间的数据交互。即，在GPU上执行算法的时间会比GPU与CPU之间交互数据的耗时小很多，这时就应当用锁页式内存去进行数据交互。

使用锁页式内存时应当注意：

1.锁页式内存在主机上的大小是有限的，不要无节制使用；

2.锁页式内存被占用过多的话，会导致普通内存与GPU交互速度变慢，因为普通内存与GPU交互时也是把其作为一个中间桥梁，因此对于一些内存占用量大的任务，要谨慎分配锁页式内存。

第5.2课流与锁页式内存实例

实现目标：矩阵的转置。

gputimer.h：

#ifndef __GPU_TIMER_H__
#define __GPU_TIMER_H__

struct GpuTimer
{
      cudaEvent_t start;
      cudaEvent_t stop;
 
      GpuTimer()
      {
            cudaEventCreate(&start);
            cudaEventCreate(&stop);
      }
 
      ~GpuTimer()
      {
            cudaEventDestroy(start);
            cudaEventDestroy(stop);
      }
 
      void Start()
      {
            cudaEventRecord(start, 0);
      }
 
      void Stop()
      {
            cudaEventRecord(stop, 0);
      }
 
      float Elapsed()
      {
            float elapsed;
            cudaEventSynchronize(stop);
            cudaEventElapsedTime(&elapsed, start, stop);
            return elapsed;
      }
};

#endif  /* __GPU_TIMER_H__ */

transpose.cu：

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
#include "gputimer.h"

const int N= 512;		// matrix size is NxN
const int K= 32;				// tile size is KxK

// Utility functions: compare, print, and fill matrices
#define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__)

template<typename T>
void check(T err, const char* const func, const char* const file, const int line)
{
  if (err != cudaSuccess) {
    fprintf(stderr, "CUDA error at: %s : %d\n", file,line);
    fprintf(stderr, "%s %s\n", cudaGetErrorString(err), func);;
    exit(1);
  }
}

int compare_matrices(float *gpu, float *ref)
{
	int result = 0;

	for(int j=0; j < N; j++)
    	for(int i=0; i < N; i++)
    		if (ref[i + j*N] != gpu[i + j*N])
    		{
    			// printf("reference(%d,%d) = %f but test(%d,%d) = %f\n",
    			// i,j,ref[i+j*N],i,j,test[i+j*N]);
    			result = 1;
    		}
    return result;
}

void print_matrix(float *mat)
{
	for(int j=0; j < N; j++) 
	{
		for(int i=0; i < N; i++) { printf("%4.4g ", mat[i + j*N]); }
		printf("\n");
	}	
}

// fill a matrix with sequential numbers in the range 0..N-1
void fill_matrix(float *mat)
{
	for(int j=0; j < N * N; j++)
		mat[j] = (float) j;
}



void 
transpose_CPU(float in[], float out[])
{
	for(int j=0; j < N; j++)
    	for(int i=0; i < N; i++)
      		out[j + i*N] = in[i + j*N]; // out(j,i) = in(i,j)
}

// to be launched on a single thread
__global__ void 
transpose_serial(float in[], float out[])
{
	for(int j=0; j < N; j++)
		for(int i=0; i < N; i++)
			out[j + i*N] = in[i + j*N]; // out(j,i) = in(i,j)
}

// to be launched with one thread per row of output matrix
__global__ void 
transpose_parallel_per_row(float in[], float out[])
{
	int i = threadIdx.x;

	for(int j=0; j < N; j++)
		out[j + i*N] = in[i + j*N]; // out(j,i) = in(i,j)
}

// to be launched with one thread per element, in KxK threadblocks
// thread (x,y) in grid writes element (i,j) of output matrix 
__global__ void 
transpose_parallel_per_element(float in[], float out[])
{
	int i = blockIdx.x * K + threadIdx.x;
	int j = blockIdx.y * K + threadIdx.y;

	out[j + i*N] = in[i + j*N]; // out(j,i) = in(i,j)
}

// to be launched with one thread per element, in (tilesize)x(tilesize) threadblocks
// thread blocks read & write tiles, in coalesced fashion
// adjacent threads read adjacent input elements, write adjacent output elmts
__global__ void 
transpose_parallel_per_element_tiled(float in[], float out[])
{
	// (i,j) locations of the tile corners for input & output matrices:
	int in_corner_i  = blockIdx.x * K, in_corner_j  = blockIdx.y * K;
	int out_corner_i = blockIdx.y * K, out_corner_j = blockIdx.x * K;

	int x = threadIdx.x, y = threadIdx.y;

	__shared__ float tile[K][K];

	// coalesced read from global mem, TRANSPOSED write into shared mem:
	tile[y][x] = in[(in_corner_i + x) + (in_corner_j + y)*N];
	__syncthreads();
	// read from shared mem, coalesced write to global mem:
	out[(out_corner_i + x) + (out_corner_j + y)*N] = tile[x][y];
}

// to be launched with one thread per element, in (tilesize)x(tilesize) threadblocks
// thread blocks read & write tiles, in coalesced fashion
// adjacent threads read adjacent input elements, write adjacent output elmts
__global__ void 
transpose_parallel_per_element_tiled16(float in[], float out[])
{
	// (i,j) locations of the tile corners for input & output matrices:
	int in_corner_i  = blockIdx.x * 16, in_corner_j  = blockIdx.y * 16;
	int out_corner_i = blockIdx.y * 16, out_corner_j = blockIdx.x * 16;

	int x = threadIdx.x, y = threadIdx.y;

	__shared__ float tile[16][16];

	// coalesced read from global mem, TRANSPOSED write into shared mem:
	tile[y][x] = in[(in_corner_i + x) + (in_corner_j + y)*N];
	__syncthreads();
	// read from shared mem, coalesced write to global mem:
	out[(out_corner_i + x) + (out_corner_j + y)*N] = tile[x][y];
}

// to be launched with one thread per element, in KxK threadblocks
// thread blocks read & write tiles, in coalesced fashion
// shared memory array padded to avoid bank conflicts
__global__ void 
transpose_parallel_per_element_tiled_padded(float in[], float out[])
{
	// (i,j) locations of the tile corners for input & output matrices:
	int in_corner_i  = blockIdx.x * K, in_corner_j  = blockIdx.y * K;
	int out_corner_i = blockIdx.y * K, out_corner_j = blockIdx.x * K;

	int x = threadIdx.x, y = threadIdx.y;

	__shared__ float tile[K][K+1];

	// coalesced read from global mem, TRANSPOSED write into shared mem:
	tile[y][x] = in[(in_corner_i + x) + (in_corner_j + y)*N];
	__syncthreads();
	// read from shared mem, coalesced write to global mem:
	out[(out_corner_i + x) + (out_corner_j + y)*N] = tile[x][y];
}

// to be launched with one thread per element, in KxK threadblocks
// thread blocks read & write tiles, in coalesced fashion
// shared memory array padded to avoid bank conflicts
__global__ void 
transpose_parallel_per_element_tiled_padded16(float in[], float out[])
{
	// (i,j) locations of the tile corners for input & output matrices:
	int in_corner_i  = blockIdx.x * 16, in_corner_j  = blockIdx.y * 16;
	int out_corner_i = blockIdx.y * 16, out_corner_j = blockIdx.x * 16;

	int x = threadIdx.x, y = threadIdx.y;

	__shared__ float tile[16][16+1];

	// coalesced read from global mem, TRANSPOSED write into shared mem:
	tile[y][x] = in[(in_corner_i + x) + (in_corner_j + y)*N];
	__syncthreads();
	// read from shared mem, coalesced write to global mem:
	out[(out_corner_i + x) + (out_corner_j + y)*N] = tile[x][y];
}

__global__ void transpose_parallel_per_stream(float in[], float out[], int stream_id) {
	int i = blockIdx.x * blockDim.x + threadIdx.x;
	int j = stream_id;

	out[j + i * N] = in[i + j * N]; // out(j,i) = in(i,j)
}

void transpose_with_stream(float* d_in, float* d_out, float* out, float* gold, int numbytes, int stream_num) {
	GpuTimer timer;
	cudaStream_t streams[N];
	for (int i = 0; i < N; ++i)
		cudaStreamCreate(streams + i);

	timer.Start();
	// 用流去进行单个矩阵转置
	for (int i = 0; i < stream_num; ++i)
		transpose_parallel_per_stream << <N/256, 256, 0, streams[i] >> > (d_in, d_out, i);
	cudaDeviceSynchronize();
	timer.Stop();
	cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
	printf("transpose_with_streams: %g ms.\nVerifying transpose...%s\n",
		timer.Elapsed(), compare_matrices(out, gold) ? "Failed" : "Success");

	
	for (int i = 0; i < N; ++i)
		cudaStreamDestroy(streams[i]);
}

void run_streams(int stream_num, bool use_hostmalloc) {
	// 当use_hostmalloc == 1， 调用锁页式内存，0时调用普通内存

	int numbytes = N * N * sizeof(float);

	float *in = (float *)malloc(numbytes);
	float **out = new float*[stream_num];
	
	float *gold = (float *)malloc(numbytes);

	fill_matrix(in);
	transpose_CPU(in, gold);

	float **d_in = new float* [stream_num], **d_out = new float* [stream_num];

	for (int i = 0; i < stream_num; i++)
	{
		if(use_hostmalloc)		cudaHostAlloc((void**)&out[i], numbytes, cudaHostAllocDefault);
		else out[i] = (float *)malloc(numbytes);
		cudaMalloc((void **)&(d_in[i]), numbytes);
		cudaMalloc((void **)&(d_out[i]), numbytes);
	}

	dim3 blocks16x16(N / 16, N / 16); // blocks per grid
	dim3 threads16x16(16, 16);	 // threads per block

	GpuTimer timer;
	cudaStream_t streams[N];
	for (int i = 0; i < N; ++i)
		cudaStreamCreate(streams + i);

	timer.Start();
	for (int i = 0; i < stream_num; ++i) {
		cudaMemcpyAsync(d_in[i], in, numbytes, cudaMemcpyHostToDevice, streams[i]);
		transpose_parallel_per_element_tiled16 << <blocks16x16, threads16x16, 0, streams[i] >> > (d_in[i], d_out[i]);
		cudaMemcpyAsync(out[i], d_out[i], numbytes, cudaMemcpyDeviceToHost, streams[i]);
	}
	cudaDeviceSynchronize();
	timer.Stop();
	for (int i = 0; i < stream_num; i++) {
		//printf(" %s ", compare_matrices(out[i], gold) ? "Failed" : "Success");
	}

	printf("transpose_with_streams: %g ms.\nVerifying transpose...\n",
		timer.Elapsed());


	timer.Start();
	for (int i = 0; i < stream_num; ++i) {
		cudaMemcpy(d_in[i], in, numbytes, cudaMemcpyHostToDevice);
		transpose_parallel_per_element_tiled16 << <blocks16x16, threads16x16, 0 >> > (d_in[i], d_out[i]);
		cudaMemcpy(out[i], d_out[i], numbytes, cudaMemcpyDeviceToHost);
	}
	cudaDeviceSynchronize();
	timer.Stop();
	for (int i = 0; i < stream_num; i++) {
		//printf(" %s ", compare_matrices(out[i], gold) ? "Failed" : "Success");
	}

	printf("transpose_with_nostreams: %g ms.\nVerifying transpose...\n",
		timer.Elapsed());


	for (int i = 0; i < stream_num; i++) {
		cudaStreamDestroy(streams[i]);
		if(use_hostmalloc)	cudaFreeHost(out[i]);
		cudaFree(d_out[i]);
		cudaFree(d_in[i]);
	}
	
}

int main(int argc, char **argv)
{
	int numbytes = N * N * sizeof(float);

	float *in = (float *) malloc(numbytes);
	float *out = (float *) malloc(numbytes);
	float *gold = (float *) malloc(numbytes);

	fill_matrix(in);
	transpose_CPU(in, gold);

	float *d_in, *d_out;

	cudaMalloc(&d_in, numbytes);
	cudaMalloc(&d_out, numbytes);
	cudaMemcpy(d_in, in, numbytes, cudaMemcpyHostToDevice);

	GpuTimer timer;

/*  
 * Now time each kernel and verify that it produces the correct result.
 *
 * To be really careful about benchmarking purposes, we should run every kernel once
 * to "warm" the system and avoid any compilation or code-caching effects, then run 
 * every kernel 10 or 100 times and average the timings to smooth out any variance. 
 * But this makes for messy code and our goal is teaching, not detailed benchmarking.
 */

	timer.Start();
	transpose_serial<<<1,1>>>(d_in, d_out);
	timer.Stop();
	cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
	printf("transpose_serial: %g ms.\nVerifying transpose...%s\n", 
	       timer.Elapsed(), compare_matrices(out, gold) ? "Failed" : "Success");

	timer.Start();
	transpose_parallel_per_row<<<1,N>>>(d_in, d_out);
	timer.Stop();
	cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
	printf("transpose_parallel_per_row: %g ms.\nVerifying transpose...%s\n", 
		   timer.Elapsed(), compare_matrices(out, gold) ? "Failed" : "Success");

	dim3 blocks(N/K,N/K); // blocks per grid
	dim3 threads(K,K);	// threads per block

	timer.Start();
	transpose_parallel_per_element<<<blocks,threads>>>(d_in, d_out);
	timer.Stop();
	cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
	printf("transpose_parallel_per_element: %g ms.\nVerifying transpose...%s\n",
		   timer.Elapsed(), compare_matrices(out, gold) ? "Failed" : "Success");

	timer.Start();
	transpose_parallel_per_element_tiled<<<blocks,threads>>>(d_in, d_out);
	timer.Stop();
	cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
	printf("transpose_parallel_per_element_tiled %dx%d: %g ms.\nVerifying ...%s\n", 
		   K, K, timer.Elapsed(), compare_matrices(out, gold) ? "Failed" : "Success");
	
	dim3 blocks16x16(N/16,N/16); // blocks per grid
	dim3 threads16x16(16,16);	 // threads per block

	timer.Start();
	transpose_parallel_per_element_tiled16<<<blocks16x16,threads16x16>>>(d_in, d_out);
	timer.Stop();
	cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
	printf("transpose_parallel_per_element_tiled 16x16: %g ms.\nVerifying ...%s\n", 
		   timer.Elapsed(), compare_matrices(out, gold) ? "Failed" : "Success");
	
	timer.Start();
 	transpose_parallel_per_element_tiled_padded16<<<blocks16x16,threads16x16>>>(d_in, d_out);
	timer.Stop();
	cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
	printf("transpose_parallel_per_element_tiled_padded 16x16: %g ms.\nVerifying...%s\n", 
	       timer.Elapsed(), compare_matrices(out, gold) ? "Failed" : "Success");


	// 使用cuda流去处理N次
	transpose_with_stream(d_in, d_out, out, gold, numbytes, N);
	
	// 使用锁页式内存或者普通内存进行处理
	printf("\n 使用锁页式内存:\n");
	run_streams(100, 1);

	printf("\n 使用普通内存:\n");
	run_streams(100, 0);

	cudaFree(d_in);
	cudaFree(d_out);
}

结果：