#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
//初始化m*n矩阵
void init(int *A, int first,int second)
{
int i, j;
srand((int)time(NULL));
for (i = 0; i<first; i++)
for (j = 0; j<second; j++)
A[i*second+j] = (int)rand()%20+1;
}
//cpu矩阵相乘函数
void cpuMatMul(int *A, int *B, int *C, int first, int second, int third)
{
int i, j, k;
int sum;
for (i = 0; i<first; i++)
for (j = 0; j<third; j++)
{
sum = 0;
for (k = 0; k<second; k++)
sum += A[i*second + k] * B[k*third + j];
C[i*third + j] = sum;
}
}
//gpu矩阵相乘核函数
__global__ void matMulKernel(int *d_A,int *d_B,int *d_C,int first,int second,int third)
{
int offset = threadIdx.x + blockIdx.x*blockDim.x;
int i,j,k;
int num = first*third;
int sum;
while (offset < num)
{
i = offset / third;
j = offset % third;
sum = 0;
for (k = 0; k < second; k++)
sum += d_A[i*second + k] * d_B[k*third + j];
d_C[i*third + j] = sum;
offset += blockDim.x*gridDim.x;
}
}
//cpu+gpu异构矩阵相乘函数
cudaError cpuAndGpuMatMul(int*A, int *B, int *C, int first, int second, int third)
{
cudaError_t cudaStatus;
cudaDeviceProp prop;
int blocks, threads;
int *d_A, *d_B, *d_C;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
return cudaStatus;
}
//get the properties of the device
cudaStatus = cudaGetDeviceProperties(&prop, 0);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaGetDeivceProperties failed!");
return cudaStatus;
}
// Allocate GPU buffers for three vectors (two input, one output)
cudaStatus = cudaMalloc((void**)&d_A, first*second*sizeof(int));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&d_B, second*third*sizeof(int));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&d_C, first*third*sizeof(int));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(d_A, A, first*second*sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(d_B, B, second*third*sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
//allocate threads and blocks
threads = prop.maxThreadsPerBlock / 2;
blocks = (first*third+ threads - 1) / threads;
if (blocks > prop.maxGridSize[0])
blocks = prop.maxGridSize[0];
// Launch a kernel on the GPU
matMulKernel <<< blocks, threads >> >(d_A,d_B,d_C,first,second,third);
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "matMulKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(C, d_C, first*third*sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
return cudaStatus;
}
//GPU设备参数查询函数
cudaError_t my_cudaGetDeviceProperties()
{
int count;
cudaError_t cudaStatus;
cudaDeviceProp prop;
cudaStatus = cudaGetDeviceCount(&count);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaGetDeviceCount failed!");
return cudaStatus;
}
printf("The number of devices: %d\n", count);
for (int i = 0; i < count; i++)
{
cudaStatus = cudaGetDeviceProperties(&prop, i);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaGetDeivceProperties of device %d failed!", i);
return cudaStatus;
}
printf(" ---General Information of Device %d---\n", i);
printf("Name: %s\n", prop.name); //设备名称
printf("Compute Capability: %d.%d\n", prop.major, prop.minor); //设备功能集的主次版本号
printf("Device copy overlap: "); //设备可以同时执行一个cudaMemory()调用和一个核函数调用
if (prop.deviceOverlap)
printf("Enable\n");
else
printf("Disabled\n");
printf("Kernel execution timeout: "); //该设备上执行的核函数是否存在运行时限制
if (prop.kernelExecTimeoutEnabled)
printf("Enabled\n");
else
printf("Disabled\n");
printf(" ---Memory Information of Device %d---\n", i);
printf("Total global mem: %ld\n", prop.totalGlobalMem); //设备上全局内存总量,单位字节
printf("Total constant mem: %ld\n", prop.totalConstMem);
printf("Max mem pitch: %ld\n", prop.memPitch);//内存复制中最大修正量,单位字节
printf("Texture Aligment: %ld\n", prop.textureAlignment);//设备纹理对齐要求
printf(" ---MP Information of Device %d---\n", i);
printf("Multiprocessor count: %d\n", prop.multiProcessorCount);//设备上多处理器的数量
printf("Shared mem per mp: %ld\n", prop.sharedMemPerBlock);//每个线程块中可使用的最大内存共享数量,单位字节
printf("Registers per mp: %d\n", prop.regsPerBlock);//每个线程块中可用的寄存器数量
printf("Threads in warp: %d\n", prop.warpSize);//每个线程束包含的线程数
printf("Max threads per block: %d\n", prop.maxThreadsPerBlock);//每个线程块中可包含的最大线程数
printf("Max thread dimensions: (%d,%d,%d)\n", prop.maxThreadsDim[0],
prop.maxThreadsDim[1], prop.maxThreadsDim[2]);//多维线程块数组中,每维最大线程数量
printf("Max grid dimensions: (%d,%d,%d)\n", prop.maxGridSize[0],
prop.maxGridSize[1], prop.maxGridSize[2]);//多维线程格中,每维最大线程块数量
}
return cudaStatus;
}
//主函数
int main()
{
printf("-----------------------设备参数----------------------\n");
my_cudaGetDeviceProperties(); //查询设备参数
clock_t cs,cf,cgs,cgf; //时钟
double duration,sp; //时延和加速比
int first=1024, second=1024, third=1024; //初始化矩阵大小
//改变矩阵大小比较CPU运算和CPU+GPU异构运算
for (int i = 1; i < 7; first += 1024,i++ )
{
int *A = (int*)malloc(first*second*sizeof(int));
int *B = (int*)malloc(second*third*sizeof(int));
int *C = (int*)malloc(first*third*sizeof(int));
init(A, first, second);
init(B, second, third);
printf("----------%d*%d矩阵与%d*%d矩阵相乘----------\n",first,second,second,third);
//CPU处理
cs = clock();
cpuMatMul(A, B, C, first, second, third);
cf = clock();
duration = (double)(cf - cs) / CLOCKS_PER_SEC;
printf("CPU处理时间:%fs\n", duration);
//CPU+GPU异构处理
cgs = clock();
cudaError_t cudaStatus;
cudaStatus = cpuAndGpuMatMul(A, B, C, first, second,third);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "gpuMatMul failed!");
return 1;
}
cgf = clock();
duration = (double)(cgf - cgs) / CLOCKS_PER_SEC;
printf("CPU+GPU处理时间:%fs\n", duration);
sp = (double)(cf - cs) / (cgf - cgs); //计算加速比
printf("加速比:%f\n", sp);
free(A);
free(B);
free(C);
}
getchar();
return 0;
}
评论0