5 Functions
5 Functions
Rupesh Nasre.
IIT Madras
January 2022
CUDA Function Declarations
Executed Callable from
on the: only the:
●
__global__ defines a kernel. It must return void.
●
A program may have several functions of each kind.
●
The same function of any kind may be called multiple times.
●
Host == CPU, Device == GPU.
2 2
Function Types (1/2)
#include <stdio.h>
#include <cuda.h>
__host__ __device__ void dhfun() {
printf("I can run on both CPU and GPU.\n");
}
__device__ unsigned dfun(unsigned *vector, unsigned vectorsize, unsigned id) {
if (id == 0) dhfun();
if (id < vectorsize) {
vector[id] = id;
return 1;
} else {
return 0;
}
}
__global__ void dkernel(unsigned *vector, unsigned vectorsize) {
unsigned id = blockIdx.x * blockDim.x + threadIdx.x;
dfun(vector, vectorsize, id);
}
__host__ void hostfun() {
printf("I am simply like another function running on CPU. Calling dhfun\n");
dhfun();
}
3
Function Types (2/2)
#define BLOCKSIZE 1024
int main(int nn, char *str[]) {
unsigned N = atoi(str[1]);
unsigned *vector, *hvector;
cudaMalloc(&vector, N * sizeof(unsigned));
hvector = (unsigned *)malloc(N * sizeof(unsigned));
10
Classwork
Write
WriteaaCUDA
CUDAcodecodetoto __host__
__host____device__
__device__voidvoidfun(int
fun(int*arr)
*arr){{
increment
incrementall
allelements
elementsinin for
for(unsigned
(unsignediiii==0;
0;iiii<<N;
N;++ii)
++ii)
an
anarray.
array.Call
Callthis
thiscode
code ++arr[ii];
++arr[ii];
from host as well as
from host as well as }}
device.
device. __global__
__global__void voiddfun(int
dfun(int*arr)
*arr){{
fun(arr);
fun(arr); Host-centric,
}} Host-centric,
sequential
sequentialon
onGPU
GPU
int main() {
int main() {
Classwork:
Classwork:Can Canyou
youavoid
avoid int
intarr[N],
arr[N],*darr;
*darr;
the
thefor
forloop
loopininfun?
fun?
cudaMalloc(&darr,
cudaMalloc(&darr,NN**sizeof(int));
sizeof(int));
for
for(unsigned
(unsignediiii==0; 0;iiii<<N;
N;++ii)
++ii)
arr[ii]
arr[ii]==ii;ii;
cudaMemcpy(darr,
cudaMemcpy(darr,arr, arr,NN**sizeof(int),
sizeof(int),
cudaMemcpyHostToDevice);
cudaMemcpyHostToDevice);
fun(arr);
fun(arr);
dfun<<<1,
dfun<<<1,1>>>(darr);
1>>>(darr);
cudaDeviceSynchronize();
cudaDeviceSynchronize();
return
return0;
0; 11
}}
Classwork
Write
WriteaaCUDA
CUDAcodecodetoto __host__
__host____device__
__device__voidvoidfun(int
fun(int*arr)
*arr){{
increment
incrementall
allelements
elementsinin ++arr;
++arr;
an
anarray.
array.Call
Callthis
thiscode
code }}
from host as well as
from host as well as __global__
__global__void voiddfun(int
dfun(int*arr)
*arr){{
device.
device. fun(arr
fun(arr++threadIdx.x);
threadIdx.x);
}}
int main() { Device-centric,
Device-centric,
int main() { sequential on CPU
int arr[N], *darr;
int arr[N], *darr; sequential on CPU
Classwork:
Classwork:Can Canyou
youavoid
avoid
the
thefor
forloop
loopininfun?
fun? cudaMalloc(&darr,
cudaMalloc(&darr,NN**sizeof(int));
sizeof(int));
for
for(unsigned
(unsignediiii==0; 0;iiii<<N;
N;++ii)
++ii)
arr[ii]
arr[ii]==ii;ii;
cudaMemcpy(darr,
cudaMemcpy(darr,arr, arr,NN**sizeof(int),
sizeof(int),
cudaMemcpyHostToDevice);
cudaMemcpyHostToDevice);
Classwork: for
for(unsigned
(unsignediiii==0; 0;iiii<<N;
N;++ii)
++ii)
Classwork:What WhatififI Idon’t
don’t fun(arr
like
likethe
theforforloop
loopininmain,
main, fun(arr++ii); ii);
but dfun<<<1, N>>>(darr);
dfun<<<1, N>>>(darr);
butstill
stillwant
wantGPU-parallel
GPU-parallel cudaDeviceSynchronize();
code?
code? cudaDeviceSynchronize();
return
return0;
0;
}} 12
Classwork: Pranav’s idea
__host__
__host____device__
__device__void voidfun(int
fun(int*arr,
*arr,int
intnn)
nn){{
Write
WriteaaCUDA
CUDAcodecodetoto for
for(unsigned
(unsignediiii==0;
0;iiii<<nn;
nn;++ii)
++ii)
increment
incrementall
allelements
elementsinin ++arr[ii];
++arr[ii];
an
anarray.
array.Call
Callthis
thiscode
code }}
from host as well as
from host as well as __global__
__global__void voiddfun(int
dfun(int*arr)
*arr){{
device.
device. fun(arr
fun(arr++threadIdx.x,
threadIdx.x,1); 1);
////need
needtotochange
changeforformore
moreblocks.
blocks.
}}
Classwork: int
intmain()
main(){{
Classwork:Can Canyou
youavoid
avoid int
the
thefor
forloop
loopininfun?
fun? intarr[N],
arr[N],*darr;
*darr;
cudaMalloc(&darr,
cudaMalloc(&darr,NN**sizeof(int));
sizeof(int));
for
for(unsigned
(unsignediiii==0; 0;iiii<<N;
N;++ii)
++ii)
arr[ii]
arr[ii]==ii;ii;
cudaMemcpy(darr,
cudaMemcpy(darr,arr, arr,NN**sizeof(int),
sizeof(int),
Classwork:
Classwork:What WhatififI Idon’t
don’t cudaMemcpyHostToDevice);
cudaMemcpyHostToDevice);
like
likethe
theforforloop
loopininmain,
main,
but
butstill
stillwant
wantGPU-parallel fun(arr,
GPU-parallel fun(arr,N);
N);
code? dfun<<<1,
code? dfun<<<1,N>>>(darr);
N>>>(darr);
cudaDeviceSynchronize();
cudaDeviceSynchronize();
13
return
return0;
0;
}}
Thrust
●
Thrust is a parallel algorithms library
(similar in spirit to STL on CPU).
●
Supports vectors and associated transforms.
●
Programmer is oblivious to where code executes
– on CPU or GPU.
●
Makes use of C++ features such as functors, and
__host__ __device__ functions.
14
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <iostream>
int main(void) {
// H has storage for 4 integers
thrust::host_vector<int> H(4);
// initialize individual elements
H[0] = 14; H[1] = 20; H[2] = 38; H[3] = 46;
// H.size() returns the size of vector H
std::cout << "H has size " << H.size() << std::endl;
// print contents of H
for(int i = 0; i < H.size(); i++) std::cout << "H[" << i << "] = " << H[i] << std::endl;
// resize H
H.resize(2);
std::cout << "H now has size " << H.size() << std::endl;
// Copy host_vector H to device_vector D
thrust::device_vector<int> D = H;
// elements of D can be modified
D[0] = 99; D[1] = 88;
// H and D are automatically deleted when the function returns
return 0;
} 15
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/copy.h>
#include <thrust/fill.h>
#include <thrust/sequence.h>
#include <iostream>
int main(void) {
// initialize all ten integers of a device_vector to 1
thrust::device_vector<int> D(10, 1);
// set the first seven elements of a vector to 9
thrust::fill(D.begin(), D.begin() + 7, 9);
// initialize a host_vector with the first five elements of D
thrust::host_vector<int> H(D.begin(), D.begin() + 5);
// set the elements of H to 0, 1, 2, 3, ...
thrust::sequence(H.begin(), H.end());
// copy all of H back to the beginning of D
thrust::copy(H.begin(), H.end(), D.begin());
// print D
for(int i = 0; i < D.size(); i++)
std::cout << "D[" << i << "] = " << D[i] << std::endl;
return 0; 16
}
Thrust Details
thrust::host_vector<int> hnums(1024);
thrust::device_vector<int> dnums;
// initialization.
thrust::device_vector<int> dnum2(hnums.begin(), hnums.end());
hnums = dnum2; // array resizing happens automatically.
17
Thrust Functions
●
find(begin, end, value);
●
find_if(begin, end, predicate);
●
copy, copy_if.
●
count, count_if.
●
equal.
●
min_element, max_element.
●
merge, sort, reduce.
●
transform.
18
●
...
Thrust Algorithms
●
Dual implementations: host and device
●
Iterators as arguments must be on the same
device
– except copy, which can copy across devices
– Otherwise, compiler issues error
19
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/sequence.h>
#include <thrust/copy.h>
#include <thrust/fill.h>
#include <thrust/replace.h>
#include <thrust/functional.h>
#include <iostream>
int main(void) {
// allocate three device_vectors with 10 elements
thrust::device_vector<int> X(10);
thrust::device_vector<int> Y(10);
thrust::device_vector<int> Z(10);
// initialize X to 0,1,2,3, ....
thrust::sequence(X.begin(), X.end());
// compute Y = -X
thrust::transform(X.begin(), X.end(), Y.begin(), thrust::negate<int>());
// fill Z with twos
thrust::fill(Z.begin(), Z.end(), 2);
// compute Y = X mod 2
thrust::transform(X.begin(), X.end(), Z.begin(), Y.begin(), thrust::modulus<int>());
// replace all the ones in Y with tens
thrust::replace(Y.begin(), Y.end(), 1, 10);
// print Y
thrust::copy(Y.begin(), Y.end(), std::ostream_iterator<int>(std::cout, "\n"));
return 0; 20
}
Thrust User-Defined Functors
// calculate result[] = (a * x[]) + y[]
struct saxpy {
const float _a;
saxpy(int a) : _a(a) { }
__host__ __device__
float operator()(const float& x, const float& y) const {
return _a * x + y;
}
};
21
Classwork
●
Create two 32-element vectors:
– X on host, Y on device
●
Fill X with 10, fill Y with sequence 0..31
●
Compute X = X – Y
●
Compute Z = X * Y
– // element-wise multiplication
22
Thrust Reductions
●
Recall reductions in log(n) barriers
●
No need to worry about blocks, synchronization.
int
intx,x,y;
y;
thrust::host_vector<int>
thrust::host_vector<int>hvec;
hvec;
thrust::device_vector<int>
thrust::device_vector<int>dvec;
dvec;
////(thrust::reduce
(thrust::reduceisisaasum
sumoperation
operationby
bydefault)
default)
xx==thrust::reduce(hvec.begin(),
thrust::reduce(hvec.begin(),hvec.end());
hvec.end()); ////on
onCPU
CPU
yy==thrust::reduce(dvec.begin(),
thrust::reduce(dvec.begin(),dvec.end());
dvec.end()); ////on
onGPU
GPU
yy==thrust::reduce(dvec.begin(),
thrust::reduce(dvec.begin(),dvec.end(),
dvec.end(),
(int)0,
(int)0,thrust::plus<int>());
thrust::plus<int>());
Classwork:
Classwork:Implement
Implementcount
countusing
usingreduction.
reduction.
For
Forinstance,
instance,IIwant
wanttotofind
findthe
thenumber
numberofofoccurrences
occurrencesofofan
anelement
elementininaa 23
vector.
vector.
struct mycount {
int _a;
mycount(int a):_a(a){}
__host__ __device__
int operator()(const int x, const int y) const {
return (y == _a ? x + 1 : x);
}
};
int main() {
thrust::host_vector<int> vec(10, 0);
vec[1] = 5;
vec[4] = 5;
vec[9] = 5;
25
Classwork
●
What is the output of the following code?
int
intdata[]
data[]=={-5,
{-5,0,
0,2,
2,-3,
-3,2,
2,4,
4,0,
0,-1,
-1,2,
2,8};
8};
int
intsizedata
sizedata==sizeof(data)
sizeof(data)//sizeof(*data);
sizeof(*data);
thrust::plus<int>
thrust::plus<int>binop;
binop;
thrust::exclusive_scan(data,
thrust::exclusive_scan(data,data
data++sizedata,
sizedata,data,
data,5,
5,binop);
binop);
for
for(unsigned
(unsignediiii==0;
0;iiii<<sizedata;
sizedata;++ii)
++ii){{
std::cout
std::cout<<
<<data[ii]
data[ii]<<
<<""";";
}}
std::cout
std::cout<<<<std::endl;
std::endl;
26
55000022-1-11155554466
Classwork
●
What is the output of the following code?
int
intdata[]
data[]=={-5,
{-5,0,
0,2,
2,-3,
-3,2,
2,4,
4,0,
0,-1,
-1,2,
2,8};
8};
int
intsizedata
sizedata==sizeof(data)
sizeof(data)//sizeof(*data);
sizeof(*data);
thrust::plus<int>
thrust::plus<int>binop;
binop;
thrust::inclusive_scan(data,
thrust::inclusive_scan(data,data
data++sizedata,
sizedata,data,
data,5,
5,binop);
binop);
for
for(unsigned
(unsignediiii==0;
0;iiii<<sizedata;
sizedata;++ii)
++ii){{
std::cout
std::cout<<
<<data[ii]
data[ii]<<
<<""";";
}}
std::cout
std::cout<<<<std::endl;
std::endl;
27
Compile-time
Compile-timeerror:
error:Why?
Why?
Classwork
●
What is the output of the following code?
int
intdata[]
data[]=={-5,
{-5,0,
0,2,
2,-3,
-3,2,
2,4,
4,0,
0,-1,
-1,2,
2,8};
8};
int
intsizedata
sizedata==sizeof(data)
sizeof(data)//sizeof(*data);
sizeof(*data);
thrust::plus<int>
thrust::plus<int>binop;
binop;
thrust::inclusive_scan(data,
thrust::inclusive_scan(data,data
data++sizedata,
sizedata,data,
data,5,
5,binop);
binop);
for
for(unsigned
(unsignediiii==0;
0;iiii<<sizedata;
sizedata;++ii)
++ii){{
std::cout
std::cout<<
<<data[ii]
data[ii]<<
<<""";";
}}
std::cout
std::cout<<<<std::endl;
std::endl;
28
-5-5-5-5-3-3-6-6-4-40000-1-11199
Classwork
●
What is the output of the following code?
int
intdata
data [][]=={-5,
{-5,0,
0,2,
2,-3,
-3,2,
2,4,
4,0,
0,-1,
-1,2,
2,8};
8};
int
intdata2[]
data2[]=={-5,
{-5,0,0,2,
2,-3,
-3,2,
2,4,
4,0,
0,-1,
-1,2,
2,8};
8};
int
intsizedata
sizedata==sizeof(data)
sizeof(data)//sizeof(*data);
sizeof(*data);
thrust::plus<int>
thrust::plus<int>binop;
binop;
thrust::inclusive_scan(data,
thrust::inclusive_scan(data,data+sizedata,
data+sizedata,data,
data,5,
5,binop);
binop);
thrust::exclusive_scan(data,
thrust::exclusive_scan(data,data+sizedata,
data+sizedata,data2,
data2,0,0,binop);
binop);
for
for(unsigned
(unsignediiii==0;
0;iiii<<sizedata;
sizedata;++ii)
++ii)
std::cout
std::cout<<
<<data2[ii]
data2[ii]<<<<""";";
std::cout
std::cout<<<<std::endl;
std::endl;
29
00-5-5-10
-10-13
-13-19
-19-23
-23-23
-23-23
-23-24
-24-23
-23
Classwork: Find output
int main() {
int data[] = {-5, 0, 2, -3, 2, 4, 0, -1, 2, 8};
int sizedata = sizeof(data) / sizeof(*data);
thrust::maximum<int> binop;
thrust::exclusive_scan(data, data + sizedata,
data, 1, binop);
for (unsigned ii = 0; ii < sizedata; ++ii) {
std::cout << data[ii] << " ";
}
std::cout << std::endl;
return 0;
} 1112224444 30
1112224444
Set Operations
#include <thrust/set_operations.h>
…
Must be sorted
int A1[6] = {0, 1, 3, 4, 5, 6, 9};
int A2[5] = {1, 3, 5, 7, 9};
int result[N];
thrust::set_difference(A1, A1+6, A2, A2+5, result);
result
resultisis{0,
{0,4,4,6}.
6}.
31
Set Operations
#include <thrust/set_operations.h>
…
int A1[] = {9, 6, 5, 4, 3, 1, 0};
int A2[5] = {9, 7, 5, 3, 1};
int result[N];
thrust::set_difference(A1, A1+7, A2, A2+5, result,
thrust::greater<int>());
result
resultisis{6,
{6,4,4,0}.
0}.
32
Sorting
#include <thrust/sort.h>
...
const int N = 6;
int A[N] = {1, 4, 2, 8, 5, 7};
thrust::sort(A, A + N);
// A is now {1, 2, 4, 5, 7, 8}
34