0% found this document useful (0 votes)

4 views34 pages

5 Functions

The document provides an overview of CUDA function declarations and types, detailing the differences between host and device functions. It includes examples of function definitions, memory allocation, and the execution of kernels in CUDA programming. Additionally, it discusses the use of global variables and the implications of memory access on CPU and GPU.

Uploaded by

Omniverse9 BTFF

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

4 views34 pages

5 Functions

Uploaded by

Omniverse9 BTFF

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 34

Functions

Rupesh Nasre.

IIT Madras
January 2022
CUDA Function Declarations
Executed Callable from
on the: only the:

device float DeviceFunc() device device

__global__ void KernelFunc() device host + device

host float HostFunc() host host

●
__global__ defines a kernel. It must return void.
●
A program may have several functions of each kind.
●
The same function of any kind may be called multiple times.
●
Host == CPU, Device == GPU.

2 2
Function Types (1/2)
#include <stdio.h>
#include <cuda.h>
__host__ __device__ void dhfun() {
printf("I can run on both CPU and GPU.\n");
}
__device__ unsigned dfun(unsigned *vector, unsigned vectorsize, unsigned id) {
if (id == 0) dhfun();
if (id < vectorsize) {
vector[id] = id;
return 1;
} else {
return 0;
}
}
__global__ void dkernel(unsigned *vector, unsigned vectorsize) {
unsigned id = blockIdx.x * blockDim.x + threadIdx.x;
dfun(vector, vectorsize, id);
}
__host__ void hostfun() {
printf("I am simply like another function running on CPU. Calling dhfun\n");
dhfun();
}
3
Function Types (2/2)
#define BLOCKSIZE 1024
int main(int nn, char *str[]) {
unsigned N = atoi(str[1]);
unsigned *vector, *hvector;
cudaMalloc(&vector, N * sizeof(unsigned));
hvector = (unsigned *)malloc(N * sizeof(unsigned));

unsigned nblocks = ceil((float)N / BLOCKSIZE);

printf("nblocks = %d\n", nblocks);

dkernel<<<nblocks, BLOCKSIZE>>>(vector, N);

cudaMemcpy(hvector, vector, N * sizeof(unsigned), cudaMemcpyDeviceToHost);
for (unsigned ii = 0; ii < N; ++ii) { C
printf("%4d ", hvector[ii]); main hostfun P
} main hostfun
U
printf("\n"); dhfun
dhfun G
hostfun();
dhfun(); dkernel dfun
dfun
P
dkernel
return 0; U
}

What are the other arrows possible in this diagram?

4
How about dhfun to dfun?
with HostAlloc'ed Memory
__host__ __device__ __host__
__host____device__
__device__void voidfun(int
fun(int*counter)
*counter){{
functions are friends with ++*counter;
++*counter;
HostAlloc’ed memory. }}
__global__
__global__voidvoidprintk(int
printk(int*counter)
*counter){{
fun(counter);
fun(counter);
printf("printk
printf("printk(after
(afterfun):
fun):%d\n",
%d\n",*counter);
*counter);
}} What
Whatisisthe
the
int
intmain()
main(){{ output
outputof
of
int
int*counter;
*counter; this
thiscode?
code?
cudaHostAlloc(&counter,
cudaHostAlloc(&counter,sizeof(int),
sizeof(int),0);
0);
*counter
*counter==0;0;
printf("main:
printf("main:%d\n",
%d\n",*counter);
*counter);
printk<<<1,
printk<<<1,1>>>(counter);
1>>>(counter);
cudaDeviceSynchronize();
cudaDeviceSynchronize();
fun(counter);
fun(counter);
printf("main
printf("main(after
(afterfun):
fun):%d\n",
%d\n",*counter);
*counter);
return
return0;
0;
}} 5
with a Device-only Function
__host__
__host____device__
__device__void voidfun(int
fun(int*counter)
*counter){{
++*counter;
++*counter;
__syncthreads(); __syncthreads()
__syncthreads()
__syncthreads();
}} isisnot
notavailable
available
__global__ on CPU.
__global__voidvoidprintk(int
printk(int*counter)
*counter){{ on CPU.
fun(counter);
fun(counter);
printf("printk
printf("printk(after
(afterfun):
fun):%d\n",
%d\n",*counter);
*counter);
}}
int
intmain()
main(){{
int
int*counter;
*counter;
cudaHostAlloc(&counter,
cudaHostAlloc(&counter,sizeof(int),
sizeof(int),0);
0);
*counter
*counter==0;0;
printf("main:
printf("main:%d\n",
%d\n",*counter);
*counter);
printk
printk<<<1,
<<<1,1>>>(counter);
1>>>(counter);
cudaDeviceSynchronize();
cudaDeviceSynchronize();
fun(counter);
fun(counter);
printf("main
printf("main(after
(afterfun):
fun):%d\n",
%d\n",*counter);
*counter);
return
return0;
0; 6
}}
with a CPU-only Memory
__host__
__host____device__
__device__void voidfun(int
fun(int*counter)
*counter){{
++*counter;
++*counter;
}}
__global__
__global__void voidprintk(int
printk(int*counter)
*counter){{
fun(counter);
fun(counter);
printf("printk
printf("printk(after
(afterfun):
fun):%d\n",
%d\n",*counter);
*counter);
}}
int
intmain()
main(){{
int
int*counter;
*counter; countercannot
counter cannot
////cudaHostAlloc(&counter,
cudaHostAlloc(&counter,sizeof(int),
sizeof(int),0);
0); be accessed
be accessed
cudaMalloc(&counter,
cudaMalloc(&counter,sizeof(int));
sizeof(int)); on
onCPU.
CPU.
*counter
*counter==0;0;
printf("main:
printf("main:%d\n",
%d\n",*counter);
*counter);
printk
printk<<<1,
<<<1,1>>>(counter);
1>>>(counter);
cudaDeviceSynchronize();
cudaDeviceSynchronize();
fun(counter);
fun(counter);
printf("main
printf("main(after
(afterfun):
fun):%d\n",
%d\n",*counter);
*counter);
return
return0;
0; 7
}}
Global Variables
int
intcounter;
counter;
__host__
__host____device__
__device__void voidfun()
fun(){{ countercannot
counter cannot
++counter; be
beaccessed
accessed
++counter;
}} on
onGPU.
GPU.
__global__
__global__voidvoidprintk()
printk(){{
fun();
fun();
printf("printk
printf("printk(after
(afterfun):
fun):%d\n",
%d\n",counter);
counter);
}}
int
intmain()
main(){{
counter
counter==0;0;
printf("main:
printf("main:%d\n",
%d\n",counter);
counter);
printk
printk<<<1,
<<<1,1>>>();
1>>>();
cudaDeviceSynchronize();
cudaDeviceSynchronize();
fun();
fun();
printf("main
printf("main(after
(afterfun):
fun):%d\n",
%d\n",counter);
counter);
return
return0;
0;
}} 8
Global Variables
__host__
__host____device__
__device__int
intcounter;
counter;
Variables
Variables
__host__
__host____device__
__device__void voidfun()
fun(){{ cannot
cannotbe
be
++counter; declared
++counter; declaredas
as
}} __host__.
__host__.
__global__
__global__voidvoidprintk()
printk(){{
fun();
fun();
printf("printk
printf("printk(after
(afterfun):
fun):%d\n",
%d\n",counter);
counter);
}}
int
intmain()
main(){{
counter
counter==0;0;
printf("main:
printf("main:%d\n",
%d\n",counter);
counter);
printk
printk<<<1,
<<<1,1>>>();
1>>>();
cudaDeviceSynchronize();
cudaDeviceSynchronize();
fun();
fun();
printf("main
printf("main(after
(afterfun):
fun):%d\n",
%d\n",counter);
counter);
return
return0;
0;
}} 9
Global Variables
__device__
__device__int
intcounter;
counter;
__host__
__host____device__
__device__void voidfun()
fun(){{ Warning
Warningduring
during
++counter;
++counter; compilation,
compilation,
}} but
butworks
worksfine.
fine.
__global__
__global__voidvoidprintk()
printk(){{
fun();
fun();
printf("printk
printf("printk(after
(afterfun):
fun):%d\n",
%d\n",counter);
counter);
}}
int
intmain()
main(){{
printk
printk<<<1,
<<<1,1>>>();
1>>>();
cudaDeviceSynchronize();
cudaDeviceSynchronize();
return
return0;
0;
}}

10
Classwork
Write
WriteaaCUDA
CUDAcodecodetoto __host__
__host____device__
__device__voidvoidfun(int
fun(int*arr)
*arr){{
increment
incrementall
allelements
elementsinin for
for(unsigned
(unsignediiii==0;
0;iiii<<N;
N;++ii)
++ii)
an
anarray.
array.Call
Callthis
thiscode
code ++arr[ii];
++arr[ii];
from host as well as
from host as well as }}
device.
device. __global__
__global__void voiddfun(int
dfun(int*arr)
*arr){{
fun(arr);
fun(arr); Host-centric,
}} Host-centric,
sequential
sequentialon
onGPU
GPU
int main() {
int main() {
Classwork:
Classwork:Can Canyou
youavoid
avoid int
intarr[N],
arr[N],*darr;
*darr;
the
thefor
forloop
loopininfun?
fun?
cudaMalloc(&darr,
cudaMalloc(&darr,NN**sizeof(int));
sizeof(int));
for
for(unsigned
(unsignediiii==0; 0;iiii<<N;
N;++ii)
++ii)
arr[ii]
arr[ii]==ii;ii;
cudaMemcpy(darr,
cudaMemcpy(darr,arr, arr,NN**sizeof(int),
sizeof(int),
cudaMemcpyHostToDevice);
cudaMemcpyHostToDevice);
fun(arr);
fun(arr);
dfun<<<1,
dfun<<<1,1>>>(darr);
1>>>(darr);
cudaDeviceSynchronize();
cudaDeviceSynchronize();
return
return0;
0; 11
}}
Classwork
Write
WriteaaCUDA
CUDAcodecodetoto __host__
__host____device__
__device__voidvoidfun(int
fun(int*arr)
*arr){{
increment
incrementall
allelements
elementsinin ++arr;
++arr;
an
anarray.
array.Call
Callthis
thiscode
code }}
from host as well as
from host as well as __global__
__global__void voiddfun(int
dfun(int*arr)
*arr){{
device.
device. fun(arr
fun(arr++threadIdx.x);
threadIdx.x);
}}
int main() { Device-centric,
Device-centric,
int main() { sequential on CPU
int arr[N], *darr;
int arr[N], *darr; sequential on CPU
Classwork:
Classwork:Can Canyou
youavoid
avoid
the
thefor
forloop
loopininfun?
fun? cudaMalloc(&darr,
cudaMalloc(&darr,NN**sizeof(int));
sizeof(int));
for
for(unsigned
(unsignediiii==0; 0;iiii<<N;
N;++ii)
++ii)
arr[ii]
arr[ii]==ii;ii;
cudaMemcpy(darr,
cudaMemcpy(darr,arr, arr,NN**sizeof(int),
sizeof(int),
cudaMemcpyHostToDevice);
cudaMemcpyHostToDevice);
Classwork: for
for(unsigned
(unsignediiii==0; 0;iiii<<N;
N;++ii)
++ii)
Classwork:What WhatififI Idon’t
don’t fun(arr
like
likethe
theforforloop
loopininmain,
main, fun(arr++ii); ii);
but dfun<<<1, N>>>(darr);
dfun<<<1, N>>>(darr);
butstill
stillwant
wantGPU-parallel
GPU-parallel cudaDeviceSynchronize();
code?
code? cudaDeviceSynchronize();
return
return0;
0;
}} 12
Classwork: Pranav’s idea
__host__
__host____device__
__device__void voidfun(int
fun(int*arr,
*arr,int
intnn)
nn){{
Write
WriteaaCUDA
CUDAcodecodetoto for
for(unsigned
(unsignediiii==0;
0;iiii<<nn;
nn;++ii)
++ii)
increment
incrementall
allelements
elementsinin ++arr[ii];
++arr[ii];
an
anarray.
array.Call
Callthis
thiscode
code }}
from host as well as
from host as well as __global__
__global__void voiddfun(int
dfun(int*arr)
*arr){{
device.
device. fun(arr
fun(arr++threadIdx.x,
threadIdx.x,1); 1);
////need
needtotochange
changeforformore
moreblocks.
blocks.
}}
Classwork: int
intmain()
main(){{
Classwork:Can Canyou
youavoid
avoid int
the
thefor
forloop
loopininfun?
fun? intarr[N],
arr[N],*darr;
*darr;
cudaMalloc(&darr,
cudaMalloc(&darr,NN**sizeof(int));
sizeof(int));
for
for(unsigned
(unsignediiii==0; 0;iiii<<N;
N;++ii)
++ii)
arr[ii]
arr[ii]==ii;ii;
cudaMemcpy(darr,
cudaMemcpy(darr,arr, arr,NN**sizeof(int),
sizeof(int),
Classwork:
Classwork:What WhatififI Idon’t
don’t cudaMemcpyHostToDevice);
cudaMemcpyHostToDevice);
like
likethe
theforforloop
loopininmain,
main,
but
butstill
stillwant
wantGPU-parallel fun(arr,
GPU-parallel fun(arr,N);
N);
code? dfun<<<1,
code? dfun<<<1,N>>>(darr);
N>>>(darr);
cudaDeviceSynchronize();
cudaDeviceSynchronize();
13
return
return0;
0;
}}
Thrust
●
Thrust is a parallel algorithms library
(similar in spirit to STL on CPU).
●
Supports vectors and associated transforms.
●
Programmer is oblivious to where code executes
– on CPU or GPU.
●
Makes use of C++ features such as functors, and
__host__ __device__ functions.

14
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <iostream>
int main(void) {
// H has storage for 4 integers
thrust::host_vector<int> H(4);
// initialize individual elements
H[0] = 14; H[1] = 20; H[2] = 38; H[3] = 46;
// H.size() returns the size of vector H
std::cout << "H has size " << H.size() << std::endl;
// print contents of H
for(int i = 0; i < H.size(); i++) std::cout << "H[" << i << "] = " << H[i] << std::endl;
// resize H
H.resize(2);
std::cout << "H now has size " << H.size() << std::endl;
// Copy host_vector H to device_vector D
thrust::device_vector<int> D = H;
// elements of D can be modified
D[0] = 99; D[1] = 88;
// H and D are automatically deleted when the function returns
return 0;
} 15
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/copy.h>
#include <thrust/fill.h>
#include <thrust/sequence.h>
#include <iostream>
int main(void) {
// initialize all ten integers of a device_vector to 1
thrust::device_vector<int> D(10, 1);
// set the first seven elements of a vector to 9
thrust::fill(D.begin(), D.begin() + 7, 9);
// initialize a host_vector with the first five elements of D
thrust::host_vector<int> H(D.begin(), D.begin() + 5);
// set the elements of H to 0, 1, 2, 3, ...
thrust::sequence(H.begin(), H.end());
// copy all of H back to the beginning of D
thrust::copy(H.begin(), H.end(), D.begin());
// print D
for(int i = 0; i < D.size(); i++)
std::cout << "D[" << i << "] = " << D[i] << std::endl;
return 0; 16
}
Thrust Details
thrust::host_vector<int> hnums(1024);
thrust::device_vector<int> dnums;

dnums = hnums; // calls cudaMemcpy

// initialization.
thrust::device_vector<int> dnum2(hnums.begin(), hnums.end());
hnums = dnum2; // array resizing happens automatically.

std::cout << dnums[3] << std::endl;

thrust::transform(dsrc.begin(), dsrc.end(), dsrc2.begin(),

ddst.begin(), addFunc);

17
Thrust Functions
●
find(begin, end, value);
●
find_if(begin, end, predicate);
●
copy, copy_if.
●
count, count_if.
●
equal.
●
min_element, max_element.
●
merge, sort, reduce.
●
transform.
18
●
...
Thrust Algorithms
●
Dual implementations: host and device
●
Iterators as arguments must be on the same
device
– except copy, which can copy across devices
– Otherwise, compiler issues error

19
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/sequence.h>
#include <thrust/copy.h>
#include <thrust/fill.h>
#include <thrust/replace.h>
#include <thrust/functional.h>
#include <iostream>
int main(void) {
// allocate three device_vectors with 10 elements
thrust::device_vector<int> X(10);
thrust::device_vector<int> Y(10);
thrust::device_vector<int> Z(10);
// initialize X to 0,1,2,3, ....
thrust::sequence(X.begin(), X.end());
// compute Y = -X
thrust::transform(X.begin(), X.end(), Y.begin(), thrust::negate<int>());
// fill Z with twos
thrust::fill(Z.begin(), Z.end(), 2);
// compute Y = X mod 2
thrust::transform(X.begin(), X.end(), Z.begin(), Y.begin(), thrust::modulus<int>());
// replace all the ones in Y with tens
thrust::replace(Y.begin(), Y.end(), 1, 10);
// print Y
thrust::copy(Y.begin(), Y.end(), std::ostream_iterator<int>(std::cout, "\n"));
return 0; 20
}
Thrust User-Defined Functors
// calculate result[] = (a * x[]) + y[]
struct saxpy {
const float _a;
saxpy(int a) : _a(a) { }

__host__ __device__
float operator()(const float& x, const float& y) const {
return _a * x + y;
}
};

thrust::device_vector<float> x, y, result; … // populate x, y.

thrust::transform(x.begin(), x.end(), y.begin(), result.begin(), saxpy(a));

21
Classwork
●
Create two 32-element vectors:
– X on host, Y on device
●
Fill X with 10, fill Y with sequence 0..31
●
Compute X = X – Y
●
Compute Z = X * Y
– // element-wise multiplication

22
Thrust Reductions
●
Recall reductions in log(n) barriers
●
No need to worry about blocks, synchronization.
int
intx,x,y;
y;
thrust::host_vector<int>
thrust::host_vector<int>hvec;
hvec;
thrust::device_vector<int>
thrust::device_vector<int>dvec;
dvec;
////(thrust::reduce
(thrust::reduceisisaasum
sumoperation
operationby
bydefault)
default)
xx==thrust::reduce(hvec.begin(),
thrust::reduce(hvec.begin(),hvec.end());
hvec.end()); ////on
onCPU
CPU
yy==thrust::reduce(dvec.begin(),
thrust::reduce(dvec.begin(),dvec.end());
dvec.end()); ////on
onGPU
GPU

yy==thrust::reduce(dvec.begin(),
thrust::reduce(dvec.begin(),dvec.end(),
dvec.end(),
(int)0,
(int)0,thrust::plus<int>());
thrust::plus<int>());
Classwork:
Classwork:Implement
Implementcount
countusing
usingreduction.
reduction.
For
Forinstance,
instance,IIwant
wanttotofind
findthe
thenumber
numberofofoccurrences
occurrencesofofan
anelement
elementininaa 23
vector.
vector.
struct mycount {
int _a;
mycount(int a):_a(a){}
__host__ __device__
int operator()(const int x, const int y) const {
return (y == _a ? x + 1 : x);
}
};
int main() {
thrust::host_vector<int> vec(10, 0);
vec[1] = 5;
vec[4] = 5;
vec[9] = 5;

int result = thrust::reduce(vec.begin(), vec.end(),

(int)0, mycount(5));
std::cout << result << std::endl;
return 0; 24
}
Prefix Sum / Scan
#include <thrust/scan.h>
int data[6] = {1, 0, 2, 2, 1, 3};
// inclusive scan
thrust::inclusive_scan(data, data + 6, data);
// data is now {1, 1, 3, 5, 6, 9}
thrust::exclusive_scan(data, data + 6, data);
// data is now {0, 1, 1, 3, 5, 6}

25
Classwork
●
What is the output of the following code?
int
intdata[]
data[]=={-5,
{-5,0,
0,2,
2,-3,
-3,2,
2,4,
4,0,
0,-1,
-1,2,
2,8};
8};
int
intsizedata
sizedata==sizeof(data)
sizeof(data)//sizeof(*data);
sizeof(*data);
thrust::plus<int>
thrust::plus<int>binop;
binop;
thrust::exclusive_scan(data,
thrust::exclusive_scan(data,data
data++sizedata,
sizedata,data,
data,5,
5,binop);
binop);
for
for(unsigned
(unsignediiii==0;
0;iiii<<sizedata;
sizedata;++ii)
++ii){{
std::cout
std::cout<<
<<data[ii]
data[ii]<<
<<""";";
}}
std::cout
std::cout<<<<std::endl;
std::endl;

26
55000022-1-11155554466
Classwork
●
What is the output of the following code?
int
intdata[]
data[]=={-5,
{-5,0,
0,2,
2,-3,
-3,2,
2,4,
4,0,
0,-1,
-1,2,
2,8};
8};
int
intsizedata
sizedata==sizeof(data)
sizeof(data)//sizeof(*data);
sizeof(*data);
thrust::plus<int>
thrust::plus<int>binop;
binop;
thrust::inclusive_scan(data,
thrust::inclusive_scan(data,data
data++sizedata,
sizedata,data,
data,5,
5,binop);
binop);
for
for(unsigned
(unsignediiii==0;
0;iiii<<sizedata;
sizedata;++ii)
++ii){{
std::cout
std::cout<<
<<data[ii]
data[ii]<<
<<""";";
}}
std::cout
std::cout<<<<std::endl;
std::endl;

27
Compile-time
Compile-timeerror:
error:Why?
Why?
Classwork
●
What is the output of the following code?
int
intdata[]
data[]=={-5,
{-5,0,
0,2,
2,-3,
-3,2,
2,4,
4,0,
0,-1,
-1,2,
2,8};
8};
int
intsizedata
sizedata==sizeof(data)
sizeof(data)//sizeof(*data);
sizeof(*data);
thrust::plus<int>
thrust::plus<int>binop;
binop;
thrust::inclusive_scan(data,
thrust::inclusive_scan(data,data
data++sizedata,
sizedata,data,
data,5,
5,binop);
binop);
for
for(unsigned
(unsignediiii==0;
0;iiii<<sizedata;
sizedata;++ii)
++ii){{
std::cout
std::cout<<
<<data[ii]
data[ii]<<
<<""";";
}}
std::cout
std::cout<<<<std::endl;
std::endl;

28
-5-5-5-5-3-3-6-6-4-40000-1-11199
Classwork
●
What is the output of the following code?
int
intdata
data [][]=={-5,
{-5,0,
0,2,
2,-3,
-3,2,
2,4,
4,0,
0,-1,
-1,2,
2,8};
8};
int
intdata2[]
data2[]=={-5,
{-5,0,0,2,
2,-3,
-3,2,
2,4,
4,0,
0,-1,
-1,2,
2,8};
8};
int
intsizedata
sizedata==sizeof(data)
sizeof(data)//sizeof(*data);
sizeof(*data);
thrust::plus<int>
thrust::plus<int>binop;
binop;
thrust::inclusive_scan(data,
thrust::inclusive_scan(data,data+sizedata,
data+sizedata,data,
data,5,
5,binop);
binop);
thrust::exclusive_scan(data,
thrust::exclusive_scan(data,data+sizedata,
data+sizedata,data2,
data2,0,0,binop);
binop);
for
for(unsigned
(unsignediiii==0;
0;iiii<<sizedata;
sizedata;++ii)
++ii)
std::cout
std::cout<<
<<data2[ii]
data2[ii]<<<<""";";
std::cout
std::cout<<<<std::endl;
std::endl;

29
00-5-5-10
-10-13
-13-19
-19-23
-23-23
-23-23
-23-24
-24-23
-23
Classwork: Find output
int main() {
int data[] = {-5, 0, 2, -3, 2, 4, 0, -1, 2, 8};
int sizedata = sizeof(data) / sizeof(*data);
thrust::maximum<int> binop;
thrust::exclusive_scan(data, data + sizedata,
data, 1, binop);
for (unsigned ii = 0; ii < sizedata; ++ii) {
std::cout << data[ii] << " ";
}
std::cout << std::endl;
return 0;
} 1112224444 30
1112224444
Set Operations
#include <thrust/set_operations.h>
…
Must be sorted
int A1[6] = {0, 1, 3, 4, 5, 6, 9};
int A2[5] = {1, 3, 5, 7, 9};
int result[N];
thrust::set_difference(A1, A1+6, A2, A2+5, result);

result
resultisis{0,
{0,4,4,6}.
6}.
31
Set Operations
#include <thrust/set_operations.h>
…
int A1[] = {9, 6, 5, 4, 3, 1, 0};
int A2[5] = {9, 7, 5, 3, 1};
int result[N];
thrust::set_difference(A1, A1+7, A2, A2+5, result,
thrust::greater<int>());
result
resultisis{6,
{6,4,4,0}.
0}.
32
Sorting
#include <thrust/sort.h>
...
const int N = 6;
int A[N] = {1, 4, 2, 8, 5, 7};
thrust::sort(A, A + N);
// A is now {1, 2, 4, 5, 7, 8}

int keys[N] = { 1, 4, 2, 8, 5, 7};

char values[N] = {'a', 'b', 'c', 'd', 'e', 'f'};
thrust::sort_by_key(keys, keys + N, values);
// keys is now { 1, 2, 4, 5, 7, 8}
// values is now {'a', 'c', 'b', 'e', 'f', 'd'}
33
Summary
✔
__host__, __device__, __global__
✔
Functors
✔
Thrust
– Aggregate functions
– Iterators
– Reduction
– Prefix sum

Complete 1988 Mazda 323 Workshop Manual
91% (22)
Complete 1988 Mazda 323 Workshop Manual
1,129 pages
3-computation
No ratings yet
3-computation
28 pages
Cuda C/C++ Basics: NVIDIA Corporation
No ratings yet
Cuda C/C++ Basics: NVIDIA Corporation
67 pages
2023-CSC14120-Lecture01-CUDAIntroduction
No ratings yet
2023-CSC14120-Lecture01-CUDAIntroduction
32 pages
21.L18 Intro To GPU and CUDA C
No ratings yet
21.L18 Intro To GPU and CUDA C
89 pages
CUDA PPT Anurita Unit3
No ratings yet
CUDA PPT Anurita Unit3
42 pages
Introduction To CUDA C 3
No ratings yet
Introduction To CUDA C 3
67 pages
cuda
No ratings yet
cuda
4 pages
CUDA Programming Invert
No ratings yet
CUDA Programming Invert
36 pages
Gpu History and Cuda Programming Basics
No ratings yet
Gpu History and Cuda Programming Basics
44 pages
Introduction To CUDA C
No ratings yet
Introduction To CUDA C
67 pages
CUDA Compute Unified Device Architecture
No ratings yet
CUDA Compute Unified Device Architecture
26 pages
2-Computation
No ratings yet
2-Computation
15 pages
NirajTamang Week8
No ratings yet
NirajTamang Week8
10 pages
Introduction To CUDA: CAP 4730 Spring 2012
No ratings yet
Introduction To CUDA: CAP 4730 Spring 2012
35 pages
Lecture2 Cuda Basic 2010
No ratings yet
Lecture2 Cuda Basic 2010
44 pages
CUDA Exercises
No ratings yet
CUDA Exercises
185 pages
5. Moving to Parallel With CUDA - Hello Program
No ratings yet
5. Moving to Parallel With CUDA - Hello Program
14 pages
CUDA_part-1
No ratings yet
CUDA_part-1
52 pages
217 Lec2
No ratings yet
217 Lec2
24 pages
Introduccion CUDA C
No ratings yet
Introduccion CUDA C
51 pages
Cuda Talk
100% (1)
Cuda Talk
82 pages
CUDA_part-1-LMS
No ratings yet
CUDA_part-1-LMS
51 pages
Cuda Firstprograms PDF
No ratings yet
Cuda Firstprograms PDF
6 pages
04 IntroductionGPUsCUDA
No ratings yet
04 IntroductionGPUsCUDA
25 pages
CUDA Putting It All Together
No ratings yet
CUDA Putting It All Together
39 pages
LP 1,,1
No ratings yet
LP 1,,1
5 pages
GPU Series III CUDA Compilation Host Side 1721302802
No ratings yet
GPU Series III CUDA Compilation Host Side 1721302802
8 pages
Lecture 11 Programming On Gpus Part 1 Zxu2acms60212 40212 S15lec 11 Gpupdf
No ratings yet
Lecture 11 Programming On Gpus Part 1 Zxu2acms60212 40212 S15lec 11 Gpupdf
121 pages
3-CUDA
No ratings yet
3-CUDA
5 pages
3 Some Commonly Used CUDA API: 3.1 Function Type Qualifiers
No ratings yet
3 Some Commonly Used CUDA API: 3.1 Function Type Qualifiers
7 pages
Intro To CUDA
No ratings yet
Intro To CUDA
76 pages
PDC assignment
No ratings yet
PDC assignment
9 pages
CUDA_1
No ratings yet
CUDA_1
45 pages
Csnb594csnb4423 Lab 5 01a Harveen Velan Sw0104101
No ratings yet
Csnb594csnb4423 Lab 5 01a Harveen Velan Sw0104101
19 pages
Hetero Lecture Slides 002 Lecture 1 Lecture-1-5-Cuda-API
No ratings yet
Hetero Lecture Slides 002 Lecture 1 Lecture-1-5-Cuda-API
11 pages
CUDAProgModel
No ratings yet
CUDAProgModel
24 pages
CuPrintf Readme
No ratings yet
CuPrintf Readme
6 pages
Lab 10,11
No ratings yet
Lab 10,11
4 pages
Lecture3 Fundamentals of CUDA(Part1)_2025
No ratings yet
Lecture3 Fundamentals of CUDA(Part1)_2025
52 pages
01 Cuda c Basics
No ratings yet
01 Cuda c Basics
32 pages
Cuda Review 1
No ratings yet
Cuda Review 1
13 pages
CUDA Introduction
No ratings yet
CUDA Introduction
39 pages
Gpu Cuda
No ratings yet
Gpu Cuda
204 pages
Lecture5 2
No ratings yet
Lecture5 2
46 pages
Ejercicio 2 Práctica 3: CUDA Desempeño en Función de La Homogeneidad para Acceder A Memoria y de La Regularidad Del Código
No ratings yet
Ejercicio 2 Práctica 3: CUDA Desempeño en Función de La Homogeneidad para Acceder A Memoria y de La Regularidad Del Código
8 pages
Overview of GPGPU's
No ratings yet
Overview of GPGPU's
81 pages
CUDA Memory Architecture: GPGPU Class Week 4
No ratings yet
CUDA Memory Architecture: GPGPU Class Week 4
28 pages
Lec6 Cuda Memory
No ratings yet
Lec6 Cuda Memory
18 pages
GPU_Programming_slides_2
No ratings yet
GPU_Programming_slides_2
37 pages
Basic-Cuda
No ratings yet
Basic-Cuda
49 pages
4. Cuda Add Mult
No ratings yet
4. Cuda Add Mult
3 pages
002 - Introduction To CUDA Programming - 1
No ratings yet
002 - Introduction To CUDA Programming - 1
54 pages
GPU Programming: CUDA
No ratings yet
GPU Programming: CUDA
29 pages
GPUMod 2
No ratings yet
GPUMod 2
64 pages
Recipe For Running Simple CUDA Code On A GPU Based Rocks Cluster
No ratings yet
Recipe For Running Simple CUDA Code On A GPU Based Rocks Cluster
17 pages
Addition_Cuda
No ratings yet
Addition_Cuda
2 pages
Aca Lab Manual Final
No ratings yet
Aca Lab Manual Final
28 pages
Computer Engineering Laboratory Solution Primer
From Everand
Computer Engineering Laboratory Solution Primer
Karan Bhandari
No ratings yet
C Language Programming Codes
From Everand
C Language Programming Codes
Durgesh
No ratings yet
C Programming
From Everand
C Programming
Netra
No ratings yet
Arts Promotion and Preservation
No ratings yet
Arts Promotion and Preservation
2 pages
Kellyanne Conway Transcript
No ratings yet
Kellyanne Conway Transcript
187 pages
Enhancing Skills in English: Ms. Jamaica Tinguha
No ratings yet
Enhancing Skills in English: Ms. Jamaica Tinguha
34 pages
Cambridge Igcse 9 1 Chemistry 0971 Grade Threshold Table June 2022
No ratings yet
Cambridge Igcse 9 1 Chemistry 0971 Grade Threshold Table June 2022
1 page
The Story of Ariadne Theseus Minotaur
No ratings yet
The Story of Ariadne Theseus Minotaur
3 pages
Chun Tian Man Man Yi Dian Dian Fa Ya
No ratings yet
Chun Tian Man Man Yi Dian Dian Fa Ya
3 pages
4.2 Pediatric Nutrition 2
No ratings yet
4.2 Pediatric Nutrition 2
1 page
5 Types of Question
No ratings yet
5 Types of Question
4 pages
Zone of Proximal Development
No ratings yet
Zone of Proximal Development
2 pages
G242e
No ratings yet
G242e
97 pages
321.carole Ann Ainio
No ratings yet
321.carole Ann Ainio
2 pages
Data and AI - Transforming The Future of Engineering
No ratings yet
Data and AI - Transforming The Future of Engineering
18 pages
Class 5th
No ratings yet
Class 5th
6 pages
Area Formulas
No ratings yet
Area Formulas
22 pages
3bse066174r201 - Cba
No ratings yet
3bse066174r201 - Cba
44 pages
Exordium 04 En
No ratings yet
Exordium 04 En
20 pages
Divorce Debate
No ratings yet
Divorce Debate
19 pages
SAP2000 Command Line
No ratings yet
SAP2000 Command Line
2 pages
Word Formation: Prefixes and Suffixes
No ratings yet
Word Formation: Prefixes and Suffixes
8 pages
2024
No ratings yet
2024
54 pages
Walk in Interview Application-Form HPS KP
No ratings yet
Walk in Interview Application-Form HPS KP
4 pages
Matthew Everingham
No ratings yet
Matthew Everingham
29 pages
Atomic Mass and Atomic Number Worksheet
No ratings yet
Atomic Mass and Atomic Number Worksheet
1 page
MiniCase6 - The Mystery Powder
No ratings yet
MiniCase6 - The Mystery Powder
6 pages
Expired Domain Terms
No ratings yet
Expired Domain Terms
5 pages
China's Claim of Sovereignty Over Spratly and Paracel Islands: A Historical and Legal Perspective
No ratings yet
China's Claim of Sovereignty Over Spratly and Paracel Islands: A Historical and Legal Perspective
23 pages
Computer Keyboard Shortcut Keys PDF
No ratings yet
Computer Keyboard Shortcut Keys PDF
7 pages
Keyboard Shortcuts For Windows and Mac
No ratings yet
Keyboard Shortcuts For Windows and Mac
8 pages
List of Adjectives Teens 1 Advanced
No ratings yet
List of Adjectives Teens 1 Advanced
1 page

5 Functions

Uploaded by

5 Functions

Uploaded by

Functions

__device__ float DeviceFunc() device device

__host__ float HostFunc() host host

unsigned nblocks = ceil((float)N / BLOCKSIZE);

dkernel<<<nblocks, BLOCKSIZE>>>(vector, N);

What are the other arrows possible in this diagram?

dnums = hnums; // calls cudaMemcpy

std::cout << dnums[3] << std::endl;

thrust::transform(dsrc.begin(), dsrc.end(), dsrc2.begin(),

thrust::device_vector<float> x, y, result; … // populate x, y.

int result = thrust::reduce(vec.begin(), vec.end(),

int keys[N] = { 1, 4, 2, 8, 5, 7};

You might also like

device float DeviceFunc() device device

host float HostFunc() host host