0% found this document useful (0 votes)
4 views34 pages

5 Functions

The document provides an overview of CUDA function declarations and types, detailing the differences between host and device functions. It includes examples of function definitions, memory allocation, and the execution of kernels in CUDA programming. Additionally, it discusses the use of global variables and the implications of memory access on CPU and GPU.

Uploaded by

Omniverse9 BTFF
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
4 views34 pages

5 Functions

The document provides an overview of CUDA function declarations and types, detailing the differences between host and device functions. It includes examples of function definitions, memory allocation, and the execution of kernels in CUDA programming. Additionally, it discusses the use of global variables and the implications of memory access on CPU and GPU.

Uploaded by

Omniverse9 BTFF
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 34

Functions

Rupesh Nasre.

IIT Madras
January 2022
CUDA Function Declarations
Executed Callable from
on the: only the:

__device__ float DeviceFunc() device device


__global__ void KernelFunc() device host + device

__host__ float HostFunc() host host


__global__ defines a kernel. It must return void.

A program may have several functions of each kind.

The same function of any kind may be called multiple times.

Host == CPU, Device == GPU.

2 2
Function Types (1/2)
#include <stdio.h>
#include <cuda.h>
__host__ __device__ void dhfun() {
printf("I can run on both CPU and GPU.\n");
}
__device__ unsigned dfun(unsigned *vector, unsigned vectorsize, unsigned id) {
if (id == 0) dhfun();
if (id < vectorsize) {
vector[id] = id;
return 1;
} else {
return 0;
}
}
__global__ void dkernel(unsigned *vector, unsigned vectorsize) {
unsigned id = blockIdx.x * blockDim.x + threadIdx.x;
dfun(vector, vectorsize, id);
}
__host__ void hostfun() {
printf("I am simply like another function running on CPU. Calling dhfun\n");
dhfun();
}
3
Function Types (2/2)
#define BLOCKSIZE 1024
int main(int nn, char *str[]) {
unsigned N = atoi(str[1]);
unsigned *vector, *hvector;
cudaMalloc(&vector, N * sizeof(unsigned));
hvector = (unsigned *)malloc(N * sizeof(unsigned));

unsigned nblocks = ceil((float)N / BLOCKSIZE);


printf("nblocks = %d\n", nblocks);

dkernel<<<nblocks, BLOCKSIZE>>>(vector, N);


cudaMemcpy(hvector, vector, N * sizeof(unsigned), cudaMemcpyDeviceToHost);
for (unsigned ii = 0; ii < N; ++ii) { C
printf("%4d ", hvector[ii]); main hostfun P
} main hostfun
U
printf("\n"); dhfun
dhfun G
hostfun();
dhfun(); dkernel dfun
dfun
P
dkernel
return 0; U
}

What are the other arrows possible in this diagram?


4
How about dhfun to dfun?
with HostAlloc'ed Memory
__host__ __device__ __host__
__host____device__
__device__void voidfun(int
fun(int*counter)
*counter){{
functions are friends with ++*counter;
++*counter;
HostAlloc’ed memory. }}
__global__
__global__voidvoidprintk(int
printk(int*counter)
*counter){{
fun(counter);
fun(counter);
printf("printk
printf("printk(after
(afterfun):
fun):%d\n",
%d\n",*counter);
*counter);
}} What
Whatisisthe
the
int
intmain()
main(){{ output
outputof
of
int
int*counter;
*counter; this
thiscode?
code?
cudaHostAlloc(&counter,
cudaHostAlloc(&counter,sizeof(int),
sizeof(int),0);
0);
*counter
*counter==0;0;
printf("main:
printf("main:%d\n",
%d\n",*counter);
*counter);
printk<<<1,
printk<<<1,1>>>(counter);
1>>>(counter);
cudaDeviceSynchronize();
cudaDeviceSynchronize();
fun(counter);
fun(counter);
printf("main
printf("main(after
(afterfun):
fun):%d\n",
%d\n",*counter);
*counter);
return
return0;
0;
}} 5
with a Device-only Function
__host__
__host____device__
__device__void voidfun(int
fun(int*counter)
*counter){{
++*counter;
++*counter;
__syncthreads(); __syncthreads()
__syncthreads()
__syncthreads();
}} isisnot
notavailable
available
__global__ on CPU.
__global__voidvoidprintk(int
printk(int*counter)
*counter){{ on CPU.
fun(counter);
fun(counter);
printf("printk
printf("printk(after
(afterfun):
fun):%d\n",
%d\n",*counter);
*counter);
}}
int
intmain()
main(){{
int
int*counter;
*counter;
cudaHostAlloc(&counter,
cudaHostAlloc(&counter,sizeof(int),
sizeof(int),0);
0);
*counter
*counter==0;0;
printf("main:
printf("main:%d\n",
%d\n",*counter);
*counter);
printk
printk<<<1,
<<<1,1>>>(counter);
1>>>(counter);
cudaDeviceSynchronize();
cudaDeviceSynchronize();
fun(counter);
fun(counter);
printf("main
printf("main(after
(afterfun):
fun):%d\n",
%d\n",*counter);
*counter);
return
return0;
0; 6
}}
with a CPU-only Memory
__host__
__host____device__
__device__void voidfun(int
fun(int*counter)
*counter){{
++*counter;
++*counter;
}}
__global__
__global__void voidprintk(int
printk(int*counter)
*counter){{
fun(counter);
fun(counter);
printf("printk
printf("printk(after
(afterfun):
fun):%d\n",
%d\n",*counter);
*counter);
}}
int
intmain()
main(){{
int
int*counter;
*counter; countercannot
counter cannot
////cudaHostAlloc(&counter,
cudaHostAlloc(&counter,sizeof(int),
sizeof(int),0);
0); be accessed
be accessed
cudaMalloc(&counter,
cudaMalloc(&counter,sizeof(int));
sizeof(int)); on
onCPU.
CPU.
*counter
*counter==0;0;
printf("main:
printf("main:%d\n",
%d\n",*counter);
*counter);
printk
printk<<<1,
<<<1,1>>>(counter);
1>>>(counter);
cudaDeviceSynchronize();
cudaDeviceSynchronize();
fun(counter);
fun(counter);
printf("main
printf("main(after
(afterfun):
fun):%d\n",
%d\n",*counter);
*counter);
return
return0;
0; 7
}}
Global Variables
int
intcounter;
counter;
__host__
__host____device__
__device__void voidfun()
fun(){{ countercannot
counter cannot
++counter; be
beaccessed
accessed
++counter;
}} on
onGPU.
GPU.
__global__
__global__voidvoidprintk()
printk(){{
fun();
fun();
printf("printk
printf("printk(after
(afterfun):
fun):%d\n",
%d\n",counter);
counter);
}}
int
intmain()
main(){{
counter
counter==0;0;
printf("main:
printf("main:%d\n",
%d\n",counter);
counter);
printk
printk<<<1,
<<<1,1>>>();
1>>>();
cudaDeviceSynchronize();
cudaDeviceSynchronize();
fun();
fun();
printf("main
printf("main(after
(afterfun):
fun):%d\n",
%d\n",counter);
counter);
return
return0;
0;
}} 8
Global Variables
__host__
__host____device__
__device__int
intcounter;
counter;
Variables
Variables
__host__
__host____device__
__device__void voidfun()
fun(){{ cannot
cannotbe
be
++counter; declared
++counter; declaredas
as
}} __host__.
__host__.
__global__
__global__voidvoidprintk()
printk(){{
fun();
fun();
printf("printk
printf("printk(after
(afterfun):
fun):%d\n",
%d\n",counter);
counter);
}}
int
intmain()
main(){{
counter
counter==0;0;
printf("main:
printf("main:%d\n",
%d\n",counter);
counter);
printk
printk<<<1,
<<<1,1>>>();
1>>>();
cudaDeviceSynchronize();
cudaDeviceSynchronize();
fun();
fun();
printf("main
printf("main(after
(afterfun):
fun):%d\n",
%d\n",counter);
counter);
return
return0;
0;
}} 9
Global Variables
__device__
__device__int
intcounter;
counter;
__host__
__host____device__
__device__void voidfun()
fun(){{ Warning
Warningduring
during
++counter;
++counter; compilation,
compilation,
}} but
butworks
worksfine.
fine.
__global__
__global__voidvoidprintk()
printk(){{
fun();
fun();
printf("printk
printf("printk(after
(afterfun):
fun):%d\n",
%d\n",counter);
counter);
}}
int
intmain()
main(){{
printk
printk<<<1,
<<<1,1>>>();
1>>>();
cudaDeviceSynchronize();
cudaDeviceSynchronize();
return
return0;
0;
}}

10
Classwork
Write
WriteaaCUDA
CUDAcodecodetoto __host__
__host____device__
__device__voidvoidfun(int
fun(int*arr)
*arr){{
increment
incrementall
allelements
elementsinin for
for(unsigned
(unsignediiii==0;
0;iiii<<N;
N;++ii)
++ii)
an
anarray.
array.Call
Callthis
thiscode
code ++arr[ii];
++arr[ii];
from host as well as
from host as well as }}
device.
device. __global__
__global__void voiddfun(int
dfun(int*arr)
*arr){{
fun(arr);
fun(arr); Host-centric,
}} Host-centric,
sequential
sequentialon
onGPU
GPU
int main() {
int main() {
Classwork:
Classwork:Can Canyou
youavoid
avoid int
intarr[N],
arr[N],*darr;
*darr;
the
thefor
forloop
loopininfun?
fun?
cudaMalloc(&darr,
cudaMalloc(&darr,NN**sizeof(int));
sizeof(int));
for
for(unsigned
(unsignediiii==0; 0;iiii<<N;
N;++ii)
++ii)
arr[ii]
arr[ii]==ii;ii;
cudaMemcpy(darr,
cudaMemcpy(darr,arr, arr,NN**sizeof(int),
sizeof(int),
cudaMemcpyHostToDevice);
cudaMemcpyHostToDevice);
fun(arr);
fun(arr);
dfun<<<1,
dfun<<<1,1>>>(darr);
1>>>(darr);
cudaDeviceSynchronize();
cudaDeviceSynchronize();
return
return0;
0; 11
}}
Classwork
Write
WriteaaCUDA
CUDAcodecodetoto __host__
__host____device__
__device__voidvoidfun(int
fun(int*arr)
*arr){{
increment
incrementall
allelements
elementsinin ++arr;
++arr;
an
anarray.
array.Call
Callthis
thiscode
code }}
from host as well as
from host as well as __global__
__global__void voiddfun(int
dfun(int*arr)
*arr){{
device.
device. fun(arr
fun(arr++threadIdx.x);
threadIdx.x);
}}
int main() { Device-centric,
Device-centric,
int main() { sequential on CPU
int arr[N], *darr;
int arr[N], *darr; sequential on CPU
Classwork:
Classwork:Can Canyou
youavoid
avoid
the
thefor
forloop
loopininfun?
fun? cudaMalloc(&darr,
cudaMalloc(&darr,NN**sizeof(int));
sizeof(int));
for
for(unsigned
(unsignediiii==0; 0;iiii<<N;
N;++ii)
++ii)
arr[ii]
arr[ii]==ii;ii;
cudaMemcpy(darr,
cudaMemcpy(darr,arr, arr,NN**sizeof(int),
sizeof(int),
cudaMemcpyHostToDevice);
cudaMemcpyHostToDevice);
Classwork: for
for(unsigned
(unsignediiii==0; 0;iiii<<N;
N;++ii)
++ii)
Classwork:What WhatififI Idon’t
don’t fun(arr
like
likethe
theforforloop
loopininmain,
main, fun(arr++ii); ii);
but dfun<<<1, N>>>(darr);
dfun<<<1, N>>>(darr);
butstill
stillwant
wantGPU-parallel
GPU-parallel cudaDeviceSynchronize();
code?
code? cudaDeviceSynchronize();
return
return0;
0;
}} 12
Classwork: Pranav’s idea
__host__
__host____device__
__device__void voidfun(int
fun(int*arr,
*arr,int
intnn)
nn){{
Write
WriteaaCUDA
CUDAcodecodetoto for
for(unsigned
(unsignediiii==0;
0;iiii<<nn;
nn;++ii)
++ii)
increment
incrementall
allelements
elementsinin ++arr[ii];
++arr[ii];
an
anarray.
array.Call
Callthis
thiscode
code }}
from host as well as
from host as well as __global__
__global__void voiddfun(int
dfun(int*arr)
*arr){{
device.
device. fun(arr
fun(arr++threadIdx.x,
threadIdx.x,1); 1);
////need
needtotochange
changeforformore
moreblocks.
blocks.
}}
Classwork: int
intmain()
main(){{
Classwork:Can Canyou
youavoid
avoid int
the
thefor
forloop
loopininfun?
fun? intarr[N],
arr[N],*darr;
*darr;
cudaMalloc(&darr,
cudaMalloc(&darr,NN**sizeof(int));
sizeof(int));
for
for(unsigned
(unsignediiii==0; 0;iiii<<N;
N;++ii)
++ii)
arr[ii]
arr[ii]==ii;ii;
cudaMemcpy(darr,
cudaMemcpy(darr,arr, arr,NN**sizeof(int),
sizeof(int),
Classwork:
Classwork:What WhatififI Idon’t
don’t cudaMemcpyHostToDevice);
cudaMemcpyHostToDevice);
like
likethe
theforforloop
loopininmain,
main,
but
butstill
stillwant
wantGPU-parallel fun(arr,
GPU-parallel fun(arr,N);
N);
code? dfun<<<1,
code? dfun<<<1,N>>>(darr);
N>>>(darr);
cudaDeviceSynchronize();
cudaDeviceSynchronize();
13
return
return0;
0;
}}
Thrust

Thrust is a parallel algorithms library
(similar in spirit to STL on CPU).

Supports vectors and associated transforms.

Programmer is oblivious to where code executes
– on CPU or GPU.

Makes use of C++ features such as functors, and
__host__ __device__ functions.

14
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <iostream>
int main(void) {
// H has storage for 4 integers
thrust::host_vector<int> H(4);
// initialize individual elements
H[0] = 14; H[1] = 20; H[2] = 38; H[3] = 46;
// H.size() returns the size of vector H
std::cout << "H has size " << H.size() << std::endl;
// print contents of H
for(int i = 0; i < H.size(); i++) std::cout << "H[" << i << "] = " << H[i] << std::endl;
// resize H
H.resize(2);
std::cout << "H now has size " << H.size() << std::endl;
// Copy host_vector H to device_vector D
thrust::device_vector<int> D = H;
// elements of D can be modified
D[0] = 99; D[1] = 88;
// H and D are automatically deleted when the function returns
return 0;
} 15
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/copy.h>
#include <thrust/fill.h>
#include <thrust/sequence.h>
#include <iostream>
int main(void) {
// initialize all ten integers of a device_vector to 1
thrust::device_vector<int> D(10, 1);
// set the first seven elements of a vector to 9
thrust::fill(D.begin(), D.begin() + 7, 9);
// initialize a host_vector with the first five elements of D
thrust::host_vector<int> H(D.begin(), D.begin() + 5);
// set the elements of H to 0, 1, 2, 3, ...
thrust::sequence(H.begin(), H.end());
// copy all of H back to the beginning of D
thrust::copy(H.begin(), H.end(), D.begin());
// print D
for(int i = 0; i < D.size(); i++)
std::cout << "D[" << i << "] = " << D[i] << std::endl;
return 0; 16
}
Thrust Details
thrust::host_vector<int> hnums(1024);
thrust::device_vector<int> dnums;

dnums = hnums; // calls cudaMemcpy

// initialization.
thrust::device_vector<int> dnum2(hnums.begin(), hnums.end());
hnums = dnum2; // array resizing happens automatically.

std::cout << dnums[3] << std::endl;

thrust::transform(dsrc.begin(), dsrc.end(), dsrc2.begin(),


ddst.begin(), addFunc);

17
Thrust Functions

find(begin, end, value);

find_if(begin, end, predicate);

copy, copy_if.

count, count_if.

equal.

min_element, max_element.

merge, sort, reduce.

transform.
18

...
Thrust Algorithms

Dual implementations: host and device

Iterators as arguments must be on the same
device
– except copy, which can copy across devices
– Otherwise, compiler issues error

19
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/sequence.h>
#include <thrust/copy.h>
#include <thrust/fill.h>
#include <thrust/replace.h>
#include <thrust/functional.h>
#include <iostream>
int main(void) {
// allocate three device_vectors with 10 elements
thrust::device_vector<int> X(10);
thrust::device_vector<int> Y(10);
thrust::device_vector<int> Z(10);
// initialize X to 0,1,2,3, ....
thrust::sequence(X.begin(), X.end());
// compute Y = -X
thrust::transform(X.begin(), X.end(), Y.begin(), thrust::negate<int>());
// fill Z with twos
thrust::fill(Z.begin(), Z.end(), 2);
// compute Y = X mod 2
thrust::transform(X.begin(), X.end(), Z.begin(), Y.begin(), thrust::modulus<int>());
// replace all the ones in Y with tens
thrust::replace(Y.begin(), Y.end(), 1, 10);
// print Y
thrust::copy(Y.begin(), Y.end(), std::ostream_iterator<int>(std::cout, "\n"));
return 0; 20
}
Thrust User-Defined Functors
// calculate result[] = (a * x[]) + y[]
struct saxpy {
const float _a;
saxpy(int a) : _a(a) { }

__host__ __device__
float operator()(const float& x, const float& y) const {
return _a * x + y;
}
};

thrust::device_vector<float> x, y, result; … // populate x, y.


thrust::transform(x.begin(), x.end(), y.begin(), result.begin(), saxpy(a));

21
Classwork

Create two 32-element vectors:
– X on host, Y on device

Fill X with 10, fill Y with sequence 0..31

Compute X = X – Y

Compute Z = X * Y
– // element-wise multiplication

22
Thrust Reductions

Recall reductions in log(n) barriers

No need to worry about blocks, synchronization.
int
intx,x,y;
y;
thrust::host_vector<int>
thrust::host_vector<int>hvec;
hvec;
thrust::device_vector<int>
thrust::device_vector<int>dvec;
dvec;
////(thrust::reduce
(thrust::reduceisisaasum
sumoperation
operationby
bydefault)
default)
xx==thrust::reduce(hvec.begin(),
thrust::reduce(hvec.begin(),hvec.end());
hvec.end()); ////on
onCPU
CPU
yy==thrust::reduce(dvec.begin(),
thrust::reduce(dvec.begin(),dvec.end());
dvec.end()); ////on
onGPU
GPU

yy==thrust::reduce(dvec.begin(),
thrust::reduce(dvec.begin(),dvec.end(),
dvec.end(),
(int)0,
(int)0,thrust::plus<int>());
thrust::plus<int>());
Classwork:
Classwork:Implement
Implementcount
countusing
usingreduction.
reduction.
For
Forinstance,
instance,IIwant
wanttotofind
findthe
thenumber
numberofofoccurrences
occurrencesofofan
anelement
elementininaa 23
vector.
vector.
struct mycount {
int _a;
mycount(int a):_a(a){}
__host__ __device__
int operator()(const int x, const int y) const {
return (y == _a ? x + 1 : x);
}
};
int main() {
thrust::host_vector<int> vec(10, 0);
vec[1] = 5;
vec[4] = 5;
vec[9] = 5;

int result = thrust::reduce(vec.begin(), vec.end(),


(int)0, mycount(5));
std::cout << result << std::endl;
return 0; 24
}
Prefix Sum / Scan
#include <thrust/scan.h>
int data[6] = {1, 0, 2, 2, 1, 3};
// inclusive scan
thrust::inclusive_scan(data, data + 6, data);
// data is now {1, 1, 3, 5, 6, 9}
thrust::exclusive_scan(data, data + 6, data);
// data is now {0, 1, 1, 3, 5, 6}

25
Classwork

What is the output of the following code?
int
intdata[]
data[]=={-5,
{-5,0,
0,2,
2,-3,
-3,2,
2,4,
4,0,
0,-1,
-1,2,
2,8};
8};
int
intsizedata
sizedata==sizeof(data)
sizeof(data)//sizeof(*data);
sizeof(*data);
thrust::plus<int>
thrust::plus<int>binop;
binop;
thrust::exclusive_scan(data,
thrust::exclusive_scan(data,data
data++sizedata,
sizedata,data,
data,5,
5,binop);
binop);
for
for(unsigned
(unsignediiii==0;
0;iiii<<sizedata;
sizedata;++ii)
++ii){{
std::cout
std::cout<<
<<data[ii]
data[ii]<<
<<""";";
}}
std::cout
std::cout<<<<std::endl;
std::endl;

26
55000022-1-11155554466
Classwork

What is the output of the following code?
int
intdata[]
data[]=={-5,
{-5,0,
0,2,
2,-3,
-3,2,
2,4,
4,0,
0,-1,
-1,2,
2,8};
8};
int
intsizedata
sizedata==sizeof(data)
sizeof(data)//sizeof(*data);
sizeof(*data);
thrust::plus<int>
thrust::plus<int>binop;
binop;
thrust::inclusive_scan(data,
thrust::inclusive_scan(data,data
data++sizedata,
sizedata,data,
data,5,
5,binop);
binop);
for
for(unsigned
(unsignediiii==0;
0;iiii<<sizedata;
sizedata;++ii)
++ii){{
std::cout
std::cout<<
<<data[ii]
data[ii]<<
<<""";";
}}
std::cout
std::cout<<<<std::endl;
std::endl;

27
Compile-time
Compile-timeerror:
error:Why?
Why?
Classwork

What is the output of the following code?
int
intdata[]
data[]=={-5,
{-5,0,
0,2,
2,-3,
-3,2,
2,4,
4,0,
0,-1,
-1,2,
2,8};
8};
int
intsizedata
sizedata==sizeof(data)
sizeof(data)//sizeof(*data);
sizeof(*data);
thrust::plus<int>
thrust::plus<int>binop;
binop;
thrust::inclusive_scan(data,
thrust::inclusive_scan(data,data
data++sizedata,
sizedata,data,
data,5,
5,binop);
binop);
for
for(unsigned
(unsignediiii==0;
0;iiii<<sizedata;
sizedata;++ii)
++ii){{
std::cout
std::cout<<
<<data[ii]
data[ii]<<
<<""";";
}}
std::cout
std::cout<<<<std::endl;
std::endl;

28
-5-5-5-5-3-3-6-6-4-40000-1-11199
Classwork

What is the output of the following code?
int
intdata
data [][]=={-5,
{-5,0,
0,2,
2,-3,
-3,2,
2,4,
4,0,
0,-1,
-1,2,
2,8};
8};
int
intdata2[]
data2[]=={-5,
{-5,0,0,2,
2,-3,
-3,2,
2,4,
4,0,
0,-1,
-1,2,
2,8};
8};
int
intsizedata
sizedata==sizeof(data)
sizeof(data)//sizeof(*data);
sizeof(*data);
thrust::plus<int>
thrust::plus<int>binop;
binop;
thrust::inclusive_scan(data,
thrust::inclusive_scan(data,data+sizedata,
data+sizedata,data,
data,5,
5,binop);
binop);
thrust::exclusive_scan(data,
thrust::exclusive_scan(data,data+sizedata,
data+sizedata,data2,
data2,0,0,binop);
binop);
for
for(unsigned
(unsignediiii==0;
0;iiii<<sizedata;
sizedata;++ii)
++ii)
std::cout
std::cout<<
<<data2[ii]
data2[ii]<<<<""";";
std::cout
std::cout<<<<std::endl;
std::endl;

29
00-5-5-10
-10-13
-13-19
-19-23
-23-23
-23-23
-23-24
-24-23
-23
Classwork: Find output
int main() {
int data[] = {-5, 0, 2, -3, 2, 4, 0, -1, 2, 8};
int sizedata = sizeof(data) / sizeof(*data);
thrust::maximum<int> binop;
thrust::exclusive_scan(data, data + sizedata,
data, 1, binop);
for (unsigned ii = 0; ii < sizedata; ++ii) {
std::cout << data[ii] << " ";
}
std::cout << std::endl;
return 0;
} 1112224444 30
1112224444
Set Operations
#include <thrust/set_operations.h>

Must be sorted
int A1[6] = {0, 1, 3, 4, 5, 6, 9};
int A2[5] = {1, 3, 5, 7, 9};
int result[N];
thrust::set_difference(A1, A1+6, A2, A2+5, result);

result
resultisis{0,
{0,4,4,6}.
6}.
31
Set Operations
#include <thrust/set_operations.h>

int A1[] = {9, 6, 5, 4, 3, 1, 0};
int A2[5] = {9, 7, 5, 3, 1};
int result[N];
thrust::set_difference(A1, A1+7, A2, A2+5, result,
thrust::greater<int>());
result
resultisis{6,
{6,4,4,0}.
0}.
32
Sorting
#include <thrust/sort.h>
...
const int N = 6;
int A[N] = {1, 4, 2, 8, 5, 7};
thrust::sort(A, A + N);
// A is now {1, 2, 4, 5, 7, 8}

int keys[N] = { 1, 4, 2, 8, 5, 7};


char values[N] = {'a', 'b', 'c', 'd', 'e', 'f'};
thrust::sort_by_key(keys, keys + N, values);
// keys is now { 1, 2, 4, 5, 7, 8}
// values is now {'a', 'c', 'b', 'e', 'f', 'd'}
33
Summary

__host__, __device__, __global__

Functors

Thrust
– Aggregate functions
– Iterators
– Reduction
– Prefix sum

34

You might also like