0% found this document useful (0 votes)

57 views49 pages

Basic-Cuda

This document provides an overview of a course on GPU architectures and programming. The course covers topics like GPU architecture, CUDA programming, multi-dimensional data and synchronization, warp scheduling and divergence, and optimizing neural network training. It includes a 12-week schedule detailing the topics to be covered each week. The document also provides brief introductions to CUDA C programming and the CUDA program structure, explaining that CUDA programs have host code that runs on the CPU and device code that runs on GPUs.

Uploaded by

Vijay Reddy

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

57 views49 pages

Basic-Cuda

Uploaded by

Vijay Reddy

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 49

GPU Architectures and Programming

Soumyajit Dey, Assistant Professor,

CSE, IIT Kharagpur

December 12, 2019

TECHNO
OF LO
TE

GY
ITU
IAN INST

KH
ARAGPUR
IND

19 5 1

yog, kms kOflm^

GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Course Organization
Topic Week Hours
Review of basic COA w.r.t. performance 1 2
Intro to GPU architectures 2 3
Intro to CUDA programming 3 2
Multi-dimensional data and synchronization 4 2
Warp Scheduling and Divergence 5 2
Memory Access Coalescing 6 2
Optimizing Reduction Kernels 7 3
Kernel Fusion, Thread and Block Coarsening 8 3
OpenCL - runtime system 9 3
OpenCL - heterogeneous computing 10 2
Efficient Neural Network Training/Inferencing 11-12 6
TECHNO
OF LO
TE

GY
ITU
IAN INST

KH
ARAGPUR
IND

19 5 1

yog, kms kOflm^

GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Compute Unified Device Architecture

I CUDA C is an extension of C programming language with special constructs for

supporting parallel computing
I CUDA programmer perspective - CPU is a host : dispatches parallel jobs to GPU
devices

TECHNO
OF LO
TE

GY
ITU
IAN INST

KH
ARAGPUR
IND

19 5 1

yog, kms kOflm^

GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
CUDA program structure

I host code for a host device (CPU)

I device code for GPU(s)
I Any C program is a valid CUDA host code
I In general CUDA programs (host + device) code cannot be compiled by standard
C compilers
NVIDIA C compiler (NVCC)

TECHNO
OF LO
TE

GY
ITU
IAN INST

KH
ARAGPUR
IND

19 5 1

yog, kms kOflm^

GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
The compilation flow

TECHNO
OF LO
TE

GY
ITU
IAN INST

KH
ARAGPUR
IND

19 5 1

yog, kms kOflm^

GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
The execution flow

CPU serial code

⇓
GPU parallel kernel
⇓
CPU serial code
⇓
GPU parallel kernel

TECHNO
OF LO
TE

GY
ITU
IAN INST

KH
ARAGPUR
IND

19 5 1

yog, kms kOflm^

GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Examples : Vector addition CPU only

void vecAdd ( float * h_A , float * h_B ,

float * h_C , int n )
{
for ( i = 0; i < n ; i ++)
h_C [ i ] = h_A [ i ] + h_B [ i ];
}
int main ()
{
float * h_A ,* h_B ,* h_C ;
int n ;
h_A =( float *) malloc ( n * sizeof ( float ) )
h_B =( float *) malloc ( n * sizeof ( float ) )
h_C =( float *) malloc ( n * sizeof ( float ) )
vecAdd ( h_A , h_B , h_C , N ) ;
} TE
OF
TECHNO
LO

GY
ITU
IAN INST

KH
ARAGPUR
IND

19 5 1

yog, kms kOflm^

GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Examples : Vector addition CPU-GPU
# include < cuda .h >
# include < cuda_runtime .h >
__global__ void vectorAdd ( float * , float * , float * , int ) ;
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
__global__
void vectorAdd ( float * A , float * B ,
float * C , int n ) { // CUDA kernel definition
int i = threadIdx . x + blockDim . x * blockIdx . x ;
if (i < n )
C [ i ] = A [ i ] + B [ i ];
}
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
void vecAdd ( float * h_A , float * h_B ,
float * h_C , int n )
{ // host program
int size = n * sizeof ( float ) ;
float * d_A = NULL , * d_B = NULL , * d_C = NULL ;
TECHNO
OF LO
TE

GY
ITU
IAN INST

KH
ARAGPUR
// Error code to check return values for CUDA calls

IND

19 5 1

cudaError_t err = cudaSuccess ; yog, kms kOflm^

GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Device Memory Allocation
err = cudaMalloc (( void **) & d_A , size ) ;
if ( err != cudaSuccess )
{
fprintf ( stderr , " Failed to allocate device vector A ( error code % s ) !\ n " ,
c u d a G e t E r r o r S t r i n g ( err ) ) ;
exit ( EXIT_FAILURE ) ;
}
err = cudaMalloc (( void **) & d_B , size ) ;
if ( err != cudaSuccess )
{
fprintf ( stderr , " Failed to allocate device vector B ( error code % s ) !\ n " ,
c u d a G e t E r r o r S t r i n g ( err ) ) ;
exit ( EXIT_FAILURE ) ;
}
err = cudaMalloc (( void **) & d_C , size ) ;
if ( err != cudaSuccess )
{
fprintf ( stderr , " Failed to allocate device vector C ( error code % s ) !\ n " , TE
OF
TECHNO
LO

GY
ITU
IAN INST

KH
ARAGPUR
c u d a G e t E r r o r S t r i n g ( err ) ) ;

IND

19 5 1

exit ( EXIT_FAILURE ) ; yog, kms kOflm^

}
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Host to Device Data Transfer
printf ( " Copy input data from the host memory to the CUDA device \ n " ) ;
err = cudaMemcpy ( d_A , h_A , size , c u d a M e m c p y H o s t T o D e v i c e ) ;

if ( err != cudaSuccess )
{
fprintf ( stderr , " Failed to copy vector A from host to device ( error code % s )
!\ n " , c u d a G e t E r r o r S t r i n g ( err ) ) ;
exit ( EXIT_FAILURE ) ;
}

err = cudaMemcpy ( d_B , h_B , size , c u d a M e m c p y H o s t T o D e v i c e ) ;

if ( err != cudaSuccess )
{
fprintf ( stderr , " Failed to copy vector B from host to device ( error code % s )
!\ n " , c u d a G e t E r r o r S t r i n g ( err ) ) ;
exit ( EXIT_FAILURE ) ; TE
OF
TECHNO
LO

GY
ITU
IAN INST

KH
}

ARAGPUR
IND

19 5 1

yog, kms kOflm^

GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Kernel Launch

int t hr e ad sP e rB lo c k = 256;
int blocksPerGrid =( n + threadsPerBlock -1) / t h re ad s Pe rB l oc k ;
printf ( " CUDA kernel launch with % d blocks of % d threads \ n " , threadsPerBlock ,
blocksPerGrid ) ;
vectorAdd < < < blocksPerGrid , threadsPerBlock > > >( d_A , d_B , d_C , n ) ;
err = cu d a G e t L a s t E r r o r () ;
// device function ( CUDA kernel ) called from host does not have return type
// CUDA runtime functions ( execute in host side ) can have return type

if ( err != cudaSuccess )
{
fprintf ( stderr , " Failed to launch vectorAdd kernel ( error code % s ) !\ n " ,
c u d a G e t E r r o r S t r i n g ( err ) ) ;
exit ( EXIT_FAILURE ) ;
} TE
OF
TECHNO
LO

GY
ITU
IAN INST

KH
ARAGPUR
IND

19 5 1

yog, kms kOflm^

GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Device to Host Memory Transfer
printf ( " Copy output data from the output device to the host memory \ n " ) ;
err = cudaMemcpy ( h_C , d_C , size , c u d a M e m c p y D e v i c e T o H o s t ) ;
if ( err != cudaSuccess )
{
fprintf ( stderr , " Failed to copy vector C from device to host ( error code % s
) !\ n " , c u d a G e t E r r o r S t r i n g ( err ) ) ;
exit ( EXIT_FAILURE ) ;
}
cudaFree ( d_A ) ; cudaFree ( d_B ) ; cudaFree ( d_C ) ;
// Verify that the result vector is correct
for ( int i = 0; i < n ; ++ i )
{
if ( fabs ( h_A [ i ] + h_B [ i ] - h_C [ i ]) > 1e -5)
{
fprintf ( stderr , " Result verification failed at element % d !\ n " , i ) ;
exit ( EXIT_FAILURE ) ;
}
} TE
OF
TECHNO
LO

GY
ITU
IAN INST

KH
ARAGPUR
printf ( " Test PASSED " ) ;

IND

19 5 1

} // End of Function yog, kms kOflm^

GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Compile and Run

nvcc kernel . cu host . cu -o output

./ output
[ Vector addition of 50000 elements ]
Copy input data from the host memory to the CUDA device
CUDA kernel launch with 196 blocks of 256 threads
Copy output data from the CUDA device to the host memory
Test PASSED

TECHNO
OF LO
TE

GY
ITU
IAN INST

KH
ARAGPUR
IND

19 5 1

yog, kms kOflm^

GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Observations

Host Memory Device Memory

CPU GPU

Figure: CPU/GPU Mem Layout

I cuda.h → includes during compilation CUDA API functions and CUDA system
variables TE
OF
TECHNO
LO

GY
ITU
IAN INST

KH
ARAGPUR
I h_A, h_B, h_C → arrays mapped to main memory locations

IND

19 5 1

yog, kms kOflm^

GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Observations

cudaMalloc (( void **) & d_A , size ) ;

// allocate memory segment from GPU global memory
// expects a generic pointer ( void **)
// the low level function is common for all object types
cudaMemcpy ( d_A , h_A , size , c u d a M e m c p y H o s t T o D e v i c e ) ;
// transfer data from CPU to GPU memory
// d_A cannot be dereferenced in host code

TECHNO
OF LO
TE

GY
ITU
IAN INST

KH
ARAGPUR
IND

19 5 1

yog, kms kOflm^

GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Observations

// d_A cannot be dereferenced in host code

cudaMemcpy ( h_C , d_C , size , c u d a M e m c p y D e v i c e T o H o s t ) ;
// transfer data from GPU to CPU memory
// can also transfer among different device mem locations
// can also transfer data host to host - we do not need that
// cannot transfer data among different GPU devices
cudaFree ( d_A ) ;
// free GPU global memory

TECHNO
OF LO
TE

GY
ITU
IAN INST

KH
ARAGPUR
IND

19 5 1

yog, kms kOflm^

GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
CUDA kernel

A CUDA kernel when invoked launches multiple threads arranged in a 2 level hierarchy,
check the device fn call.
vectorAdd < < < ceil ( n /256) ,256 > > >
( d_A , d_B , d_C , n )

I The call specifies a grid of threads to be launched

I the grid is arranged in a hierarchical manner
I (no. of blocks, no. of thread per block)
I all blocks contain same no. of threads (max 1024)
I blocks can be numbered as (_,_,_) triplets : more on this later
TECHNO
OF LO
TE

GY
ITU
IAN INST

KH
ARAGPUR
IND

19 5 1

yog, kms kOflm^

GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Block 0 Block 1 Block N-1

0 1 2 3 252 253 254 255 0 1 2 3 252 253 254 255 0 1 2 3 252 253 254 255

i=blockIdx.xblockDim.x+threadIdx.x i=blockIdx.xblockDim.x+threadIdx.x ... i=blockIdx.x*blockDim.x+threadIdx.x

C[i]=A[i]+B[i] C[i]=A[i]+B[i] C[i]=A[i]+B[i]

... ... ...

TECHNO
OF LO
TE

GY
ITU
IAN INST

KH
ARAGPUR
IND

19 5 1

yog, kms kOflm^

GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Kernel specific system vars

I gridDim - no. of blocks in the grid

I gridDim.x - no. of blocks in dimension x of multi-dim grid !!
I blockDim - no. of threads/block
I blockDim.x - no. of threads/block in dimension x of multi-dim block !!
I For single dimension defn of block composition in grid, blockDim = blockDim.x
I blockidx.x = block number for a thread
I threadidx.x = thread no. inside a block

TECHNO
OF LO
TE

GY
ITU
IAN INST

KH
ARAGPUR
IND

19 5 1

yog, kms kOflm^

GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
__global__
void vectorAdd ( float * A , float * B ,
float * C , int n ) {
int i = threadIdx . x + blockDim . x * blockIdx . x ;
if (i < n )
C [ i ] = A [ i ] + B [ i ];
}

I The code is executed by all the threads in the grid

I Every thread has a unique combination of (blockIdx.x, threadIdx.x) which maps to
a unique value of i
I i is private to each thread

TECHNO
OF LO
TE

GY
ITU
IAN INST

KH
ARAGPUR
IND

19 5 1

yog, kms kOflm^

GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Block 0 Block 1 Block N-1

0 1 2 3 252 253 254 255 0 1 2 3 252 253 254 255 0 1 2 3 252 253 254 255

1 256 253

i=blockIdx.xblockDim.x+threadIdx.x i=blockIdx.xblockDim.x+threadIdx.x ... i=blockIdx.x*blockDim.x+threadIdx.x

C[i]=A[i]+B[i] C[i]=A[i]+B[i] C[i]=A[i]+B[i]

... ... ...

TECHNO
OF LO
TE

GY
ITU
IAN INST

KH
ARAGPUR
IND

19 5 1

yog, kms kOflm^

GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Function declaration Keywords

__global__
void vectorAdd ( float * A , float * B , float * C , int n )

Table: CUDA Keywords for functions and their scope

Keywords and Functions Executed on the Only callable from the
__device__ float DeviceFunc() device device
__global__ void KernelFunc() device host
__host__ float HostFunc() host host

TECHNO
OF LO
TE

GY
ITU
IAN INST

KH
ARAGPUR
IND

19 5 1

yog, kms kOflm^

GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
CUDA functions

I Every function is a default __host__ function (if not having any CUDA
keywords)
I A function can be declared as both __host__ and __device__ function
I "__host__ __device__ fn()"
I Runtime system generates two object files, one can be called from host fn()s,
another from device fn()s
I __global__ functions can also be called from the device using CUDA kernel
semantics (<<< ... >>>) if you are using dynamic parallelism - that requires
CUDA 5.0 and compute capability 3.5 or higher.
TECHNO
OF LO
TE

GY
ITU
IAN INST

KH
ARAGPUR
IND

19 5 1

yog, kms kOflm^

GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
CUDA functions : more observations

I __device__ functions can have a return type other than void but __global__
functions must always return void
I __global__functions can be called from within other kernels running on the GPU
to launch additional GPU threads (as part of CUDA dynamic parallelism model)
while __device__ functions run on the same thread as the calling kernel.

TECHNO
OF LO
TE

GY
ITU
IAN INST

KH
ARAGPUR
IND

19 5 1

yog, kms kOflm^

GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Matrix Multiplication (CPU only)

void M at ri x Mu lK e rn el ( float * M , float * N , float * P , int N ) {

for ( int i =0; i < N ; i ++)
for ( int j =0; j < N ; j ++)
{
float Pvalue =0.0;
for ( int k = 0; k < N ; ++ k )
{
Pvalue += M [ i ][ k ]* N [ k ][ j ];
}
P [ i ][ j ] = Pvalue ;
}
}

TECHNO
OF LO
TE

GY
ITU
IAN INST

KH
ARAGPUR
IND

19 5 1

yog, kms kOflm^

GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Matrix Multiplication Host Program

int main ()
{
int size = 16*16;
cudaMemcpy ( d_M , M , size * sizeof ( float ) ,
cudaMemcpyHostToDevice );
cudaMemcpy ( d_N , N , size * sizeof ( float ) ,
cudaMemcpyHostToDevice );
dim3 grid (2 ,2 ,1) ;
dim3 block (8 ,8 ,1) ;
int N =16; // N is the number of rows and columns
MatrixMulKernel < < < grid , block > > >( d_M , d_N , d_P , N )
cudaMemcpy (P , d_P , size * sizeof ( float ) ,
cudaMemcpyDeviceToHost );
}
TECHNO
OF LO
TE

GY
ITU
IAN INST

KH
ARAGPUR
IND

19 5 1

yog, kms kOflm^

GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Matrix Multiplication Kernel

__global__
void M at ri x Mu lK e rn el ( float * d_M , float * d_N , float * d_P , int N ) {
int i = blockIdx . y * blockDim . y + threadIdx . y ;
int j = blockIdx . x * blockDim . x + threadIdx . x ;
if (( i < N ) && (j < N ) ) {
float Pvalue = 0.0;
for ( int k = 0; k < N ; ++ k ) {
Pvalue += d_M [ i * N + k ]* d_N [ k * N + j ];
}
d_P [ i * N + j ] = Pvalue ;
}
}

TECHNO
OF LO
TE

GY
ITU
IAN INST

KH
ARAGPUR
IND

19 5 1

yog, kms kOflm^

GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
X =

Figure: Matrix Multiplication

N
P
d_P[i ∗ N + j] = d_M[i ∗ N + k] ∗ d_N[k ∗ N + j]
k=0
TECHNO
OF LO
TE

GY
ITU
IAN INST

KH
ARAGPUR
IND

19 5 1

yog, kms kOflm^

GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Slightly Advanced Example: Julia Sets

A Julia Set (named after the French mathematicians Gaston Julia who worked on
complex dynamics during the early 20th century.) J represents a set of points
contained in the boundary of a certain class of functions over complex numbers.
I Given a set of points in a complex plane, the set J is constructed by evaluating for
each point, a simple iterative equation given by Zn = Zn2 + C where Zn represents
a complex number and C represents a complex constant.
I A point does not belong to J , if iterative application of the equation yields a
diverging sequence of numbers for that point.

TECHNO
OF LO
TE

GY
ITU
IAN INST

KH
ARAGPUR
IND

19 5 1

yog, kms kOflm^

GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Complex Numbers (CPU)
struct Complex {
float r ;
float i ;
};

float magnitude ( struct Complex a ) {

return (( a . r * a . r ) + ( a . i * a . i ) ) ;
}

void add ( struct Complex a , struct Complex b , struct Complex * res ) {

res - > r = a . r + b . r ;
res - > i = a . i + b . i ;
}

void mul ( struct Complex a , struct Complex b , struct Complex * res ) {

res - > r = ( a . r * b . r ) - ( a . i * b . i ) ; TECHNO
OF LO
TE

res - > i = ( a . r * b . i ) + ( a . i * b . r ) ;

GY
ITU
IAN INST

KH
ARAGPUR
IND
}

19 5 1

yog, kms kOflm

GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
From Pixel Grid to Complex Plane
x
b Translate and Scale
y
(aj, bj ) aj = scale*(xj-DIM/2)/DIM/2
bj = scale*(yj-DIM/2)/DIM/2
(xj, yj )
Colour (xj,yj) depending on
DIM membership of aj + ibj in Julia Set
a

Colour (xj,yj) red if it belongs to set

Colour (xj,yj) black if it does not belong
to set.

DIM TE
OF
TECHNO
LO

GY
ITU
IAN INST

KH
ARAGPUR
IND

19 5 1

Figure: Coordinate Transformation yog, kms kOflm^

GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Julia Function for a point (CPU)
int julia ( int x , int y ) {
const float scale = 1.5;
float jx = scale * ( float ) ( DIM /2 - x ) /( DIM /2) ;
float jy = scale * ( float ) ( DIM /2 - y ) /( DIM /2) ;

struct Complex c ,a , r1 , r2 ;
c . r = -0.8; c . i =0.154;
a . r = jx ; a . i = jy ;
int i = 0;
for ( i =0; i <200; i ++) {
// a = a * a + c ;
mul (a ,a ,& r1 ) ;
add ( r1 ,c ,& r2 ) ;
if ( magnitude ( r2 ) > 1000)
return 0; // return 0 if it is not in set
a . r = r2 . r ;
a . i = r2 . i ;
} TE
OF
TECHNO
LO

GY
ITU
IAN INST

KH
ARAGPUR
return 1; // return 1 if point is in set

IND

19 5 1

} yog, kms kOflm^

GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Driver Code(CPU)
void kernel ( unsigned char * ptr )
{
for ( int y =0; y < DIM ; y ++)
{
for ( int x =0; x < DIM ; x ++)
{
int offset = x + y * DIM ;
int juliaValue = julia (x , y ) ;
ptr [ offset *4 + 0] = 255 * juliaValue ;
ptr [ offset *4 + 1] = 0;
ptr [ offset *4 + 2] = 0;
ptr [ offset *4 + 3] = 255;
}
}
}

A 32 bit per pixel color bitmap represents a 2D grid of pixel values where each pixel is TECHNO
OF LO
TE

represented by 4 channels (R,G,B,α) and where each channel has values in the range

GY
ITU
IAN INST

KH
ARAGPUR
IND

[0 − 255]. (α represents transparency).

19 5 1

yog, kms kOflm^

GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Driver Code(CPU)

int main ( void )

{
CPUBitmap bitmap ( DIM , DIM ) ;
unsigned char * ptr = bitmap . get_ptr () ;
kernel ( ptr ) ;
bitmap . d i s p l a y _ a n d _ e x i t () ;
}

We leave out intricate details of how bitmap data is constructed. The primary focus of
discussing this application lies in depicting the underlying computation involved in
constructing Julia sets.

TECHNO
OF LO
TE

GY
ITU
IAN INST

KH
ARAGPUR
IND

19 5 1

yog, kms kOflm^

GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Complex Numbers GPU
struct cuComplex {
float r ;
float i ;
};

device float magnitude ( struct cuComplex a ) {

return (( a . r * a . r ) + ( a . i * a . i ) ) ;
}

device void add ( struct cuComplex a , struct cuComplex b , struct cuComplex *

res ) {
res - > r = a . r + b . r ;
res - > i = a . i + b . i ;
}

device void mul ( struct cuComplex a , struct cuComplex b , struct cuComplex *

res ) {
res - > r = ( a . r * b . r ) - ( a . i * b . i ) ; TE
OF
TECHNO
LO

GY
ITU
IAN INST

KH
ARAGPUR
res - > i = ( a . r * b . i ) + ( a . i * b . r ) ;

IND

19 5 1

} yog, kms kOflm^

GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Julia Function GPU
__device__ int julia ( int x , int y ) {
const float scale = 1.5;
float jx = scale * ( float ) ( DIM /2 - x ) /( DIM /2) ;
float jy = scale * ( float ) ( DIM /2 - y ) /( DIM /2) ;

struct cuComplex c ,a , r1 , r2 ;
c . r = -0.8; c . i =0.154;
a . r = jx ; a . i = jy ;
int i = 0;
for ( i =0; i <200; i ++) {
// a = a * a + c ;
mul (a ,a ,& r1 ) ;
add ( r1 ,c ,& r2 ) ;
if ( magnitude ( r2 ) > 1000)
return 0; // return 0 if it is not in set
a . r = r2 . r ;
a . i = r2 . i ;
} TE
OF
TECHNO
LO

GY
ITU
IAN INST

KH
ARAGPUR
return 1; // return 1 if point is in set

IND

19 5 1

} yog, kms kOflm^

GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
CUDA Kernel GPU

global void kernel ( unsigned char * ptr ) {

// map from threadIdx / BlockIdx to pixel position
int x = blockIdx . x ;
int y = blockIdx . y ;
int offset = x + y * gridDim . x ;

int juliaValue = julia (x , y ) ;

ptr [ offset *4 + 0] = 255 * juliaValue ; // red if 1 , black if 0
ptr [ offset *4 + 1] = 0;
ptr [ offset *4 + 2] = 0;
ptr [ offset *4 + 3] = 255;
}

TECHNO
OF LO
TE

GY
ITU
IAN INST

KH
ARAGPUR
IND

19 5 1

yog, kms kOflm^

GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Host Program

int main ( void ) {

CPUBitmap bitmap ( DIM , DIM ) ;
unsigned char * dev_bitmap ;

cudaMalloc ( ( void **) & dev_bitmap , bitmap . image_size () ) ;

dim3 grid ( DIM , DIM ) ;
kernel < < < grid ,1 > > >( dev_bitmap ) ;

cudaMemcpy ( bitmap . get_ptr () , dev_bitmap , bitmap . image_size () ,

cudaMemcpyDeviceToHost );
bitmap . d i s p l a y _ a n d _ e x i t () ;
cudaFree ( dev_bitmap ) ;
}

TECHNO
OF LO
TE

GY
ITU
IAN INST

KH
ARAGPUR
IND

19 5 1

yog, kms kOflm^

GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Julia Fractal Pattern

Figure: Julia Set TE

OF
TECHNO
LO

GY
ITU
IAN INST

KH
ARAGPUR
IND

19 5 1

yog, kms kOflm^

CUDA by Example: An Introduction to General-Purpose GPU Programming by Sanders et al.

GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Complex Numbers (CPU) C++

struct Complex
{
float r ;
float i ;
Complex ( float a , float b ) : r ( a ) , i ( b )
{}
float magnitude2 ( void ) { return r * r + i * i ; }
Complex operator *( const Complex & a )
{
return Complex ( r * a . r - i * a .i , i * a . r + r * a . i ) ;
}
Complex operator +( const Complex & a )
{
return Complex ( r + a .r , i + a . i ) ;
}
}; TE
OF
TECHNO
LO

GY
ITU
IAN INST

KH
ARAGPUR
IND

19 5 1

yog, kms kOflm^

GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Julia Function for a point (CPU) C++

int julia ( int x , int y )

{
const float scale = 1.5;
float jx = scale * ( float ) ( DIM /2 - x ) /( DIM /2) ;
float jy = scale * ( float ) ( DIM /2 - y ) /( DIM /2) ;
Complex c ( -0.8 , 0.156) ; // constant C
Complex a ( jx , jy ) ;
int i = 0;
for ( i =0; i <200; i ++) {
a = a * a + c;
if ( a . magnitude2 () > 1000)
return 0;
}
return 1;
}
TECHNO
OF LO
TE

GY
ITU
IAN INST

KH
ARAGPUR
IND

19 5 1

yog, kms kOflm^

GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Complex Numbers GPU (C++)

struct cuComplex
{
float r ;
float i ;
__device__ cuComplex ( float a , float b ) : r ( a ) , i ( b ) {}
__device__ float magnitude2 ( void ) { return r * r + i * i ;}
__device__ cuComplex operator *( const cuComplex & a ) {
return cuComplex ( r * a . r - i * a .i , i * a . r + r * a . i ) ;
}
__device__ cuComplex operator +( const cuComplex & a ) {
return cuComplex ( r + a .r , i + a . i ) ;
}
};

TECHNO
OF LO
TE

GY
ITU
IAN INST

KH
ARAGPUR
IND

19 5 1

yog, kms kOflm^

GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Julia Function GPU (C++)

device int julia ( int x , int y ) {

const float scale = 1.5;
float jx = scale * ( float ) ( DIM /2 - x ) /( DIM /2) ;
float jy = scale * ( float ) ( DIM /2 - y ) /( DIM /2) ;

cuComplex c ( -0.8 ,0.154) ;

cuComplex a ( jx , jy ) ;

int i = 0;
for ( i =0; i <200; i ++) {
a = a*a + c;
if ( a . magnitude2 () > 1000)
return 0; // return 0 if (x , y ) is not in set
}
return 1; // return 1 if (x , y ) is in set
} TE
OF
TECHNO
LO

GY
ITU
IAN INST

KH
ARAGPUR
IND

19 5 1

yog, kms kOflm^

GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Pre-requisite to run CUDA on your system

To use CUDA on your system, you will need the following installed:
I CUDA-capable GPU
I A supported version of Linux with a gcc compiler and toolchain
I NVIDIA CUDA Toolkit (available at
http://developer.nvidia.com/cuda-downloads)

Please follow the steps to install CUDA from here.

TECHNO
OF LO
TE

GY
ITU
IAN INST

KH
ARAGPUR
IND

19 5 1

yog, kms kOflm^

GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
How to setup Google Colab
Google Colab can compile and execute CUDA code online.
I Open this link in Google Chrome.
I Open NEW PYTHON 3 NOTEBOOK.

TECHNO
OF LO
TE

GY
ITU
IAN INST

KH
ARAGPUR
IND

19 5 1

yog, kms kOflm^

GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Cont.
I Click on Runtime -> Change runtime type.
I Select GPU from the drop down menu and click on Save.

TECHNO
OF LO
TE

GY
ITU
IAN INST

KH
ARAGPUR
IND

19 5 1

yog, kms kOflm^

GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Cont.

I Check your NVCC version using this code in code cell:

! nvcc -- version

I The output should be something like this:

nvcc : NVIDIA ( R ) Cuda compiler driver
Copyright ( c ) 2005 -2018 NVIDIA Corporation
Built on Sat_Aug_25_21 :08:01 _CDT_2018
Cuda compilation tools , release 10.0 , V10 .0.130

TECHNO
OF LO
TE

GY
ITU
IAN INST

KH
ARAGPUR
IND

19 5 1

yog, kms kOflm^

GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Cont.

If NOT then, install CUDA Version 9 following these commands in code cell.
! wget https : // developer . nvidia . com / compute / cuda /9.2/ Prod / l o c a l _ i n s t a l l e r s / cuda
- repo - ubuntu1604 -9 -2 - local_9 .2.88 -1 _amd64 -O cuda - repo - ubuntu1604 -9 -2 -
local_9 .2.88 -1 _amd64 . deb
! dpkg -i cuda - repo - ubuntu1604 -9 -2 - local_9 .2.88 -1 _amd64 . deb
! apt - key add / var / cuda - repo -9 -2 - local /7 fa2af80 . pub
! apt - get update
! apt - get install cuda -9.2

TECHNO
OF LO
TE

GY
ITU
IAN INST

KH
ARAGPUR
IND

19 5 1

yog, kms kOflm^

GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Cont.

I Execute the given command to install a small extension to run nvcc from
Notebook cells.
! pip install git + git : // github . com / andreinechaev / nvcc4jupyter . git

I Load the extension using this code:

% load_ext nvcc_plugin

I Go to Insert -> Code Cell

I Write %%cu in the first line
I Write the cuda program and execute
TECHNO
OF LO
TE

GY
ITU
IAN INST

KH
ARAGPUR
IND

19 5 1

yog, kms kOflm^

GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur

Essentials of SMT: Practical Know-How
From Everand
Essentials of SMT: Practical Know-How
Young Bong Kang
4.5/5 (6)
GPU_Programming_slides_2
No ratings yet
GPU_Programming_slides_2
37 pages
CUDA PPT Anurita Unit3
No ratings yet
CUDA PPT Anurita Unit3
42 pages
CUDA_part-1
No ratings yet
CUDA_part-1
52 pages
HPC Final 4-8
No ratings yet
HPC Final 4-8
25 pages
Cuda Review 1
No ratings yet
Cuda Review 1
13 pages
CUDAProgModel
No ratings yet
CUDAProgModel
24 pages
CUDA Programming Model
No ratings yet
CUDA Programming Model
14 pages
GPUMod 2
No ratings yet
GPUMod 2
64 pages
Lec 2 PDC
No ratings yet
Lec 2 PDC
31 pages
01 Cuda c Basics
No ratings yet
01 Cuda c Basics
32 pages
Lecture 11 Programming On Gpus Part 1 Zxu2acms60212 40212 S15lec 11 Gpupdf
No ratings yet
Lecture 11 Programming On Gpus Part 1 Zxu2acms60212 40212 S15lec 11 Gpupdf
121 pages
Gpu History and Cuda Programming Basics
No ratings yet
Gpu History and Cuda Programming Basics
44 pages
217 Lec2
No ratings yet
217 Lec2
24 pages
CUDA Compute Unified Device Architecture
No ratings yet
CUDA Compute Unified Device Architecture
26 pages
21.L18 Intro To GPU and CUDA C
No ratings yet
21.L18 Intro To GPU and CUDA C
89 pages
Intro To CUDA
No ratings yet
Intro To CUDA
76 pages
CUDA_part-1-LMS
No ratings yet
CUDA_part-1-LMS
51 pages
Programming Gpus With Cuda: John Mellor-Crummey
No ratings yet
Programming Gpus With Cuda: John Mellor-Crummey
42 pages
Lecture3 Fundamentals of CUDA(Part1)_2025
No ratings yet
Lecture3 Fundamentals of CUDA(Part1)_2025
52 pages
Hetero Lecture Slides 002 Lecture 1 Lecture-1-5-Cuda-API
No ratings yet
Hetero Lecture Slides 002 Lecture 1 Lecture-1-5-Cuda-API
11 pages
CUDA_1
No ratings yet
CUDA_1
45 pages
Lecture 2
No ratings yet
Lecture 2
77 pages
Lecture-12-GPU-Programming
No ratings yet
Lecture-12-GPU-Programming
65 pages
Cuda C/C++ Basics: NVIDIA Corporation
No ratings yet
Cuda C/C++ Basics: NVIDIA Corporation
67 pages
CUDA Programming Invert
No ratings yet
CUDA Programming Invert
36 pages
Threads
No ratings yet
Threads
54 pages
CUDA
No ratings yet
CUDA
33 pages
Laboratory Practice I (410246)
No ratings yet
Laboratory Practice I (410246)
28 pages
GPU Basics
No ratings yet
GPU Basics
93 pages
GPU Series III CUDA Compilation Host Side 1721302802
No ratings yet
GPU Series III CUDA Compilation Host Side 1721302802
8 pages
Lecture12 GPUArchCUDA02-CUDAMem
No ratings yet
Lecture12 GPUArchCUDA02-CUDAMem
67 pages
Lecture2 Cuda Basic 2010
No ratings yet
Lecture2 Cuda Basic 2010
44 pages
лк CUDA - 1 PDCn
No ratings yet
лк CUDA - 1 PDCn
31 pages
Aca Lab Manual Final
No ratings yet
Aca Lab Manual Final
28 pages
A Beginner'S Guide To Programming Gpus With Cuda: Mike Peardon
No ratings yet
A Beginner'S Guide To Programming Gpus With Cuda: Mike Peardon
21 pages
Unit 5 - CUDA Architecture
No ratings yet
Unit 5 - CUDA Architecture
17 pages
Introduction To CUDA C 3
No ratings yet
Introduction To CUDA C 3
67 pages
ECE 498AL The CUDA Programming Model
No ratings yet
ECE 498AL The CUDA Programming Model
37 pages
Group A Assignment 4 (A) : Two Large Vectors
No ratings yet
Group A Assignment 4 (A) : Two Large Vectors
5 pages
Introduction To CUDA C
No ratings yet
Introduction To CUDA C
67 pages
Introduction To CUDA: CAP 4730 Spring 2012
No ratings yet
Introduction To CUDA: CAP 4730 Spring 2012
35 pages
combinepdf
No ratings yet
combinepdf
28 pages
Introduccion CUDA C
No ratings yet
Introduccion CUDA C
51 pages
Gpu Cuda
No ratings yet
Gpu Cuda
204 pages
2023-CSC14120-Lecture01-CUDAIntroduction
No ratings yet
2023-CSC14120-Lecture01-CUDAIntroduction
32 pages
CUDA Tutorial
No ratings yet
CUDA Tutorial
50 pages
Introduction To Programming Massively Parallel Graphics Processors
No ratings yet
Introduction To Programming Massively Parallel Graphics Processors
84 pages
Endsem Imp Hpc Unit 5
No ratings yet
Endsem Imp Hpc Unit 5
24 pages
GPGPU Programming With CUDA: Leandro Avila - University of Northern Iowa
No ratings yet
GPGPU Programming With CUDA: Leandro Avila - University of Northern Iowa
29 pages
27th Aug - Introduction To GPGPU - Part 1
No ratings yet
27th Aug - Introduction To GPGPU - Part 1
32 pages
5. Moving to Parallel With CUDA - Hello Program
No ratings yet
5. Moving to Parallel With CUDA - Hello Program
14 pages
DS1822 - Parallel Computing-unit3
No ratings yet
DS1822 - Parallel Computing-unit3
17 pages
CUDA Introduction Mod
No ratings yet
CUDA Introduction Mod
50 pages
CUDA Programming Basic: High Performance Computing Center Hanoi University of Science & Technology
No ratings yet
CUDA Programming Basic: High Performance Computing Center Hanoi University of Science & Technology
38 pages
Cuda C
No ratings yet
Cuda C
70 pages
GPU Programming: Dr. Florian Ferreira
No ratings yet
GPU Programming: Dr. Florian Ferreira
101 pages
04 IntroductionGPUsCUDA
No ratings yet
04 IntroductionGPUsCUDA
25 pages
Gpu-Arc
No ratings yet
Gpu-Arc
37 pages
Mem-Coalesce
No ratings yet
Mem-Coalesce
69 pages
Module 3 Antenna Part
No ratings yet
Module 3 Antenna Part
35 pages
Reduction
No ratings yet
Reduction
91 pages
Multi - Dim
No ratings yet
Multi - Dim
29 pages
Wave Guides
No ratings yet
Wave Guides
29 pages
Module 1 and Module 2
No ratings yet
Module 1 and Module 2
56 pages
Double Stub and LC Matching Circuit
No ratings yet
Double Stub and LC Matching Circuit
31 pages
ConcurrencyDecomposition Parallel Algorithm
No ratings yet
ConcurrencyDecomposition Parallel Algorithm
40 pages
Comparing SAP Analytics Cloud and Microsoft Power BI - Datavard
No ratings yet
Comparing SAP Analytics Cloud and Microsoft Power BI - Datavard
13 pages
##Cisco Channel Partner Program 0112
No ratings yet
##Cisco Channel Partner Program 0112
12 pages
Software Mining Repository Practical
No ratings yet
Software Mining Repository Practical
28 pages
Cisco Catalyst 3850 Series and Cisco Catalyst 3650 Series Switches Best Practices Guide
No ratings yet
Cisco Catalyst 3850 Series and Cisco Catalyst 3650 Series Switches Best Practices Guide
120 pages
10987C - Performance Tuning and Optimising SQL Databases
No ratings yet
10987C - Performance Tuning and Optimising SQL Databases
4 pages
Widex Moment™ Ite/Itc M-Im: Standard Technology
No ratings yet
Widex Moment™ Ite/Itc M-Im: Standard Technology
2 pages
Uml Diagrams: Use Case Diagram
No ratings yet
Uml Diagrams: Use Case Diagram
9 pages
PHD Coursework Syllabus
No ratings yet
PHD Coursework Syllabus
69 pages
Handout (CS F301)
No ratings yet
Handout (CS F301)
5 pages
How Do LTSP Fat Clients Work
No ratings yet
How Do LTSP Fat Clients Work
3 pages
PW6K1ICE Intelligent Controller: Installation and Configuration Guide
No ratings yet
PW6K1ICE Intelligent Controller: Installation and Configuration Guide
42 pages
Task - Scheduling - Algorithm - in - Cloud Compu
No ratings yet
Task - Scheduling - Algorithm - in - Cloud Compu
20 pages
DDI0501F_cortex_a53_cryptography_trm
No ratings yet
DDI0501F_cortex_a53_cryptography_trm
19 pages
DJ Mixer: This Manual Is Applicable To The Following Model (S) and Type (S)
0% (1)
DJ Mixer: This Manual Is Applicable To The Following Model (S) and Type (S)
156 pages
AN-012-EN Reduce Acoustic Noise of IS31FL3236 EVB Rev.A
No ratings yet
AN-012-EN Reduce Acoustic Noise of IS31FL3236 EVB Rev.A
4 pages
CANoe ProductInformation EN
No ratings yet
CANoe ProductInformation EN
57 pages
Experimenting With A Stellex YIG Oscillator
No ratings yet
Experimenting With A Stellex YIG Oscillator
10 pages
Physics Project
No ratings yet
Physics Project
24 pages
Webpage Development Report
No ratings yet
Webpage Development Report
33 pages
Banking Domain Application Testing
No ratings yet
Banking Domain Application Testing
7 pages
Fanucprofibusdp 04
No ratings yet
Fanucprofibusdp 04
279 pages
A High Precision Optical Position Detector Based on Duo-lateral PSD
No ratings yet
A High Precision Optical Position Detector Based on Duo-lateral PSD
3 pages
Transformerless SoC-based Current Control Switchin PDF
No ratings yet
Transformerless SoC-based Current Control Switchin PDF
5 pages
EPROM Chip Replacement
No ratings yet
EPROM Chip Replacement
5 pages
Omneon SystemManager
No ratings yet
Omneon SystemManager
47 pages
Leica XPro 6.2 Manual
No ratings yet
Leica XPro 6.2 Manual
196 pages
AUP - Raman Ramsin
No ratings yet
AUP - Raman Ramsin
17 pages
PPT of Chapter 2
No ratings yet
PPT of Chapter 2
49 pages
Project Report Format
No ratings yet
Project Report Format
6 pages

Basic-Cuda

Uploaded by

Basic-Cuda

Uploaded by

GPU Architectures and Programming

Soumyajit Dey, Assistant Professor,

December 12, 2019

yog, kms kOflm^

yog, kms kOflm^

I CUDA C is an extension of C programming language with special constructs for

yog, kms kOflm^

I host code for a host device (CPU)

yog, kms kOflm^

yog, kms kOflm^

CPU serial code

yog, kms kOflm^

void vecAdd ( float * h_A , float * h_B ,

yog, kms kOflm^

cudaError_t err = cudaSuccess ; yog, kms kOflm^

exit ( EXIT_FAILURE ) ; yog, kms kOflm^

err = cudaMemcpy ( d_B , h_B , size , c u d a M e m c p y H o s t T o D e v i c e ) ;

yog, kms kOflm^

yog, kms kOflm^

} // End of Function yog, kms kOflm^

nvcc kernel . cu host . cu -o output

yog, kms kOflm^

Host Memory Device Memory

Figure: CPU/GPU Mem Layout

yog, kms kOflm^

cudaMalloc (( void **) & d_A , size ) ;

yog, kms kOflm^

// d_A cannot be dereferenced in host code

yog, kms kOflm^

I The call specifies a grid of threads to be launched

yog, kms kOflm^

i=blockIdx.x*blockDim.x+threadIdx.x i=blockIdx.x*blockDim.x+threadIdx.x ... i=blockIdx.x*blockDim.x+threadIdx.x

C[i]=A[i]+B[i] C[i]=A[i]+B[i] C[i]=A[i]+B[i]

... ... ...

yog, kms kOflm^

I gridDim - no. of blocks in the grid

yog, kms kOflm^

I The code is executed by all the threads in the grid

yog, kms kOflm^

i=blockIdx.x*blockDim.x+threadIdx.x i=blockIdx.x*blockDim.x+threadIdx.x ... i=blockIdx.x*blockDim.x+threadIdx.x

C[i]=A[i]+B[i] C[i]=A[i]+B[i] C[i]=A[i]+B[i]

... ... ...

yog, kms kOflm^

Table: CUDA Keywords for functions and their scope

yog, kms kOflm^

yog, kms kOflm^

yog, kms kOflm^

void M at ri x Mu lK e rn el ( float * M , float * N , float * P , int N ) {

yog, kms kOflm^

yog, kms kOflm^

yog, kms kOflm^

Figure: Matrix Multiplication

yog, kms kOflm^

yog, kms kOflm^

float magnitude ( struct Complex a ) {

void add ( struct Complex a , struct Complex b , struct Complex * res ) {

void mul ( struct Complex a , struct Complex b , struct Complex * res ) {

yog, kms kOflm

Colour (xj,yj) red if it belongs to set

Figure: Coordinate Transformation yog, kms kOflm^

} yog, kms kOflm^

[0 − 255]. (α represents transparency).

yog, kms kOflm^

int main ( void )

yog, kms kOflm^

__device__ float magnitude ( struct cuComplex a ) {

__device__ void add ( struct cuComplex a , struct cuComplex b , struct cuComplex *

__device__ void mul ( struct cuComplex a , struct cuComplex b , struct cuComplex *

} yog, kms kOflm^

} yog, kms kOflm^

__global__ void kernel ( unsigned char * ptr ) {

int juliaValue = julia (x , y ) ;

yog, kms kOflm^

int main ( void ) {

cudaMalloc ( ( void **) & dev_bitmap , bitmap . image_size () ) ;

cudaMemcpy ( bitmap . get_ptr () , dev_bitmap , bitmap . image_size () ,

i=blockIdx.xblockDim.x+threadIdx.x i=blockIdx.xblockDim.x+threadIdx.x ... i=blockIdx.x*blockDim.x+threadIdx.x

i=blockIdx.xblockDim.x+threadIdx.x i=blockIdx.xblockDim.x+threadIdx.x ... i=blockIdx.x*blockDim.x+threadIdx.x

device float magnitude ( struct cuComplex a ) {

device void add ( struct cuComplex a , struct cuComplex b , struct cuComplex *

device void mul ( struct cuComplex a , struct cuComplex b , struct cuComplex *

global void kernel ( unsigned char * ptr ) {

device int julia ( int x , int y ) {