Basic-Cuda
Basic-Cuda
TECHNO
OF LO
TE
GY
ITU
IAN INST
KH
ARAGPUR
IND
19 5 1
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Course Organization
Topic Week Hours
Review of basic COA w.r.t. performance 1 2
Intro to GPU architectures 2 3
Intro to CUDA programming 3 2
Multi-dimensional data and synchronization 4 2
Warp Scheduling and Divergence 5 2
Memory Access Coalescing 6 2
Optimizing Reduction Kernels 7 3
Kernel Fusion, Thread and Block Coarsening 8 3
OpenCL - runtime system 9 3
OpenCL - heterogeneous computing 10 2
Efficient Neural Network Training/Inferencing 11-12 6
TECHNO
OF LO
TE
GY
ITU
IAN INST
KH
ARAGPUR
IND
19 5 1
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Compute Unified Device Architecture
TECHNO
OF LO
TE
GY
ITU
IAN INST
KH
ARAGPUR
IND
19 5 1
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
CUDA program structure
TECHNO
OF LO
TE
GY
ITU
IAN INST
KH
ARAGPUR
IND
19 5 1
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
The compilation flow
TECHNO
OF LO
TE
GY
ITU
IAN INST
KH
ARAGPUR
IND
19 5 1
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
The execution flow
TECHNO
OF LO
TE
GY
ITU
IAN INST
KH
ARAGPUR
IND
19 5 1
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Examples : Vector addition CPU only
GY
ITU
IAN INST
KH
ARAGPUR
IND
19 5 1
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Examples : Vector addition CPU-GPU
# include < cuda .h >
# include < cuda_runtime .h >
__global__ void vectorAdd ( float * , float * , float * , int ) ;
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
__global__
void vectorAdd ( float * A , float * B ,
float * C , int n ) { // CUDA kernel definition
int i = threadIdx . x + blockDim . x * blockIdx . x ;
if (i < n )
C [ i ] = A [ i ] + B [ i ];
}
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
void vecAdd ( float * h_A , float * h_B ,
float * h_C , int n )
{ // host program
int size = n * sizeof ( float ) ;
float * d_A = NULL , * d_B = NULL , * d_C = NULL ;
TECHNO
OF LO
TE
GY
ITU
IAN INST
KH
ARAGPUR
// Error code to check return values for CUDA calls
IND
19 5 1
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Device Memory Allocation
err = cudaMalloc (( void **) & d_A , size ) ;
if ( err != cudaSuccess )
{
fprintf ( stderr , " Failed to allocate device vector A ( error code % s ) !\ n " ,
c u d a G e t E r r o r S t r i n g ( err ) ) ;
exit ( EXIT_FAILURE ) ;
}
err = cudaMalloc (( void **) & d_B , size ) ;
if ( err != cudaSuccess )
{
fprintf ( stderr , " Failed to allocate device vector B ( error code % s ) !\ n " ,
c u d a G e t E r r o r S t r i n g ( err ) ) ;
exit ( EXIT_FAILURE ) ;
}
err = cudaMalloc (( void **) & d_C , size ) ;
if ( err != cudaSuccess )
{
fprintf ( stderr , " Failed to allocate device vector C ( error code % s ) !\ n " , TE
OF
TECHNO
LO
GY
ITU
IAN INST
KH
ARAGPUR
c u d a G e t E r r o r S t r i n g ( err ) ) ;
IND
19 5 1
}
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Host to Device Data Transfer
printf ( " Copy input data from the host memory to the CUDA device \ n " ) ;
err = cudaMemcpy ( d_A , h_A , size , c u d a M e m c p y H o s t T o D e v i c e ) ;
if ( err != cudaSuccess )
{
fprintf ( stderr , " Failed to copy vector A from host to device ( error code % s )
!\ n " , c u d a G e t E r r o r S t r i n g ( err ) ) ;
exit ( EXIT_FAILURE ) ;
}
if ( err != cudaSuccess )
{
fprintf ( stderr , " Failed to copy vector B from host to device ( error code % s )
!\ n " , c u d a G e t E r r o r S t r i n g ( err ) ) ;
exit ( EXIT_FAILURE ) ; TE
OF
TECHNO
LO
GY
ITU
IAN INST
KH
}
ARAGPUR
IND
19 5 1
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Kernel Launch
int t hr e ad sP e rB lo c k = 256;
int blocksPerGrid =( n + threadsPerBlock -1) / t h re ad s Pe rB l oc k ;
printf ( " CUDA kernel launch with % d blocks of % d threads \ n " , threadsPerBlock ,
blocksPerGrid ) ;
vectorAdd < < < blocksPerGrid , threadsPerBlock > > >( d_A , d_B , d_C , n ) ;
err = cu d a G e t L a s t E r r o r () ;
// device function ( CUDA kernel ) called from host does not have return type
// CUDA runtime functions ( execute in host side ) can have return type
if ( err != cudaSuccess )
{
fprintf ( stderr , " Failed to launch vectorAdd kernel ( error code % s ) !\ n " ,
c u d a G e t E r r o r S t r i n g ( err ) ) ;
exit ( EXIT_FAILURE ) ;
} TE
OF
TECHNO
LO
GY
ITU
IAN INST
KH
ARAGPUR
IND
19 5 1
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Device to Host Memory Transfer
printf ( " Copy output data from the output device to the host memory \ n " ) ;
err = cudaMemcpy ( h_C , d_C , size , c u d a M e m c p y D e v i c e T o H o s t ) ;
if ( err != cudaSuccess )
{
fprintf ( stderr , " Failed to copy vector C from device to host ( error code % s
) !\ n " , c u d a G e t E r r o r S t r i n g ( err ) ) ;
exit ( EXIT_FAILURE ) ;
}
cudaFree ( d_A ) ; cudaFree ( d_B ) ; cudaFree ( d_C ) ;
// Verify that the result vector is correct
for ( int i = 0; i < n ; ++ i )
{
if ( fabs ( h_A [ i ] + h_B [ i ] - h_C [ i ]) > 1e -5)
{
fprintf ( stderr , " Result verification failed at element % d !\ n " , i ) ;
exit ( EXIT_FAILURE ) ;
}
} TE
OF
TECHNO
LO
GY
ITU
IAN INST
KH
ARAGPUR
printf ( " Test PASSED " ) ;
IND
19 5 1
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Compile and Run
./ output
[ Vector addition of 50000 elements ]
Copy input data from the host memory to the CUDA device
CUDA kernel launch with 196 blocks of 256 threads
Copy output data from the CUDA device to the host memory
Test PASSED
TECHNO
OF LO
TE
GY
ITU
IAN INST
KH
ARAGPUR
IND
19 5 1
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Observations
CPU GPU
I cuda.h → includes during compilation CUDA API functions and CUDA system
variables TE
OF
TECHNO
LO
GY
ITU
IAN INST
KH
ARAGPUR
I h_A, h_B, h_C → arrays mapped to main memory locations
IND
19 5 1
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Observations
TECHNO
OF LO
TE
GY
ITU
IAN INST
KH
ARAGPUR
IND
19 5 1
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Observations
TECHNO
OF LO
TE
GY
ITU
IAN INST
KH
ARAGPUR
IND
19 5 1
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
CUDA kernel
A CUDA kernel when invoked launches multiple threads arranged in a 2 level hierarchy,
check the device fn call.
vectorAdd < < < ceil ( n /256) ,256 > > >
( d_A , d_B , d_C , n )
GY
ITU
IAN INST
KH
ARAGPUR
IND
19 5 1
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Block 0 Block 1 Block N-1
0 1 2 3 252 253 254 255 0 1 2 3 252 253 254 255 0 1 2 3 252 253 254 255
TECHNO
OF LO
TE
GY
ITU
IAN INST
KH
ARAGPUR
IND
19 5 1
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Kernel specific system vars
TECHNO
OF LO
TE
GY
ITU
IAN INST
KH
ARAGPUR
IND
19 5 1
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
__global__
void vectorAdd ( float * A , float * B ,
float * C , int n ) {
int i = threadIdx . x + blockDim . x * blockIdx . x ;
if (i < n )
C [ i ] = A [ i ] + B [ i ];
}
TECHNO
OF LO
TE
GY
ITU
IAN INST
KH
ARAGPUR
IND
19 5 1
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Block 0 Block 1 Block N-1
0 1 2 3 252 253 254 255 0 1 2 3 252 253 254 255 0 1 2 3 252 253 254 255
1 256 253
TECHNO
OF LO
TE
GY
ITU
IAN INST
KH
ARAGPUR
IND
19 5 1
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Function declaration Keywords
__global__
void vectorAdd ( float * A , float * B , float * C , int n )
TECHNO
OF LO
TE
GY
ITU
IAN INST
KH
ARAGPUR
IND
19 5 1
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
CUDA functions
I Every function is a default __host__ function (if not having any CUDA
keywords)
I A function can be declared as both __host__ and __device__ function
I "__host__ __device__ fn()"
I Runtime system generates two object files, one can be called from host fn()s,
another from device fn()s
I __global__ functions can also be called from the device using CUDA kernel
semantics (<<< ... >>>) if you are using dynamic parallelism - that requires
CUDA 5.0 and compute capability 3.5 or higher.
TECHNO
OF LO
TE
GY
ITU
IAN INST
KH
ARAGPUR
IND
19 5 1
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
CUDA functions : more observations
I __device__ functions can have a return type other than void but __global__
functions must always return void
I __global__functions can be called from within other kernels running on the GPU
to launch additional GPU threads (as part of CUDA dynamic parallelism model)
while __device__ functions run on the same thread as the calling kernel.
TECHNO
OF LO
TE
GY
ITU
IAN INST
KH
ARAGPUR
IND
19 5 1
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Matrix Multiplication (CPU only)
TECHNO
OF LO
TE
GY
ITU
IAN INST
KH
ARAGPUR
IND
19 5 1
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Matrix Multiplication Host Program
int main ()
{
int size = 16*16;
cudaMemcpy ( d_M , M , size * sizeof ( float ) ,
cudaMemcpyHostToDevice );
cudaMemcpy ( d_N , N , size * sizeof ( float ) ,
cudaMemcpyHostToDevice );
dim3 grid (2 ,2 ,1) ;
dim3 block (8 ,8 ,1) ;
int N =16; // N is the number of rows and columns
MatrixMulKernel < < < grid , block > > >( d_M , d_N , d_P , N )
cudaMemcpy (P , d_P , size * sizeof ( float ) ,
cudaMemcpyDeviceToHost );
}
TECHNO
OF LO
TE
GY
ITU
IAN INST
KH
ARAGPUR
IND
19 5 1
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Matrix Multiplication Kernel
__global__
void M at ri x Mu lK e rn el ( float * d_M , float * d_N , float * d_P , int N ) {
int i = blockIdx . y * blockDim . y + threadIdx . y ;
int j = blockIdx . x * blockDim . x + threadIdx . x ;
if (( i < N ) && (j < N ) ) {
float Pvalue = 0.0;
for ( int k = 0; k < N ; ++ k ) {
Pvalue += d_M [ i * N + k ]* d_N [ k * N + j ];
}
d_P [ i * N + j ] = Pvalue ;
}
}
TECHNO
OF LO
TE
GY
ITU
IAN INST
KH
ARAGPUR
IND
19 5 1
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
X =
N
P
d_P[i ∗ N + j] = d_M[i ∗ N + k] ∗ d_N[k ∗ N + j]
k=0
TECHNO
OF LO
TE
GY
ITU
IAN INST
KH
ARAGPUR
IND
19 5 1
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Slightly Advanced Example: Julia Sets
A Julia Set (named after the French mathematicians Gaston Julia who worked on
complex dynamics during the early 20th century.) J represents a set of points
contained in the boundary of a certain class of functions over complex numbers.
I Given a set of points in a complex plane, the set J is constructed by evaluating for
each point, a simple iterative equation given by Zn = Zn2 + C where Zn represents
a complex number and C represents a complex constant.
I A point does not belong to J , if iterative application of the equation yields a
diverging sequence of numbers for that point.
TECHNO
OF LO
TE
GY
ITU
IAN INST
KH
ARAGPUR
IND
19 5 1
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Complex Numbers (CPU)
struct Complex {
float r ;
float i ;
};
res - > i = ( a . r * b . i ) + ( a . i * b . r ) ;
GY
ITU
IAN INST
KH
ARAGPUR
IND
}
19 5 1
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
From Pixel Grid to Complex Plane
x
b Translate and Scale
y
(aj, bj ) aj = scale*(xj-DIM/2)/DIM/2
bj = scale*(yj-DIM/2)/DIM/2
(xj, yj )
Colour (xj,yj) depending on
DIM membership of aj + ibj in Julia Set
a
DIM TE
OF
TECHNO
LO
GY
ITU
IAN INST
KH
ARAGPUR
IND
19 5 1
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Julia Function for a point (CPU)
int julia ( int x , int y ) {
const float scale = 1.5;
float jx = scale * ( float ) ( DIM /2 - x ) /( DIM /2) ;
float jy = scale * ( float ) ( DIM /2 - y ) /( DIM /2) ;
struct Complex c ,a , r1 , r2 ;
c . r = -0.8; c . i =0.154;
a . r = jx ; a . i = jy ;
int i = 0;
for ( i =0; i <200; i ++) {
// a = a * a + c ;
mul (a ,a ,& r1 ) ;
add ( r1 ,c ,& r2 ) ;
if ( magnitude ( r2 ) > 1000)
return 0; // return 0 if it is not in set
a . r = r2 . r ;
a . i = r2 . i ;
} TE
OF
TECHNO
LO
GY
ITU
IAN INST
KH
ARAGPUR
return 1; // return 1 if point is in set
IND
19 5 1
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Driver Code(CPU)
void kernel ( unsigned char * ptr )
{
for ( int y =0; y < DIM ; y ++)
{
for ( int x =0; x < DIM ; x ++)
{
int offset = x + y * DIM ;
int juliaValue = julia (x , y ) ;
ptr [ offset *4 + 0] = 255 * juliaValue ;
ptr [ offset *4 + 1] = 0;
ptr [ offset *4 + 2] = 0;
ptr [ offset *4 + 3] = 255;
}
}
}
A 32 bit per pixel color bitmap represents a 2D grid of pixel values where each pixel is TECHNO
OF LO
TE
represented by 4 channels (R,G,B,α) and where each channel has values in the range
GY
ITU
IAN INST
KH
ARAGPUR
IND
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Driver Code(CPU)
We leave out intricate details of how bitmap data is constructed. The primary focus of
discussing this application lies in depicting the underlying computation involved in
constructing Julia sets.
TECHNO
OF LO
TE
GY
ITU
IAN INST
KH
ARAGPUR
IND
19 5 1
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Complex Numbers GPU
struct cuComplex {
float r ;
float i ;
};
GY
ITU
IAN INST
KH
ARAGPUR
res - > i = ( a . r * b . i ) + ( a . i * b . r ) ;
IND
19 5 1
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Julia Function GPU
__device__ int julia ( int x , int y ) {
const float scale = 1.5;
float jx = scale * ( float ) ( DIM /2 - x ) /( DIM /2) ;
float jy = scale * ( float ) ( DIM /2 - y ) /( DIM /2) ;
struct cuComplex c ,a , r1 , r2 ;
c . r = -0.8; c . i =0.154;
a . r = jx ; a . i = jy ;
int i = 0;
for ( i =0; i <200; i ++) {
// a = a * a + c ;
mul (a ,a ,& r1 ) ;
add ( r1 ,c ,& r2 ) ;
if ( magnitude ( r2 ) > 1000)
return 0; // return 0 if it is not in set
a . r = r2 . r ;
a . i = r2 . i ;
} TE
OF
TECHNO
LO
GY
ITU
IAN INST
KH
ARAGPUR
return 1; // return 1 if point is in set
IND
19 5 1
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
CUDA Kernel GPU
TECHNO
OF LO
TE
GY
ITU
IAN INST
KH
ARAGPUR
IND
19 5 1
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Host Program
TECHNO
OF LO
TE
GY
ITU
IAN INST
KH
ARAGPUR
IND
19 5 1
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Julia Fractal Pattern
GY
ITU
IAN INST
KH
ARAGPUR
IND
19 5 1
struct Complex
{
float r ;
float i ;
Complex ( float a , float b ) : r ( a ) , i ( b )
{}
float magnitude2 ( void ) { return r * r + i * i ; }
Complex operator *( const Complex & a )
{
return Complex ( r * a . r - i * a .i , i * a . r + r * a . i ) ;
}
Complex operator +( const Complex & a )
{
return Complex ( r + a .r , i + a . i ) ;
}
}; TE
OF
TECHNO
LO
GY
ITU
IAN INST
KH
ARAGPUR
IND
19 5 1
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Julia Function for a point (CPU) C++
GY
ITU
IAN INST
KH
ARAGPUR
IND
19 5 1
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Complex Numbers GPU (C++)
struct cuComplex
{
float r ;
float i ;
__device__ cuComplex ( float a , float b ) : r ( a ) , i ( b ) {}
__device__ float magnitude2 ( void ) { return r * r + i * i ;}
__device__ cuComplex operator *( const cuComplex & a ) {
return cuComplex ( r * a . r - i * a .i , i * a . r + r * a . i ) ;
}
__device__ cuComplex operator +( const cuComplex & a ) {
return cuComplex ( r + a .r , i + a . i ) ;
}
};
TECHNO
OF LO
TE
GY
ITU
IAN INST
KH
ARAGPUR
IND
19 5 1
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Julia Function GPU (C++)
int i = 0;
for ( i =0; i <200; i ++) {
a = a*a + c;
if ( a . magnitude2 () > 1000)
return 0; // return 0 if (x , y ) is not in set
}
return 1; // return 1 if (x , y ) is in set
} TE
OF
TECHNO
LO
GY
ITU
IAN INST
KH
ARAGPUR
IND
19 5 1
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Pre-requisite to run CUDA on your system
To use CUDA on your system, you will need the following installed:
I CUDA-capable GPU
I A supported version of Linux with a gcc compiler and toolchain
I NVIDIA CUDA Toolkit (available at
http://developer.nvidia.com/cuda-downloads)
TECHNO
OF LO
TE
GY
ITU
IAN INST
KH
ARAGPUR
IND
19 5 1
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
How to setup Google Colab
Google Colab can compile and execute CUDA code online.
I Open this link in Google Chrome.
I Open NEW PYTHON 3 NOTEBOOK.
TECHNO
OF LO
TE
GY
ITU
IAN INST
KH
ARAGPUR
IND
19 5 1
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Cont.
I Click on Runtime -> Change runtime type.
I Select GPU from the drop down menu and click on Save.
TECHNO
OF LO
TE
GY
ITU
IAN INST
KH
ARAGPUR
IND
19 5 1
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Cont.
TECHNO
OF LO
TE
GY
ITU
IAN INST
KH
ARAGPUR
IND
19 5 1
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Cont.
If NOT then, install CUDA Version 9 following these commands in code cell.
! wget https : // developer . nvidia . com / compute / cuda /9.2/ Prod / l o c a l _ i n s t a l l e r s / cuda
- repo - ubuntu1604 -9 -2 - local_9 .2.88 -1 _amd64 -O cuda - repo - ubuntu1604 -9 -2 -
local_9 .2.88 -1 _amd64 . deb
! dpkg -i cuda - repo - ubuntu1604 -9 -2 - local_9 .2.88 -1 _amd64 . deb
! apt - key add / var / cuda - repo -9 -2 - local /7 fa2af80 . pub
! apt - get update
! apt - get install cuda -9.2
TECHNO
OF LO
TE
GY
ITU
IAN INST
KH
ARAGPUR
IND
19 5 1
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur
Cont.
I Execute the given command to install a small extension to run nvcc from
Notebook cells.
! pip install git + git : // github . com / andreinechaev / nvcc4jupyter . git
GY
ITU
IAN INST
KH
ARAGPUR
IND
19 5 1
GPU Architectures and Programming Soumyajit Dey, Assistant Professor, CSE, IIT Kharagpur