0% found this document useful (0 votes)
18 views3 pages

Vector Addition

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
18 views3 pages

Vector Addition

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 3

#include <iostream>

#include <cuda_runtime.h>
#include <conio.h>
using namespace std;

__global__ void addVectors(int* A, int* B, int* C, int n)


{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n)
{
C[i] = A[i] + B[i];
}
}

int main()
{
int n;
cout << "Enter the size of vectors: ";
cin >> n;

int* A, * B, * C;
int size = n * sizeof(int);

// Allocate memory on the host


cudaMallocHost(&A, size);
cudaMallocHost(&B, size);
cudaMallocHost(&C, size);

// Initialize vector A
cout << "Enter values for vector A:" << endl;
for (int i = 0; i < n; i++)
{
cout << "A[" << i << "]: ";
cin >> A[i];
}

// Initialize vector B
cout << "Enter values for vector B:" << endl;
for (int i = 0; i < n; i++)
{
cout << "B[" << i << "]: ";
cin >> B[i];
}

// Get the number of threads per block from the user


int threadsPerBlock;
cout << "Enter the number of threads per block: ";
cin >> threadsPerBlock;

// Calculate the number of blocks


int numBlocks = (n + threadsPerBlock - 1) / threadsPerBlock;

// Allocate memory on the device


int* dev_A, * dev_B, * dev_C;
cudaMalloc(&dev_A, size);
cudaMalloc(&dev_B, size);
cudaMalloc(&dev_C, size);

// Copy data from host to device


cudaMemcpy(dev_A, A, size, cudaMemcpyHostToDevice);
cudaMemcpy(dev_B, B, size, cudaMemcpyHostToDevice);

// Create CUDA events for timing


cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);

// Record the start event


cudaEventRecord(start);

// Launch the kernel


addVectors<<<numBlocks, threadsPerBlock>>>(dev_A, dev_B, dev_C, n);

// Record the stop event


cudaEventRecord(stop);

// Synchronize to ensure kernel execution is complete


cudaDeviceSynchronize();

// Calculate the elapsed time


float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
cout << "Execution Time: " << milliseconds << " ms" << endl;

// Copy data from device to host


cudaMemcpy(C, dev_C, size, cudaMemcpyDeviceToHost);

// Print the results


cout << "Resultant vector C:" << endl;
for (int i = 0; i < n; i++)
{
cout << C[i] << " ";
}
cout << endl;

// Free memory
cudaFree(dev_A);
cudaFree(dev_B);
cudaFree(dev_C);
cudaFreeHost(A);
cudaFreeHost(B);
cudaFreeHost(C);

// Destroy the events


cudaEventDestroy(start);
cudaEventDestroy(stop);
getch();
return 0;
}

output:

Enter the size of vectors: 5


Enter values for vector A:
A[0]: 4
A[1]: 6
A[2]: 8
A[3]: 2
A[4]: 3
Enter values for vector B:
B[0]: 8
B[1]: 4
B[2]: 2
B[3]: 6
B[4]: 4
Enter the number of threads per block: 3
Execution Time: 0.006112 ms
Resultant vector C:
12 10 10 8 7

You might also like