Introduction
In this blog post describe how vector addition on CUDA.
Code Example
__global__ void vecAdd( float * in1, float * in2, float * out, int len ) { //@@ code vector addition here int i = threadIdx.x + blockIdx.x * blockDim.x; if( i < len ) { out[i] = in1[i] + in2[i]; } } int main( int argc, char ** argv ) { float input1[] = { 1, 2, 3 }; float input2[] = { 2, 3, 4 }; int inputLength = 3; float * hostInput1 = &input1; float * hostInput2 = &input2; float * hostOutput; float * deviceInput1; float * deviceInput2; float * deviceOutput; hostOutput = (float *) malloc( inputLength * sizeof( float ) ); //@@ Allocate GPU memory here cudaMalloc( (void**) &deviceInput1, inputLength*sizeof(float) ); cudaMalloc( (void**) &deviceInput2, inputLength*sizeof(float) ); cudaMalloc( (void**) &deviceOutput, inputLength*sizeof(float) ); cudaMemcpy(deviceInput1, hostInput1, inputLength*sizeof(float) , cudaMemcpyHostToDevice ); cudaMemcpy(deviceInput2, hostInput2, inputLength*sizeof(float) , cudaMemcpyHostToDevice ); //@@ Initialize the grid and block dimensions here dim3 grid ( (inputLength - 1 ) / 256 + 1, 1, 1 ); dim3 block ( 256, 1, 1 ); //@@ Launch the GPU Kernel here vecAdd <<< grid, block >>> ( deviceInput1, deviceInput2, deviceOutput, inputLength ); cudaThreadSynchronize(); //@@ Copy the GPU memory back to the CPU here cudaMemcpy( hostOutput, deviceOutput, inputLength*sizeof(float),cudaMemcpyDeviceToHost ); //@@ Free the GPU memory here cudaFree( deviceInput1 ); cudaFree( deviceInput2 ); cudaFree( deviceOutput ); for( int = 0; i < inputLength; i++ ) { printf( "%f ", *hostOutput[i] ); } free( hostInput1 ); free( hostInput2 ); free( hostOutput ); return 0; }