Vector Addition in CUDA


Introduction

In this blog post describe how vector addition on CUDA.

Code Example


__global__ void vecAdd( float * in1, float * in2, float * out, int len )   
{  
	//@@ code vector addition here  int i = threadIdx.x + blockIdx.x * blockDim.x;  
	if( i < len )  
	{  
		out[i] = in1[i] + in2[i];  
	}  
}  

int main( int argc, char ** argv )  
{  
	float input1[] = { 1, 2, 3 };  
	float input2[] = { 2, 3, 4 };  

	int inputLength = 3;  

	float * hostInput1 = &input1;  
	float * hostInput2 = &input2;  
	float * hostOutput;  
	float * deviceInput1;  
	float * deviceInput2;  
	float * deviceOutput;  

	hostOutput = (float *) malloc( inputLength * sizeof( float ) );

	//@@ Allocate GPU memory here  
	cudaMalloc( (void**) &deviceInput1, inputLength*sizeof(float) );       
	cudaMalloc( (void**) &deviceInput2, inputLength*sizeof(float) );       
	cudaMalloc( (void**) &deviceOutput, inputLength*sizeof(float) );  
	cudaMemcpy(deviceInput1, hostInput1, inputLength*sizeof(float) , cudaMemcpyHostToDevice );  
	cudaMemcpy(deviceInput2, hostInput2, inputLength*sizeof(float) , cudaMemcpyHostToDevice ); 

	//@@ Initialize the grid and block dimensions here  
	dim3 grid ( (inputLength - 1 ) / 256 + 1, 1, 1 );  
	dim3 block ( 256, 1, 1 );  

	//@@ Launch the GPU Kernel here  
	vecAdd <<< grid, block >>> ( deviceInput1, deviceInput2, deviceOutput, inputLength );  
	cudaThreadSynchronize();  

	//@@ Copy the GPU memory back to the CPU here  
	cudaMemcpy( hostOutput, deviceOutput, inputLength*sizeof(float),cudaMemcpyDeviceToHost );

	//@@ Free the GPU memory here  
	cudaFree( deviceInput1 );  
	cudaFree( deviceInput2 );  
	cudaFree( deviceOutput );  

	for( int = 0; i < inputLength; i++ )  
	{  
		printf( "%f ", *hostOutput[i] );                   
	}  

	free( hostInput1 );  
	free( hostInput2 );  
	free( hostOutput );  

	return 0;  
}