cuda examples
DESCRIPTION
Example programs for CUDA CTRANSCRIPT
-
1ExamplesofCuda code
1) Thedotproduct2) Matrixvectormultiplication3) Sparsematrixmultiplication4) Globalreduction
Computingy=ax+ywithaSerialLoopvoidsaxpy_serial(int n,floatalpha,float*x,float*y){for(inti=0;i
-
2__global__voidmm_simple(float*C,float*A,float*B,int n){int row=blockIdx.y *blockDim.y +threadIdx.y;int col=blockIdx.x *blockDim.x +threadIdx.x;
floatsum=0.0f;for(int k=0;k
-
3voidcsrmul_serial(int *Ap,int *Aj,float*Av,int num_rows,float*x,float*y)
{for(int row=0;row
-
4Cachinginsharedmemory
=
Block_begin
Block_end
AthreadCacheinsharedmemory
Expectmostofthenonzeroelementshere(aroundthediagonal)
therowexecutedbyathread
_global_void csrmul_cached(int *Ap,int *Aj,float*Av,int num_rows,constfloat*x,float*y){_shared_float cache[blocksize];//Cachetherowsofx[]correspondingtothisblock.int block_begin =blockIdx.x *blockDim.x;int block_end =block_begin +blockDim.x;int row=block_begin +threadIdx.x;//Fetchandcacheourwindowofx[].if(row
- 5_global_void plus_reduce(int *input,int N,int *total){int tid =threadIdx.x;int i =blockIdx.x*blockDim.x +threadIdx.x;//Eachblockloadsitselementsintosharedmemory_shared_int x[blocksize];x[tid]=(i0;s=s/2){if(tid