[d1]deview 2012 nvidia

2.1�� OpenACC�� example�� :�� Pi��

2.2�� CUDA�� Example�� :�� MatrixMul��

�� step1�� :�� CPU�� code��

�� step2�� :�� CPU�� launcher��

�� step3�� :�� GPU�� launcher��

�� step4�� :�� Memory�� Map��

�� step5�� :�� GPU�� kernel��

�� step6�� :�� Optimization��

1.�� CUDA�� Architecture��

MultiCore�� Era��

multiCore�� singleCore��

singleCore�� Era:�� 매년�� CPU�� 업그레이드만으로�� S/W�� 성능�� 향상.�� multiCore�� Era�� :�� 이제�� CPU를�� 바꿔도�� 성능�� 향상이�� 전혀�� 없음��   일반�� 앱�� 개발자도�� 성능�� 향상을�� 위해서는�� 병렬�� 프로그래밍�� 고려��

CPU(multiCore)�� vs�� GPU(maniCore)��

DDR3�� 메모리�� I/O��

L3�� Cache��

Core��

System�� I/O��

Core�� Core�� Core��

내장�� GPU��

Intel�� “Sandy�� Bridge”�� CPU�� NVIDIA�� “Kepler�� GK104”�� GPU�� NVENC��

PCI-E�� I/O��

GDDR5�� 메모리�� I/O��



GPC�� SMX��

I/O��

I/O��

Sch��

Raster��

SMX��

Core��

FP�� Perf.�� per�� chip�� CPU�� Sandy�� Bridge�� :�� 32FP�� =�� 4�� Core�� (w/�� 2�� HT,�� 4�� AVE)�� GPU�� GK104�� :�� 1536�� FP�� =�� 1536�� Core��

�� GK110�� :�� 3072�� FP�� =�� 3072�� Core��

CUDA�� enable�� GPUs��

Tesla�� K10/20�� (for�� server)��

Geforce�� (for�� gaming�� PC)��

Quadro�� K5000�� (for�� workstation)��

Tegra�� (for�� moble�� :�� not�� yet,�� so

on)��

you�� can�� find�� CUDA�� enabled�� GPUs�� everywhere�� !!!��

GPGPU�� Programming�� Model�� History��

1980s�� 1990s�� 2000s�� 2010s��

1985�� :�� Amiga�� -�� (2D)�� graphics�� accelerator��

1990�� :�� SGI,�� IRIS�� GL�� for�� Onyx�� system�� 1992�� :�� OpenGL:�� OpenGL�� ARB�� founded�� 1996�� :�� 3dfx,�� Voodoo�� -�� 3D�� accelerator�� ,�� acquired�� by�� NVIDIA(2000)�� 1996�� :�� Microsoft,�� DirectX�� accelerate�� 3D�� 1997�� :�� OpenGL,�� OpenGL�� 1.1��

2002�� :�� NVIDIA,�� first�� GPUs�� with�� programmable�� vertex,�� fragment�� shaders��

2002�� :�� Mark�� Harris,�� GPGPU��

2003�� :�� Lib�� Sh��

2003�� :�� Ian�� Buck,�� BrookGPU�� language��

2006�� :�� NVIDIA,�� G80�� -�� Unified�� shader�� architecture:�� fully�� programmable�� units��

2006�� :�� NVIDIA,�� CUDA��

2006�� :�� PeakStream,�� PeakStream�� support�� GPU,�� acquired�� by�� Google(2007)��

2007�� :�� AMD,�� Stream�� SDK(Close-to-Metal,�� Brook++)��

2008�� :�� RapidMind,�� sh-like�� language,�� acquired�� by�� Intel(2009)��

2008�� :�� Khronos,�� Apple,�� OpenCL��

2009�� :�� Microsoft,�� DirectCompute��

2010�� :�� NVIDIA,�� Parallel�� NSight�� Debugger�� for�� Visual�� Studio�� 1.0��

2010�� :�� PGI,�� PGI�� Accelerator��

2011�� :�� Microsoft,�� C++�� AMP��

2012�� :�� NVIDIA,�� NSight�� for�� Eclipse�� Edition��

2012�� :�� OpenACC,�� PGI,�� CAPS,�� CRAY,�� NVIDIA�� support�� OpenACC��

CUDA�� Parallel�� Model��

CPU��

CPU��

OpenMP�� Parallel��

core02�� core04��

CPU��

GPU�� Core01�� Core3000��

CUDA�� Parallel��

CUDA�� Work�� Model��

*ptr_cpu�� *ptr_gpu��


upload�� input�� data��

Step3.�� download�� result�� data��

FOR�� LOOP��

Step1.��

Step2.�� launch�� GPU�� Kernel��

CUDA�� Acceleration�� Approach��

5.�� CUDA�� development��

1.�� CUDA�� ISV�� Apps��

2.�� CUDA�� Libraries��

3.�� OpenACC�� Directives��

Ansys�� Mech,�� Fluent,��

4.�� CUDA�� tools�� Matlab,�� Mathematica,�� Octave��

cuFFT,�� cuBLAS,�� CULA��

PGI,�� CAPS,�� Cray�� Compiler��

C/C++,�� Fortran,�� python��

CUDA�� ISV�� Apps��

http://www.nvidia.com/object/gpu-applications.html��

ANSYS�� Mech.�� 14�� Adobe�� CS6��

Library�� :�� cuFFT/cuBLAS��

•  cuFFT�� :�� CUDA�� version�� FFT�� –  신호처리,�� 영상처리�� 분야�� 활용�� –  FFTW와�� 함수구조�� 동일��

•  cuBLAS�� :�� CUDA�� version�� BLAS�� –  수치해석�� 분야�� 활용�� –  MKL와�� 함수구조�� 동일��

•  cuSpMV�� :�� CUDA�� version�� SpMV�� –  수치해석�� 분야�� 활용��

Library�� :�� SAXPY�� with�� cuBLAS��

cublasInit();�� cublasSetVector(N,�� sizeof(x[0]),�� x,�� 1,�� d_x,�� 1);�� cublasSetVector(N,�� sizeof(y[0]),�� y,�� 1,�� d_y,�� 1);�� cublasSaxpy�� (N,�� 2.0,�� d_x,�� 1,�� d_y,�� 1);�� cublasGetVector(N,�� sizeof(y[0]),�� d_y,�� 1,�� y,�� 1);��

*cpu_ptr�� *gpu_ptr��

OpenACC�� :�� website��

http://www.openacc.org��

PGI�� Compiler�� CAPS�� HMPP�� Compiler�� CRAY�� Compiler��

pgcc�� –acc�� saxpy.c�� hmpp�� gcc�� saxpy.c��

OpenACC�� :�� SAXPY�� example��

•  openMP�� level�� GPU�� Acceleration��

void�� saxpy(int�� n,�� float�� a,�� float�� *x,�� float�� *y)�� {�� #pragma�� omp�� parallel�� {�� for�� (int�� i�� =�� 0;�� i�� <�� n;�� ++i)�� {�� y[i]�� =�� a*x[i]�� +�� y[i];�� }�� }�� }��

void�� saxpy(int�� n,�� float�� a,�� float�� *x,�� float�� *y)�� {�� #pragma�� acc�� kernels�� {�� for�� (int�� i�� =�� 0;�� i�� <�� n;�� ++i)�� {�� y[i]�� =�� a*x[i]�� +�� y[i];�� }�� }�� }��

OpenMP�� Parallel�� with�� 4�� Core�� CPU�� OpenMP�� Parallel�� with�� 3000�� Core�� GPU��

CUDA�� :�� SAXPY�� example��

__global__�� void�� saxpy(int�� n,�� float�� a,�� float�� *x,�� float�� *y)�� {�� int�� i�� =�� blockIdx.x*blockDim.x�� +�� threadIdx.x;�� if�� (i�� <�� n)�� y[i]�� =�� a*x[i]�� +�� y[i];�� }�� cudaMemcpy(x,�� d_x,�� N,�� cudaMemcpyHostToDevice);�� cudaMemcpy(y,�� d_y,�� N,�� cudaMemcpyHostToDevice);�� saxpy�� <<<4096,256>>>�� (N,�� 2.0,�� x,�� y);�� cudaMemcpy(d_y,�� y,�� N,�� cudaMemcpyDeviceToHost);��

CUDA�� kernel�� Launch��

upload��

download��

2.�� CUDA�� Converting�� Example��

Example�� :�� PI��

�� #include�� <stdio.h>�� #include�� <stdlib.h>�� int�� main�� (int�� argc,�� char�� *argv[])�� {�� int�� nthreads,�� tid;�� int�� i,�� INTER;�� double�� n_1,�� x,�� pi�� =�� 0.0;�� INTER=100;�� n_1�� =�� 1.0�� /�� (double)INTER�� ;�� for�� (i�� =�� 0;�� i�� <�� INTER;�� i++)�� {�� x�� =�� n_1�� *�� ((double)i�� -�� 0.5);�� pi�� +=�� 4.0�� /�� (1.0�� +�� x�� *�� x);�� }�� pi�� *=�� n_1;�� printf�� ("Pi�� =�� %.12lf\n",�� pi);�� return�� 0;�� }��

Example�� :�� OpenACC�� for�� PI��

�� #include�� <stdio.h>�� #include�� <stdlib.h>�� int�� main�� (int�� argc,�� char�� *argv[])�� {�� int�� nthreads,�� tid;�� int�� i,�� INTER;�� double�� n_1,�� x,�� pi�� =�� 0.0;�� INTER=100;�� n_1�� =�� 1.0�� /�� (double)INTER�� ;�� #pragma�� acc�� kernels�� for�� for�� (i�� =�� 0;�� i�� <�� INTER;�� i++)�� {�� x�� =�� n_1�� *�� ((double)i�� -�� 0.5);�� pi�� +=�� 4.0�� /�� (1.0�� +�� x�� *�� x);�� }�� pi�� *=�� n_1;�� printf�� ("Pi�� =�� %.12lf\n",�� pi);�� return�� 0;�� }��

How�� to�� Convert�� CUDA��

•  CPU�� code�� Profile��

•  Parallelize�� algorithm��

•  Convert�� CUDA�� 1.  separate�� launcher�� for�� Intensive�� FOR-LOOP��

1.  check�� ptr,�� variables,�� sub-routine��

2.  clone�� launcher�� for�� GPU��

3.  add�� Memory�� Model�� on�� launcher�� 1.  cudaMalloc,�� cudaMemcpy��

4.  add�� CUDA�� code�� on�� launcher�� 1.  FOR(�� ;�� ;�� )�� è�� __global__�� void�� +�� <<<�� B,T>>>��

5.  CUDA�� Profile�� &�� Optimize�� 1.  Occupancy,�� Coalesing,�� shared�� Memory,�� Bank�� Conflict,�� etc.��



upload�� input�� data��

Step3.�� download�� result�� data��

FOR�� LOOP��

Step1.��

Step2.�� launch�� GPU�� Kernel��

2�� 3��

4��

matrixMul�� :�� CPU_main��

#include <stdlib.h>�� #include <stdio.h>�� #include <math.h> #define n 8 #define m 4 ��

void datainit_cpu ( double *, double *, double *, int, int );�� void matrixmul_cpu( double *, double *, double *, int, int );��

int main () {�� int i,row,col,k;�� double val;�� double *a, *b, *c; //cpu pointer�� a = (double *) malloc ( m*n*sizeof(double) );�� b = (double *) malloc ( n*m*sizeof(double) );�� c = (double *) malloc ( m*m*sizeof(double) );�� datainit_cpu(a,b,c,m,n);�� matrixmul_cpu(a,b,c,m,n); return 0; }��

step1�� :�� CPU_launcher��

�� void��

matrixmul_cpu(�� double�� *a,�� double�� *b,�� double�� *c,�� int�� m,int�� n�� ){��

�� int�� hA,�� wA,�� hB,�� wB,�� hC,wC;//�� hA�� =�� size�� of�� col��

�� hA=m;wA=n;�� hB=n;wB=m;�� hC=m,wC=m;��

��

�� double�� val;��

�� int�� row,col,k;��

��

for�� (�� row=0;�� row<hC;�� row++�� )�� {�� //hC�� =�� hA��

�� for�� (�� col=0;�� col<wC;�� col++�� )�� {�� //wC�� =�� wB��

�� val�� =�� 0;��

�� for�� (�� k=0;�� k<wA;�� k++�� )�� {�� //wA�� =�� hB��

�� val�� +=�� a[row*wA+k]�� *�� b[k*wB+col];��

�� }��

�� c[row*wC+col]�� =�� val;��

�� }��

�� }��

return�� ;��

}��

step2�� :�� clone�� GPU_launcher��

void��

matrixmul_gpu(�� double�� *a,�� double�� *b,�� double�� *c,�� int�� m,int�� n){��

�� int�� hA,�� wA,�� hB,�� wB,�� hC,wC;��


��

��

�� matrixmul_kernel�� <<<�� size_block�� ,�� size_thread�� >>>�� (a,�� b,�� c,�� m,n);��

��


��

}��

step3�� :�� GPU_Memory��

void��

malloc_gpu(�� double�� **a_d,�� double�� **b_d,�� double�� **c_d,�� int�� m,int�� n){��



��

�� cudaMalloc(�� (void**)a_d,�� sizeof(double)*hA*wA�� );��

�� cudaMalloc(�� (void**)b_d,�� sizeof(double)*hB*wB�� );��

�� cudaMalloc(�� (void**)c_d,�� sizeof(double)*hC*wC�� );��

��

�� cudaMemset(�� *a_d,�� 0.0,�� sizeof(double)*hA*wA);��

�� cudaMemset(�� *b_d,�� 0.0,�� sizeof(double)*hB*wB);��

�� cudaMemset(�� *c_d,�� 0.0,�� sizeof(double)*hC*wC); ��

return;��

}��

��

void��

free_gpu(�� double�� *a_d,�� double�� *b_d,�� double�� *c_d,�� int�� m,int�� n){��

�� cudaFree(�� a_d�� ); ��

�� cudaFree(�� b_d�� ); ��

�� cudaFree(�� c_d�� );��

�� return;��

}��

step3�� :�� GPU_Memory��

��

void��

upload_gpu(�� double�� *a,�� double�� *b,�� double�� *a_d,�� double�� *b_d,�� int�� m,�� int�� n){��



��

�� cudaMemcpy(�� a_d,�� a,�� sizeof(double)*hA*wA,�� cudaMemcpyHostToDevice);��

�� cudaMemcpy(�� b_d,�� b,�� sizeof(double)*hB*wB,�� cudaMemcpyHostToDevice); ��

return;��

}��

��

��

void��

download_gpu(�� double�� *c,�� double�� *c_d,�� int�� m,�� int�� n){��



�� cudaMemcpy(�� c,�� c_d,�� sizeof(double)*hC*wC,�� cudaMemcpyDeviceToHost);��

�� return;��

}��

step4�� :�� Make�� GPU_Kernel�� for�� FOR-LOOP��

__global__��

void�� matrixmul_kernel1(double�� *a,�� double�� *b,�� double�� *c,�� int�� m,int�� n){��

�� int�� hA,�� wA,�� hB,�� wB,�� hC,wC;//�� hA�� =�� size�� of�� col��


��

�� int�� row�� =blockIdx.x�� *�� blockDim.x�� +�� threadIdx.x;�� //hC�� ~�� hA,�� wB�� "2D�� index"��

�� int�� col�� =blockIdx.y�� *�� blockDim.y�� +�� threadIdx.y;�� //wC�� ~�� wA,�� hB��

�� int�� i�� =�� row�� *�� wC�� +�� col;�� //�� total�� index�� of�� C��

�� int�� k=0;�� //�� wA�� =�� hB�� for�� fixed�� C(row,col)��

��

�� double�� val�� =�� 0;��

�� for�� (�� k=0;�� k<m;�� k++�� )�� {�� //wA��

�� val�� +=�� a[row*wA+k]�� *�� b[k*wB+col];��

�� }��

�� c[row*wC+col]�� =�� val;��


}��

step5�� :�� CUDA�� Profile��

step6�� :�� Optimization�� with�� Shared�� Memory��

__global__�� void��

matrixmul_kernel(double�� *C,�� double�� *A,�� double�� *B,�� int�� wA,�� int�� wB)��

{��

�� int�� bx�� =�� blockIdx.x;�� int�� by�� =�� blockIdx.y;��

�� int�� tx�� =�� threadIdx.x;�� int�� ty�� =�� threadIdx.y;��

�� int�� aBegin�� =�� wA�� *�� BLOCK_SIZE�� *�� by;�� int�� aEnd�� =�� aBegin�� +�� wA�� -�� 1;�� int�� aStep�� =�� BLOCK_SIZE;��

�� int�� bBegin�� =�� BLOCK_SIZE�� *�� bx;�� int�� bStep�� =�� BLOCK_SIZE�� *�� wB;��

�� float�� Csub�� =�� 0;��

�� for�� (int�� a�� =�� aBegin,�� b�� =�� bBegin;��

�� a�� <=�� aEnd;��

�� a�� +=�� aStep,�� b�� +=�� bStep)��

�� {��

�� __shared__�� float�� As[BLOCK_SIZE][BLOCK_SIZE];��

�� __shared__�� float�� Bs[BLOCK_SIZE][BLOCK_SIZE];��

�� AS(ty,�� tx)�� =�� A[a�� +�� wA�� *�� ty�� +�� tx];��

�� BS(ty,�� tx)�� =�� B[b�� +�� wB�� *�� ty�� +�� tx];��

�� __syncthreads();��

�� for�� (int�� k�� =�� 0;�� k�� <�� BLOCK_SIZE;�� ++k)��

�� Csub�� +=�� AS(ty,�� k)�� *�� BS(k,�� tx);��

�� __syncthreads();��

�� }��

�� int�� c�� =�� wB�� *�� BLOCK_SIZE�� *�� by�� +�� BLOCK_SIZE�� *�� bx;��

�� C[c�� +�� wB�� *�� ty�� +�� tx]�� =�� Csub;��


��

}��

shared�� Memory��

추천도서��

GPUGems�� 3�� GPU�� Computing�� Gems�� emerald�� edition��

CUDA�� by�� example�� Programming�� Massively�� Parallel�� Processors��

GPU�� Computing�� Gems�� emerald�� edition��

CUDA�� Coding�� Contest��

http://www.nvidiakoreapsc.com��

감사합니다.��

[d1]deview 2012 nvidia

Technology