intro2 cuda moayad
DESCRIPTION
Lecture for High Performance Computing and High Availability On CUDA and OpenCLTRANSCRIPT
1
Introduction toParallel Programming WithCUDA & OpenCL
Moayad H. Almohaishi
Graduate student, Computer ScienceLouisiana Tech [email protected]
Outlines
• Introduction
• Introduction to CUDA
– Hello World
– Addition application
– Array Addition
– CUDA Memories
– Matrix Multiplication
– Performance considerations
• Introduction to OpenCL
– Addition Kernel
– differences from CUDA kernel
– setting the OpenCL host code
• Sources and additional Resources
01/23/11 2
3
Introduction
• Why GPU – Available in almost all new desktops and laptops– many-core
• 512 cores on GTX580
– high floating point operations • GTX580 offer peak performance ≈1.5 TFLOPS (Single
Precision)
– high memory bandwidth • GTX580 offer 192.4 GB/sec
4
Introduction to CUDA
• CUDA Architecture– The physical technology on the GPU
• CUDA C – The programming language to harvest the power of
CUDA architecture – based on standard C
5
What you need to know?
Today :
•You will need some knowledge about C •Yow don’t need to know about parallel programming•You don’t need to know about CUDA architecture
6
Terminology
• Host– The CPU and its dedicated system memory (RAM).
• Device – The GPU and its on-board memory
7
C Hello World
int main( void ) {printf(“Hello World ! \n”);
return 0;
}
This Hello world C Code if compiled with Nvidia CUDA compiler will compile without problem.
8
CUDA Kernel
__global__ void kernel( Void ){
}
int main( void ) {kernel<<<1,1>>>();
printf(“Hello World ! \n”);
return 0;
}
9
CUDA Kernel
__global__ void kernel( Void ){
}
int main( void ) {kernel<<<1,1>>>();
printf(“Hello World ! \n”);
return 0;
}
__global__ void kernel( Void ){
}
kernel<<<1,1>>>();
__global__ is a key word to define the function as a CUDA kernel
Kernel<<1,1>>(); is the command to call the CUDA kernel from the host code
10
Single Addition on the CPU
float add( float *a, float *b ){return a+b;
}
void main( void ) {float *a, *b, *c;
... // setting a and b values
c = add(a,b);
printf(“%f + %f = %f \n”, a,b,c);
return 0;
}
11
Single Addition on the GPU
__global__ void add( float *a, float *b, float *c ){c= a+b;
}
void main( void ) {float *a, *b, *c;
... // setting a and b values
add<<<1,1>>>(a,b,c);
printf(“%f + %f = %f \n”, a,b,c);
return 0;
}
12
Single Addition on the GPU
__global__ void add( float *a, float *b, float *c ){c= a+b;
}
void main( void ) {float *a, *b, *c;
... // setting a and b values
add<<<1,1>>>(a,b,c); // c will need to be copied to host
printf(“%f + %f = %f \n”, a,b,c);
return 0;
}?!
13
CUDA Global Memory
• To be able to use the GPU memory you will need:– Allocate memory on the GPU using the command
• cudaMalloc()
– Copy the host memory to the device memory using• cudaMemcpy()
• To free the memory• cudaFree()
Original C memory commands: malloc(), free(), memcpy()
14
Single Addition on the GPU
__global__ void add( float *a, float *b, float *c ){c= a+b;
}
The Kernel will is correct and will stay the same
15
Single Addition on the GPU
void main( void ) {float *h_a, *h_b, *h_c;
float *d_a, *d_b, *d_c;
int size = sizeof(float);
cudaMalloc((void**) &d_a, size);
cudaMalloc((void**) &d_b, size);
cudaMalloc((void**) &d_c, size);
h_a = 150; h_b = 89;
we need to define different variables for host and device memories.
Allocating the the device memory
16
Single Addition on the GPU
cudaMemcpy(d_a, &h_a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, &h_b, size, cudaMemcpyHostToDevice);
add<<<1,1>>>(d_a,d_b,d_c);
cudaMemcpy(&h_c, d_c, size, cudaMemcpyDeviceToHost);
printf(“%f + %f = %f \n”, a,b,c);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
Free the device memory
copy the memory to and from the device
17
Is that right to do?
• GPU is about massive parallelism, so running this program on the GPU is inefficient and will run slower than the CPU version
• You need large data
18
Array Addition on the CPU
void main( void ) {int n = 512; // 2^9
float *a[n], *b[n], *c[n];
... // setting a and b values
for (int i=0 i<=n, i++){
c[i] = add(a[i],b[i]);
printf(“%f + %f = %f \n”, a,b,c);
}
return 0;
}
The add function will stay the same
19
Array Addition on the GPU
void main( void ) {int n = 512;
float *h_a[n], *h_b[n], *h_c[n];
float *d_a[n], *d_b[n], *d_c[n];
int size = sizeof(float) * n;
cudaMalloc((void**) &d_a, size);
cudaMalloc((void**) &d_b, size);
cudaMalloc((void**) &d_c, size);
... // setting the input data h_a and h_b
we have to modify the size
20
Array Addition on the GPU
cudaMemcpy(d_a, &h_a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, &h_b, size, cudaMemcpyHostToDevice);
add<<<1,1>>>(d_a,d_b,d_c);
cudaMemcpy(&h_c, d_c, size, cudaMemcpyDeviceToHost);
printf(“%f + %f = %f \n”, a,b,c);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
add<<<1,1>>>(d_a,d_b,d_c);
?!
21
Blocks
• CUDA Run the Kernel as a block on a grid containing n number of blocks.
• The maximum value of n can defer from device to device. current devices limit is 65535 blocks per grid
• we will use blockIdx.x to access the block ID from the kernel
22
Array Addition on the GPU1
cudaMemcpy(d_a, &h_a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, &h_b, size, cudaMemcpyHostToDevice);
cudaMemcpy(&h_c, d_c, size, cudaMemcpyDeviceToHost);
printf(“%f + %f = %f \n”, a,b,c);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
add<<<n,1>>>(d_a,d_b,d_c);
n number of blocks will be running on the kernel
23
Array Addition Kernel1
__global__ void add( float *a, float *b, float *c ){int idx = blockIdx.x ;
c[idx]= a[idx]+b[idx];
}
24
Threads
• Each block can contain up to 512 parallel threads in the first and second CUDA architecture
• In fermi architecture each block can contain up to 1024 parallel threads.
• we will use threadIdx.x to access the thread ID from the kernel
25
Array Addition on the GPU
cudaMemcpy(d_a, &h_a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, &h_b, size, cudaMemcpyHostToDevice);
cudaMemcpy(&h_c, d_c, size, cudaMemcpyDeviceToHost);
printf(“%f + %f = %f \n”, a,b,c);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
add<<<1,n>>>(d_a,d_b,d_c);
n number of threads on single block will be running on the kernel
26
Array Addition Kernel
__global__ void add( float *a, float *b, float *c ){int idx = threadIdx.x ;
c[idx]= a[idx]+b[idx];
}
CUDA Run the threads as half warps. so it is more efficient to have at least 16 threads per block
27
MORE
• is it still massive parallelism ?
• what about more than 512 elements ?
28
Terminology
• 1D grid
0 1 2 3 4 5 6 0 1 2 3 4 5 6 0 1 2 3 4 5 6
blockIdx.x= 0 blockIdx.x= 1 blockIdx.x= 2
threadIdx.x
BlockSize= 7
Threads
29
global memory access
0 1 2 3 4 5 6 0 1 2 3 4 5 6 0 1 2 3 4 5 6
blockIdx.x= 0 blockIdx.x= 1 blockIdx.x= 2
0 1 2 3 4 5 6 7 8 910
11
12
13
14
15
16
17
18
19
20
Threads
Global Memory
How to point each thread to the right global memory address ?
30
global memory access
• 1D grid
0 1 2 3 4 5 6 0 1 2 3 4 5 6 0 1 2 3 4 5 6
blockIdx.x= 0 blockIdx.x= 1 blockIdx.x= 2
BlockSize= 7
Threads
idx = threadIdx.x + blockIdx.x * blockDim.x
31
Array Addition on the GPU
cudaMemcpy(d_a, &h_a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, &h_b, size, cudaMemcpyHostToDevice);
int blockSize = 256;
int blocks = n/blockSize;
add<<<blocks,blockSize>>>(d_a,d_b,d_c);
cudaMemcpy(&h_c, d_c, size, cudaMemcpyDeviceToHost);
//printf(“%f + %f = %f \n”, a,b,c);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0; }
int blockSize = 256;
int blocks = n/blockSize;
add<<<blocks,blockSize>>>(d_a,d_b,d_c);
32
Array Addition Kernel
__global__ void add( float *a, float *b, float *c ){int idx = threadIdx.x + blockIdx.x * blockDim.x;
c[idx]= a[idx]+b[idx];
}
33
Exercises
• What is the maximum number of threads that can be run on a grid ?
• How we can go over that limit ?
34
global memory access
• Allowing each thread to do 2 computation
0 1 2 3 4 5 6 0 1 2 3 4 5 6 0 1 2 3 4 5 6
blockIdx.x= 0 blockIdx.x= 1 blockIdx.x= 2
0 1 2 3 4 5 6 7 8 910
11
12
13
14
15
16
17
18
19
20
Threads
Global Memory
How to point each thread to the right global memory address ?Hint: you need to find the idx formula that count one memory index and jump the second one.You will access the second index throw idx + 1
35
global memory access
• Allowing each thread to do 2 computation
0 1 2 3 4 5 6 0 1 2 3 4 5 6 0 1 2 3 4 5 6
blockIdx.x= 0 blockIdx.x= 1 blockIdx.x= 2
0 1 2 3 4 5 6 7 8 910
11
12
13
14
15
16
17
18
19
20
Threads
Global Memory
How to point each thread to the right global memory address ?Hint: you need to find the idx formula that count one memory index and jump the next blockSize .You will access the second index throw idx + blockDim.x
36
What you learned
• Creating CUDA Kernel• Calling the Kernel from the host• Allocating CUDA memory• Copy to/from the device memory • freeing the device memory • controlling the number of threads throw the block
size and number of blocks per grid.
37
Dot Product
A
B
C
+
×
38
• if each thread do one multiplication. which thread will make the addition ?
39
Shared Memory
• The Shared Memory is very fast memory on the GPU chip itself.
• each block has its own shared memory space.• can be declared using __shared__ CUDA
keyword
• to make sure all the thread finished computing use the CUDA keyword __syncthreads()
40
Dot Product Kernel
__global__ void dotP(int *a, int *b, int *c){
__shared__ temp[N];
temp[threadIdx.x] = a[threadIdx.x] * b[threadIdx.x];
__syncthreads();
if (threadIdx.x == 0) {
int sum = 0;
for (int i = 0;i<N;i++)
sum += temp[i];
c = sum;
}
}
41
exercise
• in this application the addition will run on thread 0 only. is that efficient ?
• how to make it better ?
42
Matrix multiplication
A
B
C
43
MatrixMul on the GPU
void main( void ) {int n = 16;
float *h_a[n][n], *h_b[n][n], *h_c[n][n];
float *d_a[n][n], *d_b[n][n], *d_c[n][n];
int size = sizeof(float) * n * n;
cudaMalloc((void**) &d_a, size);
cudaMalloc((void**) &d_b, size);
cudaMalloc((void**) &d_c, size);
... // setting the input data h_a and h_b
44
Array Addition on the GPU
cudaMemcpy(d_a, &h_a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, &h_b, size, cudaMemcpyHostToDevice);
cudaMemcpy(&h_c, d_c, size, cudaMemcpyDeviceToHost);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
dim3 blockSize = (n,n,1);
add<<<1,blockSize>>>(d_a,d_b,d_c);
45
Simple Matrix Multiplication Kernel
__global__ void matrixMul( float *a, float *b, float *c ){
int x = threadIdx.x ; //row
int y = threadIdx.y; //column
float temp = 0;
for (int i=0; i<= blockDim.x; i++){temp += a[i][y] * b[x][i];
}c[x][y] = temp;
}
46
Exercise
• Use the shared memory to optimize the matrix algorithm (hint: look at the code on the SDK)
47
What you learned
• Using the shared memory to share the date among the threads in a block
• Synchronizing the threads• setting blockSize of more than one dimension
using dim3
48
Performance Considerations
• for maximum performance:– Reduce the global memory access.– maximize the occupancy (allow scheduling of 1024
threads per stream multi processor) • use the right blockSize • use the right number of registers • use the right size of the shared memory
– Increasing the independent instructions – coalescing the memory access – Using right instruction:byte ratio
49
Introduction to OpenCL
• OpenCL is open standard• Cross platform; can run on:
– Multi-core CPU– GPU (NVIDIA,ATI)– Cell B/E– others
• close to CUDA
50
How the program work
Host Device (GPU)
Memory Memory
•Allocating the memory in the Host
A[] B[] C[]
•initializing data in the memory objects.
A[] B[] C[]A[] B[] C[]
•Allocating the memory in the Device (GPU)
•Copy the Data from Host to Device
A[] B[] C[]
Stream Processors
GPUKernel Code
•Running the Kernel
GPUKernel Code
•Copy the results to the Host memory•Clear the Memory and Free the resources
51
Basic OpenCL program Structure
• OpenCL Kernel• Host program containing:
– a. Devices Context.– b. Command Queue– c. Memory Objects– d. OpenCL Program.– e. Kernel Memory Arguments.
52
Creating the Kernel
#include <studio.h>#include <stdlib.h>#include <CL/cl.h>const char* OpenCLSource[ ] = { “__kernel void VectorAdd(__global int* c, __global int* a, \n”, “ __global int* b) \n”, “{ \n”, “unsigned int n = get_global_id(0); \n”, “ c[c] = a[n] + b[n]; \n”, “} \n”};
53
Creating the Kernel
#include <studio.h>#include <stdlib.h>#include <CL/cl.h>const char* OpenCLSource[ ] = { “__kernel void VectorAdd(__global int* c, __global int* a, \n”, “ __global int* b) \n”, “{ \n”, “unsigned int n = get_global_id(0); \n”, “ c[c] = a[n] + b[n]; \n”, “} \n”};
Notice that all the kernel here stored as char variable
const char* OpenCLSource[ ] = {
54
Creating the Kernel
#include <studio.h>#include <stdlib.h>#include <CL/cl.h>const char* OpenCLSource[ ] = { “__kernel void VectorAdd(__global int* c, __global int* a, \n”, “ __global int* b) \n”, “{ \n”, “unsigned int n = get_global_id(0); \n”, “ c[c] = a[n] + b[n]; \n”, “} \n”};
get_global_id() is a built in function instead of calculating the global ID in CUDA
The __kernel key word is equivalent to __global__ in CUDA The function parameters need to be define as __global while you don’t need that in CUDA
const char* OpenCLSource[ ] = { “__kernel void VectorAdd(__global int* c, __global int* a, \n”, “ __global int* b) \n”,
“ unsigned int n = get_global_id(0); \n”,
55
Initializing data
int InitialData1[12] = {62, 48, 20, -53, 39, 83, 19, 47, 13, 88, 38, -92};int InitialData2[12] = {-49, 29, 38, 10, 37, 46, -12, 86, 17, 83, -22, 94};#define SIZE 2048
56
Creating the main function
int main (int argc, char **argv){ int HostVector1[SIZE]; int HostVector2[SIZE];for (int c= 0; c<SIZE; c++) { HostVector[c] = InitialData1[c
%12]; HostVector[c] = initialData2[c%12];}
57
Creating the context
cl_context GPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, NULL);
cl_context clCreateContextFromType(cl_context_properties *properties, cl_device_type device_type, void (*pfn_notify)(const char *errinfo, const void *private_info, size_t cb, void *user_data), void *user_data, cl_int *errcode_ret)
58
Creating the context
cl_context GPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, NULL);
cl_context clCreateContextFromType(cl_context_properties *properties, cl_device_type device_type, void (*pfn_notify)(const char *errinfo, const void *private_info, size_t cb, void *user_data), void *user_data, cl_int *errcode_ret)
You can also use CL_DEVICE_TYPE_CPU
59
Query compute devices
size_t ParamDataBytes;clGetContextInfo(GPUContext, CL_CONTEXT_DEVICES, 0, NULL, &ParmDataBytes);
cl_int clGetContextInfo( cl_context context,cl_platform_info param_name,size_t param_value_size, void *param_value, size_t *param_value_size_ret)Param_name: CL_CONTEXT_REFERENCE_COUNT,CL_CONTEXT_DEVICES,CL_CONTEXT_PROPERTIES
60
Query compute devices
cl_device_id* GPUDevices = (cl_device_id*)malloc(ParmDataBytes);clGetContextInfo(GPUContext, CL_CONTEXT_DEVICES, ParmDataBytes, GPUDevices, NULL);
cl_int clGetContextInfo( cl_context context,cl_platform_info param_name,size_t param_value_size, void *param_value, size_t *param_value_size_ret)Param_name: CL_CONTEXT_REFERENCE_COUNT,CL_CONTEXT_DEVICES,CL_CONTEXT_PROPERTIES
61
Command queue
cl_command_queue GPUCommandQueue = clCreatCommandQueue (GPUContext, GPUDevices[0], 0, NULL);
cl_command_queue clCreateCommandQueue(cl_context context, cl_device_id device,cl_command_queue_properties properties, cl_int *errcode_ret)Properties:CL_QUEUE_PROFILING_ENABLE,
CL_QUEUE_OUT_OF_ORFER_EXEC_MODE_ENABLE
62
Allocating the Memory
cl_mem GPUVector1 = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof (int) * SIZE, HostVector1, NULL);
cl_mem clCreateBuffer(cl_context context,cl_mem_flags flags, size_t size, void *host_ptr, cl_int *errcode_ret)flags: CL_MEM_READ_WRITE, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY, CL_MEM_USE_HOST_PTR,CL_MEM_ALLOC_HOST_PTR, CL_MEM_COPY_HOST_PTR
63
Allocating the Memory
cl_mem GPUVector2 = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof (int) * SIZE, HostVector2, NULL);cl_mem GPUOutputVector; GPUOutputVector = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY, sizeof (int) * SIZE, NULL, NULL);
64
Creating the program
cl_program OpenCLProgram = clCreateProgramWithSource(GPUContext, 8, OpenCLSource, NULL, NULL);
cl_program clCreateProgramWithSource(cl_context context, cl_unit count, const char **strings, const size_t *lengths, cl_int *errcode_ret)
65
Creating the program
clBuildProgram(OpenCLProgram, 0, NULL, NULL, NULL, NULL);
cl_int clBuildProgram(cl_program program, cl_unit num_devices, const cl_device_id *device_list,
const char *options,void (*pfn_notify)(cl_program, void *user_data), void *user_data)
66
Creating the program
cl_kernel OpenCLVectorAdd = clCreateKernel(OpenCLProgram, “VectorAdd”, NULL);
cl_kernel clCreateKernel(cl_program program, const char *kernel_name, cl_int *errcode_ret)
67
matching the GPU memory with the Kernel
clSetKernelArg(OpenCLVectorAdd, 0, sizeof (cl_mem), (void*) &GPUOutputVector);
cl_int clSetKernelArg(cl_kernel kernel, cl_unit arg_index, size_t arg_size, const void *arg_value)
68
matching the GPU memory with the Kernel
clSetKernelArg(OpenCLVectorAdd, 1, sizeof (cl_mem), (void*) &GPUVector1);clSetKernelArg(OpenCLVectorAdd, 2, sizeof (cl_mem), (void*) &GPUVector2);
69
Lunching the Kernel
size_t WorkSize [1] = {SIZE};clEnqueueNDRangeKernel(GPUCommandQueue, OpenCLVectorAdd, 1, NULL, WorkSize, NULL, 0, NULL, NULL);
cl_int clEnqueueNDRangeKernel(cl_command_queue command_queue, cl_kernel kernel, cl_unit work_dim, const size_t *global_work_offset,
const size_t *global_work_size, const size_t *local_work_size, cl_unit num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event)
70
Copying the output to the host memory
int HostOutputVector [SIZE];clEnqueueReadBuffer(GPUCommandQueue, GPUOutputVector, CL_TRUE, 0, SIZE* sizeof(int), HostOutputVector, 0, NULL, NULL);
cl_int clEnqueueReadBuffer(cl_command_queue command_queue, cl_mem buffer,
cl_bool blocking_read, size_t offset,size_t cb,void *ptr, cl_unit num_evernts_in_wait_list,const cl_event *event_wait_list, cl_event *event)
71
Cleaning the GPU device
clReleaseMemObject(GPUVector1);clReleaseMemObject(GPUVector2);clReleaseMemObject(GPUOutputVector);free (GPUDevices);for(int c= 0; c < 305; c++)printf (“%c”, (char)HostOutputVector[c]);return 0;}
72
What you learned
• Writing OpenCL Kernel• Writing OpenCL Application
– Setting the context – preparing the command queue – setting the memory objects– setting the program– setting the kernel and the arguments
73
Sources and additional resources
• Jason sander, “Introduction to CUDA” -book and GTC presentation.
• OpenCL specification document• NVIDIA CUDA programming guide • NVIDIA OpenCL getting started guide
• Videos from GTC’10 in the link :• http://www.nvidia.com/object/gtc2010-presentatio
n-archive.html#session2131