intro2 cuda moayad

73
1 Introduction to Parallel Programming With CUDA & OpenCL Moayad H. Almohaishi Graduate student, Computer Science Louisiana Tech University [email protected]

Upload: moayadhn

Post on 26-May-2015

1.185 views

Category:

Documents


0 download

DESCRIPTION

Lecture for High Performance Computing and High Availability On CUDA and OpenCL

TRANSCRIPT

Page 1: Intro2 Cuda Moayad

1

Introduction toParallel Programming WithCUDA & OpenCL

Moayad H. Almohaishi

Graduate student, Computer ScienceLouisiana Tech [email protected]

Page 2: Intro2 Cuda Moayad

Outlines

• Introduction

• Introduction to CUDA

– Hello World

– Addition application

– Array Addition

– CUDA Memories

– Matrix Multiplication

– Performance considerations

• Introduction to OpenCL

– Addition Kernel

– differences from CUDA kernel

– setting the OpenCL host code

• Sources and additional Resources

01/23/11 2

Page 3: Intro2 Cuda Moayad

3

Introduction

• Why GPU – Available in almost all new desktops and laptops– many-core

• 512 cores on GTX580

– high floating point operations • GTX580 offer peak performance ≈1.5 TFLOPS (Single

Precision)

– high memory bandwidth • GTX580 offer 192.4 GB/sec

Page 4: Intro2 Cuda Moayad

4

Introduction to CUDA

• CUDA Architecture– The physical technology on the GPU

• CUDA C – The programming language to harvest the power of

CUDA architecture – based on standard C

Page 5: Intro2 Cuda Moayad

5

What you need to know?

Today :

•You will need some knowledge about C •Yow don’t need to know about parallel programming•You don’t need to know about CUDA architecture

Page 6: Intro2 Cuda Moayad

6

Terminology

• Host– The CPU and its dedicated system memory (RAM).

• Device – The GPU and its on-board memory

Page 7: Intro2 Cuda Moayad

7

C Hello World

int main( void ) {printf(“Hello World ! \n”);

return 0;

}

This Hello world C Code if compiled with Nvidia CUDA compiler will compile without problem.

Page 8: Intro2 Cuda Moayad

8

CUDA Kernel

__global__ void kernel( Void ){

}

int main( void ) {kernel<<<1,1>>>();

printf(“Hello World ! \n”);

return 0;

}

Page 9: Intro2 Cuda Moayad

9

CUDA Kernel

__global__ void kernel( Void ){

}

int main( void ) {kernel<<<1,1>>>();

printf(“Hello World ! \n”);

return 0;

}

__global__ void kernel( Void ){

}

kernel<<<1,1>>>();

__global__ is a key word to define the function as a CUDA kernel

Kernel<<1,1>>(); is the command to call the CUDA kernel from the host code

Page 10: Intro2 Cuda Moayad

10

Single Addition on the CPU

float add( float *a, float *b ){return a+b;

}

void main( void ) {float *a, *b, *c;

... // setting a and b values

c = add(a,b);

printf(“%f + %f = %f \n”, a,b,c);

return 0;

}

Page 11: Intro2 Cuda Moayad

11

Single Addition on the GPU

__global__ void add( float *a, float *b, float *c ){c= a+b;

}

void main( void ) {float *a, *b, *c;

... // setting a and b values

add<<<1,1>>>(a,b,c);

printf(“%f + %f = %f \n”, a,b,c);

return 0;

}

Page 12: Intro2 Cuda Moayad

12

Single Addition on the GPU

__global__ void add( float *a, float *b, float *c ){c= a+b;

}

void main( void ) {float *a, *b, *c;

... // setting a and b values

add<<<1,1>>>(a,b,c); // c will need to be copied to host

printf(“%f + %f = %f \n”, a,b,c);

return 0;

}?!

Page 13: Intro2 Cuda Moayad

13

CUDA Global Memory

• To be able to use the GPU memory you will need:– Allocate memory on the GPU using the command

• cudaMalloc()

– Copy the host memory to the device memory using• cudaMemcpy()

• To free the memory• cudaFree()

Original C memory commands: malloc(), free(), memcpy()

Page 14: Intro2 Cuda Moayad

14

Single Addition on the GPU

__global__ void add( float *a, float *b, float *c ){c= a+b;

}

The Kernel will is correct and will stay the same

Page 15: Intro2 Cuda Moayad

15

Single Addition on the GPU

void main( void ) {float *h_a, *h_b, *h_c;

float *d_a, *d_b, *d_c;

int size = sizeof(float);

cudaMalloc((void**) &d_a, size);

cudaMalloc((void**) &d_b, size);

cudaMalloc((void**) &d_c, size);

h_a = 150; h_b = 89;

we need to define different variables for host and device memories.

Allocating the the device memory

Page 16: Intro2 Cuda Moayad

16

Single Addition on the GPU

cudaMemcpy(d_a, &h_a, size, cudaMemcpyHostToDevice);

cudaMemcpy(d_b, &h_b, size, cudaMemcpyHostToDevice);

add<<<1,1>>>(d_a,d_b,d_c);

cudaMemcpy(&h_c, d_c, size, cudaMemcpyDeviceToHost);

printf(“%f + %f = %f \n”, a,b,c);

cudaFree(d_a);

cudaFree(d_b);

cudaFree(d_c);

return 0;

}

Free the device memory

copy the memory to and from the device

Page 17: Intro2 Cuda Moayad

17

Is that right to do?

• GPU is about massive parallelism, so running this program on the GPU is inefficient and will run slower than the CPU version

• You need large data

Page 18: Intro2 Cuda Moayad

18

Array Addition on the CPU

void main( void ) {int n = 512; // 2^9

float *a[n], *b[n], *c[n];

... // setting a and b values

for (int i=0 i<=n, i++){

c[i] = add(a[i],b[i]);

printf(“%f + %f = %f \n”, a,b,c);

}

return 0;

}

The add function will stay the same

Page 19: Intro2 Cuda Moayad

19

Array Addition on the GPU

void main( void ) {int n = 512;

float *h_a[n], *h_b[n], *h_c[n];

float *d_a[n], *d_b[n], *d_c[n];

int size = sizeof(float) * n;

cudaMalloc((void**) &d_a, size);

cudaMalloc((void**) &d_b, size);

cudaMalloc((void**) &d_c, size);

... // setting the input data h_a and h_b

we have to modify the size

Page 20: Intro2 Cuda Moayad

20

Array Addition on the GPU

cudaMemcpy(d_a, &h_a, size, cudaMemcpyHostToDevice);

cudaMemcpy(d_b, &h_b, size, cudaMemcpyHostToDevice);

add<<<1,1>>>(d_a,d_b,d_c);

cudaMemcpy(&h_c, d_c, size, cudaMemcpyDeviceToHost);

printf(“%f + %f = %f \n”, a,b,c);

cudaFree(d_a);

cudaFree(d_b);

cudaFree(d_c);

return 0;

}

add<<<1,1>>>(d_a,d_b,d_c);

?!

Page 21: Intro2 Cuda Moayad

21

Blocks

• CUDA Run the Kernel as a block on a grid containing n number of blocks.

• The maximum value of n can defer from device to device. current devices limit is 65535 blocks per grid

• we will use blockIdx.x to access the block ID from the kernel

Page 22: Intro2 Cuda Moayad

22

Array Addition on the GPU1

cudaMemcpy(d_a, &h_a, size, cudaMemcpyHostToDevice);

cudaMemcpy(d_b, &h_b, size, cudaMemcpyHostToDevice);

cudaMemcpy(&h_c, d_c, size, cudaMemcpyDeviceToHost);

printf(“%f + %f = %f \n”, a,b,c);

cudaFree(d_a);

cudaFree(d_b);

cudaFree(d_c);

return 0;

}

add<<<n,1>>>(d_a,d_b,d_c);

n number of blocks will be running on the kernel

Page 23: Intro2 Cuda Moayad

23

Array Addition Kernel1

__global__ void add( float *a, float *b, float *c ){int idx = blockIdx.x ;

c[idx]= a[idx]+b[idx];

}

Page 24: Intro2 Cuda Moayad

24

Threads

• Each block can contain up to 512 parallel threads in the first and second CUDA architecture

• In fermi architecture each block can contain up to 1024 parallel threads.

• we will use threadIdx.x to access the thread ID from the kernel

Page 25: Intro2 Cuda Moayad

25

Array Addition on the GPU

cudaMemcpy(d_a, &h_a, size, cudaMemcpyHostToDevice);

cudaMemcpy(d_b, &h_b, size, cudaMemcpyHostToDevice);

cudaMemcpy(&h_c, d_c, size, cudaMemcpyDeviceToHost);

printf(“%f + %f = %f \n”, a,b,c);

cudaFree(d_a);

cudaFree(d_b);

cudaFree(d_c);

return 0;

}

add<<<1,n>>>(d_a,d_b,d_c);

n number of threads on single block will be running on the kernel

Page 26: Intro2 Cuda Moayad

26

Array Addition Kernel

__global__ void add( float *a, float *b, float *c ){int idx = threadIdx.x ;

c[idx]= a[idx]+b[idx];

}

CUDA Run the threads as half warps. so it is more efficient to have at least 16 threads per block

Page 27: Intro2 Cuda Moayad

27

MORE

• is it still massive parallelism ?

• what about more than 512 elements ?

Page 28: Intro2 Cuda Moayad

28

Terminology

• 1D grid

0 1 2 3 4 5 6 0 1 2 3 4 5 6 0 1 2 3 4 5 6

blockIdx.x= 0 blockIdx.x= 1 blockIdx.x= 2

threadIdx.x

BlockSize= 7

Threads

Page 29: Intro2 Cuda Moayad

29

global memory access

0 1 2 3 4 5 6 0 1 2 3 4 5 6 0 1 2 3 4 5 6

blockIdx.x= 0 blockIdx.x= 1 blockIdx.x= 2

0 1 2 3 4 5 6 7 8 910

11

12

13

14

15

16

17

18

19

20

Threads

Global Memory

How to point each thread to the right global memory address ?

Page 30: Intro2 Cuda Moayad

30

global memory access

• 1D grid

0 1 2 3 4 5 6 0 1 2 3 4 5 6 0 1 2 3 4 5 6

blockIdx.x= 0 blockIdx.x= 1 blockIdx.x= 2

BlockSize= 7

Threads

idx = threadIdx.x + blockIdx.x * blockDim.x

Page 31: Intro2 Cuda Moayad

31

Array Addition on the GPU

cudaMemcpy(d_a, &h_a, size, cudaMemcpyHostToDevice);

cudaMemcpy(d_b, &h_b, size, cudaMemcpyHostToDevice);

int blockSize = 256;

int blocks = n/blockSize;

add<<<blocks,blockSize>>>(d_a,d_b,d_c);

cudaMemcpy(&h_c, d_c, size, cudaMemcpyDeviceToHost);

//printf(“%f + %f = %f \n”, a,b,c);

cudaFree(d_a);

cudaFree(d_b);

cudaFree(d_c);

return 0; }

int blockSize = 256;

int blocks = n/blockSize;

add<<<blocks,blockSize>>>(d_a,d_b,d_c);

Page 32: Intro2 Cuda Moayad

32

Array Addition Kernel

__global__ void add( float *a, float *b, float *c ){int idx = threadIdx.x + blockIdx.x * blockDim.x;

c[idx]= a[idx]+b[idx];

}

Page 33: Intro2 Cuda Moayad

33

Exercises

• What is the maximum number of threads that can be run on a grid ?

• How we can go over that limit ?

Page 34: Intro2 Cuda Moayad

34

global memory access

• Allowing each thread to do 2 computation

0 1 2 3 4 5 6 0 1 2 3 4 5 6 0 1 2 3 4 5 6

blockIdx.x= 0 blockIdx.x= 1 blockIdx.x= 2

0 1 2 3 4 5 6 7 8 910

11

12

13

14

15

16

17

18

19

20

Threads

Global Memory

How to point each thread to the right global memory address ?Hint: you need to find the idx formula that count one memory index and jump the second one.You will access the second index throw idx + 1

Page 35: Intro2 Cuda Moayad

35

global memory access

• Allowing each thread to do 2 computation

0 1 2 3 4 5 6 0 1 2 3 4 5 6 0 1 2 3 4 5 6

blockIdx.x= 0 blockIdx.x= 1 blockIdx.x= 2

0 1 2 3 4 5 6 7 8 910

11

12

13

14

15

16

17

18

19

20

Threads

Global Memory

How to point each thread to the right global memory address ?Hint: you need to find the idx formula that count one memory index and jump the next blockSize .You will access the second index throw idx + blockDim.x

Page 36: Intro2 Cuda Moayad

36

What you learned

• Creating CUDA Kernel• Calling the Kernel from the host• Allocating CUDA memory• Copy to/from the device memory • freeing the device memory • controlling the number of threads throw the block

size and number of blocks per grid.

Page 37: Intro2 Cuda Moayad

37

Dot Product

A

B

C

+

×

Page 38: Intro2 Cuda Moayad

38

• if each thread do one multiplication. which thread will make the addition ?

Page 39: Intro2 Cuda Moayad

39

Shared Memory

• The Shared Memory is very fast memory on the GPU chip itself.

• each block has its own shared memory space.• can be declared using __shared__ CUDA

keyword

• to make sure all the thread finished computing use the CUDA keyword __syncthreads()

Page 40: Intro2 Cuda Moayad

40

Dot Product Kernel

__global__ void dotP(int *a, int *b, int *c){

__shared__ temp[N];

temp[threadIdx.x] = a[threadIdx.x] * b[threadIdx.x];

__syncthreads();

if (threadIdx.x == 0) {

int sum = 0;

for (int i = 0;i<N;i++)

sum += temp[i];

c = sum;

}

}

Page 41: Intro2 Cuda Moayad

41

exercise

• in this application the addition will run on thread 0 only. is that efficient ?

• how to make it better ?

Page 42: Intro2 Cuda Moayad

42

Matrix multiplication

A

B

C

Page 43: Intro2 Cuda Moayad

43

MatrixMul on the GPU

void main( void ) {int n = 16;

float *h_a[n][n], *h_b[n][n], *h_c[n][n];

float *d_a[n][n], *d_b[n][n], *d_c[n][n];

int size = sizeof(float) * n * n;

cudaMalloc((void**) &d_a, size);

cudaMalloc((void**) &d_b, size);

cudaMalloc((void**) &d_c, size);

... // setting the input data h_a and h_b

Page 44: Intro2 Cuda Moayad

44

Array Addition on the GPU

cudaMemcpy(d_a, &h_a, size, cudaMemcpyHostToDevice);

cudaMemcpy(d_b, &h_b, size, cudaMemcpyHostToDevice);

cudaMemcpy(&h_c, d_c, size, cudaMemcpyDeviceToHost);

cudaFree(d_a);

cudaFree(d_b);

cudaFree(d_c);

return 0;

}

dim3 blockSize = (n,n,1);

add<<<1,blockSize>>>(d_a,d_b,d_c);

Page 45: Intro2 Cuda Moayad

45

Simple Matrix Multiplication Kernel

__global__ void matrixMul( float *a, float *b, float *c ){

int x = threadIdx.x ; //row

int y = threadIdx.y; //column

float temp = 0;

for (int i=0; i<= blockDim.x; i++){temp += a[i][y] * b[x][i];

}c[x][y] = temp;

}

Page 46: Intro2 Cuda Moayad

46

Exercise

• Use the shared memory to optimize the matrix algorithm (hint: look at the code on the SDK)

Page 47: Intro2 Cuda Moayad

47

What you learned

• Using the shared memory to share the date among the threads in a block

• Synchronizing the threads• setting blockSize of more than one dimension

using dim3

Page 48: Intro2 Cuda Moayad

48

Performance Considerations

• for maximum performance:– Reduce the global memory access.– maximize the occupancy (allow scheduling of 1024

threads per stream multi processor) • use the right blockSize • use the right number of registers • use the right size of the shared memory

– Increasing the independent instructions – coalescing the memory access – Using right instruction:byte ratio

Page 49: Intro2 Cuda Moayad

49

Introduction to OpenCL

• OpenCL is open standard• Cross platform; can run on:

– Multi-core CPU– GPU (NVIDIA,ATI)– Cell B/E– others

• close to CUDA

Page 50: Intro2 Cuda Moayad

50

How the program work

Host Device (GPU)

Memory Memory

•Allocating the memory in the Host

A[] B[] C[]

•initializing data in the memory objects.

A[] B[] C[]A[] B[] C[]

•Allocating the memory in the Device (GPU)

•Copy the Data from Host to Device

A[] B[] C[]

Stream Processors

GPUKernel Code

•Running the Kernel

GPUKernel Code

•Copy the results to the Host memory•Clear the Memory and Free the resources

Page 51: Intro2 Cuda Moayad

51

Basic OpenCL program Structure

• OpenCL Kernel• Host program containing:

– a. Devices Context.– b. Command Queue– c. Memory Objects– d. OpenCL Program.– e. Kernel Memory Arguments.

Page 52: Intro2 Cuda Moayad

52

Creating the Kernel

#include <studio.h>#include <stdlib.h>#include <CL/cl.h>const char* OpenCLSource[ ] = { “__kernel void VectorAdd(__global int* c, __global int* a, \n”, “ __global int* b) \n”, “{ \n”, “unsigned int n = get_global_id(0); \n”, “ c[c] = a[n] + b[n]; \n”, “} \n”};

Page 53: Intro2 Cuda Moayad

53

Creating the Kernel

#include <studio.h>#include <stdlib.h>#include <CL/cl.h>const char* OpenCLSource[ ] = { “__kernel void VectorAdd(__global int* c, __global int* a, \n”, “ __global int* b) \n”, “{ \n”, “unsigned int n = get_global_id(0); \n”, “ c[c] = a[n] + b[n]; \n”, “} \n”};

Notice that all the kernel here stored as char variable

const char* OpenCLSource[ ] = {

Page 54: Intro2 Cuda Moayad

54

Creating the Kernel

#include <studio.h>#include <stdlib.h>#include <CL/cl.h>const char* OpenCLSource[ ] = { “__kernel void VectorAdd(__global int* c, __global int* a, \n”, “ __global int* b) \n”, “{ \n”, “unsigned int n = get_global_id(0); \n”, “ c[c] = a[n] + b[n]; \n”, “} \n”};

get_global_id() is a built in function instead of calculating the global ID in CUDA

The __kernel key word is equivalent to __global__ in CUDA The function parameters need to be define as __global while you don’t need that in CUDA

const char* OpenCLSource[ ] = { “__kernel void VectorAdd(__global int* c, __global int* a, \n”, “ __global int* b) \n”,

“ unsigned int n = get_global_id(0); \n”,

Page 55: Intro2 Cuda Moayad

55

Initializing data

int InitialData1[12] = {62, 48, 20, -53, 39, 83, 19, 47, 13, 88, 38, -92};int InitialData2[12] = {-49, 29, 38, 10, 37, 46, -12, 86, 17, 83, -22, 94};#define SIZE 2048

Page 56: Intro2 Cuda Moayad

56

Creating the main function

int main (int argc, char **argv){ int HostVector1[SIZE]; int HostVector2[SIZE];for (int c= 0; c<SIZE; c++) { HostVector[c] = InitialData1[c

%12]; HostVector[c] = initialData2[c%12];}

Page 57: Intro2 Cuda Moayad

57

Creating the context

cl_context GPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, NULL);

cl_context clCreateContextFromType(cl_context_properties *properties, cl_device_type device_type, void (*pfn_notify)(const char *errinfo, const void *private_info, size_t cb, void *user_data), void *user_data, cl_int *errcode_ret)

Page 58: Intro2 Cuda Moayad

58

Creating the context

cl_context GPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, NULL);

cl_context clCreateContextFromType(cl_context_properties *properties, cl_device_type device_type, void (*pfn_notify)(const char *errinfo, const void *private_info, size_t cb, void *user_data), void *user_data, cl_int *errcode_ret)

You can also use CL_DEVICE_TYPE_CPU

Page 59: Intro2 Cuda Moayad

59

Query compute devices

size_t ParamDataBytes;clGetContextInfo(GPUContext, CL_CONTEXT_DEVICES, 0, NULL, &ParmDataBytes);

cl_int clGetContextInfo( cl_context context,cl_platform_info param_name,size_t param_value_size, void *param_value, size_t *param_value_size_ret)Param_name: CL_CONTEXT_REFERENCE_COUNT,CL_CONTEXT_DEVICES,CL_CONTEXT_PROPERTIES

Page 60: Intro2 Cuda Moayad

60

Query compute devices

cl_device_id* GPUDevices = (cl_device_id*)malloc(ParmDataBytes);clGetContextInfo(GPUContext, CL_CONTEXT_DEVICES, ParmDataBytes, GPUDevices, NULL);

cl_int clGetContextInfo( cl_context context,cl_platform_info param_name,size_t param_value_size, void *param_value, size_t *param_value_size_ret)Param_name: CL_CONTEXT_REFERENCE_COUNT,CL_CONTEXT_DEVICES,CL_CONTEXT_PROPERTIES

Page 61: Intro2 Cuda Moayad

61

Command queue

cl_command_queue GPUCommandQueue = clCreatCommandQueue (GPUContext, GPUDevices[0], 0, NULL);

cl_command_queue clCreateCommandQueue(cl_context context, cl_device_id device,cl_command_queue_properties properties, cl_int *errcode_ret)Properties:CL_QUEUE_PROFILING_ENABLE,

CL_QUEUE_OUT_OF_ORFER_EXEC_MODE_ENABLE

Page 62: Intro2 Cuda Moayad

62

Allocating the Memory

cl_mem GPUVector1 = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof (int) * SIZE, HostVector1, NULL);

cl_mem clCreateBuffer(cl_context context,cl_mem_flags flags, size_t size, void *host_ptr, cl_int *errcode_ret)flags: CL_MEM_READ_WRITE, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY, CL_MEM_USE_HOST_PTR,CL_MEM_ALLOC_HOST_PTR, CL_MEM_COPY_HOST_PTR

Page 63: Intro2 Cuda Moayad

63

Allocating the Memory

cl_mem GPUVector2 = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof (int) * SIZE, HostVector2, NULL);cl_mem GPUOutputVector; GPUOutputVector = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY, sizeof (int) * SIZE, NULL, NULL);

Page 64: Intro2 Cuda Moayad

64

Creating the program

cl_program OpenCLProgram = clCreateProgramWithSource(GPUContext, 8, OpenCLSource, NULL, NULL);

cl_program clCreateProgramWithSource(cl_context context, cl_unit count, const char **strings, const size_t *lengths, cl_int *errcode_ret)

Page 65: Intro2 Cuda Moayad

65

Creating the program

clBuildProgram(OpenCLProgram, 0, NULL, NULL, NULL, NULL);

cl_int clBuildProgram(cl_program program, cl_unit num_devices, const cl_device_id *device_list,

const char *options,void (*pfn_notify)(cl_program, void *user_data), void *user_data)

Page 66: Intro2 Cuda Moayad

66

Creating the program

cl_kernel OpenCLVectorAdd = clCreateKernel(OpenCLProgram, “VectorAdd”, NULL);

cl_kernel clCreateKernel(cl_program program, const char *kernel_name, cl_int *errcode_ret)

Page 67: Intro2 Cuda Moayad

67

matching the GPU memory with the Kernel

clSetKernelArg(OpenCLVectorAdd, 0, sizeof (cl_mem), (void*) &GPUOutputVector);

cl_int clSetKernelArg(cl_kernel kernel, cl_unit arg_index, size_t arg_size, const void *arg_value)

Page 68: Intro2 Cuda Moayad

68

matching the GPU memory with the Kernel

clSetKernelArg(OpenCLVectorAdd, 1, sizeof (cl_mem), (void*) &GPUVector1);clSetKernelArg(OpenCLVectorAdd, 2, sizeof (cl_mem), (void*) &GPUVector2);

Page 69: Intro2 Cuda Moayad

69

Lunching the Kernel

size_t WorkSize [1] = {SIZE};clEnqueueNDRangeKernel(GPUCommandQueue, OpenCLVectorAdd, 1, NULL, WorkSize, NULL, 0, NULL, NULL);

cl_int clEnqueueNDRangeKernel(cl_command_queue command_queue, cl_kernel kernel, cl_unit work_dim, const size_t *global_work_offset,

const size_t *global_work_size, const size_t *local_work_size, cl_unit num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event)

Page 70: Intro2 Cuda Moayad

70

Copying the output to the host memory

int HostOutputVector [SIZE];clEnqueueReadBuffer(GPUCommandQueue, GPUOutputVector, CL_TRUE, 0, SIZE* sizeof(int), HostOutputVector, 0, NULL, NULL);

cl_int clEnqueueReadBuffer(cl_command_queue command_queue, cl_mem buffer,

cl_bool blocking_read, size_t offset,size_t cb,void *ptr, cl_unit num_evernts_in_wait_list,const cl_event *event_wait_list, cl_event *event)

Page 71: Intro2 Cuda Moayad

71

Cleaning the GPU device

clReleaseMemObject(GPUVector1);clReleaseMemObject(GPUVector2);clReleaseMemObject(GPUOutputVector);free (GPUDevices);for(int c= 0; c < 305; c++)printf (“%c”, (char)HostOutputVector[c]);return 0;}

Page 72: Intro2 Cuda Moayad

72

What you learned

• Writing OpenCL Kernel• Writing OpenCL Application

– Setting the context – preparing the command queue – setting the memory objects– setting the program– setting the kernel and the arguments

Page 73: Intro2 Cuda Moayad

73

Sources and additional resources

• Jason sander, “Introduction to CUDA” -book and GTC presentation.

• OpenCL specification document• NVIDIA CUDA programming guide • NVIDIA OpenCL getting started guide

• Videos from GTC’10 in the link :• http://www.nvidia.com/object/gtc2010-presentatio

n-archive.html#session2131