commit
3b978b8e0b
5 changed files with 367 additions and 0 deletions
-
1.gitignore
-
6Makefile
-
20README.md
-
4README.md.old
-
336arrayadd.c
@ -0,0 +1 @@ |
|||
arrayadd |
|||
@ -0,0 +1,6 @@ |
|||
CFLAGS += -std=c99 -I/usr/local/include |
|||
LIBS += -lcl -L/usr/local/lib64/beignet |
|||
CC = cc |
|||
|
|||
all: |
|||
$(CC) -O2 -march=native $(CFLAGS) $(LIBS) -o arrayadd arrayadd.c |
|||
@ -0,0 +1,20 @@ |
|||
# OpenCL array add. |
|||
|
|||
Taken from |
|||
[here](http://www.heterogeneouscompute.org/wordpress/wp-content/uploads/2011/06/Chapter2.txt). |
|||
|
|||
## Description |
|||
|
|||
This is an example on how to implement an array add with OpenCL. |
|||
|
|||
## Requirements |
|||
|
|||
Some OpenCL capable hardware and the according OpenCL library exposing the |
|||
OpenCL API. I tested this on an Intel GPU (Intel Corporation Haswell-ULT |
|||
Integrated Graphics Controller (rev 09)) with the |
|||
[beignet](https://www.freedesktop.org/wiki/Software/Beignet/) |
|||
open source library. |
|||
|
|||
## License |
|||
|
|||
unknown |
|||
@ -0,0 +1,4 @@ |
|||
OpenCL tutorial notes |
|||
===================== |
|||
|
|||
URL: http://www.heterogeneouscompute.org/wordpress/wp-content/uploads/2011/06/Chapter2.txt |
|||
@ -0,0 +1,336 @@ |
|||
// This program implements a vector addition using OpenCL |
|||
|
|||
// System includes |
|||
#include <stdio.h> |
|||
#include <stdlib.h> |
|||
|
|||
// OpenCL includes |
|||
#include <CL/cl.h> |
|||
|
|||
// OpenCL kernel to perform an element-wise add of two arrays |
|||
const char* programSource = |
|||
"__kernel \n" |
|||
"void vecadd(__global int *A, \n" |
|||
" __global int *B, \n" |
|||
" __global int *C) \n" |
|||
"{ \n" |
|||
" \n" |
|||
" // Get the work-item’s unique ID \n" |
|||
" int idx = get_global_id(0); \n" |
|||
" \n" |
|||
" // Add the corresponding locations of \n" |
|||
" // 'A' and 'B', and store the result in 'C'. \n" |
|||
" C[idx] = A[idx] + B[idx]; \n" |
|||
"} \n" |
|||
; |
|||
|
|||
typedef enum {false=0, true} bool; |
|||
|
|||
int main() { |
|||
// This code executes on the OpenCL host |
|||
|
|||
// Host data |
|||
int *A = NULL; // Input array |
|||
int *B = NULL; // Input array |
|||
int *C = NULL; // Output array |
|||
|
|||
// Elements in each array |
|||
const int elements = 2048; |
|||
|
|||
// Compute the size of the data |
|||
size_t datasize = sizeof(int)*elements; |
|||
|
|||
// Allocate space for input/output data |
|||
A = (int*)malloc(datasize); |
|||
B = (int*)malloc(datasize); |
|||
C = (int*)malloc(datasize); |
|||
// Initialize the input data |
|||
for(int i = 0; i < elements; i++) { |
|||
A[i] = i; |
|||
B[i] = i; |
|||
} |
|||
|
|||
// Use this to check the output of each API call |
|||
cl_int status; |
|||
|
|||
//----------------------------------------------------- |
|||
// STEP 1: Discover and initialize the platforms |
|||
//----------------------------------------------------- |
|||
|
|||
cl_uint numPlatforms = 0; |
|||
cl_platform_id *platforms = NULL; |
|||
|
|||
// Use clGetPlatformIDs() to retrieve the number of platforms |
|||
status = clGetPlatformIDs(0, NULL, &numPlatforms); |
|||
|
|||
// Allocate enough space for each platform |
|||
platforms = |
|||
(cl_platform_id*)malloc( |
|||
numPlatforms*sizeof(cl_platform_id)); |
|||
|
|||
// Fill in platforms with clGetPlatformIDs() |
|||
status = clGetPlatformIDs(numPlatforms, platforms, |
|||
NULL); |
|||
|
|||
//----------------------------------------------------- |
|||
// STEP 2: Discover and initialize the devices |
|||
//----------------------------------------------------- |
|||
|
|||
cl_uint numDevices = 0; |
|||
cl_device_id *devices = NULL; |
|||
|
|||
// Use clGetDeviceIDs() to retrieve the number of |
|||
// devices present |
|||
status = clGetDeviceIDs( |
|||
platforms[0], |
|||
CL_DEVICE_TYPE_ALL, |
|||
0, |
|||
NULL, |
|||
&numDevices); |
|||
|
|||
// Allocate enough space for each device |
|||
devices = |
|||
(cl_device_id*)malloc( |
|||
numDevices*sizeof(cl_device_id)); |
|||
|
|||
// Fill in devices with clGetDeviceIDs() |
|||
status = clGetDeviceIDs( |
|||
platforms[0], |
|||
CL_DEVICE_TYPE_ALL, |
|||
numDevices, |
|||
devices, |
|||
NULL); |
|||
|
|||
//----------------------------------------------------- |
|||
// STEP 3: Create a context |
|||
//----------------------------------------------------- |
|||
|
|||
cl_context context = NULL; |
|||
|
|||
// Create a context using clCreateContext() and |
|||
// associate it with the devices |
|||
context = clCreateContext( |
|||
NULL, |
|||
numDevices, |
|||
devices, |
|||
NULL, |
|||
NULL, |
|||
&status); |
|||
|
|||
//----------------------------------------------------- |
|||
// STEP 4: Create a command queue |
|||
//----------------------------------------------------- |
|||
|
|||
cl_command_queue cmdQueue; |
|||
|
|||
// Create a command queue using clCreateCommandQueue(), |
|||
// and associate it with the device you want to execute |
|||
// on |
|||
cmdQueue = clCreateCommandQueue( |
|||
context, |
|||
devices[0], |
|||
0, |
|||
&status); |
|||
|
|||
//----------------------------------------------------- |
|||
// STEP 5: Create device buffers |
|||
//----------------------------------------------------- |
|||
|
|||
cl_mem bufferA; // Input array on the device |
|||
cl_mem bufferB; // Input array on the device |
|||
cl_mem bufferC; // Output array on the device |
|||
|
|||
// Use clCreateBuffer() to create a buffer object (d_A) |
|||
// that will contain the data from the host array A |
|||
bufferA = clCreateBuffer( |
|||
context, |
|||
CL_MEM_READ_ONLY, |
|||
datasize, |
|||
NULL, |
|||
&status); |
|||
|
|||
// Use clCreateBuffer() to create a buffer object (d_B) |
|||
// that will contain the data from the host array B |
|||
bufferB = clCreateBuffer( |
|||
context, |
|||
CL_MEM_READ_ONLY, |
|||
datasize, |
|||
NULL, |
|||
&status); |
|||
|
|||
// Use clCreateBuffer() to create a buffer object (d_C) |
|||
// with enough space to hold the output data |
|||
bufferC = clCreateBuffer( |
|||
context, |
|||
CL_MEM_WRITE_ONLY, |
|||
datasize, |
|||
NULL, |
|||
&status); |
|||
|
|||
//----------------------------------------------------- |
|||
// STEP 6: Write host data to device buffers |
|||
//----------------------------------------------------- |
|||
|
|||
// Use clEnqueueWriteBuffer() to write input array A to |
|||
// the device buffer bufferA |
|||
status = clEnqueueWriteBuffer( |
|||
cmdQueue, |
|||
bufferA, |
|||
CL_FALSE, |
|||
0, |
|||
datasize, |
|||
A, |
|||
0, |
|||
NULL, |
|||
NULL); |
|||
|
|||
// Use clEnqueueWriteBuffer() to write input array B to |
|||
// the device buffer bufferB |
|||
status = clEnqueueWriteBuffer( |
|||
cmdQueue, |
|||
bufferB, |
|||
CL_FALSE, |
|||
0, |
|||
datasize, |
|||
B, |
|||
0, |
|||
NULL, |
|||
NULL); |
|||
|
|||
//----------------------------------------------------- |
|||
// STEP 7: Create and compile the program |
|||
//----------------------------------------------------- |
|||
|
|||
// Create a program using clCreateProgramWithSource() |
|||
cl_program program = clCreateProgramWithSource( |
|||
context, |
|||
1, |
|||
(const char**)&programSource, |
|||
NULL, |
|||
&status); |
|||
|
|||
// Build (compile) the program for the devices with |
|||
// clBuildProgram() |
|||
status = clBuildProgram( |
|||
program, |
|||
numDevices, |
|||
devices, |
|||
NULL, |
|||
NULL, |
|||
NULL); |
|||
|
|||
//----------------------------------------------------- |
|||
// STEP 8: Create the kernel |
|||
//----------------------------------------------------- |
|||
|
|||
cl_kernel kernel = NULL; |
|||
|
|||
// Use clCreateKernel() to create a kernel from the |
|||
// vector addition function (named "vecadd") |
|||
kernel = clCreateKernel(program, "vecadd", &status); |
|||
|
|||
//----------------------------------------------------- |
|||
// STEP 9: Set the kernel arguments |
|||
//----------------------------------------------------- |
|||
|
|||
// Associate the input and output buffers with the |
|||
// kernel |
|||
// using clSetKernelArg() |
|||
status = clSetKernelArg( |
|||
kernel, |
|||
0, |
|||
sizeof(cl_mem), |
|||
&bufferA); |
|||
status |= clSetKernelArg( |
|||
kernel, |
|||
1, |
|||
sizeof(cl_mem), |
|||
&bufferB); |
|||
status |= clSetKernelArg( |
|||
kernel, |
|||
2, |
|||
sizeof(cl_mem), |
|||
&bufferC); |
|||
|
|||
//----------------------------------------------------- |
|||
// STEP 10: Configure the work-item structure |
|||
//----------------------------------------------------- |
|||
|
|||
// Define an index space (global work size) of work items for |
|||
// execution. A workgroup size (local work size) is not required, |
|||
// but can be used. |
|||
size_t globalWorkSize[1]; |
|||
// There are 'elements' work-items |
|||
globalWorkSize[0] = elements; |
|||
|
|||
//----------------------------------------------------- |
|||
// STEP 11: Enqueue the kernel for execution |
|||
//----------------------------------------------------- |
|||
|
|||
// Execute the kernel by using clEnqueueNDRangeKernel(). |
|||
// 'globalWorkSize' is the 1D dimension of the work-items |
|||
status = clEnqueueNDRangeKernel( |
|||
cmdQueue, |
|||
kernel, |
|||
1, |
|||
NULL, |
|||
globalWorkSize, |
|||
NULL, |
|||
0, |
|||
NULL, |
|||
NULL); |
|||
|
|||
//----------------------------------------------------- |
|||
// STEP 12: Read the output buffer back to the host |
|||
//----------------------------------------------------- |
|||
|
|||
// Use clEnqueueReadBuffer() to read the OpenCL output |
|||
// buffer (bufferC) |
|||
// to the host output array (C) |
|||
clEnqueueReadBuffer( |
|||
cmdQueue, |
|||
bufferC, |
|||
CL_TRUE, |
|||
0, |
|||
datasize, |
|||
C, |
|||
0, |
|||
NULL, |
|||
NULL); |
|||
|
|||
// Verify the output |
|||
bool result = true; |
|||
for(int i = 0; i < elements; i++) { |
|||
if(C[i] != i+i) { |
|||
result = false; |
|||
break; |
|||
} |
|||
} |
|||
if(result) { |
|||
printf("Output is correct\n"); |
|||
} else { |
|||
printf("Output is incorrect\n"); |
|||
} |
|||
|
|||
//----------------------------------------------------- |
|||
// STEP 13: Release OpenCL resources |
|||
//----------------------------------------------------- |
|||
|
|||
// Free OpenCL resources |
|||
clReleaseKernel(kernel); |
|||
clReleaseProgram(program); |
|||
clReleaseCommandQueue(cmdQueue); |
|||
clReleaseMemObject(bufferA); |
|||
clReleaseMemObject(bufferB); |
|||
clReleaseMemObject(bufferC); |
|||
clReleaseContext(context); |
|||
|
|||
// Free host resources |
|||
free(A); |
|||
free(B); |
|||
free(C); |
|||
free(platforms); |
|||
free(devices); |
|||
} |
|||
|
|||
// vim: ft=c ts=4 sw=4: |
|||
Write
Preview
Loading…
Cancel
Save
Reference in new issue