commit
3b978b8e0b
5 changed files with 367 additions and 0 deletions
-
1.gitignore
-
6Makefile
-
20README.md
-
4README.md.old
-
336arrayadd.c
@ -0,0 +1 @@ |
|||||
|
arrayadd |
||||
@ -0,0 +1,6 @@ |
|||||
|
CFLAGS += -std=c99 -I/usr/local/include |
||||
|
LIBS += -lcl -L/usr/local/lib64/beignet |
||||
|
CC = cc |
||||
|
|
||||
|
all: |
||||
|
$(CC) -O2 -march=native $(CFLAGS) $(LIBS) -o arrayadd arrayadd.c |
||||
@ -0,0 +1,20 @@ |
|||||
|
# OpenCL array add. |
||||
|
|
||||
|
Taken from |
||||
|
[here](http://www.heterogeneouscompute.org/wordpress/wp-content/uploads/2011/06/Chapter2.txt). |
||||
|
|
||||
|
## Description |
||||
|
|
||||
|
This is an example on how to implement an array add with OpenCL. |
||||
|
|
||||
|
## Requirements |
||||
|
|
||||
|
Some OpenCL capable hardware and the according OpenCL library exposing the |
||||
|
OpenCL API. I tested this on an Intel GPU (Intel Corporation Haswell-ULT |
||||
|
Integrated Graphics Controller (rev 09)) with the |
||||
|
[beignet](https://www.freedesktop.org/wiki/Software/Beignet/) |
||||
|
open source library. |
||||
|
|
||||
|
## License |
||||
|
|
||||
|
unknown |
||||
@ -0,0 +1,4 @@ |
|||||
|
OpenCL tutorial notes |
||||
|
===================== |
||||
|
|
||||
|
URL: http://www.heterogeneouscompute.org/wordpress/wp-content/uploads/2011/06/Chapter2.txt |
||||
@ -0,0 +1,336 @@ |
|||||
|
// This program implements a vector addition using OpenCL |
||||
|
|
||||
|
// System includes |
||||
|
#include <stdio.h> |
||||
|
#include <stdlib.h> |
||||
|
|
||||
|
// OpenCL includes |
||||
|
#include <CL/cl.h> |
||||
|
|
||||
|
// OpenCL kernel to perform an element-wise add of two arrays |
||||
|
const char* programSource = |
||||
|
"__kernel \n" |
||||
|
"void vecadd(__global int *A, \n" |
||||
|
" __global int *B, \n" |
||||
|
" __global int *C) \n" |
||||
|
"{ \n" |
||||
|
" \n" |
||||
|
" // Get the work-item’s unique ID \n" |
||||
|
" int idx = get_global_id(0); \n" |
||||
|
" \n" |
||||
|
" // Add the corresponding locations of \n" |
||||
|
" // 'A' and 'B', and store the result in 'C'. \n" |
||||
|
" C[idx] = A[idx] + B[idx]; \n" |
||||
|
"} \n" |
||||
|
; |
||||
|
|
||||
|
typedef enum {false=0, true} bool; |
||||
|
|
||||
|
int main() { |
||||
|
// This code executes on the OpenCL host |
||||
|
|
||||
|
// Host data |
||||
|
int *A = NULL; // Input array |
||||
|
int *B = NULL; // Input array |
||||
|
int *C = NULL; // Output array |
||||
|
|
||||
|
// Elements in each array |
||||
|
const int elements = 2048; |
||||
|
|
||||
|
// Compute the size of the data |
||||
|
size_t datasize = sizeof(int)*elements; |
||||
|
|
||||
|
// Allocate space for input/output data |
||||
|
A = (int*)malloc(datasize); |
||||
|
B = (int*)malloc(datasize); |
||||
|
C = (int*)malloc(datasize); |
||||
|
// Initialize the input data |
||||
|
for(int i = 0; i < elements; i++) { |
||||
|
A[i] = i; |
||||
|
B[i] = i; |
||||
|
} |
||||
|
|
||||
|
// Use this to check the output of each API call |
||||
|
cl_int status; |
||||
|
|
||||
|
//----------------------------------------------------- |
||||
|
// STEP 1: Discover and initialize the platforms |
||||
|
//----------------------------------------------------- |
||||
|
|
||||
|
cl_uint numPlatforms = 0; |
||||
|
cl_platform_id *platforms = NULL; |
||||
|
|
||||
|
// Use clGetPlatformIDs() to retrieve the number of platforms |
||||
|
status = clGetPlatformIDs(0, NULL, &numPlatforms); |
||||
|
|
||||
|
// Allocate enough space for each platform |
||||
|
platforms = |
||||
|
(cl_platform_id*)malloc( |
||||
|
numPlatforms*sizeof(cl_platform_id)); |
||||
|
|
||||
|
// Fill in platforms with clGetPlatformIDs() |
||||
|
status = clGetPlatformIDs(numPlatforms, platforms, |
||||
|
NULL); |
||||
|
|
||||
|
//----------------------------------------------------- |
||||
|
// STEP 2: Discover and initialize the devices |
||||
|
//----------------------------------------------------- |
||||
|
|
||||
|
cl_uint numDevices = 0; |
||||
|
cl_device_id *devices = NULL; |
||||
|
|
||||
|
// Use clGetDeviceIDs() to retrieve the number of |
||||
|
// devices present |
||||
|
status = clGetDeviceIDs( |
||||
|
platforms[0], |
||||
|
CL_DEVICE_TYPE_ALL, |
||||
|
0, |
||||
|
NULL, |
||||
|
&numDevices); |
||||
|
|
||||
|
// Allocate enough space for each device |
||||
|
devices = |
||||
|
(cl_device_id*)malloc( |
||||
|
numDevices*sizeof(cl_device_id)); |
||||
|
|
||||
|
// Fill in devices with clGetDeviceIDs() |
||||
|
status = clGetDeviceIDs( |
||||
|
platforms[0], |
||||
|
CL_DEVICE_TYPE_ALL, |
||||
|
numDevices, |
||||
|
devices, |
||||
|
NULL); |
||||
|
|
||||
|
//----------------------------------------------------- |
||||
|
// STEP 3: Create a context |
||||
|
//----------------------------------------------------- |
||||
|
|
||||
|
cl_context context = NULL; |
||||
|
|
||||
|
// Create a context using clCreateContext() and |
||||
|
// associate it with the devices |
||||
|
context = clCreateContext( |
||||
|
NULL, |
||||
|
numDevices, |
||||
|
devices, |
||||
|
NULL, |
||||
|
NULL, |
||||
|
&status); |
||||
|
|
||||
|
//----------------------------------------------------- |
||||
|
// STEP 4: Create a command queue |
||||
|
//----------------------------------------------------- |
||||
|
|
||||
|
cl_command_queue cmdQueue; |
||||
|
|
||||
|
// Create a command queue using clCreateCommandQueue(), |
||||
|
// and associate it with the device you want to execute |
||||
|
// on |
||||
|
cmdQueue = clCreateCommandQueue( |
||||
|
context, |
||||
|
devices[0], |
||||
|
0, |
||||
|
&status); |
||||
|
|
||||
|
//----------------------------------------------------- |
||||
|
// STEP 5: Create device buffers |
||||
|
//----------------------------------------------------- |
||||
|
|
||||
|
cl_mem bufferA; // Input array on the device |
||||
|
cl_mem bufferB; // Input array on the device |
||||
|
cl_mem bufferC; // Output array on the device |
||||
|
|
||||
|
// Use clCreateBuffer() to create a buffer object (d_A) |
||||
|
// that will contain the data from the host array A |
||||
|
bufferA = clCreateBuffer( |
||||
|
context, |
||||
|
CL_MEM_READ_ONLY, |
||||
|
datasize, |
||||
|
NULL, |
||||
|
&status); |
||||
|
|
||||
|
// Use clCreateBuffer() to create a buffer object (d_B) |
||||
|
// that will contain the data from the host array B |
||||
|
bufferB = clCreateBuffer( |
||||
|
context, |
||||
|
CL_MEM_READ_ONLY, |
||||
|
datasize, |
||||
|
NULL, |
||||
|
&status); |
||||
|
|
||||
|
// Use clCreateBuffer() to create a buffer object (d_C) |
||||
|
// with enough space to hold the output data |
||||
|
bufferC = clCreateBuffer( |
||||
|
context, |
||||
|
CL_MEM_WRITE_ONLY, |
||||
|
datasize, |
||||
|
NULL, |
||||
|
&status); |
||||
|
|
||||
|
//----------------------------------------------------- |
||||
|
// STEP 6: Write host data to device buffers |
||||
|
//----------------------------------------------------- |
||||
|
|
||||
|
// Use clEnqueueWriteBuffer() to write input array A to |
||||
|
// the device buffer bufferA |
||||
|
status = clEnqueueWriteBuffer( |
||||
|
cmdQueue, |
||||
|
bufferA, |
||||
|
CL_FALSE, |
||||
|
0, |
||||
|
datasize, |
||||
|
A, |
||||
|
0, |
||||
|
NULL, |
||||
|
NULL); |
||||
|
|
||||
|
// Use clEnqueueWriteBuffer() to write input array B to |
||||
|
// the device buffer bufferB |
||||
|
status = clEnqueueWriteBuffer( |
||||
|
cmdQueue, |
||||
|
bufferB, |
||||
|
CL_FALSE, |
||||
|
0, |
||||
|
datasize, |
||||
|
B, |
||||
|
0, |
||||
|
NULL, |
||||
|
NULL); |
||||
|
|
||||
|
//----------------------------------------------------- |
||||
|
// STEP 7: Create and compile the program |
||||
|
//----------------------------------------------------- |
||||
|
|
||||
|
// Create a program using clCreateProgramWithSource() |
||||
|
cl_program program = clCreateProgramWithSource( |
||||
|
context, |
||||
|
1, |
||||
|
(const char**)&programSource, |
||||
|
NULL, |
||||
|
&status); |
||||
|
|
||||
|
// Build (compile) the program for the devices with |
||||
|
// clBuildProgram() |
||||
|
status = clBuildProgram( |
||||
|
program, |
||||
|
numDevices, |
||||
|
devices, |
||||
|
NULL, |
||||
|
NULL, |
||||
|
NULL); |
||||
|
|
||||
|
//----------------------------------------------------- |
||||
|
// STEP 8: Create the kernel |
||||
|
//----------------------------------------------------- |
||||
|
|
||||
|
cl_kernel kernel = NULL; |
||||
|
|
||||
|
// Use clCreateKernel() to create a kernel from the |
||||
|
// vector addition function (named "vecadd") |
||||
|
kernel = clCreateKernel(program, "vecadd", &status); |
||||
|
|
||||
|
//----------------------------------------------------- |
||||
|
// STEP 9: Set the kernel arguments |
||||
|
//----------------------------------------------------- |
||||
|
|
||||
|
// Associate the input and output buffers with the |
||||
|
// kernel |
||||
|
// using clSetKernelArg() |
||||
|
status = clSetKernelArg( |
||||
|
kernel, |
||||
|
0, |
||||
|
sizeof(cl_mem), |
||||
|
&bufferA); |
||||
|
status |= clSetKernelArg( |
||||
|
kernel, |
||||
|
1, |
||||
|
sizeof(cl_mem), |
||||
|
&bufferB); |
||||
|
status |= clSetKernelArg( |
||||
|
kernel, |
||||
|
2, |
||||
|
sizeof(cl_mem), |
||||
|
&bufferC); |
||||
|
|
||||
|
//----------------------------------------------------- |
||||
|
// STEP 10: Configure the work-item structure |
||||
|
//----------------------------------------------------- |
||||
|
|
||||
|
// Define an index space (global work size) of work items for |
||||
|
// execution. A workgroup size (local work size) is not required, |
||||
|
// but can be used. |
||||
|
size_t globalWorkSize[1]; |
||||
|
// There are 'elements' work-items |
||||
|
globalWorkSize[0] = elements; |
||||
|
|
||||
|
//----------------------------------------------------- |
||||
|
// STEP 11: Enqueue the kernel for execution |
||||
|
//----------------------------------------------------- |
||||
|
|
||||
|
// Execute the kernel by using clEnqueueNDRangeKernel(). |
||||
|
// 'globalWorkSize' is the 1D dimension of the work-items |
||||
|
status = clEnqueueNDRangeKernel( |
||||
|
cmdQueue, |
||||
|
kernel, |
||||
|
1, |
||||
|
NULL, |
||||
|
globalWorkSize, |
||||
|
NULL, |
||||
|
0, |
||||
|
NULL, |
||||
|
NULL); |
||||
|
|
||||
|
//----------------------------------------------------- |
||||
|
// STEP 12: Read the output buffer back to the host |
||||
|
//----------------------------------------------------- |
||||
|
|
||||
|
// Use clEnqueueReadBuffer() to read the OpenCL output |
||||
|
// buffer (bufferC) |
||||
|
// to the host output array (C) |
||||
|
clEnqueueReadBuffer( |
||||
|
cmdQueue, |
||||
|
bufferC, |
||||
|
CL_TRUE, |
||||
|
0, |
||||
|
datasize, |
||||
|
C, |
||||
|
0, |
||||
|
NULL, |
||||
|
NULL); |
||||
|
|
||||
|
// Verify the output |
||||
|
bool result = true; |
||||
|
for(int i = 0; i < elements; i++) { |
||||
|
if(C[i] != i+i) { |
||||
|
result = false; |
||||
|
break; |
||||
|
} |
||||
|
} |
||||
|
if(result) { |
||||
|
printf("Output is correct\n"); |
||||
|
} else { |
||||
|
printf("Output is incorrect\n"); |
||||
|
} |
||||
|
|
||||
|
//----------------------------------------------------- |
||||
|
// STEP 13: Release OpenCL resources |
||||
|
//----------------------------------------------------- |
||||
|
|
||||
|
// Free OpenCL resources |
||||
|
clReleaseKernel(kernel); |
||||
|
clReleaseProgram(program); |
||||
|
clReleaseCommandQueue(cmdQueue); |
||||
|
clReleaseMemObject(bufferA); |
||||
|
clReleaseMemObject(bufferB); |
||||
|
clReleaseMemObject(bufferC); |
||||
|
clReleaseContext(context); |
||||
|
|
||||
|
// Free host resources |
||||
|
free(A); |
||||
|
free(B); |
||||
|
free(C); |
||||
|
free(platforms); |
||||
|
free(devices); |
||||
|
} |
||||
|
|
||||
|
// vim: ft=c ts=4 sw=4: |
||||
Write
Preview
Loading…
Cancel
Save
Reference in new issue