/* StarPU --- Runtime system for heterogeneous multicore architectures. * * Copyright (C) 2010-2021 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria * * StarPU is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation; either version 2.1 of the License, or (at * your option) any later version. * * StarPU is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * * See the GNU Lesser General Public License in COPYING.LGPL for more details. */ #include #include #include #include #ifdef __APPLE_CC__ #include #else #include #endif #define error(...) do { fprintf(stderr, "Error: " __VA_ARGS__); exit(EXIT_FAILURE); } while(0) #define check(err, str) do { if(err != CL_SUCCESS) { fprintf(stderr, "OpenCL Error (%d): %s\n",err, str); exit(EXIT_FAILURE); }} while(0) #ifdef UNUSED #elif defined(__GNUC__) # define UNUSED(x) UNUSED_ ## x __attribute__((unused)) #else # define UNUSED(x) x #endif #define SIZE 1024 #define TYPE float #define REALSIZE (SIZE * sizeof(TYPE)) const char * kernel_src = "__kernel void add(__global float*s1, __global float*s2, __global float*d) { \ size_t x = get_global_id(0);\n\ size_t y = get_global_id(1);\n\ size_t w = get_global_size(0); \n\ int idx = y*w+x; \n\ #ifdef SOCL_DEVICE_TYPE_GPU \n\ d[idx] = s1[idx] + s2[idx];\n\ #endif \n\ #ifdef SOCL_DEVICE_TYPE_CPU \n\ d[idx] = s1[idx] + 2* s2[idx];\n\ #endif \n\ #ifdef SOCL_DEVICE_TYPE_ACCELERATOR \n\ d[idx] = s1[idx] + 3 * s2[idx];\n\ #endif \n\ #ifdef SOCL_DEVICE_TYPE_UNKNOWN \n\ d[idx] = s1[idx] + 4 * s2[idx];\n\ #endif \n\ }"; cl_kernel kernel; cl_context context; TYPE s1[SIZE],s2[SIZE],d[SIZE]; typedef cl_int (*split_func_t)(cl_command_queue, cl_uint, cl_uint, const size_t *, const size_t *, const size_t *, const cl_event, cl_event *); void add(cl_command_queue cq, cl_uint size, TYPE * s1, TYPE *s2, TYPE*d, cl_uint num_events, cl_event * events, cl_event *event) { cl_int err; printf("Creating buffers...\n"); cl_mem s1m = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, size * sizeof(TYPE), s1, &err); check(err, "clCreateBuffer s1"); cl_mem s2m = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, size * sizeof(TYPE), s2, &err); check(err, "clCreateBuffer s2"); cl_mem dm = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, size * sizeof(TYPE), d, &err); check(err, "clCreateBuffer d"); err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &s1m); check(err, "clSetKernelArg 0"); err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &s2m); check(err, "clSetKernelArg 1"); err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &dm); check(err, "clSetKernelArg 2"); printf("Enqueueing NDRangeKernel...\n"); size_t local[3] = {16, 1, 1}; size_t global[3] = {size, 1, 1}; cl_event eventK; err = clEnqueueNDRangeKernel(cq, kernel, 3, NULL, global, local, num_events, events, &eventK); check(err, "clEnqueueNDRangeKernel"); clEnqueueMapBuffer(cq, dm, CL_FALSE, CL_MAP_READ, 0, size * sizeof(TYPE), 1, &eventK, event, &err); check(err, "clEnqueueMapBuffer"); clReleaseMemObject(s1m); clReleaseMemObject(s2m); clReleaseMemObject(dm); } cl_int split_func(cl_command_queue cq, cl_uint split_factor, void * data, cl_event before, cl_event * after) { cl_event evs[split_factor]; printf("Partition with factor %d\n", split_factor); cl_uint size = ((SIZE)/split_factor) - (SIZE/split_factor % 16); cl_uint i; for (i=0; i