Andra Hugo 13 år sedan
förälder
incheckning
9bb769dbf0

+ 1 - 1
doc/chapters/basic-api.texi

@@ -118,7 +118,7 @@ This can also be specified with the @code{STARPU_SINGLE_COMBINED_WORKER} environ
 @item @code{int disable_asynchronous_copy} (default = 0)
 This flag should be set to 1 to disable asynchronous copies between
 CPUs and accelerators. This can also be specified with the
-@code{STARPU_DISABLE_ASYNCHRONOUS_COPY} environment variable.
+@code{DISABLE_STARPU_ASYNCHRONOUS_COPY} environment variable.
 The AMD implementation of OpenCL is known to
 fail when copying data asynchronously. When using this implementation,
 it is therefore necessary to disable asynchronous data transfers.

+ 10 - 0
examples/Makefile.am

@@ -38,6 +38,7 @@ EXTRA_DIST = 					\
 	basic_examples/multiformat_opencl_kernel.cl  \
 	basic_examples/multiformat_conversion_codelets_opencl_kernel.cl \
 	common/blas_model.c			\
+	spmd/vector_scal_spmd.c			\
 	spmv/spmv_cuda.cu			\
 	spmv/spmv_opencl.cl			\
 	gordon/null_kernel_gordon.c		\
@@ -184,6 +185,7 @@ examplebin_PROGRAMS +=				\
 	tag_example/tag_example3		\
 	tag_example/tag_example4		\
 	tag_example/tag_restartable		\
+	spmd/vector_scal_spmd			\
 	spmv/spmv				\
 	callback/callback			\
 	incrementer/incrementer			\
@@ -248,6 +250,7 @@ STARPU_EXAMPLES +=				\
 	tag_example/tag_example3		\
 	tag_example/tag_example4		\
 	tag_example/tag_restartable		\
+	spmd/vector_scal_spmd			\
 	spmv/spmv				\
 	callback/callback			\
 	incrementer/incrementer			\
@@ -666,6 +669,13 @@ cg_cg_LDADD =					\
 endif
 
 ################
+# SPMD example #
+################
+
+spmd_vector_scal_spmd_SOURCES =			\
+	spmd/vector_scal_spmd.c
+
+################
 # SpMV example #
 ################
 

+ 1 - 0
examples/axpy/axpy.c

@@ -21,6 +21,7 @@
 #include <stdio.h>
 #include <assert.h>
 #include <sys/time.h>
+#include <math.h>
 
 #include <common/blas.h>
 

+ 5 - 5
examples/basic_examples/block_opencl.c

@@ -36,11 +36,11 @@ void opencl_codelet(void *descr[], void *_args)
 	cl_event event;
 	int id, devid, err;
 	cl_mem block = (cl_mem)STARPU_BLOCK_GET_DEV_HANDLE(descr[0]);
-	uint32_t nx = (int)STARPU_BLOCK_GET_NX(descr[0]);
-	uint32_t ny = (int)STARPU_BLOCK_GET_NY(descr[0]);
-	uint32_t nz = (int)STARPU_BLOCK_GET_NZ(descr[0]);
-        uint32_t ldy = STARPU_BLOCK_GET_LDY(descr[0]);
-        uint32_t ldz = STARPU_BLOCK_GET_LDZ(descr[0]);
+	int nx = (int)STARPU_BLOCK_GET_NX(descr[0]);
+	int ny = (int)STARPU_BLOCK_GET_NY(descr[0]);
+	int nz = (int)STARPU_BLOCK_GET_NZ(descr[0]);
+        int ldy = (int)STARPU_BLOCK_GET_LDY(descr[0]);
+        int ldz = (int) STARPU_BLOCK_GET_LDZ(descr[0]);
         float *multiplier = (float *)_args;
 
         id = starpu_worker_get_id();

+ 1 - 3
examples/basic_examples/block_opencl_kernel.cl

@@ -14,9 +14,7 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include <stdint.h>
-
-__kernel void block(__global float *b, uint32_t nx, uint32_t ny, uint32_t nz, uint32_t ldy, uint32_t ldz, float multiplier)
+__kernel void block(__global float *b, int nx, int ny, int nz, int ldy, int ldz, float multiplier)
 {
      const int i = get_global_id(0);
      if (i < (nz*ldz)+(ny*ldy)+nx)

+ 146 - 0
examples/spmd/vector_scal_spmd.c

@@ -0,0 +1,146 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/* gcc build:
+
+   gcc -O2 -g vector_scal.c -o vector_scal $(pkg-config --cflags starpu-1.0) $(pkg-config --libs starpu-1.0)
+
+ */
+
+#include <starpu.h>
+#include <stdio.h>
+#include <limits.h>
+
+#define MIN(a,b)        ((a)<(b)?(a):(b))
+
+#define	NX	2048000
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
+static int get_first_element_rank(int nel, int rank, int nb_workers)
+{
+	if(rank == 0)
+		return 0;
+
+	/* We get the  number of bigger parts which stand before the part */
+	int nb_big_parts = MIN(nel % nb_workers, rank);
+
+	return nb_big_parts * (nel / nb_workers + 1) + (rank - nb_big_parts) * (nel / nb_workers);
+}
+
+void scal_cpu_func(void *buffers[], void *_args)
+{
+	unsigned i;
+	float *factor = _args, f = *factor;
+	struct starpu_vector_interface *vector = buffers[0];
+	unsigned n = STARPU_VECTOR_GET_NX(vector);
+	float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
+
+	int nb_workers = starpu_combined_worker_get_size();
+	int rank = starpu_combined_worker_get_rank();
+
+	if (rank == 0)
+		FPRINTF(stderr, "running task with %d CPUs.\n", starpu_combined_worker_get_size());
+
+	/* We add 1 to the (nel_total % nb_workers) first workers, thus we get an evenly splitted data. */
+	int nel_worker = (n / nb_workers) + ((rank < (n % nb_workers)) ? 1 : 0);
+
+	int begin = get_first_element_rank(n, rank, nb_workers);
+
+
+	for (i = 0; i < nel_worker; i++) {
+		int rank = i + begin;
+
+		float v = val[rank];
+		int j;
+		for (j = 0; j < 100; j++)
+			v = v * f;
+		val[rank] = v;
+	}
+}
+
+static struct starpu_perfmodel vector_scal_model =
+{
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "vector_scale_parallel"
+};
+
+static struct starpu_codelet cl =
+{
+	.modes = { STARPU_RW },
+	.where = STARPU_CPU,
+	.type = STARPU_SPMD,
+	.max_parallelism = INT_MAX,
+	.cpu_funcs = {scal_cpu_func, NULL},
+	.nbuffers = 1,
+	.model = &vector_scal_model,
+};
+
+int main(int argc, char **argv)
+{
+	struct starpu_conf conf;
+	float *vector;
+	unsigned i;
+	int ret;
+
+	vector = malloc(NX*sizeof(*vector));
+
+	for (i = 0; i < NX; i++)
+		vector[i] = (i+1.0f);
+
+	FPRINTF(stderr, "BEFORE: First element was %f\n", vector[0]);
+	FPRINTF(stderr, "BEFORE: Last element was %f\n", vector[NX-1]);
+
+	starpu_conf_init(&conf);
+
+	conf.single_combined_worker = 1;
+	conf.sched_policy_name = "pheft";
+
+	ret = starpu_init(&conf);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	starpu_data_handle_t vector_handle;
+	starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector, NX, sizeof(vector[0]));
+
+	float factor = 1.001;
+
+	for (i = 0; i < 100; i++) {
+		struct starpu_task *task = starpu_task_create();
+
+		task->cl = &cl;
+
+		task->handles[0] = vector_handle;
+		task->cl_arg = &factor;
+		task->cl_arg_size = sizeof(factor);
+
+		ret = starpu_task_submit(task);
+		if (ret == -ENODEV) {
+		     ret = 77;
+		     break;
+		}
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
+
+	starpu_data_unregister(vector_handle);
+
+	/* terminate StarPU, no task can be submitted after */
+	starpu_shutdown();
+
+	FPRINTF(stderr, "AFTER: First element is %f\n", vector[0]);
+	FPRINTF(stderr, "AFTER: Last element is %f\n", vector[NX-1]);
+
+	return ret;
+}

+ 8 - 8
examples/spmv/spmv_kernels.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -28,18 +28,18 @@ void spmv_kernel_opencl(void *descr[], void *args)
 	cl_event event;
 	int id, devid, err, n;
 
-	uint32_t nnz = STARPU_CSR_GET_NNZ(descr[0]);
-	uint32_t nrow = STARPU_CSR_GET_NROW(descr[0]);
+	int nnz = (int) STARPU_CSR_GET_NNZ(descr[0]);
+	int nrow = (int) STARPU_CSR_GET_NROW(descr[0]);
 	cl_mem nzval = (cl_mem)STARPU_CSR_GET_NZVAL(descr[0]);
-	uint32_t *colind = STARPU_CSR_GET_COLIND(descr[0]);
-	uint32_t *rowptr = STARPU_CSR_GET_ROWPTR(descr[0]);
-	uint32_t firstentry = STARPU_CSR_GET_FIRSTENTRY(descr[0]);
+	cl_mem colind = (cl_mem)STARPU_CSR_GET_COLIND(descr[0]);
+	cl_mem rowptr = (cl_mem)STARPU_CSR_GET_ROWPTR(descr[0]);
+	int firstentry = STARPU_CSR_GET_FIRSTENTRY(descr[0]);
 
 	cl_mem vecin = (cl_mem)STARPU_VECTOR_GET_DEV_HANDLE(descr[1]);
-	uint32_t nx_in = STARPU_VECTOR_GET_NX(descr[1]);
+	int nx_in = (int)STARPU_VECTOR_GET_NX(descr[1]);
 
 	cl_mem vecout = (cl_mem)STARPU_VECTOR_GET_DEV_HANDLE(descr[2]);
-	uint32_t nx_out = STARPU_VECTOR_GET_NX(descr[2]);
+	int nx_out = (int)STARPU_VECTOR_GET_NX(descr[2]);
 
         id = starpu_worker_get_id();
         devid = starpu_worker_get_devid(id);

+ 6 - 8
examples/spmv/spmv_opencl.cl

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -14,15 +14,13 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-__kernel void spmv(unsigned nnz, unsigned nrow,
+__kernel void spmv(int nnz, int nrow,
                    __global float* nzval, __global unsigned* colind,
-                   __global unsigned* rowptr, unsigned firstentry,
-                   __global float *vecin, unsigned nx_in,
-                   __global float *vecout, unsigned nx_out)
+                   __global unsigned* rowptr, int firstentry,
+                   __global float *vecin, int nx_in,
+                   __global float *vecout, int nx_out)
 {
-	unsigned row;
-	// for (row = 0; row < nrow; row++)
-	row = get_global_id(0);
+	const int row = get_global_id(0);
 	if (row < nrow)
 	{
 		float tmp = 0.0f;

+ 1 - 1
include/starpu.h

@@ -107,7 +107,7 @@ unsigned starpu_cuda_worker_get_count(void);
 unsigned starpu_spu_worker_get_count(void);
 unsigned starpu_opencl_worker_get_count(void);
 
-int starpu_disable_asynchronous_copy();
+int starpu_asynchronous_copy_disabled();
 
 /* Return the identifier of the thread in case this is associated to a worker.
  * This will return -1 if this function is called directly from the application

+ 24 - 24
socl/examples/clinfo/clinfo.c

@@ -20,7 +20,7 @@
 
 #include <CL/cl.h>
 
-static inline void 
+static inline void
 checkErr(cl_int err, const char * name) {
     if (err != CL_SUCCESS) {
         fprintf(stderr, "ERROR: %s (%d)\n", name, err);
@@ -45,8 +45,8 @@ main(void) {
    platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id)*num_platforms);
    err = clGetPlatformIDs(num_platforms, platforms, NULL);
    checkErr(err, "Unable to get platform list");
-   
-   
+
+
    // Iteratate over platforms
    printf("Number of platforms:\t\t\t\t %d\n", num_platforms);
 
@@ -54,25 +54,25 @@ main(void) {
       unsigned int i;
       for (i=0; i<num_platforms; i++) {
          char str[256];
-         err = clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, sizeof(str), &str, NULL);
+         err = clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, sizeof(str), str, NULL);
          checkErr(err, "clGetPlatformInfo(CL_PLATFORM_PROFILE)");
-         printf("  Plaform Profile:\t\t\t\t %s\n", str);    
+         printf("  Plaform Profile:\t\t\t\t %s\n", str);
 
-         err= clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, sizeof(str), &str, NULL);
+         err= clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, sizeof(str), str, NULL);
          checkErr(err, "clGetPlatformInfo(CL_PLATFORM_VERSION)");
-         printf("  Plaform Version:\t\t\t\t %s\n", str);    
+         printf("  Plaform Version:\t\t\t\t %s\n", str);
 
-         err = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, sizeof(str), &str, NULL);
+         err = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, sizeof(str), str, NULL);
          checkErr(err, "clGetPlatformInfo(CL_PLATFORM_NAME)");
-         printf("  Plaform Name:\t\t\t\t\t %s\n", str);    
+         printf("  Plaform Name:\t\t\t\t\t %s\n", str);
 
-         err = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(str), &str, NULL);
+         err = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(str), str, NULL);
          checkErr(err, "clGetPlatformInfo(CL_PLATFORM_VENDOR)");
-         printf("  Plaform Vendor:\t\t\t\t %s\n", str);    
+         printf("  Plaform Vendor:\t\t\t\t %s\n", str);
 
-         err = clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, sizeof(str), &str, NULL);
+         err = clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, sizeof(str), str, NULL);
          checkErr(err, "clGetPlatformInfo(CL_PLATFORM_EXTENSIONS)");
-         printf("  Plaform Extensions:\t\t\t %s\n", str);    
+         printf("  Plaform Extensions:\t\t\t %s\n", str);
       }
    }
 
@@ -88,12 +88,12 @@ main(void) {
 
          err = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, sizeof(str), &str, NULL);
          checkErr(err, "clGetPlatformInfo(CL_PLATFORM_NAME)");
-         printf("  Plaform Name:\t\t\t\t\t %s\n", str);    
+         printf("  Plaform Name:\t\t\t\t\t %s\n", str);
 
          err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
          checkErr(err, "clGetDeviceIds(CL_DEVICE_TYPE_ALL)");
          devices = (cl_device_id*)malloc(sizeof(cl_device_id)*num_devices);
-         
+
          err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
          checkErr(err, "clGetDeviceIds(CL_DEVICE_TYPE_ALL)");
 
@@ -103,7 +103,7 @@ main(void) {
             for (j=0; j<num_devices; j++) {
                cl_device_type dev_type;
                printf("\n  DEVICE %d\n", j);
-               
+
                err = clGetDeviceInfo(devices[j], CL_DEVICE_TYPE, sizeof(dev_type), &dev_type, NULL);
                checkErr(err, "clGetDeviceInfo(CL_DEVICE_TYPE)");
 
@@ -123,13 +123,13 @@ main(void) {
                   cl_uint vendor_id;
                   err = clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR_ID, sizeof(vendor_id), &vendor_id, NULL);
                   checkErr(err, "clGetDeviceInfo(CL_DEVICE_VENDOR_ID)");
-                  printf("  Device ID:\t\t\t\t\t %d\n", vendor_id); 
+                  printf("  Device ID:\t\t\t\t\t %d\n", vendor_id);
                }
                {
                   cl_uint units;
                   err = clGetDeviceInfo(devices[j], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(units), &units, NULL);
                   checkErr(err, "clGetDeviceInfo(CL_DEVICE_MAX_COMPUTE_UNITS)");
-                  printf("  Max compute units:\t\t\t\t %d\n", units); 
+                  printf("  Max compute units:\t\t\t\t %d\n", units);
                }
 
                {
@@ -137,12 +137,12 @@ main(void) {
                   size_t *sizes;
                   err = clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(dims), &dims, NULL);
                   checkErr(err, "clGetDeviceInfo(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS)");
-                  printf("  Max work item dimensions:\t\t\t %d\n", dims); 
+                  printf("  Max work item dimensions:\t\t\t %d\n", dims);
 
                   sizes = (size_t*)malloc(dims * sizeof(size_t));
                   err = clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*dims, sizes, NULL);
                   checkErr(err, "clGetDeviceInfo(CL_DEVICE_MAX_WORK_ITEM_SIZES)");
-                  printf("  Max work item dimensions:\t\t\t %d\n", dims); 
+                  printf("  Max work item dimensions:\t\t\t %d\n", dims);
 
                   {
                      unsigned int k;
@@ -165,7 +165,7 @@ main(void) {
 
 #define GET_STRING(CL_D,str,size) { \
    char val[size]; \
-   err = clGetDeviceInfo(devices[j], CL_D, sizeof(val), &val, NULL); \
+   err = clGetDeviceInfo(devices[j], CL_D, sizeof(val), val, NULL);	\
    checkErr(err, "clGetDeviceInfo(" #CL_D ")"); \
    printf(str, val); \
 }
@@ -204,9 +204,9 @@ main(void) {
    checkErr(err, "clGetDeviceInfo(" #CL_D ")"); \
    printf(str, ((val & test) == CL_TRUE ? "Yes" : "No")); \
 }
-      
+
                GET_SIZET(CL_DEVICE_MAX_WORK_GROUP_SIZE, "  Max work group size:\t\t\t\t %u\n")
-               
+
                GET_UINT(CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, "  Preferred vector width char:\t\t\t %u\n")
                GET_UINT(CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, "  Preferred vector width short:\t\t\t %u\n")
                GET_UINT(CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, "  Preferred vector width int:\t\t\t %u\n")
@@ -291,7 +291,7 @@ main(void) {
                GET_STRING(CL_DEVICE_PROFILE, "  Profile:\t\t\t\t\t %s\n", 30);
                GET_STRING(CL_DEVICE_VERSION, "  Version:\t\t\t\t\t %s\n", 50);
                GET_STRING(CL_DEVICE_EXTENSIONS, "  Extensions:\t\t\t\t\t %s\n", 4096);
-            
+
                printf("\n");
             }
          }

+ 6 - 6
socl/src/cl_getdeviceinfo.c

@@ -79,12 +79,12 @@ soclGetDeviceInfo(cl_device_id    device,
          cl_platform_id p = &socl_platform;
          INFO_CASE_EX2(p);
       }
-      INFO_CASE(CL_DEVICE_NAME, device->name)
-      INFO_CASE(CL_DEVICE_VENDOR, SOCL_VENDOR)
-      INFO_CASE(CL_DRIVER_VERSION, SOCL_DRIVER_VERSION)
-      INFO_CASE(CL_DEVICE_PROFILE, SOCL_PROFILE)
-      INFO_CASE(CL_DEVICE_VERSION, SOCL_VERSION)
-      INFO_CASE(CL_DEVICE_EXTENSIONS, device->extensions)
+      INFO_CASE_STRING(CL_DEVICE_NAME, device->name)
+      INFO_CASE_STRING(CL_DEVICE_VENDOR, SOCL_VENDOR)
+      INFO_CASE_STRING(CL_DRIVER_VERSION, SOCL_DRIVER_VERSION)
+      INFO_CASE_STRING(CL_DEVICE_PROFILE, SOCL_PROFILE)
+      INFO_CASE_STRING(CL_DEVICE_VERSION, SOCL_VERSION)
+      INFO_CASE_STRING(CL_DEVICE_EXTENSIONS, device->extensions)
       default:
          return CL_INVALID_VALUE;
    }

+ 9 - 8
socl/src/cl_getplatforminfo.c

@@ -30,15 +30,16 @@ soclGetPlatformInfo(cl_platform_id   platform,
                   size_t *         param_value_size_ret) CL_API_SUFFIX__VERSION_1_0
 {
    if (platform != NULL && platform != &socl_platform)
-      return CL_INVALID_PLATFORM;
+	return CL_INVALID_PLATFORM;
 
-   switch (param_name) {
-      INFO_CASE(CL_PLATFORM_PROFILE, SOCL_PROFILE)
-      INFO_CASE(CL_PLATFORM_VERSION, SOCL_VERSION)
-      INFO_CASE(CL_PLATFORM_NAME,    SOCL_PLATFORM_NAME)
-      INFO_CASE(CL_PLATFORM_VENDOR,  SOCL_VENDOR)
-      INFO_CASE(CL_PLATFORM_EXTENSIONS, SOCL_PLATFORM_EXTENSIONS)
-      default:
+   switch (param_name)
+   {
+	INFO_CASE_STRING(CL_PLATFORM_PROFILE, SOCL_PROFILE);
+	INFO_CASE_STRING(CL_PLATFORM_VERSION, SOCL_VERSION);
+	INFO_CASE_STRING(CL_PLATFORM_NAME,    SOCL_PLATFORM_NAME);
+	INFO_CASE_STRING(CL_PLATFORM_VENDOR,  SOCL_VENDOR);
+	INFO_CASE_STRING(CL_PLATFORM_EXTENSIONS, SOCL_PLATFORM_EXTENSIONS);
+   default:
          return CL_INVALID_VALUE;
    }
 

+ 12 - 0
socl/src/getinfo.h

@@ -29,6 +29,18 @@
 #define INFO_CASE(param, var) case param: \
    INFO_CASE_EX2(var)
 
+#define INFO_CASE_STRING_EX2(var) if (param_value != NULL) { \
+      if (param_value_size < strlen(var)) \
+         return CL_INVALID_VALUE; \
+      strcpy(param_value, var); \
+   } \
+   if (param_value_size_ret != NULL) \
+      *param_value_size_ret = strlen(var); \
+   break;
+
+#define INFO_CASE_STRING(param, var) case param: \
+   INFO_CASE_STRING_EX2(var)
+
 #define INFO_CASE_VALUE(param, type, value) case param: {\
       type tmp = (value);\
       INFO_CASE_EX2(tmp);\

+ 4 - 1
src/core/jobs.c

@@ -204,8 +204,11 @@ void _starpu_handle_job_termination(struct _starpu_job *j, int workerid)
 			_starpu_clock_gettime(&task->profiling_info->callback_end_time);
 	}
 
+	/* If the job was executed on a combined worker there is no need for the
+	 * scheduler to process it : the task structure doesn't contain any valuable
+	 * data as it's not linked to an actual worker */
 	/* control task should not execute post_exec_hook */
-	if(task->cl != NULL && !task->control_task)
+	if(j->task_size == 1 && task->cl != NULL && !task->control_task)
 	{
 	  _starpu_sched_post_exec_hook(task);
 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR

+ 5 - 5
src/core/workers.c

@@ -458,16 +458,16 @@ int starpu_init(struct starpu_conf *user_conf)
 
 	if (user_conf)
 	{
-	     int disable_asynchronous_copy = starpu_get_env_number("STARPU_DISABLE_ASYNCHRONOUS_COPY");
-	     if (disable_asynchronous_copy == 1)
+	     int asynchronous_copy_disabled = starpu_get_env_number("DISABLE_STARPU_ASYNCHRONOUS_COPY");
+	     if (asynchronous_copy_disabled == 1)
 		  config.disable_asynchronous_copy = 1;
 	     else
 		  config.disable_asynchronous_copy = (user_conf->disable_asynchronous_copy == 1);
 	}
 	else
 	{
-	     int disable_asynchronous_copy = starpu_get_env_number("STARPU_DISABLE_ASYNCHRONOUS_COPY");
-	     config.disable_asynchronous_copy = (disable_asynchronous_copy == 1);
+	     int asynchronous_copy_disabled = starpu_get_env_number("STARPU_DISABLE_ASYNCHRONOUS_COPY");
+	     config.disable_asynchronous_copy = (asynchronous_copy_disabled == 1);
 	}
 
 	_starpu_init_all_sched_ctxs(&config);
@@ -721,7 +721,7 @@ unsigned starpu_spu_worker_get_count(void)
 	return config.topology.ngordon_spus;
 }
 
-int starpu_disable_asynchronous_copy()
+int starpu_asynchronous_copy_disabled()
 {
 	return config.disable_asynchronous_copy;
 }

+ 2 - 2
src/datawizard/interfaces/data_interface.c

@@ -288,8 +288,8 @@ void starpu_data_register(starpu_data_handle_t *handleptr, uint32_t home_node,
 	*handleptr = handle;
 	handle->mf_node = home_node;
 
-	int disable_asynchronous_copy = starpu_disable_asynchronous_copy();
-	if (STARPU_UNLIKELY(disable_asynchronous_copy))
+	int asynchronous_copy_disabled = starpu_asynchronous_copy_disabled();
+	if (STARPU_UNLIKELY(asynchronous_copy_disabled))
 	{
 #ifdef STARPU_USE_CUDA
 	     if (ops->copy_methods->ram_to_cuda_async)

+ 22 - 6
src/drivers/cpu/driver_cpu.c

@@ -28,9 +28,14 @@
 #include <core/sched_policy.h>
 #include <core/sched_ctx.h>
 
-static int execute_job_on_cpu(struct _starpu_job *j, struct _starpu_worker *cpu_args, int is_parallel_task, int rank, enum starpu_perf_archtype perf_arch)
+/* Actually launch the job on a cpu worker.
+ * Handle binding CPUs on cores.
+ * In the case of a combined worker WORKER_TASK != J->TASK */
+
+static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_task, struct _starpu_worker *cpu_args, int rank, enum starpu_perf_archtype perf_arch)
 {
 	int ret;
+	int is_parallel_task = (j->task_size > 1);
 	int profiling = starpu_profiling_status_get();
 	struct timespec codelet_start, codelet_end;
 
@@ -51,8 +56,14 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct _starpu_worker *cpu_
 	}
 
 	if (is_parallel_task)
+	{
 		_STARPU_PTHREAD_BARRIER_WAIT(&j->before_work_barrier);
 
+		/* In the case of a combined worker, the scheduler needs to know
+		 * when each actual worker begins the execution */
+		_starpu_sched_pre_exec_hook(worker_task);
+	}
+
 	/* Give profiling variable */
 	_starpu_driver_start_job(cpu_args, j, &codelet_start, rank, profiling);
 
@@ -167,10 +178,6 @@ void *_starpu_cpu_worker(void *arg)
 		/* Get the rank in case it is a parallel task */
 		if (is_parallel_task)
 		{
-			/* We can release the fake task */
-			STARPU_ASSERT(task != j->task);
-			free(task);
-
 			_STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
 			rank = j->active_task_alias_count++;
 			_STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
@@ -194,7 +201,7 @@ void *_starpu_cpu_worker(void *arg)
 		_starpu_set_current_task(j->task);
 		cpu_arg->current_task = j->task;
 
-                res = execute_job_on_cpu(j, cpu_arg, is_parallel_task, rank, perf_arch);
+                res = execute_job_on_cpu(j, task, cpu_arg, rank, perf_arch);
 
 		_starpu_set_current_task(NULL);
 		cpu_arg->current_task = NULL;
@@ -211,6 +218,15 @@ void *_starpu_cpu_worker(void *arg)
 			}
 		}
 
+		/* In the case of combined workers, we need to inform the
+		 * scheduler each worker's execution is over.
+		 * Then we free the workers' task alias */
+		if (is_parallel_task)
+		{
+			_starpu_sched_post_exec_hook(task);
+			free(task);
+		}
+
 		if (rank == 0)
 			_starpu_handle_job_termination(j, workerid);
         }

+ 4 - 1
src/drivers/driver_common/driver_common.c

@@ -39,7 +39,10 @@ void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j
 	if (cl->model && cl->model->benchmarking)
 		calibrate_model = 1;
 
-	if (rank == 0)
+	/* If the job is executed on a combined worker there is no need for the
+	 * scheduler to process it : it doesn't contain any valuable data
+	 * as it's not linked to an actual worker */
+	if (j->task_size == 1)
 		_starpu_sched_pre_exec_hook(task);
 
 	args->status = STATUS_EXECUTING;

+ 6 - 2
src/drivers/opencl/driver_opencl.c

@@ -58,7 +58,9 @@ static void limit_gpu_mem_if_needed(int devid)
 
 	/* Request the size of the current device's memory */
 	cl_ulong totalGlobalMem;
-	clGetDeviceInfo(devices[devid], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(totalGlobalMem), &totalGlobalMem, NULL);
+	err = clGetDeviceInfo(devices[devid], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(totalGlobalMem), &totalGlobalMem, NULL);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
 
 	/* How much memory to waste ? */
 	size_t to_waste = (size_t)totalGlobalMem - (size_t)limit*1024*1024;
@@ -76,7 +78,9 @@ static void unlimit_gpu_mem_if_needed(int devid)
 {
 	if (wasted_memory[devid])
 	{
-		clReleaseMemObject(wasted_memory[devid]);
+		cl_int err = clReleaseMemObject(wasted_memory[devid]);
+		if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
 		wasted_memory[devid] = NULL;
 	}
 }

+ 2 - 3
src/sched_policies/heft.c

@@ -353,6 +353,7 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 			{
 				ntasks_best_end = ntasks_end;
 				ntasks_best = worker;
+				nimpl_best = nimpl;
 			}
 
 			if (isnan(local_task_length[worker_ctx][nimpl]))
@@ -417,13 +418,11 @@ static int push_conversion_tasks(struct starpu_task *task, unsigned int workerid
 		conversion_task->execute_on_a_specific_worker = 1;
 		conversion_task->workerid = workerid;
 		conversion_task->mf_skip = 1;
+		handle->mf_node = node;
 		ret = _starpu_task_submit_conversion_task(conversion_task, workerid);
 		STARPU_ASSERT(ret == 0);
 	}
 
-	for (i = 0; i < task->cl->nbuffers; i++)
-		task->handles[i]->mf_node = node;
-
 	task->execute_on_a_specific_worker = 1;
 	task->workerid = workerid;
 	task->mf_skip= 1;

+ 5 - 6
src/sched_policies/parallel_heft.c

@@ -130,6 +130,11 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 		j->task_size = worker_size;
 		j->combined_workerid = best_workerid;
 		j->active_task_alias_count = 0;
+
+		/* This task doesn't belong to an actual worker, it belongs
+		 * to a combined worker and thus the scheduler doesn't care
+		 * of its predicted values which are insignificant */
+		task->predicted = 0;
 		task->predicted_transfer = 0;
 
 		_STARPU_PTHREAD_BARRIER_INIT(&j->before_work_barrier, NULL, worker_size);
@@ -368,16 +373,11 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, uns
 		{
 			worker = workers->has_next(workers) ? workers->get_next(workers) : worker_ctx;
 
-			unsigned incremented = 0;
 			for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
 			{
 				if (skip_worker[worker_ctx][nimpl])
 				{
 					/* no one on that queue may execute this task */
-					if(!incremented)
-						worker_ctx++;
-
-					incremented = 1;
 					continue;
 				}
 
@@ -402,7 +402,6 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, uns
 
 			//	fprintf(stderr, "FITNESS worker %d -> %e local_exp_end %e - local_data_penalty %e\n", worker, fitness[worker][nimpl], local_exp_end[worker][nimpl] - best_exp_end, local_data_penalty[worker][nimpl]);
 			}
-			if(!incremented)
 				worker_ctx++;
 		}
 	}

+ 18 - 6
tests/datawizard/handle_to_pointer.c

@@ -80,7 +80,7 @@ static void opencl_task(void *buffers[], void *args)
 				NULL,           /* event_wait_list */
 				NULL            /* event */);
 	}
-			
+	clFinish(queue);
 }
 #endif
 
@@ -146,12 +146,24 @@ int main(int argc, char *argv[])
 	starpu_data_acquire(handle, STARPU_R);
 
 	/* Make sure we have a local pointer to it.  */
+	ret = EXIT_SUCCESS;
 	pointer = (int *) starpu_handle_to_pointer(handle, 0);
-	STARPU_ASSERT(pointer != NULL);
-	for(i = 0; i < count; i++)
+	if (pointer == NULL)
 	{
-		int *numbers = (int *)pointer;
-		STARPU_ASSERT(numbers[i] == i);
+	     FPRINTF(stderr, "pointer should be non NULL\n");
+	     ret = EXIT_FAILURE;
+	}
+	else
+	{
+	     for(i = 0; i < count; i++)
+	     {
+		  int *numbers = (int *)pointer;
+		  if (numbers[i] != i)
+		  {
+		       FPRINTF(stderr, "Incorrect value numbers[%d] == %d should be %d\n", i, numbers[i], i);
+		       ret = EXIT_FAILURE;
+		  }
+	     }
 	}
 	starpu_data_release(handle);
 
@@ -159,5 +171,5 @@ int main(int argc, char *argv[])
 
 	starpu_shutdown();
 
-	return EXIT_SUCCESS;
+	return ret;
 }

+ 3 - 0
tests/datawizard/increment_redux_lazy.c

@@ -79,6 +79,7 @@ static void redux_opencl_kernel(void *descr[], void *arg)
 	h_dst += h_src;
 
 	clEnqueueWriteBuffer(queue, d_dst, CL_TRUE, 0, sizeof(unsigned), (void *)&h_dst, 0, NULL, NULL);
+	clFinish(queue);
 }
 
 static void neutral_opencl_kernel(void *descr[], void *arg)
@@ -90,6 +91,7 @@ static void neutral_opencl_kernel(void *descr[], void *arg)
 	starpu_opencl_get_current_queue(&queue);
 
 	clEnqueueWriteBuffer(queue, d_dst, CL_TRUE, 0, sizeof(unsigned), (void *)&h_dst, 0, NULL, NULL);
+	clFinish(queue);
 }
 #endif
 
@@ -151,6 +153,7 @@ static void increment_opencl_kernel(void *descr[], void *cl_arg __attribute__((u
 	clEnqueueReadBuffer(queue, d_token, CL_TRUE, 0, sizeof(unsigned), (void *)&h_token, 0, NULL, NULL);
 	h_token++;
 	clEnqueueWriteBuffer(queue, d_token, CL_TRUE, 0, sizeof(unsigned), (void *)&h_token, 0, NULL, NULL);
+	clFinish(queue);
 }
 #endif
 

+ 1 - 1
tests/datawizard/interfaces/test_interfaces.c

@@ -589,7 +589,7 @@ out:
 static void
 run_async(void)
 {
-	int async = starpu_disable_asynchronous_copy();
+	int async = starpu_asynchronous_copy_disabled();
 	if (async == 1) {
 		FPRINTF(stderr, "Asynchronous copies have been disabled\n");
 		return;

+ 1 - 1
tests/datawizard/readonly.c

@@ -45,7 +45,7 @@ int main(int argc, char **argv)
      if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
      STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
-     int copy = starpu_disable_asynchronous_copy();
+     int copy = starpu_asynchronous_copy_disabled();
      FPRINTF(stderr, "copy %d\n", copy);
 
      starpu_variable_data_register(&handle, 0, (uintptr_t)&var, sizeof(var));

+ 1 - 0
tests/datawizard/wt_broadcast.c

@@ -46,6 +46,7 @@ static void increment_opencl_kernel(void *descr[], void *cl_arg __attribute__((u
 	clEnqueueReadBuffer(queue, d_token, CL_TRUE, 0, sizeof(unsigned), (void *)&h_token, 0, NULL, NULL);
 	h_token++;
 	clEnqueueWriteBuffer(queue, d_token, CL_TRUE, 0, sizeof(unsigned), (void *)&h_token, 0, NULL, NULL);
+	clFinish(queue);
 }
 #endif
 

+ 1 - 0
tests/datawizard/wt_host.c

@@ -46,6 +46,7 @@ static void increment_opencl_kernel(void *descr[], void *cl_arg __attribute__((u
 	clEnqueueReadBuffer(queue, d_token, CL_TRUE, 0, sizeof(unsigned), (void *)&h_token, 0, NULL, NULL);
 	h_token++;
 	clEnqueueWriteBuffer(queue, d_token, CL_TRUE, 0, sizeof(unsigned), (void *)&h_token, 0, NULL, NULL);
+	clFinish(queue);
 }
 #endif